diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-24 12:07:03 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-24 12:07:03 -0400 |
commit | 5fabc487c96819dd12ddb9414835d170fd9cd6d5 (patch) | |
tree | 01532d492e5074b0d3add29bf92ebf9a9d161e9e | |
parent | c61264f98c1a974ee6f545f61a4ab33b141d6bda (diff) | |
parent | 3f68b0318bbbd61bf08478ab99a149f0d9e5156e (diff) |
Merge branch 'kvm-updates/3.1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/3.1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (143 commits)
KVM: IOMMU: Disable device assignment without interrupt remapping
KVM: MMU: trace mmio page fault
KVM: MMU: mmio page fault support
KVM: MMU: reorganize struct kvm_shadow_walk_iterator
KVM: MMU: lockless walking shadow page table
KVM: MMU: do not need atomicly to set/clear spte
KVM: MMU: introduce the rules to modify shadow page table
KVM: MMU: abstract some functions to handle fault pfn
KVM: MMU: filter out the mmio pfn from the fault pfn
KVM: MMU: remove bypass_guest_pf
KVM: MMU: split kvm_mmu_free_page
KVM: MMU: count used shadow pages on prepareing path
KVM: MMU: rename 'pt_write' to 'emulate'
KVM: MMU: cleanup for FNAME(fetch)
KVM: MMU: optimize to handle dirty bit
KVM: MMU: cache mmio info on page fault path
KVM: x86: introduce vcpu_mmio_gva_to_gpa to cleanup the code
KVM: MMU: do not update slot bitmap if spte is nonpresent
KVM: MMU: fix walking shadow page table
KVM guest: KVM Steal time registration
...
102 files changed, 12320 insertions, 3679 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index aa47be71df4c..40cc653984ee 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -1159,10 +1159,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
1159 | for all guests. | 1159 | for all guests. |
1160 | Default is 1 (enabled) if in 64bit or 32bit-PAE mode | 1160 | Default is 1 (enabled) if in 64bit or 32bit-PAE mode |
1161 | 1161 | ||
1162 | kvm-intel.bypass_guest_pf= | ||
1163 | [KVM,Intel] Disables bypassing of guest page faults | ||
1164 | on Intel chips. Default is 1 (enabled) | ||
1165 | |||
1166 | kvm-intel.ept= [KVM,Intel] Disable extended page tables | 1162 | kvm-intel.ept= [KVM,Intel] Disable extended page tables |
1167 | (virtualized MMU) support on capable Intel chips. | 1163 | (virtualized MMU) support on capable Intel chips. |
1168 | Default is 1 (enabled) | 1164 | Default is 1 (enabled) |
@@ -1737,6 +1733,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
1737 | no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page | 1733 | no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page |
1738 | fault handling. | 1734 | fault handling. |
1739 | 1735 | ||
1736 | no-steal-acc [X86,KVM] Disable paravirtualized steal time accounting. | ||
1737 | steal time is computed, but won't influence scheduler | ||
1738 | behaviour | ||
1739 | |||
1740 | nolapic [X86-32,APIC] Do not enable or use the local APIC. | 1740 | nolapic [X86-32,APIC] Do not enable or use the local APIC. |
1741 | 1741 | ||
1742 | nolapic_timer [X86-32,APIC] Do not use the local APIC timer. | 1742 | nolapic_timer [X86-32,APIC] Do not use the local APIC timer. |
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 42542eb802ca..b0e4b9cd6a66 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt | |||
@@ -180,6 +180,19 @@ KVM_CHECK_EXTENSION ioctl() to determine the value for max_vcpus at run-time. | |||
180 | If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4 | 180 | If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4 |
181 | cpus max. | 181 | cpus max. |
182 | 182 | ||
183 | On powerpc using book3s_hv mode, the vcpus are mapped onto virtual | ||
184 | threads in one or more virtual CPU cores. (This is because the | ||
185 | hardware requires all the hardware threads in a CPU core to be in the | ||
186 | same partition.) The KVM_CAP_PPC_SMT capability indicates the number | ||
187 | of vcpus per virtual core (vcore). The vcore id is obtained by | ||
188 | dividing the vcpu id by the number of vcpus per vcore. The vcpus in a | ||
189 | given vcore will always be in the same physical core as each other | ||
190 | (though that might be a different physical core from time to time). | ||
191 | Userspace can control the threading (SMT) mode of the guest by its | ||
192 | allocation of vcpu ids. For example, if userspace wants | ||
193 | single-threaded guest vcpus, it should make all vcpu ids be a multiple | ||
194 | of the number of vcpus per vcore. | ||
195 | |||
183 | 4.8 KVM_GET_DIRTY_LOG (vm ioctl) | 196 | 4.8 KVM_GET_DIRTY_LOG (vm ioctl) |
184 | 197 | ||
185 | Capability: basic | 198 | Capability: basic |
@@ -1143,15 +1156,10 @@ Assigns an IRQ to a passed-through device. | |||
1143 | 1156 | ||
1144 | struct kvm_assigned_irq { | 1157 | struct kvm_assigned_irq { |
1145 | __u32 assigned_dev_id; | 1158 | __u32 assigned_dev_id; |
1146 | __u32 host_irq; | 1159 | __u32 host_irq; /* ignored (legacy field) */ |
1147 | __u32 guest_irq; | 1160 | __u32 guest_irq; |
1148 | __u32 flags; | 1161 | __u32 flags; |
1149 | union { | 1162 | union { |
1150 | struct { | ||
1151 | __u32 addr_lo; | ||
1152 | __u32 addr_hi; | ||
1153 | __u32 data; | ||
1154 | } guest_msi; | ||
1155 | __u32 reserved[12]; | 1163 | __u32 reserved[12]; |
1156 | }; | 1164 | }; |
1157 | }; | 1165 | }; |
@@ -1239,8 +1247,10 @@ Type: vm ioctl | |||
1239 | Parameters: struct kvm_assigned_msix_nr (in) | 1247 | Parameters: struct kvm_assigned_msix_nr (in) |
1240 | Returns: 0 on success, -1 on error | 1248 | Returns: 0 on success, -1 on error |
1241 | 1249 | ||
1242 | Set the number of MSI-X interrupts for an assigned device. This service can | 1250 | Set the number of MSI-X interrupts for an assigned device. The number is |
1243 | only be called once in the lifetime of an assigned device. | 1251 | reset again by terminating the MSI-X assignment of the device via |
1252 | KVM_DEASSIGN_DEV_IRQ. Calling this service more than once at any earlier | ||
1253 | point will fail. | ||
1244 | 1254 | ||
1245 | struct kvm_assigned_msix_nr { | 1255 | struct kvm_assigned_msix_nr { |
1246 | __u32 assigned_dev_id; | 1256 | __u32 assigned_dev_id; |
@@ -1291,6 +1301,135 @@ Returns the tsc frequency of the guest. The unit of the return value is | |||
1291 | KHz. If the host has unstable tsc this ioctl returns -EIO instead as an | 1301 | KHz. If the host has unstable tsc this ioctl returns -EIO instead as an |
1292 | error. | 1302 | error. |
1293 | 1303 | ||
1304 | 4.56 KVM_GET_LAPIC | ||
1305 | |||
1306 | Capability: KVM_CAP_IRQCHIP | ||
1307 | Architectures: x86 | ||
1308 | Type: vcpu ioctl | ||
1309 | Parameters: struct kvm_lapic_state (out) | ||
1310 | Returns: 0 on success, -1 on error | ||
1311 | |||
1312 | #define KVM_APIC_REG_SIZE 0x400 | ||
1313 | struct kvm_lapic_state { | ||
1314 | char regs[KVM_APIC_REG_SIZE]; | ||
1315 | }; | ||
1316 | |||
1317 | Reads the Local APIC registers and copies them into the input argument. The | ||
1318 | data format and layout are the same as documented in the architecture manual. | ||
1319 | |||
1320 | 4.57 KVM_SET_LAPIC | ||
1321 | |||
1322 | Capability: KVM_CAP_IRQCHIP | ||
1323 | Architectures: x86 | ||
1324 | Type: vcpu ioctl | ||
1325 | Parameters: struct kvm_lapic_state (in) | ||
1326 | Returns: 0 on success, -1 on error | ||
1327 | |||
1328 | #define KVM_APIC_REG_SIZE 0x400 | ||
1329 | struct kvm_lapic_state { | ||
1330 | char regs[KVM_APIC_REG_SIZE]; | ||
1331 | }; | ||
1332 | |||
1333 | Copies the input argument into the the Local APIC registers. The data format | ||
1334 | and layout are the same as documented in the architecture manual. | ||
1335 | |||
1336 | 4.58 KVM_IOEVENTFD | ||
1337 | |||
1338 | Capability: KVM_CAP_IOEVENTFD | ||
1339 | Architectures: all | ||
1340 | Type: vm ioctl | ||
1341 | Parameters: struct kvm_ioeventfd (in) | ||
1342 | Returns: 0 on success, !0 on error | ||
1343 | |||
1344 | This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address | ||
1345 | within the guest. A guest write in the registered address will signal the | ||
1346 | provided event instead of triggering an exit. | ||
1347 | |||
1348 | struct kvm_ioeventfd { | ||
1349 | __u64 datamatch; | ||
1350 | __u64 addr; /* legal pio/mmio address */ | ||
1351 | __u32 len; /* 1, 2, 4, or 8 bytes */ | ||
1352 | __s32 fd; | ||
1353 | __u32 flags; | ||
1354 | __u8 pad[36]; | ||
1355 | }; | ||
1356 | |||
1357 | The following flags are defined: | ||
1358 | |||
1359 | #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch) | ||
1360 | #define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio) | ||
1361 | #define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) | ||
1362 | |||
1363 | If datamatch flag is set, the event will be signaled only if the written value | ||
1364 | to the registered address is equal to datamatch in struct kvm_ioeventfd. | ||
1365 | |||
1366 | 4.62 KVM_CREATE_SPAPR_TCE | ||
1367 | |||
1368 | Capability: KVM_CAP_SPAPR_TCE | ||
1369 | Architectures: powerpc | ||
1370 | Type: vm ioctl | ||
1371 | Parameters: struct kvm_create_spapr_tce (in) | ||
1372 | Returns: file descriptor for manipulating the created TCE table | ||
1373 | |||
1374 | This creates a virtual TCE (translation control entry) table, which | ||
1375 | is an IOMMU for PAPR-style virtual I/O. It is used to translate | ||
1376 | logical addresses used in virtual I/O into guest physical addresses, | ||
1377 | and provides a scatter/gather capability for PAPR virtual I/O. | ||
1378 | |||
1379 | /* for KVM_CAP_SPAPR_TCE */ | ||
1380 | struct kvm_create_spapr_tce { | ||
1381 | __u64 liobn; | ||
1382 | __u32 window_size; | ||
1383 | }; | ||
1384 | |||
1385 | The liobn field gives the logical IO bus number for which to create a | ||
1386 | TCE table. The window_size field specifies the size of the DMA window | ||
1387 | which this TCE table will translate - the table will contain one 64 | ||
1388 | bit TCE entry for every 4kiB of the DMA window. | ||
1389 | |||
1390 | When the guest issues an H_PUT_TCE hcall on a liobn for which a TCE | ||
1391 | table has been created using this ioctl(), the kernel will handle it | ||
1392 | in real mode, updating the TCE table. H_PUT_TCE calls for other | ||
1393 | liobns will cause a vm exit and must be handled by userspace. | ||
1394 | |||
1395 | The return value is a file descriptor which can be passed to mmap(2) | ||
1396 | to map the created TCE table into userspace. This lets userspace read | ||
1397 | the entries written by kernel-handled H_PUT_TCE calls, and also lets | ||
1398 | userspace update the TCE table directly which is useful in some | ||
1399 | circumstances. | ||
1400 | |||
1401 | 4.63 KVM_ALLOCATE_RMA | ||
1402 | |||
1403 | Capability: KVM_CAP_PPC_RMA | ||
1404 | Architectures: powerpc | ||
1405 | Type: vm ioctl | ||
1406 | Parameters: struct kvm_allocate_rma (out) | ||
1407 | Returns: file descriptor for mapping the allocated RMA | ||
1408 | |||
1409 | This allocates a Real Mode Area (RMA) from the pool allocated at boot | ||
1410 | time by the kernel. An RMA is a physically-contiguous, aligned region | ||
1411 | of memory used on older POWER processors to provide the memory which | ||
1412 | will be accessed by real-mode (MMU off) accesses in a KVM guest. | ||
1413 | POWER processors support a set of sizes for the RMA that usually | ||
1414 | includes 64MB, 128MB, 256MB and some larger powers of two. | ||
1415 | |||
1416 | /* for KVM_ALLOCATE_RMA */ | ||
1417 | struct kvm_allocate_rma { | ||
1418 | __u64 rma_size; | ||
1419 | }; | ||
1420 | |||
1421 | The return value is a file descriptor which can be passed to mmap(2) | ||
1422 | to map the allocated RMA into userspace. The mapped area can then be | ||
1423 | passed to the KVM_SET_USER_MEMORY_REGION ioctl to establish it as the | ||
1424 | RMA for a virtual machine. The size of the RMA in bytes (which is | ||
1425 | fixed at host kernel boot time) is returned in the rma_size field of | ||
1426 | the argument structure. | ||
1427 | |||
1428 | The KVM_CAP_PPC_RMA capability is 1 or 2 if the KVM_ALLOCATE_RMA ioctl | ||
1429 | is supported; 2 if the processor requires all virtual machines to have | ||
1430 | an RMA, or 1 if the processor can use an RMA but doesn't require it, | ||
1431 | because it supports the Virtual RMA (VRMA) facility. | ||
1432 | |||
1294 | 5. The kvm_run structure | 1433 | 5. The kvm_run structure |
1295 | 1434 | ||
1296 | Application code obtains a pointer to the kvm_run structure by | 1435 | Application code obtains a pointer to the kvm_run structure by |
@@ -1473,6 +1612,23 @@ Userspace can now handle the hypercall and when it's done modify the gprs as | |||
1473 | necessary. Upon guest entry all guest GPRs will then be replaced by the values | 1612 | necessary. Upon guest entry all guest GPRs will then be replaced by the values |
1474 | in this struct. | 1613 | in this struct. |
1475 | 1614 | ||
1615 | /* KVM_EXIT_PAPR_HCALL */ | ||
1616 | struct { | ||
1617 | __u64 nr; | ||
1618 | __u64 ret; | ||
1619 | __u64 args[9]; | ||
1620 | } papr_hcall; | ||
1621 | |||
1622 | This is used on 64-bit PowerPC when emulating a pSeries partition, | ||
1623 | e.g. with the 'pseries' machine type in qemu. It occurs when the | ||
1624 | guest does a hypercall using the 'sc 1' instruction. The 'nr' field | ||
1625 | contains the hypercall number (from the guest R3), and 'args' contains | ||
1626 | the arguments (from the guest R4 - R12). Userspace should put the | ||
1627 | return code in 'ret' and any extra returned values in args[]. | ||
1628 | The possible hypercalls are defined in the Power Architecture Platform | ||
1629 | Requirements (PAPR) document available from www.power.org (free | ||
1630 | developer registration required to access it). | ||
1631 | |||
1476 | /* Fix the size of the union. */ | 1632 | /* Fix the size of the union. */ |
1477 | char padding[256]; | 1633 | char padding[256]; |
1478 | }; | 1634 | }; |
diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt index f46aa58389ca..5dc972c09b55 100644 --- a/Documentation/virtual/kvm/mmu.txt +++ b/Documentation/virtual/kvm/mmu.txt | |||
@@ -165,6 +165,10 @@ Shadow pages contain the following information: | |||
165 | Contains the value of efer.nxe for which the page is valid. | 165 | Contains the value of efer.nxe for which the page is valid. |
166 | role.cr0_wp: | 166 | role.cr0_wp: |
167 | Contains the value of cr0.wp for which the page is valid. | 167 | Contains the value of cr0.wp for which the page is valid. |
168 | role.smep_andnot_wp: | ||
169 | Contains the value of cr4.smep && !cr0.wp for which the page is valid | ||
170 | (pages for which this is true are different from other pages; see the | ||
171 | treatment of cr0.wp=0 below). | ||
168 | gfn: | 172 | gfn: |
169 | Either the guest page table containing the translations shadowed by this | 173 | Either the guest page table containing the translations shadowed by this |
170 | page, or the base page frame for linear translations. See role.direct. | 174 | page, or the base page frame for linear translations. See role.direct. |
@@ -317,6 +321,20 @@ on fault type: | |||
317 | 321 | ||
318 | (user write faults generate a #PF) | 322 | (user write faults generate a #PF) |
319 | 323 | ||
324 | In the first case there is an additional complication if CR4.SMEP is | ||
325 | enabled: since we've turned the page into a kernel page, the kernel may now | ||
326 | execute it. We handle this by also setting spte.nx. If we get a user | ||
327 | fetch or read fault, we'll change spte.u=1 and spte.nx=gpte.nx back. | ||
328 | |||
329 | To prevent an spte that was converted into a kernel page with cr0.wp=0 | ||
330 | from being written by the kernel after cr0.wp has changed to 1, we make | ||
331 | the value of cr0.wp part of the page role. This means that an spte created | ||
332 | with one value of cr0.wp cannot be used when cr0.wp has a different value - | ||
333 | it will simply be missed by the shadow page lookup code. A similar issue | ||
334 | exists when an spte created with cr0.wp=0 and cr4.smep=0 is used after | ||
335 | changing cr4.smep to 1. To avoid this, the value of !cr0.wp && cr4.smep | ||
336 | is also made a part of the page role. | ||
337 | |||
320 | Large pages | 338 | Large pages |
321 | =========== | 339 | =========== |
322 | 340 | ||
diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt index d079aed27e03..50317809113d 100644 --- a/Documentation/virtual/kvm/msr.txt +++ b/Documentation/virtual/kvm/msr.txt | |||
@@ -185,3 +185,37 @@ MSR_KVM_ASYNC_PF_EN: 0x4b564d02 | |||
185 | 185 | ||
186 | Currently type 2 APF will be always delivered on the same vcpu as | 186 | Currently type 2 APF will be always delivered on the same vcpu as |
187 | type 1 was, but guest should not rely on that. | 187 | type 1 was, but guest should not rely on that. |
188 | |||
189 | MSR_KVM_STEAL_TIME: 0x4b564d03 | ||
190 | |||
191 | data: 64-byte alignment physical address of a memory area which must be | ||
192 | in guest RAM, plus an enable bit in bit 0. This memory is expected to | ||
193 | hold a copy of the following structure: | ||
194 | |||
195 | struct kvm_steal_time { | ||
196 | __u64 steal; | ||
197 | __u32 version; | ||
198 | __u32 flags; | ||
199 | __u32 pad[12]; | ||
200 | } | ||
201 | |||
202 | whose data will be filled in by the hypervisor periodically. Only one | ||
203 | write, or registration, is needed for each VCPU. The interval between | ||
204 | updates of this structure is arbitrary and implementation-dependent. | ||
205 | The hypervisor may update this structure at any time it sees fit until | ||
206 | anything with bit0 == 0 is written to it. Guest is required to make sure | ||
207 | this structure is initialized to zero. | ||
208 | |||
209 | Fields have the following meanings: | ||
210 | |||
211 | version: a sequence counter. In other words, guest has to check | ||
212 | this field before and after grabbing time information and make | ||
213 | sure they are both equal and even. An odd version indicates an | ||
214 | in-progress update. | ||
215 | |||
216 | flags: At this point, always zero. May be used to indicate | ||
217 | changes in this structure in the future. | ||
218 | |||
219 | steal: the amount of time in which this vCPU did not run, in | ||
220 | nanoseconds. Time during which the vcpu is idle, will not be | ||
221 | reported as steal time. | ||
diff --git a/Documentation/virtual/kvm/nested-vmx.txt b/Documentation/virtual/kvm/nested-vmx.txt new file mode 100644 index 000000000000..8ed937de1163 --- /dev/null +++ b/Documentation/virtual/kvm/nested-vmx.txt | |||
@@ -0,0 +1,251 @@ | |||
1 | Nested VMX | ||
2 | ========== | ||
3 | |||
4 | Overview | ||
5 | --------- | ||
6 | |||
7 | On Intel processors, KVM uses Intel's VMX (Virtual-Machine eXtensions) | ||
8 | to easily and efficiently run guest operating systems. Normally, these guests | ||
9 | *cannot* themselves be hypervisors running their own guests, because in VMX, | ||
10 | guests cannot use VMX instructions. | ||
11 | |||
12 | The "Nested VMX" feature adds this missing capability - of running guest | ||
13 | hypervisors (which use VMX) with their own nested guests. It does so by | ||
14 | allowing a guest to use VMX instructions, and correctly and efficiently | ||
15 | emulating them using the single level of VMX available in the hardware. | ||
16 | |||
17 | We describe in much greater detail the theory behind the nested VMX feature, | ||
18 | its implementation and its performance characteristics, in the OSDI 2010 paper | ||
19 | "The Turtles Project: Design and Implementation of Nested Virtualization", | ||
20 | available at: | ||
21 | |||
22 | http://www.usenix.org/events/osdi10/tech/full_papers/Ben-Yehuda.pdf | ||
23 | |||
24 | |||
25 | Terminology | ||
26 | ----------- | ||
27 | |||
28 | Single-level virtualization has two levels - the host (KVM) and the guests. | ||
29 | In nested virtualization, we have three levels: The host (KVM), which we call | ||
30 | L0, the guest hypervisor, which we call L1, and its nested guest, which we | ||
31 | call L2. | ||
32 | |||
33 | |||
34 | Known limitations | ||
35 | ----------------- | ||
36 | |||
37 | The current code supports running Linux guests under KVM guests. | ||
38 | Only 64-bit guest hypervisors are supported. | ||
39 | |||
40 | Additional patches for running Windows under guest KVM, and Linux under | ||
41 | guest VMware server, and support for nested EPT, are currently running in | ||
42 | the lab, and will be sent as follow-on patchsets. | ||
43 | |||
44 | |||
45 | Running nested VMX | ||
46 | ------------------ | ||
47 | |||
48 | The nested VMX feature is disabled by default. It can be enabled by giving | ||
49 | the "nested=1" option to the kvm-intel module. | ||
50 | |||
51 | No modifications are required to user space (qemu). However, qemu's default | ||
52 | emulated CPU type (qemu64) does not list the "VMX" CPU feature, so it must be | ||
53 | explicitly enabled, by giving qemu one of the following options: | ||
54 | |||
55 | -cpu host (emulated CPU has all features of the real CPU) | ||
56 | |||
57 | -cpu qemu64,+vmx (add just the vmx feature to a named CPU type) | ||
58 | |||
59 | |||
60 | ABIs | ||
61 | ---- | ||
62 | |||
63 | Nested VMX aims to present a standard and (eventually) fully-functional VMX | ||
64 | implementation for the a guest hypervisor to use. As such, the official | ||
65 | specification of the ABI that it provides is Intel's VMX specification, | ||
66 | namely volume 3B of their "Intel 64 and IA-32 Architectures Software | ||
67 | Developer's Manual". Not all of VMX's features are currently fully supported, | ||
68 | but the goal is to eventually support them all, starting with the VMX features | ||
69 | which are used in practice by popular hypervisors (KVM and others). | ||
70 | |||
71 | As a VMX implementation, nested VMX presents a VMCS structure to L1. | ||
72 | As mandated by the spec, other than the two fields revision_id and abort, | ||
73 | this structure is *opaque* to its user, who is not supposed to know or care | ||
74 | about its internal structure. Rather, the structure is accessed through the | ||
75 | VMREAD and VMWRITE instructions. | ||
76 | Still, for debugging purposes, KVM developers might be interested to know the | ||
77 | internals of this structure; This is struct vmcs12 from arch/x86/kvm/vmx.c. | ||
78 | |||
79 | The name "vmcs12" refers to the VMCS that L1 builds for L2. In the code we | ||
80 | also have "vmcs01", the VMCS that L0 built for L1, and "vmcs02" is the VMCS | ||
81 | which L0 builds to actually run L2 - how this is done is explained in the | ||
82 | aforementioned paper. | ||
83 | |||
84 | For convenience, we repeat the content of struct vmcs12 here. If the internals | ||
85 | of this structure changes, this can break live migration across KVM versions. | ||
86 | VMCS12_REVISION (from vmx.c) should be changed if struct vmcs12 or its inner | ||
87 | struct shadow_vmcs is ever changed. | ||
88 | |||
89 | typedef u64 natural_width; | ||
90 | struct __packed vmcs12 { | ||
91 | /* According to the Intel spec, a VMCS region must start with | ||
92 | * these two user-visible fields */ | ||
93 | u32 revision_id; | ||
94 | u32 abort; | ||
95 | |||
96 | u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ | ||
97 | u32 padding[7]; /* room for future expansion */ | ||
98 | |||
99 | u64 io_bitmap_a; | ||
100 | u64 io_bitmap_b; | ||
101 | u64 msr_bitmap; | ||
102 | u64 vm_exit_msr_store_addr; | ||
103 | u64 vm_exit_msr_load_addr; | ||
104 | u64 vm_entry_msr_load_addr; | ||
105 | u64 tsc_offset; | ||
106 | u64 virtual_apic_page_addr; | ||
107 | u64 apic_access_addr; | ||
108 | u64 ept_pointer; | ||
109 | u64 guest_physical_address; | ||
110 | u64 vmcs_link_pointer; | ||
111 | u64 guest_ia32_debugctl; | ||
112 | u64 guest_ia32_pat; | ||
113 | u64 guest_ia32_efer; | ||
114 | u64 guest_pdptr0; | ||
115 | u64 guest_pdptr1; | ||
116 | u64 guest_pdptr2; | ||
117 | u64 guest_pdptr3; | ||
118 | u64 host_ia32_pat; | ||
119 | u64 host_ia32_efer; | ||
120 | u64 padding64[8]; /* room for future expansion */ | ||
121 | natural_width cr0_guest_host_mask; | ||
122 | natural_width cr4_guest_host_mask; | ||
123 | natural_width cr0_read_shadow; | ||
124 | natural_width cr4_read_shadow; | ||
125 | natural_width cr3_target_value0; | ||
126 | natural_width cr3_target_value1; | ||
127 | natural_width cr3_target_value2; | ||
128 | natural_width cr3_target_value3; | ||
129 | natural_width exit_qualification; | ||
130 | natural_width guest_linear_address; | ||
131 | natural_width guest_cr0; | ||
132 | natural_width guest_cr3; | ||
133 | natural_width guest_cr4; | ||
134 | natural_width guest_es_base; | ||
135 | natural_width guest_cs_base; | ||
136 | natural_width guest_ss_base; | ||
137 | natural_width guest_ds_base; | ||
138 | natural_width guest_fs_base; | ||
139 | natural_width guest_gs_base; | ||
140 | natural_width guest_ldtr_base; | ||
141 | natural_width guest_tr_base; | ||
142 | natural_width guest_gdtr_base; | ||
143 | natural_width guest_idtr_base; | ||
144 | natural_width guest_dr7; | ||
145 | natural_width guest_rsp; | ||
146 | natural_width guest_rip; | ||
147 | natural_width guest_rflags; | ||
148 | natural_width guest_pending_dbg_exceptions; | ||
149 | natural_width guest_sysenter_esp; | ||
150 | natural_width guest_sysenter_eip; | ||
151 | natural_width host_cr0; | ||
152 | natural_width host_cr3; | ||
153 | natural_width host_cr4; | ||
154 | natural_width host_fs_base; | ||
155 | natural_width host_gs_base; | ||
156 | natural_width host_tr_base; | ||
157 | natural_width host_gdtr_base; | ||
158 | natural_width host_idtr_base; | ||
159 | natural_width host_ia32_sysenter_esp; | ||
160 | natural_width host_ia32_sysenter_eip; | ||
161 | natural_width host_rsp; | ||
162 | natural_width host_rip; | ||
163 | natural_width paddingl[8]; /* room for future expansion */ | ||
164 | u32 pin_based_vm_exec_control; | ||
165 | u32 cpu_based_vm_exec_control; | ||
166 | u32 exception_bitmap; | ||
167 | u32 page_fault_error_code_mask; | ||
168 | u32 page_fault_error_code_match; | ||
169 | u32 cr3_target_count; | ||
170 | u32 vm_exit_controls; | ||
171 | u32 vm_exit_msr_store_count; | ||
172 | u32 vm_exit_msr_load_count; | ||
173 | u32 vm_entry_controls; | ||
174 | u32 vm_entry_msr_load_count; | ||
175 | u32 vm_entry_intr_info_field; | ||
176 | u32 vm_entry_exception_error_code; | ||
177 | u32 vm_entry_instruction_len; | ||
178 | u32 tpr_threshold; | ||
179 | u32 secondary_vm_exec_control; | ||
180 | u32 vm_instruction_error; | ||
181 | u32 vm_exit_reason; | ||
182 | u32 vm_exit_intr_info; | ||
183 | u32 vm_exit_intr_error_code; | ||
184 | u32 idt_vectoring_info_field; | ||
185 | u32 idt_vectoring_error_code; | ||
186 | u32 vm_exit_instruction_len; | ||
187 | u32 vmx_instruction_info; | ||
188 | u32 guest_es_limit; | ||
189 | u32 guest_cs_limit; | ||
190 | u32 guest_ss_limit; | ||
191 | u32 guest_ds_limit; | ||
192 | u32 guest_fs_limit; | ||
193 | u32 guest_gs_limit; | ||
194 | u32 guest_ldtr_limit; | ||
195 | u32 guest_tr_limit; | ||
196 | u32 guest_gdtr_limit; | ||
197 | u32 guest_idtr_limit; | ||
198 | u32 guest_es_ar_bytes; | ||
199 | u32 guest_cs_ar_bytes; | ||
200 | u32 guest_ss_ar_bytes; | ||
201 | u32 guest_ds_ar_bytes; | ||
202 | u32 guest_fs_ar_bytes; | ||
203 | u32 guest_gs_ar_bytes; | ||
204 | u32 guest_ldtr_ar_bytes; | ||
205 | u32 guest_tr_ar_bytes; | ||
206 | u32 guest_interruptibility_info; | ||
207 | u32 guest_activity_state; | ||
208 | u32 guest_sysenter_cs; | ||
209 | u32 host_ia32_sysenter_cs; | ||
210 | u32 padding32[8]; /* room for future expansion */ | ||
211 | u16 virtual_processor_id; | ||
212 | u16 guest_es_selector; | ||
213 | u16 guest_cs_selector; | ||
214 | u16 guest_ss_selector; | ||
215 | u16 guest_ds_selector; | ||
216 | u16 guest_fs_selector; | ||
217 | u16 guest_gs_selector; | ||
218 | u16 guest_ldtr_selector; | ||
219 | u16 guest_tr_selector; | ||
220 | u16 host_es_selector; | ||
221 | u16 host_cs_selector; | ||
222 | u16 host_ss_selector; | ||
223 | u16 host_ds_selector; | ||
224 | u16 host_fs_selector; | ||
225 | u16 host_gs_selector; | ||
226 | u16 host_tr_selector; | ||
227 | }; | ||
228 | |||
229 | |||
230 | Authors | ||
231 | ------- | ||
232 | |||
233 | These patches were written by: | ||
234 | Abel Gordon, abelg <at> il.ibm.com | ||
235 | Nadav Har'El, nyh <at> il.ibm.com | ||
236 | Orit Wasserman, oritw <at> il.ibm.com | ||
237 | Ben-Ami Yassor, benami <at> il.ibm.com | ||
238 | Muli Ben-Yehuda, muli <at> il.ibm.com | ||
239 | |||
240 | With contributions by: | ||
241 | Anthony Liguori, aliguori <at> us.ibm.com | ||
242 | Mike Day, mdday <at> us.ibm.com | ||
243 | Michael Factor, factor <at> il.ibm.com | ||
244 | Zvi Dubitzky, dubi <at> il.ibm.com | ||
245 | |||
246 | And valuable reviews by: | ||
247 | Avi Kivity, avi <at> redhat.com | ||
248 | Gleb Natapov, gleb <at> redhat.com | ||
249 | Marcelo Tosatti, mtosatti <at> redhat.com | ||
250 | Kevin Tian, kevin.tian <at> intel.com | ||
251 | and others. | ||
diff --git a/Documentation/virtual/kvm/ppc-pv.txt b/Documentation/virtual/kvm/ppc-pv.txt index 3ab969c59046..2b7ce190cde4 100644 --- a/Documentation/virtual/kvm/ppc-pv.txt +++ b/Documentation/virtual/kvm/ppc-pv.txt | |||
@@ -68,9 +68,11 @@ page that contains parts of supervisor visible register state. The guest can | |||
68 | map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE. | 68 | map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE. |
69 | 69 | ||
70 | With this hypercall issued the guest always gets the magic page mapped at the | 70 | With this hypercall issued the guest always gets the magic page mapped at the |
71 | desired location in effective and physical address space. For now, we always | 71 | desired location. The first parameter indicates the effective address when the |
72 | map the page to -4096. This way we can access it using absolute load and store | 72 | MMU is enabled. The second parameter indicates the address in real mode, if |
73 | functions. The following instruction reads the first field of the magic page: | 73 | applicable to the target. For now, we always map the page to -4096. This way we |
74 | can access it using absolute load and store functions. The following | ||
75 | instruction reads the first field of the magic page: | ||
74 | 76 | ||
75 | ld rX, -4096(0) | 77 | ld rX, -4096(0) |
76 | 78 | ||
diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h index 2eb0a981a09a..32551d304cd7 100644 --- a/arch/ia64/include/asm/paravirt.h +++ b/arch/ia64/include/asm/paravirt.h | |||
@@ -281,6 +281,10 @@ paravirt_init_missing_ticks_accounting(int cpu) | |||
281 | pv_time_ops.init_missing_ticks_accounting(cpu); | 281 | pv_time_ops.init_missing_ticks_accounting(cpu); |
282 | } | 282 | } |
283 | 283 | ||
284 | struct jump_label_key; | ||
285 | extern struct jump_label_key paravirt_steal_enabled; | ||
286 | extern struct jump_label_key paravirt_steal_rq_enabled; | ||
287 | |||
284 | static inline int | 288 | static inline int |
285 | paravirt_do_steal_accounting(unsigned long *new_itm) | 289 | paravirt_do_steal_accounting(unsigned long *new_itm) |
286 | { | 290 | { |
diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c index a21d7bb9c69c..100868216c55 100644 --- a/arch/ia64/kernel/paravirt.c +++ b/arch/ia64/kernel/paravirt.c | |||
@@ -634,6 +634,8 @@ struct pv_irq_ops pv_irq_ops = { | |||
634 | * pv_time_ops | 634 | * pv_time_ops |
635 | * time operations | 635 | * time operations |
636 | */ | 636 | */ |
637 | struct jump_label_key paravirt_steal_enabled; | ||
638 | struct jump_label_key paravirt_steal_rq_enabled; | ||
637 | 639 | ||
638 | static int | 640 | static int |
639 | ia64_native_do_steal_accounting(unsigned long *new_itm) | 641 | ia64_native_do_steal_accounting(unsigned long *new_itm) |
diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index c0d842cfd012..e30442c539ce 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h | |||
@@ -179,8 +179,9 @@ extern const char *powerpc_base_platform; | |||
179 | #define LONG_ASM_CONST(x) 0 | 179 | #define LONG_ASM_CONST(x) 0 |
180 | #endif | 180 | #endif |
181 | 181 | ||
182 | 182 | #define CPU_FTR_HVMODE LONG_ASM_CONST(0x0000000200000000) | |
183 | #define CPU_FTR_HVMODE_206 LONG_ASM_CONST(0x0000000800000000) | 183 | #define CPU_FTR_ARCH_201 LONG_ASM_CONST(0x0000000400000000) |
184 | #define CPU_FTR_ARCH_206 LONG_ASM_CONST(0x0000000800000000) | ||
184 | #define CPU_FTR_CFAR LONG_ASM_CONST(0x0000001000000000) | 185 | #define CPU_FTR_CFAR LONG_ASM_CONST(0x0000001000000000) |
185 | #define CPU_FTR_IABR LONG_ASM_CONST(0x0000002000000000) | 186 | #define CPU_FTR_IABR LONG_ASM_CONST(0x0000002000000000) |
186 | #define CPU_FTR_MMCRA LONG_ASM_CONST(0x0000004000000000) | 187 | #define CPU_FTR_MMCRA LONG_ASM_CONST(0x0000004000000000) |
@@ -401,9 +402,10 @@ extern const char *powerpc_base_platform; | |||
401 | CPU_FTR_MMCRA | CPU_FTR_CP_USE_DCBTZ | \ | 402 | CPU_FTR_MMCRA | CPU_FTR_CP_USE_DCBTZ | \ |
402 | CPU_FTR_STCX_CHECKS_ADDRESS) | 403 | CPU_FTR_STCX_CHECKS_ADDRESS) |
403 | #define CPU_FTRS_PPC970 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ | 404 | #define CPU_FTRS_PPC970 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ |
404 | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ | 405 | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_201 | \ |
405 | CPU_FTR_ALTIVEC_COMP | CPU_FTR_CAN_NAP | CPU_FTR_MMCRA | \ | 406 | CPU_FTR_ALTIVEC_COMP | CPU_FTR_CAN_NAP | CPU_FTR_MMCRA | \ |
406 | CPU_FTR_CP_USE_DCBTZ | CPU_FTR_STCX_CHECKS_ADDRESS) | 407 | CPU_FTR_CP_USE_DCBTZ | CPU_FTR_STCX_CHECKS_ADDRESS | \ |
408 | CPU_FTR_HVMODE) | ||
407 | #define CPU_FTRS_POWER5 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ | 409 | #define CPU_FTRS_POWER5 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ |
408 | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ | 410 | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ |
409 | CPU_FTR_MMCRA | CPU_FTR_SMT | \ | 411 | CPU_FTR_MMCRA | CPU_FTR_SMT | \ |
@@ -417,13 +419,13 @@ extern const char *powerpc_base_platform; | |||
417 | CPU_FTR_DSCR | CPU_FTR_UNALIGNED_LD_STD | \ | 419 | CPU_FTR_DSCR | CPU_FTR_UNALIGNED_LD_STD | \ |
418 | CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_CFAR) | 420 | CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_CFAR) |
419 | #define CPU_FTRS_POWER7 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ | 421 | #define CPU_FTRS_POWER7 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ |
420 | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_HVMODE_206 |\ | 422 | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_206 |\ |
421 | CPU_FTR_MMCRA | CPU_FTR_SMT | \ | 423 | CPU_FTR_MMCRA | CPU_FTR_SMT | \ |
422 | CPU_FTR_COHERENT_ICACHE | \ | 424 | CPU_FTR_COHERENT_ICACHE | \ |
423 | CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ | 425 | CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ |
424 | CPU_FTR_DSCR | CPU_FTR_SAO | CPU_FTR_ASYM_SMT | \ | 426 | CPU_FTR_DSCR | CPU_FTR_SAO | CPU_FTR_ASYM_SMT | \ |
425 | CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \ | 427 | CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \ |
426 | CPU_FTR_ICSWX | CPU_FTR_CFAR) | 428 | CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE) |
427 | #define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ | 429 | #define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ |
428 | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ | 430 | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ |
429 | CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \ | 431 | CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \ |
diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index f5dfe3411f64..8057f4f6980f 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h | |||
@@ -61,19 +61,22 @@ | |||
61 | #define EXC_HV H | 61 | #define EXC_HV H |
62 | #define EXC_STD | 62 | #define EXC_STD |
63 | 63 | ||
64 | #define EXCEPTION_PROLOG_1(area) \ | 64 | #define __EXCEPTION_PROLOG_1(area, extra, vec) \ |
65 | GET_PACA(r13); \ | 65 | GET_PACA(r13); \ |
66 | std r9,area+EX_R9(r13); /* save r9 - r12 */ \ | 66 | std r9,area+EX_R9(r13); /* save r9 - r12 */ \ |
67 | std r10,area+EX_R10(r13); \ | 67 | std r10,area+EX_R10(r13); \ |
68 | std r11,area+EX_R11(r13); \ | ||
69 | std r12,area+EX_R12(r13); \ | ||
70 | BEGIN_FTR_SECTION_NESTED(66); \ | 68 | BEGIN_FTR_SECTION_NESTED(66); \ |
71 | mfspr r10,SPRN_CFAR; \ | 69 | mfspr r10,SPRN_CFAR; \ |
72 | std r10,area+EX_CFAR(r13); \ | 70 | std r10,area+EX_CFAR(r13); \ |
73 | END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); \ | 71 | END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); \ |
74 | GET_SCRATCH0(r9); \ | 72 | mfcr r9; \ |
75 | std r9,area+EX_R13(r13); \ | 73 | extra(vec); \ |
76 | mfcr r9 | 74 | std r11,area+EX_R11(r13); \ |
75 | std r12,area+EX_R12(r13); \ | ||
76 | GET_SCRATCH0(r10); \ | ||
77 | std r10,area+EX_R13(r13) | ||
78 | #define EXCEPTION_PROLOG_1(area, extra, vec) \ | ||
79 | __EXCEPTION_PROLOG_1(area, extra, vec) | ||
77 | 80 | ||
78 | #define __EXCEPTION_PROLOG_PSERIES_1(label, h) \ | 81 | #define __EXCEPTION_PROLOG_PSERIES_1(label, h) \ |
79 | ld r12,PACAKBASE(r13); /* get high part of &label */ \ | 82 | ld r12,PACAKBASE(r13); /* get high part of &label */ \ |
@@ -85,13 +88,65 @@ | |||
85 | mtspr SPRN_##h##SRR1,r10; \ | 88 | mtspr SPRN_##h##SRR1,r10; \ |
86 | h##rfid; \ | 89 | h##rfid; \ |
87 | b . /* prevent speculative execution */ | 90 | b . /* prevent speculative execution */ |
88 | #define EXCEPTION_PROLOG_PSERIES_1(label, h) \ | 91 | #define EXCEPTION_PROLOG_PSERIES_1(label, h) \ |
89 | __EXCEPTION_PROLOG_PSERIES_1(label, h) | 92 | __EXCEPTION_PROLOG_PSERIES_1(label, h) |
90 | 93 | ||
91 | #define EXCEPTION_PROLOG_PSERIES(area, label, h) \ | 94 | #define EXCEPTION_PROLOG_PSERIES(area, label, h, extra, vec) \ |
92 | EXCEPTION_PROLOG_1(area); \ | 95 | EXCEPTION_PROLOG_1(area, extra, vec); \ |
93 | EXCEPTION_PROLOG_PSERIES_1(label, h); | 96 | EXCEPTION_PROLOG_PSERIES_1(label, h); |
94 | 97 | ||
98 | #define __KVMTEST(n) \ | ||
99 | lbz r10,HSTATE_IN_GUEST(r13); \ | ||
100 | cmpwi r10,0; \ | ||
101 | bne do_kvm_##n | ||
102 | |||
103 | #define __KVM_HANDLER(area, h, n) \ | ||
104 | do_kvm_##n: \ | ||
105 | ld r10,area+EX_R10(r13); \ | ||
106 | stw r9,HSTATE_SCRATCH1(r13); \ | ||
107 | ld r9,area+EX_R9(r13); \ | ||
108 | std r12,HSTATE_SCRATCH0(r13); \ | ||
109 | li r12,n; \ | ||
110 | b kvmppc_interrupt | ||
111 | |||
112 | #define __KVM_HANDLER_SKIP(area, h, n) \ | ||
113 | do_kvm_##n: \ | ||
114 | cmpwi r10,KVM_GUEST_MODE_SKIP; \ | ||
115 | ld r10,area+EX_R10(r13); \ | ||
116 | beq 89f; \ | ||
117 | stw r9,HSTATE_SCRATCH1(r13); \ | ||
118 | ld r9,area+EX_R9(r13); \ | ||
119 | std r12,HSTATE_SCRATCH0(r13); \ | ||
120 | li r12,n; \ | ||
121 | b kvmppc_interrupt; \ | ||
122 | 89: mtocrf 0x80,r9; \ | ||
123 | ld r9,area+EX_R9(r13); \ | ||
124 | b kvmppc_skip_##h##interrupt | ||
125 | |||
126 | #ifdef CONFIG_KVM_BOOK3S_64_HANDLER | ||
127 | #define KVMTEST(n) __KVMTEST(n) | ||
128 | #define KVM_HANDLER(area, h, n) __KVM_HANDLER(area, h, n) | ||
129 | #define KVM_HANDLER_SKIP(area, h, n) __KVM_HANDLER_SKIP(area, h, n) | ||
130 | |||
131 | #else | ||
132 | #define KVMTEST(n) | ||
133 | #define KVM_HANDLER(area, h, n) | ||
134 | #define KVM_HANDLER_SKIP(area, h, n) | ||
135 | #endif | ||
136 | |||
137 | #ifdef CONFIG_KVM_BOOK3S_PR | ||
138 | #define KVMTEST_PR(n) __KVMTEST(n) | ||
139 | #define KVM_HANDLER_PR(area, h, n) __KVM_HANDLER(area, h, n) | ||
140 | #define KVM_HANDLER_PR_SKIP(area, h, n) __KVM_HANDLER_SKIP(area, h, n) | ||
141 | |||
142 | #else | ||
143 | #define KVMTEST_PR(n) | ||
144 | #define KVM_HANDLER_PR(area, h, n) | ||
145 | #define KVM_HANDLER_PR_SKIP(area, h, n) | ||
146 | #endif | ||
147 | |||
148 | #define NOTEST(n) | ||
149 | |||
95 | /* | 150 | /* |
96 | * The common exception prolog is used for all except a few exceptions | 151 | * The common exception prolog is used for all except a few exceptions |
97 | * such as a segment miss on a kernel address. We have to be prepared | 152 | * such as a segment miss on a kernel address. We have to be prepared |
@@ -164,57 +219,58 @@ | |||
164 | .globl label##_pSeries; \ | 219 | .globl label##_pSeries; \ |
165 | label##_pSeries: \ | 220 | label##_pSeries: \ |
166 | HMT_MEDIUM; \ | 221 | HMT_MEDIUM; \ |
167 | DO_KVM vec; \ | ||
168 | SET_SCRATCH0(r13); /* save r13 */ \ | 222 | SET_SCRATCH0(r13); /* save r13 */ \ |
169 | EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common, EXC_STD) | 223 | EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common, \ |
224 | EXC_STD, KVMTEST_PR, vec) | ||
170 | 225 | ||
171 | #define STD_EXCEPTION_HV(loc, vec, label) \ | 226 | #define STD_EXCEPTION_HV(loc, vec, label) \ |
172 | . = loc; \ | 227 | . = loc; \ |
173 | .globl label##_hv; \ | 228 | .globl label##_hv; \ |
174 | label##_hv: \ | 229 | label##_hv: \ |
175 | HMT_MEDIUM; \ | 230 | HMT_MEDIUM; \ |
176 | DO_KVM vec; \ | 231 | SET_SCRATCH0(r13); /* save r13 */ \ |
177 | SET_SCRATCH0(r13); /* save r13 */ \ | 232 | EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common, \ |
178 | EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common, EXC_HV) | 233 | EXC_HV, KVMTEST, vec) |
179 | 234 | ||
180 | #define __MASKABLE_EXCEPTION_PSERIES(vec, label, h) \ | 235 | #define __SOFTEN_TEST(h) \ |
181 | HMT_MEDIUM; \ | ||
182 | DO_KVM vec; \ | ||
183 | SET_SCRATCH0(r13); /* save r13 */ \ | ||
184 | GET_PACA(r13); \ | ||
185 | std r9,PACA_EXGEN+EX_R9(r13); /* save r9, r10 */ \ | ||
186 | std r10,PACA_EXGEN+EX_R10(r13); \ | ||
187 | lbz r10,PACASOFTIRQEN(r13); \ | 236 | lbz r10,PACASOFTIRQEN(r13); \ |
188 | mfcr r9; \ | ||
189 | cmpwi r10,0; \ | 237 | cmpwi r10,0; \ |
190 | beq masked_##h##interrupt; \ | 238 | beq masked_##h##interrupt |
191 | GET_SCRATCH0(r10); \ | 239 | #define _SOFTEN_TEST(h) __SOFTEN_TEST(h) |
192 | std r10,PACA_EXGEN+EX_R13(r13); \ | 240 | |
193 | std r11,PACA_EXGEN+EX_R11(r13); \ | 241 | #define SOFTEN_TEST_PR(vec) \ |
194 | std r12,PACA_EXGEN+EX_R12(r13); \ | 242 | KVMTEST_PR(vec); \ |
195 | ld r12,PACAKBASE(r13); /* get high part of &label */ \ | 243 | _SOFTEN_TEST(EXC_STD) |
196 | ld r10,PACAKMSR(r13); /* get MSR value for kernel */ \ | 244 | |
197 | mfspr r11,SPRN_##h##SRR0; /* save SRR0 */ \ | 245 | #define SOFTEN_TEST_HV(vec) \ |
198 | LOAD_HANDLER(r12,label##_common) \ | 246 | KVMTEST(vec); \ |
199 | mtspr SPRN_##h##SRR0,r12; \ | 247 | _SOFTEN_TEST(EXC_HV) |
200 | mfspr r12,SPRN_##h##SRR1; /* and SRR1 */ \ | 248 | |
201 | mtspr SPRN_##h##SRR1,r10; \ | 249 | #define SOFTEN_TEST_HV_201(vec) \ |
202 | h##rfid; \ | 250 | KVMTEST(vec); \ |
203 | b . /* prevent speculative execution */ | 251 | _SOFTEN_TEST(EXC_STD) |
204 | #define _MASKABLE_EXCEPTION_PSERIES(vec, label, h) \ | 252 | |
205 | __MASKABLE_EXCEPTION_PSERIES(vec, label, h) | 253 | #define __MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra) \ |
254 | HMT_MEDIUM; \ | ||
255 | SET_SCRATCH0(r13); /* save r13 */ \ | ||
256 | __EXCEPTION_PROLOG_1(PACA_EXGEN, extra, vec); \ | ||
257 | EXCEPTION_PROLOG_PSERIES_1(label##_common, h); | ||
258 | #define _MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra) \ | ||
259 | __MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra) | ||
206 | 260 | ||
207 | #define MASKABLE_EXCEPTION_PSERIES(loc, vec, label) \ | 261 | #define MASKABLE_EXCEPTION_PSERIES(loc, vec, label) \ |
208 | . = loc; \ | 262 | . = loc; \ |
209 | .globl label##_pSeries; \ | 263 | .globl label##_pSeries; \ |
210 | label##_pSeries: \ | 264 | label##_pSeries: \ |
211 | _MASKABLE_EXCEPTION_PSERIES(vec, label, EXC_STD) | 265 | _MASKABLE_EXCEPTION_PSERIES(vec, label, \ |
266 | EXC_STD, SOFTEN_TEST_PR) | ||
212 | 267 | ||
213 | #define MASKABLE_EXCEPTION_HV(loc, vec, label) \ | 268 | #define MASKABLE_EXCEPTION_HV(loc, vec, label) \ |
214 | . = loc; \ | 269 | . = loc; \ |
215 | .globl label##_hv; \ | 270 | .globl label##_hv; \ |
216 | label##_hv: \ | 271 | label##_hv: \ |
217 | _MASKABLE_EXCEPTION_PSERIES(vec, label, EXC_HV) | 272 | _MASKABLE_EXCEPTION_PSERIES(vec, label, \ |
273 | EXC_HV, SOFTEN_TEST_HV) | ||
218 | 274 | ||
219 | #ifdef CONFIG_PPC_ISERIES | 275 | #ifdef CONFIG_PPC_ISERIES |
220 | #define DISABLE_INTS \ | 276 | #define DISABLE_INTS \ |
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index fd8201dddd4b..1c324ff55ea8 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h | |||
@@ -29,6 +29,10 @@ | |||
29 | #define H_LONG_BUSY_ORDER_100_SEC 9905 /* Long busy, hint that 100sec \ | 29 | #define H_LONG_BUSY_ORDER_100_SEC 9905 /* Long busy, hint that 100sec \ |
30 | is a good time to retry */ | 30 | is a good time to retry */ |
31 | #define H_LONG_BUSY_END_RANGE 9905 /* End of long busy range */ | 31 | #define H_LONG_BUSY_END_RANGE 9905 /* End of long busy range */ |
32 | |||
33 | /* Internal value used in book3s_hv kvm support; not returned to guests */ | ||
34 | #define H_TOO_HARD 9999 | ||
35 | |||
32 | #define H_HARDWARE -1 /* Hardware error */ | 36 | #define H_HARDWARE -1 /* Hardware error */ |
33 | #define H_FUNCTION -2 /* Function not supported */ | 37 | #define H_FUNCTION -2 /* Function not supported */ |
34 | #define H_PRIVILEGE -3 /* Caller not privileged */ | 38 | #define H_PRIVILEGE -3 /* Caller not privileged */ |
@@ -100,6 +104,7 @@ | |||
100 | #define H_PAGE_SET_ACTIVE H_PAGE_STATE_CHANGE | 104 | #define H_PAGE_SET_ACTIVE H_PAGE_STATE_CHANGE |
101 | #define H_AVPN (1UL<<(63-32)) /* An avpn is provided as a sanity test */ | 105 | #define H_AVPN (1UL<<(63-32)) /* An avpn is provided as a sanity test */ |
102 | #define H_ANDCOND (1UL<<(63-33)) | 106 | #define H_ANDCOND (1UL<<(63-33)) |
107 | #define H_LOCAL (1UL<<(63-35)) | ||
103 | #define H_ICACHE_INVALIDATE (1UL<<(63-40)) /* icbi, etc. (ignored for IO pages) */ | 108 | #define H_ICACHE_INVALIDATE (1UL<<(63-40)) /* icbi, etc. (ignored for IO pages) */ |
104 | #define H_ICACHE_SYNCHRONIZE (1UL<<(63-41)) /* dcbst, icbi, etc (ignored for IO pages */ | 109 | #define H_ICACHE_SYNCHRONIZE (1UL<<(63-41)) /* dcbst, icbi, etc (ignored for IO pages */ |
105 | #define H_COALESCE_CAND (1UL<<(63-42)) /* page is a good candidate for coalescing */ | 110 | #define H_COALESCE_CAND (1UL<<(63-42)) /* page is a good candidate for coalescing */ |
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index d2ca5ed3877b..a4f6c85431f8 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h | |||
@@ -22,6 +22,10 @@ | |||
22 | 22 | ||
23 | #include <linux/types.h> | 23 | #include <linux/types.h> |
24 | 24 | ||
25 | /* Select powerpc specific features in <linux/kvm.h> */ | ||
26 | #define __KVM_HAVE_SPAPR_TCE | ||
27 | #define __KVM_HAVE_PPC_SMT | ||
28 | |||
25 | struct kvm_regs { | 29 | struct kvm_regs { |
26 | __u64 pc; | 30 | __u64 pc; |
27 | __u64 cr; | 31 | __u64 cr; |
@@ -272,4 +276,15 @@ struct kvm_guest_debug_arch { | |||
272 | #define KVM_INTERRUPT_UNSET -2U | 276 | #define KVM_INTERRUPT_UNSET -2U |
273 | #define KVM_INTERRUPT_SET_LEVEL -3U | 277 | #define KVM_INTERRUPT_SET_LEVEL -3U |
274 | 278 | ||
279 | /* for KVM_CAP_SPAPR_TCE */ | ||
280 | struct kvm_create_spapr_tce { | ||
281 | __u64 liobn; | ||
282 | __u32 window_size; | ||
283 | }; | ||
284 | |||
285 | /* for KVM_ALLOCATE_RMA */ | ||
286 | struct kvm_allocate_rma { | ||
287 | __u64 rma_size; | ||
288 | }; | ||
289 | |||
275 | #endif /* __LINUX_KVM_POWERPC_H */ | 290 | #endif /* __LINUX_KVM_POWERPC_H */ |
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index 0951b17f4eb5..7b1f0e0fc653 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h | |||
@@ -64,8 +64,12 @@ | |||
64 | #define BOOK3S_INTERRUPT_PROGRAM 0x700 | 64 | #define BOOK3S_INTERRUPT_PROGRAM 0x700 |
65 | #define BOOK3S_INTERRUPT_FP_UNAVAIL 0x800 | 65 | #define BOOK3S_INTERRUPT_FP_UNAVAIL 0x800 |
66 | #define BOOK3S_INTERRUPT_DECREMENTER 0x900 | 66 | #define BOOK3S_INTERRUPT_DECREMENTER 0x900 |
67 | #define BOOK3S_INTERRUPT_HV_DECREMENTER 0x980 | ||
67 | #define BOOK3S_INTERRUPT_SYSCALL 0xc00 | 68 | #define BOOK3S_INTERRUPT_SYSCALL 0xc00 |
68 | #define BOOK3S_INTERRUPT_TRACE 0xd00 | 69 | #define BOOK3S_INTERRUPT_TRACE 0xd00 |
70 | #define BOOK3S_INTERRUPT_H_DATA_STORAGE 0xe00 | ||
71 | #define BOOK3S_INTERRUPT_H_INST_STORAGE 0xe20 | ||
72 | #define BOOK3S_INTERRUPT_H_EMUL_ASSIST 0xe40 | ||
69 | #define BOOK3S_INTERRUPT_PERFMON 0xf00 | 73 | #define BOOK3S_INTERRUPT_PERFMON 0xf00 |
70 | #define BOOK3S_INTERRUPT_ALTIVEC 0xf20 | 74 | #define BOOK3S_INTERRUPT_ALTIVEC 0xf20 |
71 | #define BOOK3S_INTERRUPT_VSX 0xf40 | 75 | #define BOOK3S_INTERRUPT_VSX 0xf40 |
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index d62e703f1214..98da010252a3 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h | |||
@@ -24,20 +24,6 @@ | |||
24 | #include <linux/kvm_host.h> | 24 | #include <linux/kvm_host.h> |
25 | #include <asm/kvm_book3s_asm.h> | 25 | #include <asm/kvm_book3s_asm.h> |
26 | 26 | ||
27 | struct kvmppc_slb { | ||
28 | u64 esid; | ||
29 | u64 vsid; | ||
30 | u64 orige; | ||
31 | u64 origv; | ||
32 | bool valid : 1; | ||
33 | bool Ks : 1; | ||
34 | bool Kp : 1; | ||
35 | bool nx : 1; | ||
36 | bool large : 1; /* PTEs are 16MB */ | ||
37 | bool tb : 1; /* 1TB segment */ | ||
38 | bool class : 1; | ||
39 | }; | ||
40 | |||
41 | struct kvmppc_bat { | 27 | struct kvmppc_bat { |
42 | u64 raw; | 28 | u64 raw; |
43 | u32 bepi; | 29 | u32 bepi; |
@@ -67,11 +53,22 @@ struct kvmppc_sid_map { | |||
67 | #define VSID_POOL_SIZE (SID_CONTEXTS * 16) | 53 | #define VSID_POOL_SIZE (SID_CONTEXTS * 16) |
68 | #endif | 54 | #endif |
69 | 55 | ||
56 | struct hpte_cache { | ||
57 | struct hlist_node list_pte; | ||
58 | struct hlist_node list_pte_long; | ||
59 | struct hlist_node list_vpte; | ||
60 | struct hlist_node list_vpte_long; | ||
61 | struct rcu_head rcu_head; | ||
62 | u64 host_va; | ||
63 | u64 pfn; | ||
64 | ulong slot; | ||
65 | struct kvmppc_pte pte; | ||
66 | }; | ||
67 | |||
70 | struct kvmppc_vcpu_book3s { | 68 | struct kvmppc_vcpu_book3s { |
71 | struct kvm_vcpu vcpu; | 69 | struct kvm_vcpu vcpu; |
72 | struct kvmppc_book3s_shadow_vcpu *shadow_vcpu; | 70 | struct kvmppc_book3s_shadow_vcpu *shadow_vcpu; |
73 | struct kvmppc_sid_map sid_map[SID_MAP_NUM]; | 71 | struct kvmppc_sid_map sid_map[SID_MAP_NUM]; |
74 | struct kvmppc_slb slb[64]; | ||
75 | struct { | 72 | struct { |
76 | u64 esid; | 73 | u64 esid; |
77 | u64 vsid; | 74 | u64 vsid; |
@@ -81,7 +78,6 @@ struct kvmppc_vcpu_book3s { | |||
81 | struct kvmppc_bat dbat[8]; | 78 | struct kvmppc_bat dbat[8]; |
82 | u64 hid[6]; | 79 | u64 hid[6]; |
83 | u64 gqr[8]; | 80 | u64 gqr[8]; |
84 | int slb_nr; | ||
85 | u64 sdr1; | 81 | u64 sdr1; |
86 | u64 hior; | 82 | u64 hior; |
87 | u64 msr_mask; | 83 | u64 msr_mask; |
@@ -93,7 +89,13 @@ struct kvmppc_vcpu_book3s { | |||
93 | u64 vsid_max; | 89 | u64 vsid_max; |
94 | #endif | 90 | #endif |
95 | int context_id[SID_CONTEXTS]; | 91 | int context_id[SID_CONTEXTS]; |
96 | ulong prog_flags; /* flags to inject when giving a 700 trap */ | 92 | |
93 | struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE]; | ||
94 | struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG]; | ||
95 | struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE]; | ||
96 | struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG]; | ||
97 | int hpte_cache_count; | ||
98 | spinlock_t mmu_lock; | ||
97 | }; | 99 | }; |
98 | 100 | ||
99 | #define CONTEXT_HOST 0 | 101 | #define CONTEXT_HOST 0 |
@@ -110,8 +112,10 @@ extern void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong ea, ulong ea_mask) | |||
110 | extern void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 vp, u64 vp_mask); | 112 | extern void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 vp, u64 vp_mask); |
111 | extern void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end); | 113 | extern void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end); |
112 | extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr); | 114 | extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr); |
115 | extern void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr); | ||
113 | extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu); | 116 | extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu); |
114 | extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu); | 117 | extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu); |
118 | extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu); | ||
115 | extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte); | 119 | extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte); |
116 | extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr); | 120 | extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr); |
117 | extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu); | 121 | extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu); |
@@ -123,19 +127,22 @@ extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu); | |||
123 | extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte); | 127 | extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte); |
124 | extern int kvmppc_mmu_hpte_sysinit(void); | 128 | extern int kvmppc_mmu_hpte_sysinit(void); |
125 | extern void kvmppc_mmu_hpte_sysexit(void); | 129 | extern void kvmppc_mmu_hpte_sysexit(void); |
130 | extern int kvmppc_mmu_hv_init(void); | ||
126 | 131 | ||
127 | extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); | 132 | extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); |
128 | extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); | 133 | extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); |
129 | extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec); | 134 | extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec); |
135 | extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags); | ||
130 | extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, | 136 | extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, |
131 | bool upper, u32 val); | 137 | bool upper, u32 val); |
132 | extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); | 138 | extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); |
133 | extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu); | 139 | extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu); |
134 | extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); | 140 | extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); |
135 | 141 | ||
136 | extern ulong kvmppc_trampoline_lowmem; | 142 | extern void kvmppc_handler_lowmem_trampoline(void); |
137 | extern ulong kvmppc_trampoline_enter; | 143 | extern void kvmppc_handler_trampoline_enter(void); |
138 | extern void kvmppc_rmcall(ulong srr0, ulong srr1); | 144 | extern void kvmppc_rmcall(ulong srr0, ulong srr1); |
145 | extern void kvmppc_hv_entry_trampoline(void); | ||
139 | extern void kvmppc_load_up_fpu(void); | 146 | extern void kvmppc_load_up_fpu(void); |
140 | extern void kvmppc_load_up_altivec(void); | 147 | extern void kvmppc_load_up_altivec(void); |
141 | extern void kvmppc_load_up_vsx(void); | 148 | extern void kvmppc_load_up_vsx(void); |
@@ -147,15 +154,32 @@ static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu) | |||
147 | return container_of(vcpu, struct kvmppc_vcpu_book3s, vcpu); | 154 | return container_of(vcpu, struct kvmppc_vcpu_book3s, vcpu); |
148 | } | 155 | } |
149 | 156 | ||
150 | static inline ulong dsisr(void) | 157 | extern void kvm_return_point(void); |
158 | |||
159 | /* Also add subarch specific defines */ | ||
160 | |||
161 | #ifdef CONFIG_KVM_BOOK3S_32_HANDLER | ||
162 | #include <asm/kvm_book3s_32.h> | ||
163 | #endif | ||
164 | #ifdef CONFIG_KVM_BOOK3S_64_HANDLER | ||
165 | #include <asm/kvm_book3s_64.h> | ||
166 | #endif | ||
167 | |||
168 | #ifdef CONFIG_KVM_BOOK3S_PR | ||
169 | |||
170 | static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) | ||
151 | { | 171 | { |
152 | ulong r; | 172 | return to_book3s(vcpu)->hior; |
153 | asm ( "mfdsisr %0 " : "=r" (r) ); | ||
154 | return r; | ||
155 | } | 173 | } |
156 | 174 | ||
157 | extern void kvm_return_point(void); | 175 | static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, |
158 | static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu); | 176 | unsigned long pending_now, unsigned long old_pending) |
177 | { | ||
178 | if (pending_now) | ||
179 | vcpu->arch.shared->int_pending = 1; | ||
180 | else if (old_pending) | ||
181 | vcpu->arch.shared->int_pending = 0; | ||
182 | } | ||
159 | 183 | ||
160 | static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) | 184 | static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) |
161 | { | 185 | { |
@@ -244,6 +268,120 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu) | |||
244 | return to_svcpu(vcpu)->fault_dar; | 268 | return to_svcpu(vcpu)->fault_dar; |
245 | } | 269 | } |
246 | 270 | ||
271 | static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) | ||
272 | { | ||
273 | ulong crit_raw = vcpu->arch.shared->critical; | ||
274 | ulong crit_r1 = kvmppc_get_gpr(vcpu, 1); | ||
275 | bool crit; | ||
276 | |||
277 | /* Truncate crit indicators in 32 bit mode */ | ||
278 | if (!(vcpu->arch.shared->msr & MSR_SF)) { | ||
279 | crit_raw &= 0xffffffff; | ||
280 | crit_r1 &= 0xffffffff; | ||
281 | } | ||
282 | |||
283 | /* Critical section when crit == r1 */ | ||
284 | crit = (crit_raw == crit_r1); | ||
285 | /* ... and we're in supervisor mode */ | ||
286 | crit = crit && !(vcpu->arch.shared->msr & MSR_PR); | ||
287 | |||
288 | return crit; | ||
289 | } | ||
290 | #else /* CONFIG_KVM_BOOK3S_PR */ | ||
291 | |||
292 | static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) | ||
293 | { | ||
294 | return 0; | ||
295 | } | ||
296 | |||
297 | static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, | ||
298 | unsigned long pending_now, unsigned long old_pending) | ||
299 | { | ||
300 | } | ||
301 | |||
302 | static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) | ||
303 | { | ||
304 | vcpu->arch.gpr[num] = val; | ||
305 | } | ||
306 | |||
307 | static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num) | ||
308 | { | ||
309 | return vcpu->arch.gpr[num]; | ||
310 | } | ||
311 | |||
312 | static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) | ||
313 | { | ||
314 | vcpu->arch.cr = val; | ||
315 | } | ||
316 | |||
317 | static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) | ||
318 | { | ||
319 | return vcpu->arch.cr; | ||
320 | } | ||
321 | |||
322 | static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val) | ||
323 | { | ||
324 | vcpu->arch.xer = val; | ||
325 | } | ||
326 | |||
327 | static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu) | ||
328 | { | ||
329 | return vcpu->arch.xer; | ||
330 | } | ||
331 | |||
332 | static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val) | ||
333 | { | ||
334 | vcpu->arch.ctr = val; | ||
335 | } | ||
336 | |||
337 | static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu) | ||
338 | { | ||
339 | return vcpu->arch.ctr; | ||
340 | } | ||
341 | |||
342 | static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val) | ||
343 | { | ||
344 | vcpu->arch.lr = val; | ||
345 | } | ||
346 | |||
347 | static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu) | ||
348 | { | ||
349 | return vcpu->arch.lr; | ||
350 | } | ||
351 | |||
352 | static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val) | ||
353 | { | ||
354 | vcpu->arch.pc = val; | ||
355 | } | ||
356 | |||
357 | static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu) | ||
358 | { | ||
359 | return vcpu->arch.pc; | ||
360 | } | ||
361 | |||
362 | static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu) | ||
363 | { | ||
364 | ulong pc = kvmppc_get_pc(vcpu); | ||
365 | |||
366 | /* Load the instruction manually if it failed to do so in the | ||
367 | * exit path */ | ||
368 | if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED) | ||
369 | kvmppc_ld(vcpu, &pc, sizeof(u32), &vcpu->arch.last_inst, false); | ||
370 | |||
371 | return vcpu->arch.last_inst; | ||
372 | } | ||
373 | |||
374 | static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu) | ||
375 | { | ||
376 | return vcpu->arch.fault_dar; | ||
377 | } | ||
378 | |||
379 | static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) | ||
380 | { | ||
381 | return false; | ||
382 | } | ||
383 | #endif | ||
384 | |||
247 | /* Magic register values loaded into r3 and r4 before the 'sc' assembly | 385 | /* Magic register values loaded into r3 and r4 before the 'sc' assembly |
248 | * instruction for the OSI hypercalls */ | 386 | * instruction for the OSI hypercalls */ |
249 | #define OSI_SC_MAGIC_R3 0x113724FA | 387 | #define OSI_SC_MAGIC_R3 0x113724FA |
@@ -251,12 +389,4 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu) | |||
251 | 389 | ||
252 | #define INS_DCBZ 0x7c0007ec | 390 | #define INS_DCBZ 0x7c0007ec |
253 | 391 | ||
254 | /* Also add subarch specific defines */ | ||
255 | |||
256 | #ifdef CONFIG_PPC_BOOK3S_32 | ||
257 | #include <asm/kvm_book3s_32.h> | ||
258 | #else | ||
259 | #include <asm/kvm_book3s_64.h> | ||
260 | #endif | ||
261 | |||
262 | #endif /* __ASM_KVM_BOOK3S_H__ */ | 392 | #endif /* __ASM_KVM_BOOK3S_H__ */ |
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 4cadd612d575..e43fe42b9875 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h | |||
@@ -20,9 +20,13 @@ | |||
20 | #ifndef __ASM_KVM_BOOK3S_64_H__ | 20 | #ifndef __ASM_KVM_BOOK3S_64_H__ |
21 | #define __ASM_KVM_BOOK3S_64_H__ | 21 | #define __ASM_KVM_BOOK3S_64_H__ |
22 | 22 | ||
23 | #ifdef CONFIG_KVM_BOOK3S_PR | ||
23 | static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu) | 24 | static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu) |
24 | { | 25 | { |
25 | return &get_paca()->shadow_vcpu; | 26 | return &get_paca()->shadow_vcpu; |
26 | } | 27 | } |
28 | #endif | ||
29 | |||
30 | #define SPAPR_TCE_SHIFT 12 | ||
27 | 31 | ||
28 | #endif /* __ASM_KVM_BOOK3S_64_H__ */ | 32 | #endif /* __ASM_KVM_BOOK3S_64_H__ */ |
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index d5a8a3861635..ef7b3688c3b6 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h | |||
@@ -60,6 +60,36 @@ kvmppc_resume_\intno: | |||
60 | 60 | ||
61 | #else /*__ASSEMBLY__ */ | 61 | #else /*__ASSEMBLY__ */ |
62 | 62 | ||
63 | /* | ||
64 | * This struct goes in the PACA on 64-bit processors. It is used | ||
65 | * to store host state that needs to be saved when we enter a guest | ||
66 | * and restored when we exit, but isn't specific to any particular | ||
67 | * guest or vcpu. It also has some scratch fields used by the guest | ||
68 | * exit code. | ||
69 | */ | ||
70 | struct kvmppc_host_state { | ||
71 | ulong host_r1; | ||
72 | ulong host_r2; | ||
73 | ulong host_msr; | ||
74 | ulong vmhandler; | ||
75 | ulong scratch0; | ||
76 | ulong scratch1; | ||
77 | u8 in_guest; | ||
78 | |||
79 | #ifdef CONFIG_KVM_BOOK3S_64_HV | ||
80 | struct kvm_vcpu *kvm_vcpu; | ||
81 | struct kvmppc_vcore *kvm_vcore; | ||
82 | unsigned long xics_phys; | ||
83 | u64 dabr; | ||
84 | u64 host_mmcr[3]; | ||
85 | u32 host_pmc[8]; | ||
86 | u64 host_purr; | ||
87 | u64 host_spurr; | ||
88 | u64 host_dscr; | ||
89 | u64 dec_expires; | ||
90 | #endif | ||
91 | }; | ||
92 | |||
63 | struct kvmppc_book3s_shadow_vcpu { | 93 | struct kvmppc_book3s_shadow_vcpu { |
64 | ulong gpr[14]; | 94 | ulong gpr[14]; |
65 | u32 cr; | 95 | u32 cr; |
@@ -73,17 +103,12 @@ struct kvmppc_book3s_shadow_vcpu { | |||
73 | ulong shadow_srr1; | 103 | ulong shadow_srr1; |
74 | ulong fault_dar; | 104 | ulong fault_dar; |
75 | 105 | ||
76 | ulong host_r1; | ||
77 | ulong host_r2; | ||
78 | ulong handler; | ||
79 | ulong scratch0; | ||
80 | ulong scratch1; | ||
81 | ulong vmhandler; | ||
82 | u8 in_guest; | ||
83 | |||
84 | #ifdef CONFIG_PPC_BOOK3S_32 | 106 | #ifdef CONFIG_PPC_BOOK3S_32 |
85 | u32 sr[16]; /* Guest SRs */ | 107 | u32 sr[16]; /* Guest SRs */ |
108 | |||
109 | struct kvmppc_host_state hstate; | ||
86 | #endif | 110 | #endif |
111 | |||
87 | #ifdef CONFIG_PPC_BOOK3S_64 | 112 | #ifdef CONFIG_PPC_BOOK3S_64 |
88 | u8 slb_max; /* highest used guest slb entry */ | 113 | u8 slb_max; /* highest used guest slb entry */ |
89 | struct { | 114 | struct { |
diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h index 9c9ba3d59b1b..a90e09188777 100644 --- a/arch/powerpc/include/asm/kvm_booke.h +++ b/arch/powerpc/include/asm/kvm_booke.h | |||
@@ -93,4 +93,8 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu) | |||
93 | return vcpu->arch.fault_dear; | 93 | return vcpu->arch.fault_dear; |
94 | } | 94 | } |
95 | 95 | ||
96 | static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu) | ||
97 | { | ||
98 | return vcpu->arch.shared->msr; | ||
99 | } | ||
96 | #endif /* __ASM_KVM_BOOKE_H__ */ | 100 | #endif /* __ASM_KVM_BOOKE_H__ */ |
diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h index 7a2a565f88c4..adbfca9dd100 100644 --- a/arch/powerpc/include/asm/kvm_e500.h +++ b/arch/powerpc/include/asm/kvm_e500.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. | 2 | * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. |
3 | * | 3 | * |
4 | * Author: Yu Liu, <yu.liu@freescale.com> | 4 | * Author: Yu Liu, <yu.liu@freescale.com> |
5 | * | 5 | * |
@@ -29,17 +29,25 @@ struct tlbe{ | |||
29 | u32 mas7; | 29 | u32 mas7; |
30 | }; | 30 | }; |
31 | 31 | ||
32 | #define E500_TLB_VALID 1 | ||
33 | #define E500_TLB_DIRTY 2 | ||
34 | |||
35 | struct tlbe_priv { | ||
36 | pfn_t pfn; | ||
37 | unsigned int flags; /* E500_TLB_* */ | ||
38 | }; | ||
39 | |||
40 | struct vcpu_id_table; | ||
41 | |||
32 | struct kvmppc_vcpu_e500 { | 42 | struct kvmppc_vcpu_e500 { |
33 | /* Unmodified copy of the guest's TLB. */ | 43 | /* Unmodified copy of the guest's TLB. */ |
34 | struct tlbe *guest_tlb[E500_TLB_NUM]; | 44 | struct tlbe *gtlb_arch[E500_TLB_NUM]; |
35 | /* TLB that's actually used when the guest is running. */ | ||
36 | struct tlbe *shadow_tlb[E500_TLB_NUM]; | ||
37 | /* Pages which are referenced in the shadow TLB. */ | ||
38 | struct page **shadow_pages[E500_TLB_NUM]; | ||
39 | 45 | ||
40 | unsigned int guest_tlb_size[E500_TLB_NUM]; | 46 | /* KVM internal information associated with each guest TLB entry */ |
41 | unsigned int shadow_tlb_size[E500_TLB_NUM]; | 47 | struct tlbe_priv *gtlb_priv[E500_TLB_NUM]; |
42 | unsigned int guest_tlb_nv[E500_TLB_NUM]; | 48 | |
49 | unsigned int gtlb_size[E500_TLB_NUM]; | ||
50 | unsigned int gtlb_nv[E500_TLB_NUM]; | ||
43 | 51 | ||
44 | u32 host_pid[E500_PID_NUM]; | 52 | u32 host_pid[E500_PID_NUM]; |
45 | u32 pid[E500_PID_NUM]; | 53 | u32 pid[E500_PID_NUM]; |
@@ -53,6 +61,10 @@ struct kvmppc_vcpu_e500 { | |||
53 | u32 mas5; | 61 | u32 mas5; |
54 | u32 mas6; | 62 | u32 mas6; |
55 | u32 mas7; | 63 | u32 mas7; |
64 | |||
65 | /* vcpu id table */ | ||
66 | struct vcpu_id_table *idt; | ||
67 | |||
56 | u32 l1csr0; | 68 | u32 l1csr0; |
57 | u32 l1csr1; | 69 | u32 l1csr1; |
58 | u32 hid0; | 70 | u32 hid0; |
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 186f150b9b89..cc22b282d755 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h | |||
@@ -25,15 +25,23 @@ | |||
25 | #include <linux/interrupt.h> | 25 | #include <linux/interrupt.h> |
26 | #include <linux/types.h> | 26 | #include <linux/types.h> |
27 | #include <linux/kvm_types.h> | 27 | #include <linux/kvm_types.h> |
28 | #include <linux/threads.h> | ||
29 | #include <linux/spinlock.h> | ||
28 | #include <linux/kvm_para.h> | 30 | #include <linux/kvm_para.h> |
31 | #include <linux/list.h> | ||
32 | #include <linux/atomic.h> | ||
29 | #include <asm/kvm_asm.h> | 33 | #include <asm/kvm_asm.h> |
34 | #include <asm/processor.h> | ||
30 | 35 | ||
31 | #define KVM_MAX_VCPUS 1 | 36 | #define KVM_MAX_VCPUS NR_CPUS |
37 | #define KVM_MAX_VCORES NR_CPUS | ||
32 | #define KVM_MEMORY_SLOTS 32 | 38 | #define KVM_MEMORY_SLOTS 32 |
33 | /* memory slots that does not exposed to userspace */ | 39 | /* memory slots that does not exposed to userspace */ |
34 | #define KVM_PRIVATE_MEM_SLOTS 4 | 40 | #define KVM_PRIVATE_MEM_SLOTS 4 |
35 | 41 | ||
42 | #ifdef CONFIG_KVM_MMIO | ||
36 | #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 | 43 | #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 |
44 | #endif | ||
37 | 45 | ||
38 | /* We don't currently support large pages. */ | 46 | /* We don't currently support large pages. */ |
39 | #define KVM_HPAGE_GFN_SHIFT(x) 0 | 47 | #define KVM_HPAGE_GFN_SHIFT(x) 0 |
@@ -57,6 +65,10 @@ struct kvm; | |||
57 | struct kvm_run; | 65 | struct kvm_run; |
58 | struct kvm_vcpu; | 66 | struct kvm_vcpu; |
59 | 67 | ||
68 | struct lppaca; | ||
69 | struct slb_shadow; | ||
70 | struct dtl; | ||
71 | |||
60 | struct kvm_vm_stat { | 72 | struct kvm_vm_stat { |
61 | u32 remote_tlb_flush; | 73 | u32 remote_tlb_flush; |
62 | }; | 74 | }; |
@@ -133,9 +145,74 @@ struct kvmppc_exit_timing { | |||
133 | }; | 145 | }; |
134 | }; | 146 | }; |
135 | 147 | ||
148 | struct kvmppc_pginfo { | ||
149 | unsigned long pfn; | ||
150 | atomic_t refcnt; | ||
151 | }; | ||
152 | |||
153 | struct kvmppc_spapr_tce_table { | ||
154 | struct list_head list; | ||
155 | struct kvm *kvm; | ||
156 | u64 liobn; | ||
157 | u32 window_size; | ||
158 | struct page *pages[0]; | ||
159 | }; | ||
160 | |||
161 | struct kvmppc_rma_info { | ||
162 | void *base_virt; | ||
163 | unsigned long base_pfn; | ||
164 | unsigned long npages; | ||
165 | struct list_head list; | ||
166 | atomic_t use_count; | ||
167 | }; | ||
168 | |||
136 | struct kvm_arch { | 169 | struct kvm_arch { |
170 | #ifdef CONFIG_KVM_BOOK3S_64_HV | ||
171 | unsigned long hpt_virt; | ||
172 | unsigned long ram_npages; | ||
173 | unsigned long ram_psize; | ||
174 | unsigned long ram_porder; | ||
175 | struct kvmppc_pginfo *ram_pginfo; | ||
176 | unsigned int lpid; | ||
177 | unsigned int host_lpid; | ||
178 | unsigned long host_lpcr; | ||
179 | unsigned long sdr1; | ||
180 | unsigned long host_sdr1; | ||
181 | int tlbie_lock; | ||
182 | int n_rma_pages; | ||
183 | unsigned long lpcr; | ||
184 | unsigned long rmor; | ||
185 | struct kvmppc_rma_info *rma; | ||
186 | struct list_head spapr_tce_tables; | ||
187 | unsigned short last_vcpu[NR_CPUS]; | ||
188 | struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; | ||
189 | #endif /* CONFIG_KVM_BOOK3S_64_HV */ | ||
137 | }; | 190 | }; |
138 | 191 | ||
192 | /* | ||
193 | * Struct for a virtual core. | ||
194 | * Note: entry_exit_count combines an entry count in the bottom 8 bits | ||
195 | * and an exit count in the next 8 bits. This is so that we can | ||
196 | * atomically increment the entry count iff the exit count is 0 | ||
197 | * without taking the lock. | ||
198 | */ | ||
199 | struct kvmppc_vcore { | ||
200 | int n_runnable; | ||
201 | int n_blocked; | ||
202 | int num_threads; | ||
203 | int entry_exit_count; | ||
204 | int n_woken; | ||
205 | int nap_count; | ||
206 | u16 pcpu; | ||
207 | u8 vcore_running; | ||
208 | u8 in_guest; | ||
209 | struct list_head runnable_threads; | ||
210 | spinlock_t lock; | ||
211 | }; | ||
212 | |||
213 | #define VCORE_ENTRY_COUNT(vc) ((vc)->entry_exit_count & 0xff) | ||
214 | #define VCORE_EXIT_COUNT(vc) ((vc)->entry_exit_count >> 8) | ||
215 | |||
139 | struct kvmppc_pte { | 216 | struct kvmppc_pte { |
140 | ulong eaddr; | 217 | ulong eaddr; |
141 | u64 vpage; | 218 | u64 vpage; |
@@ -163,16 +240,18 @@ struct kvmppc_mmu { | |||
163 | bool (*is_dcbz32)(struct kvm_vcpu *vcpu); | 240 | bool (*is_dcbz32)(struct kvm_vcpu *vcpu); |
164 | }; | 241 | }; |
165 | 242 | ||
166 | struct hpte_cache { | 243 | struct kvmppc_slb { |
167 | struct hlist_node list_pte; | 244 | u64 esid; |
168 | struct hlist_node list_pte_long; | 245 | u64 vsid; |
169 | struct hlist_node list_vpte; | 246 | u64 orige; |
170 | struct hlist_node list_vpte_long; | 247 | u64 origv; |
171 | struct rcu_head rcu_head; | 248 | bool valid : 1; |
172 | u64 host_va; | 249 | bool Ks : 1; |
173 | u64 pfn; | 250 | bool Kp : 1; |
174 | ulong slot; | 251 | bool nx : 1; |
175 | struct kvmppc_pte pte; | 252 | bool large : 1; /* PTEs are 16MB */ |
253 | bool tb : 1; /* 1TB segment */ | ||
254 | bool class : 1; | ||
176 | }; | 255 | }; |
177 | 256 | ||
178 | struct kvm_vcpu_arch { | 257 | struct kvm_vcpu_arch { |
@@ -187,6 +266,9 @@ struct kvm_vcpu_arch { | |||
187 | ulong highmem_handler; | 266 | ulong highmem_handler; |
188 | ulong rmcall; | 267 | ulong rmcall; |
189 | ulong host_paca_phys; | 268 | ulong host_paca_phys; |
269 | struct kvmppc_slb slb[64]; | ||
270 | int slb_max; /* 1 + index of last valid entry in slb[] */ | ||
271 | int slb_nr; /* total number of entries in SLB */ | ||
190 | struct kvmppc_mmu mmu; | 272 | struct kvmppc_mmu mmu; |
191 | #endif | 273 | #endif |
192 | 274 | ||
@@ -195,13 +277,19 @@ struct kvm_vcpu_arch { | |||
195 | u64 fpr[32]; | 277 | u64 fpr[32]; |
196 | u64 fpscr; | 278 | u64 fpscr; |
197 | 279 | ||
280 | #ifdef CONFIG_SPE | ||
281 | ulong evr[32]; | ||
282 | ulong spefscr; | ||
283 | ulong host_spefscr; | ||
284 | u64 acc; | ||
285 | #endif | ||
198 | #ifdef CONFIG_ALTIVEC | 286 | #ifdef CONFIG_ALTIVEC |
199 | vector128 vr[32]; | 287 | vector128 vr[32]; |
200 | vector128 vscr; | 288 | vector128 vscr; |
201 | #endif | 289 | #endif |
202 | 290 | ||
203 | #ifdef CONFIG_VSX | 291 | #ifdef CONFIG_VSX |
204 | u64 vsr[32]; | 292 | u64 vsr[64]; |
205 | #endif | 293 | #endif |
206 | 294 | ||
207 | #ifdef CONFIG_PPC_BOOK3S | 295 | #ifdef CONFIG_PPC_BOOK3S |
@@ -209,22 +297,27 @@ struct kvm_vcpu_arch { | |||
209 | u32 qpr[32]; | 297 | u32 qpr[32]; |
210 | #endif | 298 | #endif |
211 | 299 | ||
212 | #ifdef CONFIG_BOOKE | ||
213 | ulong pc; | 300 | ulong pc; |
214 | ulong ctr; | 301 | ulong ctr; |
215 | ulong lr; | 302 | ulong lr; |
216 | 303 | ||
217 | ulong xer; | 304 | ulong xer; |
218 | u32 cr; | 305 | u32 cr; |
219 | #endif | ||
220 | 306 | ||
221 | #ifdef CONFIG_PPC_BOOK3S | 307 | #ifdef CONFIG_PPC_BOOK3S |
222 | ulong shadow_msr; | ||
223 | ulong hflags; | 308 | ulong hflags; |
224 | ulong guest_owned_ext; | 309 | ulong guest_owned_ext; |
310 | ulong purr; | ||
311 | ulong spurr; | ||
312 | ulong dscr; | ||
313 | ulong amr; | ||
314 | ulong uamor; | ||
315 | u32 ctrl; | ||
316 | ulong dabr; | ||
225 | #endif | 317 | #endif |
226 | u32 vrsave; /* also USPRG0 */ | 318 | u32 vrsave; /* also USPRG0 */ |
227 | u32 mmucr; | 319 | u32 mmucr; |
320 | ulong shadow_msr; | ||
228 | ulong sprg4; | 321 | ulong sprg4; |
229 | ulong sprg5; | 322 | ulong sprg5; |
230 | ulong sprg6; | 323 | ulong sprg6; |
@@ -249,6 +342,7 @@ struct kvm_vcpu_arch { | |||
249 | u32 pvr; | 342 | u32 pvr; |
250 | 343 | ||
251 | u32 shadow_pid; | 344 | u32 shadow_pid; |
345 | u32 shadow_pid1; | ||
252 | u32 pid; | 346 | u32 pid; |
253 | u32 swap_pid; | 347 | u32 swap_pid; |
254 | 348 | ||
@@ -258,6 +352,9 @@ struct kvm_vcpu_arch { | |||
258 | u32 dbcr1; | 352 | u32 dbcr1; |
259 | u32 dbsr; | 353 | u32 dbsr; |
260 | 354 | ||
355 | u64 mmcr[3]; | ||
356 | u32 pmc[8]; | ||
357 | |||
261 | #ifdef CONFIG_KVM_EXIT_TIMING | 358 | #ifdef CONFIG_KVM_EXIT_TIMING |
262 | struct mutex exit_timing_lock; | 359 | struct mutex exit_timing_lock; |
263 | struct kvmppc_exit_timing timing_exit; | 360 | struct kvmppc_exit_timing timing_exit; |
@@ -272,8 +369,12 @@ struct kvm_vcpu_arch { | |||
272 | struct dentry *debugfs_exit_timing; | 369 | struct dentry *debugfs_exit_timing; |
273 | #endif | 370 | #endif |
274 | 371 | ||
372 | #ifdef CONFIG_PPC_BOOK3S | ||
373 | ulong fault_dar; | ||
374 | u32 fault_dsisr; | ||
375 | #endif | ||
376 | |||
275 | #ifdef CONFIG_BOOKE | 377 | #ifdef CONFIG_BOOKE |
276 | u32 last_inst; | ||
277 | ulong fault_dear; | 378 | ulong fault_dear; |
278 | ulong fault_esr; | 379 | ulong fault_esr; |
279 | ulong queued_dear; | 380 | ulong queued_dear; |
@@ -288,25 +389,47 @@ struct kvm_vcpu_arch { | |||
288 | u8 dcr_is_write; | 389 | u8 dcr_is_write; |
289 | u8 osi_needed; | 390 | u8 osi_needed; |
290 | u8 osi_enabled; | 391 | u8 osi_enabled; |
392 | u8 hcall_needed; | ||
291 | 393 | ||
292 | u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ | 394 | u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ |
293 | 395 | ||
294 | struct hrtimer dec_timer; | 396 | struct hrtimer dec_timer; |
295 | struct tasklet_struct tasklet; | 397 | struct tasklet_struct tasklet; |
296 | u64 dec_jiffies; | 398 | u64 dec_jiffies; |
399 | u64 dec_expires; | ||
297 | unsigned long pending_exceptions; | 400 | unsigned long pending_exceptions; |
401 | u16 last_cpu; | ||
402 | u8 ceded; | ||
403 | u8 prodded; | ||
404 | u32 last_inst; | ||
405 | |||
406 | struct lppaca *vpa; | ||
407 | struct slb_shadow *slb_shadow; | ||
408 | struct dtl *dtl; | ||
409 | struct dtl *dtl_end; | ||
410 | |||
411 | struct kvmppc_vcore *vcore; | ||
412 | int ret; | ||
413 | int trap; | ||
414 | int state; | ||
415 | int ptid; | ||
416 | wait_queue_head_t cpu_run; | ||
417 | |||
298 | struct kvm_vcpu_arch_shared *shared; | 418 | struct kvm_vcpu_arch_shared *shared; |
299 | unsigned long magic_page_pa; /* phys addr to map the magic page to */ | 419 | unsigned long magic_page_pa; /* phys addr to map the magic page to */ |
300 | unsigned long magic_page_ea; /* effect. addr to map the magic page to */ | 420 | unsigned long magic_page_ea; /* effect. addr to map the magic page to */ |
301 | 421 | ||
302 | #ifdef CONFIG_PPC_BOOK3S | 422 | #ifdef CONFIG_KVM_BOOK3S_64_HV |
303 | struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE]; | 423 | struct kvm_vcpu_arch_shared shregs; |
304 | struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG]; | 424 | |
305 | struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE]; | 425 | struct list_head run_list; |
306 | struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG]; | 426 | struct task_struct *run_task; |
307 | int hpte_cache_count; | 427 | struct kvm_run *kvm_run; |
308 | spinlock_t mmu_lock; | ||
309 | #endif | 428 | #endif |
310 | }; | 429 | }; |
311 | 430 | ||
431 | #define KVMPPC_VCPU_BUSY_IN_HOST 0 | ||
432 | #define KVMPPC_VCPU_BLOCKED 1 | ||
433 | #define KVMPPC_VCPU_RUNNABLE 2 | ||
434 | |||
312 | #endif /* __POWERPC_KVM_HOST_H__ */ | 435 | #endif /* __POWERPC_KVM_HOST_H__ */ |
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 9345238edecf..d121f49d62b8 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h | |||
@@ -33,6 +33,9 @@ | |||
33 | #else | 33 | #else |
34 | #include <asm/kvm_booke.h> | 34 | #include <asm/kvm_booke.h> |
35 | #endif | 35 | #endif |
36 | #ifdef CONFIG_KVM_BOOK3S_64_HANDLER | ||
37 | #include <asm/paca.h> | ||
38 | #endif | ||
36 | 39 | ||
37 | enum emulation_result { | 40 | enum emulation_result { |
38 | EMULATE_DONE, /* no further processing */ | 41 | EMULATE_DONE, /* no further processing */ |
@@ -42,6 +45,7 @@ enum emulation_result { | |||
42 | EMULATE_AGAIN, /* something went wrong. go again */ | 45 | EMULATE_AGAIN, /* something went wrong. go again */ |
43 | }; | 46 | }; |
44 | 47 | ||
48 | extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); | ||
45 | extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); | 49 | extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); |
46 | extern char kvmppc_handlers_start[]; | 50 | extern char kvmppc_handlers_start[]; |
47 | extern unsigned long kvmppc_handler_len; | 51 | extern unsigned long kvmppc_handler_len; |
@@ -109,6 +113,27 @@ extern void kvmppc_booke_exit(void); | |||
109 | 113 | ||
110 | extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu); | 114 | extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu); |
111 | extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu); | 115 | extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu); |
116 | extern void kvmppc_map_magic(struct kvm_vcpu *vcpu); | ||
117 | |||
118 | extern long kvmppc_alloc_hpt(struct kvm *kvm); | ||
119 | extern void kvmppc_free_hpt(struct kvm *kvm); | ||
120 | extern long kvmppc_prepare_vrma(struct kvm *kvm, | ||
121 | struct kvm_userspace_memory_region *mem); | ||
122 | extern void kvmppc_map_vrma(struct kvm *kvm, | ||
123 | struct kvm_userspace_memory_region *mem); | ||
124 | extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); | ||
125 | extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, | ||
126 | struct kvm_create_spapr_tce *args); | ||
127 | extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, | ||
128 | struct kvm_allocate_rma *rma); | ||
129 | extern struct kvmppc_rma_info *kvm_alloc_rma(void); | ||
130 | extern void kvm_release_rma(struct kvmppc_rma_info *ri); | ||
131 | extern int kvmppc_core_init_vm(struct kvm *kvm); | ||
132 | extern void kvmppc_core_destroy_vm(struct kvm *kvm); | ||
133 | extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, | ||
134 | struct kvm_userspace_memory_region *mem); | ||
135 | extern void kvmppc_core_commit_memory_region(struct kvm *kvm, | ||
136 | struct kvm_userspace_memory_region *mem); | ||
112 | 137 | ||
113 | /* | 138 | /* |
114 | * Cuts out inst bits with ordering according to spec. | 139 | * Cuts out inst bits with ordering according to spec. |
@@ -151,4 +176,20 @@ int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); | |||
151 | 176 | ||
152 | void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid); | 177 | void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid); |
153 | 178 | ||
179 | #ifdef CONFIG_KVM_BOOK3S_64_HV | ||
180 | static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) | ||
181 | { | ||
182 | paca[cpu].kvm_hstate.xics_phys = addr; | ||
183 | } | ||
184 | |||
185 | extern void kvm_rma_init(void); | ||
186 | |||
187 | #else | ||
188 | static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) | ||
189 | {} | ||
190 | |||
191 | static inline void kvm_rma_init(void) | ||
192 | {} | ||
193 | #endif | ||
194 | |||
154 | #endif /* __POWERPC_KVM_PPC_H__ */ | 195 | #endif /* __POWERPC_KVM_PPC_H__ */ |
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h index d865bd909c7d..b445e0af4c2b 100644 --- a/arch/powerpc/include/asm/mmu-hash64.h +++ b/arch/powerpc/include/asm/mmu-hash64.h | |||
@@ -90,13 +90,19 @@ extern char initial_stab[]; | |||
90 | 90 | ||
91 | #define HPTE_R_PP0 ASM_CONST(0x8000000000000000) | 91 | #define HPTE_R_PP0 ASM_CONST(0x8000000000000000) |
92 | #define HPTE_R_TS ASM_CONST(0x4000000000000000) | 92 | #define HPTE_R_TS ASM_CONST(0x4000000000000000) |
93 | #define HPTE_R_KEY_HI ASM_CONST(0x3000000000000000) | ||
93 | #define HPTE_R_RPN_SHIFT 12 | 94 | #define HPTE_R_RPN_SHIFT 12 |
94 | #define HPTE_R_RPN ASM_CONST(0x3ffffffffffff000) | 95 | #define HPTE_R_RPN ASM_CONST(0x0ffffffffffff000) |
95 | #define HPTE_R_FLAGS ASM_CONST(0x00000000000003ff) | ||
96 | #define HPTE_R_PP ASM_CONST(0x0000000000000003) | 96 | #define HPTE_R_PP ASM_CONST(0x0000000000000003) |
97 | #define HPTE_R_N ASM_CONST(0x0000000000000004) | 97 | #define HPTE_R_N ASM_CONST(0x0000000000000004) |
98 | #define HPTE_R_G ASM_CONST(0x0000000000000008) | ||
99 | #define HPTE_R_M ASM_CONST(0x0000000000000010) | ||
100 | #define HPTE_R_I ASM_CONST(0x0000000000000020) | ||
101 | #define HPTE_R_W ASM_CONST(0x0000000000000040) | ||
102 | #define HPTE_R_WIMG ASM_CONST(0x0000000000000078) | ||
98 | #define HPTE_R_C ASM_CONST(0x0000000000000080) | 103 | #define HPTE_R_C ASM_CONST(0x0000000000000080) |
99 | #define HPTE_R_R ASM_CONST(0x0000000000000100) | 104 | #define HPTE_R_R ASM_CONST(0x0000000000000100) |
105 | #define HPTE_R_KEY_LO ASM_CONST(0x0000000000000e00) | ||
100 | 106 | ||
101 | #define HPTE_V_1TB_SEG ASM_CONST(0x4000000000000000) | 107 | #define HPTE_V_1TB_SEG ASM_CONST(0x4000000000000000) |
102 | #define HPTE_V_VRMA_MASK ASM_CONST(0x4001ffffff000000) | 108 | #define HPTE_V_VRMA_MASK ASM_CONST(0x4001ffffff000000) |
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 74126765106a..a6da12859959 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h | |||
@@ -147,9 +147,12 @@ struct paca_struct { | |||
147 | struct dtl_entry *dtl_curr; /* pointer corresponding to dtl_ridx */ | 147 | struct dtl_entry *dtl_curr; /* pointer corresponding to dtl_ridx */ |
148 | 148 | ||
149 | #ifdef CONFIG_KVM_BOOK3S_HANDLER | 149 | #ifdef CONFIG_KVM_BOOK3S_HANDLER |
150 | #ifdef CONFIG_KVM_BOOK3S_PR | ||
150 | /* We use this to store guest state in */ | 151 | /* We use this to store guest state in */ |
151 | struct kvmppc_book3s_shadow_vcpu shadow_vcpu; | 152 | struct kvmppc_book3s_shadow_vcpu shadow_vcpu; |
152 | #endif | 153 | #endif |
154 | struct kvmppc_host_state kvm_hstate; | ||
155 | #endif | ||
153 | }; | 156 | }; |
154 | 157 | ||
155 | extern struct paca_struct *paca; | 158 | extern struct paca_struct *paca; |
diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 1b422381fc16..368f72f79808 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h | |||
@@ -150,18 +150,22 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) | |||
150 | #define REST_16VSRSU(n,b,base) REST_8VSRSU(n,b,base); REST_8VSRSU(n+8,b,base) | 150 | #define REST_16VSRSU(n,b,base) REST_8VSRSU(n,b,base); REST_8VSRSU(n+8,b,base) |
151 | #define REST_32VSRSU(n,b,base) REST_16VSRSU(n,b,base); REST_16VSRSU(n+16,b,base) | 151 | #define REST_32VSRSU(n,b,base) REST_16VSRSU(n,b,base); REST_16VSRSU(n+16,b,base) |
152 | 152 | ||
153 | #define SAVE_EVR(n,s,base) evmergehi s,s,n; stw s,THREAD_EVR0+4*(n)(base) | 153 | /* |
154 | #define SAVE_2EVRS(n,s,base) SAVE_EVR(n,s,base); SAVE_EVR(n+1,s,base) | 154 | * b = base register for addressing, o = base offset from register of 1st EVR |
155 | #define SAVE_4EVRS(n,s,base) SAVE_2EVRS(n,s,base); SAVE_2EVRS(n+2,s,base) | 155 | * n = first EVR, s = scratch |
156 | #define SAVE_8EVRS(n,s,base) SAVE_4EVRS(n,s,base); SAVE_4EVRS(n+4,s,base) | 156 | */ |
157 | #define SAVE_16EVRS(n,s,base) SAVE_8EVRS(n,s,base); SAVE_8EVRS(n+8,s,base) | 157 | #define SAVE_EVR(n,s,b,o) evmergehi s,s,n; stw s,o+4*(n)(b) |
158 | #define SAVE_32EVRS(n,s,base) SAVE_16EVRS(n,s,base); SAVE_16EVRS(n+16,s,base) | 158 | #define SAVE_2EVRS(n,s,b,o) SAVE_EVR(n,s,b,o); SAVE_EVR(n+1,s,b,o) |
159 | #define REST_EVR(n,s,base) lwz s,THREAD_EVR0+4*(n)(base); evmergelo n,s,n | 159 | #define SAVE_4EVRS(n,s,b,o) SAVE_2EVRS(n,s,b,o); SAVE_2EVRS(n+2,s,b,o) |
160 | #define REST_2EVRS(n,s,base) REST_EVR(n,s,base); REST_EVR(n+1,s,base) | 160 | #define SAVE_8EVRS(n,s,b,o) SAVE_4EVRS(n,s,b,o); SAVE_4EVRS(n+4,s,b,o) |
161 | #define REST_4EVRS(n,s,base) REST_2EVRS(n,s,base); REST_2EVRS(n+2,s,base) | 161 | #define SAVE_16EVRS(n,s,b,o) SAVE_8EVRS(n,s,b,o); SAVE_8EVRS(n+8,s,b,o) |
162 | #define REST_8EVRS(n,s,base) REST_4EVRS(n,s,base); REST_4EVRS(n+4,s,base) | 162 | #define SAVE_32EVRS(n,s,b,o) SAVE_16EVRS(n,s,b,o); SAVE_16EVRS(n+16,s,b,o) |
163 | #define REST_16EVRS(n,s,base) REST_8EVRS(n,s,base); REST_8EVRS(n+8,s,base) | 163 | #define REST_EVR(n,s,b,o) lwz s,o+4*(n)(b); evmergelo n,s,n |
164 | #define REST_32EVRS(n,s,base) REST_16EVRS(n,s,base); REST_16EVRS(n+16,s,base) | 164 | #define REST_2EVRS(n,s,b,o) REST_EVR(n,s,b,o); REST_EVR(n+1,s,b,o) |
165 | #define REST_4EVRS(n,s,b,o) REST_2EVRS(n,s,b,o); REST_2EVRS(n+2,s,b,o) | ||
166 | #define REST_8EVRS(n,s,b,o) REST_4EVRS(n,s,b,o); REST_4EVRS(n+4,s,b,o) | ||
167 | #define REST_16EVRS(n,s,b,o) REST_8EVRS(n,s,b,o); REST_8EVRS(n+8,s,b,o) | ||
168 | #define REST_32EVRS(n,s,b,o) REST_16EVRS(n,s,b,o); REST_16EVRS(n+16,s,b,o) | ||
165 | 169 | ||
166 | /* Macros to adjust thread priority for hardware multithreading */ | 170 | /* Macros to adjust thread priority for hardware multithreading */ |
167 | #define HMT_VERY_LOW or 31,31,31 # very low priority | 171 | #define HMT_VERY_LOW or 31,31,31 # very low priority |
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index c5cae0dd176c..ddbe57ae8584 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h | |||
@@ -189,6 +189,9 @@ | |||
189 | #define SPRN_CTR 0x009 /* Count Register */ | 189 | #define SPRN_CTR 0x009 /* Count Register */ |
190 | #define SPRN_DSCR 0x11 | 190 | #define SPRN_DSCR 0x11 |
191 | #define SPRN_CFAR 0x1c /* Come From Address Register */ | 191 | #define SPRN_CFAR 0x1c /* Come From Address Register */ |
192 | #define SPRN_AMR 0x1d /* Authority Mask Register */ | ||
193 | #define SPRN_UAMOR 0x9d /* User Authority Mask Override Register */ | ||
194 | #define SPRN_AMOR 0x15d /* Authority Mask Override Register */ | ||
192 | #define SPRN_ACOP 0x1F /* Available Coprocessor Register */ | 195 | #define SPRN_ACOP 0x1F /* Available Coprocessor Register */ |
193 | #define SPRN_CTRLF 0x088 | 196 | #define SPRN_CTRLF 0x088 |
194 | #define SPRN_CTRLT 0x098 | 197 | #define SPRN_CTRLT 0x098 |
@@ -232,22 +235,28 @@ | |||
232 | #define LPCR_VPM0 (1ul << (63-0)) | 235 | #define LPCR_VPM0 (1ul << (63-0)) |
233 | #define LPCR_VPM1 (1ul << (63-1)) | 236 | #define LPCR_VPM1 (1ul << (63-1)) |
234 | #define LPCR_ISL (1ul << (63-2)) | 237 | #define LPCR_ISL (1ul << (63-2)) |
238 | #define LPCR_VC_SH (63-2) | ||
235 | #define LPCR_DPFD_SH (63-11) | 239 | #define LPCR_DPFD_SH (63-11) |
236 | #define LPCR_VRMA_L (1ul << (63-12)) | 240 | #define LPCR_VRMA_L (1ul << (63-12)) |
237 | #define LPCR_VRMA_LP0 (1ul << (63-15)) | 241 | #define LPCR_VRMA_LP0 (1ul << (63-15)) |
238 | #define LPCR_VRMA_LP1 (1ul << (63-16)) | 242 | #define LPCR_VRMA_LP1 (1ul << (63-16)) |
243 | #define LPCR_VRMASD_SH (63-16) | ||
239 | #define LPCR_RMLS 0x1C000000 /* impl dependent rmo limit sel */ | 244 | #define LPCR_RMLS 0x1C000000 /* impl dependent rmo limit sel */ |
245 | #define LPCR_RMLS_SH (63-37) | ||
240 | #define LPCR_ILE 0x02000000 /* !HV irqs set MSR:LE */ | 246 | #define LPCR_ILE 0x02000000 /* !HV irqs set MSR:LE */ |
241 | #define LPCR_PECE 0x00007000 /* powersave exit cause enable */ | 247 | #define LPCR_PECE 0x00007000 /* powersave exit cause enable */ |
242 | #define LPCR_PECE0 0x00004000 /* ext. exceptions can cause exit */ | 248 | #define LPCR_PECE0 0x00004000 /* ext. exceptions can cause exit */ |
243 | #define LPCR_PECE1 0x00002000 /* decrementer can cause exit */ | 249 | #define LPCR_PECE1 0x00002000 /* decrementer can cause exit */ |
244 | #define LPCR_PECE2 0x00001000 /* machine check etc can cause exit */ | 250 | #define LPCR_PECE2 0x00001000 /* machine check etc can cause exit */ |
245 | #define LPCR_MER 0x00000800 /* Mediated External Exception */ | 251 | #define LPCR_MER 0x00000800 /* Mediated External Exception */ |
252 | #define LPCR_LPES 0x0000000c | ||
246 | #define LPCR_LPES0 0x00000008 /* LPAR Env selector 0 */ | 253 | #define LPCR_LPES0 0x00000008 /* LPAR Env selector 0 */ |
247 | #define LPCR_LPES1 0x00000004 /* LPAR Env selector 1 */ | 254 | #define LPCR_LPES1 0x00000004 /* LPAR Env selector 1 */ |
255 | #define LPCR_LPES_SH 2 | ||
248 | #define LPCR_RMI 0x00000002 /* real mode is cache inhibit */ | 256 | #define LPCR_RMI 0x00000002 /* real mode is cache inhibit */ |
249 | #define LPCR_HDICE 0x00000001 /* Hyp Decr enable (HV,PR,EE) */ | 257 | #define LPCR_HDICE 0x00000001 /* Hyp Decr enable (HV,PR,EE) */ |
250 | #define SPRN_LPID 0x13F /* Logical Partition Identifier */ | 258 | #define SPRN_LPID 0x13F /* Logical Partition Identifier */ |
259 | #define LPID_RSVD 0x3ff /* Reserved LPID for partn switching */ | ||
251 | #define SPRN_HMER 0x150 /* Hardware m? error recovery */ | 260 | #define SPRN_HMER 0x150 /* Hardware m? error recovery */ |
252 | #define SPRN_HMEER 0x151 /* Hardware m? enable error recovery */ | 261 | #define SPRN_HMEER 0x151 /* Hardware m? enable error recovery */ |
253 | #define SPRN_HEIR 0x153 /* Hypervisor Emulated Instruction Register */ | 262 | #define SPRN_HEIR 0x153 /* Hypervisor Emulated Instruction Register */ |
@@ -298,6 +307,7 @@ | |||
298 | #define SPRN_HASH1 0x3D2 /* Primary Hash Address Register */ | 307 | #define SPRN_HASH1 0x3D2 /* Primary Hash Address Register */ |
299 | #define SPRN_HASH2 0x3D3 /* Secondary Hash Address Resgister */ | 308 | #define SPRN_HASH2 0x3D3 /* Secondary Hash Address Resgister */ |
300 | #define SPRN_HID0 0x3F0 /* Hardware Implementation Register 0 */ | 309 | #define SPRN_HID0 0x3F0 /* Hardware Implementation Register 0 */ |
310 | #define HID0_HDICE_SH (63 - 23) /* 970 HDEC interrupt enable */ | ||
301 | #define HID0_EMCP (1<<31) /* Enable Machine Check pin */ | 311 | #define HID0_EMCP (1<<31) /* Enable Machine Check pin */ |
302 | #define HID0_EBA (1<<29) /* Enable Bus Address Parity */ | 312 | #define HID0_EBA (1<<29) /* Enable Bus Address Parity */ |
303 | #define HID0_EBD (1<<28) /* Enable Bus Data Parity */ | 313 | #define HID0_EBD (1<<28) /* Enable Bus Data Parity */ |
@@ -353,6 +363,13 @@ | |||
353 | #define SPRN_IABR2 0x3FA /* 83xx */ | 363 | #define SPRN_IABR2 0x3FA /* 83xx */ |
354 | #define SPRN_IBCR 0x135 /* 83xx Insn Breakpoint Control Reg */ | 364 | #define SPRN_IBCR 0x135 /* 83xx Insn Breakpoint Control Reg */ |
355 | #define SPRN_HID4 0x3F4 /* 970 HID4 */ | 365 | #define SPRN_HID4 0x3F4 /* 970 HID4 */ |
366 | #define HID4_LPES0 (1ul << (63-0)) /* LPAR env. sel. bit 0 */ | ||
367 | #define HID4_RMLS2_SH (63 - 2) /* Real mode limit bottom 2 bits */ | ||
368 | #define HID4_LPID5_SH (63 - 6) /* partition ID bottom 4 bits */ | ||
369 | #define HID4_RMOR_SH (63 - 22) /* real mode offset (16 bits) */ | ||
370 | #define HID4_LPES1 (1 << (63-57)) /* LPAR env. sel. bit 1 */ | ||
371 | #define HID4_RMLS0_SH (63 - 58) /* Real mode limit top bit */ | ||
372 | #define HID4_LPID1_SH 0 /* partition ID top 2 bits */ | ||
356 | #define SPRN_HID4_GEKKO 0x3F3 /* Gekko HID4 */ | 373 | #define SPRN_HID4_GEKKO 0x3F3 /* Gekko HID4 */ |
357 | #define SPRN_HID5 0x3F6 /* 970 HID5 */ | 374 | #define SPRN_HID5 0x3F6 /* 970 HID5 */ |
358 | #define SPRN_HID6 0x3F9 /* BE HID 6 */ | 375 | #define SPRN_HID6 0x3F9 /* BE HID 6 */ |
@@ -802,28 +819,28 @@ | |||
802 | mfspr rX,SPRN_SPRG_PACA; \ | 819 | mfspr rX,SPRN_SPRG_PACA; \ |
803 | FTR_SECTION_ELSE_NESTED(66); \ | 820 | FTR_SECTION_ELSE_NESTED(66); \ |
804 | mfspr rX,SPRN_SPRG_HPACA; \ | 821 | mfspr rX,SPRN_SPRG_HPACA; \ |
805 | ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66) | 822 | ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66) |
806 | 823 | ||
807 | #define SET_PACA(rX) \ | 824 | #define SET_PACA(rX) \ |
808 | BEGIN_FTR_SECTION_NESTED(66); \ | 825 | BEGIN_FTR_SECTION_NESTED(66); \ |
809 | mtspr SPRN_SPRG_PACA,rX; \ | 826 | mtspr SPRN_SPRG_PACA,rX; \ |
810 | FTR_SECTION_ELSE_NESTED(66); \ | 827 | FTR_SECTION_ELSE_NESTED(66); \ |
811 | mtspr SPRN_SPRG_HPACA,rX; \ | 828 | mtspr SPRN_SPRG_HPACA,rX; \ |
812 | ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66) | 829 | ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66) |
813 | 830 | ||
814 | #define GET_SCRATCH0(rX) \ | 831 | #define GET_SCRATCH0(rX) \ |
815 | BEGIN_FTR_SECTION_NESTED(66); \ | 832 | BEGIN_FTR_SECTION_NESTED(66); \ |
816 | mfspr rX,SPRN_SPRG_SCRATCH0; \ | 833 | mfspr rX,SPRN_SPRG_SCRATCH0; \ |
817 | FTR_SECTION_ELSE_NESTED(66); \ | 834 | FTR_SECTION_ELSE_NESTED(66); \ |
818 | mfspr rX,SPRN_SPRG_HSCRATCH0; \ | 835 | mfspr rX,SPRN_SPRG_HSCRATCH0; \ |
819 | ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66) | 836 | ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66) |
820 | 837 | ||
821 | #define SET_SCRATCH0(rX) \ | 838 | #define SET_SCRATCH0(rX) \ |
822 | BEGIN_FTR_SECTION_NESTED(66); \ | 839 | BEGIN_FTR_SECTION_NESTED(66); \ |
823 | mtspr SPRN_SPRG_SCRATCH0,rX; \ | 840 | mtspr SPRN_SPRG_SCRATCH0,rX; \ |
824 | FTR_SECTION_ELSE_NESTED(66); \ | 841 | FTR_SECTION_ELSE_NESTED(66); \ |
825 | mtspr SPRN_SPRG_HSCRATCH0,rX; \ | 842 | mtspr SPRN_SPRG_HSCRATCH0,rX; \ |
826 | ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66) | 843 | ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66) |
827 | 844 | ||
828 | #else /* CONFIG_PPC_BOOK3S_64 */ | 845 | #else /* CONFIG_PPC_BOOK3S_64 */ |
829 | #define GET_SCRATCH0(rX) mfspr rX,SPRN_SPRG_SCRATCH0 | 846 | #define GET_SCRATCH0(rX) mfspr rX,SPRN_SPRG_SCRATCH0 |
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index 0f0ad9fa01c1..9ec0b39f9ddc 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h | |||
@@ -318,6 +318,7 @@ | |||
318 | #define ESR_ILK 0x00100000 /* Instr. Cache Locking */ | 318 | #define ESR_ILK 0x00100000 /* Instr. Cache Locking */ |
319 | #define ESR_PUO 0x00040000 /* Unimplemented Operation exception */ | 319 | #define ESR_PUO 0x00040000 /* Unimplemented Operation exception */ |
320 | #define ESR_BO 0x00020000 /* Byte Ordering */ | 320 | #define ESR_BO 0x00020000 /* Byte Ordering */ |
321 | #define ESR_SPV 0x00000080 /* Signal Processing operation */ | ||
321 | 322 | ||
322 | /* Bit definitions related to the DBCR0. */ | 323 | /* Bit definitions related to the DBCR0. */ |
323 | #if defined(CONFIG_40x) | 324 | #if defined(CONFIG_40x) |
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 36e1c8a29be8..54b935f2f5de 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c | |||
@@ -128,6 +128,7 @@ int main(void) | |||
128 | DEFINE(ICACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, ilines_per_page)); | 128 | DEFINE(ICACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, ilines_per_page)); |
129 | /* paca */ | 129 | /* paca */ |
130 | DEFINE(PACA_SIZE, sizeof(struct paca_struct)); | 130 | DEFINE(PACA_SIZE, sizeof(struct paca_struct)); |
131 | DEFINE(PACA_LOCK_TOKEN, offsetof(struct paca_struct, lock_token)); | ||
131 | DEFINE(PACAPACAINDEX, offsetof(struct paca_struct, paca_index)); | 132 | DEFINE(PACAPACAINDEX, offsetof(struct paca_struct, paca_index)); |
132 | DEFINE(PACAPROCSTART, offsetof(struct paca_struct, cpu_start)); | 133 | DEFINE(PACAPROCSTART, offsetof(struct paca_struct, cpu_start)); |
133 | DEFINE(PACAKSAVE, offsetof(struct paca_struct, kstack)); | 134 | DEFINE(PACAKSAVE, offsetof(struct paca_struct, kstack)); |
@@ -187,7 +188,9 @@ int main(void) | |||
187 | DEFINE(LPPACASRR1, offsetof(struct lppaca, saved_srr1)); | 188 | DEFINE(LPPACASRR1, offsetof(struct lppaca, saved_srr1)); |
188 | DEFINE(LPPACAANYINT, offsetof(struct lppaca, int_dword.any_int)); | 189 | DEFINE(LPPACAANYINT, offsetof(struct lppaca, int_dword.any_int)); |
189 | DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int)); | 190 | DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int)); |
191 | DEFINE(LPPACA_PMCINUSE, offsetof(struct lppaca, pmcregs_in_use)); | ||
190 | DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx)); | 192 | DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx)); |
193 | DEFINE(LPPACA_YIELDCOUNT, offsetof(struct lppaca, yield_count)); | ||
191 | DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx)); | 194 | DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx)); |
192 | #endif /* CONFIG_PPC_STD_MMU_64 */ | 195 | #endif /* CONFIG_PPC_STD_MMU_64 */ |
193 | DEFINE(PACAEMERGSP, offsetof(struct paca_struct, emergency_sp)); | 196 | DEFINE(PACAEMERGSP, offsetof(struct paca_struct, emergency_sp)); |
@@ -198,11 +201,6 @@ int main(void) | |||
198 | DEFINE(PACA_USER_TIME, offsetof(struct paca_struct, user_time)); | 201 | DEFINE(PACA_USER_TIME, offsetof(struct paca_struct, user_time)); |
199 | DEFINE(PACA_SYSTEM_TIME, offsetof(struct paca_struct, system_time)); | 202 | DEFINE(PACA_SYSTEM_TIME, offsetof(struct paca_struct, system_time)); |
200 | DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save)); | 203 | DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save)); |
201 | #ifdef CONFIG_KVM_BOOK3S_64_HANDLER | ||
202 | DEFINE(PACA_KVM_SVCPU, offsetof(struct paca_struct, shadow_vcpu)); | ||
203 | DEFINE(SVCPU_SLB, offsetof(struct kvmppc_book3s_shadow_vcpu, slb)); | ||
204 | DEFINE(SVCPU_SLB_MAX, offsetof(struct kvmppc_book3s_shadow_vcpu, slb_max)); | ||
205 | #endif | ||
206 | #endif /* CONFIG_PPC64 */ | 204 | #endif /* CONFIG_PPC64 */ |
207 | 205 | ||
208 | /* RTAS */ | 206 | /* RTAS */ |
@@ -397,67 +395,160 @@ int main(void) | |||
397 | DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid)); | 395 | DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid)); |
398 | DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr)); | 396 | DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr)); |
399 | DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave)); | 397 | DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave)); |
398 | DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fpr)); | ||
399 | DEFINE(VCPU_FPSCR, offsetof(struct kvm_vcpu, arch.fpscr)); | ||
400 | #ifdef CONFIG_ALTIVEC | ||
401 | DEFINE(VCPU_VRS, offsetof(struct kvm_vcpu, arch.vr)); | ||
402 | DEFINE(VCPU_VSCR, offsetof(struct kvm_vcpu, arch.vscr)); | ||
403 | #endif | ||
404 | #ifdef CONFIG_VSX | ||
405 | DEFINE(VCPU_VSRS, offsetof(struct kvm_vcpu, arch.vsr)); | ||
406 | #endif | ||
407 | DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer)); | ||
408 | DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr)); | ||
409 | DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr)); | ||
410 | DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr)); | ||
411 | DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc)); | ||
412 | #ifdef CONFIG_KVM_BOOK3S_64_HV | ||
413 | DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.shregs.msr)); | ||
414 | DEFINE(VCPU_SRR0, offsetof(struct kvm_vcpu, arch.shregs.srr0)); | ||
415 | DEFINE(VCPU_SRR1, offsetof(struct kvm_vcpu, arch.shregs.srr1)); | ||
416 | DEFINE(VCPU_SPRG0, offsetof(struct kvm_vcpu, arch.shregs.sprg0)); | ||
417 | DEFINE(VCPU_SPRG1, offsetof(struct kvm_vcpu, arch.shregs.sprg1)); | ||
418 | DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2)); | ||
419 | DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3)); | ||
420 | #endif | ||
400 | DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4)); | 421 | DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4)); |
401 | DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5)); | 422 | DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5)); |
402 | DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6)); | 423 | DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6)); |
403 | DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7)); | 424 | DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7)); |
404 | DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid)); | 425 | DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid)); |
426 | DEFINE(VCPU_SHADOW_PID1, offsetof(struct kvm_vcpu, arch.shadow_pid1)); | ||
405 | DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared)); | 427 | DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared)); |
406 | DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr)); | 428 | DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr)); |
429 | DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr)); | ||
407 | 430 | ||
408 | /* book3s */ | 431 | /* book3s */ |
432 | #ifdef CONFIG_KVM_BOOK3S_64_HV | ||
433 | DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid)); | ||
434 | DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1)); | ||
435 | DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid)); | ||
436 | DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr)); | ||
437 | DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1)); | ||
438 | DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock)); | ||
439 | DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter)); | ||
440 | DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu)); | ||
441 | DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr)); | ||
442 | DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor)); | ||
443 | DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr)); | ||
444 | DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); | ||
445 | #endif | ||
409 | #ifdef CONFIG_PPC_BOOK3S | 446 | #ifdef CONFIG_PPC_BOOK3S |
447 | DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm)); | ||
448 | DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id)); | ||
410 | DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip)); | 449 | DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip)); |
411 | DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr)); | 450 | DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr)); |
412 | DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr)); | 451 | DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr)); |
452 | DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr)); | ||
453 | DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr)); | ||
454 | DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr)); | ||
455 | DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor)); | ||
456 | DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl)); | ||
457 | DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr)); | ||
413 | DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem)); | 458 | DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem)); |
414 | DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter)); | 459 | DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter)); |
415 | DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler)); | 460 | DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler)); |
416 | DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall)); | 461 | DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall)); |
417 | DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags)); | 462 | DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags)); |
463 | DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec)); | ||
464 | DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires)); | ||
465 | DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions)); | ||
466 | DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa)); | ||
467 | DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr)); | ||
468 | DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc)); | ||
469 | DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb)); | ||
470 | DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max)); | ||
471 | DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr)); | ||
472 | DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu)); | ||
473 | DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr)); | ||
474 | DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar)); | ||
475 | DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); | ||
476 | DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap)); | ||
477 | DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid)); | ||
478 | DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count)); | ||
479 | DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count)); | ||
480 | DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest)); | ||
418 | DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) - | 481 | DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) - |
419 | offsetof(struct kvmppc_vcpu_book3s, vcpu)); | 482 | offsetof(struct kvmppc_vcpu_book3s, vcpu)); |
420 | DEFINE(SVCPU_CR, offsetof(struct kvmppc_book3s_shadow_vcpu, cr)); | 483 | DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige)); |
421 | DEFINE(SVCPU_XER, offsetof(struct kvmppc_book3s_shadow_vcpu, xer)); | 484 | DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv)); |
422 | DEFINE(SVCPU_CTR, offsetof(struct kvmppc_book3s_shadow_vcpu, ctr)); | 485 | DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb)); |
423 | DEFINE(SVCPU_LR, offsetof(struct kvmppc_book3s_shadow_vcpu, lr)); | 486 | |
424 | DEFINE(SVCPU_PC, offsetof(struct kvmppc_book3s_shadow_vcpu, pc)); | 487 | #ifdef CONFIG_PPC_BOOK3S_64 |
425 | DEFINE(SVCPU_R0, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[0])); | 488 | #ifdef CONFIG_KVM_BOOK3S_PR |
426 | DEFINE(SVCPU_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[1])); | 489 | # define SVCPU_FIELD(x, f) DEFINE(x, offsetof(struct paca_struct, shadow_vcpu.f)) |
427 | DEFINE(SVCPU_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[2])); | 490 | #else |
428 | DEFINE(SVCPU_R3, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[3])); | 491 | # define SVCPU_FIELD(x, f) |
429 | DEFINE(SVCPU_R4, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[4])); | 492 | #endif |
430 | DEFINE(SVCPU_R5, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[5])); | 493 | # define HSTATE_FIELD(x, f) DEFINE(x, offsetof(struct paca_struct, kvm_hstate.f)) |
431 | DEFINE(SVCPU_R6, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[6])); | 494 | #else /* 32-bit */ |
432 | DEFINE(SVCPU_R7, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[7])); | 495 | # define SVCPU_FIELD(x, f) DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, f)) |
433 | DEFINE(SVCPU_R8, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[8])); | 496 | # define HSTATE_FIELD(x, f) DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, hstate.f)) |
434 | DEFINE(SVCPU_R9, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[9])); | 497 | #endif |
435 | DEFINE(SVCPU_R10, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[10])); | 498 | |
436 | DEFINE(SVCPU_R11, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[11])); | 499 | SVCPU_FIELD(SVCPU_CR, cr); |
437 | DEFINE(SVCPU_R12, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[12])); | 500 | SVCPU_FIELD(SVCPU_XER, xer); |
438 | DEFINE(SVCPU_R13, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[13])); | 501 | SVCPU_FIELD(SVCPU_CTR, ctr); |
439 | DEFINE(SVCPU_HOST_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r1)); | 502 | SVCPU_FIELD(SVCPU_LR, lr); |
440 | DEFINE(SVCPU_HOST_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r2)); | 503 | SVCPU_FIELD(SVCPU_PC, pc); |
441 | DEFINE(SVCPU_VMHANDLER, offsetof(struct kvmppc_book3s_shadow_vcpu, | 504 | SVCPU_FIELD(SVCPU_R0, gpr[0]); |
442 | vmhandler)); | 505 | SVCPU_FIELD(SVCPU_R1, gpr[1]); |
443 | DEFINE(SVCPU_SCRATCH0, offsetof(struct kvmppc_book3s_shadow_vcpu, | 506 | SVCPU_FIELD(SVCPU_R2, gpr[2]); |
444 | scratch0)); | 507 | SVCPU_FIELD(SVCPU_R3, gpr[3]); |
445 | DEFINE(SVCPU_SCRATCH1, offsetof(struct kvmppc_book3s_shadow_vcpu, | 508 | SVCPU_FIELD(SVCPU_R4, gpr[4]); |
446 | scratch1)); | 509 | SVCPU_FIELD(SVCPU_R5, gpr[5]); |
447 | DEFINE(SVCPU_IN_GUEST, offsetof(struct kvmppc_book3s_shadow_vcpu, | 510 | SVCPU_FIELD(SVCPU_R6, gpr[6]); |
448 | in_guest)); | 511 | SVCPU_FIELD(SVCPU_R7, gpr[7]); |
449 | DEFINE(SVCPU_FAULT_DSISR, offsetof(struct kvmppc_book3s_shadow_vcpu, | 512 | SVCPU_FIELD(SVCPU_R8, gpr[8]); |
450 | fault_dsisr)); | 513 | SVCPU_FIELD(SVCPU_R9, gpr[9]); |
451 | DEFINE(SVCPU_FAULT_DAR, offsetof(struct kvmppc_book3s_shadow_vcpu, | 514 | SVCPU_FIELD(SVCPU_R10, gpr[10]); |
452 | fault_dar)); | 515 | SVCPU_FIELD(SVCPU_R11, gpr[11]); |
453 | DEFINE(SVCPU_LAST_INST, offsetof(struct kvmppc_book3s_shadow_vcpu, | 516 | SVCPU_FIELD(SVCPU_R12, gpr[12]); |
454 | last_inst)); | 517 | SVCPU_FIELD(SVCPU_R13, gpr[13]); |
455 | DEFINE(SVCPU_SHADOW_SRR1, offsetof(struct kvmppc_book3s_shadow_vcpu, | 518 | SVCPU_FIELD(SVCPU_FAULT_DSISR, fault_dsisr); |
456 | shadow_srr1)); | 519 | SVCPU_FIELD(SVCPU_FAULT_DAR, fault_dar); |
520 | SVCPU_FIELD(SVCPU_LAST_INST, last_inst); | ||
521 | SVCPU_FIELD(SVCPU_SHADOW_SRR1, shadow_srr1); | ||
457 | #ifdef CONFIG_PPC_BOOK3S_32 | 522 | #ifdef CONFIG_PPC_BOOK3S_32 |
458 | DEFINE(SVCPU_SR, offsetof(struct kvmppc_book3s_shadow_vcpu, sr)); | 523 | SVCPU_FIELD(SVCPU_SR, sr); |
459 | #endif | 524 | #endif |
460 | #else | 525 | #ifdef CONFIG_PPC64 |
526 | SVCPU_FIELD(SVCPU_SLB, slb); | ||
527 | SVCPU_FIELD(SVCPU_SLB_MAX, slb_max); | ||
528 | #endif | ||
529 | |||
530 | HSTATE_FIELD(HSTATE_HOST_R1, host_r1); | ||
531 | HSTATE_FIELD(HSTATE_HOST_R2, host_r2); | ||
532 | HSTATE_FIELD(HSTATE_HOST_MSR, host_msr); | ||
533 | HSTATE_FIELD(HSTATE_VMHANDLER, vmhandler); | ||
534 | HSTATE_FIELD(HSTATE_SCRATCH0, scratch0); | ||
535 | HSTATE_FIELD(HSTATE_SCRATCH1, scratch1); | ||
536 | HSTATE_FIELD(HSTATE_IN_GUEST, in_guest); | ||
537 | |||
538 | #ifdef CONFIG_KVM_BOOK3S_64_HV | ||
539 | HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu); | ||
540 | HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore); | ||
541 | HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys); | ||
542 | HSTATE_FIELD(HSTATE_MMCR, host_mmcr); | ||
543 | HSTATE_FIELD(HSTATE_PMC, host_pmc); | ||
544 | HSTATE_FIELD(HSTATE_PURR, host_purr); | ||
545 | HSTATE_FIELD(HSTATE_SPURR, host_spurr); | ||
546 | HSTATE_FIELD(HSTATE_DSCR, host_dscr); | ||
547 | HSTATE_FIELD(HSTATE_DABR, dabr); | ||
548 | HSTATE_FIELD(HSTATE_DECEXP, dec_expires); | ||
549 | #endif /* CONFIG_KVM_BOOK3S_64_HV */ | ||
550 | |||
551 | #else /* CONFIG_PPC_BOOK3S */ | ||
461 | DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr)); | 552 | DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr)); |
462 | DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer)); | 553 | DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer)); |
463 | DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr)); | 554 | DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr)); |
@@ -467,7 +558,7 @@ int main(void) | |||
467 | DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear)); | 558 | DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear)); |
468 | DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr)); | 559 | DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr)); |
469 | #endif /* CONFIG_PPC_BOOK3S */ | 560 | #endif /* CONFIG_PPC_BOOK3S */ |
470 | #endif | 561 | #endif /* CONFIG_KVM */ |
471 | 562 | ||
472 | #ifdef CONFIG_KVM_GUEST | 563 | #ifdef CONFIG_KVM_GUEST |
473 | DEFINE(KVM_MAGIC_SCRATCH1, offsetof(struct kvm_vcpu_arch_shared, | 564 | DEFINE(KVM_MAGIC_SCRATCH1, offsetof(struct kvm_vcpu_arch_shared, |
@@ -497,6 +588,13 @@ int main(void) | |||
497 | DEFINE(TLBCAM_MAS7, offsetof(struct tlbcam, MAS7)); | 588 | DEFINE(TLBCAM_MAS7, offsetof(struct tlbcam, MAS7)); |
498 | #endif | 589 | #endif |
499 | 590 | ||
591 | #if defined(CONFIG_KVM) && defined(CONFIG_SPE) | ||
592 | DEFINE(VCPU_EVR, offsetof(struct kvm_vcpu, arch.evr[0])); | ||
593 | DEFINE(VCPU_ACC, offsetof(struct kvm_vcpu, arch.acc)); | ||
594 | DEFINE(VCPU_SPEFSCR, offsetof(struct kvm_vcpu, arch.spefscr)); | ||
595 | DEFINE(VCPU_HOST_SPEFSCR, offsetof(struct kvm_vcpu, arch.host_spefscr)); | ||
596 | #endif | ||
597 | |||
500 | #ifdef CONFIG_KVM_EXIT_TIMING | 598 | #ifdef CONFIG_KVM_EXIT_TIMING |
501 | DEFINE(VCPU_TIMING_EXIT_TBU, offsetof(struct kvm_vcpu, | 599 | DEFINE(VCPU_TIMING_EXIT_TBU, offsetof(struct kvm_vcpu, |
502 | arch.timing_exit.tv32.tbu)); | 600 | arch.timing_exit.tv32.tbu)); |
diff --git a/arch/powerpc/kernel/cpu_setup_power7.S b/arch/powerpc/kernel/cpu_setup_power7.S index 4f9a93fcfe07..76797c5105d6 100644 --- a/arch/powerpc/kernel/cpu_setup_power7.S +++ b/arch/powerpc/kernel/cpu_setup_power7.S | |||
@@ -45,12 +45,12 @@ _GLOBAL(__restore_cpu_power7) | |||
45 | blr | 45 | blr |
46 | 46 | ||
47 | __init_hvmode_206: | 47 | __init_hvmode_206: |
48 | /* Disable CPU_FTR_HVMODE_206 and exit if MSR:HV is not set */ | 48 | /* Disable CPU_FTR_HVMODE and exit if MSR:HV is not set */ |
49 | mfmsr r3 | 49 | mfmsr r3 |
50 | rldicl. r0,r3,4,63 | 50 | rldicl. r0,r3,4,63 |
51 | bnelr | 51 | bnelr |
52 | ld r5,CPU_SPEC_FEATURES(r4) | 52 | ld r5,CPU_SPEC_FEATURES(r4) |
53 | LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE_206) | 53 | LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE) |
54 | xor r5,r5,r6 | 54 | xor r5,r5,r6 |
55 | std r5,CPU_SPEC_FEATURES(r4) | 55 | std r5,CPU_SPEC_FEATURES(r4) |
56 | blr | 56 | blr |
@@ -61,19 +61,23 @@ __init_LPCR: | |||
61 | * LPES = 0b01 (HSRR0/1 used for 0x500) | 61 | * LPES = 0b01 (HSRR0/1 used for 0x500) |
62 | * PECE = 0b111 | 62 | * PECE = 0b111 |
63 | * DPFD = 4 | 63 | * DPFD = 4 |
64 | * HDICE = 0 | ||
65 | * VC = 0b100 (VPM0=1, VPM1=0, ISL=0) | ||
66 | * VRMASD = 0b10000 (L=1, LP=00) | ||
64 | * | 67 | * |
65 | * Other bits untouched for now | 68 | * Other bits untouched for now |
66 | */ | 69 | */ |
67 | mfspr r3,SPRN_LPCR | 70 | mfspr r3,SPRN_LPCR |
68 | ori r3,r3,(LPCR_LPES0|LPCR_LPES1) | 71 | li r5,1 |
69 | xori r3,r3, LPCR_LPES0 | 72 | rldimi r3,r5, LPCR_LPES_SH, 64-LPCR_LPES_SH-2 |
70 | ori r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2) | 73 | ori r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2) |
71 | li r5,7 | ||
72 | sldi r5,r5,LPCR_DPFD_SH | ||
73 | andc r3,r3,r5 | ||
74 | li r5,4 | 74 | li r5,4 |
75 | sldi r5,r5,LPCR_DPFD_SH | 75 | rldimi r3,r5, LPCR_DPFD_SH, 64-LPCR_DPFD_SH-3 |
76 | or r3,r3,r5 | 76 | clrrdi r3,r3,1 /* clear HDICE */ |
77 | li r5,4 | ||
78 | rldimi r3,r5, LPCR_VC_SH, 0 | ||
79 | li r5,0x10 | ||
80 | rldimi r3,r5, LPCR_VRMASD_SH, 64-LPCR_VRMASD_SH-5 | ||
77 | mtspr SPRN_LPCR,r3 | 81 | mtspr SPRN_LPCR,r3 |
78 | isync | 82 | isync |
79 | blr | 83 | blr |
diff --git a/arch/powerpc/kernel/cpu_setup_ppc970.S b/arch/powerpc/kernel/cpu_setup_ppc970.S index 27f2507279d8..12fac8df01c5 100644 --- a/arch/powerpc/kernel/cpu_setup_ppc970.S +++ b/arch/powerpc/kernel/cpu_setup_ppc970.S | |||
@@ -76,7 +76,7 @@ _GLOBAL(__setup_cpu_ppc970) | |||
76 | /* Do nothing if not running in HV mode */ | 76 | /* Do nothing if not running in HV mode */ |
77 | mfmsr r0 | 77 | mfmsr r0 |
78 | rldicl. r0,r0,4,63 | 78 | rldicl. r0,r0,4,63 |
79 | beqlr | 79 | beq no_hv_mode |
80 | 80 | ||
81 | mfspr r0,SPRN_HID0 | 81 | mfspr r0,SPRN_HID0 |
82 | li r11,5 /* clear DOZE and SLEEP */ | 82 | li r11,5 /* clear DOZE and SLEEP */ |
@@ -90,7 +90,7 @@ _GLOBAL(__setup_cpu_ppc970MP) | |||
90 | /* Do nothing if not running in HV mode */ | 90 | /* Do nothing if not running in HV mode */ |
91 | mfmsr r0 | 91 | mfmsr r0 |
92 | rldicl. r0,r0,4,63 | 92 | rldicl. r0,r0,4,63 |
93 | beqlr | 93 | beq no_hv_mode |
94 | 94 | ||
95 | mfspr r0,SPRN_HID0 | 95 | mfspr r0,SPRN_HID0 |
96 | li r11,0x15 /* clear DOZE and SLEEP */ | 96 | li r11,0x15 /* clear DOZE and SLEEP */ |
@@ -109,6 +109,14 @@ load_hids: | |||
109 | sync | 109 | sync |
110 | isync | 110 | isync |
111 | 111 | ||
112 | /* Try to set LPES = 01 in HID4 */ | ||
113 | mfspr r0,SPRN_HID4 | ||
114 | clrldi r0,r0,1 /* clear LPES0 */ | ||
115 | ori r0,r0,HID4_LPES1 /* set LPES1 */ | ||
116 | sync | ||
117 | mtspr SPRN_HID4,r0 | ||
118 | isync | ||
119 | |||
112 | /* Save away cpu state */ | 120 | /* Save away cpu state */ |
113 | LOAD_REG_ADDR(r5,cpu_state_storage) | 121 | LOAD_REG_ADDR(r5,cpu_state_storage) |
114 | 122 | ||
@@ -117,11 +125,21 @@ load_hids: | |||
117 | std r3,CS_HID0(r5) | 125 | std r3,CS_HID0(r5) |
118 | mfspr r3,SPRN_HID1 | 126 | mfspr r3,SPRN_HID1 |
119 | std r3,CS_HID1(r5) | 127 | std r3,CS_HID1(r5) |
120 | mfspr r3,SPRN_HID4 | 128 | mfspr r4,SPRN_HID4 |
121 | std r3,CS_HID4(r5) | 129 | std r4,CS_HID4(r5) |
122 | mfspr r3,SPRN_HID5 | 130 | mfspr r3,SPRN_HID5 |
123 | std r3,CS_HID5(r5) | 131 | std r3,CS_HID5(r5) |
124 | 132 | ||
133 | /* See if we successfully set LPES1 to 1; if not we are in Apple mode */ | ||
134 | andi. r4,r4,HID4_LPES1 | ||
135 | bnelr | ||
136 | |||
137 | no_hv_mode: | ||
138 | /* Disable CPU_FTR_HVMODE and exit, since we don't have HV mode */ | ||
139 | ld r5,CPU_SPEC_FEATURES(r4) | ||
140 | LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE) | ||
141 | andc r5,r5,r6 | ||
142 | std r5,CPU_SPEC_FEATURES(r4) | ||
125 | blr | 143 | blr |
126 | 144 | ||
127 | /* Called with no MMU context (typically MSR:IR/DR off) to | 145 | /* Called with no MMU context (typically MSR:IR/DR off) to |
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index a85f4874cba7..41b02c792aa3 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S | |||
@@ -40,7 +40,6 @@ __start_interrupts: | |||
40 | .globl system_reset_pSeries; | 40 | .globl system_reset_pSeries; |
41 | system_reset_pSeries: | 41 | system_reset_pSeries: |
42 | HMT_MEDIUM; | 42 | HMT_MEDIUM; |
43 | DO_KVM 0x100; | ||
44 | SET_SCRATCH0(r13) | 43 | SET_SCRATCH0(r13) |
45 | #ifdef CONFIG_PPC_P7_NAP | 44 | #ifdef CONFIG_PPC_P7_NAP |
46 | BEGIN_FTR_SECTION | 45 | BEGIN_FTR_SECTION |
@@ -50,82 +49,73 @@ BEGIN_FTR_SECTION | |||
50 | * state loss at this time. | 49 | * state loss at this time. |
51 | */ | 50 | */ |
52 | mfspr r13,SPRN_SRR1 | 51 | mfspr r13,SPRN_SRR1 |
53 | rlwinm r13,r13,47-31,30,31 | 52 | rlwinm. r13,r13,47-31,30,31 |
54 | cmpwi cr0,r13,1 | 53 | beq 9f |
55 | bne 1f | 54 | |
56 | b .power7_wakeup_noloss | 55 | /* waking up from powersave (nap) state */ |
57 | 1: cmpwi cr0,r13,2 | 56 | cmpwi cr1,r13,2 |
58 | bne 1f | ||
59 | b .power7_wakeup_loss | ||
60 | /* Total loss of HV state is fatal, we could try to use the | 57 | /* Total loss of HV state is fatal, we could try to use the |
61 | * PIR to locate a PACA, then use an emergency stack etc... | 58 | * PIR to locate a PACA, then use an emergency stack etc... |
62 | * but for now, let's just stay stuck here | 59 | * but for now, let's just stay stuck here |
63 | */ | 60 | */ |
64 | 1: cmpwi cr0,r13,3 | 61 | bgt cr1,. |
65 | beq . | 62 | GET_PACA(r13) |
66 | END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206) | 63 | |
64 | #ifdef CONFIG_KVM_BOOK3S_64_HV | ||
65 | lbz r0,PACAPROCSTART(r13) | ||
66 | cmpwi r0,0x80 | ||
67 | bne 1f | ||
68 | li r0,0 | ||
69 | stb r0,PACAPROCSTART(r13) | ||
70 | b kvm_start_guest | ||
71 | 1: | ||
72 | #endif | ||
73 | |||
74 | beq cr1,2f | ||
75 | b .power7_wakeup_noloss | ||
76 | 2: b .power7_wakeup_loss | ||
77 | 9: | ||
78 | END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) | ||
67 | #endif /* CONFIG_PPC_P7_NAP */ | 79 | #endif /* CONFIG_PPC_P7_NAP */ |
68 | EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD) | 80 | EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD, |
81 | NOTEST, 0x100) | ||
69 | 82 | ||
70 | . = 0x200 | 83 | . = 0x200 |
71 | _machine_check_pSeries: | 84 | machine_check_pSeries_1: |
72 | HMT_MEDIUM | 85 | /* This is moved out of line as it can be patched by FW, but |
73 | DO_KVM 0x200 | 86 | * some code path might still want to branch into the original |
74 | SET_SCRATCH0(r13) | 87 | * vector |
75 | EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common, EXC_STD) | 88 | */ |
89 | b machine_check_pSeries | ||
76 | 90 | ||
77 | . = 0x300 | 91 | . = 0x300 |
78 | .globl data_access_pSeries | 92 | .globl data_access_pSeries |
79 | data_access_pSeries: | 93 | data_access_pSeries: |
80 | HMT_MEDIUM | 94 | HMT_MEDIUM |
81 | DO_KVM 0x300 | ||
82 | SET_SCRATCH0(r13) | 95 | SET_SCRATCH0(r13) |
96 | #ifndef CONFIG_POWER4_ONLY | ||
83 | BEGIN_FTR_SECTION | 97 | BEGIN_FTR_SECTION |
84 | GET_PACA(r13) | 98 | b data_access_check_stab |
85 | std r9,PACA_EXSLB+EX_R9(r13) | 99 | data_access_not_stab: |
86 | std r10,PACA_EXSLB+EX_R10(r13) | 100 | END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB) |
87 | mfspr r10,SPRN_DAR | 101 | #endif |
88 | mfspr r9,SPRN_DSISR | 102 | EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD, |
89 | srdi r10,r10,60 | 103 | KVMTEST_PR, 0x300) |
90 | rlwimi r10,r9,16,0x20 | ||
91 | mfcr r9 | ||
92 | cmpwi r10,0x2c | ||
93 | beq do_stab_bolted_pSeries | ||
94 | ld r10,PACA_EXSLB+EX_R10(r13) | ||
95 | std r11,PACA_EXGEN+EX_R11(r13) | ||
96 | ld r11,PACA_EXSLB+EX_R9(r13) | ||
97 | std r12,PACA_EXGEN+EX_R12(r13) | ||
98 | GET_SCRATCH0(r12) | ||
99 | std r10,PACA_EXGEN+EX_R10(r13) | ||
100 | std r11,PACA_EXGEN+EX_R9(r13) | ||
101 | std r12,PACA_EXGEN+EX_R13(r13) | ||
102 | EXCEPTION_PROLOG_PSERIES_1(data_access_common, EXC_STD) | ||
103 | FTR_SECTION_ELSE | ||
104 | EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD) | ||
105 | ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_SLB) | ||
106 | 104 | ||
107 | . = 0x380 | 105 | . = 0x380 |
108 | .globl data_access_slb_pSeries | 106 | .globl data_access_slb_pSeries |
109 | data_access_slb_pSeries: | 107 | data_access_slb_pSeries: |
110 | HMT_MEDIUM | 108 | HMT_MEDIUM |
111 | DO_KVM 0x380 | ||
112 | SET_SCRATCH0(r13) | 109 | SET_SCRATCH0(r13) |
113 | GET_PACA(r13) | 110 | EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380) |
114 | std r3,PACA_EXSLB+EX_R3(r13) | 111 | std r3,PACA_EXSLB+EX_R3(r13) |
115 | mfspr r3,SPRN_DAR | 112 | mfspr r3,SPRN_DAR |
116 | std r9,PACA_EXSLB+EX_R9(r13) /* save r9 - r12 */ | ||
117 | mfcr r9 | ||
118 | #ifdef __DISABLED__ | 113 | #ifdef __DISABLED__ |
119 | /* Keep that around for when we re-implement dynamic VSIDs */ | 114 | /* Keep that around for when we re-implement dynamic VSIDs */ |
120 | cmpdi r3,0 | 115 | cmpdi r3,0 |
121 | bge slb_miss_user_pseries | 116 | bge slb_miss_user_pseries |
122 | #endif /* __DISABLED__ */ | 117 | #endif /* __DISABLED__ */ |
123 | std r10,PACA_EXSLB+EX_R10(r13) | 118 | mfspr r12,SPRN_SRR1 |
124 | std r11,PACA_EXSLB+EX_R11(r13) | ||
125 | std r12,PACA_EXSLB+EX_R12(r13) | ||
126 | GET_SCRATCH0(r10) | ||
127 | std r10,PACA_EXSLB+EX_R13(r13) | ||
128 | mfspr r12,SPRN_SRR1 /* and SRR1 */ | ||
129 | #ifndef CONFIG_RELOCATABLE | 119 | #ifndef CONFIG_RELOCATABLE |
130 | b .slb_miss_realmode | 120 | b .slb_miss_realmode |
131 | #else | 121 | #else |
@@ -147,24 +137,16 @@ data_access_slb_pSeries: | |||
147 | .globl instruction_access_slb_pSeries | 137 | .globl instruction_access_slb_pSeries |
148 | instruction_access_slb_pSeries: | 138 | instruction_access_slb_pSeries: |
149 | HMT_MEDIUM | 139 | HMT_MEDIUM |
150 | DO_KVM 0x480 | ||
151 | SET_SCRATCH0(r13) | 140 | SET_SCRATCH0(r13) |
152 | GET_PACA(r13) | 141 | EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480) |
153 | std r3,PACA_EXSLB+EX_R3(r13) | 142 | std r3,PACA_EXSLB+EX_R3(r13) |
154 | mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ | 143 | mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ |
155 | std r9,PACA_EXSLB+EX_R9(r13) /* save r9 - r12 */ | ||
156 | mfcr r9 | ||
157 | #ifdef __DISABLED__ | 144 | #ifdef __DISABLED__ |
158 | /* Keep that around for when we re-implement dynamic VSIDs */ | 145 | /* Keep that around for when we re-implement dynamic VSIDs */ |
159 | cmpdi r3,0 | 146 | cmpdi r3,0 |
160 | bge slb_miss_user_pseries | 147 | bge slb_miss_user_pseries |
161 | #endif /* __DISABLED__ */ | 148 | #endif /* __DISABLED__ */ |
162 | std r10,PACA_EXSLB+EX_R10(r13) | 149 | mfspr r12,SPRN_SRR1 |
163 | std r11,PACA_EXSLB+EX_R11(r13) | ||
164 | std r12,PACA_EXSLB+EX_R12(r13) | ||
165 | GET_SCRATCH0(r10) | ||
166 | std r10,PACA_EXSLB+EX_R13(r13) | ||
167 | mfspr r12,SPRN_SRR1 /* and SRR1 */ | ||
168 | #ifndef CONFIG_RELOCATABLE | 150 | #ifndef CONFIG_RELOCATABLE |
169 | b .slb_miss_realmode | 151 | b .slb_miss_realmode |
170 | #else | 152 | #else |
@@ -184,26 +166,46 @@ instruction_access_slb_pSeries: | |||
184 | hardware_interrupt_pSeries: | 166 | hardware_interrupt_pSeries: |
185 | hardware_interrupt_hv: | 167 | hardware_interrupt_hv: |
186 | BEGIN_FTR_SECTION | 168 | BEGIN_FTR_SECTION |
187 | _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt, EXC_STD) | 169 | _MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt, |
170 | EXC_HV, SOFTEN_TEST_HV) | ||
171 | KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502) | ||
188 | FTR_SECTION_ELSE | 172 | FTR_SECTION_ELSE |
189 | _MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt, EXC_HV) | 173 | _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt, |
190 | ALT_FTR_SECTION_END_IFCLR(CPU_FTR_HVMODE_206) | 174 | EXC_STD, SOFTEN_TEST_HV_201) |
175 | KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500) | ||
176 | ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) | ||
191 | 177 | ||
192 | STD_EXCEPTION_PSERIES(0x600, 0x600, alignment) | 178 | STD_EXCEPTION_PSERIES(0x600, 0x600, alignment) |
179 | KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x600) | ||
180 | |||
193 | STD_EXCEPTION_PSERIES(0x700, 0x700, program_check) | 181 | STD_EXCEPTION_PSERIES(0x700, 0x700, program_check) |
182 | KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x700) | ||
183 | |||
194 | STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable) | 184 | STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable) |
185 | KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x800) | ||
195 | 186 | ||
196 | MASKABLE_EXCEPTION_PSERIES(0x900, 0x900, decrementer) | 187 | MASKABLE_EXCEPTION_PSERIES(0x900, 0x900, decrementer) |
197 | MASKABLE_EXCEPTION_HV(0x980, 0x980, decrementer) | 188 | MASKABLE_EXCEPTION_HV(0x980, 0x982, decrementer) |
198 | 189 | ||
199 | STD_EXCEPTION_PSERIES(0xa00, 0xa00, trap_0a) | 190 | STD_EXCEPTION_PSERIES(0xa00, 0xa00, trap_0a) |
191 | KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xa00) | ||
192 | |||
200 | STD_EXCEPTION_PSERIES(0xb00, 0xb00, trap_0b) | 193 | STD_EXCEPTION_PSERIES(0xb00, 0xb00, trap_0b) |
194 | KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xb00) | ||
201 | 195 | ||
202 | . = 0xc00 | 196 | . = 0xc00 |
203 | .globl system_call_pSeries | 197 | .globl system_call_pSeries |
204 | system_call_pSeries: | 198 | system_call_pSeries: |
205 | HMT_MEDIUM | 199 | HMT_MEDIUM |
206 | DO_KVM 0xc00 | 200 | #ifdef CONFIG_KVM_BOOK3S_64_HANDLER |
201 | SET_SCRATCH0(r13) | ||
202 | GET_PACA(r13) | ||
203 | std r9,PACA_EXGEN+EX_R9(r13) | ||
204 | std r10,PACA_EXGEN+EX_R10(r13) | ||
205 | mfcr r9 | ||
206 | KVMTEST(0xc00) | ||
207 | GET_SCRATCH0(r13) | ||
208 | #endif | ||
207 | BEGIN_FTR_SECTION | 209 | BEGIN_FTR_SECTION |
208 | cmpdi r0,0x1ebe | 210 | cmpdi r0,0x1ebe |
209 | beq- 1f | 211 | beq- 1f |
@@ -220,6 +222,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) | |||
220 | rfid | 222 | rfid |
221 | b . /* prevent speculative execution */ | 223 | b . /* prevent speculative execution */ |
222 | 224 | ||
225 | KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00) | ||
226 | |||
223 | /* Fast LE/BE switch system call */ | 227 | /* Fast LE/BE switch system call */ |
224 | 1: mfspr r12,SPRN_SRR1 | 228 | 1: mfspr r12,SPRN_SRR1 |
225 | xori r12,r12,MSR_LE | 229 | xori r12,r12,MSR_LE |
@@ -228,6 +232,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) | |||
228 | b . | 232 | b . |
229 | 233 | ||
230 | STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step) | 234 | STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step) |
235 | KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xd00) | ||
231 | 236 | ||
232 | /* At 0xe??? we have a bunch of hypervisor exceptions, we branch | 237 | /* At 0xe??? we have a bunch of hypervisor exceptions, we branch |
233 | * out of line to handle them | 238 | * out of line to handle them |
@@ -262,30 +267,93 @@ vsx_unavailable_pSeries_1: | |||
262 | 267 | ||
263 | #ifdef CONFIG_CBE_RAS | 268 | #ifdef CONFIG_CBE_RAS |
264 | STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error) | 269 | STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error) |
270 | KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_HV, 0x1202) | ||
265 | #endif /* CONFIG_CBE_RAS */ | 271 | #endif /* CONFIG_CBE_RAS */ |
272 | |||
266 | STD_EXCEPTION_PSERIES(0x1300, 0x1300, instruction_breakpoint) | 273 | STD_EXCEPTION_PSERIES(0x1300, 0x1300, instruction_breakpoint) |
274 | KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x1300) | ||
275 | |||
267 | #ifdef CONFIG_CBE_RAS | 276 | #ifdef CONFIG_CBE_RAS |
268 | STD_EXCEPTION_HV(0x1600, 0x1602, cbe_maintenance) | 277 | STD_EXCEPTION_HV(0x1600, 0x1602, cbe_maintenance) |
278 | KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_HV, 0x1602) | ||
269 | #endif /* CONFIG_CBE_RAS */ | 279 | #endif /* CONFIG_CBE_RAS */ |
280 | |||
270 | STD_EXCEPTION_PSERIES(0x1700, 0x1700, altivec_assist) | 281 | STD_EXCEPTION_PSERIES(0x1700, 0x1700, altivec_assist) |
282 | KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x1700) | ||
283 | |||
271 | #ifdef CONFIG_CBE_RAS | 284 | #ifdef CONFIG_CBE_RAS |
272 | STD_EXCEPTION_HV(0x1800, 0x1802, cbe_thermal) | 285 | STD_EXCEPTION_HV(0x1800, 0x1802, cbe_thermal) |
286 | KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_HV, 0x1802) | ||
273 | #endif /* CONFIG_CBE_RAS */ | 287 | #endif /* CONFIG_CBE_RAS */ |
274 | 288 | ||
275 | . = 0x3000 | 289 | . = 0x3000 |
276 | 290 | ||
277 | /*** Out of line interrupts support ***/ | 291 | /*** Out of line interrupts support ***/ |
278 | 292 | ||
293 | /* moved from 0x200 */ | ||
294 | machine_check_pSeries: | ||
295 | .globl machine_check_fwnmi | ||
296 | machine_check_fwnmi: | ||
297 | HMT_MEDIUM | ||
298 | SET_SCRATCH0(r13) /* save r13 */ | ||
299 | EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common, | ||
300 | EXC_STD, KVMTEST, 0x200) | ||
301 | KVM_HANDLER_SKIP(PACA_EXMC, EXC_STD, 0x200) | ||
302 | |||
303 | #ifndef CONFIG_POWER4_ONLY | ||
304 | /* moved from 0x300 */ | ||
305 | data_access_check_stab: | ||
306 | GET_PACA(r13) | ||
307 | std r9,PACA_EXSLB+EX_R9(r13) | ||
308 | std r10,PACA_EXSLB+EX_R10(r13) | ||
309 | mfspr r10,SPRN_DAR | ||
310 | mfspr r9,SPRN_DSISR | ||
311 | srdi r10,r10,60 | ||
312 | rlwimi r10,r9,16,0x20 | ||
313 | #ifdef CONFIG_KVM_BOOK3S_PR | ||
314 | lbz r9,HSTATE_IN_GUEST(r13) | ||
315 | rlwimi r10,r9,8,0x300 | ||
316 | #endif | ||
317 | mfcr r9 | ||
318 | cmpwi r10,0x2c | ||
319 | beq do_stab_bolted_pSeries | ||
320 | mtcrf 0x80,r9 | ||
321 | ld r9,PACA_EXSLB+EX_R9(r13) | ||
322 | ld r10,PACA_EXSLB+EX_R10(r13) | ||
323 | b data_access_not_stab | ||
324 | do_stab_bolted_pSeries: | ||
325 | std r11,PACA_EXSLB+EX_R11(r13) | ||
326 | std r12,PACA_EXSLB+EX_R12(r13) | ||
327 | GET_SCRATCH0(r10) | ||
328 | std r10,PACA_EXSLB+EX_R13(r13) | ||
329 | EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD) | ||
330 | #endif /* CONFIG_POWER4_ONLY */ | ||
331 | |||
332 | KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x300) | ||
333 | KVM_HANDLER_PR_SKIP(PACA_EXSLB, EXC_STD, 0x380) | ||
334 | KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x400) | ||
335 | KVM_HANDLER_PR(PACA_EXSLB, EXC_STD, 0x480) | ||
336 | KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x900) | ||
337 | KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x982) | ||
338 | |||
339 | .align 7 | ||
279 | /* moved from 0xe00 */ | 340 | /* moved from 0xe00 */ |
280 | STD_EXCEPTION_HV(., 0xe00, h_data_storage) | 341 | STD_EXCEPTION_HV(., 0xe02, h_data_storage) |
281 | STD_EXCEPTION_HV(., 0xe20, h_instr_storage) | 342 | KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0xe02) |
282 | STD_EXCEPTION_HV(., 0xe40, emulation_assist) | 343 | STD_EXCEPTION_HV(., 0xe22, h_instr_storage) |
283 | STD_EXCEPTION_HV(., 0xe60, hmi_exception) /* need to flush cache ? */ | 344 | KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe22) |
345 | STD_EXCEPTION_HV(., 0xe42, emulation_assist) | ||
346 | KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe42) | ||
347 | STD_EXCEPTION_HV(., 0xe62, hmi_exception) /* need to flush cache ? */ | ||
348 | KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe62) | ||
284 | 349 | ||
285 | /* moved from 0xf00 */ | 350 | /* moved from 0xf00 */ |
286 | STD_EXCEPTION_PSERIES(., 0xf00, performance_monitor) | 351 | STD_EXCEPTION_PSERIES(., 0xf00, performance_monitor) |
352 | KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf00) | ||
287 | STD_EXCEPTION_PSERIES(., 0xf20, altivec_unavailable) | 353 | STD_EXCEPTION_PSERIES(., 0xf20, altivec_unavailable) |
354 | KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf20) | ||
288 | STD_EXCEPTION_PSERIES(., 0xf40, vsx_unavailable) | 355 | STD_EXCEPTION_PSERIES(., 0xf40, vsx_unavailable) |
356 | KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40) | ||
289 | 357 | ||
290 | /* | 358 | /* |
291 | * An interrupt came in while soft-disabled; clear EE in SRR1, | 359 | * An interrupt came in while soft-disabled; clear EE in SRR1, |
@@ -317,14 +385,6 @@ masked_Hinterrupt: | |||
317 | hrfid | 385 | hrfid |
318 | b . | 386 | b . |
319 | 387 | ||
320 | .align 7 | ||
321 | do_stab_bolted_pSeries: | ||
322 | std r11,PACA_EXSLB+EX_R11(r13) | ||
323 | std r12,PACA_EXSLB+EX_R12(r13) | ||
324 | GET_SCRATCH0(r10) | ||
325 | std r10,PACA_EXSLB+EX_R13(r13) | ||
326 | EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD) | ||
327 | |||
328 | #ifdef CONFIG_PPC_PSERIES | 388 | #ifdef CONFIG_PPC_PSERIES |
329 | /* | 389 | /* |
330 | * Vectors for the FWNMI option. Share common code. | 390 | * Vectors for the FWNMI option. Share common code. |
@@ -334,14 +394,8 @@ do_stab_bolted_pSeries: | |||
334 | system_reset_fwnmi: | 394 | system_reset_fwnmi: |
335 | HMT_MEDIUM | 395 | HMT_MEDIUM |
336 | SET_SCRATCH0(r13) /* save r13 */ | 396 | SET_SCRATCH0(r13) /* save r13 */ |
337 | EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD) | 397 | EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD, |
338 | 398 | NOTEST, 0x100) | |
339 | .globl machine_check_fwnmi | ||
340 | .align 7 | ||
341 | machine_check_fwnmi: | ||
342 | HMT_MEDIUM | ||
343 | SET_SCRATCH0(r13) /* save r13 */ | ||
344 | EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common, EXC_STD) | ||
345 | 399 | ||
346 | #endif /* CONFIG_PPC_PSERIES */ | 400 | #endif /* CONFIG_PPC_PSERIES */ |
347 | 401 | ||
@@ -376,7 +430,11 @@ slb_miss_user_pseries: | |||
376 | /* KVM's trampoline code needs to be close to the interrupt handlers */ | 430 | /* KVM's trampoline code needs to be close to the interrupt handlers */ |
377 | 431 | ||
378 | #ifdef CONFIG_KVM_BOOK3S_64_HANDLER | 432 | #ifdef CONFIG_KVM_BOOK3S_64_HANDLER |
433 | #ifdef CONFIG_KVM_BOOK3S_PR | ||
379 | #include "../kvm/book3s_rmhandlers.S" | 434 | #include "../kvm/book3s_rmhandlers.S" |
435 | #else | ||
436 | #include "../kvm/book3s_hv_rmhandlers.S" | ||
437 | #endif | ||
380 | #endif | 438 | #endif |
381 | 439 | ||
382 | .align 7 | 440 | .align 7 |
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 5ecf54cfa7d4..fe37dd0dfd17 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S | |||
@@ -656,7 +656,7 @@ load_up_spe: | |||
656 | cmpi 0,r4,0 | 656 | cmpi 0,r4,0 |
657 | beq 1f | 657 | beq 1f |
658 | addi r4,r4,THREAD /* want THREAD of last_task_used_spe */ | 658 | addi r4,r4,THREAD /* want THREAD of last_task_used_spe */ |
659 | SAVE_32EVRS(0,r10,r4) | 659 | SAVE_32EVRS(0,r10,r4,THREAD_EVR0) |
660 | evxor evr10, evr10, evr10 /* clear out evr10 */ | 660 | evxor evr10, evr10, evr10 /* clear out evr10 */ |
661 | evmwumiaa evr10, evr10, evr10 /* evr10 <- ACC = 0 * 0 + ACC */ | 661 | evmwumiaa evr10, evr10, evr10 /* evr10 <- ACC = 0 * 0 + ACC */ |
662 | li r5,THREAD_ACC | 662 | li r5,THREAD_ACC |
@@ -676,7 +676,7 @@ load_up_spe: | |||
676 | stw r4,THREAD_USED_SPE(r5) | 676 | stw r4,THREAD_USED_SPE(r5) |
677 | evlddx evr4,r10,r5 | 677 | evlddx evr4,r10,r5 |
678 | evmra evr4,evr4 | 678 | evmra evr4,evr4 |
679 | REST_32EVRS(0,r10,r5) | 679 | REST_32EVRS(0,r10,r5,THREAD_EVR0) |
680 | #ifndef CONFIG_SMP | 680 | #ifndef CONFIG_SMP |
681 | subi r4,r5,THREAD | 681 | subi r4,r5,THREAD |
682 | stw r4,last_task_used_spe@l(r3) | 682 | stw r4,last_task_used_spe@l(r3) |
@@ -787,13 +787,11 @@ _GLOBAL(giveup_spe) | |||
787 | addi r3,r3,THREAD /* want THREAD of task */ | 787 | addi r3,r3,THREAD /* want THREAD of task */ |
788 | lwz r5,PT_REGS(r3) | 788 | lwz r5,PT_REGS(r3) |
789 | cmpi 0,r5,0 | 789 | cmpi 0,r5,0 |
790 | SAVE_32EVRS(0, r4, r3) | 790 | SAVE_32EVRS(0, r4, r3, THREAD_EVR0) |
791 | evxor evr6, evr6, evr6 /* clear out evr6 */ | 791 | evxor evr6, evr6, evr6 /* clear out evr6 */ |
792 | evmwumiaa evr6, evr6, evr6 /* evr6 <- ACC = 0 * 0 + ACC */ | 792 | evmwumiaa evr6, evr6, evr6 /* evr6 <- ACC = 0 * 0 + ACC */ |
793 | li r4,THREAD_ACC | 793 | li r4,THREAD_ACC |
794 | evstddx evr6, r4, r3 /* save off accumulator */ | 794 | evstddx evr6, r4, r3 /* save off accumulator */ |
795 | mfspr r6,SPRN_SPEFSCR | ||
796 | stw r6,THREAD_SPEFSCR(r3) /* save spefscr register value */ | ||
797 | beq 1f | 795 | beq 1f |
798 | lwz r4,_MSR-STACK_FRAME_OVERHEAD(r5) | 796 | lwz r4,_MSR-STACK_FRAME_OVERHEAD(r5) |
799 | lis r3,MSR_SPE@h | 797 | lis r3,MSR_SPE@h |
diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S index f8f0bc7f1d4f..3a70845a51c7 100644 --- a/arch/powerpc/kernel/idle_power7.S +++ b/arch/powerpc/kernel/idle_power7.S | |||
@@ -73,7 +73,6 @@ _GLOBAL(power7_idle) | |||
73 | b . | 73 | b . |
74 | 74 | ||
75 | _GLOBAL(power7_wakeup_loss) | 75 | _GLOBAL(power7_wakeup_loss) |
76 | GET_PACA(r13) | ||
77 | ld r1,PACAR1(r13) | 76 | ld r1,PACAR1(r13) |
78 | REST_NVGPRS(r1) | 77 | REST_NVGPRS(r1) |
79 | REST_GPR(2, r1) | 78 | REST_GPR(2, r1) |
@@ -87,7 +86,6 @@ _GLOBAL(power7_wakeup_loss) | |||
87 | rfid | 86 | rfid |
88 | 87 | ||
89 | _GLOBAL(power7_wakeup_noloss) | 88 | _GLOBAL(power7_wakeup_noloss) |
90 | GET_PACA(r13) | ||
91 | ld r1,PACAR1(r13) | 89 | ld r1,PACAR1(r13) |
92 | ld r4,_MSR(r1) | 90 | ld r4,_MSR(r1) |
93 | ld r5,_NIP(r1) | 91 | ld r5,_NIP(r1) |
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index efeb88184182..0a5a899846bb 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c | |||
@@ -167,7 +167,7 @@ void setup_paca(struct paca_struct *new_paca) | |||
167 | * if we do a GET_PACA() before the feature fixups have been | 167 | * if we do a GET_PACA() before the feature fixups have been |
168 | * applied | 168 | * applied |
169 | */ | 169 | */ |
170 | if (cpu_has_feature(CPU_FTR_HVMODE_206)) | 170 | if (cpu_has_feature(CPU_FTR_HVMODE)) |
171 | mtspr(SPRN_SPRG_HPACA, local_paca); | 171 | mtspr(SPRN_SPRG_HPACA, local_paca); |
172 | #endif | 172 | #endif |
173 | mtspr(SPRN_SPRG_PACA, local_paca); | 173 | mtspr(SPRN_SPRG_PACA, local_paca); |
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 91e52df3d81d..ec2d0edeb134 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c | |||
@@ -96,6 +96,7 @@ void flush_fp_to_thread(struct task_struct *tsk) | |||
96 | preempt_enable(); | 96 | preempt_enable(); |
97 | } | 97 | } |
98 | } | 98 | } |
99 | EXPORT_SYMBOL_GPL(flush_fp_to_thread); | ||
99 | 100 | ||
100 | void enable_kernel_fp(void) | 101 | void enable_kernel_fp(void) |
101 | { | 102 | { |
@@ -145,6 +146,7 @@ void flush_altivec_to_thread(struct task_struct *tsk) | |||
145 | preempt_enable(); | 146 | preempt_enable(); |
146 | } | 147 | } |
147 | } | 148 | } |
149 | EXPORT_SYMBOL_GPL(flush_altivec_to_thread); | ||
148 | #endif /* CONFIG_ALTIVEC */ | 150 | #endif /* CONFIG_ALTIVEC */ |
149 | 151 | ||
150 | #ifdef CONFIG_VSX | 152 | #ifdef CONFIG_VSX |
@@ -186,6 +188,7 @@ void flush_vsx_to_thread(struct task_struct *tsk) | |||
186 | preempt_enable(); | 188 | preempt_enable(); |
187 | } | 189 | } |
188 | } | 190 | } |
191 | EXPORT_SYMBOL_GPL(flush_vsx_to_thread); | ||
189 | #endif /* CONFIG_VSX */ | 192 | #endif /* CONFIG_VSX */ |
190 | 193 | ||
191 | #ifdef CONFIG_SPE | 194 | #ifdef CONFIG_SPE |
@@ -213,6 +216,7 @@ void flush_spe_to_thread(struct task_struct *tsk) | |||
213 | #ifdef CONFIG_SMP | 216 | #ifdef CONFIG_SMP |
214 | BUG_ON(tsk != current); | 217 | BUG_ON(tsk != current); |
215 | #endif | 218 | #endif |
219 | tsk->thread.spefscr = mfspr(SPRN_SPEFSCR); | ||
216 | giveup_spe(tsk); | 220 | giveup_spe(tsk); |
217 | } | 221 | } |
218 | preempt_enable(); | 222 | preempt_enable(); |
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 79fca2651b65..22051ef04bd9 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c | |||
@@ -375,6 +375,9 @@ void __init check_for_initrd(void) | |||
375 | 375 | ||
376 | int threads_per_core, threads_shift; | 376 | int threads_per_core, threads_shift; |
377 | cpumask_t threads_core_mask; | 377 | cpumask_t threads_core_mask; |
378 | EXPORT_SYMBOL_GPL(threads_per_core); | ||
379 | EXPORT_SYMBOL_GPL(threads_shift); | ||
380 | EXPORT_SYMBOL_GPL(threads_core_mask); | ||
378 | 381 | ||
379 | static void __init cpu_init_thread_core_maps(int tpc) | 382 | static void __init cpu_init_thread_core_maps(int tpc) |
380 | { | 383 | { |
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index a88bf2713d41..532054f24ecb 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c | |||
@@ -63,6 +63,7 @@ | |||
63 | #include <asm/kexec.h> | 63 | #include <asm/kexec.h> |
64 | #include <asm/mmu_context.h> | 64 | #include <asm/mmu_context.h> |
65 | #include <asm/code-patching.h> | 65 | #include <asm/code-patching.h> |
66 | #include <asm/kvm_ppc.h> | ||
66 | 67 | ||
67 | #include "setup.h" | 68 | #include "setup.h" |
68 | 69 | ||
@@ -580,6 +581,8 @@ void __init setup_arch(char **cmdline_p) | |||
580 | /* Initialize the MMU context management stuff */ | 581 | /* Initialize the MMU context management stuff */ |
581 | mmu_context_init(); | 582 | mmu_context_init(); |
582 | 583 | ||
584 | kvm_rma_init(); | ||
585 | |||
583 | ppc64_boot_msg(0x15, "Setup Done"); | 586 | ppc64_boot_msg(0x15, "Setup Done"); |
584 | } | 587 | } |
585 | 588 | ||
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 8ebc6700b98d..09a85a9045d6 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c | |||
@@ -243,6 +243,7 @@ void smp_send_reschedule(int cpu) | |||
243 | if (likely(smp_ops)) | 243 | if (likely(smp_ops)) |
244 | smp_ops->message_pass(cpu, PPC_MSG_RESCHEDULE); | 244 | smp_ops->message_pass(cpu, PPC_MSG_RESCHEDULE); |
245 | } | 245 | } |
246 | EXPORT_SYMBOL_GPL(smp_send_reschedule); | ||
246 | 247 | ||
247 | void arch_send_call_function_single_ipi(int cpu) | 248 | void arch_send_call_function_single_ipi(int cpu) |
248 | { | 249 | { |
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 1a0141426cda..f19d9777d3c1 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c | |||
@@ -1387,10 +1387,7 @@ void SPEFloatingPointException(struct pt_regs *regs) | |||
1387 | int code = 0; | 1387 | int code = 0; |
1388 | int err; | 1388 | int err; |
1389 | 1389 | ||
1390 | preempt_disable(); | 1390 | flush_spe_to_thread(current); |
1391 | if (regs->msr & MSR_SPE) | ||
1392 | giveup_spe(current); | ||
1393 | preempt_enable(); | ||
1394 | 1391 | ||
1395 | spefscr = current->thread.spefscr; | 1392 | spefscr = current->thread.spefscr; |
1396 | fpexc_mode = current->thread.fpexc_mode; | 1393 | fpexc_mode = current->thread.fpexc_mode; |
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index 5f3cff83e089..33aa715dab28 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c | |||
@@ -387,8 +387,10 @@ static void kvmppc_44x_invalidate(struct kvm_vcpu *vcpu, | |||
387 | } | 387 | } |
388 | } | 388 | } |
389 | 389 | ||
390 | void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode) | 390 | void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr) |
391 | { | 391 | { |
392 | int usermode = vcpu->arch.shared->msr & MSR_PR; | ||
393 | |||
392 | vcpu->arch.shadow_pid = !usermode; | 394 | vcpu->arch.shadow_pid = !usermode; |
393 | } | 395 | } |
394 | 396 | ||
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 105b6918b23e..78133deb4b64 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig | |||
@@ -20,7 +20,6 @@ config KVM | |||
20 | bool | 20 | bool |
21 | select PREEMPT_NOTIFIERS | 21 | select PREEMPT_NOTIFIERS |
22 | select ANON_INODES | 22 | select ANON_INODES |
23 | select KVM_MMIO | ||
24 | 23 | ||
25 | config KVM_BOOK3S_HANDLER | 24 | config KVM_BOOK3S_HANDLER |
26 | bool | 25 | bool |
@@ -28,16 +27,22 @@ config KVM_BOOK3S_HANDLER | |||
28 | config KVM_BOOK3S_32_HANDLER | 27 | config KVM_BOOK3S_32_HANDLER |
29 | bool | 28 | bool |
30 | select KVM_BOOK3S_HANDLER | 29 | select KVM_BOOK3S_HANDLER |
30 | select KVM_MMIO | ||
31 | 31 | ||
32 | config KVM_BOOK3S_64_HANDLER | 32 | config KVM_BOOK3S_64_HANDLER |
33 | bool | 33 | bool |
34 | select KVM_BOOK3S_HANDLER | 34 | select KVM_BOOK3S_HANDLER |
35 | 35 | ||
36 | config KVM_BOOK3S_PR | ||
37 | bool | ||
38 | select KVM_MMIO | ||
39 | |||
36 | config KVM_BOOK3S_32 | 40 | config KVM_BOOK3S_32 |
37 | tristate "KVM support for PowerPC book3s_32 processors" | 41 | tristate "KVM support for PowerPC book3s_32 processors" |
38 | depends on EXPERIMENTAL && PPC_BOOK3S_32 && !SMP && !PTE_64BIT | 42 | depends on EXPERIMENTAL && PPC_BOOK3S_32 && !SMP && !PTE_64BIT |
39 | select KVM | 43 | select KVM |
40 | select KVM_BOOK3S_32_HANDLER | 44 | select KVM_BOOK3S_32_HANDLER |
45 | select KVM_BOOK3S_PR | ||
41 | ---help--- | 46 | ---help--- |
42 | Support running unmodified book3s_32 guest kernels | 47 | Support running unmodified book3s_32 guest kernels |
43 | in virtual machines on book3s_32 host processors. | 48 | in virtual machines on book3s_32 host processors. |
@@ -50,8 +55,8 @@ config KVM_BOOK3S_32 | |||
50 | config KVM_BOOK3S_64 | 55 | config KVM_BOOK3S_64 |
51 | tristate "KVM support for PowerPC book3s_64 processors" | 56 | tristate "KVM support for PowerPC book3s_64 processors" |
52 | depends on EXPERIMENTAL && PPC_BOOK3S_64 | 57 | depends on EXPERIMENTAL && PPC_BOOK3S_64 |
53 | select KVM | ||
54 | select KVM_BOOK3S_64_HANDLER | 58 | select KVM_BOOK3S_64_HANDLER |
59 | select KVM | ||
55 | ---help--- | 60 | ---help--- |
56 | Support running unmodified book3s_64 and book3s_32 guest kernels | 61 | Support running unmodified book3s_64 and book3s_32 guest kernels |
57 | in virtual machines on book3s_64 host processors. | 62 | in virtual machines on book3s_64 host processors. |
@@ -61,10 +66,34 @@ config KVM_BOOK3S_64 | |||
61 | 66 | ||
62 | If unsure, say N. | 67 | If unsure, say N. |
63 | 68 | ||
69 | config KVM_BOOK3S_64_HV | ||
70 | bool "KVM support for POWER7 and PPC970 using hypervisor mode in host" | ||
71 | depends on KVM_BOOK3S_64 | ||
72 | ---help--- | ||
73 | Support running unmodified book3s_64 guest kernels in | ||
74 | virtual machines on POWER7 and PPC970 processors that have | ||
75 | hypervisor mode available to the host. | ||
76 | |||
77 | If you say Y here, KVM will use the hardware virtualization | ||
78 | facilities of POWER7 (and later) processors, meaning that | ||
79 | guest operating systems will run at full hardware speed | ||
80 | using supervisor and user modes. However, this also means | ||
81 | that KVM is not usable under PowerVM (pHyp), is only usable | ||
82 | on POWER7 (or later) processors and PPC970-family processors, | ||
83 | and cannot emulate a different processor from the host processor. | ||
84 | |||
85 | If unsure, say N. | ||
86 | |||
87 | config KVM_BOOK3S_64_PR | ||
88 | def_bool y | ||
89 | depends on KVM_BOOK3S_64 && !KVM_BOOK3S_64_HV | ||
90 | select KVM_BOOK3S_PR | ||
91 | |||
64 | config KVM_440 | 92 | config KVM_440 |
65 | bool "KVM support for PowerPC 440 processors" | 93 | bool "KVM support for PowerPC 440 processors" |
66 | depends on EXPERIMENTAL && 44x | 94 | depends on EXPERIMENTAL && 44x |
67 | select KVM | 95 | select KVM |
96 | select KVM_MMIO | ||
68 | ---help--- | 97 | ---help--- |
69 | Support running unmodified 440 guest kernels in virtual machines on | 98 | Support running unmodified 440 guest kernels in virtual machines on |
70 | 440 host processors. | 99 | 440 host processors. |
@@ -89,6 +118,7 @@ config KVM_E500 | |||
89 | bool "KVM support for PowerPC E500 processors" | 118 | bool "KVM support for PowerPC E500 processors" |
90 | depends on EXPERIMENTAL && E500 | 119 | depends on EXPERIMENTAL && E500 |
91 | select KVM | 120 | select KVM |
121 | select KVM_MMIO | ||
92 | ---help--- | 122 | ---help--- |
93 | Support running unmodified E500 guest kernels in virtual machines on | 123 | Support running unmodified E500 guest kernels in virtual machines on |
94 | E500 host processors. | 124 | E500 host processors. |
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 4d6863823f69..08428e2c188d 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile | |||
@@ -38,24 +38,42 @@ kvm-e500-objs := \ | |||
38 | e500_emulate.o | 38 | e500_emulate.o |
39 | kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs) | 39 | kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs) |
40 | 40 | ||
41 | kvm-book3s_64-objs := \ | 41 | kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \ |
42 | $(common-objs-y) \ | 42 | ../../../virt/kvm/coalesced_mmio.o \ |
43 | fpu.o \ | 43 | fpu.o \ |
44 | book3s_paired_singles.o \ | 44 | book3s_paired_singles.o \ |
45 | book3s.o \ | 45 | book3s_pr.o \ |
46 | book3s_emulate.o \ | 46 | book3s_emulate.o \ |
47 | book3s_interrupts.o \ | 47 | book3s_interrupts.o \ |
48 | book3s_mmu_hpte.o \ | 48 | book3s_mmu_hpte.o \ |
49 | book3s_64_mmu_host.o \ | 49 | book3s_64_mmu_host.o \ |
50 | book3s_64_mmu.o \ | 50 | book3s_64_mmu.o \ |
51 | book3s_32_mmu.o | 51 | book3s_32_mmu.o |
52 | kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-objs) | 52 | |
53 | kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ | ||
54 | book3s_hv.o \ | ||
55 | book3s_hv_interrupts.o \ | ||
56 | book3s_64_mmu_hv.o | ||
57 | kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ | ||
58 | book3s_hv_rm_mmu.o \ | ||
59 | book3s_64_vio_hv.o \ | ||
60 | book3s_hv_builtin.o | ||
61 | |||
62 | kvm-book3s_64-module-objs := \ | ||
63 | ../../../virt/kvm/kvm_main.o \ | ||
64 | powerpc.o \ | ||
65 | emulate.o \ | ||
66 | book3s.o \ | ||
67 | $(kvm-book3s_64-objs-y) | ||
68 | |||
69 | kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs) | ||
53 | 70 | ||
54 | kvm-book3s_32-objs := \ | 71 | kvm-book3s_32-objs := \ |
55 | $(common-objs-y) \ | 72 | $(common-objs-y) \ |
56 | fpu.o \ | 73 | fpu.o \ |
57 | book3s_paired_singles.o \ | 74 | book3s_paired_singles.o \ |
58 | book3s.o \ | 75 | book3s.o \ |
76 | book3s_pr.o \ | ||
59 | book3s_emulate.o \ | 77 | book3s_emulate.o \ |
60 | book3s_interrupts.o \ | 78 | book3s_interrupts.o \ |
61 | book3s_mmu_hpte.o \ | 79 | book3s_mmu_hpte.o \ |
@@ -70,3 +88,4 @@ obj-$(CONFIG_KVM_E500) += kvm.o | |||
70 | obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o | 88 | obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o |
71 | obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o | 89 | obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o |
72 | 90 | ||
91 | obj-y += $(kvm-book3s_64-builtin-objs-y) | ||
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 0f95b5cce033..f68a34d16035 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c | |||
@@ -17,7 +17,6 @@ | |||
17 | #include <linux/kvm_host.h> | 17 | #include <linux/kvm_host.h> |
18 | #include <linux/err.h> | 18 | #include <linux/err.h> |
19 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
20 | #include "trace.h" | ||
21 | 20 | ||
22 | #include <asm/reg.h> | 21 | #include <asm/reg.h> |
23 | #include <asm/cputable.h> | 22 | #include <asm/cputable.h> |
@@ -28,25 +27,17 @@ | |||
28 | #include <asm/kvm_ppc.h> | 27 | #include <asm/kvm_ppc.h> |
29 | #include <asm/kvm_book3s.h> | 28 | #include <asm/kvm_book3s.h> |
30 | #include <asm/mmu_context.h> | 29 | #include <asm/mmu_context.h> |
30 | #include <asm/page.h> | ||
31 | #include <linux/gfp.h> | 31 | #include <linux/gfp.h> |
32 | #include <linux/sched.h> | 32 | #include <linux/sched.h> |
33 | #include <linux/vmalloc.h> | 33 | #include <linux/vmalloc.h> |
34 | #include <linux/highmem.h> | 34 | #include <linux/highmem.h> |
35 | 35 | ||
36 | #include "trace.h" | ||
37 | |||
36 | #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU | 38 | #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU |
37 | 39 | ||
38 | /* #define EXIT_DEBUG */ | 40 | /* #define EXIT_DEBUG */ |
39 | /* #define DEBUG_EXT */ | ||
40 | |||
41 | static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, | ||
42 | ulong msr); | ||
43 | |||
44 | /* Some compatibility defines */ | ||
45 | #ifdef CONFIG_PPC_BOOK3S_32 | ||
46 | #define MSR_USER32 MSR_USER | ||
47 | #define MSR_USER64 MSR_USER | ||
48 | #define HW_PAGE_SIZE PAGE_SIZE | ||
49 | #endif | ||
50 | 41 | ||
51 | struct kvm_stats_debugfs_item debugfs_entries[] = { | 42 | struct kvm_stats_debugfs_item debugfs_entries[] = { |
52 | { "exits", VCPU_STAT(sum_exits) }, | 43 | { "exits", VCPU_STAT(sum_exits) }, |
@@ -77,100 +68,11 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu) | |||
77 | { | 68 | { |
78 | } | 69 | } |
79 | 70 | ||
80 | void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | ||
81 | { | ||
82 | #ifdef CONFIG_PPC_BOOK3S_64 | ||
83 | memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb)); | ||
84 | memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu, | ||
85 | sizeof(get_paca()->shadow_vcpu)); | ||
86 | to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max; | ||
87 | #endif | ||
88 | |||
89 | #ifdef CONFIG_PPC_BOOK3S_32 | ||
90 | current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu; | ||
91 | #endif | ||
92 | } | ||
93 | |||
94 | void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) | ||
95 | { | ||
96 | #ifdef CONFIG_PPC_BOOK3S_64 | ||
97 | memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb)); | ||
98 | memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu, | ||
99 | sizeof(get_paca()->shadow_vcpu)); | ||
100 | to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max; | ||
101 | #endif | ||
102 | |||
103 | kvmppc_giveup_ext(vcpu, MSR_FP); | ||
104 | kvmppc_giveup_ext(vcpu, MSR_VEC); | ||
105 | kvmppc_giveup_ext(vcpu, MSR_VSX); | ||
106 | } | ||
107 | |||
108 | static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) | ||
109 | { | ||
110 | ulong smsr = vcpu->arch.shared->msr; | ||
111 | |||
112 | /* Guest MSR values */ | ||
113 | smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE; | ||
114 | /* Process MSR values */ | ||
115 | smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE; | ||
116 | /* External providers the guest reserved */ | ||
117 | smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext); | ||
118 | /* 64-bit Process MSR values */ | ||
119 | #ifdef CONFIG_PPC_BOOK3S_64 | ||
120 | smsr |= MSR_ISF | MSR_HV; | ||
121 | #endif | ||
122 | vcpu->arch.shadow_msr = smsr; | ||
123 | } | ||
124 | |||
125 | void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) | ||
126 | { | ||
127 | ulong old_msr = vcpu->arch.shared->msr; | ||
128 | |||
129 | #ifdef EXIT_DEBUG | ||
130 | printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr); | ||
131 | #endif | ||
132 | |||
133 | msr &= to_book3s(vcpu)->msr_mask; | ||
134 | vcpu->arch.shared->msr = msr; | ||
135 | kvmppc_recalc_shadow_msr(vcpu); | ||
136 | |||
137 | if (msr & MSR_POW) { | ||
138 | if (!vcpu->arch.pending_exceptions) { | ||
139 | kvm_vcpu_block(vcpu); | ||
140 | vcpu->stat.halt_wakeup++; | ||
141 | |||
142 | /* Unset POW bit after we woke up */ | ||
143 | msr &= ~MSR_POW; | ||
144 | vcpu->arch.shared->msr = msr; | ||
145 | } | ||
146 | } | ||
147 | |||
148 | if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) != | ||
149 | (old_msr & (MSR_PR|MSR_IR|MSR_DR))) { | ||
150 | kvmppc_mmu_flush_segments(vcpu); | ||
151 | kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); | ||
152 | |||
153 | /* Preload magic page segment when in kernel mode */ | ||
154 | if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) { | ||
155 | struct kvm_vcpu_arch *a = &vcpu->arch; | ||
156 | |||
157 | if (msr & MSR_DR) | ||
158 | kvmppc_mmu_map_segment(vcpu, a->magic_page_ea); | ||
159 | else | ||
160 | kvmppc_mmu_map_segment(vcpu, a->magic_page_pa); | ||
161 | } | ||
162 | } | ||
163 | |||
164 | /* Preload FPU if it's enabled */ | ||
165 | if (vcpu->arch.shared->msr & MSR_FP) | ||
166 | kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); | ||
167 | } | ||
168 | |||
169 | void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags) | 71 | void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags) |
170 | { | 72 | { |
171 | vcpu->arch.shared->srr0 = kvmppc_get_pc(vcpu); | 73 | vcpu->arch.shared->srr0 = kvmppc_get_pc(vcpu); |
172 | vcpu->arch.shared->srr1 = vcpu->arch.shared->msr | flags; | 74 | vcpu->arch.shared->srr1 = vcpu->arch.shared->msr | flags; |
173 | kvmppc_set_pc(vcpu, to_book3s(vcpu)->hior + vec); | 75 | kvmppc_set_pc(vcpu, kvmppc_interrupt_offset(vcpu) + vec); |
174 | vcpu->arch.mmu.reset_msr(vcpu); | 76 | vcpu->arch.mmu.reset_msr(vcpu); |
175 | } | 77 | } |
176 | 78 | ||
@@ -204,11 +106,13 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec) | |||
204 | static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, | 106 | static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, |
205 | unsigned int vec) | 107 | unsigned int vec) |
206 | { | 108 | { |
109 | unsigned long old_pending = vcpu->arch.pending_exceptions; | ||
110 | |||
207 | clear_bit(kvmppc_book3s_vec2irqprio(vec), | 111 | clear_bit(kvmppc_book3s_vec2irqprio(vec), |
208 | &vcpu->arch.pending_exceptions); | 112 | &vcpu->arch.pending_exceptions); |
209 | 113 | ||
210 | if (!vcpu->arch.pending_exceptions) | 114 | kvmppc_update_int_pending(vcpu, vcpu->arch.pending_exceptions, |
211 | vcpu->arch.shared->int_pending = 0; | 115 | old_pending); |
212 | } | 116 | } |
213 | 117 | ||
214 | void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) | 118 | void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) |
@@ -225,8 +129,8 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) | |||
225 | 129 | ||
226 | void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags) | 130 | void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags) |
227 | { | 131 | { |
228 | to_book3s(vcpu)->prog_flags = flags; | 132 | /* might as well deliver this straight away */ |
229 | kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_PROGRAM); | 133 | kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_PROGRAM, flags); |
230 | } | 134 | } |
231 | 135 | ||
232 | void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu) | 136 | void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu) |
@@ -266,21 +170,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) | |||
266 | { | 170 | { |
267 | int deliver = 1; | 171 | int deliver = 1; |
268 | int vec = 0; | 172 | int vec = 0; |
269 | ulong flags = 0ULL; | 173 | bool crit = kvmppc_critical_section(vcpu); |
270 | ulong crit_raw = vcpu->arch.shared->critical; | ||
271 | ulong crit_r1 = kvmppc_get_gpr(vcpu, 1); | ||
272 | bool crit; | ||
273 | |||
274 | /* Truncate crit indicators in 32 bit mode */ | ||
275 | if (!(vcpu->arch.shared->msr & MSR_SF)) { | ||
276 | crit_raw &= 0xffffffff; | ||
277 | crit_r1 &= 0xffffffff; | ||
278 | } | ||
279 | |||
280 | /* Critical section when crit == r1 */ | ||
281 | crit = (crit_raw == crit_r1); | ||
282 | /* ... and we're in supervisor mode */ | ||
283 | crit = crit && !(vcpu->arch.shared->msr & MSR_PR); | ||
284 | 174 | ||
285 | switch (priority) { | 175 | switch (priority) { |
286 | case BOOK3S_IRQPRIO_DECREMENTER: | 176 | case BOOK3S_IRQPRIO_DECREMENTER: |
@@ -315,7 +205,6 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) | |||
315 | break; | 205 | break; |
316 | case BOOK3S_IRQPRIO_PROGRAM: | 206 | case BOOK3S_IRQPRIO_PROGRAM: |
317 | vec = BOOK3S_INTERRUPT_PROGRAM; | 207 | vec = BOOK3S_INTERRUPT_PROGRAM; |
318 | flags = to_book3s(vcpu)->prog_flags; | ||
319 | break; | 208 | break; |
320 | case BOOK3S_IRQPRIO_VSX: | 209 | case BOOK3S_IRQPRIO_VSX: |
321 | vec = BOOK3S_INTERRUPT_VSX; | 210 | vec = BOOK3S_INTERRUPT_VSX; |
@@ -346,7 +235,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) | |||
346 | #endif | 235 | #endif |
347 | 236 | ||
348 | if (deliver) | 237 | if (deliver) |
349 | kvmppc_inject_interrupt(vcpu, vec, flags); | 238 | kvmppc_inject_interrupt(vcpu, vec, 0); |
350 | 239 | ||
351 | return deliver; | 240 | return deliver; |
352 | } | 241 | } |
@@ -392,64 +281,7 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) | |||
392 | } | 281 | } |
393 | 282 | ||
394 | /* Tell the guest about our interrupt status */ | 283 | /* Tell the guest about our interrupt status */ |
395 | if (*pending) | 284 | kvmppc_update_int_pending(vcpu, *pending, old_pending); |
396 | vcpu->arch.shared->int_pending = 1; | ||
397 | else if (old_pending) | ||
398 | vcpu->arch.shared->int_pending = 0; | ||
399 | } | ||
400 | |||
401 | void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) | ||
402 | { | ||
403 | u32 host_pvr; | ||
404 | |||
405 | vcpu->arch.hflags &= ~BOOK3S_HFLAG_SLB; | ||
406 | vcpu->arch.pvr = pvr; | ||
407 | #ifdef CONFIG_PPC_BOOK3S_64 | ||
408 | if ((pvr >= 0x330000) && (pvr < 0x70330000)) { | ||
409 | kvmppc_mmu_book3s_64_init(vcpu); | ||
410 | to_book3s(vcpu)->hior = 0xfff00000; | ||
411 | to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL; | ||
412 | } else | ||
413 | #endif | ||
414 | { | ||
415 | kvmppc_mmu_book3s_32_init(vcpu); | ||
416 | to_book3s(vcpu)->hior = 0; | ||
417 | to_book3s(vcpu)->msr_mask = 0xffffffffULL; | ||
418 | } | ||
419 | |||
420 | /* If we are in hypervisor level on 970, we can tell the CPU to | ||
421 | * treat DCBZ as 32 bytes store */ | ||
422 | vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32; | ||
423 | if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV) && | ||
424 | !strcmp(cur_cpu_spec->platform, "ppc970")) | ||
425 | vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; | ||
426 | |||
427 | /* Cell performs badly if MSR_FEx are set. So let's hope nobody | ||
428 | really needs them in a VM on Cell and force disable them. */ | ||
429 | if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be")) | ||
430 | to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1); | ||
431 | |||
432 | #ifdef CONFIG_PPC_BOOK3S_32 | ||
433 | /* 32 bit Book3S always has 32 byte dcbz */ | ||
434 | vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; | ||
435 | #endif | ||
436 | |||
437 | /* On some CPUs we can execute paired single operations natively */ | ||
438 | asm ( "mfpvr %0" : "=r"(host_pvr)); | ||
439 | switch (host_pvr) { | ||
440 | case 0x00080200: /* lonestar 2.0 */ | ||
441 | case 0x00088202: /* lonestar 2.2 */ | ||
442 | case 0x70000100: /* gekko 1.0 */ | ||
443 | case 0x00080100: /* gekko 2.0 */ | ||
444 | case 0x00083203: /* gekko 2.3a */ | ||
445 | case 0x00083213: /* gekko 2.3b */ | ||
446 | case 0x00083204: /* gekko 2.4 */ | ||
447 | case 0x00083214: /* gekko 2.4e (8SE) - retail HW2 */ | ||
448 | case 0x00087200: /* broadway */ | ||
449 | vcpu->arch.hflags |= BOOK3S_HFLAG_NATIVE_PS; | ||
450 | /* Enable HID2.PSE - in case we need it later */ | ||
451 | mtspr(SPRN_HID2_GEKKO, mfspr(SPRN_HID2_GEKKO) | (1 << 29)); | ||
452 | } | ||
453 | } | 285 | } |
454 | 286 | ||
455 | pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) | 287 | pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) |
@@ -471,44 +303,6 @@ pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) | |||
471 | return gfn_to_pfn(vcpu->kvm, gfn); | 303 | return gfn_to_pfn(vcpu->kvm, gfn); |
472 | } | 304 | } |
473 | 305 | ||
474 | /* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To | ||
475 | * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to | ||
476 | * emulate 32 bytes dcbz length. | ||
477 | * | ||
478 | * The Book3s_64 inventors also realized this case and implemented a special bit | ||
479 | * in the HID5 register, which is a hypervisor ressource. Thus we can't use it. | ||
480 | * | ||
481 | * My approach here is to patch the dcbz instruction on executing pages. | ||
482 | */ | ||
483 | static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) | ||
484 | { | ||
485 | struct page *hpage; | ||
486 | u64 hpage_offset; | ||
487 | u32 *page; | ||
488 | int i; | ||
489 | |||
490 | hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT); | ||
491 | if (is_error_page(hpage)) { | ||
492 | kvm_release_page_clean(hpage); | ||
493 | return; | ||
494 | } | ||
495 | |||
496 | hpage_offset = pte->raddr & ~PAGE_MASK; | ||
497 | hpage_offset &= ~0xFFFULL; | ||
498 | hpage_offset /= 4; | ||
499 | |||
500 | get_page(hpage); | ||
501 | page = kmap_atomic(hpage, KM_USER0); | ||
502 | |||
503 | /* patch dcbz into reserved instruction, so we trap */ | ||
504 | for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++) | ||
505 | if ((page[i] & 0xff0007ff) == INS_DCBZ) | ||
506 | page[i] &= 0xfffffff7; | ||
507 | |||
508 | kunmap_atomic(page, KM_USER0); | ||
509 | put_page(hpage); | ||
510 | } | ||
511 | |||
512 | static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data, | 306 | static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data, |
513 | struct kvmppc_pte *pte) | 307 | struct kvmppc_pte *pte) |
514 | { | 308 | { |
@@ -606,519 +400,6 @@ mmio: | |||
606 | return EMULATE_DO_MMIO; | 400 | return EMULATE_DO_MMIO; |
607 | } | 401 | } |
608 | 402 | ||
609 | static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
610 | { | ||
611 | ulong mp_pa = vcpu->arch.magic_page_pa; | ||
612 | |||
613 | if (unlikely(mp_pa) && | ||
614 | unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) { | ||
615 | return 1; | ||
616 | } | ||
617 | |||
618 | return kvm_is_visible_gfn(vcpu->kvm, gfn); | ||
619 | } | ||
620 | |||
621 | int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, | ||
622 | ulong eaddr, int vec) | ||
623 | { | ||
624 | bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE); | ||
625 | int r = RESUME_GUEST; | ||
626 | int relocated; | ||
627 | int page_found = 0; | ||
628 | struct kvmppc_pte pte; | ||
629 | bool is_mmio = false; | ||
630 | bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false; | ||
631 | bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false; | ||
632 | u64 vsid; | ||
633 | |||
634 | relocated = data ? dr : ir; | ||
635 | |||
636 | /* Resolve real address if translation turned on */ | ||
637 | if (relocated) { | ||
638 | page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data); | ||
639 | } else { | ||
640 | pte.may_execute = true; | ||
641 | pte.may_read = true; | ||
642 | pte.may_write = true; | ||
643 | pte.raddr = eaddr & KVM_PAM; | ||
644 | pte.eaddr = eaddr; | ||
645 | pte.vpage = eaddr >> 12; | ||
646 | } | ||
647 | |||
648 | switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { | ||
649 | case 0: | ||
650 | pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12)); | ||
651 | break; | ||
652 | case MSR_DR: | ||
653 | case MSR_IR: | ||
654 | vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid); | ||
655 | |||
656 | if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR) | ||
657 | pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12)); | ||
658 | else | ||
659 | pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12)); | ||
660 | pte.vpage |= vsid; | ||
661 | |||
662 | if (vsid == -1) | ||
663 | page_found = -EINVAL; | ||
664 | break; | ||
665 | } | ||
666 | |||
667 | if (vcpu->arch.mmu.is_dcbz32(vcpu) && | ||
668 | (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) { | ||
669 | /* | ||
670 | * If we do the dcbz hack, we have to NX on every execution, | ||
671 | * so we can patch the executing code. This renders our guest | ||
672 | * NX-less. | ||
673 | */ | ||
674 | pte.may_execute = !data; | ||
675 | } | ||
676 | |||
677 | if (page_found == -ENOENT) { | ||
678 | /* Page not found in guest PTE entries */ | ||
679 | vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); | ||
680 | vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; | ||
681 | vcpu->arch.shared->msr |= | ||
682 | (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); | ||
683 | kvmppc_book3s_queue_irqprio(vcpu, vec); | ||
684 | } else if (page_found == -EPERM) { | ||
685 | /* Storage protection */ | ||
686 | vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); | ||
687 | vcpu->arch.shared->dsisr = | ||
688 | to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE; | ||
689 | vcpu->arch.shared->dsisr |= DSISR_PROTFAULT; | ||
690 | vcpu->arch.shared->msr |= | ||
691 | (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); | ||
692 | kvmppc_book3s_queue_irqprio(vcpu, vec); | ||
693 | } else if (page_found == -EINVAL) { | ||
694 | /* Page not found in guest SLB */ | ||
695 | vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); | ||
696 | kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80); | ||
697 | } else if (!is_mmio && | ||
698 | kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) { | ||
699 | /* The guest's PTE is not mapped yet. Map on the host */ | ||
700 | kvmppc_mmu_map_page(vcpu, &pte); | ||
701 | if (data) | ||
702 | vcpu->stat.sp_storage++; | ||
703 | else if (vcpu->arch.mmu.is_dcbz32(vcpu) && | ||
704 | (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) | ||
705 | kvmppc_patch_dcbz(vcpu, &pte); | ||
706 | } else { | ||
707 | /* MMIO */ | ||
708 | vcpu->stat.mmio_exits++; | ||
709 | vcpu->arch.paddr_accessed = pte.raddr; | ||
710 | r = kvmppc_emulate_mmio(run, vcpu); | ||
711 | if ( r == RESUME_HOST_NV ) | ||
712 | r = RESUME_HOST; | ||
713 | } | ||
714 | |||
715 | return r; | ||
716 | } | ||
717 | |||
718 | static inline int get_fpr_index(int i) | ||
719 | { | ||
720 | #ifdef CONFIG_VSX | ||
721 | i *= 2; | ||
722 | #endif | ||
723 | return i; | ||
724 | } | ||
725 | |||
726 | /* Give up external provider (FPU, Altivec, VSX) */ | ||
727 | void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr) | ||
728 | { | ||
729 | struct thread_struct *t = ¤t->thread; | ||
730 | u64 *vcpu_fpr = vcpu->arch.fpr; | ||
731 | #ifdef CONFIG_VSX | ||
732 | u64 *vcpu_vsx = vcpu->arch.vsr; | ||
733 | #endif | ||
734 | u64 *thread_fpr = (u64*)t->fpr; | ||
735 | int i; | ||
736 | |||
737 | if (!(vcpu->arch.guest_owned_ext & msr)) | ||
738 | return; | ||
739 | |||
740 | #ifdef DEBUG_EXT | ||
741 | printk(KERN_INFO "Giving up ext 0x%lx\n", msr); | ||
742 | #endif | ||
743 | |||
744 | switch (msr) { | ||
745 | case MSR_FP: | ||
746 | giveup_fpu(current); | ||
747 | for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) | ||
748 | vcpu_fpr[i] = thread_fpr[get_fpr_index(i)]; | ||
749 | |||
750 | vcpu->arch.fpscr = t->fpscr.val; | ||
751 | break; | ||
752 | case MSR_VEC: | ||
753 | #ifdef CONFIG_ALTIVEC | ||
754 | giveup_altivec(current); | ||
755 | memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr)); | ||
756 | vcpu->arch.vscr = t->vscr; | ||
757 | #endif | ||
758 | break; | ||
759 | case MSR_VSX: | ||
760 | #ifdef CONFIG_VSX | ||
761 | __giveup_vsx(current); | ||
762 | for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) | ||
763 | vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1]; | ||
764 | #endif | ||
765 | break; | ||
766 | default: | ||
767 | BUG(); | ||
768 | } | ||
769 | |||
770 | vcpu->arch.guest_owned_ext &= ~msr; | ||
771 | current->thread.regs->msr &= ~msr; | ||
772 | kvmppc_recalc_shadow_msr(vcpu); | ||
773 | } | ||
774 | |||
775 | static int kvmppc_read_inst(struct kvm_vcpu *vcpu) | ||
776 | { | ||
777 | ulong srr0 = kvmppc_get_pc(vcpu); | ||
778 | u32 last_inst = kvmppc_get_last_inst(vcpu); | ||
779 | int ret; | ||
780 | |||
781 | ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false); | ||
782 | if (ret == -ENOENT) { | ||
783 | ulong msr = vcpu->arch.shared->msr; | ||
784 | |||
785 | msr = kvmppc_set_field(msr, 33, 33, 1); | ||
786 | msr = kvmppc_set_field(msr, 34, 36, 0); | ||
787 | vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0); | ||
788 | kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE); | ||
789 | return EMULATE_AGAIN; | ||
790 | } | ||
791 | |||
792 | return EMULATE_DONE; | ||
793 | } | ||
794 | |||
795 | static int kvmppc_check_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr) | ||
796 | { | ||
797 | |||
798 | /* Need to do paired single emulation? */ | ||
799 | if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)) | ||
800 | return EMULATE_DONE; | ||
801 | |||
802 | /* Read out the instruction */ | ||
803 | if (kvmppc_read_inst(vcpu) == EMULATE_DONE) | ||
804 | /* Need to emulate */ | ||
805 | return EMULATE_FAIL; | ||
806 | |||
807 | return EMULATE_AGAIN; | ||
808 | } | ||
809 | |||
810 | /* Handle external providers (FPU, Altivec, VSX) */ | ||
811 | static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, | ||
812 | ulong msr) | ||
813 | { | ||
814 | struct thread_struct *t = ¤t->thread; | ||
815 | u64 *vcpu_fpr = vcpu->arch.fpr; | ||
816 | #ifdef CONFIG_VSX | ||
817 | u64 *vcpu_vsx = vcpu->arch.vsr; | ||
818 | #endif | ||
819 | u64 *thread_fpr = (u64*)t->fpr; | ||
820 | int i; | ||
821 | |||
822 | /* When we have paired singles, we emulate in software */ | ||
823 | if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE) | ||
824 | return RESUME_GUEST; | ||
825 | |||
826 | if (!(vcpu->arch.shared->msr & msr)) { | ||
827 | kvmppc_book3s_queue_irqprio(vcpu, exit_nr); | ||
828 | return RESUME_GUEST; | ||
829 | } | ||
830 | |||
831 | /* We already own the ext */ | ||
832 | if (vcpu->arch.guest_owned_ext & msr) { | ||
833 | return RESUME_GUEST; | ||
834 | } | ||
835 | |||
836 | #ifdef DEBUG_EXT | ||
837 | printk(KERN_INFO "Loading up ext 0x%lx\n", msr); | ||
838 | #endif | ||
839 | |||
840 | current->thread.regs->msr |= msr; | ||
841 | |||
842 | switch (msr) { | ||
843 | case MSR_FP: | ||
844 | for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) | ||
845 | thread_fpr[get_fpr_index(i)] = vcpu_fpr[i]; | ||
846 | |||
847 | t->fpscr.val = vcpu->arch.fpscr; | ||
848 | t->fpexc_mode = 0; | ||
849 | kvmppc_load_up_fpu(); | ||
850 | break; | ||
851 | case MSR_VEC: | ||
852 | #ifdef CONFIG_ALTIVEC | ||
853 | memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr)); | ||
854 | t->vscr = vcpu->arch.vscr; | ||
855 | t->vrsave = -1; | ||
856 | kvmppc_load_up_altivec(); | ||
857 | #endif | ||
858 | break; | ||
859 | case MSR_VSX: | ||
860 | #ifdef CONFIG_VSX | ||
861 | for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) | ||
862 | thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i]; | ||
863 | kvmppc_load_up_vsx(); | ||
864 | #endif | ||
865 | break; | ||
866 | default: | ||
867 | BUG(); | ||
868 | } | ||
869 | |||
870 | vcpu->arch.guest_owned_ext |= msr; | ||
871 | |||
872 | kvmppc_recalc_shadow_msr(vcpu); | ||
873 | |||
874 | return RESUME_GUEST; | ||
875 | } | ||
876 | |||
877 | int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, | ||
878 | unsigned int exit_nr) | ||
879 | { | ||
880 | int r = RESUME_HOST; | ||
881 | |||
882 | vcpu->stat.sum_exits++; | ||
883 | |||
884 | run->exit_reason = KVM_EXIT_UNKNOWN; | ||
885 | run->ready_for_interrupt_injection = 1; | ||
886 | |||
887 | trace_kvm_book3s_exit(exit_nr, vcpu); | ||
888 | kvm_resched(vcpu); | ||
889 | switch (exit_nr) { | ||
890 | case BOOK3S_INTERRUPT_INST_STORAGE: | ||
891 | vcpu->stat.pf_instruc++; | ||
892 | |||
893 | #ifdef CONFIG_PPC_BOOK3S_32 | ||
894 | /* We set segments as unused segments when invalidating them. So | ||
895 | * treat the respective fault as segment fault. */ | ||
896 | if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT] | ||
897 | == SR_INVALID) { | ||
898 | kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); | ||
899 | r = RESUME_GUEST; | ||
900 | break; | ||
901 | } | ||
902 | #endif | ||
903 | |||
904 | /* only care about PTEG not found errors, but leave NX alone */ | ||
905 | if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) { | ||
906 | r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr); | ||
907 | vcpu->stat.sp_instruc++; | ||
908 | } else if (vcpu->arch.mmu.is_dcbz32(vcpu) && | ||
909 | (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) { | ||
910 | /* | ||
911 | * XXX If we do the dcbz hack we use the NX bit to flush&patch the page, | ||
912 | * so we can't use the NX bit inside the guest. Let's cross our fingers, | ||
913 | * that no guest that needs the dcbz hack does NX. | ||
914 | */ | ||
915 | kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL); | ||
916 | r = RESUME_GUEST; | ||
917 | } else { | ||
918 | vcpu->arch.shared->msr |= | ||
919 | to_svcpu(vcpu)->shadow_srr1 & 0x58000000; | ||
920 | kvmppc_book3s_queue_irqprio(vcpu, exit_nr); | ||
921 | r = RESUME_GUEST; | ||
922 | } | ||
923 | break; | ||
924 | case BOOK3S_INTERRUPT_DATA_STORAGE: | ||
925 | { | ||
926 | ulong dar = kvmppc_get_fault_dar(vcpu); | ||
927 | vcpu->stat.pf_storage++; | ||
928 | |||
929 | #ifdef CONFIG_PPC_BOOK3S_32 | ||
930 | /* We set segments as unused segments when invalidating them. So | ||
931 | * treat the respective fault as segment fault. */ | ||
932 | if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) { | ||
933 | kvmppc_mmu_map_segment(vcpu, dar); | ||
934 | r = RESUME_GUEST; | ||
935 | break; | ||
936 | } | ||
937 | #endif | ||
938 | |||
939 | /* The only case we need to handle is missing shadow PTEs */ | ||
940 | if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) { | ||
941 | r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr); | ||
942 | } else { | ||
943 | vcpu->arch.shared->dar = dar; | ||
944 | vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; | ||
945 | kvmppc_book3s_queue_irqprio(vcpu, exit_nr); | ||
946 | r = RESUME_GUEST; | ||
947 | } | ||
948 | break; | ||
949 | } | ||
950 | case BOOK3S_INTERRUPT_DATA_SEGMENT: | ||
951 | if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) { | ||
952 | vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); | ||
953 | kvmppc_book3s_queue_irqprio(vcpu, | ||
954 | BOOK3S_INTERRUPT_DATA_SEGMENT); | ||
955 | } | ||
956 | r = RESUME_GUEST; | ||
957 | break; | ||
958 | case BOOK3S_INTERRUPT_INST_SEGMENT: | ||
959 | if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)) < 0) { | ||
960 | kvmppc_book3s_queue_irqprio(vcpu, | ||
961 | BOOK3S_INTERRUPT_INST_SEGMENT); | ||
962 | } | ||
963 | r = RESUME_GUEST; | ||
964 | break; | ||
965 | /* We're good on these - the host merely wanted to get our attention */ | ||
966 | case BOOK3S_INTERRUPT_DECREMENTER: | ||
967 | vcpu->stat.dec_exits++; | ||
968 | r = RESUME_GUEST; | ||
969 | break; | ||
970 | case BOOK3S_INTERRUPT_EXTERNAL: | ||
971 | vcpu->stat.ext_intr_exits++; | ||
972 | r = RESUME_GUEST; | ||
973 | break; | ||
974 | case BOOK3S_INTERRUPT_PERFMON: | ||
975 | r = RESUME_GUEST; | ||
976 | break; | ||
977 | case BOOK3S_INTERRUPT_PROGRAM: | ||
978 | { | ||
979 | enum emulation_result er; | ||
980 | ulong flags; | ||
981 | |||
982 | program_interrupt: | ||
983 | flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull; | ||
984 | |||
985 | if (vcpu->arch.shared->msr & MSR_PR) { | ||
986 | #ifdef EXIT_DEBUG | ||
987 | printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu)); | ||
988 | #endif | ||
989 | if ((kvmppc_get_last_inst(vcpu) & 0xff0007ff) != | ||
990 | (INS_DCBZ & 0xfffffff7)) { | ||
991 | kvmppc_core_queue_program(vcpu, flags); | ||
992 | r = RESUME_GUEST; | ||
993 | break; | ||
994 | } | ||
995 | } | ||
996 | |||
997 | vcpu->stat.emulated_inst_exits++; | ||
998 | er = kvmppc_emulate_instruction(run, vcpu); | ||
999 | switch (er) { | ||
1000 | case EMULATE_DONE: | ||
1001 | r = RESUME_GUEST_NV; | ||
1002 | break; | ||
1003 | case EMULATE_AGAIN: | ||
1004 | r = RESUME_GUEST; | ||
1005 | break; | ||
1006 | case EMULATE_FAIL: | ||
1007 | printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", | ||
1008 | __func__, kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu)); | ||
1009 | kvmppc_core_queue_program(vcpu, flags); | ||
1010 | r = RESUME_GUEST; | ||
1011 | break; | ||
1012 | case EMULATE_DO_MMIO: | ||
1013 | run->exit_reason = KVM_EXIT_MMIO; | ||
1014 | r = RESUME_HOST_NV; | ||
1015 | break; | ||
1016 | default: | ||
1017 | BUG(); | ||
1018 | } | ||
1019 | break; | ||
1020 | } | ||
1021 | case BOOK3S_INTERRUPT_SYSCALL: | ||
1022 | if (vcpu->arch.osi_enabled && | ||
1023 | (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) && | ||
1024 | (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) { | ||
1025 | /* MOL hypercalls */ | ||
1026 | u64 *gprs = run->osi.gprs; | ||
1027 | int i; | ||
1028 | |||
1029 | run->exit_reason = KVM_EXIT_OSI; | ||
1030 | for (i = 0; i < 32; i++) | ||
1031 | gprs[i] = kvmppc_get_gpr(vcpu, i); | ||
1032 | vcpu->arch.osi_needed = 1; | ||
1033 | r = RESUME_HOST_NV; | ||
1034 | } else if (!(vcpu->arch.shared->msr & MSR_PR) && | ||
1035 | (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) { | ||
1036 | /* KVM PV hypercalls */ | ||
1037 | kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); | ||
1038 | r = RESUME_GUEST; | ||
1039 | } else { | ||
1040 | /* Guest syscalls */ | ||
1041 | vcpu->stat.syscall_exits++; | ||
1042 | kvmppc_book3s_queue_irqprio(vcpu, exit_nr); | ||
1043 | r = RESUME_GUEST; | ||
1044 | } | ||
1045 | break; | ||
1046 | case BOOK3S_INTERRUPT_FP_UNAVAIL: | ||
1047 | case BOOK3S_INTERRUPT_ALTIVEC: | ||
1048 | case BOOK3S_INTERRUPT_VSX: | ||
1049 | { | ||
1050 | int ext_msr = 0; | ||
1051 | |||
1052 | switch (exit_nr) { | ||
1053 | case BOOK3S_INTERRUPT_FP_UNAVAIL: ext_msr = MSR_FP; break; | ||
1054 | case BOOK3S_INTERRUPT_ALTIVEC: ext_msr = MSR_VEC; break; | ||
1055 | case BOOK3S_INTERRUPT_VSX: ext_msr = MSR_VSX; break; | ||
1056 | } | ||
1057 | |||
1058 | switch (kvmppc_check_ext(vcpu, exit_nr)) { | ||
1059 | case EMULATE_DONE: | ||
1060 | /* everything ok - let's enable the ext */ | ||
1061 | r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr); | ||
1062 | break; | ||
1063 | case EMULATE_FAIL: | ||
1064 | /* we need to emulate this instruction */ | ||
1065 | goto program_interrupt; | ||
1066 | break; | ||
1067 | default: | ||
1068 | /* nothing to worry about - go again */ | ||
1069 | break; | ||
1070 | } | ||
1071 | break; | ||
1072 | } | ||
1073 | case BOOK3S_INTERRUPT_ALIGNMENT: | ||
1074 | if (kvmppc_read_inst(vcpu) == EMULATE_DONE) { | ||
1075 | vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu, | ||
1076 | kvmppc_get_last_inst(vcpu)); | ||
1077 | vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu, | ||
1078 | kvmppc_get_last_inst(vcpu)); | ||
1079 | kvmppc_book3s_queue_irqprio(vcpu, exit_nr); | ||
1080 | } | ||
1081 | r = RESUME_GUEST; | ||
1082 | break; | ||
1083 | case BOOK3S_INTERRUPT_MACHINE_CHECK: | ||
1084 | case BOOK3S_INTERRUPT_TRACE: | ||
1085 | kvmppc_book3s_queue_irqprio(vcpu, exit_nr); | ||
1086 | r = RESUME_GUEST; | ||
1087 | break; | ||
1088 | default: | ||
1089 | /* Ugh - bork here! What did we get? */ | ||
1090 | printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", | ||
1091 | exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1); | ||
1092 | r = RESUME_HOST; | ||
1093 | BUG(); | ||
1094 | break; | ||
1095 | } | ||
1096 | |||
1097 | |||
1098 | if (!(r & RESUME_HOST)) { | ||
1099 | /* To avoid clobbering exit_reason, only check for signals if | ||
1100 | * we aren't already exiting to userspace for some other | ||
1101 | * reason. */ | ||
1102 | if (signal_pending(current)) { | ||
1103 | #ifdef EXIT_DEBUG | ||
1104 | printk(KERN_EMERG "KVM: Going back to host\n"); | ||
1105 | #endif | ||
1106 | vcpu->stat.signal_exits++; | ||
1107 | run->exit_reason = KVM_EXIT_INTR; | ||
1108 | r = -EINTR; | ||
1109 | } else { | ||
1110 | /* In case an interrupt came in that was triggered | ||
1111 | * from userspace (like DEC), we need to check what | ||
1112 | * to inject now! */ | ||
1113 | kvmppc_core_deliver_interrupts(vcpu); | ||
1114 | } | ||
1115 | } | ||
1116 | |||
1117 | trace_kvm_book3s_reenter(r, vcpu); | ||
1118 | |||
1119 | return r; | ||
1120 | } | ||
1121 | |||
1122 | int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) | 403 | int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) |
1123 | { | 404 | { |
1124 | return 0; | 405 | return 0; |
@@ -1179,69 +460,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | |||
1179 | return 0; | 460 | return 0; |
1180 | } | 461 | } |
1181 | 462 | ||
1182 | int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | ||
1183 | struct kvm_sregs *sregs) | ||
1184 | { | ||
1185 | struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); | ||
1186 | int i; | ||
1187 | |||
1188 | sregs->pvr = vcpu->arch.pvr; | ||
1189 | |||
1190 | sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1; | ||
1191 | if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { | ||
1192 | for (i = 0; i < 64; i++) { | ||
1193 | sregs->u.s.ppc64.slb[i].slbe = vcpu3s->slb[i].orige | i; | ||
1194 | sregs->u.s.ppc64.slb[i].slbv = vcpu3s->slb[i].origv; | ||
1195 | } | ||
1196 | } else { | ||
1197 | for (i = 0; i < 16; i++) | ||
1198 | sregs->u.s.ppc32.sr[i] = vcpu->arch.shared->sr[i]; | ||
1199 | |||
1200 | for (i = 0; i < 8; i++) { | ||
1201 | sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw; | ||
1202 | sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw; | ||
1203 | } | ||
1204 | } | ||
1205 | |||
1206 | return 0; | ||
1207 | } | ||
1208 | |||
1209 | int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | ||
1210 | struct kvm_sregs *sregs) | ||
1211 | { | ||
1212 | struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); | ||
1213 | int i; | ||
1214 | |||
1215 | kvmppc_set_pvr(vcpu, sregs->pvr); | ||
1216 | |||
1217 | vcpu3s->sdr1 = sregs->u.s.sdr1; | ||
1218 | if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { | ||
1219 | for (i = 0; i < 64; i++) { | ||
1220 | vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv, | ||
1221 | sregs->u.s.ppc64.slb[i].slbe); | ||
1222 | } | ||
1223 | } else { | ||
1224 | for (i = 0; i < 16; i++) { | ||
1225 | vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]); | ||
1226 | } | ||
1227 | for (i = 0; i < 8; i++) { | ||
1228 | kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), false, | ||
1229 | (u32)sregs->u.s.ppc32.ibat[i]); | ||
1230 | kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), true, | ||
1231 | (u32)(sregs->u.s.ppc32.ibat[i] >> 32)); | ||
1232 | kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), false, | ||
1233 | (u32)sregs->u.s.ppc32.dbat[i]); | ||
1234 | kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), true, | ||
1235 | (u32)(sregs->u.s.ppc32.dbat[i] >> 32)); | ||
1236 | } | ||
1237 | } | ||
1238 | |||
1239 | /* Flush the MMU after messing with the segments */ | ||
1240 | kvmppc_mmu_pte_flush(vcpu, 0, 0); | ||
1241 | |||
1242 | return 0; | ||
1243 | } | ||
1244 | |||
1245 | int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | 463 | int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) |
1246 | { | 464 | { |
1247 | return -ENOTSUPP; | 465 | return -ENOTSUPP; |
@@ -1296,202 +514,3 @@ out: | |||
1296 | mutex_unlock(&kvm->slots_lock); | 514 | mutex_unlock(&kvm->slots_lock); |
1297 | return r; | 515 | return r; |
1298 | } | 516 | } |
1299 | |||
1300 | int kvmppc_core_check_processor_compat(void) | ||
1301 | { | ||
1302 | return 0; | ||
1303 | } | ||
1304 | |||
1305 | struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) | ||
1306 | { | ||
1307 | struct kvmppc_vcpu_book3s *vcpu_book3s; | ||
1308 | struct kvm_vcpu *vcpu; | ||
1309 | int err = -ENOMEM; | ||
1310 | unsigned long p; | ||
1311 | |||
1312 | vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); | ||
1313 | if (!vcpu_book3s) | ||
1314 | goto out; | ||
1315 | |||
1316 | vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *) | ||
1317 | kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL); | ||
1318 | if (!vcpu_book3s->shadow_vcpu) | ||
1319 | goto free_vcpu; | ||
1320 | |||
1321 | vcpu = &vcpu_book3s->vcpu; | ||
1322 | err = kvm_vcpu_init(vcpu, kvm, id); | ||
1323 | if (err) | ||
1324 | goto free_shadow_vcpu; | ||
1325 | |||
1326 | p = __get_free_page(GFP_KERNEL|__GFP_ZERO); | ||
1327 | /* the real shared page fills the last 4k of our page */ | ||
1328 | vcpu->arch.shared = (void*)(p + PAGE_SIZE - 4096); | ||
1329 | if (!p) | ||
1330 | goto uninit_vcpu; | ||
1331 | |||
1332 | vcpu->arch.host_retip = kvm_return_point; | ||
1333 | vcpu->arch.host_msr = mfmsr(); | ||
1334 | #ifdef CONFIG_PPC_BOOK3S_64 | ||
1335 | /* default to book3s_64 (970fx) */ | ||
1336 | vcpu->arch.pvr = 0x3C0301; | ||
1337 | #else | ||
1338 | /* default to book3s_32 (750) */ | ||
1339 | vcpu->arch.pvr = 0x84202; | ||
1340 | #endif | ||
1341 | kvmppc_set_pvr(vcpu, vcpu->arch.pvr); | ||
1342 | vcpu_book3s->slb_nr = 64; | ||
1343 | |||
1344 | /* remember where some real-mode handlers are */ | ||
1345 | vcpu->arch.trampoline_lowmem = kvmppc_trampoline_lowmem; | ||
1346 | vcpu->arch.trampoline_enter = kvmppc_trampoline_enter; | ||
1347 | vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem; | ||
1348 | #ifdef CONFIG_PPC_BOOK3S_64 | ||
1349 | vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall; | ||
1350 | #else | ||
1351 | vcpu->arch.rmcall = (ulong)kvmppc_rmcall; | ||
1352 | #endif | ||
1353 | |||
1354 | vcpu->arch.shadow_msr = MSR_USER64; | ||
1355 | |||
1356 | err = kvmppc_mmu_init(vcpu); | ||
1357 | if (err < 0) | ||
1358 | goto uninit_vcpu; | ||
1359 | |||
1360 | return vcpu; | ||
1361 | |||
1362 | uninit_vcpu: | ||
1363 | kvm_vcpu_uninit(vcpu); | ||
1364 | free_shadow_vcpu: | ||
1365 | kfree(vcpu_book3s->shadow_vcpu); | ||
1366 | free_vcpu: | ||
1367 | vfree(vcpu_book3s); | ||
1368 | out: | ||
1369 | return ERR_PTR(err); | ||
1370 | } | ||
1371 | |||
1372 | void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) | ||
1373 | { | ||
1374 | struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); | ||
1375 | |||
1376 | free_page((unsigned long)vcpu->arch.shared & PAGE_MASK); | ||
1377 | kvm_vcpu_uninit(vcpu); | ||
1378 | kfree(vcpu_book3s->shadow_vcpu); | ||
1379 | vfree(vcpu_book3s); | ||
1380 | } | ||
1381 | |||
1382 | extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); | ||
1383 | int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | ||
1384 | { | ||
1385 | int ret; | ||
1386 | double fpr[32][TS_FPRWIDTH]; | ||
1387 | unsigned int fpscr; | ||
1388 | int fpexc_mode; | ||
1389 | #ifdef CONFIG_ALTIVEC | ||
1390 | vector128 vr[32]; | ||
1391 | vector128 vscr; | ||
1392 | unsigned long uninitialized_var(vrsave); | ||
1393 | int used_vr; | ||
1394 | #endif | ||
1395 | #ifdef CONFIG_VSX | ||
1396 | int used_vsr; | ||
1397 | #endif | ||
1398 | ulong ext_msr; | ||
1399 | |||
1400 | /* No need to go into the guest when all we do is going out */ | ||
1401 | if (signal_pending(current)) { | ||
1402 | kvm_run->exit_reason = KVM_EXIT_INTR; | ||
1403 | return -EINTR; | ||
1404 | } | ||
1405 | |||
1406 | /* Save FPU state in stack */ | ||
1407 | if (current->thread.regs->msr & MSR_FP) | ||
1408 | giveup_fpu(current); | ||
1409 | memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr)); | ||
1410 | fpscr = current->thread.fpscr.val; | ||
1411 | fpexc_mode = current->thread.fpexc_mode; | ||
1412 | |||
1413 | #ifdef CONFIG_ALTIVEC | ||
1414 | /* Save Altivec state in stack */ | ||
1415 | used_vr = current->thread.used_vr; | ||
1416 | if (used_vr) { | ||
1417 | if (current->thread.regs->msr & MSR_VEC) | ||
1418 | giveup_altivec(current); | ||
1419 | memcpy(vr, current->thread.vr, sizeof(current->thread.vr)); | ||
1420 | vscr = current->thread.vscr; | ||
1421 | vrsave = current->thread.vrsave; | ||
1422 | } | ||
1423 | #endif | ||
1424 | |||
1425 | #ifdef CONFIG_VSX | ||
1426 | /* Save VSX state in stack */ | ||
1427 | used_vsr = current->thread.used_vsr; | ||
1428 | if (used_vsr && (current->thread.regs->msr & MSR_VSX)) | ||
1429 | __giveup_vsx(current); | ||
1430 | #endif | ||
1431 | |||
1432 | /* Remember the MSR with disabled extensions */ | ||
1433 | ext_msr = current->thread.regs->msr; | ||
1434 | |||
1435 | /* XXX we get called with irq disabled - change that! */ | ||
1436 | local_irq_enable(); | ||
1437 | |||
1438 | /* Preload FPU if it's enabled */ | ||
1439 | if (vcpu->arch.shared->msr & MSR_FP) | ||
1440 | kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); | ||
1441 | |||
1442 | ret = __kvmppc_vcpu_entry(kvm_run, vcpu); | ||
1443 | |||
1444 | local_irq_disable(); | ||
1445 | |||
1446 | current->thread.regs->msr = ext_msr; | ||
1447 | |||
1448 | /* Make sure we save the guest FPU/Altivec/VSX state */ | ||
1449 | kvmppc_giveup_ext(vcpu, MSR_FP); | ||
1450 | kvmppc_giveup_ext(vcpu, MSR_VEC); | ||
1451 | kvmppc_giveup_ext(vcpu, MSR_VSX); | ||
1452 | |||
1453 | /* Restore FPU state from stack */ | ||
1454 | memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr)); | ||
1455 | current->thread.fpscr.val = fpscr; | ||
1456 | current->thread.fpexc_mode = fpexc_mode; | ||
1457 | |||
1458 | #ifdef CONFIG_ALTIVEC | ||
1459 | /* Restore Altivec state from stack */ | ||
1460 | if (used_vr && current->thread.used_vr) { | ||
1461 | memcpy(current->thread.vr, vr, sizeof(current->thread.vr)); | ||
1462 | current->thread.vscr = vscr; | ||
1463 | current->thread.vrsave = vrsave; | ||
1464 | } | ||
1465 | current->thread.used_vr = used_vr; | ||
1466 | #endif | ||
1467 | |||
1468 | #ifdef CONFIG_VSX | ||
1469 | current->thread.used_vsr = used_vsr; | ||
1470 | #endif | ||
1471 | |||
1472 | return ret; | ||
1473 | } | ||
1474 | |||
1475 | static int kvmppc_book3s_init(void) | ||
1476 | { | ||
1477 | int r; | ||
1478 | |||
1479 | r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0, | ||
1480 | THIS_MODULE); | ||
1481 | |||
1482 | if (r) | ||
1483 | return r; | ||
1484 | |||
1485 | r = kvmppc_mmu_hpte_sysinit(); | ||
1486 | |||
1487 | return r; | ||
1488 | } | ||
1489 | |||
1490 | static void kvmppc_book3s_exit(void) | ||
1491 | { | ||
1492 | kvmppc_mmu_hpte_sysexit(); | ||
1493 | kvm_exit(); | ||
1494 | } | ||
1495 | |||
1496 | module_init(kvmppc_book3s_init); | ||
1497 | module_exit(kvmppc_book3s_exit); | ||
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index d7889ef3211e..c6d3e194b6b4 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c | |||
@@ -41,36 +41,36 @@ static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu) | |||
41 | } | 41 | } |
42 | 42 | ||
43 | static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( | 43 | static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( |
44 | struct kvmppc_vcpu_book3s *vcpu_book3s, | 44 | struct kvm_vcpu *vcpu, |
45 | gva_t eaddr) | 45 | gva_t eaddr) |
46 | { | 46 | { |
47 | int i; | 47 | int i; |
48 | u64 esid = GET_ESID(eaddr); | 48 | u64 esid = GET_ESID(eaddr); |
49 | u64 esid_1t = GET_ESID_1T(eaddr); | 49 | u64 esid_1t = GET_ESID_1T(eaddr); |
50 | 50 | ||
51 | for (i = 0; i < vcpu_book3s->slb_nr; i++) { | 51 | for (i = 0; i < vcpu->arch.slb_nr; i++) { |
52 | u64 cmp_esid = esid; | 52 | u64 cmp_esid = esid; |
53 | 53 | ||
54 | if (!vcpu_book3s->slb[i].valid) | 54 | if (!vcpu->arch.slb[i].valid) |
55 | continue; | 55 | continue; |
56 | 56 | ||
57 | if (vcpu_book3s->slb[i].tb) | 57 | if (vcpu->arch.slb[i].tb) |
58 | cmp_esid = esid_1t; | 58 | cmp_esid = esid_1t; |
59 | 59 | ||
60 | if (vcpu_book3s->slb[i].esid == cmp_esid) | 60 | if (vcpu->arch.slb[i].esid == cmp_esid) |
61 | return &vcpu_book3s->slb[i]; | 61 | return &vcpu->arch.slb[i]; |
62 | } | 62 | } |
63 | 63 | ||
64 | dprintk("KVM: No SLB entry found for 0x%lx [%llx | %llx]\n", | 64 | dprintk("KVM: No SLB entry found for 0x%lx [%llx | %llx]\n", |
65 | eaddr, esid, esid_1t); | 65 | eaddr, esid, esid_1t); |
66 | for (i = 0; i < vcpu_book3s->slb_nr; i++) { | 66 | for (i = 0; i < vcpu->arch.slb_nr; i++) { |
67 | if (vcpu_book3s->slb[i].vsid) | 67 | if (vcpu->arch.slb[i].vsid) |
68 | dprintk(" %d: %c%c%c %llx %llx\n", i, | 68 | dprintk(" %d: %c%c%c %llx %llx\n", i, |
69 | vcpu_book3s->slb[i].valid ? 'v' : ' ', | 69 | vcpu->arch.slb[i].valid ? 'v' : ' ', |
70 | vcpu_book3s->slb[i].large ? 'l' : ' ', | 70 | vcpu->arch.slb[i].large ? 'l' : ' ', |
71 | vcpu_book3s->slb[i].tb ? 't' : ' ', | 71 | vcpu->arch.slb[i].tb ? 't' : ' ', |
72 | vcpu_book3s->slb[i].esid, | 72 | vcpu->arch.slb[i].esid, |
73 | vcpu_book3s->slb[i].vsid); | 73 | vcpu->arch.slb[i].vsid); |
74 | } | 74 | } |
75 | 75 | ||
76 | return NULL; | 76 | return NULL; |
@@ -81,7 +81,7 @@ static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, | |||
81 | { | 81 | { |
82 | struct kvmppc_slb *slb; | 82 | struct kvmppc_slb *slb; |
83 | 83 | ||
84 | slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), eaddr); | 84 | slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr); |
85 | if (!slb) | 85 | if (!slb) |
86 | return 0; | 86 | return 0; |
87 | 87 | ||
@@ -180,7 +180,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, | |||
180 | return 0; | 180 | return 0; |
181 | } | 181 | } |
182 | 182 | ||
183 | slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, eaddr); | 183 | slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr); |
184 | if (!slbe) | 184 | if (!slbe) |
185 | goto no_seg_found; | 185 | goto no_seg_found; |
186 | 186 | ||
@@ -320,10 +320,10 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb) | |||
320 | esid_1t = GET_ESID_1T(rb); | 320 | esid_1t = GET_ESID_1T(rb); |
321 | slb_nr = rb & 0xfff; | 321 | slb_nr = rb & 0xfff; |
322 | 322 | ||
323 | if (slb_nr > vcpu_book3s->slb_nr) | 323 | if (slb_nr > vcpu->arch.slb_nr) |
324 | return; | 324 | return; |
325 | 325 | ||
326 | slbe = &vcpu_book3s->slb[slb_nr]; | 326 | slbe = &vcpu->arch.slb[slb_nr]; |
327 | 327 | ||
328 | slbe->large = (rs & SLB_VSID_L) ? 1 : 0; | 328 | slbe->large = (rs & SLB_VSID_L) ? 1 : 0; |
329 | slbe->tb = (rs & SLB_VSID_B_1T) ? 1 : 0; | 329 | slbe->tb = (rs & SLB_VSID_B_1T) ? 1 : 0; |
@@ -344,38 +344,35 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb) | |||
344 | 344 | ||
345 | static u64 kvmppc_mmu_book3s_64_slbmfee(struct kvm_vcpu *vcpu, u64 slb_nr) | 345 | static u64 kvmppc_mmu_book3s_64_slbmfee(struct kvm_vcpu *vcpu, u64 slb_nr) |
346 | { | 346 | { |
347 | struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); | ||
348 | struct kvmppc_slb *slbe; | 347 | struct kvmppc_slb *slbe; |
349 | 348 | ||
350 | if (slb_nr > vcpu_book3s->slb_nr) | 349 | if (slb_nr > vcpu->arch.slb_nr) |
351 | return 0; | 350 | return 0; |
352 | 351 | ||
353 | slbe = &vcpu_book3s->slb[slb_nr]; | 352 | slbe = &vcpu->arch.slb[slb_nr]; |
354 | 353 | ||
355 | return slbe->orige; | 354 | return slbe->orige; |
356 | } | 355 | } |
357 | 356 | ||
358 | static u64 kvmppc_mmu_book3s_64_slbmfev(struct kvm_vcpu *vcpu, u64 slb_nr) | 357 | static u64 kvmppc_mmu_book3s_64_slbmfev(struct kvm_vcpu *vcpu, u64 slb_nr) |
359 | { | 358 | { |
360 | struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); | ||
361 | struct kvmppc_slb *slbe; | 359 | struct kvmppc_slb *slbe; |
362 | 360 | ||
363 | if (slb_nr > vcpu_book3s->slb_nr) | 361 | if (slb_nr > vcpu->arch.slb_nr) |
364 | return 0; | 362 | return 0; |
365 | 363 | ||
366 | slbe = &vcpu_book3s->slb[slb_nr]; | 364 | slbe = &vcpu->arch.slb[slb_nr]; |
367 | 365 | ||
368 | return slbe->origv; | 366 | return slbe->origv; |
369 | } | 367 | } |
370 | 368 | ||
371 | static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea) | 369 | static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea) |
372 | { | 370 | { |
373 | struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); | ||
374 | struct kvmppc_slb *slbe; | 371 | struct kvmppc_slb *slbe; |
375 | 372 | ||
376 | dprintk("KVM MMU: slbie(0x%llx)\n", ea); | 373 | dprintk("KVM MMU: slbie(0x%llx)\n", ea); |
377 | 374 | ||
378 | slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, ea); | 375 | slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea); |
379 | 376 | ||
380 | if (!slbe) | 377 | if (!slbe) |
381 | return; | 378 | return; |
@@ -389,13 +386,12 @@ static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea) | |||
389 | 386 | ||
390 | static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu) | 387 | static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu) |
391 | { | 388 | { |
392 | struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); | ||
393 | int i; | 389 | int i; |
394 | 390 | ||
395 | dprintk("KVM MMU: slbia()\n"); | 391 | dprintk("KVM MMU: slbia()\n"); |
396 | 392 | ||
397 | for (i = 1; i < vcpu_book3s->slb_nr; i++) | 393 | for (i = 1; i < vcpu->arch.slb_nr; i++) |
398 | vcpu_book3s->slb[i].valid = false; | 394 | vcpu->arch.slb[i].valid = false; |
399 | 395 | ||
400 | if (vcpu->arch.shared->msr & MSR_IR) { | 396 | if (vcpu->arch.shared->msr & MSR_IR) { |
401 | kvmppc_mmu_flush_segments(vcpu); | 397 | kvmppc_mmu_flush_segments(vcpu); |
@@ -464,7 +460,7 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, | |||
464 | ulong mp_ea = vcpu->arch.magic_page_ea; | 460 | ulong mp_ea = vcpu->arch.magic_page_ea; |
465 | 461 | ||
466 | if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { | 462 | if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { |
467 | slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), ea); | 463 | slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea); |
468 | if (slb) | 464 | if (slb) |
469 | gvsid = slb->vsid; | 465 | gvsid = slb->vsid; |
470 | } | 466 | } |
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c new file mode 100644 index 000000000000..bc3a2ea94217 --- /dev/null +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c | |||
@@ -0,0 +1,180 @@ | |||
1 | /* | ||
2 | * This program is free software; you can redistribute it and/or modify | ||
3 | * it under the terms of the GNU General Public License, version 2, as | ||
4 | * published by the Free Software Foundation. | ||
5 | * | ||
6 | * This program is distributed in the hope that it will be useful, | ||
7 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
8 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
9 | * GNU General Public License for more details. | ||
10 | * | ||
11 | * You should have received a copy of the GNU General Public License | ||
12 | * along with this program; if not, write to the Free Software | ||
13 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
14 | * | ||
15 | * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | ||
16 | */ | ||
17 | |||
18 | #include <linux/types.h> | ||
19 | #include <linux/string.h> | ||
20 | #include <linux/kvm.h> | ||
21 | #include <linux/kvm_host.h> | ||
22 | #include <linux/highmem.h> | ||
23 | #include <linux/gfp.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/hugetlb.h> | ||
26 | |||
27 | #include <asm/tlbflush.h> | ||
28 | #include <asm/kvm_ppc.h> | ||
29 | #include <asm/kvm_book3s.h> | ||
30 | #include <asm/mmu-hash64.h> | ||
31 | #include <asm/hvcall.h> | ||
32 | #include <asm/synch.h> | ||
33 | #include <asm/ppc-opcode.h> | ||
34 | #include <asm/cputable.h> | ||
35 | |||
36 | /* For now use fixed-size 16MB page table */ | ||
37 | #define HPT_ORDER 24 | ||
38 | #define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ | ||
39 | #define HPT_HASH_MASK (HPT_NPTEG - 1) | ||
40 | |||
41 | /* Pages in the VRMA are 16MB pages */ | ||
42 | #define VRMA_PAGE_ORDER 24 | ||
43 | #define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ | ||
44 | |||
45 | /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ | ||
46 | #define MAX_LPID_970 63 | ||
47 | #define NR_LPIDS (LPID_RSVD + 1) | ||
48 | unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)]; | ||
49 | |||
50 | long kvmppc_alloc_hpt(struct kvm *kvm) | ||
51 | { | ||
52 | unsigned long hpt; | ||
53 | unsigned long lpid; | ||
54 | |||
55 | hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN, | ||
56 | HPT_ORDER - PAGE_SHIFT); | ||
57 | if (!hpt) { | ||
58 | pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n"); | ||
59 | return -ENOMEM; | ||
60 | } | ||
61 | kvm->arch.hpt_virt = hpt; | ||
62 | |||
63 | do { | ||
64 | lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS); | ||
65 | if (lpid >= NR_LPIDS) { | ||
66 | pr_err("kvm_alloc_hpt: No LPIDs free\n"); | ||
67 | free_pages(hpt, HPT_ORDER - PAGE_SHIFT); | ||
68 | return -ENOMEM; | ||
69 | } | ||
70 | } while (test_and_set_bit(lpid, lpid_inuse)); | ||
71 | |||
72 | kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18); | ||
73 | kvm->arch.lpid = lpid; | ||
74 | |||
75 | pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid); | ||
76 | return 0; | ||
77 | } | ||
78 | |||
79 | void kvmppc_free_hpt(struct kvm *kvm) | ||
80 | { | ||
81 | clear_bit(kvm->arch.lpid, lpid_inuse); | ||
82 | free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT); | ||
83 | } | ||
84 | |||
85 | void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem) | ||
86 | { | ||
87 | unsigned long i; | ||
88 | unsigned long npages = kvm->arch.ram_npages; | ||
89 | unsigned long pfn; | ||
90 | unsigned long *hpte; | ||
91 | unsigned long hash; | ||
92 | struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo; | ||
93 | |||
94 | if (!pginfo) | ||
95 | return; | ||
96 | |||
97 | /* VRMA can't be > 1TB */ | ||
98 | if (npages > 1ul << (40 - kvm->arch.ram_porder)) | ||
99 | npages = 1ul << (40 - kvm->arch.ram_porder); | ||
100 | /* Can't use more than 1 HPTE per HPTEG */ | ||
101 | if (npages > HPT_NPTEG) | ||
102 | npages = HPT_NPTEG; | ||
103 | |||
104 | for (i = 0; i < npages; ++i) { | ||
105 | pfn = pginfo[i].pfn; | ||
106 | if (!pfn) | ||
107 | break; | ||
108 | /* can't use hpt_hash since va > 64 bits */ | ||
109 | hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK; | ||
110 | /* | ||
111 | * We assume that the hash table is empty and no | ||
112 | * vcpus are using it at this stage. Since we create | ||
113 | * at most one HPTE per HPTEG, we just assume entry 7 | ||
114 | * is available and use it. | ||
115 | */ | ||
116 | hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 7)); | ||
117 | hpte += 7 * 2; | ||
118 | /* HPTE low word - RPN, protection, etc. */ | ||
119 | hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C | | ||
120 | HPTE_R_M | PP_RWXX; | ||
121 | wmb(); | ||
122 | hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | | ||
123 | (i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED | | ||
124 | HPTE_V_LARGE | HPTE_V_VALID; | ||
125 | } | ||
126 | } | ||
127 | |||
128 | int kvmppc_mmu_hv_init(void) | ||
129 | { | ||
130 | unsigned long host_lpid, rsvd_lpid; | ||
131 | |||
132 | if (!cpu_has_feature(CPU_FTR_HVMODE)) | ||
133 | return -EINVAL; | ||
134 | |||
135 | memset(lpid_inuse, 0, sizeof(lpid_inuse)); | ||
136 | |||
137 | if (cpu_has_feature(CPU_FTR_ARCH_206)) { | ||
138 | host_lpid = mfspr(SPRN_LPID); /* POWER7 */ | ||
139 | rsvd_lpid = LPID_RSVD; | ||
140 | } else { | ||
141 | host_lpid = 0; /* PPC970 */ | ||
142 | rsvd_lpid = MAX_LPID_970; | ||
143 | } | ||
144 | |||
145 | set_bit(host_lpid, lpid_inuse); | ||
146 | /* rsvd_lpid is reserved for use in partition switching */ | ||
147 | set_bit(rsvd_lpid, lpid_inuse); | ||
148 | |||
149 | return 0; | ||
150 | } | ||
151 | |||
152 | void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) | ||
153 | { | ||
154 | } | ||
155 | |||
156 | static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) | ||
157 | { | ||
158 | kvmppc_set_msr(vcpu, MSR_SF | MSR_ME); | ||
159 | } | ||
160 | |||
161 | static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, | ||
162 | struct kvmppc_pte *gpte, bool data) | ||
163 | { | ||
164 | return -ENOENT; | ||
165 | } | ||
166 | |||
167 | void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) | ||
168 | { | ||
169 | struct kvmppc_mmu *mmu = &vcpu->arch.mmu; | ||
170 | |||
171 | if (cpu_has_feature(CPU_FTR_ARCH_206)) | ||
172 | vcpu->arch.slb_nr = 32; /* POWER7 */ | ||
173 | else | ||
174 | vcpu->arch.slb_nr = 64; | ||
175 | |||
176 | mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; | ||
177 | mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; | ||
178 | |||
179 | vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; | ||
180 | } | ||
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c new file mode 100644 index 000000000000..ea0f8c537c28 --- /dev/null +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c | |||
@@ -0,0 +1,73 @@ | |||
1 | /* | ||
2 | * This program is free software; you can redistribute it and/or modify | ||
3 | * it under the terms of the GNU General Public License, version 2, as | ||
4 | * published by the Free Software Foundation. | ||
5 | * | ||
6 | * This program is distributed in the hope that it will be useful, | ||
7 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
8 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
9 | * GNU General Public License for more details. | ||
10 | * | ||
11 | * You should have received a copy of the GNU General Public License | ||
12 | * along with this program; if not, write to the Free Software | ||
13 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
14 | * | ||
15 | * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | ||
16 | * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com> | ||
17 | */ | ||
18 | |||
19 | #include <linux/types.h> | ||
20 | #include <linux/string.h> | ||
21 | #include <linux/kvm.h> | ||
22 | #include <linux/kvm_host.h> | ||
23 | #include <linux/highmem.h> | ||
24 | #include <linux/gfp.h> | ||
25 | #include <linux/slab.h> | ||
26 | #include <linux/hugetlb.h> | ||
27 | #include <linux/list.h> | ||
28 | |||
29 | #include <asm/tlbflush.h> | ||
30 | #include <asm/kvm_ppc.h> | ||
31 | #include <asm/kvm_book3s.h> | ||
32 | #include <asm/mmu-hash64.h> | ||
33 | #include <asm/hvcall.h> | ||
34 | #include <asm/synch.h> | ||
35 | #include <asm/ppc-opcode.h> | ||
36 | #include <asm/kvm_host.h> | ||
37 | #include <asm/udbg.h> | ||
38 | |||
39 | #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) | ||
40 | |||
41 | long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, | ||
42 | unsigned long ioba, unsigned long tce) | ||
43 | { | ||
44 | struct kvm *kvm = vcpu->kvm; | ||
45 | struct kvmppc_spapr_tce_table *stt; | ||
46 | |||
47 | /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ | ||
48 | /* liobn, ioba, tce); */ | ||
49 | |||
50 | list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { | ||
51 | if (stt->liobn == liobn) { | ||
52 | unsigned long idx = ioba >> SPAPR_TCE_SHIFT; | ||
53 | struct page *page; | ||
54 | u64 *tbl; | ||
55 | |||
56 | /* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p window_size=0x%x\n", */ | ||
57 | /* liobn, stt, stt->window_size); */ | ||
58 | if (ioba >= stt->window_size) | ||
59 | return H_PARAMETER; | ||
60 | |||
61 | page = stt->pages[idx / TCES_PER_PAGE]; | ||
62 | tbl = (u64 *)page_address(page); | ||
63 | |||
64 | /* FIXME: Need to validate the TCE itself */ | ||
65 | /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */ | ||
66 | tbl[idx % TCES_PER_PAGE] = tce; | ||
67 | return H_SUCCESS; | ||
68 | } | ||
69 | } | ||
70 | |||
71 | /* Didn't find the liobn, punt it to userspace */ | ||
72 | return H_TOO_HARD; | ||
73 | } | ||
diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c index 1dd5a1ddfd0d..88c8f26add02 100644 --- a/arch/powerpc/kvm/book3s_exports.c +++ b/arch/powerpc/kvm/book3s_exports.c | |||
@@ -20,8 +20,11 @@ | |||
20 | #include <linux/module.h> | 20 | #include <linux/module.h> |
21 | #include <asm/kvm_book3s.h> | 21 | #include <asm/kvm_book3s.h> |
22 | 22 | ||
23 | EXPORT_SYMBOL_GPL(kvmppc_trampoline_enter); | 23 | #ifdef CONFIG_KVM_BOOK3S_64_HV |
24 | EXPORT_SYMBOL_GPL(kvmppc_trampoline_lowmem); | 24 | EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline); |
25 | #else | ||
26 | EXPORT_SYMBOL_GPL(kvmppc_handler_trampoline_enter); | ||
27 | EXPORT_SYMBOL_GPL(kvmppc_handler_lowmem_trampoline); | ||
25 | EXPORT_SYMBOL_GPL(kvmppc_rmcall); | 28 | EXPORT_SYMBOL_GPL(kvmppc_rmcall); |
26 | EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu); | 29 | EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu); |
27 | #ifdef CONFIG_ALTIVEC | 30 | #ifdef CONFIG_ALTIVEC |
@@ -30,3 +33,5 @@ EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec); | |||
30 | #ifdef CONFIG_VSX | 33 | #ifdef CONFIG_VSX |
31 | EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx); | 34 | EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx); |
32 | #endif | 35 | #endif |
36 | #endif | ||
37 | |||
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c new file mode 100644 index 000000000000..cc0d7f1b19ab --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv.c | |||
@@ -0,0 +1,1269 @@ | |||
1 | /* | ||
2 | * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | ||
3 | * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved. | ||
4 | * | ||
5 | * Authors: | ||
6 | * Paul Mackerras <paulus@au1.ibm.com> | ||
7 | * Alexander Graf <agraf@suse.de> | ||
8 | * Kevin Wolf <mail@kevin-wolf.de> | ||
9 | * | ||
10 | * Description: KVM functions specific to running on Book 3S | ||
11 | * processors in hypervisor mode (specifically POWER7 and later). | ||
12 | * | ||
13 | * This file is derived from arch/powerpc/kvm/book3s.c, | ||
14 | * by Alexander Graf <agraf@suse.de>. | ||
15 | * | ||
16 | * This program is free software; you can redistribute it and/or modify | ||
17 | * it under the terms of the GNU General Public License, version 2, as | ||
18 | * published by the Free Software Foundation. | ||
19 | */ | ||
20 | |||
21 | #include <linux/kvm_host.h> | ||
22 | #include <linux/err.h> | ||
23 | #include <linux/slab.h> | ||
24 | #include <linux/preempt.h> | ||
25 | #include <linux/sched.h> | ||
26 | #include <linux/delay.h> | ||
27 | #include <linux/fs.h> | ||
28 | #include <linux/anon_inodes.h> | ||
29 | #include <linux/cpumask.h> | ||
30 | #include <linux/spinlock.h> | ||
31 | #include <linux/page-flags.h> | ||
32 | |||
33 | #include <asm/reg.h> | ||
34 | #include <asm/cputable.h> | ||
35 | #include <asm/cacheflush.h> | ||
36 | #include <asm/tlbflush.h> | ||
37 | #include <asm/uaccess.h> | ||
38 | #include <asm/io.h> | ||
39 | #include <asm/kvm_ppc.h> | ||
40 | #include <asm/kvm_book3s.h> | ||
41 | #include <asm/mmu_context.h> | ||
42 | #include <asm/lppaca.h> | ||
43 | #include <asm/processor.h> | ||
44 | #include <asm/cputhreads.h> | ||
45 | #include <asm/page.h> | ||
46 | #include <linux/gfp.h> | ||
47 | #include <linux/sched.h> | ||
48 | #include <linux/vmalloc.h> | ||
49 | #include <linux/highmem.h> | ||
50 | |||
51 | /* | ||
52 | * For now, limit memory to 64GB and require it to be large pages. | ||
53 | * This value is chosen because it makes the ram_pginfo array be | ||
54 | * 64kB in size, which is about as large as we want to be trying | ||
55 | * to allocate with kmalloc. | ||
56 | */ | ||
57 | #define MAX_MEM_ORDER 36 | ||
58 | |||
59 | #define LARGE_PAGE_ORDER 24 /* 16MB pages */ | ||
60 | |||
61 | /* #define EXIT_DEBUG */ | ||
62 | /* #define EXIT_DEBUG_SIMPLE */ | ||
63 | /* #define EXIT_DEBUG_INT */ | ||
64 | |||
65 | void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | ||
66 | { | ||
67 | local_paca->kvm_hstate.kvm_vcpu = vcpu; | ||
68 | local_paca->kvm_hstate.kvm_vcore = vcpu->arch.vcore; | ||
69 | } | ||
70 | |||
71 | void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) | ||
72 | { | ||
73 | } | ||
74 | |||
75 | static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu); | ||
76 | static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu); | ||
77 | |||
78 | void kvmppc_vcpu_block(struct kvm_vcpu *vcpu) | ||
79 | { | ||
80 | u64 now; | ||
81 | unsigned long dec_nsec; | ||
82 | |||
83 | now = get_tb(); | ||
84 | if (now >= vcpu->arch.dec_expires && !kvmppc_core_pending_dec(vcpu)) | ||
85 | kvmppc_core_queue_dec(vcpu); | ||
86 | if (vcpu->arch.pending_exceptions) | ||
87 | return; | ||
88 | if (vcpu->arch.dec_expires != ~(u64)0) { | ||
89 | dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC / | ||
90 | tb_ticks_per_sec; | ||
91 | hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec), | ||
92 | HRTIMER_MODE_REL); | ||
93 | } | ||
94 | |||
95 | kvmppc_vcpu_blocked(vcpu); | ||
96 | |||
97 | kvm_vcpu_block(vcpu); | ||
98 | vcpu->stat.halt_wakeup++; | ||
99 | |||
100 | if (vcpu->arch.dec_expires != ~(u64)0) | ||
101 | hrtimer_try_to_cancel(&vcpu->arch.dec_timer); | ||
102 | |||
103 | kvmppc_vcpu_unblocked(vcpu); | ||
104 | } | ||
105 | |||
106 | void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) | ||
107 | { | ||
108 | vcpu->arch.shregs.msr = msr; | ||
109 | } | ||
110 | |||
111 | void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) | ||
112 | { | ||
113 | vcpu->arch.pvr = pvr; | ||
114 | } | ||
115 | |||
116 | void kvmppc_dump_regs(struct kvm_vcpu *vcpu) | ||
117 | { | ||
118 | int r; | ||
119 | |||
120 | pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id); | ||
121 | pr_err("pc = %.16lx msr = %.16llx trap = %x\n", | ||
122 | vcpu->arch.pc, vcpu->arch.shregs.msr, vcpu->arch.trap); | ||
123 | for (r = 0; r < 16; ++r) | ||
124 | pr_err("r%2d = %.16lx r%d = %.16lx\n", | ||
125 | r, kvmppc_get_gpr(vcpu, r), | ||
126 | r+16, kvmppc_get_gpr(vcpu, r+16)); | ||
127 | pr_err("ctr = %.16lx lr = %.16lx\n", | ||
128 | vcpu->arch.ctr, vcpu->arch.lr); | ||
129 | pr_err("srr0 = %.16llx srr1 = %.16llx\n", | ||
130 | vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1); | ||
131 | pr_err("sprg0 = %.16llx sprg1 = %.16llx\n", | ||
132 | vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1); | ||
133 | pr_err("sprg2 = %.16llx sprg3 = %.16llx\n", | ||
134 | vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3); | ||
135 | pr_err("cr = %.8x xer = %.16lx dsisr = %.8x\n", | ||
136 | vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.shregs.dsisr); | ||
137 | pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar); | ||
138 | pr_err("fault dar = %.16lx dsisr = %.8x\n", | ||
139 | vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); | ||
140 | pr_err("SLB (%d entries):\n", vcpu->arch.slb_max); | ||
141 | for (r = 0; r < vcpu->arch.slb_max; ++r) | ||
142 | pr_err(" ESID = %.16llx VSID = %.16llx\n", | ||
143 | vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv); | ||
144 | pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n", | ||
145 | vcpu->kvm->arch.lpcr, vcpu->kvm->arch.sdr1, | ||
146 | vcpu->arch.last_inst); | ||
147 | } | ||
148 | |||
149 | struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id) | ||
150 | { | ||
151 | int r; | ||
152 | struct kvm_vcpu *v, *ret = NULL; | ||
153 | |||
154 | mutex_lock(&kvm->lock); | ||
155 | kvm_for_each_vcpu(r, v, kvm) { | ||
156 | if (v->vcpu_id == id) { | ||
157 | ret = v; | ||
158 | break; | ||
159 | } | ||
160 | } | ||
161 | mutex_unlock(&kvm->lock); | ||
162 | return ret; | ||
163 | } | ||
164 | |||
165 | static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa) | ||
166 | { | ||
167 | vpa->shared_proc = 1; | ||
168 | vpa->yield_count = 1; | ||
169 | } | ||
170 | |||
171 | static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, | ||
172 | unsigned long flags, | ||
173 | unsigned long vcpuid, unsigned long vpa) | ||
174 | { | ||
175 | struct kvm *kvm = vcpu->kvm; | ||
176 | unsigned long pg_index, ra, len; | ||
177 | unsigned long pg_offset; | ||
178 | void *va; | ||
179 | struct kvm_vcpu *tvcpu; | ||
180 | |||
181 | tvcpu = kvmppc_find_vcpu(kvm, vcpuid); | ||
182 | if (!tvcpu) | ||
183 | return H_PARAMETER; | ||
184 | |||
185 | flags >>= 63 - 18; | ||
186 | flags &= 7; | ||
187 | if (flags == 0 || flags == 4) | ||
188 | return H_PARAMETER; | ||
189 | if (flags < 4) { | ||
190 | if (vpa & 0x7f) | ||
191 | return H_PARAMETER; | ||
192 | /* registering new area; convert logical addr to real */ | ||
193 | pg_index = vpa >> kvm->arch.ram_porder; | ||
194 | pg_offset = vpa & (kvm->arch.ram_psize - 1); | ||
195 | if (pg_index >= kvm->arch.ram_npages) | ||
196 | return H_PARAMETER; | ||
197 | if (kvm->arch.ram_pginfo[pg_index].pfn == 0) | ||
198 | return H_PARAMETER; | ||
199 | ra = kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT; | ||
200 | ra |= pg_offset; | ||
201 | va = __va(ra); | ||
202 | if (flags <= 1) | ||
203 | len = *(unsigned short *)(va + 4); | ||
204 | else | ||
205 | len = *(unsigned int *)(va + 4); | ||
206 | if (pg_offset + len > kvm->arch.ram_psize) | ||
207 | return H_PARAMETER; | ||
208 | switch (flags) { | ||
209 | case 1: /* register VPA */ | ||
210 | if (len < 640) | ||
211 | return H_PARAMETER; | ||
212 | tvcpu->arch.vpa = va; | ||
213 | init_vpa(vcpu, va); | ||
214 | break; | ||
215 | case 2: /* register DTL */ | ||
216 | if (len < 48) | ||
217 | return H_PARAMETER; | ||
218 | if (!tvcpu->arch.vpa) | ||
219 | return H_RESOURCE; | ||
220 | len -= len % 48; | ||
221 | tvcpu->arch.dtl = va; | ||
222 | tvcpu->arch.dtl_end = va + len; | ||
223 | break; | ||
224 | case 3: /* register SLB shadow buffer */ | ||
225 | if (len < 8) | ||
226 | return H_PARAMETER; | ||
227 | if (!tvcpu->arch.vpa) | ||
228 | return H_RESOURCE; | ||
229 | tvcpu->arch.slb_shadow = va; | ||
230 | len = (len - 16) / 16; | ||
231 | tvcpu->arch.slb_shadow = va; | ||
232 | break; | ||
233 | } | ||
234 | } else { | ||
235 | switch (flags) { | ||
236 | case 5: /* unregister VPA */ | ||
237 | if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl) | ||
238 | return H_RESOURCE; | ||
239 | tvcpu->arch.vpa = NULL; | ||
240 | break; | ||
241 | case 6: /* unregister DTL */ | ||
242 | tvcpu->arch.dtl = NULL; | ||
243 | break; | ||
244 | case 7: /* unregister SLB shadow buffer */ | ||
245 | tvcpu->arch.slb_shadow = NULL; | ||
246 | break; | ||
247 | } | ||
248 | } | ||
249 | return H_SUCCESS; | ||
250 | } | ||
251 | |||
252 | int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) | ||
253 | { | ||
254 | unsigned long req = kvmppc_get_gpr(vcpu, 3); | ||
255 | unsigned long target, ret = H_SUCCESS; | ||
256 | struct kvm_vcpu *tvcpu; | ||
257 | |||
258 | switch (req) { | ||
259 | case H_CEDE: | ||
260 | vcpu->arch.shregs.msr |= MSR_EE; | ||
261 | vcpu->arch.ceded = 1; | ||
262 | smp_mb(); | ||
263 | if (!vcpu->arch.prodded) | ||
264 | kvmppc_vcpu_block(vcpu); | ||
265 | else | ||
266 | vcpu->arch.prodded = 0; | ||
267 | smp_mb(); | ||
268 | vcpu->arch.ceded = 0; | ||
269 | break; | ||
270 | case H_PROD: | ||
271 | target = kvmppc_get_gpr(vcpu, 4); | ||
272 | tvcpu = kvmppc_find_vcpu(vcpu->kvm, target); | ||
273 | if (!tvcpu) { | ||
274 | ret = H_PARAMETER; | ||
275 | break; | ||
276 | } | ||
277 | tvcpu->arch.prodded = 1; | ||
278 | smp_mb(); | ||
279 | if (vcpu->arch.ceded) { | ||
280 | if (waitqueue_active(&vcpu->wq)) { | ||
281 | wake_up_interruptible(&vcpu->wq); | ||
282 | vcpu->stat.halt_wakeup++; | ||
283 | } | ||
284 | } | ||
285 | break; | ||
286 | case H_CONFER: | ||
287 | break; | ||
288 | case H_REGISTER_VPA: | ||
289 | ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4), | ||
290 | kvmppc_get_gpr(vcpu, 5), | ||
291 | kvmppc_get_gpr(vcpu, 6)); | ||
292 | break; | ||
293 | default: | ||
294 | return RESUME_HOST; | ||
295 | } | ||
296 | kvmppc_set_gpr(vcpu, 3, ret); | ||
297 | vcpu->arch.hcall_needed = 0; | ||
298 | return RESUME_GUEST; | ||
299 | } | ||
300 | |||
301 | static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, | ||
302 | struct task_struct *tsk) | ||
303 | { | ||
304 | int r = RESUME_HOST; | ||
305 | |||
306 | vcpu->stat.sum_exits++; | ||
307 | |||
308 | run->exit_reason = KVM_EXIT_UNKNOWN; | ||
309 | run->ready_for_interrupt_injection = 1; | ||
310 | switch (vcpu->arch.trap) { | ||
311 | /* We're good on these - the host merely wanted to get our attention */ | ||
312 | case BOOK3S_INTERRUPT_HV_DECREMENTER: | ||
313 | vcpu->stat.dec_exits++; | ||
314 | r = RESUME_GUEST; | ||
315 | break; | ||
316 | case BOOK3S_INTERRUPT_EXTERNAL: | ||
317 | vcpu->stat.ext_intr_exits++; | ||
318 | r = RESUME_GUEST; | ||
319 | break; | ||
320 | case BOOK3S_INTERRUPT_PERFMON: | ||
321 | r = RESUME_GUEST; | ||
322 | break; | ||
323 | case BOOK3S_INTERRUPT_PROGRAM: | ||
324 | { | ||
325 | ulong flags; | ||
326 | /* | ||
327 | * Normally program interrupts are delivered directly | ||
328 | * to the guest by the hardware, but we can get here | ||
329 | * as a result of a hypervisor emulation interrupt | ||
330 | * (e40) getting turned into a 700 by BML RTAS. | ||
331 | */ | ||
332 | flags = vcpu->arch.shregs.msr & 0x1f0000ull; | ||
333 | kvmppc_core_queue_program(vcpu, flags); | ||
334 | r = RESUME_GUEST; | ||
335 | break; | ||
336 | } | ||
337 | case BOOK3S_INTERRUPT_SYSCALL: | ||
338 | { | ||
339 | /* hcall - punt to userspace */ | ||
340 | int i; | ||
341 | |||
342 | if (vcpu->arch.shregs.msr & MSR_PR) { | ||
343 | /* sc 1 from userspace - reflect to guest syscall */ | ||
344 | kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_SYSCALL); | ||
345 | r = RESUME_GUEST; | ||
346 | break; | ||
347 | } | ||
348 | run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3); | ||
349 | for (i = 0; i < 9; ++i) | ||
350 | run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i); | ||
351 | run->exit_reason = KVM_EXIT_PAPR_HCALL; | ||
352 | vcpu->arch.hcall_needed = 1; | ||
353 | r = RESUME_HOST; | ||
354 | break; | ||
355 | } | ||
356 | /* | ||
357 | * We get these next two if the guest does a bad real-mode access, | ||
358 | * as we have enabled VRMA (virtualized real mode area) mode in the | ||
359 | * LPCR. We just generate an appropriate DSI/ISI to the guest. | ||
360 | */ | ||
361 | case BOOK3S_INTERRUPT_H_DATA_STORAGE: | ||
362 | vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr; | ||
363 | vcpu->arch.shregs.dar = vcpu->arch.fault_dar; | ||
364 | kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0); | ||
365 | r = RESUME_GUEST; | ||
366 | break; | ||
367 | case BOOK3S_INTERRUPT_H_INST_STORAGE: | ||
368 | kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE, | ||
369 | 0x08000000); | ||
370 | r = RESUME_GUEST; | ||
371 | break; | ||
372 | /* | ||
373 | * This occurs if the guest executes an illegal instruction. | ||
374 | * We just generate a program interrupt to the guest, since | ||
375 | * we don't emulate any guest instructions at this stage. | ||
376 | */ | ||
377 | case BOOK3S_INTERRUPT_H_EMUL_ASSIST: | ||
378 | kvmppc_core_queue_program(vcpu, 0x80000); | ||
379 | r = RESUME_GUEST; | ||
380 | break; | ||
381 | default: | ||
382 | kvmppc_dump_regs(vcpu); | ||
383 | printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n", | ||
384 | vcpu->arch.trap, kvmppc_get_pc(vcpu), | ||
385 | vcpu->arch.shregs.msr); | ||
386 | r = RESUME_HOST; | ||
387 | BUG(); | ||
388 | break; | ||
389 | } | ||
390 | |||
391 | |||
392 | if (!(r & RESUME_HOST)) { | ||
393 | /* To avoid clobbering exit_reason, only check for signals if | ||
394 | * we aren't already exiting to userspace for some other | ||
395 | * reason. */ | ||
396 | if (signal_pending(tsk)) { | ||
397 | vcpu->stat.signal_exits++; | ||
398 | run->exit_reason = KVM_EXIT_INTR; | ||
399 | r = -EINTR; | ||
400 | } else { | ||
401 | kvmppc_core_deliver_interrupts(vcpu); | ||
402 | } | ||
403 | } | ||
404 | |||
405 | return r; | ||
406 | } | ||
407 | |||
408 | int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | ||
409 | struct kvm_sregs *sregs) | ||
410 | { | ||
411 | int i; | ||
412 | |||
413 | sregs->pvr = vcpu->arch.pvr; | ||
414 | |||
415 | memset(sregs, 0, sizeof(struct kvm_sregs)); | ||
416 | for (i = 0; i < vcpu->arch.slb_max; i++) { | ||
417 | sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige; | ||
418 | sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv; | ||
419 | } | ||
420 | |||
421 | return 0; | ||
422 | } | ||
423 | |||
424 | int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | ||
425 | struct kvm_sregs *sregs) | ||
426 | { | ||
427 | int i, j; | ||
428 | |||
429 | kvmppc_set_pvr(vcpu, sregs->pvr); | ||
430 | |||
431 | j = 0; | ||
432 | for (i = 0; i < vcpu->arch.slb_nr; i++) { | ||
433 | if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) { | ||
434 | vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe; | ||
435 | vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv; | ||
436 | ++j; | ||
437 | } | ||
438 | } | ||
439 | vcpu->arch.slb_max = j; | ||
440 | |||
441 | return 0; | ||
442 | } | ||
443 | |||
444 | int kvmppc_core_check_processor_compat(void) | ||
445 | { | ||
446 | if (cpu_has_feature(CPU_FTR_HVMODE)) | ||
447 | return 0; | ||
448 | return -EIO; | ||
449 | } | ||
450 | |||
451 | struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) | ||
452 | { | ||
453 | struct kvm_vcpu *vcpu; | ||
454 | int err = -EINVAL; | ||
455 | int core; | ||
456 | struct kvmppc_vcore *vcore; | ||
457 | |||
458 | core = id / threads_per_core; | ||
459 | if (core >= KVM_MAX_VCORES) | ||
460 | goto out; | ||
461 | |||
462 | err = -ENOMEM; | ||
463 | vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); | ||
464 | if (!vcpu) | ||
465 | goto out; | ||
466 | |||
467 | err = kvm_vcpu_init(vcpu, kvm, id); | ||
468 | if (err) | ||
469 | goto free_vcpu; | ||
470 | |||
471 | vcpu->arch.shared = &vcpu->arch.shregs; | ||
472 | vcpu->arch.last_cpu = -1; | ||
473 | vcpu->arch.mmcr[0] = MMCR0_FC; | ||
474 | vcpu->arch.ctrl = CTRL_RUNLATCH; | ||
475 | /* default to host PVR, since we can't spoof it */ | ||
476 | vcpu->arch.pvr = mfspr(SPRN_PVR); | ||
477 | kvmppc_set_pvr(vcpu, vcpu->arch.pvr); | ||
478 | |||
479 | kvmppc_mmu_book3s_hv_init(vcpu); | ||
480 | |||
481 | /* | ||
482 | * Some vcpus may start out in stopped state. If we initialize | ||
483 | * them to busy-in-host state they will stop other vcpus in the | ||
484 | * vcore from running. Instead we initialize them to blocked | ||
485 | * state, effectively considering them to be stopped until we | ||
486 | * see the first run ioctl for them. | ||
487 | */ | ||
488 | vcpu->arch.state = KVMPPC_VCPU_BLOCKED; | ||
489 | |||
490 | init_waitqueue_head(&vcpu->arch.cpu_run); | ||
491 | |||
492 | mutex_lock(&kvm->lock); | ||
493 | vcore = kvm->arch.vcores[core]; | ||
494 | if (!vcore) { | ||
495 | vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL); | ||
496 | if (vcore) { | ||
497 | INIT_LIST_HEAD(&vcore->runnable_threads); | ||
498 | spin_lock_init(&vcore->lock); | ||
499 | } | ||
500 | kvm->arch.vcores[core] = vcore; | ||
501 | } | ||
502 | mutex_unlock(&kvm->lock); | ||
503 | |||
504 | if (!vcore) | ||
505 | goto free_vcpu; | ||
506 | |||
507 | spin_lock(&vcore->lock); | ||
508 | ++vcore->num_threads; | ||
509 | ++vcore->n_blocked; | ||
510 | spin_unlock(&vcore->lock); | ||
511 | vcpu->arch.vcore = vcore; | ||
512 | |||
513 | return vcpu; | ||
514 | |||
515 | free_vcpu: | ||
516 | kfree(vcpu); | ||
517 | out: | ||
518 | return ERR_PTR(err); | ||
519 | } | ||
520 | |||
521 | void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) | ||
522 | { | ||
523 | kvm_vcpu_uninit(vcpu); | ||
524 | kfree(vcpu); | ||
525 | } | ||
526 | |||
527 | static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu) | ||
528 | { | ||
529 | struct kvmppc_vcore *vc = vcpu->arch.vcore; | ||
530 | |||
531 | spin_lock(&vc->lock); | ||
532 | vcpu->arch.state = KVMPPC_VCPU_BLOCKED; | ||
533 | ++vc->n_blocked; | ||
534 | if (vc->n_runnable > 0 && | ||
535 | vc->n_runnable + vc->n_blocked == vc->num_threads) { | ||
536 | vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu, | ||
537 | arch.run_list); | ||
538 | wake_up(&vcpu->arch.cpu_run); | ||
539 | } | ||
540 | spin_unlock(&vc->lock); | ||
541 | } | ||
542 | |||
543 | static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu) | ||
544 | { | ||
545 | struct kvmppc_vcore *vc = vcpu->arch.vcore; | ||
546 | |||
547 | spin_lock(&vc->lock); | ||
548 | vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; | ||
549 | --vc->n_blocked; | ||
550 | spin_unlock(&vc->lock); | ||
551 | } | ||
552 | |||
553 | extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); | ||
554 | extern void xics_wake_cpu(int cpu); | ||
555 | |||
556 | static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, | ||
557 | struct kvm_vcpu *vcpu) | ||
558 | { | ||
559 | struct kvm_vcpu *v; | ||
560 | |||
561 | if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) | ||
562 | return; | ||
563 | vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; | ||
564 | --vc->n_runnable; | ||
565 | /* decrement the physical thread id of each following vcpu */ | ||
566 | v = vcpu; | ||
567 | list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list) | ||
568 | --v->arch.ptid; | ||
569 | list_del(&vcpu->arch.run_list); | ||
570 | } | ||
571 | |||
572 | static void kvmppc_start_thread(struct kvm_vcpu *vcpu) | ||
573 | { | ||
574 | int cpu; | ||
575 | struct paca_struct *tpaca; | ||
576 | struct kvmppc_vcore *vc = vcpu->arch.vcore; | ||
577 | |||
578 | cpu = vc->pcpu + vcpu->arch.ptid; | ||
579 | tpaca = &paca[cpu]; | ||
580 | tpaca->kvm_hstate.kvm_vcpu = vcpu; | ||
581 | tpaca->kvm_hstate.kvm_vcore = vc; | ||
582 | smp_wmb(); | ||
583 | #ifdef CONFIG_PPC_ICP_NATIVE | ||
584 | if (vcpu->arch.ptid) { | ||
585 | tpaca->cpu_start = 0x80; | ||
586 | tpaca->kvm_hstate.in_guest = KVM_GUEST_MODE_GUEST; | ||
587 | wmb(); | ||
588 | xics_wake_cpu(cpu); | ||
589 | ++vc->n_woken; | ||
590 | } | ||
591 | #endif | ||
592 | } | ||
593 | |||
594 | static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc) | ||
595 | { | ||
596 | int i; | ||
597 | |||
598 | HMT_low(); | ||
599 | i = 0; | ||
600 | while (vc->nap_count < vc->n_woken) { | ||
601 | if (++i >= 1000000) { | ||
602 | pr_err("kvmppc_wait_for_nap timeout %d %d\n", | ||
603 | vc->nap_count, vc->n_woken); | ||
604 | break; | ||
605 | } | ||
606 | cpu_relax(); | ||
607 | } | ||
608 | HMT_medium(); | ||
609 | } | ||
610 | |||
611 | /* | ||
612 | * Check that we are on thread 0 and that any other threads in | ||
613 | * this core are off-line. | ||
614 | */ | ||
615 | static int on_primary_thread(void) | ||
616 | { | ||
617 | int cpu = smp_processor_id(); | ||
618 | int thr = cpu_thread_in_core(cpu); | ||
619 | |||
620 | if (thr) | ||
621 | return 0; | ||
622 | while (++thr < threads_per_core) | ||
623 | if (cpu_online(cpu + thr)) | ||
624 | return 0; | ||
625 | return 1; | ||
626 | } | ||
627 | |||
628 | /* | ||
629 | * Run a set of guest threads on a physical core. | ||
630 | * Called with vc->lock held. | ||
631 | */ | ||
632 | static int kvmppc_run_core(struct kvmppc_vcore *vc) | ||
633 | { | ||
634 | struct kvm_vcpu *vcpu, *vnext; | ||
635 | long ret; | ||
636 | u64 now; | ||
637 | |||
638 | /* don't start if any threads have a signal pending */ | ||
639 | list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) | ||
640 | if (signal_pending(vcpu->arch.run_task)) | ||
641 | return 0; | ||
642 | |||
643 | /* | ||
644 | * Make sure we are running on thread 0, and that | ||
645 | * secondary threads are offline. | ||
646 | * XXX we should also block attempts to bring any | ||
647 | * secondary threads online. | ||
648 | */ | ||
649 | if (threads_per_core > 1 && !on_primary_thread()) { | ||
650 | list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) | ||
651 | vcpu->arch.ret = -EBUSY; | ||
652 | goto out; | ||
653 | } | ||
654 | |||
655 | vc->n_woken = 0; | ||
656 | vc->nap_count = 0; | ||
657 | vc->entry_exit_count = 0; | ||
658 | vc->vcore_running = 1; | ||
659 | vc->in_guest = 0; | ||
660 | vc->pcpu = smp_processor_id(); | ||
661 | list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) | ||
662 | kvmppc_start_thread(vcpu); | ||
663 | vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu, | ||
664 | arch.run_list); | ||
665 | |||
666 | spin_unlock(&vc->lock); | ||
667 | |||
668 | preempt_disable(); | ||
669 | kvm_guest_enter(); | ||
670 | __kvmppc_vcore_entry(NULL, vcpu); | ||
671 | |||
672 | /* wait for secondary threads to finish writing their state to memory */ | ||
673 | spin_lock(&vc->lock); | ||
674 | if (vc->nap_count < vc->n_woken) | ||
675 | kvmppc_wait_for_nap(vc); | ||
676 | /* prevent other vcpu threads from doing kvmppc_start_thread() now */ | ||
677 | vc->vcore_running = 2; | ||
678 | spin_unlock(&vc->lock); | ||
679 | |||
680 | /* make sure updates to secondary vcpu structs are visible now */ | ||
681 | smp_mb(); | ||
682 | kvm_guest_exit(); | ||
683 | |||
684 | preempt_enable(); | ||
685 | kvm_resched(vcpu); | ||
686 | |||
687 | now = get_tb(); | ||
688 | list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { | ||
689 | /* cancel pending dec exception if dec is positive */ | ||
690 | if (now < vcpu->arch.dec_expires && | ||
691 | kvmppc_core_pending_dec(vcpu)) | ||
692 | kvmppc_core_dequeue_dec(vcpu); | ||
693 | if (!vcpu->arch.trap) { | ||
694 | if (signal_pending(vcpu->arch.run_task)) { | ||
695 | vcpu->arch.kvm_run->exit_reason = KVM_EXIT_INTR; | ||
696 | vcpu->arch.ret = -EINTR; | ||
697 | } | ||
698 | continue; /* didn't get to run */ | ||
699 | } | ||
700 | ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu, | ||
701 | vcpu->arch.run_task); | ||
702 | vcpu->arch.ret = ret; | ||
703 | vcpu->arch.trap = 0; | ||
704 | } | ||
705 | |||
706 | spin_lock(&vc->lock); | ||
707 | out: | ||
708 | vc->vcore_running = 0; | ||
709 | list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, | ||
710 | arch.run_list) { | ||
711 | if (vcpu->arch.ret != RESUME_GUEST) { | ||
712 | kvmppc_remove_runnable(vc, vcpu); | ||
713 | wake_up(&vcpu->arch.cpu_run); | ||
714 | } | ||
715 | } | ||
716 | |||
717 | return 1; | ||
718 | } | ||
719 | |||
720 | static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | ||
721 | { | ||
722 | int ptid; | ||
723 | int wait_state; | ||
724 | struct kvmppc_vcore *vc; | ||
725 | DEFINE_WAIT(wait); | ||
726 | |||
727 | /* No need to go into the guest when all we do is going out */ | ||
728 | if (signal_pending(current)) { | ||
729 | kvm_run->exit_reason = KVM_EXIT_INTR; | ||
730 | return -EINTR; | ||
731 | } | ||
732 | |||
733 | /* On PPC970, check that we have an RMA region */ | ||
734 | if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201)) | ||
735 | return -EPERM; | ||
736 | |||
737 | kvm_run->exit_reason = 0; | ||
738 | vcpu->arch.ret = RESUME_GUEST; | ||
739 | vcpu->arch.trap = 0; | ||
740 | |||
741 | flush_fp_to_thread(current); | ||
742 | flush_altivec_to_thread(current); | ||
743 | flush_vsx_to_thread(current); | ||
744 | |||
745 | /* | ||
746 | * Synchronize with other threads in this virtual core | ||
747 | */ | ||
748 | vc = vcpu->arch.vcore; | ||
749 | spin_lock(&vc->lock); | ||
750 | /* This happens the first time this is called for a vcpu */ | ||
751 | if (vcpu->arch.state == KVMPPC_VCPU_BLOCKED) | ||
752 | --vc->n_blocked; | ||
753 | vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; | ||
754 | ptid = vc->n_runnable; | ||
755 | vcpu->arch.run_task = current; | ||
756 | vcpu->arch.kvm_run = kvm_run; | ||
757 | vcpu->arch.ptid = ptid; | ||
758 | list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads); | ||
759 | ++vc->n_runnable; | ||
760 | |||
761 | wait_state = TASK_INTERRUPTIBLE; | ||
762 | while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { | ||
763 | if (signal_pending(current)) { | ||
764 | if (!vc->vcore_running) { | ||
765 | kvm_run->exit_reason = KVM_EXIT_INTR; | ||
766 | vcpu->arch.ret = -EINTR; | ||
767 | break; | ||
768 | } | ||
769 | /* have to wait for vcore to stop executing guest */ | ||
770 | wait_state = TASK_UNINTERRUPTIBLE; | ||
771 | smp_send_reschedule(vc->pcpu); | ||
772 | } | ||
773 | |||
774 | if (!vc->vcore_running && | ||
775 | vc->n_runnable + vc->n_blocked == vc->num_threads) { | ||
776 | /* we can run now */ | ||
777 | if (kvmppc_run_core(vc)) | ||
778 | continue; | ||
779 | } | ||
780 | |||
781 | if (vc->vcore_running == 1 && VCORE_EXIT_COUNT(vc) == 0) | ||
782 | kvmppc_start_thread(vcpu); | ||
783 | |||
784 | /* wait for other threads to come in, or wait for vcore */ | ||
785 | prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state); | ||
786 | spin_unlock(&vc->lock); | ||
787 | schedule(); | ||
788 | finish_wait(&vcpu->arch.cpu_run, &wait); | ||
789 | spin_lock(&vc->lock); | ||
790 | } | ||
791 | |||
792 | if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) | ||
793 | kvmppc_remove_runnable(vc, vcpu); | ||
794 | spin_unlock(&vc->lock); | ||
795 | |||
796 | return vcpu->arch.ret; | ||
797 | } | ||
798 | |||
799 | int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) | ||
800 | { | ||
801 | int r; | ||
802 | |||
803 | do { | ||
804 | r = kvmppc_run_vcpu(run, vcpu); | ||
805 | |||
806 | if (run->exit_reason == KVM_EXIT_PAPR_HCALL && | ||
807 | !(vcpu->arch.shregs.msr & MSR_PR)) { | ||
808 | r = kvmppc_pseries_do_hcall(vcpu); | ||
809 | kvmppc_core_deliver_interrupts(vcpu); | ||
810 | } | ||
811 | } while (r == RESUME_GUEST); | ||
812 | return r; | ||
813 | } | ||
814 | |||
815 | static long kvmppc_stt_npages(unsigned long window_size) | ||
816 | { | ||
817 | return ALIGN((window_size >> SPAPR_TCE_SHIFT) | ||
818 | * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; | ||
819 | } | ||
820 | |||
821 | static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt) | ||
822 | { | ||
823 | struct kvm *kvm = stt->kvm; | ||
824 | int i; | ||
825 | |||
826 | mutex_lock(&kvm->lock); | ||
827 | list_del(&stt->list); | ||
828 | for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++) | ||
829 | __free_page(stt->pages[i]); | ||
830 | kfree(stt); | ||
831 | mutex_unlock(&kvm->lock); | ||
832 | |||
833 | kvm_put_kvm(kvm); | ||
834 | } | ||
835 | |||
836 | static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | ||
837 | { | ||
838 | struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data; | ||
839 | struct page *page; | ||
840 | |||
841 | if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size)) | ||
842 | return VM_FAULT_SIGBUS; | ||
843 | |||
844 | page = stt->pages[vmf->pgoff]; | ||
845 | get_page(page); | ||
846 | vmf->page = page; | ||
847 | return 0; | ||
848 | } | ||
849 | |||
850 | static const struct vm_operations_struct kvm_spapr_tce_vm_ops = { | ||
851 | .fault = kvm_spapr_tce_fault, | ||
852 | }; | ||
853 | |||
854 | static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma) | ||
855 | { | ||
856 | vma->vm_ops = &kvm_spapr_tce_vm_ops; | ||
857 | return 0; | ||
858 | } | ||
859 | |||
860 | static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) | ||
861 | { | ||
862 | struct kvmppc_spapr_tce_table *stt = filp->private_data; | ||
863 | |||
864 | release_spapr_tce_table(stt); | ||
865 | return 0; | ||
866 | } | ||
867 | |||
868 | static struct file_operations kvm_spapr_tce_fops = { | ||
869 | .mmap = kvm_spapr_tce_mmap, | ||
870 | .release = kvm_spapr_tce_release, | ||
871 | }; | ||
872 | |||
873 | long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, | ||
874 | struct kvm_create_spapr_tce *args) | ||
875 | { | ||
876 | struct kvmppc_spapr_tce_table *stt = NULL; | ||
877 | long npages; | ||
878 | int ret = -ENOMEM; | ||
879 | int i; | ||
880 | |||
881 | /* Check this LIOBN hasn't been previously allocated */ | ||
882 | list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { | ||
883 | if (stt->liobn == args->liobn) | ||
884 | return -EBUSY; | ||
885 | } | ||
886 | |||
887 | npages = kvmppc_stt_npages(args->window_size); | ||
888 | |||
889 | stt = kzalloc(sizeof(*stt) + npages* sizeof(struct page *), | ||
890 | GFP_KERNEL); | ||
891 | if (!stt) | ||
892 | goto fail; | ||
893 | |||
894 | stt->liobn = args->liobn; | ||
895 | stt->window_size = args->window_size; | ||
896 | stt->kvm = kvm; | ||
897 | |||
898 | for (i = 0; i < npages; i++) { | ||
899 | stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
900 | if (!stt->pages[i]) | ||
901 | goto fail; | ||
902 | } | ||
903 | |||
904 | kvm_get_kvm(kvm); | ||
905 | |||
906 | mutex_lock(&kvm->lock); | ||
907 | list_add(&stt->list, &kvm->arch.spapr_tce_tables); | ||
908 | |||
909 | mutex_unlock(&kvm->lock); | ||
910 | |||
911 | return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, | ||
912 | stt, O_RDWR); | ||
913 | |||
914 | fail: | ||
915 | if (stt) { | ||
916 | for (i = 0; i < npages; i++) | ||
917 | if (stt->pages[i]) | ||
918 | __free_page(stt->pages[i]); | ||
919 | |||
920 | kfree(stt); | ||
921 | } | ||
922 | return ret; | ||
923 | } | ||
924 | |||
925 | /* Work out RMLS (real mode limit selector) field value for a given RMA size. | ||
926 | Assumes POWER7 or PPC970. */ | ||
927 | static inline int lpcr_rmls(unsigned long rma_size) | ||
928 | { | ||
929 | switch (rma_size) { | ||
930 | case 32ul << 20: /* 32 MB */ | ||
931 | if (cpu_has_feature(CPU_FTR_ARCH_206)) | ||
932 | return 8; /* only supported on POWER7 */ | ||
933 | return -1; | ||
934 | case 64ul << 20: /* 64 MB */ | ||
935 | return 3; | ||
936 | case 128ul << 20: /* 128 MB */ | ||
937 | return 7; | ||
938 | case 256ul << 20: /* 256 MB */ | ||
939 | return 4; | ||
940 | case 1ul << 30: /* 1 GB */ | ||
941 | return 2; | ||
942 | case 16ul << 30: /* 16 GB */ | ||
943 | return 1; | ||
944 | case 256ul << 30: /* 256 GB */ | ||
945 | return 0; | ||
946 | default: | ||
947 | return -1; | ||
948 | } | ||
949 | } | ||
950 | |||
951 | static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | ||
952 | { | ||
953 | struct kvmppc_rma_info *ri = vma->vm_file->private_data; | ||
954 | struct page *page; | ||
955 | |||
956 | if (vmf->pgoff >= ri->npages) | ||
957 | return VM_FAULT_SIGBUS; | ||
958 | |||
959 | page = pfn_to_page(ri->base_pfn + vmf->pgoff); | ||
960 | get_page(page); | ||
961 | vmf->page = page; | ||
962 | return 0; | ||
963 | } | ||
964 | |||
965 | static const struct vm_operations_struct kvm_rma_vm_ops = { | ||
966 | .fault = kvm_rma_fault, | ||
967 | }; | ||
968 | |||
969 | static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma) | ||
970 | { | ||
971 | vma->vm_flags |= VM_RESERVED; | ||
972 | vma->vm_ops = &kvm_rma_vm_ops; | ||
973 | return 0; | ||
974 | } | ||
975 | |||
976 | static int kvm_rma_release(struct inode *inode, struct file *filp) | ||
977 | { | ||
978 | struct kvmppc_rma_info *ri = filp->private_data; | ||
979 | |||
980 | kvm_release_rma(ri); | ||
981 | return 0; | ||
982 | } | ||
983 | |||
984 | static struct file_operations kvm_rma_fops = { | ||
985 | .mmap = kvm_rma_mmap, | ||
986 | .release = kvm_rma_release, | ||
987 | }; | ||
988 | |||
989 | long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret) | ||
990 | { | ||
991 | struct kvmppc_rma_info *ri; | ||
992 | long fd; | ||
993 | |||
994 | ri = kvm_alloc_rma(); | ||
995 | if (!ri) | ||
996 | return -ENOMEM; | ||
997 | |||
998 | fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR); | ||
999 | if (fd < 0) | ||
1000 | kvm_release_rma(ri); | ||
1001 | |||
1002 | ret->rma_size = ri->npages << PAGE_SHIFT; | ||
1003 | return fd; | ||
1004 | } | ||
1005 | |||
1006 | static struct page *hva_to_page(unsigned long addr) | ||
1007 | { | ||
1008 | struct page *page[1]; | ||
1009 | int npages; | ||
1010 | |||
1011 | might_sleep(); | ||
1012 | |||
1013 | npages = get_user_pages_fast(addr, 1, 1, page); | ||
1014 | |||
1015 | if (unlikely(npages != 1)) | ||
1016 | return 0; | ||
1017 | |||
1018 | return page[0]; | ||
1019 | } | ||
1020 | |||
1021 | int kvmppc_core_prepare_memory_region(struct kvm *kvm, | ||
1022 | struct kvm_userspace_memory_region *mem) | ||
1023 | { | ||
1024 | unsigned long psize, porder; | ||
1025 | unsigned long i, npages, totalpages; | ||
1026 | unsigned long pg_ix; | ||
1027 | struct kvmppc_pginfo *pginfo; | ||
1028 | unsigned long hva; | ||
1029 | struct kvmppc_rma_info *ri = NULL; | ||
1030 | struct page *page; | ||
1031 | |||
1032 | /* For now, only allow 16MB pages */ | ||
1033 | porder = LARGE_PAGE_ORDER; | ||
1034 | psize = 1ul << porder; | ||
1035 | if ((mem->memory_size & (psize - 1)) || | ||
1036 | (mem->guest_phys_addr & (psize - 1))) { | ||
1037 | pr_err("bad memory_size=%llx @ %llx\n", | ||
1038 | mem->memory_size, mem->guest_phys_addr); | ||
1039 | return -EINVAL; | ||
1040 | } | ||
1041 | |||
1042 | npages = mem->memory_size >> porder; | ||
1043 | totalpages = (mem->guest_phys_addr + mem->memory_size) >> porder; | ||
1044 | |||
1045 | /* More memory than we have space to track? */ | ||
1046 | if (totalpages > (1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER))) | ||
1047 | return -EINVAL; | ||
1048 | |||
1049 | /* Do we already have an RMA registered? */ | ||
1050 | if (mem->guest_phys_addr == 0 && kvm->arch.rma) | ||
1051 | return -EINVAL; | ||
1052 | |||
1053 | if (totalpages > kvm->arch.ram_npages) | ||
1054 | kvm->arch.ram_npages = totalpages; | ||
1055 | |||
1056 | /* Is this one of our preallocated RMAs? */ | ||
1057 | if (mem->guest_phys_addr == 0) { | ||
1058 | struct vm_area_struct *vma; | ||
1059 | |||
1060 | down_read(¤t->mm->mmap_sem); | ||
1061 | vma = find_vma(current->mm, mem->userspace_addr); | ||
1062 | if (vma && vma->vm_file && | ||
1063 | vma->vm_file->f_op == &kvm_rma_fops && | ||
1064 | mem->userspace_addr == vma->vm_start) | ||
1065 | ri = vma->vm_file->private_data; | ||
1066 | up_read(¤t->mm->mmap_sem); | ||
1067 | if (!ri && cpu_has_feature(CPU_FTR_ARCH_201)) { | ||
1068 | pr_err("CPU requires an RMO\n"); | ||
1069 | return -EINVAL; | ||
1070 | } | ||
1071 | } | ||
1072 | |||
1073 | if (ri) { | ||
1074 | unsigned long rma_size; | ||
1075 | unsigned long lpcr; | ||
1076 | long rmls; | ||
1077 | |||
1078 | rma_size = ri->npages << PAGE_SHIFT; | ||
1079 | if (rma_size > mem->memory_size) | ||
1080 | rma_size = mem->memory_size; | ||
1081 | rmls = lpcr_rmls(rma_size); | ||
1082 | if (rmls < 0) { | ||
1083 | pr_err("Can't use RMA of 0x%lx bytes\n", rma_size); | ||
1084 | return -EINVAL; | ||
1085 | } | ||
1086 | atomic_inc(&ri->use_count); | ||
1087 | kvm->arch.rma = ri; | ||
1088 | kvm->arch.n_rma_pages = rma_size >> porder; | ||
1089 | |||
1090 | /* Update LPCR and RMOR */ | ||
1091 | lpcr = kvm->arch.lpcr; | ||
1092 | if (cpu_has_feature(CPU_FTR_ARCH_201)) { | ||
1093 | /* PPC970; insert RMLS value (split field) in HID4 */ | ||
1094 | lpcr &= ~((1ul << HID4_RMLS0_SH) | | ||
1095 | (3ul << HID4_RMLS2_SH)); | ||
1096 | lpcr |= ((rmls >> 2) << HID4_RMLS0_SH) | | ||
1097 | ((rmls & 3) << HID4_RMLS2_SH); | ||
1098 | /* RMOR is also in HID4 */ | ||
1099 | lpcr |= ((ri->base_pfn >> (26 - PAGE_SHIFT)) & 0xffff) | ||
1100 | << HID4_RMOR_SH; | ||
1101 | } else { | ||
1102 | /* POWER7 */ | ||
1103 | lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L); | ||
1104 | lpcr |= rmls << LPCR_RMLS_SH; | ||
1105 | kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT; | ||
1106 | } | ||
1107 | kvm->arch.lpcr = lpcr; | ||
1108 | pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n", | ||
1109 | ri->base_pfn << PAGE_SHIFT, rma_size, lpcr); | ||
1110 | } | ||
1111 | |||
1112 | pg_ix = mem->guest_phys_addr >> porder; | ||
1113 | pginfo = kvm->arch.ram_pginfo + pg_ix; | ||
1114 | for (i = 0; i < npages; ++i, ++pg_ix) { | ||
1115 | if (ri && pg_ix < kvm->arch.n_rma_pages) { | ||
1116 | pginfo[i].pfn = ri->base_pfn + | ||
1117 | (pg_ix << (porder - PAGE_SHIFT)); | ||
1118 | continue; | ||
1119 | } | ||
1120 | hva = mem->userspace_addr + (i << porder); | ||
1121 | page = hva_to_page(hva); | ||
1122 | if (!page) { | ||
1123 | pr_err("oops, no pfn for hva %lx\n", hva); | ||
1124 | goto err; | ||
1125 | } | ||
1126 | /* Check it's a 16MB page */ | ||
1127 | if (!PageHead(page) || | ||
1128 | compound_order(page) != (LARGE_PAGE_ORDER - PAGE_SHIFT)) { | ||
1129 | pr_err("page at %lx isn't 16MB (o=%d)\n", | ||
1130 | hva, compound_order(page)); | ||
1131 | goto err; | ||
1132 | } | ||
1133 | pginfo[i].pfn = page_to_pfn(page); | ||
1134 | } | ||
1135 | |||
1136 | return 0; | ||
1137 | |||
1138 | err: | ||
1139 | return -EINVAL; | ||
1140 | } | ||
1141 | |||
1142 | void kvmppc_core_commit_memory_region(struct kvm *kvm, | ||
1143 | struct kvm_userspace_memory_region *mem) | ||
1144 | { | ||
1145 | if (mem->guest_phys_addr == 0 && mem->memory_size != 0 && | ||
1146 | !kvm->arch.rma) | ||
1147 | kvmppc_map_vrma(kvm, mem); | ||
1148 | } | ||
1149 | |||
1150 | int kvmppc_core_init_vm(struct kvm *kvm) | ||
1151 | { | ||
1152 | long r; | ||
1153 | unsigned long npages = 1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER); | ||
1154 | long err = -ENOMEM; | ||
1155 | unsigned long lpcr; | ||
1156 | |||
1157 | /* Allocate hashed page table */ | ||
1158 | r = kvmppc_alloc_hpt(kvm); | ||
1159 | if (r) | ||
1160 | return r; | ||
1161 | |||
1162 | INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); | ||
1163 | |||
1164 | kvm->arch.ram_pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo), | ||
1165 | GFP_KERNEL); | ||
1166 | if (!kvm->arch.ram_pginfo) { | ||
1167 | pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n", | ||
1168 | npages * sizeof(struct kvmppc_pginfo)); | ||
1169 | goto out_free; | ||
1170 | } | ||
1171 | |||
1172 | kvm->arch.ram_npages = 0; | ||
1173 | kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER; | ||
1174 | kvm->arch.ram_porder = LARGE_PAGE_ORDER; | ||
1175 | kvm->arch.rma = NULL; | ||
1176 | kvm->arch.n_rma_pages = 0; | ||
1177 | |||
1178 | kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); | ||
1179 | |||
1180 | if (cpu_has_feature(CPU_FTR_ARCH_201)) { | ||
1181 | /* PPC970; HID4 is effectively the LPCR */ | ||
1182 | unsigned long lpid = kvm->arch.lpid; | ||
1183 | kvm->arch.host_lpid = 0; | ||
1184 | kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4); | ||
1185 | lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH)); | ||
1186 | lpcr |= ((lpid >> 4) << HID4_LPID1_SH) | | ||
1187 | ((lpid & 0xf) << HID4_LPID5_SH); | ||
1188 | } else { | ||
1189 | /* POWER7; init LPCR for virtual RMA mode */ | ||
1190 | kvm->arch.host_lpid = mfspr(SPRN_LPID); | ||
1191 | kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR); | ||
1192 | lpcr &= LPCR_PECE | LPCR_LPES; | ||
1193 | lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE | | ||
1194 | LPCR_VPM0 | LPCR_VRMA_L; | ||
1195 | } | ||
1196 | kvm->arch.lpcr = lpcr; | ||
1197 | |||
1198 | return 0; | ||
1199 | |||
1200 | out_free: | ||
1201 | kvmppc_free_hpt(kvm); | ||
1202 | return err; | ||
1203 | } | ||
1204 | |||
1205 | void kvmppc_core_destroy_vm(struct kvm *kvm) | ||
1206 | { | ||
1207 | struct kvmppc_pginfo *pginfo; | ||
1208 | unsigned long i; | ||
1209 | |||
1210 | if (kvm->arch.ram_pginfo) { | ||
1211 | pginfo = kvm->arch.ram_pginfo; | ||
1212 | kvm->arch.ram_pginfo = NULL; | ||
1213 | for (i = kvm->arch.n_rma_pages; i < kvm->arch.ram_npages; ++i) | ||
1214 | if (pginfo[i].pfn) | ||
1215 | put_page(pfn_to_page(pginfo[i].pfn)); | ||
1216 | kfree(pginfo); | ||
1217 | } | ||
1218 | if (kvm->arch.rma) { | ||
1219 | kvm_release_rma(kvm->arch.rma); | ||
1220 | kvm->arch.rma = NULL; | ||
1221 | } | ||
1222 | |||
1223 | kvmppc_free_hpt(kvm); | ||
1224 | WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); | ||
1225 | } | ||
1226 | |||
1227 | /* These are stubs for now */ | ||
1228 | void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) | ||
1229 | { | ||
1230 | } | ||
1231 | |||
1232 | /* We don't need to emulate any privileged instructions or dcbz */ | ||
1233 | int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, | ||
1234 | unsigned int inst, int *advance) | ||
1235 | { | ||
1236 | return EMULATE_FAIL; | ||
1237 | } | ||
1238 | |||
1239 | int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) | ||
1240 | { | ||
1241 | return EMULATE_FAIL; | ||
1242 | } | ||
1243 | |||
1244 | int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) | ||
1245 | { | ||
1246 | return EMULATE_FAIL; | ||
1247 | } | ||
1248 | |||
1249 | static int kvmppc_book3s_hv_init(void) | ||
1250 | { | ||
1251 | int r; | ||
1252 | |||
1253 | r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); | ||
1254 | |||
1255 | if (r) | ||
1256 | return r; | ||
1257 | |||
1258 | r = kvmppc_mmu_hv_init(); | ||
1259 | |||
1260 | return r; | ||
1261 | } | ||
1262 | |||
1263 | static void kvmppc_book3s_hv_exit(void) | ||
1264 | { | ||
1265 | kvm_exit(); | ||
1266 | } | ||
1267 | |||
1268 | module_init(kvmppc_book3s_hv_init); | ||
1269 | module_exit(kvmppc_book3s_hv_exit); | ||
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c new file mode 100644 index 000000000000..d43120355eec --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_builtin.c | |||
@@ -0,0 +1,155 @@ | |||
1 | /* | ||
2 | * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License, version 2, as | ||
6 | * published by the Free Software Foundation. | ||
7 | */ | ||
8 | |||
9 | #include <linux/kvm_host.h> | ||
10 | #include <linux/preempt.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <linux/spinlock.h> | ||
13 | #include <linux/bootmem.h> | ||
14 | #include <linux/init.h> | ||
15 | |||
16 | #include <asm/cputable.h> | ||
17 | #include <asm/kvm_ppc.h> | ||
18 | #include <asm/kvm_book3s.h> | ||
19 | |||
20 | /* | ||
21 | * This maintains a list of RMAs (real mode areas) for KVM guests to use. | ||
22 | * Each RMA has to be physically contiguous and of a size that the | ||
23 | * hardware supports. PPC970 and POWER7 support 64MB, 128MB and 256MB, | ||
24 | * and other larger sizes. Since we are unlikely to be allocate that | ||
25 | * much physically contiguous memory after the system is up and running, | ||
26 | * we preallocate a set of RMAs in early boot for KVM to use. | ||
27 | */ | ||
28 | static unsigned long kvm_rma_size = 64 << 20; /* 64MB */ | ||
29 | static unsigned long kvm_rma_count; | ||
30 | |||
31 | static int __init early_parse_rma_size(char *p) | ||
32 | { | ||
33 | if (!p) | ||
34 | return 1; | ||
35 | |||
36 | kvm_rma_size = memparse(p, &p); | ||
37 | |||
38 | return 0; | ||
39 | } | ||
40 | early_param("kvm_rma_size", early_parse_rma_size); | ||
41 | |||
42 | static int __init early_parse_rma_count(char *p) | ||
43 | { | ||
44 | if (!p) | ||
45 | return 1; | ||
46 | |||
47 | kvm_rma_count = simple_strtoul(p, NULL, 0); | ||
48 | |||
49 | return 0; | ||
50 | } | ||
51 | early_param("kvm_rma_count", early_parse_rma_count); | ||
52 | |||
53 | static struct kvmppc_rma_info *rma_info; | ||
54 | static LIST_HEAD(free_rmas); | ||
55 | static DEFINE_SPINLOCK(rma_lock); | ||
56 | |||
57 | /* Work out RMLS (real mode limit selector) field value for a given RMA size. | ||
58 | Assumes POWER7 or PPC970. */ | ||
59 | static inline int lpcr_rmls(unsigned long rma_size) | ||
60 | { | ||
61 | switch (rma_size) { | ||
62 | case 32ul << 20: /* 32 MB */ | ||
63 | if (cpu_has_feature(CPU_FTR_ARCH_206)) | ||
64 | return 8; /* only supported on POWER7 */ | ||
65 | return -1; | ||
66 | case 64ul << 20: /* 64 MB */ | ||
67 | return 3; | ||
68 | case 128ul << 20: /* 128 MB */ | ||
69 | return 7; | ||
70 | case 256ul << 20: /* 256 MB */ | ||
71 | return 4; | ||
72 | case 1ul << 30: /* 1 GB */ | ||
73 | return 2; | ||
74 | case 16ul << 30: /* 16 GB */ | ||
75 | return 1; | ||
76 | case 256ul << 30: /* 256 GB */ | ||
77 | return 0; | ||
78 | default: | ||
79 | return -1; | ||
80 | } | ||
81 | } | ||
82 | |||
83 | /* | ||
84 | * Called at boot time while the bootmem allocator is active, | ||
85 | * to allocate contiguous physical memory for the real memory | ||
86 | * areas for guests. | ||
87 | */ | ||
88 | void kvm_rma_init(void) | ||
89 | { | ||
90 | unsigned long i; | ||
91 | unsigned long j, npages; | ||
92 | void *rma; | ||
93 | struct page *pg; | ||
94 | |||
95 | /* Only do this on PPC970 in HV mode */ | ||
96 | if (!cpu_has_feature(CPU_FTR_HVMODE) || | ||
97 | !cpu_has_feature(CPU_FTR_ARCH_201)) | ||
98 | return; | ||
99 | |||
100 | if (!kvm_rma_size || !kvm_rma_count) | ||
101 | return; | ||
102 | |||
103 | /* Check that the requested size is one supported in hardware */ | ||
104 | if (lpcr_rmls(kvm_rma_size) < 0) { | ||
105 | pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size); | ||
106 | return; | ||
107 | } | ||
108 | |||
109 | npages = kvm_rma_size >> PAGE_SHIFT; | ||
110 | rma_info = alloc_bootmem(kvm_rma_count * sizeof(struct kvmppc_rma_info)); | ||
111 | for (i = 0; i < kvm_rma_count; ++i) { | ||
112 | rma = alloc_bootmem_align(kvm_rma_size, kvm_rma_size); | ||
113 | pr_info("Allocated KVM RMA at %p (%ld MB)\n", rma, | ||
114 | kvm_rma_size >> 20); | ||
115 | rma_info[i].base_virt = rma; | ||
116 | rma_info[i].base_pfn = __pa(rma) >> PAGE_SHIFT; | ||
117 | rma_info[i].npages = npages; | ||
118 | list_add_tail(&rma_info[i].list, &free_rmas); | ||
119 | atomic_set(&rma_info[i].use_count, 0); | ||
120 | |||
121 | pg = pfn_to_page(rma_info[i].base_pfn); | ||
122 | for (j = 0; j < npages; ++j) { | ||
123 | atomic_inc(&pg->_count); | ||
124 | ++pg; | ||
125 | } | ||
126 | } | ||
127 | } | ||
128 | |||
129 | struct kvmppc_rma_info *kvm_alloc_rma(void) | ||
130 | { | ||
131 | struct kvmppc_rma_info *ri; | ||
132 | |||
133 | ri = NULL; | ||
134 | spin_lock(&rma_lock); | ||
135 | if (!list_empty(&free_rmas)) { | ||
136 | ri = list_first_entry(&free_rmas, struct kvmppc_rma_info, list); | ||
137 | list_del(&ri->list); | ||
138 | atomic_inc(&ri->use_count); | ||
139 | } | ||
140 | spin_unlock(&rma_lock); | ||
141 | return ri; | ||
142 | } | ||
143 | EXPORT_SYMBOL_GPL(kvm_alloc_rma); | ||
144 | |||
145 | void kvm_release_rma(struct kvmppc_rma_info *ri) | ||
146 | { | ||
147 | if (atomic_dec_and_test(&ri->use_count)) { | ||
148 | spin_lock(&rma_lock); | ||
149 | list_add_tail(&ri->list, &free_rmas); | ||
150 | spin_unlock(&rma_lock); | ||
151 | |||
152 | } | ||
153 | } | ||
154 | EXPORT_SYMBOL_GPL(kvm_release_rma); | ||
155 | |||
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S new file mode 100644 index 000000000000..3f7b674dd4bf --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S | |||
@@ -0,0 +1,166 @@ | |||
1 | /* | ||
2 | * This program is free software; you can redistribute it and/or modify | ||
3 | * it under the terms of the GNU General Public License, version 2, as | ||
4 | * published by the Free Software Foundation. | ||
5 | * | ||
6 | * This program is distributed in the hope that it will be useful, | ||
7 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
8 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
9 | * GNU General Public License for more details. | ||
10 | * | ||
11 | * You should have received a copy of the GNU General Public License | ||
12 | * along with this program; if not, write to the Free Software | ||
13 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
14 | * | ||
15 | * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | ||
16 | * | ||
17 | * Derived from book3s_interrupts.S, which is: | ||
18 | * Copyright SUSE Linux Products GmbH 2009 | ||
19 | * | ||
20 | * Authors: Alexander Graf <agraf@suse.de> | ||
21 | */ | ||
22 | |||
23 | #include <asm/ppc_asm.h> | ||
24 | #include <asm/kvm_asm.h> | ||
25 | #include <asm/reg.h> | ||
26 | #include <asm/page.h> | ||
27 | #include <asm/asm-offsets.h> | ||
28 | #include <asm/exception-64s.h> | ||
29 | #include <asm/ppc-opcode.h> | ||
30 | |||
31 | /***************************************************************************** | ||
32 | * * | ||
33 | * Guest entry / exit code that is in kernel module memory (vmalloc) * | ||
34 | * * | ||
35 | ****************************************************************************/ | ||
36 | |||
37 | /* Registers: | ||
38 | * r4: vcpu pointer | ||
39 | */ | ||
40 | _GLOBAL(__kvmppc_vcore_entry) | ||
41 | |||
42 | /* Write correct stack frame */ | ||
43 | mflr r0 | ||
44 | std r0,PPC_LR_STKOFF(r1) | ||
45 | |||
46 | /* Save host state to the stack */ | ||
47 | stdu r1, -SWITCH_FRAME_SIZE(r1) | ||
48 | |||
49 | /* Save non-volatile registers (r14 - r31) */ | ||
50 | SAVE_NVGPRS(r1) | ||
51 | |||
52 | /* Save host DSCR */ | ||
53 | BEGIN_FTR_SECTION | ||
54 | mfspr r3, SPRN_DSCR | ||
55 | std r3, HSTATE_DSCR(r13) | ||
56 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) | ||
57 | |||
58 | /* Save host DABR */ | ||
59 | mfspr r3, SPRN_DABR | ||
60 | std r3, HSTATE_DABR(r13) | ||
61 | |||
62 | /* Hard-disable interrupts */ | ||
63 | mfmsr r10 | ||
64 | std r10, HSTATE_HOST_MSR(r13) | ||
65 | rldicl r10,r10,48,1 | ||
66 | rotldi r10,r10,16 | ||
67 | mtmsrd r10,1 | ||
68 | |||
69 | /* Save host PMU registers and load guest PMU registers */ | ||
70 | /* R4 is live here (vcpu pointer) but not r3 or r5 */ | ||
71 | li r3, 1 | ||
72 | sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ | ||
73 | mfspr r7, SPRN_MMCR0 /* save MMCR0 */ | ||
74 | mtspr SPRN_MMCR0, r3 /* freeze all counters, disable interrupts */ | ||
75 | isync | ||
76 | ld r3, PACALPPACAPTR(r13) /* is the host using the PMU? */ | ||
77 | lbz r5, LPPACA_PMCINUSE(r3) | ||
78 | cmpwi r5, 0 | ||
79 | beq 31f /* skip if not */ | ||
80 | mfspr r5, SPRN_MMCR1 | ||
81 | mfspr r6, SPRN_MMCRA | ||
82 | std r7, HSTATE_MMCR(r13) | ||
83 | std r5, HSTATE_MMCR + 8(r13) | ||
84 | std r6, HSTATE_MMCR + 16(r13) | ||
85 | mfspr r3, SPRN_PMC1 | ||
86 | mfspr r5, SPRN_PMC2 | ||
87 | mfspr r6, SPRN_PMC3 | ||
88 | mfspr r7, SPRN_PMC4 | ||
89 | mfspr r8, SPRN_PMC5 | ||
90 | mfspr r9, SPRN_PMC6 | ||
91 | BEGIN_FTR_SECTION | ||
92 | mfspr r10, SPRN_PMC7 | ||
93 | mfspr r11, SPRN_PMC8 | ||
94 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) | ||
95 | stw r3, HSTATE_PMC(r13) | ||
96 | stw r5, HSTATE_PMC + 4(r13) | ||
97 | stw r6, HSTATE_PMC + 8(r13) | ||
98 | stw r7, HSTATE_PMC + 12(r13) | ||
99 | stw r8, HSTATE_PMC + 16(r13) | ||
100 | stw r9, HSTATE_PMC + 20(r13) | ||
101 | BEGIN_FTR_SECTION | ||
102 | stw r10, HSTATE_PMC + 24(r13) | ||
103 | stw r11, HSTATE_PMC + 28(r13) | ||
104 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) | ||
105 | 31: | ||
106 | |||
107 | /* | ||
108 | * Put whatever is in the decrementer into the | ||
109 | * hypervisor decrementer. | ||
110 | */ | ||
111 | mfspr r8,SPRN_DEC | ||
112 | mftb r7 | ||
113 | mtspr SPRN_HDEC,r8 | ||
114 | extsw r8,r8 | ||
115 | add r8,r8,r7 | ||
116 | std r8,HSTATE_DECEXP(r13) | ||
117 | |||
118 | /* | ||
119 | * On PPC970, if the guest vcpu has an external interrupt pending, | ||
120 | * send ourselves an IPI so as to interrupt the guest once it | ||
121 | * enables interrupts. (It must have interrupts disabled, | ||
122 | * otherwise we would already have delivered the interrupt.) | ||
123 | */ | ||
124 | BEGIN_FTR_SECTION | ||
125 | ld r0, VCPU_PENDING_EXC(r4) | ||
126 | li r7, (1 << BOOK3S_IRQPRIO_EXTERNAL) | ||
127 | oris r7, r7, (1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h | ||
128 | and. r0, r0, r7 | ||
129 | beq 32f | ||
130 | mr r31, r4 | ||
131 | lhz r3, PACAPACAINDEX(r13) | ||
132 | bl smp_send_reschedule | ||
133 | nop | ||
134 | mr r4, r31 | ||
135 | 32: | ||
136 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) | ||
137 | |||
138 | /* Jump to partition switch code */ | ||
139 | bl .kvmppc_hv_entry_trampoline | ||
140 | nop | ||
141 | |||
142 | /* | ||
143 | * We return here in virtual mode after the guest exits | ||
144 | * with something that we can't handle in real mode. | ||
145 | * Interrupts are enabled again at this point. | ||
146 | */ | ||
147 | |||
148 | .global kvmppc_handler_highmem | ||
149 | kvmppc_handler_highmem: | ||
150 | |||
151 | /* | ||
152 | * Register usage at this point: | ||
153 | * | ||
154 | * R1 = host R1 | ||
155 | * R2 = host R2 | ||
156 | * R12 = exit handler id | ||
157 | * R13 = PACA | ||
158 | */ | ||
159 | |||
160 | /* Restore non-volatile host registers (r14 - r31) */ | ||
161 | REST_NVGPRS(r1) | ||
162 | |||
163 | addi r1, r1, SWITCH_FRAME_SIZE | ||
164 | ld r0, PPC_LR_STKOFF(r1) | ||
165 | mtlr r0 | ||
166 | blr | ||
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c new file mode 100644 index 000000000000..fcfe6b055558 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c | |||
@@ -0,0 +1,370 @@ | |||
1 | /* | ||
2 | * This program is free software; you can redistribute it and/or modify | ||
3 | * it under the terms of the GNU General Public License, version 2, as | ||
4 | * published by the Free Software Foundation. | ||
5 | * | ||
6 | * Copyright 2010-2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | ||
7 | */ | ||
8 | |||
9 | #include <linux/types.h> | ||
10 | #include <linux/string.h> | ||
11 | #include <linux/kvm.h> | ||
12 | #include <linux/kvm_host.h> | ||
13 | #include <linux/hugetlb.h> | ||
14 | |||
15 | #include <asm/tlbflush.h> | ||
16 | #include <asm/kvm_ppc.h> | ||
17 | #include <asm/kvm_book3s.h> | ||
18 | #include <asm/mmu-hash64.h> | ||
19 | #include <asm/hvcall.h> | ||
20 | #include <asm/synch.h> | ||
21 | #include <asm/ppc-opcode.h> | ||
22 | |||
23 | /* For now use fixed-size 16MB page table */ | ||
24 | #define HPT_ORDER 24 | ||
25 | #define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ | ||
26 | #define HPT_HASH_MASK (HPT_NPTEG - 1) | ||
27 | |||
28 | #define HPTE_V_HVLOCK 0x40UL | ||
29 | |||
30 | static inline long lock_hpte(unsigned long *hpte, unsigned long bits) | ||
31 | { | ||
32 | unsigned long tmp, old; | ||
33 | |||
34 | asm volatile(" ldarx %0,0,%2\n" | ||
35 | " and. %1,%0,%3\n" | ||
36 | " bne 2f\n" | ||
37 | " ori %0,%0,%4\n" | ||
38 | " stdcx. %0,0,%2\n" | ||
39 | " beq+ 2f\n" | ||
40 | " li %1,%3\n" | ||
41 | "2: isync" | ||
42 | : "=&r" (tmp), "=&r" (old) | ||
43 | : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK) | ||
44 | : "cc", "memory"); | ||
45 | return old == 0; | ||
46 | } | ||
47 | |||
48 | long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, | ||
49 | long pte_index, unsigned long pteh, unsigned long ptel) | ||
50 | { | ||
51 | unsigned long porder; | ||
52 | struct kvm *kvm = vcpu->kvm; | ||
53 | unsigned long i, lpn, pa; | ||
54 | unsigned long *hpte; | ||
55 | |||
56 | /* only handle 4k, 64k and 16M pages for now */ | ||
57 | porder = 12; | ||
58 | if (pteh & HPTE_V_LARGE) { | ||
59 | if (cpu_has_feature(CPU_FTR_ARCH_206) && | ||
60 | (ptel & 0xf000) == 0x1000) { | ||
61 | /* 64k page */ | ||
62 | porder = 16; | ||
63 | } else if ((ptel & 0xff000) == 0) { | ||
64 | /* 16M page */ | ||
65 | porder = 24; | ||
66 | /* lowest AVA bit must be 0 for 16M pages */ | ||
67 | if (pteh & 0x80) | ||
68 | return H_PARAMETER; | ||
69 | } else | ||
70 | return H_PARAMETER; | ||
71 | } | ||
72 | lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder; | ||
73 | if (lpn >= kvm->arch.ram_npages || porder > kvm->arch.ram_porder) | ||
74 | return H_PARAMETER; | ||
75 | pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT; | ||
76 | if (!pa) | ||
77 | return H_PARAMETER; | ||
78 | /* Check WIMG */ | ||
79 | if ((ptel & HPTE_R_WIMG) != HPTE_R_M && | ||
80 | (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M)) | ||
81 | return H_PARAMETER; | ||
82 | pteh &= ~0x60UL; | ||
83 | ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize); | ||
84 | ptel |= pa; | ||
85 | if (pte_index >= (HPT_NPTEG << 3)) | ||
86 | return H_PARAMETER; | ||
87 | if (likely((flags & H_EXACT) == 0)) { | ||
88 | pte_index &= ~7UL; | ||
89 | hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); | ||
90 | for (i = 0; ; ++i) { | ||
91 | if (i == 8) | ||
92 | return H_PTEG_FULL; | ||
93 | if ((*hpte & HPTE_V_VALID) == 0 && | ||
94 | lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) | ||
95 | break; | ||
96 | hpte += 2; | ||
97 | } | ||
98 | } else { | ||
99 | i = 0; | ||
100 | hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); | ||
101 | if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) | ||
102 | return H_PTEG_FULL; | ||
103 | } | ||
104 | hpte[1] = ptel; | ||
105 | eieio(); | ||
106 | hpte[0] = pteh; | ||
107 | asm volatile("ptesync" : : : "memory"); | ||
108 | atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt); | ||
109 | vcpu->arch.gpr[4] = pte_index + i; | ||
110 | return H_SUCCESS; | ||
111 | } | ||
112 | |||
113 | static unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, | ||
114 | unsigned long pte_index) | ||
115 | { | ||
116 | unsigned long rb, va_low; | ||
117 | |||
118 | rb = (v & ~0x7fUL) << 16; /* AVA field */ | ||
119 | va_low = pte_index >> 3; | ||
120 | if (v & HPTE_V_SECONDARY) | ||
121 | va_low = ~va_low; | ||
122 | /* xor vsid from AVA */ | ||
123 | if (!(v & HPTE_V_1TB_SEG)) | ||
124 | va_low ^= v >> 12; | ||
125 | else | ||
126 | va_low ^= v >> 24; | ||
127 | va_low &= 0x7ff; | ||
128 | if (v & HPTE_V_LARGE) { | ||
129 | rb |= 1; /* L field */ | ||
130 | if (cpu_has_feature(CPU_FTR_ARCH_206) && | ||
131 | (r & 0xff000)) { | ||
132 | /* non-16MB large page, must be 64k */ | ||
133 | /* (masks depend on page size) */ | ||
134 | rb |= 0x1000; /* page encoding in LP field */ | ||
135 | rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */ | ||
136 | rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */ | ||
137 | } | ||
138 | } else { | ||
139 | /* 4kB page */ | ||
140 | rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */ | ||
141 | } | ||
142 | rb |= (v >> 54) & 0x300; /* B field */ | ||
143 | return rb; | ||
144 | } | ||
145 | |||
146 | #define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token)) | ||
147 | |||
148 | static inline int try_lock_tlbie(unsigned int *lock) | ||
149 | { | ||
150 | unsigned int tmp, old; | ||
151 | unsigned int token = LOCK_TOKEN; | ||
152 | |||
153 | asm volatile("1:lwarx %1,0,%2\n" | ||
154 | " cmpwi cr0,%1,0\n" | ||
155 | " bne 2f\n" | ||
156 | " stwcx. %3,0,%2\n" | ||
157 | " bne- 1b\n" | ||
158 | " isync\n" | ||
159 | "2:" | ||
160 | : "=&r" (tmp), "=&r" (old) | ||
161 | : "r" (lock), "r" (token) | ||
162 | : "cc", "memory"); | ||
163 | return old == 0; | ||
164 | } | ||
165 | |||
166 | long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, | ||
167 | unsigned long pte_index, unsigned long avpn, | ||
168 | unsigned long va) | ||
169 | { | ||
170 | struct kvm *kvm = vcpu->kvm; | ||
171 | unsigned long *hpte; | ||
172 | unsigned long v, r, rb; | ||
173 | |||
174 | if (pte_index >= (HPT_NPTEG << 3)) | ||
175 | return H_PARAMETER; | ||
176 | hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); | ||
177 | while (!lock_hpte(hpte, HPTE_V_HVLOCK)) | ||
178 | cpu_relax(); | ||
179 | if ((hpte[0] & HPTE_V_VALID) == 0 || | ||
180 | ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) || | ||
181 | ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) { | ||
182 | hpte[0] &= ~HPTE_V_HVLOCK; | ||
183 | return H_NOT_FOUND; | ||
184 | } | ||
185 | if (atomic_read(&kvm->online_vcpus) == 1) | ||
186 | flags |= H_LOCAL; | ||
187 | vcpu->arch.gpr[4] = v = hpte[0] & ~HPTE_V_HVLOCK; | ||
188 | vcpu->arch.gpr[5] = r = hpte[1]; | ||
189 | rb = compute_tlbie_rb(v, r, pte_index); | ||
190 | hpte[0] = 0; | ||
191 | if (!(flags & H_LOCAL)) { | ||
192 | while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) | ||
193 | cpu_relax(); | ||
194 | asm volatile("ptesync" : : : "memory"); | ||
195 | asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" | ||
196 | : : "r" (rb), "r" (kvm->arch.lpid)); | ||
197 | asm volatile("ptesync" : : : "memory"); | ||
198 | kvm->arch.tlbie_lock = 0; | ||
199 | } else { | ||
200 | asm volatile("ptesync" : : : "memory"); | ||
201 | asm volatile("tlbiel %0" : : "r" (rb)); | ||
202 | asm volatile("ptesync" : : : "memory"); | ||
203 | } | ||
204 | return H_SUCCESS; | ||
205 | } | ||
206 | |||
207 | long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) | ||
208 | { | ||
209 | struct kvm *kvm = vcpu->kvm; | ||
210 | unsigned long *args = &vcpu->arch.gpr[4]; | ||
211 | unsigned long *hp, tlbrb[4]; | ||
212 | long int i, found; | ||
213 | long int n_inval = 0; | ||
214 | unsigned long flags, req, pte_index; | ||
215 | long int local = 0; | ||
216 | long int ret = H_SUCCESS; | ||
217 | |||
218 | if (atomic_read(&kvm->online_vcpus) == 1) | ||
219 | local = 1; | ||
220 | for (i = 0; i < 4; ++i) { | ||
221 | pte_index = args[i * 2]; | ||
222 | flags = pte_index >> 56; | ||
223 | pte_index &= ((1ul << 56) - 1); | ||
224 | req = flags >> 6; | ||
225 | flags &= 3; | ||
226 | if (req == 3) | ||
227 | break; | ||
228 | if (req != 1 || flags == 3 || | ||
229 | pte_index >= (HPT_NPTEG << 3)) { | ||
230 | /* parameter error */ | ||
231 | args[i * 2] = ((0xa0 | flags) << 56) + pte_index; | ||
232 | ret = H_PARAMETER; | ||
233 | break; | ||
234 | } | ||
235 | hp = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); | ||
236 | while (!lock_hpte(hp, HPTE_V_HVLOCK)) | ||
237 | cpu_relax(); | ||
238 | found = 0; | ||
239 | if (hp[0] & HPTE_V_VALID) { | ||
240 | switch (flags & 3) { | ||
241 | case 0: /* absolute */ | ||
242 | found = 1; | ||
243 | break; | ||
244 | case 1: /* andcond */ | ||
245 | if (!(hp[0] & args[i * 2 + 1])) | ||
246 | found = 1; | ||
247 | break; | ||
248 | case 2: /* AVPN */ | ||
249 | if ((hp[0] & ~0x7fUL) == args[i * 2 + 1]) | ||
250 | found = 1; | ||
251 | break; | ||
252 | } | ||
253 | } | ||
254 | if (!found) { | ||
255 | hp[0] &= ~HPTE_V_HVLOCK; | ||
256 | args[i * 2] = ((0x90 | flags) << 56) + pte_index; | ||
257 | continue; | ||
258 | } | ||
259 | /* insert R and C bits from PTE */ | ||
260 | flags |= (hp[1] >> 5) & 0x0c; | ||
261 | args[i * 2] = ((0x80 | flags) << 56) + pte_index; | ||
262 | tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index); | ||
263 | hp[0] = 0; | ||
264 | } | ||
265 | if (n_inval == 0) | ||
266 | return ret; | ||
267 | |||
268 | if (!local) { | ||
269 | while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) | ||
270 | cpu_relax(); | ||
271 | asm volatile("ptesync" : : : "memory"); | ||
272 | for (i = 0; i < n_inval; ++i) | ||
273 | asm volatile(PPC_TLBIE(%1,%0) | ||
274 | : : "r" (tlbrb[i]), "r" (kvm->arch.lpid)); | ||
275 | asm volatile("eieio; tlbsync; ptesync" : : : "memory"); | ||
276 | kvm->arch.tlbie_lock = 0; | ||
277 | } else { | ||
278 | asm volatile("ptesync" : : : "memory"); | ||
279 | for (i = 0; i < n_inval; ++i) | ||
280 | asm volatile("tlbiel %0" : : "r" (tlbrb[i])); | ||
281 | asm volatile("ptesync" : : : "memory"); | ||
282 | } | ||
283 | return ret; | ||
284 | } | ||
285 | |||
286 | long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, | ||
287 | unsigned long pte_index, unsigned long avpn, | ||
288 | unsigned long va) | ||
289 | { | ||
290 | struct kvm *kvm = vcpu->kvm; | ||
291 | unsigned long *hpte; | ||
292 | unsigned long v, r, rb; | ||
293 | |||
294 | if (pte_index >= (HPT_NPTEG << 3)) | ||
295 | return H_PARAMETER; | ||
296 | hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); | ||
297 | while (!lock_hpte(hpte, HPTE_V_HVLOCK)) | ||
298 | cpu_relax(); | ||
299 | if ((hpte[0] & HPTE_V_VALID) == 0 || | ||
300 | ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) { | ||
301 | hpte[0] &= ~HPTE_V_HVLOCK; | ||
302 | return H_NOT_FOUND; | ||
303 | } | ||
304 | if (atomic_read(&kvm->online_vcpus) == 1) | ||
305 | flags |= H_LOCAL; | ||
306 | v = hpte[0]; | ||
307 | r = hpte[1] & ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | | ||
308 | HPTE_R_KEY_HI | HPTE_R_KEY_LO); | ||
309 | r |= (flags << 55) & HPTE_R_PP0; | ||
310 | r |= (flags << 48) & HPTE_R_KEY_HI; | ||
311 | r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); | ||
312 | rb = compute_tlbie_rb(v, r, pte_index); | ||
313 | hpte[0] = v & ~HPTE_V_VALID; | ||
314 | if (!(flags & H_LOCAL)) { | ||
315 | while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) | ||
316 | cpu_relax(); | ||
317 | asm volatile("ptesync" : : : "memory"); | ||
318 | asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" | ||
319 | : : "r" (rb), "r" (kvm->arch.lpid)); | ||
320 | asm volatile("ptesync" : : : "memory"); | ||
321 | kvm->arch.tlbie_lock = 0; | ||
322 | } else { | ||
323 | asm volatile("ptesync" : : : "memory"); | ||
324 | asm volatile("tlbiel %0" : : "r" (rb)); | ||
325 | asm volatile("ptesync" : : : "memory"); | ||
326 | } | ||
327 | hpte[1] = r; | ||
328 | eieio(); | ||
329 | hpte[0] = v & ~HPTE_V_HVLOCK; | ||
330 | asm volatile("ptesync" : : : "memory"); | ||
331 | return H_SUCCESS; | ||
332 | } | ||
333 | |||
334 | static unsigned long reverse_xlate(struct kvm *kvm, unsigned long realaddr) | ||
335 | { | ||
336 | long int i; | ||
337 | unsigned long offset, rpn; | ||
338 | |||
339 | offset = realaddr & (kvm->arch.ram_psize - 1); | ||
340 | rpn = (realaddr - offset) >> PAGE_SHIFT; | ||
341 | for (i = 0; i < kvm->arch.ram_npages; ++i) | ||
342 | if (rpn == kvm->arch.ram_pginfo[i].pfn) | ||
343 | return (i << PAGE_SHIFT) + offset; | ||
344 | return HPTE_R_RPN; /* all 1s in the RPN field */ | ||
345 | } | ||
346 | |||
347 | long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, | ||
348 | unsigned long pte_index) | ||
349 | { | ||
350 | struct kvm *kvm = vcpu->kvm; | ||
351 | unsigned long *hpte, r; | ||
352 | int i, n = 1; | ||
353 | |||
354 | if (pte_index >= (HPT_NPTEG << 3)) | ||
355 | return H_PARAMETER; | ||
356 | if (flags & H_READ_4) { | ||
357 | pte_index &= ~3; | ||
358 | n = 4; | ||
359 | } | ||
360 | for (i = 0; i < n; ++i, ++pte_index) { | ||
361 | hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); | ||
362 | r = hpte[1]; | ||
363 | if ((flags & H_R_XLATE) && (hpte[0] & HPTE_V_VALID)) | ||
364 | r = reverse_xlate(kvm, r & HPTE_R_RPN) | | ||
365 | (r & ~HPTE_R_RPN); | ||
366 | vcpu->arch.gpr[4 + i * 2] = hpte[0]; | ||
367 | vcpu->arch.gpr[5 + i * 2] = r; | ||
368 | } | ||
369 | return H_SUCCESS; | ||
370 | } | ||
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S new file mode 100644 index 000000000000..6dd33581a228 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S | |||
@@ -0,0 +1,1345 @@ | |||
1 | /* | ||
2 | * This program is free software; you can redistribute it and/or modify | ||
3 | * it under the terms of the GNU General Public License, version 2, as | ||
4 | * published by the Free Software Foundation. | ||
5 | * | ||
6 | * This program is distributed in the hope that it will be useful, | ||
7 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
8 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
9 | * GNU General Public License for more details. | ||
10 | * | ||
11 | * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | ||
12 | * | ||
13 | * Derived from book3s_rmhandlers.S and other files, which are: | ||
14 | * | ||
15 | * Copyright SUSE Linux Products GmbH 2009 | ||
16 | * | ||
17 | * Authors: Alexander Graf <agraf@suse.de> | ||
18 | */ | ||
19 | |||
20 | #include <asm/ppc_asm.h> | ||
21 | #include <asm/kvm_asm.h> | ||
22 | #include <asm/reg.h> | ||
23 | #include <asm/page.h> | ||
24 | #include <asm/asm-offsets.h> | ||
25 | #include <asm/exception-64s.h> | ||
26 | |||
27 | /***************************************************************************** | ||
28 | * * | ||
29 | * Real Mode handlers that need to be in the linear mapping * | ||
30 | * * | ||
31 | ****************************************************************************/ | ||
32 | |||
33 | .globl kvmppc_skip_interrupt | ||
34 | kvmppc_skip_interrupt: | ||
35 | mfspr r13,SPRN_SRR0 | ||
36 | addi r13,r13,4 | ||
37 | mtspr SPRN_SRR0,r13 | ||
38 | GET_SCRATCH0(r13) | ||
39 | rfid | ||
40 | b . | ||
41 | |||
42 | .globl kvmppc_skip_Hinterrupt | ||
43 | kvmppc_skip_Hinterrupt: | ||
44 | mfspr r13,SPRN_HSRR0 | ||
45 | addi r13,r13,4 | ||
46 | mtspr SPRN_HSRR0,r13 | ||
47 | GET_SCRATCH0(r13) | ||
48 | hrfid | ||
49 | b . | ||
50 | |||
51 | /* | ||
52 | * Call kvmppc_handler_trampoline_enter in real mode. | ||
53 | * Must be called with interrupts hard-disabled. | ||
54 | * | ||
55 | * Input Registers: | ||
56 | * | ||
57 | * LR = return address to continue at after eventually re-enabling MMU | ||
58 | */ | ||
59 | _GLOBAL(kvmppc_hv_entry_trampoline) | ||
60 | mfmsr r10 | ||
61 | LOAD_REG_ADDR(r5, kvmppc_hv_entry) | ||
62 | li r0,MSR_RI | ||
63 | andc r0,r10,r0 | ||
64 | li r6,MSR_IR | MSR_DR | ||
65 | andc r6,r10,r6 | ||
66 | mtmsrd r0,1 /* clear RI in MSR */ | ||
67 | mtsrr0 r5 | ||
68 | mtsrr1 r6 | ||
69 | RFI | ||
70 | |||
71 | #define ULONG_SIZE 8 | ||
72 | #define VCPU_GPR(n) (VCPU_GPRS + (n * ULONG_SIZE)) | ||
73 | |||
74 | /****************************************************************************** | ||
75 | * * | ||
76 | * Entry code * | ||
77 | * * | ||
78 | *****************************************************************************/ | ||
79 | |||
80 | #define XICS_XIRR 4 | ||
81 | #define XICS_QIRR 0xc | ||
82 | |||
83 | /* | ||
84 | * We come in here when wakened from nap mode on a secondary hw thread. | ||
85 | * Relocation is off and most register values are lost. | ||
86 | * r13 points to the PACA. | ||
87 | */ | ||
88 | .globl kvm_start_guest | ||
89 | kvm_start_guest: | ||
90 | ld r1,PACAEMERGSP(r13) | ||
91 | subi r1,r1,STACK_FRAME_OVERHEAD | ||
92 | |||
93 | /* get vcpu pointer */ | ||
94 | ld r4, HSTATE_KVM_VCPU(r13) | ||
95 | |||
96 | /* We got here with an IPI; clear it */ | ||
97 | ld r5, HSTATE_XICS_PHYS(r13) | ||
98 | li r0, 0xff | ||
99 | li r6, XICS_QIRR | ||
100 | li r7, XICS_XIRR | ||
101 | lwzcix r8, r5, r7 /* ack the interrupt */ | ||
102 | sync | ||
103 | stbcix r0, r5, r6 /* clear it */ | ||
104 | stwcix r8, r5, r7 /* EOI it */ | ||
105 | |||
106 | .global kvmppc_hv_entry | ||
107 | kvmppc_hv_entry: | ||
108 | |||
109 | /* Required state: | ||
110 | * | ||
111 | * R4 = vcpu pointer | ||
112 | * MSR = ~IR|DR | ||
113 | * R13 = PACA | ||
114 | * R1 = host R1 | ||
115 | * all other volatile GPRS = free | ||
116 | */ | ||
117 | mflr r0 | ||
118 | std r0, HSTATE_VMHANDLER(r13) | ||
119 | |||
120 | ld r14, VCPU_GPR(r14)(r4) | ||
121 | ld r15, VCPU_GPR(r15)(r4) | ||
122 | ld r16, VCPU_GPR(r16)(r4) | ||
123 | ld r17, VCPU_GPR(r17)(r4) | ||
124 | ld r18, VCPU_GPR(r18)(r4) | ||
125 | ld r19, VCPU_GPR(r19)(r4) | ||
126 | ld r20, VCPU_GPR(r20)(r4) | ||
127 | ld r21, VCPU_GPR(r21)(r4) | ||
128 | ld r22, VCPU_GPR(r22)(r4) | ||
129 | ld r23, VCPU_GPR(r23)(r4) | ||
130 | ld r24, VCPU_GPR(r24)(r4) | ||
131 | ld r25, VCPU_GPR(r25)(r4) | ||
132 | ld r26, VCPU_GPR(r26)(r4) | ||
133 | ld r27, VCPU_GPR(r27)(r4) | ||
134 | ld r28, VCPU_GPR(r28)(r4) | ||
135 | ld r29, VCPU_GPR(r29)(r4) | ||
136 | ld r30, VCPU_GPR(r30)(r4) | ||
137 | ld r31, VCPU_GPR(r31)(r4) | ||
138 | |||
139 | /* Load guest PMU registers */ | ||
140 | /* R4 is live here (vcpu pointer) */ | ||
141 | li r3, 1 | ||
142 | sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ | ||
143 | mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ | ||
144 | isync | ||
145 | lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */ | ||
146 | lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */ | ||
147 | lwz r6, VCPU_PMC + 8(r4) | ||
148 | lwz r7, VCPU_PMC + 12(r4) | ||
149 | lwz r8, VCPU_PMC + 16(r4) | ||
150 | lwz r9, VCPU_PMC + 20(r4) | ||
151 | BEGIN_FTR_SECTION | ||
152 | lwz r10, VCPU_PMC + 24(r4) | ||
153 | lwz r11, VCPU_PMC + 28(r4) | ||
154 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) | ||
155 | mtspr SPRN_PMC1, r3 | ||
156 | mtspr SPRN_PMC2, r5 | ||
157 | mtspr SPRN_PMC3, r6 | ||
158 | mtspr SPRN_PMC4, r7 | ||
159 | mtspr SPRN_PMC5, r8 | ||
160 | mtspr SPRN_PMC6, r9 | ||
161 | BEGIN_FTR_SECTION | ||
162 | mtspr SPRN_PMC7, r10 | ||
163 | mtspr SPRN_PMC8, r11 | ||
164 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) | ||
165 | ld r3, VCPU_MMCR(r4) | ||
166 | ld r5, VCPU_MMCR + 8(r4) | ||
167 | ld r6, VCPU_MMCR + 16(r4) | ||
168 | mtspr SPRN_MMCR1, r5 | ||
169 | mtspr SPRN_MMCRA, r6 | ||
170 | mtspr SPRN_MMCR0, r3 | ||
171 | isync | ||
172 | |||
173 | /* Load up FP, VMX and VSX registers */ | ||
174 | bl kvmppc_load_fp | ||
175 | |||
176 | BEGIN_FTR_SECTION | ||
177 | /* Switch DSCR to guest value */ | ||
178 | ld r5, VCPU_DSCR(r4) | ||
179 | mtspr SPRN_DSCR, r5 | ||
180 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) | ||
181 | |||
182 | /* | ||
183 | * Set the decrementer to the guest decrementer. | ||
184 | */ | ||
185 | ld r8,VCPU_DEC_EXPIRES(r4) | ||
186 | mftb r7 | ||
187 | subf r3,r7,r8 | ||
188 | mtspr SPRN_DEC,r3 | ||
189 | stw r3,VCPU_DEC(r4) | ||
190 | |||
191 | ld r5, VCPU_SPRG0(r4) | ||
192 | ld r6, VCPU_SPRG1(r4) | ||
193 | ld r7, VCPU_SPRG2(r4) | ||
194 | ld r8, VCPU_SPRG3(r4) | ||
195 | mtspr SPRN_SPRG0, r5 | ||
196 | mtspr SPRN_SPRG1, r6 | ||
197 | mtspr SPRN_SPRG2, r7 | ||
198 | mtspr SPRN_SPRG3, r8 | ||
199 | |||
200 | /* Save R1 in the PACA */ | ||
201 | std r1, HSTATE_HOST_R1(r13) | ||
202 | |||
203 | /* Increment yield count if they have a VPA */ | ||
204 | ld r3, VCPU_VPA(r4) | ||
205 | cmpdi r3, 0 | ||
206 | beq 25f | ||
207 | lwz r5, LPPACA_YIELDCOUNT(r3) | ||
208 | addi r5, r5, 1 | ||
209 | stw r5, LPPACA_YIELDCOUNT(r3) | ||
210 | 25: | ||
211 | /* Load up DAR and DSISR */ | ||
212 | ld r5, VCPU_DAR(r4) | ||
213 | lwz r6, VCPU_DSISR(r4) | ||
214 | mtspr SPRN_DAR, r5 | ||
215 | mtspr SPRN_DSISR, r6 | ||
216 | |||
217 | /* Set partition DABR */ | ||
218 | li r5,3 | ||
219 | ld r6,VCPU_DABR(r4) | ||
220 | mtspr SPRN_DABRX,r5 | ||
221 | mtspr SPRN_DABR,r6 | ||
222 | |||
223 | BEGIN_FTR_SECTION | ||
224 | /* Restore AMR and UAMOR, set AMOR to all 1s */ | ||
225 | ld r5,VCPU_AMR(r4) | ||
226 | ld r6,VCPU_UAMOR(r4) | ||
227 | li r7,-1 | ||
228 | mtspr SPRN_AMR,r5 | ||
229 | mtspr SPRN_UAMOR,r6 | ||
230 | mtspr SPRN_AMOR,r7 | ||
231 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) | ||
232 | |||
233 | /* Clear out SLB */ | ||
234 | li r6,0 | ||
235 | slbmte r6,r6 | ||
236 | slbia | ||
237 | ptesync | ||
238 | |||
239 | BEGIN_FTR_SECTION | ||
240 | b 30f | ||
241 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) | ||
242 | /* | ||
243 | * POWER7 host -> guest partition switch code. | ||
244 | * We don't have to lock against concurrent tlbies, | ||
245 | * but we do have to coordinate across hardware threads. | ||
246 | */ | ||
247 | /* Increment entry count iff exit count is zero. */ | ||
248 | ld r5,HSTATE_KVM_VCORE(r13) | ||
249 | addi r9,r5,VCORE_ENTRY_EXIT | ||
250 | 21: lwarx r3,0,r9 | ||
251 | cmpwi r3,0x100 /* any threads starting to exit? */ | ||
252 | bge secondary_too_late /* if so we're too late to the party */ | ||
253 | addi r3,r3,1 | ||
254 | stwcx. r3,0,r9 | ||
255 | bne 21b | ||
256 | |||
257 | /* Primary thread switches to guest partition. */ | ||
258 | ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ | ||
259 | lwz r6,VCPU_PTID(r4) | ||
260 | cmpwi r6,0 | ||
261 | bne 20f | ||
262 | ld r6,KVM_SDR1(r9) | ||
263 | lwz r7,KVM_LPID(r9) | ||
264 | li r0,LPID_RSVD /* switch to reserved LPID */ | ||
265 | mtspr SPRN_LPID,r0 | ||
266 | ptesync | ||
267 | mtspr SPRN_SDR1,r6 /* switch to partition page table */ | ||
268 | mtspr SPRN_LPID,r7 | ||
269 | isync | ||
270 | li r0,1 | ||
271 | stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ | ||
272 | b 10f | ||
273 | |||
274 | /* Secondary threads wait for primary to have done partition switch */ | ||
275 | 20: lbz r0,VCORE_IN_GUEST(r5) | ||
276 | cmpwi r0,0 | ||
277 | beq 20b | ||
278 | |||
279 | /* Set LPCR. Set the MER bit if there is a pending external irq. */ | ||
280 | 10: ld r8,KVM_LPCR(r9) | ||
281 | ld r0,VCPU_PENDING_EXC(r4) | ||
282 | li r7,(1 << BOOK3S_IRQPRIO_EXTERNAL) | ||
283 | oris r7,r7,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h | ||
284 | and. r0,r0,r7 | ||
285 | beq 11f | ||
286 | ori r8,r8,LPCR_MER | ||
287 | 11: mtspr SPRN_LPCR,r8 | ||
288 | ld r8,KVM_RMOR(r9) | ||
289 | mtspr SPRN_RMOR,r8 | ||
290 | isync | ||
291 | |||
292 | /* Check if HDEC expires soon */ | ||
293 | mfspr r3,SPRN_HDEC | ||
294 | cmpwi r3,10 | ||
295 | li r12,BOOK3S_INTERRUPT_HV_DECREMENTER | ||
296 | mr r9,r4 | ||
297 | blt hdec_soon | ||
298 | |||
299 | /* | ||
300 | * Invalidate the TLB if we could possibly have stale TLB | ||
301 | * entries for this partition on this core due to the use | ||
302 | * of tlbiel. | ||
303 | * XXX maybe only need this on primary thread? | ||
304 | */ | ||
305 | ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ | ||
306 | lwz r5,VCPU_VCPUID(r4) | ||
307 | lhz r6,PACAPACAINDEX(r13) | ||
308 | rldimi r6,r5,0,62 /* XXX map as if threads 1:1 p:v */ | ||
309 | lhz r8,VCPU_LAST_CPU(r4) | ||
310 | sldi r7,r6,1 /* see if this is the same vcpu */ | ||
311 | add r7,r7,r9 /* as last ran on this pcpu */ | ||
312 | lhz r0,KVM_LAST_VCPU(r7) | ||
313 | cmpw r6,r8 /* on the same cpu core as last time? */ | ||
314 | bne 3f | ||
315 | cmpw r0,r5 /* same vcpu as this core last ran? */ | ||
316 | beq 1f | ||
317 | 3: sth r6,VCPU_LAST_CPU(r4) /* if not, invalidate partition TLB */ | ||
318 | sth r5,KVM_LAST_VCPU(r7) | ||
319 | li r6,128 | ||
320 | mtctr r6 | ||
321 | li r7,0x800 /* IS field = 0b10 */ | ||
322 | ptesync | ||
323 | 2: tlbiel r7 | ||
324 | addi r7,r7,0x1000 | ||
325 | bdnz 2b | ||
326 | ptesync | ||
327 | 1: | ||
328 | |||
329 | /* Save purr/spurr */ | ||
330 | mfspr r5,SPRN_PURR | ||
331 | mfspr r6,SPRN_SPURR | ||
332 | std r5,HSTATE_PURR(r13) | ||
333 | std r6,HSTATE_SPURR(r13) | ||
334 | ld r7,VCPU_PURR(r4) | ||
335 | ld r8,VCPU_SPURR(r4) | ||
336 | mtspr SPRN_PURR,r7 | ||
337 | mtspr SPRN_SPURR,r8 | ||
338 | b 31f | ||
339 | |||
340 | /* | ||
341 | * PPC970 host -> guest partition switch code. | ||
342 | * We have to lock against concurrent tlbies, | ||
343 | * using native_tlbie_lock to lock against host tlbies | ||
344 | * and kvm->arch.tlbie_lock to lock against guest tlbies. | ||
345 | * We also have to invalidate the TLB since its | ||
346 | * entries aren't tagged with the LPID. | ||
347 | */ | ||
348 | 30: ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ | ||
349 | |||
350 | /* first take native_tlbie_lock */ | ||
351 | .section ".toc","aw" | ||
352 | toc_tlbie_lock: | ||
353 | .tc native_tlbie_lock[TC],native_tlbie_lock | ||
354 | .previous | ||
355 | ld r3,toc_tlbie_lock@toc(2) | ||
356 | lwz r8,PACA_LOCK_TOKEN(r13) | ||
357 | 24: lwarx r0,0,r3 | ||
358 | cmpwi r0,0 | ||
359 | bne 24b | ||
360 | stwcx. r8,0,r3 | ||
361 | bne 24b | ||
362 | isync | ||
363 | |||
364 | ld r7,KVM_LPCR(r9) /* use kvm->arch.lpcr to store HID4 */ | ||
365 | li r0,0x18f | ||
366 | rotldi r0,r0,HID4_LPID5_SH /* all lpid bits in HID4 = 1 */ | ||
367 | or r0,r7,r0 | ||
368 | ptesync | ||
369 | sync | ||
370 | mtspr SPRN_HID4,r0 /* switch to reserved LPID */ | ||
371 | isync | ||
372 | li r0,0 | ||
373 | stw r0,0(r3) /* drop native_tlbie_lock */ | ||
374 | |||
375 | /* invalidate the whole TLB */ | ||
376 | li r0,256 | ||
377 | mtctr r0 | ||
378 | li r6,0 | ||
379 | 25: tlbiel r6 | ||
380 | addi r6,r6,0x1000 | ||
381 | bdnz 25b | ||
382 | ptesync | ||
383 | |||
384 | /* Take the guest's tlbie_lock */ | ||
385 | addi r3,r9,KVM_TLBIE_LOCK | ||
386 | 24: lwarx r0,0,r3 | ||
387 | cmpwi r0,0 | ||
388 | bne 24b | ||
389 | stwcx. r8,0,r3 | ||
390 | bne 24b | ||
391 | isync | ||
392 | ld r6,KVM_SDR1(r9) | ||
393 | mtspr SPRN_SDR1,r6 /* switch to partition page table */ | ||
394 | |||
395 | /* Set up HID4 with the guest's LPID etc. */ | ||
396 | sync | ||
397 | mtspr SPRN_HID4,r7 | ||
398 | isync | ||
399 | |||
400 | /* drop the guest's tlbie_lock */ | ||
401 | li r0,0 | ||
402 | stw r0,0(r3) | ||
403 | |||
404 | /* Check if HDEC expires soon */ | ||
405 | mfspr r3,SPRN_HDEC | ||
406 | cmpwi r3,10 | ||
407 | li r12,BOOK3S_INTERRUPT_HV_DECREMENTER | ||
408 | mr r9,r4 | ||
409 | blt hdec_soon | ||
410 | |||
411 | /* Enable HDEC interrupts */ | ||
412 | mfspr r0,SPRN_HID0 | ||
413 | li r3,1 | ||
414 | rldimi r0,r3, HID0_HDICE_SH, 64-HID0_HDICE_SH-1 | ||
415 | sync | ||
416 | mtspr SPRN_HID0,r0 | ||
417 | mfspr r0,SPRN_HID0 | ||
418 | mfspr r0,SPRN_HID0 | ||
419 | mfspr r0,SPRN_HID0 | ||
420 | mfspr r0,SPRN_HID0 | ||
421 | mfspr r0,SPRN_HID0 | ||
422 | mfspr r0,SPRN_HID0 | ||
423 | |||
424 | /* Load up guest SLB entries */ | ||
425 | 31: lwz r5,VCPU_SLB_MAX(r4) | ||
426 | cmpwi r5,0 | ||
427 | beq 9f | ||
428 | mtctr r5 | ||
429 | addi r6,r4,VCPU_SLB | ||
430 | 1: ld r8,VCPU_SLB_E(r6) | ||
431 | ld r9,VCPU_SLB_V(r6) | ||
432 | slbmte r9,r8 | ||
433 | addi r6,r6,VCPU_SLB_SIZE | ||
434 | bdnz 1b | ||
435 | 9: | ||
436 | |||
437 | /* Restore state of CTRL run bit; assume 1 on entry */ | ||
438 | lwz r5,VCPU_CTRL(r4) | ||
439 | andi. r5,r5,1 | ||
440 | bne 4f | ||
441 | mfspr r6,SPRN_CTRLF | ||
442 | clrrdi r6,r6,1 | ||
443 | mtspr SPRN_CTRLT,r6 | ||
444 | 4: | ||
445 | ld r6, VCPU_CTR(r4) | ||
446 | lwz r7, VCPU_XER(r4) | ||
447 | |||
448 | mtctr r6 | ||
449 | mtxer r7 | ||
450 | |||
451 | /* Move SRR0 and SRR1 into the respective regs */ | ||
452 | ld r6, VCPU_SRR0(r4) | ||
453 | ld r7, VCPU_SRR1(r4) | ||
454 | mtspr SPRN_SRR0, r6 | ||
455 | mtspr SPRN_SRR1, r7 | ||
456 | |||
457 | ld r10, VCPU_PC(r4) | ||
458 | |||
459 | ld r11, VCPU_MSR(r4) /* r10 = vcpu->arch.msr & ~MSR_HV */ | ||
460 | rldicl r11, r11, 63 - MSR_HV_LG, 1 | ||
461 | rotldi r11, r11, 1 + MSR_HV_LG | ||
462 | ori r11, r11, MSR_ME | ||
463 | |||
464 | fast_guest_return: | ||
465 | mtspr SPRN_HSRR0,r10 | ||
466 | mtspr SPRN_HSRR1,r11 | ||
467 | |||
468 | /* Activate guest mode, so faults get handled by KVM */ | ||
469 | li r9, KVM_GUEST_MODE_GUEST | ||
470 | stb r9, HSTATE_IN_GUEST(r13) | ||
471 | |||
472 | /* Enter guest */ | ||
473 | |||
474 | ld r5, VCPU_LR(r4) | ||
475 | lwz r6, VCPU_CR(r4) | ||
476 | mtlr r5 | ||
477 | mtcr r6 | ||
478 | |||
479 | ld r0, VCPU_GPR(r0)(r4) | ||
480 | ld r1, VCPU_GPR(r1)(r4) | ||
481 | ld r2, VCPU_GPR(r2)(r4) | ||
482 | ld r3, VCPU_GPR(r3)(r4) | ||
483 | ld r5, VCPU_GPR(r5)(r4) | ||
484 | ld r6, VCPU_GPR(r6)(r4) | ||
485 | ld r7, VCPU_GPR(r7)(r4) | ||
486 | ld r8, VCPU_GPR(r8)(r4) | ||
487 | ld r9, VCPU_GPR(r9)(r4) | ||
488 | ld r10, VCPU_GPR(r10)(r4) | ||
489 | ld r11, VCPU_GPR(r11)(r4) | ||
490 | ld r12, VCPU_GPR(r12)(r4) | ||
491 | ld r13, VCPU_GPR(r13)(r4) | ||
492 | |||
493 | ld r4, VCPU_GPR(r4)(r4) | ||
494 | |||
495 | hrfid | ||
496 | b . | ||
497 | |||
498 | /****************************************************************************** | ||
499 | * * | ||
500 | * Exit code * | ||
501 | * * | ||
502 | *****************************************************************************/ | ||
503 | |||
504 | /* | ||
505 | * We come here from the first-level interrupt handlers. | ||
506 | */ | ||
507 | .globl kvmppc_interrupt | ||
508 | kvmppc_interrupt: | ||
509 | /* | ||
510 | * Register contents: | ||
511 | * R12 = interrupt vector | ||
512 | * R13 = PACA | ||
513 | * guest CR, R12 saved in shadow VCPU SCRATCH1/0 | ||
514 | * guest R13 saved in SPRN_SCRATCH0 | ||
515 | */ | ||
516 | /* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */ | ||
517 | std r9, HSTATE_HOST_R2(r13) | ||
518 | ld r9, HSTATE_KVM_VCPU(r13) | ||
519 | |||
520 | /* Save registers */ | ||
521 | |||
522 | std r0, VCPU_GPR(r0)(r9) | ||
523 | std r1, VCPU_GPR(r1)(r9) | ||
524 | std r2, VCPU_GPR(r2)(r9) | ||
525 | std r3, VCPU_GPR(r3)(r9) | ||
526 | std r4, VCPU_GPR(r4)(r9) | ||
527 | std r5, VCPU_GPR(r5)(r9) | ||
528 | std r6, VCPU_GPR(r6)(r9) | ||
529 | std r7, VCPU_GPR(r7)(r9) | ||
530 | std r8, VCPU_GPR(r8)(r9) | ||
531 | ld r0, HSTATE_HOST_R2(r13) | ||
532 | std r0, VCPU_GPR(r9)(r9) | ||
533 | std r10, VCPU_GPR(r10)(r9) | ||
534 | std r11, VCPU_GPR(r11)(r9) | ||
535 | ld r3, HSTATE_SCRATCH0(r13) | ||
536 | lwz r4, HSTATE_SCRATCH1(r13) | ||
537 | std r3, VCPU_GPR(r12)(r9) | ||
538 | stw r4, VCPU_CR(r9) | ||
539 | |||
540 | /* Restore R1/R2 so we can handle faults */ | ||
541 | ld r1, HSTATE_HOST_R1(r13) | ||
542 | ld r2, PACATOC(r13) | ||
543 | |||
544 | mfspr r10, SPRN_SRR0 | ||
545 | mfspr r11, SPRN_SRR1 | ||
546 | std r10, VCPU_SRR0(r9) | ||
547 | std r11, VCPU_SRR1(r9) | ||
548 | andi. r0, r12, 2 /* need to read HSRR0/1? */ | ||
549 | beq 1f | ||
550 | mfspr r10, SPRN_HSRR0 | ||
551 | mfspr r11, SPRN_HSRR1 | ||
552 | clrrdi r12, r12, 2 | ||
553 | 1: std r10, VCPU_PC(r9) | ||
554 | std r11, VCPU_MSR(r9) | ||
555 | |||
556 | GET_SCRATCH0(r3) | ||
557 | mflr r4 | ||
558 | std r3, VCPU_GPR(r13)(r9) | ||
559 | std r4, VCPU_LR(r9) | ||
560 | |||
561 | /* Unset guest mode */ | ||
562 | li r0, KVM_GUEST_MODE_NONE | ||
563 | stb r0, HSTATE_IN_GUEST(r13) | ||
564 | |||
565 | stw r12,VCPU_TRAP(r9) | ||
566 | |||
567 | /* See if this is a leftover HDEC interrupt */ | ||
568 | cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER | ||
569 | bne 2f | ||
570 | mfspr r3,SPRN_HDEC | ||
571 | cmpwi r3,0 | ||
572 | bge ignore_hdec | ||
573 | 2: | ||
574 | /* See if this is something we can handle in real mode */ | ||
575 | cmpwi r12,BOOK3S_INTERRUPT_SYSCALL | ||
576 | beq hcall_try_real_mode | ||
577 | hcall_real_cont: | ||
578 | |||
579 | /* Check for mediated interrupts (could be done earlier really ...) */ | ||
580 | BEGIN_FTR_SECTION | ||
581 | cmpwi r12,BOOK3S_INTERRUPT_EXTERNAL | ||
582 | bne+ 1f | ||
583 | ld r5,VCPU_KVM(r9) | ||
584 | ld r5,KVM_LPCR(r5) | ||
585 | andi. r0,r11,MSR_EE | ||
586 | beq 1f | ||
587 | andi. r0,r5,LPCR_MER | ||
588 | bne bounce_ext_interrupt | ||
589 | 1: | ||
590 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) | ||
591 | |||
592 | /* Save DEC */ | ||
593 | mfspr r5,SPRN_DEC | ||
594 | mftb r6 | ||
595 | extsw r5,r5 | ||
596 | add r5,r5,r6 | ||
597 | std r5,VCPU_DEC_EXPIRES(r9) | ||
598 | |||
599 | /* Save HEIR (HV emulation assist reg) in last_inst | ||
600 | if this is an HEI (HV emulation interrupt, e40) */ | ||
601 | li r3,-1 | ||
602 | BEGIN_FTR_SECTION | ||
603 | cmpwi r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST | ||
604 | bne 11f | ||
605 | mfspr r3,SPRN_HEIR | ||
606 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) | ||
607 | 11: stw r3,VCPU_LAST_INST(r9) | ||
608 | |||
609 | /* Save more register state */ | ||
610 | mfxer r5 | ||
611 | mfdar r6 | ||
612 | mfdsisr r7 | ||
613 | mfctr r8 | ||
614 | |||
615 | stw r5, VCPU_XER(r9) | ||
616 | std r6, VCPU_DAR(r9) | ||
617 | stw r7, VCPU_DSISR(r9) | ||
618 | std r8, VCPU_CTR(r9) | ||
619 | /* grab HDAR & HDSISR if HV data storage interrupt (HDSI) */ | ||
620 | BEGIN_FTR_SECTION | ||
621 | cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE | ||
622 | beq 6f | ||
623 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) | ||
624 | 7: std r6, VCPU_FAULT_DAR(r9) | ||
625 | stw r7, VCPU_FAULT_DSISR(r9) | ||
626 | |||
627 | /* Save guest CTRL register, set runlatch to 1 */ | ||
628 | mfspr r6,SPRN_CTRLF | ||
629 | stw r6,VCPU_CTRL(r9) | ||
630 | andi. r0,r6,1 | ||
631 | bne 4f | ||
632 | ori r6,r6,1 | ||
633 | mtspr SPRN_CTRLT,r6 | ||
634 | 4: | ||
635 | /* Read the guest SLB and save it away */ | ||
636 | lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */ | ||
637 | mtctr r0 | ||
638 | li r6,0 | ||
639 | addi r7,r9,VCPU_SLB | ||
640 | li r5,0 | ||
641 | 1: slbmfee r8,r6 | ||
642 | andis. r0,r8,SLB_ESID_V@h | ||
643 | beq 2f | ||
644 | add r8,r8,r6 /* put index in */ | ||
645 | slbmfev r3,r6 | ||
646 | std r8,VCPU_SLB_E(r7) | ||
647 | std r3,VCPU_SLB_V(r7) | ||
648 | addi r7,r7,VCPU_SLB_SIZE | ||
649 | addi r5,r5,1 | ||
650 | 2: addi r6,r6,1 | ||
651 | bdnz 1b | ||
652 | stw r5,VCPU_SLB_MAX(r9) | ||
653 | |||
654 | /* | ||
655 | * Save the guest PURR/SPURR | ||
656 | */ | ||
657 | BEGIN_FTR_SECTION | ||
658 | mfspr r5,SPRN_PURR | ||
659 | mfspr r6,SPRN_SPURR | ||
660 | ld r7,VCPU_PURR(r9) | ||
661 | ld r8,VCPU_SPURR(r9) | ||
662 | std r5,VCPU_PURR(r9) | ||
663 | std r6,VCPU_SPURR(r9) | ||
664 | subf r5,r7,r5 | ||
665 | subf r6,r8,r6 | ||
666 | |||
667 | /* | ||
668 | * Restore host PURR/SPURR and add guest times | ||
669 | * so that the time in the guest gets accounted. | ||
670 | */ | ||
671 | ld r3,HSTATE_PURR(r13) | ||
672 | ld r4,HSTATE_SPURR(r13) | ||
673 | add r3,r3,r5 | ||
674 | add r4,r4,r6 | ||
675 | mtspr SPRN_PURR,r3 | ||
676 | mtspr SPRN_SPURR,r4 | ||
677 | END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201) | ||
678 | |||
679 | /* Clear out SLB */ | ||
680 | li r5,0 | ||
681 | slbmte r5,r5 | ||
682 | slbia | ||
683 | ptesync | ||
684 | |||
685 | hdec_soon: | ||
686 | BEGIN_FTR_SECTION | ||
687 | b 32f | ||
688 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) | ||
689 | /* | ||
690 | * POWER7 guest -> host partition switch code. | ||
691 | * We don't have to lock against tlbies but we do | ||
692 | * have to coordinate the hardware threads. | ||
693 | */ | ||
694 | /* Increment the threads-exiting-guest count in the 0xff00 | ||
695 | bits of vcore->entry_exit_count */ | ||
696 | lwsync | ||
697 | ld r5,HSTATE_KVM_VCORE(r13) | ||
698 | addi r6,r5,VCORE_ENTRY_EXIT | ||
699 | 41: lwarx r3,0,r6 | ||
700 | addi r0,r3,0x100 | ||
701 | stwcx. r0,0,r6 | ||
702 | bne 41b | ||
703 | |||
704 | /* | ||
705 | * At this point we have an interrupt that we have to pass | ||
706 | * up to the kernel or qemu; we can't handle it in real mode. | ||
707 | * Thus we have to do a partition switch, so we have to | ||
708 | * collect the other threads, if we are the first thread | ||
709 | * to take an interrupt. To do this, we set the HDEC to 0, | ||
710 | * which causes an HDEC interrupt in all threads within 2ns | ||
711 | * because the HDEC register is shared between all 4 threads. | ||
712 | * However, we don't need to bother if this is an HDEC | ||
713 | * interrupt, since the other threads will already be on their | ||
714 | * way here in that case. | ||
715 | */ | ||
716 | cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER | ||
717 | beq 40f | ||
718 | cmpwi r3,0x100 /* Are we the first here? */ | ||
719 | bge 40f | ||
720 | cmpwi r3,1 | ||
721 | ble 40f | ||
722 | li r0,0 | ||
723 | mtspr SPRN_HDEC,r0 | ||
724 | 40: | ||
725 | |||
726 | /* Secondary threads wait for primary to do partition switch */ | ||
727 | ld r4,VCPU_KVM(r9) /* pointer to struct kvm */ | ||
728 | ld r5,HSTATE_KVM_VCORE(r13) | ||
729 | lwz r3,VCPU_PTID(r9) | ||
730 | cmpwi r3,0 | ||
731 | beq 15f | ||
732 | HMT_LOW | ||
733 | 13: lbz r3,VCORE_IN_GUEST(r5) | ||
734 | cmpwi r3,0 | ||
735 | bne 13b | ||
736 | HMT_MEDIUM | ||
737 | b 16f | ||
738 | |||
739 | /* Primary thread waits for all the secondaries to exit guest */ | ||
740 | 15: lwz r3,VCORE_ENTRY_EXIT(r5) | ||
741 | srwi r0,r3,8 | ||
742 | clrldi r3,r3,56 | ||
743 | cmpw r3,r0 | ||
744 | bne 15b | ||
745 | isync | ||
746 | |||
747 | /* Primary thread switches back to host partition */ | ||
748 | ld r6,KVM_HOST_SDR1(r4) | ||
749 | lwz r7,KVM_HOST_LPID(r4) | ||
750 | li r8,LPID_RSVD /* switch to reserved LPID */ | ||
751 | mtspr SPRN_LPID,r8 | ||
752 | ptesync | ||
753 | mtspr SPRN_SDR1,r6 /* switch to partition page table */ | ||
754 | mtspr SPRN_LPID,r7 | ||
755 | isync | ||
756 | li r0,0 | ||
757 | stb r0,VCORE_IN_GUEST(r5) | ||
758 | lis r8,0x7fff /* MAX_INT@h */ | ||
759 | mtspr SPRN_HDEC,r8 | ||
760 | |||
761 | 16: ld r8,KVM_HOST_LPCR(r4) | ||
762 | mtspr SPRN_LPCR,r8 | ||
763 | isync | ||
764 | b 33f | ||
765 | |||
766 | /* | ||
767 | * PPC970 guest -> host partition switch code. | ||
768 | * We have to lock against concurrent tlbies, and | ||
769 | * we have to flush the whole TLB. | ||
770 | */ | ||
771 | 32: ld r4,VCPU_KVM(r9) /* pointer to struct kvm */ | ||
772 | |||
773 | /* Take the guest's tlbie_lock */ | ||
774 | lwz r8,PACA_LOCK_TOKEN(r13) | ||
775 | addi r3,r4,KVM_TLBIE_LOCK | ||
776 | 24: lwarx r0,0,r3 | ||
777 | cmpwi r0,0 | ||
778 | bne 24b | ||
779 | stwcx. r8,0,r3 | ||
780 | bne 24b | ||
781 | isync | ||
782 | |||
783 | ld r7,KVM_HOST_LPCR(r4) /* use kvm->arch.host_lpcr for HID4 */ | ||
784 | li r0,0x18f | ||
785 | rotldi r0,r0,HID4_LPID5_SH /* all lpid bits in HID4 = 1 */ | ||
786 | or r0,r7,r0 | ||
787 | ptesync | ||
788 | sync | ||
789 | mtspr SPRN_HID4,r0 /* switch to reserved LPID */ | ||
790 | isync | ||
791 | li r0,0 | ||
792 | stw r0,0(r3) /* drop guest tlbie_lock */ | ||
793 | |||
794 | /* invalidate the whole TLB */ | ||
795 | li r0,256 | ||
796 | mtctr r0 | ||
797 | li r6,0 | ||
798 | 25: tlbiel r6 | ||
799 | addi r6,r6,0x1000 | ||
800 | bdnz 25b | ||
801 | ptesync | ||
802 | |||
803 | /* take native_tlbie_lock */ | ||
804 | ld r3,toc_tlbie_lock@toc(2) | ||
805 | 24: lwarx r0,0,r3 | ||
806 | cmpwi r0,0 | ||
807 | bne 24b | ||
808 | stwcx. r8,0,r3 | ||
809 | bne 24b | ||
810 | isync | ||
811 | |||
812 | ld r6,KVM_HOST_SDR1(r4) | ||
813 | mtspr SPRN_SDR1,r6 /* switch to host page table */ | ||
814 | |||
815 | /* Set up host HID4 value */ | ||
816 | sync | ||
817 | mtspr SPRN_HID4,r7 | ||
818 | isync | ||
819 | li r0,0 | ||
820 | stw r0,0(r3) /* drop native_tlbie_lock */ | ||
821 | |||
822 | lis r8,0x7fff /* MAX_INT@h */ | ||
823 | mtspr SPRN_HDEC,r8 | ||
824 | |||
825 | /* Disable HDEC interrupts */ | ||
826 | mfspr r0,SPRN_HID0 | ||
827 | li r3,0 | ||
828 | rldimi r0,r3, HID0_HDICE_SH, 64-HID0_HDICE_SH-1 | ||
829 | sync | ||
830 | mtspr SPRN_HID0,r0 | ||
831 | mfspr r0,SPRN_HID0 | ||
832 | mfspr r0,SPRN_HID0 | ||
833 | mfspr r0,SPRN_HID0 | ||
834 | mfspr r0,SPRN_HID0 | ||
835 | mfspr r0,SPRN_HID0 | ||
836 | mfspr r0,SPRN_HID0 | ||
837 | |||
838 | /* load host SLB entries */ | ||
839 | 33: ld r8,PACA_SLBSHADOWPTR(r13) | ||
840 | |||
841 | .rept SLB_NUM_BOLTED | ||
842 | ld r5,SLBSHADOW_SAVEAREA(r8) | ||
843 | ld r6,SLBSHADOW_SAVEAREA+8(r8) | ||
844 | andis. r7,r5,SLB_ESID_V@h | ||
845 | beq 1f | ||
846 | slbmte r6,r5 | ||
847 | 1: addi r8,r8,16 | ||
848 | .endr | ||
849 | |||
850 | /* Save and reset AMR and UAMOR before turning on the MMU */ | ||
851 | BEGIN_FTR_SECTION | ||
852 | mfspr r5,SPRN_AMR | ||
853 | mfspr r6,SPRN_UAMOR | ||
854 | std r5,VCPU_AMR(r9) | ||
855 | std r6,VCPU_UAMOR(r9) | ||
856 | li r6,0 | ||
857 | mtspr SPRN_AMR,r6 | ||
858 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) | ||
859 | |||
860 | /* Restore host DABR and DABRX */ | ||
861 | ld r5,HSTATE_DABR(r13) | ||
862 | li r6,7 | ||
863 | mtspr SPRN_DABR,r5 | ||
864 | mtspr SPRN_DABRX,r6 | ||
865 | |||
866 | /* Switch DSCR back to host value */ | ||
867 | BEGIN_FTR_SECTION | ||
868 | mfspr r8, SPRN_DSCR | ||
869 | ld r7, HSTATE_DSCR(r13) | ||
870 | std r8, VCPU_DSCR(r7) | ||
871 | mtspr SPRN_DSCR, r7 | ||
872 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) | ||
873 | |||
874 | /* Save non-volatile GPRs */ | ||
875 | std r14, VCPU_GPR(r14)(r9) | ||
876 | std r15, VCPU_GPR(r15)(r9) | ||
877 | std r16, VCPU_GPR(r16)(r9) | ||
878 | std r17, VCPU_GPR(r17)(r9) | ||
879 | std r18, VCPU_GPR(r18)(r9) | ||
880 | std r19, VCPU_GPR(r19)(r9) | ||
881 | std r20, VCPU_GPR(r20)(r9) | ||
882 | std r21, VCPU_GPR(r21)(r9) | ||
883 | std r22, VCPU_GPR(r22)(r9) | ||
884 | std r23, VCPU_GPR(r23)(r9) | ||
885 | std r24, VCPU_GPR(r24)(r9) | ||
886 | std r25, VCPU_GPR(r25)(r9) | ||
887 | std r26, VCPU_GPR(r26)(r9) | ||
888 | std r27, VCPU_GPR(r27)(r9) | ||
889 | std r28, VCPU_GPR(r28)(r9) | ||
890 | std r29, VCPU_GPR(r29)(r9) | ||
891 | std r30, VCPU_GPR(r30)(r9) | ||
892 | std r31, VCPU_GPR(r31)(r9) | ||
893 | |||
894 | /* Save SPRGs */ | ||
895 | mfspr r3, SPRN_SPRG0 | ||
896 | mfspr r4, SPRN_SPRG1 | ||
897 | mfspr r5, SPRN_SPRG2 | ||
898 | mfspr r6, SPRN_SPRG3 | ||
899 | std r3, VCPU_SPRG0(r9) | ||
900 | std r4, VCPU_SPRG1(r9) | ||
901 | std r5, VCPU_SPRG2(r9) | ||
902 | std r6, VCPU_SPRG3(r9) | ||
903 | |||
904 | /* Increment yield count if they have a VPA */ | ||
905 | ld r8, VCPU_VPA(r9) /* do they have a VPA? */ | ||
906 | cmpdi r8, 0 | ||
907 | beq 25f | ||
908 | lwz r3, LPPACA_YIELDCOUNT(r8) | ||
909 | addi r3, r3, 1 | ||
910 | stw r3, LPPACA_YIELDCOUNT(r8) | ||
911 | 25: | ||
912 | /* Save PMU registers if requested */ | ||
913 | /* r8 and cr0.eq are live here */ | ||
914 | li r3, 1 | ||
915 | sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ | ||
916 | mfspr r4, SPRN_MMCR0 /* save MMCR0 */ | ||
917 | mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ | ||
918 | isync | ||
919 | beq 21f /* if no VPA, save PMU stuff anyway */ | ||
920 | lbz r7, LPPACA_PMCINUSE(r8) | ||
921 | cmpwi r7, 0 /* did they ask for PMU stuff to be saved? */ | ||
922 | bne 21f | ||
923 | std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */ | ||
924 | b 22f | ||
925 | 21: mfspr r5, SPRN_MMCR1 | ||
926 | mfspr r6, SPRN_MMCRA | ||
927 | std r4, VCPU_MMCR(r9) | ||
928 | std r5, VCPU_MMCR + 8(r9) | ||
929 | std r6, VCPU_MMCR + 16(r9) | ||
930 | mfspr r3, SPRN_PMC1 | ||
931 | mfspr r4, SPRN_PMC2 | ||
932 | mfspr r5, SPRN_PMC3 | ||
933 | mfspr r6, SPRN_PMC4 | ||
934 | mfspr r7, SPRN_PMC5 | ||
935 | mfspr r8, SPRN_PMC6 | ||
936 | BEGIN_FTR_SECTION | ||
937 | mfspr r10, SPRN_PMC7 | ||
938 | mfspr r11, SPRN_PMC8 | ||
939 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) | ||
940 | stw r3, VCPU_PMC(r9) | ||
941 | stw r4, VCPU_PMC + 4(r9) | ||
942 | stw r5, VCPU_PMC + 8(r9) | ||
943 | stw r6, VCPU_PMC + 12(r9) | ||
944 | stw r7, VCPU_PMC + 16(r9) | ||
945 | stw r8, VCPU_PMC + 20(r9) | ||
946 | BEGIN_FTR_SECTION | ||
947 | stw r10, VCPU_PMC + 24(r9) | ||
948 | stw r11, VCPU_PMC + 28(r9) | ||
949 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) | ||
950 | 22: | ||
951 | /* save FP state */ | ||
952 | mr r3, r9 | ||
953 | bl .kvmppc_save_fp | ||
954 | |||
955 | /* Secondary threads go off to take a nap on POWER7 */ | ||
956 | BEGIN_FTR_SECTION | ||
957 | lwz r0,VCPU_PTID(r3) | ||
958 | cmpwi r0,0 | ||
959 | bne secondary_nap | ||
960 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) | ||
961 | |||
962 | /* | ||
963 | * Reload DEC. HDEC interrupts were disabled when | ||
964 | * we reloaded the host's LPCR value. | ||
965 | */ | ||
966 | ld r3, HSTATE_DECEXP(r13) | ||
967 | mftb r4 | ||
968 | subf r4, r4, r3 | ||
969 | mtspr SPRN_DEC, r4 | ||
970 | |||
971 | /* Reload the host's PMU registers */ | ||
972 | ld r3, PACALPPACAPTR(r13) /* is the host using the PMU? */ | ||
973 | lbz r4, LPPACA_PMCINUSE(r3) | ||
974 | cmpwi r4, 0 | ||
975 | beq 23f /* skip if not */ | ||
976 | lwz r3, HSTATE_PMC(r13) | ||
977 | lwz r4, HSTATE_PMC + 4(r13) | ||
978 | lwz r5, HSTATE_PMC + 8(r13) | ||
979 | lwz r6, HSTATE_PMC + 12(r13) | ||
980 | lwz r8, HSTATE_PMC + 16(r13) | ||
981 | lwz r9, HSTATE_PMC + 20(r13) | ||
982 | BEGIN_FTR_SECTION | ||
983 | lwz r10, HSTATE_PMC + 24(r13) | ||
984 | lwz r11, HSTATE_PMC + 28(r13) | ||
985 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) | ||
986 | mtspr SPRN_PMC1, r3 | ||
987 | mtspr SPRN_PMC2, r4 | ||
988 | mtspr SPRN_PMC3, r5 | ||
989 | mtspr SPRN_PMC4, r6 | ||
990 | mtspr SPRN_PMC5, r8 | ||
991 | mtspr SPRN_PMC6, r9 | ||
992 | BEGIN_FTR_SECTION | ||
993 | mtspr SPRN_PMC7, r10 | ||
994 | mtspr SPRN_PMC8, r11 | ||
995 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) | ||
996 | ld r3, HSTATE_MMCR(r13) | ||
997 | ld r4, HSTATE_MMCR + 8(r13) | ||
998 | ld r5, HSTATE_MMCR + 16(r13) | ||
999 | mtspr SPRN_MMCR1, r4 | ||
1000 | mtspr SPRN_MMCRA, r5 | ||
1001 | mtspr SPRN_MMCR0, r3 | ||
1002 | isync | ||
1003 | 23: | ||
1004 | /* | ||
1005 | * For external and machine check interrupts, we need | ||
1006 | * to call the Linux handler to process the interrupt. | ||
1007 | * We do that by jumping to the interrupt vector address | ||
1008 | * which we have in r12. The [h]rfid at the end of the | ||
1009 | * handler will return to the book3s_hv_interrupts.S code. | ||
1010 | * For other interrupts we do the rfid to get back | ||
1011 | * to the book3s_interrupts.S code here. | ||
1012 | */ | ||
1013 | ld r8, HSTATE_VMHANDLER(r13) | ||
1014 | ld r7, HSTATE_HOST_MSR(r13) | ||
1015 | |||
1016 | cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL | ||
1017 | beq 11f | ||
1018 | cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK | ||
1019 | |||
1020 | /* RFI into the highmem handler, or branch to interrupt handler */ | ||
1021 | 12: mfmsr r6 | ||
1022 | mtctr r12 | ||
1023 | li r0, MSR_RI | ||
1024 | andc r6, r6, r0 | ||
1025 | mtmsrd r6, 1 /* Clear RI in MSR */ | ||
1026 | mtsrr0 r8 | ||
1027 | mtsrr1 r7 | ||
1028 | beqctr | ||
1029 | RFI | ||
1030 | |||
1031 | 11: | ||
1032 | BEGIN_FTR_SECTION | ||
1033 | b 12b | ||
1034 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) | ||
1035 | mtspr SPRN_HSRR0, r8 | ||
1036 | mtspr SPRN_HSRR1, r7 | ||
1037 | ba 0x500 | ||
1038 | |||
1039 | 6: mfspr r6,SPRN_HDAR | ||
1040 | mfspr r7,SPRN_HDSISR | ||
1041 | b 7b | ||
1042 | |||
1043 | /* | ||
1044 | * Try to handle an hcall in real mode. | ||
1045 | * Returns to the guest if we handle it, or continues on up to | ||
1046 | * the kernel if we can't (i.e. if we don't have a handler for | ||
1047 | * it, or if the handler returns H_TOO_HARD). | ||
1048 | */ | ||
1049 | .globl hcall_try_real_mode | ||
1050 | hcall_try_real_mode: | ||
1051 | ld r3,VCPU_GPR(r3)(r9) | ||
1052 | andi. r0,r11,MSR_PR | ||
1053 | bne hcall_real_cont | ||
1054 | clrrdi r3,r3,2 | ||
1055 | cmpldi r3,hcall_real_table_end - hcall_real_table | ||
1056 | bge hcall_real_cont | ||
1057 | LOAD_REG_ADDR(r4, hcall_real_table) | ||
1058 | lwzx r3,r3,r4 | ||
1059 | cmpwi r3,0 | ||
1060 | beq hcall_real_cont | ||
1061 | add r3,r3,r4 | ||
1062 | mtctr r3 | ||
1063 | mr r3,r9 /* get vcpu pointer */ | ||
1064 | ld r4,VCPU_GPR(r4)(r9) | ||
1065 | bctrl | ||
1066 | cmpdi r3,H_TOO_HARD | ||
1067 | beq hcall_real_fallback | ||
1068 | ld r4,HSTATE_KVM_VCPU(r13) | ||
1069 | std r3,VCPU_GPR(r3)(r4) | ||
1070 | ld r10,VCPU_PC(r4) | ||
1071 | ld r11,VCPU_MSR(r4) | ||
1072 | b fast_guest_return | ||
1073 | |||
1074 | /* We've attempted a real mode hcall, but it's punted it back | ||
1075 | * to userspace. We need to restore some clobbered volatiles | ||
1076 | * before resuming the pass-it-to-qemu path */ | ||
1077 | hcall_real_fallback: | ||
1078 | li r12,BOOK3S_INTERRUPT_SYSCALL | ||
1079 | ld r9, HSTATE_KVM_VCPU(r13) | ||
1080 | ld r11, VCPU_MSR(r9) | ||
1081 | |||
1082 | b hcall_real_cont | ||
1083 | |||
1084 | .globl hcall_real_table | ||
1085 | hcall_real_table: | ||
1086 | .long 0 /* 0 - unused */ | ||
1087 | .long .kvmppc_h_remove - hcall_real_table | ||
1088 | .long .kvmppc_h_enter - hcall_real_table | ||
1089 | .long .kvmppc_h_read - hcall_real_table | ||
1090 | .long 0 /* 0x10 - H_CLEAR_MOD */ | ||
1091 | .long 0 /* 0x14 - H_CLEAR_REF */ | ||
1092 | .long .kvmppc_h_protect - hcall_real_table | ||
1093 | .long 0 /* 0x1c - H_GET_TCE */ | ||
1094 | .long .kvmppc_h_put_tce - hcall_real_table | ||
1095 | .long 0 /* 0x24 - H_SET_SPRG0 */ | ||
1096 | .long .kvmppc_h_set_dabr - hcall_real_table | ||
1097 | .long 0 /* 0x2c */ | ||
1098 | .long 0 /* 0x30 */ | ||
1099 | .long 0 /* 0x34 */ | ||
1100 | .long 0 /* 0x38 */ | ||
1101 | .long 0 /* 0x3c */ | ||
1102 | .long 0 /* 0x40 */ | ||
1103 | .long 0 /* 0x44 */ | ||
1104 | .long 0 /* 0x48 */ | ||
1105 | .long 0 /* 0x4c */ | ||
1106 | .long 0 /* 0x50 */ | ||
1107 | .long 0 /* 0x54 */ | ||
1108 | .long 0 /* 0x58 */ | ||
1109 | .long 0 /* 0x5c */ | ||
1110 | .long 0 /* 0x60 */ | ||
1111 | .long 0 /* 0x64 */ | ||
1112 | .long 0 /* 0x68 */ | ||
1113 | .long 0 /* 0x6c */ | ||
1114 | .long 0 /* 0x70 */ | ||
1115 | .long 0 /* 0x74 */ | ||
1116 | .long 0 /* 0x78 */ | ||
1117 | .long 0 /* 0x7c */ | ||
1118 | .long 0 /* 0x80 */ | ||
1119 | .long 0 /* 0x84 */ | ||
1120 | .long 0 /* 0x88 */ | ||
1121 | .long 0 /* 0x8c */ | ||
1122 | .long 0 /* 0x90 */ | ||
1123 | .long 0 /* 0x94 */ | ||
1124 | .long 0 /* 0x98 */ | ||
1125 | .long 0 /* 0x9c */ | ||
1126 | .long 0 /* 0xa0 */ | ||
1127 | .long 0 /* 0xa4 */ | ||
1128 | .long 0 /* 0xa8 */ | ||
1129 | .long 0 /* 0xac */ | ||
1130 | .long 0 /* 0xb0 */ | ||
1131 | .long 0 /* 0xb4 */ | ||
1132 | .long 0 /* 0xb8 */ | ||
1133 | .long 0 /* 0xbc */ | ||
1134 | .long 0 /* 0xc0 */ | ||
1135 | .long 0 /* 0xc4 */ | ||
1136 | .long 0 /* 0xc8 */ | ||
1137 | .long 0 /* 0xcc */ | ||
1138 | .long 0 /* 0xd0 */ | ||
1139 | .long 0 /* 0xd4 */ | ||
1140 | .long 0 /* 0xd8 */ | ||
1141 | .long 0 /* 0xdc */ | ||
1142 | .long 0 /* 0xe0 */ | ||
1143 | .long 0 /* 0xe4 */ | ||
1144 | .long 0 /* 0xe8 */ | ||
1145 | .long 0 /* 0xec */ | ||
1146 | .long 0 /* 0xf0 */ | ||
1147 | .long 0 /* 0xf4 */ | ||
1148 | .long 0 /* 0xf8 */ | ||
1149 | .long 0 /* 0xfc */ | ||
1150 | .long 0 /* 0x100 */ | ||
1151 | .long 0 /* 0x104 */ | ||
1152 | .long 0 /* 0x108 */ | ||
1153 | .long 0 /* 0x10c */ | ||
1154 | .long 0 /* 0x110 */ | ||
1155 | .long 0 /* 0x114 */ | ||
1156 | .long 0 /* 0x118 */ | ||
1157 | .long 0 /* 0x11c */ | ||
1158 | .long 0 /* 0x120 */ | ||
1159 | .long .kvmppc_h_bulk_remove - hcall_real_table | ||
1160 | hcall_real_table_end: | ||
1161 | |||
1162 | ignore_hdec: | ||
1163 | mr r4,r9 | ||
1164 | b fast_guest_return | ||
1165 | |||
1166 | bounce_ext_interrupt: | ||
1167 | mr r4,r9 | ||
1168 | mtspr SPRN_SRR0,r10 | ||
1169 | mtspr SPRN_SRR1,r11 | ||
1170 | li r10,BOOK3S_INTERRUPT_EXTERNAL | ||
1171 | LOAD_REG_IMMEDIATE(r11,MSR_SF | MSR_ME); | ||
1172 | b fast_guest_return | ||
1173 | |||
1174 | _GLOBAL(kvmppc_h_set_dabr) | ||
1175 | std r4,VCPU_DABR(r3) | ||
1176 | mtspr SPRN_DABR,r4 | ||
1177 | li r3,0 | ||
1178 | blr | ||
1179 | |||
1180 | secondary_too_late: | ||
1181 | ld r5,HSTATE_KVM_VCORE(r13) | ||
1182 | HMT_LOW | ||
1183 | 13: lbz r3,VCORE_IN_GUEST(r5) | ||
1184 | cmpwi r3,0 | ||
1185 | bne 13b | ||
1186 | HMT_MEDIUM | ||
1187 | ld r11,PACA_SLBSHADOWPTR(r13) | ||
1188 | |||
1189 | .rept SLB_NUM_BOLTED | ||
1190 | ld r5,SLBSHADOW_SAVEAREA(r11) | ||
1191 | ld r6,SLBSHADOW_SAVEAREA+8(r11) | ||
1192 | andis. r7,r5,SLB_ESID_V@h | ||
1193 | beq 1f | ||
1194 | slbmte r6,r5 | ||
1195 | 1: addi r11,r11,16 | ||
1196 | .endr | ||
1197 | b 50f | ||
1198 | |||
1199 | secondary_nap: | ||
1200 | /* Clear any pending IPI */ | ||
1201 | 50: ld r5, HSTATE_XICS_PHYS(r13) | ||
1202 | li r0, 0xff | ||
1203 | li r6, XICS_QIRR | ||
1204 | stbcix r0, r5, r6 | ||
1205 | |||
1206 | /* increment the nap count and then go to nap mode */ | ||
1207 | ld r4, HSTATE_KVM_VCORE(r13) | ||
1208 | addi r4, r4, VCORE_NAP_COUNT | ||
1209 | lwsync /* make previous updates visible */ | ||
1210 | 51: lwarx r3, 0, r4 | ||
1211 | addi r3, r3, 1 | ||
1212 | stwcx. r3, 0, r4 | ||
1213 | bne 51b | ||
1214 | isync | ||
1215 | |||
1216 | mfspr r4, SPRN_LPCR | ||
1217 | li r0, LPCR_PECE | ||
1218 | andc r4, r4, r0 | ||
1219 | ori r4, r4, LPCR_PECE0 /* exit nap on interrupt */ | ||
1220 | mtspr SPRN_LPCR, r4 | ||
1221 | li r0, 0 | ||
1222 | std r0, HSTATE_SCRATCH0(r13) | ||
1223 | ptesync | ||
1224 | ld r0, HSTATE_SCRATCH0(r13) | ||
1225 | 1: cmpd r0, r0 | ||
1226 | bne 1b | ||
1227 | nap | ||
1228 | b . | ||
1229 | |||
1230 | /* | ||
1231 | * Save away FP, VMX and VSX registers. | ||
1232 | * r3 = vcpu pointer | ||
1233 | */ | ||
1234 | _GLOBAL(kvmppc_save_fp) | ||
1235 | mfmsr r9 | ||
1236 | ori r8,r9,MSR_FP | ||
1237 | #ifdef CONFIG_ALTIVEC | ||
1238 | BEGIN_FTR_SECTION | ||
1239 | oris r8,r8,MSR_VEC@h | ||
1240 | END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) | ||
1241 | #endif | ||
1242 | #ifdef CONFIG_VSX | ||
1243 | BEGIN_FTR_SECTION | ||
1244 | oris r8,r8,MSR_VSX@h | ||
1245 | END_FTR_SECTION_IFSET(CPU_FTR_VSX) | ||
1246 | #endif | ||
1247 | mtmsrd r8 | ||
1248 | isync | ||
1249 | #ifdef CONFIG_VSX | ||
1250 | BEGIN_FTR_SECTION | ||
1251 | reg = 0 | ||
1252 | .rept 32 | ||
1253 | li r6,reg*16+VCPU_VSRS | ||
1254 | stxvd2x reg,r6,r3 | ||
1255 | reg = reg + 1 | ||
1256 | .endr | ||
1257 | FTR_SECTION_ELSE | ||
1258 | #endif | ||
1259 | reg = 0 | ||
1260 | .rept 32 | ||
1261 | stfd reg,reg*8+VCPU_FPRS(r3) | ||
1262 | reg = reg + 1 | ||
1263 | .endr | ||
1264 | #ifdef CONFIG_VSX | ||
1265 | ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX) | ||
1266 | #endif | ||
1267 | mffs fr0 | ||
1268 | stfd fr0,VCPU_FPSCR(r3) | ||
1269 | |||
1270 | #ifdef CONFIG_ALTIVEC | ||
1271 | BEGIN_FTR_SECTION | ||
1272 | reg = 0 | ||
1273 | .rept 32 | ||
1274 | li r6,reg*16+VCPU_VRS | ||
1275 | stvx reg,r6,r3 | ||
1276 | reg = reg + 1 | ||
1277 | .endr | ||
1278 | mfvscr vr0 | ||
1279 | li r6,VCPU_VSCR | ||
1280 | stvx vr0,r6,r3 | ||
1281 | END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) | ||
1282 | #endif | ||
1283 | mfspr r6,SPRN_VRSAVE | ||
1284 | stw r6,VCPU_VRSAVE(r3) | ||
1285 | mtmsrd r9 | ||
1286 | isync | ||
1287 | blr | ||
1288 | |||
1289 | /* | ||
1290 | * Load up FP, VMX and VSX registers | ||
1291 | * r4 = vcpu pointer | ||
1292 | */ | ||
1293 | .globl kvmppc_load_fp | ||
1294 | kvmppc_load_fp: | ||
1295 | mfmsr r9 | ||
1296 | ori r8,r9,MSR_FP | ||
1297 | #ifdef CONFIG_ALTIVEC | ||
1298 | BEGIN_FTR_SECTION | ||
1299 | oris r8,r8,MSR_VEC@h | ||
1300 | END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) | ||
1301 | #endif | ||
1302 | #ifdef CONFIG_VSX | ||
1303 | BEGIN_FTR_SECTION | ||
1304 | oris r8,r8,MSR_VSX@h | ||
1305 | END_FTR_SECTION_IFSET(CPU_FTR_VSX) | ||
1306 | #endif | ||
1307 | mtmsrd r8 | ||
1308 | isync | ||
1309 | lfd fr0,VCPU_FPSCR(r4) | ||
1310 | MTFSF_L(fr0) | ||
1311 | #ifdef CONFIG_VSX | ||
1312 | BEGIN_FTR_SECTION | ||
1313 | reg = 0 | ||
1314 | .rept 32 | ||
1315 | li r7,reg*16+VCPU_VSRS | ||
1316 | lxvd2x reg,r7,r4 | ||
1317 | reg = reg + 1 | ||
1318 | .endr | ||
1319 | FTR_SECTION_ELSE | ||
1320 | #endif | ||
1321 | reg = 0 | ||
1322 | .rept 32 | ||
1323 | lfd reg,reg*8+VCPU_FPRS(r4) | ||
1324 | reg = reg + 1 | ||
1325 | .endr | ||
1326 | #ifdef CONFIG_VSX | ||
1327 | ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX) | ||
1328 | #endif | ||
1329 | |||
1330 | #ifdef CONFIG_ALTIVEC | ||
1331 | BEGIN_FTR_SECTION | ||
1332 | li r7,VCPU_VSCR | ||
1333 | lvx vr0,r7,r4 | ||
1334 | mtvscr vr0 | ||
1335 | reg = 0 | ||
1336 | .rept 32 | ||
1337 | li r7,reg*16+VCPU_VRS | ||
1338 | lvx reg,r7,r4 | ||
1339 | reg = reg + 1 | ||
1340 | .endr | ||
1341 | END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) | ||
1342 | #endif | ||
1343 | lwz r7,VCPU_VRSAVE(r4) | ||
1344 | mtspr SPRN_VRSAVE,r7 | ||
1345 | blr | ||
diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S index 2f0bc928b08a..c54b0e30cf3f 100644 --- a/arch/powerpc/kvm/book3s_interrupts.S +++ b/arch/powerpc/kvm/book3s_interrupts.S | |||
@@ -29,8 +29,7 @@ | |||
29 | #define ULONG_SIZE 8 | 29 | #define ULONG_SIZE 8 |
30 | #define FUNC(name) GLUE(.,name) | 30 | #define FUNC(name) GLUE(.,name) |
31 | 31 | ||
32 | #define GET_SHADOW_VCPU(reg) \ | 32 | #define GET_SHADOW_VCPU_R13 |
33 | addi reg, r13, PACA_KVM_SVCPU | ||
34 | 33 | ||
35 | #define DISABLE_INTERRUPTS \ | 34 | #define DISABLE_INTERRUPTS \ |
36 | mfmsr r0; \ | 35 | mfmsr r0; \ |
@@ -43,8 +42,8 @@ | |||
43 | #define ULONG_SIZE 4 | 42 | #define ULONG_SIZE 4 |
44 | #define FUNC(name) name | 43 | #define FUNC(name) name |
45 | 44 | ||
46 | #define GET_SHADOW_VCPU(reg) \ | 45 | #define GET_SHADOW_VCPU_R13 \ |
47 | lwz reg, (THREAD + THREAD_KVM_SVCPU)(r2) | 46 | lwz r13, (THREAD + THREAD_KVM_SVCPU)(r2) |
48 | 47 | ||
49 | #define DISABLE_INTERRUPTS \ | 48 | #define DISABLE_INTERRUPTS \ |
50 | mfmsr r0; \ | 49 | mfmsr r0; \ |
@@ -85,7 +84,7 @@ | |||
85 | * r3: kvm_run pointer | 84 | * r3: kvm_run pointer |
86 | * r4: vcpu pointer | 85 | * r4: vcpu pointer |
87 | */ | 86 | */ |
88 | _GLOBAL(__kvmppc_vcpu_entry) | 87 | _GLOBAL(__kvmppc_vcpu_run) |
89 | 88 | ||
90 | kvm_start_entry: | 89 | kvm_start_entry: |
91 | /* Write correct stack frame */ | 90 | /* Write correct stack frame */ |
@@ -107,17 +106,11 @@ kvm_start_entry: | |||
107 | /* Load non-volatile guest state from the vcpu */ | 106 | /* Load non-volatile guest state from the vcpu */ |
108 | VCPU_LOAD_NVGPRS(r4) | 107 | VCPU_LOAD_NVGPRS(r4) |
109 | 108 | ||
110 | GET_SHADOW_VCPU(r5) | 109 | kvm_start_lightweight: |
111 | |||
112 | /* Save R1/R2 in the PACA */ | ||
113 | PPC_STL r1, SVCPU_HOST_R1(r5) | ||
114 | PPC_STL r2, SVCPU_HOST_R2(r5) | ||
115 | 110 | ||
116 | /* XXX swap in/out on load? */ | 111 | GET_SHADOW_VCPU_R13 |
117 | PPC_LL r3, VCPU_HIGHMEM_HANDLER(r4) | 112 | PPC_LL r3, VCPU_HIGHMEM_HANDLER(r4) |
118 | PPC_STL r3, SVCPU_VMHANDLER(r5) | 113 | PPC_STL r3, HSTATE_VMHANDLER(r13) |
119 | |||
120 | kvm_start_lightweight: | ||
121 | 114 | ||
122 | PPC_LL r10, VCPU_SHADOW_MSR(r4) /* r10 = vcpu->arch.shadow_msr */ | 115 | PPC_LL r10, VCPU_SHADOW_MSR(r4) /* r10 = vcpu->arch.shadow_msr */ |
123 | 116 | ||
diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c index 79751d8dd131..41cb0017e757 100644 --- a/arch/powerpc/kvm/book3s_mmu_hpte.c +++ b/arch/powerpc/kvm/book3s_mmu_hpte.c | |||
@@ -21,7 +21,6 @@ | |||
21 | #include <linux/kvm_host.h> | 21 | #include <linux/kvm_host.h> |
22 | #include <linux/hash.h> | 22 | #include <linux/hash.h> |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include "trace.h" | ||
25 | 24 | ||
26 | #include <asm/kvm_ppc.h> | 25 | #include <asm/kvm_ppc.h> |
27 | #include <asm/kvm_book3s.h> | 26 | #include <asm/kvm_book3s.h> |
@@ -29,6 +28,8 @@ | |||
29 | #include <asm/mmu_context.h> | 28 | #include <asm/mmu_context.h> |
30 | #include <asm/hw_irq.h> | 29 | #include <asm/hw_irq.h> |
31 | 30 | ||
31 | #include "trace.h" | ||
32 | |||
32 | #define PTE_SIZE 12 | 33 | #define PTE_SIZE 12 |
33 | 34 | ||
34 | static struct kmem_cache *hpte_cache; | 35 | static struct kmem_cache *hpte_cache; |
@@ -58,30 +59,31 @@ static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage) | |||
58 | void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte) | 59 | void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte) |
59 | { | 60 | { |
60 | u64 index; | 61 | u64 index; |
62 | struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); | ||
61 | 63 | ||
62 | trace_kvm_book3s_mmu_map(pte); | 64 | trace_kvm_book3s_mmu_map(pte); |
63 | 65 | ||
64 | spin_lock(&vcpu->arch.mmu_lock); | 66 | spin_lock(&vcpu3s->mmu_lock); |
65 | 67 | ||
66 | /* Add to ePTE list */ | 68 | /* Add to ePTE list */ |
67 | index = kvmppc_mmu_hash_pte(pte->pte.eaddr); | 69 | index = kvmppc_mmu_hash_pte(pte->pte.eaddr); |
68 | hlist_add_head_rcu(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]); | 70 | hlist_add_head_rcu(&pte->list_pte, &vcpu3s->hpte_hash_pte[index]); |
69 | 71 | ||
70 | /* Add to ePTE_long list */ | 72 | /* Add to ePTE_long list */ |
71 | index = kvmppc_mmu_hash_pte_long(pte->pte.eaddr); | 73 | index = kvmppc_mmu_hash_pte_long(pte->pte.eaddr); |
72 | hlist_add_head_rcu(&pte->list_pte_long, | 74 | hlist_add_head_rcu(&pte->list_pte_long, |
73 | &vcpu->arch.hpte_hash_pte_long[index]); | 75 | &vcpu3s->hpte_hash_pte_long[index]); |
74 | 76 | ||
75 | /* Add to vPTE list */ | 77 | /* Add to vPTE list */ |
76 | index = kvmppc_mmu_hash_vpte(pte->pte.vpage); | 78 | index = kvmppc_mmu_hash_vpte(pte->pte.vpage); |
77 | hlist_add_head_rcu(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]); | 79 | hlist_add_head_rcu(&pte->list_vpte, &vcpu3s->hpte_hash_vpte[index]); |
78 | 80 | ||
79 | /* Add to vPTE_long list */ | 81 | /* Add to vPTE_long list */ |
80 | index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage); | 82 | index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage); |
81 | hlist_add_head_rcu(&pte->list_vpte_long, | 83 | hlist_add_head_rcu(&pte->list_vpte_long, |
82 | &vcpu->arch.hpte_hash_vpte_long[index]); | 84 | &vcpu3s->hpte_hash_vpte_long[index]); |
83 | 85 | ||
84 | spin_unlock(&vcpu->arch.mmu_lock); | 86 | spin_unlock(&vcpu3s->mmu_lock); |
85 | } | 87 | } |
86 | 88 | ||
87 | static void free_pte_rcu(struct rcu_head *head) | 89 | static void free_pte_rcu(struct rcu_head *head) |
@@ -92,16 +94,18 @@ static void free_pte_rcu(struct rcu_head *head) | |||
92 | 94 | ||
93 | static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) | 95 | static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) |
94 | { | 96 | { |
97 | struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); | ||
98 | |||
95 | trace_kvm_book3s_mmu_invalidate(pte); | 99 | trace_kvm_book3s_mmu_invalidate(pte); |
96 | 100 | ||
97 | /* Different for 32 and 64 bit */ | 101 | /* Different for 32 and 64 bit */ |
98 | kvmppc_mmu_invalidate_pte(vcpu, pte); | 102 | kvmppc_mmu_invalidate_pte(vcpu, pte); |
99 | 103 | ||
100 | spin_lock(&vcpu->arch.mmu_lock); | 104 | spin_lock(&vcpu3s->mmu_lock); |
101 | 105 | ||
102 | /* pte already invalidated in between? */ | 106 | /* pte already invalidated in between? */ |
103 | if (hlist_unhashed(&pte->list_pte)) { | 107 | if (hlist_unhashed(&pte->list_pte)) { |
104 | spin_unlock(&vcpu->arch.mmu_lock); | 108 | spin_unlock(&vcpu3s->mmu_lock); |
105 | return; | 109 | return; |
106 | } | 110 | } |
107 | 111 | ||
@@ -115,14 +119,15 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) | |||
115 | else | 119 | else |
116 | kvm_release_pfn_clean(pte->pfn); | 120 | kvm_release_pfn_clean(pte->pfn); |
117 | 121 | ||
118 | spin_unlock(&vcpu->arch.mmu_lock); | 122 | spin_unlock(&vcpu3s->mmu_lock); |
119 | 123 | ||
120 | vcpu->arch.hpte_cache_count--; | 124 | vcpu3s->hpte_cache_count--; |
121 | call_rcu(&pte->rcu_head, free_pte_rcu); | 125 | call_rcu(&pte->rcu_head, free_pte_rcu); |
122 | } | 126 | } |
123 | 127 | ||
124 | static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu) | 128 | static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu) |
125 | { | 129 | { |
130 | struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); | ||
126 | struct hpte_cache *pte; | 131 | struct hpte_cache *pte; |
127 | struct hlist_node *node; | 132 | struct hlist_node *node; |
128 | int i; | 133 | int i; |
@@ -130,7 +135,7 @@ static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu) | |||
130 | rcu_read_lock(); | 135 | rcu_read_lock(); |
131 | 136 | ||
132 | for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { | 137 | for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { |
133 | struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i]; | 138 | struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i]; |
134 | 139 | ||
135 | hlist_for_each_entry_rcu(pte, node, list, list_vpte_long) | 140 | hlist_for_each_entry_rcu(pte, node, list, list_vpte_long) |
136 | invalidate_pte(vcpu, pte); | 141 | invalidate_pte(vcpu, pte); |
@@ -141,12 +146,13 @@ static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu) | |||
141 | 146 | ||
142 | static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea) | 147 | static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea) |
143 | { | 148 | { |
149 | struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); | ||
144 | struct hlist_head *list; | 150 | struct hlist_head *list; |
145 | struct hlist_node *node; | 151 | struct hlist_node *node; |
146 | struct hpte_cache *pte; | 152 | struct hpte_cache *pte; |
147 | 153 | ||
148 | /* Find the list of entries in the map */ | 154 | /* Find the list of entries in the map */ |
149 | list = &vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)]; | 155 | list = &vcpu3s->hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)]; |
150 | 156 | ||
151 | rcu_read_lock(); | 157 | rcu_read_lock(); |
152 | 158 | ||
@@ -160,12 +166,13 @@ static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea) | |||
160 | 166 | ||
161 | static void kvmppc_mmu_pte_flush_long(struct kvm_vcpu *vcpu, ulong guest_ea) | 167 | static void kvmppc_mmu_pte_flush_long(struct kvm_vcpu *vcpu, ulong guest_ea) |
162 | { | 168 | { |
169 | struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); | ||
163 | struct hlist_head *list; | 170 | struct hlist_head *list; |
164 | struct hlist_node *node; | 171 | struct hlist_node *node; |
165 | struct hpte_cache *pte; | 172 | struct hpte_cache *pte; |
166 | 173 | ||
167 | /* Find the list of entries in the map */ | 174 | /* Find the list of entries in the map */ |
168 | list = &vcpu->arch.hpte_hash_pte_long[ | 175 | list = &vcpu3s->hpte_hash_pte_long[ |
169 | kvmppc_mmu_hash_pte_long(guest_ea)]; | 176 | kvmppc_mmu_hash_pte_long(guest_ea)]; |
170 | 177 | ||
171 | rcu_read_lock(); | 178 | rcu_read_lock(); |
@@ -203,12 +210,13 @@ void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask) | |||
203 | /* Flush with mask 0xfffffffff */ | 210 | /* Flush with mask 0xfffffffff */ |
204 | static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp) | 211 | static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp) |
205 | { | 212 | { |
213 | struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); | ||
206 | struct hlist_head *list; | 214 | struct hlist_head *list; |
207 | struct hlist_node *node; | 215 | struct hlist_node *node; |
208 | struct hpte_cache *pte; | 216 | struct hpte_cache *pte; |
209 | u64 vp_mask = 0xfffffffffULL; | 217 | u64 vp_mask = 0xfffffffffULL; |
210 | 218 | ||
211 | list = &vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)]; | 219 | list = &vcpu3s->hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)]; |
212 | 220 | ||
213 | rcu_read_lock(); | 221 | rcu_read_lock(); |
214 | 222 | ||
@@ -223,12 +231,13 @@ static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp) | |||
223 | /* Flush with mask 0xffffff000 */ | 231 | /* Flush with mask 0xffffff000 */ |
224 | static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp) | 232 | static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp) |
225 | { | 233 | { |
234 | struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); | ||
226 | struct hlist_head *list; | 235 | struct hlist_head *list; |
227 | struct hlist_node *node; | 236 | struct hlist_node *node; |
228 | struct hpte_cache *pte; | 237 | struct hpte_cache *pte; |
229 | u64 vp_mask = 0xffffff000ULL; | 238 | u64 vp_mask = 0xffffff000ULL; |
230 | 239 | ||
231 | list = &vcpu->arch.hpte_hash_vpte_long[ | 240 | list = &vcpu3s->hpte_hash_vpte_long[ |
232 | kvmppc_mmu_hash_vpte_long(guest_vp)]; | 241 | kvmppc_mmu_hash_vpte_long(guest_vp)]; |
233 | 242 | ||
234 | rcu_read_lock(); | 243 | rcu_read_lock(); |
@@ -261,6 +270,7 @@ void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask) | |||
261 | 270 | ||
262 | void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) | 271 | void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) |
263 | { | 272 | { |
273 | struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); | ||
264 | struct hlist_node *node; | 274 | struct hlist_node *node; |
265 | struct hpte_cache *pte; | 275 | struct hpte_cache *pte; |
266 | int i; | 276 | int i; |
@@ -270,7 +280,7 @@ void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) | |||
270 | rcu_read_lock(); | 280 | rcu_read_lock(); |
271 | 281 | ||
272 | for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { | 282 | for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { |
273 | struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i]; | 283 | struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i]; |
274 | 284 | ||
275 | hlist_for_each_entry_rcu(pte, node, list, list_vpte_long) | 285 | hlist_for_each_entry_rcu(pte, node, list, list_vpte_long) |
276 | if ((pte->pte.raddr >= pa_start) && | 286 | if ((pte->pte.raddr >= pa_start) && |
@@ -283,12 +293,13 @@ void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) | |||
283 | 293 | ||
284 | struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu) | 294 | struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu) |
285 | { | 295 | { |
296 | struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); | ||
286 | struct hpte_cache *pte; | 297 | struct hpte_cache *pte; |
287 | 298 | ||
288 | pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL); | 299 | pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL); |
289 | vcpu->arch.hpte_cache_count++; | 300 | vcpu3s->hpte_cache_count++; |
290 | 301 | ||
291 | if (vcpu->arch.hpte_cache_count == HPTEG_CACHE_NUM) | 302 | if (vcpu3s->hpte_cache_count == HPTEG_CACHE_NUM) |
292 | kvmppc_mmu_pte_flush_all(vcpu); | 303 | kvmppc_mmu_pte_flush_all(vcpu); |
293 | 304 | ||
294 | return pte; | 305 | return pte; |
@@ -309,17 +320,19 @@ static void kvmppc_mmu_hpte_init_hash(struct hlist_head *hash_list, int len) | |||
309 | 320 | ||
310 | int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu) | 321 | int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu) |
311 | { | 322 | { |
323 | struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); | ||
324 | |||
312 | /* init hpte lookup hashes */ | 325 | /* init hpte lookup hashes */ |
313 | kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte, | 326 | kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_pte, |
314 | ARRAY_SIZE(vcpu->arch.hpte_hash_pte)); | 327 | ARRAY_SIZE(vcpu3s->hpte_hash_pte)); |
315 | kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte_long, | 328 | kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_pte_long, |
316 | ARRAY_SIZE(vcpu->arch.hpte_hash_pte_long)); | 329 | ARRAY_SIZE(vcpu3s->hpte_hash_pte_long)); |
317 | kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte, | 330 | kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte, |
318 | ARRAY_SIZE(vcpu->arch.hpte_hash_vpte)); | 331 | ARRAY_SIZE(vcpu3s->hpte_hash_vpte)); |
319 | kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long, | 332 | kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_long, |
320 | ARRAY_SIZE(vcpu->arch.hpte_hash_vpte_long)); | 333 | ARRAY_SIZE(vcpu3s->hpte_hash_vpte_long)); |
321 | 334 | ||
322 | spin_lock_init(&vcpu->arch.mmu_lock); | 335 | spin_lock_init(&vcpu3s->mmu_lock); |
323 | 336 | ||
324 | return 0; | 337 | return 0; |
325 | } | 338 | } |
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c new file mode 100644 index 000000000000..0c0d3f274437 --- /dev/null +++ b/arch/powerpc/kvm/book3s_pr.c | |||
@@ -0,0 +1,1029 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved. | ||
3 | * | ||
4 | * Authors: | ||
5 | * Alexander Graf <agraf@suse.de> | ||
6 | * Kevin Wolf <mail@kevin-wolf.de> | ||
7 | * Paul Mackerras <paulus@samba.org> | ||
8 | * | ||
9 | * Description: | ||
10 | * Functions relating to running KVM on Book 3S processors where | ||
11 | * we don't have access to hypervisor mode, and we run the guest | ||
12 | * in problem state (user mode). | ||
13 | * | ||
14 | * This file is derived from arch/powerpc/kvm/44x.c, | ||
15 | * by Hollis Blanchard <hollisb@us.ibm.com>. | ||
16 | * | ||
17 | * This program is free software; you can redistribute it and/or modify | ||
18 | * it under the terms of the GNU General Public License, version 2, as | ||
19 | * published by the Free Software Foundation. | ||
20 | */ | ||
21 | |||
22 | #include <linux/kvm_host.h> | ||
23 | #include <linux/err.h> | ||
24 | #include <linux/slab.h> | ||
25 | |||
26 | #include <asm/reg.h> | ||
27 | #include <asm/cputable.h> | ||
28 | #include <asm/cacheflush.h> | ||
29 | #include <asm/tlbflush.h> | ||
30 | #include <asm/uaccess.h> | ||
31 | #include <asm/io.h> | ||
32 | #include <asm/kvm_ppc.h> | ||
33 | #include <asm/kvm_book3s.h> | ||
34 | #include <asm/mmu_context.h> | ||
35 | #include <linux/gfp.h> | ||
36 | #include <linux/sched.h> | ||
37 | #include <linux/vmalloc.h> | ||
38 | #include <linux/highmem.h> | ||
39 | |||
40 | #include "trace.h" | ||
41 | |||
42 | /* #define EXIT_DEBUG */ | ||
43 | /* #define DEBUG_EXT */ | ||
44 | |||
45 | static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, | ||
46 | ulong msr); | ||
47 | |||
48 | /* Some compatibility defines */ | ||
49 | #ifdef CONFIG_PPC_BOOK3S_32 | ||
50 | #define MSR_USER32 MSR_USER | ||
51 | #define MSR_USER64 MSR_USER | ||
52 | #define HW_PAGE_SIZE PAGE_SIZE | ||
53 | #endif | ||
54 | |||
55 | void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | ||
56 | { | ||
57 | #ifdef CONFIG_PPC_BOOK3S_64 | ||
58 | memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb)); | ||
59 | memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu, | ||
60 | sizeof(get_paca()->shadow_vcpu)); | ||
61 | to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max; | ||
62 | #endif | ||
63 | |||
64 | #ifdef CONFIG_PPC_BOOK3S_32 | ||
65 | current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu; | ||
66 | #endif | ||
67 | } | ||
68 | |||
69 | void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) | ||
70 | { | ||
71 | #ifdef CONFIG_PPC_BOOK3S_64 | ||
72 | memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb)); | ||
73 | memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu, | ||
74 | sizeof(get_paca()->shadow_vcpu)); | ||
75 | to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max; | ||
76 | #endif | ||
77 | |||
78 | kvmppc_giveup_ext(vcpu, MSR_FP); | ||
79 | kvmppc_giveup_ext(vcpu, MSR_VEC); | ||
80 | kvmppc_giveup_ext(vcpu, MSR_VSX); | ||
81 | } | ||
82 | |||
83 | static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) | ||
84 | { | ||
85 | ulong smsr = vcpu->arch.shared->msr; | ||
86 | |||
87 | /* Guest MSR values */ | ||
88 | smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE; | ||
89 | /* Process MSR values */ | ||
90 | smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE; | ||
91 | /* External providers the guest reserved */ | ||
92 | smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext); | ||
93 | /* 64-bit Process MSR values */ | ||
94 | #ifdef CONFIG_PPC_BOOK3S_64 | ||
95 | smsr |= MSR_ISF | MSR_HV; | ||
96 | #endif | ||
97 | vcpu->arch.shadow_msr = smsr; | ||
98 | } | ||
99 | |||
100 | void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) | ||
101 | { | ||
102 | ulong old_msr = vcpu->arch.shared->msr; | ||
103 | |||
104 | #ifdef EXIT_DEBUG | ||
105 | printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr); | ||
106 | #endif | ||
107 | |||
108 | msr &= to_book3s(vcpu)->msr_mask; | ||
109 | vcpu->arch.shared->msr = msr; | ||
110 | kvmppc_recalc_shadow_msr(vcpu); | ||
111 | |||
112 | if (msr & MSR_POW) { | ||
113 | if (!vcpu->arch.pending_exceptions) { | ||
114 | kvm_vcpu_block(vcpu); | ||
115 | vcpu->stat.halt_wakeup++; | ||
116 | |||
117 | /* Unset POW bit after we woke up */ | ||
118 | msr &= ~MSR_POW; | ||
119 | vcpu->arch.shared->msr = msr; | ||
120 | } | ||
121 | } | ||
122 | |||
123 | if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) != | ||
124 | (old_msr & (MSR_PR|MSR_IR|MSR_DR))) { | ||
125 | kvmppc_mmu_flush_segments(vcpu); | ||
126 | kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); | ||
127 | |||
128 | /* Preload magic page segment when in kernel mode */ | ||
129 | if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) { | ||
130 | struct kvm_vcpu_arch *a = &vcpu->arch; | ||
131 | |||
132 | if (msr & MSR_DR) | ||
133 | kvmppc_mmu_map_segment(vcpu, a->magic_page_ea); | ||
134 | else | ||
135 | kvmppc_mmu_map_segment(vcpu, a->magic_page_pa); | ||
136 | } | ||
137 | } | ||
138 | |||
139 | /* Preload FPU if it's enabled */ | ||
140 | if (vcpu->arch.shared->msr & MSR_FP) | ||
141 | kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); | ||
142 | } | ||
143 | |||
144 | void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) | ||
145 | { | ||
146 | u32 host_pvr; | ||
147 | |||
148 | vcpu->arch.hflags &= ~BOOK3S_HFLAG_SLB; | ||
149 | vcpu->arch.pvr = pvr; | ||
150 | #ifdef CONFIG_PPC_BOOK3S_64 | ||
151 | if ((pvr >= 0x330000) && (pvr < 0x70330000)) { | ||
152 | kvmppc_mmu_book3s_64_init(vcpu); | ||
153 | to_book3s(vcpu)->hior = 0xfff00000; | ||
154 | to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL; | ||
155 | } else | ||
156 | #endif | ||
157 | { | ||
158 | kvmppc_mmu_book3s_32_init(vcpu); | ||
159 | to_book3s(vcpu)->hior = 0; | ||
160 | to_book3s(vcpu)->msr_mask = 0xffffffffULL; | ||
161 | } | ||
162 | |||
163 | /* If we are in hypervisor level on 970, we can tell the CPU to | ||
164 | * treat DCBZ as 32 bytes store */ | ||
165 | vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32; | ||
166 | if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV) && | ||
167 | !strcmp(cur_cpu_spec->platform, "ppc970")) | ||
168 | vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; | ||
169 | |||
170 | /* Cell performs badly if MSR_FEx are set. So let's hope nobody | ||
171 | really needs them in a VM on Cell and force disable them. */ | ||
172 | if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be")) | ||
173 | to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1); | ||
174 | |||
175 | #ifdef CONFIG_PPC_BOOK3S_32 | ||
176 | /* 32 bit Book3S always has 32 byte dcbz */ | ||
177 | vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; | ||
178 | #endif | ||
179 | |||
180 | /* On some CPUs we can execute paired single operations natively */ | ||
181 | asm ( "mfpvr %0" : "=r"(host_pvr)); | ||
182 | switch (host_pvr) { | ||
183 | case 0x00080200: /* lonestar 2.0 */ | ||
184 | case 0x00088202: /* lonestar 2.2 */ | ||
185 | case 0x70000100: /* gekko 1.0 */ | ||
186 | case 0x00080100: /* gekko 2.0 */ | ||
187 | case 0x00083203: /* gekko 2.3a */ | ||
188 | case 0x00083213: /* gekko 2.3b */ | ||
189 | case 0x00083204: /* gekko 2.4 */ | ||
190 | case 0x00083214: /* gekko 2.4e (8SE) - retail HW2 */ | ||
191 | case 0x00087200: /* broadway */ | ||
192 | vcpu->arch.hflags |= BOOK3S_HFLAG_NATIVE_PS; | ||
193 | /* Enable HID2.PSE - in case we need it later */ | ||
194 | mtspr(SPRN_HID2_GEKKO, mfspr(SPRN_HID2_GEKKO) | (1 << 29)); | ||
195 | } | ||
196 | } | ||
197 | |||
198 | /* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To | ||
199 | * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to | ||
200 | * emulate 32 bytes dcbz length. | ||
201 | * | ||
202 | * The Book3s_64 inventors also realized this case and implemented a special bit | ||
203 | * in the HID5 register, which is a hypervisor ressource. Thus we can't use it. | ||
204 | * | ||
205 | * My approach here is to patch the dcbz instruction on executing pages. | ||
206 | */ | ||
207 | static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) | ||
208 | { | ||
209 | struct page *hpage; | ||
210 | u64 hpage_offset; | ||
211 | u32 *page; | ||
212 | int i; | ||
213 | |||
214 | hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT); | ||
215 | if (is_error_page(hpage)) { | ||
216 | kvm_release_page_clean(hpage); | ||
217 | return; | ||
218 | } | ||
219 | |||
220 | hpage_offset = pte->raddr & ~PAGE_MASK; | ||
221 | hpage_offset &= ~0xFFFULL; | ||
222 | hpage_offset /= 4; | ||
223 | |||
224 | get_page(hpage); | ||
225 | page = kmap_atomic(hpage, KM_USER0); | ||
226 | |||
227 | /* patch dcbz into reserved instruction, so we trap */ | ||
228 | for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++) | ||
229 | if ((page[i] & 0xff0007ff) == INS_DCBZ) | ||
230 | page[i] &= 0xfffffff7; | ||
231 | |||
232 | kunmap_atomic(page, KM_USER0); | ||
233 | put_page(hpage); | ||
234 | } | ||
235 | |||
236 | static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
237 | { | ||
238 | ulong mp_pa = vcpu->arch.magic_page_pa; | ||
239 | |||
240 | if (unlikely(mp_pa) && | ||
241 | unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) { | ||
242 | return 1; | ||
243 | } | ||
244 | |||
245 | return kvm_is_visible_gfn(vcpu->kvm, gfn); | ||
246 | } | ||
247 | |||
248 | int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, | ||
249 | ulong eaddr, int vec) | ||
250 | { | ||
251 | bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE); | ||
252 | int r = RESUME_GUEST; | ||
253 | int relocated; | ||
254 | int page_found = 0; | ||
255 | struct kvmppc_pte pte; | ||
256 | bool is_mmio = false; | ||
257 | bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false; | ||
258 | bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false; | ||
259 | u64 vsid; | ||
260 | |||
261 | relocated = data ? dr : ir; | ||
262 | |||
263 | /* Resolve real address if translation turned on */ | ||
264 | if (relocated) { | ||
265 | page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data); | ||
266 | } else { | ||
267 | pte.may_execute = true; | ||
268 | pte.may_read = true; | ||
269 | pte.may_write = true; | ||
270 | pte.raddr = eaddr & KVM_PAM; | ||
271 | pte.eaddr = eaddr; | ||
272 | pte.vpage = eaddr >> 12; | ||
273 | } | ||
274 | |||
275 | switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { | ||
276 | case 0: | ||
277 | pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12)); | ||
278 | break; | ||
279 | case MSR_DR: | ||
280 | case MSR_IR: | ||
281 | vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid); | ||
282 | |||
283 | if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR) | ||
284 | pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12)); | ||
285 | else | ||
286 | pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12)); | ||
287 | pte.vpage |= vsid; | ||
288 | |||
289 | if (vsid == -1) | ||
290 | page_found = -EINVAL; | ||
291 | break; | ||
292 | } | ||
293 | |||
294 | if (vcpu->arch.mmu.is_dcbz32(vcpu) && | ||
295 | (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) { | ||
296 | /* | ||
297 | * If we do the dcbz hack, we have to NX on every execution, | ||
298 | * so we can patch the executing code. This renders our guest | ||
299 | * NX-less. | ||
300 | */ | ||
301 | pte.may_execute = !data; | ||
302 | } | ||
303 | |||
304 | if (page_found == -ENOENT) { | ||
305 | /* Page not found in guest PTE entries */ | ||
306 | vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); | ||
307 | vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; | ||
308 | vcpu->arch.shared->msr |= | ||
309 | (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); | ||
310 | kvmppc_book3s_queue_irqprio(vcpu, vec); | ||
311 | } else if (page_found == -EPERM) { | ||
312 | /* Storage protection */ | ||
313 | vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); | ||
314 | vcpu->arch.shared->dsisr = | ||
315 | to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE; | ||
316 | vcpu->arch.shared->dsisr |= DSISR_PROTFAULT; | ||
317 | vcpu->arch.shared->msr |= | ||
318 | (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); | ||
319 | kvmppc_book3s_queue_irqprio(vcpu, vec); | ||
320 | } else if (page_found == -EINVAL) { | ||
321 | /* Page not found in guest SLB */ | ||
322 | vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); | ||
323 | kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80); | ||
324 | } else if (!is_mmio && | ||
325 | kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) { | ||
326 | /* The guest's PTE is not mapped yet. Map on the host */ | ||
327 | kvmppc_mmu_map_page(vcpu, &pte); | ||
328 | if (data) | ||
329 | vcpu->stat.sp_storage++; | ||
330 | else if (vcpu->arch.mmu.is_dcbz32(vcpu) && | ||
331 | (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) | ||
332 | kvmppc_patch_dcbz(vcpu, &pte); | ||
333 | } else { | ||
334 | /* MMIO */ | ||
335 | vcpu->stat.mmio_exits++; | ||
336 | vcpu->arch.paddr_accessed = pte.raddr; | ||
337 | r = kvmppc_emulate_mmio(run, vcpu); | ||
338 | if ( r == RESUME_HOST_NV ) | ||
339 | r = RESUME_HOST; | ||
340 | } | ||
341 | |||
342 | return r; | ||
343 | } | ||
344 | |||
345 | static inline int get_fpr_index(int i) | ||
346 | { | ||
347 | #ifdef CONFIG_VSX | ||
348 | i *= 2; | ||
349 | #endif | ||
350 | return i; | ||
351 | } | ||
352 | |||
353 | /* Give up external provider (FPU, Altivec, VSX) */ | ||
354 | void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr) | ||
355 | { | ||
356 | struct thread_struct *t = ¤t->thread; | ||
357 | u64 *vcpu_fpr = vcpu->arch.fpr; | ||
358 | #ifdef CONFIG_VSX | ||
359 | u64 *vcpu_vsx = vcpu->arch.vsr; | ||
360 | #endif | ||
361 | u64 *thread_fpr = (u64*)t->fpr; | ||
362 | int i; | ||
363 | |||
364 | if (!(vcpu->arch.guest_owned_ext & msr)) | ||
365 | return; | ||
366 | |||
367 | #ifdef DEBUG_EXT | ||
368 | printk(KERN_INFO "Giving up ext 0x%lx\n", msr); | ||
369 | #endif | ||
370 | |||
371 | switch (msr) { | ||
372 | case MSR_FP: | ||
373 | giveup_fpu(current); | ||
374 | for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) | ||
375 | vcpu_fpr[i] = thread_fpr[get_fpr_index(i)]; | ||
376 | |||
377 | vcpu->arch.fpscr = t->fpscr.val; | ||
378 | break; | ||
379 | case MSR_VEC: | ||
380 | #ifdef CONFIG_ALTIVEC | ||
381 | giveup_altivec(current); | ||
382 | memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr)); | ||
383 | vcpu->arch.vscr = t->vscr; | ||
384 | #endif | ||
385 | break; | ||
386 | case MSR_VSX: | ||
387 | #ifdef CONFIG_VSX | ||
388 | __giveup_vsx(current); | ||
389 | for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) | ||
390 | vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1]; | ||
391 | #endif | ||
392 | break; | ||
393 | default: | ||
394 | BUG(); | ||
395 | } | ||
396 | |||
397 | vcpu->arch.guest_owned_ext &= ~msr; | ||
398 | current->thread.regs->msr &= ~msr; | ||
399 | kvmppc_recalc_shadow_msr(vcpu); | ||
400 | } | ||
401 | |||
402 | static int kvmppc_read_inst(struct kvm_vcpu *vcpu) | ||
403 | { | ||
404 | ulong srr0 = kvmppc_get_pc(vcpu); | ||
405 | u32 last_inst = kvmppc_get_last_inst(vcpu); | ||
406 | int ret; | ||
407 | |||
408 | ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false); | ||
409 | if (ret == -ENOENT) { | ||
410 | ulong msr = vcpu->arch.shared->msr; | ||
411 | |||
412 | msr = kvmppc_set_field(msr, 33, 33, 1); | ||
413 | msr = kvmppc_set_field(msr, 34, 36, 0); | ||
414 | vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0); | ||
415 | kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE); | ||
416 | return EMULATE_AGAIN; | ||
417 | } | ||
418 | |||
419 | return EMULATE_DONE; | ||
420 | } | ||
421 | |||
422 | static int kvmppc_check_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr) | ||
423 | { | ||
424 | |||
425 | /* Need to do paired single emulation? */ | ||
426 | if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)) | ||
427 | return EMULATE_DONE; | ||
428 | |||
429 | /* Read out the instruction */ | ||
430 | if (kvmppc_read_inst(vcpu) == EMULATE_DONE) | ||
431 | /* Need to emulate */ | ||
432 | return EMULATE_FAIL; | ||
433 | |||
434 | return EMULATE_AGAIN; | ||
435 | } | ||
436 | |||
437 | /* Handle external providers (FPU, Altivec, VSX) */ | ||
438 | static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, | ||
439 | ulong msr) | ||
440 | { | ||
441 | struct thread_struct *t = ¤t->thread; | ||
442 | u64 *vcpu_fpr = vcpu->arch.fpr; | ||
443 | #ifdef CONFIG_VSX | ||
444 | u64 *vcpu_vsx = vcpu->arch.vsr; | ||
445 | #endif | ||
446 | u64 *thread_fpr = (u64*)t->fpr; | ||
447 | int i; | ||
448 | |||
449 | /* When we have paired singles, we emulate in software */ | ||
450 | if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE) | ||
451 | return RESUME_GUEST; | ||
452 | |||
453 | if (!(vcpu->arch.shared->msr & msr)) { | ||
454 | kvmppc_book3s_queue_irqprio(vcpu, exit_nr); | ||
455 | return RESUME_GUEST; | ||
456 | } | ||
457 | |||
458 | /* We already own the ext */ | ||
459 | if (vcpu->arch.guest_owned_ext & msr) { | ||
460 | return RESUME_GUEST; | ||
461 | } | ||
462 | |||
463 | #ifdef DEBUG_EXT | ||
464 | printk(KERN_INFO "Loading up ext 0x%lx\n", msr); | ||
465 | #endif | ||
466 | |||
467 | current->thread.regs->msr |= msr; | ||
468 | |||
469 | switch (msr) { | ||
470 | case MSR_FP: | ||
471 | for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) | ||
472 | thread_fpr[get_fpr_index(i)] = vcpu_fpr[i]; | ||
473 | |||
474 | t->fpscr.val = vcpu->arch.fpscr; | ||
475 | t->fpexc_mode = 0; | ||
476 | kvmppc_load_up_fpu(); | ||
477 | break; | ||
478 | case MSR_VEC: | ||
479 | #ifdef CONFIG_ALTIVEC | ||
480 | memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr)); | ||
481 | t->vscr = vcpu->arch.vscr; | ||
482 | t->vrsave = -1; | ||
483 | kvmppc_load_up_altivec(); | ||
484 | #endif | ||
485 | break; | ||
486 | case MSR_VSX: | ||
487 | #ifdef CONFIG_VSX | ||
488 | for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) | ||
489 | thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i]; | ||
490 | kvmppc_load_up_vsx(); | ||
491 | #endif | ||
492 | break; | ||
493 | default: | ||
494 | BUG(); | ||
495 | } | ||
496 | |||
497 | vcpu->arch.guest_owned_ext |= msr; | ||
498 | |||
499 | kvmppc_recalc_shadow_msr(vcpu); | ||
500 | |||
501 | return RESUME_GUEST; | ||
502 | } | ||
503 | |||
504 | int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, | ||
505 | unsigned int exit_nr) | ||
506 | { | ||
507 | int r = RESUME_HOST; | ||
508 | |||
509 | vcpu->stat.sum_exits++; | ||
510 | |||
511 | run->exit_reason = KVM_EXIT_UNKNOWN; | ||
512 | run->ready_for_interrupt_injection = 1; | ||
513 | |||
514 | trace_kvm_book3s_exit(exit_nr, vcpu); | ||
515 | kvm_resched(vcpu); | ||
516 | switch (exit_nr) { | ||
517 | case BOOK3S_INTERRUPT_INST_STORAGE: | ||
518 | vcpu->stat.pf_instruc++; | ||
519 | |||
520 | #ifdef CONFIG_PPC_BOOK3S_32 | ||
521 | /* We set segments as unused segments when invalidating them. So | ||
522 | * treat the respective fault as segment fault. */ | ||
523 | if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT] | ||
524 | == SR_INVALID) { | ||
525 | kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); | ||
526 | r = RESUME_GUEST; | ||
527 | break; | ||
528 | } | ||
529 | #endif | ||
530 | |||
531 | /* only care about PTEG not found errors, but leave NX alone */ | ||
532 | if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) { | ||
533 | r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr); | ||
534 | vcpu->stat.sp_instruc++; | ||
535 | } else if (vcpu->arch.mmu.is_dcbz32(vcpu) && | ||
536 | (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) { | ||
537 | /* | ||
538 | * XXX If we do the dcbz hack we use the NX bit to flush&patch the page, | ||
539 | * so we can't use the NX bit inside the guest. Let's cross our fingers, | ||
540 | * that no guest that needs the dcbz hack does NX. | ||
541 | */ | ||
542 | kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL); | ||
543 | r = RESUME_GUEST; | ||
544 | } else { | ||
545 | vcpu->arch.shared->msr |= | ||
546 | to_svcpu(vcpu)->shadow_srr1 & 0x58000000; | ||
547 | kvmppc_book3s_queue_irqprio(vcpu, exit_nr); | ||
548 | r = RESUME_GUEST; | ||
549 | } | ||
550 | break; | ||
551 | case BOOK3S_INTERRUPT_DATA_STORAGE: | ||
552 | { | ||
553 | ulong dar = kvmppc_get_fault_dar(vcpu); | ||
554 | vcpu->stat.pf_storage++; | ||
555 | |||
556 | #ifdef CONFIG_PPC_BOOK3S_32 | ||
557 | /* We set segments as unused segments when invalidating them. So | ||
558 | * treat the respective fault as segment fault. */ | ||
559 | if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) { | ||
560 | kvmppc_mmu_map_segment(vcpu, dar); | ||
561 | r = RESUME_GUEST; | ||
562 | break; | ||
563 | } | ||
564 | #endif | ||
565 | |||
566 | /* The only case we need to handle is missing shadow PTEs */ | ||
567 | if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) { | ||
568 | r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr); | ||
569 | } else { | ||
570 | vcpu->arch.shared->dar = dar; | ||
571 | vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; | ||
572 | kvmppc_book3s_queue_irqprio(vcpu, exit_nr); | ||
573 | r = RESUME_GUEST; | ||
574 | } | ||
575 | break; | ||
576 | } | ||
577 | case BOOK3S_INTERRUPT_DATA_SEGMENT: | ||
578 | if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) { | ||
579 | vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); | ||
580 | kvmppc_book3s_queue_irqprio(vcpu, | ||
581 | BOOK3S_INTERRUPT_DATA_SEGMENT); | ||
582 | } | ||
583 | r = RESUME_GUEST; | ||
584 | break; | ||
585 | case BOOK3S_INTERRUPT_INST_SEGMENT: | ||
586 | if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)) < 0) { | ||
587 | kvmppc_book3s_queue_irqprio(vcpu, | ||
588 | BOOK3S_INTERRUPT_INST_SEGMENT); | ||
589 | } | ||
590 | r = RESUME_GUEST; | ||
591 | break; | ||
592 | /* We're good on these - the host merely wanted to get our attention */ | ||
593 | case BOOK3S_INTERRUPT_DECREMENTER: | ||
594 | vcpu->stat.dec_exits++; | ||
595 | r = RESUME_GUEST; | ||
596 | break; | ||
597 | case BOOK3S_INTERRUPT_EXTERNAL: | ||
598 | vcpu->stat.ext_intr_exits++; | ||
599 | r = RESUME_GUEST; | ||
600 | break; | ||
601 | case BOOK3S_INTERRUPT_PERFMON: | ||
602 | r = RESUME_GUEST; | ||
603 | break; | ||
604 | case BOOK3S_INTERRUPT_PROGRAM: | ||
605 | { | ||
606 | enum emulation_result er; | ||
607 | ulong flags; | ||
608 | |||
609 | program_interrupt: | ||
610 | flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull; | ||
611 | |||
612 | if (vcpu->arch.shared->msr & MSR_PR) { | ||
613 | #ifdef EXIT_DEBUG | ||
614 | printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu)); | ||
615 | #endif | ||
616 | if ((kvmppc_get_last_inst(vcpu) & 0xff0007ff) != | ||
617 | (INS_DCBZ & 0xfffffff7)) { | ||
618 | kvmppc_core_queue_program(vcpu, flags); | ||
619 | r = RESUME_GUEST; | ||
620 | break; | ||
621 | } | ||
622 | } | ||
623 | |||
624 | vcpu->stat.emulated_inst_exits++; | ||
625 | er = kvmppc_emulate_instruction(run, vcpu); | ||
626 | switch (er) { | ||
627 | case EMULATE_DONE: | ||
628 | r = RESUME_GUEST_NV; | ||
629 | break; | ||
630 | case EMULATE_AGAIN: | ||
631 | r = RESUME_GUEST; | ||
632 | break; | ||
633 | case EMULATE_FAIL: | ||
634 | printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", | ||
635 | __func__, kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu)); | ||
636 | kvmppc_core_queue_program(vcpu, flags); | ||
637 | r = RESUME_GUEST; | ||
638 | break; | ||
639 | case EMULATE_DO_MMIO: | ||
640 | run->exit_reason = KVM_EXIT_MMIO; | ||
641 | r = RESUME_HOST_NV; | ||
642 | break; | ||
643 | default: | ||
644 | BUG(); | ||
645 | } | ||
646 | break; | ||
647 | } | ||
648 | case BOOK3S_INTERRUPT_SYSCALL: | ||
649 | if (vcpu->arch.osi_enabled && | ||
650 | (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) && | ||
651 | (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) { | ||
652 | /* MOL hypercalls */ | ||
653 | u64 *gprs = run->osi.gprs; | ||
654 | int i; | ||
655 | |||
656 | run->exit_reason = KVM_EXIT_OSI; | ||
657 | for (i = 0; i < 32; i++) | ||
658 | gprs[i] = kvmppc_get_gpr(vcpu, i); | ||
659 | vcpu->arch.osi_needed = 1; | ||
660 | r = RESUME_HOST_NV; | ||
661 | } else if (!(vcpu->arch.shared->msr & MSR_PR) && | ||
662 | (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) { | ||
663 | /* KVM PV hypercalls */ | ||
664 | kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); | ||
665 | r = RESUME_GUEST; | ||
666 | } else { | ||
667 | /* Guest syscalls */ | ||
668 | vcpu->stat.syscall_exits++; | ||
669 | kvmppc_book3s_queue_irqprio(vcpu, exit_nr); | ||
670 | r = RESUME_GUEST; | ||
671 | } | ||
672 | break; | ||
673 | case BOOK3S_INTERRUPT_FP_UNAVAIL: | ||
674 | case BOOK3S_INTERRUPT_ALTIVEC: | ||
675 | case BOOK3S_INTERRUPT_VSX: | ||
676 | { | ||
677 | int ext_msr = 0; | ||
678 | |||
679 | switch (exit_nr) { | ||
680 | case BOOK3S_INTERRUPT_FP_UNAVAIL: ext_msr = MSR_FP; break; | ||
681 | case BOOK3S_INTERRUPT_ALTIVEC: ext_msr = MSR_VEC; break; | ||
682 | case BOOK3S_INTERRUPT_VSX: ext_msr = MSR_VSX; break; | ||
683 | } | ||
684 | |||
685 | switch (kvmppc_check_ext(vcpu, exit_nr)) { | ||
686 | case EMULATE_DONE: | ||
687 | /* everything ok - let's enable the ext */ | ||
688 | r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr); | ||
689 | break; | ||
690 | case EMULATE_FAIL: | ||
691 | /* we need to emulate this instruction */ | ||
692 | goto program_interrupt; | ||
693 | break; | ||
694 | default: | ||
695 | /* nothing to worry about - go again */ | ||
696 | break; | ||
697 | } | ||
698 | break; | ||
699 | } | ||
700 | case BOOK3S_INTERRUPT_ALIGNMENT: | ||
701 | if (kvmppc_read_inst(vcpu) == EMULATE_DONE) { | ||
702 | vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu, | ||
703 | kvmppc_get_last_inst(vcpu)); | ||
704 | vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu, | ||
705 | kvmppc_get_last_inst(vcpu)); | ||
706 | kvmppc_book3s_queue_irqprio(vcpu, exit_nr); | ||
707 | } | ||
708 | r = RESUME_GUEST; | ||
709 | break; | ||
710 | case BOOK3S_INTERRUPT_MACHINE_CHECK: | ||
711 | case BOOK3S_INTERRUPT_TRACE: | ||
712 | kvmppc_book3s_queue_irqprio(vcpu, exit_nr); | ||
713 | r = RESUME_GUEST; | ||
714 | break; | ||
715 | default: | ||
716 | /* Ugh - bork here! What did we get? */ | ||
717 | printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", | ||
718 | exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1); | ||
719 | r = RESUME_HOST; | ||
720 | BUG(); | ||
721 | break; | ||
722 | } | ||
723 | |||
724 | |||
725 | if (!(r & RESUME_HOST)) { | ||
726 | /* To avoid clobbering exit_reason, only check for signals if | ||
727 | * we aren't already exiting to userspace for some other | ||
728 | * reason. */ | ||
729 | if (signal_pending(current)) { | ||
730 | #ifdef EXIT_DEBUG | ||
731 | printk(KERN_EMERG "KVM: Going back to host\n"); | ||
732 | #endif | ||
733 | vcpu->stat.signal_exits++; | ||
734 | run->exit_reason = KVM_EXIT_INTR; | ||
735 | r = -EINTR; | ||
736 | } else { | ||
737 | /* In case an interrupt came in that was triggered | ||
738 | * from userspace (like DEC), we need to check what | ||
739 | * to inject now! */ | ||
740 | kvmppc_core_deliver_interrupts(vcpu); | ||
741 | } | ||
742 | } | ||
743 | |||
744 | trace_kvm_book3s_reenter(r, vcpu); | ||
745 | |||
746 | return r; | ||
747 | } | ||
748 | |||
749 | int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | ||
750 | struct kvm_sregs *sregs) | ||
751 | { | ||
752 | struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); | ||
753 | int i; | ||
754 | |||
755 | sregs->pvr = vcpu->arch.pvr; | ||
756 | |||
757 | sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1; | ||
758 | if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { | ||
759 | for (i = 0; i < 64; i++) { | ||
760 | sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige | i; | ||
761 | sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv; | ||
762 | } | ||
763 | } else { | ||
764 | for (i = 0; i < 16; i++) | ||
765 | sregs->u.s.ppc32.sr[i] = vcpu->arch.shared->sr[i]; | ||
766 | |||
767 | for (i = 0; i < 8; i++) { | ||
768 | sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw; | ||
769 | sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw; | ||
770 | } | ||
771 | } | ||
772 | |||
773 | return 0; | ||
774 | } | ||
775 | |||
776 | int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | ||
777 | struct kvm_sregs *sregs) | ||
778 | { | ||
779 | struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); | ||
780 | int i; | ||
781 | |||
782 | kvmppc_set_pvr(vcpu, sregs->pvr); | ||
783 | |||
784 | vcpu3s->sdr1 = sregs->u.s.sdr1; | ||
785 | if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { | ||
786 | for (i = 0; i < 64; i++) { | ||
787 | vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv, | ||
788 | sregs->u.s.ppc64.slb[i].slbe); | ||
789 | } | ||
790 | } else { | ||
791 | for (i = 0; i < 16; i++) { | ||
792 | vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]); | ||
793 | } | ||
794 | for (i = 0; i < 8; i++) { | ||
795 | kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), false, | ||
796 | (u32)sregs->u.s.ppc32.ibat[i]); | ||
797 | kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), true, | ||
798 | (u32)(sregs->u.s.ppc32.ibat[i] >> 32)); | ||
799 | kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), false, | ||
800 | (u32)sregs->u.s.ppc32.dbat[i]); | ||
801 | kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), true, | ||
802 | (u32)(sregs->u.s.ppc32.dbat[i] >> 32)); | ||
803 | } | ||
804 | } | ||
805 | |||
806 | /* Flush the MMU after messing with the segments */ | ||
807 | kvmppc_mmu_pte_flush(vcpu, 0, 0); | ||
808 | |||
809 | return 0; | ||
810 | } | ||
811 | |||
812 | int kvmppc_core_check_processor_compat(void) | ||
813 | { | ||
814 | return 0; | ||
815 | } | ||
816 | |||
817 | struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) | ||
818 | { | ||
819 | struct kvmppc_vcpu_book3s *vcpu_book3s; | ||
820 | struct kvm_vcpu *vcpu; | ||
821 | int err = -ENOMEM; | ||
822 | unsigned long p; | ||
823 | |||
824 | vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); | ||
825 | if (!vcpu_book3s) | ||
826 | goto out; | ||
827 | |||
828 | vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *) | ||
829 | kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL); | ||
830 | if (!vcpu_book3s->shadow_vcpu) | ||
831 | goto free_vcpu; | ||
832 | |||
833 | vcpu = &vcpu_book3s->vcpu; | ||
834 | err = kvm_vcpu_init(vcpu, kvm, id); | ||
835 | if (err) | ||
836 | goto free_shadow_vcpu; | ||
837 | |||
838 | p = __get_free_page(GFP_KERNEL|__GFP_ZERO); | ||
839 | /* the real shared page fills the last 4k of our page */ | ||
840 | vcpu->arch.shared = (void*)(p + PAGE_SIZE - 4096); | ||
841 | if (!p) | ||
842 | goto uninit_vcpu; | ||
843 | |||
844 | vcpu->arch.host_retip = kvm_return_point; | ||
845 | vcpu->arch.host_msr = mfmsr(); | ||
846 | #ifdef CONFIG_PPC_BOOK3S_64 | ||
847 | /* default to book3s_64 (970fx) */ | ||
848 | vcpu->arch.pvr = 0x3C0301; | ||
849 | #else | ||
850 | /* default to book3s_32 (750) */ | ||
851 | vcpu->arch.pvr = 0x84202; | ||
852 | #endif | ||
853 | kvmppc_set_pvr(vcpu, vcpu->arch.pvr); | ||
854 | vcpu->arch.slb_nr = 64; | ||
855 | |||
856 | /* remember where some real-mode handlers are */ | ||
857 | vcpu->arch.trampoline_lowmem = __pa(kvmppc_handler_lowmem_trampoline); | ||
858 | vcpu->arch.trampoline_enter = __pa(kvmppc_handler_trampoline_enter); | ||
859 | vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem; | ||
860 | #ifdef CONFIG_PPC_BOOK3S_64 | ||
861 | vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall; | ||
862 | #else | ||
863 | vcpu->arch.rmcall = (ulong)kvmppc_rmcall; | ||
864 | #endif | ||
865 | |||
866 | vcpu->arch.shadow_msr = MSR_USER64; | ||
867 | |||
868 | err = kvmppc_mmu_init(vcpu); | ||
869 | if (err < 0) | ||
870 | goto uninit_vcpu; | ||
871 | |||
872 | return vcpu; | ||
873 | |||
874 | uninit_vcpu: | ||
875 | kvm_vcpu_uninit(vcpu); | ||
876 | free_shadow_vcpu: | ||
877 | kfree(vcpu_book3s->shadow_vcpu); | ||
878 | free_vcpu: | ||
879 | vfree(vcpu_book3s); | ||
880 | out: | ||
881 | return ERR_PTR(err); | ||
882 | } | ||
883 | |||
884 | void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) | ||
885 | { | ||
886 | struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); | ||
887 | |||
888 | free_page((unsigned long)vcpu->arch.shared & PAGE_MASK); | ||
889 | kvm_vcpu_uninit(vcpu); | ||
890 | kfree(vcpu_book3s->shadow_vcpu); | ||
891 | vfree(vcpu_book3s); | ||
892 | } | ||
893 | |||
894 | int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | ||
895 | { | ||
896 | int ret; | ||
897 | double fpr[32][TS_FPRWIDTH]; | ||
898 | unsigned int fpscr; | ||
899 | int fpexc_mode; | ||
900 | #ifdef CONFIG_ALTIVEC | ||
901 | vector128 vr[32]; | ||
902 | vector128 vscr; | ||
903 | unsigned long uninitialized_var(vrsave); | ||
904 | int used_vr; | ||
905 | #endif | ||
906 | #ifdef CONFIG_VSX | ||
907 | int used_vsr; | ||
908 | #endif | ||
909 | ulong ext_msr; | ||
910 | |||
911 | /* No need to go into the guest when all we do is going out */ | ||
912 | if (signal_pending(current)) { | ||
913 | kvm_run->exit_reason = KVM_EXIT_INTR; | ||
914 | return -EINTR; | ||
915 | } | ||
916 | |||
917 | /* Save FPU state in stack */ | ||
918 | if (current->thread.regs->msr & MSR_FP) | ||
919 | giveup_fpu(current); | ||
920 | memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr)); | ||
921 | fpscr = current->thread.fpscr.val; | ||
922 | fpexc_mode = current->thread.fpexc_mode; | ||
923 | |||
924 | #ifdef CONFIG_ALTIVEC | ||
925 | /* Save Altivec state in stack */ | ||
926 | used_vr = current->thread.used_vr; | ||
927 | if (used_vr) { | ||
928 | if (current->thread.regs->msr & MSR_VEC) | ||
929 | giveup_altivec(current); | ||
930 | memcpy(vr, current->thread.vr, sizeof(current->thread.vr)); | ||
931 | vscr = current->thread.vscr; | ||
932 | vrsave = current->thread.vrsave; | ||
933 | } | ||
934 | #endif | ||
935 | |||
936 | #ifdef CONFIG_VSX | ||
937 | /* Save VSX state in stack */ | ||
938 | used_vsr = current->thread.used_vsr; | ||
939 | if (used_vsr && (current->thread.regs->msr & MSR_VSX)) | ||
940 | __giveup_vsx(current); | ||
941 | #endif | ||
942 | |||
943 | /* Remember the MSR with disabled extensions */ | ||
944 | ext_msr = current->thread.regs->msr; | ||
945 | |||
946 | /* Preload FPU if it's enabled */ | ||
947 | if (vcpu->arch.shared->msr & MSR_FP) | ||
948 | kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); | ||
949 | |||
950 | kvm_guest_enter(); | ||
951 | |||
952 | ret = __kvmppc_vcpu_run(kvm_run, vcpu); | ||
953 | |||
954 | kvm_guest_exit(); | ||
955 | |||
956 | local_irq_disable(); | ||
957 | |||
958 | current->thread.regs->msr = ext_msr; | ||
959 | |||
960 | /* Make sure we save the guest FPU/Altivec/VSX state */ | ||
961 | kvmppc_giveup_ext(vcpu, MSR_FP); | ||
962 | kvmppc_giveup_ext(vcpu, MSR_VEC); | ||
963 | kvmppc_giveup_ext(vcpu, MSR_VSX); | ||
964 | |||
965 | /* Restore FPU state from stack */ | ||
966 | memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr)); | ||
967 | current->thread.fpscr.val = fpscr; | ||
968 | current->thread.fpexc_mode = fpexc_mode; | ||
969 | |||
970 | #ifdef CONFIG_ALTIVEC | ||
971 | /* Restore Altivec state from stack */ | ||
972 | if (used_vr && current->thread.used_vr) { | ||
973 | memcpy(current->thread.vr, vr, sizeof(current->thread.vr)); | ||
974 | current->thread.vscr = vscr; | ||
975 | current->thread.vrsave = vrsave; | ||
976 | } | ||
977 | current->thread.used_vr = used_vr; | ||
978 | #endif | ||
979 | |||
980 | #ifdef CONFIG_VSX | ||
981 | current->thread.used_vsr = used_vsr; | ||
982 | #endif | ||
983 | |||
984 | return ret; | ||
985 | } | ||
986 | |||
987 | int kvmppc_core_prepare_memory_region(struct kvm *kvm, | ||
988 | struct kvm_userspace_memory_region *mem) | ||
989 | { | ||
990 | return 0; | ||
991 | } | ||
992 | |||
993 | void kvmppc_core_commit_memory_region(struct kvm *kvm, | ||
994 | struct kvm_userspace_memory_region *mem) | ||
995 | { | ||
996 | } | ||
997 | |||
998 | int kvmppc_core_init_vm(struct kvm *kvm) | ||
999 | { | ||
1000 | return 0; | ||
1001 | } | ||
1002 | |||
1003 | void kvmppc_core_destroy_vm(struct kvm *kvm) | ||
1004 | { | ||
1005 | } | ||
1006 | |||
1007 | static int kvmppc_book3s_init(void) | ||
1008 | { | ||
1009 | int r; | ||
1010 | |||
1011 | r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0, | ||
1012 | THIS_MODULE); | ||
1013 | |||
1014 | if (r) | ||
1015 | return r; | ||
1016 | |||
1017 | r = kvmppc_mmu_hpte_sysinit(); | ||
1018 | |||
1019 | return r; | ||
1020 | } | ||
1021 | |||
1022 | static void kvmppc_book3s_exit(void) | ||
1023 | { | ||
1024 | kvmppc_mmu_hpte_sysexit(); | ||
1025 | kvm_exit(); | ||
1026 | } | ||
1027 | |||
1028 | module_init(kvmppc_book3s_init); | ||
1029 | module_exit(kvmppc_book3s_exit); | ||
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index 1a1b34487e71..c1f877c4a884 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S | |||
@@ -36,41 +36,44 @@ | |||
36 | #if defined(CONFIG_PPC_BOOK3S_64) | 36 | #if defined(CONFIG_PPC_BOOK3S_64) |
37 | 37 | ||
38 | #define LOAD_SHADOW_VCPU(reg) GET_PACA(reg) | 38 | #define LOAD_SHADOW_VCPU(reg) GET_PACA(reg) |
39 | #define SHADOW_VCPU_OFF PACA_KVM_SVCPU | ||
40 | #define MSR_NOIRQ MSR_KERNEL & ~(MSR_IR | MSR_DR) | 39 | #define MSR_NOIRQ MSR_KERNEL & ~(MSR_IR | MSR_DR) |
41 | #define FUNC(name) GLUE(.,name) | 40 | #define FUNC(name) GLUE(.,name) |
42 | 41 | ||
42 | kvmppc_skip_interrupt: | ||
43 | /* | ||
44 | * Here all GPRs are unchanged from when the interrupt happened | ||
45 | * except for r13, which is saved in SPRG_SCRATCH0. | ||
46 | */ | ||
47 | mfspr r13, SPRN_SRR0 | ||
48 | addi r13, r13, 4 | ||
49 | mtspr SPRN_SRR0, r13 | ||
50 | GET_SCRATCH0(r13) | ||
51 | rfid | ||
52 | b . | ||
53 | |||
54 | kvmppc_skip_Hinterrupt: | ||
55 | /* | ||
56 | * Here all GPRs are unchanged from when the interrupt happened | ||
57 | * except for r13, which is saved in SPRG_SCRATCH0. | ||
58 | */ | ||
59 | mfspr r13, SPRN_HSRR0 | ||
60 | addi r13, r13, 4 | ||
61 | mtspr SPRN_HSRR0, r13 | ||
62 | GET_SCRATCH0(r13) | ||
63 | hrfid | ||
64 | b . | ||
65 | |||
43 | #elif defined(CONFIG_PPC_BOOK3S_32) | 66 | #elif defined(CONFIG_PPC_BOOK3S_32) |
44 | 67 | ||
45 | #define LOAD_SHADOW_VCPU(reg) \ | ||
46 | mfspr reg, SPRN_SPRG_THREAD; \ | ||
47 | lwz reg, THREAD_KVM_SVCPU(reg); \ | ||
48 | /* PPC32 can have a NULL pointer - let's check for that */ \ | ||
49 | mtspr SPRN_SPRG_SCRATCH1, r12; /* Save r12 */ \ | ||
50 | mfcr r12; \ | ||
51 | cmpwi reg, 0; \ | ||
52 | bne 1f; \ | ||
53 | mfspr reg, SPRN_SPRG_SCRATCH0; \ | ||
54 | mtcr r12; \ | ||
55 | mfspr r12, SPRN_SPRG_SCRATCH1; \ | ||
56 | b kvmppc_resume_\intno; \ | ||
57 | 1:; \ | ||
58 | mtcr r12; \ | ||
59 | mfspr r12, SPRN_SPRG_SCRATCH1; \ | ||
60 | tophys(reg, reg) | ||
61 | |||
62 | #define SHADOW_VCPU_OFF 0 | ||
63 | #define MSR_NOIRQ MSR_KERNEL | 68 | #define MSR_NOIRQ MSR_KERNEL |
64 | #define FUNC(name) name | 69 | #define FUNC(name) name |
65 | 70 | ||
66 | #endif | ||
67 | |||
68 | .macro INTERRUPT_TRAMPOLINE intno | 71 | .macro INTERRUPT_TRAMPOLINE intno |
69 | 72 | ||
70 | .global kvmppc_trampoline_\intno | 73 | .global kvmppc_trampoline_\intno |
71 | kvmppc_trampoline_\intno: | 74 | kvmppc_trampoline_\intno: |
72 | 75 | ||
73 | SET_SCRATCH0(r13) /* Save r13 */ | 76 | mtspr SPRN_SPRG_SCRATCH0, r13 /* Save r13 */ |
74 | 77 | ||
75 | /* | 78 | /* |
76 | * First thing to do is to find out if we're coming | 79 | * First thing to do is to find out if we're coming |
@@ -78,19 +81,28 @@ kvmppc_trampoline_\intno: | |||
78 | * | 81 | * |
79 | * To distinguish, we check a magic byte in the PACA/current | 82 | * To distinguish, we check a magic byte in the PACA/current |
80 | */ | 83 | */ |
81 | LOAD_SHADOW_VCPU(r13) | 84 | mfspr r13, SPRN_SPRG_THREAD |
82 | PPC_STL r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13) | 85 | lwz r13, THREAD_KVM_SVCPU(r13) |
86 | /* PPC32 can have a NULL pointer - let's check for that */ | ||
87 | mtspr SPRN_SPRG_SCRATCH1, r12 /* Save r12 */ | ||
83 | mfcr r12 | 88 | mfcr r12 |
84 | stw r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13) | 89 | cmpwi r13, 0 |
85 | lbz r12, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13) | 90 | bne 1f |
91 | 2: mtcr r12 | ||
92 | mfspr r12, SPRN_SPRG_SCRATCH1 | ||
93 | mfspr r13, SPRN_SPRG_SCRATCH0 /* r13 = original r13 */ | ||
94 | b kvmppc_resume_\intno /* Get back original handler */ | ||
95 | |||
96 | 1: tophys(r13, r13) | ||
97 | stw r12, HSTATE_SCRATCH1(r13) | ||
98 | mfspr r12, SPRN_SPRG_SCRATCH1 | ||
99 | stw r12, HSTATE_SCRATCH0(r13) | ||
100 | lbz r12, HSTATE_IN_GUEST(r13) | ||
86 | cmpwi r12, KVM_GUEST_MODE_NONE | 101 | cmpwi r12, KVM_GUEST_MODE_NONE |
87 | bne ..kvmppc_handler_hasmagic_\intno | 102 | bne ..kvmppc_handler_hasmagic_\intno |
88 | /* No KVM guest? Then jump back to the Linux handler! */ | 103 | /* No KVM guest? Then jump back to the Linux handler! */ |
89 | lwz r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13) | 104 | lwz r12, HSTATE_SCRATCH1(r13) |
90 | mtcr r12 | 105 | b 2b |
91 | PPC_LL r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13) | ||
92 | GET_SCRATCH0(r13) /* r13 = original r13 */ | ||
93 | b kvmppc_resume_\intno /* Get back original handler */ | ||
94 | 106 | ||
95 | /* Now we know we're handling a KVM guest */ | 107 | /* Now we know we're handling a KVM guest */ |
96 | ..kvmppc_handler_hasmagic_\intno: | 108 | ..kvmppc_handler_hasmagic_\intno: |
@@ -112,9 +124,6 @@ INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_MACHINE_CHECK | |||
112 | INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_DATA_STORAGE | 124 | INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_DATA_STORAGE |
113 | INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_INST_STORAGE | 125 | INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_INST_STORAGE |
114 | INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_EXTERNAL | 126 | INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_EXTERNAL |
115 | #ifdef CONFIG_PPC_BOOK3S_64 | ||
116 | INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_EXTERNAL_HV | ||
117 | #endif | ||
118 | INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_ALIGNMENT | 127 | INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_ALIGNMENT |
119 | INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_PROGRAM | 128 | INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_PROGRAM |
120 | INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_FP_UNAVAIL | 129 | INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_FP_UNAVAIL |
@@ -124,14 +133,6 @@ INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_TRACE | |||
124 | INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_PERFMON | 133 | INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_PERFMON |
125 | INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_ALTIVEC | 134 | INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_ALTIVEC |
126 | 135 | ||
127 | /* Those are only available on 64 bit machines */ | ||
128 | |||
129 | #ifdef CONFIG_PPC_BOOK3S_64 | ||
130 | INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_DATA_SEGMENT | ||
131 | INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_INST_SEGMENT | ||
132 | INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_VSX | ||
133 | #endif | ||
134 | |||
135 | /* | 136 | /* |
136 | * Bring us back to the faulting code, but skip the | 137 | * Bring us back to the faulting code, but skip the |
137 | * faulting instruction. | 138 | * faulting instruction. |
@@ -143,8 +144,8 @@ INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_VSX | |||
143 | * | 144 | * |
144 | * R12 = free | 145 | * R12 = free |
145 | * R13 = Shadow VCPU (PACA) | 146 | * R13 = Shadow VCPU (PACA) |
146 | * SVCPU.SCRATCH0 = guest R12 | 147 | * HSTATE.SCRATCH0 = guest R12 |
147 | * SVCPU.SCRATCH1 = guest CR | 148 | * HSTATE.SCRATCH1 = guest CR |
148 | * SPRG_SCRATCH0 = guest R13 | 149 | * SPRG_SCRATCH0 = guest R13 |
149 | * | 150 | * |
150 | */ | 151 | */ |
@@ -156,13 +157,14 @@ kvmppc_handler_skip_ins: | |||
156 | mtsrr0 r12 | 157 | mtsrr0 r12 |
157 | 158 | ||
158 | /* Clean up all state */ | 159 | /* Clean up all state */ |
159 | lwz r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13) | 160 | lwz r12, HSTATE_SCRATCH1(r13) |
160 | mtcr r12 | 161 | mtcr r12 |
161 | PPC_LL r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13) | 162 | PPC_LL r12, HSTATE_SCRATCH0(r13) |
162 | GET_SCRATCH0(r13) | 163 | GET_SCRATCH0(r13) |
163 | 164 | ||
164 | /* And get back into the code */ | 165 | /* And get back into the code */ |
165 | RFI | 166 | RFI |
167 | #endif | ||
166 | 168 | ||
167 | /* | 169 | /* |
168 | * This trampoline brings us back to a real mode handler | 170 | * This trampoline brings us back to a real mode handler |
@@ -251,12 +253,4 @@ define_load_up(altivec) | |||
251 | define_load_up(vsx) | 253 | define_load_up(vsx) |
252 | #endif | 254 | #endif |
253 | 255 | ||
254 | .global kvmppc_trampoline_lowmem | ||
255 | kvmppc_trampoline_lowmem: | ||
256 | PPC_LONG kvmppc_handler_lowmem_trampoline - CONFIG_KERNEL_START | ||
257 | |||
258 | .global kvmppc_trampoline_enter | ||
259 | kvmppc_trampoline_enter: | ||
260 | PPC_LONG kvmppc_handler_trampoline_enter - CONFIG_KERNEL_START | ||
261 | |||
262 | #include "book3s_segment.S" | 256 | #include "book3s_segment.S" |
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index 451264274b8c..aed32e517212 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S | |||
@@ -22,7 +22,7 @@ | |||
22 | #if defined(CONFIG_PPC_BOOK3S_64) | 22 | #if defined(CONFIG_PPC_BOOK3S_64) |
23 | 23 | ||
24 | #define GET_SHADOW_VCPU(reg) \ | 24 | #define GET_SHADOW_VCPU(reg) \ |
25 | addi reg, r13, PACA_KVM_SVCPU | 25 | mr reg, r13 |
26 | 26 | ||
27 | #elif defined(CONFIG_PPC_BOOK3S_32) | 27 | #elif defined(CONFIG_PPC_BOOK3S_32) |
28 | 28 | ||
@@ -71,6 +71,10 @@ kvmppc_handler_trampoline_enter: | |||
71 | /* r3 = shadow vcpu */ | 71 | /* r3 = shadow vcpu */ |
72 | GET_SHADOW_VCPU(r3) | 72 | GET_SHADOW_VCPU(r3) |
73 | 73 | ||
74 | /* Save R1/R2 in the PACA (64-bit) or shadow_vcpu (32-bit) */ | ||
75 | PPC_STL r1, HSTATE_HOST_R1(r3) | ||
76 | PPC_STL r2, HSTATE_HOST_R2(r3) | ||
77 | |||
74 | /* Move SRR0 and SRR1 into the respective regs */ | 78 | /* Move SRR0 and SRR1 into the respective regs */ |
75 | PPC_LL r9, SVCPU_PC(r3) | 79 | PPC_LL r9, SVCPU_PC(r3) |
76 | mtsrr0 r9 | 80 | mtsrr0 r9 |
@@ -78,36 +82,36 @@ kvmppc_handler_trampoline_enter: | |||
78 | 82 | ||
79 | /* Activate guest mode, so faults get handled by KVM */ | 83 | /* Activate guest mode, so faults get handled by KVM */ |
80 | li r11, KVM_GUEST_MODE_GUEST | 84 | li r11, KVM_GUEST_MODE_GUEST |
81 | stb r11, SVCPU_IN_GUEST(r3) | 85 | stb r11, HSTATE_IN_GUEST(r3) |
82 | 86 | ||
83 | /* Switch to guest segment. This is subarch specific. */ | 87 | /* Switch to guest segment. This is subarch specific. */ |
84 | LOAD_GUEST_SEGMENTS | 88 | LOAD_GUEST_SEGMENTS |
85 | 89 | ||
86 | /* Enter guest */ | 90 | /* Enter guest */ |
87 | 91 | ||
88 | PPC_LL r4, (SVCPU_CTR)(r3) | 92 | PPC_LL r4, SVCPU_CTR(r3) |
89 | PPC_LL r5, (SVCPU_LR)(r3) | 93 | PPC_LL r5, SVCPU_LR(r3) |
90 | lwz r6, (SVCPU_CR)(r3) | 94 | lwz r6, SVCPU_CR(r3) |
91 | lwz r7, (SVCPU_XER)(r3) | 95 | lwz r7, SVCPU_XER(r3) |
92 | 96 | ||
93 | mtctr r4 | 97 | mtctr r4 |
94 | mtlr r5 | 98 | mtlr r5 |
95 | mtcr r6 | 99 | mtcr r6 |
96 | mtxer r7 | 100 | mtxer r7 |
97 | 101 | ||
98 | PPC_LL r0, (SVCPU_R0)(r3) | 102 | PPC_LL r0, SVCPU_R0(r3) |
99 | PPC_LL r1, (SVCPU_R1)(r3) | 103 | PPC_LL r1, SVCPU_R1(r3) |
100 | PPC_LL r2, (SVCPU_R2)(r3) | 104 | PPC_LL r2, SVCPU_R2(r3) |
101 | PPC_LL r4, (SVCPU_R4)(r3) | 105 | PPC_LL r4, SVCPU_R4(r3) |
102 | PPC_LL r5, (SVCPU_R5)(r3) | 106 | PPC_LL r5, SVCPU_R5(r3) |
103 | PPC_LL r6, (SVCPU_R6)(r3) | 107 | PPC_LL r6, SVCPU_R6(r3) |
104 | PPC_LL r7, (SVCPU_R7)(r3) | 108 | PPC_LL r7, SVCPU_R7(r3) |
105 | PPC_LL r8, (SVCPU_R8)(r3) | 109 | PPC_LL r8, SVCPU_R8(r3) |
106 | PPC_LL r9, (SVCPU_R9)(r3) | 110 | PPC_LL r9, SVCPU_R9(r3) |
107 | PPC_LL r10, (SVCPU_R10)(r3) | 111 | PPC_LL r10, SVCPU_R10(r3) |
108 | PPC_LL r11, (SVCPU_R11)(r3) | 112 | PPC_LL r11, SVCPU_R11(r3) |
109 | PPC_LL r12, (SVCPU_R12)(r3) | 113 | PPC_LL r12, SVCPU_R12(r3) |
110 | PPC_LL r13, (SVCPU_R13)(r3) | 114 | PPC_LL r13, SVCPU_R13(r3) |
111 | 115 | ||
112 | PPC_LL r3, (SVCPU_R3)(r3) | 116 | PPC_LL r3, (SVCPU_R3)(r3) |
113 | 117 | ||
@@ -125,56 +129,63 @@ kvmppc_handler_trampoline_enter_end: | |||
125 | .global kvmppc_handler_trampoline_exit | 129 | .global kvmppc_handler_trampoline_exit |
126 | kvmppc_handler_trampoline_exit: | 130 | kvmppc_handler_trampoline_exit: |
127 | 131 | ||
132 | .global kvmppc_interrupt | ||
133 | kvmppc_interrupt: | ||
134 | |||
128 | /* Register usage at this point: | 135 | /* Register usage at this point: |
129 | * | 136 | * |
130 | * SPRG_SCRATCH0 = guest R13 | 137 | * SPRG_SCRATCH0 = guest R13 |
131 | * R12 = exit handler id | 138 | * R12 = exit handler id |
132 | * R13 = shadow vcpu - SHADOW_VCPU_OFF [=PACA on PPC64] | 139 | * R13 = shadow vcpu (32-bit) or PACA (64-bit) |
133 | * SVCPU.SCRATCH0 = guest R12 | 140 | * HSTATE.SCRATCH0 = guest R12 |
134 | * SVCPU.SCRATCH1 = guest CR | 141 | * HSTATE.SCRATCH1 = guest CR |
135 | * | 142 | * |
136 | */ | 143 | */ |
137 | 144 | ||
138 | /* Save registers */ | 145 | /* Save registers */ |
139 | 146 | ||
140 | PPC_STL r0, (SHADOW_VCPU_OFF + SVCPU_R0)(r13) | 147 | PPC_STL r0, SVCPU_R0(r13) |
141 | PPC_STL r1, (SHADOW_VCPU_OFF + SVCPU_R1)(r13) | 148 | PPC_STL r1, SVCPU_R1(r13) |
142 | PPC_STL r2, (SHADOW_VCPU_OFF + SVCPU_R2)(r13) | 149 | PPC_STL r2, SVCPU_R2(r13) |
143 | PPC_STL r3, (SHADOW_VCPU_OFF + SVCPU_R3)(r13) | 150 | PPC_STL r3, SVCPU_R3(r13) |
144 | PPC_STL r4, (SHADOW_VCPU_OFF + SVCPU_R4)(r13) | 151 | PPC_STL r4, SVCPU_R4(r13) |
145 | PPC_STL r5, (SHADOW_VCPU_OFF + SVCPU_R5)(r13) | 152 | PPC_STL r5, SVCPU_R5(r13) |
146 | PPC_STL r6, (SHADOW_VCPU_OFF + SVCPU_R6)(r13) | 153 | PPC_STL r6, SVCPU_R6(r13) |
147 | PPC_STL r7, (SHADOW_VCPU_OFF + SVCPU_R7)(r13) | 154 | PPC_STL r7, SVCPU_R7(r13) |
148 | PPC_STL r8, (SHADOW_VCPU_OFF + SVCPU_R8)(r13) | 155 | PPC_STL r8, SVCPU_R8(r13) |
149 | PPC_STL r9, (SHADOW_VCPU_OFF + SVCPU_R9)(r13) | 156 | PPC_STL r9, SVCPU_R9(r13) |
150 | PPC_STL r10, (SHADOW_VCPU_OFF + SVCPU_R10)(r13) | 157 | PPC_STL r10, SVCPU_R10(r13) |
151 | PPC_STL r11, (SHADOW_VCPU_OFF + SVCPU_R11)(r13) | 158 | PPC_STL r11, SVCPU_R11(r13) |
152 | 159 | ||
153 | /* Restore R1/R2 so we can handle faults */ | 160 | /* Restore R1/R2 so we can handle faults */ |
154 | PPC_LL r1, (SHADOW_VCPU_OFF + SVCPU_HOST_R1)(r13) | 161 | PPC_LL r1, HSTATE_HOST_R1(r13) |
155 | PPC_LL r2, (SHADOW_VCPU_OFF + SVCPU_HOST_R2)(r13) | 162 | PPC_LL r2, HSTATE_HOST_R2(r13) |
156 | 163 | ||
157 | /* Save guest PC and MSR */ | 164 | /* Save guest PC and MSR */ |
165 | #ifdef CONFIG_PPC64 | ||
166 | BEGIN_FTR_SECTION | ||
158 | andi. r0,r12,0x2 | 167 | andi. r0,r12,0x2 |
159 | beq 1f | 168 | beq 1f |
160 | mfspr r3,SPRN_HSRR0 | 169 | mfspr r3,SPRN_HSRR0 |
161 | mfspr r4,SPRN_HSRR1 | 170 | mfspr r4,SPRN_HSRR1 |
162 | andi. r12,r12,0x3ffd | 171 | andi. r12,r12,0x3ffd |
163 | b 2f | 172 | b 2f |
173 | END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) | ||
174 | #endif | ||
164 | 1: mfsrr0 r3 | 175 | 1: mfsrr0 r3 |
165 | mfsrr1 r4 | 176 | mfsrr1 r4 |
166 | 2: | 177 | 2: |
167 | PPC_STL r3, (SHADOW_VCPU_OFF + SVCPU_PC)(r13) | 178 | PPC_STL r3, SVCPU_PC(r13) |
168 | PPC_STL r4, (SHADOW_VCPU_OFF + SVCPU_SHADOW_SRR1)(r13) | 179 | PPC_STL r4, SVCPU_SHADOW_SRR1(r13) |
169 | 180 | ||
170 | /* Get scratch'ed off registers */ | 181 | /* Get scratch'ed off registers */ |
171 | GET_SCRATCH0(r9) | 182 | GET_SCRATCH0(r9) |
172 | PPC_LL r8, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13) | 183 | PPC_LL r8, HSTATE_SCRATCH0(r13) |
173 | lwz r7, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13) | 184 | lwz r7, HSTATE_SCRATCH1(r13) |
174 | 185 | ||
175 | PPC_STL r9, (SHADOW_VCPU_OFF + SVCPU_R13)(r13) | 186 | PPC_STL r9, SVCPU_R13(r13) |
176 | PPC_STL r8, (SHADOW_VCPU_OFF + SVCPU_R12)(r13) | 187 | PPC_STL r8, SVCPU_R12(r13) |
177 | stw r7, (SHADOW_VCPU_OFF + SVCPU_CR)(r13) | 188 | stw r7, SVCPU_CR(r13) |
178 | 189 | ||
179 | /* Save more register state */ | 190 | /* Save more register state */ |
180 | 191 | ||
@@ -184,11 +195,11 @@ kvmppc_handler_trampoline_exit: | |||
184 | mfctr r8 | 195 | mfctr r8 |
185 | mflr r9 | 196 | mflr r9 |
186 | 197 | ||
187 | stw r5, (SHADOW_VCPU_OFF + SVCPU_XER)(r13) | 198 | stw r5, SVCPU_XER(r13) |
188 | PPC_STL r6, (SHADOW_VCPU_OFF + SVCPU_FAULT_DAR)(r13) | 199 | PPC_STL r6, SVCPU_FAULT_DAR(r13) |
189 | stw r7, (SHADOW_VCPU_OFF + SVCPU_FAULT_DSISR)(r13) | 200 | stw r7, SVCPU_FAULT_DSISR(r13) |
190 | PPC_STL r8, (SHADOW_VCPU_OFF + SVCPU_CTR)(r13) | 201 | PPC_STL r8, SVCPU_CTR(r13) |
191 | PPC_STL r9, (SHADOW_VCPU_OFF + SVCPU_LR)(r13) | 202 | PPC_STL r9, SVCPU_LR(r13) |
192 | 203 | ||
193 | /* | 204 | /* |
194 | * In order for us to easily get the last instruction, | 205 | * In order for us to easily get the last instruction, |
@@ -218,7 +229,7 @@ ld_last_inst: | |||
218 | /* Set guest mode to 'jump over instruction' so if lwz faults | 229 | /* Set guest mode to 'jump over instruction' so if lwz faults |
219 | * we'll just continue at the next IP. */ | 230 | * we'll just continue at the next IP. */ |
220 | li r9, KVM_GUEST_MODE_SKIP | 231 | li r9, KVM_GUEST_MODE_SKIP |
221 | stb r9, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13) | 232 | stb r9, HSTATE_IN_GUEST(r13) |
222 | 233 | ||
223 | /* 1) enable paging for data */ | 234 | /* 1) enable paging for data */ |
224 | mfmsr r9 | 235 | mfmsr r9 |
@@ -232,13 +243,13 @@ ld_last_inst: | |||
232 | sync | 243 | sync |
233 | 244 | ||
234 | #endif | 245 | #endif |
235 | stw r0, (SHADOW_VCPU_OFF + SVCPU_LAST_INST)(r13) | 246 | stw r0, SVCPU_LAST_INST(r13) |
236 | 247 | ||
237 | no_ld_last_inst: | 248 | no_ld_last_inst: |
238 | 249 | ||
239 | /* Unset guest mode */ | 250 | /* Unset guest mode */ |
240 | li r9, KVM_GUEST_MODE_NONE | 251 | li r9, KVM_GUEST_MODE_NONE |
241 | stb r9, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13) | 252 | stb r9, HSTATE_IN_GUEST(r13) |
242 | 253 | ||
243 | /* Switch back to host MMU */ | 254 | /* Switch back to host MMU */ |
244 | LOAD_HOST_SEGMENTS | 255 | LOAD_HOST_SEGMENTS |
@@ -248,7 +259,7 @@ no_ld_last_inst: | |||
248 | * R1 = host R1 | 259 | * R1 = host R1 |
249 | * R2 = host R2 | 260 | * R2 = host R2 |
250 | * R12 = exit handler id | 261 | * R12 = exit handler id |
251 | * R13 = shadow vcpu - SHADOW_VCPU_OFF [=PACA on PPC64] | 262 | * R13 = shadow vcpu (32-bit) or PACA (64-bit) |
252 | * SVCPU.* = guest * | 263 | * SVCPU.* = guest * |
253 | * | 264 | * |
254 | */ | 265 | */ |
@@ -258,7 +269,7 @@ no_ld_last_inst: | |||
258 | ori r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME /* Enable paging */ | 269 | ori r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME /* Enable paging */ |
259 | mtsrr1 r7 | 270 | mtsrr1 r7 |
260 | /* Load highmem handler address */ | 271 | /* Load highmem handler address */ |
261 | PPC_LL r8, (SHADOW_VCPU_OFF + SVCPU_VMHANDLER)(r13) | 272 | PPC_LL r8, HSTATE_VMHANDLER(r13) |
262 | mtsrr0 r8 | 273 | mtsrr0 r8 |
263 | 274 | ||
264 | RFI | 275 | RFI |
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 8462b3a1c1c7..ee45fa01220e 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c | |||
@@ -13,6 +13,7 @@ | |||
13 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | 13 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
14 | * | 14 | * |
15 | * Copyright IBM Corp. 2007 | 15 | * Copyright IBM Corp. 2007 |
16 | * Copyright 2010-2011 Freescale Semiconductor, Inc. | ||
16 | * | 17 | * |
17 | * Authors: Hollis Blanchard <hollisb@us.ibm.com> | 18 | * Authors: Hollis Blanchard <hollisb@us.ibm.com> |
18 | * Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com> | 19 | * Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com> |
@@ -78,6 +79,60 @@ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu) | |||
78 | } | 79 | } |
79 | } | 80 | } |
80 | 81 | ||
82 | #ifdef CONFIG_SPE | ||
83 | void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu) | ||
84 | { | ||
85 | preempt_disable(); | ||
86 | enable_kernel_spe(); | ||
87 | kvmppc_save_guest_spe(vcpu); | ||
88 | vcpu->arch.shadow_msr &= ~MSR_SPE; | ||
89 | preempt_enable(); | ||
90 | } | ||
91 | |||
92 | static void kvmppc_vcpu_enable_spe(struct kvm_vcpu *vcpu) | ||
93 | { | ||
94 | preempt_disable(); | ||
95 | enable_kernel_spe(); | ||
96 | kvmppc_load_guest_spe(vcpu); | ||
97 | vcpu->arch.shadow_msr |= MSR_SPE; | ||
98 | preempt_enable(); | ||
99 | } | ||
100 | |||
101 | static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu) | ||
102 | { | ||
103 | if (vcpu->arch.shared->msr & MSR_SPE) { | ||
104 | if (!(vcpu->arch.shadow_msr & MSR_SPE)) | ||
105 | kvmppc_vcpu_enable_spe(vcpu); | ||
106 | } else if (vcpu->arch.shadow_msr & MSR_SPE) { | ||
107 | kvmppc_vcpu_disable_spe(vcpu); | ||
108 | } | ||
109 | } | ||
110 | #else | ||
111 | static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu) | ||
112 | { | ||
113 | } | ||
114 | #endif | ||
115 | |||
116 | /* | ||
117 | * Helper function for "full" MSR writes. No need to call this if only | ||
118 | * EE/CE/ME/DE/RI are changing. | ||
119 | */ | ||
120 | void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) | ||
121 | { | ||
122 | u32 old_msr = vcpu->arch.shared->msr; | ||
123 | |||
124 | vcpu->arch.shared->msr = new_msr; | ||
125 | |||
126 | kvmppc_mmu_msr_notify(vcpu, old_msr); | ||
127 | |||
128 | if (vcpu->arch.shared->msr & MSR_WE) { | ||
129 | kvm_vcpu_block(vcpu); | ||
130 | kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); | ||
131 | }; | ||
132 | |||
133 | kvmppc_vcpu_sync_spe(vcpu); | ||
134 | } | ||
135 | |||
81 | static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu, | 136 | static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu, |
82 | unsigned int priority) | 137 | unsigned int priority) |
83 | { | 138 | { |
@@ -257,6 +312,19 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) | |||
257 | vcpu->arch.shared->int_pending = 0; | 312 | vcpu->arch.shared->int_pending = 0; |
258 | } | 313 | } |
259 | 314 | ||
315 | int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | ||
316 | { | ||
317 | int ret; | ||
318 | |||
319 | local_irq_disable(); | ||
320 | kvm_guest_enter(); | ||
321 | ret = __kvmppc_vcpu_run(kvm_run, vcpu); | ||
322 | kvm_guest_exit(); | ||
323 | local_irq_enable(); | ||
324 | |||
325 | return ret; | ||
326 | } | ||
327 | |||
260 | /** | 328 | /** |
261 | * kvmppc_handle_exit | 329 | * kvmppc_handle_exit |
262 | * | 330 | * |
@@ -344,10 +412,16 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, | |||
344 | r = RESUME_GUEST; | 412 | r = RESUME_GUEST; |
345 | break; | 413 | break; |
346 | 414 | ||
347 | case BOOKE_INTERRUPT_SPE_UNAVAIL: | 415 | #ifdef CONFIG_SPE |
348 | kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_UNAVAIL); | 416 | case BOOKE_INTERRUPT_SPE_UNAVAIL: { |
417 | if (vcpu->arch.shared->msr & MSR_SPE) | ||
418 | kvmppc_vcpu_enable_spe(vcpu); | ||
419 | else | ||
420 | kvmppc_booke_queue_irqprio(vcpu, | ||
421 | BOOKE_IRQPRIO_SPE_UNAVAIL); | ||
349 | r = RESUME_GUEST; | 422 | r = RESUME_GUEST; |
350 | break; | 423 | break; |
424 | } | ||
351 | 425 | ||
352 | case BOOKE_INTERRUPT_SPE_FP_DATA: | 426 | case BOOKE_INTERRUPT_SPE_FP_DATA: |
353 | kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_DATA); | 427 | kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_DATA); |
@@ -358,6 +432,28 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, | |||
358 | kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_ROUND); | 432 | kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_ROUND); |
359 | r = RESUME_GUEST; | 433 | r = RESUME_GUEST; |
360 | break; | 434 | break; |
435 | #else | ||
436 | case BOOKE_INTERRUPT_SPE_UNAVAIL: | ||
437 | /* | ||
438 | * Guest wants SPE, but host kernel doesn't support it. Send | ||
439 | * an "unimplemented operation" program check to the guest. | ||
440 | */ | ||
441 | kvmppc_core_queue_program(vcpu, ESR_PUO | ESR_SPV); | ||
442 | r = RESUME_GUEST; | ||
443 | break; | ||
444 | |||
445 | /* | ||
446 | * These really should never happen without CONFIG_SPE, | ||
447 | * as we should never enable the real MSR[SPE] in the guest. | ||
448 | */ | ||
449 | case BOOKE_INTERRUPT_SPE_FP_DATA: | ||
450 | case BOOKE_INTERRUPT_SPE_FP_ROUND: | ||
451 | printk(KERN_CRIT "%s: unexpected SPE interrupt %u at %08lx\n", | ||
452 | __func__, exit_nr, vcpu->arch.pc); | ||
453 | run->hw.hardware_exit_reason = exit_nr; | ||
454 | r = RESUME_HOST; | ||
455 | break; | ||
456 | #endif | ||
361 | 457 | ||
362 | case BOOKE_INTERRUPT_DATA_STORAGE: | 458 | case BOOKE_INTERRUPT_DATA_STORAGE: |
363 | kvmppc_core_queue_data_storage(vcpu, vcpu->arch.fault_dear, | 459 | kvmppc_core_queue_data_storage(vcpu, vcpu->arch.fault_dear, |
@@ -392,6 +488,17 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, | |||
392 | gpa_t gpaddr; | 488 | gpa_t gpaddr; |
393 | gfn_t gfn; | 489 | gfn_t gfn; |
394 | 490 | ||
491 | #ifdef CONFIG_KVM_E500 | ||
492 | if (!(vcpu->arch.shared->msr & MSR_PR) && | ||
493 | (eaddr & PAGE_MASK) == vcpu->arch.magic_page_ea) { | ||
494 | kvmppc_map_magic(vcpu); | ||
495 | kvmppc_account_exit(vcpu, DTLB_VIRT_MISS_EXITS); | ||
496 | r = RESUME_GUEST; | ||
497 | |||
498 | break; | ||
499 | } | ||
500 | #endif | ||
501 | |||
395 | /* Check the guest TLB. */ | 502 | /* Check the guest TLB. */ |
396 | gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr); | 503 | gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr); |
397 | if (gtlb_index < 0) { | 504 | if (gtlb_index < 0) { |
@@ -514,6 +621,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) | |||
514 | 621 | ||
515 | vcpu->arch.pc = 0; | 622 | vcpu->arch.pc = 0; |
516 | vcpu->arch.shared->msr = 0; | 623 | vcpu->arch.shared->msr = 0; |
624 | vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS; | ||
517 | kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ | 625 | kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ |
518 | 626 | ||
519 | vcpu->arch.shadow_pid = 1; | 627 | vcpu->arch.shadow_pid = 1; |
@@ -770,6 +878,26 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) | |||
770 | return -ENOTSUPP; | 878 | return -ENOTSUPP; |
771 | } | 879 | } |
772 | 880 | ||
881 | int kvmppc_core_prepare_memory_region(struct kvm *kvm, | ||
882 | struct kvm_userspace_memory_region *mem) | ||
883 | { | ||
884 | return 0; | ||
885 | } | ||
886 | |||
887 | void kvmppc_core_commit_memory_region(struct kvm *kvm, | ||
888 | struct kvm_userspace_memory_region *mem) | ||
889 | { | ||
890 | } | ||
891 | |||
892 | int kvmppc_core_init_vm(struct kvm *kvm) | ||
893 | { | ||
894 | return 0; | ||
895 | } | ||
896 | |||
897 | void kvmppc_core_destroy_vm(struct kvm *kvm) | ||
898 | { | ||
899 | } | ||
900 | |||
773 | int __init kvmppc_booke_init(void) | 901 | int __init kvmppc_booke_init(void) |
774 | { | 902 | { |
775 | unsigned long ivor[16]; | 903 | unsigned long ivor[16]; |
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h index 492bb7030358..8e1fe33d64e5 100644 --- a/arch/powerpc/kvm/booke.h +++ b/arch/powerpc/kvm/booke.h | |||
@@ -52,24 +52,19 @@ | |||
52 | 52 | ||
53 | extern unsigned long kvmppc_booke_handlers; | 53 | extern unsigned long kvmppc_booke_handlers; |
54 | 54 | ||
55 | /* Helper function for "full" MSR writes. No need to call this if only EE is | 55 | void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr); |
56 | * changing. */ | 56 | void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr); |
57 | static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) | ||
58 | { | ||
59 | if ((new_msr & MSR_PR) != (vcpu->arch.shared->msr & MSR_PR)) | ||
60 | kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR); | ||
61 | |||
62 | vcpu->arch.shared->msr = new_msr; | ||
63 | |||
64 | if (vcpu->arch.shared->msr & MSR_WE) { | ||
65 | kvm_vcpu_block(vcpu); | ||
66 | kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); | ||
67 | }; | ||
68 | } | ||
69 | 57 | ||
70 | int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, | 58 | int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, |
71 | unsigned int inst, int *advance); | 59 | unsigned int inst, int *advance); |
72 | int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt); | 60 | int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt); |
73 | int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs); | 61 | int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs); |
74 | 62 | ||
63 | /* low-level asm code to transfer guest state */ | ||
64 | void kvmppc_load_guest_spe(struct kvm_vcpu *vcpu); | ||
65 | void kvmppc_save_guest_spe(struct kvm_vcpu *vcpu); | ||
66 | |||
67 | /* high-level function, manages flags, host state */ | ||
68 | void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu); | ||
69 | |||
75 | #endif /* __KVM_BOOKE_H__ */ | 70 | #endif /* __KVM_BOOKE_H__ */ |
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index b58ccae95904..42f2fb1f66e9 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S | |||
@@ -13,6 +13,7 @@ | |||
13 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | 13 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
14 | * | 14 | * |
15 | * Copyright IBM Corp. 2007 | 15 | * Copyright IBM Corp. 2007 |
16 | * Copyright 2011 Freescale Semiconductor, Inc. | ||
16 | * | 17 | * |
17 | * Authors: Hollis Blanchard <hollisb@us.ibm.com> | 18 | * Authors: Hollis Blanchard <hollisb@us.ibm.com> |
18 | */ | 19 | */ |
@@ -24,8 +25,6 @@ | |||
24 | #include <asm/page.h> | 25 | #include <asm/page.h> |
25 | #include <asm/asm-offsets.h> | 26 | #include <asm/asm-offsets.h> |
26 | 27 | ||
27 | #define KVMPPC_MSR_MASK (MSR_CE|MSR_EE|MSR_PR|MSR_DE|MSR_ME|MSR_IS|MSR_DS) | ||
28 | |||
29 | #define VCPU_GPR(n) (VCPU_GPRS + (n * 4)) | 28 | #define VCPU_GPR(n) (VCPU_GPRS + (n * 4)) |
30 | 29 | ||
31 | /* The host stack layout: */ | 30 | /* The host stack layout: */ |
@@ -192,6 +191,12 @@ _GLOBAL(kvmppc_resume_host) | |||
192 | lwz r3, VCPU_HOST_PID(r4) | 191 | lwz r3, VCPU_HOST_PID(r4) |
193 | mtspr SPRN_PID, r3 | 192 | mtspr SPRN_PID, r3 |
194 | 193 | ||
194 | #ifdef CONFIG_FSL_BOOKE | ||
195 | /* we cheat and know that Linux doesn't use PID1 which is always 0 */ | ||
196 | lis r3, 0 | ||
197 | mtspr SPRN_PID1, r3 | ||
198 | #endif | ||
199 | |||
195 | /* Restore host IVPR before re-enabling interrupts. We cheat and know | 200 | /* Restore host IVPR before re-enabling interrupts. We cheat and know |
196 | * that Linux IVPR is always 0xc0000000. */ | 201 | * that Linux IVPR is always 0xc0000000. */ |
197 | lis r3, 0xc000 | 202 | lis r3, 0xc000 |
@@ -241,6 +246,14 @@ _GLOBAL(kvmppc_resume_host) | |||
241 | heavyweight_exit: | 246 | heavyweight_exit: |
242 | /* Not returning to guest. */ | 247 | /* Not returning to guest. */ |
243 | 248 | ||
249 | #ifdef CONFIG_SPE | ||
250 | /* save guest SPEFSCR and load host SPEFSCR */ | ||
251 | mfspr r9, SPRN_SPEFSCR | ||
252 | stw r9, VCPU_SPEFSCR(r4) | ||
253 | lwz r9, VCPU_HOST_SPEFSCR(r4) | ||
254 | mtspr SPRN_SPEFSCR, r9 | ||
255 | #endif | ||
256 | |||
244 | /* We already saved guest volatile register state; now save the | 257 | /* We already saved guest volatile register state; now save the |
245 | * non-volatiles. */ | 258 | * non-volatiles. */ |
246 | stw r15, VCPU_GPR(r15)(r4) | 259 | stw r15, VCPU_GPR(r15)(r4) |
@@ -342,6 +355,14 @@ _GLOBAL(__kvmppc_vcpu_run) | |||
342 | lwz r30, VCPU_GPR(r30)(r4) | 355 | lwz r30, VCPU_GPR(r30)(r4) |
343 | lwz r31, VCPU_GPR(r31)(r4) | 356 | lwz r31, VCPU_GPR(r31)(r4) |
344 | 357 | ||
358 | #ifdef CONFIG_SPE | ||
359 | /* save host SPEFSCR and load guest SPEFSCR */ | ||
360 | mfspr r3, SPRN_SPEFSCR | ||
361 | stw r3, VCPU_HOST_SPEFSCR(r4) | ||
362 | lwz r3, VCPU_SPEFSCR(r4) | ||
363 | mtspr SPRN_SPEFSCR, r3 | ||
364 | #endif | ||
365 | |||
345 | lightweight_exit: | 366 | lightweight_exit: |
346 | stw r2, HOST_R2(r1) | 367 | stw r2, HOST_R2(r1) |
347 | 368 | ||
@@ -350,6 +371,11 @@ lightweight_exit: | |||
350 | lwz r3, VCPU_SHADOW_PID(r4) | 371 | lwz r3, VCPU_SHADOW_PID(r4) |
351 | mtspr SPRN_PID, r3 | 372 | mtspr SPRN_PID, r3 |
352 | 373 | ||
374 | #ifdef CONFIG_FSL_BOOKE | ||
375 | lwz r3, VCPU_SHADOW_PID1(r4) | ||
376 | mtspr SPRN_PID1, r3 | ||
377 | #endif | ||
378 | |||
353 | #ifdef CONFIG_44x | 379 | #ifdef CONFIG_44x |
354 | iccci 0, 0 /* XXX hack */ | 380 | iccci 0, 0 /* XXX hack */ |
355 | #endif | 381 | #endif |
@@ -405,20 +431,17 @@ lightweight_exit: | |||
405 | 431 | ||
406 | /* Finish loading guest volatiles and jump to guest. */ | 432 | /* Finish loading guest volatiles and jump to guest. */ |
407 | lwz r3, VCPU_CTR(r4) | 433 | lwz r3, VCPU_CTR(r4) |
434 | lwz r5, VCPU_CR(r4) | ||
435 | lwz r6, VCPU_PC(r4) | ||
436 | lwz r7, VCPU_SHADOW_MSR(r4) | ||
408 | mtctr r3 | 437 | mtctr r3 |
409 | lwz r3, VCPU_CR(r4) | 438 | mtcr r5 |
410 | mtcr r3 | 439 | mtsrr0 r6 |
440 | mtsrr1 r7 | ||
411 | lwz r5, VCPU_GPR(r5)(r4) | 441 | lwz r5, VCPU_GPR(r5)(r4) |
412 | lwz r6, VCPU_GPR(r6)(r4) | 442 | lwz r6, VCPU_GPR(r6)(r4) |
413 | lwz r7, VCPU_GPR(r7)(r4) | 443 | lwz r7, VCPU_GPR(r7)(r4) |
414 | lwz r8, VCPU_GPR(r8)(r4) | 444 | lwz r8, VCPU_GPR(r8)(r4) |
415 | lwz r3, VCPU_PC(r4) | ||
416 | mtsrr0 r3 | ||
417 | lwz r3, VCPU_SHARED(r4) | ||
418 | lwz r3, (VCPU_SHARED_MSR + 4)(r3) | ||
419 | oris r3, r3, KVMPPC_MSR_MASK@h | ||
420 | ori r3, r3, KVMPPC_MSR_MASK@l | ||
421 | mtsrr1 r3 | ||
422 | 445 | ||
423 | /* Clear any debug events which occurred since we disabled MSR[DE]. | 446 | /* Clear any debug events which occurred since we disabled MSR[DE]. |
424 | * XXX This gives us a 3-instruction window in which a breakpoint | 447 | * XXX This gives us a 3-instruction window in which a breakpoint |
@@ -430,3 +453,24 @@ lightweight_exit: | |||
430 | lwz r3, VCPU_GPR(r3)(r4) | 453 | lwz r3, VCPU_GPR(r3)(r4) |
431 | lwz r4, VCPU_GPR(r4)(r4) | 454 | lwz r4, VCPU_GPR(r4)(r4) |
432 | rfi | 455 | rfi |
456 | |||
457 | #ifdef CONFIG_SPE | ||
458 | _GLOBAL(kvmppc_save_guest_spe) | ||
459 | cmpi 0,r3,0 | ||
460 | beqlr- | ||
461 | SAVE_32EVRS(0, r4, r3, VCPU_EVR) | ||
462 | evxor evr6, evr6, evr6 | ||
463 | evmwumiaa evr6, evr6, evr6 | ||
464 | li r4,VCPU_ACC | ||
465 | evstddx evr6, r4, r3 /* save acc */ | ||
466 | blr | ||
467 | |||
468 | _GLOBAL(kvmppc_load_guest_spe) | ||
469 | cmpi 0,r3,0 | ||
470 | beqlr- | ||
471 | li r4,VCPU_ACC | ||
472 | evlddx evr6,r4,r3 | ||
473 | evmra evr6,evr6 /* load acc */ | ||
474 | REST_32EVRS(0, r4, r3, VCPU_EVR) | ||
475 | blr | ||
476 | #endif | ||
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 318dbc61ba44..797a7447c268 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. | 2 | * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. |
3 | * | 3 | * |
4 | * Author: Yu Liu, <yu.liu@freescale.com> | 4 | * Author: Yu Liu, <yu.liu@freescale.com> |
5 | * | 5 | * |
@@ -41,6 +41,11 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
41 | void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) | 41 | void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) |
42 | { | 42 | { |
43 | kvmppc_e500_tlb_put(vcpu); | 43 | kvmppc_e500_tlb_put(vcpu); |
44 | |||
45 | #ifdef CONFIG_SPE | ||
46 | if (vcpu->arch.shadow_msr & MSR_SPE) | ||
47 | kvmppc_vcpu_disable_spe(vcpu); | ||
48 | #endif | ||
44 | } | 49 | } |
45 | 50 | ||
46 | int kvmppc_core_check_processor_compat(void) | 51 | int kvmppc_core_check_processor_compat(void) |
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index 69cd665a0caf..d48ae396f41e 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c | |||
@@ -81,8 +81,12 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) | |||
81 | kvmppc_set_pid(vcpu, spr_val); | 81 | kvmppc_set_pid(vcpu, spr_val); |
82 | break; | 82 | break; |
83 | case SPRN_PID1: | 83 | case SPRN_PID1: |
84 | if (spr_val != 0) | ||
85 | return EMULATE_FAIL; | ||
84 | vcpu_e500->pid[1] = spr_val; break; | 86 | vcpu_e500->pid[1] = spr_val; break; |
85 | case SPRN_PID2: | 87 | case SPRN_PID2: |
88 | if (spr_val != 0) | ||
89 | return EMULATE_FAIL; | ||
86 | vcpu_e500->pid[2] = spr_val; break; | 90 | vcpu_e500->pid[2] = spr_val; break; |
87 | case SPRN_MAS0: | 91 | case SPRN_MAS0: |
88 | vcpu_e500->mas0 = spr_val; break; | 92 | vcpu_e500->mas0 = spr_val; break; |
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index b18fe353397d..13c432ea2fa8 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c | |||
@@ -28,8 +28,196 @@ | |||
28 | 28 | ||
29 | #define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1) | 29 | #define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1) |
30 | 30 | ||
31 | struct id { | ||
32 | unsigned long val; | ||
33 | struct id **pentry; | ||
34 | }; | ||
35 | |||
36 | #define NUM_TIDS 256 | ||
37 | |||
38 | /* | ||
39 | * This table provide mappings from: | ||
40 | * (guestAS,guestTID,guestPR) --> ID of physical cpu | ||
41 | * guestAS [0..1] | ||
42 | * guestTID [0..255] | ||
43 | * guestPR [0..1] | ||
44 | * ID [1..255] | ||
45 | * Each vcpu keeps one vcpu_id_table. | ||
46 | */ | ||
47 | struct vcpu_id_table { | ||
48 | struct id id[2][NUM_TIDS][2]; | ||
49 | }; | ||
50 | |||
51 | /* | ||
52 | * This table provide reversed mappings of vcpu_id_table: | ||
53 | * ID --> address of vcpu_id_table item. | ||
54 | * Each physical core has one pcpu_id_table. | ||
55 | */ | ||
56 | struct pcpu_id_table { | ||
57 | struct id *entry[NUM_TIDS]; | ||
58 | }; | ||
59 | |||
60 | static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids); | ||
61 | |||
62 | /* This variable keeps last used shadow ID on local core. | ||
63 | * The valid range of shadow ID is [1..255] */ | ||
64 | static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid); | ||
65 | |||
31 | static unsigned int tlb1_entry_num; | 66 | static unsigned int tlb1_entry_num; |
32 | 67 | ||
68 | /* | ||
69 | * Allocate a free shadow id and setup a valid sid mapping in given entry. | ||
70 | * A mapping is only valid when vcpu_id_table and pcpu_id_table are match. | ||
71 | * | ||
72 | * The caller must have preemption disabled, and keep it that way until | ||
73 | * it has finished with the returned shadow id (either written into the | ||
74 | * TLB or arch.shadow_pid, or discarded). | ||
75 | */ | ||
76 | static inline int local_sid_setup_one(struct id *entry) | ||
77 | { | ||
78 | unsigned long sid; | ||
79 | int ret = -1; | ||
80 | |||
81 | sid = ++(__get_cpu_var(pcpu_last_used_sid)); | ||
82 | if (sid < NUM_TIDS) { | ||
83 | __get_cpu_var(pcpu_sids).entry[sid] = entry; | ||
84 | entry->val = sid; | ||
85 | entry->pentry = &__get_cpu_var(pcpu_sids).entry[sid]; | ||
86 | ret = sid; | ||
87 | } | ||
88 | |||
89 | /* | ||
90 | * If sid == NUM_TIDS, we've run out of sids. We return -1, and | ||
91 | * the caller will invalidate everything and start over. | ||
92 | * | ||
93 | * sid > NUM_TIDS indicates a race, which we disable preemption to | ||
94 | * avoid. | ||
95 | */ | ||
96 | WARN_ON(sid > NUM_TIDS); | ||
97 | |||
98 | return ret; | ||
99 | } | ||
100 | |||
101 | /* | ||
102 | * Check if given entry contain a valid shadow id mapping. | ||
103 | * An ID mapping is considered valid only if | ||
104 | * both vcpu and pcpu know this mapping. | ||
105 | * | ||
106 | * The caller must have preemption disabled, and keep it that way until | ||
107 | * it has finished with the returned shadow id (either written into the | ||
108 | * TLB or arch.shadow_pid, or discarded). | ||
109 | */ | ||
110 | static inline int local_sid_lookup(struct id *entry) | ||
111 | { | ||
112 | if (entry && entry->val != 0 && | ||
113 | __get_cpu_var(pcpu_sids).entry[entry->val] == entry && | ||
114 | entry->pentry == &__get_cpu_var(pcpu_sids).entry[entry->val]) | ||
115 | return entry->val; | ||
116 | return -1; | ||
117 | } | ||
118 | |||
119 | /* Invalidate all id mappings on local core */ | ||
120 | static inline void local_sid_destroy_all(void) | ||
121 | { | ||
122 | preempt_disable(); | ||
123 | __get_cpu_var(pcpu_last_used_sid) = 0; | ||
124 | memset(&__get_cpu_var(pcpu_sids), 0, sizeof(__get_cpu_var(pcpu_sids))); | ||
125 | preempt_enable(); | ||
126 | } | ||
127 | |||
128 | static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500) | ||
129 | { | ||
130 | vcpu_e500->idt = kzalloc(sizeof(struct vcpu_id_table), GFP_KERNEL); | ||
131 | return vcpu_e500->idt; | ||
132 | } | ||
133 | |||
134 | static void kvmppc_e500_id_table_free(struct kvmppc_vcpu_e500 *vcpu_e500) | ||
135 | { | ||
136 | kfree(vcpu_e500->idt); | ||
137 | } | ||
138 | |||
139 | /* Invalidate all mappings on vcpu */ | ||
140 | static void kvmppc_e500_id_table_reset_all(struct kvmppc_vcpu_e500 *vcpu_e500) | ||
141 | { | ||
142 | memset(vcpu_e500->idt, 0, sizeof(struct vcpu_id_table)); | ||
143 | |||
144 | /* Update shadow pid when mappings are changed */ | ||
145 | kvmppc_e500_recalc_shadow_pid(vcpu_e500); | ||
146 | } | ||
147 | |||
148 | /* Invalidate one ID mapping on vcpu */ | ||
149 | static inline void kvmppc_e500_id_table_reset_one( | ||
150 | struct kvmppc_vcpu_e500 *vcpu_e500, | ||
151 | int as, int pid, int pr) | ||
152 | { | ||
153 | struct vcpu_id_table *idt = vcpu_e500->idt; | ||
154 | |||
155 | BUG_ON(as >= 2); | ||
156 | BUG_ON(pid >= NUM_TIDS); | ||
157 | BUG_ON(pr >= 2); | ||
158 | |||
159 | idt->id[as][pid][pr].val = 0; | ||
160 | idt->id[as][pid][pr].pentry = NULL; | ||
161 | |||
162 | /* Update shadow pid when mappings are changed */ | ||
163 | kvmppc_e500_recalc_shadow_pid(vcpu_e500); | ||
164 | } | ||
165 | |||
166 | /* | ||
167 | * Map guest (vcpu,AS,ID,PR) to physical core shadow id. | ||
168 | * This function first lookup if a valid mapping exists, | ||
169 | * if not, then creates a new one. | ||
170 | * | ||
171 | * The caller must have preemption disabled, and keep it that way until | ||
172 | * it has finished with the returned shadow id (either written into the | ||
173 | * TLB or arch.shadow_pid, or discarded). | ||
174 | */ | ||
175 | static unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500, | ||
176 | unsigned int as, unsigned int gid, | ||
177 | unsigned int pr, int avoid_recursion) | ||
178 | { | ||
179 | struct vcpu_id_table *idt = vcpu_e500->idt; | ||
180 | int sid; | ||
181 | |||
182 | BUG_ON(as >= 2); | ||
183 | BUG_ON(gid >= NUM_TIDS); | ||
184 | BUG_ON(pr >= 2); | ||
185 | |||
186 | sid = local_sid_lookup(&idt->id[as][gid][pr]); | ||
187 | |||
188 | while (sid <= 0) { | ||
189 | /* No mapping yet */ | ||
190 | sid = local_sid_setup_one(&idt->id[as][gid][pr]); | ||
191 | if (sid <= 0) { | ||
192 | _tlbil_all(); | ||
193 | local_sid_destroy_all(); | ||
194 | } | ||
195 | |||
196 | /* Update shadow pid when mappings are changed */ | ||
197 | if (!avoid_recursion) | ||
198 | kvmppc_e500_recalc_shadow_pid(vcpu_e500); | ||
199 | } | ||
200 | |||
201 | return sid; | ||
202 | } | ||
203 | |||
204 | /* Map guest pid to shadow. | ||
205 | * We use PID to keep shadow of current guest non-zero PID, | ||
206 | * and use PID1 to keep shadow of guest zero PID. | ||
207 | * So that guest tlbe with TID=0 can be accessed at any time */ | ||
208 | void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500) | ||
209 | { | ||
210 | preempt_disable(); | ||
211 | vcpu_e500->vcpu.arch.shadow_pid = kvmppc_e500_get_sid(vcpu_e500, | ||
212 | get_cur_as(&vcpu_e500->vcpu), | ||
213 | get_cur_pid(&vcpu_e500->vcpu), | ||
214 | get_cur_pr(&vcpu_e500->vcpu), 1); | ||
215 | vcpu_e500->vcpu.arch.shadow_pid1 = kvmppc_e500_get_sid(vcpu_e500, | ||
216 | get_cur_as(&vcpu_e500->vcpu), 0, | ||
217 | get_cur_pr(&vcpu_e500->vcpu), 1); | ||
218 | preempt_enable(); | ||
219 | } | ||
220 | |||
33 | void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu) | 221 | void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu) |
34 | { | 222 | { |
35 | struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); | 223 | struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); |
@@ -41,25 +229,14 @@ void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu) | |||
41 | 229 | ||
42 | for (tlbsel = 0; tlbsel < 2; tlbsel++) { | 230 | for (tlbsel = 0; tlbsel < 2; tlbsel++) { |
43 | printk("Guest TLB%d:\n", tlbsel); | 231 | printk("Guest TLB%d:\n", tlbsel); |
44 | for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++) { | 232 | for (i = 0; i < vcpu_e500->gtlb_size[tlbsel]; i++) { |
45 | tlbe = &vcpu_e500->guest_tlb[tlbsel][i]; | 233 | tlbe = &vcpu_e500->gtlb_arch[tlbsel][i]; |
46 | if (tlbe->mas1 & MAS1_VALID) | 234 | if (tlbe->mas1 & MAS1_VALID) |
47 | printk(" G[%d][%3d] | %08X | %08X | %08X | %08X |\n", | 235 | printk(" G[%d][%3d] | %08X | %08X | %08X | %08X |\n", |
48 | tlbsel, i, tlbe->mas1, tlbe->mas2, | 236 | tlbsel, i, tlbe->mas1, tlbe->mas2, |
49 | tlbe->mas3, tlbe->mas7); | 237 | tlbe->mas3, tlbe->mas7); |
50 | } | 238 | } |
51 | } | 239 | } |
52 | |||
53 | for (tlbsel = 0; tlbsel < 2; tlbsel++) { | ||
54 | printk("Shadow TLB%d:\n", tlbsel); | ||
55 | for (i = 0; i < vcpu_e500->shadow_tlb_size[tlbsel]; i++) { | ||
56 | tlbe = &vcpu_e500->shadow_tlb[tlbsel][i]; | ||
57 | if (tlbe->mas1 & MAS1_VALID) | ||
58 | printk(" S[%d][%3d] | %08X | %08X | %08X | %08X |\n", | ||
59 | tlbsel, i, tlbe->mas1, tlbe->mas2, | ||
60 | tlbe->mas3, tlbe->mas7); | ||
61 | } | ||
62 | } | ||
63 | } | 240 | } |
64 | 241 | ||
65 | static inline unsigned int tlb0_get_next_victim( | 242 | static inline unsigned int tlb0_get_next_victim( |
@@ -67,16 +244,17 @@ static inline unsigned int tlb0_get_next_victim( | |||
67 | { | 244 | { |
68 | unsigned int victim; | 245 | unsigned int victim; |
69 | 246 | ||
70 | victim = vcpu_e500->guest_tlb_nv[0]++; | 247 | victim = vcpu_e500->gtlb_nv[0]++; |
71 | if (unlikely(vcpu_e500->guest_tlb_nv[0] >= KVM_E500_TLB0_WAY_NUM)) | 248 | if (unlikely(vcpu_e500->gtlb_nv[0] >= KVM_E500_TLB0_WAY_NUM)) |
72 | vcpu_e500->guest_tlb_nv[0] = 0; | 249 | vcpu_e500->gtlb_nv[0] = 0; |
73 | 250 | ||
74 | return victim; | 251 | return victim; |
75 | } | 252 | } |
76 | 253 | ||
77 | static inline unsigned int tlb1_max_shadow_size(void) | 254 | static inline unsigned int tlb1_max_shadow_size(void) |
78 | { | 255 | { |
79 | return tlb1_entry_num - tlbcam_index; | 256 | /* reserve one entry for magic page */ |
257 | return tlb1_entry_num - tlbcam_index - 1; | ||
80 | } | 258 | } |
81 | 259 | ||
82 | static inline int tlbe_is_writable(struct tlbe *tlbe) | 260 | static inline int tlbe_is_writable(struct tlbe *tlbe) |
@@ -112,72 +290,149 @@ static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) | |||
112 | /* | 290 | /* |
113 | * writing shadow tlb entry to host TLB | 291 | * writing shadow tlb entry to host TLB |
114 | */ | 292 | */ |
115 | static inline void __write_host_tlbe(struct tlbe *stlbe) | 293 | static inline void __write_host_tlbe(struct tlbe *stlbe, uint32_t mas0) |
116 | { | 294 | { |
295 | unsigned long flags; | ||
296 | |||
297 | local_irq_save(flags); | ||
298 | mtspr(SPRN_MAS0, mas0); | ||
117 | mtspr(SPRN_MAS1, stlbe->mas1); | 299 | mtspr(SPRN_MAS1, stlbe->mas1); |
118 | mtspr(SPRN_MAS2, stlbe->mas2); | 300 | mtspr(SPRN_MAS2, stlbe->mas2); |
119 | mtspr(SPRN_MAS3, stlbe->mas3); | 301 | mtspr(SPRN_MAS3, stlbe->mas3); |
120 | mtspr(SPRN_MAS7, stlbe->mas7); | 302 | mtspr(SPRN_MAS7, stlbe->mas7); |
121 | __asm__ __volatile__ ("tlbwe\n" : : ); | 303 | asm volatile("isync; tlbwe" : : : "memory"); |
304 | local_irq_restore(flags); | ||
122 | } | 305 | } |
123 | 306 | ||
124 | static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, | 307 | static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, |
125 | int tlbsel, int esel) | 308 | int tlbsel, int esel, struct tlbe *stlbe) |
126 | { | 309 | { |
127 | struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel]; | ||
128 | |||
129 | local_irq_disable(); | ||
130 | if (tlbsel == 0) { | 310 | if (tlbsel == 0) { |
131 | __write_host_tlbe(stlbe); | 311 | __write_host_tlbe(stlbe, |
312 | MAS0_TLBSEL(0) | | ||
313 | MAS0_ESEL(esel & (KVM_E500_TLB0_WAY_NUM - 1))); | ||
132 | } else { | 314 | } else { |
133 | unsigned register mas0; | 315 | __write_host_tlbe(stlbe, |
134 | 316 | MAS0_TLBSEL(1) | | |
135 | mas0 = mfspr(SPRN_MAS0); | 317 | MAS0_ESEL(to_htlb1_esel(esel))); |
136 | |||
137 | mtspr(SPRN_MAS0, MAS0_TLBSEL(1) | MAS0_ESEL(to_htlb1_esel(esel))); | ||
138 | __write_host_tlbe(stlbe); | ||
139 | |||
140 | mtspr(SPRN_MAS0, mas0); | ||
141 | } | 318 | } |
142 | local_irq_enable(); | 319 | trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2, |
320 | stlbe->mas3, stlbe->mas7); | ||
321 | } | ||
322 | |||
323 | void kvmppc_map_magic(struct kvm_vcpu *vcpu) | ||
324 | { | ||
325 | struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); | ||
326 | struct tlbe magic; | ||
327 | ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK; | ||
328 | unsigned int stid; | ||
329 | pfn_t pfn; | ||
330 | |||
331 | pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT; | ||
332 | get_page(pfn_to_page(pfn)); | ||
333 | |||
334 | preempt_disable(); | ||
335 | stid = kvmppc_e500_get_sid(vcpu_e500, 0, 0, 0, 0); | ||
336 | |||
337 | magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) | | ||
338 | MAS1_TSIZE(BOOK3E_PAGESZ_4K); | ||
339 | magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M; | ||
340 | magic.mas3 = (pfn << PAGE_SHIFT) | | ||
341 | MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR; | ||
342 | magic.mas7 = pfn >> (32 - PAGE_SHIFT); | ||
343 | |||
344 | __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index)); | ||
345 | preempt_enable(); | ||
143 | } | 346 | } |
144 | 347 | ||
145 | void kvmppc_e500_tlb_load(struct kvm_vcpu *vcpu, int cpu) | 348 | void kvmppc_e500_tlb_load(struct kvm_vcpu *vcpu, int cpu) |
146 | { | 349 | { |
147 | struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); | 350 | struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); |
148 | int i; | 351 | |
149 | unsigned register mas0; | 352 | /* Shadow PID may be expired on local core */ |
150 | 353 | kvmppc_e500_recalc_shadow_pid(vcpu_e500); | |
151 | /* Load all valid TLB1 entries to reduce guest tlb miss fault */ | ||
152 | local_irq_disable(); | ||
153 | mas0 = mfspr(SPRN_MAS0); | ||
154 | for (i = 0; i < tlb1_max_shadow_size(); i++) { | ||
155 | struct tlbe *stlbe = &vcpu_e500->shadow_tlb[1][i]; | ||
156 | |||
157 | if (get_tlb_v(stlbe)) { | ||
158 | mtspr(SPRN_MAS0, MAS0_TLBSEL(1) | ||
159 | | MAS0_ESEL(to_htlb1_esel(i))); | ||
160 | __write_host_tlbe(stlbe); | ||
161 | } | ||
162 | } | ||
163 | mtspr(SPRN_MAS0, mas0); | ||
164 | local_irq_enable(); | ||
165 | } | 354 | } |
166 | 355 | ||
167 | void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu) | 356 | void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu) |
168 | { | 357 | { |
169 | _tlbil_all(); | 358 | } |
359 | |||
360 | static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, | ||
361 | int tlbsel, int esel) | ||
362 | { | ||
363 | struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; | ||
364 | struct vcpu_id_table *idt = vcpu_e500->idt; | ||
365 | unsigned int pr, tid, ts, pid; | ||
366 | u32 val, eaddr; | ||
367 | unsigned long flags; | ||
368 | |||
369 | ts = get_tlb_ts(gtlbe); | ||
370 | tid = get_tlb_tid(gtlbe); | ||
371 | |||
372 | preempt_disable(); | ||
373 | |||
374 | /* One guest ID may be mapped to two shadow IDs */ | ||
375 | for (pr = 0; pr < 2; pr++) { | ||
376 | /* | ||
377 | * The shadow PID can have a valid mapping on at most one | ||
378 | * host CPU. In the common case, it will be valid on this | ||
379 | * CPU, in which case (for TLB0) we do a local invalidation | ||
380 | * of the specific address. | ||
381 | * | ||
382 | * If the shadow PID is not valid on the current host CPU, or | ||
383 | * if we're invalidating a TLB1 entry, we invalidate the | ||
384 | * entire shadow PID. | ||
385 | */ | ||
386 | if (tlbsel == 1 || | ||
387 | (pid = local_sid_lookup(&idt->id[ts][tid][pr])) <= 0) { | ||
388 | kvmppc_e500_id_table_reset_one(vcpu_e500, ts, tid, pr); | ||
389 | continue; | ||
390 | } | ||
391 | |||
392 | /* | ||
393 | * The guest is invalidating a TLB0 entry which is in a PID | ||
394 | * that has a valid shadow mapping on this host CPU. We | ||
395 | * search host TLB0 to invalidate it's shadow TLB entry, | ||
396 | * similar to __tlbil_va except that we need to look in AS1. | ||
397 | */ | ||
398 | val = (pid << MAS6_SPID_SHIFT) | MAS6_SAS; | ||
399 | eaddr = get_tlb_eaddr(gtlbe); | ||
400 | |||
401 | local_irq_save(flags); | ||
402 | |||
403 | mtspr(SPRN_MAS6, val); | ||
404 | asm volatile("tlbsx 0, %[eaddr]" : : [eaddr] "r" (eaddr)); | ||
405 | val = mfspr(SPRN_MAS1); | ||
406 | if (val & MAS1_VALID) { | ||
407 | mtspr(SPRN_MAS1, val & ~MAS1_VALID); | ||
408 | asm volatile("tlbwe"); | ||
409 | } | ||
410 | |||
411 | local_irq_restore(flags); | ||
412 | } | ||
413 | |||
414 | preempt_enable(); | ||
170 | } | 415 | } |
171 | 416 | ||
172 | /* Search the guest TLB for a matching entry. */ | 417 | /* Search the guest TLB for a matching entry. */ |
173 | static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, | 418 | static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, |
174 | gva_t eaddr, int tlbsel, unsigned int pid, int as) | 419 | gva_t eaddr, int tlbsel, unsigned int pid, int as) |
175 | { | 420 | { |
421 | int size = vcpu_e500->gtlb_size[tlbsel]; | ||
422 | int set_base; | ||
176 | int i; | 423 | int i; |
177 | 424 | ||
178 | /* XXX Replace loop with fancy data structures. */ | 425 | if (tlbsel == 0) { |
179 | for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++) { | 426 | int mask = size / KVM_E500_TLB0_WAY_NUM - 1; |
180 | struct tlbe *tlbe = &vcpu_e500->guest_tlb[tlbsel][i]; | 427 | set_base = (eaddr >> PAGE_SHIFT) & mask; |
428 | set_base *= KVM_E500_TLB0_WAY_NUM; | ||
429 | size = KVM_E500_TLB0_WAY_NUM; | ||
430 | } else { | ||
431 | set_base = 0; | ||
432 | } | ||
433 | |||
434 | for (i = 0; i < size; i++) { | ||
435 | struct tlbe *tlbe = &vcpu_e500->gtlb_arch[tlbsel][set_base + i]; | ||
181 | unsigned int tid; | 436 | unsigned int tid; |
182 | 437 | ||
183 | if (eaddr < get_tlb_eaddr(tlbe)) | 438 | if (eaddr < get_tlb_eaddr(tlbe)) |
@@ -196,66 +451,32 @@ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, | |||
196 | if (get_tlb_ts(tlbe) != as && as != -1) | 451 | if (get_tlb_ts(tlbe) != as && as != -1) |
197 | continue; | 452 | continue; |
198 | 453 | ||
199 | return i; | 454 | return set_base + i; |
200 | } | 455 | } |
201 | 456 | ||
202 | return -1; | 457 | return -1; |
203 | } | 458 | } |
204 | 459 | ||
205 | static void kvmppc_e500_shadow_release(struct kvmppc_vcpu_e500 *vcpu_e500, | 460 | static inline void kvmppc_e500_priv_setup(struct tlbe_priv *priv, |
206 | int tlbsel, int esel) | 461 | struct tlbe *gtlbe, |
207 | { | 462 | pfn_t pfn) |
208 | struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel]; | ||
209 | struct page *page = vcpu_e500->shadow_pages[tlbsel][esel]; | ||
210 | |||
211 | if (page) { | ||
212 | vcpu_e500->shadow_pages[tlbsel][esel] = NULL; | ||
213 | |||
214 | if (get_tlb_v(stlbe)) { | ||
215 | if (tlbe_is_writable(stlbe)) | ||
216 | kvm_release_page_dirty(page); | ||
217 | else | ||
218 | kvm_release_page_clean(page); | ||
219 | } | ||
220 | } | ||
221 | } | ||
222 | |||
223 | static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, | ||
224 | int tlbsel, int esel) | ||
225 | { | 463 | { |
226 | struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel]; | 464 | priv->pfn = pfn; |
465 | priv->flags = E500_TLB_VALID; | ||
227 | 466 | ||
228 | kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel); | 467 | if (tlbe_is_writable(gtlbe)) |
229 | stlbe->mas1 = 0; | 468 | priv->flags |= E500_TLB_DIRTY; |
230 | trace_kvm_stlb_inval(index_of(tlbsel, esel)); | ||
231 | } | 469 | } |
232 | 470 | ||
233 | static void kvmppc_e500_tlb1_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, | 471 | static inline void kvmppc_e500_priv_release(struct tlbe_priv *priv) |
234 | gva_t eaddr, gva_t eend, u32 tid) | ||
235 | { | 472 | { |
236 | unsigned int pid = tid & 0xff; | 473 | if (priv->flags & E500_TLB_VALID) { |
237 | unsigned int i; | 474 | if (priv->flags & E500_TLB_DIRTY) |
238 | 475 | kvm_release_pfn_dirty(priv->pfn); | |
239 | /* XXX Replace loop with fancy data structures. */ | 476 | else |
240 | for (i = 0; i < vcpu_e500->guest_tlb_size[1]; i++) { | 477 | kvm_release_pfn_clean(priv->pfn); |
241 | struct tlbe *stlbe = &vcpu_e500->shadow_tlb[1][i]; | ||
242 | unsigned int tid; | ||
243 | |||
244 | if (!get_tlb_v(stlbe)) | ||
245 | continue; | ||
246 | |||
247 | if (eend < get_tlb_eaddr(stlbe)) | ||
248 | continue; | ||
249 | 478 | ||
250 | if (eaddr > get_tlb_end(stlbe)) | 479 | priv->flags = 0; |
251 | continue; | ||
252 | |||
253 | tid = get_tlb_tid(stlbe); | ||
254 | if (tid && (tid != pid)) | ||
255 | continue; | ||
256 | |||
257 | kvmppc_e500_stlbe_invalidate(vcpu_e500, 1, i); | ||
258 | write_host_tlbe(vcpu_e500, 1, i); | ||
259 | } | 480 | } |
260 | } | 481 | } |
261 | 482 | ||
@@ -273,7 +494,7 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, | |||
273 | tsized = (vcpu_e500->mas4 >> 7) & 0x1f; | 494 | tsized = (vcpu_e500->mas4 >> 7) & 0x1f; |
274 | 495 | ||
275 | vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) | 496 | vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) |
276 | | MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); | 497 | | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); |
277 | vcpu_e500->mas1 = MAS1_VALID | (as ? MAS1_TS : 0) | 498 | vcpu_e500->mas1 = MAS1_VALID | (as ? MAS1_TS : 0) |
278 | | MAS1_TID(vcpu_e500->pid[pidsel]) | 499 | | MAS1_TID(vcpu_e500->pid[pidsel]) |
279 | | MAS1_TSIZE(tsized); | 500 | | MAS1_TSIZE(tsized); |
@@ -286,56 +507,154 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, | |||
286 | vcpu_e500->mas7 = 0; | 507 | vcpu_e500->mas7 = 0; |
287 | } | 508 | } |
288 | 509 | ||
289 | static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, | 510 | static inline void kvmppc_e500_setup_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, |
290 | u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel) | 511 | struct tlbe *gtlbe, int tsize, |
512 | struct tlbe_priv *priv, | ||
513 | u64 gvaddr, struct tlbe *stlbe) | ||
291 | { | 514 | { |
292 | struct page *new_page; | 515 | pfn_t pfn = priv->pfn; |
293 | struct tlbe *stlbe; | 516 | unsigned int stid; |
294 | hpa_t hpaddr; | ||
295 | |||
296 | stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel]; | ||
297 | |||
298 | /* Get reference to new page. */ | ||
299 | new_page = gfn_to_page(vcpu_e500->vcpu.kvm, gfn); | ||
300 | if (is_error_page(new_page)) { | ||
301 | printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", | ||
302 | (long)gfn); | ||
303 | kvm_release_page_clean(new_page); | ||
304 | return; | ||
305 | } | ||
306 | hpaddr = page_to_phys(new_page); | ||
307 | |||
308 | /* Drop reference to old page. */ | ||
309 | kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel); | ||
310 | 517 | ||
311 | vcpu_e500->shadow_pages[tlbsel][esel] = new_page; | 518 | stid = kvmppc_e500_get_sid(vcpu_e500, get_tlb_ts(gtlbe), |
519 | get_tlb_tid(gtlbe), | ||
520 | get_cur_pr(&vcpu_e500->vcpu), 0); | ||
312 | 521 | ||
313 | /* Force TS=1 IPROT=0 TSIZE=4KB for all guest mappings. */ | 522 | /* Force TS=1 IPROT=0 for all guest mappings. */ |
314 | stlbe->mas1 = MAS1_TSIZE(BOOK3E_PAGESZ_4K) | 523 | stlbe->mas1 = MAS1_TSIZE(tsize) |
315 | | MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID; | 524 | | MAS1_TID(stid) | MAS1_TS | MAS1_VALID; |
316 | stlbe->mas2 = (gvaddr & MAS2_EPN) | 525 | stlbe->mas2 = (gvaddr & MAS2_EPN) |
317 | | e500_shadow_mas2_attrib(gtlbe->mas2, | 526 | | e500_shadow_mas2_attrib(gtlbe->mas2, |
318 | vcpu_e500->vcpu.arch.shared->msr & MSR_PR); | 527 | vcpu_e500->vcpu.arch.shared->msr & MSR_PR); |
319 | stlbe->mas3 = (hpaddr & MAS3_RPN) | 528 | stlbe->mas3 = ((pfn << PAGE_SHIFT) & MAS3_RPN) |
320 | | e500_shadow_mas3_attrib(gtlbe->mas3, | 529 | | e500_shadow_mas3_attrib(gtlbe->mas3, |
321 | vcpu_e500->vcpu.arch.shared->msr & MSR_PR); | 530 | vcpu_e500->vcpu.arch.shared->msr & MSR_PR); |
322 | stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN; | 531 | stlbe->mas7 = (pfn >> (32 - PAGE_SHIFT)) & MAS7_RPN; |
532 | } | ||
323 | 533 | ||
324 | trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2, | 534 | |
325 | stlbe->mas3, stlbe->mas7); | 535 | static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, |
536 | u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel, | ||
537 | struct tlbe *stlbe) | ||
538 | { | ||
539 | struct kvm_memory_slot *slot; | ||
540 | unsigned long pfn, hva; | ||
541 | int pfnmap = 0; | ||
542 | int tsize = BOOK3E_PAGESZ_4K; | ||
543 | struct tlbe_priv *priv; | ||
544 | |||
545 | /* | ||
546 | * Translate guest physical to true physical, acquiring | ||
547 | * a page reference if it is normal, non-reserved memory. | ||
548 | * | ||
549 | * gfn_to_memslot() must succeed because otherwise we wouldn't | ||
550 | * have gotten this far. Eventually we should just pass the slot | ||
551 | * pointer through from the first lookup. | ||
552 | */ | ||
553 | slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn); | ||
554 | hva = gfn_to_hva_memslot(slot, gfn); | ||
555 | |||
556 | if (tlbsel == 1) { | ||
557 | struct vm_area_struct *vma; | ||
558 | down_read(¤t->mm->mmap_sem); | ||
559 | |||
560 | vma = find_vma(current->mm, hva); | ||
561 | if (vma && hva >= vma->vm_start && | ||
562 | (vma->vm_flags & VM_PFNMAP)) { | ||
563 | /* | ||
564 | * This VMA is a physically contiguous region (e.g. | ||
565 | * /dev/mem) that bypasses normal Linux page | ||
566 | * management. Find the overlap between the | ||
567 | * vma and the memslot. | ||
568 | */ | ||
569 | |||
570 | unsigned long start, end; | ||
571 | unsigned long slot_start, slot_end; | ||
572 | |||
573 | pfnmap = 1; | ||
574 | |||
575 | start = vma->vm_pgoff; | ||
576 | end = start + | ||
577 | ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); | ||
578 | |||
579 | pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT); | ||
580 | |||
581 | slot_start = pfn - (gfn - slot->base_gfn); | ||
582 | slot_end = slot_start + slot->npages; | ||
583 | |||
584 | if (start < slot_start) | ||
585 | start = slot_start; | ||
586 | if (end > slot_end) | ||
587 | end = slot_end; | ||
588 | |||
589 | tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> | ||
590 | MAS1_TSIZE_SHIFT; | ||
591 | |||
592 | /* | ||
593 | * e500 doesn't implement the lowest tsize bit, | ||
594 | * or 1K pages. | ||
595 | */ | ||
596 | tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); | ||
597 | |||
598 | /* | ||
599 | * Now find the largest tsize (up to what the guest | ||
600 | * requested) that will cover gfn, stay within the | ||
601 | * range, and for which gfn and pfn are mutually | ||
602 | * aligned. | ||
603 | */ | ||
604 | |||
605 | for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) { | ||
606 | unsigned long gfn_start, gfn_end, tsize_pages; | ||
607 | tsize_pages = 1 << (tsize - 2); | ||
608 | |||
609 | gfn_start = gfn & ~(tsize_pages - 1); | ||
610 | gfn_end = gfn_start + tsize_pages; | ||
611 | |||
612 | if (gfn_start + pfn - gfn < start) | ||
613 | continue; | ||
614 | if (gfn_end + pfn - gfn > end) | ||
615 | continue; | ||
616 | if ((gfn & (tsize_pages - 1)) != | ||
617 | (pfn & (tsize_pages - 1))) | ||
618 | continue; | ||
619 | |||
620 | gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); | ||
621 | pfn &= ~(tsize_pages - 1); | ||
622 | break; | ||
623 | } | ||
624 | } | ||
625 | |||
626 | up_read(¤t->mm->mmap_sem); | ||
627 | } | ||
628 | |||
629 | if (likely(!pfnmap)) { | ||
630 | pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn); | ||
631 | if (is_error_pfn(pfn)) { | ||
632 | printk(KERN_ERR "Couldn't get real page for gfn %lx!\n", | ||
633 | (long)gfn); | ||
634 | kvm_release_pfn_clean(pfn); | ||
635 | return; | ||
636 | } | ||
637 | } | ||
638 | |||
639 | /* Drop old priv and setup new one. */ | ||
640 | priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; | ||
641 | kvmppc_e500_priv_release(priv); | ||
642 | kvmppc_e500_priv_setup(priv, gtlbe, pfn); | ||
643 | |||
644 | kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, tsize, priv, gvaddr, stlbe); | ||
326 | } | 645 | } |
327 | 646 | ||
328 | /* XXX only map the one-one case, for now use TLB0 */ | 647 | /* XXX only map the one-one case, for now use TLB0 */ |
329 | static int kvmppc_e500_stlbe_map(struct kvmppc_vcpu_e500 *vcpu_e500, | 648 | static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, |
330 | int tlbsel, int esel) | 649 | int esel, struct tlbe *stlbe) |
331 | { | 650 | { |
332 | struct tlbe *gtlbe; | 651 | struct tlbe *gtlbe; |
333 | 652 | ||
334 | gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; | 653 | gtlbe = &vcpu_e500->gtlb_arch[0][esel]; |
335 | 654 | ||
336 | kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe), | 655 | kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe), |
337 | get_tlb_raddr(gtlbe) >> PAGE_SHIFT, | 656 | get_tlb_raddr(gtlbe) >> PAGE_SHIFT, |
338 | gtlbe, tlbsel, esel); | 657 | gtlbe, 0, esel, stlbe); |
339 | 658 | ||
340 | return esel; | 659 | return esel; |
341 | } | 660 | } |
@@ -344,53 +663,37 @@ static int kvmppc_e500_stlbe_map(struct kvmppc_vcpu_e500 *vcpu_e500, | |||
344 | * the shadow TLB. */ | 663 | * the shadow TLB. */ |
345 | /* XXX for both one-one and one-to-many , for now use TLB1 */ | 664 | /* XXX for both one-one and one-to-many , for now use TLB1 */ |
346 | static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, | 665 | static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, |
347 | u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe) | 666 | u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, struct tlbe *stlbe) |
348 | { | 667 | { |
349 | unsigned int victim; | 668 | unsigned int victim; |
350 | 669 | ||
351 | victim = vcpu_e500->guest_tlb_nv[1]++; | 670 | victim = vcpu_e500->gtlb_nv[1]++; |
352 | 671 | ||
353 | if (unlikely(vcpu_e500->guest_tlb_nv[1] >= tlb1_max_shadow_size())) | 672 | if (unlikely(vcpu_e500->gtlb_nv[1] >= tlb1_max_shadow_size())) |
354 | vcpu_e500->guest_tlb_nv[1] = 0; | 673 | vcpu_e500->gtlb_nv[1] = 0; |
355 | 674 | ||
356 | kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, victim); | 675 | kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, victim, stlbe); |
357 | 676 | ||
358 | return victim; | 677 | return victim; |
359 | } | 678 | } |
360 | 679 | ||
361 | /* Invalidate all guest kernel mappings when enter usermode, | 680 | void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr) |
362 | * so that when they fault back in they will get the | ||
363 | * proper permission bits. */ | ||
364 | void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode) | ||
365 | { | 681 | { |
366 | if (usermode) { | 682 | struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); |
367 | struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); | ||
368 | int i; | ||
369 | |||
370 | /* XXX Replace loop with fancy data structures. */ | ||
371 | for (i = 0; i < tlb1_max_shadow_size(); i++) | ||
372 | kvmppc_e500_stlbe_invalidate(vcpu_e500, 1, i); | ||
373 | 683 | ||
374 | _tlbil_all(); | 684 | /* Recalc shadow pid since MSR changes */ |
375 | } | 685 | kvmppc_e500_recalc_shadow_pid(vcpu_e500); |
376 | } | 686 | } |
377 | 687 | ||
378 | static int kvmppc_e500_gtlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, | 688 | static inline int kvmppc_e500_gtlbe_invalidate( |
379 | int tlbsel, int esel) | 689 | struct kvmppc_vcpu_e500 *vcpu_e500, |
690 | int tlbsel, int esel) | ||
380 | { | 691 | { |
381 | struct tlbe *gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; | 692 | struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; |
382 | 693 | ||
383 | if (unlikely(get_tlb_iprot(gtlbe))) | 694 | if (unlikely(get_tlb_iprot(gtlbe))) |
384 | return -1; | 695 | return -1; |
385 | 696 | ||
386 | if (tlbsel == 1) { | ||
387 | kvmppc_e500_tlb1_invalidate(vcpu_e500, get_tlb_eaddr(gtlbe), | ||
388 | get_tlb_end(gtlbe), | ||
389 | get_tlb_tid(gtlbe)); | ||
390 | } else { | ||
391 | kvmppc_e500_stlbe_invalidate(vcpu_e500, tlbsel, esel); | ||
392 | } | ||
393 | |||
394 | gtlbe->mas1 = 0; | 697 | gtlbe->mas1 = 0; |
395 | 698 | ||
396 | return 0; | 699 | return 0; |
@@ -401,13 +704,14 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value) | |||
401 | int esel; | 704 | int esel; |
402 | 705 | ||
403 | if (value & MMUCSR0_TLB0FI) | 706 | if (value & MMUCSR0_TLB0FI) |
404 | for (esel = 0; esel < vcpu_e500->guest_tlb_size[0]; esel++) | 707 | for (esel = 0; esel < vcpu_e500->gtlb_size[0]; esel++) |
405 | kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel); | 708 | kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel); |
406 | if (value & MMUCSR0_TLB1FI) | 709 | if (value & MMUCSR0_TLB1FI) |
407 | for (esel = 0; esel < vcpu_e500->guest_tlb_size[1]; esel++) | 710 | for (esel = 0; esel < vcpu_e500->gtlb_size[1]; esel++) |
408 | kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel); | 711 | kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel); |
409 | 712 | ||
410 | _tlbil_all(); | 713 | /* Invalidate all vcpu id mappings */ |
714 | kvmppc_e500_id_table_reset_all(vcpu_e500); | ||
411 | 715 | ||
412 | return EMULATE_DONE; | 716 | return EMULATE_DONE; |
413 | } | 717 | } |
@@ -428,7 +732,7 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) | |||
428 | 732 | ||
429 | if (ia) { | 733 | if (ia) { |
430 | /* invalidate all entries */ | 734 | /* invalidate all entries */ |
431 | for (esel = 0; esel < vcpu_e500->guest_tlb_size[tlbsel]; esel++) | 735 | for (esel = 0; esel < vcpu_e500->gtlb_size[tlbsel]; esel++) |
432 | kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); | 736 | kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); |
433 | } else { | 737 | } else { |
434 | ea &= 0xfffff000; | 738 | ea &= 0xfffff000; |
@@ -438,7 +742,8 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) | |||
438 | kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); | 742 | kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); |
439 | } | 743 | } |
440 | 744 | ||
441 | _tlbil_all(); | 745 | /* Invalidate all vcpu id mappings */ |
746 | kvmppc_e500_id_table_reset_all(vcpu_e500); | ||
442 | 747 | ||
443 | return EMULATE_DONE; | 748 | return EMULATE_DONE; |
444 | } | 749 | } |
@@ -452,9 +757,9 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu) | |||
452 | tlbsel = get_tlb_tlbsel(vcpu_e500); | 757 | tlbsel = get_tlb_tlbsel(vcpu_e500); |
453 | esel = get_tlb_esel(vcpu_e500, tlbsel); | 758 | esel = get_tlb_esel(vcpu_e500, tlbsel); |
454 | 759 | ||
455 | gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; | 760 | gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; |
456 | vcpu_e500->mas0 &= ~MAS0_NV(~0); | 761 | vcpu_e500->mas0 &= ~MAS0_NV(~0); |
457 | vcpu_e500->mas0 |= MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); | 762 | vcpu_e500->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); |
458 | vcpu_e500->mas1 = gtlbe->mas1; | 763 | vcpu_e500->mas1 = gtlbe->mas1; |
459 | vcpu_e500->mas2 = gtlbe->mas2; | 764 | vcpu_e500->mas2 = gtlbe->mas2; |
460 | vcpu_e500->mas3 = gtlbe->mas3; | 765 | vcpu_e500->mas3 = gtlbe->mas3; |
@@ -477,14 +782,14 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) | |||
477 | for (tlbsel = 0; tlbsel < 2; tlbsel++) { | 782 | for (tlbsel = 0; tlbsel < 2; tlbsel++) { |
478 | esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); | 783 | esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); |
479 | if (esel >= 0) { | 784 | if (esel >= 0) { |
480 | gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; | 785 | gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; |
481 | break; | 786 | break; |
482 | } | 787 | } |
483 | } | 788 | } |
484 | 789 | ||
485 | if (gtlbe) { | 790 | if (gtlbe) { |
486 | vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel) | 791 | vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel) |
487 | | MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); | 792 | | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); |
488 | vcpu_e500->mas1 = gtlbe->mas1; | 793 | vcpu_e500->mas1 = gtlbe->mas1; |
489 | vcpu_e500->mas2 = gtlbe->mas2; | 794 | vcpu_e500->mas2 = gtlbe->mas2; |
490 | vcpu_e500->mas3 = gtlbe->mas3; | 795 | vcpu_e500->mas3 = gtlbe->mas3; |
@@ -497,7 +802,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) | |||
497 | victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; | 802 | victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; |
498 | 803 | ||
499 | vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) | 804 | vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) |
500 | | MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); | 805 | | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); |
501 | vcpu_e500->mas1 = (vcpu_e500->mas6 & MAS6_SPID0) | 806 | vcpu_e500->mas1 = (vcpu_e500->mas6 & MAS6_SPID0) |
502 | | (vcpu_e500->mas6 & (MAS6_SAS ? MAS1_TS : 0)) | 807 | | (vcpu_e500->mas6 & (MAS6_SAS ? MAS1_TS : 0)) |
503 | | (vcpu_e500->mas4 & MAS4_TSIZED(~0)); | 808 | | (vcpu_e500->mas4 & MAS4_TSIZED(~0)); |
@@ -514,23 +819,16 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) | |||
514 | int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) | 819 | int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) |
515 | { | 820 | { |
516 | struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); | 821 | struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); |
517 | u64 eaddr; | ||
518 | u64 raddr; | ||
519 | u32 tid; | ||
520 | struct tlbe *gtlbe; | 822 | struct tlbe *gtlbe; |
521 | int tlbsel, esel, stlbsel, sesel; | 823 | int tlbsel, esel; |
522 | 824 | ||
523 | tlbsel = get_tlb_tlbsel(vcpu_e500); | 825 | tlbsel = get_tlb_tlbsel(vcpu_e500); |
524 | esel = get_tlb_esel(vcpu_e500, tlbsel); | 826 | esel = get_tlb_esel(vcpu_e500, tlbsel); |
525 | 827 | ||
526 | gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; | 828 | gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; |
527 | 829 | ||
528 | if (get_tlb_v(gtlbe) && tlbsel == 1) { | 830 | if (get_tlb_v(gtlbe)) |
529 | eaddr = get_tlb_eaddr(gtlbe); | 831 | kvmppc_e500_stlbe_invalidate(vcpu_e500, tlbsel, esel); |
530 | tid = get_tlb_tid(gtlbe); | ||
531 | kvmppc_e500_tlb1_invalidate(vcpu_e500, eaddr, | ||
532 | get_tlb_end(gtlbe), tid); | ||
533 | } | ||
534 | 832 | ||
535 | gtlbe->mas1 = vcpu_e500->mas1; | 833 | gtlbe->mas1 = vcpu_e500->mas1; |
536 | gtlbe->mas2 = vcpu_e500->mas2; | 834 | gtlbe->mas2 = vcpu_e500->mas2; |
@@ -542,6 +840,12 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) | |||
542 | 840 | ||
543 | /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ | 841 | /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ |
544 | if (tlbe_is_host_safe(vcpu, gtlbe)) { | 842 | if (tlbe_is_host_safe(vcpu, gtlbe)) { |
843 | struct tlbe stlbe; | ||
844 | int stlbsel, sesel; | ||
845 | u64 eaddr; | ||
846 | u64 raddr; | ||
847 | |||
848 | preempt_disable(); | ||
545 | switch (tlbsel) { | 849 | switch (tlbsel) { |
546 | case 0: | 850 | case 0: |
547 | /* TLB0 */ | 851 | /* TLB0 */ |
@@ -549,7 +853,7 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) | |||
549 | gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K); | 853 | gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K); |
550 | 854 | ||
551 | stlbsel = 0; | 855 | stlbsel = 0; |
552 | sesel = kvmppc_e500_stlbe_map(vcpu_e500, 0, esel); | 856 | sesel = kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe); |
553 | 857 | ||
554 | break; | 858 | break; |
555 | 859 | ||
@@ -564,13 +868,14 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) | |||
564 | * are mapped on the fly. */ | 868 | * are mapped on the fly. */ |
565 | stlbsel = 1; | 869 | stlbsel = 1; |
566 | sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, | 870 | sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, |
567 | raddr >> PAGE_SHIFT, gtlbe); | 871 | raddr >> PAGE_SHIFT, gtlbe, &stlbe); |
568 | break; | 872 | break; |
569 | 873 | ||
570 | default: | 874 | default: |
571 | BUG(); | 875 | BUG(); |
572 | } | 876 | } |
573 | write_host_tlbe(vcpu_e500, stlbsel, sesel); | 877 | write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe); |
878 | preempt_enable(); | ||
574 | } | 879 | } |
575 | 880 | ||
576 | kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS); | 881 | kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS); |
@@ -610,7 +915,7 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index, | |||
610 | { | 915 | { |
611 | struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); | 916 | struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); |
612 | struct tlbe *gtlbe = | 917 | struct tlbe *gtlbe = |
613 | &vcpu_e500->guest_tlb[tlbsel_of(index)][esel_of(index)]; | 918 | &vcpu_e500->gtlb_arch[tlbsel_of(index)][esel_of(index)]; |
614 | u64 pgmask = get_tlb_bytes(gtlbe) - 1; | 919 | u64 pgmask = get_tlb_bytes(gtlbe) - 1; |
615 | 920 | ||
616 | return get_tlb_raddr(gtlbe) | (eaddr & pgmask); | 921 | return get_tlb_raddr(gtlbe) | (eaddr & pgmask); |
@@ -618,38 +923,37 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index, | |||
618 | 923 | ||
619 | void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) | 924 | void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) |
620 | { | 925 | { |
621 | struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); | ||
622 | int tlbsel, i; | ||
623 | |||
624 | for (tlbsel = 0; tlbsel < 2; tlbsel++) | ||
625 | for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++) | ||
626 | kvmppc_e500_shadow_release(vcpu_e500, tlbsel, i); | ||
627 | |||
628 | /* discard all guest mapping */ | ||
629 | _tlbil_all(); | ||
630 | } | 926 | } |
631 | 927 | ||
632 | void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, | 928 | void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, |
633 | unsigned int index) | 929 | unsigned int index) |
634 | { | 930 | { |
635 | struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); | 931 | struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); |
932 | struct tlbe_priv *priv; | ||
933 | struct tlbe *gtlbe, stlbe; | ||
636 | int tlbsel = tlbsel_of(index); | 934 | int tlbsel = tlbsel_of(index); |
637 | int esel = esel_of(index); | 935 | int esel = esel_of(index); |
638 | int stlbsel, sesel; | 936 | int stlbsel, sesel; |
639 | 937 | ||
938 | gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; | ||
939 | |||
940 | preempt_disable(); | ||
640 | switch (tlbsel) { | 941 | switch (tlbsel) { |
641 | case 0: | 942 | case 0: |
642 | stlbsel = 0; | 943 | stlbsel = 0; |
643 | sesel = esel; | 944 | sesel = esel; |
945 | priv = &vcpu_e500->gtlb_priv[stlbsel][sesel]; | ||
946 | |||
947 | kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, BOOK3E_PAGESZ_4K, | ||
948 | priv, eaddr, &stlbe); | ||
644 | break; | 949 | break; |
645 | 950 | ||
646 | case 1: { | 951 | case 1: { |
647 | gfn_t gfn = gpaddr >> PAGE_SHIFT; | 952 | gfn_t gfn = gpaddr >> PAGE_SHIFT; |
648 | struct tlbe *gtlbe | ||
649 | = &vcpu_e500->guest_tlb[tlbsel][esel]; | ||
650 | 953 | ||
651 | stlbsel = 1; | 954 | stlbsel = 1; |
652 | sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, gtlbe); | 955 | sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, |
956 | gtlbe, &stlbe); | ||
653 | break; | 957 | break; |
654 | } | 958 | } |
655 | 959 | ||
@@ -657,7 +961,9 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, | |||
657 | BUG(); | 961 | BUG(); |
658 | break; | 962 | break; |
659 | } | 963 | } |
660 | write_host_tlbe(vcpu_e500, stlbsel, sesel); | 964 | |
965 | write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe); | ||
966 | preempt_enable(); | ||
661 | } | 967 | } |
662 | 968 | ||
663 | int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu, | 969 | int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu, |
@@ -679,8 +985,10 @@ void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid) | |||
679 | { | 985 | { |
680 | struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); | 986 | struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); |
681 | 987 | ||
682 | vcpu_e500->pid[0] = vcpu->arch.shadow_pid = | 988 | if (vcpu->arch.pid != pid) { |
683 | vcpu->arch.pid = pid; | 989 | vcpu_e500->pid[0] = vcpu->arch.pid = pid; |
990 | kvmppc_e500_recalc_shadow_pid(vcpu_e500); | ||
991 | } | ||
684 | } | 992 | } |
685 | 993 | ||
686 | void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) | 994 | void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) |
@@ -688,14 +996,14 @@ void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) | |||
688 | struct tlbe *tlbe; | 996 | struct tlbe *tlbe; |
689 | 997 | ||
690 | /* Insert large initial mapping for guest. */ | 998 | /* Insert large initial mapping for guest. */ |
691 | tlbe = &vcpu_e500->guest_tlb[1][0]; | 999 | tlbe = &vcpu_e500->gtlb_arch[1][0]; |
692 | tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M); | 1000 | tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M); |
693 | tlbe->mas2 = 0; | 1001 | tlbe->mas2 = 0; |
694 | tlbe->mas3 = E500_TLB_SUPER_PERM_MASK; | 1002 | tlbe->mas3 = E500_TLB_SUPER_PERM_MASK; |
695 | tlbe->mas7 = 0; | 1003 | tlbe->mas7 = 0; |
696 | 1004 | ||
697 | /* 4K map for serial output. Used by kernel wrapper. */ | 1005 | /* 4K map for serial output. Used by kernel wrapper. */ |
698 | tlbe = &vcpu_e500->guest_tlb[1][1]; | 1006 | tlbe = &vcpu_e500->gtlb_arch[1][1]; |
699 | tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K); | 1007 | tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K); |
700 | tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G; | 1008 | tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G; |
701 | tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; | 1009 | tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; |
@@ -706,68 +1014,64 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) | |||
706 | { | 1014 | { |
707 | tlb1_entry_num = mfspr(SPRN_TLB1CFG) & 0xFFF; | 1015 | tlb1_entry_num = mfspr(SPRN_TLB1CFG) & 0xFFF; |
708 | 1016 | ||
709 | vcpu_e500->guest_tlb_size[0] = KVM_E500_TLB0_SIZE; | 1017 | vcpu_e500->gtlb_size[0] = KVM_E500_TLB0_SIZE; |
710 | vcpu_e500->guest_tlb[0] = | 1018 | vcpu_e500->gtlb_arch[0] = |
711 | kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL); | 1019 | kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL); |
712 | if (vcpu_e500->guest_tlb[0] == NULL) | 1020 | if (vcpu_e500->gtlb_arch[0] == NULL) |
713 | goto err_out; | 1021 | goto err_out; |
714 | 1022 | ||
715 | vcpu_e500->shadow_tlb_size[0] = KVM_E500_TLB0_SIZE; | 1023 | vcpu_e500->gtlb_size[1] = KVM_E500_TLB1_SIZE; |
716 | vcpu_e500->shadow_tlb[0] = | 1024 | vcpu_e500->gtlb_arch[1] = |
717 | kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL); | ||
718 | if (vcpu_e500->shadow_tlb[0] == NULL) | ||
719 | goto err_out_guest0; | ||
720 | |||
721 | vcpu_e500->guest_tlb_size[1] = KVM_E500_TLB1_SIZE; | ||
722 | vcpu_e500->guest_tlb[1] = | ||
723 | kzalloc(sizeof(struct tlbe) * KVM_E500_TLB1_SIZE, GFP_KERNEL); | 1025 | kzalloc(sizeof(struct tlbe) * KVM_E500_TLB1_SIZE, GFP_KERNEL); |
724 | if (vcpu_e500->guest_tlb[1] == NULL) | 1026 | if (vcpu_e500->gtlb_arch[1] == NULL) |
725 | goto err_out_shadow0; | 1027 | goto err_out_guest0; |
726 | 1028 | ||
727 | vcpu_e500->shadow_tlb_size[1] = tlb1_entry_num; | 1029 | vcpu_e500->gtlb_priv[0] = (struct tlbe_priv *) |
728 | vcpu_e500->shadow_tlb[1] = | 1030 | kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB0_SIZE, GFP_KERNEL); |
729 | kzalloc(sizeof(struct tlbe) * tlb1_entry_num, GFP_KERNEL); | 1031 | if (vcpu_e500->gtlb_priv[0] == NULL) |
730 | if (vcpu_e500->shadow_tlb[1] == NULL) | ||
731 | goto err_out_guest1; | 1032 | goto err_out_guest1; |
1033 | vcpu_e500->gtlb_priv[1] = (struct tlbe_priv *) | ||
1034 | kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB1_SIZE, GFP_KERNEL); | ||
732 | 1035 | ||
733 | vcpu_e500->shadow_pages[0] = (struct page **) | 1036 | if (vcpu_e500->gtlb_priv[1] == NULL) |
734 | kzalloc(sizeof(struct page *) * KVM_E500_TLB0_SIZE, GFP_KERNEL); | 1037 | goto err_out_priv0; |
735 | if (vcpu_e500->shadow_pages[0] == NULL) | ||
736 | goto err_out_shadow1; | ||
737 | 1038 | ||
738 | vcpu_e500->shadow_pages[1] = (struct page **) | 1039 | if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) |
739 | kzalloc(sizeof(struct page *) * tlb1_entry_num, GFP_KERNEL); | 1040 | goto err_out_priv1; |
740 | if (vcpu_e500->shadow_pages[1] == NULL) | ||
741 | goto err_out_page0; | ||
742 | 1041 | ||
743 | /* Init TLB configuration register */ | 1042 | /* Init TLB configuration register */ |
744 | vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL; | 1043 | vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL; |
745 | vcpu_e500->tlb0cfg |= vcpu_e500->guest_tlb_size[0]; | 1044 | vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_size[0]; |
746 | vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL; | 1045 | vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL; |
747 | vcpu_e500->tlb1cfg |= vcpu_e500->guest_tlb_size[1]; | 1046 | vcpu_e500->tlb1cfg |= vcpu_e500->gtlb_size[1]; |
748 | 1047 | ||
749 | return 0; | 1048 | return 0; |
750 | 1049 | ||
751 | err_out_page0: | 1050 | err_out_priv1: |
752 | kfree(vcpu_e500->shadow_pages[0]); | 1051 | kfree(vcpu_e500->gtlb_priv[1]); |
753 | err_out_shadow1: | 1052 | err_out_priv0: |
754 | kfree(vcpu_e500->shadow_tlb[1]); | 1053 | kfree(vcpu_e500->gtlb_priv[0]); |
755 | err_out_guest1: | 1054 | err_out_guest1: |
756 | kfree(vcpu_e500->guest_tlb[1]); | 1055 | kfree(vcpu_e500->gtlb_arch[1]); |
757 | err_out_shadow0: | ||
758 | kfree(vcpu_e500->shadow_tlb[0]); | ||
759 | err_out_guest0: | 1056 | err_out_guest0: |
760 | kfree(vcpu_e500->guest_tlb[0]); | 1057 | kfree(vcpu_e500->gtlb_arch[0]); |
761 | err_out: | 1058 | err_out: |
762 | return -1; | 1059 | return -1; |
763 | } | 1060 | } |
764 | 1061 | ||
765 | void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) | 1062 | void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) |
766 | { | 1063 | { |
767 | kfree(vcpu_e500->shadow_pages[1]); | 1064 | int stlbsel, i; |
768 | kfree(vcpu_e500->shadow_pages[0]); | 1065 | |
769 | kfree(vcpu_e500->shadow_tlb[1]); | 1066 | /* release all privs */ |
770 | kfree(vcpu_e500->guest_tlb[1]); | 1067 | for (stlbsel = 0; stlbsel < 2; stlbsel++) |
771 | kfree(vcpu_e500->shadow_tlb[0]); | 1068 | for (i = 0; i < vcpu_e500->gtlb_size[stlbsel]; i++) { |
772 | kfree(vcpu_e500->guest_tlb[0]); | 1069 | struct tlbe_priv *priv = |
1070 | &vcpu_e500->gtlb_priv[stlbsel][i]; | ||
1071 | kvmppc_e500_priv_release(priv); | ||
1072 | } | ||
1073 | |||
1074 | kvmppc_e500_id_table_free(vcpu_e500); | ||
1075 | kfree(vcpu_e500->gtlb_arch[1]); | ||
1076 | kfree(vcpu_e500->gtlb_arch[0]); | ||
773 | } | 1077 | } |
diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h index 458946b4775d..59b88e99a235 100644 --- a/arch/powerpc/kvm/e500_tlb.h +++ b/arch/powerpc/kvm/e500_tlb.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. | 2 | * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. |
3 | * | 3 | * |
4 | * Author: Yu Liu, yu.liu@freescale.com | 4 | * Author: Yu Liu, yu.liu@freescale.com |
5 | * | 5 | * |
@@ -55,6 +55,7 @@ extern void kvmppc_e500_tlb_load(struct kvm_vcpu *, int); | |||
55 | extern int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *); | 55 | extern int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *); |
56 | extern void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *); | 56 | extern void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *); |
57 | extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *); | 57 | extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *); |
58 | extern void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *); | ||
58 | 59 | ||
59 | /* TLB helper functions */ | 60 | /* TLB helper functions */ |
60 | static inline unsigned int get_tlb_size(const struct tlbe *tlbe) | 61 | static inline unsigned int get_tlb_size(const struct tlbe *tlbe) |
@@ -110,6 +111,16 @@ static inline unsigned int get_cur_pid(struct kvm_vcpu *vcpu) | |||
110 | return vcpu->arch.pid & 0xff; | 111 | return vcpu->arch.pid & 0xff; |
111 | } | 112 | } |
112 | 113 | ||
114 | static inline unsigned int get_cur_as(struct kvm_vcpu *vcpu) | ||
115 | { | ||
116 | return !!(vcpu->arch.shared->msr & (MSR_IS | MSR_DS)); | ||
117 | } | ||
118 | |||
119 | static inline unsigned int get_cur_pr(struct kvm_vcpu *vcpu) | ||
120 | { | ||
121 | return !!(vcpu->arch.shared->msr & MSR_PR); | ||
122 | } | ||
123 | |||
113 | static inline unsigned int get_cur_spid( | 124 | static inline unsigned int get_cur_spid( |
114 | const struct kvmppc_vcpu_e500 *vcpu_e500) | 125 | const struct kvmppc_vcpu_e500 *vcpu_e500) |
115 | { | 126 | { |
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 616dd516ca1f..a107c9be0fb1 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <asm/uaccess.h> | 30 | #include <asm/uaccess.h> |
31 | #include <asm/kvm_ppc.h> | 31 | #include <asm/kvm_ppc.h> |
32 | #include <asm/tlbflush.h> | 32 | #include <asm/tlbflush.h> |
33 | #include <asm/cputhreads.h> | ||
33 | #include "timing.h" | 34 | #include "timing.h" |
34 | #include "../mm/mmu_decl.h" | 35 | #include "../mm/mmu_decl.h" |
35 | 36 | ||
@@ -38,8 +39,12 @@ | |||
38 | 39 | ||
39 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) | 40 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) |
40 | { | 41 | { |
42 | #ifndef CONFIG_KVM_BOOK3S_64_HV | ||
41 | return !(v->arch.shared->msr & MSR_WE) || | 43 | return !(v->arch.shared->msr & MSR_WE) || |
42 | !!(v->arch.pending_exceptions); | 44 | !!(v->arch.pending_exceptions); |
45 | #else | ||
46 | return !(v->arch.ceded) || !!(v->arch.pending_exceptions); | ||
47 | #endif | ||
43 | } | 48 | } |
44 | 49 | ||
45 | int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) | 50 | int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) |
@@ -73,7 +78,8 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) | |||
73 | } | 78 | } |
74 | case HC_VENDOR_KVM | KVM_HC_FEATURES: | 79 | case HC_VENDOR_KVM | KVM_HC_FEATURES: |
75 | r = HC_EV_SUCCESS; | 80 | r = HC_EV_SUCCESS; |
76 | #if defined(CONFIG_PPC_BOOK3S) /* XXX Missing magic page on BookE */ | 81 | #if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500) |
82 | /* XXX Missing magic page on 44x */ | ||
77 | r2 |= (1 << KVM_FEATURE_MAGIC_PAGE); | 83 | r2 |= (1 << KVM_FEATURE_MAGIC_PAGE); |
78 | #endif | 84 | #endif |
79 | 85 | ||
@@ -147,7 +153,7 @@ void kvm_arch_check_processor_compat(void *rtn) | |||
147 | 153 | ||
148 | int kvm_arch_init_vm(struct kvm *kvm) | 154 | int kvm_arch_init_vm(struct kvm *kvm) |
149 | { | 155 | { |
150 | return 0; | 156 | return kvmppc_core_init_vm(kvm); |
151 | } | 157 | } |
152 | 158 | ||
153 | void kvm_arch_destroy_vm(struct kvm *kvm) | 159 | void kvm_arch_destroy_vm(struct kvm *kvm) |
@@ -163,6 +169,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm) | |||
163 | kvm->vcpus[i] = NULL; | 169 | kvm->vcpus[i] = NULL; |
164 | 170 | ||
165 | atomic_set(&kvm->online_vcpus, 0); | 171 | atomic_set(&kvm->online_vcpus, 0); |
172 | |||
173 | kvmppc_core_destroy_vm(kvm); | ||
174 | |||
166 | mutex_unlock(&kvm->lock); | 175 | mutex_unlock(&kvm->lock); |
167 | } | 176 | } |
168 | 177 | ||
@@ -180,10 +189,13 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
180 | #else | 189 | #else |
181 | case KVM_CAP_PPC_SEGSTATE: | 190 | case KVM_CAP_PPC_SEGSTATE: |
182 | #endif | 191 | #endif |
183 | case KVM_CAP_PPC_PAIRED_SINGLES: | ||
184 | case KVM_CAP_PPC_UNSET_IRQ: | 192 | case KVM_CAP_PPC_UNSET_IRQ: |
185 | case KVM_CAP_PPC_IRQ_LEVEL: | 193 | case KVM_CAP_PPC_IRQ_LEVEL: |
186 | case KVM_CAP_ENABLE_CAP: | 194 | case KVM_CAP_ENABLE_CAP: |
195 | r = 1; | ||
196 | break; | ||
197 | #ifndef CONFIG_KVM_BOOK3S_64_HV | ||
198 | case KVM_CAP_PPC_PAIRED_SINGLES: | ||
187 | case KVM_CAP_PPC_OSI: | 199 | case KVM_CAP_PPC_OSI: |
188 | case KVM_CAP_PPC_GET_PVINFO: | 200 | case KVM_CAP_PPC_GET_PVINFO: |
189 | r = 1; | 201 | r = 1; |
@@ -191,6 +203,21 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
191 | case KVM_CAP_COALESCED_MMIO: | 203 | case KVM_CAP_COALESCED_MMIO: |
192 | r = KVM_COALESCED_MMIO_PAGE_OFFSET; | 204 | r = KVM_COALESCED_MMIO_PAGE_OFFSET; |
193 | break; | 205 | break; |
206 | #endif | ||
207 | #ifdef CONFIG_KVM_BOOK3S_64_HV | ||
208 | case KVM_CAP_SPAPR_TCE: | ||
209 | r = 1; | ||
210 | break; | ||
211 | case KVM_CAP_PPC_SMT: | ||
212 | r = threads_per_core; | ||
213 | break; | ||
214 | case KVM_CAP_PPC_RMA: | ||
215 | r = 1; | ||
216 | /* PPC970 requires an RMA */ | ||
217 | if (cpu_has_feature(CPU_FTR_ARCH_201)) | ||
218 | r = 2; | ||
219 | break; | ||
220 | #endif | ||
194 | default: | 221 | default: |
195 | r = 0; | 222 | r = 0; |
196 | break; | 223 | break; |
@@ -211,7 +238,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, | |||
211 | struct kvm_userspace_memory_region *mem, | 238 | struct kvm_userspace_memory_region *mem, |
212 | int user_alloc) | 239 | int user_alloc) |
213 | { | 240 | { |
214 | return 0; | 241 | return kvmppc_core_prepare_memory_region(kvm, mem); |
215 | } | 242 | } |
216 | 243 | ||
217 | void kvm_arch_commit_memory_region(struct kvm *kvm, | 244 | void kvm_arch_commit_memory_region(struct kvm *kvm, |
@@ -219,7 +246,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, | |||
219 | struct kvm_memory_slot old, | 246 | struct kvm_memory_slot old, |
220 | int user_alloc) | 247 | int user_alloc) |
221 | { | 248 | { |
222 | return; | 249 | kvmppc_core_commit_memory_region(kvm, mem); |
223 | } | 250 | } |
224 | 251 | ||
225 | 252 | ||
@@ -287,6 +314,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
287 | hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); | 314 | hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); |
288 | tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu); | 315 | tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu); |
289 | vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; | 316 | vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; |
317 | vcpu->arch.dec_expires = ~(u64)0; | ||
290 | 318 | ||
291 | #ifdef CONFIG_KVM_EXIT_TIMING | 319 | #ifdef CONFIG_KVM_EXIT_TIMING |
292 | mutex_init(&vcpu->arch.exit_timing_lock); | 320 | mutex_init(&vcpu->arch.exit_timing_lock); |
@@ -313,6 +341,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
313 | mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); | 341 | mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); |
314 | #endif | 342 | #endif |
315 | kvmppc_core_vcpu_load(vcpu, cpu); | 343 | kvmppc_core_vcpu_load(vcpu, cpu); |
344 | vcpu->cpu = smp_processor_id(); | ||
316 | } | 345 | } |
317 | 346 | ||
318 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) | 347 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) |
@@ -321,6 +350,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) | |||
321 | #ifdef CONFIG_BOOKE | 350 | #ifdef CONFIG_BOOKE |
322 | vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); | 351 | vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); |
323 | #endif | 352 | #endif |
353 | vcpu->cpu = -1; | ||
324 | } | 354 | } |
325 | 355 | ||
326 | int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, | 356 | int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, |
@@ -492,15 +522,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) | |||
492 | for (i = 0; i < 32; i++) | 522 | for (i = 0; i < 32; i++) |
493 | kvmppc_set_gpr(vcpu, i, gprs[i]); | 523 | kvmppc_set_gpr(vcpu, i, gprs[i]); |
494 | vcpu->arch.osi_needed = 0; | 524 | vcpu->arch.osi_needed = 0; |
525 | } else if (vcpu->arch.hcall_needed) { | ||
526 | int i; | ||
527 | |||
528 | kvmppc_set_gpr(vcpu, 3, run->papr_hcall.ret); | ||
529 | for (i = 0; i < 9; ++i) | ||
530 | kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]); | ||
531 | vcpu->arch.hcall_needed = 0; | ||
495 | } | 532 | } |
496 | 533 | ||
497 | kvmppc_core_deliver_interrupts(vcpu); | 534 | kvmppc_core_deliver_interrupts(vcpu); |
498 | 535 | ||
499 | local_irq_disable(); | 536 | r = kvmppc_vcpu_run(run, vcpu); |
500 | kvm_guest_enter(); | ||
501 | r = __kvmppc_vcpu_run(run, vcpu); | ||
502 | kvm_guest_exit(); | ||
503 | local_irq_enable(); | ||
504 | 537 | ||
505 | if (vcpu->sigset_active) | 538 | if (vcpu->sigset_active) |
506 | sigprocmask(SIG_SETMASK, &sigsaved, NULL); | 539 | sigprocmask(SIG_SETMASK, &sigsaved, NULL); |
@@ -518,6 +551,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) | |||
518 | if (waitqueue_active(&vcpu->wq)) { | 551 | if (waitqueue_active(&vcpu->wq)) { |
519 | wake_up_interruptible(&vcpu->wq); | 552 | wake_up_interruptible(&vcpu->wq); |
520 | vcpu->stat.halt_wakeup++; | 553 | vcpu->stat.halt_wakeup++; |
554 | } else if (vcpu->cpu != -1) { | ||
555 | smp_send_reschedule(vcpu->cpu); | ||
521 | } | 556 | } |
522 | 557 | ||
523 | return 0; | 558 | return 0; |
@@ -633,6 +668,29 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
633 | 668 | ||
634 | break; | 669 | break; |
635 | } | 670 | } |
671 | #ifdef CONFIG_KVM_BOOK3S_64_HV | ||
672 | case KVM_CREATE_SPAPR_TCE: { | ||
673 | struct kvm_create_spapr_tce create_tce; | ||
674 | struct kvm *kvm = filp->private_data; | ||
675 | |||
676 | r = -EFAULT; | ||
677 | if (copy_from_user(&create_tce, argp, sizeof(create_tce))) | ||
678 | goto out; | ||
679 | r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce); | ||
680 | goto out; | ||
681 | } | ||
682 | |||
683 | case KVM_ALLOCATE_RMA: { | ||
684 | struct kvm *kvm = filp->private_data; | ||
685 | struct kvm_allocate_rma rma; | ||
686 | |||
687 | r = kvm_vm_ioctl_allocate_rma(kvm, &rma); | ||
688 | if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma))) | ||
689 | r = -EFAULT; | ||
690 | break; | ||
691 | } | ||
692 | #endif /* CONFIG_KVM_BOOK3S_64_HV */ | ||
693 | |||
636 | default: | 694 | default: |
637 | r = -ENOTTY; | 695 | r = -ENOTTY; |
638 | } | 696 | } |
diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c index 319177df9587..07b6110a4bb7 100644 --- a/arch/powerpc/kvm/timing.c +++ b/arch/powerpc/kvm/timing.c | |||
@@ -56,15 +56,6 @@ static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type) | |||
56 | { | 56 | { |
57 | u64 old; | 57 | u64 old; |
58 | 58 | ||
59 | do_div(duration, tb_ticks_per_usec); | ||
60 | if (unlikely(duration > 0xFFFFFFFF)) { | ||
61 | printk(KERN_ERR"%s - duration too big -> overflow" | ||
62 | " duration %lld type %d exit #%d\n", | ||
63 | __func__, duration, type, | ||
64 | vcpu->arch.timing_count_type[type]); | ||
65 | return; | ||
66 | } | ||
67 | |||
68 | mutex_lock(&vcpu->arch.exit_timing_lock); | 59 | mutex_lock(&vcpu->arch.exit_timing_lock); |
69 | 60 | ||
70 | vcpu->arch.timing_count_type[type]++; | 61 | vcpu->arch.timing_count_type[type]++; |
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h index 3aca1b042b8c..b135d3d397db 100644 --- a/arch/powerpc/kvm/trace.h +++ b/arch/powerpc/kvm/trace.h | |||
@@ -103,7 +103,7 @@ TRACE_EVENT(kvm_gtlb_write, | |||
103 | * Book3S trace points * | 103 | * Book3S trace points * |
104 | *************************************************************************/ | 104 | *************************************************************************/ |
105 | 105 | ||
106 | #ifdef CONFIG_PPC_BOOK3S | 106 | #ifdef CONFIG_KVM_BOOK3S_PR |
107 | 107 | ||
108 | TRACE_EVENT(kvm_book3s_exit, | 108 | TRACE_EVENT(kvm_book3s_exit, |
109 | TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), | 109 | TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), |
@@ -252,7 +252,7 @@ TRACE_EVENT(kvm_book3s_mmu_flush, | |||
252 | ), | 252 | ), |
253 | 253 | ||
254 | TP_fast_assign( | 254 | TP_fast_assign( |
255 | __entry->count = vcpu->arch.hpte_cache_count; | 255 | __entry->count = to_book3s(vcpu)->hpte_cache_count; |
256 | __entry->p1 = p1; | 256 | __entry->p1 = p1; |
257 | __entry->p2 = p2; | 257 | __entry->p2 = p2; |
258 | __entry->type = type; | 258 | __entry->type = type; |
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index dfd764896db0..90039bc64119 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c | |||
@@ -37,7 +37,7 @@ | |||
37 | 37 | ||
38 | #define HPTE_LOCK_BIT 3 | 38 | #define HPTE_LOCK_BIT 3 |
39 | 39 | ||
40 | static DEFINE_RAW_SPINLOCK(native_tlbie_lock); | 40 | DEFINE_RAW_SPINLOCK(native_tlbie_lock); |
41 | 41 | ||
42 | static inline void __tlbie(unsigned long va, int psize, int ssize) | 42 | static inline void __tlbie(unsigned long va, int psize, int ssize) |
43 | { | 43 | { |
@@ -51,7 +51,7 @@ static inline void __tlbie(unsigned long va, int psize, int ssize) | |||
51 | va &= ~0xffful; | 51 | va &= ~0xffful; |
52 | va |= ssize << 8; | 52 | va |= ssize << 8; |
53 | asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2) | 53 | asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2) |
54 | : : "r" (va), "r"(0), "i" (CPU_FTR_HVMODE_206) | 54 | : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206) |
55 | : "memory"); | 55 | : "memory"); |
56 | break; | 56 | break; |
57 | default: | 57 | default: |
@@ -61,7 +61,7 @@ static inline void __tlbie(unsigned long va, int psize, int ssize) | |||
61 | va |= ssize << 8; | 61 | va |= ssize << 8; |
62 | va |= 1; /* L */ | 62 | va |= 1; /* L */ |
63 | asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2) | 63 | asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2) |
64 | : : "r" (va), "r"(0), "i" (CPU_FTR_HVMODE_206) | 64 | : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206) |
65 | : "memory"); | 65 | : "memory"); |
66 | break; | 66 | break; |
67 | } | 67 | } |
diff --git a/arch/powerpc/platforms/iseries/exception.S b/arch/powerpc/platforms/iseries/exception.S index 29c02f36b32f..f519ee17ff7d 100644 --- a/arch/powerpc/platforms/iseries/exception.S +++ b/arch/powerpc/platforms/iseries/exception.S | |||
@@ -167,7 +167,7 @@ BEGIN_FTR_SECTION | |||
167 | std r12,PACA_EXGEN+EX_R13(r13) | 167 | std r12,PACA_EXGEN+EX_R13(r13) |
168 | EXCEPTION_PROLOG_ISERIES_1 | 168 | EXCEPTION_PROLOG_ISERIES_1 |
169 | FTR_SECTION_ELSE | 169 | FTR_SECTION_ELSE |
170 | EXCEPTION_PROLOG_1(PACA_EXGEN) | 170 | EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0) |
171 | EXCEPTION_PROLOG_ISERIES_1 | 171 | EXCEPTION_PROLOG_ISERIES_1 |
172 | ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_SLB) | 172 | ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_SLB) |
173 | b data_access_common | 173 | b data_access_common |
diff --git a/arch/powerpc/platforms/iseries/exception.h b/arch/powerpc/platforms/iseries/exception.h index bae3fba5ad8e..50271b550a99 100644 --- a/arch/powerpc/platforms/iseries/exception.h +++ b/arch/powerpc/platforms/iseries/exception.h | |||
@@ -39,7 +39,7 @@ | |||
39 | label##_iSeries: \ | 39 | label##_iSeries: \ |
40 | HMT_MEDIUM; \ | 40 | HMT_MEDIUM; \ |
41 | mtspr SPRN_SPRG_SCRATCH0,r13; /* save r13 */ \ | 41 | mtspr SPRN_SPRG_SCRATCH0,r13; /* save r13 */ \ |
42 | EXCEPTION_PROLOG_1(area); \ | 42 | EXCEPTION_PROLOG_1(area, NOTEST, 0); \ |
43 | EXCEPTION_PROLOG_ISERIES_1; \ | 43 | EXCEPTION_PROLOG_ISERIES_1; \ |
44 | b label##_common | 44 | b label##_common |
45 | 45 | ||
@@ -48,7 +48,7 @@ label##_iSeries: \ | |||
48 | label##_iSeries: \ | 48 | label##_iSeries: \ |
49 | HMT_MEDIUM; \ | 49 | HMT_MEDIUM; \ |
50 | mtspr SPRN_SPRG_SCRATCH0,r13; /* save r13 */ \ | 50 | mtspr SPRN_SPRG_SCRATCH0,r13; /* save r13 */ \ |
51 | EXCEPTION_PROLOG_1(PACA_EXGEN); \ | 51 | EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0); \ |
52 | lbz r10,PACASOFTIRQEN(r13); \ | 52 | lbz r10,PACASOFTIRQEN(r13); \ |
53 | cmpwi 0,r10,0; \ | 53 | cmpwi 0,r10,0; \ |
54 | beq- label##_iSeries_masked; \ | 54 | beq- label##_iSeries_masked; \ |
diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c index 1f15ad436140..ba382b59b926 100644 --- a/arch/powerpc/sysdev/xics/icp-native.c +++ b/arch/powerpc/sysdev/xics/icp-native.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/cpu.h> | 17 | #include <linux/cpu.h> |
18 | #include <linux/of.h> | 18 | #include <linux/of.h> |
19 | #include <linux/spinlock.h> | 19 | #include <linux/spinlock.h> |
20 | #include <linux/module.h> | ||
20 | 21 | ||
21 | #include <asm/prom.h> | 22 | #include <asm/prom.h> |
22 | #include <asm/io.h> | 23 | #include <asm/io.h> |
@@ -24,6 +25,7 @@ | |||
24 | #include <asm/irq.h> | 25 | #include <asm/irq.h> |
25 | #include <asm/errno.h> | 26 | #include <asm/errno.h> |
26 | #include <asm/xics.h> | 27 | #include <asm/xics.h> |
28 | #include <asm/kvm_ppc.h> | ||
27 | 29 | ||
28 | struct icp_ipl { | 30 | struct icp_ipl { |
29 | union { | 31 | union { |
@@ -139,6 +141,12 @@ static void icp_native_cause_ipi(int cpu, unsigned long data) | |||
139 | icp_native_set_qirr(cpu, IPI_PRIORITY); | 141 | icp_native_set_qirr(cpu, IPI_PRIORITY); |
140 | } | 142 | } |
141 | 143 | ||
144 | void xics_wake_cpu(int cpu) | ||
145 | { | ||
146 | icp_native_set_qirr(cpu, IPI_PRIORITY); | ||
147 | } | ||
148 | EXPORT_SYMBOL_GPL(xics_wake_cpu); | ||
149 | |||
142 | static irqreturn_t icp_native_ipi_action(int irq, void *dev_id) | 150 | static irqreturn_t icp_native_ipi_action(int irq, void *dev_id) |
143 | { | 151 | { |
144 | int cpu = smp_processor_id(); | 152 | int cpu = smp_processor_id(); |
@@ -185,6 +193,7 @@ static int __init icp_native_map_one_cpu(int hw_id, unsigned long addr, | |||
185 | } | 193 | } |
186 | 194 | ||
187 | icp_native_regs[cpu] = ioremap(addr, size); | 195 | icp_native_regs[cpu] = ioremap(addr, size); |
196 | kvmppc_set_xics_phys(cpu, addr); | ||
188 | if (!icp_native_regs[cpu]) { | 197 | if (!icp_native_regs[cpu]) { |
189 | pr_warning("icp_native: Failed ioremap for CPU %d, " | 198 | pr_warning("icp_native: Failed ioremap for CPU %d, " |
190 | "interrupt server #0x%x, addr %#lx\n", | 199 | "interrupt server #0x%x, addr %#lx\n", |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b2127544fbe7..a67e014e4e44 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -529,6 +529,18 @@ menuconfig PARAVIRT_GUEST | |||
529 | 529 | ||
530 | if PARAVIRT_GUEST | 530 | if PARAVIRT_GUEST |
531 | 531 | ||
532 | config PARAVIRT_TIME_ACCOUNTING | ||
533 | bool "Paravirtual steal time accounting" | ||
534 | select PARAVIRT | ||
535 | default n | ||
536 | ---help--- | ||
537 | Select this option to enable fine granularity task steal time | ||
538 | accounting. Time spent executing other tasks in parallel with | ||
539 | the current vCPU is discounted from the vCPU power. To account for | ||
540 | that, there can be a small performance impact. | ||
541 | |||
542 | If in doubt, say N here. | ||
543 | |||
532 | source "arch/x86/xen/Kconfig" | 544 | source "arch/x86/xen/Kconfig" |
533 | 545 | ||
534 | config KVM_CLOCK | 546 | config KVM_CLOCK |
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 0049211959c0..6040d115ef51 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h | |||
@@ -229,7 +229,26 @@ struct read_cache { | |||
229 | unsigned long end; | 229 | unsigned long end; |
230 | }; | 230 | }; |
231 | 231 | ||
232 | struct decode_cache { | 232 | struct x86_emulate_ctxt { |
233 | struct x86_emulate_ops *ops; | ||
234 | |||
235 | /* Register state before/after emulation. */ | ||
236 | unsigned long eflags; | ||
237 | unsigned long eip; /* eip before instruction emulation */ | ||
238 | /* Emulated execution mode, represented by an X86EMUL_MODE value. */ | ||
239 | int mode; | ||
240 | |||
241 | /* interruptibility state, as a result of execution of STI or MOV SS */ | ||
242 | int interruptibility; | ||
243 | |||
244 | bool guest_mode; /* guest running a nested guest */ | ||
245 | bool perm_ok; /* do not check permissions if true */ | ||
246 | bool only_vendor_specific_insn; | ||
247 | |||
248 | bool have_exception; | ||
249 | struct x86_exception exception; | ||
250 | |||
251 | /* decode cache */ | ||
233 | u8 twobyte; | 252 | u8 twobyte; |
234 | u8 b; | 253 | u8 b; |
235 | u8 intercept; | 254 | u8 intercept; |
@@ -246,8 +265,6 @@ struct decode_cache { | |||
246 | unsigned int d; | 265 | unsigned int d; |
247 | int (*execute)(struct x86_emulate_ctxt *ctxt); | 266 | int (*execute)(struct x86_emulate_ctxt *ctxt); |
248 | int (*check_perm)(struct x86_emulate_ctxt *ctxt); | 267 | int (*check_perm)(struct x86_emulate_ctxt *ctxt); |
249 | unsigned long regs[NR_VCPU_REGS]; | ||
250 | unsigned long eip; | ||
251 | /* modrm */ | 268 | /* modrm */ |
252 | u8 modrm; | 269 | u8 modrm; |
253 | u8 modrm_mod; | 270 | u8 modrm_mod; |
@@ -255,34 +272,14 @@ struct decode_cache { | |||
255 | u8 modrm_rm; | 272 | u8 modrm_rm; |
256 | u8 modrm_seg; | 273 | u8 modrm_seg; |
257 | bool rip_relative; | 274 | bool rip_relative; |
275 | unsigned long _eip; | ||
276 | /* Fields above regs are cleared together. */ | ||
277 | unsigned long regs[NR_VCPU_REGS]; | ||
258 | struct fetch_cache fetch; | 278 | struct fetch_cache fetch; |
259 | struct read_cache io_read; | 279 | struct read_cache io_read; |
260 | struct read_cache mem_read; | 280 | struct read_cache mem_read; |
261 | }; | 281 | }; |
262 | 282 | ||
263 | struct x86_emulate_ctxt { | ||
264 | struct x86_emulate_ops *ops; | ||
265 | |||
266 | /* Register state before/after emulation. */ | ||
267 | unsigned long eflags; | ||
268 | unsigned long eip; /* eip before instruction emulation */ | ||
269 | /* Emulated execution mode, represented by an X86EMUL_MODE value. */ | ||
270 | int mode; | ||
271 | |||
272 | /* interruptibility state, as a result of execution of STI or MOV SS */ | ||
273 | int interruptibility; | ||
274 | |||
275 | bool guest_mode; /* guest running a nested guest */ | ||
276 | bool perm_ok; /* do not check permissions if true */ | ||
277 | bool only_vendor_specific_insn; | ||
278 | |||
279 | bool have_exception; | ||
280 | struct x86_exception exception; | ||
281 | |||
282 | /* decode cache */ | ||
283 | struct decode_cache decode; | ||
284 | }; | ||
285 | |||
286 | /* Repeat String Operation Prefix */ | 283 | /* Repeat String Operation Prefix */ |
287 | #define REPE_PREFIX 0xf3 | 284 | #define REPE_PREFIX 0xf3 |
288 | #define REPNE_PREFIX 0xf2 | 285 | #define REPNE_PREFIX 0xf2 |
@@ -373,6 +370,5 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); | |||
373 | int emulator_task_switch(struct x86_emulate_ctxt *ctxt, | 370 | int emulator_task_switch(struct x86_emulate_ctxt *ctxt, |
374 | u16 tss_selector, int reason, | 371 | u16 tss_selector, int reason, |
375 | bool has_error_code, u32 error_code); | 372 | bool has_error_code, u32 error_code); |
376 | int emulate_int_real(struct x86_emulate_ctxt *ctxt, | 373 | int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); |
377 | struct x86_emulate_ops *ops, int irq); | ||
378 | #endif /* _ASM_X86_KVM_X86_EMULATE_H */ | 374 | #endif /* _ASM_X86_KVM_X86_EMULATE_H */ |
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d2ac8e2ee897..dd51c83aa5de 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h | |||
@@ -48,7 +48,7 @@ | |||
48 | (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | 48 | (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ |
49 | | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | 49 | | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ |
50 | | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ | 50 | | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ |
51 | | X86_CR4_OSXSAVE \ | 51 | | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \ |
52 | | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) | 52 | | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) |
53 | 53 | ||
54 | #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) | 54 | #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) |
@@ -205,6 +205,7 @@ union kvm_mmu_page_role { | |||
205 | unsigned invalid:1; | 205 | unsigned invalid:1; |
206 | unsigned nxe:1; | 206 | unsigned nxe:1; |
207 | unsigned cr0_wp:1; | 207 | unsigned cr0_wp:1; |
208 | unsigned smep_andnot_wp:1; | ||
208 | }; | 209 | }; |
209 | }; | 210 | }; |
210 | 211 | ||
@@ -227,15 +228,17 @@ struct kvm_mmu_page { | |||
227 | * in this shadow page. | 228 | * in this shadow page. |
228 | */ | 229 | */ |
229 | DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); | 230 | DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); |
230 | bool multimapped; /* More than one parent_pte? */ | ||
231 | bool unsync; | 231 | bool unsync; |
232 | int root_count; /* Currently serving as active root */ | 232 | int root_count; /* Currently serving as active root */ |
233 | unsigned int unsync_children; | 233 | unsigned int unsync_children; |
234 | union { | 234 | unsigned long parent_ptes; /* Reverse mapping for parent_pte */ |
235 | u64 *parent_pte; /* !multimapped */ | ||
236 | struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ | ||
237 | }; | ||
238 | DECLARE_BITMAP(unsync_child_bitmap, 512); | 235 | DECLARE_BITMAP(unsync_child_bitmap, 512); |
236 | |||
237 | #ifdef CONFIG_X86_32 | ||
238 | int clear_spte_count; | ||
239 | #endif | ||
240 | |||
241 | struct rcu_head rcu; | ||
239 | }; | 242 | }; |
240 | 243 | ||
241 | struct kvm_pv_mmu_op_buffer { | 244 | struct kvm_pv_mmu_op_buffer { |
@@ -269,8 +272,6 @@ struct kvm_mmu { | |||
269 | gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, | 272 | gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, |
270 | struct x86_exception *exception); | 273 | struct x86_exception *exception); |
271 | gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); | 274 | gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); |
272 | void (*prefetch_page)(struct kvm_vcpu *vcpu, | ||
273 | struct kvm_mmu_page *page); | ||
274 | int (*sync_page)(struct kvm_vcpu *vcpu, | 275 | int (*sync_page)(struct kvm_vcpu *vcpu, |
275 | struct kvm_mmu_page *sp); | 276 | struct kvm_mmu_page *sp); |
276 | void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); | 277 | void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); |
@@ -346,8 +347,7 @@ struct kvm_vcpu_arch { | |||
346 | * put it here to avoid allocation */ | 347 | * put it here to avoid allocation */ |
347 | struct kvm_pv_mmu_op_buffer mmu_op_buffer; | 348 | struct kvm_pv_mmu_op_buffer mmu_op_buffer; |
348 | 349 | ||
349 | struct kvm_mmu_memory_cache mmu_pte_chain_cache; | 350 | struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; |
350 | struct kvm_mmu_memory_cache mmu_rmap_desc_cache; | ||
351 | struct kvm_mmu_memory_cache mmu_page_cache; | 351 | struct kvm_mmu_memory_cache mmu_page_cache; |
352 | struct kvm_mmu_memory_cache mmu_page_header_cache; | 352 | struct kvm_mmu_memory_cache mmu_page_header_cache; |
353 | 353 | ||
@@ -393,6 +393,15 @@ struct kvm_vcpu_arch { | |||
393 | unsigned int hw_tsc_khz; | 393 | unsigned int hw_tsc_khz; |
394 | unsigned int time_offset; | 394 | unsigned int time_offset; |
395 | struct page *time_page; | 395 | struct page *time_page; |
396 | |||
397 | struct { | ||
398 | u64 msr_val; | ||
399 | u64 last_steal; | ||
400 | u64 accum_steal; | ||
401 | struct gfn_to_hva_cache stime; | ||
402 | struct kvm_steal_time steal; | ||
403 | } st; | ||
404 | |||
396 | u64 last_guest_tsc; | 405 | u64 last_guest_tsc; |
397 | u64 last_kernel_ns; | 406 | u64 last_kernel_ns; |
398 | u64 last_tsc_nsec; | 407 | u64 last_tsc_nsec; |
@@ -419,6 +428,11 @@ struct kvm_vcpu_arch { | |||
419 | u64 mcg_ctl; | 428 | u64 mcg_ctl; |
420 | u64 *mce_banks; | 429 | u64 *mce_banks; |
421 | 430 | ||
431 | /* Cache MMIO info */ | ||
432 | u64 mmio_gva; | ||
433 | unsigned access; | ||
434 | gfn_t mmio_gfn; | ||
435 | |||
422 | /* used for guest single stepping over the given code position */ | 436 | /* used for guest single stepping over the given code position */ |
423 | unsigned long singlestep_rip; | 437 | unsigned long singlestep_rip; |
424 | 438 | ||
@@ -441,6 +455,7 @@ struct kvm_arch { | |||
441 | unsigned int n_used_mmu_pages; | 455 | unsigned int n_used_mmu_pages; |
442 | unsigned int n_requested_mmu_pages; | 456 | unsigned int n_requested_mmu_pages; |
443 | unsigned int n_max_mmu_pages; | 457 | unsigned int n_max_mmu_pages; |
458 | unsigned int indirect_shadow_pages; | ||
444 | atomic_t invlpg_counter; | 459 | atomic_t invlpg_counter; |
445 | struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; | 460 | struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; |
446 | /* | 461 | /* |
@@ -477,6 +492,8 @@ struct kvm_arch { | |||
477 | u64 hv_guest_os_id; | 492 | u64 hv_guest_os_id; |
478 | u64 hv_hypercall; | 493 | u64 hv_hypercall; |
479 | 494 | ||
495 | atomic_t reader_counter; | ||
496 | |||
480 | #ifdef CONFIG_KVM_MMU_AUDIT | 497 | #ifdef CONFIG_KVM_MMU_AUDIT |
481 | int audit_point; | 498 | int audit_point; |
482 | #endif | 499 | #endif |
@@ -559,7 +576,7 @@ struct kvm_x86_ops { | |||
559 | void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); | 576 | void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); |
560 | void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); | 577 | void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); |
561 | void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); | 578 | void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); |
562 | void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); | 579 | int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); |
563 | void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); | 580 | void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); |
564 | void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); | 581 | void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); |
565 | void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); | 582 | void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); |
@@ -636,7 +653,6 @@ void kvm_mmu_module_exit(void); | |||
636 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu); | 653 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu); |
637 | int kvm_mmu_create(struct kvm_vcpu *vcpu); | 654 | int kvm_mmu_create(struct kvm_vcpu *vcpu); |
638 | int kvm_mmu_setup(struct kvm_vcpu *vcpu); | 655 | int kvm_mmu_setup(struct kvm_vcpu *vcpu); |
639 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); | ||
640 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, | 656 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, |
641 | u64 dirty_mask, u64 nx_mask, u64 x_mask); | 657 | u64 dirty_mask, u64 nx_mask, u64 x_mask); |
642 | 658 | ||
@@ -830,11 +846,12 @@ enum { | |||
830 | asmlinkage void kvm_spurious_fault(void); | 846 | asmlinkage void kvm_spurious_fault(void); |
831 | extern bool kvm_rebooting; | 847 | extern bool kvm_rebooting; |
832 | 848 | ||
833 | #define __kvm_handle_fault_on_reboot(insn) \ | 849 | #define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \ |
834 | "666: " insn "\n\t" \ | 850 | "666: " insn "\n\t" \ |
835 | "668: \n\t" \ | 851 | "668: \n\t" \ |
836 | ".pushsection .fixup, \"ax\" \n" \ | 852 | ".pushsection .fixup, \"ax\" \n" \ |
837 | "667: \n\t" \ | 853 | "667: \n\t" \ |
854 | cleanup_insn "\n\t" \ | ||
838 | "cmpb $0, kvm_rebooting \n\t" \ | 855 | "cmpb $0, kvm_rebooting \n\t" \ |
839 | "jne 668b \n\t" \ | 856 | "jne 668b \n\t" \ |
840 | __ASM_SIZE(push) " $666b \n\t" \ | 857 | __ASM_SIZE(push) " $666b \n\t" \ |
@@ -844,6 +861,9 @@ extern bool kvm_rebooting; | |||
844 | _ASM_PTR " 666b, 667b \n\t" \ | 861 | _ASM_PTR " 666b, 667b \n\t" \ |
845 | ".popsection" | 862 | ".popsection" |
846 | 863 | ||
864 | #define __kvm_handle_fault_on_reboot(insn) \ | ||
865 | ____kvm_handle_fault_on_reboot(insn, "") | ||
866 | |||
847 | #define KVM_ARCH_WANT_MMU_NOTIFIER | 867 | #define KVM_ARCH_WANT_MMU_NOTIFIER |
848 | int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); | 868 | int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); |
849 | int kvm_age_hva(struct kvm *kvm, unsigned long hva); | 869 | int kvm_age_hva(struct kvm *kvm, unsigned long hva); |
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index a427bf77a93d..734c3767cfac 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h | |||
@@ -21,6 +21,7 @@ | |||
21 | */ | 21 | */ |
22 | #define KVM_FEATURE_CLOCKSOURCE2 3 | 22 | #define KVM_FEATURE_CLOCKSOURCE2 3 |
23 | #define KVM_FEATURE_ASYNC_PF 4 | 23 | #define KVM_FEATURE_ASYNC_PF 4 |
24 | #define KVM_FEATURE_STEAL_TIME 5 | ||
24 | 25 | ||
25 | /* The last 8 bits are used to indicate how to interpret the flags field | 26 | /* The last 8 bits are used to indicate how to interpret the flags field |
26 | * in pvclock structure. If no bits are set, all flags are ignored. | 27 | * in pvclock structure. If no bits are set, all flags are ignored. |
@@ -30,10 +31,23 @@ | |||
30 | #define MSR_KVM_WALL_CLOCK 0x11 | 31 | #define MSR_KVM_WALL_CLOCK 0x11 |
31 | #define MSR_KVM_SYSTEM_TIME 0x12 | 32 | #define MSR_KVM_SYSTEM_TIME 0x12 |
32 | 33 | ||
34 | #define KVM_MSR_ENABLED 1 | ||
33 | /* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ | 35 | /* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ |
34 | #define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 | 36 | #define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 |
35 | #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 | 37 | #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 |
36 | #define MSR_KVM_ASYNC_PF_EN 0x4b564d02 | 38 | #define MSR_KVM_ASYNC_PF_EN 0x4b564d02 |
39 | #define MSR_KVM_STEAL_TIME 0x4b564d03 | ||
40 | |||
41 | struct kvm_steal_time { | ||
42 | __u64 steal; | ||
43 | __u32 version; | ||
44 | __u32 flags; | ||
45 | __u32 pad[12]; | ||
46 | }; | ||
47 | |||
48 | #define KVM_STEAL_ALIGNMENT_BITS 5 | ||
49 | #define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1))) | ||
50 | #define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1) | ||
37 | 51 | ||
38 | #define KVM_MAX_MMU_OP_BATCH 32 | 52 | #define KVM_MAX_MMU_OP_BATCH 32 |
39 | 53 | ||
@@ -178,6 +192,7 @@ void __init kvm_guest_init(void); | |||
178 | void kvm_async_pf_task_wait(u32 token); | 192 | void kvm_async_pf_task_wait(u32 token); |
179 | void kvm_async_pf_task_wake(u32 token); | 193 | void kvm_async_pf_task_wake(u32 token); |
180 | u32 kvm_read_and_reset_pf_reason(void); | 194 | u32 kvm_read_and_reset_pf_reason(void); |
195 | extern void kvm_disable_steal_time(void); | ||
181 | #else | 196 | #else |
182 | #define kvm_guest_init() do { } while (0) | 197 | #define kvm_guest_init() do { } while (0) |
183 | #define kvm_async_pf_task_wait(T) do {} while(0) | 198 | #define kvm_async_pf_task_wait(T) do {} while(0) |
@@ -186,6 +201,11 @@ static inline u32 kvm_read_and_reset_pf_reason(void) | |||
186 | { | 201 | { |
187 | return 0; | 202 | return 0; |
188 | } | 203 | } |
204 | |||
205 | static inline void kvm_disable_steal_time(void) | ||
206 | { | ||
207 | return; | ||
208 | } | ||
189 | #endif | 209 | #endif |
190 | 210 | ||
191 | #endif /* __KERNEL__ */ | 211 | #endif /* __KERNEL__ */ |
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index d96bdb25ca3d..d52609aeeab8 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h | |||
@@ -441,6 +441,18 @@ | |||
441 | #define MSR_IA32_VMX_VMCS_ENUM 0x0000048a | 441 | #define MSR_IA32_VMX_VMCS_ENUM 0x0000048a |
442 | #define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b | 442 | #define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b |
443 | #define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c | 443 | #define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c |
444 | #define MSR_IA32_VMX_TRUE_PINBASED_CTLS 0x0000048d | ||
445 | #define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e | ||
446 | #define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f | ||
447 | #define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490 | ||
448 | |||
449 | /* VMX_BASIC bits and bitmasks */ | ||
450 | #define VMX_BASIC_VMCS_SIZE_SHIFT 32 | ||
451 | #define VMX_BASIC_64 0x0001000000000000LLU | ||
452 | #define VMX_BASIC_MEM_TYPE_SHIFT 50 | ||
453 | #define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU | ||
454 | #define VMX_BASIC_MEM_TYPE_WB 6LLU | ||
455 | #define VMX_BASIC_INOUT 0x0040000000000000LLU | ||
444 | 456 | ||
445 | /* AMD-V MSRs */ | 457 | /* AMD-V MSRs */ |
446 | 458 | ||
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index ebbc4d8ab170..a7d2db9a74fb 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h | |||
@@ -230,6 +230,15 @@ static inline unsigned long long paravirt_sched_clock(void) | |||
230 | return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); | 230 | return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); |
231 | } | 231 | } |
232 | 232 | ||
233 | struct jump_label_key; | ||
234 | extern struct jump_label_key paravirt_steal_enabled; | ||
235 | extern struct jump_label_key paravirt_steal_rq_enabled; | ||
236 | |||
237 | static inline u64 paravirt_steal_clock(int cpu) | ||
238 | { | ||
239 | return PVOP_CALL1(u64, pv_time_ops.steal_clock, cpu); | ||
240 | } | ||
241 | |||
233 | static inline unsigned long long paravirt_read_pmc(int counter) | 242 | static inline unsigned long long paravirt_read_pmc(int counter) |
234 | { | 243 | { |
235 | return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter); | 244 | return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter); |
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 82885099c869..2c7652163111 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h | |||
@@ -89,6 +89,7 @@ struct pv_lazy_ops { | |||
89 | 89 | ||
90 | struct pv_time_ops { | 90 | struct pv_time_ops { |
91 | unsigned long long (*sched_clock)(void); | 91 | unsigned long long (*sched_clock)(void); |
92 | unsigned long long (*steal_clock)(int cpu); | ||
92 | unsigned long (*get_tsc_khz)(void); | 93 | unsigned long (*get_tsc_khz)(void); |
93 | }; | 94 | }; |
94 | 95 | ||
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 59ab4dffa377..2dddb317bb39 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h | |||
@@ -59,6 +59,7 @@ | |||
59 | #define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */ | 59 | #define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */ |
60 | #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ | 60 | #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ |
61 | #define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ | 61 | #define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ |
62 | #define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */ | ||
62 | #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ | 63 | #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ |
63 | #define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ | 64 | #define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ |
64 | 65 | ||
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 84471b810460..2caf290e9895 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h | |||
@@ -132,6 +132,8 @@ enum vmcs_field { | |||
132 | GUEST_IA32_PAT_HIGH = 0x00002805, | 132 | GUEST_IA32_PAT_HIGH = 0x00002805, |
133 | GUEST_IA32_EFER = 0x00002806, | 133 | GUEST_IA32_EFER = 0x00002806, |
134 | GUEST_IA32_EFER_HIGH = 0x00002807, | 134 | GUEST_IA32_EFER_HIGH = 0x00002807, |
135 | GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, | ||
136 | GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809, | ||
135 | GUEST_PDPTR0 = 0x0000280a, | 137 | GUEST_PDPTR0 = 0x0000280a, |
136 | GUEST_PDPTR0_HIGH = 0x0000280b, | 138 | GUEST_PDPTR0_HIGH = 0x0000280b, |
137 | GUEST_PDPTR1 = 0x0000280c, | 139 | GUEST_PDPTR1 = 0x0000280c, |
@@ -144,6 +146,8 @@ enum vmcs_field { | |||
144 | HOST_IA32_PAT_HIGH = 0x00002c01, | 146 | HOST_IA32_PAT_HIGH = 0x00002c01, |
145 | HOST_IA32_EFER = 0x00002c02, | 147 | HOST_IA32_EFER = 0x00002c02, |
146 | HOST_IA32_EFER_HIGH = 0x00002c03, | 148 | HOST_IA32_EFER_HIGH = 0x00002c03, |
149 | HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, | ||
150 | HOST_IA32_PERF_GLOBAL_CTRL_HIGH = 0x00002c05, | ||
147 | PIN_BASED_VM_EXEC_CONTROL = 0x00004000, | 151 | PIN_BASED_VM_EXEC_CONTROL = 0x00004000, |
148 | CPU_BASED_VM_EXEC_CONTROL = 0x00004002, | 152 | CPU_BASED_VM_EXEC_CONTROL = 0x00004002, |
149 | EXCEPTION_BITMAP = 0x00004004, | 153 | EXCEPTION_BITMAP = 0x00004004, |
@@ -426,4 +430,43 @@ struct vmx_msr_entry { | |||
426 | u64 value; | 430 | u64 value; |
427 | } __aligned(16); | 431 | } __aligned(16); |
428 | 432 | ||
433 | /* | ||
434 | * Exit Qualifications for entry failure during or after loading guest state | ||
435 | */ | ||
436 | #define ENTRY_FAIL_DEFAULT 0 | ||
437 | #define ENTRY_FAIL_PDPTE 2 | ||
438 | #define ENTRY_FAIL_NMI 3 | ||
439 | #define ENTRY_FAIL_VMCS_LINK_PTR 4 | ||
440 | |||
441 | /* | ||
442 | * VM-instruction error numbers | ||
443 | */ | ||
444 | enum vm_instruction_error_number { | ||
445 | VMXERR_VMCALL_IN_VMX_ROOT_OPERATION = 1, | ||
446 | VMXERR_VMCLEAR_INVALID_ADDRESS = 2, | ||
447 | VMXERR_VMCLEAR_VMXON_POINTER = 3, | ||
448 | VMXERR_VMLAUNCH_NONCLEAR_VMCS = 4, | ||
449 | VMXERR_VMRESUME_NONLAUNCHED_VMCS = 5, | ||
450 | VMXERR_VMRESUME_AFTER_VMXOFF = 6, | ||
451 | VMXERR_ENTRY_INVALID_CONTROL_FIELD = 7, | ||
452 | VMXERR_ENTRY_INVALID_HOST_STATE_FIELD = 8, | ||
453 | VMXERR_VMPTRLD_INVALID_ADDRESS = 9, | ||
454 | VMXERR_VMPTRLD_VMXON_POINTER = 10, | ||
455 | VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID = 11, | ||
456 | VMXERR_UNSUPPORTED_VMCS_COMPONENT = 12, | ||
457 | VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT = 13, | ||
458 | VMXERR_VMXON_IN_VMX_ROOT_OPERATION = 15, | ||
459 | VMXERR_ENTRY_INVALID_EXECUTIVE_VMCS_POINTER = 16, | ||
460 | VMXERR_ENTRY_NONLAUNCHED_EXECUTIVE_VMCS = 17, | ||
461 | VMXERR_ENTRY_EXECUTIVE_VMCS_POINTER_NOT_VMXON_POINTER = 18, | ||
462 | VMXERR_VMCALL_NONCLEAR_VMCS = 19, | ||
463 | VMXERR_VMCALL_INVALID_VM_EXIT_CONTROL_FIELDS = 20, | ||
464 | VMXERR_VMCALL_INCORRECT_MSEG_REVISION_ID = 22, | ||
465 | VMXERR_VMXOFF_UNDER_DUAL_MONITOR_TREATMENT_OF_SMIS_AND_SMM = 23, | ||
466 | VMXERR_VMCALL_INVALID_SMM_MONITOR_FEATURES = 24, | ||
467 | VMXERR_ENTRY_INVALID_VM_EXECUTION_CONTROL_FIELDS_IN_EXECUTIVE_VMCS = 25, | ||
468 | VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS = 26, | ||
469 | VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28, | ||
470 | }; | ||
471 | |||
429 | #endif | 472 | #endif |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 33c07b0b122e..a9c2116001d6 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
@@ -51,6 +51,15 @@ static int parse_no_kvmapf(char *arg) | |||
51 | 51 | ||
52 | early_param("no-kvmapf", parse_no_kvmapf); | 52 | early_param("no-kvmapf", parse_no_kvmapf); |
53 | 53 | ||
54 | static int steal_acc = 1; | ||
55 | static int parse_no_stealacc(char *arg) | ||
56 | { | ||
57 | steal_acc = 0; | ||
58 | return 0; | ||
59 | } | ||
60 | |||
61 | early_param("no-steal-acc", parse_no_stealacc); | ||
62 | |||
54 | struct kvm_para_state { | 63 | struct kvm_para_state { |
55 | u8 mmu_queue[MMU_QUEUE_SIZE]; | 64 | u8 mmu_queue[MMU_QUEUE_SIZE]; |
56 | int mmu_queue_len; | 65 | int mmu_queue_len; |
@@ -58,6 +67,8 @@ struct kvm_para_state { | |||
58 | 67 | ||
59 | static DEFINE_PER_CPU(struct kvm_para_state, para_state); | 68 | static DEFINE_PER_CPU(struct kvm_para_state, para_state); |
60 | static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); | 69 | static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); |
70 | static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); | ||
71 | static int has_steal_clock = 0; | ||
61 | 72 | ||
62 | static struct kvm_para_state *kvm_para_state(void) | 73 | static struct kvm_para_state *kvm_para_state(void) |
63 | { | 74 | { |
@@ -441,6 +452,21 @@ static void __init paravirt_ops_setup(void) | |||
441 | #endif | 452 | #endif |
442 | } | 453 | } |
443 | 454 | ||
455 | static void kvm_register_steal_time(void) | ||
456 | { | ||
457 | int cpu = smp_processor_id(); | ||
458 | struct kvm_steal_time *st = &per_cpu(steal_time, cpu); | ||
459 | |||
460 | if (!has_steal_clock) | ||
461 | return; | ||
462 | |||
463 | memset(st, 0, sizeof(*st)); | ||
464 | |||
465 | wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED)); | ||
466 | printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n", | ||
467 | cpu, __pa(st)); | ||
468 | } | ||
469 | |||
444 | void __cpuinit kvm_guest_cpu_init(void) | 470 | void __cpuinit kvm_guest_cpu_init(void) |
445 | { | 471 | { |
446 | if (!kvm_para_available()) | 472 | if (!kvm_para_available()) |
@@ -457,6 +483,9 @@ void __cpuinit kvm_guest_cpu_init(void) | |||
457 | printk(KERN_INFO"KVM setup async PF for cpu %d\n", | 483 | printk(KERN_INFO"KVM setup async PF for cpu %d\n", |
458 | smp_processor_id()); | 484 | smp_processor_id()); |
459 | } | 485 | } |
486 | |||
487 | if (has_steal_clock) | ||
488 | kvm_register_steal_time(); | ||
460 | } | 489 | } |
461 | 490 | ||
462 | static void kvm_pv_disable_apf(void *unused) | 491 | static void kvm_pv_disable_apf(void *unused) |
@@ -483,6 +512,31 @@ static struct notifier_block kvm_pv_reboot_nb = { | |||
483 | .notifier_call = kvm_pv_reboot_notify, | 512 | .notifier_call = kvm_pv_reboot_notify, |
484 | }; | 513 | }; |
485 | 514 | ||
515 | static u64 kvm_steal_clock(int cpu) | ||
516 | { | ||
517 | u64 steal; | ||
518 | struct kvm_steal_time *src; | ||
519 | int version; | ||
520 | |||
521 | src = &per_cpu(steal_time, cpu); | ||
522 | do { | ||
523 | version = src->version; | ||
524 | rmb(); | ||
525 | steal = src->steal; | ||
526 | rmb(); | ||
527 | } while ((version & 1) || (version != src->version)); | ||
528 | |||
529 | return steal; | ||
530 | } | ||
531 | |||
532 | void kvm_disable_steal_time(void) | ||
533 | { | ||
534 | if (!has_steal_clock) | ||
535 | return; | ||
536 | |||
537 | wrmsr(MSR_KVM_STEAL_TIME, 0, 0); | ||
538 | } | ||
539 | |||
486 | #ifdef CONFIG_SMP | 540 | #ifdef CONFIG_SMP |
487 | static void __init kvm_smp_prepare_boot_cpu(void) | 541 | static void __init kvm_smp_prepare_boot_cpu(void) |
488 | { | 542 | { |
@@ -500,6 +554,7 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy) | |||
500 | 554 | ||
501 | static void kvm_guest_cpu_offline(void *dummy) | 555 | static void kvm_guest_cpu_offline(void *dummy) |
502 | { | 556 | { |
557 | kvm_disable_steal_time(); | ||
503 | kvm_pv_disable_apf(NULL); | 558 | kvm_pv_disable_apf(NULL); |
504 | apf_task_wake_all(); | 559 | apf_task_wake_all(); |
505 | } | 560 | } |
@@ -548,6 +603,11 @@ void __init kvm_guest_init(void) | |||
548 | if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) | 603 | if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) |
549 | x86_init.irqs.trap_init = kvm_apf_trap_init; | 604 | x86_init.irqs.trap_init = kvm_apf_trap_init; |
550 | 605 | ||
606 | if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { | ||
607 | has_steal_clock = 1; | ||
608 | pv_time_ops.steal_clock = kvm_steal_clock; | ||
609 | } | ||
610 | |||
551 | #ifdef CONFIG_SMP | 611 | #ifdef CONFIG_SMP |
552 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; | 612 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; |
553 | register_cpu_notifier(&kvm_cpu_notifier); | 613 | register_cpu_notifier(&kvm_cpu_notifier); |
@@ -555,3 +615,15 @@ void __init kvm_guest_init(void) | |||
555 | kvm_guest_cpu_init(); | 615 | kvm_guest_cpu_init(); |
556 | #endif | 616 | #endif |
557 | } | 617 | } |
618 | |||
619 | static __init int activate_jump_labels(void) | ||
620 | { | ||
621 | if (has_steal_clock) { | ||
622 | jump_label_inc(¶virt_steal_enabled); | ||
623 | if (steal_acc) | ||
624 | jump_label_inc(¶virt_steal_rq_enabled); | ||
625 | } | ||
626 | |||
627 | return 0; | ||
628 | } | ||
629 | arch_initcall(activate_jump_labels); | ||
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 6389a6bca11b..c1a0188e29ae 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c | |||
@@ -160,6 +160,7 @@ static void __cpuinit kvm_setup_secondary_clock(void) | |||
160 | static void kvm_crash_shutdown(struct pt_regs *regs) | 160 | static void kvm_crash_shutdown(struct pt_regs *regs) |
161 | { | 161 | { |
162 | native_write_msr(msr_kvm_system_time, 0, 0); | 162 | native_write_msr(msr_kvm_system_time, 0, 0); |
163 | kvm_disable_steal_time(); | ||
163 | native_machine_crash_shutdown(regs); | 164 | native_machine_crash_shutdown(regs); |
164 | } | 165 | } |
165 | #endif | 166 | #endif |
@@ -167,6 +168,7 @@ static void kvm_crash_shutdown(struct pt_regs *regs) | |||
167 | static void kvm_shutdown(void) | 168 | static void kvm_shutdown(void) |
168 | { | 169 | { |
169 | native_write_msr(msr_kvm_system_time, 0, 0); | 170 | native_write_msr(msr_kvm_system_time, 0, 0); |
171 | kvm_disable_steal_time(); | ||
170 | native_machine_shutdown(); | 172 | native_machine_shutdown(); |
171 | } | 173 | } |
172 | 174 | ||
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 869e1aeeb71b..613a7931ecc1 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c | |||
@@ -202,6 +202,14 @@ static void native_flush_tlb_single(unsigned long addr) | |||
202 | __native_flush_tlb_single(addr); | 202 | __native_flush_tlb_single(addr); |
203 | } | 203 | } |
204 | 204 | ||
205 | struct jump_label_key paravirt_steal_enabled; | ||
206 | struct jump_label_key paravirt_steal_rq_enabled; | ||
207 | |||
208 | static u64 native_steal_clock(int cpu) | ||
209 | { | ||
210 | return 0; | ||
211 | } | ||
212 | |||
205 | /* These are in entry.S */ | 213 | /* These are in entry.S */ |
206 | extern void native_iret(void); | 214 | extern void native_iret(void); |
207 | extern void native_irq_enable_sysexit(void); | 215 | extern void native_irq_enable_sysexit(void); |
@@ -307,6 +315,7 @@ struct pv_init_ops pv_init_ops = { | |||
307 | 315 | ||
308 | struct pv_time_ops pv_time_ops = { | 316 | struct pv_time_ops pv_time_ops = { |
309 | .sched_clock = native_sched_clock, | 317 | .sched_clock = native_sched_clock, |
318 | .steal_clock = native_steal_clock, | ||
310 | }; | 319 | }; |
311 | 320 | ||
312 | struct pv_irq_ops pv_irq_ops = { | 321 | struct pv_irq_ops pv_irq_ops = { |
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 65cf8233d25c..988724b236b6 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig | |||
@@ -31,6 +31,7 @@ config KVM | |||
31 | select KVM_ASYNC_PF | 31 | select KVM_ASYNC_PF |
32 | select USER_RETURN_NOTIFIER | 32 | select USER_RETURN_NOTIFIER |
33 | select KVM_MMIO | 33 | select KVM_MMIO |
34 | select TASK_DELAY_ACCT | ||
34 | ---help--- | 35 | ---help--- |
35 | Support hosting fully virtualized guest machines using hardware | 36 | Support hosting fully virtualized guest machines using hardware |
36 | virtualization extensions. You will need a fairly recent | 37 | virtualization extensions. You will need a fairly recent |
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index adc98675cda0..6f08bc940fa8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c | |||
@@ -407,76 +407,59 @@ struct gprefix { | |||
407 | } \ | 407 | } \ |
408 | } while (0) | 408 | } while (0) |
409 | 409 | ||
410 | /* Fetch next part of the instruction being emulated. */ | ||
411 | #define insn_fetch(_type, _size, _eip) \ | ||
412 | ({ unsigned long _x; \ | ||
413 | rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ | ||
414 | if (rc != X86EMUL_CONTINUE) \ | ||
415 | goto done; \ | ||
416 | (_eip) += (_size); \ | ||
417 | (_type)_x; \ | ||
418 | }) | ||
419 | |||
420 | #define insn_fetch_arr(_arr, _size, _eip) \ | ||
421 | ({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ | ||
422 | if (rc != X86EMUL_CONTINUE) \ | ||
423 | goto done; \ | ||
424 | (_eip) += (_size); \ | ||
425 | }) | ||
426 | |||
427 | static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, | 410 | static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, |
428 | enum x86_intercept intercept, | 411 | enum x86_intercept intercept, |
429 | enum x86_intercept_stage stage) | 412 | enum x86_intercept_stage stage) |
430 | { | 413 | { |
431 | struct x86_instruction_info info = { | 414 | struct x86_instruction_info info = { |
432 | .intercept = intercept, | 415 | .intercept = intercept, |
433 | .rep_prefix = ctxt->decode.rep_prefix, | 416 | .rep_prefix = ctxt->rep_prefix, |
434 | .modrm_mod = ctxt->decode.modrm_mod, | 417 | .modrm_mod = ctxt->modrm_mod, |
435 | .modrm_reg = ctxt->decode.modrm_reg, | 418 | .modrm_reg = ctxt->modrm_reg, |
436 | .modrm_rm = ctxt->decode.modrm_rm, | 419 | .modrm_rm = ctxt->modrm_rm, |
437 | .src_val = ctxt->decode.src.val64, | 420 | .src_val = ctxt->src.val64, |
438 | .src_bytes = ctxt->decode.src.bytes, | 421 | .src_bytes = ctxt->src.bytes, |
439 | .dst_bytes = ctxt->decode.dst.bytes, | 422 | .dst_bytes = ctxt->dst.bytes, |
440 | .ad_bytes = ctxt->decode.ad_bytes, | 423 | .ad_bytes = ctxt->ad_bytes, |
441 | .next_rip = ctxt->eip, | 424 | .next_rip = ctxt->eip, |
442 | }; | 425 | }; |
443 | 426 | ||
444 | return ctxt->ops->intercept(ctxt, &info, stage); | 427 | return ctxt->ops->intercept(ctxt, &info, stage); |
445 | } | 428 | } |
446 | 429 | ||
447 | static inline unsigned long ad_mask(struct decode_cache *c) | 430 | static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) |
448 | { | 431 | { |
449 | return (1UL << (c->ad_bytes << 3)) - 1; | 432 | return (1UL << (ctxt->ad_bytes << 3)) - 1; |
450 | } | 433 | } |
451 | 434 | ||
452 | /* Access/update address held in a register, based on addressing mode. */ | 435 | /* Access/update address held in a register, based on addressing mode. */ |
453 | static inline unsigned long | 436 | static inline unsigned long |
454 | address_mask(struct decode_cache *c, unsigned long reg) | 437 | address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) |
455 | { | 438 | { |
456 | if (c->ad_bytes == sizeof(unsigned long)) | 439 | if (ctxt->ad_bytes == sizeof(unsigned long)) |
457 | return reg; | 440 | return reg; |
458 | else | 441 | else |
459 | return reg & ad_mask(c); | 442 | return reg & ad_mask(ctxt); |
460 | } | 443 | } |
461 | 444 | ||
462 | static inline unsigned long | 445 | static inline unsigned long |
463 | register_address(struct decode_cache *c, unsigned long reg) | 446 | register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg) |
464 | { | 447 | { |
465 | return address_mask(c, reg); | 448 | return address_mask(ctxt, reg); |
466 | } | 449 | } |
467 | 450 | ||
468 | static inline void | 451 | static inline void |
469 | register_address_increment(struct decode_cache *c, unsigned long *reg, int inc) | 452 | register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc) |
470 | { | 453 | { |
471 | if (c->ad_bytes == sizeof(unsigned long)) | 454 | if (ctxt->ad_bytes == sizeof(unsigned long)) |
472 | *reg += inc; | 455 | *reg += inc; |
473 | else | 456 | else |
474 | *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c)); | 457 | *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt)); |
475 | } | 458 | } |
476 | 459 | ||
477 | static inline void jmp_rel(struct decode_cache *c, int rel) | 460 | static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) |
478 | { | 461 | { |
479 | register_address_increment(c, &c->eip, rel); | 462 | register_address_increment(ctxt, &ctxt->_eip, rel); |
480 | } | 463 | } |
481 | 464 | ||
482 | static u32 desc_limit_scaled(struct desc_struct *desc) | 465 | static u32 desc_limit_scaled(struct desc_struct *desc) |
@@ -486,28 +469,26 @@ static u32 desc_limit_scaled(struct desc_struct *desc) | |||
486 | return desc->g ? (limit << 12) | 0xfff : limit; | 469 | return desc->g ? (limit << 12) | 0xfff : limit; |
487 | } | 470 | } |
488 | 471 | ||
489 | static void set_seg_override(struct decode_cache *c, int seg) | 472 | static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg) |
490 | { | 473 | { |
491 | c->has_seg_override = true; | 474 | ctxt->has_seg_override = true; |
492 | c->seg_override = seg; | 475 | ctxt->seg_override = seg; |
493 | } | 476 | } |
494 | 477 | ||
495 | static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, | 478 | static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) |
496 | struct x86_emulate_ops *ops, int seg) | ||
497 | { | 479 | { |
498 | if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) | 480 | if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) |
499 | return 0; | 481 | return 0; |
500 | 482 | ||
501 | return ops->get_cached_segment_base(ctxt, seg); | 483 | return ctxt->ops->get_cached_segment_base(ctxt, seg); |
502 | } | 484 | } |
503 | 485 | ||
504 | static unsigned seg_override(struct x86_emulate_ctxt *ctxt, | 486 | static unsigned seg_override(struct x86_emulate_ctxt *ctxt) |
505 | struct decode_cache *c) | ||
506 | { | 487 | { |
507 | if (!c->has_seg_override) | 488 | if (!ctxt->has_seg_override) |
508 | return 0; | 489 | return 0; |
509 | 490 | ||
510 | return c->seg_override; | 491 | return ctxt->seg_override; |
511 | } | 492 | } |
512 | 493 | ||
513 | static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, | 494 | static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, |
@@ -579,7 +560,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, | |||
579 | unsigned size, bool write, bool fetch, | 560 | unsigned size, bool write, bool fetch, |
580 | ulong *linear) | 561 | ulong *linear) |
581 | { | 562 | { |
582 | struct decode_cache *c = &ctxt->decode; | ||
583 | struct desc_struct desc; | 563 | struct desc_struct desc; |
584 | bool usable; | 564 | bool usable; |
585 | ulong la; | 565 | ulong la; |
@@ -587,7 +567,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, | |||
587 | u16 sel; | 567 | u16 sel; |
588 | unsigned cpl, rpl; | 568 | unsigned cpl, rpl; |
589 | 569 | ||
590 | la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; | 570 | la = seg_base(ctxt, addr.seg) + addr.ea; |
591 | switch (ctxt->mode) { | 571 | switch (ctxt->mode) { |
592 | case X86EMUL_MODE_REAL: | 572 | case X86EMUL_MODE_REAL: |
593 | break; | 573 | break; |
@@ -637,7 +617,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, | |||
637 | } | 617 | } |
638 | break; | 618 | break; |
639 | } | 619 | } |
640 | if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : c->ad_bytes != 8) | 620 | if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8) |
641 | la &= (u32)-1; | 621 | la &= (u32)-1; |
642 | *linear = la; | 622 | *linear = la; |
643 | return X86EMUL_CONTINUE; | 623 | return X86EMUL_CONTINUE; |
@@ -671,11 +651,10 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, | |||
671 | return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); | 651 | return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); |
672 | } | 652 | } |
673 | 653 | ||
674 | static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, | 654 | static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, |
675 | struct x86_emulate_ops *ops, | ||
676 | unsigned long eip, u8 *dest) | 655 | unsigned long eip, u8 *dest) |
677 | { | 656 | { |
678 | struct fetch_cache *fc = &ctxt->decode.fetch; | 657 | struct fetch_cache *fc = &ctxt->fetch; |
679 | int rc; | 658 | int rc; |
680 | int size, cur_size; | 659 | int size, cur_size; |
681 | 660 | ||
@@ -687,8 +666,8 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, | |||
687 | rc = __linearize(ctxt, addr, size, false, true, &linear); | 666 | rc = __linearize(ctxt, addr, size, false, true, &linear); |
688 | if (rc != X86EMUL_CONTINUE) | 667 | if (rc != X86EMUL_CONTINUE) |
689 | return rc; | 668 | return rc; |
690 | rc = ops->fetch(ctxt, linear, fc->data + cur_size, | 669 | rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size, |
691 | size, &ctxt->exception); | 670 | size, &ctxt->exception); |
692 | if (rc != X86EMUL_CONTINUE) | 671 | if (rc != X86EMUL_CONTINUE) |
693 | return rc; | 672 | return rc; |
694 | fc->end += size; | 673 | fc->end += size; |
@@ -698,7 +677,6 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, | |||
698 | } | 677 | } |
699 | 678 | ||
700 | static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, | 679 | static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, |
701 | struct x86_emulate_ops *ops, | ||
702 | unsigned long eip, void *dest, unsigned size) | 680 | unsigned long eip, void *dest, unsigned size) |
703 | { | 681 | { |
704 | int rc; | 682 | int rc; |
@@ -707,13 +685,30 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, | |||
707 | if (eip + size - ctxt->eip > 15) | 685 | if (eip + size - ctxt->eip > 15) |
708 | return X86EMUL_UNHANDLEABLE; | 686 | return X86EMUL_UNHANDLEABLE; |
709 | while (size--) { | 687 | while (size--) { |
710 | rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); | 688 | rc = do_insn_fetch_byte(ctxt, eip++, dest++); |
711 | if (rc != X86EMUL_CONTINUE) | 689 | if (rc != X86EMUL_CONTINUE) |
712 | return rc; | 690 | return rc; |
713 | } | 691 | } |
714 | return X86EMUL_CONTINUE; | 692 | return X86EMUL_CONTINUE; |
715 | } | 693 | } |
716 | 694 | ||
695 | /* Fetch next part of the instruction being emulated. */ | ||
696 | #define insn_fetch(_type, _size, _eip) \ | ||
697 | ({ unsigned long _x; \ | ||
698 | rc = do_insn_fetch(ctxt, (_eip), &_x, (_size)); \ | ||
699 | if (rc != X86EMUL_CONTINUE) \ | ||
700 | goto done; \ | ||
701 | (_eip) += (_size); \ | ||
702 | (_type)_x; \ | ||
703 | }) | ||
704 | |||
705 | #define insn_fetch_arr(_arr, _size, _eip) \ | ||
706 | ({ rc = do_insn_fetch(ctxt, (_eip), _arr, (_size)); \ | ||
707 | if (rc != X86EMUL_CONTINUE) \ | ||
708 | goto done; \ | ||
709 | (_eip) += (_size); \ | ||
710 | }) | ||
711 | |||
717 | /* | 712 | /* |
718 | * Given the 'reg' portion of a ModRM byte, and a register block, return a | 713 | * Given the 'reg' portion of a ModRM byte, and a register block, return a |
719 | * pointer into the block that addresses the relevant register. | 714 | * pointer into the block that addresses the relevant register. |
@@ -857,16 +852,15 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, | |||
857 | 852 | ||
858 | static void decode_register_operand(struct x86_emulate_ctxt *ctxt, | 853 | static void decode_register_operand(struct x86_emulate_ctxt *ctxt, |
859 | struct operand *op, | 854 | struct operand *op, |
860 | struct decode_cache *c, | ||
861 | int inhibit_bytereg) | 855 | int inhibit_bytereg) |
862 | { | 856 | { |
863 | unsigned reg = c->modrm_reg; | 857 | unsigned reg = ctxt->modrm_reg; |
864 | int highbyte_regs = c->rex_prefix == 0; | 858 | int highbyte_regs = ctxt->rex_prefix == 0; |
865 | 859 | ||
866 | if (!(c->d & ModRM)) | 860 | if (!(ctxt->d & ModRM)) |
867 | reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); | 861 | reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3); |
868 | 862 | ||
869 | if (c->d & Sse) { | 863 | if (ctxt->d & Sse) { |
870 | op->type = OP_XMM; | 864 | op->type = OP_XMM; |
871 | op->bytes = 16; | 865 | op->bytes = 16; |
872 | op->addr.xmm = reg; | 866 | op->addr.xmm = reg; |
@@ -875,49 +869,47 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, | |||
875 | } | 869 | } |
876 | 870 | ||
877 | op->type = OP_REG; | 871 | op->type = OP_REG; |
878 | if ((c->d & ByteOp) && !inhibit_bytereg) { | 872 | if ((ctxt->d & ByteOp) && !inhibit_bytereg) { |
879 | op->addr.reg = decode_register(reg, c->regs, highbyte_regs); | 873 | op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs); |
880 | op->bytes = 1; | 874 | op->bytes = 1; |
881 | } else { | 875 | } else { |
882 | op->addr.reg = decode_register(reg, c->regs, 0); | 876 | op->addr.reg = decode_register(reg, ctxt->regs, 0); |
883 | op->bytes = c->op_bytes; | 877 | op->bytes = ctxt->op_bytes; |
884 | } | 878 | } |
885 | fetch_register_operand(op); | 879 | fetch_register_operand(op); |
886 | op->orig_val = op->val; | 880 | op->orig_val = op->val; |
887 | } | 881 | } |
888 | 882 | ||
889 | static int decode_modrm(struct x86_emulate_ctxt *ctxt, | 883 | static int decode_modrm(struct x86_emulate_ctxt *ctxt, |
890 | struct x86_emulate_ops *ops, | ||
891 | struct operand *op) | 884 | struct operand *op) |
892 | { | 885 | { |
893 | struct decode_cache *c = &ctxt->decode; | ||
894 | u8 sib; | 886 | u8 sib; |
895 | int index_reg = 0, base_reg = 0, scale; | 887 | int index_reg = 0, base_reg = 0, scale; |
896 | int rc = X86EMUL_CONTINUE; | 888 | int rc = X86EMUL_CONTINUE; |
897 | ulong modrm_ea = 0; | 889 | ulong modrm_ea = 0; |
898 | 890 | ||
899 | if (c->rex_prefix) { | 891 | if (ctxt->rex_prefix) { |
900 | c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ | 892 | ctxt->modrm_reg = (ctxt->rex_prefix & 4) << 1; /* REX.R */ |
901 | index_reg = (c->rex_prefix & 2) << 2; /* REX.X */ | 893 | index_reg = (ctxt->rex_prefix & 2) << 2; /* REX.X */ |
902 | c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */ | 894 | ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ |
903 | } | 895 | } |
904 | 896 | ||
905 | c->modrm = insn_fetch(u8, 1, c->eip); | 897 | ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); |
906 | c->modrm_mod |= (c->modrm & 0xc0) >> 6; | 898 | ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; |
907 | c->modrm_reg |= (c->modrm & 0x38) >> 3; | 899 | ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; |
908 | c->modrm_rm |= (c->modrm & 0x07); | 900 | ctxt->modrm_rm |= (ctxt->modrm & 0x07); |
909 | c->modrm_seg = VCPU_SREG_DS; | 901 | ctxt->modrm_seg = VCPU_SREG_DS; |
910 | 902 | ||
911 | if (c->modrm_mod == 3) { | 903 | if (ctxt->modrm_mod == 3) { |
912 | op->type = OP_REG; | 904 | op->type = OP_REG; |
913 | op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 905 | op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; |
914 | op->addr.reg = decode_register(c->modrm_rm, | 906 | op->addr.reg = decode_register(ctxt->modrm_rm, |
915 | c->regs, c->d & ByteOp); | 907 | ctxt->regs, ctxt->d & ByteOp); |
916 | if (c->d & Sse) { | 908 | if (ctxt->d & Sse) { |
917 | op->type = OP_XMM; | 909 | op->type = OP_XMM; |
918 | op->bytes = 16; | 910 | op->bytes = 16; |
919 | op->addr.xmm = c->modrm_rm; | 911 | op->addr.xmm = ctxt->modrm_rm; |
920 | read_sse_reg(ctxt, &op->vec_val, c->modrm_rm); | 912 | read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm); |
921 | return rc; | 913 | return rc; |
922 | } | 914 | } |
923 | fetch_register_operand(op); | 915 | fetch_register_operand(op); |
@@ -926,26 +918,26 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, | |||
926 | 918 | ||
927 | op->type = OP_MEM; | 919 | op->type = OP_MEM; |
928 | 920 | ||
929 | if (c->ad_bytes == 2) { | 921 | if (ctxt->ad_bytes == 2) { |
930 | unsigned bx = c->regs[VCPU_REGS_RBX]; | 922 | unsigned bx = ctxt->regs[VCPU_REGS_RBX]; |
931 | unsigned bp = c->regs[VCPU_REGS_RBP]; | 923 | unsigned bp = ctxt->regs[VCPU_REGS_RBP]; |
932 | unsigned si = c->regs[VCPU_REGS_RSI]; | 924 | unsigned si = ctxt->regs[VCPU_REGS_RSI]; |
933 | unsigned di = c->regs[VCPU_REGS_RDI]; | 925 | unsigned di = ctxt->regs[VCPU_REGS_RDI]; |
934 | 926 | ||
935 | /* 16-bit ModR/M decode. */ | 927 | /* 16-bit ModR/M decode. */ |
936 | switch (c->modrm_mod) { | 928 | switch (ctxt->modrm_mod) { |
937 | case 0: | 929 | case 0: |
938 | if (c->modrm_rm == 6) | 930 | if (ctxt->modrm_rm == 6) |
939 | modrm_ea += insn_fetch(u16, 2, c->eip); | 931 | modrm_ea += insn_fetch(u16, 2, ctxt->_eip); |
940 | break; | 932 | break; |
941 | case 1: | 933 | case 1: |
942 | modrm_ea += insn_fetch(s8, 1, c->eip); | 934 | modrm_ea += insn_fetch(s8, 1, ctxt->_eip); |
943 | break; | 935 | break; |
944 | case 2: | 936 | case 2: |
945 | modrm_ea += insn_fetch(u16, 2, c->eip); | 937 | modrm_ea += insn_fetch(u16, 2, ctxt->_eip); |
946 | break; | 938 | break; |
947 | } | 939 | } |
948 | switch (c->modrm_rm) { | 940 | switch (ctxt->modrm_rm) { |
949 | case 0: | 941 | case 0: |
950 | modrm_ea += bx + si; | 942 | modrm_ea += bx + si; |
951 | break; | 943 | break; |
@@ -965,46 +957,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, | |||
965 | modrm_ea += di; | 957 | modrm_ea += di; |
966 | break; | 958 | break; |
967 | case 6: | 959 | case 6: |
968 | if (c->modrm_mod != 0) | 960 | if (ctxt->modrm_mod != 0) |
969 | modrm_ea += bp; | 961 | modrm_ea += bp; |
970 | break; | 962 | break; |
971 | case 7: | 963 | case 7: |
972 | modrm_ea += bx; | 964 | modrm_ea += bx; |
973 | break; | 965 | break; |
974 | } | 966 | } |
975 | if (c->modrm_rm == 2 || c->modrm_rm == 3 || | 967 | if (ctxt->modrm_rm == 2 || ctxt->modrm_rm == 3 || |
976 | (c->modrm_rm == 6 && c->modrm_mod != 0)) | 968 | (ctxt->modrm_rm == 6 && ctxt->modrm_mod != 0)) |
977 | c->modrm_seg = VCPU_SREG_SS; | 969 | ctxt->modrm_seg = VCPU_SREG_SS; |
978 | modrm_ea = (u16)modrm_ea; | 970 | modrm_ea = (u16)modrm_ea; |
979 | } else { | 971 | } else { |
980 | /* 32/64-bit ModR/M decode. */ | 972 | /* 32/64-bit ModR/M decode. */ |
981 | if ((c->modrm_rm & 7) == 4) { | 973 | if ((ctxt->modrm_rm & 7) == 4) { |
982 | sib = insn_fetch(u8, 1, c->eip); | 974 | sib = insn_fetch(u8, 1, ctxt->_eip); |
983 | index_reg |= (sib >> 3) & 7; | 975 | index_reg |= (sib >> 3) & 7; |
984 | base_reg |= sib & 7; | 976 | base_reg |= sib & 7; |
985 | scale = sib >> 6; | 977 | scale = sib >> 6; |
986 | 978 | ||
987 | if ((base_reg & 7) == 5 && c->modrm_mod == 0) | 979 | if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) |
988 | modrm_ea += insn_fetch(s32, 4, c->eip); | 980 | modrm_ea += insn_fetch(s32, 4, ctxt->_eip); |
989 | else | 981 | else |
990 | modrm_ea += c->regs[base_reg]; | 982 | modrm_ea += ctxt->regs[base_reg]; |
991 | if (index_reg != 4) | 983 | if (index_reg != 4) |
992 | modrm_ea += c->regs[index_reg] << scale; | 984 | modrm_ea += ctxt->regs[index_reg] << scale; |
993 | } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { | 985 | } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { |
994 | if (ctxt->mode == X86EMUL_MODE_PROT64) | 986 | if (ctxt->mode == X86EMUL_MODE_PROT64) |
995 | c->rip_relative = 1; | 987 | ctxt->rip_relative = 1; |
996 | } else | 988 | } else |
997 | modrm_ea += c->regs[c->modrm_rm]; | 989 | modrm_ea += ctxt->regs[ctxt->modrm_rm]; |
998 | switch (c->modrm_mod) { | 990 | switch (ctxt->modrm_mod) { |
999 | case 0: | 991 | case 0: |
1000 | if (c->modrm_rm == 5) | 992 | if (ctxt->modrm_rm == 5) |
1001 | modrm_ea += insn_fetch(s32, 4, c->eip); | 993 | modrm_ea += insn_fetch(s32, 4, ctxt->_eip); |
1002 | break; | 994 | break; |
1003 | case 1: | 995 | case 1: |
1004 | modrm_ea += insn_fetch(s8, 1, c->eip); | 996 | modrm_ea += insn_fetch(s8, 1, ctxt->_eip); |
1005 | break; | 997 | break; |
1006 | case 2: | 998 | case 2: |
1007 | modrm_ea += insn_fetch(s32, 4, c->eip); | 999 | modrm_ea += insn_fetch(s32, 4, ctxt->_eip); |
1008 | break; | 1000 | break; |
1009 | } | 1001 | } |
1010 | } | 1002 | } |
@@ -1014,53 +1006,50 @@ done: | |||
1014 | } | 1006 | } |
1015 | 1007 | ||
1016 | static int decode_abs(struct x86_emulate_ctxt *ctxt, | 1008 | static int decode_abs(struct x86_emulate_ctxt *ctxt, |
1017 | struct x86_emulate_ops *ops, | ||
1018 | struct operand *op) | 1009 | struct operand *op) |
1019 | { | 1010 | { |
1020 | struct decode_cache *c = &ctxt->decode; | ||
1021 | int rc = X86EMUL_CONTINUE; | 1011 | int rc = X86EMUL_CONTINUE; |
1022 | 1012 | ||
1023 | op->type = OP_MEM; | 1013 | op->type = OP_MEM; |
1024 | switch (c->ad_bytes) { | 1014 | switch (ctxt->ad_bytes) { |
1025 | case 2: | 1015 | case 2: |
1026 | op->addr.mem.ea = insn_fetch(u16, 2, c->eip); | 1016 | op->addr.mem.ea = insn_fetch(u16, 2, ctxt->_eip); |
1027 | break; | 1017 | break; |
1028 | case 4: | 1018 | case 4: |
1029 | op->addr.mem.ea = insn_fetch(u32, 4, c->eip); | 1019 | op->addr.mem.ea = insn_fetch(u32, 4, ctxt->_eip); |
1030 | break; | 1020 | break; |
1031 | case 8: | 1021 | case 8: |
1032 | op->addr.mem.ea = insn_fetch(u64, 8, c->eip); | 1022 | op->addr.mem.ea = insn_fetch(u64, 8, ctxt->_eip); |
1033 | break; | 1023 | break; |
1034 | } | 1024 | } |
1035 | done: | 1025 | done: |
1036 | return rc; | 1026 | return rc; |
1037 | } | 1027 | } |
1038 | 1028 | ||
1039 | static void fetch_bit_operand(struct decode_cache *c) | 1029 | static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt) |
1040 | { | 1030 | { |
1041 | long sv = 0, mask; | 1031 | long sv = 0, mask; |
1042 | 1032 | ||
1043 | if (c->dst.type == OP_MEM && c->src.type == OP_REG) { | 1033 | if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) { |
1044 | mask = ~(c->dst.bytes * 8 - 1); | 1034 | mask = ~(ctxt->dst.bytes * 8 - 1); |
1045 | 1035 | ||
1046 | if (c->src.bytes == 2) | 1036 | if (ctxt->src.bytes == 2) |
1047 | sv = (s16)c->src.val & (s16)mask; | 1037 | sv = (s16)ctxt->src.val & (s16)mask; |
1048 | else if (c->src.bytes == 4) | 1038 | else if (ctxt->src.bytes == 4) |
1049 | sv = (s32)c->src.val & (s32)mask; | 1039 | sv = (s32)ctxt->src.val & (s32)mask; |
1050 | 1040 | ||
1051 | c->dst.addr.mem.ea += (sv >> 3); | 1041 | ctxt->dst.addr.mem.ea += (sv >> 3); |
1052 | } | 1042 | } |
1053 | 1043 | ||
1054 | /* only subword offset */ | 1044 | /* only subword offset */ |
1055 | c->src.val &= (c->dst.bytes << 3) - 1; | 1045 | ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; |
1056 | } | 1046 | } |
1057 | 1047 | ||
1058 | static int read_emulated(struct x86_emulate_ctxt *ctxt, | 1048 | static int read_emulated(struct x86_emulate_ctxt *ctxt, |
1059 | struct x86_emulate_ops *ops, | ||
1060 | unsigned long addr, void *dest, unsigned size) | 1049 | unsigned long addr, void *dest, unsigned size) |
1061 | { | 1050 | { |
1062 | int rc; | 1051 | int rc; |
1063 | struct read_cache *mc = &ctxt->decode.mem_read; | 1052 | struct read_cache *mc = &ctxt->mem_read; |
1064 | 1053 | ||
1065 | while (size) { | 1054 | while (size) { |
1066 | int n = min(size, 8u); | 1055 | int n = min(size, 8u); |
@@ -1068,8 +1057,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, | |||
1068 | if (mc->pos < mc->end) | 1057 | if (mc->pos < mc->end) |
1069 | goto read_cached; | 1058 | goto read_cached; |
1070 | 1059 | ||
1071 | rc = ops->read_emulated(ctxt, addr, mc->data + mc->end, n, | 1060 | rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n, |
1072 | &ctxt->exception); | 1061 | &ctxt->exception); |
1073 | if (rc != X86EMUL_CONTINUE) | 1062 | if (rc != X86EMUL_CONTINUE) |
1074 | return rc; | 1063 | return rc; |
1075 | mc->end += n; | 1064 | mc->end += n; |
@@ -1094,7 +1083,7 @@ static int segmented_read(struct x86_emulate_ctxt *ctxt, | |||
1094 | rc = linearize(ctxt, addr, size, false, &linear); | 1083 | rc = linearize(ctxt, addr, size, false, &linear); |
1095 | if (rc != X86EMUL_CONTINUE) | 1084 | if (rc != X86EMUL_CONTINUE) |
1096 | return rc; | 1085 | return rc; |
1097 | return read_emulated(ctxt, ctxt->ops, linear, data, size); | 1086 | return read_emulated(ctxt, linear, data, size); |
1098 | } | 1087 | } |
1099 | 1088 | ||
1100 | static int segmented_write(struct x86_emulate_ctxt *ctxt, | 1089 | static int segmented_write(struct x86_emulate_ctxt *ctxt, |
@@ -1128,26 +1117,24 @@ static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt, | |||
1128 | } | 1117 | } |
1129 | 1118 | ||
1130 | static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, | 1119 | static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, |
1131 | struct x86_emulate_ops *ops, | ||
1132 | unsigned int size, unsigned short port, | 1120 | unsigned int size, unsigned short port, |
1133 | void *dest) | 1121 | void *dest) |
1134 | { | 1122 | { |
1135 | struct read_cache *rc = &ctxt->decode.io_read; | 1123 | struct read_cache *rc = &ctxt->io_read; |
1136 | 1124 | ||
1137 | if (rc->pos == rc->end) { /* refill pio read ahead */ | 1125 | if (rc->pos == rc->end) { /* refill pio read ahead */ |
1138 | struct decode_cache *c = &ctxt->decode; | ||
1139 | unsigned int in_page, n; | 1126 | unsigned int in_page, n; |
1140 | unsigned int count = c->rep_prefix ? | 1127 | unsigned int count = ctxt->rep_prefix ? |
1141 | address_mask(c, c->regs[VCPU_REGS_RCX]) : 1; | 1128 | address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) : 1; |
1142 | in_page = (ctxt->eflags & EFLG_DF) ? | 1129 | in_page = (ctxt->eflags & EFLG_DF) ? |
1143 | offset_in_page(c->regs[VCPU_REGS_RDI]) : | 1130 | offset_in_page(ctxt->regs[VCPU_REGS_RDI]) : |
1144 | PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]); | 1131 | PAGE_SIZE - offset_in_page(ctxt->regs[VCPU_REGS_RDI]); |
1145 | n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, | 1132 | n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, |
1146 | count); | 1133 | count); |
1147 | if (n == 0) | 1134 | if (n == 0) |
1148 | n = 1; | 1135 | n = 1; |
1149 | rc->pos = rc->end = 0; | 1136 | rc->pos = rc->end = 0; |
1150 | if (!ops->pio_in_emulated(ctxt, size, port, rc->data, n)) | 1137 | if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n)) |
1151 | return 0; | 1138 | return 0; |
1152 | rc->end = n * size; | 1139 | rc->end = n * size; |
1153 | } | 1140 | } |
@@ -1158,9 +1145,10 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, | |||
1158 | } | 1145 | } |
1159 | 1146 | ||
1160 | static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, | 1147 | static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, |
1161 | struct x86_emulate_ops *ops, | ||
1162 | u16 selector, struct desc_ptr *dt) | 1148 | u16 selector, struct desc_ptr *dt) |
1163 | { | 1149 | { |
1150 | struct x86_emulate_ops *ops = ctxt->ops; | ||
1151 | |||
1164 | if (selector & 1 << 2) { | 1152 | if (selector & 1 << 2) { |
1165 | struct desc_struct desc; | 1153 | struct desc_struct desc; |
1166 | u16 sel; | 1154 | u16 sel; |
@@ -1177,48 +1165,42 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, | |||
1177 | 1165 | ||
1178 | /* allowed just for 8 bytes segments */ | 1166 | /* allowed just for 8 bytes segments */ |
1179 | static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, | 1167 | static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, |
1180 | struct x86_emulate_ops *ops, | ||
1181 | u16 selector, struct desc_struct *desc) | 1168 | u16 selector, struct desc_struct *desc) |
1182 | { | 1169 | { |
1183 | struct desc_ptr dt; | 1170 | struct desc_ptr dt; |
1184 | u16 index = selector >> 3; | 1171 | u16 index = selector >> 3; |
1185 | int ret; | ||
1186 | ulong addr; | 1172 | ulong addr; |
1187 | 1173 | ||
1188 | get_descriptor_table_ptr(ctxt, ops, selector, &dt); | 1174 | get_descriptor_table_ptr(ctxt, selector, &dt); |
1189 | 1175 | ||
1190 | if (dt.size < index * 8 + 7) | 1176 | if (dt.size < index * 8 + 7) |
1191 | return emulate_gp(ctxt, selector & 0xfffc); | 1177 | return emulate_gp(ctxt, selector & 0xfffc); |
1192 | addr = dt.address + index * 8; | ||
1193 | ret = ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); | ||
1194 | 1178 | ||
1195 | return ret; | 1179 | addr = dt.address + index * 8; |
1180 | return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, | ||
1181 | &ctxt->exception); | ||
1196 | } | 1182 | } |
1197 | 1183 | ||
1198 | /* allowed just for 8 bytes segments */ | 1184 | /* allowed just for 8 bytes segments */ |
1199 | static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, | 1185 | static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, |
1200 | struct x86_emulate_ops *ops, | ||
1201 | u16 selector, struct desc_struct *desc) | 1186 | u16 selector, struct desc_struct *desc) |
1202 | { | 1187 | { |
1203 | struct desc_ptr dt; | 1188 | struct desc_ptr dt; |
1204 | u16 index = selector >> 3; | 1189 | u16 index = selector >> 3; |
1205 | ulong addr; | 1190 | ulong addr; |
1206 | int ret; | ||
1207 | 1191 | ||
1208 | get_descriptor_table_ptr(ctxt, ops, selector, &dt); | 1192 | get_descriptor_table_ptr(ctxt, selector, &dt); |
1209 | 1193 | ||
1210 | if (dt.size < index * 8 + 7) | 1194 | if (dt.size < index * 8 + 7) |
1211 | return emulate_gp(ctxt, selector & 0xfffc); | 1195 | return emulate_gp(ctxt, selector & 0xfffc); |
1212 | 1196 | ||
1213 | addr = dt.address + index * 8; | 1197 | addr = dt.address + index * 8; |
1214 | ret = ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); | 1198 | return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc, |
1215 | 1199 | &ctxt->exception); | |
1216 | return ret; | ||
1217 | } | 1200 | } |
1218 | 1201 | ||
1219 | /* Does not support long mode */ | 1202 | /* Does not support long mode */ |
1220 | static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | 1203 | static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, |
1221 | struct x86_emulate_ops *ops, | ||
1222 | u16 selector, int seg) | 1204 | u16 selector, int seg) |
1223 | { | 1205 | { |
1224 | struct desc_struct seg_desc; | 1206 | struct desc_struct seg_desc; |
@@ -1253,7 +1235,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1253 | if (null_selector) /* for NULL selector skip all following checks */ | 1235 | if (null_selector) /* for NULL selector skip all following checks */ |
1254 | goto load; | 1236 | goto load; |
1255 | 1237 | ||
1256 | ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc); | 1238 | ret = read_segment_descriptor(ctxt, selector, &seg_desc); |
1257 | if (ret != X86EMUL_CONTINUE) | 1239 | if (ret != X86EMUL_CONTINUE) |
1258 | return ret; | 1240 | return ret; |
1259 | 1241 | ||
@@ -1271,7 +1253,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1271 | 1253 | ||
1272 | rpl = selector & 3; | 1254 | rpl = selector & 3; |
1273 | dpl = seg_desc.dpl; | 1255 | dpl = seg_desc.dpl; |
1274 | cpl = ops->cpl(ctxt); | 1256 | cpl = ctxt->ops->cpl(ctxt); |
1275 | 1257 | ||
1276 | switch (seg) { | 1258 | switch (seg) { |
1277 | case VCPU_SREG_SS: | 1259 | case VCPU_SREG_SS: |
@@ -1322,12 +1304,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1322 | if (seg_desc.s) { | 1304 | if (seg_desc.s) { |
1323 | /* mark segment as accessed */ | 1305 | /* mark segment as accessed */ |
1324 | seg_desc.type |= 1; | 1306 | seg_desc.type |= 1; |
1325 | ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc); | 1307 | ret = write_segment_descriptor(ctxt, selector, &seg_desc); |
1326 | if (ret != X86EMUL_CONTINUE) | 1308 | if (ret != X86EMUL_CONTINUE) |
1327 | return ret; | 1309 | return ret; |
1328 | } | 1310 | } |
1329 | load: | 1311 | load: |
1330 | ops->set_segment(ctxt, selector, &seg_desc, 0, seg); | 1312 | ctxt->ops->set_segment(ctxt, selector, &seg_desc, 0, seg); |
1331 | return X86EMUL_CONTINUE; | 1313 | return X86EMUL_CONTINUE; |
1332 | exception: | 1314 | exception: |
1333 | emulate_exception(ctxt, err_vec, err_code, true); | 1315 | emulate_exception(ctxt, err_vec, err_code, true); |
@@ -1356,29 +1338,28 @@ static void write_register_operand(struct operand *op) | |||
1356 | static int writeback(struct x86_emulate_ctxt *ctxt) | 1338 | static int writeback(struct x86_emulate_ctxt *ctxt) |
1357 | { | 1339 | { |
1358 | int rc; | 1340 | int rc; |
1359 | struct decode_cache *c = &ctxt->decode; | ||
1360 | 1341 | ||
1361 | switch (c->dst.type) { | 1342 | switch (ctxt->dst.type) { |
1362 | case OP_REG: | 1343 | case OP_REG: |
1363 | write_register_operand(&c->dst); | 1344 | write_register_operand(&ctxt->dst); |
1364 | break; | 1345 | break; |
1365 | case OP_MEM: | 1346 | case OP_MEM: |
1366 | if (c->lock_prefix) | 1347 | if (ctxt->lock_prefix) |
1367 | rc = segmented_cmpxchg(ctxt, | 1348 | rc = segmented_cmpxchg(ctxt, |
1368 | c->dst.addr.mem, | 1349 | ctxt->dst.addr.mem, |
1369 | &c->dst.orig_val, | 1350 | &ctxt->dst.orig_val, |
1370 | &c->dst.val, | 1351 | &ctxt->dst.val, |
1371 | c->dst.bytes); | 1352 | ctxt->dst.bytes); |
1372 | else | 1353 | else |
1373 | rc = segmented_write(ctxt, | 1354 | rc = segmented_write(ctxt, |
1374 | c->dst.addr.mem, | 1355 | ctxt->dst.addr.mem, |
1375 | &c->dst.val, | 1356 | &ctxt->dst.val, |
1376 | c->dst.bytes); | 1357 | ctxt->dst.bytes); |
1377 | if (rc != X86EMUL_CONTINUE) | 1358 | if (rc != X86EMUL_CONTINUE) |
1378 | return rc; | 1359 | return rc; |
1379 | break; | 1360 | break; |
1380 | case OP_XMM: | 1361 | case OP_XMM: |
1381 | write_sse_reg(ctxt, &c->dst.vec_val, c->dst.addr.xmm); | 1362 | write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); |
1382 | break; | 1363 | break; |
1383 | case OP_NONE: | 1364 | case OP_NONE: |
1384 | /* no writeback */ | 1365 | /* no writeback */ |
@@ -1391,50 +1372,45 @@ static int writeback(struct x86_emulate_ctxt *ctxt) | |||
1391 | 1372 | ||
1392 | static int em_push(struct x86_emulate_ctxt *ctxt) | 1373 | static int em_push(struct x86_emulate_ctxt *ctxt) |
1393 | { | 1374 | { |
1394 | struct decode_cache *c = &ctxt->decode; | ||
1395 | struct segmented_address addr; | 1375 | struct segmented_address addr; |
1396 | 1376 | ||
1397 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); | 1377 | register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes); |
1398 | addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); | 1378 | addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); |
1399 | addr.seg = VCPU_SREG_SS; | 1379 | addr.seg = VCPU_SREG_SS; |
1400 | 1380 | ||
1401 | /* Disable writeback. */ | 1381 | /* Disable writeback. */ |
1402 | c->dst.type = OP_NONE; | 1382 | ctxt->dst.type = OP_NONE; |
1403 | return segmented_write(ctxt, addr, &c->src.val, c->op_bytes); | 1383 | return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes); |
1404 | } | 1384 | } |
1405 | 1385 | ||
1406 | static int emulate_pop(struct x86_emulate_ctxt *ctxt, | 1386 | static int emulate_pop(struct x86_emulate_ctxt *ctxt, |
1407 | void *dest, int len) | 1387 | void *dest, int len) |
1408 | { | 1388 | { |
1409 | struct decode_cache *c = &ctxt->decode; | ||
1410 | int rc; | 1389 | int rc; |
1411 | struct segmented_address addr; | 1390 | struct segmented_address addr; |
1412 | 1391 | ||
1413 | addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); | 1392 | addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); |
1414 | addr.seg = VCPU_SREG_SS; | 1393 | addr.seg = VCPU_SREG_SS; |
1415 | rc = segmented_read(ctxt, addr, dest, len); | 1394 | rc = segmented_read(ctxt, addr, dest, len); |
1416 | if (rc != X86EMUL_CONTINUE) | 1395 | if (rc != X86EMUL_CONTINUE) |
1417 | return rc; | 1396 | return rc; |
1418 | 1397 | ||
1419 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); | 1398 | register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len); |
1420 | return rc; | 1399 | return rc; |
1421 | } | 1400 | } |
1422 | 1401 | ||
1423 | static int em_pop(struct x86_emulate_ctxt *ctxt) | 1402 | static int em_pop(struct x86_emulate_ctxt *ctxt) |
1424 | { | 1403 | { |
1425 | struct decode_cache *c = &ctxt->decode; | 1404 | return emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); |
1426 | |||
1427 | return emulate_pop(ctxt, &c->dst.val, c->op_bytes); | ||
1428 | } | 1405 | } |
1429 | 1406 | ||
1430 | static int emulate_popf(struct x86_emulate_ctxt *ctxt, | 1407 | static int emulate_popf(struct x86_emulate_ctxt *ctxt, |
1431 | struct x86_emulate_ops *ops, | 1408 | void *dest, int len) |
1432 | void *dest, int len) | ||
1433 | { | 1409 | { |
1434 | int rc; | 1410 | int rc; |
1435 | unsigned long val, change_mask; | 1411 | unsigned long val, change_mask; |
1436 | int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; | 1412 | int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; |
1437 | int cpl = ops->cpl(ctxt); | 1413 | int cpl = ctxt->ops->cpl(ctxt); |
1438 | 1414 | ||
1439 | rc = emulate_pop(ctxt, &val, len); | 1415 | rc = emulate_pop(ctxt, &val, len); |
1440 | if (rc != X86EMUL_CONTINUE) | 1416 | if (rc != X86EMUL_CONTINUE) |
@@ -1470,49 +1446,41 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, | |||
1470 | 1446 | ||
1471 | static int em_popf(struct x86_emulate_ctxt *ctxt) | 1447 | static int em_popf(struct x86_emulate_ctxt *ctxt) |
1472 | { | 1448 | { |
1473 | struct decode_cache *c = &ctxt->decode; | 1449 | ctxt->dst.type = OP_REG; |
1474 | 1450 | ctxt->dst.addr.reg = &ctxt->eflags; | |
1475 | c->dst.type = OP_REG; | 1451 | ctxt->dst.bytes = ctxt->op_bytes; |
1476 | c->dst.addr.reg = &ctxt->eflags; | 1452 | return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); |
1477 | c->dst.bytes = c->op_bytes; | ||
1478 | return emulate_popf(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); | ||
1479 | } | 1453 | } |
1480 | 1454 | ||
1481 | static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, | 1455 | static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) |
1482 | struct x86_emulate_ops *ops, int seg) | ||
1483 | { | 1456 | { |
1484 | struct decode_cache *c = &ctxt->decode; | 1457 | ctxt->src.val = get_segment_selector(ctxt, seg); |
1485 | |||
1486 | c->src.val = get_segment_selector(ctxt, seg); | ||
1487 | 1458 | ||
1488 | return em_push(ctxt); | 1459 | return em_push(ctxt); |
1489 | } | 1460 | } |
1490 | 1461 | ||
1491 | static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, | 1462 | static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int seg) |
1492 | struct x86_emulate_ops *ops, int seg) | ||
1493 | { | 1463 | { |
1494 | struct decode_cache *c = &ctxt->decode; | ||
1495 | unsigned long selector; | 1464 | unsigned long selector; |
1496 | int rc; | 1465 | int rc; |
1497 | 1466 | ||
1498 | rc = emulate_pop(ctxt, &selector, c->op_bytes); | 1467 | rc = emulate_pop(ctxt, &selector, ctxt->op_bytes); |
1499 | if (rc != X86EMUL_CONTINUE) | 1468 | if (rc != X86EMUL_CONTINUE) |
1500 | return rc; | 1469 | return rc; |
1501 | 1470 | ||
1502 | rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg); | 1471 | rc = load_segment_descriptor(ctxt, (u16)selector, seg); |
1503 | return rc; | 1472 | return rc; |
1504 | } | 1473 | } |
1505 | 1474 | ||
1506 | static int em_pusha(struct x86_emulate_ctxt *ctxt) | 1475 | static int em_pusha(struct x86_emulate_ctxt *ctxt) |
1507 | { | 1476 | { |
1508 | struct decode_cache *c = &ctxt->decode; | 1477 | unsigned long old_esp = ctxt->regs[VCPU_REGS_RSP]; |
1509 | unsigned long old_esp = c->regs[VCPU_REGS_RSP]; | ||
1510 | int rc = X86EMUL_CONTINUE; | 1478 | int rc = X86EMUL_CONTINUE; |
1511 | int reg = VCPU_REGS_RAX; | 1479 | int reg = VCPU_REGS_RAX; |
1512 | 1480 | ||
1513 | while (reg <= VCPU_REGS_RDI) { | 1481 | while (reg <= VCPU_REGS_RDI) { |
1514 | (reg == VCPU_REGS_RSP) ? | 1482 | (reg == VCPU_REGS_RSP) ? |
1515 | (c->src.val = old_esp) : (c->src.val = c->regs[reg]); | 1483 | (ctxt->src.val = old_esp) : (ctxt->src.val = ctxt->regs[reg]); |
1516 | 1484 | ||
1517 | rc = em_push(ctxt); | 1485 | rc = em_push(ctxt); |
1518 | if (rc != X86EMUL_CONTINUE) | 1486 | if (rc != X86EMUL_CONTINUE) |
@@ -1526,26 +1494,23 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt) | |||
1526 | 1494 | ||
1527 | static int em_pushf(struct x86_emulate_ctxt *ctxt) | 1495 | static int em_pushf(struct x86_emulate_ctxt *ctxt) |
1528 | { | 1496 | { |
1529 | struct decode_cache *c = &ctxt->decode; | 1497 | ctxt->src.val = (unsigned long)ctxt->eflags; |
1530 | |||
1531 | c->src.val = (unsigned long)ctxt->eflags; | ||
1532 | return em_push(ctxt); | 1498 | return em_push(ctxt); |
1533 | } | 1499 | } |
1534 | 1500 | ||
1535 | static int em_popa(struct x86_emulate_ctxt *ctxt) | 1501 | static int em_popa(struct x86_emulate_ctxt *ctxt) |
1536 | { | 1502 | { |
1537 | struct decode_cache *c = &ctxt->decode; | ||
1538 | int rc = X86EMUL_CONTINUE; | 1503 | int rc = X86EMUL_CONTINUE; |
1539 | int reg = VCPU_REGS_RDI; | 1504 | int reg = VCPU_REGS_RDI; |
1540 | 1505 | ||
1541 | while (reg >= VCPU_REGS_RAX) { | 1506 | while (reg >= VCPU_REGS_RAX) { |
1542 | if (reg == VCPU_REGS_RSP) { | 1507 | if (reg == VCPU_REGS_RSP) { |
1543 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], | 1508 | register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], |
1544 | c->op_bytes); | 1509 | ctxt->op_bytes); |
1545 | --reg; | 1510 | --reg; |
1546 | } | 1511 | } |
1547 | 1512 | ||
1548 | rc = emulate_pop(ctxt, &c->regs[reg], c->op_bytes); | 1513 | rc = emulate_pop(ctxt, &ctxt->regs[reg], ctxt->op_bytes); |
1549 | if (rc != X86EMUL_CONTINUE) | 1514 | if (rc != X86EMUL_CONTINUE) |
1550 | break; | 1515 | break; |
1551 | --reg; | 1516 | --reg; |
@@ -1553,10 +1518,9 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) | |||
1553 | return rc; | 1518 | return rc; |
1554 | } | 1519 | } |
1555 | 1520 | ||
1556 | int emulate_int_real(struct x86_emulate_ctxt *ctxt, | 1521 | int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) |
1557 | struct x86_emulate_ops *ops, int irq) | ||
1558 | { | 1522 | { |
1559 | struct decode_cache *c = &ctxt->decode; | 1523 | struct x86_emulate_ops *ops = ctxt->ops; |
1560 | int rc; | 1524 | int rc; |
1561 | struct desc_ptr dt; | 1525 | struct desc_ptr dt; |
1562 | gva_t cs_addr; | 1526 | gva_t cs_addr; |
@@ -1564,19 +1528,19 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, | |||
1564 | u16 cs, eip; | 1528 | u16 cs, eip; |
1565 | 1529 | ||
1566 | /* TODO: Add limit checks */ | 1530 | /* TODO: Add limit checks */ |
1567 | c->src.val = ctxt->eflags; | 1531 | ctxt->src.val = ctxt->eflags; |
1568 | rc = em_push(ctxt); | 1532 | rc = em_push(ctxt); |
1569 | if (rc != X86EMUL_CONTINUE) | 1533 | if (rc != X86EMUL_CONTINUE) |
1570 | return rc; | 1534 | return rc; |
1571 | 1535 | ||
1572 | ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); | 1536 | ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); |
1573 | 1537 | ||
1574 | c->src.val = get_segment_selector(ctxt, VCPU_SREG_CS); | 1538 | ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS); |
1575 | rc = em_push(ctxt); | 1539 | rc = em_push(ctxt); |
1576 | if (rc != X86EMUL_CONTINUE) | 1540 | if (rc != X86EMUL_CONTINUE) |
1577 | return rc; | 1541 | return rc; |
1578 | 1542 | ||
1579 | c->src.val = c->eip; | 1543 | ctxt->src.val = ctxt->_eip; |
1580 | rc = em_push(ctxt); | 1544 | rc = em_push(ctxt); |
1581 | if (rc != X86EMUL_CONTINUE) | 1545 | if (rc != X86EMUL_CONTINUE) |
1582 | return rc; | 1546 | return rc; |
@@ -1594,21 +1558,20 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, | |||
1594 | if (rc != X86EMUL_CONTINUE) | 1558 | if (rc != X86EMUL_CONTINUE) |
1595 | return rc; | 1559 | return rc; |
1596 | 1560 | ||
1597 | rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS); | 1561 | rc = load_segment_descriptor(ctxt, cs, VCPU_SREG_CS); |
1598 | if (rc != X86EMUL_CONTINUE) | 1562 | if (rc != X86EMUL_CONTINUE) |
1599 | return rc; | 1563 | return rc; |
1600 | 1564 | ||
1601 | c->eip = eip; | 1565 | ctxt->_eip = eip; |
1602 | 1566 | ||
1603 | return rc; | 1567 | return rc; |
1604 | } | 1568 | } |
1605 | 1569 | ||
1606 | static int emulate_int(struct x86_emulate_ctxt *ctxt, | 1570 | static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq) |
1607 | struct x86_emulate_ops *ops, int irq) | ||
1608 | { | 1571 | { |
1609 | switch(ctxt->mode) { | 1572 | switch(ctxt->mode) { |
1610 | case X86EMUL_MODE_REAL: | 1573 | case X86EMUL_MODE_REAL: |
1611 | return emulate_int_real(ctxt, ops, irq); | 1574 | return emulate_int_real(ctxt, irq); |
1612 | case X86EMUL_MODE_VM86: | 1575 | case X86EMUL_MODE_VM86: |
1613 | case X86EMUL_MODE_PROT16: | 1576 | case X86EMUL_MODE_PROT16: |
1614 | case X86EMUL_MODE_PROT32: | 1577 | case X86EMUL_MODE_PROT32: |
@@ -1619,10 +1582,8 @@ static int emulate_int(struct x86_emulate_ctxt *ctxt, | |||
1619 | } | 1582 | } |
1620 | } | 1583 | } |
1621 | 1584 | ||
1622 | static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, | 1585 | static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) |
1623 | struct x86_emulate_ops *ops) | ||
1624 | { | 1586 | { |
1625 | struct decode_cache *c = &ctxt->decode; | ||
1626 | int rc = X86EMUL_CONTINUE; | 1587 | int rc = X86EMUL_CONTINUE; |
1627 | unsigned long temp_eip = 0; | 1588 | unsigned long temp_eip = 0; |
1628 | unsigned long temp_eflags = 0; | 1589 | unsigned long temp_eflags = 0; |
@@ -1634,7 +1595,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, | |||
1634 | 1595 | ||
1635 | /* TODO: Add stack limit check */ | 1596 | /* TODO: Add stack limit check */ |
1636 | 1597 | ||
1637 | rc = emulate_pop(ctxt, &temp_eip, c->op_bytes); | 1598 | rc = emulate_pop(ctxt, &temp_eip, ctxt->op_bytes); |
1638 | 1599 | ||
1639 | if (rc != X86EMUL_CONTINUE) | 1600 | if (rc != X86EMUL_CONTINUE) |
1640 | return rc; | 1601 | return rc; |
@@ -1642,27 +1603,27 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, | |||
1642 | if (temp_eip & ~0xffff) | 1603 | if (temp_eip & ~0xffff) |
1643 | return emulate_gp(ctxt, 0); | 1604 | return emulate_gp(ctxt, 0); |
1644 | 1605 | ||
1645 | rc = emulate_pop(ctxt, &cs, c->op_bytes); | 1606 | rc = emulate_pop(ctxt, &cs, ctxt->op_bytes); |
1646 | 1607 | ||
1647 | if (rc != X86EMUL_CONTINUE) | 1608 | if (rc != X86EMUL_CONTINUE) |
1648 | return rc; | 1609 | return rc; |
1649 | 1610 | ||
1650 | rc = emulate_pop(ctxt, &temp_eflags, c->op_bytes); | 1611 | rc = emulate_pop(ctxt, &temp_eflags, ctxt->op_bytes); |
1651 | 1612 | ||
1652 | if (rc != X86EMUL_CONTINUE) | 1613 | if (rc != X86EMUL_CONTINUE) |
1653 | return rc; | 1614 | return rc; |
1654 | 1615 | ||
1655 | rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); | 1616 | rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS); |
1656 | 1617 | ||
1657 | if (rc != X86EMUL_CONTINUE) | 1618 | if (rc != X86EMUL_CONTINUE) |
1658 | return rc; | 1619 | return rc; |
1659 | 1620 | ||
1660 | c->eip = temp_eip; | 1621 | ctxt->_eip = temp_eip; |
1661 | 1622 | ||
1662 | 1623 | ||
1663 | if (c->op_bytes == 4) | 1624 | if (ctxt->op_bytes == 4) |
1664 | ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); | 1625 | ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); |
1665 | else if (c->op_bytes == 2) { | 1626 | else if (ctxt->op_bytes == 2) { |
1666 | ctxt->eflags &= ~0xffff; | 1627 | ctxt->eflags &= ~0xffff; |
1667 | ctxt->eflags |= temp_eflags; | 1628 | ctxt->eflags |= temp_eflags; |
1668 | } | 1629 | } |
@@ -1673,12 +1634,11 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, | |||
1673 | return rc; | 1634 | return rc; |
1674 | } | 1635 | } |
1675 | 1636 | ||
1676 | static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, | 1637 | static int em_iret(struct x86_emulate_ctxt *ctxt) |
1677 | struct x86_emulate_ops* ops) | ||
1678 | { | 1638 | { |
1679 | switch(ctxt->mode) { | 1639 | switch(ctxt->mode) { |
1680 | case X86EMUL_MODE_REAL: | 1640 | case X86EMUL_MODE_REAL: |
1681 | return emulate_iret_real(ctxt, ops); | 1641 | return emulate_iret_real(ctxt); |
1682 | case X86EMUL_MODE_VM86: | 1642 | case X86EMUL_MODE_VM86: |
1683 | case X86EMUL_MODE_PROT16: | 1643 | case X86EMUL_MODE_PROT16: |
1684 | case X86EMUL_MODE_PROT32: | 1644 | case X86EMUL_MODE_PROT32: |
@@ -1691,53 +1651,49 @@ static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, | |||
1691 | 1651 | ||
1692 | static int em_jmp_far(struct x86_emulate_ctxt *ctxt) | 1652 | static int em_jmp_far(struct x86_emulate_ctxt *ctxt) |
1693 | { | 1653 | { |
1694 | struct decode_cache *c = &ctxt->decode; | ||
1695 | int rc; | 1654 | int rc; |
1696 | unsigned short sel; | 1655 | unsigned short sel; |
1697 | 1656 | ||
1698 | memcpy(&sel, c->src.valptr + c->op_bytes, 2); | 1657 | memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); |
1699 | 1658 | ||
1700 | rc = load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS); | 1659 | rc = load_segment_descriptor(ctxt, sel, VCPU_SREG_CS); |
1701 | if (rc != X86EMUL_CONTINUE) | 1660 | if (rc != X86EMUL_CONTINUE) |
1702 | return rc; | 1661 | return rc; |
1703 | 1662 | ||
1704 | c->eip = 0; | 1663 | ctxt->_eip = 0; |
1705 | memcpy(&c->eip, c->src.valptr, c->op_bytes); | 1664 | memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes); |
1706 | return X86EMUL_CONTINUE; | 1665 | return X86EMUL_CONTINUE; |
1707 | } | 1666 | } |
1708 | 1667 | ||
1709 | static int em_grp1a(struct x86_emulate_ctxt *ctxt) | 1668 | static int em_grp1a(struct x86_emulate_ctxt *ctxt) |
1710 | { | 1669 | { |
1711 | struct decode_cache *c = &ctxt->decode; | 1670 | return emulate_pop(ctxt, &ctxt->dst.val, ctxt->dst.bytes); |
1712 | |||
1713 | return emulate_pop(ctxt, &c->dst.val, c->dst.bytes); | ||
1714 | } | 1671 | } |
1715 | 1672 | ||
1716 | static int em_grp2(struct x86_emulate_ctxt *ctxt) | 1673 | static int em_grp2(struct x86_emulate_ctxt *ctxt) |
1717 | { | 1674 | { |
1718 | struct decode_cache *c = &ctxt->decode; | 1675 | switch (ctxt->modrm_reg) { |
1719 | switch (c->modrm_reg) { | ||
1720 | case 0: /* rol */ | 1676 | case 0: /* rol */ |
1721 | emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags); | 1677 | emulate_2op_SrcB("rol", ctxt->src, ctxt->dst, ctxt->eflags); |
1722 | break; | 1678 | break; |
1723 | case 1: /* ror */ | 1679 | case 1: /* ror */ |
1724 | emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags); | 1680 | emulate_2op_SrcB("ror", ctxt->src, ctxt->dst, ctxt->eflags); |
1725 | break; | 1681 | break; |
1726 | case 2: /* rcl */ | 1682 | case 2: /* rcl */ |
1727 | emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags); | 1683 | emulate_2op_SrcB("rcl", ctxt->src, ctxt->dst, ctxt->eflags); |
1728 | break; | 1684 | break; |
1729 | case 3: /* rcr */ | 1685 | case 3: /* rcr */ |
1730 | emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags); | 1686 | emulate_2op_SrcB("rcr", ctxt->src, ctxt->dst, ctxt->eflags); |
1731 | break; | 1687 | break; |
1732 | case 4: /* sal/shl */ | 1688 | case 4: /* sal/shl */ |
1733 | case 6: /* sal/shl */ | 1689 | case 6: /* sal/shl */ |
1734 | emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags); | 1690 | emulate_2op_SrcB("sal", ctxt->src, ctxt->dst, ctxt->eflags); |
1735 | break; | 1691 | break; |
1736 | case 5: /* shr */ | 1692 | case 5: /* shr */ |
1737 | emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags); | 1693 | emulate_2op_SrcB("shr", ctxt->src, ctxt->dst, ctxt->eflags); |
1738 | break; | 1694 | break; |
1739 | case 7: /* sar */ | 1695 | case 7: /* sar */ |
1740 | emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); | 1696 | emulate_2op_SrcB("sar", ctxt->src, ctxt->dst, ctxt->eflags); |
1741 | break; | 1697 | break; |
1742 | } | 1698 | } |
1743 | return X86EMUL_CONTINUE; | 1699 | return X86EMUL_CONTINUE; |
@@ -1745,33 +1701,32 @@ static int em_grp2(struct x86_emulate_ctxt *ctxt) | |||
1745 | 1701 | ||
1746 | static int em_grp3(struct x86_emulate_ctxt *ctxt) | 1702 | static int em_grp3(struct x86_emulate_ctxt *ctxt) |
1747 | { | 1703 | { |
1748 | struct decode_cache *c = &ctxt->decode; | 1704 | unsigned long *rax = &ctxt->regs[VCPU_REGS_RAX]; |
1749 | unsigned long *rax = &c->regs[VCPU_REGS_RAX]; | 1705 | unsigned long *rdx = &ctxt->regs[VCPU_REGS_RDX]; |
1750 | unsigned long *rdx = &c->regs[VCPU_REGS_RDX]; | ||
1751 | u8 de = 0; | 1706 | u8 de = 0; |
1752 | 1707 | ||
1753 | switch (c->modrm_reg) { | 1708 | switch (ctxt->modrm_reg) { |
1754 | case 0 ... 1: /* test */ | 1709 | case 0 ... 1: /* test */ |
1755 | emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); | 1710 | emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags); |
1756 | break; | 1711 | break; |
1757 | case 2: /* not */ | 1712 | case 2: /* not */ |
1758 | c->dst.val = ~c->dst.val; | 1713 | ctxt->dst.val = ~ctxt->dst.val; |
1759 | break; | 1714 | break; |
1760 | case 3: /* neg */ | 1715 | case 3: /* neg */ |
1761 | emulate_1op("neg", c->dst, ctxt->eflags); | 1716 | emulate_1op("neg", ctxt->dst, ctxt->eflags); |
1762 | break; | 1717 | break; |
1763 | case 4: /* mul */ | 1718 | case 4: /* mul */ |
1764 | emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags); | 1719 | emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, ctxt->eflags); |
1765 | break; | 1720 | break; |
1766 | case 5: /* imul */ | 1721 | case 5: /* imul */ |
1767 | emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags); | 1722 | emulate_1op_rax_rdx("imul", ctxt->src, *rax, *rdx, ctxt->eflags); |
1768 | break; | 1723 | break; |
1769 | case 6: /* div */ | 1724 | case 6: /* div */ |
1770 | emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx, | 1725 | emulate_1op_rax_rdx_ex("div", ctxt->src, *rax, *rdx, |
1771 | ctxt->eflags, de); | 1726 | ctxt->eflags, de); |
1772 | break; | 1727 | break; |
1773 | case 7: /* idiv */ | 1728 | case 7: /* idiv */ |
1774 | emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx, | 1729 | emulate_1op_rax_rdx_ex("idiv", ctxt->src, *rax, *rdx, |
1775 | ctxt->eflags, de); | 1730 | ctxt->eflags, de); |
1776 | break; | 1731 | break; |
1777 | default: | 1732 | default: |
@@ -1784,26 +1739,25 @@ static int em_grp3(struct x86_emulate_ctxt *ctxt) | |||
1784 | 1739 | ||
1785 | static int em_grp45(struct x86_emulate_ctxt *ctxt) | 1740 | static int em_grp45(struct x86_emulate_ctxt *ctxt) |
1786 | { | 1741 | { |
1787 | struct decode_cache *c = &ctxt->decode; | ||
1788 | int rc = X86EMUL_CONTINUE; | 1742 | int rc = X86EMUL_CONTINUE; |
1789 | 1743 | ||
1790 | switch (c->modrm_reg) { | 1744 | switch (ctxt->modrm_reg) { |
1791 | case 0: /* inc */ | 1745 | case 0: /* inc */ |
1792 | emulate_1op("inc", c->dst, ctxt->eflags); | 1746 | emulate_1op("inc", ctxt->dst, ctxt->eflags); |
1793 | break; | 1747 | break; |
1794 | case 1: /* dec */ | 1748 | case 1: /* dec */ |
1795 | emulate_1op("dec", c->dst, ctxt->eflags); | 1749 | emulate_1op("dec", ctxt->dst, ctxt->eflags); |
1796 | break; | 1750 | break; |
1797 | case 2: /* call near abs */ { | 1751 | case 2: /* call near abs */ { |
1798 | long int old_eip; | 1752 | long int old_eip; |
1799 | old_eip = c->eip; | 1753 | old_eip = ctxt->_eip; |
1800 | c->eip = c->src.val; | 1754 | ctxt->_eip = ctxt->src.val; |
1801 | c->src.val = old_eip; | 1755 | ctxt->src.val = old_eip; |
1802 | rc = em_push(ctxt); | 1756 | rc = em_push(ctxt); |
1803 | break; | 1757 | break; |
1804 | } | 1758 | } |
1805 | case 4: /* jmp abs */ | 1759 | case 4: /* jmp abs */ |
1806 | c->eip = c->src.val; | 1760 | ctxt->_eip = ctxt->src.val; |
1807 | break; | 1761 | break; |
1808 | case 5: /* jmp far */ | 1762 | case 5: /* jmp far */ |
1809 | rc = em_jmp_far(ctxt); | 1763 | rc = em_jmp_far(ctxt); |
@@ -1817,68 +1771,70 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt) | |||
1817 | 1771 | ||
1818 | static int em_grp9(struct x86_emulate_ctxt *ctxt) | 1772 | static int em_grp9(struct x86_emulate_ctxt *ctxt) |
1819 | { | 1773 | { |
1820 | struct decode_cache *c = &ctxt->decode; | 1774 | u64 old = ctxt->dst.orig_val64; |
1821 | u64 old = c->dst.orig_val64; | ||
1822 | 1775 | ||
1823 | if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || | 1776 | if (((u32) (old >> 0) != (u32) ctxt->regs[VCPU_REGS_RAX]) || |
1824 | ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { | 1777 | ((u32) (old >> 32) != (u32) ctxt->regs[VCPU_REGS_RDX])) { |
1825 | c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); | 1778 | ctxt->regs[VCPU_REGS_RAX] = (u32) (old >> 0); |
1826 | c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); | 1779 | ctxt->regs[VCPU_REGS_RDX] = (u32) (old >> 32); |
1827 | ctxt->eflags &= ~EFLG_ZF; | 1780 | ctxt->eflags &= ~EFLG_ZF; |
1828 | } else { | 1781 | } else { |
1829 | c->dst.val64 = ((u64)c->regs[VCPU_REGS_RCX] << 32) | | 1782 | ctxt->dst.val64 = ((u64)ctxt->regs[VCPU_REGS_RCX] << 32) | |
1830 | (u32) c->regs[VCPU_REGS_RBX]; | 1783 | (u32) ctxt->regs[VCPU_REGS_RBX]; |
1831 | 1784 | ||
1832 | ctxt->eflags |= EFLG_ZF; | 1785 | ctxt->eflags |= EFLG_ZF; |
1833 | } | 1786 | } |
1834 | return X86EMUL_CONTINUE; | 1787 | return X86EMUL_CONTINUE; |
1835 | } | 1788 | } |
1836 | 1789 | ||
1837 | static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, | 1790 | static int em_ret(struct x86_emulate_ctxt *ctxt) |
1838 | struct x86_emulate_ops *ops) | 1791 | { |
1792 | ctxt->dst.type = OP_REG; | ||
1793 | ctxt->dst.addr.reg = &ctxt->_eip; | ||
1794 | ctxt->dst.bytes = ctxt->op_bytes; | ||
1795 | return em_pop(ctxt); | ||
1796 | } | ||
1797 | |||
1798 | static int em_ret_far(struct x86_emulate_ctxt *ctxt) | ||
1839 | { | 1799 | { |
1840 | struct decode_cache *c = &ctxt->decode; | ||
1841 | int rc; | 1800 | int rc; |
1842 | unsigned long cs; | 1801 | unsigned long cs; |
1843 | 1802 | ||
1844 | rc = emulate_pop(ctxt, &c->eip, c->op_bytes); | 1803 | rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes); |
1845 | if (rc != X86EMUL_CONTINUE) | 1804 | if (rc != X86EMUL_CONTINUE) |
1846 | return rc; | 1805 | return rc; |
1847 | if (c->op_bytes == 4) | 1806 | if (ctxt->op_bytes == 4) |
1848 | c->eip = (u32)c->eip; | 1807 | ctxt->_eip = (u32)ctxt->_eip; |
1849 | rc = emulate_pop(ctxt, &cs, c->op_bytes); | 1808 | rc = emulate_pop(ctxt, &cs, ctxt->op_bytes); |
1850 | if (rc != X86EMUL_CONTINUE) | 1809 | if (rc != X86EMUL_CONTINUE) |
1851 | return rc; | 1810 | return rc; |
1852 | rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); | 1811 | rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS); |
1853 | return rc; | 1812 | return rc; |
1854 | } | 1813 | } |
1855 | 1814 | ||
1856 | static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, | 1815 | static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, int seg) |
1857 | struct x86_emulate_ops *ops, int seg) | ||
1858 | { | 1816 | { |
1859 | struct decode_cache *c = &ctxt->decode; | ||
1860 | unsigned short sel; | 1817 | unsigned short sel; |
1861 | int rc; | 1818 | int rc; |
1862 | 1819 | ||
1863 | memcpy(&sel, c->src.valptr + c->op_bytes, 2); | 1820 | memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); |
1864 | 1821 | ||
1865 | rc = load_segment_descriptor(ctxt, ops, sel, seg); | 1822 | rc = load_segment_descriptor(ctxt, sel, seg); |
1866 | if (rc != X86EMUL_CONTINUE) | 1823 | if (rc != X86EMUL_CONTINUE) |
1867 | return rc; | 1824 | return rc; |
1868 | 1825 | ||
1869 | c->dst.val = c->src.val; | 1826 | ctxt->dst.val = ctxt->src.val; |
1870 | return rc; | 1827 | return rc; |
1871 | } | 1828 | } |
1872 | 1829 | ||
1873 | static inline void | 1830 | static void |
1874 | setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, | 1831 | setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, |
1875 | struct x86_emulate_ops *ops, struct desc_struct *cs, | 1832 | struct desc_struct *cs, struct desc_struct *ss) |
1876 | struct desc_struct *ss) | ||
1877 | { | 1833 | { |
1878 | u16 selector; | 1834 | u16 selector; |
1879 | 1835 | ||
1880 | memset(cs, 0, sizeof(struct desc_struct)); | 1836 | memset(cs, 0, sizeof(struct desc_struct)); |
1881 | ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS); | 1837 | ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS); |
1882 | memset(ss, 0, sizeof(struct desc_struct)); | 1838 | memset(ss, 0, sizeof(struct desc_struct)); |
1883 | 1839 | ||
1884 | cs->l = 0; /* will be adjusted later */ | 1840 | cs->l = 0; /* will be adjusted later */ |
@@ -1901,10 +1857,9 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, | |||
1901 | ss->p = 1; | 1857 | ss->p = 1; |
1902 | } | 1858 | } |
1903 | 1859 | ||
1904 | static int | 1860 | static int em_syscall(struct x86_emulate_ctxt *ctxt) |
1905 | emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | ||
1906 | { | 1861 | { |
1907 | struct decode_cache *c = &ctxt->decode; | 1862 | struct x86_emulate_ops *ops = ctxt->ops; |
1908 | struct desc_struct cs, ss; | 1863 | struct desc_struct cs, ss; |
1909 | u64 msr_data; | 1864 | u64 msr_data; |
1910 | u16 cs_sel, ss_sel; | 1865 | u16 cs_sel, ss_sel; |
@@ -1916,7 +1871,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1916 | return emulate_ud(ctxt); | 1871 | return emulate_ud(ctxt); |
1917 | 1872 | ||
1918 | ops->get_msr(ctxt, MSR_EFER, &efer); | 1873 | ops->get_msr(ctxt, MSR_EFER, &efer); |
1919 | setup_syscalls_segments(ctxt, ops, &cs, &ss); | 1874 | setup_syscalls_segments(ctxt, &cs, &ss); |
1920 | 1875 | ||
1921 | ops->get_msr(ctxt, MSR_STAR, &msr_data); | 1876 | ops->get_msr(ctxt, MSR_STAR, &msr_data); |
1922 | msr_data >>= 32; | 1877 | msr_data >>= 32; |
@@ -1930,15 +1885,15 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1930 | ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); | 1885 | ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); |
1931 | ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); | 1886 | ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); |
1932 | 1887 | ||
1933 | c->regs[VCPU_REGS_RCX] = c->eip; | 1888 | ctxt->regs[VCPU_REGS_RCX] = ctxt->_eip; |
1934 | if (efer & EFER_LMA) { | 1889 | if (efer & EFER_LMA) { |
1935 | #ifdef CONFIG_X86_64 | 1890 | #ifdef CONFIG_X86_64 |
1936 | c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; | 1891 | ctxt->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; |
1937 | 1892 | ||
1938 | ops->get_msr(ctxt, | 1893 | ops->get_msr(ctxt, |
1939 | ctxt->mode == X86EMUL_MODE_PROT64 ? | 1894 | ctxt->mode == X86EMUL_MODE_PROT64 ? |
1940 | MSR_LSTAR : MSR_CSTAR, &msr_data); | 1895 | MSR_LSTAR : MSR_CSTAR, &msr_data); |
1941 | c->eip = msr_data; | 1896 | ctxt->_eip = msr_data; |
1942 | 1897 | ||
1943 | ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); | 1898 | ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); |
1944 | ctxt->eflags &= ~(msr_data | EFLG_RF); | 1899 | ctxt->eflags &= ~(msr_data | EFLG_RF); |
@@ -1946,7 +1901,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1946 | } else { | 1901 | } else { |
1947 | /* legacy mode */ | 1902 | /* legacy mode */ |
1948 | ops->get_msr(ctxt, MSR_STAR, &msr_data); | 1903 | ops->get_msr(ctxt, MSR_STAR, &msr_data); |
1949 | c->eip = (u32)msr_data; | 1904 | ctxt->_eip = (u32)msr_data; |
1950 | 1905 | ||
1951 | ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); | 1906 | ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); |
1952 | } | 1907 | } |
@@ -1954,16 +1909,15 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1954 | return X86EMUL_CONTINUE; | 1909 | return X86EMUL_CONTINUE; |
1955 | } | 1910 | } |
1956 | 1911 | ||
1957 | static int | 1912 | static int em_sysenter(struct x86_emulate_ctxt *ctxt) |
1958 | emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | ||
1959 | { | 1913 | { |
1960 | struct decode_cache *c = &ctxt->decode; | 1914 | struct x86_emulate_ops *ops = ctxt->ops; |
1961 | struct desc_struct cs, ss; | 1915 | struct desc_struct cs, ss; |
1962 | u64 msr_data; | 1916 | u64 msr_data; |
1963 | u16 cs_sel, ss_sel; | 1917 | u16 cs_sel, ss_sel; |
1964 | u64 efer = 0; | 1918 | u64 efer = 0; |
1965 | 1919 | ||
1966 | ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); | 1920 | ops->get_msr(ctxt, MSR_EFER, &efer); |
1967 | /* inject #GP if in real mode */ | 1921 | /* inject #GP if in real mode */ |
1968 | if (ctxt->mode == X86EMUL_MODE_REAL) | 1922 | if (ctxt->mode == X86EMUL_MODE_REAL) |
1969 | return emulate_gp(ctxt, 0); | 1923 | return emulate_gp(ctxt, 0); |
@@ -1974,7 +1928,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1974 | if (ctxt->mode == X86EMUL_MODE_PROT64) | 1928 | if (ctxt->mode == X86EMUL_MODE_PROT64) |
1975 | return emulate_ud(ctxt); | 1929 | return emulate_ud(ctxt); |
1976 | 1930 | ||
1977 | setup_syscalls_segments(ctxt, ops, &cs, &ss); | 1931 | setup_syscalls_segments(ctxt, &cs, &ss); |
1978 | 1932 | ||
1979 | ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); | 1933 | ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); |
1980 | switch (ctxt->mode) { | 1934 | switch (ctxt->mode) { |
@@ -2002,31 +1956,30 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
2002 | ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); | 1956 | ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); |
2003 | 1957 | ||
2004 | ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data); | 1958 | ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data); |
2005 | c->eip = msr_data; | 1959 | ctxt->_eip = msr_data; |
2006 | 1960 | ||
2007 | ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); | 1961 | ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); |
2008 | c->regs[VCPU_REGS_RSP] = msr_data; | 1962 | ctxt->regs[VCPU_REGS_RSP] = msr_data; |
2009 | 1963 | ||
2010 | return X86EMUL_CONTINUE; | 1964 | return X86EMUL_CONTINUE; |
2011 | } | 1965 | } |
2012 | 1966 | ||
2013 | static int | 1967 | static int em_sysexit(struct x86_emulate_ctxt *ctxt) |
2014 | emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | ||
2015 | { | 1968 | { |
2016 | struct decode_cache *c = &ctxt->decode; | 1969 | struct x86_emulate_ops *ops = ctxt->ops; |
2017 | struct desc_struct cs, ss; | 1970 | struct desc_struct cs, ss; |
2018 | u64 msr_data; | 1971 | u64 msr_data; |
2019 | int usermode; | 1972 | int usermode; |
2020 | u16 cs_sel, ss_sel; | 1973 | u16 cs_sel = 0, ss_sel = 0; |
2021 | 1974 | ||
2022 | /* inject #GP if in real mode or Virtual 8086 mode */ | 1975 | /* inject #GP if in real mode or Virtual 8086 mode */ |
2023 | if (ctxt->mode == X86EMUL_MODE_REAL || | 1976 | if (ctxt->mode == X86EMUL_MODE_REAL || |
2024 | ctxt->mode == X86EMUL_MODE_VM86) | 1977 | ctxt->mode == X86EMUL_MODE_VM86) |
2025 | return emulate_gp(ctxt, 0); | 1978 | return emulate_gp(ctxt, 0); |
2026 | 1979 | ||
2027 | setup_syscalls_segments(ctxt, ops, &cs, &ss); | 1980 | setup_syscalls_segments(ctxt, &cs, &ss); |
2028 | 1981 | ||
2029 | if ((c->rex_prefix & 0x8) != 0x0) | 1982 | if ((ctxt->rex_prefix & 0x8) != 0x0) |
2030 | usermode = X86EMUL_MODE_PROT64; | 1983 | usermode = X86EMUL_MODE_PROT64; |
2031 | else | 1984 | else |
2032 | usermode = X86EMUL_MODE_PROT32; | 1985 | usermode = X86EMUL_MODE_PROT32; |
@@ -2056,14 +2009,13 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
2056 | ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); | 2009 | ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); |
2057 | ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); | 2010 | ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); |
2058 | 2011 | ||
2059 | c->eip = c->regs[VCPU_REGS_RDX]; | 2012 | ctxt->_eip = ctxt->regs[VCPU_REGS_RDX]; |
2060 | c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; | 2013 | ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX]; |
2061 | 2014 | ||
2062 | return X86EMUL_CONTINUE; | 2015 | return X86EMUL_CONTINUE; |
2063 | } | 2016 | } |
2064 | 2017 | ||
2065 | static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt, | 2018 | static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) |
2066 | struct x86_emulate_ops *ops) | ||
2067 | { | 2019 | { |
2068 | int iopl; | 2020 | int iopl; |
2069 | if (ctxt->mode == X86EMUL_MODE_REAL) | 2021 | if (ctxt->mode == X86EMUL_MODE_REAL) |
@@ -2071,13 +2023,13 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt, | |||
2071 | if (ctxt->mode == X86EMUL_MODE_VM86) | 2023 | if (ctxt->mode == X86EMUL_MODE_VM86) |
2072 | return true; | 2024 | return true; |
2073 | iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; | 2025 | iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; |
2074 | return ops->cpl(ctxt) > iopl; | 2026 | return ctxt->ops->cpl(ctxt) > iopl; |
2075 | } | 2027 | } |
2076 | 2028 | ||
2077 | static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, | 2029 | static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, |
2078 | struct x86_emulate_ops *ops, | ||
2079 | u16 port, u16 len) | 2030 | u16 port, u16 len) |
2080 | { | 2031 | { |
2032 | struct x86_emulate_ops *ops = ctxt->ops; | ||
2081 | struct desc_struct tr_seg; | 2033 | struct desc_struct tr_seg; |
2082 | u32 base3; | 2034 | u32 base3; |
2083 | int r; | 2035 | int r; |
@@ -2108,14 +2060,13 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, | |||
2108 | } | 2060 | } |
2109 | 2061 | ||
2110 | static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, | 2062 | static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, |
2111 | struct x86_emulate_ops *ops, | ||
2112 | u16 port, u16 len) | 2063 | u16 port, u16 len) |
2113 | { | 2064 | { |
2114 | if (ctxt->perm_ok) | 2065 | if (ctxt->perm_ok) |
2115 | return true; | 2066 | return true; |
2116 | 2067 | ||
2117 | if (emulator_bad_iopl(ctxt, ops)) | 2068 | if (emulator_bad_iopl(ctxt)) |
2118 | if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) | 2069 | if (!emulator_io_port_access_allowed(ctxt, port, len)) |
2119 | return false; | 2070 | return false; |
2120 | 2071 | ||
2121 | ctxt->perm_ok = true; | 2072 | ctxt->perm_ok = true; |
@@ -2124,21 +2075,18 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, | |||
2124 | } | 2075 | } |
2125 | 2076 | ||
2126 | static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, | 2077 | static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, |
2127 | struct x86_emulate_ops *ops, | ||
2128 | struct tss_segment_16 *tss) | 2078 | struct tss_segment_16 *tss) |
2129 | { | 2079 | { |
2130 | struct decode_cache *c = &ctxt->decode; | 2080 | tss->ip = ctxt->_eip; |
2131 | |||
2132 | tss->ip = c->eip; | ||
2133 | tss->flag = ctxt->eflags; | 2081 | tss->flag = ctxt->eflags; |
2134 | tss->ax = c->regs[VCPU_REGS_RAX]; | 2082 | tss->ax = ctxt->regs[VCPU_REGS_RAX]; |
2135 | tss->cx = c->regs[VCPU_REGS_RCX]; | 2083 | tss->cx = ctxt->regs[VCPU_REGS_RCX]; |
2136 | tss->dx = c->regs[VCPU_REGS_RDX]; | 2084 | tss->dx = ctxt->regs[VCPU_REGS_RDX]; |
2137 | tss->bx = c->regs[VCPU_REGS_RBX]; | 2085 | tss->bx = ctxt->regs[VCPU_REGS_RBX]; |
2138 | tss->sp = c->regs[VCPU_REGS_RSP]; | 2086 | tss->sp = ctxt->regs[VCPU_REGS_RSP]; |
2139 | tss->bp = c->regs[VCPU_REGS_RBP]; | 2087 | tss->bp = ctxt->regs[VCPU_REGS_RBP]; |
2140 | tss->si = c->regs[VCPU_REGS_RSI]; | 2088 | tss->si = ctxt->regs[VCPU_REGS_RSI]; |
2141 | tss->di = c->regs[VCPU_REGS_RDI]; | 2089 | tss->di = ctxt->regs[VCPU_REGS_RDI]; |
2142 | 2090 | ||
2143 | tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); | 2091 | tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); |
2144 | tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); | 2092 | tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); |
@@ -2148,22 +2096,20 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, | |||
2148 | } | 2096 | } |
2149 | 2097 | ||
2150 | static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, | 2098 | static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, |
2151 | struct x86_emulate_ops *ops, | ||
2152 | struct tss_segment_16 *tss) | 2099 | struct tss_segment_16 *tss) |
2153 | { | 2100 | { |
2154 | struct decode_cache *c = &ctxt->decode; | ||
2155 | int ret; | 2101 | int ret; |
2156 | 2102 | ||
2157 | c->eip = tss->ip; | 2103 | ctxt->_eip = tss->ip; |
2158 | ctxt->eflags = tss->flag | 2; | 2104 | ctxt->eflags = tss->flag | 2; |
2159 | c->regs[VCPU_REGS_RAX] = tss->ax; | 2105 | ctxt->regs[VCPU_REGS_RAX] = tss->ax; |
2160 | c->regs[VCPU_REGS_RCX] = tss->cx; | 2106 | ctxt->regs[VCPU_REGS_RCX] = tss->cx; |
2161 | c->regs[VCPU_REGS_RDX] = tss->dx; | 2107 | ctxt->regs[VCPU_REGS_RDX] = tss->dx; |
2162 | c->regs[VCPU_REGS_RBX] = tss->bx; | 2108 | ctxt->regs[VCPU_REGS_RBX] = tss->bx; |
2163 | c->regs[VCPU_REGS_RSP] = tss->sp; | 2109 | ctxt->regs[VCPU_REGS_RSP] = tss->sp; |
2164 | c->regs[VCPU_REGS_RBP] = tss->bp; | 2110 | ctxt->regs[VCPU_REGS_RBP] = tss->bp; |
2165 | c->regs[VCPU_REGS_RSI] = tss->si; | 2111 | ctxt->regs[VCPU_REGS_RSI] = tss->si; |
2166 | c->regs[VCPU_REGS_RDI] = tss->di; | 2112 | ctxt->regs[VCPU_REGS_RDI] = tss->di; |
2167 | 2113 | ||
2168 | /* | 2114 | /* |
2169 | * SDM says that segment selectors are loaded before segment | 2115 | * SDM says that segment selectors are loaded before segment |
@@ -2179,19 +2125,19 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, | |||
2179 | * Now load segment descriptors. If fault happenes at this stage | 2125 | * Now load segment descriptors. If fault happenes at this stage |
2180 | * it is handled in a context of new task | 2126 | * it is handled in a context of new task |
2181 | */ | 2127 | */ |
2182 | ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR); | 2128 | ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR); |
2183 | if (ret != X86EMUL_CONTINUE) | 2129 | if (ret != X86EMUL_CONTINUE) |
2184 | return ret; | 2130 | return ret; |
2185 | ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES); | 2131 | ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); |
2186 | if (ret != X86EMUL_CONTINUE) | 2132 | if (ret != X86EMUL_CONTINUE) |
2187 | return ret; | 2133 | return ret; |
2188 | ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS); | 2134 | ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); |
2189 | if (ret != X86EMUL_CONTINUE) | 2135 | if (ret != X86EMUL_CONTINUE) |
2190 | return ret; | 2136 | return ret; |
2191 | ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS); | 2137 | ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); |
2192 | if (ret != X86EMUL_CONTINUE) | 2138 | if (ret != X86EMUL_CONTINUE) |
2193 | return ret; | 2139 | return ret; |
2194 | ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS); | 2140 | ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); |
2195 | if (ret != X86EMUL_CONTINUE) | 2141 | if (ret != X86EMUL_CONTINUE) |
2196 | return ret; | 2142 | return ret; |
2197 | 2143 | ||
@@ -2199,10 +2145,10 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, | |||
2199 | } | 2145 | } |
2200 | 2146 | ||
2201 | static int task_switch_16(struct x86_emulate_ctxt *ctxt, | 2147 | static int task_switch_16(struct x86_emulate_ctxt *ctxt, |
2202 | struct x86_emulate_ops *ops, | ||
2203 | u16 tss_selector, u16 old_tss_sel, | 2148 | u16 tss_selector, u16 old_tss_sel, |
2204 | ulong old_tss_base, struct desc_struct *new_desc) | 2149 | ulong old_tss_base, struct desc_struct *new_desc) |
2205 | { | 2150 | { |
2151 | struct x86_emulate_ops *ops = ctxt->ops; | ||
2206 | struct tss_segment_16 tss_seg; | 2152 | struct tss_segment_16 tss_seg; |
2207 | int ret; | 2153 | int ret; |
2208 | u32 new_tss_base = get_desc_base(new_desc); | 2154 | u32 new_tss_base = get_desc_base(new_desc); |
@@ -2213,7 +2159,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, | |||
2213 | /* FIXME: need to provide precise fault address */ | 2159 | /* FIXME: need to provide precise fault address */ |
2214 | return ret; | 2160 | return ret; |
2215 | 2161 | ||
2216 | save_state_to_tss16(ctxt, ops, &tss_seg); | 2162 | save_state_to_tss16(ctxt, &tss_seg); |
2217 | 2163 | ||
2218 | ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, | 2164 | ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, |
2219 | &ctxt->exception); | 2165 | &ctxt->exception); |
@@ -2239,26 +2185,23 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, | |||
2239 | return ret; | 2185 | return ret; |
2240 | } | 2186 | } |
2241 | 2187 | ||
2242 | return load_state_from_tss16(ctxt, ops, &tss_seg); | 2188 | return load_state_from_tss16(ctxt, &tss_seg); |
2243 | } | 2189 | } |
2244 | 2190 | ||
2245 | static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, | 2191 | static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, |
2246 | struct x86_emulate_ops *ops, | ||
2247 | struct tss_segment_32 *tss) | 2192 | struct tss_segment_32 *tss) |
2248 | { | 2193 | { |
2249 | struct decode_cache *c = &ctxt->decode; | 2194 | tss->cr3 = ctxt->ops->get_cr(ctxt, 3); |
2250 | 2195 | tss->eip = ctxt->_eip; | |
2251 | tss->cr3 = ops->get_cr(ctxt, 3); | ||
2252 | tss->eip = c->eip; | ||
2253 | tss->eflags = ctxt->eflags; | 2196 | tss->eflags = ctxt->eflags; |
2254 | tss->eax = c->regs[VCPU_REGS_RAX]; | 2197 | tss->eax = ctxt->regs[VCPU_REGS_RAX]; |
2255 | tss->ecx = c->regs[VCPU_REGS_RCX]; | 2198 | tss->ecx = ctxt->regs[VCPU_REGS_RCX]; |
2256 | tss->edx = c->regs[VCPU_REGS_RDX]; | 2199 | tss->edx = ctxt->regs[VCPU_REGS_RDX]; |
2257 | tss->ebx = c->regs[VCPU_REGS_RBX]; | 2200 | tss->ebx = ctxt->regs[VCPU_REGS_RBX]; |
2258 | tss->esp = c->regs[VCPU_REGS_RSP]; | 2201 | tss->esp = ctxt->regs[VCPU_REGS_RSP]; |
2259 | tss->ebp = c->regs[VCPU_REGS_RBP]; | 2202 | tss->ebp = ctxt->regs[VCPU_REGS_RBP]; |
2260 | tss->esi = c->regs[VCPU_REGS_RSI]; | 2203 | tss->esi = ctxt->regs[VCPU_REGS_RSI]; |
2261 | tss->edi = c->regs[VCPU_REGS_RDI]; | 2204 | tss->edi = ctxt->regs[VCPU_REGS_RDI]; |
2262 | 2205 | ||
2263 | tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); | 2206 | tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); |
2264 | tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); | 2207 | tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); |
@@ -2270,24 +2213,22 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, | |||
2270 | } | 2213 | } |
2271 | 2214 | ||
2272 | static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, | 2215 | static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, |
2273 | struct x86_emulate_ops *ops, | ||
2274 | struct tss_segment_32 *tss) | 2216 | struct tss_segment_32 *tss) |
2275 | { | 2217 | { |
2276 | struct decode_cache *c = &ctxt->decode; | ||
2277 | int ret; | 2218 | int ret; |
2278 | 2219 | ||
2279 | if (ops->set_cr(ctxt, 3, tss->cr3)) | 2220 | if (ctxt->ops->set_cr(ctxt, 3, tss->cr3)) |
2280 | return emulate_gp(ctxt, 0); | 2221 | return emulate_gp(ctxt, 0); |
2281 | c->eip = tss->eip; | 2222 | ctxt->_eip = tss->eip; |
2282 | ctxt->eflags = tss->eflags | 2; | 2223 | ctxt->eflags = tss->eflags | 2; |
2283 | c->regs[VCPU_REGS_RAX] = tss->eax; | 2224 | ctxt->regs[VCPU_REGS_RAX] = tss->eax; |
2284 | c->regs[VCPU_REGS_RCX] = tss->ecx; | 2225 | ctxt->regs[VCPU_REGS_RCX] = tss->ecx; |
2285 | c->regs[VCPU_REGS_RDX] = tss->edx; | 2226 | ctxt->regs[VCPU_REGS_RDX] = tss->edx; |
2286 | c->regs[VCPU_REGS_RBX] = tss->ebx; | 2227 | ctxt->regs[VCPU_REGS_RBX] = tss->ebx; |
2287 | c->regs[VCPU_REGS_RSP] = tss->esp; | 2228 | ctxt->regs[VCPU_REGS_RSP] = tss->esp; |
2288 | c->regs[VCPU_REGS_RBP] = tss->ebp; | 2229 | ctxt->regs[VCPU_REGS_RBP] = tss->ebp; |
2289 | c->regs[VCPU_REGS_RSI] = tss->esi; | 2230 | ctxt->regs[VCPU_REGS_RSI] = tss->esi; |
2290 | c->regs[VCPU_REGS_RDI] = tss->edi; | 2231 | ctxt->regs[VCPU_REGS_RDI] = tss->edi; |
2291 | 2232 | ||
2292 | /* | 2233 | /* |
2293 | * SDM says that segment selectors are loaded before segment | 2234 | * SDM says that segment selectors are loaded before segment |
@@ -2305,25 +2246,25 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, | |||
2305 | * Now load segment descriptors. If fault happenes at this stage | 2246 | * Now load segment descriptors. If fault happenes at this stage |
2306 | * it is handled in a context of new task | 2247 | * it is handled in a context of new task |
2307 | */ | 2248 | */ |
2308 | ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR); | 2249 | ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); |
2309 | if (ret != X86EMUL_CONTINUE) | 2250 | if (ret != X86EMUL_CONTINUE) |
2310 | return ret; | 2251 | return ret; |
2311 | ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES); | 2252 | ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); |
2312 | if (ret != X86EMUL_CONTINUE) | 2253 | if (ret != X86EMUL_CONTINUE) |
2313 | return ret; | 2254 | return ret; |
2314 | ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS); | 2255 | ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); |
2315 | if (ret != X86EMUL_CONTINUE) | 2256 | if (ret != X86EMUL_CONTINUE) |
2316 | return ret; | 2257 | return ret; |
2317 | ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS); | 2258 | ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); |
2318 | if (ret != X86EMUL_CONTINUE) | 2259 | if (ret != X86EMUL_CONTINUE) |
2319 | return ret; | 2260 | return ret; |
2320 | ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS); | 2261 | ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); |
2321 | if (ret != X86EMUL_CONTINUE) | 2262 | if (ret != X86EMUL_CONTINUE) |
2322 | return ret; | 2263 | return ret; |
2323 | ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS); | 2264 | ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS); |
2324 | if (ret != X86EMUL_CONTINUE) | 2265 | if (ret != X86EMUL_CONTINUE) |
2325 | return ret; | 2266 | return ret; |
2326 | ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS); | 2267 | ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS); |
2327 | if (ret != X86EMUL_CONTINUE) | 2268 | if (ret != X86EMUL_CONTINUE) |
2328 | return ret; | 2269 | return ret; |
2329 | 2270 | ||
@@ -2331,10 +2272,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, | |||
2331 | } | 2272 | } |
2332 | 2273 | ||
2333 | static int task_switch_32(struct x86_emulate_ctxt *ctxt, | 2274 | static int task_switch_32(struct x86_emulate_ctxt *ctxt, |
2334 | struct x86_emulate_ops *ops, | ||
2335 | u16 tss_selector, u16 old_tss_sel, | 2275 | u16 tss_selector, u16 old_tss_sel, |
2336 | ulong old_tss_base, struct desc_struct *new_desc) | 2276 | ulong old_tss_base, struct desc_struct *new_desc) |
2337 | { | 2277 | { |
2278 | struct x86_emulate_ops *ops = ctxt->ops; | ||
2338 | struct tss_segment_32 tss_seg; | 2279 | struct tss_segment_32 tss_seg; |
2339 | int ret; | 2280 | int ret; |
2340 | u32 new_tss_base = get_desc_base(new_desc); | 2281 | u32 new_tss_base = get_desc_base(new_desc); |
@@ -2345,7 +2286,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2345 | /* FIXME: need to provide precise fault address */ | 2286 | /* FIXME: need to provide precise fault address */ |
2346 | return ret; | 2287 | return ret; |
2347 | 2288 | ||
2348 | save_state_to_tss32(ctxt, ops, &tss_seg); | 2289 | save_state_to_tss32(ctxt, &tss_seg); |
2349 | 2290 | ||
2350 | ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, | 2291 | ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, |
2351 | &ctxt->exception); | 2292 | &ctxt->exception); |
@@ -2371,14 +2312,14 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2371 | return ret; | 2312 | return ret; |
2372 | } | 2313 | } |
2373 | 2314 | ||
2374 | return load_state_from_tss32(ctxt, ops, &tss_seg); | 2315 | return load_state_from_tss32(ctxt, &tss_seg); |
2375 | } | 2316 | } |
2376 | 2317 | ||
2377 | static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | 2318 | static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, |
2378 | struct x86_emulate_ops *ops, | ||
2379 | u16 tss_selector, int reason, | 2319 | u16 tss_selector, int reason, |
2380 | bool has_error_code, u32 error_code) | 2320 | bool has_error_code, u32 error_code) |
2381 | { | 2321 | { |
2322 | struct x86_emulate_ops *ops = ctxt->ops; | ||
2382 | struct desc_struct curr_tss_desc, next_tss_desc; | 2323 | struct desc_struct curr_tss_desc, next_tss_desc; |
2383 | int ret; | 2324 | int ret; |
2384 | u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); | 2325 | u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); |
@@ -2388,10 +2329,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2388 | 2329 | ||
2389 | /* FIXME: old_tss_base == ~0 ? */ | 2330 | /* FIXME: old_tss_base == ~0 ? */ |
2390 | 2331 | ||
2391 | ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc); | 2332 | ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc); |
2392 | if (ret != X86EMUL_CONTINUE) | 2333 | if (ret != X86EMUL_CONTINUE) |
2393 | return ret; | 2334 | return ret; |
2394 | ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc); | 2335 | ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc); |
2395 | if (ret != X86EMUL_CONTINUE) | 2336 | if (ret != X86EMUL_CONTINUE) |
2396 | return ret; | 2337 | return ret; |
2397 | 2338 | ||
@@ -2413,8 +2354,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2413 | 2354 | ||
2414 | if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { | 2355 | if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { |
2415 | curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */ | 2356 | curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */ |
2416 | write_segment_descriptor(ctxt, ops, old_tss_sel, | 2357 | write_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc); |
2417 | &curr_tss_desc); | ||
2418 | } | 2358 | } |
2419 | 2359 | ||
2420 | if (reason == TASK_SWITCH_IRET) | 2360 | if (reason == TASK_SWITCH_IRET) |
@@ -2426,10 +2366,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2426 | old_tss_sel = 0xffff; | 2366 | old_tss_sel = 0xffff; |
2427 | 2367 | ||
2428 | if (next_tss_desc.type & 8) | 2368 | if (next_tss_desc.type & 8) |
2429 | ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel, | 2369 | ret = task_switch_32(ctxt, tss_selector, old_tss_sel, |
2430 | old_tss_base, &next_tss_desc); | 2370 | old_tss_base, &next_tss_desc); |
2431 | else | 2371 | else |
2432 | ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel, | 2372 | ret = task_switch_16(ctxt, tss_selector, old_tss_sel, |
2433 | old_tss_base, &next_tss_desc); | 2373 | old_tss_base, &next_tss_desc); |
2434 | if (ret != X86EMUL_CONTINUE) | 2374 | if (ret != X86EMUL_CONTINUE) |
2435 | return ret; | 2375 | return ret; |
@@ -2439,19 +2379,16 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2439 | 2379 | ||
2440 | if (reason != TASK_SWITCH_IRET) { | 2380 | if (reason != TASK_SWITCH_IRET) { |
2441 | next_tss_desc.type |= (1 << 1); /* set busy flag */ | 2381 | next_tss_desc.type |= (1 << 1); /* set busy flag */ |
2442 | write_segment_descriptor(ctxt, ops, tss_selector, | 2382 | write_segment_descriptor(ctxt, tss_selector, &next_tss_desc); |
2443 | &next_tss_desc); | ||
2444 | } | 2383 | } |
2445 | 2384 | ||
2446 | ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS); | 2385 | ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS); |
2447 | ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR); | 2386 | ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR); |
2448 | 2387 | ||
2449 | if (has_error_code) { | 2388 | if (has_error_code) { |
2450 | struct decode_cache *c = &ctxt->decode; | 2389 | ctxt->op_bytes = ctxt->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; |
2451 | 2390 | ctxt->lock_prefix = 0; | |
2452 | c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; | 2391 | ctxt->src.val = (unsigned long) error_code; |
2453 | c->lock_prefix = 0; | ||
2454 | c->src.val = (unsigned long) error_code; | ||
2455 | ret = em_push(ctxt); | 2392 | ret = em_push(ctxt); |
2456 | } | 2393 | } |
2457 | 2394 | ||
@@ -2462,18 +2399,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2462 | u16 tss_selector, int reason, | 2399 | u16 tss_selector, int reason, |
2463 | bool has_error_code, u32 error_code) | 2400 | bool has_error_code, u32 error_code) |
2464 | { | 2401 | { |
2465 | struct x86_emulate_ops *ops = ctxt->ops; | ||
2466 | struct decode_cache *c = &ctxt->decode; | ||
2467 | int rc; | 2402 | int rc; |
2468 | 2403 | ||
2469 | c->eip = ctxt->eip; | 2404 | ctxt->_eip = ctxt->eip; |
2470 | c->dst.type = OP_NONE; | 2405 | ctxt->dst.type = OP_NONE; |
2471 | 2406 | ||
2472 | rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, | 2407 | rc = emulator_do_task_switch(ctxt, tss_selector, reason, |
2473 | has_error_code, error_code); | 2408 | has_error_code, error_code); |
2474 | 2409 | ||
2475 | if (rc == X86EMUL_CONTINUE) | 2410 | if (rc == X86EMUL_CONTINUE) |
2476 | ctxt->eip = c->eip; | 2411 | ctxt->eip = ctxt->_eip; |
2477 | 2412 | ||
2478 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; | 2413 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; |
2479 | } | 2414 | } |
@@ -2481,22 +2416,20 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2481 | static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, | 2416 | static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, |
2482 | int reg, struct operand *op) | 2417 | int reg, struct operand *op) |
2483 | { | 2418 | { |
2484 | struct decode_cache *c = &ctxt->decode; | ||
2485 | int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; | 2419 | int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; |
2486 | 2420 | ||
2487 | register_address_increment(c, &c->regs[reg], df * op->bytes); | 2421 | register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes); |
2488 | op->addr.mem.ea = register_address(c, c->regs[reg]); | 2422 | op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]); |
2489 | op->addr.mem.seg = seg; | 2423 | op->addr.mem.seg = seg; |
2490 | } | 2424 | } |
2491 | 2425 | ||
2492 | static int em_das(struct x86_emulate_ctxt *ctxt) | 2426 | static int em_das(struct x86_emulate_ctxt *ctxt) |
2493 | { | 2427 | { |
2494 | struct decode_cache *c = &ctxt->decode; | ||
2495 | u8 al, old_al; | 2428 | u8 al, old_al; |
2496 | bool af, cf, old_cf; | 2429 | bool af, cf, old_cf; |
2497 | 2430 | ||
2498 | cf = ctxt->eflags & X86_EFLAGS_CF; | 2431 | cf = ctxt->eflags & X86_EFLAGS_CF; |
2499 | al = c->dst.val; | 2432 | al = ctxt->dst.val; |
2500 | 2433 | ||
2501 | old_al = al; | 2434 | old_al = al; |
2502 | old_cf = cf; | 2435 | old_cf = cf; |
@@ -2514,12 +2447,12 @@ static int em_das(struct x86_emulate_ctxt *ctxt) | |||
2514 | cf = true; | 2447 | cf = true; |
2515 | } | 2448 | } |
2516 | 2449 | ||
2517 | c->dst.val = al; | 2450 | ctxt->dst.val = al; |
2518 | /* Set PF, ZF, SF */ | 2451 | /* Set PF, ZF, SF */ |
2519 | c->src.type = OP_IMM; | 2452 | ctxt->src.type = OP_IMM; |
2520 | c->src.val = 0; | 2453 | ctxt->src.val = 0; |
2521 | c->src.bytes = 1; | 2454 | ctxt->src.bytes = 1; |
2522 | emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); | 2455 | emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags); |
2523 | ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); | 2456 | ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); |
2524 | if (cf) | 2457 | if (cf) |
2525 | ctxt->eflags |= X86_EFLAGS_CF; | 2458 | ctxt->eflags |= X86_EFLAGS_CF; |
@@ -2530,175 +2463,189 @@ static int em_das(struct x86_emulate_ctxt *ctxt) | |||
2530 | 2463 | ||
2531 | static int em_call_far(struct x86_emulate_ctxt *ctxt) | 2464 | static int em_call_far(struct x86_emulate_ctxt *ctxt) |
2532 | { | 2465 | { |
2533 | struct decode_cache *c = &ctxt->decode; | ||
2534 | u16 sel, old_cs; | 2466 | u16 sel, old_cs; |
2535 | ulong old_eip; | 2467 | ulong old_eip; |
2536 | int rc; | 2468 | int rc; |
2537 | 2469 | ||
2538 | old_cs = get_segment_selector(ctxt, VCPU_SREG_CS); | 2470 | old_cs = get_segment_selector(ctxt, VCPU_SREG_CS); |
2539 | old_eip = c->eip; | 2471 | old_eip = ctxt->_eip; |
2540 | 2472 | ||
2541 | memcpy(&sel, c->src.valptr + c->op_bytes, 2); | 2473 | memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); |
2542 | if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS)) | 2474 | if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS)) |
2543 | return X86EMUL_CONTINUE; | 2475 | return X86EMUL_CONTINUE; |
2544 | 2476 | ||
2545 | c->eip = 0; | 2477 | ctxt->_eip = 0; |
2546 | memcpy(&c->eip, c->src.valptr, c->op_bytes); | 2478 | memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes); |
2547 | 2479 | ||
2548 | c->src.val = old_cs; | 2480 | ctxt->src.val = old_cs; |
2549 | rc = em_push(ctxt); | 2481 | rc = em_push(ctxt); |
2550 | if (rc != X86EMUL_CONTINUE) | 2482 | if (rc != X86EMUL_CONTINUE) |
2551 | return rc; | 2483 | return rc; |
2552 | 2484 | ||
2553 | c->src.val = old_eip; | 2485 | ctxt->src.val = old_eip; |
2554 | return em_push(ctxt); | 2486 | return em_push(ctxt); |
2555 | } | 2487 | } |
2556 | 2488 | ||
2557 | static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) | 2489 | static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) |
2558 | { | 2490 | { |
2559 | struct decode_cache *c = &ctxt->decode; | ||
2560 | int rc; | 2491 | int rc; |
2561 | 2492 | ||
2562 | c->dst.type = OP_REG; | 2493 | ctxt->dst.type = OP_REG; |
2563 | c->dst.addr.reg = &c->eip; | 2494 | ctxt->dst.addr.reg = &ctxt->_eip; |
2564 | c->dst.bytes = c->op_bytes; | 2495 | ctxt->dst.bytes = ctxt->op_bytes; |
2565 | rc = emulate_pop(ctxt, &c->dst.val, c->op_bytes); | 2496 | rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); |
2566 | if (rc != X86EMUL_CONTINUE) | 2497 | if (rc != X86EMUL_CONTINUE) |
2567 | return rc; | 2498 | return rc; |
2568 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val); | 2499 | register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val); |
2569 | return X86EMUL_CONTINUE; | 2500 | return X86EMUL_CONTINUE; |
2570 | } | 2501 | } |
2571 | 2502 | ||
2572 | static int em_add(struct x86_emulate_ctxt *ctxt) | 2503 | static int em_add(struct x86_emulate_ctxt *ctxt) |
2573 | { | 2504 | { |
2574 | struct decode_cache *c = &ctxt->decode; | 2505 | emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags); |
2575 | |||
2576 | emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); | ||
2577 | return X86EMUL_CONTINUE; | 2506 | return X86EMUL_CONTINUE; |
2578 | } | 2507 | } |
2579 | 2508 | ||
2580 | static int em_or(struct x86_emulate_ctxt *ctxt) | 2509 | static int em_or(struct x86_emulate_ctxt *ctxt) |
2581 | { | 2510 | { |
2582 | struct decode_cache *c = &ctxt->decode; | 2511 | emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags); |
2583 | |||
2584 | emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); | ||
2585 | return X86EMUL_CONTINUE; | 2512 | return X86EMUL_CONTINUE; |
2586 | } | 2513 | } |
2587 | 2514 | ||
2588 | static int em_adc(struct x86_emulate_ctxt *ctxt) | 2515 | static int em_adc(struct x86_emulate_ctxt *ctxt) |
2589 | { | 2516 | { |
2590 | struct decode_cache *c = &ctxt->decode; | 2517 | emulate_2op_SrcV("adc", ctxt->src, ctxt->dst, ctxt->eflags); |
2591 | |||
2592 | emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); | ||
2593 | return X86EMUL_CONTINUE; | 2518 | return X86EMUL_CONTINUE; |
2594 | } | 2519 | } |
2595 | 2520 | ||
2596 | static int em_sbb(struct x86_emulate_ctxt *ctxt) | 2521 | static int em_sbb(struct x86_emulate_ctxt *ctxt) |
2597 | { | 2522 | { |
2598 | struct decode_cache *c = &ctxt->decode; | 2523 | emulate_2op_SrcV("sbb", ctxt->src, ctxt->dst, ctxt->eflags); |
2599 | |||
2600 | emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); | ||
2601 | return X86EMUL_CONTINUE; | 2524 | return X86EMUL_CONTINUE; |
2602 | } | 2525 | } |
2603 | 2526 | ||
2604 | static int em_and(struct x86_emulate_ctxt *ctxt) | 2527 | static int em_and(struct x86_emulate_ctxt *ctxt) |
2605 | { | 2528 | { |
2606 | struct decode_cache *c = &ctxt->decode; | 2529 | emulate_2op_SrcV("and", ctxt->src, ctxt->dst, ctxt->eflags); |
2607 | |||
2608 | emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); | ||
2609 | return X86EMUL_CONTINUE; | 2530 | return X86EMUL_CONTINUE; |
2610 | } | 2531 | } |
2611 | 2532 | ||
2612 | static int em_sub(struct x86_emulate_ctxt *ctxt) | 2533 | static int em_sub(struct x86_emulate_ctxt *ctxt) |
2613 | { | 2534 | { |
2614 | struct decode_cache *c = &ctxt->decode; | 2535 | emulate_2op_SrcV("sub", ctxt->src, ctxt->dst, ctxt->eflags); |
2615 | |||
2616 | emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); | ||
2617 | return X86EMUL_CONTINUE; | 2536 | return X86EMUL_CONTINUE; |
2618 | } | 2537 | } |
2619 | 2538 | ||
2620 | static int em_xor(struct x86_emulate_ctxt *ctxt) | 2539 | static int em_xor(struct x86_emulate_ctxt *ctxt) |
2621 | { | 2540 | { |
2622 | struct decode_cache *c = &ctxt->decode; | 2541 | emulate_2op_SrcV("xor", ctxt->src, ctxt->dst, ctxt->eflags); |
2623 | |||
2624 | emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); | ||
2625 | return X86EMUL_CONTINUE; | 2542 | return X86EMUL_CONTINUE; |
2626 | } | 2543 | } |
2627 | 2544 | ||
2628 | static int em_cmp(struct x86_emulate_ctxt *ctxt) | 2545 | static int em_cmp(struct x86_emulate_ctxt *ctxt) |
2629 | { | 2546 | { |
2630 | struct decode_cache *c = &ctxt->decode; | 2547 | emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags); |
2631 | |||
2632 | emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); | ||
2633 | /* Disable writeback. */ | 2548 | /* Disable writeback. */ |
2634 | c->dst.type = OP_NONE; | 2549 | ctxt->dst.type = OP_NONE; |
2635 | return X86EMUL_CONTINUE; | 2550 | return X86EMUL_CONTINUE; |
2636 | } | 2551 | } |
2637 | 2552 | ||
2638 | static int em_imul(struct x86_emulate_ctxt *ctxt) | 2553 | static int em_test(struct x86_emulate_ctxt *ctxt) |
2554 | { | ||
2555 | emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags); | ||
2556 | return X86EMUL_CONTINUE; | ||
2557 | } | ||
2558 | |||
2559 | static int em_xchg(struct x86_emulate_ctxt *ctxt) | ||
2639 | { | 2560 | { |
2640 | struct decode_cache *c = &ctxt->decode; | 2561 | /* Write back the register source. */ |
2562 | ctxt->src.val = ctxt->dst.val; | ||
2563 | write_register_operand(&ctxt->src); | ||
2641 | 2564 | ||
2642 | emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags); | 2565 | /* Write back the memory destination with implicit LOCK prefix. */ |
2566 | ctxt->dst.val = ctxt->src.orig_val; | ||
2567 | ctxt->lock_prefix = 1; | ||
2643 | return X86EMUL_CONTINUE; | 2568 | return X86EMUL_CONTINUE; |
2644 | } | 2569 | } |
2645 | 2570 | ||
2646 | static int em_imul_3op(struct x86_emulate_ctxt *ctxt) | 2571 | static int em_imul(struct x86_emulate_ctxt *ctxt) |
2647 | { | 2572 | { |
2648 | struct decode_cache *c = &ctxt->decode; | 2573 | emulate_2op_SrcV_nobyte("imul", ctxt->src, ctxt->dst, ctxt->eflags); |
2574 | return X86EMUL_CONTINUE; | ||
2575 | } | ||
2649 | 2576 | ||
2650 | c->dst.val = c->src2.val; | 2577 | static int em_imul_3op(struct x86_emulate_ctxt *ctxt) |
2578 | { | ||
2579 | ctxt->dst.val = ctxt->src2.val; | ||
2651 | return em_imul(ctxt); | 2580 | return em_imul(ctxt); |
2652 | } | 2581 | } |
2653 | 2582 | ||
2654 | static int em_cwd(struct x86_emulate_ctxt *ctxt) | 2583 | static int em_cwd(struct x86_emulate_ctxt *ctxt) |
2655 | { | 2584 | { |
2656 | struct decode_cache *c = &ctxt->decode; | 2585 | ctxt->dst.type = OP_REG; |
2657 | 2586 | ctxt->dst.bytes = ctxt->src.bytes; | |
2658 | c->dst.type = OP_REG; | 2587 | ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; |
2659 | c->dst.bytes = c->src.bytes; | 2588 | ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1); |
2660 | c->dst.addr.reg = &c->regs[VCPU_REGS_RDX]; | ||
2661 | c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1); | ||
2662 | 2589 | ||
2663 | return X86EMUL_CONTINUE; | 2590 | return X86EMUL_CONTINUE; |
2664 | } | 2591 | } |
2665 | 2592 | ||
2666 | static int em_rdtsc(struct x86_emulate_ctxt *ctxt) | 2593 | static int em_rdtsc(struct x86_emulate_ctxt *ctxt) |
2667 | { | 2594 | { |
2668 | struct decode_cache *c = &ctxt->decode; | ||
2669 | u64 tsc = 0; | 2595 | u64 tsc = 0; |
2670 | 2596 | ||
2671 | ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); | 2597 | ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); |
2672 | c->regs[VCPU_REGS_RAX] = (u32)tsc; | 2598 | ctxt->regs[VCPU_REGS_RAX] = (u32)tsc; |
2673 | c->regs[VCPU_REGS_RDX] = tsc >> 32; | 2599 | ctxt->regs[VCPU_REGS_RDX] = tsc >> 32; |
2674 | return X86EMUL_CONTINUE; | 2600 | return X86EMUL_CONTINUE; |
2675 | } | 2601 | } |
2676 | 2602 | ||
2677 | static int em_mov(struct x86_emulate_ctxt *ctxt) | 2603 | static int em_mov(struct x86_emulate_ctxt *ctxt) |
2678 | { | 2604 | { |
2679 | struct decode_cache *c = &ctxt->decode; | 2605 | ctxt->dst.val = ctxt->src.val; |
2680 | c->dst.val = c->src.val; | ||
2681 | return X86EMUL_CONTINUE; | 2606 | return X86EMUL_CONTINUE; |
2682 | } | 2607 | } |
2683 | 2608 | ||
2609 | static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt) | ||
2610 | { | ||
2611 | if (ctxt->modrm_reg > VCPU_SREG_GS) | ||
2612 | return emulate_ud(ctxt); | ||
2613 | |||
2614 | ctxt->dst.val = get_segment_selector(ctxt, ctxt->modrm_reg); | ||
2615 | return X86EMUL_CONTINUE; | ||
2616 | } | ||
2617 | |||
2618 | static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) | ||
2619 | { | ||
2620 | u16 sel = ctxt->src.val; | ||
2621 | |||
2622 | if (ctxt->modrm_reg == VCPU_SREG_CS || ctxt->modrm_reg > VCPU_SREG_GS) | ||
2623 | return emulate_ud(ctxt); | ||
2624 | |||
2625 | if (ctxt->modrm_reg == VCPU_SREG_SS) | ||
2626 | ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; | ||
2627 | |||
2628 | /* Disable writeback. */ | ||
2629 | ctxt->dst.type = OP_NONE; | ||
2630 | return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); | ||
2631 | } | ||
2632 | |||
2684 | static int em_movdqu(struct x86_emulate_ctxt *ctxt) | 2633 | static int em_movdqu(struct x86_emulate_ctxt *ctxt) |
2685 | { | 2634 | { |
2686 | struct decode_cache *c = &ctxt->decode; | 2635 | memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes); |
2687 | memcpy(&c->dst.vec_val, &c->src.vec_val, c->op_bytes); | ||
2688 | return X86EMUL_CONTINUE; | 2636 | return X86EMUL_CONTINUE; |
2689 | } | 2637 | } |
2690 | 2638 | ||
2691 | static int em_invlpg(struct x86_emulate_ctxt *ctxt) | 2639 | static int em_invlpg(struct x86_emulate_ctxt *ctxt) |
2692 | { | 2640 | { |
2693 | struct decode_cache *c = &ctxt->decode; | ||
2694 | int rc; | 2641 | int rc; |
2695 | ulong linear; | 2642 | ulong linear; |
2696 | 2643 | ||
2697 | rc = linearize(ctxt, c->src.addr.mem, 1, false, &linear); | 2644 | rc = linearize(ctxt, ctxt->src.addr.mem, 1, false, &linear); |
2698 | if (rc == X86EMUL_CONTINUE) | 2645 | if (rc == X86EMUL_CONTINUE) |
2699 | ctxt->ops->invlpg(ctxt, linear); | 2646 | ctxt->ops->invlpg(ctxt, linear); |
2700 | /* Disable writeback. */ | 2647 | /* Disable writeback. */ |
2701 | c->dst.type = OP_NONE; | 2648 | ctxt->dst.type = OP_NONE; |
2702 | return X86EMUL_CONTINUE; | 2649 | return X86EMUL_CONTINUE; |
2703 | } | 2650 | } |
2704 | 2651 | ||
@@ -2714,10 +2661,9 @@ static int em_clts(struct x86_emulate_ctxt *ctxt) | |||
2714 | 2661 | ||
2715 | static int em_vmcall(struct x86_emulate_ctxt *ctxt) | 2662 | static int em_vmcall(struct x86_emulate_ctxt *ctxt) |
2716 | { | 2663 | { |
2717 | struct decode_cache *c = &ctxt->decode; | ||
2718 | int rc; | 2664 | int rc; |
2719 | 2665 | ||
2720 | if (c->modrm_mod != 3 || c->modrm_rm != 1) | 2666 | if (ctxt->modrm_mod != 3 || ctxt->modrm_rm != 1) |
2721 | return X86EMUL_UNHANDLEABLE; | 2667 | return X86EMUL_UNHANDLEABLE; |
2722 | 2668 | ||
2723 | rc = ctxt->ops->fix_hypercall(ctxt); | 2669 | rc = ctxt->ops->fix_hypercall(ctxt); |
@@ -2725,73 +2671,104 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt) | |||
2725 | return rc; | 2671 | return rc; |
2726 | 2672 | ||
2727 | /* Let the processor re-execute the fixed hypercall */ | 2673 | /* Let the processor re-execute the fixed hypercall */ |
2728 | c->eip = ctxt->eip; | 2674 | ctxt->_eip = ctxt->eip; |
2729 | /* Disable writeback. */ | 2675 | /* Disable writeback. */ |
2730 | c->dst.type = OP_NONE; | 2676 | ctxt->dst.type = OP_NONE; |
2731 | return X86EMUL_CONTINUE; | 2677 | return X86EMUL_CONTINUE; |
2732 | } | 2678 | } |
2733 | 2679 | ||
2734 | static int em_lgdt(struct x86_emulate_ctxt *ctxt) | 2680 | static int em_lgdt(struct x86_emulate_ctxt *ctxt) |
2735 | { | 2681 | { |
2736 | struct decode_cache *c = &ctxt->decode; | ||
2737 | struct desc_ptr desc_ptr; | 2682 | struct desc_ptr desc_ptr; |
2738 | int rc; | 2683 | int rc; |
2739 | 2684 | ||
2740 | rc = read_descriptor(ctxt, c->src.addr.mem, | 2685 | rc = read_descriptor(ctxt, ctxt->src.addr.mem, |
2741 | &desc_ptr.size, &desc_ptr.address, | 2686 | &desc_ptr.size, &desc_ptr.address, |
2742 | c->op_bytes); | 2687 | ctxt->op_bytes); |
2743 | if (rc != X86EMUL_CONTINUE) | 2688 | if (rc != X86EMUL_CONTINUE) |
2744 | return rc; | 2689 | return rc; |
2745 | ctxt->ops->set_gdt(ctxt, &desc_ptr); | 2690 | ctxt->ops->set_gdt(ctxt, &desc_ptr); |
2746 | /* Disable writeback. */ | 2691 | /* Disable writeback. */ |
2747 | c->dst.type = OP_NONE; | 2692 | ctxt->dst.type = OP_NONE; |
2748 | return X86EMUL_CONTINUE; | 2693 | return X86EMUL_CONTINUE; |
2749 | } | 2694 | } |
2750 | 2695 | ||
2751 | static int em_vmmcall(struct x86_emulate_ctxt *ctxt) | 2696 | static int em_vmmcall(struct x86_emulate_ctxt *ctxt) |
2752 | { | 2697 | { |
2753 | struct decode_cache *c = &ctxt->decode; | ||
2754 | int rc; | 2698 | int rc; |
2755 | 2699 | ||
2756 | rc = ctxt->ops->fix_hypercall(ctxt); | 2700 | rc = ctxt->ops->fix_hypercall(ctxt); |
2757 | 2701 | ||
2758 | /* Disable writeback. */ | 2702 | /* Disable writeback. */ |
2759 | c->dst.type = OP_NONE; | 2703 | ctxt->dst.type = OP_NONE; |
2760 | return rc; | 2704 | return rc; |
2761 | } | 2705 | } |
2762 | 2706 | ||
2763 | static int em_lidt(struct x86_emulate_ctxt *ctxt) | 2707 | static int em_lidt(struct x86_emulate_ctxt *ctxt) |
2764 | { | 2708 | { |
2765 | struct decode_cache *c = &ctxt->decode; | ||
2766 | struct desc_ptr desc_ptr; | 2709 | struct desc_ptr desc_ptr; |
2767 | int rc; | 2710 | int rc; |
2768 | 2711 | ||
2769 | rc = read_descriptor(ctxt, c->src.addr.mem, | 2712 | rc = read_descriptor(ctxt, ctxt->src.addr.mem, |
2770 | &desc_ptr.size, &desc_ptr.address, | 2713 | &desc_ptr.size, &desc_ptr.address, |
2771 | c->op_bytes); | 2714 | ctxt->op_bytes); |
2772 | if (rc != X86EMUL_CONTINUE) | 2715 | if (rc != X86EMUL_CONTINUE) |
2773 | return rc; | 2716 | return rc; |
2774 | ctxt->ops->set_idt(ctxt, &desc_ptr); | 2717 | ctxt->ops->set_idt(ctxt, &desc_ptr); |
2775 | /* Disable writeback. */ | 2718 | /* Disable writeback. */ |
2776 | c->dst.type = OP_NONE; | 2719 | ctxt->dst.type = OP_NONE; |
2777 | return X86EMUL_CONTINUE; | 2720 | return X86EMUL_CONTINUE; |
2778 | } | 2721 | } |
2779 | 2722 | ||
2780 | static int em_smsw(struct x86_emulate_ctxt *ctxt) | 2723 | static int em_smsw(struct x86_emulate_ctxt *ctxt) |
2781 | { | 2724 | { |
2782 | struct decode_cache *c = &ctxt->decode; | 2725 | ctxt->dst.bytes = 2; |
2783 | 2726 | ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0); | |
2784 | c->dst.bytes = 2; | ||
2785 | c->dst.val = ctxt->ops->get_cr(ctxt, 0); | ||
2786 | return X86EMUL_CONTINUE; | 2727 | return X86EMUL_CONTINUE; |
2787 | } | 2728 | } |
2788 | 2729 | ||
2789 | static int em_lmsw(struct x86_emulate_ctxt *ctxt) | 2730 | static int em_lmsw(struct x86_emulate_ctxt *ctxt) |
2790 | { | 2731 | { |
2791 | struct decode_cache *c = &ctxt->decode; | ||
2792 | ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul) | 2732 | ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul) |
2793 | | (c->src.val & 0x0f)); | 2733 | | (ctxt->src.val & 0x0f)); |
2794 | c->dst.type = OP_NONE; | 2734 | ctxt->dst.type = OP_NONE; |
2735 | return X86EMUL_CONTINUE; | ||
2736 | } | ||
2737 | |||
2738 | static int em_loop(struct x86_emulate_ctxt *ctxt) | ||
2739 | { | ||
2740 | register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1); | ||
2741 | if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) && | ||
2742 | (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags))) | ||
2743 | jmp_rel(ctxt, ctxt->src.val); | ||
2744 | |||
2745 | return X86EMUL_CONTINUE; | ||
2746 | } | ||
2747 | |||
2748 | static int em_jcxz(struct x86_emulate_ctxt *ctxt) | ||
2749 | { | ||
2750 | if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) | ||
2751 | jmp_rel(ctxt, ctxt->src.val); | ||
2752 | |||
2753 | return X86EMUL_CONTINUE; | ||
2754 | } | ||
2755 | |||
2756 | static int em_cli(struct x86_emulate_ctxt *ctxt) | ||
2757 | { | ||
2758 | if (emulator_bad_iopl(ctxt)) | ||
2759 | return emulate_gp(ctxt, 0); | ||
2760 | |||
2761 | ctxt->eflags &= ~X86_EFLAGS_IF; | ||
2762 | return X86EMUL_CONTINUE; | ||
2763 | } | ||
2764 | |||
2765 | static int em_sti(struct x86_emulate_ctxt *ctxt) | ||
2766 | { | ||
2767 | if (emulator_bad_iopl(ctxt)) | ||
2768 | return emulate_gp(ctxt, 0); | ||
2769 | |||
2770 | ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; | ||
2771 | ctxt->eflags |= X86_EFLAGS_IF; | ||
2795 | return X86EMUL_CONTINUE; | 2772 | return X86EMUL_CONTINUE; |
2796 | } | 2773 | } |
2797 | 2774 | ||
@@ -2809,9 +2786,7 @@ static bool valid_cr(int nr) | |||
2809 | 2786 | ||
2810 | static int check_cr_read(struct x86_emulate_ctxt *ctxt) | 2787 | static int check_cr_read(struct x86_emulate_ctxt *ctxt) |
2811 | { | 2788 | { |
2812 | struct decode_cache *c = &ctxt->decode; | 2789 | if (!valid_cr(ctxt->modrm_reg)) |
2813 | |||
2814 | if (!valid_cr(c->modrm_reg)) | ||
2815 | return emulate_ud(ctxt); | 2790 | return emulate_ud(ctxt); |
2816 | 2791 | ||
2817 | return X86EMUL_CONTINUE; | 2792 | return X86EMUL_CONTINUE; |
@@ -2819,9 +2794,8 @@ static int check_cr_read(struct x86_emulate_ctxt *ctxt) | |||
2819 | 2794 | ||
2820 | static int check_cr_write(struct x86_emulate_ctxt *ctxt) | 2795 | static int check_cr_write(struct x86_emulate_ctxt *ctxt) |
2821 | { | 2796 | { |
2822 | struct decode_cache *c = &ctxt->decode; | 2797 | u64 new_val = ctxt->src.val64; |
2823 | u64 new_val = c->src.val64; | 2798 | int cr = ctxt->modrm_reg; |
2824 | int cr = c->modrm_reg; | ||
2825 | u64 efer = 0; | 2799 | u64 efer = 0; |
2826 | 2800 | ||
2827 | static u64 cr_reserved_bits[] = { | 2801 | static u64 cr_reserved_bits[] = { |
@@ -2898,8 +2872,7 @@ static int check_dr7_gd(struct x86_emulate_ctxt *ctxt) | |||
2898 | 2872 | ||
2899 | static int check_dr_read(struct x86_emulate_ctxt *ctxt) | 2873 | static int check_dr_read(struct x86_emulate_ctxt *ctxt) |
2900 | { | 2874 | { |
2901 | struct decode_cache *c = &ctxt->decode; | 2875 | int dr = ctxt->modrm_reg; |
2902 | int dr = c->modrm_reg; | ||
2903 | u64 cr4; | 2876 | u64 cr4; |
2904 | 2877 | ||
2905 | if (dr > 7) | 2878 | if (dr > 7) |
@@ -2917,9 +2890,8 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt) | |||
2917 | 2890 | ||
2918 | static int check_dr_write(struct x86_emulate_ctxt *ctxt) | 2891 | static int check_dr_write(struct x86_emulate_ctxt *ctxt) |
2919 | { | 2892 | { |
2920 | struct decode_cache *c = &ctxt->decode; | 2893 | u64 new_val = ctxt->src.val64; |
2921 | u64 new_val = c->src.val64; | 2894 | int dr = ctxt->modrm_reg; |
2922 | int dr = c->modrm_reg; | ||
2923 | 2895 | ||
2924 | if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL)) | 2896 | if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL)) |
2925 | return emulate_gp(ctxt, 0); | 2897 | return emulate_gp(ctxt, 0); |
@@ -2941,7 +2913,7 @@ static int check_svme(struct x86_emulate_ctxt *ctxt) | |||
2941 | 2913 | ||
2942 | static int check_svme_pa(struct x86_emulate_ctxt *ctxt) | 2914 | static int check_svme_pa(struct x86_emulate_ctxt *ctxt) |
2943 | { | 2915 | { |
2944 | u64 rax = ctxt->decode.regs[VCPU_REGS_RAX]; | 2916 | u64 rax = ctxt->regs[VCPU_REGS_RAX]; |
2945 | 2917 | ||
2946 | /* Valid physical address? */ | 2918 | /* Valid physical address? */ |
2947 | if (rax & 0xffff000000000000ULL) | 2919 | if (rax & 0xffff000000000000ULL) |
@@ -2963,7 +2935,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt) | |||
2963 | static int check_rdpmc(struct x86_emulate_ctxt *ctxt) | 2935 | static int check_rdpmc(struct x86_emulate_ctxt *ctxt) |
2964 | { | 2936 | { |
2965 | u64 cr4 = ctxt->ops->get_cr(ctxt, 4); | 2937 | u64 cr4 = ctxt->ops->get_cr(ctxt, 4); |
2966 | u64 rcx = ctxt->decode.regs[VCPU_REGS_RCX]; | 2938 | u64 rcx = ctxt->regs[VCPU_REGS_RCX]; |
2967 | 2939 | ||
2968 | if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || | 2940 | if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || |
2969 | (rcx > 3)) | 2941 | (rcx > 3)) |
@@ -2974,10 +2946,8 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt) | |||
2974 | 2946 | ||
2975 | static int check_perm_in(struct x86_emulate_ctxt *ctxt) | 2947 | static int check_perm_in(struct x86_emulate_ctxt *ctxt) |
2976 | { | 2948 | { |
2977 | struct decode_cache *c = &ctxt->decode; | 2949 | ctxt->dst.bytes = min(ctxt->dst.bytes, 4u); |
2978 | 2950 | if (!emulator_io_permited(ctxt, ctxt->src.val, ctxt->dst.bytes)) | |
2979 | c->dst.bytes = min(c->dst.bytes, 4u); | ||
2980 | if (!emulator_io_permited(ctxt, ctxt->ops, c->src.val, c->dst.bytes)) | ||
2981 | return emulate_gp(ctxt, 0); | 2951 | return emulate_gp(ctxt, 0); |
2982 | 2952 | ||
2983 | return X86EMUL_CONTINUE; | 2953 | return X86EMUL_CONTINUE; |
@@ -2985,10 +2955,8 @@ static int check_perm_in(struct x86_emulate_ctxt *ctxt) | |||
2985 | 2955 | ||
2986 | static int check_perm_out(struct x86_emulate_ctxt *ctxt) | 2956 | static int check_perm_out(struct x86_emulate_ctxt *ctxt) |
2987 | { | 2957 | { |
2988 | struct decode_cache *c = &ctxt->decode; | 2958 | ctxt->src.bytes = min(ctxt->src.bytes, 4u); |
2989 | 2959 | if (!emulator_io_permited(ctxt, ctxt->dst.val, ctxt->src.bytes)) | |
2990 | c->src.bytes = min(c->src.bytes, 4u); | ||
2991 | if (!emulator_io_permited(ctxt, ctxt->ops, c->dst.val, c->src.bytes)) | ||
2992 | return emulate_gp(ctxt, 0); | 2960 | return emulate_gp(ctxt, 0); |
2993 | 2961 | ||
2994 | return X86EMUL_CONTINUE; | 2962 | return X86EMUL_CONTINUE; |
@@ -3165,12 +3133,15 @@ static struct opcode opcode_table[256] = { | |||
3165 | G(DstMem | SrcImm | ModRM | Group, group1), | 3133 | G(DstMem | SrcImm | ModRM | Group, group1), |
3166 | G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), | 3134 | G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), |
3167 | G(DstMem | SrcImmByte | ModRM | Group, group1), | 3135 | G(DstMem | SrcImmByte | ModRM | Group, group1), |
3168 | D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock), | 3136 | I2bv(DstMem | SrcReg | ModRM, em_test), |
3137 | I2bv(DstMem | SrcReg | ModRM | Lock, em_xchg), | ||
3169 | /* 0x88 - 0x8F */ | 3138 | /* 0x88 - 0x8F */ |
3170 | I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), | 3139 | I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), |
3171 | I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), | 3140 | I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), |
3172 | D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg), | 3141 | I(DstMem | SrcNone | ModRM | Mov, em_mov_rm_sreg), |
3173 | D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), | 3142 | D(ModRM | SrcMem | NoAccess | DstReg), |
3143 | I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm), | ||
3144 | G(0, group1A), | ||
3174 | /* 0x90 - 0x97 */ | 3145 | /* 0x90 - 0x97 */ |
3175 | DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)), | 3146 | DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)), |
3176 | /* 0x98 - 0x9F */ | 3147 | /* 0x98 - 0x9F */ |
@@ -3184,7 +3155,7 @@ static struct opcode opcode_table[256] = { | |||
3184 | I2bv(SrcSI | DstDI | Mov | String, em_mov), | 3155 | I2bv(SrcSI | DstDI | Mov | String, em_mov), |
3185 | I2bv(SrcSI | DstDI | String, em_cmp), | 3156 | I2bv(SrcSI | DstDI | String, em_cmp), |
3186 | /* 0xA8 - 0xAF */ | 3157 | /* 0xA8 - 0xAF */ |
3187 | D2bv(DstAcc | SrcImm), | 3158 | I2bv(DstAcc | SrcImm, em_test), |
3188 | I2bv(SrcAcc | DstDI | Mov | String, em_mov), | 3159 | I2bv(SrcAcc | DstDI | Mov | String, em_mov), |
3189 | I2bv(SrcSI | DstAcc | Mov | String, em_mov), | 3160 | I2bv(SrcSI | DstAcc | Mov | String, em_mov), |
3190 | I2bv(SrcAcc | DstDI | String, em_cmp), | 3161 | I2bv(SrcAcc | DstDI | String, em_cmp), |
@@ -3195,25 +3166,26 @@ static struct opcode opcode_table[256] = { | |||
3195 | /* 0xC0 - 0xC7 */ | 3166 | /* 0xC0 - 0xC7 */ |
3196 | D2bv(DstMem | SrcImmByte | ModRM), | 3167 | D2bv(DstMem | SrcImmByte | ModRM), |
3197 | I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), | 3168 | I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), |
3198 | D(ImplicitOps | Stack), | 3169 | I(ImplicitOps | Stack, em_ret), |
3199 | D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), | 3170 | D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), |
3200 | G(ByteOp, group11), G(0, group11), | 3171 | G(ByteOp, group11), G(0, group11), |
3201 | /* 0xC8 - 0xCF */ | 3172 | /* 0xC8 - 0xCF */ |
3202 | N, N, N, D(ImplicitOps | Stack), | 3173 | N, N, N, I(ImplicitOps | Stack, em_ret_far), |
3203 | D(ImplicitOps), DI(SrcImmByte, intn), | 3174 | D(ImplicitOps), DI(SrcImmByte, intn), |
3204 | D(ImplicitOps | No64), DI(ImplicitOps, iret), | 3175 | D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), |
3205 | /* 0xD0 - 0xD7 */ | 3176 | /* 0xD0 - 0xD7 */ |
3206 | D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), | 3177 | D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), |
3207 | N, N, N, N, | 3178 | N, N, N, N, |
3208 | /* 0xD8 - 0xDF */ | 3179 | /* 0xD8 - 0xDF */ |
3209 | N, N, N, N, N, N, N, N, | 3180 | N, N, N, N, N, N, N, N, |
3210 | /* 0xE0 - 0xE7 */ | 3181 | /* 0xE0 - 0xE7 */ |
3211 | X4(D(SrcImmByte)), | 3182 | X3(I(SrcImmByte, em_loop)), |
3183 | I(SrcImmByte, em_jcxz), | ||
3212 | D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in), | 3184 | D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in), |
3213 | D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out), | 3185 | D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out), |
3214 | /* 0xE8 - 0xEF */ | 3186 | /* 0xE8 - 0xEF */ |
3215 | D(SrcImm | Stack), D(SrcImm | ImplicitOps), | 3187 | D(SrcImm | Stack), D(SrcImm | ImplicitOps), |
3216 | D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), | 3188 | I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps), |
3217 | D2bvIP(SrcDX | DstAcc, in, check_perm_in), | 3189 | D2bvIP(SrcDX | DstAcc, in, check_perm_in), |
3218 | D2bvIP(SrcAcc | DstDX, out, check_perm_out), | 3190 | D2bvIP(SrcAcc | DstDX, out, check_perm_out), |
3219 | /* 0xF0 - 0xF7 */ | 3191 | /* 0xF0 - 0xF7 */ |
@@ -3221,14 +3193,16 @@ static struct opcode opcode_table[256] = { | |||
3221 | DI(ImplicitOps | Priv, hlt), D(ImplicitOps), | 3193 | DI(ImplicitOps | Priv, hlt), D(ImplicitOps), |
3222 | G(ByteOp, group3), G(0, group3), | 3194 | G(ByteOp, group3), G(0, group3), |
3223 | /* 0xF8 - 0xFF */ | 3195 | /* 0xF8 - 0xFF */ |
3224 | D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), | 3196 | D(ImplicitOps), D(ImplicitOps), |
3197 | I(ImplicitOps, em_cli), I(ImplicitOps, em_sti), | ||
3225 | D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), | 3198 | D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), |
3226 | }; | 3199 | }; |
3227 | 3200 | ||
3228 | static struct opcode twobyte_table[256] = { | 3201 | static struct opcode twobyte_table[256] = { |
3229 | /* 0x00 - 0x0F */ | 3202 | /* 0x00 - 0x0F */ |
3230 | G(0, group6), GD(0, &group7), N, N, | 3203 | G(0, group6), GD(0, &group7), N, N, |
3231 | N, D(ImplicitOps | VendorSpecific), DI(ImplicitOps | Priv, clts), N, | 3204 | N, I(ImplicitOps | VendorSpecific, em_syscall), |
3205 | II(ImplicitOps | Priv, em_clts, clts), N, | ||
3232 | DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, | 3206 | DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, |
3233 | N, D(ImplicitOps | ModRM), N, N, | 3207 | N, D(ImplicitOps | ModRM), N, N, |
3234 | /* 0x10 - 0x1F */ | 3208 | /* 0x10 - 0x1F */ |
@@ -3245,7 +3219,8 @@ static struct opcode twobyte_table[256] = { | |||
3245 | IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), | 3219 | IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), |
3246 | DI(ImplicitOps | Priv, rdmsr), | 3220 | DI(ImplicitOps | Priv, rdmsr), |
3247 | DIP(ImplicitOps | Priv, rdpmc, check_rdpmc), | 3221 | DIP(ImplicitOps | Priv, rdpmc, check_rdpmc), |
3248 | D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific), | 3222 | I(ImplicitOps | VendorSpecific, em_sysenter), |
3223 | I(ImplicitOps | Priv | VendorSpecific, em_sysexit), | ||
3249 | N, N, | 3224 | N, N, |
3250 | N, N, N, N, N, N, N, N, | 3225 | N, N, N, N, N, N, N, N, |
3251 | /* 0x40 - 0x4F */ | 3226 | /* 0x40 - 0x4F */ |
@@ -3313,11 +3288,11 @@ static struct opcode twobyte_table[256] = { | |||
3313 | #undef I2bv | 3288 | #undef I2bv |
3314 | #undef I6ALU | 3289 | #undef I6ALU |
3315 | 3290 | ||
3316 | static unsigned imm_size(struct decode_cache *c) | 3291 | static unsigned imm_size(struct x86_emulate_ctxt *ctxt) |
3317 | { | 3292 | { |
3318 | unsigned size; | 3293 | unsigned size; |
3319 | 3294 | ||
3320 | size = (c->d & ByteOp) ? 1 : c->op_bytes; | 3295 | size = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; |
3321 | if (size == 8) | 3296 | if (size == 8) |
3322 | size = 4; | 3297 | size = 4; |
3323 | return size; | 3298 | return size; |
@@ -3326,23 +3301,21 @@ static unsigned imm_size(struct decode_cache *c) | |||
3326 | static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, | 3301 | static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, |
3327 | unsigned size, bool sign_extension) | 3302 | unsigned size, bool sign_extension) |
3328 | { | 3303 | { |
3329 | struct decode_cache *c = &ctxt->decode; | ||
3330 | struct x86_emulate_ops *ops = ctxt->ops; | ||
3331 | int rc = X86EMUL_CONTINUE; | 3304 | int rc = X86EMUL_CONTINUE; |
3332 | 3305 | ||
3333 | op->type = OP_IMM; | 3306 | op->type = OP_IMM; |
3334 | op->bytes = size; | 3307 | op->bytes = size; |
3335 | op->addr.mem.ea = c->eip; | 3308 | op->addr.mem.ea = ctxt->_eip; |
3336 | /* NB. Immediates are sign-extended as necessary. */ | 3309 | /* NB. Immediates are sign-extended as necessary. */ |
3337 | switch (op->bytes) { | 3310 | switch (op->bytes) { |
3338 | case 1: | 3311 | case 1: |
3339 | op->val = insn_fetch(s8, 1, c->eip); | 3312 | op->val = insn_fetch(s8, 1, ctxt->_eip); |
3340 | break; | 3313 | break; |
3341 | case 2: | 3314 | case 2: |
3342 | op->val = insn_fetch(s16, 2, c->eip); | 3315 | op->val = insn_fetch(s16, 2, ctxt->_eip); |
3343 | break; | 3316 | break; |
3344 | case 4: | 3317 | case 4: |
3345 | op->val = insn_fetch(s32, 4, c->eip); | 3318 | op->val = insn_fetch(s32, 4, ctxt->_eip); |
3346 | break; | 3319 | break; |
3347 | } | 3320 | } |
3348 | if (!sign_extension) { | 3321 | if (!sign_extension) { |
@@ -3362,11 +3335,8 @@ done: | |||
3362 | return rc; | 3335 | return rc; |
3363 | } | 3336 | } |
3364 | 3337 | ||
3365 | int | 3338 | int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) |
3366 | x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) | ||
3367 | { | 3339 | { |
3368 | struct x86_emulate_ops *ops = ctxt->ops; | ||
3369 | struct decode_cache *c = &ctxt->decode; | ||
3370 | int rc = X86EMUL_CONTINUE; | 3340 | int rc = X86EMUL_CONTINUE; |
3371 | int mode = ctxt->mode; | 3341 | int mode = ctxt->mode; |
3372 | int def_op_bytes, def_ad_bytes, goffset, simd_prefix; | 3342 | int def_op_bytes, def_ad_bytes, goffset, simd_prefix; |
@@ -3374,11 +3344,11 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) | |||
3374 | struct opcode opcode; | 3344 | struct opcode opcode; |
3375 | struct operand memop = { .type = OP_NONE }, *memopp = NULL; | 3345 | struct operand memop = { .type = OP_NONE }, *memopp = NULL; |
3376 | 3346 | ||
3377 | c->eip = ctxt->eip; | 3347 | ctxt->_eip = ctxt->eip; |
3378 | c->fetch.start = c->eip; | 3348 | ctxt->fetch.start = ctxt->_eip; |
3379 | c->fetch.end = c->fetch.start + insn_len; | 3349 | ctxt->fetch.end = ctxt->fetch.start + insn_len; |
3380 | if (insn_len > 0) | 3350 | if (insn_len > 0) |
3381 | memcpy(c->fetch.data, insn, insn_len); | 3351 | memcpy(ctxt->fetch.data, insn, insn_len); |
3382 | 3352 | ||
3383 | switch (mode) { | 3353 | switch (mode) { |
3384 | case X86EMUL_MODE_REAL: | 3354 | case X86EMUL_MODE_REAL: |
@@ -3399,46 +3369,46 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) | |||
3399 | return -1; | 3369 | return -1; |
3400 | } | 3370 | } |
3401 | 3371 | ||
3402 | c->op_bytes = def_op_bytes; | 3372 | ctxt->op_bytes = def_op_bytes; |
3403 | c->ad_bytes = def_ad_bytes; | 3373 | ctxt->ad_bytes = def_ad_bytes; |
3404 | 3374 | ||
3405 | /* Legacy prefixes. */ | 3375 | /* Legacy prefixes. */ |
3406 | for (;;) { | 3376 | for (;;) { |
3407 | switch (c->b = insn_fetch(u8, 1, c->eip)) { | 3377 | switch (ctxt->b = insn_fetch(u8, 1, ctxt->_eip)) { |
3408 | case 0x66: /* operand-size override */ | 3378 | case 0x66: /* operand-size override */ |
3409 | op_prefix = true; | 3379 | op_prefix = true; |
3410 | /* switch between 2/4 bytes */ | 3380 | /* switch between 2/4 bytes */ |
3411 | c->op_bytes = def_op_bytes ^ 6; | 3381 | ctxt->op_bytes = def_op_bytes ^ 6; |
3412 | break; | 3382 | break; |
3413 | case 0x67: /* address-size override */ | 3383 | case 0x67: /* address-size override */ |
3414 | if (mode == X86EMUL_MODE_PROT64) | 3384 | if (mode == X86EMUL_MODE_PROT64) |
3415 | /* switch between 4/8 bytes */ | 3385 | /* switch between 4/8 bytes */ |
3416 | c->ad_bytes = def_ad_bytes ^ 12; | 3386 | ctxt->ad_bytes = def_ad_bytes ^ 12; |
3417 | else | 3387 | else |
3418 | /* switch between 2/4 bytes */ | 3388 | /* switch between 2/4 bytes */ |
3419 | c->ad_bytes = def_ad_bytes ^ 6; | 3389 | ctxt->ad_bytes = def_ad_bytes ^ 6; |
3420 | break; | 3390 | break; |
3421 | case 0x26: /* ES override */ | 3391 | case 0x26: /* ES override */ |
3422 | case 0x2e: /* CS override */ | 3392 | case 0x2e: /* CS override */ |
3423 | case 0x36: /* SS override */ | 3393 | case 0x36: /* SS override */ |
3424 | case 0x3e: /* DS override */ | 3394 | case 0x3e: /* DS override */ |
3425 | set_seg_override(c, (c->b >> 3) & 3); | 3395 | set_seg_override(ctxt, (ctxt->b >> 3) & 3); |
3426 | break; | 3396 | break; |
3427 | case 0x64: /* FS override */ | 3397 | case 0x64: /* FS override */ |
3428 | case 0x65: /* GS override */ | 3398 | case 0x65: /* GS override */ |
3429 | set_seg_override(c, c->b & 7); | 3399 | set_seg_override(ctxt, ctxt->b & 7); |
3430 | break; | 3400 | break; |
3431 | case 0x40 ... 0x4f: /* REX */ | 3401 | case 0x40 ... 0x4f: /* REX */ |
3432 | if (mode != X86EMUL_MODE_PROT64) | 3402 | if (mode != X86EMUL_MODE_PROT64) |
3433 | goto done_prefixes; | 3403 | goto done_prefixes; |
3434 | c->rex_prefix = c->b; | 3404 | ctxt->rex_prefix = ctxt->b; |
3435 | continue; | 3405 | continue; |
3436 | case 0xf0: /* LOCK */ | 3406 | case 0xf0: /* LOCK */ |
3437 | c->lock_prefix = 1; | 3407 | ctxt->lock_prefix = 1; |
3438 | break; | 3408 | break; |
3439 | case 0xf2: /* REPNE/REPNZ */ | 3409 | case 0xf2: /* REPNE/REPNZ */ |
3440 | case 0xf3: /* REP/REPE/REPZ */ | 3410 | case 0xf3: /* REP/REPE/REPZ */ |
3441 | c->rep_prefix = c->b; | 3411 | ctxt->rep_prefix = ctxt->b; |
3442 | break; | 3412 | break; |
3443 | default: | 3413 | default: |
3444 | goto done_prefixes; | 3414 | goto done_prefixes; |
@@ -3446,50 +3416,50 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) | |||
3446 | 3416 | ||
3447 | /* Any legacy prefix after a REX prefix nullifies its effect. */ | 3417 | /* Any legacy prefix after a REX prefix nullifies its effect. */ |
3448 | 3418 | ||
3449 | c->rex_prefix = 0; | 3419 | ctxt->rex_prefix = 0; |
3450 | } | 3420 | } |
3451 | 3421 | ||
3452 | done_prefixes: | 3422 | done_prefixes: |
3453 | 3423 | ||
3454 | /* REX prefix. */ | 3424 | /* REX prefix. */ |
3455 | if (c->rex_prefix & 8) | 3425 | if (ctxt->rex_prefix & 8) |
3456 | c->op_bytes = 8; /* REX.W */ | 3426 | ctxt->op_bytes = 8; /* REX.W */ |
3457 | 3427 | ||
3458 | /* Opcode byte(s). */ | 3428 | /* Opcode byte(s). */ |
3459 | opcode = opcode_table[c->b]; | 3429 | opcode = opcode_table[ctxt->b]; |
3460 | /* Two-byte opcode? */ | 3430 | /* Two-byte opcode? */ |
3461 | if (c->b == 0x0f) { | 3431 | if (ctxt->b == 0x0f) { |
3462 | c->twobyte = 1; | 3432 | ctxt->twobyte = 1; |
3463 | c->b = insn_fetch(u8, 1, c->eip); | 3433 | ctxt->b = insn_fetch(u8, 1, ctxt->_eip); |
3464 | opcode = twobyte_table[c->b]; | 3434 | opcode = twobyte_table[ctxt->b]; |
3465 | } | 3435 | } |
3466 | c->d = opcode.flags; | 3436 | ctxt->d = opcode.flags; |
3467 | 3437 | ||
3468 | while (c->d & GroupMask) { | 3438 | while (ctxt->d & GroupMask) { |
3469 | switch (c->d & GroupMask) { | 3439 | switch (ctxt->d & GroupMask) { |
3470 | case Group: | 3440 | case Group: |
3471 | c->modrm = insn_fetch(u8, 1, c->eip); | 3441 | ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); |
3472 | --c->eip; | 3442 | --ctxt->_eip; |
3473 | goffset = (c->modrm >> 3) & 7; | 3443 | goffset = (ctxt->modrm >> 3) & 7; |
3474 | opcode = opcode.u.group[goffset]; | 3444 | opcode = opcode.u.group[goffset]; |
3475 | break; | 3445 | break; |
3476 | case GroupDual: | 3446 | case GroupDual: |
3477 | c->modrm = insn_fetch(u8, 1, c->eip); | 3447 | ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); |
3478 | --c->eip; | 3448 | --ctxt->_eip; |
3479 | goffset = (c->modrm >> 3) & 7; | 3449 | goffset = (ctxt->modrm >> 3) & 7; |
3480 | if ((c->modrm >> 6) == 3) | 3450 | if ((ctxt->modrm >> 6) == 3) |
3481 | opcode = opcode.u.gdual->mod3[goffset]; | 3451 | opcode = opcode.u.gdual->mod3[goffset]; |
3482 | else | 3452 | else |
3483 | opcode = opcode.u.gdual->mod012[goffset]; | 3453 | opcode = opcode.u.gdual->mod012[goffset]; |
3484 | break; | 3454 | break; |
3485 | case RMExt: | 3455 | case RMExt: |
3486 | goffset = c->modrm & 7; | 3456 | goffset = ctxt->modrm & 7; |
3487 | opcode = opcode.u.group[goffset]; | 3457 | opcode = opcode.u.group[goffset]; |
3488 | break; | 3458 | break; |
3489 | case Prefix: | 3459 | case Prefix: |
3490 | if (c->rep_prefix && op_prefix) | 3460 | if (ctxt->rep_prefix && op_prefix) |
3491 | return X86EMUL_UNHANDLEABLE; | 3461 | return X86EMUL_UNHANDLEABLE; |
3492 | simd_prefix = op_prefix ? 0x66 : c->rep_prefix; | 3462 | simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix; |
3493 | switch (simd_prefix) { | 3463 | switch (simd_prefix) { |
3494 | case 0x00: opcode = opcode.u.gprefix->pfx_no; break; | 3464 | case 0x00: opcode = opcode.u.gprefix->pfx_no; break; |
3495 | case 0x66: opcode = opcode.u.gprefix->pfx_66; break; | 3465 | case 0x66: opcode = opcode.u.gprefix->pfx_66; break; |
@@ -3501,61 +3471,61 @@ done_prefixes: | |||
3501 | return X86EMUL_UNHANDLEABLE; | 3471 | return X86EMUL_UNHANDLEABLE; |
3502 | } | 3472 | } |
3503 | 3473 | ||
3504 | c->d &= ~GroupMask; | 3474 | ctxt->d &= ~GroupMask; |
3505 | c->d |= opcode.flags; | 3475 | ctxt->d |= opcode.flags; |
3506 | } | 3476 | } |
3507 | 3477 | ||
3508 | c->execute = opcode.u.execute; | 3478 | ctxt->execute = opcode.u.execute; |
3509 | c->check_perm = opcode.check_perm; | 3479 | ctxt->check_perm = opcode.check_perm; |
3510 | c->intercept = opcode.intercept; | 3480 | ctxt->intercept = opcode.intercept; |
3511 | 3481 | ||
3512 | /* Unrecognised? */ | 3482 | /* Unrecognised? */ |
3513 | if (c->d == 0 || (c->d & Undefined)) | 3483 | if (ctxt->d == 0 || (ctxt->d & Undefined)) |
3514 | return -1; | 3484 | return -1; |
3515 | 3485 | ||
3516 | if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn) | 3486 | if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) |
3517 | return -1; | 3487 | return -1; |
3518 | 3488 | ||
3519 | if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) | 3489 | if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) |
3520 | c->op_bytes = 8; | 3490 | ctxt->op_bytes = 8; |
3521 | 3491 | ||
3522 | if (c->d & Op3264) { | 3492 | if (ctxt->d & Op3264) { |
3523 | if (mode == X86EMUL_MODE_PROT64) | 3493 | if (mode == X86EMUL_MODE_PROT64) |
3524 | c->op_bytes = 8; | 3494 | ctxt->op_bytes = 8; |
3525 | else | 3495 | else |
3526 | c->op_bytes = 4; | 3496 | ctxt->op_bytes = 4; |
3527 | } | 3497 | } |
3528 | 3498 | ||
3529 | if (c->d & Sse) | 3499 | if (ctxt->d & Sse) |
3530 | c->op_bytes = 16; | 3500 | ctxt->op_bytes = 16; |
3531 | 3501 | ||
3532 | /* ModRM and SIB bytes. */ | 3502 | /* ModRM and SIB bytes. */ |
3533 | if (c->d & ModRM) { | 3503 | if (ctxt->d & ModRM) { |
3534 | rc = decode_modrm(ctxt, ops, &memop); | 3504 | rc = decode_modrm(ctxt, &memop); |
3535 | if (!c->has_seg_override) | 3505 | if (!ctxt->has_seg_override) |
3536 | set_seg_override(c, c->modrm_seg); | 3506 | set_seg_override(ctxt, ctxt->modrm_seg); |
3537 | } else if (c->d & MemAbs) | 3507 | } else if (ctxt->d & MemAbs) |
3538 | rc = decode_abs(ctxt, ops, &memop); | 3508 | rc = decode_abs(ctxt, &memop); |
3539 | if (rc != X86EMUL_CONTINUE) | 3509 | if (rc != X86EMUL_CONTINUE) |
3540 | goto done; | 3510 | goto done; |
3541 | 3511 | ||
3542 | if (!c->has_seg_override) | 3512 | if (!ctxt->has_seg_override) |
3543 | set_seg_override(c, VCPU_SREG_DS); | 3513 | set_seg_override(ctxt, VCPU_SREG_DS); |
3544 | 3514 | ||
3545 | memop.addr.mem.seg = seg_override(ctxt, c); | 3515 | memop.addr.mem.seg = seg_override(ctxt); |
3546 | 3516 | ||
3547 | if (memop.type == OP_MEM && c->ad_bytes != 8) | 3517 | if (memop.type == OP_MEM && ctxt->ad_bytes != 8) |
3548 | memop.addr.mem.ea = (u32)memop.addr.mem.ea; | 3518 | memop.addr.mem.ea = (u32)memop.addr.mem.ea; |
3549 | 3519 | ||
3550 | /* | 3520 | /* |
3551 | * Decode and fetch the source operand: register, memory | 3521 | * Decode and fetch the source operand: register, memory |
3552 | * or immediate. | 3522 | * or immediate. |
3553 | */ | 3523 | */ |
3554 | switch (c->d & SrcMask) { | 3524 | switch (ctxt->d & SrcMask) { |
3555 | case SrcNone: | 3525 | case SrcNone: |
3556 | break; | 3526 | break; |
3557 | case SrcReg: | 3527 | case SrcReg: |
3558 | decode_register_operand(ctxt, &c->src, c, 0); | 3528 | decode_register_operand(ctxt, &ctxt->src, 0); |
3559 | break; | 3529 | break; |
3560 | case SrcMem16: | 3530 | case SrcMem16: |
3561 | memop.bytes = 2; | 3531 | memop.bytes = 2; |
@@ -3564,60 +3534,60 @@ done_prefixes: | |||
3564 | memop.bytes = 4; | 3534 | memop.bytes = 4; |
3565 | goto srcmem_common; | 3535 | goto srcmem_common; |
3566 | case SrcMem: | 3536 | case SrcMem: |
3567 | memop.bytes = (c->d & ByteOp) ? 1 : | 3537 | memop.bytes = (ctxt->d & ByteOp) ? 1 : |
3568 | c->op_bytes; | 3538 | ctxt->op_bytes; |
3569 | srcmem_common: | 3539 | srcmem_common: |
3570 | c->src = memop; | 3540 | ctxt->src = memop; |
3571 | memopp = &c->src; | 3541 | memopp = &ctxt->src; |
3572 | break; | 3542 | break; |
3573 | case SrcImmU16: | 3543 | case SrcImmU16: |
3574 | rc = decode_imm(ctxt, &c->src, 2, false); | 3544 | rc = decode_imm(ctxt, &ctxt->src, 2, false); |
3575 | break; | 3545 | break; |
3576 | case SrcImm: | 3546 | case SrcImm: |
3577 | rc = decode_imm(ctxt, &c->src, imm_size(c), true); | 3547 | rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), true); |
3578 | break; | 3548 | break; |
3579 | case SrcImmU: | 3549 | case SrcImmU: |
3580 | rc = decode_imm(ctxt, &c->src, imm_size(c), false); | 3550 | rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), false); |
3581 | break; | 3551 | break; |
3582 | case SrcImmByte: | 3552 | case SrcImmByte: |
3583 | rc = decode_imm(ctxt, &c->src, 1, true); | 3553 | rc = decode_imm(ctxt, &ctxt->src, 1, true); |
3584 | break; | 3554 | break; |
3585 | case SrcImmUByte: | 3555 | case SrcImmUByte: |
3586 | rc = decode_imm(ctxt, &c->src, 1, false); | 3556 | rc = decode_imm(ctxt, &ctxt->src, 1, false); |
3587 | break; | 3557 | break; |
3588 | case SrcAcc: | 3558 | case SrcAcc: |
3589 | c->src.type = OP_REG; | 3559 | ctxt->src.type = OP_REG; |
3590 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 3560 | ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; |
3591 | c->src.addr.reg = &c->regs[VCPU_REGS_RAX]; | 3561 | ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RAX]; |
3592 | fetch_register_operand(&c->src); | 3562 | fetch_register_operand(&ctxt->src); |
3593 | break; | 3563 | break; |
3594 | case SrcOne: | 3564 | case SrcOne: |
3595 | c->src.bytes = 1; | 3565 | ctxt->src.bytes = 1; |
3596 | c->src.val = 1; | 3566 | ctxt->src.val = 1; |
3597 | break; | 3567 | break; |
3598 | case SrcSI: | 3568 | case SrcSI: |
3599 | c->src.type = OP_MEM; | 3569 | ctxt->src.type = OP_MEM; |
3600 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 3570 | ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; |
3601 | c->src.addr.mem.ea = | 3571 | ctxt->src.addr.mem.ea = |
3602 | register_address(c, c->regs[VCPU_REGS_RSI]); | 3572 | register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]); |
3603 | c->src.addr.mem.seg = seg_override(ctxt, c); | 3573 | ctxt->src.addr.mem.seg = seg_override(ctxt); |
3604 | c->src.val = 0; | 3574 | ctxt->src.val = 0; |
3605 | break; | 3575 | break; |
3606 | case SrcImmFAddr: | 3576 | case SrcImmFAddr: |
3607 | c->src.type = OP_IMM; | 3577 | ctxt->src.type = OP_IMM; |
3608 | c->src.addr.mem.ea = c->eip; | 3578 | ctxt->src.addr.mem.ea = ctxt->_eip; |
3609 | c->src.bytes = c->op_bytes + 2; | 3579 | ctxt->src.bytes = ctxt->op_bytes + 2; |
3610 | insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); | 3580 | insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt->_eip); |
3611 | break; | 3581 | break; |
3612 | case SrcMemFAddr: | 3582 | case SrcMemFAddr: |
3613 | memop.bytes = c->op_bytes + 2; | 3583 | memop.bytes = ctxt->op_bytes + 2; |
3614 | goto srcmem_common; | 3584 | goto srcmem_common; |
3615 | break; | 3585 | break; |
3616 | case SrcDX: | 3586 | case SrcDX: |
3617 | c->src.type = OP_REG; | 3587 | ctxt->src.type = OP_REG; |
3618 | c->src.bytes = 2; | 3588 | ctxt->src.bytes = 2; |
3619 | c->src.addr.reg = &c->regs[VCPU_REGS_RDX]; | 3589 | ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; |
3620 | fetch_register_operand(&c->src); | 3590 | fetch_register_operand(&ctxt->src); |
3621 | break; | 3591 | break; |
3622 | } | 3592 | } |
3623 | 3593 | ||
@@ -3628,22 +3598,22 @@ done_prefixes: | |||
3628 | * Decode and fetch the second source operand: register, memory | 3598 | * Decode and fetch the second source operand: register, memory |
3629 | * or immediate. | 3599 | * or immediate. |
3630 | */ | 3600 | */ |
3631 | switch (c->d & Src2Mask) { | 3601 | switch (ctxt->d & Src2Mask) { |
3632 | case Src2None: | 3602 | case Src2None: |
3633 | break; | 3603 | break; |
3634 | case Src2CL: | 3604 | case Src2CL: |
3635 | c->src2.bytes = 1; | 3605 | ctxt->src2.bytes = 1; |
3636 | c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; | 3606 | ctxt->src2.val = ctxt->regs[VCPU_REGS_RCX] & 0x8; |
3637 | break; | 3607 | break; |
3638 | case Src2ImmByte: | 3608 | case Src2ImmByte: |
3639 | rc = decode_imm(ctxt, &c->src2, 1, true); | 3609 | rc = decode_imm(ctxt, &ctxt->src2, 1, true); |
3640 | break; | 3610 | break; |
3641 | case Src2One: | 3611 | case Src2One: |
3642 | c->src2.bytes = 1; | 3612 | ctxt->src2.bytes = 1; |
3643 | c->src2.val = 1; | 3613 | ctxt->src2.val = 1; |
3644 | break; | 3614 | break; |
3645 | case Src2Imm: | 3615 | case Src2Imm: |
3646 | rc = decode_imm(ctxt, &c->src2, imm_size(c), true); | 3616 | rc = decode_imm(ctxt, &ctxt->src2, imm_size(ctxt), true); |
3647 | break; | 3617 | break; |
3648 | } | 3618 | } |
3649 | 3619 | ||
@@ -3651,68 +3621,66 @@ done_prefixes: | |||
3651 | goto done; | 3621 | goto done; |
3652 | 3622 | ||
3653 | /* Decode and fetch the destination operand: register or memory. */ | 3623 | /* Decode and fetch the destination operand: register or memory. */ |
3654 | switch (c->d & DstMask) { | 3624 | switch (ctxt->d & DstMask) { |
3655 | case DstReg: | 3625 | case DstReg: |
3656 | decode_register_operand(ctxt, &c->dst, c, | 3626 | decode_register_operand(ctxt, &ctxt->dst, |
3657 | c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); | 3627 | ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7)); |
3658 | break; | 3628 | break; |
3659 | case DstImmUByte: | 3629 | case DstImmUByte: |
3660 | c->dst.type = OP_IMM; | 3630 | ctxt->dst.type = OP_IMM; |
3661 | c->dst.addr.mem.ea = c->eip; | 3631 | ctxt->dst.addr.mem.ea = ctxt->_eip; |
3662 | c->dst.bytes = 1; | 3632 | ctxt->dst.bytes = 1; |
3663 | c->dst.val = insn_fetch(u8, 1, c->eip); | 3633 | ctxt->dst.val = insn_fetch(u8, 1, ctxt->_eip); |
3664 | break; | 3634 | break; |
3665 | case DstMem: | 3635 | case DstMem: |
3666 | case DstMem64: | 3636 | case DstMem64: |
3667 | c->dst = memop; | 3637 | ctxt->dst = memop; |
3668 | memopp = &c->dst; | 3638 | memopp = &ctxt->dst; |
3669 | if ((c->d & DstMask) == DstMem64) | 3639 | if ((ctxt->d & DstMask) == DstMem64) |
3670 | c->dst.bytes = 8; | 3640 | ctxt->dst.bytes = 8; |
3671 | else | 3641 | else |
3672 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 3642 | ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; |
3673 | if (c->d & BitOp) | 3643 | if (ctxt->d & BitOp) |
3674 | fetch_bit_operand(c); | 3644 | fetch_bit_operand(ctxt); |
3675 | c->dst.orig_val = c->dst.val; | 3645 | ctxt->dst.orig_val = ctxt->dst.val; |
3676 | break; | 3646 | break; |
3677 | case DstAcc: | 3647 | case DstAcc: |
3678 | c->dst.type = OP_REG; | 3648 | ctxt->dst.type = OP_REG; |
3679 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 3649 | ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; |
3680 | c->dst.addr.reg = &c->regs[VCPU_REGS_RAX]; | 3650 | ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RAX]; |
3681 | fetch_register_operand(&c->dst); | 3651 | fetch_register_operand(&ctxt->dst); |
3682 | c->dst.orig_val = c->dst.val; | 3652 | ctxt->dst.orig_val = ctxt->dst.val; |
3683 | break; | 3653 | break; |
3684 | case DstDI: | 3654 | case DstDI: |
3685 | c->dst.type = OP_MEM; | 3655 | ctxt->dst.type = OP_MEM; |
3686 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 3656 | ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; |
3687 | c->dst.addr.mem.ea = | 3657 | ctxt->dst.addr.mem.ea = |
3688 | register_address(c, c->regs[VCPU_REGS_RDI]); | 3658 | register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]); |
3689 | c->dst.addr.mem.seg = VCPU_SREG_ES; | 3659 | ctxt->dst.addr.mem.seg = VCPU_SREG_ES; |
3690 | c->dst.val = 0; | 3660 | ctxt->dst.val = 0; |
3691 | break; | 3661 | break; |
3692 | case DstDX: | 3662 | case DstDX: |
3693 | c->dst.type = OP_REG; | 3663 | ctxt->dst.type = OP_REG; |
3694 | c->dst.bytes = 2; | 3664 | ctxt->dst.bytes = 2; |
3695 | c->dst.addr.reg = &c->regs[VCPU_REGS_RDX]; | 3665 | ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; |
3696 | fetch_register_operand(&c->dst); | 3666 | fetch_register_operand(&ctxt->dst); |
3697 | break; | 3667 | break; |
3698 | case ImplicitOps: | 3668 | case ImplicitOps: |
3699 | /* Special instructions do their own operand decoding. */ | 3669 | /* Special instructions do their own operand decoding. */ |
3700 | default: | 3670 | default: |
3701 | c->dst.type = OP_NONE; /* Disable writeback. */ | 3671 | ctxt->dst.type = OP_NONE; /* Disable writeback. */ |
3702 | break; | 3672 | break; |
3703 | } | 3673 | } |
3704 | 3674 | ||
3705 | done: | 3675 | done: |
3706 | if (memopp && memopp->type == OP_MEM && c->rip_relative) | 3676 | if (memopp && memopp->type == OP_MEM && ctxt->rip_relative) |
3707 | memopp->addr.mem.ea += c->eip; | 3677 | memopp->addr.mem.ea += ctxt->_eip; |
3708 | 3678 | ||
3709 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; | 3679 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; |
3710 | } | 3680 | } |
3711 | 3681 | ||
3712 | static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) | 3682 | static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) |
3713 | { | 3683 | { |
3714 | struct decode_cache *c = &ctxt->decode; | ||
3715 | |||
3716 | /* The second termination condition only applies for REPE | 3684 | /* The second termination condition only applies for REPE |
3717 | * and REPNE. Test if the repeat string operation prefix is | 3685 | * and REPNE. Test if the repeat string operation prefix is |
3718 | * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the | 3686 | * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the |
@@ -3720,304 +3688,232 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) | |||
3720 | * - if REPE/REPZ and ZF = 0 then done | 3688 | * - if REPE/REPZ and ZF = 0 then done |
3721 | * - if REPNE/REPNZ and ZF = 1 then done | 3689 | * - if REPNE/REPNZ and ZF = 1 then done |
3722 | */ | 3690 | */ |
3723 | if (((c->b == 0xa6) || (c->b == 0xa7) || | 3691 | if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) || |
3724 | (c->b == 0xae) || (c->b == 0xaf)) | 3692 | (ctxt->b == 0xae) || (ctxt->b == 0xaf)) |
3725 | && (((c->rep_prefix == REPE_PREFIX) && | 3693 | && (((ctxt->rep_prefix == REPE_PREFIX) && |
3726 | ((ctxt->eflags & EFLG_ZF) == 0)) | 3694 | ((ctxt->eflags & EFLG_ZF) == 0)) |
3727 | || ((c->rep_prefix == REPNE_PREFIX) && | 3695 | || ((ctxt->rep_prefix == REPNE_PREFIX) && |
3728 | ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) | 3696 | ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) |
3729 | return true; | 3697 | return true; |
3730 | 3698 | ||
3731 | return false; | 3699 | return false; |
3732 | } | 3700 | } |
3733 | 3701 | ||
3734 | int | 3702 | int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) |
3735 | x86_emulate_insn(struct x86_emulate_ctxt *ctxt) | ||
3736 | { | 3703 | { |
3737 | struct x86_emulate_ops *ops = ctxt->ops; | 3704 | struct x86_emulate_ops *ops = ctxt->ops; |
3738 | u64 msr_data; | 3705 | u64 msr_data; |
3739 | struct decode_cache *c = &ctxt->decode; | ||
3740 | int rc = X86EMUL_CONTINUE; | 3706 | int rc = X86EMUL_CONTINUE; |
3741 | int saved_dst_type = c->dst.type; | 3707 | int saved_dst_type = ctxt->dst.type; |
3742 | int irq; /* Used for int 3, int, and into */ | ||
3743 | 3708 | ||
3744 | ctxt->decode.mem_read.pos = 0; | 3709 | ctxt->mem_read.pos = 0; |
3745 | 3710 | ||
3746 | if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { | 3711 | if (ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) { |
3747 | rc = emulate_ud(ctxt); | 3712 | rc = emulate_ud(ctxt); |
3748 | goto done; | 3713 | goto done; |
3749 | } | 3714 | } |
3750 | 3715 | ||
3751 | /* LOCK prefix is allowed only with some instructions */ | 3716 | /* LOCK prefix is allowed only with some instructions */ |
3752 | if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { | 3717 | if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) { |
3753 | rc = emulate_ud(ctxt); | 3718 | rc = emulate_ud(ctxt); |
3754 | goto done; | 3719 | goto done; |
3755 | } | 3720 | } |
3756 | 3721 | ||
3757 | if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { | 3722 | if ((ctxt->d & SrcMask) == SrcMemFAddr && ctxt->src.type != OP_MEM) { |
3758 | rc = emulate_ud(ctxt); | 3723 | rc = emulate_ud(ctxt); |
3759 | goto done; | 3724 | goto done; |
3760 | } | 3725 | } |
3761 | 3726 | ||
3762 | if ((c->d & Sse) | 3727 | if ((ctxt->d & Sse) |
3763 | && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) | 3728 | && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) |
3764 | || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { | 3729 | || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { |
3765 | rc = emulate_ud(ctxt); | 3730 | rc = emulate_ud(ctxt); |
3766 | goto done; | 3731 | goto done; |
3767 | } | 3732 | } |
3768 | 3733 | ||
3769 | if ((c->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { | 3734 | if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { |
3770 | rc = emulate_nm(ctxt); | 3735 | rc = emulate_nm(ctxt); |
3771 | goto done; | 3736 | goto done; |
3772 | } | 3737 | } |
3773 | 3738 | ||
3774 | if (unlikely(ctxt->guest_mode) && c->intercept) { | 3739 | if (unlikely(ctxt->guest_mode) && ctxt->intercept) { |
3775 | rc = emulator_check_intercept(ctxt, c->intercept, | 3740 | rc = emulator_check_intercept(ctxt, ctxt->intercept, |
3776 | X86_ICPT_PRE_EXCEPT); | 3741 | X86_ICPT_PRE_EXCEPT); |
3777 | if (rc != X86EMUL_CONTINUE) | 3742 | if (rc != X86EMUL_CONTINUE) |
3778 | goto done; | 3743 | goto done; |
3779 | } | 3744 | } |
3780 | 3745 | ||
3781 | /* Privileged instruction can be executed only in CPL=0 */ | 3746 | /* Privileged instruction can be executed only in CPL=0 */ |
3782 | if ((c->d & Priv) && ops->cpl(ctxt)) { | 3747 | if ((ctxt->d & Priv) && ops->cpl(ctxt)) { |
3783 | rc = emulate_gp(ctxt, 0); | 3748 | rc = emulate_gp(ctxt, 0); |
3784 | goto done; | 3749 | goto done; |
3785 | } | 3750 | } |
3786 | 3751 | ||
3787 | /* Instruction can only be executed in protected mode */ | 3752 | /* Instruction can only be executed in protected mode */ |
3788 | if ((c->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) { | 3753 | if ((ctxt->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) { |
3789 | rc = emulate_ud(ctxt); | 3754 | rc = emulate_ud(ctxt); |
3790 | goto done; | 3755 | goto done; |
3791 | } | 3756 | } |
3792 | 3757 | ||
3793 | /* Do instruction specific permission checks */ | 3758 | /* Do instruction specific permission checks */ |
3794 | if (c->check_perm) { | 3759 | if (ctxt->check_perm) { |
3795 | rc = c->check_perm(ctxt); | 3760 | rc = ctxt->check_perm(ctxt); |
3796 | if (rc != X86EMUL_CONTINUE) | 3761 | if (rc != X86EMUL_CONTINUE) |
3797 | goto done; | 3762 | goto done; |
3798 | } | 3763 | } |
3799 | 3764 | ||
3800 | if (unlikely(ctxt->guest_mode) && c->intercept) { | 3765 | if (unlikely(ctxt->guest_mode) && ctxt->intercept) { |
3801 | rc = emulator_check_intercept(ctxt, c->intercept, | 3766 | rc = emulator_check_intercept(ctxt, ctxt->intercept, |
3802 | X86_ICPT_POST_EXCEPT); | 3767 | X86_ICPT_POST_EXCEPT); |
3803 | if (rc != X86EMUL_CONTINUE) | 3768 | if (rc != X86EMUL_CONTINUE) |
3804 | goto done; | 3769 | goto done; |
3805 | } | 3770 | } |
3806 | 3771 | ||
3807 | if (c->rep_prefix && (c->d & String)) { | 3772 | if (ctxt->rep_prefix && (ctxt->d & String)) { |
3808 | /* All REP prefixes have the same first termination condition */ | 3773 | /* All REP prefixes have the same first termination condition */ |
3809 | if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { | 3774 | if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) { |
3810 | ctxt->eip = c->eip; | 3775 | ctxt->eip = ctxt->_eip; |
3811 | goto done; | 3776 | goto done; |
3812 | } | 3777 | } |
3813 | } | 3778 | } |
3814 | 3779 | ||
3815 | if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { | 3780 | if ((ctxt->src.type == OP_MEM) && !(ctxt->d & NoAccess)) { |
3816 | rc = segmented_read(ctxt, c->src.addr.mem, | 3781 | rc = segmented_read(ctxt, ctxt->src.addr.mem, |
3817 | c->src.valptr, c->src.bytes); | 3782 | ctxt->src.valptr, ctxt->src.bytes); |
3818 | if (rc != X86EMUL_CONTINUE) | 3783 | if (rc != X86EMUL_CONTINUE) |
3819 | goto done; | 3784 | goto done; |
3820 | c->src.orig_val64 = c->src.val64; | 3785 | ctxt->src.orig_val64 = ctxt->src.val64; |
3821 | } | 3786 | } |
3822 | 3787 | ||
3823 | if (c->src2.type == OP_MEM) { | 3788 | if (ctxt->src2.type == OP_MEM) { |
3824 | rc = segmented_read(ctxt, c->src2.addr.mem, | 3789 | rc = segmented_read(ctxt, ctxt->src2.addr.mem, |
3825 | &c->src2.val, c->src2.bytes); | 3790 | &ctxt->src2.val, ctxt->src2.bytes); |
3826 | if (rc != X86EMUL_CONTINUE) | 3791 | if (rc != X86EMUL_CONTINUE) |
3827 | goto done; | 3792 | goto done; |
3828 | } | 3793 | } |
3829 | 3794 | ||
3830 | if ((c->d & DstMask) == ImplicitOps) | 3795 | if ((ctxt->d & DstMask) == ImplicitOps) |
3831 | goto special_insn; | 3796 | goto special_insn; |
3832 | 3797 | ||
3833 | 3798 | ||
3834 | if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { | 3799 | if ((ctxt->dst.type == OP_MEM) && !(ctxt->d & Mov)) { |
3835 | /* optimisation - avoid slow emulated read if Mov */ | 3800 | /* optimisation - avoid slow emulated read if Mov */ |
3836 | rc = segmented_read(ctxt, c->dst.addr.mem, | 3801 | rc = segmented_read(ctxt, ctxt->dst.addr.mem, |
3837 | &c->dst.val, c->dst.bytes); | 3802 | &ctxt->dst.val, ctxt->dst.bytes); |
3838 | if (rc != X86EMUL_CONTINUE) | 3803 | if (rc != X86EMUL_CONTINUE) |
3839 | goto done; | 3804 | goto done; |
3840 | } | 3805 | } |
3841 | c->dst.orig_val = c->dst.val; | 3806 | ctxt->dst.orig_val = ctxt->dst.val; |
3842 | 3807 | ||
3843 | special_insn: | 3808 | special_insn: |
3844 | 3809 | ||
3845 | if (unlikely(ctxt->guest_mode) && c->intercept) { | 3810 | if (unlikely(ctxt->guest_mode) && ctxt->intercept) { |
3846 | rc = emulator_check_intercept(ctxt, c->intercept, | 3811 | rc = emulator_check_intercept(ctxt, ctxt->intercept, |
3847 | X86_ICPT_POST_MEMACCESS); | 3812 | X86_ICPT_POST_MEMACCESS); |
3848 | if (rc != X86EMUL_CONTINUE) | 3813 | if (rc != X86EMUL_CONTINUE) |
3849 | goto done; | 3814 | goto done; |
3850 | } | 3815 | } |
3851 | 3816 | ||
3852 | if (c->execute) { | 3817 | if (ctxt->execute) { |
3853 | rc = c->execute(ctxt); | 3818 | rc = ctxt->execute(ctxt); |
3854 | if (rc != X86EMUL_CONTINUE) | 3819 | if (rc != X86EMUL_CONTINUE) |
3855 | goto done; | 3820 | goto done; |
3856 | goto writeback; | 3821 | goto writeback; |
3857 | } | 3822 | } |
3858 | 3823 | ||
3859 | if (c->twobyte) | 3824 | if (ctxt->twobyte) |
3860 | goto twobyte_insn; | 3825 | goto twobyte_insn; |
3861 | 3826 | ||
3862 | switch (c->b) { | 3827 | switch (ctxt->b) { |
3863 | case 0x06: /* push es */ | 3828 | case 0x06: /* push es */ |
3864 | rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); | 3829 | rc = emulate_push_sreg(ctxt, VCPU_SREG_ES); |
3865 | break; | 3830 | break; |
3866 | case 0x07: /* pop es */ | 3831 | case 0x07: /* pop es */ |
3867 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); | 3832 | rc = emulate_pop_sreg(ctxt, VCPU_SREG_ES); |
3868 | break; | 3833 | break; |
3869 | case 0x0e: /* push cs */ | 3834 | case 0x0e: /* push cs */ |
3870 | rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); | 3835 | rc = emulate_push_sreg(ctxt, VCPU_SREG_CS); |
3871 | break; | 3836 | break; |
3872 | case 0x16: /* push ss */ | 3837 | case 0x16: /* push ss */ |
3873 | rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); | 3838 | rc = emulate_push_sreg(ctxt, VCPU_SREG_SS); |
3874 | break; | 3839 | break; |
3875 | case 0x17: /* pop ss */ | 3840 | case 0x17: /* pop ss */ |
3876 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); | 3841 | rc = emulate_pop_sreg(ctxt, VCPU_SREG_SS); |
3877 | break; | 3842 | break; |
3878 | case 0x1e: /* push ds */ | 3843 | case 0x1e: /* push ds */ |
3879 | rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); | 3844 | rc = emulate_push_sreg(ctxt, VCPU_SREG_DS); |
3880 | break; | 3845 | break; |
3881 | case 0x1f: /* pop ds */ | 3846 | case 0x1f: /* pop ds */ |
3882 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); | 3847 | rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS); |
3883 | break; | 3848 | break; |
3884 | case 0x40 ... 0x47: /* inc r16/r32 */ | 3849 | case 0x40 ... 0x47: /* inc r16/r32 */ |
3885 | emulate_1op("inc", c->dst, ctxt->eflags); | 3850 | emulate_1op("inc", ctxt->dst, ctxt->eflags); |
3886 | break; | 3851 | break; |
3887 | case 0x48 ... 0x4f: /* dec r16/r32 */ | 3852 | case 0x48 ... 0x4f: /* dec r16/r32 */ |
3888 | emulate_1op("dec", c->dst, ctxt->eflags); | 3853 | emulate_1op("dec", ctxt->dst, ctxt->eflags); |
3889 | break; | 3854 | break; |
3890 | case 0x63: /* movsxd */ | 3855 | case 0x63: /* movsxd */ |
3891 | if (ctxt->mode != X86EMUL_MODE_PROT64) | 3856 | if (ctxt->mode != X86EMUL_MODE_PROT64) |
3892 | goto cannot_emulate; | 3857 | goto cannot_emulate; |
3893 | c->dst.val = (s32) c->src.val; | 3858 | ctxt->dst.val = (s32) ctxt->src.val; |
3894 | break; | 3859 | break; |
3895 | case 0x6c: /* insb */ | 3860 | case 0x6c: /* insb */ |
3896 | case 0x6d: /* insw/insd */ | 3861 | case 0x6d: /* insw/insd */ |
3897 | c->src.val = c->regs[VCPU_REGS_RDX]; | 3862 | ctxt->src.val = ctxt->regs[VCPU_REGS_RDX]; |
3898 | goto do_io_in; | 3863 | goto do_io_in; |
3899 | case 0x6e: /* outsb */ | 3864 | case 0x6e: /* outsb */ |
3900 | case 0x6f: /* outsw/outsd */ | 3865 | case 0x6f: /* outsw/outsd */ |
3901 | c->dst.val = c->regs[VCPU_REGS_RDX]; | 3866 | ctxt->dst.val = ctxt->regs[VCPU_REGS_RDX]; |
3902 | goto do_io_out; | 3867 | goto do_io_out; |
3903 | break; | 3868 | break; |
3904 | case 0x70 ... 0x7f: /* jcc (short) */ | 3869 | case 0x70 ... 0x7f: /* jcc (short) */ |
3905 | if (test_cc(c->b, ctxt->eflags)) | 3870 | if (test_cc(ctxt->b, ctxt->eflags)) |
3906 | jmp_rel(c, c->src.val); | 3871 | jmp_rel(ctxt, ctxt->src.val); |
3907 | break; | ||
3908 | case 0x84 ... 0x85: | ||
3909 | test: | ||
3910 | emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); | ||
3911 | break; | ||
3912 | case 0x86 ... 0x87: /* xchg */ | ||
3913 | xchg: | ||
3914 | /* Write back the register source. */ | ||
3915 | c->src.val = c->dst.val; | ||
3916 | write_register_operand(&c->src); | ||
3917 | /* | ||
3918 | * Write back the memory destination with implicit LOCK | ||
3919 | * prefix. | ||
3920 | */ | ||
3921 | c->dst.val = c->src.orig_val; | ||
3922 | c->lock_prefix = 1; | ||
3923 | break; | ||
3924 | case 0x8c: /* mov r/m, sreg */ | ||
3925 | if (c->modrm_reg > VCPU_SREG_GS) { | ||
3926 | rc = emulate_ud(ctxt); | ||
3927 | goto done; | ||
3928 | } | ||
3929 | c->dst.val = get_segment_selector(ctxt, c->modrm_reg); | ||
3930 | break; | 3872 | break; |
3931 | case 0x8d: /* lea r16/r32, m */ | 3873 | case 0x8d: /* lea r16/r32, m */ |
3932 | c->dst.val = c->src.addr.mem.ea; | 3874 | ctxt->dst.val = ctxt->src.addr.mem.ea; |
3933 | break; | 3875 | break; |
3934 | case 0x8e: { /* mov seg, r/m16 */ | ||
3935 | uint16_t sel; | ||
3936 | |||
3937 | sel = c->src.val; | ||
3938 | |||
3939 | if (c->modrm_reg == VCPU_SREG_CS || | ||
3940 | c->modrm_reg > VCPU_SREG_GS) { | ||
3941 | rc = emulate_ud(ctxt); | ||
3942 | goto done; | ||
3943 | } | ||
3944 | |||
3945 | if (c->modrm_reg == VCPU_SREG_SS) | ||
3946 | ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; | ||
3947 | |||
3948 | rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg); | ||
3949 | |||
3950 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
3951 | break; | ||
3952 | } | ||
3953 | case 0x8f: /* pop (sole member of Grp1a) */ | 3876 | case 0x8f: /* pop (sole member of Grp1a) */ |
3954 | rc = em_grp1a(ctxt); | 3877 | rc = em_grp1a(ctxt); |
3955 | break; | 3878 | break; |
3956 | case 0x90 ... 0x97: /* nop / xchg reg, rax */ | 3879 | case 0x90 ... 0x97: /* nop / xchg reg, rax */ |
3957 | if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) | 3880 | if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX]) |
3958 | break; | 3881 | break; |
3959 | goto xchg; | 3882 | rc = em_xchg(ctxt); |
3883 | break; | ||
3960 | case 0x98: /* cbw/cwde/cdqe */ | 3884 | case 0x98: /* cbw/cwde/cdqe */ |
3961 | switch (c->op_bytes) { | 3885 | switch (ctxt->op_bytes) { |
3962 | case 2: c->dst.val = (s8)c->dst.val; break; | 3886 | case 2: ctxt->dst.val = (s8)ctxt->dst.val; break; |
3963 | case 4: c->dst.val = (s16)c->dst.val; break; | 3887 | case 4: ctxt->dst.val = (s16)ctxt->dst.val; break; |
3964 | case 8: c->dst.val = (s32)c->dst.val; break; | 3888 | case 8: ctxt->dst.val = (s32)ctxt->dst.val; break; |
3965 | } | 3889 | } |
3966 | break; | 3890 | break; |
3967 | case 0xa8 ... 0xa9: /* test ax, imm */ | ||
3968 | goto test; | ||
3969 | case 0xc0 ... 0xc1: | 3891 | case 0xc0 ... 0xc1: |
3970 | rc = em_grp2(ctxt); | 3892 | rc = em_grp2(ctxt); |
3971 | break; | 3893 | break; |
3972 | case 0xc3: /* ret */ | ||
3973 | c->dst.type = OP_REG; | ||
3974 | c->dst.addr.reg = &c->eip; | ||
3975 | c->dst.bytes = c->op_bytes; | ||
3976 | rc = em_pop(ctxt); | ||
3977 | break; | ||
3978 | case 0xc4: /* les */ | 3894 | case 0xc4: /* les */ |
3979 | rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES); | 3895 | rc = emulate_load_segment(ctxt, VCPU_SREG_ES); |
3980 | break; | 3896 | break; |
3981 | case 0xc5: /* lds */ | 3897 | case 0xc5: /* lds */ |
3982 | rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS); | 3898 | rc = emulate_load_segment(ctxt, VCPU_SREG_DS); |
3983 | break; | ||
3984 | case 0xcb: /* ret far */ | ||
3985 | rc = emulate_ret_far(ctxt, ops); | ||
3986 | break; | 3899 | break; |
3987 | case 0xcc: /* int3 */ | 3900 | case 0xcc: /* int3 */ |
3988 | irq = 3; | 3901 | rc = emulate_int(ctxt, 3); |
3989 | goto do_interrupt; | 3902 | break; |
3990 | case 0xcd: /* int n */ | 3903 | case 0xcd: /* int n */ |
3991 | irq = c->src.val; | 3904 | rc = emulate_int(ctxt, ctxt->src.val); |
3992 | do_interrupt: | ||
3993 | rc = emulate_int(ctxt, ops, irq); | ||
3994 | break; | 3905 | break; |
3995 | case 0xce: /* into */ | 3906 | case 0xce: /* into */ |
3996 | if (ctxt->eflags & EFLG_OF) { | 3907 | if (ctxt->eflags & EFLG_OF) |
3997 | irq = 4; | 3908 | rc = emulate_int(ctxt, 4); |
3998 | goto do_interrupt; | ||
3999 | } | ||
4000 | break; | ||
4001 | case 0xcf: /* iret */ | ||
4002 | rc = emulate_iret(ctxt, ops); | ||
4003 | break; | 3909 | break; |
4004 | case 0xd0 ... 0xd1: /* Grp2 */ | 3910 | case 0xd0 ... 0xd1: /* Grp2 */ |
4005 | rc = em_grp2(ctxt); | 3911 | rc = em_grp2(ctxt); |
4006 | break; | 3912 | break; |
4007 | case 0xd2 ... 0xd3: /* Grp2 */ | 3913 | case 0xd2 ... 0xd3: /* Grp2 */ |
4008 | c->src.val = c->regs[VCPU_REGS_RCX]; | 3914 | ctxt->src.val = ctxt->regs[VCPU_REGS_RCX]; |
4009 | rc = em_grp2(ctxt); | 3915 | rc = em_grp2(ctxt); |
4010 | break; | 3916 | break; |
4011 | case 0xe0 ... 0xe2: /* loop/loopz/loopnz */ | ||
4012 | register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); | ||
4013 | if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 && | ||
4014 | (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags))) | ||
4015 | jmp_rel(c, c->src.val); | ||
4016 | break; | ||
4017 | case 0xe3: /* jcxz/jecxz/jrcxz */ | ||
4018 | if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) | ||
4019 | jmp_rel(c, c->src.val); | ||
4020 | break; | ||
4021 | case 0xe4: /* inb */ | 3917 | case 0xe4: /* inb */ |
4022 | case 0xe5: /* in */ | 3918 | case 0xe5: /* in */ |
4023 | goto do_io_in; | 3919 | goto do_io_in; |
@@ -4025,35 +3921,30 @@ special_insn: | |||
4025 | case 0xe7: /* out */ | 3921 | case 0xe7: /* out */ |
4026 | goto do_io_out; | 3922 | goto do_io_out; |
4027 | case 0xe8: /* call (near) */ { | 3923 | case 0xe8: /* call (near) */ { |
4028 | long int rel = c->src.val; | 3924 | long int rel = ctxt->src.val; |
4029 | c->src.val = (unsigned long) c->eip; | 3925 | ctxt->src.val = (unsigned long) ctxt->_eip; |
4030 | jmp_rel(c, rel); | 3926 | jmp_rel(ctxt, rel); |
4031 | rc = em_push(ctxt); | 3927 | rc = em_push(ctxt); |
4032 | break; | 3928 | break; |
4033 | } | 3929 | } |
4034 | case 0xe9: /* jmp rel */ | 3930 | case 0xe9: /* jmp rel */ |
4035 | goto jmp; | 3931 | case 0xeb: /* jmp rel short */ |
4036 | case 0xea: /* jmp far */ | 3932 | jmp_rel(ctxt, ctxt->src.val); |
4037 | rc = em_jmp_far(ctxt); | 3933 | ctxt->dst.type = OP_NONE; /* Disable writeback. */ |
4038 | break; | ||
4039 | case 0xeb: | ||
4040 | jmp: /* jmp rel short */ | ||
4041 | jmp_rel(c, c->src.val); | ||
4042 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
4043 | break; | 3934 | break; |
4044 | case 0xec: /* in al,dx */ | 3935 | case 0xec: /* in al,dx */ |
4045 | case 0xed: /* in (e/r)ax,dx */ | 3936 | case 0xed: /* in (e/r)ax,dx */ |
4046 | do_io_in: | 3937 | do_io_in: |
4047 | if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, | 3938 | if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val, |
4048 | &c->dst.val)) | 3939 | &ctxt->dst.val)) |
4049 | goto done; /* IO is needed */ | 3940 | goto done; /* IO is needed */ |
4050 | break; | 3941 | break; |
4051 | case 0xee: /* out dx,al */ | 3942 | case 0xee: /* out dx,al */ |
4052 | case 0xef: /* out dx,(e/r)ax */ | 3943 | case 0xef: /* out dx,(e/r)ax */ |
4053 | do_io_out: | 3944 | do_io_out: |
4054 | ops->pio_out_emulated(ctxt, c->src.bytes, c->dst.val, | 3945 | ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val, |
4055 | &c->src.val, 1); | 3946 | &ctxt->src.val, 1); |
4056 | c->dst.type = OP_NONE; /* Disable writeback. */ | 3947 | ctxt->dst.type = OP_NONE; /* Disable writeback. */ |
4057 | break; | 3948 | break; |
4058 | case 0xf4: /* hlt */ | 3949 | case 0xf4: /* hlt */ |
4059 | ctxt->ops->halt(ctxt); | 3950 | ctxt->ops->halt(ctxt); |
@@ -4071,22 +3962,6 @@ special_insn: | |||
4071 | case 0xf9: /* stc */ | 3962 | case 0xf9: /* stc */ |
4072 | ctxt->eflags |= EFLG_CF; | 3963 | ctxt->eflags |= EFLG_CF; |
4073 | break; | 3964 | break; |
4074 | case 0xfa: /* cli */ | ||
4075 | if (emulator_bad_iopl(ctxt, ops)) { | ||
4076 | rc = emulate_gp(ctxt, 0); | ||
4077 | goto done; | ||
4078 | } else | ||
4079 | ctxt->eflags &= ~X86_EFLAGS_IF; | ||
4080 | break; | ||
4081 | case 0xfb: /* sti */ | ||
4082 | if (emulator_bad_iopl(ctxt, ops)) { | ||
4083 | rc = emulate_gp(ctxt, 0); | ||
4084 | goto done; | ||
4085 | } else { | ||
4086 | ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; | ||
4087 | ctxt->eflags |= X86_EFLAGS_IF; | ||
4088 | } | ||
4089 | break; | ||
4090 | case 0xfc: /* cld */ | 3965 | case 0xfc: /* cld */ |
4091 | ctxt->eflags &= ~EFLG_DF; | 3966 | ctxt->eflags &= ~EFLG_DF; |
4092 | break; | 3967 | break; |
@@ -4115,40 +3990,40 @@ writeback: | |||
4115 | * restore dst type in case the decoding will be reused | 3990 | * restore dst type in case the decoding will be reused |
4116 | * (happens for string instruction ) | 3991 | * (happens for string instruction ) |
4117 | */ | 3992 | */ |
4118 | c->dst.type = saved_dst_type; | 3993 | ctxt->dst.type = saved_dst_type; |
4119 | 3994 | ||
4120 | if ((c->d & SrcMask) == SrcSI) | 3995 | if ((ctxt->d & SrcMask) == SrcSI) |
4121 | string_addr_inc(ctxt, seg_override(ctxt, c), | 3996 | string_addr_inc(ctxt, seg_override(ctxt), |
4122 | VCPU_REGS_RSI, &c->src); | 3997 | VCPU_REGS_RSI, &ctxt->src); |
4123 | 3998 | ||
4124 | if ((c->d & DstMask) == DstDI) | 3999 | if ((ctxt->d & DstMask) == DstDI) |
4125 | string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI, | 4000 | string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI, |
4126 | &c->dst); | 4001 | &ctxt->dst); |
4127 | 4002 | ||
4128 | if (c->rep_prefix && (c->d & String)) { | 4003 | if (ctxt->rep_prefix && (ctxt->d & String)) { |
4129 | struct read_cache *r = &ctxt->decode.io_read; | 4004 | struct read_cache *r = &ctxt->io_read; |
4130 | register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); | 4005 | register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1); |
4131 | 4006 | ||
4132 | if (!string_insn_completed(ctxt)) { | 4007 | if (!string_insn_completed(ctxt)) { |
4133 | /* | 4008 | /* |
4134 | * Re-enter guest when pio read ahead buffer is empty | 4009 | * Re-enter guest when pio read ahead buffer is empty |
4135 | * or, if it is not used, after each 1024 iteration. | 4010 | * or, if it is not used, after each 1024 iteration. |
4136 | */ | 4011 | */ |
4137 | if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) && | 4012 | if ((r->end != 0 || ctxt->regs[VCPU_REGS_RCX] & 0x3ff) && |
4138 | (r->end == 0 || r->end != r->pos)) { | 4013 | (r->end == 0 || r->end != r->pos)) { |
4139 | /* | 4014 | /* |
4140 | * Reset read cache. Usually happens before | 4015 | * Reset read cache. Usually happens before |
4141 | * decode, but since instruction is restarted | 4016 | * decode, but since instruction is restarted |
4142 | * we have to do it here. | 4017 | * we have to do it here. |
4143 | */ | 4018 | */ |
4144 | ctxt->decode.mem_read.end = 0; | 4019 | ctxt->mem_read.end = 0; |
4145 | return EMULATION_RESTART; | 4020 | return EMULATION_RESTART; |
4146 | } | 4021 | } |
4147 | goto done; /* skip rip writeback */ | 4022 | goto done; /* skip rip writeback */ |
4148 | } | 4023 | } |
4149 | } | 4024 | } |
4150 | 4025 | ||
4151 | ctxt->eip = c->eip; | 4026 | ctxt->eip = ctxt->_eip; |
4152 | 4027 | ||
4153 | done: | 4028 | done: |
4154 | if (rc == X86EMUL_PROPAGATE_FAULT) | 4029 | if (rc == X86EMUL_PROPAGATE_FAULT) |
@@ -4159,13 +4034,7 @@ done: | |||
4159 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; | 4034 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; |
4160 | 4035 | ||
4161 | twobyte_insn: | 4036 | twobyte_insn: |
4162 | switch (c->b) { | 4037 | switch (ctxt->b) { |
4163 | case 0x05: /* syscall */ | ||
4164 | rc = emulate_syscall(ctxt, ops); | ||
4165 | break; | ||
4166 | case 0x06: | ||
4167 | rc = em_clts(ctxt); | ||
4168 | break; | ||
4169 | case 0x09: /* wbinvd */ | 4038 | case 0x09: /* wbinvd */ |
4170 | (ctxt->ops->wbinvd)(ctxt); | 4039 | (ctxt->ops->wbinvd)(ctxt); |
4171 | break; | 4040 | break; |
@@ -4174,21 +4043,21 @@ twobyte_insn: | |||
4174 | case 0x18: /* Grp16 (prefetch/nop) */ | 4043 | case 0x18: /* Grp16 (prefetch/nop) */ |
4175 | break; | 4044 | break; |
4176 | case 0x20: /* mov cr, reg */ | 4045 | case 0x20: /* mov cr, reg */ |
4177 | c->dst.val = ops->get_cr(ctxt, c->modrm_reg); | 4046 | ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg); |
4178 | break; | 4047 | break; |
4179 | case 0x21: /* mov from dr to reg */ | 4048 | case 0x21: /* mov from dr to reg */ |
4180 | ops->get_dr(ctxt, c->modrm_reg, &c->dst.val); | 4049 | ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val); |
4181 | break; | 4050 | break; |
4182 | case 0x22: /* mov reg, cr */ | 4051 | case 0x22: /* mov reg, cr */ |
4183 | if (ops->set_cr(ctxt, c->modrm_reg, c->src.val)) { | 4052 | if (ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) { |
4184 | emulate_gp(ctxt, 0); | 4053 | emulate_gp(ctxt, 0); |
4185 | rc = X86EMUL_PROPAGATE_FAULT; | 4054 | rc = X86EMUL_PROPAGATE_FAULT; |
4186 | goto done; | 4055 | goto done; |
4187 | } | 4056 | } |
4188 | c->dst.type = OP_NONE; | 4057 | ctxt->dst.type = OP_NONE; |
4189 | break; | 4058 | break; |
4190 | case 0x23: /* mov from reg to dr */ | 4059 | case 0x23: /* mov from reg to dr */ |
4191 | if (ops->set_dr(ctxt, c->modrm_reg, c->src.val & | 4060 | if (ops->set_dr(ctxt, ctxt->modrm_reg, ctxt->src.val & |
4192 | ((ctxt->mode == X86EMUL_MODE_PROT64) ? | 4061 | ((ctxt->mode == X86EMUL_MODE_PROT64) ? |
4193 | ~0ULL : ~0U)) < 0) { | 4062 | ~0ULL : ~0U)) < 0) { |
4194 | /* #UD condition is already handled by the code above */ | 4063 | /* #UD condition is already handled by the code above */ |
@@ -4197,13 +4066,13 @@ twobyte_insn: | |||
4197 | goto done; | 4066 | goto done; |
4198 | } | 4067 | } |
4199 | 4068 | ||
4200 | c->dst.type = OP_NONE; /* no writeback */ | 4069 | ctxt->dst.type = OP_NONE; /* no writeback */ |
4201 | break; | 4070 | break; |
4202 | case 0x30: | 4071 | case 0x30: |
4203 | /* wrmsr */ | 4072 | /* wrmsr */ |
4204 | msr_data = (u32)c->regs[VCPU_REGS_RAX] | 4073 | msr_data = (u32)ctxt->regs[VCPU_REGS_RAX] |
4205 | | ((u64)c->regs[VCPU_REGS_RDX] << 32); | 4074 | | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32); |
4206 | if (ops->set_msr(ctxt, c->regs[VCPU_REGS_RCX], msr_data)) { | 4075 | if (ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) { |
4207 | emulate_gp(ctxt, 0); | 4076 | emulate_gp(ctxt, 0); |
4208 | rc = X86EMUL_PROPAGATE_FAULT; | 4077 | rc = X86EMUL_PROPAGATE_FAULT; |
4209 | goto done; | 4078 | goto done; |
@@ -4212,64 +4081,58 @@ twobyte_insn: | |||
4212 | break; | 4081 | break; |
4213 | case 0x32: | 4082 | case 0x32: |
4214 | /* rdmsr */ | 4083 | /* rdmsr */ |
4215 | if (ops->get_msr(ctxt, c->regs[VCPU_REGS_RCX], &msr_data)) { | 4084 | if (ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) { |
4216 | emulate_gp(ctxt, 0); | 4085 | emulate_gp(ctxt, 0); |
4217 | rc = X86EMUL_PROPAGATE_FAULT; | 4086 | rc = X86EMUL_PROPAGATE_FAULT; |
4218 | goto done; | 4087 | goto done; |
4219 | } else { | 4088 | } else { |
4220 | c->regs[VCPU_REGS_RAX] = (u32)msr_data; | 4089 | ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data; |
4221 | c->regs[VCPU_REGS_RDX] = msr_data >> 32; | 4090 | ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32; |
4222 | } | 4091 | } |
4223 | rc = X86EMUL_CONTINUE; | 4092 | rc = X86EMUL_CONTINUE; |
4224 | break; | 4093 | break; |
4225 | case 0x34: /* sysenter */ | ||
4226 | rc = emulate_sysenter(ctxt, ops); | ||
4227 | break; | ||
4228 | case 0x35: /* sysexit */ | ||
4229 | rc = emulate_sysexit(ctxt, ops); | ||
4230 | break; | ||
4231 | case 0x40 ... 0x4f: /* cmov */ | 4094 | case 0x40 ... 0x4f: /* cmov */ |
4232 | c->dst.val = c->dst.orig_val = c->src.val; | 4095 | ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val; |
4233 | if (!test_cc(c->b, ctxt->eflags)) | 4096 | if (!test_cc(ctxt->b, ctxt->eflags)) |
4234 | c->dst.type = OP_NONE; /* no writeback */ | 4097 | ctxt->dst.type = OP_NONE; /* no writeback */ |
4235 | break; | 4098 | break; |
4236 | case 0x80 ... 0x8f: /* jnz rel, etc*/ | 4099 | case 0x80 ... 0x8f: /* jnz rel, etc*/ |
4237 | if (test_cc(c->b, ctxt->eflags)) | 4100 | if (test_cc(ctxt->b, ctxt->eflags)) |
4238 | jmp_rel(c, c->src.val); | 4101 | jmp_rel(ctxt, ctxt->src.val); |
4239 | break; | 4102 | break; |
4240 | case 0x90 ... 0x9f: /* setcc r/m8 */ | 4103 | case 0x90 ... 0x9f: /* setcc r/m8 */ |
4241 | c->dst.val = test_cc(c->b, ctxt->eflags); | 4104 | ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); |
4242 | break; | 4105 | break; |
4243 | case 0xa0: /* push fs */ | 4106 | case 0xa0: /* push fs */ |
4244 | rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); | 4107 | rc = emulate_push_sreg(ctxt, VCPU_SREG_FS); |
4245 | break; | 4108 | break; |
4246 | case 0xa1: /* pop fs */ | 4109 | case 0xa1: /* pop fs */ |
4247 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); | 4110 | rc = emulate_pop_sreg(ctxt, VCPU_SREG_FS); |
4248 | break; | 4111 | break; |
4249 | case 0xa3: | 4112 | case 0xa3: |
4250 | bt: /* bt */ | 4113 | bt: /* bt */ |
4251 | c->dst.type = OP_NONE; | 4114 | ctxt->dst.type = OP_NONE; |
4252 | /* only subword offset */ | 4115 | /* only subword offset */ |
4253 | c->src.val &= (c->dst.bytes << 3) - 1; | 4116 | ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; |
4254 | emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); | 4117 | emulate_2op_SrcV_nobyte("bt", ctxt->src, ctxt->dst, ctxt->eflags); |
4255 | break; | 4118 | break; |
4256 | case 0xa4: /* shld imm8, r, r/m */ | 4119 | case 0xa4: /* shld imm8, r, r/m */ |
4257 | case 0xa5: /* shld cl, r, r/m */ | 4120 | case 0xa5: /* shld cl, r, r/m */ |
4258 | emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); | 4121 | emulate_2op_cl("shld", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags); |
4259 | break; | 4122 | break; |
4260 | case 0xa8: /* push gs */ | 4123 | case 0xa8: /* push gs */ |
4261 | rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); | 4124 | rc = emulate_push_sreg(ctxt, VCPU_SREG_GS); |
4262 | break; | 4125 | break; |
4263 | case 0xa9: /* pop gs */ | 4126 | case 0xa9: /* pop gs */ |
4264 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); | 4127 | rc = emulate_pop_sreg(ctxt, VCPU_SREG_GS); |
4265 | break; | 4128 | break; |
4266 | case 0xab: | 4129 | case 0xab: |
4267 | bts: /* bts */ | 4130 | bts: /* bts */ |
4268 | emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); | 4131 | emulate_2op_SrcV_nobyte("bts", ctxt->src, ctxt->dst, ctxt->eflags); |
4269 | break; | 4132 | break; |
4270 | case 0xac: /* shrd imm8, r, r/m */ | 4133 | case 0xac: /* shrd imm8, r, r/m */ |
4271 | case 0xad: /* shrd cl, r, r/m */ | 4134 | case 0xad: /* shrd cl, r, r/m */ |
4272 | emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags); | 4135 | emulate_2op_cl("shrd", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags); |
4273 | break; | 4136 | break; |
4274 | case 0xae: /* clflush */ | 4137 | case 0xae: /* clflush */ |
4275 | break; | 4138 | break; |
@@ -4278,38 +4141,38 @@ twobyte_insn: | |||
4278 | * Save real source value, then compare EAX against | 4141 | * Save real source value, then compare EAX against |
4279 | * destination. | 4142 | * destination. |
4280 | */ | 4143 | */ |
4281 | c->src.orig_val = c->src.val; | 4144 | ctxt->src.orig_val = ctxt->src.val; |
4282 | c->src.val = c->regs[VCPU_REGS_RAX]; | 4145 | ctxt->src.val = ctxt->regs[VCPU_REGS_RAX]; |
4283 | emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); | 4146 | emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags); |
4284 | if (ctxt->eflags & EFLG_ZF) { | 4147 | if (ctxt->eflags & EFLG_ZF) { |
4285 | /* Success: write back to memory. */ | 4148 | /* Success: write back to memory. */ |
4286 | c->dst.val = c->src.orig_val; | 4149 | ctxt->dst.val = ctxt->src.orig_val; |
4287 | } else { | 4150 | } else { |
4288 | /* Failure: write the value we saw to EAX. */ | 4151 | /* Failure: write the value we saw to EAX. */ |
4289 | c->dst.type = OP_REG; | 4152 | ctxt->dst.type = OP_REG; |
4290 | c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX]; | 4153 | ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; |
4291 | } | 4154 | } |
4292 | break; | 4155 | break; |
4293 | case 0xb2: /* lss */ | 4156 | case 0xb2: /* lss */ |
4294 | rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS); | 4157 | rc = emulate_load_segment(ctxt, VCPU_SREG_SS); |
4295 | break; | 4158 | break; |
4296 | case 0xb3: | 4159 | case 0xb3: |
4297 | btr: /* btr */ | 4160 | btr: /* btr */ |
4298 | emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); | 4161 | emulate_2op_SrcV_nobyte("btr", ctxt->src, ctxt->dst, ctxt->eflags); |
4299 | break; | 4162 | break; |
4300 | case 0xb4: /* lfs */ | 4163 | case 0xb4: /* lfs */ |
4301 | rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS); | 4164 | rc = emulate_load_segment(ctxt, VCPU_SREG_FS); |
4302 | break; | 4165 | break; |
4303 | case 0xb5: /* lgs */ | 4166 | case 0xb5: /* lgs */ |
4304 | rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS); | 4167 | rc = emulate_load_segment(ctxt, VCPU_SREG_GS); |
4305 | break; | 4168 | break; |
4306 | case 0xb6 ... 0xb7: /* movzx */ | 4169 | case 0xb6 ... 0xb7: /* movzx */ |
4307 | c->dst.bytes = c->op_bytes; | 4170 | ctxt->dst.bytes = ctxt->op_bytes; |
4308 | c->dst.val = (c->d & ByteOp) ? (u8) c->src.val | 4171 | ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val |
4309 | : (u16) c->src.val; | 4172 | : (u16) ctxt->src.val; |
4310 | break; | 4173 | break; |
4311 | case 0xba: /* Grp8 */ | 4174 | case 0xba: /* Grp8 */ |
4312 | switch (c->modrm_reg & 3) { | 4175 | switch (ctxt->modrm_reg & 3) { |
4313 | case 0: | 4176 | case 0: |
4314 | goto bt; | 4177 | goto bt; |
4315 | case 1: | 4178 | case 1: |
@@ -4322,47 +4185,47 @@ twobyte_insn: | |||
4322 | break; | 4185 | break; |
4323 | case 0xbb: | 4186 | case 0xbb: |
4324 | btc: /* btc */ | 4187 | btc: /* btc */ |
4325 | emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); | 4188 | emulate_2op_SrcV_nobyte("btc", ctxt->src, ctxt->dst, ctxt->eflags); |
4326 | break; | 4189 | break; |
4327 | case 0xbc: { /* bsf */ | 4190 | case 0xbc: { /* bsf */ |
4328 | u8 zf; | 4191 | u8 zf; |
4329 | __asm__ ("bsf %2, %0; setz %1" | 4192 | __asm__ ("bsf %2, %0; setz %1" |
4330 | : "=r"(c->dst.val), "=q"(zf) | 4193 | : "=r"(ctxt->dst.val), "=q"(zf) |
4331 | : "r"(c->src.val)); | 4194 | : "r"(ctxt->src.val)); |
4332 | ctxt->eflags &= ~X86_EFLAGS_ZF; | 4195 | ctxt->eflags &= ~X86_EFLAGS_ZF; |
4333 | if (zf) { | 4196 | if (zf) { |
4334 | ctxt->eflags |= X86_EFLAGS_ZF; | 4197 | ctxt->eflags |= X86_EFLAGS_ZF; |
4335 | c->dst.type = OP_NONE; /* Disable writeback. */ | 4198 | ctxt->dst.type = OP_NONE; /* Disable writeback. */ |
4336 | } | 4199 | } |
4337 | break; | 4200 | break; |
4338 | } | 4201 | } |
4339 | case 0xbd: { /* bsr */ | 4202 | case 0xbd: { /* bsr */ |
4340 | u8 zf; | 4203 | u8 zf; |
4341 | __asm__ ("bsr %2, %0; setz %1" | 4204 | __asm__ ("bsr %2, %0; setz %1" |
4342 | : "=r"(c->dst.val), "=q"(zf) | 4205 | : "=r"(ctxt->dst.val), "=q"(zf) |
4343 | : "r"(c->src.val)); | 4206 | : "r"(ctxt->src.val)); |
4344 | ctxt->eflags &= ~X86_EFLAGS_ZF; | 4207 | ctxt->eflags &= ~X86_EFLAGS_ZF; |
4345 | if (zf) { | 4208 | if (zf) { |
4346 | ctxt->eflags |= X86_EFLAGS_ZF; | 4209 | ctxt->eflags |= X86_EFLAGS_ZF; |
4347 | c->dst.type = OP_NONE; /* Disable writeback. */ | 4210 | ctxt->dst.type = OP_NONE; /* Disable writeback. */ |
4348 | } | 4211 | } |
4349 | break; | 4212 | break; |
4350 | } | 4213 | } |
4351 | case 0xbe ... 0xbf: /* movsx */ | 4214 | case 0xbe ... 0xbf: /* movsx */ |
4352 | c->dst.bytes = c->op_bytes; | 4215 | ctxt->dst.bytes = ctxt->op_bytes; |
4353 | c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : | 4216 | ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : |
4354 | (s16) c->src.val; | 4217 | (s16) ctxt->src.val; |
4355 | break; | 4218 | break; |
4356 | case 0xc0 ... 0xc1: /* xadd */ | 4219 | case 0xc0 ... 0xc1: /* xadd */ |
4357 | emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); | 4220 | emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags); |
4358 | /* Write back the register source. */ | 4221 | /* Write back the register source. */ |
4359 | c->src.val = c->dst.orig_val; | 4222 | ctxt->src.val = ctxt->dst.orig_val; |
4360 | write_register_operand(&c->src); | 4223 | write_register_operand(&ctxt->src); |
4361 | break; | 4224 | break; |
4362 | case 0xc3: /* movnti */ | 4225 | case 0xc3: /* movnti */ |
4363 | c->dst.bytes = c->op_bytes; | 4226 | ctxt->dst.bytes = ctxt->op_bytes; |
4364 | c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : | 4227 | ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : |
4365 | (u64) c->src.val; | 4228 | (u64) ctxt->src.val; |
4366 | break; | 4229 | break; |
4367 | case 0xc7: /* Grp9 (cmpxchg8b) */ | 4230 | case 0xc7: /* Grp9 (cmpxchg8b) */ |
4368 | rc = em_grp9(ctxt); | 4231 | rc = em_grp9(ctxt); |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index aee38623b768..9335e1bf72ad 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -148,7 +148,7 @@ module_param(oos_shadow, bool, 0644); | |||
148 | #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | 148 | #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ |
149 | | PT64_NX_MASK) | 149 | | PT64_NX_MASK) |
150 | 150 | ||
151 | #define RMAP_EXT 4 | 151 | #define PTE_LIST_EXT 4 |
152 | 152 | ||
153 | #define ACC_EXEC_MASK 1 | 153 | #define ACC_EXEC_MASK 1 |
154 | #define ACC_WRITE_MASK PT_WRITABLE_MASK | 154 | #define ACC_WRITE_MASK PT_WRITABLE_MASK |
@@ -164,16 +164,16 @@ module_param(oos_shadow, bool, 0644); | |||
164 | 164 | ||
165 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | 165 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) |
166 | 166 | ||
167 | struct kvm_rmap_desc { | 167 | struct pte_list_desc { |
168 | u64 *sptes[RMAP_EXT]; | 168 | u64 *sptes[PTE_LIST_EXT]; |
169 | struct kvm_rmap_desc *more; | 169 | struct pte_list_desc *more; |
170 | }; | 170 | }; |
171 | 171 | ||
172 | struct kvm_shadow_walk_iterator { | 172 | struct kvm_shadow_walk_iterator { |
173 | u64 addr; | 173 | u64 addr; |
174 | hpa_t shadow_addr; | 174 | hpa_t shadow_addr; |
175 | int level; | ||
176 | u64 *sptep; | 175 | u64 *sptep; |
176 | int level; | ||
177 | unsigned index; | 177 | unsigned index; |
178 | }; | 178 | }; |
179 | 179 | ||
@@ -182,32 +182,68 @@ struct kvm_shadow_walk_iterator { | |||
182 | shadow_walk_okay(&(_walker)); \ | 182 | shadow_walk_okay(&(_walker)); \ |
183 | shadow_walk_next(&(_walker))) | 183 | shadow_walk_next(&(_walker))) |
184 | 184 | ||
185 | typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); | 185 | #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \ |
186 | for (shadow_walk_init(&(_walker), _vcpu, _addr); \ | ||
187 | shadow_walk_okay(&(_walker)) && \ | ||
188 | ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \ | ||
189 | __shadow_walk_next(&(_walker), spte)) | ||
186 | 190 | ||
187 | static struct kmem_cache *pte_chain_cache; | 191 | static struct kmem_cache *pte_list_desc_cache; |
188 | static struct kmem_cache *rmap_desc_cache; | ||
189 | static struct kmem_cache *mmu_page_header_cache; | 192 | static struct kmem_cache *mmu_page_header_cache; |
190 | static struct percpu_counter kvm_total_used_mmu_pages; | 193 | static struct percpu_counter kvm_total_used_mmu_pages; |
191 | 194 | ||
192 | static u64 __read_mostly shadow_trap_nonpresent_pte; | ||
193 | static u64 __read_mostly shadow_notrap_nonpresent_pte; | ||
194 | static u64 __read_mostly shadow_nx_mask; | 195 | static u64 __read_mostly shadow_nx_mask; |
195 | static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ | 196 | static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ |
196 | static u64 __read_mostly shadow_user_mask; | 197 | static u64 __read_mostly shadow_user_mask; |
197 | static u64 __read_mostly shadow_accessed_mask; | 198 | static u64 __read_mostly shadow_accessed_mask; |
198 | static u64 __read_mostly shadow_dirty_mask; | 199 | static u64 __read_mostly shadow_dirty_mask; |
200 | static u64 __read_mostly shadow_mmio_mask; | ||
199 | 201 | ||
200 | static inline u64 rsvd_bits(int s, int e) | 202 | static void mmu_spte_set(u64 *sptep, u64 spte); |
203 | |||
204 | void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) | ||
201 | { | 205 | { |
202 | return ((1ULL << (e - s + 1)) - 1) << s; | 206 | shadow_mmio_mask = mmio_mask; |
207 | } | ||
208 | EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); | ||
209 | |||
210 | static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) | ||
211 | { | ||
212 | access &= ACC_WRITE_MASK | ACC_USER_MASK; | ||
213 | |||
214 | trace_mark_mmio_spte(sptep, gfn, access); | ||
215 | mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT); | ||
203 | } | 216 | } |
204 | 217 | ||
205 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) | 218 | static bool is_mmio_spte(u64 spte) |
206 | { | 219 | { |
207 | shadow_trap_nonpresent_pte = trap_pte; | 220 | return (spte & shadow_mmio_mask) == shadow_mmio_mask; |
208 | shadow_notrap_nonpresent_pte = notrap_pte; | 221 | } |
222 | |||
223 | static gfn_t get_mmio_spte_gfn(u64 spte) | ||
224 | { | ||
225 | return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT; | ||
226 | } | ||
227 | |||
228 | static unsigned get_mmio_spte_access(u64 spte) | ||
229 | { | ||
230 | return (spte & ~shadow_mmio_mask) & ~PAGE_MASK; | ||
231 | } | ||
232 | |||
233 | static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) | ||
234 | { | ||
235 | if (unlikely(is_noslot_pfn(pfn))) { | ||
236 | mark_mmio_spte(sptep, gfn, access); | ||
237 | return true; | ||
238 | } | ||
239 | |||
240 | return false; | ||
241 | } | ||
242 | |||
243 | static inline u64 rsvd_bits(int s, int e) | ||
244 | { | ||
245 | return ((1ULL << (e - s + 1)) - 1) << s; | ||
209 | } | 246 | } |
210 | EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); | ||
211 | 247 | ||
212 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, | 248 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, |
213 | u64 dirty_mask, u64 nx_mask, u64 x_mask) | 249 | u64 dirty_mask, u64 nx_mask, u64 x_mask) |
@@ -220,11 +256,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, | |||
220 | } | 256 | } |
221 | EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); | 257 | EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); |
222 | 258 | ||
223 | static bool is_write_protection(struct kvm_vcpu *vcpu) | ||
224 | { | ||
225 | return kvm_read_cr0_bits(vcpu, X86_CR0_WP); | ||
226 | } | ||
227 | |||
228 | static int is_cpuid_PSE36(void) | 259 | static int is_cpuid_PSE36(void) |
229 | { | 260 | { |
230 | return 1; | 261 | return 1; |
@@ -237,8 +268,7 @@ static int is_nx(struct kvm_vcpu *vcpu) | |||
237 | 268 | ||
238 | static int is_shadow_present_pte(u64 pte) | 269 | static int is_shadow_present_pte(u64 pte) |
239 | { | 270 | { |
240 | return pte != shadow_trap_nonpresent_pte | 271 | return pte & PT_PRESENT_MASK && !is_mmio_spte(pte); |
241 | && pte != shadow_notrap_nonpresent_pte; | ||
242 | } | 272 | } |
243 | 273 | ||
244 | static int is_large_pte(u64 pte) | 274 | static int is_large_pte(u64 pte) |
@@ -246,11 +276,6 @@ static int is_large_pte(u64 pte) | |||
246 | return pte & PT_PAGE_SIZE_MASK; | 276 | return pte & PT_PAGE_SIZE_MASK; |
247 | } | 277 | } |
248 | 278 | ||
249 | static int is_writable_pte(unsigned long pte) | ||
250 | { | ||
251 | return pte & PT_WRITABLE_MASK; | ||
252 | } | ||
253 | |||
254 | static int is_dirty_gpte(unsigned long pte) | 279 | static int is_dirty_gpte(unsigned long pte) |
255 | { | 280 | { |
256 | return pte & PT_DIRTY_MASK; | 281 | return pte & PT_DIRTY_MASK; |
@@ -282,26 +307,154 @@ static gfn_t pse36_gfn_delta(u32 gpte) | |||
282 | return (gpte & PT32_DIR_PSE36_MASK) << shift; | 307 | return (gpte & PT32_DIR_PSE36_MASK) << shift; |
283 | } | 308 | } |
284 | 309 | ||
310 | #ifdef CONFIG_X86_64 | ||
285 | static void __set_spte(u64 *sptep, u64 spte) | 311 | static void __set_spte(u64 *sptep, u64 spte) |
286 | { | 312 | { |
287 | set_64bit(sptep, spte); | 313 | *sptep = spte; |
288 | } | 314 | } |
289 | 315 | ||
290 | static u64 __xchg_spte(u64 *sptep, u64 new_spte) | 316 | static void __update_clear_spte_fast(u64 *sptep, u64 spte) |
291 | { | 317 | { |
292 | #ifdef CONFIG_X86_64 | 318 | *sptep = spte; |
293 | return xchg(sptep, new_spte); | 319 | } |
320 | |||
321 | static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) | ||
322 | { | ||
323 | return xchg(sptep, spte); | ||
324 | } | ||
325 | |||
326 | static u64 __get_spte_lockless(u64 *sptep) | ||
327 | { | ||
328 | return ACCESS_ONCE(*sptep); | ||
329 | } | ||
330 | |||
331 | static bool __check_direct_spte_mmio_pf(u64 spte) | ||
332 | { | ||
333 | /* It is valid if the spte is zapped. */ | ||
334 | return spte == 0ull; | ||
335 | } | ||
294 | #else | 336 | #else |
295 | u64 old_spte; | 337 | union split_spte { |
338 | struct { | ||
339 | u32 spte_low; | ||
340 | u32 spte_high; | ||
341 | }; | ||
342 | u64 spte; | ||
343 | }; | ||
296 | 344 | ||
297 | do { | 345 | static void count_spte_clear(u64 *sptep, u64 spte) |
298 | old_spte = *sptep; | 346 | { |
299 | } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte); | 347 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); |
300 | 348 | ||
301 | return old_spte; | 349 | if (is_shadow_present_pte(spte)) |
302 | #endif | 350 | return; |
351 | |||
352 | /* Ensure the spte is completely set before we increase the count */ | ||
353 | smp_wmb(); | ||
354 | sp->clear_spte_count++; | ||
355 | } | ||
356 | |||
357 | static void __set_spte(u64 *sptep, u64 spte) | ||
358 | { | ||
359 | union split_spte *ssptep, sspte; | ||
360 | |||
361 | ssptep = (union split_spte *)sptep; | ||
362 | sspte = (union split_spte)spte; | ||
363 | |||
364 | ssptep->spte_high = sspte.spte_high; | ||
365 | |||
366 | /* | ||
367 | * If we map the spte from nonpresent to present, We should store | ||
368 | * the high bits firstly, then set present bit, so cpu can not | ||
369 | * fetch this spte while we are setting the spte. | ||
370 | */ | ||
371 | smp_wmb(); | ||
372 | |||
373 | ssptep->spte_low = sspte.spte_low; | ||
303 | } | 374 | } |
304 | 375 | ||
376 | static void __update_clear_spte_fast(u64 *sptep, u64 spte) | ||
377 | { | ||
378 | union split_spte *ssptep, sspte; | ||
379 | |||
380 | ssptep = (union split_spte *)sptep; | ||
381 | sspte = (union split_spte)spte; | ||
382 | |||
383 | ssptep->spte_low = sspte.spte_low; | ||
384 | |||
385 | /* | ||
386 | * If we map the spte from present to nonpresent, we should clear | ||
387 | * present bit firstly to avoid vcpu fetch the old high bits. | ||
388 | */ | ||
389 | smp_wmb(); | ||
390 | |||
391 | ssptep->spte_high = sspte.spte_high; | ||
392 | count_spte_clear(sptep, spte); | ||
393 | } | ||
394 | |||
395 | static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) | ||
396 | { | ||
397 | union split_spte *ssptep, sspte, orig; | ||
398 | |||
399 | ssptep = (union split_spte *)sptep; | ||
400 | sspte = (union split_spte)spte; | ||
401 | |||
402 | /* xchg acts as a barrier before the setting of the high bits */ | ||
403 | orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low); | ||
404 | orig.spte_high = ssptep->spte_high = sspte.spte_high; | ||
405 | count_spte_clear(sptep, spte); | ||
406 | |||
407 | return orig.spte; | ||
408 | } | ||
409 | |||
410 | /* | ||
411 | * The idea using the light way get the spte on x86_32 guest is from | ||
412 | * gup_get_pte(arch/x86/mm/gup.c). | ||
413 | * The difference is we can not catch the spte tlb flush if we leave | ||
414 | * guest mode, so we emulate it by increase clear_spte_count when spte | ||
415 | * is cleared. | ||
416 | */ | ||
417 | static u64 __get_spte_lockless(u64 *sptep) | ||
418 | { | ||
419 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); | ||
420 | union split_spte spte, *orig = (union split_spte *)sptep; | ||
421 | int count; | ||
422 | |||
423 | retry: | ||
424 | count = sp->clear_spte_count; | ||
425 | smp_rmb(); | ||
426 | |||
427 | spte.spte_low = orig->spte_low; | ||
428 | smp_rmb(); | ||
429 | |||
430 | spte.spte_high = orig->spte_high; | ||
431 | smp_rmb(); | ||
432 | |||
433 | if (unlikely(spte.spte_low != orig->spte_low || | ||
434 | count != sp->clear_spte_count)) | ||
435 | goto retry; | ||
436 | |||
437 | return spte.spte; | ||
438 | } | ||
439 | |||
440 | static bool __check_direct_spte_mmio_pf(u64 spte) | ||
441 | { | ||
442 | union split_spte sspte = (union split_spte)spte; | ||
443 | u32 high_mmio_mask = shadow_mmio_mask >> 32; | ||
444 | |||
445 | /* It is valid if the spte is zapped. */ | ||
446 | if (spte == 0ull) | ||
447 | return true; | ||
448 | |||
449 | /* It is valid if the spte is being zapped. */ | ||
450 | if (sspte.spte_low == 0ull && | ||
451 | (sspte.spte_high & high_mmio_mask) == high_mmio_mask) | ||
452 | return true; | ||
453 | |||
454 | return false; | ||
455 | } | ||
456 | #endif | ||
457 | |||
305 | static bool spte_has_volatile_bits(u64 spte) | 458 | static bool spte_has_volatile_bits(u64 spte) |
306 | { | 459 | { |
307 | if (!shadow_accessed_mask) | 460 | if (!shadow_accessed_mask) |
@@ -322,12 +475,30 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) | |||
322 | return (old_spte & bit_mask) && !(new_spte & bit_mask); | 475 | return (old_spte & bit_mask) && !(new_spte & bit_mask); |
323 | } | 476 | } |
324 | 477 | ||
325 | static void update_spte(u64 *sptep, u64 new_spte) | 478 | /* Rules for using mmu_spte_set: |
479 | * Set the sptep from nonpresent to present. | ||
480 | * Note: the sptep being assigned *must* be either not present | ||
481 | * or in a state where the hardware will not attempt to update | ||
482 | * the spte. | ||
483 | */ | ||
484 | static void mmu_spte_set(u64 *sptep, u64 new_spte) | ||
485 | { | ||
486 | WARN_ON(is_shadow_present_pte(*sptep)); | ||
487 | __set_spte(sptep, new_spte); | ||
488 | } | ||
489 | |||
490 | /* Rules for using mmu_spte_update: | ||
491 | * Update the state bits, it means the mapped pfn is not changged. | ||
492 | */ | ||
493 | static void mmu_spte_update(u64 *sptep, u64 new_spte) | ||
326 | { | 494 | { |
327 | u64 mask, old_spte = *sptep; | 495 | u64 mask, old_spte = *sptep; |
328 | 496 | ||
329 | WARN_ON(!is_rmap_spte(new_spte)); | 497 | WARN_ON(!is_rmap_spte(new_spte)); |
330 | 498 | ||
499 | if (!is_shadow_present_pte(old_spte)) | ||
500 | return mmu_spte_set(sptep, new_spte); | ||
501 | |||
331 | new_spte |= old_spte & shadow_dirty_mask; | 502 | new_spte |= old_spte & shadow_dirty_mask; |
332 | 503 | ||
333 | mask = shadow_accessed_mask; | 504 | mask = shadow_accessed_mask; |
@@ -335,9 +506,9 @@ static void update_spte(u64 *sptep, u64 new_spte) | |||
335 | mask |= shadow_dirty_mask; | 506 | mask |= shadow_dirty_mask; |
336 | 507 | ||
337 | if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) | 508 | if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) |
338 | __set_spte(sptep, new_spte); | 509 | __update_clear_spte_fast(sptep, new_spte); |
339 | else | 510 | else |
340 | old_spte = __xchg_spte(sptep, new_spte); | 511 | old_spte = __update_clear_spte_slow(sptep, new_spte); |
341 | 512 | ||
342 | if (!shadow_accessed_mask) | 513 | if (!shadow_accessed_mask) |
343 | return; | 514 | return; |
@@ -348,6 +519,64 @@ static void update_spte(u64 *sptep, u64 new_spte) | |||
348 | kvm_set_pfn_dirty(spte_to_pfn(old_spte)); | 519 | kvm_set_pfn_dirty(spte_to_pfn(old_spte)); |
349 | } | 520 | } |
350 | 521 | ||
522 | /* | ||
523 | * Rules for using mmu_spte_clear_track_bits: | ||
524 | * It sets the sptep from present to nonpresent, and track the | ||
525 | * state bits, it is used to clear the last level sptep. | ||
526 | */ | ||
527 | static int mmu_spte_clear_track_bits(u64 *sptep) | ||
528 | { | ||
529 | pfn_t pfn; | ||
530 | u64 old_spte = *sptep; | ||
531 | |||
532 | if (!spte_has_volatile_bits(old_spte)) | ||
533 | __update_clear_spte_fast(sptep, 0ull); | ||
534 | else | ||
535 | old_spte = __update_clear_spte_slow(sptep, 0ull); | ||
536 | |||
537 | if (!is_rmap_spte(old_spte)) | ||
538 | return 0; | ||
539 | |||
540 | pfn = spte_to_pfn(old_spte); | ||
541 | if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) | ||
542 | kvm_set_pfn_accessed(pfn); | ||
543 | if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) | ||
544 | kvm_set_pfn_dirty(pfn); | ||
545 | return 1; | ||
546 | } | ||
547 | |||
548 | /* | ||
549 | * Rules for using mmu_spte_clear_no_track: | ||
550 | * Directly clear spte without caring the state bits of sptep, | ||
551 | * it is used to set the upper level spte. | ||
552 | */ | ||
553 | static void mmu_spte_clear_no_track(u64 *sptep) | ||
554 | { | ||
555 | __update_clear_spte_fast(sptep, 0ull); | ||
556 | } | ||
557 | |||
558 | static u64 mmu_spte_get_lockless(u64 *sptep) | ||
559 | { | ||
560 | return __get_spte_lockless(sptep); | ||
561 | } | ||
562 | |||
563 | static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) | ||
564 | { | ||
565 | rcu_read_lock(); | ||
566 | atomic_inc(&vcpu->kvm->arch.reader_counter); | ||
567 | |||
568 | /* Increase the counter before walking shadow page table */ | ||
569 | smp_mb__after_atomic_inc(); | ||
570 | } | ||
571 | |||
572 | static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) | ||
573 | { | ||
574 | /* Decrease the counter after walking shadow page table finished */ | ||
575 | smp_mb__before_atomic_dec(); | ||
576 | atomic_dec(&vcpu->kvm->arch.reader_counter); | ||
577 | rcu_read_unlock(); | ||
578 | } | ||
579 | |||
351 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, | 580 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, |
352 | struct kmem_cache *base_cache, int min) | 581 | struct kmem_cache *base_cache, int min) |
353 | { | 582 | { |
@@ -397,12 +626,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) | |||
397 | { | 626 | { |
398 | int r; | 627 | int r; |
399 | 628 | ||
400 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, | 629 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, |
401 | pte_chain_cache, 4); | 630 | pte_list_desc_cache, 8 + PTE_PREFETCH_NUM); |
402 | if (r) | ||
403 | goto out; | ||
404 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, | ||
405 | rmap_desc_cache, 4 + PTE_PREFETCH_NUM); | ||
406 | if (r) | 631 | if (r) |
407 | goto out; | 632 | goto out; |
408 | r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); | 633 | r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); |
@@ -416,8 +641,8 @@ out: | |||
416 | 641 | ||
417 | static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) | 642 | static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) |
418 | { | 643 | { |
419 | mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache); | 644 | mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, |
420 | mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache); | 645 | pte_list_desc_cache); |
421 | mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); | 646 | mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); |
422 | mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, | 647 | mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, |
423 | mmu_page_header_cache); | 648 | mmu_page_header_cache); |
@@ -433,26 +658,15 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, | |||
433 | return p; | 658 | return p; |
434 | } | 659 | } |
435 | 660 | ||
436 | static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) | 661 | static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) |
437 | { | ||
438 | return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache, | ||
439 | sizeof(struct kvm_pte_chain)); | ||
440 | } | ||
441 | |||
442 | static void mmu_free_pte_chain(struct kvm_pte_chain *pc) | ||
443 | { | 662 | { |
444 | kmem_cache_free(pte_chain_cache, pc); | 663 | return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, |
664 | sizeof(struct pte_list_desc)); | ||
445 | } | 665 | } |
446 | 666 | ||
447 | static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) | 667 | static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) |
448 | { | 668 | { |
449 | return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache, | 669 | kmem_cache_free(pte_list_desc_cache, pte_list_desc); |
450 | sizeof(struct kvm_rmap_desc)); | ||
451 | } | ||
452 | |||
453 | static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) | ||
454 | { | ||
455 | kmem_cache_free(rmap_desc_cache, rd); | ||
456 | } | 670 | } |
457 | 671 | ||
458 | static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) | 672 | static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) |
@@ -498,6 +712,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn) | |||
498 | linfo = lpage_info_slot(gfn, slot, i); | 712 | linfo = lpage_info_slot(gfn, slot, i); |
499 | linfo->write_count += 1; | 713 | linfo->write_count += 1; |
500 | } | 714 | } |
715 | kvm->arch.indirect_shadow_pages++; | ||
501 | } | 716 | } |
502 | 717 | ||
503 | static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) | 718 | static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) |
@@ -513,6 +728,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) | |||
513 | linfo->write_count -= 1; | 728 | linfo->write_count -= 1; |
514 | WARN_ON(linfo->write_count < 0); | 729 | WARN_ON(linfo->write_count < 0); |
515 | } | 730 | } |
731 | kvm->arch.indirect_shadow_pages--; | ||
516 | } | 732 | } |
517 | 733 | ||
518 | static int has_wrprotected_page(struct kvm *kvm, | 734 | static int has_wrprotected_page(struct kvm *kvm, |
@@ -588,67 +804,42 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) | |||
588 | } | 804 | } |
589 | 805 | ||
590 | /* | 806 | /* |
591 | * Take gfn and return the reverse mapping to it. | 807 | * Pte mapping structures: |
592 | */ | ||
593 | |||
594 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) | ||
595 | { | ||
596 | struct kvm_memory_slot *slot; | ||
597 | struct kvm_lpage_info *linfo; | ||
598 | |||
599 | slot = gfn_to_memslot(kvm, gfn); | ||
600 | if (likely(level == PT_PAGE_TABLE_LEVEL)) | ||
601 | return &slot->rmap[gfn - slot->base_gfn]; | ||
602 | |||
603 | linfo = lpage_info_slot(gfn, slot, level); | ||
604 | |||
605 | return &linfo->rmap_pde; | ||
606 | } | ||
607 | |||
608 | /* | ||
609 | * Reverse mapping data structures: | ||
610 | * | 808 | * |
611 | * If rmapp bit zero is zero, then rmapp point to the shadw page table entry | 809 | * If pte_list bit zero is zero, then pte_list point to the spte. |
612 | * that points to page_address(page). | ||
613 | * | 810 | * |
614 | * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc | 811 | * If pte_list bit zero is one, (then pte_list & ~1) points to a struct |
615 | * containing more mappings. | 812 | * pte_list_desc containing more mappings. |
616 | * | 813 | * |
617 | * Returns the number of rmap entries before the spte was added or zero if | 814 | * Returns the number of pte entries before the spte was added or zero if |
618 | * the spte was not added. | 815 | * the spte was not added. |
619 | * | 816 | * |
620 | */ | 817 | */ |
621 | static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | 818 | static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, |
819 | unsigned long *pte_list) | ||
622 | { | 820 | { |
623 | struct kvm_mmu_page *sp; | 821 | struct pte_list_desc *desc; |
624 | struct kvm_rmap_desc *desc; | ||
625 | unsigned long *rmapp; | ||
626 | int i, count = 0; | 822 | int i, count = 0; |
627 | 823 | ||
628 | if (!is_rmap_spte(*spte)) | 824 | if (!*pte_list) { |
629 | return count; | 825 | rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte); |
630 | sp = page_header(__pa(spte)); | 826 | *pte_list = (unsigned long)spte; |
631 | kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); | 827 | } else if (!(*pte_list & 1)) { |
632 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); | 828 | rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte); |
633 | if (!*rmapp) { | 829 | desc = mmu_alloc_pte_list_desc(vcpu); |
634 | rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); | 830 | desc->sptes[0] = (u64 *)*pte_list; |
635 | *rmapp = (unsigned long)spte; | ||
636 | } else if (!(*rmapp & 1)) { | ||
637 | rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); | ||
638 | desc = mmu_alloc_rmap_desc(vcpu); | ||
639 | desc->sptes[0] = (u64 *)*rmapp; | ||
640 | desc->sptes[1] = spte; | 831 | desc->sptes[1] = spte; |
641 | *rmapp = (unsigned long)desc | 1; | 832 | *pte_list = (unsigned long)desc | 1; |
642 | ++count; | 833 | ++count; |
643 | } else { | 834 | } else { |
644 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); | 835 | rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte); |
645 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | 836 | desc = (struct pte_list_desc *)(*pte_list & ~1ul); |
646 | while (desc->sptes[RMAP_EXT-1] && desc->more) { | 837 | while (desc->sptes[PTE_LIST_EXT-1] && desc->more) { |
647 | desc = desc->more; | 838 | desc = desc->more; |
648 | count += RMAP_EXT; | 839 | count += PTE_LIST_EXT; |
649 | } | 840 | } |
650 | if (desc->sptes[RMAP_EXT-1]) { | 841 | if (desc->sptes[PTE_LIST_EXT-1]) { |
651 | desc->more = mmu_alloc_rmap_desc(vcpu); | 842 | desc->more = mmu_alloc_pte_list_desc(vcpu); |
652 | desc = desc->more; | 843 | desc = desc->more; |
653 | } | 844 | } |
654 | for (i = 0; desc->sptes[i]; ++i) | 845 | for (i = 0; desc->sptes[i]; ++i) |
@@ -658,59 +849,78 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | |||
658 | return count; | 849 | return count; |
659 | } | 850 | } |
660 | 851 | ||
661 | static void rmap_desc_remove_entry(unsigned long *rmapp, | 852 | static u64 *pte_list_next(unsigned long *pte_list, u64 *spte) |
662 | struct kvm_rmap_desc *desc, | 853 | { |
663 | int i, | 854 | struct pte_list_desc *desc; |
664 | struct kvm_rmap_desc *prev_desc) | 855 | u64 *prev_spte; |
856 | int i; | ||
857 | |||
858 | if (!*pte_list) | ||
859 | return NULL; | ||
860 | else if (!(*pte_list & 1)) { | ||
861 | if (!spte) | ||
862 | return (u64 *)*pte_list; | ||
863 | return NULL; | ||
864 | } | ||
865 | desc = (struct pte_list_desc *)(*pte_list & ~1ul); | ||
866 | prev_spte = NULL; | ||
867 | while (desc) { | ||
868 | for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { | ||
869 | if (prev_spte == spte) | ||
870 | return desc->sptes[i]; | ||
871 | prev_spte = desc->sptes[i]; | ||
872 | } | ||
873 | desc = desc->more; | ||
874 | } | ||
875 | return NULL; | ||
876 | } | ||
877 | |||
878 | static void | ||
879 | pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, | ||
880 | int i, struct pte_list_desc *prev_desc) | ||
665 | { | 881 | { |
666 | int j; | 882 | int j; |
667 | 883 | ||
668 | for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j) | 884 | for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j) |
669 | ; | 885 | ; |
670 | desc->sptes[i] = desc->sptes[j]; | 886 | desc->sptes[i] = desc->sptes[j]; |
671 | desc->sptes[j] = NULL; | 887 | desc->sptes[j] = NULL; |
672 | if (j != 0) | 888 | if (j != 0) |
673 | return; | 889 | return; |
674 | if (!prev_desc && !desc->more) | 890 | if (!prev_desc && !desc->more) |
675 | *rmapp = (unsigned long)desc->sptes[0]; | 891 | *pte_list = (unsigned long)desc->sptes[0]; |
676 | else | 892 | else |
677 | if (prev_desc) | 893 | if (prev_desc) |
678 | prev_desc->more = desc->more; | 894 | prev_desc->more = desc->more; |
679 | else | 895 | else |
680 | *rmapp = (unsigned long)desc->more | 1; | 896 | *pte_list = (unsigned long)desc->more | 1; |
681 | mmu_free_rmap_desc(desc); | 897 | mmu_free_pte_list_desc(desc); |
682 | } | 898 | } |
683 | 899 | ||
684 | static void rmap_remove(struct kvm *kvm, u64 *spte) | 900 | static void pte_list_remove(u64 *spte, unsigned long *pte_list) |
685 | { | 901 | { |
686 | struct kvm_rmap_desc *desc; | 902 | struct pte_list_desc *desc; |
687 | struct kvm_rmap_desc *prev_desc; | 903 | struct pte_list_desc *prev_desc; |
688 | struct kvm_mmu_page *sp; | ||
689 | gfn_t gfn; | ||
690 | unsigned long *rmapp; | ||
691 | int i; | 904 | int i; |
692 | 905 | ||
693 | sp = page_header(__pa(spte)); | 906 | if (!*pte_list) { |
694 | gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); | 907 | printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte); |
695 | rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); | ||
696 | if (!*rmapp) { | ||
697 | printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte); | ||
698 | BUG(); | 908 | BUG(); |
699 | } else if (!(*rmapp & 1)) { | 909 | } else if (!(*pte_list & 1)) { |
700 | rmap_printk("rmap_remove: %p 1->0\n", spte); | 910 | rmap_printk("pte_list_remove: %p 1->0\n", spte); |
701 | if ((u64 *)*rmapp != spte) { | 911 | if ((u64 *)*pte_list != spte) { |
702 | printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte); | 912 | printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte); |
703 | BUG(); | 913 | BUG(); |
704 | } | 914 | } |
705 | *rmapp = 0; | 915 | *pte_list = 0; |
706 | } else { | 916 | } else { |
707 | rmap_printk("rmap_remove: %p many->many\n", spte); | 917 | rmap_printk("pte_list_remove: %p many->many\n", spte); |
708 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | 918 | desc = (struct pte_list_desc *)(*pte_list & ~1ul); |
709 | prev_desc = NULL; | 919 | prev_desc = NULL; |
710 | while (desc) { | 920 | while (desc) { |
711 | for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) | 921 | for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) |
712 | if (desc->sptes[i] == spte) { | 922 | if (desc->sptes[i] == spte) { |
713 | rmap_desc_remove_entry(rmapp, | 923 | pte_list_desc_remove_entry(pte_list, |
714 | desc, i, | 924 | desc, i, |
715 | prev_desc); | 925 | prev_desc); |
716 | return; | 926 | return; |
@@ -718,62 +928,80 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) | |||
718 | prev_desc = desc; | 928 | prev_desc = desc; |
719 | desc = desc->more; | 929 | desc = desc->more; |
720 | } | 930 | } |
721 | pr_err("rmap_remove: %p many->many\n", spte); | 931 | pr_err("pte_list_remove: %p many->many\n", spte); |
722 | BUG(); | 932 | BUG(); |
723 | } | 933 | } |
724 | } | 934 | } |
725 | 935 | ||
726 | static int set_spte_track_bits(u64 *sptep, u64 new_spte) | 936 | typedef void (*pte_list_walk_fn) (u64 *spte); |
937 | static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) | ||
727 | { | 938 | { |
728 | pfn_t pfn; | 939 | struct pte_list_desc *desc; |
729 | u64 old_spte = *sptep; | 940 | int i; |
730 | 941 | ||
731 | if (!spte_has_volatile_bits(old_spte)) | 942 | if (!*pte_list) |
732 | __set_spte(sptep, new_spte); | 943 | return; |
733 | else | ||
734 | old_spte = __xchg_spte(sptep, new_spte); | ||
735 | 944 | ||
736 | if (!is_rmap_spte(old_spte)) | 945 | if (!(*pte_list & 1)) |
737 | return 0; | 946 | return fn((u64 *)*pte_list); |
738 | 947 | ||
739 | pfn = spte_to_pfn(old_spte); | 948 | desc = (struct pte_list_desc *)(*pte_list & ~1ul); |
740 | if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) | 949 | while (desc) { |
741 | kvm_set_pfn_accessed(pfn); | 950 | for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) |
742 | if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) | 951 | fn(desc->sptes[i]); |
743 | kvm_set_pfn_dirty(pfn); | 952 | desc = desc->more; |
744 | return 1; | 953 | } |
745 | } | 954 | } |
746 | 955 | ||
747 | static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) | 956 | /* |
957 | * Take gfn and return the reverse mapping to it. | ||
958 | */ | ||
959 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) | ||
748 | { | 960 | { |
749 | if (set_spte_track_bits(sptep, new_spte)) | 961 | struct kvm_memory_slot *slot; |
750 | rmap_remove(kvm, sptep); | 962 | struct kvm_lpage_info *linfo; |
963 | |||
964 | slot = gfn_to_memslot(kvm, gfn); | ||
965 | if (likely(level == PT_PAGE_TABLE_LEVEL)) | ||
966 | return &slot->rmap[gfn - slot->base_gfn]; | ||
967 | |||
968 | linfo = lpage_info_slot(gfn, slot, level); | ||
969 | |||
970 | return &linfo->rmap_pde; | ||
971 | } | ||
972 | |||
973 | static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | ||
974 | { | ||
975 | struct kvm_mmu_page *sp; | ||
976 | unsigned long *rmapp; | ||
977 | |||
978 | sp = page_header(__pa(spte)); | ||
979 | kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); | ||
980 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); | ||
981 | return pte_list_add(vcpu, spte, rmapp); | ||
751 | } | 982 | } |
752 | 983 | ||
753 | static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) | 984 | static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) |
754 | { | 985 | { |
755 | struct kvm_rmap_desc *desc; | 986 | return pte_list_next(rmapp, spte); |
756 | u64 *prev_spte; | 987 | } |
757 | int i; | ||
758 | 988 | ||
759 | if (!*rmapp) | 989 | static void rmap_remove(struct kvm *kvm, u64 *spte) |
760 | return NULL; | 990 | { |
761 | else if (!(*rmapp & 1)) { | 991 | struct kvm_mmu_page *sp; |
762 | if (!spte) | 992 | gfn_t gfn; |
763 | return (u64 *)*rmapp; | 993 | unsigned long *rmapp; |
764 | return NULL; | 994 | |
765 | } | 995 | sp = page_header(__pa(spte)); |
766 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | 996 | gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); |
767 | prev_spte = NULL; | 997 | rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); |
768 | while (desc) { | 998 | pte_list_remove(spte, rmapp); |
769 | for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { | 999 | } |
770 | if (prev_spte == spte) | 1000 | |
771 | return desc->sptes[i]; | 1001 | static void drop_spte(struct kvm *kvm, u64 *sptep) |
772 | prev_spte = desc->sptes[i]; | 1002 | { |
773 | } | 1003 | if (mmu_spte_clear_track_bits(sptep)) |
774 | desc = desc->more; | 1004 | rmap_remove(kvm, sptep); |
775 | } | ||
776 | return NULL; | ||
777 | } | 1005 | } |
778 | 1006 | ||
779 | static int rmap_write_protect(struct kvm *kvm, u64 gfn) | 1007 | static int rmap_write_protect(struct kvm *kvm, u64 gfn) |
@@ -790,7 +1018,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
790 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 1018 | BUG_ON(!(*spte & PT_PRESENT_MASK)); |
791 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); | 1019 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); |
792 | if (is_writable_pte(*spte)) { | 1020 | if (is_writable_pte(*spte)) { |
793 | update_spte(spte, *spte & ~PT_WRITABLE_MASK); | 1021 | mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); |
794 | write_protected = 1; | 1022 | write_protected = 1; |
795 | } | 1023 | } |
796 | spte = rmap_next(kvm, rmapp, spte); | 1024 | spte = rmap_next(kvm, rmapp, spte); |
@@ -807,8 +1035,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
807 | BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); | 1035 | BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); |
808 | pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); | 1036 | pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); |
809 | if (is_writable_pte(*spte)) { | 1037 | if (is_writable_pte(*spte)) { |
810 | drop_spte(kvm, spte, | 1038 | drop_spte(kvm, spte); |
811 | shadow_trap_nonpresent_pte); | ||
812 | --kvm->stat.lpages; | 1039 | --kvm->stat.lpages; |
813 | spte = NULL; | 1040 | spte = NULL; |
814 | write_protected = 1; | 1041 | write_protected = 1; |
@@ -829,7 +1056,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
829 | while ((spte = rmap_next(kvm, rmapp, NULL))) { | 1056 | while ((spte = rmap_next(kvm, rmapp, NULL))) { |
830 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 1057 | BUG_ON(!(*spte & PT_PRESENT_MASK)); |
831 | rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); | 1058 | rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); |
832 | drop_spte(kvm, spte, shadow_trap_nonpresent_pte); | 1059 | drop_spte(kvm, spte); |
833 | need_tlb_flush = 1; | 1060 | need_tlb_flush = 1; |
834 | } | 1061 | } |
835 | return need_tlb_flush; | 1062 | return need_tlb_flush; |
@@ -851,7 +1078,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
851 | rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); | 1078 | rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); |
852 | need_flush = 1; | 1079 | need_flush = 1; |
853 | if (pte_write(*ptep)) { | 1080 | if (pte_write(*ptep)) { |
854 | drop_spte(kvm, spte, shadow_trap_nonpresent_pte); | 1081 | drop_spte(kvm, spte); |
855 | spte = rmap_next(kvm, rmapp, NULL); | 1082 | spte = rmap_next(kvm, rmapp, NULL); |
856 | } else { | 1083 | } else { |
857 | new_spte = *spte &~ (PT64_BASE_ADDR_MASK); | 1084 | new_spte = *spte &~ (PT64_BASE_ADDR_MASK); |
@@ -860,7 +1087,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
860 | new_spte &= ~PT_WRITABLE_MASK; | 1087 | new_spte &= ~PT_WRITABLE_MASK; |
861 | new_spte &= ~SPTE_HOST_WRITEABLE; | 1088 | new_spte &= ~SPTE_HOST_WRITEABLE; |
862 | new_spte &= ~shadow_accessed_mask; | 1089 | new_spte &= ~shadow_accessed_mask; |
863 | set_spte_track_bits(spte, new_spte); | 1090 | mmu_spte_clear_track_bits(spte); |
1091 | mmu_spte_set(spte, new_spte); | ||
864 | spte = rmap_next(kvm, rmapp, spte); | 1092 | spte = rmap_next(kvm, rmapp, spte); |
865 | } | 1093 | } |
866 | } | 1094 | } |
@@ -1032,151 +1260,89 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) | |||
1032 | percpu_counter_add(&kvm_total_used_mmu_pages, nr); | 1260 | percpu_counter_add(&kvm_total_used_mmu_pages, nr); |
1033 | } | 1261 | } |
1034 | 1262 | ||
1035 | static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) | 1263 | /* |
1264 | * Remove the sp from shadow page cache, after call it, | ||
1265 | * we can not find this sp from the cache, and the shadow | ||
1266 | * page table is still valid. | ||
1267 | * It should be under the protection of mmu lock. | ||
1268 | */ | ||
1269 | static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp) | ||
1036 | { | 1270 | { |
1037 | ASSERT(is_empty_shadow_page(sp->spt)); | 1271 | ASSERT(is_empty_shadow_page(sp->spt)); |
1038 | hlist_del(&sp->hash_link); | 1272 | hlist_del(&sp->hash_link); |
1039 | list_del(&sp->link); | ||
1040 | free_page((unsigned long)sp->spt); | ||
1041 | if (!sp->role.direct) | 1273 | if (!sp->role.direct) |
1042 | free_page((unsigned long)sp->gfns); | 1274 | free_page((unsigned long)sp->gfns); |
1043 | kmem_cache_free(mmu_page_header_cache, sp); | ||
1044 | kvm_mod_used_mmu_pages(kvm, -1); | ||
1045 | } | 1275 | } |
1046 | 1276 | ||
1047 | static unsigned kvm_page_table_hashfn(gfn_t gfn) | 1277 | /* |
1278 | * Free the shadow page table and the sp, we can do it | ||
1279 | * out of the protection of mmu lock. | ||
1280 | */ | ||
1281 | static void kvm_mmu_free_page(struct kvm_mmu_page *sp) | ||
1048 | { | 1282 | { |
1049 | return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); | 1283 | list_del(&sp->link); |
1284 | free_page((unsigned long)sp->spt); | ||
1285 | kmem_cache_free(mmu_page_header_cache, sp); | ||
1050 | } | 1286 | } |
1051 | 1287 | ||
1052 | static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | 1288 | static unsigned kvm_page_table_hashfn(gfn_t gfn) |
1053 | u64 *parent_pte, int direct) | ||
1054 | { | 1289 | { |
1055 | struct kvm_mmu_page *sp; | 1290 | return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); |
1056 | |||
1057 | sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); | ||
1058 | sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); | ||
1059 | if (!direct) | ||
1060 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, | ||
1061 | PAGE_SIZE); | ||
1062 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | ||
1063 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); | ||
1064 | bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); | ||
1065 | sp->multimapped = 0; | ||
1066 | sp->parent_pte = parent_pte; | ||
1067 | kvm_mod_used_mmu_pages(vcpu->kvm, +1); | ||
1068 | return sp; | ||
1069 | } | 1291 | } |
1070 | 1292 | ||
1071 | static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, | 1293 | static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, |
1072 | struct kvm_mmu_page *sp, u64 *parent_pte) | 1294 | struct kvm_mmu_page *sp, u64 *parent_pte) |
1073 | { | 1295 | { |
1074 | struct kvm_pte_chain *pte_chain; | ||
1075 | struct hlist_node *node; | ||
1076 | int i; | ||
1077 | |||
1078 | if (!parent_pte) | 1296 | if (!parent_pte) |
1079 | return; | 1297 | return; |
1080 | if (!sp->multimapped) { | ||
1081 | u64 *old = sp->parent_pte; | ||
1082 | 1298 | ||
1083 | if (!old) { | 1299 | pte_list_add(vcpu, parent_pte, &sp->parent_ptes); |
1084 | sp->parent_pte = parent_pte; | ||
1085 | return; | ||
1086 | } | ||
1087 | sp->multimapped = 1; | ||
1088 | pte_chain = mmu_alloc_pte_chain(vcpu); | ||
1089 | INIT_HLIST_HEAD(&sp->parent_ptes); | ||
1090 | hlist_add_head(&pte_chain->link, &sp->parent_ptes); | ||
1091 | pte_chain->parent_ptes[0] = old; | ||
1092 | } | ||
1093 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) { | ||
1094 | if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) | ||
1095 | continue; | ||
1096 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) | ||
1097 | if (!pte_chain->parent_ptes[i]) { | ||
1098 | pte_chain->parent_ptes[i] = parent_pte; | ||
1099 | return; | ||
1100 | } | ||
1101 | } | ||
1102 | pte_chain = mmu_alloc_pte_chain(vcpu); | ||
1103 | BUG_ON(!pte_chain); | ||
1104 | hlist_add_head(&pte_chain->link, &sp->parent_ptes); | ||
1105 | pte_chain->parent_ptes[0] = parent_pte; | ||
1106 | } | 1300 | } |
1107 | 1301 | ||
1108 | static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, | 1302 | static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, |
1109 | u64 *parent_pte) | 1303 | u64 *parent_pte) |
1110 | { | 1304 | { |
1111 | struct kvm_pte_chain *pte_chain; | 1305 | pte_list_remove(parent_pte, &sp->parent_ptes); |
1112 | struct hlist_node *node; | ||
1113 | int i; | ||
1114 | |||
1115 | if (!sp->multimapped) { | ||
1116 | BUG_ON(sp->parent_pte != parent_pte); | ||
1117 | sp->parent_pte = NULL; | ||
1118 | return; | ||
1119 | } | ||
1120 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) | ||
1121 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { | ||
1122 | if (!pte_chain->parent_ptes[i]) | ||
1123 | break; | ||
1124 | if (pte_chain->parent_ptes[i] != parent_pte) | ||
1125 | continue; | ||
1126 | while (i + 1 < NR_PTE_CHAIN_ENTRIES | ||
1127 | && pte_chain->parent_ptes[i + 1]) { | ||
1128 | pte_chain->parent_ptes[i] | ||
1129 | = pte_chain->parent_ptes[i + 1]; | ||
1130 | ++i; | ||
1131 | } | ||
1132 | pte_chain->parent_ptes[i] = NULL; | ||
1133 | if (i == 0) { | ||
1134 | hlist_del(&pte_chain->link); | ||
1135 | mmu_free_pte_chain(pte_chain); | ||
1136 | if (hlist_empty(&sp->parent_ptes)) { | ||
1137 | sp->multimapped = 0; | ||
1138 | sp->parent_pte = NULL; | ||
1139 | } | ||
1140 | } | ||
1141 | return; | ||
1142 | } | ||
1143 | BUG(); | ||
1144 | } | 1306 | } |
1145 | 1307 | ||
1146 | static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) | 1308 | static void drop_parent_pte(struct kvm_mmu_page *sp, |
1309 | u64 *parent_pte) | ||
1147 | { | 1310 | { |
1148 | struct kvm_pte_chain *pte_chain; | 1311 | mmu_page_remove_parent_pte(sp, parent_pte); |
1149 | struct hlist_node *node; | 1312 | mmu_spte_clear_no_track(parent_pte); |
1150 | struct kvm_mmu_page *parent_sp; | 1313 | } |
1151 | int i; | ||
1152 | |||
1153 | if (!sp->multimapped && sp->parent_pte) { | ||
1154 | parent_sp = page_header(__pa(sp->parent_pte)); | ||
1155 | fn(parent_sp, sp->parent_pte); | ||
1156 | return; | ||
1157 | } | ||
1158 | |||
1159 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) | ||
1160 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { | ||
1161 | u64 *spte = pte_chain->parent_ptes[i]; | ||
1162 | 1314 | ||
1163 | if (!spte) | 1315 | static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, |
1164 | break; | 1316 | u64 *parent_pte, int direct) |
1165 | parent_sp = page_header(__pa(spte)); | 1317 | { |
1166 | fn(parent_sp, spte); | 1318 | struct kvm_mmu_page *sp; |
1167 | } | 1319 | sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, |
1320 | sizeof *sp); | ||
1321 | sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); | ||
1322 | if (!direct) | ||
1323 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, | ||
1324 | PAGE_SIZE); | ||
1325 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | ||
1326 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); | ||
1327 | bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); | ||
1328 | sp->parent_ptes = 0; | ||
1329 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); | ||
1330 | kvm_mod_used_mmu_pages(vcpu->kvm, +1); | ||
1331 | return sp; | ||
1168 | } | 1332 | } |
1169 | 1333 | ||
1170 | static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte); | 1334 | static void mark_unsync(u64 *spte); |
1171 | static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) | 1335 | static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) |
1172 | { | 1336 | { |
1173 | mmu_parent_walk(sp, mark_unsync); | 1337 | pte_list_walk(&sp->parent_ptes, mark_unsync); |
1174 | } | 1338 | } |
1175 | 1339 | ||
1176 | static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte) | 1340 | static void mark_unsync(u64 *spte) |
1177 | { | 1341 | { |
1342 | struct kvm_mmu_page *sp; | ||
1178 | unsigned int index; | 1343 | unsigned int index; |
1179 | 1344 | ||
1345 | sp = page_header(__pa(spte)); | ||
1180 | index = spte - sp->spt; | 1346 | index = spte - sp->spt; |
1181 | if (__test_and_set_bit(index, sp->unsync_child_bitmap)) | 1347 | if (__test_and_set_bit(index, sp->unsync_child_bitmap)) |
1182 | return; | 1348 | return; |
@@ -1185,15 +1351,6 @@ static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte) | |||
1185 | kvm_mmu_mark_parents_unsync(sp); | 1351 | kvm_mmu_mark_parents_unsync(sp); |
1186 | } | 1352 | } |
1187 | 1353 | ||
1188 | static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, | ||
1189 | struct kvm_mmu_page *sp) | ||
1190 | { | ||
1191 | int i; | ||
1192 | |||
1193 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | ||
1194 | sp->spt[i] = shadow_trap_nonpresent_pte; | ||
1195 | } | ||
1196 | |||
1197 | static int nonpaging_sync_page(struct kvm_vcpu *vcpu, | 1354 | static int nonpaging_sync_page(struct kvm_vcpu *vcpu, |
1198 | struct kvm_mmu_page *sp) | 1355 | struct kvm_mmu_page *sp) |
1199 | { | 1356 | { |
@@ -1475,6 +1632,14 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, | |||
1475 | } | 1632 | } |
1476 | } | 1633 | } |
1477 | 1634 | ||
1635 | static void init_shadow_page_table(struct kvm_mmu_page *sp) | ||
1636 | { | ||
1637 | int i; | ||
1638 | |||
1639 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | ||
1640 | sp->spt[i] = 0ull; | ||
1641 | } | ||
1642 | |||
1478 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | 1643 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, |
1479 | gfn_t gfn, | 1644 | gfn_t gfn, |
1480 | gva_t gaddr, | 1645 | gva_t gaddr, |
@@ -1537,10 +1702,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
1537 | 1702 | ||
1538 | account_shadowed(vcpu->kvm, gfn); | 1703 | account_shadowed(vcpu->kvm, gfn); |
1539 | } | 1704 | } |
1540 | if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) | 1705 | init_shadow_page_table(sp); |
1541 | vcpu->arch.mmu.prefetch_page(vcpu, sp); | ||
1542 | else | ||
1543 | nonpaging_prefetch_page(vcpu, sp); | ||
1544 | trace_kvm_mmu_get_page(sp, true); | 1706 | trace_kvm_mmu_get_page(sp, true); |
1545 | return sp; | 1707 | return sp; |
1546 | } | 1708 | } |
@@ -1572,21 +1734,28 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) | |||
1572 | if (iterator->level < PT_PAGE_TABLE_LEVEL) | 1734 | if (iterator->level < PT_PAGE_TABLE_LEVEL) |
1573 | return false; | 1735 | return false; |
1574 | 1736 | ||
1575 | if (iterator->level == PT_PAGE_TABLE_LEVEL) | ||
1576 | if (is_large_pte(*iterator->sptep)) | ||
1577 | return false; | ||
1578 | |||
1579 | iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); | 1737 | iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); |
1580 | iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; | 1738 | iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; |
1581 | return true; | 1739 | return true; |
1582 | } | 1740 | } |
1583 | 1741 | ||
1584 | static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) | 1742 | static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, |
1743 | u64 spte) | ||
1585 | { | 1744 | { |
1586 | iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK; | 1745 | if (is_last_spte(spte, iterator->level)) { |
1746 | iterator->level = 0; | ||
1747 | return; | ||
1748 | } | ||
1749 | |||
1750 | iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK; | ||
1587 | --iterator->level; | 1751 | --iterator->level; |
1588 | } | 1752 | } |
1589 | 1753 | ||
1754 | static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) | ||
1755 | { | ||
1756 | return __shadow_walk_next(iterator, *iterator->sptep); | ||
1757 | } | ||
1758 | |||
1590 | static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) | 1759 | static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) |
1591 | { | 1760 | { |
1592 | u64 spte; | 1761 | u64 spte; |
@@ -1594,13 +1763,13 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) | |||
1594 | spte = __pa(sp->spt) | 1763 | spte = __pa(sp->spt) |
1595 | | PT_PRESENT_MASK | PT_ACCESSED_MASK | 1764 | | PT_PRESENT_MASK | PT_ACCESSED_MASK |
1596 | | PT_WRITABLE_MASK | PT_USER_MASK; | 1765 | | PT_WRITABLE_MASK | PT_USER_MASK; |
1597 | __set_spte(sptep, spte); | 1766 | mmu_spte_set(sptep, spte); |
1598 | } | 1767 | } |
1599 | 1768 | ||
1600 | static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) | 1769 | static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) |
1601 | { | 1770 | { |
1602 | if (is_large_pte(*sptep)) { | 1771 | if (is_large_pte(*sptep)) { |
1603 | drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); | 1772 | drop_spte(vcpu->kvm, sptep); |
1604 | kvm_flush_remote_tlbs(vcpu->kvm); | 1773 | kvm_flush_remote_tlbs(vcpu->kvm); |
1605 | } | 1774 | } |
1606 | } | 1775 | } |
@@ -1622,38 +1791,39 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1622 | if (child->role.access == direct_access) | 1791 | if (child->role.access == direct_access) |
1623 | return; | 1792 | return; |
1624 | 1793 | ||
1625 | mmu_page_remove_parent_pte(child, sptep); | 1794 | drop_parent_pte(child, sptep); |
1626 | __set_spte(sptep, shadow_trap_nonpresent_pte); | ||
1627 | kvm_flush_remote_tlbs(vcpu->kvm); | 1795 | kvm_flush_remote_tlbs(vcpu->kvm); |
1628 | } | 1796 | } |
1629 | } | 1797 | } |
1630 | 1798 | ||
1799 | static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, | ||
1800 | u64 *spte) | ||
1801 | { | ||
1802 | u64 pte; | ||
1803 | struct kvm_mmu_page *child; | ||
1804 | |||
1805 | pte = *spte; | ||
1806 | if (is_shadow_present_pte(pte)) { | ||
1807 | if (is_last_spte(pte, sp->role.level)) | ||
1808 | drop_spte(kvm, spte); | ||
1809 | else { | ||
1810 | child = page_header(pte & PT64_BASE_ADDR_MASK); | ||
1811 | drop_parent_pte(child, spte); | ||
1812 | } | ||
1813 | } else if (is_mmio_spte(pte)) | ||
1814 | mmu_spte_clear_no_track(spte); | ||
1815 | |||
1816 | if (is_large_pte(pte)) | ||
1817 | --kvm->stat.lpages; | ||
1818 | } | ||
1819 | |||
1631 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, | 1820 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, |
1632 | struct kvm_mmu_page *sp) | 1821 | struct kvm_mmu_page *sp) |
1633 | { | 1822 | { |
1634 | unsigned i; | 1823 | unsigned i; |
1635 | u64 *pt; | 1824 | |
1636 | u64 ent; | 1825 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) |
1637 | 1826 | mmu_page_zap_pte(kvm, sp, sp->spt + i); | |
1638 | pt = sp->spt; | ||
1639 | |||
1640 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
1641 | ent = pt[i]; | ||
1642 | |||
1643 | if (is_shadow_present_pte(ent)) { | ||
1644 | if (!is_last_spte(ent, sp->role.level)) { | ||
1645 | ent &= PT64_BASE_ADDR_MASK; | ||
1646 | mmu_page_remove_parent_pte(page_header(ent), | ||
1647 | &pt[i]); | ||
1648 | } else { | ||
1649 | if (is_large_pte(ent)) | ||
1650 | --kvm->stat.lpages; | ||
1651 | drop_spte(kvm, &pt[i], | ||
1652 | shadow_trap_nonpresent_pte); | ||
1653 | } | ||
1654 | } | ||
1655 | pt[i] = shadow_trap_nonpresent_pte; | ||
1656 | } | ||
1657 | } | 1827 | } |
1658 | 1828 | ||
1659 | static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) | 1829 | static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) |
@@ -1674,20 +1844,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
1674 | { | 1844 | { |
1675 | u64 *parent_pte; | 1845 | u64 *parent_pte; |
1676 | 1846 | ||
1677 | while (sp->multimapped || sp->parent_pte) { | 1847 | while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL))) |
1678 | if (!sp->multimapped) | 1848 | drop_parent_pte(sp, parent_pte); |
1679 | parent_pte = sp->parent_pte; | ||
1680 | else { | ||
1681 | struct kvm_pte_chain *chain; | ||
1682 | |||
1683 | chain = container_of(sp->parent_ptes.first, | ||
1684 | struct kvm_pte_chain, link); | ||
1685 | parent_pte = chain->parent_ptes[0]; | ||
1686 | } | ||
1687 | BUG_ON(!parent_pte); | ||
1688 | kvm_mmu_put_page(sp, parent_pte); | ||
1689 | __set_spte(parent_pte, shadow_trap_nonpresent_pte); | ||
1690 | } | ||
1691 | } | 1849 | } |
1692 | 1850 | ||
1693 | static int mmu_zap_unsync_children(struct kvm *kvm, | 1851 | static int mmu_zap_unsync_children(struct kvm *kvm, |
@@ -1734,6 +1892,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, | |||
1734 | /* Count self */ | 1892 | /* Count self */ |
1735 | ret++; | 1893 | ret++; |
1736 | list_move(&sp->link, invalid_list); | 1894 | list_move(&sp->link, invalid_list); |
1895 | kvm_mod_used_mmu_pages(kvm, -1); | ||
1737 | } else { | 1896 | } else { |
1738 | list_move(&sp->link, &kvm->arch.active_mmu_pages); | 1897 | list_move(&sp->link, &kvm->arch.active_mmu_pages); |
1739 | kvm_reload_remote_mmus(kvm); | 1898 | kvm_reload_remote_mmus(kvm); |
@@ -1744,6 +1903,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, | |||
1744 | return ret; | 1903 | return ret; |
1745 | } | 1904 | } |
1746 | 1905 | ||
1906 | static void kvm_mmu_isolate_pages(struct list_head *invalid_list) | ||
1907 | { | ||
1908 | struct kvm_mmu_page *sp; | ||
1909 | |||
1910 | list_for_each_entry(sp, invalid_list, link) | ||
1911 | kvm_mmu_isolate_page(sp); | ||
1912 | } | ||
1913 | |||
1914 | static void free_pages_rcu(struct rcu_head *head) | ||
1915 | { | ||
1916 | struct kvm_mmu_page *next, *sp; | ||
1917 | |||
1918 | sp = container_of(head, struct kvm_mmu_page, rcu); | ||
1919 | while (sp) { | ||
1920 | if (!list_empty(&sp->link)) | ||
1921 | next = list_first_entry(&sp->link, | ||
1922 | struct kvm_mmu_page, link); | ||
1923 | else | ||
1924 | next = NULL; | ||
1925 | kvm_mmu_free_page(sp); | ||
1926 | sp = next; | ||
1927 | } | ||
1928 | } | ||
1929 | |||
1747 | static void kvm_mmu_commit_zap_page(struct kvm *kvm, | 1930 | static void kvm_mmu_commit_zap_page(struct kvm *kvm, |
1748 | struct list_head *invalid_list) | 1931 | struct list_head *invalid_list) |
1749 | { | 1932 | { |
@@ -1754,10 +1937,21 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, | |||
1754 | 1937 | ||
1755 | kvm_flush_remote_tlbs(kvm); | 1938 | kvm_flush_remote_tlbs(kvm); |
1756 | 1939 | ||
1940 | if (atomic_read(&kvm->arch.reader_counter)) { | ||
1941 | kvm_mmu_isolate_pages(invalid_list); | ||
1942 | sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); | ||
1943 | list_del_init(invalid_list); | ||
1944 | |||
1945 | trace_kvm_mmu_delay_free_pages(sp); | ||
1946 | call_rcu(&sp->rcu, free_pages_rcu); | ||
1947 | return; | ||
1948 | } | ||
1949 | |||
1757 | do { | 1950 | do { |
1758 | sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); | 1951 | sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); |
1759 | WARN_ON(!sp->role.invalid || sp->root_count); | 1952 | WARN_ON(!sp->role.invalid || sp->root_count); |
1760 | kvm_mmu_free_page(kvm, sp); | 1953 | kvm_mmu_isolate_page(sp); |
1954 | kvm_mmu_free_page(sp); | ||
1761 | } while (!list_empty(invalid_list)); | 1955 | } while (!list_empty(invalid_list)); |
1762 | 1956 | ||
1763 | } | 1957 | } |
@@ -1783,8 +1977,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) | |||
1783 | page = container_of(kvm->arch.active_mmu_pages.prev, | 1977 | page = container_of(kvm->arch.active_mmu_pages.prev, |
1784 | struct kvm_mmu_page, link); | 1978 | struct kvm_mmu_page, link); |
1785 | kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); | 1979 | kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); |
1786 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | ||
1787 | } | 1980 | } |
1981 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | ||
1788 | goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; | 1982 | goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; |
1789 | } | 1983 | } |
1790 | 1984 | ||
@@ -1833,20 +2027,6 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) | |||
1833 | __set_bit(slot, sp->slot_bitmap); | 2027 | __set_bit(slot, sp->slot_bitmap); |
1834 | } | 2028 | } |
1835 | 2029 | ||
1836 | static void mmu_convert_notrap(struct kvm_mmu_page *sp) | ||
1837 | { | ||
1838 | int i; | ||
1839 | u64 *pt = sp->spt; | ||
1840 | |||
1841 | if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte) | ||
1842 | return; | ||
1843 | |||
1844 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
1845 | if (pt[i] == shadow_notrap_nonpresent_pte) | ||
1846 | __set_spte(&pt[i], shadow_trap_nonpresent_pte); | ||
1847 | } | ||
1848 | } | ||
1849 | |||
1850 | /* | 2030 | /* |
1851 | * The function is based on mtrr_type_lookup() in | 2031 | * The function is based on mtrr_type_lookup() in |
1852 | * arch/x86/kernel/cpu/mtrr/generic.c | 2032 | * arch/x86/kernel/cpu/mtrr/generic.c |
@@ -1959,7 +2139,6 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
1959 | sp->unsync = 1; | 2139 | sp->unsync = 1; |
1960 | 2140 | ||
1961 | kvm_mmu_mark_parents_unsync(sp); | 2141 | kvm_mmu_mark_parents_unsync(sp); |
1962 | mmu_convert_notrap(sp); | ||
1963 | } | 2142 | } |
1964 | 2143 | ||
1965 | static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) | 2144 | static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) |
@@ -2002,13 +2181,16 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, | |||
2002 | 2181 | ||
2003 | static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | 2182 | static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, |
2004 | unsigned pte_access, int user_fault, | 2183 | unsigned pte_access, int user_fault, |
2005 | int write_fault, int dirty, int level, | 2184 | int write_fault, int level, |
2006 | gfn_t gfn, pfn_t pfn, bool speculative, | 2185 | gfn_t gfn, pfn_t pfn, bool speculative, |
2007 | bool can_unsync, bool host_writable) | 2186 | bool can_unsync, bool host_writable) |
2008 | { | 2187 | { |
2009 | u64 spte, entry = *sptep; | 2188 | u64 spte, entry = *sptep; |
2010 | int ret = 0; | 2189 | int ret = 0; |
2011 | 2190 | ||
2191 | if (set_mmio_spte(sptep, gfn, pfn, pte_access)) | ||
2192 | return 0; | ||
2193 | |||
2012 | /* | 2194 | /* |
2013 | * We don't set the accessed bit, since we sometimes want to see | 2195 | * We don't set the accessed bit, since we sometimes want to see |
2014 | * whether the guest actually used the pte (in order to detect | 2196 | * whether the guest actually used the pte (in order to detect |
@@ -2017,8 +2199,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2017 | spte = PT_PRESENT_MASK; | 2199 | spte = PT_PRESENT_MASK; |
2018 | if (!speculative) | 2200 | if (!speculative) |
2019 | spte |= shadow_accessed_mask; | 2201 | spte |= shadow_accessed_mask; |
2020 | if (!dirty) | 2202 | |
2021 | pte_access &= ~ACC_WRITE_MASK; | ||
2022 | if (pte_access & ACC_EXEC_MASK) | 2203 | if (pte_access & ACC_EXEC_MASK) |
2023 | spte |= shadow_x_mask; | 2204 | spte |= shadow_x_mask; |
2024 | else | 2205 | else |
@@ -2045,15 +2226,24 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2045 | if (level > PT_PAGE_TABLE_LEVEL && | 2226 | if (level > PT_PAGE_TABLE_LEVEL && |
2046 | has_wrprotected_page(vcpu->kvm, gfn, level)) { | 2227 | has_wrprotected_page(vcpu->kvm, gfn, level)) { |
2047 | ret = 1; | 2228 | ret = 1; |
2048 | drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); | 2229 | drop_spte(vcpu->kvm, sptep); |
2049 | goto done; | 2230 | goto done; |
2050 | } | 2231 | } |
2051 | 2232 | ||
2052 | spte |= PT_WRITABLE_MASK; | 2233 | spte |= PT_WRITABLE_MASK; |
2053 | 2234 | ||
2054 | if (!vcpu->arch.mmu.direct_map | 2235 | if (!vcpu->arch.mmu.direct_map |
2055 | && !(pte_access & ACC_WRITE_MASK)) | 2236 | && !(pte_access & ACC_WRITE_MASK)) { |
2056 | spte &= ~PT_USER_MASK; | 2237 | spte &= ~PT_USER_MASK; |
2238 | /* | ||
2239 | * If we converted a user page to a kernel page, | ||
2240 | * so that the kernel can write to it when cr0.wp=0, | ||
2241 | * then we should prevent the kernel from executing it | ||
2242 | * if SMEP is enabled. | ||
2243 | */ | ||
2244 | if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) | ||
2245 | spte |= PT64_NX_MASK; | ||
2246 | } | ||
2057 | 2247 | ||
2058 | /* | 2248 | /* |
2059 | * Optimization: for pte sync, if spte was writable the hash | 2249 | * Optimization: for pte sync, if spte was writable the hash |
@@ -2078,7 +2268,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2078 | mark_page_dirty(vcpu->kvm, gfn); | 2268 | mark_page_dirty(vcpu->kvm, gfn); |
2079 | 2269 | ||
2080 | set_pte: | 2270 | set_pte: |
2081 | update_spte(sptep, spte); | 2271 | mmu_spte_update(sptep, spte); |
2082 | /* | 2272 | /* |
2083 | * If we overwrite a writable spte with a read-only one we | 2273 | * If we overwrite a writable spte with a read-only one we |
2084 | * should flush remote TLBs. Otherwise rmap_write_protect | 2274 | * should flush remote TLBs. Otherwise rmap_write_protect |
@@ -2093,8 +2283,8 @@ done: | |||
2093 | 2283 | ||
2094 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | 2284 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, |
2095 | unsigned pt_access, unsigned pte_access, | 2285 | unsigned pt_access, unsigned pte_access, |
2096 | int user_fault, int write_fault, int dirty, | 2286 | int user_fault, int write_fault, |
2097 | int *ptwrite, int level, gfn_t gfn, | 2287 | int *emulate, int level, gfn_t gfn, |
2098 | pfn_t pfn, bool speculative, | 2288 | pfn_t pfn, bool speculative, |
2099 | bool host_writable) | 2289 | bool host_writable) |
2100 | { | 2290 | { |
@@ -2117,26 +2307,28 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2117 | u64 pte = *sptep; | 2307 | u64 pte = *sptep; |
2118 | 2308 | ||
2119 | child = page_header(pte & PT64_BASE_ADDR_MASK); | 2309 | child = page_header(pte & PT64_BASE_ADDR_MASK); |
2120 | mmu_page_remove_parent_pte(child, sptep); | 2310 | drop_parent_pte(child, sptep); |
2121 | __set_spte(sptep, shadow_trap_nonpresent_pte); | ||
2122 | kvm_flush_remote_tlbs(vcpu->kvm); | 2311 | kvm_flush_remote_tlbs(vcpu->kvm); |
2123 | } else if (pfn != spte_to_pfn(*sptep)) { | 2312 | } else if (pfn != spte_to_pfn(*sptep)) { |
2124 | pgprintk("hfn old %llx new %llx\n", | 2313 | pgprintk("hfn old %llx new %llx\n", |
2125 | spte_to_pfn(*sptep), pfn); | 2314 | spte_to_pfn(*sptep), pfn); |
2126 | drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); | 2315 | drop_spte(vcpu->kvm, sptep); |
2127 | kvm_flush_remote_tlbs(vcpu->kvm); | 2316 | kvm_flush_remote_tlbs(vcpu->kvm); |
2128 | } else | 2317 | } else |
2129 | was_rmapped = 1; | 2318 | was_rmapped = 1; |
2130 | } | 2319 | } |
2131 | 2320 | ||
2132 | if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, | 2321 | if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, |
2133 | dirty, level, gfn, pfn, speculative, true, | 2322 | level, gfn, pfn, speculative, true, |
2134 | host_writable)) { | 2323 | host_writable)) { |
2135 | if (write_fault) | 2324 | if (write_fault) |
2136 | *ptwrite = 1; | 2325 | *emulate = 1; |
2137 | kvm_mmu_flush_tlb(vcpu); | 2326 | kvm_mmu_flush_tlb(vcpu); |
2138 | } | 2327 | } |
2139 | 2328 | ||
2329 | if (unlikely(is_mmio_spte(*sptep) && emulate)) | ||
2330 | *emulate = 1; | ||
2331 | |||
2140 | pgprintk("%s: setting spte %llx\n", __func__, *sptep); | 2332 | pgprintk("%s: setting spte %llx\n", __func__, *sptep); |
2141 | pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", | 2333 | pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", |
2142 | is_large_pte(*sptep)? "2MB" : "4kB", | 2334 | is_large_pte(*sptep)? "2MB" : "4kB", |
@@ -2145,11 +2337,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2145 | if (!was_rmapped && is_large_pte(*sptep)) | 2337 | if (!was_rmapped && is_large_pte(*sptep)) |
2146 | ++vcpu->kvm->stat.lpages; | 2338 | ++vcpu->kvm->stat.lpages; |
2147 | 2339 | ||
2148 | page_header_update_slot(vcpu->kvm, sptep, gfn); | 2340 | if (is_shadow_present_pte(*sptep)) { |
2149 | if (!was_rmapped) { | 2341 | page_header_update_slot(vcpu->kvm, sptep, gfn); |
2150 | rmap_count = rmap_add(vcpu, sptep, gfn); | 2342 | if (!was_rmapped) { |
2151 | if (rmap_count > RMAP_RECYCLE_THRESHOLD) | 2343 | rmap_count = rmap_add(vcpu, sptep, gfn); |
2152 | rmap_recycle(vcpu, sptep, gfn); | 2344 | if (rmap_count > RMAP_RECYCLE_THRESHOLD) |
2345 | rmap_recycle(vcpu, sptep, gfn); | ||
2346 | } | ||
2153 | } | 2347 | } |
2154 | kvm_release_pfn_clean(pfn); | 2348 | kvm_release_pfn_clean(pfn); |
2155 | if (speculative) { | 2349 | if (speculative) { |
@@ -2170,8 +2364,8 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, | |||
2170 | 2364 | ||
2171 | slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); | 2365 | slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); |
2172 | if (!slot) { | 2366 | if (!slot) { |
2173 | get_page(bad_page); | 2367 | get_page(fault_page); |
2174 | return page_to_pfn(bad_page); | 2368 | return page_to_pfn(fault_page); |
2175 | } | 2369 | } |
2176 | 2370 | ||
2177 | hva = gfn_to_hva_memslot(slot, gfn); | 2371 | hva = gfn_to_hva_memslot(slot, gfn); |
@@ -2198,7 +2392,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, | |||
2198 | 2392 | ||
2199 | for (i = 0; i < ret; i++, gfn++, start++) | 2393 | for (i = 0; i < ret; i++, gfn++, start++) |
2200 | mmu_set_spte(vcpu, start, ACC_ALL, | 2394 | mmu_set_spte(vcpu, start, ACC_ALL, |
2201 | access, 0, 0, 1, NULL, | 2395 | access, 0, 0, NULL, |
2202 | sp->role.level, gfn, | 2396 | sp->role.level, gfn, |
2203 | page_to_pfn(pages[i]), true, true); | 2397 | page_to_pfn(pages[i]), true, true); |
2204 | 2398 | ||
@@ -2217,7 +2411,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, | |||
2217 | spte = sp->spt + i; | 2411 | spte = sp->spt + i; |
2218 | 2412 | ||
2219 | for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { | 2413 | for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { |
2220 | if (*spte != shadow_trap_nonpresent_pte || spte == sptep) { | 2414 | if (is_shadow_present_pte(*spte) || spte == sptep) { |
2221 | if (!start) | 2415 | if (!start) |
2222 | continue; | 2416 | continue; |
2223 | if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) | 2417 | if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) |
@@ -2254,7 +2448,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
2254 | { | 2448 | { |
2255 | struct kvm_shadow_walk_iterator iterator; | 2449 | struct kvm_shadow_walk_iterator iterator; |
2256 | struct kvm_mmu_page *sp; | 2450 | struct kvm_mmu_page *sp; |
2257 | int pt_write = 0; | 2451 | int emulate = 0; |
2258 | gfn_t pseudo_gfn; | 2452 | gfn_t pseudo_gfn; |
2259 | 2453 | ||
2260 | for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { | 2454 | for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { |
@@ -2262,14 +2456,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
2262 | unsigned pte_access = ACC_ALL; | 2456 | unsigned pte_access = ACC_ALL; |
2263 | 2457 | ||
2264 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, | 2458 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, |
2265 | 0, write, 1, &pt_write, | 2459 | 0, write, &emulate, |
2266 | level, gfn, pfn, prefault, map_writable); | 2460 | level, gfn, pfn, prefault, map_writable); |
2267 | direct_pte_prefetch(vcpu, iterator.sptep); | 2461 | direct_pte_prefetch(vcpu, iterator.sptep); |
2268 | ++vcpu->stat.pf_fixed; | 2462 | ++vcpu->stat.pf_fixed; |
2269 | break; | 2463 | break; |
2270 | } | 2464 | } |
2271 | 2465 | ||
2272 | if (*iterator.sptep == shadow_trap_nonpresent_pte) { | 2466 | if (!is_shadow_present_pte(*iterator.sptep)) { |
2273 | u64 base_addr = iterator.addr; | 2467 | u64 base_addr = iterator.addr; |
2274 | 2468 | ||
2275 | base_addr &= PT64_LVL_ADDR_MASK(iterator.level); | 2469 | base_addr &= PT64_LVL_ADDR_MASK(iterator.level); |
@@ -2283,14 +2477,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
2283 | return -ENOMEM; | 2477 | return -ENOMEM; |
2284 | } | 2478 | } |
2285 | 2479 | ||
2286 | __set_spte(iterator.sptep, | 2480 | mmu_spte_set(iterator.sptep, |
2287 | __pa(sp->spt) | 2481 | __pa(sp->spt) |
2288 | | PT_PRESENT_MASK | PT_WRITABLE_MASK | 2482 | | PT_PRESENT_MASK | PT_WRITABLE_MASK |
2289 | | shadow_user_mask | shadow_x_mask | 2483 | | shadow_user_mask | shadow_x_mask |
2290 | | shadow_accessed_mask); | 2484 | | shadow_accessed_mask); |
2291 | } | 2485 | } |
2292 | } | 2486 | } |
2293 | return pt_write; | 2487 | return emulate; |
2294 | } | 2488 | } |
2295 | 2489 | ||
2296 | static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) | 2490 | static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) |
@@ -2306,16 +2500,15 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct * | |||
2306 | send_sig_info(SIGBUS, &info, tsk); | 2500 | send_sig_info(SIGBUS, &info, tsk); |
2307 | } | 2501 | } |
2308 | 2502 | ||
2309 | static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) | 2503 | static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) |
2310 | { | 2504 | { |
2311 | kvm_release_pfn_clean(pfn); | 2505 | kvm_release_pfn_clean(pfn); |
2312 | if (is_hwpoison_pfn(pfn)) { | 2506 | if (is_hwpoison_pfn(pfn)) { |
2313 | kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current); | 2507 | kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); |
2314 | return 0; | 2508 | return 0; |
2315 | } else if (is_fault_pfn(pfn)) | 2509 | } |
2316 | return -EFAULT; | ||
2317 | 2510 | ||
2318 | return 1; | 2511 | return -EFAULT; |
2319 | } | 2512 | } |
2320 | 2513 | ||
2321 | static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, | 2514 | static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, |
@@ -2360,6 +2553,30 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, | |||
2360 | } | 2553 | } |
2361 | } | 2554 | } |
2362 | 2555 | ||
2556 | static bool mmu_invalid_pfn(pfn_t pfn) | ||
2557 | { | ||
2558 | return unlikely(is_invalid_pfn(pfn)); | ||
2559 | } | ||
2560 | |||
2561 | static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, | ||
2562 | pfn_t pfn, unsigned access, int *ret_val) | ||
2563 | { | ||
2564 | bool ret = true; | ||
2565 | |||
2566 | /* The pfn is invalid, report the error! */ | ||
2567 | if (unlikely(is_invalid_pfn(pfn))) { | ||
2568 | *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); | ||
2569 | goto exit; | ||
2570 | } | ||
2571 | |||
2572 | if (unlikely(is_noslot_pfn(pfn))) | ||
2573 | vcpu_cache_mmio_info(vcpu, gva, gfn, access); | ||
2574 | |||
2575 | ret = false; | ||
2576 | exit: | ||
2577 | return ret; | ||
2578 | } | ||
2579 | |||
2363 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, | 2580 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, |
2364 | gva_t gva, pfn_t *pfn, bool write, bool *writable); | 2581 | gva_t gva, pfn_t *pfn, bool write, bool *writable); |
2365 | 2582 | ||
@@ -2394,9 +2611,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, | |||
2394 | if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) | 2611 | if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) |
2395 | return 0; | 2612 | return 0; |
2396 | 2613 | ||
2397 | /* mmio */ | 2614 | if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) |
2398 | if (is_error_pfn(pfn)) | 2615 | return r; |
2399 | return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); | ||
2400 | 2616 | ||
2401 | spin_lock(&vcpu->kvm->mmu_lock); | 2617 | spin_lock(&vcpu->kvm->mmu_lock); |
2402 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 2618 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
@@ -2623,6 +2839,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) | |||
2623 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | 2839 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) |
2624 | return; | 2840 | return; |
2625 | 2841 | ||
2842 | vcpu_clear_mmio_info(vcpu, ~0ul); | ||
2626 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); | 2843 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); |
2627 | if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { | 2844 | if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { |
2628 | hpa_t root = vcpu->arch.mmu.root_hpa; | 2845 | hpa_t root = vcpu->arch.mmu.root_hpa; |
@@ -2667,6 +2884,94 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, | |||
2667 | return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); | 2884 | return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); |
2668 | } | 2885 | } |
2669 | 2886 | ||
2887 | static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) | ||
2888 | { | ||
2889 | if (direct) | ||
2890 | return vcpu_match_mmio_gpa(vcpu, addr); | ||
2891 | |||
2892 | return vcpu_match_mmio_gva(vcpu, addr); | ||
2893 | } | ||
2894 | |||
2895 | |||
2896 | /* | ||
2897 | * On direct hosts, the last spte is only allows two states | ||
2898 | * for mmio page fault: | ||
2899 | * - It is the mmio spte | ||
2900 | * - It is zapped or it is being zapped. | ||
2901 | * | ||
2902 | * This function completely checks the spte when the last spte | ||
2903 | * is not the mmio spte. | ||
2904 | */ | ||
2905 | static bool check_direct_spte_mmio_pf(u64 spte) | ||
2906 | { | ||
2907 | return __check_direct_spte_mmio_pf(spte); | ||
2908 | } | ||
2909 | |||
2910 | static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr) | ||
2911 | { | ||
2912 | struct kvm_shadow_walk_iterator iterator; | ||
2913 | u64 spte = 0ull; | ||
2914 | |||
2915 | walk_shadow_page_lockless_begin(vcpu); | ||
2916 | for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) | ||
2917 | if (!is_shadow_present_pte(spte)) | ||
2918 | break; | ||
2919 | walk_shadow_page_lockless_end(vcpu); | ||
2920 | |||
2921 | return spte; | ||
2922 | } | ||
2923 | |||
2924 | /* | ||
2925 | * If it is a real mmio page fault, return 1 and emulat the instruction | ||
2926 | * directly, return 0 to let CPU fault again on the address, -1 is | ||
2927 | * returned if bug is detected. | ||
2928 | */ | ||
2929 | int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) | ||
2930 | { | ||
2931 | u64 spte; | ||
2932 | |||
2933 | if (quickly_check_mmio_pf(vcpu, addr, direct)) | ||
2934 | return 1; | ||
2935 | |||
2936 | spte = walk_shadow_page_get_mmio_spte(vcpu, addr); | ||
2937 | |||
2938 | if (is_mmio_spte(spte)) { | ||
2939 | gfn_t gfn = get_mmio_spte_gfn(spte); | ||
2940 | unsigned access = get_mmio_spte_access(spte); | ||
2941 | |||
2942 | if (direct) | ||
2943 | addr = 0; | ||
2944 | |||
2945 | trace_handle_mmio_page_fault(addr, gfn, access); | ||
2946 | vcpu_cache_mmio_info(vcpu, addr, gfn, access); | ||
2947 | return 1; | ||
2948 | } | ||
2949 | |||
2950 | /* | ||
2951 | * It's ok if the gva is remapped by other cpus on shadow guest, | ||
2952 | * it's a BUG if the gfn is not a mmio page. | ||
2953 | */ | ||
2954 | if (direct && !check_direct_spte_mmio_pf(spte)) | ||
2955 | return -1; | ||
2956 | |||
2957 | /* | ||
2958 | * If the page table is zapped by other cpus, let CPU fault again on | ||
2959 | * the address. | ||
2960 | */ | ||
2961 | return 0; | ||
2962 | } | ||
2963 | EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); | ||
2964 | |||
2965 | static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, | ||
2966 | u32 error_code, bool direct) | ||
2967 | { | ||
2968 | int ret; | ||
2969 | |||
2970 | ret = handle_mmio_page_fault_common(vcpu, addr, direct); | ||
2971 | WARN_ON(ret < 0); | ||
2972 | return ret; | ||
2973 | } | ||
2974 | |||
2670 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | 2975 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, |
2671 | u32 error_code, bool prefault) | 2976 | u32 error_code, bool prefault) |
2672 | { | 2977 | { |
@@ -2674,6 +2979,10 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | |||
2674 | int r; | 2979 | int r; |
2675 | 2980 | ||
2676 | pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); | 2981 | pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); |
2982 | |||
2983 | if (unlikely(error_code & PFERR_RSVD_MASK)) | ||
2984 | return handle_mmio_page_fault(vcpu, gva, error_code, true); | ||
2985 | |||
2677 | r = mmu_topup_memory_caches(vcpu); | 2986 | r = mmu_topup_memory_caches(vcpu); |
2678 | if (r) | 2987 | if (r) |
2679 | return r; | 2988 | return r; |
@@ -2750,6 +3059,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | |||
2750 | ASSERT(vcpu); | 3059 | ASSERT(vcpu); |
2751 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); | 3060 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
2752 | 3061 | ||
3062 | if (unlikely(error_code & PFERR_RSVD_MASK)) | ||
3063 | return handle_mmio_page_fault(vcpu, gpa, error_code, true); | ||
3064 | |||
2753 | r = mmu_topup_memory_caches(vcpu); | 3065 | r = mmu_topup_memory_caches(vcpu); |
2754 | if (r) | 3066 | if (r) |
2755 | return r; | 3067 | return r; |
@@ -2767,9 +3079,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | |||
2767 | if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) | 3079 | if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) |
2768 | return 0; | 3080 | return 0; |
2769 | 3081 | ||
2770 | /* mmio */ | 3082 | if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) |
2771 | if (is_error_pfn(pfn)) | 3083 | return r; |
2772 | return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); | 3084 | |
2773 | spin_lock(&vcpu->kvm->mmu_lock); | 3085 | spin_lock(&vcpu->kvm->mmu_lock); |
2774 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 3086 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
2775 | goto out_unlock; | 3087 | goto out_unlock; |
@@ -2800,7 +3112,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu, | |||
2800 | context->page_fault = nonpaging_page_fault; | 3112 | context->page_fault = nonpaging_page_fault; |
2801 | context->gva_to_gpa = nonpaging_gva_to_gpa; | 3113 | context->gva_to_gpa = nonpaging_gva_to_gpa; |
2802 | context->free = nonpaging_free; | 3114 | context->free = nonpaging_free; |
2803 | context->prefetch_page = nonpaging_prefetch_page; | ||
2804 | context->sync_page = nonpaging_sync_page; | 3115 | context->sync_page = nonpaging_sync_page; |
2805 | context->invlpg = nonpaging_invlpg; | 3116 | context->invlpg = nonpaging_invlpg; |
2806 | context->update_pte = nonpaging_update_pte; | 3117 | context->update_pte = nonpaging_update_pte; |
@@ -2848,6 +3159,23 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) | |||
2848 | return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; | 3159 | return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; |
2849 | } | 3160 | } |
2850 | 3161 | ||
3162 | static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, | ||
3163 | int *nr_present) | ||
3164 | { | ||
3165 | if (unlikely(is_mmio_spte(*sptep))) { | ||
3166 | if (gfn != get_mmio_spte_gfn(*sptep)) { | ||
3167 | mmu_spte_clear_no_track(sptep); | ||
3168 | return true; | ||
3169 | } | ||
3170 | |||
3171 | (*nr_present)++; | ||
3172 | mark_mmio_spte(sptep, gfn, access); | ||
3173 | return true; | ||
3174 | } | ||
3175 | |||
3176 | return false; | ||
3177 | } | ||
3178 | |||
2851 | #define PTTYPE 64 | 3179 | #define PTTYPE 64 |
2852 | #include "paging_tmpl.h" | 3180 | #include "paging_tmpl.h" |
2853 | #undef PTTYPE | 3181 | #undef PTTYPE |
@@ -2930,7 +3258,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, | |||
2930 | context->new_cr3 = paging_new_cr3; | 3258 | context->new_cr3 = paging_new_cr3; |
2931 | context->page_fault = paging64_page_fault; | 3259 | context->page_fault = paging64_page_fault; |
2932 | context->gva_to_gpa = paging64_gva_to_gpa; | 3260 | context->gva_to_gpa = paging64_gva_to_gpa; |
2933 | context->prefetch_page = paging64_prefetch_page; | ||
2934 | context->sync_page = paging64_sync_page; | 3261 | context->sync_page = paging64_sync_page; |
2935 | context->invlpg = paging64_invlpg; | 3262 | context->invlpg = paging64_invlpg; |
2936 | context->update_pte = paging64_update_pte; | 3263 | context->update_pte = paging64_update_pte; |
@@ -2959,7 +3286,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, | |||
2959 | context->page_fault = paging32_page_fault; | 3286 | context->page_fault = paging32_page_fault; |
2960 | context->gva_to_gpa = paging32_gva_to_gpa; | 3287 | context->gva_to_gpa = paging32_gva_to_gpa; |
2961 | context->free = paging_free; | 3288 | context->free = paging_free; |
2962 | context->prefetch_page = paging32_prefetch_page; | ||
2963 | context->sync_page = paging32_sync_page; | 3289 | context->sync_page = paging32_sync_page; |
2964 | context->invlpg = paging32_invlpg; | 3290 | context->invlpg = paging32_invlpg; |
2965 | context->update_pte = paging32_update_pte; | 3291 | context->update_pte = paging32_update_pte; |
@@ -2984,7 +3310,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | |||
2984 | context->new_cr3 = nonpaging_new_cr3; | 3310 | context->new_cr3 = nonpaging_new_cr3; |
2985 | context->page_fault = tdp_page_fault; | 3311 | context->page_fault = tdp_page_fault; |
2986 | context->free = nonpaging_free; | 3312 | context->free = nonpaging_free; |
2987 | context->prefetch_page = nonpaging_prefetch_page; | ||
2988 | context->sync_page = nonpaging_sync_page; | 3313 | context->sync_page = nonpaging_sync_page; |
2989 | context->invlpg = nonpaging_invlpg; | 3314 | context->invlpg = nonpaging_invlpg; |
2990 | context->update_pte = nonpaging_update_pte; | 3315 | context->update_pte = nonpaging_update_pte; |
@@ -3023,6 +3348,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | |||
3023 | int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) | 3348 | int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) |
3024 | { | 3349 | { |
3025 | int r; | 3350 | int r; |
3351 | bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); | ||
3026 | ASSERT(vcpu); | 3352 | ASSERT(vcpu); |
3027 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | 3353 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
3028 | 3354 | ||
@@ -3037,6 +3363,8 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) | |||
3037 | 3363 | ||
3038 | vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); | 3364 | vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); |
3039 | vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); | 3365 | vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); |
3366 | vcpu->arch.mmu.base_role.smep_andnot_wp | ||
3367 | = smep && !is_write_protection(vcpu); | ||
3040 | 3368 | ||
3041 | return r; | 3369 | return r; |
3042 | } | 3370 | } |
@@ -3141,27 +3469,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu) | |||
3141 | } | 3469 | } |
3142 | EXPORT_SYMBOL_GPL(kvm_mmu_unload); | 3470 | EXPORT_SYMBOL_GPL(kvm_mmu_unload); |
3143 | 3471 | ||
3144 | static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | ||
3145 | struct kvm_mmu_page *sp, | ||
3146 | u64 *spte) | ||
3147 | { | ||
3148 | u64 pte; | ||
3149 | struct kvm_mmu_page *child; | ||
3150 | |||
3151 | pte = *spte; | ||
3152 | if (is_shadow_present_pte(pte)) { | ||
3153 | if (is_last_spte(pte, sp->role.level)) | ||
3154 | drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte); | ||
3155 | else { | ||
3156 | child = page_header(pte & PT64_BASE_ADDR_MASK); | ||
3157 | mmu_page_remove_parent_pte(child, spte); | ||
3158 | } | ||
3159 | } | ||
3160 | __set_spte(spte, shadow_trap_nonpresent_pte); | ||
3161 | if (is_large_pte(pte)) | ||
3162 | --vcpu->kvm->stat.lpages; | ||
3163 | } | ||
3164 | |||
3165 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | 3472 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, |
3166 | struct kvm_mmu_page *sp, u64 *spte, | 3473 | struct kvm_mmu_page *sp, u64 *spte, |
3167 | const void *new) | 3474 | const void *new) |
@@ -3233,6 +3540,13 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
3233 | int level, npte, invlpg_counter, r, flooded = 0; | 3540 | int level, npte, invlpg_counter, r, flooded = 0; |
3234 | bool remote_flush, local_flush, zap_page; | 3541 | bool remote_flush, local_flush, zap_page; |
3235 | 3542 | ||
3543 | /* | ||
3544 | * If we don't have indirect shadow pages, it means no page is | ||
3545 | * write-protected, so we can exit simply. | ||
3546 | */ | ||
3547 | if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) | ||
3548 | return; | ||
3549 | |||
3236 | zap_page = remote_flush = local_flush = false; | 3550 | zap_page = remote_flush = local_flush = false; |
3237 | offset = offset_in_page(gpa); | 3551 | offset = offset_in_page(gpa); |
3238 | 3552 | ||
@@ -3336,7 +3650,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
3336 | spte = &sp->spt[page_offset / sizeof(*spte)]; | 3650 | spte = &sp->spt[page_offset / sizeof(*spte)]; |
3337 | while (npte--) { | 3651 | while (npte--) { |
3338 | entry = *spte; | 3652 | entry = *spte; |
3339 | mmu_pte_write_zap_pte(vcpu, sp, spte); | 3653 | mmu_page_zap_pte(vcpu->kvm, sp, spte); |
3340 | if (gentry && | 3654 | if (gentry && |
3341 | !((sp->role.word ^ vcpu->arch.mmu.base_role.word) | 3655 | !((sp->role.word ^ vcpu->arch.mmu.base_role.word) |
3342 | & mask.word)) | 3656 | & mask.word)) |
@@ -3380,9 +3694,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | |||
3380 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, | 3694 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, |
3381 | struct kvm_mmu_page, link); | 3695 | struct kvm_mmu_page, link); |
3382 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); | 3696 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); |
3383 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | ||
3384 | ++vcpu->kvm->stat.mmu_recycled; | 3697 | ++vcpu->kvm->stat.mmu_recycled; |
3385 | } | 3698 | } |
3699 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | ||
3386 | } | 3700 | } |
3387 | 3701 | ||
3388 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, | 3702 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, |
@@ -3506,15 +3820,15 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
3506 | continue; | 3820 | continue; |
3507 | 3821 | ||
3508 | if (is_large_pte(pt[i])) { | 3822 | if (is_large_pte(pt[i])) { |
3509 | drop_spte(kvm, &pt[i], | 3823 | drop_spte(kvm, &pt[i]); |
3510 | shadow_trap_nonpresent_pte); | ||
3511 | --kvm->stat.lpages; | 3824 | --kvm->stat.lpages; |
3512 | continue; | 3825 | continue; |
3513 | } | 3826 | } |
3514 | 3827 | ||
3515 | /* avoid RMW */ | 3828 | /* avoid RMW */ |
3516 | if (is_writable_pte(pt[i])) | 3829 | if (is_writable_pte(pt[i])) |
3517 | update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); | 3830 | mmu_spte_update(&pt[i], |
3831 | pt[i] & ~PT_WRITABLE_MASK); | ||
3518 | } | 3832 | } |
3519 | } | 3833 | } |
3520 | kvm_flush_remote_tlbs(kvm); | 3834 | kvm_flush_remote_tlbs(kvm); |
@@ -3590,25 +3904,18 @@ static struct shrinker mmu_shrinker = { | |||
3590 | 3904 | ||
3591 | static void mmu_destroy_caches(void) | 3905 | static void mmu_destroy_caches(void) |
3592 | { | 3906 | { |
3593 | if (pte_chain_cache) | 3907 | if (pte_list_desc_cache) |
3594 | kmem_cache_destroy(pte_chain_cache); | 3908 | kmem_cache_destroy(pte_list_desc_cache); |
3595 | if (rmap_desc_cache) | ||
3596 | kmem_cache_destroy(rmap_desc_cache); | ||
3597 | if (mmu_page_header_cache) | 3909 | if (mmu_page_header_cache) |
3598 | kmem_cache_destroy(mmu_page_header_cache); | 3910 | kmem_cache_destroy(mmu_page_header_cache); |
3599 | } | 3911 | } |
3600 | 3912 | ||
3601 | int kvm_mmu_module_init(void) | 3913 | int kvm_mmu_module_init(void) |
3602 | { | 3914 | { |
3603 | pte_chain_cache = kmem_cache_create("kvm_pte_chain", | 3915 | pte_list_desc_cache = kmem_cache_create("pte_list_desc", |
3604 | sizeof(struct kvm_pte_chain), | 3916 | sizeof(struct pte_list_desc), |
3605 | 0, 0, NULL); | ||
3606 | if (!pte_chain_cache) | ||
3607 | goto nomem; | ||
3608 | rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", | ||
3609 | sizeof(struct kvm_rmap_desc), | ||
3610 | 0, 0, NULL); | 3917 | 0, 0, NULL); |
3611 | if (!rmap_desc_cache) | 3918 | if (!pte_list_desc_cache) |
3612 | goto nomem; | 3919 | goto nomem; |
3613 | 3920 | ||
3614 | mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", | 3921 | mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", |
@@ -3775,16 +4082,17 @@ out: | |||
3775 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) | 4082 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) |
3776 | { | 4083 | { |
3777 | struct kvm_shadow_walk_iterator iterator; | 4084 | struct kvm_shadow_walk_iterator iterator; |
4085 | u64 spte; | ||
3778 | int nr_sptes = 0; | 4086 | int nr_sptes = 0; |
3779 | 4087 | ||
3780 | spin_lock(&vcpu->kvm->mmu_lock); | 4088 | walk_shadow_page_lockless_begin(vcpu); |
3781 | for_each_shadow_entry(vcpu, addr, iterator) { | 4089 | for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { |
3782 | sptes[iterator.level-1] = *iterator.sptep; | 4090 | sptes[iterator.level-1] = spte; |
3783 | nr_sptes++; | 4091 | nr_sptes++; |
3784 | if (!is_shadow_present_pte(*iterator.sptep)) | 4092 | if (!is_shadow_present_pte(spte)) |
3785 | break; | 4093 | break; |
3786 | } | 4094 | } |
3787 | spin_unlock(&vcpu->kvm->mmu_lock); | 4095 | walk_shadow_page_lockless_end(vcpu); |
3788 | 4096 | ||
3789 | return nr_sptes; | 4097 | return nr_sptes; |
3790 | } | 4098 | } |
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 7086ca85d3e7..e374db9af021 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h | |||
@@ -49,6 +49,8 @@ | |||
49 | #define PFERR_FETCH_MASK (1U << 4) | 49 | #define PFERR_FETCH_MASK (1U << 4) |
50 | 50 | ||
51 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); | 51 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); |
52 | void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); | ||
53 | int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); | ||
52 | int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); | 54 | int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); |
53 | 55 | ||
54 | static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) | 56 | static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) |
@@ -76,4 +78,27 @@ static inline int is_present_gpte(unsigned long pte) | |||
76 | return pte & PT_PRESENT_MASK; | 78 | return pte & PT_PRESENT_MASK; |
77 | } | 79 | } |
78 | 80 | ||
81 | static inline int is_writable_pte(unsigned long pte) | ||
82 | { | ||
83 | return pte & PT_WRITABLE_MASK; | ||
84 | } | ||
85 | |||
86 | static inline bool is_write_protection(struct kvm_vcpu *vcpu) | ||
87 | { | ||
88 | return kvm_read_cr0_bits(vcpu, X86_CR0_WP); | ||
89 | } | ||
90 | |||
91 | static inline bool check_write_user_access(struct kvm_vcpu *vcpu, | ||
92 | bool write_fault, bool user_fault, | ||
93 | unsigned long pte) | ||
94 | { | ||
95 | if (unlikely(write_fault && !is_writable_pte(pte) | ||
96 | && (user_fault || is_write_protection(vcpu)))) | ||
97 | return false; | ||
98 | |||
99 | if (unlikely(user_fault && !(pte & PT_USER_MASK))) | ||
100 | return false; | ||
101 | |||
102 | return true; | ||
103 | } | ||
79 | #endif | 104 | #endif |
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 5f6223b8bcf7..2460a265be23 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c | |||
@@ -99,18 +99,6 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) | |||
99 | "level = %d\n", sp, level); | 99 | "level = %d\n", sp, level); |
100 | return; | 100 | return; |
101 | } | 101 | } |
102 | |||
103 | if (*sptep == shadow_notrap_nonpresent_pte) { | ||
104 | audit_printk(vcpu->kvm, "notrap spte in unsync " | ||
105 | "sp: %p\n", sp); | ||
106 | return; | ||
107 | } | ||
108 | } | ||
109 | |||
110 | if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { | ||
111 | audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n", | ||
112 | sp); | ||
113 | return; | ||
114 | } | 102 | } |
115 | 103 | ||
116 | if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level)) | 104 | if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level)) |
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index b60b4fdb3eda..eed67f34146d 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h | |||
@@ -196,6 +196,54 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, | |||
196 | TP_ARGS(sp) | 196 | TP_ARGS(sp) |
197 | ); | 197 | ); |
198 | 198 | ||
199 | DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages, | ||
200 | TP_PROTO(struct kvm_mmu_page *sp), | ||
201 | |||
202 | TP_ARGS(sp) | ||
203 | ); | ||
204 | |||
205 | TRACE_EVENT( | ||
206 | mark_mmio_spte, | ||
207 | TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access), | ||
208 | TP_ARGS(sptep, gfn, access), | ||
209 | |||
210 | TP_STRUCT__entry( | ||
211 | __field(void *, sptep) | ||
212 | __field(gfn_t, gfn) | ||
213 | __field(unsigned, access) | ||
214 | ), | ||
215 | |||
216 | TP_fast_assign( | ||
217 | __entry->sptep = sptep; | ||
218 | __entry->gfn = gfn; | ||
219 | __entry->access = access; | ||
220 | ), | ||
221 | |||
222 | TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn, | ||
223 | __entry->access) | ||
224 | ); | ||
225 | |||
226 | TRACE_EVENT( | ||
227 | handle_mmio_page_fault, | ||
228 | TP_PROTO(u64 addr, gfn_t gfn, unsigned access), | ||
229 | TP_ARGS(addr, gfn, access), | ||
230 | |||
231 | TP_STRUCT__entry( | ||
232 | __field(u64, addr) | ||
233 | __field(gfn_t, gfn) | ||
234 | __field(unsigned, access) | ||
235 | ), | ||
236 | |||
237 | TP_fast_assign( | ||
238 | __entry->addr = addr; | ||
239 | __entry->gfn = gfn; | ||
240 | __entry->access = access; | ||
241 | ), | ||
242 | |||
243 | TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, | ||
244 | __entry->access) | ||
245 | ); | ||
246 | |||
199 | TRACE_EVENT( | 247 | TRACE_EVENT( |
200 | kvm_mmu_audit, | 248 | kvm_mmu_audit, |
201 | TP_PROTO(struct kvm_vcpu *vcpu, int audit_point), | 249 | TP_PROTO(struct kvm_vcpu *vcpu, int audit_point), |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 9d03ad4dd5ec..507e2b844cfa 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -101,11 +101,15 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | |||
101 | return (ret != orig_pte); | 101 | return (ret != orig_pte); |
102 | } | 102 | } |
103 | 103 | ||
104 | static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) | 104 | static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte, |
105 | bool last) | ||
105 | { | 106 | { |
106 | unsigned access; | 107 | unsigned access; |
107 | 108 | ||
108 | access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; | 109 | access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; |
110 | if (last && !is_dirty_gpte(gpte)) | ||
111 | access &= ~ACC_WRITE_MASK; | ||
112 | |||
109 | #if PTTYPE == 64 | 113 | #if PTTYPE == 64 |
110 | if (vcpu->arch.mmu.nx) | 114 | if (vcpu->arch.mmu.nx) |
111 | access &= ~(gpte >> PT64_NX_SHIFT); | 115 | access &= ~(gpte >> PT64_NX_SHIFT); |
@@ -113,6 +117,24 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) | |||
113 | return access; | 117 | return access; |
114 | } | 118 | } |
115 | 119 | ||
120 | static bool FNAME(is_last_gpte)(struct guest_walker *walker, | ||
121 | struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | ||
122 | pt_element_t gpte) | ||
123 | { | ||
124 | if (walker->level == PT_PAGE_TABLE_LEVEL) | ||
125 | return true; | ||
126 | |||
127 | if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) && | ||
128 | (PTTYPE == 64 || is_pse(vcpu))) | ||
129 | return true; | ||
130 | |||
131 | if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) && | ||
132 | (mmu->root_level == PT64_ROOT_LEVEL)) | ||
133 | return true; | ||
134 | |||
135 | return false; | ||
136 | } | ||
137 | |||
116 | /* | 138 | /* |
117 | * Fetch a guest pte for a guest virtual address | 139 | * Fetch a guest pte for a guest virtual address |
118 | */ | 140 | */ |
@@ -125,18 +147,17 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, | |||
125 | gfn_t table_gfn; | 147 | gfn_t table_gfn; |
126 | unsigned index, pt_access, uninitialized_var(pte_access); | 148 | unsigned index, pt_access, uninitialized_var(pte_access); |
127 | gpa_t pte_gpa; | 149 | gpa_t pte_gpa; |
128 | bool eperm, present, rsvd_fault; | 150 | bool eperm; |
129 | int offset, write_fault, user_fault, fetch_fault; | 151 | int offset; |
130 | 152 | const int write_fault = access & PFERR_WRITE_MASK; | |
131 | write_fault = access & PFERR_WRITE_MASK; | 153 | const int user_fault = access & PFERR_USER_MASK; |
132 | user_fault = access & PFERR_USER_MASK; | 154 | const int fetch_fault = access & PFERR_FETCH_MASK; |
133 | fetch_fault = access & PFERR_FETCH_MASK; | 155 | u16 errcode = 0; |
134 | 156 | ||
135 | trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, | 157 | trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, |
136 | fetch_fault); | 158 | fetch_fault); |
137 | walk: | 159 | retry_walk: |
138 | present = true; | 160 | eperm = false; |
139 | eperm = rsvd_fault = false; | ||
140 | walker->level = mmu->root_level; | 161 | walker->level = mmu->root_level; |
141 | pte = mmu->get_cr3(vcpu); | 162 | pte = mmu->get_cr3(vcpu); |
142 | 163 | ||
@@ -144,10 +165,8 @@ walk: | |||
144 | if (walker->level == PT32E_ROOT_LEVEL) { | 165 | if (walker->level == PT32E_ROOT_LEVEL) { |
145 | pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3); | 166 | pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3); |
146 | trace_kvm_mmu_paging_element(pte, walker->level); | 167 | trace_kvm_mmu_paging_element(pte, walker->level); |
147 | if (!is_present_gpte(pte)) { | 168 | if (!is_present_gpte(pte)) |
148 | present = false; | ||
149 | goto error; | 169 | goto error; |
150 | } | ||
151 | --walker->level; | 170 | --walker->level; |
152 | } | 171 | } |
153 | #endif | 172 | #endif |
@@ -170,42 +189,31 @@ walk: | |||
170 | 189 | ||
171 | real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), | 190 | real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), |
172 | PFERR_USER_MASK|PFERR_WRITE_MASK); | 191 | PFERR_USER_MASK|PFERR_WRITE_MASK); |
173 | if (unlikely(real_gfn == UNMAPPED_GVA)) { | 192 | if (unlikely(real_gfn == UNMAPPED_GVA)) |
174 | present = false; | 193 | goto error; |
175 | break; | ||
176 | } | ||
177 | real_gfn = gpa_to_gfn(real_gfn); | 194 | real_gfn = gpa_to_gfn(real_gfn); |
178 | 195 | ||
179 | host_addr = gfn_to_hva(vcpu->kvm, real_gfn); | 196 | host_addr = gfn_to_hva(vcpu->kvm, real_gfn); |
180 | if (unlikely(kvm_is_error_hva(host_addr))) { | 197 | if (unlikely(kvm_is_error_hva(host_addr))) |
181 | present = false; | 198 | goto error; |
182 | break; | ||
183 | } | ||
184 | 199 | ||
185 | ptep_user = (pt_element_t __user *)((void *)host_addr + offset); | 200 | ptep_user = (pt_element_t __user *)((void *)host_addr + offset); |
186 | if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) { | 201 | if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) |
187 | present = false; | 202 | goto error; |
188 | break; | ||
189 | } | ||
190 | 203 | ||
191 | trace_kvm_mmu_paging_element(pte, walker->level); | 204 | trace_kvm_mmu_paging_element(pte, walker->level); |
192 | 205 | ||
193 | if (unlikely(!is_present_gpte(pte))) { | 206 | if (unlikely(!is_present_gpte(pte))) |
194 | present = false; | 207 | goto error; |
195 | break; | ||
196 | } | ||
197 | 208 | ||
198 | if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, | 209 | if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, |
199 | walker->level))) { | 210 | walker->level))) { |
200 | rsvd_fault = true; | 211 | errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; |
201 | break; | 212 | goto error; |
202 | } | 213 | } |
203 | 214 | ||
204 | if (unlikely(write_fault && !is_writable_pte(pte) | 215 | if (!check_write_user_access(vcpu, write_fault, user_fault, |
205 | && (user_fault || is_write_protection(vcpu)))) | 216 | pte)) |
206 | eperm = true; | ||
207 | |||
208 | if (unlikely(user_fault && !(pte & PT_USER_MASK))) | ||
209 | eperm = true; | 217 | eperm = true; |
210 | 218 | ||
211 | #if PTTYPE == 64 | 219 | #if PTTYPE == 64 |
@@ -213,39 +221,35 @@ walk: | |||
213 | eperm = true; | 221 | eperm = true; |
214 | #endif | 222 | #endif |
215 | 223 | ||
216 | if (!eperm && !rsvd_fault | 224 | if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) { |
217 | && unlikely(!(pte & PT_ACCESSED_MASK))) { | ||
218 | int ret; | 225 | int ret; |
219 | trace_kvm_mmu_set_accessed_bit(table_gfn, index, | 226 | trace_kvm_mmu_set_accessed_bit(table_gfn, index, |
220 | sizeof(pte)); | 227 | sizeof(pte)); |
221 | ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, | 228 | ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, |
222 | pte, pte|PT_ACCESSED_MASK); | 229 | pte, pte|PT_ACCESSED_MASK); |
223 | if (unlikely(ret < 0)) { | 230 | if (unlikely(ret < 0)) |
224 | present = false; | 231 | goto error; |
225 | break; | 232 | else if (ret) |
226 | } else if (ret) | 233 | goto retry_walk; |
227 | goto walk; | ||
228 | 234 | ||
229 | mark_page_dirty(vcpu->kvm, table_gfn); | 235 | mark_page_dirty(vcpu->kvm, table_gfn); |
230 | pte |= PT_ACCESSED_MASK; | 236 | pte |= PT_ACCESSED_MASK; |
231 | } | 237 | } |
232 | 238 | ||
233 | pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); | ||
234 | |||
235 | walker->ptes[walker->level - 1] = pte; | 239 | walker->ptes[walker->level - 1] = pte; |
236 | 240 | ||
237 | if ((walker->level == PT_PAGE_TABLE_LEVEL) || | 241 | if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) { |
238 | ((walker->level == PT_DIRECTORY_LEVEL) && | ||
239 | is_large_pte(pte) && | ||
240 | (PTTYPE == 64 || is_pse(vcpu))) || | ||
241 | ((walker->level == PT_PDPE_LEVEL) && | ||
242 | is_large_pte(pte) && | ||
243 | mmu->root_level == PT64_ROOT_LEVEL)) { | ||
244 | int lvl = walker->level; | 242 | int lvl = walker->level; |
245 | gpa_t real_gpa; | 243 | gpa_t real_gpa; |
246 | gfn_t gfn; | 244 | gfn_t gfn; |
247 | u32 ac; | 245 | u32 ac; |
248 | 246 | ||
247 | /* check if the kernel is fetching from user page */ | ||
248 | if (unlikely(pte_access & PT_USER_MASK) && | ||
249 | kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) | ||
250 | if (fetch_fault && !user_fault) | ||
251 | eperm = true; | ||
252 | |||
249 | gfn = gpte_to_gfn_lvl(pte, lvl); | 253 | gfn = gpte_to_gfn_lvl(pte, lvl); |
250 | gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; | 254 | gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; |
251 | 255 | ||
@@ -266,12 +270,14 @@ walk: | |||
266 | break; | 270 | break; |
267 | } | 271 | } |
268 | 272 | ||
269 | pt_access = pte_access; | 273 | pt_access &= FNAME(gpte_access)(vcpu, pte, false); |
270 | --walker->level; | 274 | --walker->level; |
271 | } | 275 | } |
272 | 276 | ||
273 | if (unlikely(!present || eperm || rsvd_fault)) | 277 | if (unlikely(eperm)) { |
278 | errcode |= PFERR_PRESENT_MASK; | ||
274 | goto error; | 279 | goto error; |
280 | } | ||
275 | 281 | ||
276 | if (write_fault && unlikely(!is_dirty_gpte(pte))) { | 282 | if (write_fault && unlikely(!is_dirty_gpte(pte))) { |
277 | int ret; | 283 | int ret; |
@@ -279,17 +285,17 @@ walk: | |||
279 | trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); | 285 | trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); |
280 | ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, | 286 | ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, |
281 | pte, pte|PT_DIRTY_MASK); | 287 | pte, pte|PT_DIRTY_MASK); |
282 | if (unlikely(ret < 0)) { | 288 | if (unlikely(ret < 0)) |
283 | present = false; | ||
284 | goto error; | 289 | goto error; |
285 | } else if (ret) | 290 | else if (ret) |
286 | goto walk; | 291 | goto retry_walk; |
287 | 292 | ||
288 | mark_page_dirty(vcpu->kvm, table_gfn); | 293 | mark_page_dirty(vcpu->kvm, table_gfn); |
289 | pte |= PT_DIRTY_MASK; | 294 | pte |= PT_DIRTY_MASK; |
290 | walker->ptes[walker->level - 1] = pte; | 295 | walker->ptes[walker->level - 1] = pte; |
291 | } | 296 | } |
292 | 297 | ||
298 | pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true); | ||
293 | walker->pt_access = pt_access; | 299 | walker->pt_access = pt_access; |
294 | walker->pte_access = pte_access; | 300 | walker->pte_access = pte_access; |
295 | pgprintk("%s: pte %llx pte_access %x pt_access %x\n", | 301 | pgprintk("%s: pte %llx pte_access %x pt_access %x\n", |
@@ -297,19 +303,14 @@ walk: | |||
297 | return 1; | 303 | return 1; |
298 | 304 | ||
299 | error: | 305 | error: |
306 | errcode |= write_fault | user_fault; | ||
307 | if (fetch_fault && (mmu->nx || | ||
308 | kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))) | ||
309 | errcode |= PFERR_FETCH_MASK; | ||
310 | |||
300 | walker->fault.vector = PF_VECTOR; | 311 | walker->fault.vector = PF_VECTOR; |
301 | walker->fault.error_code_valid = true; | 312 | walker->fault.error_code_valid = true; |
302 | walker->fault.error_code = 0; | 313 | walker->fault.error_code = errcode; |
303 | if (present) | ||
304 | walker->fault.error_code |= PFERR_PRESENT_MASK; | ||
305 | |||
306 | walker->fault.error_code |= write_fault | user_fault; | ||
307 | |||
308 | if (fetch_fault && mmu->nx) | ||
309 | walker->fault.error_code |= PFERR_FETCH_MASK; | ||
310 | if (rsvd_fault) | ||
311 | walker->fault.error_code |= PFERR_RSVD_MASK; | ||
312 | |||
313 | walker->fault.address = addr; | 314 | walker->fault.address = addr; |
314 | walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; | 315 | walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; |
315 | 316 | ||
@@ -336,16 +337,11 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, | |||
336 | struct kvm_mmu_page *sp, u64 *spte, | 337 | struct kvm_mmu_page *sp, u64 *spte, |
337 | pt_element_t gpte) | 338 | pt_element_t gpte) |
338 | { | 339 | { |
339 | u64 nonpresent = shadow_trap_nonpresent_pte; | ||
340 | |||
341 | if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) | 340 | if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) |
342 | goto no_present; | 341 | goto no_present; |
343 | 342 | ||
344 | if (!is_present_gpte(gpte)) { | 343 | if (!is_present_gpte(gpte)) |
345 | if (!sp->unsync) | ||
346 | nonpresent = shadow_notrap_nonpresent_pte; | ||
347 | goto no_present; | 344 | goto no_present; |
348 | } | ||
349 | 345 | ||
350 | if (!(gpte & PT_ACCESSED_MASK)) | 346 | if (!(gpte & PT_ACCESSED_MASK)) |
351 | goto no_present; | 347 | goto no_present; |
@@ -353,7 +349,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, | |||
353 | return false; | 349 | return false; |
354 | 350 | ||
355 | no_present: | 351 | no_present: |
356 | drop_spte(vcpu->kvm, spte, nonpresent); | 352 | drop_spte(vcpu->kvm, spte); |
357 | return true; | 353 | return true; |
358 | } | 354 | } |
359 | 355 | ||
@@ -369,9 +365,9 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
369 | return; | 365 | return; |
370 | 366 | ||
371 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); | 367 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); |
372 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | 368 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true); |
373 | pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); | 369 | pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); |
374 | if (is_error_pfn(pfn)) { | 370 | if (mmu_invalid_pfn(pfn)) { |
375 | kvm_release_pfn_clean(pfn); | 371 | kvm_release_pfn_clean(pfn); |
376 | return; | 372 | return; |
377 | } | 373 | } |
@@ -381,7 +377,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
381 | * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). | 377 | * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). |
382 | */ | 378 | */ |
383 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, | 379 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, |
384 | is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL, | 380 | NULL, PT_PAGE_TABLE_LEVEL, |
385 | gpte_to_gfn(gpte), pfn, true, true); | 381 | gpte_to_gfn(gpte), pfn, true, true); |
386 | } | 382 | } |
387 | 383 | ||
@@ -432,12 +428,11 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, | |||
432 | unsigned pte_access; | 428 | unsigned pte_access; |
433 | gfn_t gfn; | 429 | gfn_t gfn; |
434 | pfn_t pfn; | 430 | pfn_t pfn; |
435 | bool dirty; | ||
436 | 431 | ||
437 | if (spte == sptep) | 432 | if (spte == sptep) |
438 | continue; | 433 | continue; |
439 | 434 | ||
440 | if (*spte != shadow_trap_nonpresent_pte) | 435 | if (is_shadow_present_pte(*spte)) |
441 | continue; | 436 | continue; |
442 | 437 | ||
443 | gpte = gptep[i]; | 438 | gpte = gptep[i]; |
@@ -445,18 +440,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, | |||
445 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) | 440 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) |
446 | continue; | 441 | continue; |
447 | 442 | ||
448 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | 443 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, |
444 | true); | ||
449 | gfn = gpte_to_gfn(gpte); | 445 | gfn = gpte_to_gfn(gpte); |
450 | dirty = is_dirty_gpte(gpte); | ||
451 | pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, | 446 | pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, |
452 | (pte_access & ACC_WRITE_MASK) && dirty); | 447 | pte_access & ACC_WRITE_MASK); |
453 | if (is_error_pfn(pfn)) { | 448 | if (mmu_invalid_pfn(pfn)) { |
454 | kvm_release_pfn_clean(pfn); | 449 | kvm_release_pfn_clean(pfn); |
455 | break; | 450 | break; |
456 | } | 451 | } |
457 | 452 | ||
458 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, | 453 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, |
459 | dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn, | 454 | NULL, PT_PAGE_TABLE_LEVEL, gfn, |
460 | pfn, true, true); | 455 | pfn, true, true); |
461 | } | 456 | } |
462 | } | 457 | } |
@@ -467,12 +462,11 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, | |||
467 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | 462 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, |
468 | struct guest_walker *gw, | 463 | struct guest_walker *gw, |
469 | int user_fault, int write_fault, int hlevel, | 464 | int user_fault, int write_fault, int hlevel, |
470 | int *ptwrite, pfn_t pfn, bool map_writable, | 465 | int *emulate, pfn_t pfn, bool map_writable, |
471 | bool prefault) | 466 | bool prefault) |
472 | { | 467 | { |
473 | unsigned access = gw->pt_access; | 468 | unsigned access = gw->pt_access; |
474 | struct kvm_mmu_page *sp = NULL; | 469 | struct kvm_mmu_page *sp = NULL; |
475 | bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]); | ||
476 | int top_level; | 470 | int top_level; |
477 | unsigned direct_access; | 471 | unsigned direct_access; |
478 | struct kvm_shadow_walk_iterator it; | 472 | struct kvm_shadow_walk_iterator it; |
@@ -480,9 +474,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
480 | if (!is_present_gpte(gw->ptes[gw->level - 1])) | 474 | if (!is_present_gpte(gw->ptes[gw->level - 1])) |
481 | return NULL; | 475 | return NULL; |
482 | 476 | ||
483 | direct_access = gw->pt_access & gw->pte_access; | 477 | direct_access = gw->pte_access; |
484 | if (!dirty) | ||
485 | direct_access &= ~ACC_WRITE_MASK; | ||
486 | 478 | ||
487 | top_level = vcpu->arch.mmu.root_level; | 479 | top_level = vcpu->arch.mmu.root_level; |
488 | if (top_level == PT32E_ROOT_LEVEL) | 480 | if (top_level == PT32E_ROOT_LEVEL) |
@@ -540,8 +532,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
540 | link_shadow_page(it.sptep, sp); | 532 | link_shadow_page(it.sptep, sp); |
541 | } | 533 | } |
542 | 534 | ||
543 | mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, | 535 | mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, |
544 | user_fault, write_fault, dirty, ptwrite, it.level, | 536 | user_fault, write_fault, emulate, it.level, |
545 | gw->gfn, pfn, prefault, map_writable); | 537 | gw->gfn, pfn, prefault, map_writable); |
546 | FNAME(pte_prefetch)(vcpu, gw, it.sptep); | 538 | FNAME(pte_prefetch)(vcpu, gw, it.sptep); |
547 | 539 | ||
@@ -575,7 +567,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
575 | int user_fault = error_code & PFERR_USER_MASK; | 567 | int user_fault = error_code & PFERR_USER_MASK; |
576 | struct guest_walker walker; | 568 | struct guest_walker walker; |
577 | u64 *sptep; | 569 | u64 *sptep; |
578 | int write_pt = 0; | 570 | int emulate = 0; |
579 | int r; | 571 | int r; |
580 | pfn_t pfn; | 572 | pfn_t pfn; |
581 | int level = PT_PAGE_TABLE_LEVEL; | 573 | int level = PT_PAGE_TABLE_LEVEL; |
@@ -585,6 +577,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
585 | 577 | ||
586 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); | 578 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); |
587 | 579 | ||
580 | if (unlikely(error_code & PFERR_RSVD_MASK)) | ||
581 | return handle_mmio_page_fault(vcpu, addr, error_code, | ||
582 | mmu_is_nested(vcpu)); | ||
583 | |||
588 | r = mmu_topup_memory_caches(vcpu); | 584 | r = mmu_topup_memory_caches(vcpu); |
589 | if (r) | 585 | if (r) |
590 | return r; | 586 | return r; |
@@ -623,9 +619,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
623 | &map_writable)) | 619 | &map_writable)) |
624 | return 0; | 620 | return 0; |
625 | 621 | ||
626 | /* mmio */ | 622 | if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr, |
627 | if (is_error_pfn(pfn)) | 623 | walker.gfn, pfn, walker.pte_access, &r)) |
628 | return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn); | 624 | return r; |
629 | 625 | ||
630 | spin_lock(&vcpu->kvm->mmu_lock); | 626 | spin_lock(&vcpu->kvm->mmu_lock); |
631 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 627 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
@@ -636,19 +632,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
636 | if (!force_pt_level) | 632 | if (!force_pt_level) |
637 | transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); | 633 | transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); |
638 | sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | 634 | sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, |
639 | level, &write_pt, pfn, map_writable, prefault); | 635 | level, &emulate, pfn, map_writable, prefault); |
640 | (void)sptep; | 636 | (void)sptep; |
641 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, | 637 | pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__, |
642 | sptep, *sptep, write_pt); | 638 | sptep, *sptep, emulate); |
643 | 639 | ||
644 | if (!write_pt) | 640 | if (!emulate) |
645 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | 641 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ |
646 | 642 | ||
647 | ++vcpu->stat.pf_fixed; | 643 | ++vcpu->stat.pf_fixed; |
648 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); | 644 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); |
649 | spin_unlock(&vcpu->kvm->mmu_lock); | 645 | spin_unlock(&vcpu->kvm->mmu_lock); |
650 | 646 | ||
651 | return write_pt; | 647 | return emulate; |
652 | 648 | ||
653 | out_unlock: | 649 | out_unlock: |
654 | spin_unlock(&vcpu->kvm->mmu_lock); | 650 | spin_unlock(&vcpu->kvm->mmu_lock); |
@@ -665,6 +661,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | |||
665 | u64 *sptep; | 661 | u64 *sptep; |
666 | int need_flush = 0; | 662 | int need_flush = 0; |
667 | 663 | ||
664 | vcpu_clear_mmio_info(vcpu, gva); | ||
665 | |||
668 | spin_lock(&vcpu->kvm->mmu_lock); | 666 | spin_lock(&vcpu->kvm->mmu_lock); |
669 | 667 | ||
670 | for_each_shadow_entry(vcpu, gva, iterator) { | 668 | for_each_shadow_entry(vcpu, gva, iterator) { |
@@ -688,11 +686,11 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | |||
688 | if (is_shadow_present_pte(*sptep)) { | 686 | if (is_shadow_present_pte(*sptep)) { |
689 | if (is_large_pte(*sptep)) | 687 | if (is_large_pte(*sptep)) |
690 | --vcpu->kvm->stat.lpages; | 688 | --vcpu->kvm->stat.lpages; |
691 | drop_spte(vcpu->kvm, sptep, | 689 | drop_spte(vcpu->kvm, sptep); |
692 | shadow_trap_nonpresent_pte); | ||
693 | need_flush = 1; | 690 | need_flush = 1; |
694 | } else | 691 | } else if (is_mmio_spte(*sptep)) |
695 | __set_spte(sptep, shadow_trap_nonpresent_pte); | 692 | mmu_spte_clear_no_track(sptep); |
693 | |||
696 | break; | 694 | break; |
697 | } | 695 | } |
698 | 696 | ||
@@ -752,36 +750,6 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, | |||
752 | return gpa; | 750 | return gpa; |
753 | } | 751 | } |
754 | 752 | ||
755 | static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, | ||
756 | struct kvm_mmu_page *sp) | ||
757 | { | ||
758 | int i, j, offset, r; | ||
759 | pt_element_t pt[256 / sizeof(pt_element_t)]; | ||
760 | gpa_t pte_gpa; | ||
761 | |||
762 | if (sp->role.direct | ||
763 | || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { | ||
764 | nonpaging_prefetch_page(vcpu, sp); | ||
765 | return; | ||
766 | } | ||
767 | |||
768 | pte_gpa = gfn_to_gpa(sp->gfn); | ||
769 | if (PTTYPE == 32) { | ||
770 | offset = sp->role.quadrant << PT64_LEVEL_BITS; | ||
771 | pte_gpa += offset * sizeof(pt_element_t); | ||
772 | } | ||
773 | |||
774 | for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) { | ||
775 | r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt); | ||
776 | pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t); | ||
777 | for (j = 0; j < ARRAY_SIZE(pt); ++j) | ||
778 | if (r || is_present_gpte(pt[j])) | ||
779 | sp->spt[i+j] = shadow_trap_nonpresent_pte; | ||
780 | else | ||
781 | sp->spt[i+j] = shadow_notrap_nonpresent_pte; | ||
782 | } | ||
783 | } | ||
784 | |||
785 | /* | 753 | /* |
786 | * Using the cached information from sp->gfns is safe because: | 754 | * Using the cached information from sp->gfns is safe because: |
787 | * - The spte has a reference to the struct page, so the pfn for a given gfn | 755 | * - The spte has a reference to the struct page, so the pfn for a given gfn |
@@ -817,7 +785,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
817 | gpa_t pte_gpa; | 785 | gpa_t pte_gpa; |
818 | gfn_t gfn; | 786 | gfn_t gfn; |
819 | 787 | ||
820 | if (!is_shadow_present_pte(sp->spt[i])) | 788 | if (!sp->spt[i]) |
821 | continue; | 789 | continue; |
822 | 790 | ||
823 | pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); | 791 | pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); |
@@ -826,26 +794,30 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
826 | sizeof(pt_element_t))) | 794 | sizeof(pt_element_t))) |
827 | return -EINVAL; | 795 | return -EINVAL; |
828 | 796 | ||
829 | gfn = gpte_to_gfn(gpte); | ||
830 | |||
831 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { | 797 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { |
832 | vcpu->kvm->tlbs_dirty++; | 798 | vcpu->kvm->tlbs_dirty++; |
833 | continue; | 799 | continue; |
834 | } | 800 | } |
835 | 801 | ||
802 | gfn = gpte_to_gfn(gpte); | ||
803 | pte_access = sp->role.access; | ||
804 | pte_access &= FNAME(gpte_access)(vcpu, gpte, true); | ||
805 | |||
806 | if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) | ||
807 | continue; | ||
808 | |||
836 | if (gfn != sp->gfns[i]) { | 809 | if (gfn != sp->gfns[i]) { |
837 | drop_spte(vcpu->kvm, &sp->spt[i], | 810 | drop_spte(vcpu->kvm, &sp->spt[i]); |
838 | shadow_trap_nonpresent_pte); | ||
839 | vcpu->kvm->tlbs_dirty++; | 811 | vcpu->kvm->tlbs_dirty++; |
840 | continue; | 812 | continue; |
841 | } | 813 | } |
842 | 814 | ||
843 | nr_present++; | 815 | nr_present++; |
844 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | 816 | |
845 | host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; | 817 | host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; |
846 | 818 | ||
847 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, | 819 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, |
848 | is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, | 820 | PT_PAGE_TABLE_LEVEL, gfn, |
849 | spte_to_pfn(sp->spt[i]), true, false, | 821 | spte_to_pfn(sp->spt[i]), true, false, |
850 | host_writable); | 822 | host_writable); |
851 | } | 823 | } |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 506e4fe23adc..475d1c948501 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -1496,11 +1496,14 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
1496 | update_cr0_intercept(svm); | 1496 | update_cr0_intercept(svm); |
1497 | } | 1497 | } |
1498 | 1498 | ||
1499 | static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | 1499 | static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
1500 | { | 1500 | { |
1501 | unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; | 1501 | unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; |
1502 | unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; | 1502 | unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; |
1503 | 1503 | ||
1504 | if (cr4 & X86_CR4_VMXE) | ||
1505 | return 1; | ||
1506 | |||
1504 | if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) | 1507 | if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) |
1505 | svm_flush_tlb(vcpu); | 1508 | svm_flush_tlb(vcpu); |
1506 | 1509 | ||
@@ -1510,6 +1513,7 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
1510 | cr4 |= host_cr4_mce; | 1513 | cr4 |= host_cr4_mce; |
1511 | to_svm(vcpu)->vmcb->save.cr4 = cr4; | 1514 | to_svm(vcpu)->vmcb->save.cr4 = cr4; |
1512 | mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); | 1515 | mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); |
1516 | return 0; | ||
1513 | } | 1517 | } |
1514 | 1518 | ||
1515 | static void svm_set_segment(struct kvm_vcpu *vcpu, | 1519 | static void svm_set_segment(struct kvm_vcpu *vcpu, |
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index db932760ea82..3ff898c104f7 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h | |||
@@ -675,12 +675,12 @@ TRACE_EVENT(kvm_emulate_insn, | |||
675 | ), | 675 | ), |
676 | 676 | ||
677 | TP_fast_assign( | 677 | TP_fast_assign( |
678 | __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start; | 678 | __entry->rip = vcpu->arch.emulate_ctxt.fetch.start; |
679 | __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS); | 679 | __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS); |
680 | __entry->len = vcpu->arch.emulate_ctxt.decode.eip | 680 | __entry->len = vcpu->arch.emulate_ctxt._eip |
681 | - vcpu->arch.emulate_ctxt.decode.fetch.start; | 681 | - vcpu->arch.emulate_ctxt.fetch.start; |
682 | memcpy(__entry->insn, | 682 | memcpy(__entry->insn, |
683 | vcpu->arch.emulate_ctxt.decode.fetch.data, | 683 | vcpu->arch.emulate_ctxt.fetch.data, |
684 | 15); | 684 | 15); |
685 | __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode); | 685 | __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode); |
686 | __entry->failed = failed; | 686 | __entry->failed = failed; |
@@ -698,6 +698,29 @@ TRACE_EVENT(kvm_emulate_insn, | |||
698 | #define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0) | 698 | #define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0) |
699 | #define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1) | 699 | #define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1) |
700 | 700 | ||
701 | TRACE_EVENT( | ||
702 | vcpu_match_mmio, | ||
703 | TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match), | ||
704 | TP_ARGS(gva, gpa, write, gpa_match), | ||
705 | |||
706 | TP_STRUCT__entry( | ||
707 | __field(gva_t, gva) | ||
708 | __field(gpa_t, gpa) | ||
709 | __field(bool, write) | ||
710 | __field(bool, gpa_match) | ||
711 | ), | ||
712 | |||
713 | TP_fast_assign( | ||
714 | __entry->gva = gva; | ||
715 | __entry->gpa = gpa; | ||
716 | __entry->write = write; | ||
717 | __entry->gpa_match = gpa_match | ||
718 | ), | ||
719 | |||
720 | TP_printk("gva %#lx gpa %#llx %s %s", __entry->gva, __entry->gpa, | ||
721 | __entry->write ? "Write" : "Read", | ||
722 | __entry->gpa_match ? "GPA" : "GVA") | ||
723 | ); | ||
701 | #endif /* _TRACE_KVM_H */ | 724 | #endif /* _TRACE_KVM_H */ |
702 | 725 | ||
703 | #undef TRACE_INCLUDE_PATH | 726 | #undef TRACE_INCLUDE_PATH |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d48ec60ea421..e65a158dee64 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -43,13 +43,12 @@ | |||
43 | #include "trace.h" | 43 | #include "trace.h" |
44 | 44 | ||
45 | #define __ex(x) __kvm_handle_fault_on_reboot(x) | 45 | #define __ex(x) __kvm_handle_fault_on_reboot(x) |
46 | #define __ex_clear(x, reg) \ | ||
47 | ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg) | ||
46 | 48 | ||
47 | MODULE_AUTHOR("Qumranet"); | 49 | MODULE_AUTHOR("Qumranet"); |
48 | MODULE_LICENSE("GPL"); | 50 | MODULE_LICENSE("GPL"); |
49 | 51 | ||
50 | static int __read_mostly bypass_guest_pf = 1; | ||
51 | module_param(bypass_guest_pf, bool, S_IRUGO); | ||
52 | |||
53 | static int __read_mostly enable_vpid = 1; | 52 | static int __read_mostly enable_vpid = 1; |
54 | module_param_named(vpid, enable_vpid, bool, 0444); | 53 | module_param_named(vpid, enable_vpid, bool, 0444); |
55 | 54 | ||
@@ -72,6 +71,14 @@ module_param(vmm_exclusive, bool, S_IRUGO); | |||
72 | static int __read_mostly yield_on_hlt = 1; | 71 | static int __read_mostly yield_on_hlt = 1; |
73 | module_param(yield_on_hlt, bool, S_IRUGO); | 72 | module_param(yield_on_hlt, bool, S_IRUGO); |
74 | 73 | ||
74 | /* | ||
75 | * If nested=1, nested virtualization is supported, i.e., guests may use | ||
76 | * VMX and be a hypervisor for its own guests. If nested=0, guests may not | ||
77 | * use VMX instructions. | ||
78 | */ | ||
79 | static int __read_mostly nested = 0; | ||
80 | module_param(nested, bool, S_IRUGO); | ||
81 | |||
75 | #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ | 82 | #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ |
76 | (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) | 83 | (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) |
77 | #define KVM_GUEST_CR0_MASK \ | 84 | #define KVM_GUEST_CR0_MASK \ |
@@ -109,6 +116,7 @@ static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; | |||
109 | module_param(ple_window, int, S_IRUGO); | 116 | module_param(ple_window, int, S_IRUGO); |
110 | 117 | ||
111 | #define NR_AUTOLOAD_MSRS 1 | 118 | #define NR_AUTOLOAD_MSRS 1 |
119 | #define VMCS02_POOL_SIZE 1 | ||
112 | 120 | ||
113 | struct vmcs { | 121 | struct vmcs { |
114 | u32 revision_id; | 122 | u32 revision_id; |
@@ -116,17 +124,237 @@ struct vmcs { | |||
116 | char data[0]; | 124 | char data[0]; |
117 | }; | 125 | }; |
118 | 126 | ||
127 | /* | ||
128 | * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also | ||
129 | * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs | ||
130 | * loaded on this CPU (so we can clear them if the CPU goes down). | ||
131 | */ | ||
132 | struct loaded_vmcs { | ||
133 | struct vmcs *vmcs; | ||
134 | int cpu; | ||
135 | int launched; | ||
136 | struct list_head loaded_vmcss_on_cpu_link; | ||
137 | }; | ||
138 | |||
119 | struct shared_msr_entry { | 139 | struct shared_msr_entry { |
120 | unsigned index; | 140 | unsigned index; |
121 | u64 data; | 141 | u64 data; |
122 | u64 mask; | 142 | u64 mask; |
123 | }; | 143 | }; |
124 | 144 | ||
145 | /* | ||
146 | * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a | ||
147 | * single nested guest (L2), hence the name vmcs12. Any VMX implementation has | ||
148 | * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is | ||
149 | * stored in guest memory specified by VMPTRLD, but is opaque to the guest, | ||
150 | * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. | ||
151 | * More than one of these structures may exist, if L1 runs multiple L2 guests. | ||
152 | * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the | ||
153 | * underlying hardware which will be used to run L2. | ||
154 | * This structure is packed to ensure that its layout is identical across | ||
155 | * machines (necessary for live migration). | ||
156 | * If there are changes in this struct, VMCS12_REVISION must be changed. | ||
157 | */ | ||
158 | typedef u64 natural_width; | ||
159 | struct __packed vmcs12 { | ||
160 | /* According to the Intel spec, a VMCS region must start with the | ||
161 | * following two fields. Then follow implementation-specific data. | ||
162 | */ | ||
163 | u32 revision_id; | ||
164 | u32 abort; | ||
165 | |||
166 | u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ | ||
167 | u32 padding[7]; /* room for future expansion */ | ||
168 | |||
169 | u64 io_bitmap_a; | ||
170 | u64 io_bitmap_b; | ||
171 | u64 msr_bitmap; | ||
172 | u64 vm_exit_msr_store_addr; | ||
173 | u64 vm_exit_msr_load_addr; | ||
174 | u64 vm_entry_msr_load_addr; | ||
175 | u64 tsc_offset; | ||
176 | u64 virtual_apic_page_addr; | ||
177 | u64 apic_access_addr; | ||
178 | u64 ept_pointer; | ||
179 | u64 guest_physical_address; | ||
180 | u64 vmcs_link_pointer; | ||
181 | u64 guest_ia32_debugctl; | ||
182 | u64 guest_ia32_pat; | ||
183 | u64 guest_ia32_efer; | ||
184 | u64 guest_ia32_perf_global_ctrl; | ||
185 | u64 guest_pdptr0; | ||
186 | u64 guest_pdptr1; | ||
187 | u64 guest_pdptr2; | ||
188 | u64 guest_pdptr3; | ||
189 | u64 host_ia32_pat; | ||
190 | u64 host_ia32_efer; | ||
191 | u64 host_ia32_perf_global_ctrl; | ||
192 | u64 padding64[8]; /* room for future expansion */ | ||
193 | /* | ||
194 | * To allow migration of L1 (complete with its L2 guests) between | ||
195 | * machines of different natural widths (32 or 64 bit), we cannot have | ||
196 | * unsigned long fields with no explict size. We use u64 (aliased | ||
197 | * natural_width) instead. Luckily, x86 is little-endian. | ||
198 | */ | ||
199 | natural_width cr0_guest_host_mask; | ||
200 | natural_width cr4_guest_host_mask; | ||
201 | natural_width cr0_read_shadow; | ||
202 | natural_width cr4_read_shadow; | ||
203 | natural_width cr3_target_value0; | ||
204 | natural_width cr3_target_value1; | ||
205 | natural_width cr3_target_value2; | ||
206 | natural_width cr3_target_value3; | ||
207 | natural_width exit_qualification; | ||
208 | natural_width guest_linear_address; | ||
209 | natural_width guest_cr0; | ||
210 | natural_width guest_cr3; | ||
211 | natural_width guest_cr4; | ||
212 | natural_width guest_es_base; | ||
213 | natural_width guest_cs_base; | ||
214 | natural_width guest_ss_base; | ||
215 | natural_width guest_ds_base; | ||
216 | natural_width guest_fs_base; | ||
217 | natural_width guest_gs_base; | ||
218 | natural_width guest_ldtr_base; | ||
219 | natural_width guest_tr_base; | ||
220 | natural_width guest_gdtr_base; | ||
221 | natural_width guest_idtr_base; | ||
222 | natural_width guest_dr7; | ||
223 | natural_width guest_rsp; | ||
224 | natural_width guest_rip; | ||
225 | natural_width guest_rflags; | ||
226 | natural_width guest_pending_dbg_exceptions; | ||
227 | natural_width guest_sysenter_esp; | ||
228 | natural_width guest_sysenter_eip; | ||
229 | natural_width host_cr0; | ||
230 | natural_width host_cr3; | ||
231 | natural_width host_cr4; | ||
232 | natural_width host_fs_base; | ||
233 | natural_width host_gs_base; | ||
234 | natural_width host_tr_base; | ||
235 | natural_width host_gdtr_base; | ||
236 | natural_width host_idtr_base; | ||
237 | natural_width host_ia32_sysenter_esp; | ||
238 | natural_width host_ia32_sysenter_eip; | ||
239 | natural_width host_rsp; | ||
240 | natural_width host_rip; | ||
241 | natural_width paddingl[8]; /* room for future expansion */ | ||
242 | u32 pin_based_vm_exec_control; | ||
243 | u32 cpu_based_vm_exec_control; | ||
244 | u32 exception_bitmap; | ||
245 | u32 page_fault_error_code_mask; | ||
246 | u32 page_fault_error_code_match; | ||
247 | u32 cr3_target_count; | ||
248 | u32 vm_exit_controls; | ||
249 | u32 vm_exit_msr_store_count; | ||
250 | u32 vm_exit_msr_load_count; | ||
251 | u32 vm_entry_controls; | ||
252 | u32 vm_entry_msr_load_count; | ||
253 | u32 vm_entry_intr_info_field; | ||
254 | u32 vm_entry_exception_error_code; | ||
255 | u32 vm_entry_instruction_len; | ||
256 | u32 tpr_threshold; | ||
257 | u32 secondary_vm_exec_control; | ||
258 | u32 vm_instruction_error; | ||
259 | u32 vm_exit_reason; | ||
260 | u32 vm_exit_intr_info; | ||
261 | u32 vm_exit_intr_error_code; | ||
262 | u32 idt_vectoring_info_field; | ||
263 | u32 idt_vectoring_error_code; | ||
264 | u32 vm_exit_instruction_len; | ||
265 | u32 vmx_instruction_info; | ||
266 | u32 guest_es_limit; | ||
267 | u32 guest_cs_limit; | ||
268 | u32 guest_ss_limit; | ||
269 | u32 guest_ds_limit; | ||
270 | u32 guest_fs_limit; | ||
271 | u32 guest_gs_limit; | ||
272 | u32 guest_ldtr_limit; | ||
273 | u32 guest_tr_limit; | ||
274 | u32 guest_gdtr_limit; | ||
275 | u32 guest_idtr_limit; | ||
276 | u32 guest_es_ar_bytes; | ||
277 | u32 guest_cs_ar_bytes; | ||
278 | u32 guest_ss_ar_bytes; | ||
279 | u32 guest_ds_ar_bytes; | ||
280 | u32 guest_fs_ar_bytes; | ||
281 | u32 guest_gs_ar_bytes; | ||
282 | u32 guest_ldtr_ar_bytes; | ||
283 | u32 guest_tr_ar_bytes; | ||
284 | u32 guest_interruptibility_info; | ||
285 | u32 guest_activity_state; | ||
286 | u32 guest_sysenter_cs; | ||
287 | u32 host_ia32_sysenter_cs; | ||
288 | u32 padding32[8]; /* room for future expansion */ | ||
289 | u16 virtual_processor_id; | ||
290 | u16 guest_es_selector; | ||
291 | u16 guest_cs_selector; | ||
292 | u16 guest_ss_selector; | ||
293 | u16 guest_ds_selector; | ||
294 | u16 guest_fs_selector; | ||
295 | u16 guest_gs_selector; | ||
296 | u16 guest_ldtr_selector; | ||
297 | u16 guest_tr_selector; | ||
298 | u16 host_es_selector; | ||
299 | u16 host_cs_selector; | ||
300 | u16 host_ss_selector; | ||
301 | u16 host_ds_selector; | ||
302 | u16 host_fs_selector; | ||
303 | u16 host_gs_selector; | ||
304 | u16 host_tr_selector; | ||
305 | }; | ||
306 | |||
307 | /* | ||
308 | * VMCS12_REVISION is an arbitrary id that should be changed if the content or | ||
309 | * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and | ||
310 | * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. | ||
311 | */ | ||
312 | #define VMCS12_REVISION 0x11e57ed0 | ||
313 | |||
314 | /* | ||
315 | * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region | ||
316 | * and any VMCS region. Although only sizeof(struct vmcs12) are used by the | ||
317 | * current implementation, 4K are reserved to avoid future complications. | ||
318 | */ | ||
319 | #define VMCS12_SIZE 0x1000 | ||
320 | |||
321 | /* Used to remember the last vmcs02 used for some recently used vmcs12s */ | ||
322 | struct vmcs02_list { | ||
323 | struct list_head list; | ||
324 | gpa_t vmptr; | ||
325 | struct loaded_vmcs vmcs02; | ||
326 | }; | ||
327 | |||
328 | /* | ||
329 | * The nested_vmx structure is part of vcpu_vmx, and holds information we need | ||
330 | * for correct emulation of VMX (i.e., nested VMX) on this vcpu. | ||
331 | */ | ||
332 | struct nested_vmx { | ||
333 | /* Has the level1 guest done vmxon? */ | ||
334 | bool vmxon; | ||
335 | |||
336 | /* The guest-physical address of the current VMCS L1 keeps for L2 */ | ||
337 | gpa_t current_vmptr; | ||
338 | /* The host-usable pointer to the above */ | ||
339 | struct page *current_vmcs12_page; | ||
340 | struct vmcs12 *current_vmcs12; | ||
341 | |||
342 | /* vmcs02_list cache of VMCSs recently used to run L2 guests */ | ||
343 | struct list_head vmcs02_pool; | ||
344 | int vmcs02_num; | ||
345 | u64 vmcs01_tsc_offset; | ||
346 | /* L2 must run next, and mustn't decide to exit to L1. */ | ||
347 | bool nested_run_pending; | ||
348 | /* | ||
349 | * Guest pages referred to in vmcs02 with host-physical pointers, so | ||
350 | * we must keep them pinned while L2 runs. | ||
351 | */ | ||
352 | struct page *apic_access_page; | ||
353 | }; | ||
354 | |||
125 | struct vcpu_vmx { | 355 | struct vcpu_vmx { |
126 | struct kvm_vcpu vcpu; | 356 | struct kvm_vcpu vcpu; |
127 | struct list_head local_vcpus_link; | ||
128 | unsigned long host_rsp; | 357 | unsigned long host_rsp; |
129 | int launched; | ||
130 | u8 fail; | 358 | u8 fail; |
131 | u8 cpl; | 359 | u8 cpl; |
132 | bool nmi_known_unmasked; | 360 | bool nmi_known_unmasked; |
@@ -140,7 +368,14 @@ struct vcpu_vmx { | |||
140 | u64 msr_host_kernel_gs_base; | 368 | u64 msr_host_kernel_gs_base; |
141 | u64 msr_guest_kernel_gs_base; | 369 | u64 msr_guest_kernel_gs_base; |
142 | #endif | 370 | #endif |
143 | struct vmcs *vmcs; | 371 | /* |
372 | * loaded_vmcs points to the VMCS currently used in this vcpu. For a | ||
373 | * non-nested (L1) guest, it always points to vmcs01. For a nested | ||
374 | * guest (L2), it points to a different VMCS. | ||
375 | */ | ||
376 | struct loaded_vmcs vmcs01; | ||
377 | struct loaded_vmcs *loaded_vmcs; | ||
378 | bool __launched; /* temporary, used in vmx_vcpu_run */ | ||
144 | struct msr_autoload { | 379 | struct msr_autoload { |
145 | unsigned nr; | 380 | unsigned nr; |
146 | struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; | 381 | struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; |
@@ -176,6 +411,9 @@ struct vcpu_vmx { | |||
176 | u32 exit_reason; | 411 | u32 exit_reason; |
177 | 412 | ||
178 | bool rdtscp_enabled; | 413 | bool rdtscp_enabled; |
414 | |||
415 | /* Support for a guest hypervisor (nested VMX) */ | ||
416 | struct nested_vmx nested; | ||
179 | }; | 417 | }; |
180 | 418 | ||
181 | enum segment_cache_field { | 419 | enum segment_cache_field { |
@@ -192,6 +430,174 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) | |||
192 | return container_of(vcpu, struct vcpu_vmx, vcpu); | 430 | return container_of(vcpu, struct vcpu_vmx, vcpu); |
193 | } | 431 | } |
194 | 432 | ||
433 | #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) | ||
434 | #define FIELD(number, name) [number] = VMCS12_OFFSET(name) | ||
435 | #define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ | ||
436 | [number##_HIGH] = VMCS12_OFFSET(name)+4 | ||
437 | |||
438 | static unsigned short vmcs_field_to_offset_table[] = { | ||
439 | FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), | ||
440 | FIELD(GUEST_ES_SELECTOR, guest_es_selector), | ||
441 | FIELD(GUEST_CS_SELECTOR, guest_cs_selector), | ||
442 | FIELD(GUEST_SS_SELECTOR, guest_ss_selector), | ||
443 | FIELD(GUEST_DS_SELECTOR, guest_ds_selector), | ||
444 | FIELD(GUEST_FS_SELECTOR, guest_fs_selector), | ||
445 | FIELD(GUEST_GS_SELECTOR, guest_gs_selector), | ||
446 | FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), | ||
447 | FIELD(GUEST_TR_SELECTOR, guest_tr_selector), | ||
448 | FIELD(HOST_ES_SELECTOR, host_es_selector), | ||
449 | FIELD(HOST_CS_SELECTOR, host_cs_selector), | ||
450 | FIELD(HOST_SS_SELECTOR, host_ss_selector), | ||
451 | FIELD(HOST_DS_SELECTOR, host_ds_selector), | ||
452 | FIELD(HOST_FS_SELECTOR, host_fs_selector), | ||
453 | FIELD(HOST_GS_SELECTOR, host_gs_selector), | ||
454 | FIELD(HOST_TR_SELECTOR, host_tr_selector), | ||
455 | FIELD64(IO_BITMAP_A, io_bitmap_a), | ||
456 | FIELD64(IO_BITMAP_B, io_bitmap_b), | ||
457 | FIELD64(MSR_BITMAP, msr_bitmap), | ||
458 | FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr), | ||
459 | FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr), | ||
460 | FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr), | ||
461 | FIELD64(TSC_OFFSET, tsc_offset), | ||
462 | FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), | ||
463 | FIELD64(APIC_ACCESS_ADDR, apic_access_addr), | ||
464 | FIELD64(EPT_POINTER, ept_pointer), | ||
465 | FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), | ||
466 | FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), | ||
467 | FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), | ||
468 | FIELD64(GUEST_IA32_PAT, guest_ia32_pat), | ||
469 | FIELD64(GUEST_IA32_EFER, guest_ia32_efer), | ||
470 | FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl), | ||
471 | FIELD64(GUEST_PDPTR0, guest_pdptr0), | ||
472 | FIELD64(GUEST_PDPTR1, guest_pdptr1), | ||
473 | FIELD64(GUEST_PDPTR2, guest_pdptr2), | ||
474 | FIELD64(GUEST_PDPTR3, guest_pdptr3), | ||
475 | FIELD64(HOST_IA32_PAT, host_ia32_pat), | ||
476 | FIELD64(HOST_IA32_EFER, host_ia32_efer), | ||
477 | FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), | ||
478 | FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control), | ||
479 | FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control), | ||
480 | FIELD(EXCEPTION_BITMAP, exception_bitmap), | ||
481 | FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask), | ||
482 | FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match), | ||
483 | FIELD(CR3_TARGET_COUNT, cr3_target_count), | ||
484 | FIELD(VM_EXIT_CONTROLS, vm_exit_controls), | ||
485 | FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count), | ||
486 | FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count), | ||
487 | FIELD(VM_ENTRY_CONTROLS, vm_entry_controls), | ||
488 | FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count), | ||
489 | FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field), | ||
490 | FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code), | ||
491 | FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len), | ||
492 | FIELD(TPR_THRESHOLD, tpr_threshold), | ||
493 | FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control), | ||
494 | FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error), | ||
495 | FIELD(VM_EXIT_REASON, vm_exit_reason), | ||
496 | FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info), | ||
497 | FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code), | ||
498 | FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field), | ||
499 | FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code), | ||
500 | FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len), | ||
501 | FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info), | ||
502 | FIELD(GUEST_ES_LIMIT, guest_es_limit), | ||
503 | FIELD(GUEST_CS_LIMIT, guest_cs_limit), | ||
504 | FIELD(GUEST_SS_LIMIT, guest_ss_limit), | ||
505 | FIELD(GUEST_DS_LIMIT, guest_ds_limit), | ||
506 | FIELD(GUEST_FS_LIMIT, guest_fs_limit), | ||
507 | FIELD(GUEST_GS_LIMIT, guest_gs_limit), | ||
508 | FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit), | ||
509 | FIELD(GUEST_TR_LIMIT, guest_tr_limit), | ||
510 | FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit), | ||
511 | FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit), | ||
512 | FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes), | ||
513 | FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes), | ||
514 | FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes), | ||
515 | FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes), | ||
516 | FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes), | ||
517 | FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes), | ||
518 | FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes), | ||
519 | FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes), | ||
520 | FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info), | ||
521 | FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), | ||
522 | FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), | ||
523 | FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), | ||
524 | FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), | ||
525 | FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), | ||
526 | FIELD(CR0_READ_SHADOW, cr0_read_shadow), | ||
527 | FIELD(CR4_READ_SHADOW, cr4_read_shadow), | ||
528 | FIELD(CR3_TARGET_VALUE0, cr3_target_value0), | ||
529 | FIELD(CR3_TARGET_VALUE1, cr3_target_value1), | ||
530 | FIELD(CR3_TARGET_VALUE2, cr3_target_value2), | ||
531 | FIELD(CR3_TARGET_VALUE3, cr3_target_value3), | ||
532 | FIELD(EXIT_QUALIFICATION, exit_qualification), | ||
533 | FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address), | ||
534 | FIELD(GUEST_CR0, guest_cr0), | ||
535 | FIELD(GUEST_CR3, guest_cr3), | ||
536 | FIELD(GUEST_CR4, guest_cr4), | ||
537 | FIELD(GUEST_ES_BASE, guest_es_base), | ||
538 | FIELD(GUEST_CS_BASE, guest_cs_base), | ||
539 | FIELD(GUEST_SS_BASE, guest_ss_base), | ||
540 | FIELD(GUEST_DS_BASE, guest_ds_base), | ||
541 | FIELD(GUEST_FS_BASE, guest_fs_base), | ||
542 | FIELD(GUEST_GS_BASE, guest_gs_base), | ||
543 | FIELD(GUEST_LDTR_BASE, guest_ldtr_base), | ||
544 | FIELD(GUEST_TR_BASE, guest_tr_base), | ||
545 | FIELD(GUEST_GDTR_BASE, guest_gdtr_base), | ||
546 | FIELD(GUEST_IDTR_BASE, guest_idtr_base), | ||
547 | FIELD(GUEST_DR7, guest_dr7), | ||
548 | FIELD(GUEST_RSP, guest_rsp), | ||
549 | FIELD(GUEST_RIP, guest_rip), | ||
550 | FIELD(GUEST_RFLAGS, guest_rflags), | ||
551 | FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions), | ||
552 | FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp), | ||
553 | FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip), | ||
554 | FIELD(HOST_CR0, host_cr0), | ||
555 | FIELD(HOST_CR3, host_cr3), | ||
556 | FIELD(HOST_CR4, host_cr4), | ||
557 | FIELD(HOST_FS_BASE, host_fs_base), | ||
558 | FIELD(HOST_GS_BASE, host_gs_base), | ||
559 | FIELD(HOST_TR_BASE, host_tr_base), | ||
560 | FIELD(HOST_GDTR_BASE, host_gdtr_base), | ||
561 | FIELD(HOST_IDTR_BASE, host_idtr_base), | ||
562 | FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp), | ||
563 | FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), | ||
564 | FIELD(HOST_RSP, host_rsp), | ||
565 | FIELD(HOST_RIP, host_rip), | ||
566 | }; | ||
567 | static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table); | ||
568 | |||
569 | static inline short vmcs_field_to_offset(unsigned long field) | ||
570 | { | ||
571 | if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0) | ||
572 | return -1; | ||
573 | return vmcs_field_to_offset_table[field]; | ||
574 | } | ||
575 | |||
576 | static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) | ||
577 | { | ||
578 | return to_vmx(vcpu)->nested.current_vmcs12; | ||
579 | } | ||
580 | |||
581 | static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) | ||
582 | { | ||
583 | struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT); | ||
584 | if (is_error_page(page)) { | ||
585 | kvm_release_page_clean(page); | ||
586 | return NULL; | ||
587 | } | ||
588 | return page; | ||
589 | } | ||
590 | |||
591 | static void nested_release_page(struct page *page) | ||
592 | { | ||
593 | kvm_release_page_dirty(page); | ||
594 | } | ||
595 | |||
596 | static void nested_release_page_clean(struct page *page) | ||
597 | { | ||
598 | kvm_release_page_clean(page); | ||
599 | } | ||
600 | |||
195 | static u64 construct_eptp(unsigned long root_hpa); | 601 | static u64 construct_eptp(unsigned long root_hpa); |
196 | static void kvm_cpu_vmxon(u64 addr); | 602 | static void kvm_cpu_vmxon(u64 addr); |
197 | static void kvm_cpu_vmxoff(void); | 603 | static void kvm_cpu_vmxoff(void); |
@@ -200,7 +606,11 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); | |||
200 | 606 | ||
201 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); | 607 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); |
202 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); | 608 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); |
203 | static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); | 609 | /* |
610 | * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed | ||
611 | * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. | ||
612 | */ | ||
613 | static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); | ||
204 | static DEFINE_PER_CPU(struct desc_ptr, host_gdt); | 614 | static DEFINE_PER_CPU(struct desc_ptr, host_gdt); |
205 | 615 | ||
206 | static unsigned long *vmx_io_bitmap_a; | 616 | static unsigned long *vmx_io_bitmap_a; |
@@ -442,6 +852,35 @@ static inline bool report_flexpriority(void) | |||
442 | return flexpriority_enabled; | 852 | return flexpriority_enabled; |
443 | } | 853 | } |
444 | 854 | ||
855 | static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) | ||
856 | { | ||
857 | return vmcs12->cpu_based_vm_exec_control & bit; | ||
858 | } | ||
859 | |||
860 | static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) | ||
861 | { | ||
862 | return (vmcs12->cpu_based_vm_exec_control & | ||
863 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && | ||
864 | (vmcs12->secondary_vm_exec_control & bit); | ||
865 | } | ||
866 | |||
867 | static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, | ||
868 | struct kvm_vcpu *vcpu) | ||
869 | { | ||
870 | return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; | ||
871 | } | ||
872 | |||
873 | static inline bool is_exception(u32 intr_info) | ||
874 | { | ||
875 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) | ||
876 | == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); | ||
877 | } | ||
878 | |||
879 | static void nested_vmx_vmexit(struct kvm_vcpu *vcpu); | ||
880 | static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, | ||
881 | struct vmcs12 *vmcs12, | ||
882 | u32 reason, unsigned long qualification); | ||
883 | |||
445 | static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) | 884 | static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) |
446 | { | 885 | { |
447 | int i; | 886 | int i; |
@@ -501,6 +940,13 @@ static void vmcs_clear(struct vmcs *vmcs) | |||
501 | vmcs, phys_addr); | 940 | vmcs, phys_addr); |
502 | } | 941 | } |
503 | 942 | ||
943 | static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) | ||
944 | { | ||
945 | vmcs_clear(loaded_vmcs->vmcs); | ||
946 | loaded_vmcs->cpu = -1; | ||
947 | loaded_vmcs->launched = 0; | ||
948 | } | ||
949 | |||
504 | static void vmcs_load(struct vmcs *vmcs) | 950 | static void vmcs_load(struct vmcs *vmcs) |
505 | { | 951 | { |
506 | u64 phys_addr = __pa(vmcs); | 952 | u64 phys_addr = __pa(vmcs); |
@@ -510,29 +956,28 @@ static void vmcs_load(struct vmcs *vmcs) | |||
510 | : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) | 956 | : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) |
511 | : "cc", "memory"); | 957 | : "cc", "memory"); |
512 | if (error) | 958 | if (error) |
513 | printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", | 959 | printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", |
514 | vmcs, phys_addr); | 960 | vmcs, phys_addr); |
515 | } | 961 | } |
516 | 962 | ||
517 | static void __vcpu_clear(void *arg) | 963 | static void __loaded_vmcs_clear(void *arg) |
518 | { | 964 | { |
519 | struct vcpu_vmx *vmx = arg; | 965 | struct loaded_vmcs *loaded_vmcs = arg; |
520 | int cpu = raw_smp_processor_id(); | 966 | int cpu = raw_smp_processor_id(); |
521 | 967 | ||
522 | if (vmx->vcpu.cpu == cpu) | 968 | if (loaded_vmcs->cpu != cpu) |
523 | vmcs_clear(vmx->vmcs); | 969 | return; /* vcpu migration can race with cpu offline */ |
524 | if (per_cpu(current_vmcs, cpu) == vmx->vmcs) | 970 | if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) |
525 | per_cpu(current_vmcs, cpu) = NULL; | 971 | per_cpu(current_vmcs, cpu) = NULL; |
526 | list_del(&vmx->local_vcpus_link); | 972 | list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); |
527 | vmx->vcpu.cpu = -1; | 973 | loaded_vmcs_init(loaded_vmcs); |
528 | vmx->launched = 0; | ||
529 | } | 974 | } |
530 | 975 | ||
531 | static void vcpu_clear(struct vcpu_vmx *vmx) | 976 | static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) |
532 | { | 977 | { |
533 | if (vmx->vcpu.cpu == -1) | 978 | if (loaded_vmcs->cpu != -1) |
534 | return; | 979 | smp_call_function_single( |
535 | smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); | 980 | loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1); |
536 | } | 981 | } |
537 | 982 | ||
538 | static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) | 983 | static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) |
@@ -585,26 +1030,26 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) | |||
585 | } | 1030 | } |
586 | } | 1031 | } |
587 | 1032 | ||
588 | static unsigned long vmcs_readl(unsigned long field) | 1033 | static __always_inline unsigned long vmcs_readl(unsigned long field) |
589 | { | 1034 | { |
590 | unsigned long value = 0; | 1035 | unsigned long value; |
591 | 1036 | ||
592 | asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) | 1037 | asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0") |
593 | : "+a"(value) : "d"(field) : "cc"); | 1038 | : "=a"(value) : "d"(field) : "cc"); |
594 | return value; | 1039 | return value; |
595 | } | 1040 | } |
596 | 1041 | ||
597 | static u16 vmcs_read16(unsigned long field) | 1042 | static __always_inline u16 vmcs_read16(unsigned long field) |
598 | { | 1043 | { |
599 | return vmcs_readl(field); | 1044 | return vmcs_readl(field); |
600 | } | 1045 | } |
601 | 1046 | ||
602 | static u32 vmcs_read32(unsigned long field) | 1047 | static __always_inline u32 vmcs_read32(unsigned long field) |
603 | { | 1048 | { |
604 | return vmcs_readl(field); | 1049 | return vmcs_readl(field); |
605 | } | 1050 | } |
606 | 1051 | ||
607 | static u64 vmcs_read64(unsigned long field) | 1052 | static __always_inline u64 vmcs_read64(unsigned long field) |
608 | { | 1053 | { |
609 | #ifdef CONFIG_X86_64 | 1054 | #ifdef CONFIG_X86_64 |
610 | return vmcs_readl(field); | 1055 | return vmcs_readl(field); |
@@ -731,6 +1176,15 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) | |||
731 | eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ | 1176 | eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ |
732 | if (vcpu->fpu_active) | 1177 | if (vcpu->fpu_active) |
733 | eb &= ~(1u << NM_VECTOR); | 1178 | eb &= ~(1u << NM_VECTOR); |
1179 | |||
1180 | /* When we are running a nested L2 guest and L1 specified for it a | ||
1181 | * certain exception bitmap, we must trap the same exceptions and pass | ||
1182 | * them to L1. When running L2, we will only handle the exceptions | ||
1183 | * specified above if L1 did not want them. | ||
1184 | */ | ||
1185 | if (is_guest_mode(vcpu)) | ||
1186 | eb |= get_vmcs12(vcpu)->exception_bitmap; | ||
1187 | |||
734 | vmcs_write32(EXCEPTION_BITMAP, eb); | 1188 | vmcs_write32(EXCEPTION_BITMAP, eb); |
735 | } | 1189 | } |
736 | 1190 | ||
@@ -971,22 +1425,22 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
971 | 1425 | ||
972 | if (!vmm_exclusive) | 1426 | if (!vmm_exclusive) |
973 | kvm_cpu_vmxon(phys_addr); | 1427 | kvm_cpu_vmxon(phys_addr); |
974 | else if (vcpu->cpu != cpu) | 1428 | else if (vmx->loaded_vmcs->cpu != cpu) |
975 | vcpu_clear(vmx); | 1429 | loaded_vmcs_clear(vmx->loaded_vmcs); |
976 | 1430 | ||
977 | if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { | 1431 | if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { |
978 | per_cpu(current_vmcs, cpu) = vmx->vmcs; | 1432 | per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; |
979 | vmcs_load(vmx->vmcs); | 1433 | vmcs_load(vmx->loaded_vmcs->vmcs); |
980 | } | 1434 | } |
981 | 1435 | ||
982 | if (vcpu->cpu != cpu) { | 1436 | if (vmx->loaded_vmcs->cpu != cpu) { |
983 | struct desc_ptr *gdt = &__get_cpu_var(host_gdt); | 1437 | struct desc_ptr *gdt = &__get_cpu_var(host_gdt); |
984 | unsigned long sysenter_esp; | 1438 | unsigned long sysenter_esp; |
985 | 1439 | ||
986 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | 1440 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); |
987 | local_irq_disable(); | 1441 | local_irq_disable(); |
988 | list_add(&vmx->local_vcpus_link, | 1442 | list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, |
989 | &per_cpu(vcpus_on_cpu, cpu)); | 1443 | &per_cpu(loaded_vmcss_on_cpu, cpu)); |
990 | local_irq_enable(); | 1444 | local_irq_enable(); |
991 | 1445 | ||
992 | /* | 1446 | /* |
@@ -998,6 +1452,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
998 | 1452 | ||
999 | rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); | 1453 | rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); |
1000 | vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ | 1454 | vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ |
1455 | vmx->loaded_vmcs->cpu = cpu; | ||
1001 | } | 1456 | } |
1002 | } | 1457 | } |
1003 | 1458 | ||
@@ -1005,7 +1460,8 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) | |||
1005 | { | 1460 | { |
1006 | __vmx_load_host_state(to_vmx(vcpu)); | 1461 | __vmx_load_host_state(to_vmx(vcpu)); |
1007 | if (!vmm_exclusive) { | 1462 | if (!vmm_exclusive) { |
1008 | __vcpu_clear(to_vmx(vcpu)); | 1463 | __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); |
1464 | vcpu->cpu = -1; | ||
1009 | kvm_cpu_vmxoff(); | 1465 | kvm_cpu_vmxoff(); |
1010 | } | 1466 | } |
1011 | } | 1467 | } |
@@ -1023,19 +1479,55 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu) | |||
1023 | vmcs_writel(GUEST_CR0, cr0); | 1479 | vmcs_writel(GUEST_CR0, cr0); |
1024 | update_exception_bitmap(vcpu); | 1480 | update_exception_bitmap(vcpu); |
1025 | vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; | 1481 | vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; |
1482 | if (is_guest_mode(vcpu)) | ||
1483 | vcpu->arch.cr0_guest_owned_bits &= | ||
1484 | ~get_vmcs12(vcpu)->cr0_guest_host_mask; | ||
1026 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); | 1485 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); |
1027 | } | 1486 | } |
1028 | 1487 | ||
1029 | static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); | 1488 | static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); |
1030 | 1489 | ||
1490 | /* | ||
1491 | * Return the cr0 value that a nested guest would read. This is a combination | ||
1492 | * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by | ||
1493 | * its hypervisor (cr0_read_shadow). | ||
1494 | */ | ||
1495 | static inline unsigned long nested_read_cr0(struct vmcs12 *fields) | ||
1496 | { | ||
1497 | return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) | | ||
1498 | (fields->cr0_read_shadow & fields->cr0_guest_host_mask); | ||
1499 | } | ||
1500 | static inline unsigned long nested_read_cr4(struct vmcs12 *fields) | ||
1501 | { | ||
1502 | return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) | | ||
1503 | (fields->cr4_read_shadow & fields->cr4_guest_host_mask); | ||
1504 | } | ||
1505 | |||
1031 | static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) | 1506 | static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) |
1032 | { | 1507 | { |
1508 | /* Note that there is no vcpu->fpu_active = 0 here. The caller must | ||
1509 | * set this *before* calling this function. | ||
1510 | */ | ||
1033 | vmx_decache_cr0_guest_bits(vcpu); | 1511 | vmx_decache_cr0_guest_bits(vcpu); |
1034 | vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); | 1512 | vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); |
1035 | update_exception_bitmap(vcpu); | 1513 | update_exception_bitmap(vcpu); |
1036 | vcpu->arch.cr0_guest_owned_bits = 0; | 1514 | vcpu->arch.cr0_guest_owned_bits = 0; |
1037 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); | 1515 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); |
1038 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); | 1516 | if (is_guest_mode(vcpu)) { |
1517 | /* | ||
1518 | * L1's specified read shadow might not contain the TS bit, | ||
1519 | * so now that we turned on shadowing of this bit, we need to | ||
1520 | * set this bit of the shadow. Like in nested_vmx_run we need | ||
1521 | * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet | ||
1522 | * up-to-date here because we just decached cr0.TS (and we'll | ||
1523 | * only update vmcs12->guest_cr0 on nested exit). | ||
1524 | */ | ||
1525 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
1526 | vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) | | ||
1527 | (vcpu->arch.cr0 & X86_CR0_TS); | ||
1528 | vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); | ||
1529 | } else | ||
1530 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); | ||
1039 | } | 1531 | } |
1040 | 1532 | ||
1041 | static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) | 1533 | static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) |
@@ -1119,6 +1611,25 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu) | |||
1119 | vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); | 1611 | vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); |
1120 | } | 1612 | } |
1121 | 1613 | ||
1614 | /* | ||
1615 | * KVM wants to inject page-faults which it got to the guest. This function | ||
1616 | * checks whether in a nested guest, we need to inject them to L1 or L2. | ||
1617 | * This function assumes it is called with the exit reason in vmcs02 being | ||
1618 | * a #PF exception (this is the only case in which KVM injects a #PF when L2 | ||
1619 | * is running). | ||
1620 | */ | ||
1621 | static int nested_pf_handled(struct kvm_vcpu *vcpu) | ||
1622 | { | ||
1623 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
1624 | |||
1625 | /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ | ||
1626 | if (!(vmcs12->exception_bitmap & PF_VECTOR)) | ||
1627 | return 0; | ||
1628 | |||
1629 | nested_vmx_vmexit(vcpu); | ||
1630 | return 1; | ||
1631 | } | ||
1632 | |||
1122 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | 1633 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, |
1123 | bool has_error_code, u32 error_code, | 1634 | bool has_error_code, u32 error_code, |
1124 | bool reinject) | 1635 | bool reinject) |
@@ -1126,6 +1637,10 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | |||
1126 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 1637 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
1127 | u32 intr_info = nr | INTR_INFO_VALID_MASK; | 1638 | u32 intr_info = nr | INTR_INFO_VALID_MASK; |
1128 | 1639 | ||
1640 | if (nr == PF_VECTOR && is_guest_mode(vcpu) && | ||
1641 | nested_pf_handled(vcpu)) | ||
1642 | return; | ||
1643 | |||
1129 | if (has_error_code) { | 1644 | if (has_error_code) { |
1130 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); | 1645 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); |
1131 | intr_info |= INTR_INFO_DELIVER_CODE_MASK; | 1646 | intr_info |= INTR_INFO_DELIVER_CODE_MASK; |
@@ -1248,12 +1763,24 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) | |||
1248 | static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) | 1763 | static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) |
1249 | { | 1764 | { |
1250 | vmcs_write64(TSC_OFFSET, offset); | 1765 | vmcs_write64(TSC_OFFSET, offset); |
1766 | if (is_guest_mode(vcpu)) | ||
1767 | /* | ||
1768 | * We're here if L1 chose not to trap the TSC MSR. Since | ||
1769 | * prepare_vmcs12() does not copy tsc_offset, we need to also | ||
1770 | * set the vmcs12 field here. | ||
1771 | */ | ||
1772 | get_vmcs12(vcpu)->tsc_offset = offset - | ||
1773 | to_vmx(vcpu)->nested.vmcs01_tsc_offset; | ||
1251 | } | 1774 | } |
1252 | 1775 | ||
1253 | static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) | 1776 | static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) |
1254 | { | 1777 | { |
1255 | u64 offset = vmcs_read64(TSC_OFFSET); | 1778 | u64 offset = vmcs_read64(TSC_OFFSET); |
1256 | vmcs_write64(TSC_OFFSET, offset + adjustment); | 1779 | vmcs_write64(TSC_OFFSET, offset + adjustment); |
1780 | if (is_guest_mode(vcpu)) { | ||
1781 | /* Even when running L2, the adjustment needs to apply to L1 */ | ||
1782 | to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment; | ||
1783 | } | ||
1257 | } | 1784 | } |
1258 | 1785 | ||
1259 | static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) | 1786 | static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) |
@@ -1261,6 +1788,236 @@ static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) | |||
1261 | return target_tsc - native_read_tsc(); | 1788 | return target_tsc - native_read_tsc(); |
1262 | } | 1789 | } |
1263 | 1790 | ||
1791 | static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) | ||
1792 | { | ||
1793 | struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0); | ||
1794 | return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31))); | ||
1795 | } | ||
1796 | |||
1797 | /* | ||
1798 | * nested_vmx_allowed() checks whether a guest should be allowed to use VMX | ||
1799 | * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for | ||
1800 | * all guests if the "nested" module option is off, and can also be disabled | ||
1801 | * for a single guest by disabling its VMX cpuid bit. | ||
1802 | */ | ||
1803 | static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) | ||
1804 | { | ||
1805 | return nested && guest_cpuid_has_vmx(vcpu); | ||
1806 | } | ||
1807 | |||
1808 | /* | ||
1809 | * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be | ||
1810 | * returned for the various VMX controls MSRs when nested VMX is enabled. | ||
1811 | * The same values should also be used to verify that vmcs12 control fields are | ||
1812 | * valid during nested entry from L1 to L2. | ||
1813 | * Each of these control msrs has a low and high 32-bit half: A low bit is on | ||
1814 | * if the corresponding bit in the (32-bit) control field *must* be on, and a | ||
1815 | * bit in the high half is on if the corresponding bit in the control field | ||
1816 | * may be on. See also vmx_control_verify(). | ||
1817 | * TODO: allow these variables to be modified (downgraded) by module options | ||
1818 | * or other means. | ||
1819 | */ | ||
1820 | static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high; | ||
1821 | static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high; | ||
1822 | static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; | ||
1823 | static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; | ||
1824 | static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; | ||
1825 | static __init void nested_vmx_setup_ctls_msrs(void) | ||
1826 | { | ||
1827 | /* | ||
1828 | * Note that as a general rule, the high half of the MSRs (bits in | ||
1829 | * the control fields which may be 1) should be initialized by the | ||
1830 | * intersection of the underlying hardware's MSR (i.e., features which | ||
1831 | * can be supported) and the list of features we want to expose - | ||
1832 | * because they are known to be properly supported in our code. | ||
1833 | * Also, usually, the low half of the MSRs (bits which must be 1) can | ||
1834 | * be set to 0, meaning that L1 may turn off any of these bits. The | ||
1835 | * reason is that if one of these bits is necessary, it will appear | ||
1836 | * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control | ||
1837 | * fields of vmcs01 and vmcs02, will turn these bits off - and | ||
1838 | * nested_vmx_exit_handled() will not pass related exits to L1. | ||
1839 | * These rules have exceptions below. | ||
1840 | */ | ||
1841 | |||
1842 | /* pin-based controls */ | ||
1843 | /* | ||
1844 | * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is | ||
1845 | * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR. | ||
1846 | */ | ||
1847 | nested_vmx_pinbased_ctls_low = 0x16 ; | ||
1848 | nested_vmx_pinbased_ctls_high = 0x16 | | ||
1849 | PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | | ||
1850 | PIN_BASED_VIRTUAL_NMIS; | ||
1851 | |||
1852 | /* exit controls */ | ||
1853 | nested_vmx_exit_ctls_low = 0; | ||
1854 | /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ | ||
1855 | #ifdef CONFIG_X86_64 | ||
1856 | nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE; | ||
1857 | #else | ||
1858 | nested_vmx_exit_ctls_high = 0; | ||
1859 | #endif | ||
1860 | |||
1861 | /* entry controls */ | ||
1862 | rdmsr(MSR_IA32_VMX_ENTRY_CTLS, | ||
1863 | nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); | ||
1864 | nested_vmx_entry_ctls_low = 0; | ||
1865 | nested_vmx_entry_ctls_high &= | ||
1866 | VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; | ||
1867 | |||
1868 | /* cpu-based controls */ | ||
1869 | rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, | ||
1870 | nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); | ||
1871 | nested_vmx_procbased_ctls_low = 0; | ||
1872 | nested_vmx_procbased_ctls_high &= | ||
1873 | CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING | | ||
1874 | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | | ||
1875 | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | | ||
1876 | CPU_BASED_CR3_STORE_EXITING | | ||
1877 | #ifdef CONFIG_X86_64 | ||
1878 | CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | | ||
1879 | #endif | ||
1880 | CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | | ||
1881 | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | | ||
1882 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | ||
1883 | /* | ||
1884 | * We can allow some features even when not supported by the | ||
1885 | * hardware. For example, L1 can specify an MSR bitmap - and we | ||
1886 | * can use it to avoid exits to L1 - even when L0 runs L2 | ||
1887 | * without MSR bitmaps. | ||
1888 | */ | ||
1889 | nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS; | ||
1890 | |||
1891 | /* secondary cpu-based controls */ | ||
1892 | rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, | ||
1893 | nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high); | ||
1894 | nested_vmx_secondary_ctls_low = 0; | ||
1895 | nested_vmx_secondary_ctls_high &= | ||
1896 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
1897 | } | ||
1898 | |||
1899 | static inline bool vmx_control_verify(u32 control, u32 low, u32 high) | ||
1900 | { | ||
1901 | /* | ||
1902 | * Bits 0 in high must be 0, and bits 1 in low must be 1. | ||
1903 | */ | ||
1904 | return ((control & high) | low) == control; | ||
1905 | } | ||
1906 | |||
1907 | static inline u64 vmx_control_msr(u32 low, u32 high) | ||
1908 | { | ||
1909 | return low | ((u64)high << 32); | ||
1910 | } | ||
1911 | |||
1912 | /* | ||
1913 | * If we allow our guest to use VMX instructions (i.e., nested VMX), we should | ||
1914 | * also let it use VMX-specific MSRs. | ||
1915 | * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a | ||
1916 | * VMX-specific MSR, or 0 when we haven't (and the caller should handle it | ||
1917 | * like all other MSRs). | ||
1918 | */ | ||
1919 | static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | ||
1920 | { | ||
1921 | if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC && | ||
1922 | msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) { | ||
1923 | /* | ||
1924 | * According to the spec, processors which do not support VMX | ||
1925 | * should throw a #GP(0) when VMX capability MSRs are read. | ||
1926 | */ | ||
1927 | kvm_queue_exception_e(vcpu, GP_VECTOR, 0); | ||
1928 | return 1; | ||
1929 | } | ||
1930 | |||
1931 | switch (msr_index) { | ||
1932 | case MSR_IA32_FEATURE_CONTROL: | ||
1933 | *pdata = 0; | ||
1934 | break; | ||
1935 | case MSR_IA32_VMX_BASIC: | ||
1936 | /* | ||
1937 | * This MSR reports some information about VMX support. We | ||
1938 | * should return information about the VMX we emulate for the | ||
1939 | * guest, and the VMCS structure we give it - not about the | ||
1940 | * VMX support of the underlying hardware. | ||
1941 | */ | ||
1942 | *pdata = VMCS12_REVISION | | ||
1943 | ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | | ||
1944 | (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); | ||
1945 | break; | ||
1946 | case MSR_IA32_VMX_TRUE_PINBASED_CTLS: | ||
1947 | case MSR_IA32_VMX_PINBASED_CTLS: | ||
1948 | *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low, | ||
1949 | nested_vmx_pinbased_ctls_high); | ||
1950 | break; | ||
1951 | case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: | ||
1952 | case MSR_IA32_VMX_PROCBASED_CTLS: | ||
1953 | *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low, | ||
1954 | nested_vmx_procbased_ctls_high); | ||
1955 | break; | ||
1956 | case MSR_IA32_VMX_TRUE_EXIT_CTLS: | ||
1957 | case MSR_IA32_VMX_EXIT_CTLS: | ||
1958 | *pdata = vmx_control_msr(nested_vmx_exit_ctls_low, | ||
1959 | nested_vmx_exit_ctls_high); | ||
1960 | break; | ||
1961 | case MSR_IA32_VMX_TRUE_ENTRY_CTLS: | ||
1962 | case MSR_IA32_VMX_ENTRY_CTLS: | ||
1963 | *pdata = vmx_control_msr(nested_vmx_entry_ctls_low, | ||
1964 | nested_vmx_entry_ctls_high); | ||
1965 | break; | ||
1966 | case MSR_IA32_VMX_MISC: | ||
1967 | *pdata = 0; | ||
1968 | break; | ||
1969 | /* | ||
1970 | * These MSRs specify bits which the guest must keep fixed (on or off) | ||
1971 | * while L1 is in VMXON mode (in L1's root mode, or running an L2). | ||
1972 | * We picked the standard core2 setting. | ||
1973 | */ | ||
1974 | #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) | ||
1975 | #define VMXON_CR4_ALWAYSON X86_CR4_VMXE | ||
1976 | case MSR_IA32_VMX_CR0_FIXED0: | ||
1977 | *pdata = VMXON_CR0_ALWAYSON; | ||
1978 | break; | ||
1979 | case MSR_IA32_VMX_CR0_FIXED1: | ||
1980 | *pdata = -1ULL; | ||
1981 | break; | ||
1982 | case MSR_IA32_VMX_CR4_FIXED0: | ||
1983 | *pdata = VMXON_CR4_ALWAYSON; | ||
1984 | break; | ||
1985 | case MSR_IA32_VMX_CR4_FIXED1: | ||
1986 | *pdata = -1ULL; | ||
1987 | break; | ||
1988 | case MSR_IA32_VMX_VMCS_ENUM: | ||
1989 | *pdata = 0x1f; | ||
1990 | break; | ||
1991 | case MSR_IA32_VMX_PROCBASED_CTLS2: | ||
1992 | *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low, | ||
1993 | nested_vmx_secondary_ctls_high); | ||
1994 | break; | ||
1995 | case MSR_IA32_VMX_EPT_VPID_CAP: | ||
1996 | /* Currently, no nested ept or nested vpid */ | ||
1997 | *pdata = 0; | ||
1998 | break; | ||
1999 | default: | ||
2000 | return 0; | ||
2001 | } | ||
2002 | |||
2003 | return 1; | ||
2004 | } | ||
2005 | |||
2006 | static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | ||
2007 | { | ||
2008 | if (!nested_vmx_allowed(vcpu)) | ||
2009 | return 0; | ||
2010 | |||
2011 | if (msr_index == MSR_IA32_FEATURE_CONTROL) | ||
2012 | /* TODO: the right thing. */ | ||
2013 | return 1; | ||
2014 | /* | ||
2015 | * No need to treat VMX capability MSRs specially: If we don't handle | ||
2016 | * them, handle_wrmsr will #GP(0), which is correct (they are readonly) | ||
2017 | */ | ||
2018 | return 0; | ||
2019 | } | ||
2020 | |||
1264 | /* | 2021 | /* |
1265 | * Reads an msr value (of 'msr_index') into 'pdata'. | 2022 | * Reads an msr value (of 'msr_index') into 'pdata'. |
1266 | * Returns 0 on success, non-0 otherwise. | 2023 | * Returns 0 on success, non-0 otherwise. |
@@ -1309,6 +2066,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | |||
1309 | /* Otherwise falls through */ | 2066 | /* Otherwise falls through */ |
1310 | default: | 2067 | default: |
1311 | vmx_load_host_state(to_vmx(vcpu)); | 2068 | vmx_load_host_state(to_vmx(vcpu)); |
2069 | if (vmx_get_vmx_msr(vcpu, msr_index, pdata)) | ||
2070 | return 0; | ||
1312 | msr = find_msr_entry(to_vmx(vcpu), msr_index); | 2071 | msr = find_msr_entry(to_vmx(vcpu), msr_index); |
1313 | if (msr) { | 2072 | if (msr) { |
1314 | vmx_load_host_state(to_vmx(vcpu)); | 2073 | vmx_load_host_state(to_vmx(vcpu)); |
@@ -1380,6 +2139,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
1380 | return 1; | 2139 | return 1; |
1381 | /* Otherwise falls through */ | 2140 | /* Otherwise falls through */ |
1382 | default: | 2141 | default: |
2142 | if (vmx_set_vmx_msr(vcpu, msr_index, data)) | ||
2143 | break; | ||
1383 | msr = find_msr_entry(vmx, msr_index); | 2144 | msr = find_msr_entry(vmx, msr_index); |
1384 | if (msr) { | 2145 | if (msr) { |
1385 | vmx_load_host_state(vmx); | 2146 | vmx_load_host_state(vmx); |
@@ -1469,7 +2230,7 @@ static int hardware_enable(void *garbage) | |||
1469 | if (read_cr4() & X86_CR4_VMXE) | 2230 | if (read_cr4() & X86_CR4_VMXE) |
1470 | return -EBUSY; | 2231 | return -EBUSY; |
1471 | 2232 | ||
1472 | INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); | 2233 | INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); |
1473 | rdmsrl(MSR_IA32_FEATURE_CONTROL, old); | 2234 | rdmsrl(MSR_IA32_FEATURE_CONTROL, old); |
1474 | 2235 | ||
1475 | test_bits = FEATURE_CONTROL_LOCKED; | 2236 | test_bits = FEATURE_CONTROL_LOCKED; |
@@ -1493,14 +2254,14 @@ static int hardware_enable(void *garbage) | |||
1493 | return 0; | 2254 | return 0; |
1494 | } | 2255 | } |
1495 | 2256 | ||
1496 | static void vmclear_local_vcpus(void) | 2257 | static void vmclear_local_loaded_vmcss(void) |
1497 | { | 2258 | { |
1498 | int cpu = raw_smp_processor_id(); | 2259 | int cpu = raw_smp_processor_id(); |
1499 | struct vcpu_vmx *vmx, *n; | 2260 | struct loaded_vmcs *v, *n; |
1500 | 2261 | ||
1501 | list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu), | 2262 | list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), |
1502 | local_vcpus_link) | 2263 | loaded_vmcss_on_cpu_link) |
1503 | __vcpu_clear(vmx); | 2264 | __loaded_vmcs_clear(v); |
1504 | } | 2265 | } |
1505 | 2266 | ||
1506 | 2267 | ||
@@ -1515,7 +2276,7 @@ static void kvm_cpu_vmxoff(void) | |||
1515 | static void hardware_disable(void *garbage) | 2276 | static void hardware_disable(void *garbage) |
1516 | { | 2277 | { |
1517 | if (vmm_exclusive) { | 2278 | if (vmm_exclusive) { |
1518 | vmclear_local_vcpus(); | 2279 | vmclear_local_loaded_vmcss(); |
1519 | kvm_cpu_vmxoff(); | 2280 | kvm_cpu_vmxoff(); |
1520 | } | 2281 | } |
1521 | write_cr4(read_cr4() & ~X86_CR4_VMXE); | 2282 | write_cr4(read_cr4() & ~X86_CR4_VMXE); |
@@ -1696,6 +2457,18 @@ static void free_vmcs(struct vmcs *vmcs) | |||
1696 | free_pages((unsigned long)vmcs, vmcs_config.order); | 2457 | free_pages((unsigned long)vmcs, vmcs_config.order); |
1697 | } | 2458 | } |
1698 | 2459 | ||
2460 | /* | ||
2461 | * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded | ||
2462 | */ | ||
2463 | static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) | ||
2464 | { | ||
2465 | if (!loaded_vmcs->vmcs) | ||
2466 | return; | ||
2467 | loaded_vmcs_clear(loaded_vmcs); | ||
2468 | free_vmcs(loaded_vmcs->vmcs); | ||
2469 | loaded_vmcs->vmcs = NULL; | ||
2470 | } | ||
2471 | |||
1699 | static void free_kvm_area(void) | 2472 | static void free_kvm_area(void) |
1700 | { | 2473 | { |
1701 | int cpu; | 2474 | int cpu; |
@@ -1756,6 +2529,9 @@ static __init int hardware_setup(void) | |||
1756 | if (!cpu_has_vmx_ple()) | 2529 | if (!cpu_has_vmx_ple()) |
1757 | ple_gap = 0; | 2530 | ple_gap = 0; |
1758 | 2531 | ||
2532 | if (nested) | ||
2533 | nested_vmx_setup_ctls_msrs(); | ||
2534 | |||
1759 | return alloc_kvm_area(); | 2535 | return alloc_kvm_area(); |
1760 | } | 2536 | } |
1761 | 2537 | ||
@@ -2041,7 +2817,7 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu) | |||
2041 | (unsigned long *)&vcpu->arch.regs_dirty); | 2817 | (unsigned long *)&vcpu->arch.regs_dirty); |
2042 | } | 2818 | } |
2043 | 2819 | ||
2044 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); | 2820 | static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); |
2045 | 2821 | ||
2046 | static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, | 2822 | static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, |
2047 | unsigned long cr0, | 2823 | unsigned long cr0, |
@@ -2139,11 +2915,23 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
2139 | vmcs_writel(GUEST_CR3, guest_cr3); | 2915 | vmcs_writel(GUEST_CR3, guest_cr3); |
2140 | } | 2916 | } |
2141 | 2917 | ||
2142 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | 2918 | static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
2143 | { | 2919 | { |
2144 | unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? | 2920 | unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? |
2145 | KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); | 2921 | KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); |
2146 | 2922 | ||
2923 | if (cr4 & X86_CR4_VMXE) { | ||
2924 | /* | ||
2925 | * To use VMXON (and later other VMX instructions), a guest | ||
2926 | * must first be able to turn on cr4.VMXE (see handle_vmon()). | ||
2927 | * So basically the check on whether to allow nested VMX | ||
2928 | * is here. | ||
2929 | */ | ||
2930 | if (!nested_vmx_allowed(vcpu)) | ||
2931 | return 1; | ||
2932 | } else if (to_vmx(vcpu)->nested.vmxon) | ||
2933 | return 1; | ||
2934 | |||
2147 | vcpu->arch.cr4 = cr4; | 2935 | vcpu->arch.cr4 = cr4; |
2148 | if (enable_ept) { | 2936 | if (enable_ept) { |
2149 | if (!is_paging(vcpu)) { | 2937 | if (!is_paging(vcpu)) { |
@@ -2156,6 +2944,7 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
2156 | 2944 | ||
2157 | vmcs_writel(CR4_READ_SHADOW, cr4); | 2945 | vmcs_writel(CR4_READ_SHADOW, cr4); |
2158 | vmcs_writel(GUEST_CR4, hw_cr4); | 2946 | vmcs_writel(GUEST_CR4, hw_cr4); |
2947 | return 0; | ||
2159 | } | 2948 | } |
2160 | 2949 | ||
2161 | static void vmx_get_segment(struct kvm_vcpu *vcpu, | 2950 | static void vmx_get_segment(struct kvm_vcpu *vcpu, |
@@ -2721,18 +3510,110 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) | |||
2721 | } | 3510 | } |
2722 | 3511 | ||
2723 | /* | 3512 | /* |
3513 | * Set up the vmcs's constant host-state fields, i.e., host-state fields that | ||
3514 | * will not change in the lifetime of the guest. | ||
3515 | * Note that host-state that does change is set elsewhere. E.g., host-state | ||
3516 | * that is set differently for each CPU is set in vmx_vcpu_load(), not here. | ||
3517 | */ | ||
3518 | static void vmx_set_constant_host_state(void) | ||
3519 | { | ||
3520 | u32 low32, high32; | ||
3521 | unsigned long tmpl; | ||
3522 | struct desc_ptr dt; | ||
3523 | |||
3524 | vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */ | ||
3525 | vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ | ||
3526 | vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ | ||
3527 | |||
3528 | vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ | ||
3529 | vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
3530 | vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
3531 | vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
3532 | vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ | ||
3533 | |||
3534 | native_store_idt(&dt); | ||
3535 | vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ | ||
3536 | |||
3537 | asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl)); | ||
3538 | vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */ | ||
3539 | |||
3540 | rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); | ||
3541 | vmcs_write32(HOST_IA32_SYSENTER_CS, low32); | ||
3542 | rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); | ||
3543 | vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ | ||
3544 | |||
3545 | if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { | ||
3546 | rdmsr(MSR_IA32_CR_PAT, low32, high32); | ||
3547 | vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); | ||
3548 | } | ||
3549 | } | ||
3550 | |||
3551 | static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) | ||
3552 | { | ||
3553 | vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; | ||
3554 | if (enable_ept) | ||
3555 | vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; | ||
3556 | if (is_guest_mode(&vmx->vcpu)) | ||
3557 | vmx->vcpu.arch.cr4_guest_owned_bits &= | ||
3558 | ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; | ||
3559 | vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); | ||
3560 | } | ||
3561 | |||
3562 | static u32 vmx_exec_control(struct vcpu_vmx *vmx) | ||
3563 | { | ||
3564 | u32 exec_control = vmcs_config.cpu_based_exec_ctrl; | ||
3565 | if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { | ||
3566 | exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
3567 | #ifdef CONFIG_X86_64 | ||
3568 | exec_control |= CPU_BASED_CR8_STORE_EXITING | | ||
3569 | CPU_BASED_CR8_LOAD_EXITING; | ||
3570 | #endif | ||
3571 | } | ||
3572 | if (!enable_ept) | ||
3573 | exec_control |= CPU_BASED_CR3_STORE_EXITING | | ||
3574 | CPU_BASED_CR3_LOAD_EXITING | | ||
3575 | CPU_BASED_INVLPG_EXITING; | ||
3576 | return exec_control; | ||
3577 | } | ||
3578 | |||
3579 | static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) | ||
3580 | { | ||
3581 | u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; | ||
3582 | if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) | ||
3583 | exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
3584 | if (vmx->vpid == 0) | ||
3585 | exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; | ||
3586 | if (!enable_ept) { | ||
3587 | exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; | ||
3588 | enable_unrestricted_guest = 0; | ||
3589 | } | ||
3590 | if (!enable_unrestricted_guest) | ||
3591 | exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; | ||
3592 | if (!ple_gap) | ||
3593 | exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; | ||
3594 | return exec_control; | ||
3595 | } | ||
3596 | |||
3597 | static void ept_set_mmio_spte_mask(void) | ||
3598 | { | ||
3599 | /* | ||
3600 | * EPT Misconfigurations can be generated if the value of bits 2:0 | ||
3601 | * of an EPT paging-structure entry is 110b (write/execute). | ||
3602 | * Also, magic bits (0xffull << 49) is set to quickly identify mmio | ||
3603 | * spte. | ||
3604 | */ | ||
3605 | kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull); | ||
3606 | } | ||
3607 | |||
3608 | /* | ||
2724 | * Sets up the vmcs for emulated real mode. | 3609 | * Sets up the vmcs for emulated real mode. |
2725 | */ | 3610 | */ |
2726 | static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | 3611 | static int vmx_vcpu_setup(struct vcpu_vmx *vmx) |
2727 | { | 3612 | { |
2728 | u32 host_sysenter_cs, msr_low, msr_high; | 3613 | #ifdef CONFIG_X86_64 |
2729 | u32 junk; | ||
2730 | u64 host_pat; | ||
2731 | unsigned long a; | 3614 | unsigned long a; |
2732 | struct desc_ptr dt; | 3615 | #endif |
2733 | int i; | 3616 | int i; |
2734 | unsigned long kvm_vmx_return; | ||
2735 | u32 exec_control; | ||
2736 | 3617 | ||
2737 | /* I/O */ | 3618 | /* I/O */ |
2738 | vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); | 3619 | vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); |
@@ -2747,36 +3628,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2747 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, | 3628 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, |
2748 | vmcs_config.pin_based_exec_ctrl); | 3629 | vmcs_config.pin_based_exec_ctrl); |
2749 | 3630 | ||
2750 | exec_control = vmcs_config.cpu_based_exec_ctrl; | 3631 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); |
2751 | if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { | ||
2752 | exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
2753 | #ifdef CONFIG_X86_64 | ||
2754 | exec_control |= CPU_BASED_CR8_STORE_EXITING | | ||
2755 | CPU_BASED_CR8_LOAD_EXITING; | ||
2756 | #endif | ||
2757 | } | ||
2758 | if (!enable_ept) | ||
2759 | exec_control |= CPU_BASED_CR3_STORE_EXITING | | ||
2760 | CPU_BASED_CR3_LOAD_EXITING | | ||
2761 | CPU_BASED_INVLPG_EXITING; | ||
2762 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); | ||
2763 | 3632 | ||
2764 | if (cpu_has_secondary_exec_ctrls()) { | 3633 | if (cpu_has_secondary_exec_ctrls()) { |
2765 | exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; | 3634 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, |
2766 | if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) | 3635 | vmx_secondary_exec_control(vmx)); |
2767 | exec_control &= | ||
2768 | ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
2769 | if (vmx->vpid == 0) | ||
2770 | exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; | ||
2771 | if (!enable_ept) { | ||
2772 | exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; | ||
2773 | enable_unrestricted_guest = 0; | ||
2774 | } | ||
2775 | if (!enable_unrestricted_guest) | ||
2776 | exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; | ||
2777 | if (!ple_gap) | ||
2778 | exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; | ||
2779 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); | ||
2780 | } | 3636 | } |
2781 | 3637 | ||
2782 | if (ple_gap) { | 3638 | if (ple_gap) { |
@@ -2784,20 +3640,13 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2784 | vmcs_write32(PLE_WINDOW, ple_window); | 3640 | vmcs_write32(PLE_WINDOW, ple_window); |
2785 | } | 3641 | } |
2786 | 3642 | ||
2787 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); | 3643 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); |
2788 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); | 3644 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); |
2789 | vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ | 3645 | vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ |
2790 | 3646 | ||
2791 | vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */ | ||
2792 | vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ | ||
2793 | vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ | ||
2794 | |||
2795 | vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ | ||
2796 | vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
2797 | vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
2798 | vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ | 3647 | vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ |
2799 | vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ | 3648 | vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ |
2800 | vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | 3649 | vmx_set_constant_host_state(); |
2801 | #ifdef CONFIG_X86_64 | 3650 | #ifdef CONFIG_X86_64 |
2802 | rdmsrl(MSR_FS_BASE, a); | 3651 | rdmsrl(MSR_FS_BASE, a); |
2803 | vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ | 3652 | vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ |
@@ -2808,32 +3657,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2808 | vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ | 3657 | vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ |
2809 | #endif | 3658 | #endif |
2810 | 3659 | ||
2811 | vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ | ||
2812 | |||
2813 | native_store_idt(&dt); | ||
2814 | vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ | ||
2815 | |||
2816 | asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); | ||
2817 | vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ | ||
2818 | vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); | 3660 | vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); |
2819 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); | 3661 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); |
2820 | vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); | 3662 | vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); |
2821 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); | 3663 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); |
2822 | vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); | 3664 | vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); |
2823 | 3665 | ||
2824 | rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); | ||
2825 | vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); | ||
2826 | rdmsrl(MSR_IA32_SYSENTER_ESP, a); | ||
2827 | vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */ | ||
2828 | rdmsrl(MSR_IA32_SYSENTER_EIP, a); | ||
2829 | vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ | ||
2830 | |||
2831 | if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { | ||
2832 | rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); | ||
2833 | host_pat = msr_low | ((u64) msr_high << 32); | ||
2834 | vmcs_write64(HOST_IA32_PAT, host_pat); | ||
2835 | } | ||
2836 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { | 3666 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { |
3667 | u32 msr_low, msr_high; | ||
3668 | u64 host_pat; | ||
2837 | rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); | 3669 | rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); |
2838 | host_pat = msr_low | ((u64) msr_high << 32); | 3670 | host_pat = msr_low | ((u64) msr_high << 32); |
2839 | /* Write the default value follow host pat */ | 3671 | /* Write the default value follow host pat */ |
@@ -2863,10 +3695,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2863 | vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); | 3695 | vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); |
2864 | 3696 | ||
2865 | vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); | 3697 | vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); |
2866 | vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; | 3698 | set_cr4_guest_host_mask(vmx); |
2867 | if (enable_ept) | ||
2868 | vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; | ||
2869 | vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); | ||
2870 | 3699 | ||
2871 | kvm_write_tsc(&vmx->vcpu, 0); | 3700 | kvm_write_tsc(&vmx->vcpu, 0); |
2872 | 3701 | ||
@@ -2990,9 +3819,25 @@ out: | |||
2990 | return ret; | 3819 | return ret; |
2991 | } | 3820 | } |
2992 | 3821 | ||
3822 | /* | ||
3823 | * In nested virtualization, check if L1 asked to exit on external interrupts. | ||
3824 | * For most existing hypervisors, this will always return true. | ||
3825 | */ | ||
3826 | static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) | ||
3827 | { | ||
3828 | return get_vmcs12(vcpu)->pin_based_vm_exec_control & | ||
3829 | PIN_BASED_EXT_INTR_MASK; | ||
3830 | } | ||
3831 | |||
2993 | static void enable_irq_window(struct kvm_vcpu *vcpu) | 3832 | static void enable_irq_window(struct kvm_vcpu *vcpu) |
2994 | { | 3833 | { |
2995 | u32 cpu_based_vm_exec_control; | 3834 | u32 cpu_based_vm_exec_control; |
3835 | if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) | ||
3836 | /* We can get here when nested_run_pending caused | ||
3837 | * vmx_interrupt_allowed() to return false. In this case, do | ||
3838 | * nothing - the interrupt will be injected later. | ||
3839 | */ | ||
3840 | return; | ||
2996 | 3841 | ||
2997 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | 3842 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); |
2998 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; | 3843 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; |
@@ -3049,6 +3894,9 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | |||
3049 | { | 3894 | { |
3050 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3895 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3051 | 3896 | ||
3897 | if (is_guest_mode(vcpu)) | ||
3898 | return; | ||
3899 | |||
3052 | if (!cpu_has_virtual_nmis()) { | 3900 | if (!cpu_has_virtual_nmis()) { |
3053 | /* | 3901 | /* |
3054 | * Tracking the NMI-blocked state in software is built upon | 3902 | * Tracking the NMI-blocked state in software is built upon |
@@ -3115,6 +3963,17 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) | |||
3115 | 3963 | ||
3116 | static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) | 3964 | static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) |
3117 | { | 3965 | { |
3966 | if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { | ||
3967 | struct vmcs12 *vmcs12; | ||
3968 | if (to_vmx(vcpu)->nested.nested_run_pending) | ||
3969 | return 0; | ||
3970 | nested_vmx_vmexit(vcpu); | ||
3971 | vmcs12 = get_vmcs12(vcpu); | ||
3972 | vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT; | ||
3973 | vmcs12->vm_exit_intr_info = 0; | ||
3974 | /* fall through to normal code, but now in L1, not L2 */ | ||
3975 | } | ||
3976 | |||
3118 | return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && | 3977 | return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && |
3119 | !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & | 3978 | !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & |
3120 | (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); | 3979 | (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); |
@@ -3356,6 +4215,58 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | |||
3356 | hypercall[2] = 0xc1; | 4215 | hypercall[2] = 0xc1; |
3357 | } | 4216 | } |
3358 | 4217 | ||
4218 | /* called to set cr0 as approriate for a mov-to-cr0 exit. */ | ||
4219 | static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) | ||
4220 | { | ||
4221 | if (to_vmx(vcpu)->nested.vmxon && | ||
4222 | ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)) | ||
4223 | return 1; | ||
4224 | |||
4225 | if (is_guest_mode(vcpu)) { | ||
4226 | /* | ||
4227 | * We get here when L2 changed cr0 in a way that did not change | ||
4228 | * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), | ||
4229 | * but did change L0 shadowed bits. This can currently happen | ||
4230 | * with the TS bit: L0 may want to leave TS on (for lazy fpu | ||
4231 | * loading) while pretending to allow the guest to change it. | ||
4232 | */ | ||
4233 | if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) | | ||
4234 | (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits))) | ||
4235 | return 1; | ||
4236 | vmcs_writel(CR0_READ_SHADOW, val); | ||
4237 | return 0; | ||
4238 | } else | ||
4239 | return kvm_set_cr0(vcpu, val); | ||
4240 | } | ||
4241 | |||
4242 | static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) | ||
4243 | { | ||
4244 | if (is_guest_mode(vcpu)) { | ||
4245 | if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) | | ||
4246 | (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits))) | ||
4247 | return 1; | ||
4248 | vmcs_writel(CR4_READ_SHADOW, val); | ||
4249 | return 0; | ||
4250 | } else | ||
4251 | return kvm_set_cr4(vcpu, val); | ||
4252 | } | ||
4253 | |||
4254 | /* called to set cr0 as approriate for clts instruction exit. */ | ||
4255 | static void handle_clts(struct kvm_vcpu *vcpu) | ||
4256 | { | ||
4257 | if (is_guest_mode(vcpu)) { | ||
4258 | /* | ||
4259 | * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS | ||
4260 | * but we did (!fpu_active). We need to keep GUEST_CR0.TS on, | ||
4261 | * just pretend it's off (also in arch.cr0 for fpu_activate). | ||
4262 | */ | ||
4263 | vmcs_writel(CR0_READ_SHADOW, | ||
4264 | vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS); | ||
4265 | vcpu->arch.cr0 &= ~X86_CR0_TS; | ||
4266 | } else | ||
4267 | vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); | ||
4268 | } | ||
4269 | |||
3359 | static int handle_cr(struct kvm_vcpu *vcpu) | 4270 | static int handle_cr(struct kvm_vcpu *vcpu) |
3360 | { | 4271 | { |
3361 | unsigned long exit_qualification, val; | 4272 | unsigned long exit_qualification, val; |
@@ -3372,7 +4283,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) | |||
3372 | trace_kvm_cr_write(cr, val); | 4283 | trace_kvm_cr_write(cr, val); |
3373 | switch (cr) { | 4284 | switch (cr) { |
3374 | case 0: | 4285 | case 0: |
3375 | err = kvm_set_cr0(vcpu, val); | 4286 | err = handle_set_cr0(vcpu, val); |
3376 | kvm_complete_insn_gp(vcpu, err); | 4287 | kvm_complete_insn_gp(vcpu, err); |
3377 | return 1; | 4288 | return 1; |
3378 | case 3: | 4289 | case 3: |
@@ -3380,7 +4291,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) | |||
3380 | kvm_complete_insn_gp(vcpu, err); | 4291 | kvm_complete_insn_gp(vcpu, err); |
3381 | return 1; | 4292 | return 1; |
3382 | case 4: | 4293 | case 4: |
3383 | err = kvm_set_cr4(vcpu, val); | 4294 | err = handle_set_cr4(vcpu, val); |
3384 | kvm_complete_insn_gp(vcpu, err); | 4295 | kvm_complete_insn_gp(vcpu, err); |
3385 | return 1; | 4296 | return 1; |
3386 | case 8: { | 4297 | case 8: { |
@@ -3398,7 +4309,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) | |||
3398 | }; | 4309 | }; |
3399 | break; | 4310 | break; |
3400 | case 2: /* clts */ | 4311 | case 2: /* clts */ |
3401 | vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); | 4312 | handle_clts(vcpu); |
3402 | trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); | 4313 | trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); |
3403 | skip_emulated_instruction(vcpu); | 4314 | skip_emulated_instruction(vcpu); |
3404 | vmx_fpu_activate(vcpu); | 4315 | vmx_fpu_activate(vcpu); |
@@ -3574,12 +4485,6 @@ static int handle_vmcall(struct kvm_vcpu *vcpu) | |||
3574 | return 1; | 4485 | return 1; |
3575 | } | 4486 | } |
3576 | 4487 | ||
3577 | static int handle_vmx_insn(struct kvm_vcpu *vcpu) | ||
3578 | { | ||
3579 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
3580 | return 1; | ||
3581 | } | ||
3582 | |||
3583 | static int handle_invd(struct kvm_vcpu *vcpu) | 4488 | static int handle_invd(struct kvm_vcpu *vcpu) |
3584 | { | 4489 | { |
3585 | return emulate_instruction(vcpu, 0) == EMULATE_DONE; | 4490 | return emulate_instruction(vcpu, 0) == EMULATE_DONE; |
@@ -3777,11 +4682,19 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, | |||
3777 | static int handle_ept_misconfig(struct kvm_vcpu *vcpu) | 4682 | static int handle_ept_misconfig(struct kvm_vcpu *vcpu) |
3778 | { | 4683 | { |
3779 | u64 sptes[4]; | 4684 | u64 sptes[4]; |
3780 | int nr_sptes, i; | 4685 | int nr_sptes, i, ret; |
3781 | gpa_t gpa; | 4686 | gpa_t gpa; |
3782 | 4687 | ||
3783 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); | 4688 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); |
3784 | 4689 | ||
4690 | ret = handle_mmio_page_fault_common(vcpu, gpa, true); | ||
4691 | if (likely(ret == 1)) | ||
4692 | return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == | ||
4693 | EMULATE_DONE; | ||
4694 | if (unlikely(!ret)) | ||
4695 | return 1; | ||
4696 | |||
4697 | /* It is the real ept misconfig */ | ||
3785 | printk(KERN_ERR "EPT: Misconfiguration.\n"); | 4698 | printk(KERN_ERR "EPT: Misconfiguration.\n"); |
3786 | printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); | 4699 | printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); |
3787 | 4700 | ||
@@ -3866,6 +4779,639 @@ static int handle_invalid_op(struct kvm_vcpu *vcpu) | |||
3866 | } | 4779 | } |
3867 | 4780 | ||
3868 | /* | 4781 | /* |
4782 | * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. | ||
4783 | * We could reuse a single VMCS for all the L2 guests, but we also want the | ||
4784 | * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this | ||
4785 | * allows keeping them loaded on the processor, and in the future will allow | ||
4786 | * optimizations where prepare_vmcs02 doesn't need to set all the fields on | ||
4787 | * every entry if they never change. | ||
4788 | * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE | ||
4789 | * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first. | ||
4790 | * | ||
4791 | * The following functions allocate and free a vmcs02 in this pool. | ||
4792 | */ | ||
4793 | |||
4794 | /* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */ | ||
4795 | static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) | ||
4796 | { | ||
4797 | struct vmcs02_list *item; | ||
4798 | list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) | ||
4799 | if (item->vmptr == vmx->nested.current_vmptr) { | ||
4800 | list_move(&item->list, &vmx->nested.vmcs02_pool); | ||
4801 | return &item->vmcs02; | ||
4802 | } | ||
4803 | |||
4804 | if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) { | ||
4805 | /* Recycle the least recently used VMCS. */ | ||
4806 | item = list_entry(vmx->nested.vmcs02_pool.prev, | ||
4807 | struct vmcs02_list, list); | ||
4808 | item->vmptr = vmx->nested.current_vmptr; | ||
4809 | list_move(&item->list, &vmx->nested.vmcs02_pool); | ||
4810 | return &item->vmcs02; | ||
4811 | } | ||
4812 | |||
4813 | /* Create a new VMCS */ | ||
4814 | item = (struct vmcs02_list *) | ||
4815 | kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL); | ||
4816 | if (!item) | ||
4817 | return NULL; | ||
4818 | item->vmcs02.vmcs = alloc_vmcs(); | ||
4819 | if (!item->vmcs02.vmcs) { | ||
4820 | kfree(item); | ||
4821 | return NULL; | ||
4822 | } | ||
4823 | loaded_vmcs_init(&item->vmcs02); | ||
4824 | item->vmptr = vmx->nested.current_vmptr; | ||
4825 | list_add(&(item->list), &(vmx->nested.vmcs02_pool)); | ||
4826 | vmx->nested.vmcs02_num++; | ||
4827 | return &item->vmcs02; | ||
4828 | } | ||
4829 | |||
4830 | /* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */ | ||
4831 | static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) | ||
4832 | { | ||
4833 | struct vmcs02_list *item; | ||
4834 | list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) | ||
4835 | if (item->vmptr == vmptr) { | ||
4836 | free_loaded_vmcs(&item->vmcs02); | ||
4837 | list_del(&item->list); | ||
4838 | kfree(item); | ||
4839 | vmx->nested.vmcs02_num--; | ||
4840 | return; | ||
4841 | } | ||
4842 | } | ||
4843 | |||
4844 | /* | ||
4845 | * Free all VMCSs saved for this vcpu, except the one pointed by | ||
4846 | * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one | ||
4847 | * currently used, if running L2), and vmcs01 when running L2. | ||
4848 | */ | ||
4849 | static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) | ||
4850 | { | ||
4851 | struct vmcs02_list *item, *n; | ||
4852 | list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { | ||
4853 | if (vmx->loaded_vmcs != &item->vmcs02) | ||
4854 | free_loaded_vmcs(&item->vmcs02); | ||
4855 | list_del(&item->list); | ||
4856 | kfree(item); | ||
4857 | } | ||
4858 | vmx->nested.vmcs02_num = 0; | ||
4859 | |||
4860 | if (vmx->loaded_vmcs != &vmx->vmcs01) | ||
4861 | free_loaded_vmcs(&vmx->vmcs01); | ||
4862 | } | ||
4863 | |||
4864 | /* | ||
4865 | * Emulate the VMXON instruction. | ||
4866 | * Currently, we just remember that VMX is active, and do not save or even | ||
4867 | * inspect the argument to VMXON (the so-called "VMXON pointer") because we | ||
4868 | * do not currently need to store anything in that guest-allocated memory | ||
4869 | * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their | ||
4870 | * argument is different from the VMXON pointer (which the spec says they do). | ||
4871 | */ | ||
4872 | static int handle_vmon(struct kvm_vcpu *vcpu) | ||
4873 | { | ||
4874 | struct kvm_segment cs; | ||
4875 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
4876 | |||
4877 | /* The Intel VMX Instruction Reference lists a bunch of bits that | ||
4878 | * are prerequisite to running VMXON, most notably cr4.VMXE must be | ||
4879 | * set to 1 (see vmx_set_cr4() for when we allow the guest to set this). | ||
4880 | * Otherwise, we should fail with #UD. We test these now: | ||
4881 | */ | ||
4882 | if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) || | ||
4883 | !kvm_read_cr0_bits(vcpu, X86_CR0_PE) || | ||
4884 | (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { | ||
4885 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
4886 | return 1; | ||
4887 | } | ||
4888 | |||
4889 | vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
4890 | if (is_long_mode(vcpu) && !cs.l) { | ||
4891 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
4892 | return 1; | ||
4893 | } | ||
4894 | |||
4895 | if (vmx_get_cpl(vcpu)) { | ||
4896 | kvm_inject_gp(vcpu, 0); | ||
4897 | return 1; | ||
4898 | } | ||
4899 | |||
4900 | INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); | ||
4901 | vmx->nested.vmcs02_num = 0; | ||
4902 | |||
4903 | vmx->nested.vmxon = true; | ||
4904 | |||
4905 | skip_emulated_instruction(vcpu); | ||
4906 | return 1; | ||
4907 | } | ||
4908 | |||
4909 | /* | ||
4910 | * Intel's VMX Instruction Reference specifies a common set of prerequisites | ||
4911 | * for running VMX instructions (except VMXON, whose prerequisites are | ||
4912 | * slightly different). It also specifies what exception to inject otherwise. | ||
4913 | */ | ||
4914 | static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) | ||
4915 | { | ||
4916 | struct kvm_segment cs; | ||
4917 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
4918 | |||
4919 | if (!vmx->nested.vmxon) { | ||
4920 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
4921 | return 0; | ||
4922 | } | ||
4923 | |||
4924 | vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
4925 | if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) || | ||
4926 | (is_long_mode(vcpu) && !cs.l)) { | ||
4927 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
4928 | return 0; | ||
4929 | } | ||
4930 | |||
4931 | if (vmx_get_cpl(vcpu)) { | ||
4932 | kvm_inject_gp(vcpu, 0); | ||
4933 | return 0; | ||
4934 | } | ||
4935 | |||
4936 | return 1; | ||
4937 | } | ||
4938 | |||
4939 | /* | ||
4940 | * Free whatever needs to be freed from vmx->nested when L1 goes down, or | ||
4941 | * just stops using VMX. | ||
4942 | */ | ||
4943 | static void free_nested(struct vcpu_vmx *vmx) | ||
4944 | { | ||
4945 | if (!vmx->nested.vmxon) | ||
4946 | return; | ||
4947 | vmx->nested.vmxon = false; | ||
4948 | if (vmx->nested.current_vmptr != -1ull) { | ||
4949 | kunmap(vmx->nested.current_vmcs12_page); | ||
4950 | nested_release_page(vmx->nested.current_vmcs12_page); | ||
4951 | vmx->nested.current_vmptr = -1ull; | ||
4952 | vmx->nested.current_vmcs12 = NULL; | ||
4953 | } | ||
4954 | /* Unpin physical memory we referred to in current vmcs02 */ | ||
4955 | if (vmx->nested.apic_access_page) { | ||
4956 | nested_release_page(vmx->nested.apic_access_page); | ||
4957 | vmx->nested.apic_access_page = 0; | ||
4958 | } | ||
4959 | |||
4960 | nested_free_all_saved_vmcss(vmx); | ||
4961 | } | ||
4962 | |||
4963 | /* Emulate the VMXOFF instruction */ | ||
4964 | static int handle_vmoff(struct kvm_vcpu *vcpu) | ||
4965 | { | ||
4966 | if (!nested_vmx_check_permission(vcpu)) | ||
4967 | return 1; | ||
4968 | free_nested(to_vmx(vcpu)); | ||
4969 | skip_emulated_instruction(vcpu); | ||
4970 | return 1; | ||
4971 | } | ||
4972 | |||
4973 | /* | ||
4974 | * Decode the memory-address operand of a vmx instruction, as recorded on an | ||
4975 | * exit caused by such an instruction (run by a guest hypervisor). | ||
4976 | * On success, returns 0. When the operand is invalid, returns 1 and throws | ||
4977 | * #UD or #GP. | ||
4978 | */ | ||
4979 | static int get_vmx_mem_address(struct kvm_vcpu *vcpu, | ||
4980 | unsigned long exit_qualification, | ||
4981 | u32 vmx_instruction_info, gva_t *ret) | ||
4982 | { | ||
4983 | /* | ||
4984 | * According to Vol. 3B, "Information for VM Exits Due to Instruction | ||
4985 | * Execution", on an exit, vmx_instruction_info holds most of the | ||
4986 | * addressing components of the operand. Only the displacement part | ||
4987 | * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). | ||
4988 | * For how an actual address is calculated from all these components, | ||
4989 | * refer to Vol. 1, "Operand Addressing". | ||
4990 | */ | ||
4991 | int scaling = vmx_instruction_info & 3; | ||
4992 | int addr_size = (vmx_instruction_info >> 7) & 7; | ||
4993 | bool is_reg = vmx_instruction_info & (1u << 10); | ||
4994 | int seg_reg = (vmx_instruction_info >> 15) & 7; | ||
4995 | int index_reg = (vmx_instruction_info >> 18) & 0xf; | ||
4996 | bool index_is_valid = !(vmx_instruction_info & (1u << 22)); | ||
4997 | int base_reg = (vmx_instruction_info >> 23) & 0xf; | ||
4998 | bool base_is_valid = !(vmx_instruction_info & (1u << 27)); | ||
4999 | |||
5000 | if (is_reg) { | ||
5001 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
5002 | return 1; | ||
5003 | } | ||
5004 | |||
5005 | /* Addr = segment_base + offset */ | ||
5006 | /* offset = base + [index * scale] + displacement */ | ||
5007 | *ret = vmx_get_segment_base(vcpu, seg_reg); | ||
5008 | if (base_is_valid) | ||
5009 | *ret += kvm_register_read(vcpu, base_reg); | ||
5010 | if (index_is_valid) | ||
5011 | *ret += kvm_register_read(vcpu, index_reg)<<scaling; | ||
5012 | *ret += exit_qualification; /* holds the displacement */ | ||
5013 | |||
5014 | if (addr_size == 1) /* 32 bit */ | ||
5015 | *ret &= 0xffffffff; | ||
5016 | |||
5017 | /* | ||
5018 | * TODO: throw #GP (and return 1) in various cases that the VM* | ||
5019 | * instructions require it - e.g., offset beyond segment limit, | ||
5020 | * unusable or unreadable/unwritable segment, non-canonical 64-bit | ||
5021 | * address, and so on. Currently these are not checked. | ||
5022 | */ | ||
5023 | return 0; | ||
5024 | } | ||
5025 | |||
5026 | /* | ||
5027 | * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), | ||
5028 | * set the success or error code of an emulated VMX instruction, as specified | ||
5029 | * by Vol 2B, VMX Instruction Reference, "Conventions". | ||
5030 | */ | ||
5031 | static void nested_vmx_succeed(struct kvm_vcpu *vcpu) | ||
5032 | { | ||
5033 | vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) | ||
5034 | & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | | ||
5035 | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); | ||
5036 | } | ||
5037 | |||
5038 | static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) | ||
5039 | { | ||
5040 | vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) | ||
5041 | & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | | ||
5042 | X86_EFLAGS_SF | X86_EFLAGS_OF)) | ||
5043 | | X86_EFLAGS_CF); | ||
5044 | } | ||
5045 | |||
5046 | static void nested_vmx_failValid(struct kvm_vcpu *vcpu, | ||
5047 | u32 vm_instruction_error) | ||
5048 | { | ||
5049 | if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { | ||
5050 | /* | ||
5051 | * failValid writes the error number to the current VMCS, which | ||
5052 | * can't be done there isn't a current VMCS. | ||
5053 | */ | ||
5054 | nested_vmx_failInvalid(vcpu); | ||
5055 | return; | ||
5056 | } | ||
5057 | vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) | ||
5058 | & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | | ||
5059 | X86_EFLAGS_SF | X86_EFLAGS_OF)) | ||
5060 | | X86_EFLAGS_ZF); | ||
5061 | get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; | ||
5062 | } | ||
5063 | |||
5064 | /* Emulate the VMCLEAR instruction */ | ||
5065 | static int handle_vmclear(struct kvm_vcpu *vcpu) | ||
5066 | { | ||
5067 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
5068 | gva_t gva; | ||
5069 | gpa_t vmptr; | ||
5070 | struct vmcs12 *vmcs12; | ||
5071 | struct page *page; | ||
5072 | struct x86_exception e; | ||
5073 | |||
5074 | if (!nested_vmx_check_permission(vcpu)) | ||
5075 | return 1; | ||
5076 | |||
5077 | if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), | ||
5078 | vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) | ||
5079 | return 1; | ||
5080 | |||
5081 | if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, | ||
5082 | sizeof(vmptr), &e)) { | ||
5083 | kvm_inject_page_fault(vcpu, &e); | ||
5084 | return 1; | ||
5085 | } | ||
5086 | |||
5087 | if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { | ||
5088 | nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); | ||
5089 | skip_emulated_instruction(vcpu); | ||
5090 | return 1; | ||
5091 | } | ||
5092 | |||
5093 | if (vmptr == vmx->nested.current_vmptr) { | ||
5094 | kunmap(vmx->nested.current_vmcs12_page); | ||
5095 | nested_release_page(vmx->nested.current_vmcs12_page); | ||
5096 | vmx->nested.current_vmptr = -1ull; | ||
5097 | vmx->nested.current_vmcs12 = NULL; | ||
5098 | } | ||
5099 | |||
5100 | page = nested_get_page(vcpu, vmptr); | ||
5101 | if (page == NULL) { | ||
5102 | /* | ||
5103 | * For accurate processor emulation, VMCLEAR beyond available | ||
5104 | * physical memory should do nothing at all. However, it is | ||
5105 | * possible that a nested vmx bug, not a guest hypervisor bug, | ||
5106 | * resulted in this case, so let's shut down before doing any | ||
5107 | * more damage: | ||
5108 | */ | ||
5109 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); | ||
5110 | return 1; | ||
5111 | } | ||
5112 | vmcs12 = kmap(page); | ||
5113 | vmcs12->launch_state = 0; | ||
5114 | kunmap(page); | ||
5115 | nested_release_page(page); | ||
5116 | |||
5117 | nested_free_vmcs02(vmx, vmptr); | ||
5118 | |||
5119 | skip_emulated_instruction(vcpu); | ||
5120 | nested_vmx_succeed(vcpu); | ||
5121 | return 1; | ||
5122 | } | ||
5123 | |||
5124 | static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); | ||
5125 | |||
5126 | /* Emulate the VMLAUNCH instruction */ | ||
5127 | static int handle_vmlaunch(struct kvm_vcpu *vcpu) | ||
5128 | { | ||
5129 | return nested_vmx_run(vcpu, true); | ||
5130 | } | ||
5131 | |||
5132 | /* Emulate the VMRESUME instruction */ | ||
5133 | static int handle_vmresume(struct kvm_vcpu *vcpu) | ||
5134 | { | ||
5135 | |||
5136 | return nested_vmx_run(vcpu, false); | ||
5137 | } | ||
5138 | |||
5139 | enum vmcs_field_type { | ||
5140 | VMCS_FIELD_TYPE_U16 = 0, | ||
5141 | VMCS_FIELD_TYPE_U64 = 1, | ||
5142 | VMCS_FIELD_TYPE_U32 = 2, | ||
5143 | VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 | ||
5144 | }; | ||
5145 | |||
5146 | static inline int vmcs_field_type(unsigned long field) | ||
5147 | { | ||
5148 | if (0x1 & field) /* the *_HIGH fields are all 32 bit */ | ||
5149 | return VMCS_FIELD_TYPE_U32; | ||
5150 | return (field >> 13) & 0x3 ; | ||
5151 | } | ||
5152 | |||
5153 | static inline int vmcs_field_readonly(unsigned long field) | ||
5154 | { | ||
5155 | return (((field >> 10) & 0x3) == 1); | ||
5156 | } | ||
5157 | |||
5158 | /* | ||
5159 | * Read a vmcs12 field. Since these can have varying lengths and we return | ||
5160 | * one type, we chose the biggest type (u64) and zero-extend the return value | ||
5161 | * to that size. Note that the caller, handle_vmread, might need to use only | ||
5162 | * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of | ||
5163 | * 64-bit fields are to be returned). | ||
5164 | */ | ||
5165 | static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu, | ||
5166 | unsigned long field, u64 *ret) | ||
5167 | { | ||
5168 | short offset = vmcs_field_to_offset(field); | ||
5169 | char *p; | ||
5170 | |||
5171 | if (offset < 0) | ||
5172 | return 0; | ||
5173 | |||
5174 | p = ((char *)(get_vmcs12(vcpu))) + offset; | ||
5175 | |||
5176 | switch (vmcs_field_type(field)) { | ||
5177 | case VMCS_FIELD_TYPE_NATURAL_WIDTH: | ||
5178 | *ret = *((natural_width *)p); | ||
5179 | return 1; | ||
5180 | case VMCS_FIELD_TYPE_U16: | ||
5181 | *ret = *((u16 *)p); | ||
5182 | return 1; | ||
5183 | case VMCS_FIELD_TYPE_U32: | ||
5184 | *ret = *((u32 *)p); | ||
5185 | return 1; | ||
5186 | case VMCS_FIELD_TYPE_U64: | ||
5187 | *ret = *((u64 *)p); | ||
5188 | return 1; | ||
5189 | default: | ||
5190 | return 0; /* can never happen. */ | ||
5191 | } | ||
5192 | } | ||
5193 | |||
5194 | /* | ||
5195 | * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was | ||
5196 | * used before) all generate the same failure when it is missing. | ||
5197 | */ | ||
5198 | static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu) | ||
5199 | { | ||
5200 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
5201 | if (vmx->nested.current_vmptr == -1ull) { | ||
5202 | nested_vmx_failInvalid(vcpu); | ||
5203 | skip_emulated_instruction(vcpu); | ||
5204 | return 0; | ||
5205 | } | ||
5206 | return 1; | ||
5207 | } | ||
5208 | |||
5209 | static int handle_vmread(struct kvm_vcpu *vcpu) | ||
5210 | { | ||
5211 | unsigned long field; | ||
5212 | u64 field_value; | ||
5213 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
5214 | u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
5215 | gva_t gva = 0; | ||
5216 | |||
5217 | if (!nested_vmx_check_permission(vcpu) || | ||
5218 | !nested_vmx_check_vmcs12(vcpu)) | ||
5219 | return 1; | ||
5220 | |||
5221 | /* Decode instruction info and find the field to read */ | ||
5222 | field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); | ||
5223 | /* Read the field, zero-extended to a u64 field_value */ | ||
5224 | if (!vmcs12_read_any(vcpu, field, &field_value)) { | ||
5225 | nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); | ||
5226 | skip_emulated_instruction(vcpu); | ||
5227 | return 1; | ||
5228 | } | ||
5229 | /* | ||
5230 | * Now copy part of this value to register or memory, as requested. | ||
5231 | * Note that the number of bits actually copied is 32 or 64 depending | ||
5232 | * on the guest's mode (32 or 64 bit), not on the given field's length. | ||
5233 | */ | ||
5234 | if (vmx_instruction_info & (1u << 10)) { | ||
5235 | kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf), | ||
5236 | field_value); | ||
5237 | } else { | ||
5238 | if (get_vmx_mem_address(vcpu, exit_qualification, | ||
5239 | vmx_instruction_info, &gva)) | ||
5240 | return 1; | ||
5241 | /* _system ok, as nested_vmx_check_permission verified cpl=0 */ | ||
5242 | kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva, | ||
5243 | &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL); | ||
5244 | } | ||
5245 | |||
5246 | nested_vmx_succeed(vcpu); | ||
5247 | skip_emulated_instruction(vcpu); | ||
5248 | return 1; | ||
5249 | } | ||
5250 | |||
5251 | |||
5252 | static int handle_vmwrite(struct kvm_vcpu *vcpu) | ||
5253 | { | ||
5254 | unsigned long field; | ||
5255 | gva_t gva; | ||
5256 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
5257 | u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
5258 | char *p; | ||
5259 | short offset; | ||
5260 | /* The value to write might be 32 or 64 bits, depending on L1's long | ||
5261 | * mode, and eventually we need to write that into a field of several | ||
5262 | * possible lengths. The code below first zero-extends the value to 64 | ||
5263 | * bit (field_value), and then copies only the approriate number of | ||
5264 | * bits into the vmcs12 field. | ||
5265 | */ | ||
5266 | u64 field_value = 0; | ||
5267 | struct x86_exception e; | ||
5268 | |||
5269 | if (!nested_vmx_check_permission(vcpu) || | ||
5270 | !nested_vmx_check_vmcs12(vcpu)) | ||
5271 | return 1; | ||
5272 | |||
5273 | if (vmx_instruction_info & (1u << 10)) | ||
5274 | field_value = kvm_register_read(vcpu, | ||
5275 | (((vmx_instruction_info) >> 3) & 0xf)); | ||
5276 | else { | ||
5277 | if (get_vmx_mem_address(vcpu, exit_qualification, | ||
5278 | vmx_instruction_info, &gva)) | ||
5279 | return 1; | ||
5280 | if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, | ||
5281 | &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) { | ||
5282 | kvm_inject_page_fault(vcpu, &e); | ||
5283 | return 1; | ||
5284 | } | ||
5285 | } | ||
5286 | |||
5287 | |||
5288 | field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); | ||
5289 | if (vmcs_field_readonly(field)) { | ||
5290 | nested_vmx_failValid(vcpu, | ||
5291 | VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); | ||
5292 | skip_emulated_instruction(vcpu); | ||
5293 | return 1; | ||
5294 | } | ||
5295 | |||
5296 | offset = vmcs_field_to_offset(field); | ||
5297 | if (offset < 0) { | ||
5298 | nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); | ||
5299 | skip_emulated_instruction(vcpu); | ||
5300 | return 1; | ||
5301 | } | ||
5302 | p = ((char *) get_vmcs12(vcpu)) + offset; | ||
5303 | |||
5304 | switch (vmcs_field_type(field)) { | ||
5305 | case VMCS_FIELD_TYPE_U16: | ||
5306 | *(u16 *)p = field_value; | ||
5307 | break; | ||
5308 | case VMCS_FIELD_TYPE_U32: | ||
5309 | *(u32 *)p = field_value; | ||
5310 | break; | ||
5311 | case VMCS_FIELD_TYPE_U64: | ||
5312 | *(u64 *)p = field_value; | ||
5313 | break; | ||
5314 | case VMCS_FIELD_TYPE_NATURAL_WIDTH: | ||
5315 | *(natural_width *)p = field_value; | ||
5316 | break; | ||
5317 | default: | ||
5318 | nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); | ||
5319 | skip_emulated_instruction(vcpu); | ||
5320 | return 1; | ||
5321 | } | ||
5322 | |||
5323 | nested_vmx_succeed(vcpu); | ||
5324 | skip_emulated_instruction(vcpu); | ||
5325 | return 1; | ||
5326 | } | ||
5327 | |||
5328 | /* Emulate the VMPTRLD instruction */ | ||
5329 | static int handle_vmptrld(struct kvm_vcpu *vcpu) | ||
5330 | { | ||
5331 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
5332 | gva_t gva; | ||
5333 | gpa_t vmptr; | ||
5334 | struct x86_exception e; | ||
5335 | |||
5336 | if (!nested_vmx_check_permission(vcpu)) | ||
5337 | return 1; | ||
5338 | |||
5339 | if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), | ||
5340 | vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) | ||
5341 | return 1; | ||
5342 | |||
5343 | if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, | ||
5344 | sizeof(vmptr), &e)) { | ||
5345 | kvm_inject_page_fault(vcpu, &e); | ||
5346 | return 1; | ||
5347 | } | ||
5348 | |||
5349 | if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { | ||
5350 | nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); | ||
5351 | skip_emulated_instruction(vcpu); | ||
5352 | return 1; | ||
5353 | } | ||
5354 | |||
5355 | if (vmx->nested.current_vmptr != vmptr) { | ||
5356 | struct vmcs12 *new_vmcs12; | ||
5357 | struct page *page; | ||
5358 | page = nested_get_page(vcpu, vmptr); | ||
5359 | if (page == NULL) { | ||
5360 | nested_vmx_failInvalid(vcpu); | ||
5361 | skip_emulated_instruction(vcpu); | ||
5362 | return 1; | ||
5363 | } | ||
5364 | new_vmcs12 = kmap(page); | ||
5365 | if (new_vmcs12->revision_id != VMCS12_REVISION) { | ||
5366 | kunmap(page); | ||
5367 | nested_release_page_clean(page); | ||
5368 | nested_vmx_failValid(vcpu, | ||
5369 | VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); | ||
5370 | skip_emulated_instruction(vcpu); | ||
5371 | return 1; | ||
5372 | } | ||
5373 | if (vmx->nested.current_vmptr != -1ull) { | ||
5374 | kunmap(vmx->nested.current_vmcs12_page); | ||
5375 | nested_release_page(vmx->nested.current_vmcs12_page); | ||
5376 | } | ||
5377 | |||
5378 | vmx->nested.current_vmptr = vmptr; | ||
5379 | vmx->nested.current_vmcs12 = new_vmcs12; | ||
5380 | vmx->nested.current_vmcs12_page = page; | ||
5381 | } | ||
5382 | |||
5383 | nested_vmx_succeed(vcpu); | ||
5384 | skip_emulated_instruction(vcpu); | ||
5385 | return 1; | ||
5386 | } | ||
5387 | |||
5388 | /* Emulate the VMPTRST instruction */ | ||
5389 | static int handle_vmptrst(struct kvm_vcpu *vcpu) | ||
5390 | { | ||
5391 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
5392 | u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
5393 | gva_t vmcs_gva; | ||
5394 | struct x86_exception e; | ||
5395 | |||
5396 | if (!nested_vmx_check_permission(vcpu)) | ||
5397 | return 1; | ||
5398 | |||
5399 | if (get_vmx_mem_address(vcpu, exit_qualification, | ||
5400 | vmx_instruction_info, &vmcs_gva)) | ||
5401 | return 1; | ||
5402 | /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */ | ||
5403 | if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva, | ||
5404 | (void *)&to_vmx(vcpu)->nested.current_vmptr, | ||
5405 | sizeof(u64), &e)) { | ||
5406 | kvm_inject_page_fault(vcpu, &e); | ||
5407 | return 1; | ||
5408 | } | ||
5409 | nested_vmx_succeed(vcpu); | ||
5410 | skip_emulated_instruction(vcpu); | ||
5411 | return 1; | ||
5412 | } | ||
5413 | |||
5414 | /* | ||
3869 | * The exit handlers return 1 if the exit was handled fully and guest execution | 5415 | * The exit handlers return 1 if the exit was handled fully and guest execution |
3870 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs | 5416 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs |
3871 | * to be done to userspace and return 0. | 5417 | * to be done to userspace and return 0. |
@@ -3886,15 +5432,15 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { | |||
3886 | [EXIT_REASON_INVD] = handle_invd, | 5432 | [EXIT_REASON_INVD] = handle_invd, |
3887 | [EXIT_REASON_INVLPG] = handle_invlpg, | 5433 | [EXIT_REASON_INVLPG] = handle_invlpg, |
3888 | [EXIT_REASON_VMCALL] = handle_vmcall, | 5434 | [EXIT_REASON_VMCALL] = handle_vmcall, |
3889 | [EXIT_REASON_VMCLEAR] = handle_vmx_insn, | 5435 | [EXIT_REASON_VMCLEAR] = handle_vmclear, |
3890 | [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, | 5436 | [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, |
3891 | [EXIT_REASON_VMPTRLD] = handle_vmx_insn, | 5437 | [EXIT_REASON_VMPTRLD] = handle_vmptrld, |
3892 | [EXIT_REASON_VMPTRST] = handle_vmx_insn, | 5438 | [EXIT_REASON_VMPTRST] = handle_vmptrst, |
3893 | [EXIT_REASON_VMREAD] = handle_vmx_insn, | 5439 | [EXIT_REASON_VMREAD] = handle_vmread, |
3894 | [EXIT_REASON_VMRESUME] = handle_vmx_insn, | 5440 | [EXIT_REASON_VMRESUME] = handle_vmresume, |
3895 | [EXIT_REASON_VMWRITE] = handle_vmx_insn, | 5441 | [EXIT_REASON_VMWRITE] = handle_vmwrite, |
3896 | [EXIT_REASON_VMOFF] = handle_vmx_insn, | 5442 | [EXIT_REASON_VMOFF] = handle_vmoff, |
3897 | [EXIT_REASON_VMON] = handle_vmx_insn, | 5443 | [EXIT_REASON_VMON] = handle_vmon, |
3898 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, | 5444 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, |
3899 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, | 5445 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, |
3900 | [EXIT_REASON_WBINVD] = handle_wbinvd, | 5446 | [EXIT_REASON_WBINVD] = handle_wbinvd, |
@@ -3911,6 +5457,229 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { | |||
3911 | static const int kvm_vmx_max_exit_handlers = | 5457 | static const int kvm_vmx_max_exit_handlers = |
3912 | ARRAY_SIZE(kvm_vmx_exit_handlers); | 5458 | ARRAY_SIZE(kvm_vmx_exit_handlers); |
3913 | 5459 | ||
5460 | /* | ||
5461 | * Return 1 if we should exit from L2 to L1 to handle an MSR access access, | ||
5462 | * rather than handle it ourselves in L0. I.e., check whether L1 expressed | ||
5463 | * disinterest in the current event (read or write a specific MSR) by using an | ||
5464 | * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. | ||
5465 | */ | ||
5466 | static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, | ||
5467 | struct vmcs12 *vmcs12, u32 exit_reason) | ||
5468 | { | ||
5469 | u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
5470 | gpa_t bitmap; | ||
5471 | |||
5472 | if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS)) | ||
5473 | return 1; | ||
5474 | |||
5475 | /* | ||
5476 | * The MSR_BITMAP page is divided into four 1024-byte bitmaps, | ||
5477 | * for the four combinations of read/write and low/high MSR numbers. | ||
5478 | * First we need to figure out which of the four to use: | ||
5479 | */ | ||
5480 | bitmap = vmcs12->msr_bitmap; | ||
5481 | if (exit_reason == EXIT_REASON_MSR_WRITE) | ||
5482 | bitmap += 2048; | ||
5483 | if (msr_index >= 0xc0000000) { | ||
5484 | msr_index -= 0xc0000000; | ||
5485 | bitmap += 1024; | ||
5486 | } | ||
5487 | |||
5488 | /* Then read the msr_index'th bit from this bitmap: */ | ||
5489 | if (msr_index < 1024*8) { | ||
5490 | unsigned char b; | ||
5491 | kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1); | ||
5492 | return 1 & (b >> (msr_index & 7)); | ||
5493 | } else | ||
5494 | return 1; /* let L1 handle the wrong parameter */ | ||
5495 | } | ||
5496 | |||
5497 | /* | ||
5498 | * Return 1 if we should exit from L2 to L1 to handle a CR access exit, | ||
5499 | * rather than handle it ourselves in L0. I.e., check if L1 wanted to | ||
5500 | * intercept (via guest_host_mask etc.) the current event. | ||
5501 | */ | ||
5502 | static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, | ||
5503 | struct vmcs12 *vmcs12) | ||
5504 | { | ||
5505 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
5506 | int cr = exit_qualification & 15; | ||
5507 | int reg = (exit_qualification >> 8) & 15; | ||
5508 | unsigned long val = kvm_register_read(vcpu, reg); | ||
5509 | |||
5510 | switch ((exit_qualification >> 4) & 3) { | ||
5511 | case 0: /* mov to cr */ | ||
5512 | switch (cr) { | ||
5513 | case 0: | ||
5514 | if (vmcs12->cr0_guest_host_mask & | ||
5515 | (val ^ vmcs12->cr0_read_shadow)) | ||
5516 | return 1; | ||
5517 | break; | ||
5518 | case 3: | ||
5519 | if ((vmcs12->cr3_target_count >= 1 && | ||
5520 | vmcs12->cr3_target_value0 == val) || | ||
5521 | (vmcs12->cr3_target_count >= 2 && | ||
5522 | vmcs12->cr3_target_value1 == val) || | ||
5523 | (vmcs12->cr3_target_count >= 3 && | ||
5524 | vmcs12->cr3_target_value2 == val) || | ||
5525 | (vmcs12->cr3_target_count >= 4 && | ||
5526 | vmcs12->cr3_target_value3 == val)) | ||
5527 | return 0; | ||
5528 | if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) | ||
5529 | return 1; | ||
5530 | break; | ||
5531 | case 4: | ||
5532 | if (vmcs12->cr4_guest_host_mask & | ||
5533 | (vmcs12->cr4_read_shadow ^ val)) | ||
5534 | return 1; | ||
5535 | break; | ||
5536 | case 8: | ||
5537 | if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) | ||
5538 | return 1; | ||
5539 | break; | ||
5540 | } | ||
5541 | break; | ||
5542 | case 2: /* clts */ | ||
5543 | if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && | ||
5544 | (vmcs12->cr0_read_shadow & X86_CR0_TS)) | ||
5545 | return 1; | ||
5546 | break; | ||
5547 | case 1: /* mov from cr */ | ||
5548 | switch (cr) { | ||
5549 | case 3: | ||
5550 | if (vmcs12->cpu_based_vm_exec_control & | ||
5551 | CPU_BASED_CR3_STORE_EXITING) | ||
5552 | return 1; | ||
5553 | break; | ||
5554 | case 8: | ||
5555 | if (vmcs12->cpu_based_vm_exec_control & | ||
5556 | CPU_BASED_CR8_STORE_EXITING) | ||
5557 | return 1; | ||
5558 | break; | ||
5559 | } | ||
5560 | break; | ||
5561 | case 3: /* lmsw */ | ||
5562 | /* | ||
5563 | * lmsw can change bits 1..3 of cr0, and only set bit 0 of | ||
5564 | * cr0. Other attempted changes are ignored, with no exit. | ||
5565 | */ | ||
5566 | if (vmcs12->cr0_guest_host_mask & 0xe & | ||
5567 | (val ^ vmcs12->cr0_read_shadow)) | ||
5568 | return 1; | ||
5569 | if ((vmcs12->cr0_guest_host_mask & 0x1) && | ||
5570 | !(vmcs12->cr0_read_shadow & 0x1) && | ||
5571 | (val & 0x1)) | ||
5572 | return 1; | ||
5573 | break; | ||
5574 | } | ||
5575 | return 0; | ||
5576 | } | ||
5577 | |||
5578 | /* | ||
5579 | * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we | ||
5580 | * should handle it ourselves in L0 (and then continue L2). Only call this | ||
5581 | * when in is_guest_mode (L2). | ||
5582 | */ | ||
5583 | static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) | ||
5584 | { | ||
5585 | u32 exit_reason = vmcs_read32(VM_EXIT_REASON); | ||
5586 | u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
5587 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
5588 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
5589 | |||
5590 | if (vmx->nested.nested_run_pending) | ||
5591 | return 0; | ||
5592 | |||
5593 | if (unlikely(vmx->fail)) { | ||
5594 | printk(KERN_INFO "%s failed vm entry %x\n", | ||
5595 | __func__, vmcs_read32(VM_INSTRUCTION_ERROR)); | ||
5596 | return 1; | ||
5597 | } | ||
5598 | |||
5599 | switch (exit_reason) { | ||
5600 | case EXIT_REASON_EXCEPTION_NMI: | ||
5601 | if (!is_exception(intr_info)) | ||
5602 | return 0; | ||
5603 | else if (is_page_fault(intr_info)) | ||
5604 | return enable_ept; | ||
5605 | return vmcs12->exception_bitmap & | ||
5606 | (1u << (intr_info & INTR_INFO_VECTOR_MASK)); | ||
5607 | case EXIT_REASON_EXTERNAL_INTERRUPT: | ||
5608 | return 0; | ||
5609 | case EXIT_REASON_TRIPLE_FAULT: | ||
5610 | return 1; | ||
5611 | case EXIT_REASON_PENDING_INTERRUPT: | ||
5612 | case EXIT_REASON_NMI_WINDOW: | ||
5613 | /* | ||
5614 | * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit | ||
5615 | * (aka Interrupt Window Exiting) only when L1 turned it on, | ||
5616 | * so if we got a PENDING_INTERRUPT exit, this must be for L1. | ||
5617 | * Same for NMI Window Exiting. | ||
5618 | */ | ||
5619 | return 1; | ||
5620 | case EXIT_REASON_TASK_SWITCH: | ||
5621 | return 1; | ||
5622 | case EXIT_REASON_CPUID: | ||
5623 | return 1; | ||
5624 | case EXIT_REASON_HLT: | ||
5625 | return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); | ||
5626 | case EXIT_REASON_INVD: | ||
5627 | return 1; | ||
5628 | case EXIT_REASON_INVLPG: | ||
5629 | return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); | ||
5630 | case EXIT_REASON_RDPMC: | ||
5631 | return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); | ||
5632 | case EXIT_REASON_RDTSC: | ||
5633 | return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); | ||
5634 | case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: | ||
5635 | case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: | ||
5636 | case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: | ||
5637 | case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: | ||
5638 | case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: | ||
5639 | /* | ||
5640 | * VMX instructions trap unconditionally. This allows L1 to | ||
5641 | * emulate them for its L2 guest, i.e., allows 3-level nesting! | ||
5642 | */ | ||
5643 | return 1; | ||
5644 | case EXIT_REASON_CR_ACCESS: | ||
5645 | return nested_vmx_exit_handled_cr(vcpu, vmcs12); | ||
5646 | case EXIT_REASON_DR_ACCESS: | ||
5647 | return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); | ||
5648 | case EXIT_REASON_IO_INSTRUCTION: | ||
5649 | /* TODO: support IO bitmaps */ | ||
5650 | return 1; | ||
5651 | case EXIT_REASON_MSR_READ: | ||
5652 | case EXIT_REASON_MSR_WRITE: | ||
5653 | return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); | ||
5654 | case EXIT_REASON_INVALID_STATE: | ||
5655 | return 1; | ||
5656 | case EXIT_REASON_MWAIT_INSTRUCTION: | ||
5657 | return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); | ||
5658 | case EXIT_REASON_MONITOR_INSTRUCTION: | ||
5659 | return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); | ||
5660 | case EXIT_REASON_PAUSE_INSTRUCTION: | ||
5661 | return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || | ||
5662 | nested_cpu_has2(vmcs12, | ||
5663 | SECONDARY_EXEC_PAUSE_LOOP_EXITING); | ||
5664 | case EXIT_REASON_MCE_DURING_VMENTRY: | ||
5665 | return 0; | ||
5666 | case EXIT_REASON_TPR_BELOW_THRESHOLD: | ||
5667 | return 1; | ||
5668 | case EXIT_REASON_APIC_ACCESS: | ||
5669 | return nested_cpu_has2(vmcs12, | ||
5670 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); | ||
5671 | case EXIT_REASON_EPT_VIOLATION: | ||
5672 | case EXIT_REASON_EPT_MISCONFIG: | ||
5673 | return 0; | ||
5674 | case EXIT_REASON_WBINVD: | ||
5675 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); | ||
5676 | case EXIT_REASON_XSETBV: | ||
5677 | return 1; | ||
5678 | default: | ||
5679 | return 1; | ||
5680 | } | ||
5681 | } | ||
5682 | |||
3914 | static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) | 5683 | static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) |
3915 | { | 5684 | { |
3916 | *info1 = vmcs_readl(EXIT_QUALIFICATION); | 5685 | *info1 = vmcs_readl(EXIT_QUALIFICATION); |
@@ -3933,6 +5702,25 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) | |||
3933 | if (vmx->emulation_required && emulate_invalid_guest_state) | 5702 | if (vmx->emulation_required && emulate_invalid_guest_state) |
3934 | return handle_invalid_guest_state(vcpu); | 5703 | return handle_invalid_guest_state(vcpu); |
3935 | 5704 | ||
5705 | /* | ||
5706 | * the KVM_REQ_EVENT optimization bit is only on for one entry, and if | ||
5707 | * we did not inject a still-pending event to L1 now because of | ||
5708 | * nested_run_pending, we need to re-enable this bit. | ||
5709 | */ | ||
5710 | if (vmx->nested.nested_run_pending) | ||
5711 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
5712 | |||
5713 | if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH || | ||
5714 | exit_reason == EXIT_REASON_VMRESUME)) | ||
5715 | vmx->nested.nested_run_pending = 1; | ||
5716 | else | ||
5717 | vmx->nested.nested_run_pending = 0; | ||
5718 | |||
5719 | if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { | ||
5720 | nested_vmx_vmexit(vcpu); | ||
5721 | return 1; | ||
5722 | } | ||
5723 | |||
3936 | if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { | 5724 | if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { |
3937 | vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; | 5725 | vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; |
3938 | vcpu->run->fail_entry.hardware_entry_failure_reason | 5726 | vcpu->run->fail_entry.hardware_entry_failure_reason |
@@ -3955,7 +5743,9 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) | |||
3955 | "(0x%x) and exit reason is 0x%x\n", | 5743 | "(0x%x) and exit reason is 0x%x\n", |
3956 | __func__, vectoring_info, exit_reason); | 5744 | __func__, vectoring_info, exit_reason); |
3957 | 5745 | ||
3958 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { | 5746 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && |
5747 | !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( | ||
5748 | get_vmcs12(vcpu), vcpu)))) { | ||
3959 | if (vmx_interrupt_allowed(vcpu)) { | 5749 | if (vmx_interrupt_allowed(vcpu)) { |
3960 | vmx->soft_vnmi_blocked = 0; | 5750 | vmx->soft_vnmi_blocked = 0; |
3961 | } else if (vmx->vnmi_blocked_time > 1000000000LL && | 5751 | } else if (vmx->vnmi_blocked_time > 1000000000LL && |
@@ -4118,6 +5908,8 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, | |||
4118 | 5908 | ||
4119 | static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | 5909 | static void vmx_complete_interrupts(struct vcpu_vmx *vmx) |
4120 | { | 5910 | { |
5911 | if (is_guest_mode(&vmx->vcpu)) | ||
5912 | return; | ||
4121 | __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, | 5913 | __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, |
4122 | VM_EXIT_INSTRUCTION_LEN, | 5914 | VM_EXIT_INSTRUCTION_LEN, |
4123 | IDT_VECTORING_ERROR_CODE); | 5915 | IDT_VECTORING_ERROR_CODE); |
@@ -4125,6 +5917,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | |||
4125 | 5917 | ||
4126 | static void vmx_cancel_injection(struct kvm_vcpu *vcpu) | 5918 | static void vmx_cancel_injection(struct kvm_vcpu *vcpu) |
4127 | { | 5919 | { |
5920 | if (is_guest_mode(vcpu)) | ||
5921 | return; | ||
4128 | __vmx_complete_interrupts(to_vmx(vcpu), | 5922 | __vmx_complete_interrupts(to_vmx(vcpu), |
4129 | vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), | 5923 | vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), |
4130 | VM_ENTRY_INSTRUCTION_LEN, | 5924 | VM_ENTRY_INSTRUCTION_LEN, |
@@ -4145,6 +5939,21 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
4145 | { | 5939 | { |
4146 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 5940 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
4147 | 5941 | ||
5942 | if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) { | ||
5943 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
5944 | if (vmcs12->idt_vectoring_info_field & | ||
5945 | VECTORING_INFO_VALID_MASK) { | ||
5946 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
5947 | vmcs12->idt_vectoring_info_field); | ||
5948 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | ||
5949 | vmcs12->vm_exit_instruction_len); | ||
5950 | if (vmcs12->idt_vectoring_info_field & | ||
5951 | VECTORING_INFO_DELIVER_CODE_MASK) | ||
5952 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, | ||
5953 | vmcs12->idt_vectoring_error_code); | ||
5954 | } | ||
5955 | } | ||
5956 | |||
4148 | /* Record the guest's net vcpu time for enforced NMI injections. */ | 5957 | /* Record the guest's net vcpu time for enforced NMI injections. */ |
4149 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) | 5958 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) |
4150 | vmx->entry_time = ktime_get(); | 5959 | vmx->entry_time = ktime_get(); |
@@ -4167,6 +5976,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
4167 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) | 5976 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) |
4168 | vmx_set_interrupt_shadow(vcpu, 0); | 5977 | vmx_set_interrupt_shadow(vcpu, 0); |
4169 | 5978 | ||
5979 | vmx->__launched = vmx->loaded_vmcs->launched; | ||
4170 | asm( | 5980 | asm( |
4171 | /* Store host registers */ | 5981 | /* Store host registers */ |
4172 | "push %%"R"dx; push %%"R"bp;" | 5982 | "push %%"R"dx; push %%"R"bp;" |
@@ -4237,7 +6047,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
4237 | "pop %%"R"bp; pop %%"R"dx \n\t" | 6047 | "pop %%"R"bp; pop %%"R"dx \n\t" |
4238 | "setbe %c[fail](%0) \n\t" | 6048 | "setbe %c[fail](%0) \n\t" |
4239 | : : "c"(vmx), "d"((unsigned long)HOST_RSP), | 6049 | : : "c"(vmx), "d"((unsigned long)HOST_RSP), |
4240 | [launched]"i"(offsetof(struct vcpu_vmx, launched)), | 6050 | [launched]"i"(offsetof(struct vcpu_vmx, __launched)), |
4241 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), | 6051 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), |
4242 | [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), | 6052 | [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), |
4243 | [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), | 6053 | [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), |
@@ -4276,8 +6086,19 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
4276 | 6086 | ||
4277 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | 6087 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); |
4278 | 6088 | ||
6089 | if (is_guest_mode(vcpu)) { | ||
6090 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
6091 | vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info; | ||
6092 | if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { | ||
6093 | vmcs12->idt_vectoring_error_code = | ||
6094 | vmcs_read32(IDT_VECTORING_ERROR_CODE); | ||
6095 | vmcs12->vm_exit_instruction_len = | ||
6096 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | ||
6097 | } | ||
6098 | } | ||
6099 | |||
4279 | asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); | 6100 | asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); |
4280 | vmx->launched = 1; | 6101 | vmx->loaded_vmcs->launched = 1; |
4281 | 6102 | ||
4282 | vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); | 6103 | vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); |
4283 | 6104 | ||
@@ -4289,41 +6110,18 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
4289 | #undef R | 6110 | #undef R |
4290 | #undef Q | 6111 | #undef Q |
4291 | 6112 | ||
4292 | static void vmx_free_vmcs(struct kvm_vcpu *vcpu) | ||
4293 | { | ||
4294 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
4295 | |||
4296 | if (vmx->vmcs) { | ||
4297 | vcpu_clear(vmx); | ||
4298 | free_vmcs(vmx->vmcs); | ||
4299 | vmx->vmcs = NULL; | ||
4300 | } | ||
4301 | } | ||
4302 | |||
4303 | static void vmx_free_vcpu(struct kvm_vcpu *vcpu) | 6113 | static void vmx_free_vcpu(struct kvm_vcpu *vcpu) |
4304 | { | 6114 | { |
4305 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 6115 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
4306 | 6116 | ||
4307 | free_vpid(vmx); | 6117 | free_vpid(vmx); |
4308 | vmx_free_vmcs(vcpu); | 6118 | free_nested(vmx); |
6119 | free_loaded_vmcs(vmx->loaded_vmcs); | ||
4309 | kfree(vmx->guest_msrs); | 6120 | kfree(vmx->guest_msrs); |
4310 | kvm_vcpu_uninit(vcpu); | 6121 | kvm_vcpu_uninit(vcpu); |
4311 | kmem_cache_free(kvm_vcpu_cache, vmx); | 6122 | kmem_cache_free(kvm_vcpu_cache, vmx); |
4312 | } | 6123 | } |
4313 | 6124 | ||
4314 | static inline void vmcs_init(struct vmcs *vmcs) | ||
4315 | { | ||
4316 | u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id())); | ||
4317 | |||
4318 | if (!vmm_exclusive) | ||
4319 | kvm_cpu_vmxon(phys_addr); | ||
4320 | |||
4321 | vmcs_clear(vmcs); | ||
4322 | |||
4323 | if (!vmm_exclusive) | ||
4324 | kvm_cpu_vmxoff(); | ||
4325 | } | ||
4326 | |||
4327 | static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | 6125 | static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) |
4328 | { | 6126 | { |
4329 | int err; | 6127 | int err; |
@@ -4345,11 +6143,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | |||
4345 | goto uninit_vcpu; | 6143 | goto uninit_vcpu; |
4346 | } | 6144 | } |
4347 | 6145 | ||
4348 | vmx->vmcs = alloc_vmcs(); | 6146 | vmx->loaded_vmcs = &vmx->vmcs01; |
4349 | if (!vmx->vmcs) | 6147 | vmx->loaded_vmcs->vmcs = alloc_vmcs(); |
6148 | if (!vmx->loaded_vmcs->vmcs) | ||
4350 | goto free_msrs; | 6149 | goto free_msrs; |
4351 | 6150 | if (!vmm_exclusive) | |
4352 | vmcs_init(vmx->vmcs); | 6151 | kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id()))); |
6152 | loaded_vmcs_init(vmx->loaded_vmcs); | ||
6153 | if (!vmm_exclusive) | ||
6154 | kvm_cpu_vmxoff(); | ||
4353 | 6155 | ||
4354 | cpu = get_cpu(); | 6156 | cpu = get_cpu(); |
4355 | vmx_vcpu_load(&vmx->vcpu, cpu); | 6157 | vmx_vcpu_load(&vmx->vcpu, cpu); |
@@ -4375,10 +6177,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | |||
4375 | goto free_vmcs; | 6177 | goto free_vmcs; |
4376 | } | 6178 | } |
4377 | 6179 | ||
6180 | vmx->nested.current_vmptr = -1ull; | ||
6181 | vmx->nested.current_vmcs12 = NULL; | ||
6182 | |||
4378 | return &vmx->vcpu; | 6183 | return &vmx->vcpu; |
4379 | 6184 | ||
4380 | free_vmcs: | 6185 | free_vmcs: |
4381 | free_vmcs(vmx->vmcs); | 6186 | free_vmcs(vmx->loaded_vmcs->vmcs); |
4382 | free_msrs: | 6187 | free_msrs: |
4383 | kfree(vmx->guest_msrs); | 6188 | kfree(vmx->guest_msrs); |
4384 | uninit_vcpu: | 6189 | uninit_vcpu: |
@@ -4512,6 +6317,650 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) | |||
4512 | 6317 | ||
4513 | static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) | 6318 | static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) |
4514 | { | 6319 | { |
6320 | if (func == 1 && nested) | ||
6321 | entry->ecx |= bit(X86_FEATURE_VMX); | ||
6322 | } | ||
6323 | |||
6324 | /* | ||
6325 | * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested | ||
6326 | * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it | ||
6327 | * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2 | ||
6328 | * guest in a way that will both be appropriate to L1's requests, and our | ||
6329 | * needs. In addition to modifying the active vmcs (which is vmcs02), this | ||
6330 | * function also has additional necessary side-effects, like setting various | ||
6331 | * vcpu->arch fields. | ||
6332 | */ | ||
6333 | static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
6334 | { | ||
6335 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
6336 | u32 exec_control; | ||
6337 | |||
6338 | vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); | ||
6339 | vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); | ||
6340 | vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); | ||
6341 | vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); | ||
6342 | vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); | ||
6343 | vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); | ||
6344 | vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); | ||
6345 | vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); | ||
6346 | vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); | ||
6347 | vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); | ||
6348 | vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); | ||
6349 | vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); | ||
6350 | vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); | ||
6351 | vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); | ||
6352 | vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); | ||
6353 | vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); | ||
6354 | vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); | ||
6355 | vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); | ||
6356 | vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); | ||
6357 | vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); | ||
6358 | vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); | ||
6359 | vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); | ||
6360 | vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); | ||
6361 | vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); | ||
6362 | vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); | ||
6363 | vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); | ||
6364 | vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); | ||
6365 | vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); | ||
6366 | vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); | ||
6367 | vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); | ||
6368 | vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); | ||
6369 | vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); | ||
6370 | vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); | ||
6371 | vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); | ||
6372 | vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); | ||
6373 | vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); | ||
6374 | |||
6375 | vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); | ||
6376 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
6377 | vmcs12->vm_entry_intr_info_field); | ||
6378 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, | ||
6379 | vmcs12->vm_entry_exception_error_code); | ||
6380 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | ||
6381 | vmcs12->vm_entry_instruction_len); | ||
6382 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, | ||
6383 | vmcs12->guest_interruptibility_info); | ||
6384 | vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state); | ||
6385 | vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); | ||
6386 | vmcs_writel(GUEST_DR7, vmcs12->guest_dr7); | ||
6387 | vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags); | ||
6388 | vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, | ||
6389 | vmcs12->guest_pending_dbg_exceptions); | ||
6390 | vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); | ||
6391 | vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); | ||
6392 | |||
6393 | vmcs_write64(VMCS_LINK_POINTER, -1ull); | ||
6394 | |||
6395 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, | ||
6396 | (vmcs_config.pin_based_exec_ctrl | | ||
6397 | vmcs12->pin_based_vm_exec_control)); | ||
6398 | |||
6399 | /* | ||
6400 | * Whether page-faults are trapped is determined by a combination of | ||
6401 | * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. | ||
6402 | * If enable_ept, L0 doesn't care about page faults and we should | ||
6403 | * set all of these to L1's desires. However, if !enable_ept, L0 does | ||
6404 | * care about (at least some) page faults, and because it is not easy | ||
6405 | * (if at all possible?) to merge L0 and L1's desires, we simply ask | ||
6406 | * to exit on each and every L2 page fault. This is done by setting | ||
6407 | * MASK=MATCH=0 and (see below) EB.PF=1. | ||
6408 | * Note that below we don't need special code to set EB.PF beyond the | ||
6409 | * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, | ||
6410 | * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when | ||
6411 | * !enable_ept, EB.PF is 1, so the "or" will always be 1. | ||
6412 | * | ||
6413 | * A problem with this approach (when !enable_ept) is that L1 may be | ||
6414 | * injected with more page faults than it asked for. This could have | ||
6415 | * caused problems, but in practice existing hypervisors don't care. | ||
6416 | * To fix this, we will need to emulate the PFEC checking (on the L1 | ||
6417 | * page tables), using walk_addr(), when injecting PFs to L1. | ||
6418 | */ | ||
6419 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, | ||
6420 | enable_ept ? vmcs12->page_fault_error_code_mask : 0); | ||
6421 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, | ||
6422 | enable_ept ? vmcs12->page_fault_error_code_match : 0); | ||
6423 | |||
6424 | if (cpu_has_secondary_exec_ctrls()) { | ||
6425 | u32 exec_control = vmx_secondary_exec_control(vmx); | ||
6426 | if (!vmx->rdtscp_enabled) | ||
6427 | exec_control &= ~SECONDARY_EXEC_RDTSCP; | ||
6428 | /* Take the following fields only from vmcs12 */ | ||
6429 | exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
6430 | if (nested_cpu_has(vmcs12, | ||
6431 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) | ||
6432 | exec_control |= vmcs12->secondary_vm_exec_control; | ||
6433 | |||
6434 | if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) { | ||
6435 | /* | ||
6436 | * Translate L1 physical address to host physical | ||
6437 | * address for vmcs02. Keep the page pinned, so this | ||
6438 | * physical address remains valid. We keep a reference | ||
6439 | * to it so we can release it later. | ||
6440 | */ | ||
6441 | if (vmx->nested.apic_access_page) /* shouldn't happen */ | ||
6442 | nested_release_page(vmx->nested.apic_access_page); | ||
6443 | vmx->nested.apic_access_page = | ||
6444 | nested_get_page(vcpu, vmcs12->apic_access_addr); | ||
6445 | /* | ||
6446 | * If translation failed, no matter: This feature asks | ||
6447 | * to exit when accessing the given address, and if it | ||
6448 | * can never be accessed, this feature won't do | ||
6449 | * anything anyway. | ||
6450 | */ | ||
6451 | if (!vmx->nested.apic_access_page) | ||
6452 | exec_control &= | ||
6453 | ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
6454 | else | ||
6455 | vmcs_write64(APIC_ACCESS_ADDR, | ||
6456 | page_to_phys(vmx->nested.apic_access_page)); | ||
6457 | } | ||
6458 | |||
6459 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); | ||
6460 | } | ||
6461 | |||
6462 | |||
6463 | /* | ||
6464 | * Set host-state according to L0's settings (vmcs12 is irrelevant here) | ||
6465 | * Some constant fields are set here by vmx_set_constant_host_state(). | ||
6466 | * Other fields are different per CPU, and will be set later when | ||
6467 | * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. | ||
6468 | */ | ||
6469 | vmx_set_constant_host_state(); | ||
6470 | |||
6471 | /* | ||
6472 | * HOST_RSP is normally set correctly in vmx_vcpu_run() just before | ||
6473 | * entry, but only if the current (host) sp changed from the value | ||
6474 | * we wrote last (vmx->host_rsp). This cache is no longer relevant | ||
6475 | * if we switch vmcs, and rather than hold a separate cache per vmcs, | ||
6476 | * here we just force the write to happen on entry. | ||
6477 | */ | ||
6478 | vmx->host_rsp = 0; | ||
6479 | |||
6480 | exec_control = vmx_exec_control(vmx); /* L0's desires */ | ||
6481 | exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; | ||
6482 | exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; | ||
6483 | exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
6484 | exec_control |= vmcs12->cpu_based_vm_exec_control; | ||
6485 | /* | ||
6486 | * Merging of IO and MSR bitmaps not currently supported. | ||
6487 | * Rather, exit every time. | ||
6488 | */ | ||
6489 | exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; | ||
6490 | exec_control &= ~CPU_BASED_USE_IO_BITMAPS; | ||
6491 | exec_control |= CPU_BASED_UNCOND_IO_EXITING; | ||
6492 | |||
6493 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); | ||
6494 | |||
6495 | /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the | ||
6496 | * bitwise-or of what L1 wants to trap for L2, and what we want to | ||
6497 | * trap. Note that CR0.TS also needs updating - we do this later. | ||
6498 | */ | ||
6499 | update_exception_bitmap(vcpu); | ||
6500 | vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; | ||
6501 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); | ||
6502 | |||
6503 | /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */ | ||
6504 | vmcs_write32(VM_EXIT_CONTROLS, | ||
6505 | vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); | ||
6506 | vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | | ||
6507 | (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); | ||
6508 | |||
6509 | if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) | ||
6510 | vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); | ||
6511 | else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) | ||
6512 | vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); | ||
6513 | |||
6514 | |||
6515 | set_cr4_guest_host_mask(vmx); | ||
6516 | |||
6517 | vmcs_write64(TSC_OFFSET, | ||
6518 | vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); | ||
6519 | |||
6520 | if (enable_vpid) { | ||
6521 | /* | ||
6522 | * Trivially support vpid by letting L2s share their parent | ||
6523 | * L1's vpid. TODO: move to a more elaborate solution, giving | ||
6524 | * each L2 its own vpid and exposing the vpid feature to L1. | ||
6525 | */ | ||
6526 | vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); | ||
6527 | vmx_flush_tlb(vcpu); | ||
6528 | } | ||
6529 | |||
6530 | if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) | ||
6531 | vcpu->arch.efer = vmcs12->guest_ia32_efer; | ||
6532 | if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) | ||
6533 | vcpu->arch.efer |= (EFER_LMA | EFER_LME); | ||
6534 | else | ||
6535 | vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); | ||
6536 | /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ | ||
6537 | vmx_set_efer(vcpu, vcpu->arch.efer); | ||
6538 | |||
6539 | /* | ||
6540 | * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified | ||
6541 | * TS bit (for lazy fpu) and bits which we consider mandatory enabled. | ||
6542 | * The CR0_READ_SHADOW is what L2 should have expected to read given | ||
6543 | * the specifications by L1; It's not enough to take | ||
6544 | * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we | ||
6545 | * have more bits than L1 expected. | ||
6546 | */ | ||
6547 | vmx_set_cr0(vcpu, vmcs12->guest_cr0); | ||
6548 | vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); | ||
6549 | |||
6550 | vmx_set_cr4(vcpu, vmcs12->guest_cr4); | ||
6551 | vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); | ||
6552 | |||
6553 | /* shadow page tables on either EPT or shadow page tables */ | ||
6554 | kvm_set_cr3(vcpu, vmcs12->guest_cr3); | ||
6555 | kvm_mmu_reset_context(vcpu); | ||
6556 | |||
6557 | kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); | ||
6558 | kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); | ||
6559 | } | ||
6560 | |||
6561 | /* | ||
6562 | * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 | ||
6563 | * for running an L2 nested guest. | ||
6564 | */ | ||
6565 | static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) | ||
6566 | { | ||
6567 | struct vmcs12 *vmcs12; | ||
6568 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
6569 | int cpu; | ||
6570 | struct loaded_vmcs *vmcs02; | ||
6571 | |||
6572 | if (!nested_vmx_check_permission(vcpu) || | ||
6573 | !nested_vmx_check_vmcs12(vcpu)) | ||
6574 | return 1; | ||
6575 | |||
6576 | skip_emulated_instruction(vcpu); | ||
6577 | vmcs12 = get_vmcs12(vcpu); | ||
6578 | |||
6579 | /* | ||
6580 | * The nested entry process starts with enforcing various prerequisites | ||
6581 | * on vmcs12 as required by the Intel SDM, and act appropriately when | ||
6582 | * they fail: As the SDM explains, some conditions should cause the | ||
6583 | * instruction to fail, while others will cause the instruction to seem | ||
6584 | * to succeed, but return an EXIT_REASON_INVALID_STATE. | ||
6585 | * To speed up the normal (success) code path, we should avoid checking | ||
6586 | * for misconfigurations which will anyway be caught by the processor | ||
6587 | * when using the merged vmcs02. | ||
6588 | */ | ||
6589 | if (vmcs12->launch_state == launch) { | ||
6590 | nested_vmx_failValid(vcpu, | ||
6591 | launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS | ||
6592 | : VMXERR_VMRESUME_NONLAUNCHED_VMCS); | ||
6593 | return 1; | ||
6594 | } | ||
6595 | |||
6596 | if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) && | ||
6597 | !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) { | ||
6598 | /*TODO: Also verify bits beyond physical address width are 0*/ | ||
6599 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); | ||
6600 | return 1; | ||
6601 | } | ||
6602 | |||
6603 | if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && | ||
6604 | !IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) { | ||
6605 | /*TODO: Also verify bits beyond physical address width are 0*/ | ||
6606 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); | ||
6607 | return 1; | ||
6608 | } | ||
6609 | |||
6610 | if (vmcs12->vm_entry_msr_load_count > 0 || | ||
6611 | vmcs12->vm_exit_msr_load_count > 0 || | ||
6612 | vmcs12->vm_exit_msr_store_count > 0) { | ||
6613 | if (printk_ratelimit()) | ||
6614 | printk(KERN_WARNING | ||
6615 | "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__); | ||
6616 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); | ||
6617 | return 1; | ||
6618 | } | ||
6619 | |||
6620 | if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, | ||
6621 | nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) || | ||
6622 | !vmx_control_verify(vmcs12->secondary_vm_exec_control, | ||
6623 | nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) || | ||
6624 | !vmx_control_verify(vmcs12->pin_based_vm_exec_control, | ||
6625 | nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) || | ||
6626 | !vmx_control_verify(vmcs12->vm_exit_controls, | ||
6627 | nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) || | ||
6628 | !vmx_control_verify(vmcs12->vm_entry_controls, | ||
6629 | nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high)) | ||
6630 | { | ||
6631 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); | ||
6632 | return 1; | ||
6633 | } | ||
6634 | |||
6635 | if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || | ||
6636 | ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { | ||
6637 | nested_vmx_failValid(vcpu, | ||
6638 | VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); | ||
6639 | return 1; | ||
6640 | } | ||
6641 | |||
6642 | if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || | ||
6643 | ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { | ||
6644 | nested_vmx_entry_failure(vcpu, vmcs12, | ||
6645 | EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); | ||
6646 | return 1; | ||
6647 | } | ||
6648 | if (vmcs12->vmcs_link_pointer != -1ull) { | ||
6649 | nested_vmx_entry_failure(vcpu, vmcs12, | ||
6650 | EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR); | ||
6651 | return 1; | ||
6652 | } | ||
6653 | |||
6654 | /* | ||
6655 | * We're finally done with prerequisite checking, and can start with | ||
6656 | * the nested entry. | ||
6657 | */ | ||
6658 | |||
6659 | vmcs02 = nested_get_current_vmcs02(vmx); | ||
6660 | if (!vmcs02) | ||
6661 | return -ENOMEM; | ||
6662 | |||
6663 | enter_guest_mode(vcpu); | ||
6664 | |||
6665 | vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); | ||
6666 | |||
6667 | cpu = get_cpu(); | ||
6668 | vmx->loaded_vmcs = vmcs02; | ||
6669 | vmx_vcpu_put(vcpu); | ||
6670 | vmx_vcpu_load(vcpu, cpu); | ||
6671 | vcpu->cpu = cpu; | ||
6672 | put_cpu(); | ||
6673 | |||
6674 | vmcs12->launch_state = 1; | ||
6675 | |||
6676 | prepare_vmcs02(vcpu, vmcs12); | ||
6677 | |||
6678 | /* | ||
6679 | * Note no nested_vmx_succeed or nested_vmx_fail here. At this point | ||
6680 | * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet | ||
6681 | * returned as far as L1 is concerned. It will only return (and set | ||
6682 | * the success flag) when L2 exits (see nested_vmx_vmexit()). | ||
6683 | */ | ||
6684 | return 1; | ||
6685 | } | ||
6686 | |||
6687 | /* | ||
6688 | * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date | ||
6689 | * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). | ||
6690 | * This function returns the new value we should put in vmcs12.guest_cr0. | ||
6691 | * It's not enough to just return the vmcs02 GUEST_CR0. Rather, | ||
6692 | * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now | ||
6693 | * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 | ||
6694 | * didn't trap the bit, because if L1 did, so would L0). | ||
6695 | * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have | ||
6696 | * been modified by L2, and L1 knows it. So just leave the old value of | ||
6697 | * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 | ||
6698 | * isn't relevant, because if L0 traps this bit it can set it to anything. | ||
6699 | * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have | ||
6700 | * changed these bits, and therefore they need to be updated, but L0 | ||
6701 | * didn't necessarily allow them to be changed in GUEST_CR0 - and rather | ||
6702 | * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. | ||
6703 | */ | ||
6704 | static inline unsigned long | ||
6705 | vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
6706 | { | ||
6707 | return | ||
6708 | /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | | ||
6709 | /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | | ||
6710 | /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | | ||
6711 | vcpu->arch.cr0_guest_owned_bits)); | ||
6712 | } | ||
6713 | |||
6714 | static inline unsigned long | ||
6715 | vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
6716 | { | ||
6717 | return | ||
6718 | /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | | ||
6719 | /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | | ||
6720 | /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | | ||
6721 | vcpu->arch.cr4_guest_owned_bits)); | ||
6722 | } | ||
6723 | |||
6724 | /* | ||
6725 | * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits | ||
6726 | * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), | ||
6727 | * and this function updates it to reflect the changes to the guest state while | ||
6728 | * L2 was running (and perhaps made some exits which were handled directly by L0 | ||
6729 | * without going back to L1), and to reflect the exit reason. | ||
6730 | * Note that we do not have to copy here all VMCS fields, just those that | ||
6731 | * could have changed by the L2 guest or the exit - i.e., the guest-state and | ||
6732 | * exit-information fields only. Other fields are modified by L1 with VMWRITE, | ||
6733 | * which already writes to vmcs12 directly. | ||
6734 | */ | ||
6735 | void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
6736 | { | ||
6737 | /* update guest state fields: */ | ||
6738 | vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); | ||
6739 | vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); | ||
6740 | |||
6741 | kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); | ||
6742 | vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); | ||
6743 | vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); | ||
6744 | vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); | ||
6745 | |||
6746 | vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); | ||
6747 | vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); | ||
6748 | vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); | ||
6749 | vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); | ||
6750 | vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); | ||
6751 | vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); | ||
6752 | vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); | ||
6753 | vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); | ||
6754 | vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); | ||
6755 | vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); | ||
6756 | vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); | ||
6757 | vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); | ||
6758 | vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); | ||
6759 | vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); | ||
6760 | vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); | ||
6761 | vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); | ||
6762 | vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); | ||
6763 | vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); | ||
6764 | vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); | ||
6765 | vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); | ||
6766 | vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); | ||
6767 | vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); | ||
6768 | vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); | ||
6769 | vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); | ||
6770 | vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); | ||
6771 | vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); | ||
6772 | vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); | ||
6773 | vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); | ||
6774 | vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); | ||
6775 | vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); | ||
6776 | vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); | ||
6777 | vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); | ||
6778 | vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); | ||
6779 | vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); | ||
6780 | vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); | ||
6781 | vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); | ||
6782 | |||
6783 | vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE); | ||
6784 | vmcs12->guest_interruptibility_info = | ||
6785 | vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | ||
6786 | vmcs12->guest_pending_dbg_exceptions = | ||
6787 | vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); | ||
6788 | |||
6789 | /* TODO: These cannot have changed unless we have MSR bitmaps and | ||
6790 | * the relevant bit asks not to trap the change */ | ||
6791 | vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); | ||
6792 | if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT) | ||
6793 | vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); | ||
6794 | vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); | ||
6795 | vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); | ||
6796 | vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); | ||
6797 | |||
6798 | /* update exit information fields: */ | ||
6799 | |||
6800 | vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON); | ||
6801 | vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
6802 | |||
6803 | vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
6804 | vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); | ||
6805 | vmcs12->idt_vectoring_info_field = | ||
6806 | vmcs_read32(IDT_VECTORING_INFO_FIELD); | ||
6807 | vmcs12->idt_vectoring_error_code = | ||
6808 | vmcs_read32(IDT_VECTORING_ERROR_CODE); | ||
6809 | vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | ||
6810 | vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
6811 | |||
6812 | /* clear vm-entry fields which are to be cleared on exit */ | ||
6813 | if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) | ||
6814 | vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; | ||
6815 | } | ||
6816 | |||
6817 | /* | ||
6818 | * A part of what we need to when the nested L2 guest exits and we want to | ||
6819 | * run its L1 parent, is to reset L1's guest state to the host state specified | ||
6820 | * in vmcs12. | ||
6821 | * This function is to be called not only on normal nested exit, but also on | ||
6822 | * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry | ||
6823 | * Failures During or After Loading Guest State"). | ||
6824 | * This function should be called when the active VMCS is L1's (vmcs01). | ||
6825 | */ | ||
6826 | void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
6827 | { | ||
6828 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) | ||
6829 | vcpu->arch.efer = vmcs12->host_ia32_efer; | ||
6830 | if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) | ||
6831 | vcpu->arch.efer |= (EFER_LMA | EFER_LME); | ||
6832 | else | ||
6833 | vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); | ||
6834 | vmx_set_efer(vcpu, vcpu->arch.efer); | ||
6835 | |||
6836 | kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); | ||
6837 | kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); | ||
6838 | /* | ||
6839 | * Note that calling vmx_set_cr0 is important, even if cr0 hasn't | ||
6840 | * actually changed, because it depends on the current state of | ||
6841 | * fpu_active (which may have changed). | ||
6842 | * Note that vmx_set_cr0 refers to efer set above. | ||
6843 | */ | ||
6844 | kvm_set_cr0(vcpu, vmcs12->host_cr0); | ||
6845 | /* | ||
6846 | * If we did fpu_activate()/fpu_deactivate() during L2's run, we need | ||
6847 | * to apply the same changes to L1's vmcs. We just set cr0 correctly, | ||
6848 | * but we also need to update cr0_guest_host_mask and exception_bitmap. | ||
6849 | */ | ||
6850 | update_exception_bitmap(vcpu); | ||
6851 | vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0); | ||
6852 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); | ||
6853 | |||
6854 | /* | ||
6855 | * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01 | ||
6856 | * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask(); | ||
6857 | */ | ||
6858 | vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); | ||
6859 | kvm_set_cr4(vcpu, vmcs12->host_cr4); | ||
6860 | |||
6861 | /* shadow page tables on either EPT or shadow page tables */ | ||
6862 | kvm_set_cr3(vcpu, vmcs12->host_cr3); | ||
6863 | kvm_mmu_reset_context(vcpu); | ||
6864 | |||
6865 | if (enable_vpid) { | ||
6866 | /* | ||
6867 | * Trivially support vpid by letting L2s share their parent | ||
6868 | * L1's vpid. TODO: move to a more elaborate solution, giving | ||
6869 | * each L2 its own vpid and exposing the vpid feature to L1. | ||
6870 | */ | ||
6871 | vmx_flush_tlb(vcpu); | ||
6872 | } | ||
6873 | |||
6874 | |||
6875 | vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); | ||
6876 | vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); | ||
6877 | vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); | ||
6878 | vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); | ||
6879 | vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); | ||
6880 | vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base); | ||
6881 | vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base); | ||
6882 | vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base); | ||
6883 | vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector); | ||
6884 | vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector); | ||
6885 | vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector); | ||
6886 | vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector); | ||
6887 | vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector); | ||
6888 | vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector); | ||
6889 | vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector); | ||
6890 | |||
6891 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) | ||
6892 | vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); | ||
6893 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) | ||
6894 | vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, | ||
6895 | vmcs12->host_ia32_perf_global_ctrl); | ||
6896 | } | ||
6897 | |||
6898 | /* | ||
6899 | * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 | ||
6900 | * and modify vmcs12 to make it see what it would expect to see there if | ||
6901 | * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) | ||
6902 | */ | ||
6903 | static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) | ||
6904 | { | ||
6905 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
6906 | int cpu; | ||
6907 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
6908 | |||
6909 | leave_guest_mode(vcpu); | ||
6910 | prepare_vmcs12(vcpu, vmcs12); | ||
6911 | |||
6912 | cpu = get_cpu(); | ||
6913 | vmx->loaded_vmcs = &vmx->vmcs01; | ||
6914 | vmx_vcpu_put(vcpu); | ||
6915 | vmx_vcpu_load(vcpu, cpu); | ||
6916 | vcpu->cpu = cpu; | ||
6917 | put_cpu(); | ||
6918 | |||
6919 | /* if no vmcs02 cache requested, remove the one we used */ | ||
6920 | if (VMCS02_POOL_SIZE == 0) | ||
6921 | nested_free_vmcs02(vmx, vmx->nested.current_vmptr); | ||
6922 | |||
6923 | load_vmcs12_host_state(vcpu, vmcs12); | ||
6924 | |||
6925 | /* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */ | ||
6926 | vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); | ||
6927 | |||
6928 | /* This is needed for same reason as it was needed in prepare_vmcs02 */ | ||
6929 | vmx->host_rsp = 0; | ||
6930 | |||
6931 | /* Unpin physical memory we referred to in vmcs02 */ | ||
6932 | if (vmx->nested.apic_access_page) { | ||
6933 | nested_release_page(vmx->nested.apic_access_page); | ||
6934 | vmx->nested.apic_access_page = 0; | ||
6935 | } | ||
6936 | |||
6937 | /* | ||
6938 | * Exiting from L2 to L1, we're now back to L1 which thinks it just | ||
6939 | * finished a VMLAUNCH or VMRESUME instruction, so we need to set the | ||
6940 | * success or failure flag accordingly. | ||
6941 | */ | ||
6942 | if (unlikely(vmx->fail)) { | ||
6943 | vmx->fail = 0; | ||
6944 | nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); | ||
6945 | } else | ||
6946 | nested_vmx_succeed(vcpu); | ||
6947 | } | ||
6948 | |||
6949 | /* | ||
6950 | * L1's failure to enter L2 is a subset of a normal exit, as explained in | ||
6951 | * 23.7 "VM-entry failures during or after loading guest state" (this also | ||
6952 | * lists the acceptable exit-reason and exit-qualification parameters). | ||
6953 | * It should only be called before L2 actually succeeded to run, and when | ||
6954 | * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss). | ||
6955 | */ | ||
6956 | static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, | ||
6957 | struct vmcs12 *vmcs12, | ||
6958 | u32 reason, unsigned long qualification) | ||
6959 | { | ||
6960 | load_vmcs12_host_state(vcpu, vmcs12); | ||
6961 | vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY; | ||
6962 | vmcs12->exit_qualification = qualification; | ||
6963 | nested_vmx_succeed(vcpu); | ||
4515 | } | 6964 | } |
4516 | 6965 | ||
4517 | static int vmx_check_intercept(struct kvm_vcpu *vcpu, | 6966 | static int vmx_check_intercept(struct kvm_vcpu *vcpu, |
@@ -4670,16 +7119,13 @@ static int __init vmx_init(void) | |||
4670 | vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); | 7119 | vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); |
4671 | 7120 | ||
4672 | if (enable_ept) { | 7121 | if (enable_ept) { |
4673 | bypass_guest_pf = 0; | ||
4674 | kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, | 7122 | kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, |
4675 | VMX_EPT_EXECUTABLE_MASK); | 7123 | VMX_EPT_EXECUTABLE_MASK); |
7124 | ept_set_mmio_spte_mask(); | ||
4676 | kvm_enable_tdp(); | 7125 | kvm_enable_tdp(); |
4677 | } else | 7126 | } else |
4678 | kvm_disable_tdp(); | 7127 | kvm_disable_tdp(); |
4679 | 7128 | ||
4680 | if (bypass_guest_pf) | ||
4681 | kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); | ||
4682 | |||
4683 | return 0; | 7129 | return 0; |
4684 | 7130 | ||
4685 | out3: | 7131 | out3: |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 77c9d8673dc4..84a28ea45fa4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -347,6 +347,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) | |||
347 | vcpu->arch.cr2 = fault->address; | 347 | vcpu->arch.cr2 = fault->address; |
348 | kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); | 348 | kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); |
349 | } | 349 | } |
350 | EXPORT_SYMBOL_GPL(kvm_inject_page_fault); | ||
350 | 351 | ||
351 | void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) | 352 | void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) |
352 | { | 353 | { |
@@ -579,6 +580,22 @@ static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) | |||
579 | return best && (best->ecx & bit(X86_FEATURE_XSAVE)); | 580 | return best && (best->ecx & bit(X86_FEATURE_XSAVE)); |
580 | } | 581 | } |
581 | 582 | ||
583 | static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) | ||
584 | { | ||
585 | struct kvm_cpuid_entry2 *best; | ||
586 | |||
587 | best = kvm_find_cpuid_entry(vcpu, 7, 0); | ||
588 | return best && (best->ebx & bit(X86_FEATURE_SMEP)); | ||
589 | } | ||
590 | |||
591 | static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) | ||
592 | { | ||
593 | struct kvm_cpuid_entry2 *best; | ||
594 | |||
595 | best = kvm_find_cpuid_entry(vcpu, 7, 0); | ||
596 | return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); | ||
597 | } | ||
598 | |||
582 | static void update_cpuid(struct kvm_vcpu *vcpu) | 599 | static void update_cpuid(struct kvm_vcpu *vcpu) |
583 | { | 600 | { |
584 | struct kvm_cpuid_entry2 *best; | 601 | struct kvm_cpuid_entry2 *best; |
@@ -598,14 +615,20 @@ static void update_cpuid(struct kvm_vcpu *vcpu) | |||
598 | int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | 615 | int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
599 | { | 616 | { |
600 | unsigned long old_cr4 = kvm_read_cr4(vcpu); | 617 | unsigned long old_cr4 = kvm_read_cr4(vcpu); |
601 | unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; | 618 | unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | |
602 | 619 | X86_CR4_PAE | X86_CR4_SMEP; | |
603 | if (cr4 & CR4_RESERVED_BITS) | 620 | if (cr4 & CR4_RESERVED_BITS) |
604 | return 1; | 621 | return 1; |
605 | 622 | ||
606 | if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) | 623 | if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) |
607 | return 1; | 624 | return 1; |
608 | 625 | ||
626 | if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) | ||
627 | return 1; | ||
628 | |||
629 | if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS)) | ||
630 | return 1; | ||
631 | |||
609 | if (is_long_mode(vcpu)) { | 632 | if (is_long_mode(vcpu)) { |
610 | if (!(cr4 & X86_CR4_PAE)) | 633 | if (!(cr4 & X86_CR4_PAE)) |
611 | return 1; | 634 | return 1; |
@@ -615,11 +638,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
615 | kvm_read_cr3(vcpu))) | 638 | kvm_read_cr3(vcpu))) |
616 | return 1; | 639 | return 1; |
617 | 640 | ||
618 | if (cr4 & X86_CR4_VMXE) | 641 | if (kvm_x86_ops->set_cr4(vcpu, cr4)) |
619 | return 1; | 642 | return 1; |
620 | 643 | ||
621 | kvm_x86_ops->set_cr4(vcpu, cr4); | ||
622 | |||
623 | if ((cr4 ^ old_cr4) & pdptr_bits) | 644 | if ((cr4 ^ old_cr4) & pdptr_bits) |
624 | kvm_mmu_reset_context(vcpu); | 645 | kvm_mmu_reset_context(vcpu); |
625 | 646 | ||
@@ -787,12 +808,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr); | |||
787 | * kvm-specific. Those are put in the beginning of the list. | 808 | * kvm-specific. Those are put in the beginning of the list. |
788 | */ | 809 | */ |
789 | 810 | ||
790 | #define KVM_SAVE_MSRS_BEGIN 8 | 811 | #define KVM_SAVE_MSRS_BEGIN 9 |
791 | static u32 msrs_to_save[] = { | 812 | static u32 msrs_to_save[] = { |
792 | MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, | 813 | MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, |
793 | MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, | 814 | MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, |
794 | HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, | 815 | HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, |
795 | HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, | 816 | HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, |
796 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | 817 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, |
797 | MSR_STAR, | 818 | MSR_STAR, |
798 | #ifdef CONFIG_X86_64 | 819 | #ifdef CONFIG_X86_64 |
@@ -1388,7 +1409,7 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1388 | return 1; | 1409 | return 1; |
1389 | kvm_x86_ops->patch_hypercall(vcpu, instructions); | 1410 | kvm_x86_ops->patch_hypercall(vcpu, instructions); |
1390 | ((unsigned char *)instructions)[3] = 0xc3; /* ret */ | 1411 | ((unsigned char *)instructions)[3] = 0xc3; /* ret */ |
1391 | if (copy_to_user((void __user *)addr, instructions, 4)) | 1412 | if (__copy_to_user((void __user *)addr, instructions, 4)) |
1392 | return 1; | 1413 | return 1; |
1393 | kvm->arch.hv_hypercall = data; | 1414 | kvm->arch.hv_hypercall = data; |
1394 | break; | 1415 | break; |
@@ -1415,7 +1436,7 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1415 | HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); | 1436 | HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); |
1416 | if (kvm_is_error_hva(addr)) | 1437 | if (kvm_is_error_hva(addr)) |
1417 | return 1; | 1438 | return 1; |
1418 | if (clear_user((void __user *)addr, PAGE_SIZE)) | 1439 | if (__clear_user((void __user *)addr, PAGE_SIZE)) |
1419 | return 1; | 1440 | return 1; |
1420 | vcpu->arch.hv_vapic = data; | 1441 | vcpu->arch.hv_vapic = data; |
1421 | break; | 1442 | break; |
@@ -1467,6 +1488,35 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) | |||
1467 | } | 1488 | } |
1468 | } | 1489 | } |
1469 | 1490 | ||
1491 | static void accumulate_steal_time(struct kvm_vcpu *vcpu) | ||
1492 | { | ||
1493 | u64 delta; | ||
1494 | |||
1495 | if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) | ||
1496 | return; | ||
1497 | |||
1498 | delta = current->sched_info.run_delay - vcpu->arch.st.last_steal; | ||
1499 | vcpu->arch.st.last_steal = current->sched_info.run_delay; | ||
1500 | vcpu->arch.st.accum_steal = delta; | ||
1501 | } | ||
1502 | |||
1503 | static void record_steal_time(struct kvm_vcpu *vcpu) | ||
1504 | { | ||
1505 | if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) | ||
1506 | return; | ||
1507 | |||
1508 | if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, | ||
1509 | &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) | ||
1510 | return; | ||
1511 | |||
1512 | vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal; | ||
1513 | vcpu->arch.st.steal.version += 2; | ||
1514 | vcpu->arch.st.accum_steal = 0; | ||
1515 | |||
1516 | kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, | ||
1517 | &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); | ||
1518 | } | ||
1519 | |||
1470 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | 1520 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) |
1471 | { | 1521 | { |
1472 | switch (msr) { | 1522 | switch (msr) { |
@@ -1549,6 +1599,33 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1549 | if (kvm_pv_enable_async_pf(vcpu, data)) | 1599 | if (kvm_pv_enable_async_pf(vcpu, data)) |
1550 | return 1; | 1600 | return 1; |
1551 | break; | 1601 | break; |
1602 | case MSR_KVM_STEAL_TIME: | ||
1603 | |||
1604 | if (unlikely(!sched_info_on())) | ||
1605 | return 1; | ||
1606 | |||
1607 | if (data & KVM_STEAL_RESERVED_MASK) | ||
1608 | return 1; | ||
1609 | |||
1610 | if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, | ||
1611 | data & KVM_STEAL_VALID_BITS)) | ||
1612 | return 1; | ||
1613 | |||
1614 | vcpu->arch.st.msr_val = data; | ||
1615 | |||
1616 | if (!(data & KVM_MSR_ENABLED)) | ||
1617 | break; | ||
1618 | |||
1619 | vcpu->arch.st.last_steal = current->sched_info.run_delay; | ||
1620 | |||
1621 | preempt_disable(); | ||
1622 | accumulate_steal_time(vcpu); | ||
1623 | preempt_enable(); | ||
1624 | |||
1625 | kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); | ||
1626 | |||
1627 | break; | ||
1628 | |||
1552 | case MSR_IA32_MCG_CTL: | 1629 | case MSR_IA32_MCG_CTL: |
1553 | case MSR_IA32_MCG_STATUS: | 1630 | case MSR_IA32_MCG_STATUS: |
1554 | case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: | 1631 | case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: |
@@ -1834,6 +1911,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
1834 | case MSR_KVM_ASYNC_PF_EN: | 1911 | case MSR_KVM_ASYNC_PF_EN: |
1835 | data = vcpu->arch.apf.msr_val; | 1912 | data = vcpu->arch.apf.msr_val; |
1836 | break; | 1913 | break; |
1914 | case MSR_KVM_STEAL_TIME: | ||
1915 | data = vcpu->arch.st.msr_val; | ||
1916 | break; | ||
1837 | case MSR_IA32_P5_MC_ADDR: | 1917 | case MSR_IA32_P5_MC_ADDR: |
1838 | case MSR_IA32_P5_MC_TYPE: | 1918 | case MSR_IA32_P5_MC_TYPE: |
1839 | case MSR_IA32_MCG_CAP: | 1919 | case MSR_IA32_MCG_CAP: |
@@ -2145,6 +2225,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
2145 | kvm_migrate_timers(vcpu); | 2225 | kvm_migrate_timers(vcpu); |
2146 | vcpu->cpu = cpu; | 2226 | vcpu->cpu = cpu; |
2147 | } | 2227 | } |
2228 | |||
2229 | accumulate_steal_time(vcpu); | ||
2230 | kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); | ||
2148 | } | 2231 | } |
2149 | 2232 | ||
2150 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) | 2233 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) |
@@ -2283,6 +2366,13 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2283 | entry->flags = 0; | 2366 | entry->flags = 0; |
2284 | } | 2367 | } |
2285 | 2368 | ||
2369 | static bool supported_xcr0_bit(unsigned bit) | ||
2370 | { | ||
2371 | u64 mask = ((u64)1 << bit); | ||
2372 | |||
2373 | return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0; | ||
2374 | } | ||
2375 | |||
2286 | #define F(x) bit(X86_FEATURE_##x) | 2376 | #define F(x) bit(X86_FEATURE_##x) |
2287 | 2377 | ||
2288 | static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | 2378 | static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, |
@@ -2328,7 +2418,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2328 | 0 /* Reserved, DCA */ | F(XMM4_1) | | 2418 | 0 /* Reserved, DCA */ | F(XMM4_1) | |
2329 | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | | 2419 | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | |
2330 | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | | 2420 | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | |
2331 | F(F16C); | 2421 | F(F16C) | F(RDRAND); |
2332 | /* cpuid 0x80000001.ecx */ | 2422 | /* cpuid 0x80000001.ecx */ |
2333 | const u32 kvm_supported_word6_x86_features = | 2423 | const u32 kvm_supported_word6_x86_features = |
2334 | F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | | 2424 | F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | |
@@ -2342,6 +2432,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2342 | F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | | 2432 | F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | |
2343 | F(PMM) | F(PMM_EN); | 2433 | F(PMM) | F(PMM_EN); |
2344 | 2434 | ||
2435 | /* cpuid 7.0.ebx */ | ||
2436 | const u32 kvm_supported_word9_x86_features = | ||
2437 | F(SMEP) | F(FSGSBASE) | F(ERMS); | ||
2438 | |||
2345 | /* all calls to cpuid_count() should be made on the same cpu */ | 2439 | /* all calls to cpuid_count() should be made on the same cpu */ |
2346 | get_cpu(); | 2440 | get_cpu(); |
2347 | do_cpuid_1_ent(entry, function, index); | 2441 | do_cpuid_1_ent(entry, function, index); |
@@ -2376,7 +2470,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2376 | } | 2470 | } |
2377 | break; | 2471 | break; |
2378 | } | 2472 | } |
2379 | /* function 4 and 0xb have additional index. */ | 2473 | /* function 4 has additional index. */ |
2380 | case 4: { | 2474 | case 4: { |
2381 | int i, cache_type; | 2475 | int i, cache_type; |
2382 | 2476 | ||
@@ -2393,6 +2487,22 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2393 | } | 2487 | } |
2394 | break; | 2488 | break; |
2395 | } | 2489 | } |
2490 | case 7: { | ||
2491 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
2492 | /* Mask ebx against host capbability word 9 */ | ||
2493 | if (index == 0) { | ||
2494 | entry->ebx &= kvm_supported_word9_x86_features; | ||
2495 | cpuid_mask(&entry->ebx, 9); | ||
2496 | } else | ||
2497 | entry->ebx = 0; | ||
2498 | entry->eax = 0; | ||
2499 | entry->ecx = 0; | ||
2500 | entry->edx = 0; | ||
2501 | break; | ||
2502 | } | ||
2503 | case 9: | ||
2504 | break; | ||
2505 | /* function 0xb has additional index. */ | ||
2396 | case 0xb: { | 2506 | case 0xb: { |
2397 | int i, level_type; | 2507 | int i, level_type; |
2398 | 2508 | ||
@@ -2410,16 +2520,17 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2410 | break; | 2520 | break; |
2411 | } | 2521 | } |
2412 | case 0xd: { | 2522 | case 0xd: { |
2413 | int i; | 2523 | int idx, i; |
2414 | 2524 | ||
2415 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | 2525 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; |
2416 | for (i = 1; *nent < maxnent && i < 64; ++i) { | 2526 | for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) { |
2417 | if (entry[i].eax == 0) | 2527 | do_cpuid_1_ent(&entry[i], function, idx); |
2528 | if (entry[i].eax == 0 || !supported_xcr0_bit(idx)) | ||
2418 | continue; | 2529 | continue; |
2419 | do_cpuid_1_ent(&entry[i], function, i); | ||
2420 | entry[i].flags |= | 2530 | entry[i].flags |= |
2421 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | 2531 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; |
2422 | ++*nent; | 2532 | ++*nent; |
2533 | ++i; | ||
2423 | } | 2534 | } |
2424 | break; | 2535 | break; |
2425 | } | 2536 | } |
@@ -2438,6 +2549,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2438 | (1 << KVM_FEATURE_CLOCKSOURCE2) | | 2549 | (1 << KVM_FEATURE_CLOCKSOURCE2) | |
2439 | (1 << KVM_FEATURE_ASYNC_PF) | | 2550 | (1 << KVM_FEATURE_ASYNC_PF) | |
2440 | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); | 2551 | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); |
2552 | |||
2553 | if (sched_info_on()) | ||
2554 | entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); | ||
2555 | |||
2441 | entry->ebx = 0; | 2556 | entry->ebx = 0; |
2442 | entry->ecx = 0; | 2557 | entry->ecx = 0; |
2443 | entry->edx = 0; | 2558 | entry->edx = 0; |
@@ -2451,6 +2566,24 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2451 | entry->ecx &= kvm_supported_word6_x86_features; | 2566 | entry->ecx &= kvm_supported_word6_x86_features; |
2452 | cpuid_mask(&entry->ecx, 6); | 2567 | cpuid_mask(&entry->ecx, 6); |
2453 | break; | 2568 | break; |
2569 | case 0x80000008: { | ||
2570 | unsigned g_phys_as = (entry->eax >> 16) & 0xff; | ||
2571 | unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); | ||
2572 | unsigned phys_as = entry->eax & 0xff; | ||
2573 | |||
2574 | if (!g_phys_as) | ||
2575 | g_phys_as = phys_as; | ||
2576 | entry->eax = g_phys_as | (virt_as << 8); | ||
2577 | entry->ebx = entry->edx = 0; | ||
2578 | break; | ||
2579 | } | ||
2580 | case 0x80000019: | ||
2581 | entry->ecx = entry->edx = 0; | ||
2582 | break; | ||
2583 | case 0x8000001a: | ||
2584 | break; | ||
2585 | case 0x8000001d: | ||
2586 | break; | ||
2454 | /*Add support for Centaur's CPUID instruction*/ | 2587 | /*Add support for Centaur's CPUID instruction*/ |
2455 | case 0xC0000000: | 2588 | case 0xC0000000: |
2456 | /*Just support up to 0xC0000004 now*/ | 2589 | /*Just support up to 0xC0000004 now*/ |
@@ -2460,10 +2593,16 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2460 | entry->edx &= kvm_supported_word5_x86_features; | 2593 | entry->edx &= kvm_supported_word5_x86_features; |
2461 | cpuid_mask(&entry->edx, 5); | 2594 | cpuid_mask(&entry->edx, 5); |
2462 | break; | 2595 | break; |
2596 | case 3: /* Processor serial number */ | ||
2597 | case 5: /* MONITOR/MWAIT */ | ||
2598 | case 6: /* Thermal management */ | ||
2599 | case 0xA: /* Architectural Performance Monitoring */ | ||
2600 | case 0x80000007: /* Advanced power management */ | ||
2463 | case 0xC0000002: | 2601 | case 0xC0000002: |
2464 | case 0xC0000003: | 2602 | case 0xC0000003: |
2465 | case 0xC0000004: | 2603 | case 0xC0000004: |
2466 | /*Now nothing to do, reserved for the future*/ | 2604 | default: |
2605 | entry->eax = entry->ebx = entry->ecx = entry->edx = 0; | ||
2467 | break; | 2606 | break; |
2468 | } | 2607 | } |
2469 | 2608 | ||
@@ -3817,7 +3956,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, | |||
3817 | exception); | 3956 | exception); |
3818 | } | 3957 | } |
3819 | 3958 | ||
3820 | static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, | 3959 | int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, |
3821 | gva_t addr, void *val, unsigned int bytes, | 3960 | gva_t addr, void *val, unsigned int bytes, |
3822 | struct x86_exception *exception) | 3961 | struct x86_exception *exception) |
3823 | { | 3962 | { |
@@ -3827,6 +3966,7 @@ static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, | |||
3827 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, | 3966 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, |
3828 | exception); | 3967 | exception); |
3829 | } | 3968 | } |
3969 | EXPORT_SYMBOL_GPL(kvm_read_guest_virt); | ||
3830 | 3970 | ||
3831 | static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, | 3971 | static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, |
3832 | gva_t addr, void *val, unsigned int bytes, | 3972 | gva_t addr, void *val, unsigned int bytes, |
@@ -3836,7 +3976,7 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, | |||
3836 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); | 3976 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); |
3837 | } | 3977 | } |
3838 | 3978 | ||
3839 | static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, | 3979 | int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, |
3840 | gva_t addr, void *val, | 3980 | gva_t addr, void *val, |
3841 | unsigned int bytes, | 3981 | unsigned int bytes, |
3842 | struct x86_exception *exception) | 3982 | struct x86_exception *exception) |
@@ -3868,6 +4008,42 @@ static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, | |||
3868 | out: | 4008 | out: |
3869 | return r; | 4009 | return r; |
3870 | } | 4010 | } |
4011 | EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); | ||
4012 | |||
4013 | static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, | ||
4014 | gpa_t *gpa, struct x86_exception *exception, | ||
4015 | bool write) | ||
4016 | { | ||
4017 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; | ||
4018 | |||
4019 | if (vcpu_match_mmio_gva(vcpu, gva) && | ||
4020 | check_write_user_access(vcpu, write, access, | ||
4021 | vcpu->arch.access)) { | ||
4022 | *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | | ||
4023 | (gva & (PAGE_SIZE - 1)); | ||
4024 | trace_vcpu_match_mmio(gva, *gpa, write, false); | ||
4025 | return 1; | ||
4026 | } | ||
4027 | |||
4028 | if (write) | ||
4029 | access |= PFERR_WRITE_MASK; | ||
4030 | |||
4031 | *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); | ||
4032 | |||
4033 | if (*gpa == UNMAPPED_GVA) | ||
4034 | return -1; | ||
4035 | |||
4036 | /* For APIC access vmexit */ | ||
4037 | if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | ||
4038 | return 1; | ||
4039 | |||
4040 | if (vcpu_match_mmio_gpa(vcpu, *gpa)) { | ||
4041 | trace_vcpu_match_mmio(gva, *gpa, write, true); | ||
4042 | return 1; | ||
4043 | } | ||
4044 | |||
4045 | return 0; | ||
4046 | } | ||
3871 | 4047 | ||
3872 | static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, | 4048 | static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, |
3873 | unsigned long addr, | 4049 | unsigned long addr, |
@@ -3876,8 +4052,8 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, | |||
3876 | struct x86_exception *exception) | 4052 | struct x86_exception *exception) |
3877 | { | 4053 | { |
3878 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); | 4054 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
3879 | gpa_t gpa; | 4055 | gpa_t gpa; |
3880 | int handled; | 4056 | int handled, ret; |
3881 | 4057 | ||
3882 | if (vcpu->mmio_read_completed) { | 4058 | if (vcpu->mmio_read_completed) { |
3883 | memcpy(val, vcpu->mmio_data, bytes); | 4059 | memcpy(val, vcpu->mmio_data, bytes); |
@@ -3887,13 +4063,12 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, | |||
3887 | return X86EMUL_CONTINUE; | 4063 | return X86EMUL_CONTINUE; |
3888 | } | 4064 | } |
3889 | 4065 | ||
3890 | gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception); | 4066 | ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false); |
3891 | 4067 | ||
3892 | if (gpa == UNMAPPED_GVA) | 4068 | if (ret < 0) |
3893 | return X86EMUL_PROPAGATE_FAULT; | 4069 | return X86EMUL_PROPAGATE_FAULT; |
3894 | 4070 | ||
3895 | /* For APIC access vmexit */ | 4071 | if (ret) |
3896 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | ||
3897 | goto mmio; | 4072 | goto mmio; |
3898 | 4073 | ||
3899 | if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception) | 4074 | if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception) |
@@ -3944,16 +4119,16 @@ static int emulator_write_emulated_onepage(unsigned long addr, | |||
3944 | struct x86_exception *exception, | 4119 | struct x86_exception *exception, |
3945 | struct kvm_vcpu *vcpu) | 4120 | struct kvm_vcpu *vcpu) |
3946 | { | 4121 | { |
3947 | gpa_t gpa; | 4122 | gpa_t gpa; |
3948 | int handled; | 4123 | int handled, ret; |
3949 | 4124 | ||
3950 | gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); | 4125 | ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true); |
3951 | 4126 | ||
3952 | if (gpa == UNMAPPED_GVA) | 4127 | if (ret < 0) |
3953 | return X86EMUL_PROPAGATE_FAULT; | 4128 | return X86EMUL_PROPAGATE_FAULT; |
3954 | 4129 | ||
3955 | /* For APIC access vmexit */ | 4130 | /* For APIC access vmexit */ |
3956 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | 4131 | if (ret) |
3957 | goto mmio; | 4132 | goto mmio; |
3958 | 4133 | ||
3959 | if (emulator_write_phys(vcpu, gpa, val, bytes)) | 4134 | if (emulator_write_phys(vcpu, gpa, val, bytes)) |
@@ -4473,9 +4648,24 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) | |||
4473 | kvm_queue_exception(vcpu, ctxt->exception.vector); | 4648 | kvm_queue_exception(vcpu, ctxt->exception.vector); |
4474 | } | 4649 | } |
4475 | 4650 | ||
4651 | static void init_decode_cache(struct x86_emulate_ctxt *ctxt, | ||
4652 | const unsigned long *regs) | ||
4653 | { | ||
4654 | memset(&ctxt->twobyte, 0, | ||
4655 | (void *)&ctxt->regs - (void *)&ctxt->twobyte); | ||
4656 | memcpy(ctxt->regs, regs, sizeof(ctxt->regs)); | ||
4657 | |||
4658 | ctxt->fetch.start = 0; | ||
4659 | ctxt->fetch.end = 0; | ||
4660 | ctxt->io_read.pos = 0; | ||
4661 | ctxt->io_read.end = 0; | ||
4662 | ctxt->mem_read.pos = 0; | ||
4663 | ctxt->mem_read.end = 0; | ||
4664 | } | ||
4665 | |||
4476 | static void init_emulate_ctxt(struct kvm_vcpu *vcpu) | 4666 | static void init_emulate_ctxt(struct kvm_vcpu *vcpu) |
4477 | { | 4667 | { |
4478 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; | 4668 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; |
4479 | int cs_db, cs_l; | 4669 | int cs_db, cs_l; |
4480 | 4670 | ||
4481 | /* | 4671 | /* |
@@ -4488,40 +4678,38 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) | |||
4488 | 4678 | ||
4489 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | 4679 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); |
4490 | 4680 | ||
4491 | vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); | 4681 | ctxt->eflags = kvm_get_rflags(vcpu); |
4492 | vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); | 4682 | ctxt->eip = kvm_rip_read(vcpu); |
4493 | vcpu->arch.emulate_ctxt.mode = | 4683 | ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : |
4494 | (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : | 4684 | (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : |
4495 | (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) | 4685 | cs_l ? X86EMUL_MODE_PROT64 : |
4496 | ? X86EMUL_MODE_VM86 : cs_l | 4686 | cs_db ? X86EMUL_MODE_PROT32 : |
4497 | ? X86EMUL_MODE_PROT64 : cs_db | 4687 | X86EMUL_MODE_PROT16; |
4498 | ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; | 4688 | ctxt->guest_mode = is_guest_mode(vcpu); |
4499 | vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu); | 4689 | |
4500 | memset(c, 0, sizeof(struct decode_cache)); | 4690 | init_decode_cache(ctxt, vcpu->arch.regs); |
4501 | memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); | ||
4502 | vcpu->arch.emulate_regs_need_sync_from_vcpu = false; | 4691 | vcpu->arch.emulate_regs_need_sync_from_vcpu = false; |
4503 | } | 4692 | } |
4504 | 4693 | ||
4505 | int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) | 4694 | int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) |
4506 | { | 4695 | { |
4507 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; | 4696 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; |
4508 | int ret; | 4697 | int ret; |
4509 | 4698 | ||
4510 | init_emulate_ctxt(vcpu); | 4699 | init_emulate_ctxt(vcpu); |
4511 | 4700 | ||
4512 | vcpu->arch.emulate_ctxt.decode.op_bytes = 2; | 4701 | ctxt->op_bytes = 2; |
4513 | vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; | 4702 | ctxt->ad_bytes = 2; |
4514 | vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip + | 4703 | ctxt->_eip = ctxt->eip + inc_eip; |
4515 | inc_eip; | 4704 | ret = emulate_int_real(ctxt, irq); |
4516 | ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq); | ||
4517 | 4705 | ||
4518 | if (ret != X86EMUL_CONTINUE) | 4706 | if (ret != X86EMUL_CONTINUE) |
4519 | return EMULATE_FAIL; | 4707 | return EMULATE_FAIL; |
4520 | 4708 | ||
4521 | vcpu->arch.emulate_ctxt.eip = c->eip; | 4709 | ctxt->eip = ctxt->_eip; |
4522 | memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); | 4710 | memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); |
4523 | kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); | 4711 | kvm_rip_write(vcpu, ctxt->eip); |
4524 | kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); | 4712 | kvm_set_rflags(vcpu, ctxt->eflags); |
4525 | 4713 | ||
4526 | if (irq == NMI_VECTOR) | 4714 | if (irq == NMI_VECTOR) |
4527 | vcpu->arch.nmi_pending = false; | 4715 | vcpu->arch.nmi_pending = false; |
@@ -4582,21 +4770,21 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, | |||
4582 | int insn_len) | 4770 | int insn_len) |
4583 | { | 4771 | { |
4584 | int r; | 4772 | int r; |
4585 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; | 4773 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; |
4586 | bool writeback = true; | 4774 | bool writeback = true; |
4587 | 4775 | ||
4588 | kvm_clear_exception_queue(vcpu); | 4776 | kvm_clear_exception_queue(vcpu); |
4589 | 4777 | ||
4590 | if (!(emulation_type & EMULTYPE_NO_DECODE)) { | 4778 | if (!(emulation_type & EMULTYPE_NO_DECODE)) { |
4591 | init_emulate_ctxt(vcpu); | 4779 | init_emulate_ctxt(vcpu); |
4592 | vcpu->arch.emulate_ctxt.interruptibility = 0; | 4780 | ctxt->interruptibility = 0; |
4593 | vcpu->arch.emulate_ctxt.have_exception = false; | 4781 | ctxt->have_exception = false; |
4594 | vcpu->arch.emulate_ctxt.perm_ok = false; | 4782 | ctxt->perm_ok = false; |
4595 | 4783 | ||
4596 | vcpu->arch.emulate_ctxt.only_vendor_specific_insn | 4784 | ctxt->only_vendor_specific_insn |
4597 | = emulation_type & EMULTYPE_TRAP_UD; | 4785 | = emulation_type & EMULTYPE_TRAP_UD; |
4598 | 4786 | ||
4599 | r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); | 4787 | r = x86_decode_insn(ctxt, insn, insn_len); |
4600 | 4788 | ||
4601 | trace_kvm_emulate_insn_start(vcpu); | 4789 | trace_kvm_emulate_insn_start(vcpu); |
4602 | ++vcpu->stat.insn_emulation; | 4790 | ++vcpu->stat.insn_emulation; |
@@ -4612,7 +4800,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, | |||
4612 | } | 4800 | } |
4613 | 4801 | ||
4614 | if (emulation_type & EMULTYPE_SKIP) { | 4802 | if (emulation_type & EMULTYPE_SKIP) { |
4615 | kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); | 4803 | kvm_rip_write(vcpu, ctxt->_eip); |
4616 | return EMULATE_DONE; | 4804 | return EMULATE_DONE; |
4617 | } | 4805 | } |
4618 | 4806 | ||
@@ -4620,11 +4808,11 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, | |||
4620 | changes registers values during IO operation */ | 4808 | changes registers values during IO operation */ |
4621 | if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { | 4809 | if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { |
4622 | vcpu->arch.emulate_regs_need_sync_from_vcpu = false; | 4810 | vcpu->arch.emulate_regs_need_sync_from_vcpu = false; |
4623 | memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); | 4811 | memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs); |
4624 | } | 4812 | } |
4625 | 4813 | ||
4626 | restart: | 4814 | restart: |
4627 | r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); | 4815 | r = x86_emulate_insn(ctxt); |
4628 | 4816 | ||
4629 | if (r == EMULATION_INTERCEPTED) | 4817 | if (r == EMULATION_INTERCEPTED) |
4630 | return EMULATE_DONE; | 4818 | return EMULATE_DONE; |
@@ -4636,7 +4824,7 @@ restart: | |||
4636 | return handle_emulation_failure(vcpu); | 4824 | return handle_emulation_failure(vcpu); |
4637 | } | 4825 | } |
4638 | 4826 | ||
4639 | if (vcpu->arch.emulate_ctxt.have_exception) { | 4827 | if (ctxt->have_exception) { |
4640 | inject_emulated_exception(vcpu); | 4828 | inject_emulated_exception(vcpu); |
4641 | r = EMULATE_DONE; | 4829 | r = EMULATE_DONE; |
4642 | } else if (vcpu->arch.pio.count) { | 4830 | } else if (vcpu->arch.pio.count) { |
@@ -4655,13 +4843,12 @@ restart: | |||
4655 | r = EMULATE_DONE; | 4843 | r = EMULATE_DONE; |
4656 | 4844 | ||
4657 | if (writeback) { | 4845 | if (writeback) { |
4658 | toggle_interruptibility(vcpu, | 4846 | toggle_interruptibility(vcpu, ctxt->interruptibility); |
4659 | vcpu->arch.emulate_ctxt.interruptibility); | 4847 | kvm_set_rflags(vcpu, ctxt->eflags); |
4660 | kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); | ||
4661 | kvm_make_request(KVM_REQ_EVENT, vcpu); | 4848 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
4662 | memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); | 4849 | memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); |
4663 | vcpu->arch.emulate_regs_need_sync_to_vcpu = false; | 4850 | vcpu->arch.emulate_regs_need_sync_to_vcpu = false; |
4664 | kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); | 4851 | kvm_rip_write(vcpu, ctxt->eip); |
4665 | } else | 4852 | } else |
4666 | vcpu->arch.emulate_regs_need_sync_to_vcpu = true; | 4853 | vcpu->arch.emulate_regs_need_sync_to_vcpu = true; |
4667 | 4854 | ||
@@ -4878,6 +5065,30 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) | |||
4878 | } | 5065 | } |
4879 | EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); | 5066 | EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); |
4880 | 5067 | ||
5068 | static void kvm_set_mmio_spte_mask(void) | ||
5069 | { | ||
5070 | u64 mask; | ||
5071 | int maxphyaddr = boot_cpu_data.x86_phys_bits; | ||
5072 | |||
5073 | /* | ||
5074 | * Set the reserved bits and the present bit of an paging-structure | ||
5075 | * entry to generate page fault with PFER.RSV = 1. | ||
5076 | */ | ||
5077 | mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr; | ||
5078 | mask |= 1ull; | ||
5079 | |||
5080 | #ifdef CONFIG_X86_64 | ||
5081 | /* | ||
5082 | * If reserved bit is not supported, clear the present bit to disable | ||
5083 | * mmio page fault. | ||
5084 | */ | ||
5085 | if (maxphyaddr == 52) | ||
5086 | mask &= ~1ull; | ||
5087 | #endif | ||
5088 | |||
5089 | kvm_mmu_set_mmio_spte_mask(mask); | ||
5090 | } | ||
5091 | |||
4881 | int kvm_arch_init(void *opaque) | 5092 | int kvm_arch_init(void *opaque) |
4882 | { | 5093 | { |
4883 | int r; | 5094 | int r; |
@@ -4904,10 +5115,10 @@ int kvm_arch_init(void *opaque) | |||
4904 | if (r) | 5115 | if (r) |
4905 | goto out; | 5116 | goto out; |
4906 | 5117 | ||
5118 | kvm_set_mmio_spte_mask(); | ||
4907 | kvm_init_msr_list(); | 5119 | kvm_init_msr_list(); |
4908 | 5120 | ||
4909 | kvm_x86_ops = ops; | 5121 | kvm_x86_ops = ops; |
4910 | kvm_mmu_set_nonpresent_ptes(0ull, 0ull); | ||
4911 | kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, | 5122 | kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, |
4912 | PT_DIRTY_MASK, PT64_NX_MASK, 0); | 5123 | PT_DIRTY_MASK, PT64_NX_MASK, 0); |
4913 | 5124 | ||
@@ -5082,8 +5293,7 @@ int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) | |||
5082 | 5293 | ||
5083 | kvm_x86_ops->patch_hypercall(vcpu, instruction); | 5294 | kvm_x86_ops->patch_hypercall(vcpu, instruction); |
5084 | 5295 | ||
5085 | return emulator_write_emulated(&vcpu->arch.emulate_ctxt, | 5296 | return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); |
5086 | rip, instruction, 3, NULL); | ||
5087 | } | 5297 | } |
5088 | 5298 | ||
5089 | static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) | 5299 | static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) |
@@ -5384,6 +5594,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
5384 | r = 1; | 5594 | r = 1; |
5385 | goto out; | 5595 | goto out; |
5386 | } | 5596 | } |
5597 | if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) | ||
5598 | record_steal_time(vcpu); | ||
5599 | |||
5387 | } | 5600 | } |
5388 | 5601 | ||
5389 | r = kvm_mmu_reload(vcpu); | 5602 | r = kvm_mmu_reload(vcpu); |
@@ -5671,8 +5884,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | |||
5671 | * that usually, but some bad designed PV devices (vmware | 5884 | * that usually, but some bad designed PV devices (vmware |
5672 | * backdoor interface) need this to work | 5885 | * backdoor interface) need this to work |
5673 | */ | 5886 | */ |
5674 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; | 5887 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; |
5675 | memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); | 5888 | memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); |
5676 | vcpu->arch.emulate_regs_need_sync_to_vcpu = false; | 5889 | vcpu->arch.emulate_regs_need_sync_to_vcpu = false; |
5677 | } | 5890 | } |
5678 | regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); | 5891 | regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); |
@@ -5801,21 +6014,20 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, | |||
5801 | int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, | 6014 | int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, |
5802 | bool has_error_code, u32 error_code) | 6015 | bool has_error_code, u32 error_code) |
5803 | { | 6016 | { |
5804 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; | 6017 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; |
5805 | int ret; | 6018 | int ret; |
5806 | 6019 | ||
5807 | init_emulate_ctxt(vcpu); | 6020 | init_emulate_ctxt(vcpu); |
5808 | 6021 | ||
5809 | ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, | 6022 | ret = emulator_task_switch(ctxt, tss_selector, reason, |
5810 | tss_selector, reason, has_error_code, | 6023 | has_error_code, error_code); |
5811 | error_code); | ||
5812 | 6024 | ||
5813 | if (ret) | 6025 | if (ret) |
5814 | return EMULATE_FAIL; | 6026 | return EMULATE_FAIL; |
5815 | 6027 | ||
5816 | memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); | 6028 | memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); |
5817 | kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); | 6029 | kvm_rip_write(vcpu, ctxt->eip); |
5818 | kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); | 6030 | kvm_set_rflags(vcpu, ctxt->eflags); |
5819 | kvm_make_request(KVM_REQ_EVENT, vcpu); | 6031 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
5820 | return EMULATE_DONE; | 6032 | return EMULATE_DONE; |
5821 | } | 6033 | } |
@@ -6093,12 +6305,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) | |||
6093 | if (r == 0) | 6305 | if (r == 0) |
6094 | r = kvm_mmu_setup(vcpu); | 6306 | r = kvm_mmu_setup(vcpu); |
6095 | vcpu_put(vcpu); | 6307 | vcpu_put(vcpu); |
6096 | if (r < 0) | ||
6097 | goto free_vcpu; | ||
6098 | 6308 | ||
6099 | return 0; | ||
6100 | free_vcpu: | ||
6101 | kvm_x86_ops->vcpu_free(vcpu); | ||
6102 | return r; | 6309 | return r; |
6103 | } | 6310 | } |
6104 | 6311 | ||
@@ -6126,6 +6333,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) | |||
6126 | 6333 | ||
6127 | kvm_make_request(KVM_REQ_EVENT, vcpu); | 6334 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
6128 | vcpu->arch.apf.msr_val = 0; | 6335 | vcpu->arch.apf.msr_val = 0; |
6336 | vcpu->arch.st.msr_val = 0; | ||
6129 | 6337 | ||
6130 | kvmclock_reset(vcpu); | 6338 | kvmclock_reset(vcpu); |
6131 | 6339 | ||
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index e407ed3df817..d36fe237c665 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h | |||
@@ -75,10 +75,54 @@ static inline u32 bit(int bitno) | |||
75 | return 1 << (bitno & 31); | 75 | return 1 << (bitno & 31); |
76 | } | 76 | } |
77 | 77 | ||
78 | static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, | ||
79 | gva_t gva, gfn_t gfn, unsigned access) | ||
80 | { | ||
81 | vcpu->arch.mmio_gva = gva & PAGE_MASK; | ||
82 | vcpu->arch.access = access; | ||
83 | vcpu->arch.mmio_gfn = gfn; | ||
84 | } | ||
85 | |||
86 | /* | ||
87 | * Clear the mmio cache info for the given gva, | ||
88 | * specially, if gva is ~0ul, we clear all mmio cache info. | ||
89 | */ | ||
90 | static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva) | ||
91 | { | ||
92 | if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK)) | ||
93 | return; | ||
94 | |||
95 | vcpu->arch.mmio_gva = 0; | ||
96 | } | ||
97 | |||
98 | static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva) | ||
99 | { | ||
100 | if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK)) | ||
101 | return true; | ||
102 | |||
103 | return false; | ||
104 | } | ||
105 | |||
106 | static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) | ||
107 | { | ||
108 | if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT) | ||
109 | return true; | ||
110 | |||
111 | return false; | ||
112 | } | ||
113 | |||
78 | void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); | 114 | void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); |
79 | void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); | 115 | void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); |
80 | int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); | 116 | int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); |
81 | 117 | ||
82 | void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); | 118 | void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); |
83 | 119 | ||
120 | int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, | ||
121 | gva_t addr, void *val, unsigned int bytes, | ||
122 | struct x86_exception *exception); | ||
123 | |||
124 | int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, | ||
125 | gva_t addr, void *val, unsigned int bytes, | ||
126 | struct x86_exception *exception); | ||
127 | |||
84 | #endif | 128 | #endif |
diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 55ef181521ff..2c366b52f505 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h | |||
@@ -161,6 +161,7 @@ struct kvm_pit_config { | |||
161 | #define KVM_EXIT_NMI 16 | 161 | #define KVM_EXIT_NMI 16 |
162 | #define KVM_EXIT_INTERNAL_ERROR 17 | 162 | #define KVM_EXIT_INTERNAL_ERROR 17 |
163 | #define KVM_EXIT_OSI 18 | 163 | #define KVM_EXIT_OSI 18 |
164 | #define KVM_EXIT_PAPR_HCALL 19 | ||
164 | 165 | ||
165 | /* For KVM_EXIT_INTERNAL_ERROR */ | 166 | /* For KVM_EXIT_INTERNAL_ERROR */ |
166 | #define KVM_INTERNAL_ERROR_EMULATION 1 | 167 | #define KVM_INTERNAL_ERROR_EMULATION 1 |
@@ -264,6 +265,11 @@ struct kvm_run { | |||
264 | struct { | 265 | struct { |
265 | __u64 gprs[32]; | 266 | __u64 gprs[32]; |
266 | } osi; | 267 | } osi; |
268 | struct { | ||
269 | __u64 nr; | ||
270 | __u64 ret; | ||
271 | __u64 args[9]; | ||
272 | } papr_hcall; | ||
267 | /* Fix the size of the union. */ | 273 | /* Fix the size of the union. */ |
268 | char padding[256]; | 274 | char padding[256]; |
269 | }; | 275 | }; |
@@ -544,6 +550,9 @@ struct kvm_ppc_pvinfo { | |||
544 | #define KVM_CAP_TSC_CONTROL 60 | 550 | #define KVM_CAP_TSC_CONTROL 60 |
545 | #define KVM_CAP_GET_TSC_KHZ 61 | 551 | #define KVM_CAP_GET_TSC_KHZ 61 |
546 | #define KVM_CAP_PPC_BOOKE_SREGS 62 | 552 | #define KVM_CAP_PPC_BOOKE_SREGS 62 |
553 | #define KVM_CAP_SPAPR_TCE 63 | ||
554 | #define KVM_CAP_PPC_SMT 64 | ||
555 | #define KVM_CAP_PPC_RMA 65 | ||
547 | 556 | ||
548 | #ifdef KVM_CAP_IRQ_ROUTING | 557 | #ifdef KVM_CAP_IRQ_ROUTING |
549 | 558 | ||
@@ -746,6 +755,9 @@ struct kvm_clock_data { | |||
746 | /* Available with KVM_CAP_XCRS */ | 755 | /* Available with KVM_CAP_XCRS */ |
747 | #define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs) | 756 | #define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs) |
748 | #define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs) | 757 | #define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs) |
758 | #define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce) | ||
759 | /* Available with KVM_CAP_RMA */ | ||
760 | #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) | ||
749 | 761 | ||
750 | #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) | 762 | #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) |
751 | 763 | ||
@@ -773,20 +785,14 @@ struct kvm_assigned_pci_dev { | |||
773 | 785 | ||
774 | struct kvm_assigned_irq { | 786 | struct kvm_assigned_irq { |
775 | __u32 assigned_dev_id; | 787 | __u32 assigned_dev_id; |
776 | __u32 host_irq; | 788 | __u32 host_irq; /* ignored (legacy field) */ |
777 | __u32 guest_irq; | 789 | __u32 guest_irq; |
778 | __u32 flags; | 790 | __u32 flags; |
779 | union { | 791 | union { |
780 | struct { | ||
781 | __u32 addr_lo; | ||
782 | __u32 addr_hi; | ||
783 | __u32 data; | ||
784 | } guest_msi; | ||
785 | __u32 reserved[12]; | 792 | __u32 reserved[12]; |
786 | }; | 793 | }; |
787 | }; | 794 | }; |
788 | 795 | ||
789 | |||
790 | struct kvm_assigned_msix_nr { | 796 | struct kvm_assigned_msix_nr { |
791 | __u32 assigned_dev_id; | 797 | __u32 assigned_dev_id; |
792 | __u16 entry_nr; | 798 | __u16 entry_nr; |
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 31ebb59cbd2f..eabb21a30c34 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h | |||
@@ -47,6 +47,7 @@ | |||
47 | #define KVM_REQ_DEACTIVATE_FPU 10 | 47 | #define KVM_REQ_DEACTIVATE_FPU 10 |
48 | #define KVM_REQ_EVENT 11 | 48 | #define KVM_REQ_EVENT 11 |
49 | #define KVM_REQ_APF_HALT 12 | 49 | #define KVM_REQ_APF_HALT 12 |
50 | #define KVM_REQ_STEAL_UPDATE 13 | ||
50 | 51 | ||
51 | #define KVM_USERSPACE_IRQ_SOURCE_ID 0 | 52 | #define KVM_USERSPACE_IRQ_SOURCE_ID 0 |
52 | 53 | ||
@@ -326,12 +327,17 @@ static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm) | |||
326 | static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } | 327 | static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } |
327 | 328 | ||
328 | extern struct page *bad_page; | 329 | extern struct page *bad_page; |
330 | extern struct page *fault_page; | ||
331 | |||
329 | extern pfn_t bad_pfn; | 332 | extern pfn_t bad_pfn; |
333 | extern pfn_t fault_pfn; | ||
330 | 334 | ||
331 | int is_error_page(struct page *page); | 335 | int is_error_page(struct page *page); |
332 | int is_error_pfn(pfn_t pfn); | 336 | int is_error_pfn(pfn_t pfn); |
333 | int is_hwpoison_pfn(pfn_t pfn); | 337 | int is_hwpoison_pfn(pfn_t pfn); |
334 | int is_fault_pfn(pfn_t pfn); | 338 | int is_fault_pfn(pfn_t pfn); |
339 | int is_noslot_pfn(pfn_t pfn); | ||
340 | int is_invalid_pfn(pfn_t pfn); | ||
335 | int kvm_is_error_hva(unsigned long addr); | 341 | int kvm_is_error_hva(unsigned long addr); |
336 | int kvm_set_memory_region(struct kvm *kvm, | 342 | int kvm_set_memory_region(struct kvm *kvm, |
337 | struct kvm_userspace_memory_region *mem, | 343 | struct kvm_userspace_memory_region *mem, |
@@ -381,6 +387,8 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, | |||
381 | int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, | 387 | int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, |
382 | unsigned long len); | 388 | unsigned long len); |
383 | int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); | 389 | int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); |
390 | int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, | ||
391 | void *data, unsigned long len); | ||
384 | int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, | 392 | int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, |
385 | int offset, int len); | 393 | int offset, int len); |
386 | int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, | 394 | int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, |
diff --git a/kernel/compat.c b/kernel/compat.c index fc9eb093acd5..18197ae2d465 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -890,6 +890,7 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat) | |||
890 | case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); | 890 | case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); |
891 | } | 891 | } |
892 | } | 892 | } |
893 | EXPORT_SYMBOL_GPL(sigset_from_compat); | ||
893 | 894 | ||
894 | asmlinkage long | 895 | asmlinkage long |
895 | compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, | 896 | compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, |
diff --git a/kernel/delayacct.c b/kernel/delayacct.c index ead9b610aa71..418b3f7053aa 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c | |||
@@ -19,8 +19,10 @@ | |||
19 | #include <linux/time.h> | 19 | #include <linux/time.h> |
20 | #include <linux/sysctl.h> | 20 | #include <linux/sysctl.h> |
21 | #include <linux/delayacct.h> | 21 | #include <linux/delayacct.h> |
22 | #include <linux/module.h> | ||
22 | 23 | ||
23 | int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ | 24 | int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ |
25 | EXPORT_SYMBOL_GPL(delayacct_on); | ||
24 | struct kmem_cache *delayacct_cache; | 26 | struct kmem_cache *delayacct_cache; |
25 | 27 | ||
26 | static int __init delayacct_setup_disable(char *str) | 28 | static int __init delayacct_setup_disable(char *str) |
diff --git a/kernel/sched.c b/kernel/sched.c index 9aaf567c5da5..751a7cc6a5cd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -75,6 +75,9 @@ | |||
75 | #include <asm/tlb.h> | 75 | #include <asm/tlb.h> |
76 | #include <asm/irq_regs.h> | 76 | #include <asm/irq_regs.h> |
77 | #include <asm/mutex.h> | 77 | #include <asm/mutex.h> |
78 | #ifdef CONFIG_PARAVIRT | ||
79 | #include <asm/paravirt.h> | ||
80 | #endif | ||
78 | 81 | ||
79 | #include "sched_cpupri.h" | 82 | #include "sched_cpupri.h" |
80 | #include "workqueue_sched.h" | 83 | #include "workqueue_sched.h" |
@@ -528,6 +531,12 @@ struct rq { | |||
528 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 531 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
529 | u64 prev_irq_time; | 532 | u64 prev_irq_time; |
530 | #endif | 533 | #endif |
534 | #ifdef CONFIG_PARAVIRT | ||
535 | u64 prev_steal_time; | ||
536 | #endif | ||
537 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
538 | u64 prev_steal_time_rq; | ||
539 | #endif | ||
531 | 540 | ||
532 | /* calc_load related fields */ | 541 | /* calc_load related fields */ |
533 | unsigned long calc_load_update; | 542 | unsigned long calc_load_update; |
@@ -1921,10 +1930,28 @@ void account_system_vtime(struct task_struct *curr) | |||
1921 | } | 1930 | } |
1922 | EXPORT_SYMBOL_GPL(account_system_vtime); | 1931 | EXPORT_SYMBOL_GPL(account_system_vtime); |
1923 | 1932 | ||
1924 | static void update_rq_clock_task(struct rq *rq, s64 delta) | 1933 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
1934 | |||
1935 | #ifdef CONFIG_PARAVIRT | ||
1936 | static inline u64 steal_ticks(u64 steal) | ||
1925 | { | 1937 | { |
1926 | s64 irq_delta; | 1938 | if (unlikely(steal > NSEC_PER_SEC)) |
1939 | return div_u64(steal, TICK_NSEC); | ||
1927 | 1940 | ||
1941 | return __iter_div_u64_rem(steal, TICK_NSEC, &steal); | ||
1942 | } | ||
1943 | #endif | ||
1944 | |||
1945 | static void update_rq_clock_task(struct rq *rq, s64 delta) | ||
1946 | { | ||
1947 | /* | ||
1948 | * In theory, the compile should just see 0 here, and optimize out the call | ||
1949 | * to sched_rt_avg_update. But I don't trust it... | ||
1950 | */ | ||
1951 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | ||
1952 | s64 steal = 0, irq_delta = 0; | ||
1953 | #endif | ||
1954 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
1928 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; | 1955 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; |
1929 | 1956 | ||
1930 | /* | 1957 | /* |
@@ -1947,12 +1974,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
1947 | 1974 | ||
1948 | rq->prev_irq_time += irq_delta; | 1975 | rq->prev_irq_time += irq_delta; |
1949 | delta -= irq_delta; | 1976 | delta -= irq_delta; |
1977 | #endif | ||
1978 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
1979 | if (static_branch((¶virt_steal_rq_enabled))) { | ||
1980 | u64 st; | ||
1981 | |||
1982 | steal = paravirt_steal_clock(cpu_of(rq)); | ||
1983 | steal -= rq->prev_steal_time_rq; | ||
1984 | |||
1985 | if (unlikely(steal > delta)) | ||
1986 | steal = delta; | ||
1987 | |||
1988 | st = steal_ticks(steal); | ||
1989 | steal = st * TICK_NSEC; | ||
1990 | |||
1991 | rq->prev_steal_time_rq += steal; | ||
1992 | |||
1993 | delta -= steal; | ||
1994 | } | ||
1995 | #endif | ||
1996 | |||
1950 | rq->clock_task += delta; | 1997 | rq->clock_task += delta; |
1951 | 1998 | ||
1952 | if (irq_delta && sched_feat(NONIRQ_POWER)) | 1999 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) |
1953 | sched_rt_avg_update(rq, irq_delta); | 2000 | if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) |
2001 | sched_rt_avg_update(rq, irq_delta + steal); | ||
2002 | #endif | ||
1954 | } | 2003 | } |
1955 | 2004 | ||
2005 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
1956 | static int irqtime_account_hi_update(void) | 2006 | static int irqtime_account_hi_update(void) |
1957 | { | 2007 | { |
1958 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2008 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
@@ -1987,12 +2037,7 @@ static int irqtime_account_si_update(void) | |||
1987 | 2037 | ||
1988 | #define sched_clock_irqtime (0) | 2038 | #define sched_clock_irqtime (0) |
1989 | 2039 | ||
1990 | static void update_rq_clock_task(struct rq *rq, s64 delta) | 2040 | #endif |
1991 | { | ||
1992 | rq->clock_task += delta; | ||
1993 | } | ||
1994 | |||
1995 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
1996 | 2041 | ||
1997 | #include "sched_idletask.c" | 2042 | #include "sched_idletask.c" |
1998 | #include "sched_fair.c" | 2043 | #include "sched_fair.c" |
@@ -3845,6 +3890,25 @@ void account_idle_time(cputime_t cputime) | |||
3845 | cpustat->idle = cputime64_add(cpustat->idle, cputime64); | 3890 | cpustat->idle = cputime64_add(cpustat->idle, cputime64); |
3846 | } | 3891 | } |
3847 | 3892 | ||
3893 | static __always_inline bool steal_account_process_tick(void) | ||
3894 | { | ||
3895 | #ifdef CONFIG_PARAVIRT | ||
3896 | if (static_branch(¶virt_steal_enabled)) { | ||
3897 | u64 steal, st = 0; | ||
3898 | |||
3899 | steal = paravirt_steal_clock(smp_processor_id()); | ||
3900 | steal -= this_rq()->prev_steal_time; | ||
3901 | |||
3902 | st = steal_ticks(steal); | ||
3903 | this_rq()->prev_steal_time += st * TICK_NSEC; | ||
3904 | |||
3905 | account_steal_time(st); | ||
3906 | return st; | ||
3907 | } | ||
3908 | #endif | ||
3909 | return false; | ||
3910 | } | ||
3911 | |||
3848 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 3912 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
3849 | 3913 | ||
3850 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 3914 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
@@ -3876,6 +3940,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
3876 | cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); | 3940 | cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); |
3877 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3941 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3878 | 3942 | ||
3943 | if (steal_account_process_tick()) | ||
3944 | return; | ||
3945 | |||
3879 | if (irqtime_account_hi_update()) { | 3946 | if (irqtime_account_hi_update()) { |
3880 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 3947 | cpustat->irq = cputime64_add(cpustat->irq, tmp); |
3881 | } else if (irqtime_account_si_update()) { | 3948 | } else if (irqtime_account_si_update()) { |
@@ -3929,6 +3996,9 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
3929 | return; | 3996 | return; |
3930 | } | 3997 | } |
3931 | 3998 | ||
3999 | if (steal_account_process_tick()) | ||
4000 | return; | ||
4001 | |||
3932 | if (user_tick) | 4002 | if (user_tick) |
3933 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | 4003 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); |
3934 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | 4004 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 1e7066d76c26..2e74677cb040 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
@@ -61,9 +61,9 @@ SCHED_FEAT(LB_BIAS, 1) | |||
61 | SCHED_FEAT(OWNER_SPIN, 1) | 61 | SCHED_FEAT(OWNER_SPIN, 1) |
62 | 62 | ||
63 | /* | 63 | /* |
64 | * Decrement CPU power based on irq activity | 64 | * Decrement CPU power based on time not spent running tasks |
65 | */ | 65 | */ |
66 | SCHED_FEAT(NONIRQ_POWER, 1) | 66 | SCHED_FEAT(NONTASK_POWER, 1) |
67 | 67 | ||
68 | /* | 68 | /* |
69 | * Queue remote wakeups on the target CPU and process them | 69 | * Queue remote wakeups on the target CPU and process them |
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index 6cc4b97ec458..4e9eaeb518c7 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c | |||
@@ -617,7 +617,7 @@ static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, | |||
617 | if (adev->entries_nr == 0) { | 617 | if (adev->entries_nr == 0) { |
618 | adev->entries_nr = entry_nr->entry_nr; | 618 | adev->entries_nr = entry_nr->entry_nr; |
619 | if (adev->entries_nr == 0 || | 619 | if (adev->entries_nr == 0 || |
620 | adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) { | 620 | adev->entries_nr > KVM_MAX_MSIX_PER_DEV) { |
621 | r = -EINVAL; | 621 | r = -EINVAL; |
622 | goto msix_nr_out; | 622 | goto msix_nr_out; |
623 | } | 623 | } |
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index 62a9caf0563c..78c80f67f535 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c | |||
@@ -30,6 +30,12 @@ | |||
30 | #include <linux/iommu.h> | 30 | #include <linux/iommu.h> |
31 | #include <linux/intel-iommu.h> | 31 | #include <linux/intel-iommu.h> |
32 | 32 | ||
33 | static int allow_unsafe_assigned_interrupts; | ||
34 | module_param_named(allow_unsafe_assigned_interrupts, | ||
35 | allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR); | ||
36 | MODULE_PARM_DESC(allow_unsafe_assigned_interrupts, | ||
37 | "Enable device assignment on platforms without interrupt remapping support."); | ||
38 | |||
33 | static int kvm_iommu_unmap_memslots(struct kvm *kvm); | 39 | static int kvm_iommu_unmap_memslots(struct kvm *kvm); |
34 | static void kvm_iommu_put_pages(struct kvm *kvm, | 40 | static void kvm_iommu_put_pages(struct kvm *kvm, |
35 | gfn_t base_gfn, unsigned long npages); | 41 | gfn_t base_gfn, unsigned long npages); |
@@ -231,6 +237,18 @@ int kvm_iommu_map_guest(struct kvm *kvm) | |||
231 | if (!kvm->arch.iommu_domain) | 237 | if (!kvm->arch.iommu_domain) |
232 | return -ENOMEM; | 238 | return -ENOMEM; |
233 | 239 | ||
240 | if (!allow_unsafe_assigned_interrupts && | ||
241 | !iommu_domain_has_cap(kvm->arch.iommu_domain, | ||
242 | IOMMU_CAP_INTR_REMAP)) { | ||
243 | printk(KERN_WARNING "%s: No interrupt remapping support," | ||
244 | " disallowing device assignment." | ||
245 | " Re-enble with \"allow_unsafe_assigned_interrupts=1\"" | ||
246 | " module option.\n", __func__); | ||
247 | iommu_domain_free(kvm->arch.iommu_domain); | ||
248 | kvm->arch.iommu_domain = NULL; | ||
249 | return -EPERM; | ||
250 | } | ||
251 | |||
234 | r = kvm_iommu_map_memslots(kvm); | 252 | r = kvm_iommu_map_memslots(kvm); |
235 | if (r) | 253 | if (r) |
236 | goto out_unmap; | 254 | goto out_unmap; |
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 96ebc0679415..aefdda390f5e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
@@ -84,6 +84,10 @@ struct dentry *kvm_debugfs_dir; | |||
84 | 84 | ||
85 | static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, | 85 | static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, |
86 | unsigned long arg); | 86 | unsigned long arg); |
87 | #ifdef CONFIG_COMPAT | ||
88 | static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, | ||
89 | unsigned long arg); | ||
90 | #endif | ||
87 | static int hardware_enable_all(void); | 91 | static int hardware_enable_all(void); |
88 | static void hardware_disable_all(void); | 92 | static void hardware_disable_all(void); |
89 | 93 | ||
@@ -97,8 +101,8 @@ static bool largepages_enabled = true; | |||
97 | static struct page *hwpoison_page; | 101 | static struct page *hwpoison_page; |
98 | static pfn_t hwpoison_pfn; | 102 | static pfn_t hwpoison_pfn; |
99 | 103 | ||
100 | static struct page *fault_page; | 104 | struct page *fault_page; |
101 | static pfn_t fault_pfn; | 105 | pfn_t fault_pfn; |
102 | 106 | ||
103 | inline int kvm_is_mmio_pfn(pfn_t pfn) | 107 | inline int kvm_is_mmio_pfn(pfn_t pfn) |
104 | { | 108 | { |
@@ -827,6 +831,13 @@ skip_lpage: | |||
827 | 831 | ||
828 | kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); | 832 | kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); |
829 | 833 | ||
834 | /* | ||
835 | * If the new memory slot is created, we need to clear all | ||
836 | * mmio sptes. | ||
837 | */ | ||
838 | if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) | ||
839 | kvm_arch_flush_shadow(kvm); | ||
840 | |||
830 | kvm_free_physmem_slot(&old, &new); | 841 | kvm_free_physmem_slot(&old, &new); |
831 | kfree(old_memslots); | 842 | kfree(old_memslots); |
832 | 843 | ||
@@ -927,6 +938,18 @@ int is_fault_pfn(pfn_t pfn) | |||
927 | } | 938 | } |
928 | EXPORT_SYMBOL_GPL(is_fault_pfn); | 939 | EXPORT_SYMBOL_GPL(is_fault_pfn); |
929 | 940 | ||
941 | int is_noslot_pfn(pfn_t pfn) | ||
942 | { | ||
943 | return pfn == bad_pfn; | ||
944 | } | ||
945 | EXPORT_SYMBOL_GPL(is_noslot_pfn); | ||
946 | |||
947 | int is_invalid_pfn(pfn_t pfn) | ||
948 | { | ||
949 | return pfn == hwpoison_pfn || pfn == fault_pfn; | ||
950 | } | ||
951 | EXPORT_SYMBOL_GPL(is_invalid_pfn); | ||
952 | |||
930 | static inline unsigned long bad_hva(void) | 953 | static inline unsigned long bad_hva(void) |
931 | { | 954 | { |
932 | return PAGE_OFFSET; | 955 | return PAGE_OFFSET; |
@@ -1345,7 +1368,7 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, | |||
1345 | addr = gfn_to_hva(kvm, gfn); | 1368 | addr = gfn_to_hva(kvm, gfn); |
1346 | if (kvm_is_error_hva(addr)) | 1369 | if (kvm_is_error_hva(addr)) |
1347 | return -EFAULT; | 1370 | return -EFAULT; |
1348 | r = copy_to_user((void __user *)addr + offset, data, len); | 1371 | r = __copy_to_user((void __user *)addr + offset, data, len); |
1349 | if (r) | 1372 | if (r) |
1350 | return -EFAULT; | 1373 | return -EFAULT; |
1351 | mark_page_dirty(kvm, gfn); | 1374 | mark_page_dirty(kvm, gfn); |
@@ -1405,7 +1428,7 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, | |||
1405 | if (kvm_is_error_hva(ghc->hva)) | 1428 | if (kvm_is_error_hva(ghc->hva)) |
1406 | return -EFAULT; | 1429 | return -EFAULT; |
1407 | 1430 | ||
1408 | r = copy_to_user((void __user *)ghc->hva, data, len); | 1431 | r = __copy_to_user((void __user *)ghc->hva, data, len); |
1409 | if (r) | 1432 | if (r) |
1410 | return -EFAULT; | 1433 | return -EFAULT; |
1411 | mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); | 1434 | mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); |
@@ -1414,6 +1437,26 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, | |||
1414 | } | 1437 | } |
1415 | EXPORT_SYMBOL_GPL(kvm_write_guest_cached); | 1438 | EXPORT_SYMBOL_GPL(kvm_write_guest_cached); |
1416 | 1439 | ||
1440 | int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, | ||
1441 | void *data, unsigned long len) | ||
1442 | { | ||
1443 | struct kvm_memslots *slots = kvm_memslots(kvm); | ||
1444 | int r; | ||
1445 | |||
1446 | if (slots->generation != ghc->generation) | ||
1447 | kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa); | ||
1448 | |||
1449 | if (kvm_is_error_hva(ghc->hva)) | ||
1450 | return -EFAULT; | ||
1451 | |||
1452 | r = __copy_from_user(data, (void __user *)ghc->hva, len); | ||
1453 | if (r) | ||
1454 | return -EFAULT; | ||
1455 | |||
1456 | return 0; | ||
1457 | } | ||
1458 | EXPORT_SYMBOL_GPL(kvm_read_guest_cached); | ||
1459 | |||
1417 | int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) | 1460 | int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) |
1418 | { | 1461 | { |
1419 | return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page, | 1462 | return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page, |
@@ -1586,7 +1629,9 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp) | |||
1586 | static struct file_operations kvm_vcpu_fops = { | 1629 | static struct file_operations kvm_vcpu_fops = { |
1587 | .release = kvm_vcpu_release, | 1630 | .release = kvm_vcpu_release, |
1588 | .unlocked_ioctl = kvm_vcpu_ioctl, | 1631 | .unlocked_ioctl = kvm_vcpu_ioctl, |
1589 | .compat_ioctl = kvm_vcpu_ioctl, | 1632 | #ifdef CONFIG_COMPAT |
1633 | .compat_ioctl = kvm_vcpu_compat_ioctl, | ||
1634 | #endif | ||
1590 | .mmap = kvm_vcpu_mmap, | 1635 | .mmap = kvm_vcpu_mmap, |
1591 | .llseek = noop_llseek, | 1636 | .llseek = noop_llseek, |
1592 | }; | 1637 | }; |
@@ -1615,18 +1660,18 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) | |||
1615 | 1660 | ||
1616 | r = kvm_arch_vcpu_setup(vcpu); | 1661 | r = kvm_arch_vcpu_setup(vcpu); |
1617 | if (r) | 1662 | if (r) |
1618 | return r; | 1663 | goto vcpu_destroy; |
1619 | 1664 | ||
1620 | mutex_lock(&kvm->lock); | 1665 | mutex_lock(&kvm->lock); |
1621 | if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { | 1666 | if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { |
1622 | r = -EINVAL; | 1667 | r = -EINVAL; |
1623 | goto vcpu_destroy; | 1668 | goto unlock_vcpu_destroy; |
1624 | } | 1669 | } |
1625 | 1670 | ||
1626 | kvm_for_each_vcpu(r, v, kvm) | 1671 | kvm_for_each_vcpu(r, v, kvm) |
1627 | if (v->vcpu_id == id) { | 1672 | if (v->vcpu_id == id) { |
1628 | r = -EEXIST; | 1673 | r = -EEXIST; |
1629 | goto vcpu_destroy; | 1674 | goto unlock_vcpu_destroy; |
1630 | } | 1675 | } |
1631 | 1676 | ||
1632 | BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); | 1677 | BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); |
@@ -1636,7 +1681,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) | |||
1636 | r = create_vcpu_fd(vcpu); | 1681 | r = create_vcpu_fd(vcpu); |
1637 | if (r < 0) { | 1682 | if (r < 0) { |
1638 | kvm_put_kvm(kvm); | 1683 | kvm_put_kvm(kvm); |
1639 | goto vcpu_destroy; | 1684 | goto unlock_vcpu_destroy; |
1640 | } | 1685 | } |
1641 | 1686 | ||
1642 | kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; | 1687 | kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; |
@@ -1650,8 +1695,9 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) | |||
1650 | mutex_unlock(&kvm->lock); | 1695 | mutex_unlock(&kvm->lock); |
1651 | return r; | 1696 | return r; |
1652 | 1697 | ||
1653 | vcpu_destroy: | 1698 | unlock_vcpu_destroy: |
1654 | mutex_unlock(&kvm->lock); | 1699 | mutex_unlock(&kvm->lock); |
1700 | vcpu_destroy: | ||
1655 | kvm_arch_vcpu_destroy(vcpu); | 1701 | kvm_arch_vcpu_destroy(vcpu); |
1656 | return r; | 1702 | return r; |
1657 | } | 1703 | } |
@@ -1874,6 +1920,50 @@ out: | |||
1874 | return r; | 1920 | return r; |
1875 | } | 1921 | } |
1876 | 1922 | ||
1923 | #ifdef CONFIG_COMPAT | ||
1924 | static long kvm_vcpu_compat_ioctl(struct file *filp, | ||
1925 | unsigned int ioctl, unsigned long arg) | ||
1926 | { | ||
1927 | struct kvm_vcpu *vcpu = filp->private_data; | ||
1928 | void __user *argp = compat_ptr(arg); | ||
1929 | int r; | ||
1930 | |||
1931 | if (vcpu->kvm->mm != current->mm) | ||
1932 | return -EIO; | ||
1933 | |||
1934 | switch (ioctl) { | ||
1935 | case KVM_SET_SIGNAL_MASK: { | ||
1936 | struct kvm_signal_mask __user *sigmask_arg = argp; | ||
1937 | struct kvm_signal_mask kvm_sigmask; | ||
1938 | compat_sigset_t csigset; | ||
1939 | sigset_t sigset; | ||
1940 | |||
1941 | if (argp) { | ||
1942 | r = -EFAULT; | ||
1943 | if (copy_from_user(&kvm_sigmask, argp, | ||
1944 | sizeof kvm_sigmask)) | ||
1945 | goto out; | ||
1946 | r = -EINVAL; | ||
1947 | if (kvm_sigmask.len != sizeof csigset) | ||
1948 | goto out; | ||
1949 | r = -EFAULT; | ||
1950 | if (copy_from_user(&csigset, sigmask_arg->sigset, | ||
1951 | sizeof csigset)) | ||
1952 | goto out; | ||
1953 | } | ||
1954 | sigset_from_compat(&sigset, &csigset); | ||
1955 | r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); | ||
1956 | break; | ||
1957 | } | ||
1958 | default: | ||
1959 | r = kvm_vcpu_ioctl(filp, ioctl, arg); | ||
1960 | } | ||
1961 | |||
1962 | out: | ||
1963 | return r; | ||
1964 | } | ||
1965 | #endif | ||
1966 | |||
1877 | static long kvm_vm_ioctl(struct file *filp, | 1967 | static long kvm_vm_ioctl(struct file *filp, |
1878 | unsigned int ioctl, unsigned long arg) | 1968 | unsigned int ioctl, unsigned long arg) |
1879 | { | 1969 | { |