aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-07-24 12:07:03 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-07-24 12:07:03 -0400
commit5fabc487c96819dd12ddb9414835d170fd9cd6d5 (patch)
tree01532d492e5074b0d3add29bf92ebf9a9d161e9e
parentc61264f98c1a974ee6f545f61a4ab33b141d6bda (diff)
parent3f68b0318bbbd61bf08478ab99a149f0d9e5156e (diff)
Merge branch 'kvm-updates/3.1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/3.1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (143 commits) KVM: IOMMU: Disable device assignment without interrupt remapping KVM: MMU: trace mmio page fault KVM: MMU: mmio page fault support KVM: MMU: reorganize struct kvm_shadow_walk_iterator KVM: MMU: lockless walking shadow page table KVM: MMU: do not need atomicly to set/clear spte KVM: MMU: introduce the rules to modify shadow page table KVM: MMU: abstract some functions to handle fault pfn KVM: MMU: filter out the mmio pfn from the fault pfn KVM: MMU: remove bypass_guest_pf KVM: MMU: split kvm_mmu_free_page KVM: MMU: count used shadow pages on prepareing path KVM: MMU: rename 'pt_write' to 'emulate' KVM: MMU: cleanup for FNAME(fetch) KVM: MMU: optimize to handle dirty bit KVM: MMU: cache mmio info on page fault path KVM: x86: introduce vcpu_mmio_gva_to_gpa to cleanup the code KVM: MMU: do not update slot bitmap if spte is nonpresent KVM: MMU: fix walking shadow page table KVM guest: KVM Steal time registration ...
-rw-r--r--Documentation/kernel-parameters.txt8
-rw-r--r--Documentation/virtual/kvm/api.txt172
-rw-r--r--Documentation/virtual/kvm/mmu.txt18
-rw-r--r--Documentation/virtual/kvm/msr.txt34
-rw-r--r--Documentation/virtual/kvm/nested-vmx.txt251
-rw-r--r--Documentation/virtual/kvm/ppc-pv.txt8
-rw-r--r--arch/ia64/include/asm/paravirt.h4
-rw-r--r--arch/ia64/kernel/paravirt.c2
-rw-r--r--arch/powerpc/include/asm/cputable.h14
-rw-r--r--arch/powerpc/include/asm/exception-64s.h136
-rw-r--r--arch/powerpc/include/asm/hvcall.h5
-rw-r--r--arch/powerpc/include/asm/kvm.h15
-rw-r--r--arch/powerpc/include/asm/kvm_asm.h4
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h196
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h4
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_asm.h41
-rw-r--r--arch/powerpc/include/asm/kvm_booke.h4
-rw-r--r--arch/powerpc/include/asm/kvm_e500.h30
-rw-r--r--arch/powerpc/include/asm/kvm_host.h169
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h41
-rw-r--r--arch/powerpc/include/asm/mmu-hash64.h10
-rw-r--r--arch/powerpc/include/asm/paca.h3
-rw-r--r--arch/powerpc/include/asm/ppc_asm.h28
-rw-r--r--arch/powerpc/include/asm/reg.h25
-rw-r--r--arch/powerpc/include/asm/reg_booke.h1
-rw-r--r--arch/powerpc/kernel/asm-offsets.c190
-rw-r--r--arch/powerpc/kernel/cpu_setup_power7.S22
-rw-r--r--arch/powerpc/kernel/cpu_setup_ppc970.S26
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S228
-rw-r--r--arch/powerpc/kernel/head_fsl_booke.S8
-rw-r--r--arch/powerpc/kernel/idle_power7.S2
-rw-r--r--arch/powerpc/kernel/paca.c2
-rw-r--r--arch/powerpc/kernel/process.c4
-rw-r--r--arch/powerpc/kernel/setup-common.c3
-rw-r--r--arch/powerpc/kernel/setup_64.c3
-rw-r--r--arch/powerpc/kernel/smp.c1
-rw-r--r--arch/powerpc/kernel/traps.c5
-rw-r--r--arch/powerpc/kvm/44x_tlb.c4
-rw-r--r--arch/powerpc/kvm/Kconfig34
-rw-r--r--arch/powerpc/kvm/Makefile27
-rw-r--r--arch/powerpc/kvm/book3s.c1007
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu.c54
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c180
-rw-r--r--arch/powerpc/kvm/book3s_64_vio_hv.c73
-rw-r--r--arch/powerpc/kvm/book3s_exports.c9
-rw-r--r--arch/powerpc/kvm/book3s_hv.c1269
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c155
-rw-r--r--arch/powerpc/kvm/book3s_hv_interrupts.S166
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c370
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S1345
-rw-r--r--arch/powerpc/kvm/book3s_interrupts.S21
-rw-r--r--arch/powerpc/kvm/book3s_mmu_hpte.c71
-rw-r--r--arch/powerpc/kvm/book3s_pr.c1029
-rw-r--r--arch/powerpc/kvm/book3s_rmhandlers.S102
-rw-r--r--arch/powerpc/kvm/book3s_segment.S117
-rw-r--r--arch/powerpc/kvm/booke.c132
-rw-r--r--arch/powerpc/kvm/booke.h23
-rw-r--r--arch/powerpc/kvm/booke_interrupts.S66
-rw-r--r--arch/powerpc/kvm/e500.c7
-rw-r--r--arch/powerpc/kvm/e500_emulate.c4
-rw-r--r--arch/powerpc/kvm/e500_tlb.c800
-rw-r--r--arch/powerpc/kvm/e500_tlb.h13
-rw-r--r--arch/powerpc/kvm/powerpc.c78
-rw-r--r--arch/powerpc/kvm/timing.c9
-rw-r--r--arch/powerpc/kvm/trace.h4
-rw-r--r--arch/powerpc/mm/hash_native_64.c6
-rw-r--r--arch/powerpc/platforms/iseries/exception.S2
-rw-r--r--arch/powerpc/platforms/iseries/exception.h4
-rw-r--r--arch/powerpc/sysdev/xics/icp-native.c9
-rw-r--r--arch/x86/Kconfig12
-rw-r--r--arch/x86/include/asm/kvm_emulate.h52
-rw-r--r--arch/x86/include/asm/kvm_host.h46
-rw-r--r--arch/x86/include/asm/kvm_para.h20
-rw-r--r--arch/x86/include/asm/msr-index.h12
-rw-r--r--arch/x86/include/asm/paravirt.h9
-rw-r--r--arch/x86/include/asm/paravirt_types.h1
-rw-r--r--arch/x86/include/asm/processor-flags.h1
-rw-r--r--arch/x86/include/asm/vmx.h43
-rw-r--r--arch/x86/kernel/kvm.c72
-rw-r--r--arch/x86/kernel/kvmclock.c2
-rw-r--r--arch/x86/kernel/paravirt.c9
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/emulate.c1749
-rw-r--r--arch/x86/kvm/mmu.c1226
-rw-r--r--arch/x86/kvm/mmu.h25
-rw-r--r--arch/x86/kvm/mmu_audit.c12
-rw-r--r--arch/x86/kvm/mmutrace.h48
-rw-r--r--arch/x86/kvm/paging_tmpl.h258
-rw-r--r--arch/x86/kvm/svm.c6
-rw-r--r--arch/x86/kvm/trace.h31
-rw-r--r--arch/x86/kvm/vmx.c2784
-rw-r--r--arch/x86/kvm/x86.c374
-rw-r--r--arch/x86/kvm/x86.h44
-rw-r--r--include/linux/kvm.h20
-rw-r--r--include/linux/kvm_host.h8
-rw-r--r--kernel/compat.c1
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/sched.c90
-rw-r--r--kernel/sched_features.h4
-rw-r--r--virt/kvm/assigned-dev.c2
-rw-r--r--virt/kvm/iommu.c18
-rw-r--r--virt/kvm/kvm_main.c110
102 files changed, 12320 insertions, 3679 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index aa47be71df4c..40cc653984ee 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1159,10 +1159,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1159 for all guests. 1159 for all guests.
1160 Default is 1 (enabled) if in 64bit or 32bit-PAE mode 1160 Default is 1 (enabled) if in 64bit or 32bit-PAE mode
1161 1161
1162 kvm-intel.bypass_guest_pf=
1163 [KVM,Intel] Disables bypassing of guest page faults
1164 on Intel chips. Default is 1 (enabled)
1165
1166 kvm-intel.ept= [KVM,Intel] Disable extended page tables 1162 kvm-intel.ept= [KVM,Intel] Disable extended page tables
1167 (virtualized MMU) support on capable Intel chips. 1163 (virtualized MMU) support on capable Intel chips.
1168 Default is 1 (enabled) 1164 Default is 1 (enabled)
@@ -1737,6 +1733,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1737 no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page 1733 no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
1738 fault handling. 1734 fault handling.
1739 1735
1736 no-steal-acc [X86,KVM] Disable paravirtualized steal time accounting.
1737 steal time is computed, but won't influence scheduler
1738 behaviour
1739
1740 nolapic [X86-32,APIC] Do not enable or use the local APIC. 1740 nolapic [X86-32,APIC] Do not enable or use the local APIC.
1741 1741
1742 nolapic_timer [X86-32,APIC] Do not use the local APIC timer. 1742 nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 42542eb802ca..b0e4b9cd6a66 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -180,6 +180,19 @@ KVM_CHECK_EXTENSION ioctl() to determine the value for max_vcpus at run-time.
180If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4 180If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4
181cpus max. 181cpus max.
182 182
183On powerpc using book3s_hv mode, the vcpus are mapped onto virtual
184threads in one or more virtual CPU cores. (This is because the
185hardware requires all the hardware threads in a CPU core to be in the
186same partition.) The KVM_CAP_PPC_SMT capability indicates the number
187of vcpus per virtual core (vcore). The vcore id is obtained by
188dividing the vcpu id by the number of vcpus per vcore. The vcpus in a
189given vcore will always be in the same physical core as each other
190(though that might be a different physical core from time to time).
191Userspace can control the threading (SMT) mode of the guest by its
192allocation of vcpu ids. For example, if userspace wants
193single-threaded guest vcpus, it should make all vcpu ids be a multiple
194of the number of vcpus per vcore.
195
1834.8 KVM_GET_DIRTY_LOG (vm ioctl) 1964.8 KVM_GET_DIRTY_LOG (vm ioctl)
184 197
185Capability: basic 198Capability: basic
@@ -1143,15 +1156,10 @@ Assigns an IRQ to a passed-through device.
1143 1156
1144struct kvm_assigned_irq { 1157struct kvm_assigned_irq {
1145 __u32 assigned_dev_id; 1158 __u32 assigned_dev_id;
1146 __u32 host_irq; 1159 __u32 host_irq; /* ignored (legacy field) */
1147 __u32 guest_irq; 1160 __u32 guest_irq;
1148 __u32 flags; 1161 __u32 flags;
1149 union { 1162 union {
1150 struct {
1151 __u32 addr_lo;
1152 __u32 addr_hi;
1153 __u32 data;
1154 } guest_msi;
1155 __u32 reserved[12]; 1163 __u32 reserved[12];
1156 }; 1164 };
1157}; 1165};
@@ -1239,8 +1247,10 @@ Type: vm ioctl
1239Parameters: struct kvm_assigned_msix_nr (in) 1247Parameters: struct kvm_assigned_msix_nr (in)
1240Returns: 0 on success, -1 on error 1248Returns: 0 on success, -1 on error
1241 1249
1242Set the number of MSI-X interrupts for an assigned device. This service can 1250Set the number of MSI-X interrupts for an assigned device. The number is
1243only be called once in the lifetime of an assigned device. 1251reset again by terminating the MSI-X assignment of the device via
1252KVM_DEASSIGN_DEV_IRQ. Calling this service more than once at any earlier
1253point will fail.
1244 1254
1245struct kvm_assigned_msix_nr { 1255struct kvm_assigned_msix_nr {
1246 __u32 assigned_dev_id; 1256 __u32 assigned_dev_id;
@@ -1291,6 +1301,135 @@ Returns the tsc frequency of the guest. The unit of the return value is
1291KHz. If the host has unstable tsc this ioctl returns -EIO instead as an 1301KHz. If the host has unstable tsc this ioctl returns -EIO instead as an
1292error. 1302error.
1293 1303
13044.56 KVM_GET_LAPIC
1305
1306Capability: KVM_CAP_IRQCHIP
1307Architectures: x86
1308Type: vcpu ioctl
1309Parameters: struct kvm_lapic_state (out)
1310Returns: 0 on success, -1 on error
1311
1312#define KVM_APIC_REG_SIZE 0x400
1313struct kvm_lapic_state {
1314 char regs[KVM_APIC_REG_SIZE];
1315};
1316
1317Reads the Local APIC registers and copies them into the input argument. The
1318data format and layout are the same as documented in the architecture manual.
1319
13204.57 KVM_SET_LAPIC
1321
1322Capability: KVM_CAP_IRQCHIP
1323Architectures: x86
1324Type: vcpu ioctl
1325Parameters: struct kvm_lapic_state (in)
1326Returns: 0 on success, -1 on error
1327
1328#define KVM_APIC_REG_SIZE 0x400
1329struct kvm_lapic_state {
1330 char regs[KVM_APIC_REG_SIZE];
1331};
1332
1333Copies the input argument into the the Local APIC registers. The data format
1334and layout are the same as documented in the architecture manual.
1335
13364.58 KVM_IOEVENTFD
1337
1338Capability: KVM_CAP_IOEVENTFD
1339Architectures: all
1340Type: vm ioctl
1341Parameters: struct kvm_ioeventfd (in)
1342Returns: 0 on success, !0 on error
1343
1344This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address
1345within the guest. A guest write in the registered address will signal the
1346provided event instead of triggering an exit.
1347
1348struct kvm_ioeventfd {
1349 __u64 datamatch;
1350 __u64 addr; /* legal pio/mmio address */
1351 __u32 len; /* 1, 2, 4, or 8 bytes */
1352 __s32 fd;
1353 __u32 flags;
1354 __u8 pad[36];
1355};
1356
1357The following flags are defined:
1358
1359#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
1360#define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio)
1361#define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign)
1362
1363If datamatch flag is set, the event will be signaled only if the written value
1364to the registered address is equal to datamatch in struct kvm_ioeventfd.
1365
13664.62 KVM_CREATE_SPAPR_TCE
1367
1368Capability: KVM_CAP_SPAPR_TCE
1369Architectures: powerpc
1370Type: vm ioctl
1371Parameters: struct kvm_create_spapr_tce (in)
1372Returns: file descriptor for manipulating the created TCE table
1373
1374This creates a virtual TCE (translation control entry) table, which
1375is an IOMMU for PAPR-style virtual I/O. It is used to translate
1376logical addresses used in virtual I/O into guest physical addresses,
1377and provides a scatter/gather capability for PAPR virtual I/O.
1378
1379/* for KVM_CAP_SPAPR_TCE */
1380struct kvm_create_spapr_tce {
1381 __u64 liobn;
1382 __u32 window_size;
1383};
1384
1385The liobn field gives the logical IO bus number for which to create a
1386TCE table. The window_size field specifies the size of the DMA window
1387which this TCE table will translate - the table will contain one 64
1388bit TCE entry for every 4kiB of the DMA window.
1389
1390When the guest issues an H_PUT_TCE hcall on a liobn for which a TCE
1391table has been created using this ioctl(), the kernel will handle it
1392in real mode, updating the TCE table. H_PUT_TCE calls for other
1393liobns will cause a vm exit and must be handled by userspace.
1394
1395The return value is a file descriptor which can be passed to mmap(2)
1396to map the created TCE table into userspace. This lets userspace read
1397the entries written by kernel-handled H_PUT_TCE calls, and also lets
1398userspace update the TCE table directly which is useful in some
1399circumstances.
1400
14014.63 KVM_ALLOCATE_RMA
1402
1403Capability: KVM_CAP_PPC_RMA
1404Architectures: powerpc
1405Type: vm ioctl
1406Parameters: struct kvm_allocate_rma (out)
1407Returns: file descriptor for mapping the allocated RMA
1408
1409This allocates a Real Mode Area (RMA) from the pool allocated at boot
1410time by the kernel. An RMA is a physically-contiguous, aligned region
1411of memory used on older POWER processors to provide the memory which
1412will be accessed by real-mode (MMU off) accesses in a KVM guest.
1413POWER processors support a set of sizes for the RMA that usually
1414includes 64MB, 128MB, 256MB and some larger powers of two.
1415
1416/* for KVM_ALLOCATE_RMA */
1417struct kvm_allocate_rma {
1418 __u64 rma_size;
1419};
1420
1421The return value is a file descriptor which can be passed to mmap(2)
1422to map the allocated RMA into userspace. The mapped area can then be
1423passed to the KVM_SET_USER_MEMORY_REGION ioctl to establish it as the
1424RMA for a virtual machine. The size of the RMA in bytes (which is
1425fixed at host kernel boot time) is returned in the rma_size field of
1426the argument structure.
1427
1428The KVM_CAP_PPC_RMA capability is 1 or 2 if the KVM_ALLOCATE_RMA ioctl
1429is supported; 2 if the processor requires all virtual machines to have
1430an RMA, or 1 if the processor can use an RMA but doesn't require it,
1431because it supports the Virtual RMA (VRMA) facility.
1432
12945. The kvm_run structure 14335. The kvm_run structure
1295 1434
1296Application code obtains a pointer to the kvm_run structure by 1435Application code obtains a pointer to the kvm_run structure by
@@ -1473,6 +1612,23 @@ Userspace can now handle the hypercall and when it's done modify the gprs as
1473necessary. Upon guest entry all guest GPRs will then be replaced by the values 1612necessary. Upon guest entry all guest GPRs will then be replaced by the values
1474in this struct. 1613in this struct.
1475 1614
1615 /* KVM_EXIT_PAPR_HCALL */
1616 struct {
1617 __u64 nr;
1618 __u64 ret;
1619 __u64 args[9];
1620 } papr_hcall;
1621
1622This is used on 64-bit PowerPC when emulating a pSeries partition,
1623e.g. with the 'pseries' machine type in qemu. It occurs when the
1624guest does a hypercall using the 'sc 1' instruction. The 'nr' field
1625contains the hypercall number (from the guest R3), and 'args' contains
1626the arguments (from the guest R4 - R12). Userspace should put the
1627return code in 'ret' and any extra returned values in args[].
1628The possible hypercalls are defined in the Power Architecture Platform
1629Requirements (PAPR) document available from www.power.org (free
1630developer registration required to access it).
1631
1476 /* Fix the size of the union. */ 1632 /* Fix the size of the union. */
1477 char padding[256]; 1633 char padding[256];
1478 }; 1634 };
diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
index f46aa58389ca..5dc972c09b55 100644
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@ -165,6 +165,10 @@ Shadow pages contain the following information:
165 Contains the value of efer.nxe for which the page is valid. 165 Contains the value of efer.nxe for which the page is valid.
166 role.cr0_wp: 166 role.cr0_wp:
167 Contains the value of cr0.wp for which the page is valid. 167 Contains the value of cr0.wp for which the page is valid.
168 role.smep_andnot_wp:
169 Contains the value of cr4.smep && !cr0.wp for which the page is valid
170 (pages for which this is true are different from other pages; see the
171 treatment of cr0.wp=0 below).
168 gfn: 172 gfn:
169 Either the guest page table containing the translations shadowed by this 173 Either the guest page table containing the translations shadowed by this
170 page, or the base page frame for linear translations. See role.direct. 174 page, or the base page frame for linear translations. See role.direct.
@@ -317,6 +321,20 @@ on fault type:
317 321
318(user write faults generate a #PF) 322(user write faults generate a #PF)
319 323
324In the first case there is an additional complication if CR4.SMEP is
325enabled: since we've turned the page into a kernel page, the kernel may now
326execute it. We handle this by also setting spte.nx. If we get a user
327fetch or read fault, we'll change spte.u=1 and spte.nx=gpte.nx back.
328
329To prevent an spte that was converted into a kernel page with cr0.wp=0
330from being written by the kernel after cr0.wp has changed to 1, we make
331the value of cr0.wp part of the page role. This means that an spte created
332with one value of cr0.wp cannot be used when cr0.wp has a different value -
333it will simply be missed by the shadow page lookup code. A similar issue
334exists when an spte created with cr0.wp=0 and cr4.smep=0 is used after
335changing cr4.smep to 1. To avoid this, the value of !cr0.wp && cr4.smep
336is also made a part of the page role.
337
320Large pages 338Large pages
321=========== 339===========
322 340
diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt
index d079aed27e03..50317809113d 100644
--- a/Documentation/virtual/kvm/msr.txt
+++ b/Documentation/virtual/kvm/msr.txt
@@ -185,3 +185,37 @@ MSR_KVM_ASYNC_PF_EN: 0x4b564d02
185 185
186 Currently type 2 APF will be always delivered on the same vcpu as 186 Currently type 2 APF will be always delivered on the same vcpu as
187 type 1 was, but guest should not rely on that. 187 type 1 was, but guest should not rely on that.
188
189MSR_KVM_STEAL_TIME: 0x4b564d03
190
191 data: 64-byte alignment physical address of a memory area which must be
192 in guest RAM, plus an enable bit in bit 0. This memory is expected to
193 hold a copy of the following structure:
194
195 struct kvm_steal_time {
196 __u64 steal;
197 __u32 version;
198 __u32 flags;
199 __u32 pad[12];
200 }
201
202 whose data will be filled in by the hypervisor periodically. Only one
203 write, or registration, is needed for each VCPU. The interval between
204 updates of this structure is arbitrary and implementation-dependent.
205 The hypervisor may update this structure at any time it sees fit until
206 anything with bit0 == 0 is written to it. Guest is required to make sure
207 this structure is initialized to zero.
208
209 Fields have the following meanings:
210
211 version: a sequence counter. In other words, guest has to check
212 this field before and after grabbing time information and make
213 sure they are both equal and even. An odd version indicates an
214 in-progress update.
215
216 flags: At this point, always zero. May be used to indicate
217 changes in this structure in the future.
218
219 steal: the amount of time in which this vCPU did not run, in
220 nanoseconds. Time during which the vcpu is idle, will not be
221 reported as steal time.
diff --git a/Documentation/virtual/kvm/nested-vmx.txt b/Documentation/virtual/kvm/nested-vmx.txt
new file mode 100644
index 000000000000..8ed937de1163
--- /dev/null
+++ b/Documentation/virtual/kvm/nested-vmx.txt
@@ -0,0 +1,251 @@
1Nested VMX
2==========
3
4Overview
5---------
6
7On Intel processors, KVM uses Intel's VMX (Virtual-Machine eXtensions)
8to easily and efficiently run guest operating systems. Normally, these guests
9*cannot* themselves be hypervisors running their own guests, because in VMX,
10guests cannot use VMX instructions.
11
12The "Nested VMX" feature adds this missing capability - of running guest
13hypervisors (which use VMX) with their own nested guests. It does so by
14allowing a guest to use VMX instructions, and correctly and efficiently
15emulating them using the single level of VMX available in the hardware.
16
17We describe in much greater detail the theory behind the nested VMX feature,
18its implementation and its performance characteristics, in the OSDI 2010 paper
19"The Turtles Project: Design and Implementation of Nested Virtualization",
20available at:
21
22 http://www.usenix.org/events/osdi10/tech/full_papers/Ben-Yehuda.pdf
23
24
25Terminology
26-----------
27
28Single-level virtualization has two levels - the host (KVM) and the guests.
29In nested virtualization, we have three levels: The host (KVM), which we call
30L0, the guest hypervisor, which we call L1, and its nested guest, which we
31call L2.
32
33
34Known limitations
35-----------------
36
37The current code supports running Linux guests under KVM guests.
38Only 64-bit guest hypervisors are supported.
39
40Additional patches for running Windows under guest KVM, and Linux under
41guest VMware server, and support for nested EPT, are currently running in
42the lab, and will be sent as follow-on patchsets.
43
44
45Running nested VMX
46------------------
47
48The nested VMX feature is disabled by default. It can be enabled by giving
49the "nested=1" option to the kvm-intel module.
50
51No modifications are required to user space (qemu). However, qemu's default
52emulated CPU type (qemu64) does not list the "VMX" CPU feature, so it must be
53explicitly enabled, by giving qemu one of the following options:
54
55 -cpu host (emulated CPU has all features of the real CPU)
56
57 -cpu qemu64,+vmx (add just the vmx feature to a named CPU type)
58
59
60ABIs
61----
62
63Nested VMX aims to present a standard and (eventually) fully-functional VMX
64implementation for the a guest hypervisor to use. As such, the official
65specification of the ABI that it provides is Intel's VMX specification,
66namely volume 3B of their "Intel 64 and IA-32 Architectures Software
67Developer's Manual". Not all of VMX's features are currently fully supported,
68but the goal is to eventually support them all, starting with the VMX features
69which are used in practice by popular hypervisors (KVM and others).
70
71As a VMX implementation, nested VMX presents a VMCS structure to L1.
72As mandated by the spec, other than the two fields revision_id and abort,
73this structure is *opaque* to its user, who is not supposed to know or care
74about its internal structure. Rather, the structure is accessed through the
75VMREAD and VMWRITE instructions.
76Still, for debugging purposes, KVM developers might be interested to know the
77internals of this structure; This is struct vmcs12 from arch/x86/kvm/vmx.c.
78
79The name "vmcs12" refers to the VMCS that L1 builds for L2. In the code we
80also have "vmcs01", the VMCS that L0 built for L1, and "vmcs02" is the VMCS
81which L0 builds to actually run L2 - how this is done is explained in the
82aforementioned paper.
83
84For convenience, we repeat the content of struct vmcs12 here. If the internals
85of this structure changes, this can break live migration across KVM versions.
86VMCS12_REVISION (from vmx.c) should be changed if struct vmcs12 or its inner
87struct shadow_vmcs is ever changed.
88
89 typedef u64 natural_width;
90 struct __packed vmcs12 {
91 /* According to the Intel spec, a VMCS region must start with
92 * these two user-visible fields */
93 u32 revision_id;
94 u32 abort;
95
96 u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
97 u32 padding[7]; /* room for future expansion */
98
99 u64 io_bitmap_a;
100 u64 io_bitmap_b;
101 u64 msr_bitmap;
102 u64 vm_exit_msr_store_addr;
103 u64 vm_exit_msr_load_addr;
104 u64 vm_entry_msr_load_addr;
105 u64 tsc_offset;
106 u64 virtual_apic_page_addr;
107 u64 apic_access_addr;
108 u64 ept_pointer;
109 u64 guest_physical_address;
110 u64 vmcs_link_pointer;
111 u64 guest_ia32_debugctl;
112 u64 guest_ia32_pat;
113 u64 guest_ia32_efer;
114 u64 guest_pdptr0;
115 u64 guest_pdptr1;
116 u64 guest_pdptr2;
117 u64 guest_pdptr3;
118 u64 host_ia32_pat;
119 u64 host_ia32_efer;
120 u64 padding64[8]; /* room for future expansion */
121 natural_width cr0_guest_host_mask;
122 natural_width cr4_guest_host_mask;
123 natural_width cr0_read_shadow;
124 natural_width cr4_read_shadow;
125 natural_width cr3_target_value0;
126 natural_width cr3_target_value1;
127 natural_width cr3_target_value2;
128 natural_width cr3_target_value3;
129 natural_width exit_qualification;
130 natural_width guest_linear_address;
131 natural_width guest_cr0;
132 natural_width guest_cr3;
133 natural_width guest_cr4;
134 natural_width guest_es_base;
135 natural_width guest_cs_base;
136 natural_width guest_ss_base;
137 natural_width guest_ds_base;
138 natural_width guest_fs_base;
139 natural_width guest_gs_base;
140 natural_width guest_ldtr_base;
141 natural_width guest_tr_base;
142 natural_width guest_gdtr_base;
143 natural_width guest_idtr_base;
144 natural_width guest_dr7;
145 natural_width guest_rsp;
146 natural_width guest_rip;
147 natural_width guest_rflags;
148 natural_width guest_pending_dbg_exceptions;
149 natural_width guest_sysenter_esp;
150 natural_width guest_sysenter_eip;
151 natural_width host_cr0;
152 natural_width host_cr3;
153 natural_width host_cr4;
154 natural_width host_fs_base;
155 natural_width host_gs_base;
156 natural_width host_tr_base;
157 natural_width host_gdtr_base;
158 natural_width host_idtr_base;
159 natural_width host_ia32_sysenter_esp;
160 natural_width host_ia32_sysenter_eip;
161 natural_width host_rsp;
162 natural_width host_rip;
163 natural_width paddingl[8]; /* room for future expansion */
164 u32 pin_based_vm_exec_control;
165 u32 cpu_based_vm_exec_control;
166 u32 exception_bitmap;
167 u32 page_fault_error_code_mask;
168 u32 page_fault_error_code_match;
169 u32 cr3_target_count;
170 u32 vm_exit_controls;
171 u32 vm_exit_msr_store_count;
172 u32 vm_exit_msr_load_count;
173 u32 vm_entry_controls;
174 u32 vm_entry_msr_load_count;
175 u32 vm_entry_intr_info_field;
176 u32 vm_entry_exception_error_code;
177 u32 vm_entry_instruction_len;
178 u32 tpr_threshold;
179 u32 secondary_vm_exec_control;
180 u32 vm_instruction_error;
181 u32 vm_exit_reason;
182 u32 vm_exit_intr_info;
183 u32 vm_exit_intr_error_code;
184 u32 idt_vectoring_info_field;
185 u32 idt_vectoring_error_code;
186 u32 vm_exit_instruction_len;
187 u32 vmx_instruction_info;
188 u32 guest_es_limit;
189 u32 guest_cs_limit;
190 u32 guest_ss_limit;
191 u32 guest_ds_limit;
192 u32 guest_fs_limit;
193 u32 guest_gs_limit;
194 u32 guest_ldtr_limit;
195 u32 guest_tr_limit;
196 u32 guest_gdtr_limit;
197 u32 guest_idtr_limit;
198 u32 guest_es_ar_bytes;
199 u32 guest_cs_ar_bytes;
200 u32 guest_ss_ar_bytes;
201 u32 guest_ds_ar_bytes;
202 u32 guest_fs_ar_bytes;
203 u32 guest_gs_ar_bytes;
204 u32 guest_ldtr_ar_bytes;
205 u32 guest_tr_ar_bytes;
206 u32 guest_interruptibility_info;
207 u32 guest_activity_state;
208 u32 guest_sysenter_cs;
209 u32 host_ia32_sysenter_cs;
210 u32 padding32[8]; /* room for future expansion */
211 u16 virtual_processor_id;
212 u16 guest_es_selector;
213 u16 guest_cs_selector;
214 u16 guest_ss_selector;
215 u16 guest_ds_selector;
216 u16 guest_fs_selector;
217 u16 guest_gs_selector;
218 u16 guest_ldtr_selector;
219 u16 guest_tr_selector;
220 u16 host_es_selector;
221 u16 host_cs_selector;
222 u16 host_ss_selector;
223 u16 host_ds_selector;
224 u16 host_fs_selector;
225 u16 host_gs_selector;
226 u16 host_tr_selector;
227 };
228
229
230Authors
231-------
232
233These patches were written by:
234 Abel Gordon, abelg <at> il.ibm.com
235 Nadav Har'El, nyh <at> il.ibm.com
236 Orit Wasserman, oritw <at> il.ibm.com
237 Ben-Ami Yassor, benami <at> il.ibm.com
238 Muli Ben-Yehuda, muli <at> il.ibm.com
239
240With contributions by:
241 Anthony Liguori, aliguori <at> us.ibm.com
242 Mike Day, mdday <at> us.ibm.com
243 Michael Factor, factor <at> il.ibm.com
244 Zvi Dubitzky, dubi <at> il.ibm.com
245
246And valuable reviews by:
247 Avi Kivity, avi <at> redhat.com
248 Gleb Natapov, gleb <at> redhat.com
249 Marcelo Tosatti, mtosatti <at> redhat.com
250 Kevin Tian, kevin.tian <at> intel.com
251 and others.
diff --git a/Documentation/virtual/kvm/ppc-pv.txt b/Documentation/virtual/kvm/ppc-pv.txt
index 3ab969c59046..2b7ce190cde4 100644
--- a/Documentation/virtual/kvm/ppc-pv.txt
+++ b/Documentation/virtual/kvm/ppc-pv.txt
@@ -68,9 +68,11 @@ page that contains parts of supervisor visible register state. The guest can
68map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE. 68map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE.
69 69
70With this hypercall issued the guest always gets the magic page mapped at the 70With this hypercall issued the guest always gets the magic page mapped at the
71desired location in effective and physical address space. For now, we always 71desired location. The first parameter indicates the effective address when the
72map the page to -4096. This way we can access it using absolute load and store 72MMU is enabled. The second parameter indicates the address in real mode, if
73functions. The following instruction reads the first field of the magic page: 73applicable to the target. For now, we always map the page to -4096. This way we
74can access it using absolute load and store functions. The following
75instruction reads the first field of the magic page:
74 76
75 ld rX, -4096(0) 77 ld rX, -4096(0)
76 78
diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h
index 2eb0a981a09a..32551d304cd7 100644
--- a/arch/ia64/include/asm/paravirt.h
+++ b/arch/ia64/include/asm/paravirt.h
@@ -281,6 +281,10 @@ paravirt_init_missing_ticks_accounting(int cpu)
281 pv_time_ops.init_missing_ticks_accounting(cpu); 281 pv_time_ops.init_missing_ticks_accounting(cpu);
282} 282}
283 283
284struct jump_label_key;
285extern struct jump_label_key paravirt_steal_enabled;
286extern struct jump_label_key paravirt_steal_rq_enabled;
287
284static inline int 288static inline int
285paravirt_do_steal_accounting(unsigned long *new_itm) 289paravirt_do_steal_accounting(unsigned long *new_itm)
286{ 290{
diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c
index a21d7bb9c69c..100868216c55 100644
--- a/arch/ia64/kernel/paravirt.c
+++ b/arch/ia64/kernel/paravirt.c
@@ -634,6 +634,8 @@ struct pv_irq_ops pv_irq_ops = {
634 * pv_time_ops 634 * pv_time_ops
635 * time operations 635 * time operations
636 */ 636 */
637struct jump_label_key paravirt_steal_enabled;
638struct jump_label_key paravirt_steal_rq_enabled;
637 639
638static int 640static int
639ia64_native_do_steal_accounting(unsigned long *new_itm) 641ia64_native_do_steal_accounting(unsigned long *new_itm)
diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index c0d842cfd012..e30442c539ce 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -179,8 +179,9 @@ extern const char *powerpc_base_platform;
179#define LONG_ASM_CONST(x) 0 179#define LONG_ASM_CONST(x) 0
180#endif 180#endif
181 181
182 182#define CPU_FTR_HVMODE LONG_ASM_CONST(0x0000000200000000)
183#define CPU_FTR_HVMODE_206 LONG_ASM_CONST(0x0000000800000000) 183#define CPU_FTR_ARCH_201 LONG_ASM_CONST(0x0000000400000000)
184#define CPU_FTR_ARCH_206 LONG_ASM_CONST(0x0000000800000000)
184#define CPU_FTR_CFAR LONG_ASM_CONST(0x0000001000000000) 185#define CPU_FTR_CFAR LONG_ASM_CONST(0x0000001000000000)
185#define CPU_FTR_IABR LONG_ASM_CONST(0x0000002000000000) 186#define CPU_FTR_IABR LONG_ASM_CONST(0x0000002000000000)
186#define CPU_FTR_MMCRA LONG_ASM_CONST(0x0000004000000000) 187#define CPU_FTR_MMCRA LONG_ASM_CONST(0x0000004000000000)
@@ -401,9 +402,10 @@ extern const char *powerpc_base_platform;
401 CPU_FTR_MMCRA | CPU_FTR_CP_USE_DCBTZ | \ 402 CPU_FTR_MMCRA | CPU_FTR_CP_USE_DCBTZ | \
402 CPU_FTR_STCX_CHECKS_ADDRESS) 403 CPU_FTR_STCX_CHECKS_ADDRESS)
403#define CPU_FTRS_PPC970 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ 404#define CPU_FTRS_PPC970 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
404 CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ 405 CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_201 | \
405 CPU_FTR_ALTIVEC_COMP | CPU_FTR_CAN_NAP | CPU_FTR_MMCRA | \ 406 CPU_FTR_ALTIVEC_COMP | CPU_FTR_CAN_NAP | CPU_FTR_MMCRA | \
406 CPU_FTR_CP_USE_DCBTZ | CPU_FTR_STCX_CHECKS_ADDRESS) 407 CPU_FTR_CP_USE_DCBTZ | CPU_FTR_STCX_CHECKS_ADDRESS | \
408 CPU_FTR_HVMODE)
407#define CPU_FTRS_POWER5 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ 409#define CPU_FTRS_POWER5 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
408 CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ 410 CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
409 CPU_FTR_MMCRA | CPU_FTR_SMT | \ 411 CPU_FTR_MMCRA | CPU_FTR_SMT | \
@@ -417,13 +419,13 @@ extern const char *powerpc_base_platform;
417 CPU_FTR_DSCR | CPU_FTR_UNALIGNED_LD_STD | \ 419 CPU_FTR_DSCR | CPU_FTR_UNALIGNED_LD_STD | \
418 CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_CFAR) 420 CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_CFAR)
419#define CPU_FTRS_POWER7 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ 421#define CPU_FTRS_POWER7 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
420 CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_HVMODE_206 |\ 422 CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_206 |\
421 CPU_FTR_MMCRA | CPU_FTR_SMT | \ 423 CPU_FTR_MMCRA | CPU_FTR_SMT | \
422 CPU_FTR_COHERENT_ICACHE | \ 424 CPU_FTR_COHERENT_ICACHE | \
423 CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ 425 CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
424 CPU_FTR_DSCR | CPU_FTR_SAO | CPU_FTR_ASYM_SMT | \ 426 CPU_FTR_DSCR | CPU_FTR_SAO | CPU_FTR_ASYM_SMT | \
425 CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \ 427 CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
426 CPU_FTR_ICSWX | CPU_FTR_CFAR) 428 CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE)
427#define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ 429#define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
428 CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ 430 CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
429 CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \ 431 CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index f5dfe3411f64..8057f4f6980f 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -61,19 +61,22 @@
61#define EXC_HV H 61#define EXC_HV H
62#define EXC_STD 62#define EXC_STD
63 63
64#define EXCEPTION_PROLOG_1(area) \ 64#define __EXCEPTION_PROLOG_1(area, extra, vec) \
65 GET_PACA(r13); \ 65 GET_PACA(r13); \
66 std r9,area+EX_R9(r13); /* save r9 - r12 */ \ 66 std r9,area+EX_R9(r13); /* save r9 - r12 */ \
67 std r10,area+EX_R10(r13); \ 67 std r10,area+EX_R10(r13); \
68 std r11,area+EX_R11(r13); \
69 std r12,area+EX_R12(r13); \
70 BEGIN_FTR_SECTION_NESTED(66); \ 68 BEGIN_FTR_SECTION_NESTED(66); \
71 mfspr r10,SPRN_CFAR; \ 69 mfspr r10,SPRN_CFAR; \
72 std r10,area+EX_CFAR(r13); \ 70 std r10,area+EX_CFAR(r13); \
73 END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); \ 71 END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); \
74 GET_SCRATCH0(r9); \ 72 mfcr r9; \
75 std r9,area+EX_R13(r13); \ 73 extra(vec); \
76 mfcr r9 74 std r11,area+EX_R11(r13); \
75 std r12,area+EX_R12(r13); \
76 GET_SCRATCH0(r10); \
77 std r10,area+EX_R13(r13)
78#define EXCEPTION_PROLOG_1(area, extra, vec) \
79 __EXCEPTION_PROLOG_1(area, extra, vec)
77 80
78#define __EXCEPTION_PROLOG_PSERIES_1(label, h) \ 81#define __EXCEPTION_PROLOG_PSERIES_1(label, h) \
79 ld r12,PACAKBASE(r13); /* get high part of &label */ \ 82 ld r12,PACAKBASE(r13); /* get high part of &label */ \
@@ -85,13 +88,65 @@
85 mtspr SPRN_##h##SRR1,r10; \ 88 mtspr SPRN_##h##SRR1,r10; \
86 h##rfid; \ 89 h##rfid; \
87 b . /* prevent speculative execution */ 90 b . /* prevent speculative execution */
88#define EXCEPTION_PROLOG_PSERIES_1(label, h) \ 91#define EXCEPTION_PROLOG_PSERIES_1(label, h) \
89 __EXCEPTION_PROLOG_PSERIES_1(label, h) 92 __EXCEPTION_PROLOG_PSERIES_1(label, h)
90 93
91#define EXCEPTION_PROLOG_PSERIES(area, label, h) \ 94#define EXCEPTION_PROLOG_PSERIES(area, label, h, extra, vec) \
92 EXCEPTION_PROLOG_1(area); \ 95 EXCEPTION_PROLOG_1(area, extra, vec); \
93 EXCEPTION_PROLOG_PSERIES_1(label, h); 96 EXCEPTION_PROLOG_PSERIES_1(label, h);
94 97
98#define __KVMTEST(n) \
99 lbz r10,HSTATE_IN_GUEST(r13); \
100 cmpwi r10,0; \
101 bne do_kvm_##n
102
103#define __KVM_HANDLER(area, h, n) \
104do_kvm_##n: \
105 ld r10,area+EX_R10(r13); \
106 stw r9,HSTATE_SCRATCH1(r13); \
107 ld r9,area+EX_R9(r13); \
108 std r12,HSTATE_SCRATCH0(r13); \
109 li r12,n; \
110 b kvmppc_interrupt
111
112#define __KVM_HANDLER_SKIP(area, h, n) \
113do_kvm_##n: \
114 cmpwi r10,KVM_GUEST_MODE_SKIP; \
115 ld r10,area+EX_R10(r13); \
116 beq 89f; \
117 stw r9,HSTATE_SCRATCH1(r13); \
118 ld r9,area+EX_R9(r13); \
119 std r12,HSTATE_SCRATCH0(r13); \
120 li r12,n; \
121 b kvmppc_interrupt; \
12289: mtocrf 0x80,r9; \
123 ld r9,area+EX_R9(r13); \
124 b kvmppc_skip_##h##interrupt
125
126#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
127#define KVMTEST(n) __KVMTEST(n)
128#define KVM_HANDLER(area, h, n) __KVM_HANDLER(area, h, n)
129#define KVM_HANDLER_SKIP(area, h, n) __KVM_HANDLER_SKIP(area, h, n)
130
131#else
132#define KVMTEST(n)
133#define KVM_HANDLER(area, h, n)
134#define KVM_HANDLER_SKIP(area, h, n)
135#endif
136
137#ifdef CONFIG_KVM_BOOK3S_PR
138#define KVMTEST_PR(n) __KVMTEST(n)
139#define KVM_HANDLER_PR(area, h, n) __KVM_HANDLER(area, h, n)
140#define KVM_HANDLER_PR_SKIP(area, h, n) __KVM_HANDLER_SKIP(area, h, n)
141
142#else
143#define KVMTEST_PR(n)
144#define KVM_HANDLER_PR(area, h, n)
145#define KVM_HANDLER_PR_SKIP(area, h, n)
146#endif
147
148#define NOTEST(n)
149
95/* 150/*
96 * The common exception prolog is used for all except a few exceptions 151 * The common exception prolog is used for all except a few exceptions
97 * such as a segment miss on a kernel address. We have to be prepared 152 * such as a segment miss on a kernel address. We have to be prepared
@@ -164,57 +219,58 @@
164 .globl label##_pSeries; \ 219 .globl label##_pSeries; \
165label##_pSeries: \ 220label##_pSeries: \
166 HMT_MEDIUM; \ 221 HMT_MEDIUM; \
167 DO_KVM vec; \
168 SET_SCRATCH0(r13); /* save r13 */ \ 222 SET_SCRATCH0(r13); /* save r13 */ \
169 EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common, EXC_STD) 223 EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common, \
224 EXC_STD, KVMTEST_PR, vec)
170 225
171#define STD_EXCEPTION_HV(loc, vec, label) \ 226#define STD_EXCEPTION_HV(loc, vec, label) \
172 . = loc; \ 227 . = loc; \
173 .globl label##_hv; \ 228 .globl label##_hv; \
174label##_hv: \ 229label##_hv: \
175 HMT_MEDIUM; \ 230 HMT_MEDIUM; \
176 DO_KVM vec; \ 231 SET_SCRATCH0(r13); /* save r13 */ \
177 SET_SCRATCH0(r13); /* save r13 */ \ 232 EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common, \
178 EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common, EXC_HV) 233 EXC_HV, KVMTEST, vec)
179 234
180#define __MASKABLE_EXCEPTION_PSERIES(vec, label, h) \ 235#define __SOFTEN_TEST(h) \
181 HMT_MEDIUM; \
182 DO_KVM vec; \
183 SET_SCRATCH0(r13); /* save r13 */ \
184 GET_PACA(r13); \
185 std r9,PACA_EXGEN+EX_R9(r13); /* save r9, r10 */ \
186 std r10,PACA_EXGEN+EX_R10(r13); \
187 lbz r10,PACASOFTIRQEN(r13); \ 236 lbz r10,PACASOFTIRQEN(r13); \
188 mfcr r9; \
189 cmpwi r10,0; \ 237 cmpwi r10,0; \
190 beq masked_##h##interrupt; \ 238 beq masked_##h##interrupt
191 GET_SCRATCH0(r10); \ 239#define _SOFTEN_TEST(h) __SOFTEN_TEST(h)
192 std r10,PACA_EXGEN+EX_R13(r13); \ 240
193 std r11,PACA_EXGEN+EX_R11(r13); \ 241#define SOFTEN_TEST_PR(vec) \
194 std r12,PACA_EXGEN+EX_R12(r13); \ 242 KVMTEST_PR(vec); \
195 ld r12,PACAKBASE(r13); /* get high part of &label */ \ 243 _SOFTEN_TEST(EXC_STD)
196 ld r10,PACAKMSR(r13); /* get MSR value for kernel */ \ 244
197 mfspr r11,SPRN_##h##SRR0; /* save SRR0 */ \ 245#define SOFTEN_TEST_HV(vec) \
198 LOAD_HANDLER(r12,label##_common) \ 246 KVMTEST(vec); \
199 mtspr SPRN_##h##SRR0,r12; \ 247 _SOFTEN_TEST(EXC_HV)
200 mfspr r12,SPRN_##h##SRR1; /* and SRR1 */ \ 248
201 mtspr SPRN_##h##SRR1,r10; \ 249#define SOFTEN_TEST_HV_201(vec) \
202 h##rfid; \ 250 KVMTEST(vec); \
203 b . /* prevent speculative execution */ 251 _SOFTEN_TEST(EXC_STD)
204#define _MASKABLE_EXCEPTION_PSERIES(vec, label, h) \ 252
205 __MASKABLE_EXCEPTION_PSERIES(vec, label, h) 253#define __MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra) \
254 HMT_MEDIUM; \
255 SET_SCRATCH0(r13); /* save r13 */ \
256 __EXCEPTION_PROLOG_1(PACA_EXGEN, extra, vec); \
257 EXCEPTION_PROLOG_PSERIES_1(label##_common, h);
258#define _MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra) \
259 __MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)
206 260
207#define MASKABLE_EXCEPTION_PSERIES(loc, vec, label) \ 261#define MASKABLE_EXCEPTION_PSERIES(loc, vec, label) \
208 . = loc; \ 262 . = loc; \
209 .globl label##_pSeries; \ 263 .globl label##_pSeries; \
210label##_pSeries: \ 264label##_pSeries: \
211 _MASKABLE_EXCEPTION_PSERIES(vec, label, EXC_STD) 265 _MASKABLE_EXCEPTION_PSERIES(vec, label, \
266 EXC_STD, SOFTEN_TEST_PR)
212 267
213#define MASKABLE_EXCEPTION_HV(loc, vec, label) \ 268#define MASKABLE_EXCEPTION_HV(loc, vec, label) \
214 . = loc; \ 269 . = loc; \
215 .globl label##_hv; \ 270 .globl label##_hv; \
216label##_hv: \ 271label##_hv: \
217 _MASKABLE_EXCEPTION_PSERIES(vec, label, EXC_HV) 272 _MASKABLE_EXCEPTION_PSERIES(vec, label, \
273 EXC_HV, SOFTEN_TEST_HV)
218 274
219#ifdef CONFIG_PPC_ISERIES 275#ifdef CONFIG_PPC_ISERIES
220#define DISABLE_INTS \ 276#define DISABLE_INTS \
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index fd8201dddd4b..1c324ff55ea8 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -29,6 +29,10 @@
29#define H_LONG_BUSY_ORDER_100_SEC 9905 /* Long busy, hint that 100sec \ 29#define H_LONG_BUSY_ORDER_100_SEC 9905 /* Long busy, hint that 100sec \
30 is a good time to retry */ 30 is a good time to retry */
31#define H_LONG_BUSY_END_RANGE 9905 /* End of long busy range */ 31#define H_LONG_BUSY_END_RANGE 9905 /* End of long busy range */
32
33/* Internal value used in book3s_hv kvm support; not returned to guests */
34#define H_TOO_HARD 9999
35
32#define H_HARDWARE -1 /* Hardware error */ 36#define H_HARDWARE -1 /* Hardware error */
33#define H_FUNCTION -2 /* Function not supported */ 37#define H_FUNCTION -2 /* Function not supported */
34#define H_PRIVILEGE -3 /* Caller not privileged */ 38#define H_PRIVILEGE -3 /* Caller not privileged */
@@ -100,6 +104,7 @@
100#define H_PAGE_SET_ACTIVE H_PAGE_STATE_CHANGE 104#define H_PAGE_SET_ACTIVE H_PAGE_STATE_CHANGE
101#define H_AVPN (1UL<<(63-32)) /* An avpn is provided as a sanity test */ 105#define H_AVPN (1UL<<(63-32)) /* An avpn is provided as a sanity test */
102#define H_ANDCOND (1UL<<(63-33)) 106#define H_ANDCOND (1UL<<(63-33))
107#define H_LOCAL (1UL<<(63-35))
103#define H_ICACHE_INVALIDATE (1UL<<(63-40)) /* icbi, etc. (ignored for IO pages) */ 108#define H_ICACHE_INVALIDATE (1UL<<(63-40)) /* icbi, etc. (ignored for IO pages) */
104#define H_ICACHE_SYNCHRONIZE (1UL<<(63-41)) /* dcbst, icbi, etc (ignored for IO pages */ 109#define H_ICACHE_SYNCHRONIZE (1UL<<(63-41)) /* dcbst, icbi, etc (ignored for IO pages */
105#define H_COALESCE_CAND (1UL<<(63-42)) /* page is a good candidate for coalescing */ 110#define H_COALESCE_CAND (1UL<<(63-42)) /* page is a good candidate for coalescing */
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index d2ca5ed3877b..a4f6c85431f8 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -22,6 +22,10 @@
22 22
23#include <linux/types.h> 23#include <linux/types.h>
24 24
25/* Select powerpc specific features in <linux/kvm.h> */
26#define __KVM_HAVE_SPAPR_TCE
27#define __KVM_HAVE_PPC_SMT
28
25struct kvm_regs { 29struct kvm_regs {
26 __u64 pc; 30 __u64 pc;
27 __u64 cr; 31 __u64 cr;
@@ -272,4 +276,15 @@ struct kvm_guest_debug_arch {
272#define KVM_INTERRUPT_UNSET -2U 276#define KVM_INTERRUPT_UNSET -2U
273#define KVM_INTERRUPT_SET_LEVEL -3U 277#define KVM_INTERRUPT_SET_LEVEL -3U
274 278
279/* for KVM_CAP_SPAPR_TCE */
280struct kvm_create_spapr_tce {
281 __u64 liobn;
282 __u32 window_size;
283};
284
285/* for KVM_ALLOCATE_RMA */
286struct kvm_allocate_rma {
287 __u64 rma_size;
288};
289
275#endif /* __LINUX_KVM_POWERPC_H */ 290#endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 0951b17f4eb5..7b1f0e0fc653 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -64,8 +64,12 @@
64#define BOOK3S_INTERRUPT_PROGRAM 0x700 64#define BOOK3S_INTERRUPT_PROGRAM 0x700
65#define BOOK3S_INTERRUPT_FP_UNAVAIL 0x800 65#define BOOK3S_INTERRUPT_FP_UNAVAIL 0x800
66#define BOOK3S_INTERRUPT_DECREMENTER 0x900 66#define BOOK3S_INTERRUPT_DECREMENTER 0x900
67#define BOOK3S_INTERRUPT_HV_DECREMENTER 0x980
67#define BOOK3S_INTERRUPT_SYSCALL 0xc00 68#define BOOK3S_INTERRUPT_SYSCALL 0xc00
68#define BOOK3S_INTERRUPT_TRACE 0xd00 69#define BOOK3S_INTERRUPT_TRACE 0xd00
70#define BOOK3S_INTERRUPT_H_DATA_STORAGE 0xe00
71#define BOOK3S_INTERRUPT_H_INST_STORAGE 0xe20
72#define BOOK3S_INTERRUPT_H_EMUL_ASSIST 0xe40
69#define BOOK3S_INTERRUPT_PERFMON 0xf00 73#define BOOK3S_INTERRUPT_PERFMON 0xf00
70#define BOOK3S_INTERRUPT_ALTIVEC 0xf20 74#define BOOK3S_INTERRUPT_ALTIVEC 0xf20
71#define BOOK3S_INTERRUPT_VSX 0xf40 75#define BOOK3S_INTERRUPT_VSX 0xf40
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index d62e703f1214..98da010252a3 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -24,20 +24,6 @@
24#include <linux/kvm_host.h> 24#include <linux/kvm_host.h>
25#include <asm/kvm_book3s_asm.h> 25#include <asm/kvm_book3s_asm.h>
26 26
27struct kvmppc_slb {
28 u64 esid;
29 u64 vsid;
30 u64 orige;
31 u64 origv;
32 bool valid : 1;
33 bool Ks : 1;
34 bool Kp : 1;
35 bool nx : 1;
36 bool large : 1; /* PTEs are 16MB */
37 bool tb : 1; /* 1TB segment */
38 bool class : 1;
39};
40
41struct kvmppc_bat { 27struct kvmppc_bat {
42 u64 raw; 28 u64 raw;
43 u32 bepi; 29 u32 bepi;
@@ -67,11 +53,22 @@ struct kvmppc_sid_map {
67#define VSID_POOL_SIZE (SID_CONTEXTS * 16) 53#define VSID_POOL_SIZE (SID_CONTEXTS * 16)
68#endif 54#endif
69 55
56struct hpte_cache {
57 struct hlist_node list_pte;
58 struct hlist_node list_pte_long;
59 struct hlist_node list_vpte;
60 struct hlist_node list_vpte_long;
61 struct rcu_head rcu_head;
62 u64 host_va;
63 u64 pfn;
64 ulong slot;
65 struct kvmppc_pte pte;
66};
67
70struct kvmppc_vcpu_book3s { 68struct kvmppc_vcpu_book3s {
71 struct kvm_vcpu vcpu; 69 struct kvm_vcpu vcpu;
72 struct kvmppc_book3s_shadow_vcpu *shadow_vcpu; 70 struct kvmppc_book3s_shadow_vcpu *shadow_vcpu;
73 struct kvmppc_sid_map sid_map[SID_MAP_NUM]; 71 struct kvmppc_sid_map sid_map[SID_MAP_NUM];
74 struct kvmppc_slb slb[64];
75 struct { 72 struct {
76 u64 esid; 73 u64 esid;
77 u64 vsid; 74 u64 vsid;
@@ -81,7 +78,6 @@ struct kvmppc_vcpu_book3s {
81 struct kvmppc_bat dbat[8]; 78 struct kvmppc_bat dbat[8];
82 u64 hid[6]; 79 u64 hid[6];
83 u64 gqr[8]; 80 u64 gqr[8];
84 int slb_nr;
85 u64 sdr1; 81 u64 sdr1;
86 u64 hior; 82 u64 hior;
87 u64 msr_mask; 83 u64 msr_mask;
@@ -93,7 +89,13 @@ struct kvmppc_vcpu_book3s {
93 u64 vsid_max; 89 u64 vsid_max;
94#endif 90#endif
95 int context_id[SID_CONTEXTS]; 91 int context_id[SID_CONTEXTS];
96 ulong prog_flags; /* flags to inject when giving a 700 trap */ 92
93 struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
94 struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
95 struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
96 struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG];
97 int hpte_cache_count;
98 spinlock_t mmu_lock;
97}; 99};
98 100
99#define CONTEXT_HOST 0 101#define CONTEXT_HOST 0
@@ -110,8 +112,10 @@ extern void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong ea, ulong ea_mask)
110extern void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 vp, u64 vp_mask); 112extern void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 vp, u64 vp_mask);
111extern void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end); 113extern void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end);
112extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr); 114extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr);
115extern void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr);
113extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu); 116extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu);
114extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu); 117extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
118extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu);
115extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte); 119extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
116extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr); 120extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
117extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu); 121extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
@@ -123,19 +127,22 @@ extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu);
123extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte); 127extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
124extern int kvmppc_mmu_hpte_sysinit(void); 128extern int kvmppc_mmu_hpte_sysinit(void);
125extern void kvmppc_mmu_hpte_sysexit(void); 129extern void kvmppc_mmu_hpte_sysexit(void);
130extern int kvmppc_mmu_hv_init(void);
126 131
127extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); 132extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
128extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); 133extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
129extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec); 134extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
135extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
130extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, 136extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
131 bool upper, u32 val); 137 bool upper, u32 val);
132extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); 138extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
133extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu); 139extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
134extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); 140extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
135 141
136extern ulong kvmppc_trampoline_lowmem; 142extern void kvmppc_handler_lowmem_trampoline(void);
137extern ulong kvmppc_trampoline_enter; 143extern void kvmppc_handler_trampoline_enter(void);
138extern void kvmppc_rmcall(ulong srr0, ulong srr1); 144extern void kvmppc_rmcall(ulong srr0, ulong srr1);
145extern void kvmppc_hv_entry_trampoline(void);
139extern void kvmppc_load_up_fpu(void); 146extern void kvmppc_load_up_fpu(void);
140extern void kvmppc_load_up_altivec(void); 147extern void kvmppc_load_up_altivec(void);
141extern void kvmppc_load_up_vsx(void); 148extern void kvmppc_load_up_vsx(void);
@@ -147,15 +154,32 @@ static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
147 return container_of(vcpu, struct kvmppc_vcpu_book3s, vcpu); 154 return container_of(vcpu, struct kvmppc_vcpu_book3s, vcpu);
148} 155}
149 156
150static inline ulong dsisr(void) 157extern void kvm_return_point(void);
158
159/* Also add subarch specific defines */
160
161#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
162#include <asm/kvm_book3s_32.h>
163#endif
164#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
165#include <asm/kvm_book3s_64.h>
166#endif
167
168#ifdef CONFIG_KVM_BOOK3S_PR
169
170static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
151{ 171{
152 ulong r; 172 return to_book3s(vcpu)->hior;
153 asm ( "mfdsisr %0 " : "=r" (r) );
154 return r;
155} 173}
156 174
157extern void kvm_return_point(void); 175static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
158static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu); 176 unsigned long pending_now, unsigned long old_pending)
177{
178 if (pending_now)
179 vcpu->arch.shared->int_pending = 1;
180 else if (old_pending)
181 vcpu->arch.shared->int_pending = 0;
182}
159 183
160static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) 184static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
161{ 185{
@@ -244,6 +268,120 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
244 return to_svcpu(vcpu)->fault_dar; 268 return to_svcpu(vcpu)->fault_dar;
245} 269}
246 270
271static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
272{
273 ulong crit_raw = vcpu->arch.shared->critical;
274 ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
275 bool crit;
276
277 /* Truncate crit indicators in 32 bit mode */
278 if (!(vcpu->arch.shared->msr & MSR_SF)) {
279 crit_raw &= 0xffffffff;
280 crit_r1 &= 0xffffffff;
281 }
282
283 /* Critical section when crit == r1 */
284 crit = (crit_raw == crit_r1);
285 /* ... and we're in supervisor mode */
286 crit = crit && !(vcpu->arch.shared->msr & MSR_PR);
287
288 return crit;
289}
290#else /* CONFIG_KVM_BOOK3S_PR */
291
292static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
293{
294 return 0;
295}
296
297static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
298 unsigned long pending_now, unsigned long old_pending)
299{
300}
301
302static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
303{
304 vcpu->arch.gpr[num] = val;
305}
306
307static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
308{
309 return vcpu->arch.gpr[num];
310}
311
312static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
313{
314 vcpu->arch.cr = val;
315}
316
317static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
318{
319 return vcpu->arch.cr;
320}
321
322static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
323{
324 vcpu->arch.xer = val;
325}
326
327static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
328{
329 return vcpu->arch.xer;
330}
331
332static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val)
333{
334 vcpu->arch.ctr = val;
335}
336
337static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu)
338{
339 return vcpu->arch.ctr;
340}
341
342static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val)
343{
344 vcpu->arch.lr = val;
345}
346
347static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu)
348{
349 return vcpu->arch.lr;
350}
351
352static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val)
353{
354 vcpu->arch.pc = val;
355}
356
357static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
358{
359 return vcpu->arch.pc;
360}
361
362static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
363{
364 ulong pc = kvmppc_get_pc(vcpu);
365
366 /* Load the instruction manually if it failed to do so in the
367 * exit path */
368 if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED)
369 kvmppc_ld(vcpu, &pc, sizeof(u32), &vcpu->arch.last_inst, false);
370
371 return vcpu->arch.last_inst;
372}
373
374static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
375{
376 return vcpu->arch.fault_dar;
377}
378
379static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
380{
381 return false;
382}
383#endif
384
247/* Magic register values loaded into r3 and r4 before the 'sc' assembly 385/* Magic register values loaded into r3 and r4 before the 'sc' assembly
248 * instruction for the OSI hypercalls */ 386 * instruction for the OSI hypercalls */
249#define OSI_SC_MAGIC_R3 0x113724FA 387#define OSI_SC_MAGIC_R3 0x113724FA
@@ -251,12 +389,4 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
251 389
252#define INS_DCBZ 0x7c0007ec 390#define INS_DCBZ 0x7c0007ec
253 391
254/* Also add subarch specific defines */
255
256#ifdef CONFIG_PPC_BOOK3S_32
257#include <asm/kvm_book3s_32.h>
258#else
259#include <asm/kvm_book3s_64.h>
260#endif
261
262#endif /* __ASM_KVM_BOOK3S_H__ */ 392#endif /* __ASM_KVM_BOOK3S_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 4cadd612d575..e43fe42b9875 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -20,9 +20,13 @@
20#ifndef __ASM_KVM_BOOK3S_64_H__ 20#ifndef __ASM_KVM_BOOK3S_64_H__
21#define __ASM_KVM_BOOK3S_64_H__ 21#define __ASM_KVM_BOOK3S_64_H__
22 22
23#ifdef CONFIG_KVM_BOOK3S_PR
23static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu) 24static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu)
24{ 25{
25 return &get_paca()->shadow_vcpu; 26 return &get_paca()->shadow_vcpu;
26} 27}
28#endif
29
30#define SPAPR_TCE_SHIFT 12
27 31
28#endif /* __ASM_KVM_BOOK3S_64_H__ */ 32#endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index d5a8a3861635..ef7b3688c3b6 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -60,6 +60,36 @@ kvmppc_resume_\intno:
60 60
61#else /*__ASSEMBLY__ */ 61#else /*__ASSEMBLY__ */
62 62
63/*
64 * This struct goes in the PACA on 64-bit processors. It is used
65 * to store host state that needs to be saved when we enter a guest
66 * and restored when we exit, but isn't specific to any particular
67 * guest or vcpu. It also has some scratch fields used by the guest
68 * exit code.
69 */
70struct kvmppc_host_state {
71 ulong host_r1;
72 ulong host_r2;
73 ulong host_msr;
74 ulong vmhandler;
75 ulong scratch0;
76 ulong scratch1;
77 u8 in_guest;
78
79#ifdef CONFIG_KVM_BOOK3S_64_HV
80 struct kvm_vcpu *kvm_vcpu;
81 struct kvmppc_vcore *kvm_vcore;
82 unsigned long xics_phys;
83 u64 dabr;
84 u64 host_mmcr[3];
85 u32 host_pmc[8];
86 u64 host_purr;
87 u64 host_spurr;
88 u64 host_dscr;
89 u64 dec_expires;
90#endif
91};
92
63struct kvmppc_book3s_shadow_vcpu { 93struct kvmppc_book3s_shadow_vcpu {
64 ulong gpr[14]; 94 ulong gpr[14];
65 u32 cr; 95 u32 cr;
@@ -73,17 +103,12 @@ struct kvmppc_book3s_shadow_vcpu {
73 ulong shadow_srr1; 103 ulong shadow_srr1;
74 ulong fault_dar; 104 ulong fault_dar;
75 105
76 ulong host_r1;
77 ulong host_r2;
78 ulong handler;
79 ulong scratch0;
80 ulong scratch1;
81 ulong vmhandler;
82 u8 in_guest;
83
84#ifdef CONFIG_PPC_BOOK3S_32 106#ifdef CONFIG_PPC_BOOK3S_32
85 u32 sr[16]; /* Guest SRs */ 107 u32 sr[16]; /* Guest SRs */
108
109 struct kvmppc_host_state hstate;
86#endif 110#endif
111
87#ifdef CONFIG_PPC_BOOK3S_64 112#ifdef CONFIG_PPC_BOOK3S_64
88 u8 slb_max; /* highest used guest slb entry */ 113 u8 slb_max; /* highest used guest slb entry */
89 struct { 114 struct {
diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h
index 9c9ba3d59b1b..a90e09188777 100644
--- a/arch/powerpc/include/asm/kvm_booke.h
+++ b/arch/powerpc/include/asm/kvm_booke.h
@@ -93,4 +93,8 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
93 return vcpu->arch.fault_dear; 93 return vcpu->arch.fault_dear;
94} 94}
95 95
96static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
97{
98 return vcpu->arch.shared->msr;
99}
96#endif /* __ASM_KVM_BOOKE_H__ */ 100#endif /* __ASM_KVM_BOOKE_H__ */
diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h
index 7a2a565f88c4..adbfca9dd100 100644
--- a/arch/powerpc/include/asm/kvm_e500.h
+++ b/arch/powerpc/include/asm/kvm_e500.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. 2 * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
3 * 3 *
4 * Author: Yu Liu, <yu.liu@freescale.com> 4 * Author: Yu Liu, <yu.liu@freescale.com>
5 * 5 *
@@ -29,17 +29,25 @@ struct tlbe{
29 u32 mas7; 29 u32 mas7;
30}; 30};
31 31
32#define E500_TLB_VALID 1
33#define E500_TLB_DIRTY 2
34
35struct tlbe_priv {
36 pfn_t pfn;
37 unsigned int flags; /* E500_TLB_* */
38};
39
40struct vcpu_id_table;
41
32struct kvmppc_vcpu_e500 { 42struct kvmppc_vcpu_e500 {
33 /* Unmodified copy of the guest's TLB. */ 43 /* Unmodified copy of the guest's TLB. */
34 struct tlbe *guest_tlb[E500_TLB_NUM]; 44 struct tlbe *gtlb_arch[E500_TLB_NUM];
35 /* TLB that's actually used when the guest is running. */
36 struct tlbe *shadow_tlb[E500_TLB_NUM];
37 /* Pages which are referenced in the shadow TLB. */
38 struct page **shadow_pages[E500_TLB_NUM];
39 45
40 unsigned int guest_tlb_size[E500_TLB_NUM]; 46 /* KVM internal information associated with each guest TLB entry */
41 unsigned int shadow_tlb_size[E500_TLB_NUM]; 47 struct tlbe_priv *gtlb_priv[E500_TLB_NUM];
42 unsigned int guest_tlb_nv[E500_TLB_NUM]; 48
49 unsigned int gtlb_size[E500_TLB_NUM];
50 unsigned int gtlb_nv[E500_TLB_NUM];
43 51
44 u32 host_pid[E500_PID_NUM]; 52 u32 host_pid[E500_PID_NUM];
45 u32 pid[E500_PID_NUM]; 53 u32 pid[E500_PID_NUM];
@@ -53,6 +61,10 @@ struct kvmppc_vcpu_e500 {
53 u32 mas5; 61 u32 mas5;
54 u32 mas6; 62 u32 mas6;
55 u32 mas7; 63 u32 mas7;
64
65 /* vcpu id table */
66 struct vcpu_id_table *idt;
67
56 u32 l1csr0; 68 u32 l1csr0;
57 u32 l1csr1; 69 u32 l1csr1;
58 u32 hid0; 70 u32 hid0;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 186f150b9b89..cc22b282d755 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -25,15 +25,23 @@
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/types.h> 26#include <linux/types.h>
27#include <linux/kvm_types.h> 27#include <linux/kvm_types.h>
28#include <linux/threads.h>
29#include <linux/spinlock.h>
28#include <linux/kvm_para.h> 30#include <linux/kvm_para.h>
31#include <linux/list.h>
32#include <linux/atomic.h>
29#include <asm/kvm_asm.h> 33#include <asm/kvm_asm.h>
34#include <asm/processor.h>
30 35
31#define KVM_MAX_VCPUS 1 36#define KVM_MAX_VCPUS NR_CPUS
37#define KVM_MAX_VCORES NR_CPUS
32#define KVM_MEMORY_SLOTS 32 38#define KVM_MEMORY_SLOTS 32
33/* memory slots that does not exposed to userspace */ 39/* memory slots that does not exposed to userspace */
34#define KVM_PRIVATE_MEM_SLOTS 4 40#define KVM_PRIVATE_MEM_SLOTS 4
35 41
42#ifdef CONFIG_KVM_MMIO
36#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 43#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
44#endif
37 45
38/* We don't currently support large pages. */ 46/* We don't currently support large pages. */
39#define KVM_HPAGE_GFN_SHIFT(x) 0 47#define KVM_HPAGE_GFN_SHIFT(x) 0
@@ -57,6 +65,10 @@ struct kvm;
57struct kvm_run; 65struct kvm_run;
58struct kvm_vcpu; 66struct kvm_vcpu;
59 67
68struct lppaca;
69struct slb_shadow;
70struct dtl;
71
60struct kvm_vm_stat { 72struct kvm_vm_stat {
61 u32 remote_tlb_flush; 73 u32 remote_tlb_flush;
62}; 74};
@@ -133,9 +145,74 @@ struct kvmppc_exit_timing {
133 }; 145 };
134}; 146};
135 147
148struct kvmppc_pginfo {
149 unsigned long pfn;
150 atomic_t refcnt;
151};
152
153struct kvmppc_spapr_tce_table {
154 struct list_head list;
155 struct kvm *kvm;
156 u64 liobn;
157 u32 window_size;
158 struct page *pages[0];
159};
160
161struct kvmppc_rma_info {
162 void *base_virt;
163 unsigned long base_pfn;
164 unsigned long npages;
165 struct list_head list;
166 atomic_t use_count;
167};
168
136struct kvm_arch { 169struct kvm_arch {
170#ifdef CONFIG_KVM_BOOK3S_64_HV
171 unsigned long hpt_virt;
172 unsigned long ram_npages;
173 unsigned long ram_psize;
174 unsigned long ram_porder;
175 struct kvmppc_pginfo *ram_pginfo;
176 unsigned int lpid;
177 unsigned int host_lpid;
178 unsigned long host_lpcr;
179 unsigned long sdr1;
180 unsigned long host_sdr1;
181 int tlbie_lock;
182 int n_rma_pages;
183 unsigned long lpcr;
184 unsigned long rmor;
185 struct kvmppc_rma_info *rma;
186 struct list_head spapr_tce_tables;
187 unsigned short last_vcpu[NR_CPUS];
188 struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
189#endif /* CONFIG_KVM_BOOK3S_64_HV */
137}; 190};
138 191
192/*
193 * Struct for a virtual core.
194 * Note: entry_exit_count combines an entry count in the bottom 8 bits
195 * and an exit count in the next 8 bits. This is so that we can
196 * atomically increment the entry count iff the exit count is 0
197 * without taking the lock.
198 */
199struct kvmppc_vcore {
200 int n_runnable;
201 int n_blocked;
202 int num_threads;
203 int entry_exit_count;
204 int n_woken;
205 int nap_count;
206 u16 pcpu;
207 u8 vcore_running;
208 u8 in_guest;
209 struct list_head runnable_threads;
210 spinlock_t lock;
211};
212
213#define VCORE_ENTRY_COUNT(vc) ((vc)->entry_exit_count & 0xff)
214#define VCORE_EXIT_COUNT(vc) ((vc)->entry_exit_count >> 8)
215
139struct kvmppc_pte { 216struct kvmppc_pte {
140 ulong eaddr; 217 ulong eaddr;
141 u64 vpage; 218 u64 vpage;
@@ -163,16 +240,18 @@ struct kvmppc_mmu {
163 bool (*is_dcbz32)(struct kvm_vcpu *vcpu); 240 bool (*is_dcbz32)(struct kvm_vcpu *vcpu);
164}; 241};
165 242
166struct hpte_cache { 243struct kvmppc_slb {
167 struct hlist_node list_pte; 244 u64 esid;
168 struct hlist_node list_pte_long; 245 u64 vsid;
169 struct hlist_node list_vpte; 246 u64 orige;
170 struct hlist_node list_vpte_long; 247 u64 origv;
171 struct rcu_head rcu_head; 248 bool valid : 1;
172 u64 host_va; 249 bool Ks : 1;
173 u64 pfn; 250 bool Kp : 1;
174 ulong slot; 251 bool nx : 1;
175 struct kvmppc_pte pte; 252 bool large : 1; /* PTEs are 16MB */
253 bool tb : 1; /* 1TB segment */
254 bool class : 1;
176}; 255};
177 256
178struct kvm_vcpu_arch { 257struct kvm_vcpu_arch {
@@ -187,6 +266,9 @@ struct kvm_vcpu_arch {
187 ulong highmem_handler; 266 ulong highmem_handler;
188 ulong rmcall; 267 ulong rmcall;
189 ulong host_paca_phys; 268 ulong host_paca_phys;
269 struct kvmppc_slb slb[64];
270 int slb_max; /* 1 + index of last valid entry in slb[] */
271 int slb_nr; /* total number of entries in SLB */
190 struct kvmppc_mmu mmu; 272 struct kvmppc_mmu mmu;
191#endif 273#endif
192 274
@@ -195,13 +277,19 @@ struct kvm_vcpu_arch {
195 u64 fpr[32]; 277 u64 fpr[32];
196 u64 fpscr; 278 u64 fpscr;
197 279
280#ifdef CONFIG_SPE
281 ulong evr[32];
282 ulong spefscr;
283 ulong host_spefscr;
284 u64 acc;
285#endif
198#ifdef CONFIG_ALTIVEC 286#ifdef CONFIG_ALTIVEC
199 vector128 vr[32]; 287 vector128 vr[32];
200 vector128 vscr; 288 vector128 vscr;
201#endif 289#endif
202 290
203#ifdef CONFIG_VSX 291#ifdef CONFIG_VSX
204 u64 vsr[32]; 292 u64 vsr[64];
205#endif 293#endif
206 294
207#ifdef CONFIG_PPC_BOOK3S 295#ifdef CONFIG_PPC_BOOK3S
@@ -209,22 +297,27 @@ struct kvm_vcpu_arch {
209 u32 qpr[32]; 297 u32 qpr[32];
210#endif 298#endif
211 299
212#ifdef CONFIG_BOOKE
213 ulong pc; 300 ulong pc;
214 ulong ctr; 301 ulong ctr;
215 ulong lr; 302 ulong lr;
216 303
217 ulong xer; 304 ulong xer;
218 u32 cr; 305 u32 cr;
219#endif
220 306
221#ifdef CONFIG_PPC_BOOK3S 307#ifdef CONFIG_PPC_BOOK3S
222 ulong shadow_msr;
223 ulong hflags; 308 ulong hflags;
224 ulong guest_owned_ext; 309 ulong guest_owned_ext;
310 ulong purr;
311 ulong spurr;
312 ulong dscr;
313 ulong amr;
314 ulong uamor;
315 u32 ctrl;
316 ulong dabr;
225#endif 317#endif
226 u32 vrsave; /* also USPRG0 */ 318 u32 vrsave; /* also USPRG0 */
227 u32 mmucr; 319 u32 mmucr;
320 ulong shadow_msr;
228 ulong sprg4; 321 ulong sprg4;
229 ulong sprg5; 322 ulong sprg5;
230 ulong sprg6; 323 ulong sprg6;
@@ -249,6 +342,7 @@ struct kvm_vcpu_arch {
249 u32 pvr; 342 u32 pvr;
250 343
251 u32 shadow_pid; 344 u32 shadow_pid;
345 u32 shadow_pid1;
252 u32 pid; 346 u32 pid;
253 u32 swap_pid; 347 u32 swap_pid;
254 348
@@ -258,6 +352,9 @@ struct kvm_vcpu_arch {
258 u32 dbcr1; 352 u32 dbcr1;
259 u32 dbsr; 353 u32 dbsr;
260 354
355 u64 mmcr[3];
356 u32 pmc[8];
357
261#ifdef CONFIG_KVM_EXIT_TIMING 358#ifdef CONFIG_KVM_EXIT_TIMING
262 struct mutex exit_timing_lock; 359 struct mutex exit_timing_lock;
263 struct kvmppc_exit_timing timing_exit; 360 struct kvmppc_exit_timing timing_exit;
@@ -272,8 +369,12 @@ struct kvm_vcpu_arch {
272 struct dentry *debugfs_exit_timing; 369 struct dentry *debugfs_exit_timing;
273#endif 370#endif
274 371
372#ifdef CONFIG_PPC_BOOK3S
373 ulong fault_dar;
374 u32 fault_dsisr;
375#endif
376
275#ifdef CONFIG_BOOKE 377#ifdef CONFIG_BOOKE
276 u32 last_inst;
277 ulong fault_dear; 378 ulong fault_dear;
278 ulong fault_esr; 379 ulong fault_esr;
279 ulong queued_dear; 380 ulong queued_dear;
@@ -288,25 +389,47 @@ struct kvm_vcpu_arch {
288 u8 dcr_is_write; 389 u8 dcr_is_write;
289 u8 osi_needed; 390 u8 osi_needed;
290 u8 osi_enabled; 391 u8 osi_enabled;
392 u8 hcall_needed;
291 393
292 u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ 394 u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
293 395
294 struct hrtimer dec_timer; 396 struct hrtimer dec_timer;
295 struct tasklet_struct tasklet; 397 struct tasklet_struct tasklet;
296 u64 dec_jiffies; 398 u64 dec_jiffies;
399 u64 dec_expires;
297 unsigned long pending_exceptions; 400 unsigned long pending_exceptions;
401 u16 last_cpu;
402 u8 ceded;
403 u8 prodded;
404 u32 last_inst;
405
406 struct lppaca *vpa;
407 struct slb_shadow *slb_shadow;
408 struct dtl *dtl;
409 struct dtl *dtl_end;
410
411 struct kvmppc_vcore *vcore;
412 int ret;
413 int trap;
414 int state;
415 int ptid;
416 wait_queue_head_t cpu_run;
417
298 struct kvm_vcpu_arch_shared *shared; 418 struct kvm_vcpu_arch_shared *shared;
299 unsigned long magic_page_pa; /* phys addr to map the magic page to */ 419 unsigned long magic_page_pa; /* phys addr to map the magic page to */
300 unsigned long magic_page_ea; /* effect. addr to map the magic page to */ 420 unsigned long magic_page_ea; /* effect. addr to map the magic page to */
301 421
302#ifdef CONFIG_PPC_BOOK3S 422#ifdef CONFIG_KVM_BOOK3S_64_HV
303 struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE]; 423 struct kvm_vcpu_arch_shared shregs;
304 struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG]; 424
305 struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE]; 425 struct list_head run_list;
306 struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG]; 426 struct task_struct *run_task;
307 int hpte_cache_count; 427 struct kvm_run *kvm_run;
308 spinlock_t mmu_lock;
309#endif 428#endif
310}; 429};
311 430
431#define KVMPPC_VCPU_BUSY_IN_HOST 0
432#define KVMPPC_VCPU_BLOCKED 1
433#define KVMPPC_VCPU_RUNNABLE 2
434
312#endif /* __POWERPC_KVM_HOST_H__ */ 435#endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 9345238edecf..d121f49d62b8 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -33,6 +33,9 @@
33#else 33#else
34#include <asm/kvm_booke.h> 34#include <asm/kvm_booke.h>
35#endif 35#endif
36#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
37#include <asm/paca.h>
38#endif
36 39
37enum emulation_result { 40enum emulation_result {
38 EMULATE_DONE, /* no further processing */ 41 EMULATE_DONE, /* no further processing */
@@ -42,6 +45,7 @@ enum emulation_result {
42 EMULATE_AGAIN, /* something went wrong. go again */ 45 EMULATE_AGAIN, /* something went wrong. go again */
43}; 46};
44 47
48extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
45extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); 49extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
46extern char kvmppc_handlers_start[]; 50extern char kvmppc_handlers_start[];
47extern unsigned long kvmppc_handler_len; 51extern unsigned long kvmppc_handler_len;
@@ -109,6 +113,27 @@ extern void kvmppc_booke_exit(void);
109 113
110extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu); 114extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
111extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu); 115extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
116extern void kvmppc_map_magic(struct kvm_vcpu *vcpu);
117
118extern long kvmppc_alloc_hpt(struct kvm *kvm);
119extern void kvmppc_free_hpt(struct kvm *kvm);
120extern long kvmppc_prepare_vrma(struct kvm *kvm,
121 struct kvm_userspace_memory_region *mem);
122extern void kvmppc_map_vrma(struct kvm *kvm,
123 struct kvm_userspace_memory_region *mem);
124extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
125extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
126 struct kvm_create_spapr_tce *args);
127extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
128 struct kvm_allocate_rma *rma);
129extern struct kvmppc_rma_info *kvm_alloc_rma(void);
130extern void kvm_release_rma(struct kvmppc_rma_info *ri);
131extern int kvmppc_core_init_vm(struct kvm *kvm);
132extern void kvmppc_core_destroy_vm(struct kvm *kvm);
133extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
134 struct kvm_userspace_memory_region *mem);
135extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
136 struct kvm_userspace_memory_region *mem);
112 137
113/* 138/*
114 * Cuts out inst bits with ordering according to spec. 139 * Cuts out inst bits with ordering according to spec.
@@ -151,4 +176,20 @@ int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
151 176
152void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid); 177void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
153 178
179#ifdef CONFIG_KVM_BOOK3S_64_HV
180static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
181{
182 paca[cpu].kvm_hstate.xics_phys = addr;
183}
184
185extern void kvm_rma_init(void);
186
187#else
188static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
189{}
190
191static inline void kvm_rma_init(void)
192{}
193#endif
194
154#endif /* __POWERPC_KVM_PPC_H__ */ 195#endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index d865bd909c7d..b445e0af4c2b 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -90,13 +90,19 @@ extern char initial_stab[];
90 90
91#define HPTE_R_PP0 ASM_CONST(0x8000000000000000) 91#define HPTE_R_PP0 ASM_CONST(0x8000000000000000)
92#define HPTE_R_TS ASM_CONST(0x4000000000000000) 92#define HPTE_R_TS ASM_CONST(0x4000000000000000)
93#define HPTE_R_KEY_HI ASM_CONST(0x3000000000000000)
93#define HPTE_R_RPN_SHIFT 12 94#define HPTE_R_RPN_SHIFT 12
94#define HPTE_R_RPN ASM_CONST(0x3ffffffffffff000) 95#define HPTE_R_RPN ASM_CONST(0x0ffffffffffff000)
95#define HPTE_R_FLAGS ASM_CONST(0x00000000000003ff)
96#define HPTE_R_PP ASM_CONST(0x0000000000000003) 96#define HPTE_R_PP ASM_CONST(0x0000000000000003)
97#define HPTE_R_N ASM_CONST(0x0000000000000004) 97#define HPTE_R_N ASM_CONST(0x0000000000000004)
98#define HPTE_R_G ASM_CONST(0x0000000000000008)
99#define HPTE_R_M ASM_CONST(0x0000000000000010)
100#define HPTE_R_I ASM_CONST(0x0000000000000020)
101#define HPTE_R_W ASM_CONST(0x0000000000000040)
102#define HPTE_R_WIMG ASM_CONST(0x0000000000000078)
98#define HPTE_R_C ASM_CONST(0x0000000000000080) 103#define HPTE_R_C ASM_CONST(0x0000000000000080)
99#define HPTE_R_R ASM_CONST(0x0000000000000100) 104#define HPTE_R_R ASM_CONST(0x0000000000000100)
105#define HPTE_R_KEY_LO ASM_CONST(0x0000000000000e00)
100 106
101#define HPTE_V_1TB_SEG ASM_CONST(0x4000000000000000) 107#define HPTE_V_1TB_SEG ASM_CONST(0x4000000000000000)
102#define HPTE_V_VRMA_MASK ASM_CONST(0x4001ffffff000000) 108#define HPTE_V_VRMA_MASK ASM_CONST(0x4001ffffff000000)
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 74126765106a..a6da12859959 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -147,9 +147,12 @@ struct paca_struct {
147 struct dtl_entry *dtl_curr; /* pointer corresponding to dtl_ridx */ 147 struct dtl_entry *dtl_curr; /* pointer corresponding to dtl_ridx */
148 148
149#ifdef CONFIG_KVM_BOOK3S_HANDLER 149#ifdef CONFIG_KVM_BOOK3S_HANDLER
150#ifdef CONFIG_KVM_BOOK3S_PR
150 /* We use this to store guest state in */ 151 /* We use this to store guest state in */
151 struct kvmppc_book3s_shadow_vcpu shadow_vcpu; 152 struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
152#endif 153#endif
154 struct kvmppc_host_state kvm_hstate;
155#endif
153}; 156};
154 157
155extern struct paca_struct *paca; 158extern struct paca_struct *paca;
diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index 1b422381fc16..368f72f79808 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -150,18 +150,22 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
150#define REST_16VSRSU(n,b,base) REST_8VSRSU(n,b,base); REST_8VSRSU(n+8,b,base) 150#define REST_16VSRSU(n,b,base) REST_8VSRSU(n,b,base); REST_8VSRSU(n+8,b,base)
151#define REST_32VSRSU(n,b,base) REST_16VSRSU(n,b,base); REST_16VSRSU(n+16,b,base) 151#define REST_32VSRSU(n,b,base) REST_16VSRSU(n,b,base); REST_16VSRSU(n+16,b,base)
152 152
153#define SAVE_EVR(n,s,base) evmergehi s,s,n; stw s,THREAD_EVR0+4*(n)(base) 153/*
154#define SAVE_2EVRS(n,s,base) SAVE_EVR(n,s,base); SAVE_EVR(n+1,s,base) 154 * b = base register for addressing, o = base offset from register of 1st EVR
155#define SAVE_4EVRS(n,s,base) SAVE_2EVRS(n,s,base); SAVE_2EVRS(n+2,s,base) 155 * n = first EVR, s = scratch
156#define SAVE_8EVRS(n,s,base) SAVE_4EVRS(n,s,base); SAVE_4EVRS(n+4,s,base) 156 */
157#define SAVE_16EVRS(n,s,base) SAVE_8EVRS(n,s,base); SAVE_8EVRS(n+8,s,base) 157#define SAVE_EVR(n,s,b,o) evmergehi s,s,n; stw s,o+4*(n)(b)
158#define SAVE_32EVRS(n,s,base) SAVE_16EVRS(n,s,base); SAVE_16EVRS(n+16,s,base) 158#define SAVE_2EVRS(n,s,b,o) SAVE_EVR(n,s,b,o); SAVE_EVR(n+1,s,b,o)
159#define REST_EVR(n,s,base) lwz s,THREAD_EVR0+4*(n)(base); evmergelo n,s,n 159#define SAVE_4EVRS(n,s,b,o) SAVE_2EVRS(n,s,b,o); SAVE_2EVRS(n+2,s,b,o)
160#define REST_2EVRS(n,s,base) REST_EVR(n,s,base); REST_EVR(n+1,s,base) 160#define SAVE_8EVRS(n,s,b,o) SAVE_4EVRS(n,s,b,o); SAVE_4EVRS(n+4,s,b,o)
161#define REST_4EVRS(n,s,base) REST_2EVRS(n,s,base); REST_2EVRS(n+2,s,base) 161#define SAVE_16EVRS(n,s,b,o) SAVE_8EVRS(n,s,b,o); SAVE_8EVRS(n+8,s,b,o)
162#define REST_8EVRS(n,s,base) REST_4EVRS(n,s,base); REST_4EVRS(n+4,s,base) 162#define SAVE_32EVRS(n,s,b,o) SAVE_16EVRS(n,s,b,o); SAVE_16EVRS(n+16,s,b,o)
163#define REST_16EVRS(n,s,base) REST_8EVRS(n,s,base); REST_8EVRS(n+8,s,base) 163#define REST_EVR(n,s,b,o) lwz s,o+4*(n)(b); evmergelo n,s,n
164#define REST_32EVRS(n,s,base) REST_16EVRS(n,s,base); REST_16EVRS(n+16,s,base) 164#define REST_2EVRS(n,s,b,o) REST_EVR(n,s,b,o); REST_EVR(n+1,s,b,o)
165#define REST_4EVRS(n,s,b,o) REST_2EVRS(n,s,b,o); REST_2EVRS(n+2,s,b,o)
166#define REST_8EVRS(n,s,b,o) REST_4EVRS(n,s,b,o); REST_4EVRS(n+4,s,b,o)
167#define REST_16EVRS(n,s,b,o) REST_8EVRS(n,s,b,o); REST_8EVRS(n+8,s,b,o)
168#define REST_32EVRS(n,s,b,o) REST_16EVRS(n,s,b,o); REST_16EVRS(n+16,s,b,o)
165 169
166/* Macros to adjust thread priority for hardware multithreading */ 170/* Macros to adjust thread priority for hardware multithreading */
167#define HMT_VERY_LOW or 31,31,31 # very low priority 171#define HMT_VERY_LOW or 31,31,31 # very low priority
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index c5cae0dd176c..ddbe57ae8584 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -189,6 +189,9 @@
189#define SPRN_CTR 0x009 /* Count Register */ 189#define SPRN_CTR 0x009 /* Count Register */
190#define SPRN_DSCR 0x11 190#define SPRN_DSCR 0x11
191#define SPRN_CFAR 0x1c /* Come From Address Register */ 191#define SPRN_CFAR 0x1c /* Come From Address Register */
192#define SPRN_AMR 0x1d /* Authority Mask Register */
193#define SPRN_UAMOR 0x9d /* User Authority Mask Override Register */
194#define SPRN_AMOR 0x15d /* Authority Mask Override Register */
192#define SPRN_ACOP 0x1F /* Available Coprocessor Register */ 195#define SPRN_ACOP 0x1F /* Available Coprocessor Register */
193#define SPRN_CTRLF 0x088 196#define SPRN_CTRLF 0x088
194#define SPRN_CTRLT 0x098 197#define SPRN_CTRLT 0x098
@@ -232,22 +235,28 @@
232#define LPCR_VPM0 (1ul << (63-0)) 235#define LPCR_VPM0 (1ul << (63-0))
233#define LPCR_VPM1 (1ul << (63-1)) 236#define LPCR_VPM1 (1ul << (63-1))
234#define LPCR_ISL (1ul << (63-2)) 237#define LPCR_ISL (1ul << (63-2))
238#define LPCR_VC_SH (63-2)
235#define LPCR_DPFD_SH (63-11) 239#define LPCR_DPFD_SH (63-11)
236#define LPCR_VRMA_L (1ul << (63-12)) 240#define LPCR_VRMA_L (1ul << (63-12))
237#define LPCR_VRMA_LP0 (1ul << (63-15)) 241#define LPCR_VRMA_LP0 (1ul << (63-15))
238#define LPCR_VRMA_LP1 (1ul << (63-16)) 242#define LPCR_VRMA_LP1 (1ul << (63-16))
243#define LPCR_VRMASD_SH (63-16)
239#define LPCR_RMLS 0x1C000000 /* impl dependent rmo limit sel */ 244#define LPCR_RMLS 0x1C000000 /* impl dependent rmo limit sel */
245#define LPCR_RMLS_SH (63-37)
240#define LPCR_ILE 0x02000000 /* !HV irqs set MSR:LE */ 246#define LPCR_ILE 0x02000000 /* !HV irqs set MSR:LE */
241#define LPCR_PECE 0x00007000 /* powersave exit cause enable */ 247#define LPCR_PECE 0x00007000 /* powersave exit cause enable */
242#define LPCR_PECE0 0x00004000 /* ext. exceptions can cause exit */ 248#define LPCR_PECE0 0x00004000 /* ext. exceptions can cause exit */
243#define LPCR_PECE1 0x00002000 /* decrementer can cause exit */ 249#define LPCR_PECE1 0x00002000 /* decrementer can cause exit */
244#define LPCR_PECE2 0x00001000 /* machine check etc can cause exit */ 250#define LPCR_PECE2 0x00001000 /* machine check etc can cause exit */
245#define LPCR_MER 0x00000800 /* Mediated External Exception */ 251#define LPCR_MER 0x00000800 /* Mediated External Exception */
252#define LPCR_LPES 0x0000000c
246#define LPCR_LPES0 0x00000008 /* LPAR Env selector 0 */ 253#define LPCR_LPES0 0x00000008 /* LPAR Env selector 0 */
247#define LPCR_LPES1 0x00000004 /* LPAR Env selector 1 */ 254#define LPCR_LPES1 0x00000004 /* LPAR Env selector 1 */
255#define LPCR_LPES_SH 2
248#define LPCR_RMI 0x00000002 /* real mode is cache inhibit */ 256#define LPCR_RMI 0x00000002 /* real mode is cache inhibit */
249#define LPCR_HDICE 0x00000001 /* Hyp Decr enable (HV,PR,EE) */ 257#define LPCR_HDICE 0x00000001 /* Hyp Decr enable (HV,PR,EE) */
250#define SPRN_LPID 0x13F /* Logical Partition Identifier */ 258#define SPRN_LPID 0x13F /* Logical Partition Identifier */
259#define LPID_RSVD 0x3ff /* Reserved LPID for partn switching */
251#define SPRN_HMER 0x150 /* Hardware m? error recovery */ 260#define SPRN_HMER 0x150 /* Hardware m? error recovery */
252#define SPRN_HMEER 0x151 /* Hardware m? enable error recovery */ 261#define SPRN_HMEER 0x151 /* Hardware m? enable error recovery */
253#define SPRN_HEIR 0x153 /* Hypervisor Emulated Instruction Register */ 262#define SPRN_HEIR 0x153 /* Hypervisor Emulated Instruction Register */
@@ -298,6 +307,7 @@
298#define SPRN_HASH1 0x3D2 /* Primary Hash Address Register */ 307#define SPRN_HASH1 0x3D2 /* Primary Hash Address Register */
299#define SPRN_HASH2 0x3D3 /* Secondary Hash Address Resgister */ 308#define SPRN_HASH2 0x3D3 /* Secondary Hash Address Resgister */
300#define SPRN_HID0 0x3F0 /* Hardware Implementation Register 0 */ 309#define SPRN_HID0 0x3F0 /* Hardware Implementation Register 0 */
310#define HID0_HDICE_SH (63 - 23) /* 970 HDEC interrupt enable */
301#define HID0_EMCP (1<<31) /* Enable Machine Check pin */ 311#define HID0_EMCP (1<<31) /* Enable Machine Check pin */
302#define HID0_EBA (1<<29) /* Enable Bus Address Parity */ 312#define HID0_EBA (1<<29) /* Enable Bus Address Parity */
303#define HID0_EBD (1<<28) /* Enable Bus Data Parity */ 313#define HID0_EBD (1<<28) /* Enable Bus Data Parity */
@@ -353,6 +363,13 @@
353#define SPRN_IABR2 0x3FA /* 83xx */ 363#define SPRN_IABR2 0x3FA /* 83xx */
354#define SPRN_IBCR 0x135 /* 83xx Insn Breakpoint Control Reg */ 364#define SPRN_IBCR 0x135 /* 83xx Insn Breakpoint Control Reg */
355#define SPRN_HID4 0x3F4 /* 970 HID4 */ 365#define SPRN_HID4 0x3F4 /* 970 HID4 */
366#define HID4_LPES0 (1ul << (63-0)) /* LPAR env. sel. bit 0 */
367#define HID4_RMLS2_SH (63 - 2) /* Real mode limit bottom 2 bits */
368#define HID4_LPID5_SH (63 - 6) /* partition ID bottom 4 bits */
369#define HID4_RMOR_SH (63 - 22) /* real mode offset (16 bits) */
370#define HID4_LPES1 (1 << (63-57)) /* LPAR env. sel. bit 1 */
371#define HID4_RMLS0_SH (63 - 58) /* Real mode limit top bit */
372#define HID4_LPID1_SH 0 /* partition ID top 2 bits */
356#define SPRN_HID4_GEKKO 0x3F3 /* Gekko HID4 */ 373#define SPRN_HID4_GEKKO 0x3F3 /* Gekko HID4 */
357#define SPRN_HID5 0x3F6 /* 970 HID5 */ 374#define SPRN_HID5 0x3F6 /* 970 HID5 */
358#define SPRN_HID6 0x3F9 /* BE HID 6 */ 375#define SPRN_HID6 0x3F9 /* BE HID 6 */
@@ -802,28 +819,28 @@
802 mfspr rX,SPRN_SPRG_PACA; \ 819 mfspr rX,SPRN_SPRG_PACA; \
803 FTR_SECTION_ELSE_NESTED(66); \ 820 FTR_SECTION_ELSE_NESTED(66); \
804 mfspr rX,SPRN_SPRG_HPACA; \ 821 mfspr rX,SPRN_SPRG_HPACA; \
805 ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66) 822 ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66)
806 823
807#define SET_PACA(rX) \ 824#define SET_PACA(rX) \
808 BEGIN_FTR_SECTION_NESTED(66); \ 825 BEGIN_FTR_SECTION_NESTED(66); \
809 mtspr SPRN_SPRG_PACA,rX; \ 826 mtspr SPRN_SPRG_PACA,rX; \
810 FTR_SECTION_ELSE_NESTED(66); \ 827 FTR_SECTION_ELSE_NESTED(66); \
811 mtspr SPRN_SPRG_HPACA,rX; \ 828 mtspr SPRN_SPRG_HPACA,rX; \
812 ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66) 829 ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66)
813 830
814#define GET_SCRATCH0(rX) \ 831#define GET_SCRATCH0(rX) \
815 BEGIN_FTR_SECTION_NESTED(66); \ 832 BEGIN_FTR_SECTION_NESTED(66); \
816 mfspr rX,SPRN_SPRG_SCRATCH0; \ 833 mfspr rX,SPRN_SPRG_SCRATCH0; \
817 FTR_SECTION_ELSE_NESTED(66); \ 834 FTR_SECTION_ELSE_NESTED(66); \
818 mfspr rX,SPRN_SPRG_HSCRATCH0; \ 835 mfspr rX,SPRN_SPRG_HSCRATCH0; \
819 ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66) 836 ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66)
820 837
821#define SET_SCRATCH0(rX) \ 838#define SET_SCRATCH0(rX) \
822 BEGIN_FTR_SECTION_NESTED(66); \ 839 BEGIN_FTR_SECTION_NESTED(66); \
823 mtspr SPRN_SPRG_SCRATCH0,rX; \ 840 mtspr SPRN_SPRG_SCRATCH0,rX; \
824 FTR_SECTION_ELSE_NESTED(66); \ 841 FTR_SECTION_ELSE_NESTED(66); \
825 mtspr SPRN_SPRG_HSCRATCH0,rX; \ 842 mtspr SPRN_SPRG_HSCRATCH0,rX; \
826 ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66) 843 ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66)
827 844
828#else /* CONFIG_PPC_BOOK3S_64 */ 845#else /* CONFIG_PPC_BOOK3S_64 */
829#define GET_SCRATCH0(rX) mfspr rX,SPRN_SPRG_SCRATCH0 846#define GET_SCRATCH0(rX) mfspr rX,SPRN_SPRG_SCRATCH0
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index 0f0ad9fa01c1..9ec0b39f9ddc 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -318,6 +318,7 @@
318#define ESR_ILK 0x00100000 /* Instr. Cache Locking */ 318#define ESR_ILK 0x00100000 /* Instr. Cache Locking */
319#define ESR_PUO 0x00040000 /* Unimplemented Operation exception */ 319#define ESR_PUO 0x00040000 /* Unimplemented Operation exception */
320#define ESR_BO 0x00020000 /* Byte Ordering */ 320#define ESR_BO 0x00020000 /* Byte Ordering */
321#define ESR_SPV 0x00000080 /* Signal Processing operation */
321 322
322/* Bit definitions related to the DBCR0. */ 323/* Bit definitions related to the DBCR0. */
323#if defined(CONFIG_40x) 324#if defined(CONFIG_40x)
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 36e1c8a29be8..54b935f2f5de 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -128,6 +128,7 @@ int main(void)
128 DEFINE(ICACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, ilines_per_page)); 128 DEFINE(ICACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, ilines_per_page));
129 /* paca */ 129 /* paca */
130 DEFINE(PACA_SIZE, sizeof(struct paca_struct)); 130 DEFINE(PACA_SIZE, sizeof(struct paca_struct));
131 DEFINE(PACA_LOCK_TOKEN, offsetof(struct paca_struct, lock_token));
131 DEFINE(PACAPACAINDEX, offsetof(struct paca_struct, paca_index)); 132 DEFINE(PACAPACAINDEX, offsetof(struct paca_struct, paca_index));
132 DEFINE(PACAPROCSTART, offsetof(struct paca_struct, cpu_start)); 133 DEFINE(PACAPROCSTART, offsetof(struct paca_struct, cpu_start));
133 DEFINE(PACAKSAVE, offsetof(struct paca_struct, kstack)); 134 DEFINE(PACAKSAVE, offsetof(struct paca_struct, kstack));
@@ -187,7 +188,9 @@ int main(void)
187 DEFINE(LPPACASRR1, offsetof(struct lppaca, saved_srr1)); 188 DEFINE(LPPACASRR1, offsetof(struct lppaca, saved_srr1));
188 DEFINE(LPPACAANYINT, offsetof(struct lppaca, int_dword.any_int)); 189 DEFINE(LPPACAANYINT, offsetof(struct lppaca, int_dword.any_int));
189 DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int)); 190 DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int));
191 DEFINE(LPPACA_PMCINUSE, offsetof(struct lppaca, pmcregs_in_use));
190 DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx)); 192 DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx));
193 DEFINE(LPPACA_YIELDCOUNT, offsetof(struct lppaca, yield_count));
191 DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx)); 194 DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx));
192#endif /* CONFIG_PPC_STD_MMU_64 */ 195#endif /* CONFIG_PPC_STD_MMU_64 */
193 DEFINE(PACAEMERGSP, offsetof(struct paca_struct, emergency_sp)); 196 DEFINE(PACAEMERGSP, offsetof(struct paca_struct, emergency_sp));
@@ -198,11 +201,6 @@ int main(void)
198 DEFINE(PACA_USER_TIME, offsetof(struct paca_struct, user_time)); 201 DEFINE(PACA_USER_TIME, offsetof(struct paca_struct, user_time));
199 DEFINE(PACA_SYSTEM_TIME, offsetof(struct paca_struct, system_time)); 202 DEFINE(PACA_SYSTEM_TIME, offsetof(struct paca_struct, system_time));
200 DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save)); 203 DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save));
201#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
202 DEFINE(PACA_KVM_SVCPU, offsetof(struct paca_struct, shadow_vcpu));
203 DEFINE(SVCPU_SLB, offsetof(struct kvmppc_book3s_shadow_vcpu, slb));
204 DEFINE(SVCPU_SLB_MAX, offsetof(struct kvmppc_book3s_shadow_vcpu, slb_max));
205#endif
206#endif /* CONFIG_PPC64 */ 204#endif /* CONFIG_PPC64 */
207 205
208 /* RTAS */ 206 /* RTAS */
@@ -397,67 +395,160 @@ int main(void)
397 DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid)); 395 DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
398 DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr)); 396 DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
399 DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave)); 397 DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave));
398 DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fpr));
399 DEFINE(VCPU_FPSCR, offsetof(struct kvm_vcpu, arch.fpscr));
400#ifdef CONFIG_ALTIVEC
401 DEFINE(VCPU_VRS, offsetof(struct kvm_vcpu, arch.vr));
402 DEFINE(VCPU_VSCR, offsetof(struct kvm_vcpu, arch.vscr));
403#endif
404#ifdef CONFIG_VSX
405 DEFINE(VCPU_VSRS, offsetof(struct kvm_vcpu, arch.vsr));
406#endif
407 DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
408 DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr));
409 DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
410 DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
411 DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc));
412#ifdef CONFIG_KVM_BOOK3S_64_HV
413 DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.shregs.msr));
414 DEFINE(VCPU_SRR0, offsetof(struct kvm_vcpu, arch.shregs.srr0));
415 DEFINE(VCPU_SRR1, offsetof(struct kvm_vcpu, arch.shregs.srr1));
416 DEFINE(VCPU_SPRG0, offsetof(struct kvm_vcpu, arch.shregs.sprg0));
417 DEFINE(VCPU_SPRG1, offsetof(struct kvm_vcpu, arch.shregs.sprg1));
418 DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
419 DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
420#endif
400 DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4)); 421 DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4));
401 DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5)); 422 DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
402 DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6)); 423 DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
403 DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7)); 424 DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7));
404 DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid)); 425 DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
426 DEFINE(VCPU_SHADOW_PID1, offsetof(struct kvm_vcpu, arch.shadow_pid1));
405 DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared)); 427 DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared));
406 DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr)); 428 DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr));
429 DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
407 430
408 /* book3s */ 431 /* book3s */
432#ifdef CONFIG_KVM_BOOK3S_64_HV
433 DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid));
434 DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1));
435 DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid));
436 DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
437 DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
438 DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
439 DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter));
440 DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
441 DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
442 DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
443 DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
444 DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
445#endif
409#ifdef CONFIG_PPC_BOOK3S 446#ifdef CONFIG_PPC_BOOK3S
447 DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
448 DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
410 DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip)); 449 DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip));
411 DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr)); 450 DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr));
412 DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr)); 451 DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
452 DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
453 DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr));
454 DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr));
455 DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor));
456 DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl));
457 DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr));
413 DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem)); 458 DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem));
414 DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter)); 459 DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter));
415 DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler)); 460 DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler));
416 DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall)); 461 DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall));
417 DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags)); 462 DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
463 DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
464 DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
465 DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions));
466 DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa));
467 DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
468 DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
469 DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
470 DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
471 DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
472 DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu));
473 DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
474 DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
475 DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
476 DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
477 DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid));
478 DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
479 DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
480 DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
418 DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) - 481 DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
419 offsetof(struct kvmppc_vcpu_book3s, vcpu)); 482 offsetof(struct kvmppc_vcpu_book3s, vcpu));
420 DEFINE(SVCPU_CR, offsetof(struct kvmppc_book3s_shadow_vcpu, cr)); 483 DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
421 DEFINE(SVCPU_XER, offsetof(struct kvmppc_book3s_shadow_vcpu, xer)); 484 DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv));
422 DEFINE(SVCPU_CTR, offsetof(struct kvmppc_book3s_shadow_vcpu, ctr)); 485 DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb));
423 DEFINE(SVCPU_LR, offsetof(struct kvmppc_book3s_shadow_vcpu, lr)); 486
424 DEFINE(SVCPU_PC, offsetof(struct kvmppc_book3s_shadow_vcpu, pc)); 487#ifdef CONFIG_PPC_BOOK3S_64
425 DEFINE(SVCPU_R0, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[0])); 488#ifdef CONFIG_KVM_BOOK3S_PR
426 DEFINE(SVCPU_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[1])); 489# define SVCPU_FIELD(x, f) DEFINE(x, offsetof(struct paca_struct, shadow_vcpu.f))
427 DEFINE(SVCPU_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[2])); 490#else
428 DEFINE(SVCPU_R3, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[3])); 491# define SVCPU_FIELD(x, f)
429 DEFINE(SVCPU_R4, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[4])); 492#endif
430 DEFINE(SVCPU_R5, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[5])); 493# define HSTATE_FIELD(x, f) DEFINE(x, offsetof(struct paca_struct, kvm_hstate.f))
431 DEFINE(SVCPU_R6, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[6])); 494#else /* 32-bit */
432 DEFINE(SVCPU_R7, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[7])); 495# define SVCPU_FIELD(x, f) DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, f))
433 DEFINE(SVCPU_R8, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[8])); 496# define HSTATE_FIELD(x, f) DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, hstate.f))
434 DEFINE(SVCPU_R9, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[9])); 497#endif
435 DEFINE(SVCPU_R10, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[10])); 498
436 DEFINE(SVCPU_R11, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[11])); 499 SVCPU_FIELD(SVCPU_CR, cr);
437 DEFINE(SVCPU_R12, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[12])); 500 SVCPU_FIELD(SVCPU_XER, xer);
438 DEFINE(SVCPU_R13, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[13])); 501 SVCPU_FIELD(SVCPU_CTR, ctr);
439 DEFINE(SVCPU_HOST_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r1)); 502 SVCPU_FIELD(SVCPU_LR, lr);
440 DEFINE(SVCPU_HOST_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r2)); 503 SVCPU_FIELD(SVCPU_PC, pc);
441 DEFINE(SVCPU_VMHANDLER, offsetof(struct kvmppc_book3s_shadow_vcpu, 504 SVCPU_FIELD(SVCPU_R0, gpr[0]);
442 vmhandler)); 505 SVCPU_FIELD(SVCPU_R1, gpr[1]);
443 DEFINE(SVCPU_SCRATCH0, offsetof(struct kvmppc_book3s_shadow_vcpu, 506 SVCPU_FIELD(SVCPU_R2, gpr[2]);
444 scratch0)); 507 SVCPU_FIELD(SVCPU_R3, gpr[3]);
445 DEFINE(SVCPU_SCRATCH1, offsetof(struct kvmppc_book3s_shadow_vcpu, 508 SVCPU_FIELD(SVCPU_R4, gpr[4]);
446 scratch1)); 509 SVCPU_FIELD(SVCPU_R5, gpr[5]);
447 DEFINE(SVCPU_IN_GUEST, offsetof(struct kvmppc_book3s_shadow_vcpu, 510 SVCPU_FIELD(SVCPU_R6, gpr[6]);
448 in_guest)); 511 SVCPU_FIELD(SVCPU_R7, gpr[7]);
449 DEFINE(SVCPU_FAULT_DSISR, offsetof(struct kvmppc_book3s_shadow_vcpu, 512 SVCPU_FIELD(SVCPU_R8, gpr[8]);
450 fault_dsisr)); 513 SVCPU_FIELD(SVCPU_R9, gpr[9]);
451 DEFINE(SVCPU_FAULT_DAR, offsetof(struct kvmppc_book3s_shadow_vcpu, 514 SVCPU_FIELD(SVCPU_R10, gpr[10]);
452 fault_dar)); 515 SVCPU_FIELD(SVCPU_R11, gpr[11]);
453 DEFINE(SVCPU_LAST_INST, offsetof(struct kvmppc_book3s_shadow_vcpu, 516 SVCPU_FIELD(SVCPU_R12, gpr[12]);
454 last_inst)); 517 SVCPU_FIELD(SVCPU_R13, gpr[13]);
455 DEFINE(SVCPU_SHADOW_SRR1, offsetof(struct kvmppc_book3s_shadow_vcpu, 518 SVCPU_FIELD(SVCPU_FAULT_DSISR, fault_dsisr);
456 shadow_srr1)); 519 SVCPU_FIELD(SVCPU_FAULT_DAR, fault_dar);
520 SVCPU_FIELD(SVCPU_LAST_INST, last_inst);
521 SVCPU_FIELD(SVCPU_SHADOW_SRR1, shadow_srr1);
457#ifdef CONFIG_PPC_BOOK3S_32 522#ifdef CONFIG_PPC_BOOK3S_32
458 DEFINE(SVCPU_SR, offsetof(struct kvmppc_book3s_shadow_vcpu, sr)); 523 SVCPU_FIELD(SVCPU_SR, sr);
459#endif 524#endif
460#else 525#ifdef CONFIG_PPC64
526 SVCPU_FIELD(SVCPU_SLB, slb);
527 SVCPU_FIELD(SVCPU_SLB_MAX, slb_max);
528#endif
529
530 HSTATE_FIELD(HSTATE_HOST_R1, host_r1);
531 HSTATE_FIELD(HSTATE_HOST_R2, host_r2);
532 HSTATE_FIELD(HSTATE_HOST_MSR, host_msr);
533 HSTATE_FIELD(HSTATE_VMHANDLER, vmhandler);
534 HSTATE_FIELD(HSTATE_SCRATCH0, scratch0);
535 HSTATE_FIELD(HSTATE_SCRATCH1, scratch1);
536 HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
537
538#ifdef CONFIG_KVM_BOOK3S_64_HV
539 HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
540 HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore);
541 HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
542 HSTATE_FIELD(HSTATE_MMCR, host_mmcr);
543 HSTATE_FIELD(HSTATE_PMC, host_pmc);
544 HSTATE_FIELD(HSTATE_PURR, host_purr);
545 HSTATE_FIELD(HSTATE_SPURR, host_spurr);
546 HSTATE_FIELD(HSTATE_DSCR, host_dscr);
547 HSTATE_FIELD(HSTATE_DABR, dabr);
548 HSTATE_FIELD(HSTATE_DECEXP, dec_expires);
549#endif /* CONFIG_KVM_BOOK3S_64_HV */
550
551#else /* CONFIG_PPC_BOOK3S */
461 DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr)); 552 DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
462 DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer)); 553 DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
463 DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr)); 554 DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
@@ -467,7 +558,7 @@ int main(void)
467 DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear)); 558 DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear));
468 DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr)); 559 DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr));
469#endif /* CONFIG_PPC_BOOK3S */ 560#endif /* CONFIG_PPC_BOOK3S */
470#endif 561#endif /* CONFIG_KVM */
471 562
472#ifdef CONFIG_KVM_GUEST 563#ifdef CONFIG_KVM_GUEST
473 DEFINE(KVM_MAGIC_SCRATCH1, offsetof(struct kvm_vcpu_arch_shared, 564 DEFINE(KVM_MAGIC_SCRATCH1, offsetof(struct kvm_vcpu_arch_shared,
@@ -497,6 +588,13 @@ int main(void)
497 DEFINE(TLBCAM_MAS7, offsetof(struct tlbcam, MAS7)); 588 DEFINE(TLBCAM_MAS7, offsetof(struct tlbcam, MAS7));
498#endif 589#endif
499 590
591#if defined(CONFIG_KVM) && defined(CONFIG_SPE)
592 DEFINE(VCPU_EVR, offsetof(struct kvm_vcpu, arch.evr[0]));
593 DEFINE(VCPU_ACC, offsetof(struct kvm_vcpu, arch.acc));
594 DEFINE(VCPU_SPEFSCR, offsetof(struct kvm_vcpu, arch.spefscr));
595 DEFINE(VCPU_HOST_SPEFSCR, offsetof(struct kvm_vcpu, arch.host_spefscr));
596#endif
597
500#ifdef CONFIG_KVM_EXIT_TIMING 598#ifdef CONFIG_KVM_EXIT_TIMING
501 DEFINE(VCPU_TIMING_EXIT_TBU, offsetof(struct kvm_vcpu, 599 DEFINE(VCPU_TIMING_EXIT_TBU, offsetof(struct kvm_vcpu,
502 arch.timing_exit.tv32.tbu)); 600 arch.timing_exit.tv32.tbu));
diff --git a/arch/powerpc/kernel/cpu_setup_power7.S b/arch/powerpc/kernel/cpu_setup_power7.S
index 4f9a93fcfe07..76797c5105d6 100644
--- a/arch/powerpc/kernel/cpu_setup_power7.S
+++ b/arch/powerpc/kernel/cpu_setup_power7.S
@@ -45,12 +45,12 @@ _GLOBAL(__restore_cpu_power7)
45 blr 45 blr
46 46
47__init_hvmode_206: 47__init_hvmode_206:
48 /* Disable CPU_FTR_HVMODE_206 and exit if MSR:HV is not set */ 48 /* Disable CPU_FTR_HVMODE and exit if MSR:HV is not set */
49 mfmsr r3 49 mfmsr r3
50 rldicl. r0,r3,4,63 50 rldicl. r0,r3,4,63
51 bnelr 51 bnelr
52 ld r5,CPU_SPEC_FEATURES(r4) 52 ld r5,CPU_SPEC_FEATURES(r4)
53 LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE_206) 53 LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE)
54 xor r5,r5,r6 54 xor r5,r5,r6
55 std r5,CPU_SPEC_FEATURES(r4) 55 std r5,CPU_SPEC_FEATURES(r4)
56 blr 56 blr
@@ -61,19 +61,23 @@ __init_LPCR:
61 * LPES = 0b01 (HSRR0/1 used for 0x500) 61 * LPES = 0b01 (HSRR0/1 used for 0x500)
62 * PECE = 0b111 62 * PECE = 0b111
63 * DPFD = 4 63 * DPFD = 4
64 * HDICE = 0
65 * VC = 0b100 (VPM0=1, VPM1=0, ISL=0)
66 * VRMASD = 0b10000 (L=1, LP=00)
64 * 67 *
65 * Other bits untouched for now 68 * Other bits untouched for now
66 */ 69 */
67 mfspr r3,SPRN_LPCR 70 mfspr r3,SPRN_LPCR
68 ori r3,r3,(LPCR_LPES0|LPCR_LPES1) 71 li r5,1
69 xori r3,r3, LPCR_LPES0 72 rldimi r3,r5, LPCR_LPES_SH, 64-LPCR_LPES_SH-2
70 ori r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2) 73 ori r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2)
71 li r5,7
72 sldi r5,r5,LPCR_DPFD_SH
73 andc r3,r3,r5
74 li r5,4 74 li r5,4
75 sldi r5,r5,LPCR_DPFD_SH 75 rldimi r3,r5, LPCR_DPFD_SH, 64-LPCR_DPFD_SH-3
76 or r3,r3,r5 76 clrrdi r3,r3,1 /* clear HDICE */
77 li r5,4
78 rldimi r3,r5, LPCR_VC_SH, 0
79 li r5,0x10
80 rldimi r3,r5, LPCR_VRMASD_SH, 64-LPCR_VRMASD_SH-5
77 mtspr SPRN_LPCR,r3 81 mtspr SPRN_LPCR,r3
78 isync 82 isync
79 blr 83 blr
diff --git a/arch/powerpc/kernel/cpu_setup_ppc970.S b/arch/powerpc/kernel/cpu_setup_ppc970.S
index 27f2507279d8..12fac8df01c5 100644
--- a/arch/powerpc/kernel/cpu_setup_ppc970.S
+++ b/arch/powerpc/kernel/cpu_setup_ppc970.S
@@ -76,7 +76,7 @@ _GLOBAL(__setup_cpu_ppc970)
76 /* Do nothing if not running in HV mode */ 76 /* Do nothing if not running in HV mode */
77 mfmsr r0 77 mfmsr r0
78 rldicl. r0,r0,4,63 78 rldicl. r0,r0,4,63
79 beqlr 79 beq no_hv_mode
80 80
81 mfspr r0,SPRN_HID0 81 mfspr r0,SPRN_HID0
82 li r11,5 /* clear DOZE and SLEEP */ 82 li r11,5 /* clear DOZE and SLEEP */
@@ -90,7 +90,7 @@ _GLOBAL(__setup_cpu_ppc970MP)
90 /* Do nothing if not running in HV mode */ 90 /* Do nothing if not running in HV mode */
91 mfmsr r0 91 mfmsr r0
92 rldicl. r0,r0,4,63 92 rldicl. r0,r0,4,63
93 beqlr 93 beq no_hv_mode
94 94
95 mfspr r0,SPRN_HID0 95 mfspr r0,SPRN_HID0
96 li r11,0x15 /* clear DOZE and SLEEP */ 96 li r11,0x15 /* clear DOZE and SLEEP */
@@ -109,6 +109,14 @@ load_hids:
109 sync 109 sync
110 isync 110 isync
111 111
112 /* Try to set LPES = 01 in HID4 */
113 mfspr r0,SPRN_HID4
114 clrldi r0,r0,1 /* clear LPES0 */
115 ori r0,r0,HID4_LPES1 /* set LPES1 */
116 sync
117 mtspr SPRN_HID4,r0
118 isync
119
112 /* Save away cpu state */ 120 /* Save away cpu state */
113 LOAD_REG_ADDR(r5,cpu_state_storage) 121 LOAD_REG_ADDR(r5,cpu_state_storage)
114 122
@@ -117,11 +125,21 @@ load_hids:
117 std r3,CS_HID0(r5) 125 std r3,CS_HID0(r5)
118 mfspr r3,SPRN_HID1 126 mfspr r3,SPRN_HID1
119 std r3,CS_HID1(r5) 127 std r3,CS_HID1(r5)
120 mfspr r3,SPRN_HID4 128 mfspr r4,SPRN_HID4
121 std r3,CS_HID4(r5) 129 std r4,CS_HID4(r5)
122 mfspr r3,SPRN_HID5 130 mfspr r3,SPRN_HID5
123 std r3,CS_HID5(r5) 131 std r3,CS_HID5(r5)
124 132
133 /* See if we successfully set LPES1 to 1; if not we are in Apple mode */
134 andi. r4,r4,HID4_LPES1
135 bnelr
136
137no_hv_mode:
138 /* Disable CPU_FTR_HVMODE and exit, since we don't have HV mode */
139 ld r5,CPU_SPEC_FEATURES(r4)
140 LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE)
141 andc r5,r5,r6
142 std r5,CPU_SPEC_FEATURES(r4)
125 blr 143 blr
126 144
127/* Called with no MMU context (typically MSR:IR/DR off) to 145/* Called with no MMU context (typically MSR:IR/DR off) to
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index a85f4874cba7..41b02c792aa3 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -40,7 +40,6 @@ __start_interrupts:
40 .globl system_reset_pSeries; 40 .globl system_reset_pSeries;
41system_reset_pSeries: 41system_reset_pSeries:
42 HMT_MEDIUM; 42 HMT_MEDIUM;
43 DO_KVM 0x100;
44 SET_SCRATCH0(r13) 43 SET_SCRATCH0(r13)
45#ifdef CONFIG_PPC_P7_NAP 44#ifdef CONFIG_PPC_P7_NAP
46BEGIN_FTR_SECTION 45BEGIN_FTR_SECTION
@@ -50,82 +49,73 @@ BEGIN_FTR_SECTION
50 * state loss at this time. 49 * state loss at this time.
51 */ 50 */
52 mfspr r13,SPRN_SRR1 51 mfspr r13,SPRN_SRR1
53 rlwinm r13,r13,47-31,30,31 52 rlwinm. r13,r13,47-31,30,31
54 cmpwi cr0,r13,1 53 beq 9f
55 bne 1f 54
56 b .power7_wakeup_noloss 55 /* waking up from powersave (nap) state */
571: cmpwi cr0,r13,2 56 cmpwi cr1,r13,2
58 bne 1f
59 b .power7_wakeup_loss
60 /* Total loss of HV state is fatal, we could try to use the 57 /* Total loss of HV state is fatal, we could try to use the
61 * PIR to locate a PACA, then use an emergency stack etc... 58 * PIR to locate a PACA, then use an emergency stack etc...
62 * but for now, let's just stay stuck here 59 * but for now, let's just stay stuck here
63 */ 60 */
641: cmpwi cr0,r13,3 61 bgt cr1,.
65 beq . 62 GET_PACA(r13)
66END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206) 63
64#ifdef CONFIG_KVM_BOOK3S_64_HV
65 lbz r0,PACAPROCSTART(r13)
66 cmpwi r0,0x80
67 bne 1f
68 li r0,0
69 stb r0,PACAPROCSTART(r13)
70 b kvm_start_guest
711:
72#endif
73
74 beq cr1,2f
75 b .power7_wakeup_noloss
762: b .power7_wakeup_loss
779:
78END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
67#endif /* CONFIG_PPC_P7_NAP */ 79#endif /* CONFIG_PPC_P7_NAP */
68 EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD) 80 EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
81 NOTEST, 0x100)
69 82
70 . = 0x200 83 . = 0x200
71_machine_check_pSeries: 84machine_check_pSeries_1:
72 HMT_MEDIUM 85 /* This is moved out of line as it can be patched by FW, but
73 DO_KVM 0x200 86 * some code path might still want to branch into the original
74 SET_SCRATCH0(r13) 87 * vector
75 EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common, EXC_STD) 88 */
89 b machine_check_pSeries
76 90
77 . = 0x300 91 . = 0x300
78 .globl data_access_pSeries 92 .globl data_access_pSeries
79data_access_pSeries: 93data_access_pSeries:
80 HMT_MEDIUM 94 HMT_MEDIUM
81 DO_KVM 0x300
82 SET_SCRATCH0(r13) 95 SET_SCRATCH0(r13)
96#ifndef CONFIG_POWER4_ONLY
83BEGIN_FTR_SECTION 97BEGIN_FTR_SECTION
84 GET_PACA(r13) 98 b data_access_check_stab
85 std r9,PACA_EXSLB+EX_R9(r13) 99data_access_not_stab:
86 std r10,PACA_EXSLB+EX_R10(r13) 100END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
87 mfspr r10,SPRN_DAR 101#endif
88 mfspr r9,SPRN_DSISR 102 EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD,
89 srdi r10,r10,60 103 KVMTEST_PR, 0x300)
90 rlwimi r10,r9,16,0x20
91 mfcr r9
92 cmpwi r10,0x2c
93 beq do_stab_bolted_pSeries
94 ld r10,PACA_EXSLB+EX_R10(r13)
95 std r11,PACA_EXGEN+EX_R11(r13)
96 ld r11,PACA_EXSLB+EX_R9(r13)
97 std r12,PACA_EXGEN+EX_R12(r13)
98 GET_SCRATCH0(r12)
99 std r10,PACA_EXGEN+EX_R10(r13)
100 std r11,PACA_EXGEN+EX_R9(r13)
101 std r12,PACA_EXGEN+EX_R13(r13)
102 EXCEPTION_PROLOG_PSERIES_1(data_access_common, EXC_STD)
103FTR_SECTION_ELSE
104 EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD)
105ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_SLB)
106 104
107 . = 0x380 105 . = 0x380
108 .globl data_access_slb_pSeries 106 .globl data_access_slb_pSeries
109data_access_slb_pSeries: 107data_access_slb_pSeries:
110 HMT_MEDIUM 108 HMT_MEDIUM
111 DO_KVM 0x380
112 SET_SCRATCH0(r13) 109 SET_SCRATCH0(r13)
113 GET_PACA(r13) 110 EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380)
114 std r3,PACA_EXSLB+EX_R3(r13) 111 std r3,PACA_EXSLB+EX_R3(r13)
115 mfspr r3,SPRN_DAR 112 mfspr r3,SPRN_DAR
116 std r9,PACA_EXSLB+EX_R9(r13) /* save r9 - r12 */
117 mfcr r9
118#ifdef __DISABLED__ 113#ifdef __DISABLED__
119 /* Keep that around for when we re-implement dynamic VSIDs */ 114 /* Keep that around for when we re-implement dynamic VSIDs */
120 cmpdi r3,0 115 cmpdi r3,0
121 bge slb_miss_user_pseries 116 bge slb_miss_user_pseries
122#endif /* __DISABLED__ */ 117#endif /* __DISABLED__ */
123 std r10,PACA_EXSLB+EX_R10(r13) 118 mfspr r12,SPRN_SRR1
124 std r11,PACA_EXSLB+EX_R11(r13)
125 std r12,PACA_EXSLB+EX_R12(r13)
126 GET_SCRATCH0(r10)
127 std r10,PACA_EXSLB+EX_R13(r13)
128 mfspr r12,SPRN_SRR1 /* and SRR1 */
129#ifndef CONFIG_RELOCATABLE 119#ifndef CONFIG_RELOCATABLE
130 b .slb_miss_realmode 120 b .slb_miss_realmode
131#else 121#else
@@ -147,24 +137,16 @@ data_access_slb_pSeries:
147 .globl instruction_access_slb_pSeries 137 .globl instruction_access_slb_pSeries
148instruction_access_slb_pSeries: 138instruction_access_slb_pSeries:
149 HMT_MEDIUM 139 HMT_MEDIUM
150 DO_KVM 0x480
151 SET_SCRATCH0(r13) 140 SET_SCRATCH0(r13)
152 GET_PACA(r13) 141 EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480)
153 std r3,PACA_EXSLB+EX_R3(r13) 142 std r3,PACA_EXSLB+EX_R3(r13)
154 mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ 143 mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */
155 std r9,PACA_EXSLB+EX_R9(r13) /* save r9 - r12 */
156 mfcr r9
157#ifdef __DISABLED__ 144#ifdef __DISABLED__
158 /* Keep that around for when we re-implement dynamic VSIDs */ 145 /* Keep that around for when we re-implement dynamic VSIDs */
159 cmpdi r3,0 146 cmpdi r3,0
160 bge slb_miss_user_pseries 147 bge slb_miss_user_pseries
161#endif /* __DISABLED__ */ 148#endif /* __DISABLED__ */
162 std r10,PACA_EXSLB+EX_R10(r13) 149 mfspr r12,SPRN_SRR1
163 std r11,PACA_EXSLB+EX_R11(r13)
164 std r12,PACA_EXSLB+EX_R12(r13)
165 GET_SCRATCH0(r10)
166 std r10,PACA_EXSLB+EX_R13(r13)
167 mfspr r12,SPRN_SRR1 /* and SRR1 */
168#ifndef CONFIG_RELOCATABLE 150#ifndef CONFIG_RELOCATABLE
169 b .slb_miss_realmode 151 b .slb_miss_realmode
170#else 152#else
@@ -184,26 +166,46 @@ instruction_access_slb_pSeries:
184hardware_interrupt_pSeries: 166hardware_interrupt_pSeries:
185hardware_interrupt_hv: 167hardware_interrupt_hv:
186 BEGIN_FTR_SECTION 168 BEGIN_FTR_SECTION
187 _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt, EXC_STD) 169 _MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt,
170 EXC_HV, SOFTEN_TEST_HV)
171 KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
188 FTR_SECTION_ELSE 172 FTR_SECTION_ELSE
189 _MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt, EXC_HV) 173 _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
190 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_HVMODE_206) 174 EXC_STD, SOFTEN_TEST_HV_201)
175 KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
176 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
191 177
192 STD_EXCEPTION_PSERIES(0x600, 0x600, alignment) 178 STD_EXCEPTION_PSERIES(0x600, 0x600, alignment)
179 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x600)
180
193 STD_EXCEPTION_PSERIES(0x700, 0x700, program_check) 181 STD_EXCEPTION_PSERIES(0x700, 0x700, program_check)
182 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x700)
183
194 STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable) 184 STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable)
185 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x800)
195 186
196 MASKABLE_EXCEPTION_PSERIES(0x900, 0x900, decrementer) 187 MASKABLE_EXCEPTION_PSERIES(0x900, 0x900, decrementer)
197 MASKABLE_EXCEPTION_HV(0x980, 0x980, decrementer) 188 MASKABLE_EXCEPTION_HV(0x980, 0x982, decrementer)
198 189
199 STD_EXCEPTION_PSERIES(0xa00, 0xa00, trap_0a) 190 STD_EXCEPTION_PSERIES(0xa00, 0xa00, trap_0a)
191 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xa00)
192
200 STD_EXCEPTION_PSERIES(0xb00, 0xb00, trap_0b) 193 STD_EXCEPTION_PSERIES(0xb00, 0xb00, trap_0b)
194 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xb00)
201 195
202 . = 0xc00 196 . = 0xc00
203 .globl system_call_pSeries 197 .globl system_call_pSeries
204system_call_pSeries: 198system_call_pSeries:
205 HMT_MEDIUM 199 HMT_MEDIUM
206 DO_KVM 0xc00 200#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
201 SET_SCRATCH0(r13)
202 GET_PACA(r13)
203 std r9,PACA_EXGEN+EX_R9(r13)
204 std r10,PACA_EXGEN+EX_R10(r13)
205 mfcr r9
206 KVMTEST(0xc00)
207 GET_SCRATCH0(r13)
208#endif
207BEGIN_FTR_SECTION 209BEGIN_FTR_SECTION
208 cmpdi r0,0x1ebe 210 cmpdi r0,0x1ebe
209 beq- 1f 211 beq- 1f
@@ -220,6 +222,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
220 rfid 222 rfid
221 b . /* prevent speculative execution */ 223 b . /* prevent speculative execution */
222 224
225 KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00)
226
223/* Fast LE/BE switch system call */ 227/* Fast LE/BE switch system call */
2241: mfspr r12,SPRN_SRR1 2281: mfspr r12,SPRN_SRR1
225 xori r12,r12,MSR_LE 229 xori r12,r12,MSR_LE
@@ -228,6 +232,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
228 b . 232 b .
229 233
230 STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step) 234 STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step)
235 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xd00)
231 236
232 /* At 0xe??? we have a bunch of hypervisor exceptions, we branch 237 /* At 0xe??? we have a bunch of hypervisor exceptions, we branch
233 * out of line to handle them 238 * out of line to handle them
@@ -262,30 +267,93 @@ vsx_unavailable_pSeries_1:
262 267
263#ifdef CONFIG_CBE_RAS 268#ifdef CONFIG_CBE_RAS
264 STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error) 269 STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error)
270 KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
265#endif /* CONFIG_CBE_RAS */ 271#endif /* CONFIG_CBE_RAS */
272
266 STD_EXCEPTION_PSERIES(0x1300, 0x1300, instruction_breakpoint) 273 STD_EXCEPTION_PSERIES(0x1300, 0x1300, instruction_breakpoint)
274 KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
275
267#ifdef CONFIG_CBE_RAS 276#ifdef CONFIG_CBE_RAS
268 STD_EXCEPTION_HV(0x1600, 0x1602, cbe_maintenance) 277 STD_EXCEPTION_HV(0x1600, 0x1602, cbe_maintenance)
278 KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
269#endif /* CONFIG_CBE_RAS */ 279#endif /* CONFIG_CBE_RAS */
280
270 STD_EXCEPTION_PSERIES(0x1700, 0x1700, altivec_assist) 281 STD_EXCEPTION_PSERIES(0x1700, 0x1700, altivec_assist)
282 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x1700)
283
271#ifdef CONFIG_CBE_RAS 284#ifdef CONFIG_CBE_RAS
272 STD_EXCEPTION_HV(0x1800, 0x1802, cbe_thermal) 285 STD_EXCEPTION_HV(0x1800, 0x1802, cbe_thermal)
286 KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
273#endif /* CONFIG_CBE_RAS */ 287#endif /* CONFIG_CBE_RAS */
274 288
275 . = 0x3000 289 . = 0x3000
276 290
277/*** Out of line interrupts support ***/ 291/*** Out of line interrupts support ***/
278 292
293 /* moved from 0x200 */
294machine_check_pSeries:
295 .globl machine_check_fwnmi
296machine_check_fwnmi:
297 HMT_MEDIUM
298 SET_SCRATCH0(r13) /* save r13 */
299 EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common,
300 EXC_STD, KVMTEST, 0x200)
301 KVM_HANDLER_SKIP(PACA_EXMC, EXC_STD, 0x200)
302
303#ifndef CONFIG_POWER4_ONLY
304 /* moved from 0x300 */
305data_access_check_stab:
306 GET_PACA(r13)
307 std r9,PACA_EXSLB+EX_R9(r13)
308 std r10,PACA_EXSLB+EX_R10(r13)
309 mfspr r10,SPRN_DAR
310 mfspr r9,SPRN_DSISR
311 srdi r10,r10,60
312 rlwimi r10,r9,16,0x20
313#ifdef CONFIG_KVM_BOOK3S_PR
314 lbz r9,HSTATE_IN_GUEST(r13)
315 rlwimi r10,r9,8,0x300
316#endif
317 mfcr r9
318 cmpwi r10,0x2c
319 beq do_stab_bolted_pSeries
320 mtcrf 0x80,r9
321 ld r9,PACA_EXSLB+EX_R9(r13)
322 ld r10,PACA_EXSLB+EX_R10(r13)
323 b data_access_not_stab
324do_stab_bolted_pSeries:
325 std r11,PACA_EXSLB+EX_R11(r13)
326 std r12,PACA_EXSLB+EX_R12(r13)
327 GET_SCRATCH0(r10)
328 std r10,PACA_EXSLB+EX_R13(r13)
329 EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
330#endif /* CONFIG_POWER4_ONLY */
331
332 KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x300)
333 KVM_HANDLER_PR_SKIP(PACA_EXSLB, EXC_STD, 0x380)
334 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x400)
335 KVM_HANDLER_PR(PACA_EXSLB, EXC_STD, 0x480)
336 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x900)
337 KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x982)
338
339 .align 7
279 /* moved from 0xe00 */ 340 /* moved from 0xe00 */
280 STD_EXCEPTION_HV(., 0xe00, h_data_storage) 341 STD_EXCEPTION_HV(., 0xe02, h_data_storage)
281 STD_EXCEPTION_HV(., 0xe20, h_instr_storage) 342 KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0xe02)
282 STD_EXCEPTION_HV(., 0xe40, emulation_assist) 343 STD_EXCEPTION_HV(., 0xe22, h_instr_storage)
283 STD_EXCEPTION_HV(., 0xe60, hmi_exception) /* need to flush cache ? */ 344 KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe22)
345 STD_EXCEPTION_HV(., 0xe42, emulation_assist)
346 KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe42)
347 STD_EXCEPTION_HV(., 0xe62, hmi_exception) /* need to flush cache ? */
348 KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe62)
284 349
285 /* moved from 0xf00 */ 350 /* moved from 0xf00 */
286 STD_EXCEPTION_PSERIES(., 0xf00, performance_monitor) 351 STD_EXCEPTION_PSERIES(., 0xf00, performance_monitor)
352 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf00)
287 STD_EXCEPTION_PSERIES(., 0xf20, altivec_unavailable) 353 STD_EXCEPTION_PSERIES(., 0xf20, altivec_unavailable)
354 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf20)
288 STD_EXCEPTION_PSERIES(., 0xf40, vsx_unavailable) 355 STD_EXCEPTION_PSERIES(., 0xf40, vsx_unavailable)
356 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40)
289 357
290/* 358/*
291 * An interrupt came in while soft-disabled; clear EE in SRR1, 359 * An interrupt came in while soft-disabled; clear EE in SRR1,
@@ -317,14 +385,6 @@ masked_Hinterrupt:
317 hrfid 385 hrfid
318 b . 386 b .
319 387
320 .align 7
321do_stab_bolted_pSeries:
322 std r11,PACA_EXSLB+EX_R11(r13)
323 std r12,PACA_EXSLB+EX_R12(r13)
324 GET_SCRATCH0(r10)
325 std r10,PACA_EXSLB+EX_R13(r13)
326 EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
327
328#ifdef CONFIG_PPC_PSERIES 388#ifdef CONFIG_PPC_PSERIES
329/* 389/*
330 * Vectors for the FWNMI option. Share common code. 390 * Vectors for the FWNMI option. Share common code.
@@ -334,14 +394,8 @@ do_stab_bolted_pSeries:
334system_reset_fwnmi: 394system_reset_fwnmi:
335 HMT_MEDIUM 395 HMT_MEDIUM
336 SET_SCRATCH0(r13) /* save r13 */ 396 SET_SCRATCH0(r13) /* save r13 */
337 EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD) 397 EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
338 398 NOTEST, 0x100)
339 .globl machine_check_fwnmi
340 .align 7
341machine_check_fwnmi:
342 HMT_MEDIUM
343 SET_SCRATCH0(r13) /* save r13 */
344 EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common, EXC_STD)
345 399
346#endif /* CONFIG_PPC_PSERIES */ 400#endif /* CONFIG_PPC_PSERIES */
347 401
@@ -376,7 +430,11 @@ slb_miss_user_pseries:
376/* KVM's trampoline code needs to be close to the interrupt handlers */ 430/* KVM's trampoline code needs to be close to the interrupt handlers */
377 431
378#ifdef CONFIG_KVM_BOOK3S_64_HANDLER 432#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
433#ifdef CONFIG_KVM_BOOK3S_PR
379#include "../kvm/book3s_rmhandlers.S" 434#include "../kvm/book3s_rmhandlers.S"
435#else
436#include "../kvm/book3s_hv_rmhandlers.S"
437#endif
380#endif 438#endif
381 439
382 .align 7 440 .align 7
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index 5ecf54cfa7d4..fe37dd0dfd17 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -656,7 +656,7 @@ load_up_spe:
656 cmpi 0,r4,0 656 cmpi 0,r4,0
657 beq 1f 657 beq 1f
658 addi r4,r4,THREAD /* want THREAD of last_task_used_spe */ 658 addi r4,r4,THREAD /* want THREAD of last_task_used_spe */
659 SAVE_32EVRS(0,r10,r4) 659 SAVE_32EVRS(0,r10,r4,THREAD_EVR0)
660 evxor evr10, evr10, evr10 /* clear out evr10 */ 660 evxor evr10, evr10, evr10 /* clear out evr10 */
661 evmwumiaa evr10, evr10, evr10 /* evr10 <- ACC = 0 * 0 + ACC */ 661 evmwumiaa evr10, evr10, evr10 /* evr10 <- ACC = 0 * 0 + ACC */
662 li r5,THREAD_ACC 662 li r5,THREAD_ACC
@@ -676,7 +676,7 @@ load_up_spe:
676 stw r4,THREAD_USED_SPE(r5) 676 stw r4,THREAD_USED_SPE(r5)
677 evlddx evr4,r10,r5 677 evlddx evr4,r10,r5
678 evmra evr4,evr4 678 evmra evr4,evr4
679 REST_32EVRS(0,r10,r5) 679 REST_32EVRS(0,r10,r5,THREAD_EVR0)
680#ifndef CONFIG_SMP 680#ifndef CONFIG_SMP
681 subi r4,r5,THREAD 681 subi r4,r5,THREAD
682 stw r4,last_task_used_spe@l(r3) 682 stw r4,last_task_used_spe@l(r3)
@@ -787,13 +787,11 @@ _GLOBAL(giveup_spe)
787 addi r3,r3,THREAD /* want THREAD of task */ 787 addi r3,r3,THREAD /* want THREAD of task */
788 lwz r5,PT_REGS(r3) 788 lwz r5,PT_REGS(r3)
789 cmpi 0,r5,0 789 cmpi 0,r5,0
790 SAVE_32EVRS(0, r4, r3) 790 SAVE_32EVRS(0, r4, r3, THREAD_EVR0)
791 evxor evr6, evr6, evr6 /* clear out evr6 */ 791 evxor evr6, evr6, evr6 /* clear out evr6 */
792 evmwumiaa evr6, evr6, evr6 /* evr6 <- ACC = 0 * 0 + ACC */ 792 evmwumiaa evr6, evr6, evr6 /* evr6 <- ACC = 0 * 0 + ACC */
793 li r4,THREAD_ACC 793 li r4,THREAD_ACC
794 evstddx evr6, r4, r3 /* save off accumulator */ 794 evstddx evr6, r4, r3 /* save off accumulator */
795 mfspr r6,SPRN_SPEFSCR
796 stw r6,THREAD_SPEFSCR(r3) /* save spefscr register value */
797 beq 1f 795 beq 1f
798 lwz r4,_MSR-STACK_FRAME_OVERHEAD(r5) 796 lwz r4,_MSR-STACK_FRAME_OVERHEAD(r5)
799 lis r3,MSR_SPE@h 797 lis r3,MSR_SPE@h
diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S
index f8f0bc7f1d4f..3a70845a51c7 100644
--- a/arch/powerpc/kernel/idle_power7.S
+++ b/arch/powerpc/kernel/idle_power7.S
@@ -73,7 +73,6 @@ _GLOBAL(power7_idle)
73 b . 73 b .
74 74
75_GLOBAL(power7_wakeup_loss) 75_GLOBAL(power7_wakeup_loss)
76 GET_PACA(r13)
77 ld r1,PACAR1(r13) 76 ld r1,PACAR1(r13)
78 REST_NVGPRS(r1) 77 REST_NVGPRS(r1)
79 REST_GPR(2, r1) 78 REST_GPR(2, r1)
@@ -87,7 +86,6 @@ _GLOBAL(power7_wakeup_loss)
87 rfid 86 rfid
88 87
89_GLOBAL(power7_wakeup_noloss) 88_GLOBAL(power7_wakeup_noloss)
90 GET_PACA(r13)
91 ld r1,PACAR1(r13) 89 ld r1,PACAR1(r13)
92 ld r4,_MSR(r1) 90 ld r4,_MSR(r1)
93 ld r5,_NIP(r1) 91 ld r5,_NIP(r1)
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index efeb88184182..0a5a899846bb 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -167,7 +167,7 @@ void setup_paca(struct paca_struct *new_paca)
167 * if we do a GET_PACA() before the feature fixups have been 167 * if we do a GET_PACA() before the feature fixups have been
168 * applied 168 * applied
169 */ 169 */
170 if (cpu_has_feature(CPU_FTR_HVMODE_206)) 170 if (cpu_has_feature(CPU_FTR_HVMODE))
171 mtspr(SPRN_SPRG_HPACA, local_paca); 171 mtspr(SPRN_SPRG_HPACA, local_paca);
172#endif 172#endif
173 mtspr(SPRN_SPRG_PACA, local_paca); 173 mtspr(SPRN_SPRG_PACA, local_paca);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 91e52df3d81d..ec2d0edeb134 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -96,6 +96,7 @@ void flush_fp_to_thread(struct task_struct *tsk)
96 preempt_enable(); 96 preempt_enable();
97 } 97 }
98} 98}
99EXPORT_SYMBOL_GPL(flush_fp_to_thread);
99 100
100void enable_kernel_fp(void) 101void enable_kernel_fp(void)
101{ 102{
@@ -145,6 +146,7 @@ void flush_altivec_to_thread(struct task_struct *tsk)
145 preempt_enable(); 146 preempt_enable();
146 } 147 }
147} 148}
149EXPORT_SYMBOL_GPL(flush_altivec_to_thread);
148#endif /* CONFIG_ALTIVEC */ 150#endif /* CONFIG_ALTIVEC */
149 151
150#ifdef CONFIG_VSX 152#ifdef CONFIG_VSX
@@ -186,6 +188,7 @@ void flush_vsx_to_thread(struct task_struct *tsk)
186 preempt_enable(); 188 preempt_enable();
187 } 189 }
188} 190}
191EXPORT_SYMBOL_GPL(flush_vsx_to_thread);
189#endif /* CONFIG_VSX */ 192#endif /* CONFIG_VSX */
190 193
191#ifdef CONFIG_SPE 194#ifdef CONFIG_SPE
@@ -213,6 +216,7 @@ void flush_spe_to_thread(struct task_struct *tsk)
213#ifdef CONFIG_SMP 216#ifdef CONFIG_SMP
214 BUG_ON(tsk != current); 217 BUG_ON(tsk != current);
215#endif 218#endif
219 tsk->thread.spefscr = mfspr(SPRN_SPEFSCR);
216 giveup_spe(tsk); 220 giveup_spe(tsk);
217 } 221 }
218 preempt_enable(); 222 preempt_enable();
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 79fca2651b65..22051ef04bd9 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -375,6 +375,9 @@ void __init check_for_initrd(void)
375 375
376int threads_per_core, threads_shift; 376int threads_per_core, threads_shift;
377cpumask_t threads_core_mask; 377cpumask_t threads_core_mask;
378EXPORT_SYMBOL_GPL(threads_per_core);
379EXPORT_SYMBOL_GPL(threads_shift);
380EXPORT_SYMBOL_GPL(threads_core_mask);
378 381
379static void __init cpu_init_thread_core_maps(int tpc) 382static void __init cpu_init_thread_core_maps(int tpc)
380{ 383{
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index a88bf2713d41..532054f24ecb 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -63,6 +63,7 @@
63#include <asm/kexec.h> 63#include <asm/kexec.h>
64#include <asm/mmu_context.h> 64#include <asm/mmu_context.h>
65#include <asm/code-patching.h> 65#include <asm/code-patching.h>
66#include <asm/kvm_ppc.h>
66 67
67#include "setup.h" 68#include "setup.h"
68 69
@@ -580,6 +581,8 @@ void __init setup_arch(char **cmdline_p)
580 /* Initialize the MMU context management stuff */ 581 /* Initialize the MMU context management stuff */
581 mmu_context_init(); 582 mmu_context_init();
582 583
584 kvm_rma_init();
585
583 ppc64_boot_msg(0x15, "Setup Done"); 586 ppc64_boot_msg(0x15, "Setup Done");
584} 587}
585 588
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8ebc6700b98d..09a85a9045d6 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -243,6 +243,7 @@ void smp_send_reschedule(int cpu)
243 if (likely(smp_ops)) 243 if (likely(smp_ops))
244 smp_ops->message_pass(cpu, PPC_MSG_RESCHEDULE); 244 smp_ops->message_pass(cpu, PPC_MSG_RESCHEDULE);
245} 245}
246EXPORT_SYMBOL_GPL(smp_send_reschedule);
246 247
247void arch_send_call_function_single_ipi(int cpu) 248void arch_send_call_function_single_ipi(int cpu)
248{ 249{
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 1a0141426cda..f19d9777d3c1 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1387,10 +1387,7 @@ void SPEFloatingPointException(struct pt_regs *regs)
1387 int code = 0; 1387 int code = 0;
1388 int err; 1388 int err;
1389 1389
1390 preempt_disable(); 1390 flush_spe_to_thread(current);
1391 if (regs->msr & MSR_SPE)
1392 giveup_spe(current);
1393 preempt_enable();
1394 1391
1395 spefscr = current->thread.spefscr; 1392 spefscr = current->thread.spefscr;
1396 fpexc_mode = current->thread.fpexc_mode; 1393 fpexc_mode = current->thread.fpexc_mode;
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 5f3cff83e089..33aa715dab28 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -387,8 +387,10 @@ static void kvmppc_44x_invalidate(struct kvm_vcpu *vcpu,
387 } 387 }
388} 388}
389 389
390void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode) 390void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr)
391{ 391{
392 int usermode = vcpu->arch.shared->msr & MSR_PR;
393
392 vcpu->arch.shadow_pid = !usermode; 394 vcpu->arch.shadow_pid = !usermode;
393} 395}
394 396
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 105b6918b23e..78133deb4b64 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,7 +20,6 @@ config KVM
20 bool 20 bool
21 select PREEMPT_NOTIFIERS 21 select PREEMPT_NOTIFIERS
22 select ANON_INODES 22 select ANON_INODES
23 select KVM_MMIO
24 23
25config KVM_BOOK3S_HANDLER 24config KVM_BOOK3S_HANDLER
26 bool 25 bool
@@ -28,16 +27,22 @@ config KVM_BOOK3S_HANDLER
28config KVM_BOOK3S_32_HANDLER 27config KVM_BOOK3S_32_HANDLER
29 bool 28 bool
30 select KVM_BOOK3S_HANDLER 29 select KVM_BOOK3S_HANDLER
30 select KVM_MMIO
31 31
32config KVM_BOOK3S_64_HANDLER 32config KVM_BOOK3S_64_HANDLER
33 bool 33 bool
34 select KVM_BOOK3S_HANDLER 34 select KVM_BOOK3S_HANDLER
35 35
36config KVM_BOOK3S_PR
37 bool
38 select KVM_MMIO
39
36config KVM_BOOK3S_32 40config KVM_BOOK3S_32
37 tristate "KVM support for PowerPC book3s_32 processors" 41 tristate "KVM support for PowerPC book3s_32 processors"
38 depends on EXPERIMENTAL && PPC_BOOK3S_32 && !SMP && !PTE_64BIT 42 depends on EXPERIMENTAL && PPC_BOOK3S_32 && !SMP && !PTE_64BIT
39 select KVM 43 select KVM
40 select KVM_BOOK3S_32_HANDLER 44 select KVM_BOOK3S_32_HANDLER
45 select KVM_BOOK3S_PR
41 ---help--- 46 ---help---
42 Support running unmodified book3s_32 guest kernels 47 Support running unmodified book3s_32 guest kernels
43 in virtual machines on book3s_32 host processors. 48 in virtual machines on book3s_32 host processors.
@@ -50,8 +55,8 @@ config KVM_BOOK3S_32
50config KVM_BOOK3S_64 55config KVM_BOOK3S_64
51 tristate "KVM support for PowerPC book3s_64 processors" 56 tristate "KVM support for PowerPC book3s_64 processors"
52 depends on EXPERIMENTAL && PPC_BOOK3S_64 57 depends on EXPERIMENTAL && PPC_BOOK3S_64
53 select KVM
54 select KVM_BOOK3S_64_HANDLER 58 select KVM_BOOK3S_64_HANDLER
59 select KVM
55 ---help--- 60 ---help---
56 Support running unmodified book3s_64 and book3s_32 guest kernels 61 Support running unmodified book3s_64 and book3s_32 guest kernels
57 in virtual machines on book3s_64 host processors. 62 in virtual machines on book3s_64 host processors.
@@ -61,10 +66,34 @@ config KVM_BOOK3S_64
61 66
62 If unsure, say N. 67 If unsure, say N.
63 68
69config KVM_BOOK3S_64_HV
70 bool "KVM support for POWER7 and PPC970 using hypervisor mode in host"
71 depends on KVM_BOOK3S_64
72 ---help---
73 Support running unmodified book3s_64 guest kernels in
74 virtual machines on POWER7 and PPC970 processors that have
75 hypervisor mode available to the host.
76
77 If you say Y here, KVM will use the hardware virtualization
78 facilities of POWER7 (and later) processors, meaning that
79 guest operating systems will run at full hardware speed
80 using supervisor and user modes. However, this also means
81 that KVM is not usable under PowerVM (pHyp), is only usable
82 on POWER7 (or later) processors and PPC970-family processors,
83 and cannot emulate a different processor from the host processor.
84
85 If unsure, say N.
86
87config KVM_BOOK3S_64_PR
88 def_bool y
89 depends on KVM_BOOK3S_64 && !KVM_BOOK3S_64_HV
90 select KVM_BOOK3S_PR
91
64config KVM_440 92config KVM_440
65 bool "KVM support for PowerPC 440 processors" 93 bool "KVM support for PowerPC 440 processors"
66 depends on EXPERIMENTAL && 44x 94 depends on EXPERIMENTAL && 44x
67 select KVM 95 select KVM
96 select KVM_MMIO
68 ---help--- 97 ---help---
69 Support running unmodified 440 guest kernels in virtual machines on 98 Support running unmodified 440 guest kernels in virtual machines on
70 440 host processors. 99 440 host processors.
@@ -89,6 +118,7 @@ config KVM_E500
89 bool "KVM support for PowerPC E500 processors" 118 bool "KVM support for PowerPC E500 processors"
90 depends on EXPERIMENTAL && E500 119 depends on EXPERIMENTAL && E500
91 select KVM 120 select KVM
121 select KVM_MMIO
92 ---help--- 122 ---help---
93 Support running unmodified E500 guest kernels in virtual machines on 123 Support running unmodified E500 guest kernels in virtual machines on
94 E500 host processors. 124 E500 host processors.
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 4d6863823f69..08428e2c188d 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -38,24 +38,42 @@ kvm-e500-objs := \
38 e500_emulate.o 38 e500_emulate.o
39kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs) 39kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs)
40 40
41kvm-book3s_64-objs := \ 41kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
42 $(common-objs-y) \ 42 ../../../virt/kvm/coalesced_mmio.o \
43 fpu.o \ 43 fpu.o \
44 book3s_paired_singles.o \ 44 book3s_paired_singles.o \
45 book3s.o \ 45 book3s_pr.o \
46 book3s_emulate.o \ 46 book3s_emulate.o \
47 book3s_interrupts.o \ 47 book3s_interrupts.o \
48 book3s_mmu_hpte.o \ 48 book3s_mmu_hpte.o \
49 book3s_64_mmu_host.o \ 49 book3s_64_mmu_host.o \
50 book3s_64_mmu.o \ 50 book3s_64_mmu.o \
51 book3s_32_mmu.o 51 book3s_32_mmu.o
52kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-objs) 52
53kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
54 book3s_hv.o \
55 book3s_hv_interrupts.o \
56 book3s_64_mmu_hv.o
57kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
58 book3s_hv_rm_mmu.o \
59 book3s_64_vio_hv.o \
60 book3s_hv_builtin.o
61
62kvm-book3s_64-module-objs := \
63 ../../../virt/kvm/kvm_main.o \
64 powerpc.o \
65 emulate.o \
66 book3s.o \
67 $(kvm-book3s_64-objs-y)
68
69kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
53 70
54kvm-book3s_32-objs := \ 71kvm-book3s_32-objs := \
55 $(common-objs-y) \ 72 $(common-objs-y) \
56 fpu.o \ 73 fpu.o \
57 book3s_paired_singles.o \ 74 book3s_paired_singles.o \
58 book3s.o \ 75 book3s.o \
76 book3s_pr.o \
59 book3s_emulate.o \ 77 book3s_emulate.o \
60 book3s_interrupts.o \ 78 book3s_interrupts.o \
61 book3s_mmu_hpte.o \ 79 book3s_mmu_hpte.o \
@@ -70,3 +88,4 @@ obj-$(CONFIG_KVM_E500) += kvm.o
70obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o 88obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o
71obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o 89obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o
72 90
91obj-y += $(kvm-book3s_64-builtin-objs-y)
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 0f95b5cce033..f68a34d16035 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -17,7 +17,6 @@
17#include <linux/kvm_host.h> 17#include <linux/kvm_host.h>
18#include <linux/err.h> 18#include <linux/err.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include "trace.h"
21 20
22#include <asm/reg.h> 21#include <asm/reg.h>
23#include <asm/cputable.h> 22#include <asm/cputable.h>
@@ -28,25 +27,17 @@
28#include <asm/kvm_ppc.h> 27#include <asm/kvm_ppc.h>
29#include <asm/kvm_book3s.h> 28#include <asm/kvm_book3s.h>
30#include <asm/mmu_context.h> 29#include <asm/mmu_context.h>
30#include <asm/page.h>
31#include <linux/gfp.h> 31#include <linux/gfp.h>
32#include <linux/sched.h> 32#include <linux/sched.h>
33#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
34#include <linux/highmem.h> 34#include <linux/highmem.h>
35 35
36#include "trace.h"
37
36#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 38#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
37 39
38/* #define EXIT_DEBUG */ 40/* #define EXIT_DEBUG */
39/* #define DEBUG_EXT */
40
41static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
42 ulong msr);
43
44/* Some compatibility defines */
45#ifdef CONFIG_PPC_BOOK3S_32
46#define MSR_USER32 MSR_USER
47#define MSR_USER64 MSR_USER
48#define HW_PAGE_SIZE PAGE_SIZE
49#endif
50 41
51struct kvm_stats_debugfs_item debugfs_entries[] = { 42struct kvm_stats_debugfs_item debugfs_entries[] = {
52 { "exits", VCPU_STAT(sum_exits) }, 43 { "exits", VCPU_STAT(sum_exits) },
@@ -77,100 +68,11 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
77{ 68{
78} 69}
79 70
80void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
81{
82#ifdef CONFIG_PPC_BOOK3S_64
83 memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb));
84 memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu,
85 sizeof(get_paca()->shadow_vcpu));
86 to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max;
87#endif
88
89#ifdef CONFIG_PPC_BOOK3S_32
90 current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu;
91#endif
92}
93
94void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
95{
96#ifdef CONFIG_PPC_BOOK3S_64
97 memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb));
98 memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu,
99 sizeof(get_paca()->shadow_vcpu));
100 to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max;
101#endif
102
103 kvmppc_giveup_ext(vcpu, MSR_FP);
104 kvmppc_giveup_ext(vcpu, MSR_VEC);
105 kvmppc_giveup_ext(vcpu, MSR_VSX);
106}
107
108static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
109{
110 ulong smsr = vcpu->arch.shared->msr;
111
112 /* Guest MSR values */
113 smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE;
114 /* Process MSR values */
115 smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
116 /* External providers the guest reserved */
117 smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext);
118 /* 64-bit Process MSR values */
119#ifdef CONFIG_PPC_BOOK3S_64
120 smsr |= MSR_ISF | MSR_HV;
121#endif
122 vcpu->arch.shadow_msr = smsr;
123}
124
125void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
126{
127 ulong old_msr = vcpu->arch.shared->msr;
128
129#ifdef EXIT_DEBUG
130 printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr);
131#endif
132
133 msr &= to_book3s(vcpu)->msr_mask;
134 vcpu->arch.shared->msr = msr;
135 kvmppc_recalc_shadow_msr(vcpu);
136
137 if (msr & MSR_POW) {
138 if (!vcpu->arch.pending_exceptions) {
139 kvm_vcpu_block(vcpu);
140 vcpu->stat.halt_wakeup++;
141
142 /* Unset POW bit after we woke up */
143 msr &= ~MSR_POW;
144 vcpu->arch.shared->msr = msr;
145 }
146 }
147
148 if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) !=
149 (old_msr & (MSR_PR|MSR_IR|MSR_DR))) {
150 kvmppc_mmu_flush_segments(vcpu);
151 kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
152
153 /* Preload magic page segment when in kernel mode */
154 if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) {
155 struct kvm_vcpu_arch *a = &vcpu->arch;
156
157 if (msr & MSR_DR)
158 kvmppc_mmu_map_segment(vcpu, a->magic_page_ea);
159 else
160 kvmppc_mmu_map_segment(vcpu, a->magic_page_pa);
161 }
162 }
163
164 /* Preload FPU if it's enabled */
165 if (vcpu->arch.shared->msr & MSR_FP)
166 kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
167}
168
169void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags) 71void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags)
170{ 72{
171 vcpu->arch.shared->srr0 = kvmppc_get_pc(vcpu); 73 vcpu->arch.shared->srr0 = kvmppc_get_pc(vcpu);
172 vcpu->arch.shared->srr1 = vcpu->arch.shared->msr | flags; 74 vcpu->arch.shared->srr1 = vcpu->arch.shared->msr | flags;
173 kvmppc_set_pc(vcpu, to_book3s(vcpu)->hior + vec); 75 kvmppc_set_pc(vcpu, kvmppc_interrupt_offset(vcpu) + vec);
174 vcpu->arch.mmu.reset_msr(vcpu); 76 vcpu->arch.mmu.reset_msr(vcpu);
175} 77}
176 78
@@ -204,11 +106,13 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
204static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, 106static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
205 unsigned int vec) 107 unsigned int vec)
206{ 108{
109 unsigned long old_pending = vcpu->arch.pending_exceptions;
110
207 clear_bit(kvmppc_book3s_vec2irqprio(vec), 111 clear_bit(kvmppc_book3s_vec2irqprio(vec),
208 &vcpu->arch.pending_exceptions); 112 &vcpu->arch.pending_exceptions);
209 113
210 if (!vcpu->arch.pending_exceptions) 114 kvmppc_update_int_pending(vcpu, vcpu->arch.pending_exceptions,
211 vcpu->arch.shared->int_pending = 0; 115 old_pending);
212} 116}
213 117
214void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) 118void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
@@ -225,8 +129,8 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
225 129
226void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags) 130void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags)
227{ 131{
228 to_book3s(vcpu)->prog_flags = flags; 132 /* might as well deliver this straight away */
229 kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_PROGRAM); 133 kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_PROGRAM, flags);
230} 134}
231 135
232void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu) 136void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
@@ -266,21 +170,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
266{ 170{
267 int deliver = 1; 171 int deliver = 1;
268 int vec = 0; 172 int vec = 0;
269 ulong flags = 0ULL; 173 bool crit = kvmppc_critical_section(vcpu);
270 ulong crit_raw = vcpu->arch.shared->critical;
271 ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
272 bool crit;
273
274 /* Truncate crit indicators in 32 bit mode */
275 if (!(vcpu->arch.shared->msr & MSR_SF)) {
276 crit_raw &= 0xffffffff;
277 crit_r1 &= 0xffffffff;
278 }
279
280 /* Critical section when crit == r1 */
281 crit = (crit_raw == crit_r1);
282 /* ... and we're in supervisor mode */
283 crit = crit && !(vcpu->arch.shared->msr & MSR_PR);
284 174
285 switch (priority) { 175 switch (priority) {
286 case BOOK3S_IRQPRIO_DECREMENTER: 176 case BOOK3S_IRQPRIO_DECREMENTER:
@@ -315,7 +205,6 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
315 break; 205 break;
316 case BOOK3S_IRQPRIO_PROGRAM: 206 case BOOK3S_IRQPRIO_PROGRAM:
317 vec = BOOK3S_INTERRUPT_PROGRAM; 207 vec = BOOK3S_INTERRUPT_PROGRAM;
318 flags = to_book3s(vcpu)->prog_flags;
319 break; 208 break;
320 case BOOK3S_IRQPRIO_VSX: 209 case BOOK3S_IRQPRIO_VSX:
321 vec = BOOK3S_INTERRUPT_VSX; 210 vec = BOOK3S_INTERRUPT_VSX;
@@ -346,7 +235,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
346#endif 235#endif
347 236
348 if (deliver) 237 if (deliver)
349 kvmppc_inject_interrupt(vcpu, vec, flags); 238 kvmppc_inject_interrupt(vcpu, vec, 0);
350 239
351 return deliver; 240 return deliver;
352} 241}
@@ -392,64 +281,7 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
392 } 281 }
393 282
394 /* Tell the guest about our interrupt status */ 283 /* Tell the guest about our interrupt status */
395 if (*pending) 284 kvmppc_update_int_pending(vcpu, *pending, old_pending);
396 vcpu->arch.shared->int_pending = 1;
397 else if (old_pending)
398 vcpu->arch.shared->int_pending = 0;
399}
400
401void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
402{
403 u32 host_pvr;
404
405 vcpu->arch.hflags &= ~BOOK3S_HFLAG_SLB;
406 vcpu->arch.pvr = pvr;
407#ifdef CONFIG_PPC_BOOK3S_64
408 if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
409 kvmppc_mmu_book3s_64_init(vcpu);
410 to_book3s(vcpu)->hior = 0xfff00000;
411 to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
412 } else
413#endif
414 {
415 kvmppc_mmu_book3s_32_init(vcpu);
416 to_book3s(vcpu)->hior = 0;
417 to_book3s(vcpu)->msr_mask = 0xffffffffULL;
418 }
419
420 /* If we are in hypervisor level on 970, we can tell the CPU to
421 * treat DCBZ as 32 bytes store */
422 vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32;
423 if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV) &&
424 !strcmp(cur_cpu_spec->platform, "ppc970"))
425 vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
426
427 /* Cell performs badly if MSR_FEx are set. So let's hope nobody
428 really needs them in a VM on Cell and force disable them. */
429 if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be"))
430 to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1);
431
432#ifdef CONFIG_PPC_BOOK3S_32
433 /* 32 bit Book3S always has 32 byte dcbz */
434 vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
435#endif
436
437 /* On some CPUs we can execute paired single operations natively */
438 asm ( "mfpvr %0" : "=r"(host_pvr));
439 switch (host_pvr) {
440 case 0x00080200: /* lonestar 2.0 */
441 case 0x00088202: /* lonestar 2.2 */
442 case 0x70000100: /* gekko 1.0 */
443 case 0x00080100: /* gekko 2.0 */
444 case 0x00083203: /* gekko 2.3a */
445 case 0x00083213: /* gekko 2.3b */
446 case 0x00083204: /* gekko 2.4 */
447 case 0x00083214: /* gekko 2.4e (8SE) - retail HW2 */
448 case 0x00087200: /* broadway */
449 vcpu->arch.hflags |= BOOK3S_HFLAG_NATIVE_PS;
450 /* Enable HID2.PSE - in case we need it later */
451 mtspr(SPRN_HID2_GEKKO, mfspr(SPRN_HID2_GEKKO) | (1 << 29));
452 }
453} 285}
454 286
455pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) 287pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -471,44 +303,6 @@ pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
471 return gfn_to_pfn(vcpu->kvm, gfn); 303 return gfn_to_pfn(vcpu->kvm, gfn);
472} 304}
473 305
474/* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To
475 * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to
476 * emulate 32 bytes dcbz length.
477 *
478 * The Book3s_64 inventors also realized this case and implemented a special bit
479 * in the HID5 register, which is a hypervisor ressource. Thus we can't use it.
480 *
481 * My approach here is to patch the dcbz instruction on executing pages.
482 */
483static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
484{
485 struct page *hpage;
486 u64 hpage_offset;
487 u32 *page;
488 int i;
489
490 hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
491 if (is_error_page(hpage)) {
492 kvm_release_page_clean(hpage);
493 return;
494 }
495
496 hpage_offset = pte->raddr & ~PAGE_MASK;
497 hpage_offset &= ~0xFFFULL;
498 hpage_offset /= 4;
499
500 get_page(hpage);
501 page = kmap_atomic(hpage, KM_USER0);
502
503 /* patch dcbz into reserved instruction, so we trap */
504 for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++)
505 if ((page[i] & 0xff0007ff) == INS_DCBZ)
506 page[i] &= 0xfffffff7;
507
508 kunmap_atomic(page, KM_USER0);
509 put_page(hpage);
510}
511
512static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data, 306static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data,
513 struct kvmppc_pte *pte) 307 struct kvmppc_pte *pte)
514{ 308{
@@ -606,519 +400,6 @@ mmio:
606 return EMULATE_DO_MMIO; 400 return EMULATE_DO_MMIO;
607} 401}
608 402
609static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
610{
611 ulong mp_pa = vcpu->arch.magic_page_pa;
612
613 if (unlikely(mp_pa) &&
614 unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) {
615 return 1;
616 }
617
618 return kvm_is_visible_gfn(vcpu->kvm, gfn);
619}
620
621int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
622 ulong eaddr, int vec)
623{
624 bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE);
625 int r = RESUME_GUEST;
626 int relocated;
627 int page_found = 0;
628 struct kvmppc_pte pte;
629 bool is_mmio = false;
630 bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false;
631 bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false;
632 u64 vsid;
633
634 relocated = data ? dr : ir;
635
636 /* Resolve real address if translation turned on */
637 if (relocated) {
638 page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data);
639 } else {
640 pte.may_execute = true;
641 pte.may_read = true;
642 pte.may_write = true;
643 pte.raddr = eaddr & KVM_PAM;
644 pte.eaddr = eaddr;
645 pte.vpage = eaddr >> 12;
646 }
647
648 switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
649 case 0:
650 pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12));
651 break;
652 case MSR_DR:
653 case MSR_IR:
654 vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid);
655
656 if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR)
657 pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12));
658 else
659 pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12));
660 pte.vpage |= vsid;
661
662 if (vsid == -1)
663 page_found = -EINVAL;
664 break;
665 }
666
667 if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
668 (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
669 /*
670 * If we do the dcbz hack, we have to NX on every execution,
671 * so we can patch the executing code. This renders our guest
672 * NX-less.
673 */
674 pte.may_execute = !data;
675 }
676
677 if (page_found == -ENOENT) {
678 /* Page not found in guest PTE entries */
679 vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
680 vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
681 vcpu->arch.shared->msr |=
682 (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
683 kvmppc_book3s_queue_irqprio(vcpu, vec);
684 } else if (page_found == -EPERM) {
685 /* Storage protection */
686 vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
687 vcpu->arch.shared->dsisr =
688 to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE;
689 vcpu->arch.shared->dsisr |= DSISR_PROTFAULT;
690 vcpu->arch.shared->msr |=
691 (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
692 kvmppc_book3s_queue_irqprio(vcpu, vec);
693 } else if (page_found == -EINVAL) {
694 /* Page not found in guest SLB */
695 vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
696 kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80);
697 } else if (!is_mmio &&
698 kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) {
699 /* The guest's PTE is not mapped yet. Map on the host */
700 kvmppc_mmu_map_page(vcpu, &pte);
701 if (data)
702 vcpu->stat.sp_storage++;
703 else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
704 (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32)))
705 kvmppc_patch_dcbz(vcpu, &pte);
706 } else {
707 /* MMIO */
708 vcpu->stat.mmio_exits++;
709 vcpu->arch.paddr_accessed = pte.raddr;
710 r = kvmppc_emulate_mmio(run, vcpu);
711 if ( r == RESUME_HOST_NV )
712 r = RESUME_HOST;
713 }
714
715 return r;
716}
717
718static inline int get_fpr_index(int i)
719{
720#ifdef CONFIG_VSX
721 i *= 2;
722#endif
723 return i;
724}
725
726/* Give up external provider (FPU, Altivec, VSX) */
727void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
728{
729 struct thread_struct *t = &current->thread;
730 u64 *vcpu_fpr = vcpu->arch.fpr;
731#ifdef CONFIG_VSX
732 u64 *vcpu_vsx = vcpu->arch.vsr;
733#endif
734 u64 *thread_fpr = (u64*)t->fpr;
735 int i;
736
737 if (!(vcpu->arch.guest_owned_ext & msr))
738 return;
739
740#ifdef DEBUG_EXT
741 printk(KERN_INFO "Giving up ext 0x%lx\n", msr);
742#endif
743
744 switch (msr) {
745 case MSR_FP:
746 giveup_fpu(current);
747 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
748 vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];
749
750 vcpu->arch.fpscr = t->fpscr.val;
751 break;
752 case MSR_VEC:
753#ifdef CONFIG_ALTIVEC
754 giveup_altivec(current);
755 memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr));
756 vcpu->arch.vscr = t->vscr;
757#endif
758 break;
759 case MSR_VSX:
760#ifdef CONFIG_VSX
761 __giveup_vsx(current);
762 for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
763 vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1];
764#endif
765 break;
766 default:
767 BUG();
768 }
769
770 vcpu->arch.guest_owned_ext &= ~msr;
771 current->thread.regs->msr &= ~msr;
772 kvmppc_recalc_shadow_msr(vcpu);
773}
774
775static int kvmppc_read_inst(struct kvm_vcpu *vcpu)
776{
777 ulong srr0 = kvmppc_get_pc(vcpu);
778 u32 last_inst = kvmppc_get_last_inst(vcpu);
779 int ret;
780
781 ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
782 if (ret == -ENOENT) {
783 ulong msr = vcpu->arch.shared->msr;
784
785 msr = kvmppc_set_field(msr, 33, 33, 1);
786 msr = kvmppc_set_field(msr, 34, 36, 0);
787 vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0);
788 kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE);
789 return EMULATE_AGAIN;
790 }
791
792 return EMULATE_DONE;
793}
794
795static int kvmppc_check_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr)
796{
797
798 /* Need to do paired single emulation? */
799 if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE))
800 return EMULATE_DONE;
801
802 /* Read out the instruction */
803 if (kvmppc_read_inst(vcpu) == EMULATE_DONE)
804 /* Need to emulate */
805 return EMULATE_FAIL;
806
807 return EMULATE_AGAIN;
808}
809
810/* Handle external providers (FPU, Altivec, VSX) */
811static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
812 ulong msr)
813{
814 struct thread_struct *t = &current->thread;
815 u64 *vcpu_fpr = vcpu->arch.fpr;
816#ifdef CONFIG_VSX
817 u64 *vcpu_vsx = vcpu->arch.vsr;
818#endif
819 u64 *thread_fpr = (u64*)t->fpr;
820 int i;
821
822 /* When we have paired singles, we emulate in software */
823 if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)
824 return RESUME_GUEST;
825
826 if (!(vcpu->arch.shared->msr & msr)) {
827 kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
828 return RESUME_GUEST;
829 }
830
831 /* We already own the ext */
832 if (vcpu->arch.guest_owned_ext & msr) {
833 return RESUME_GUEST;
834 }
835
836#ifdef DEBUG_EXT
837 printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
838#endif
839
840 current->thread.regs->msr |= msr;
841
842 switch (msr) {
843 case MSR_FP:
844 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
845 thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
846
847 t->fpscr.val = vcpu->arch.fpscr;
848 t->fpexc_mode = 0;
849 kvmppc_load_up_fpu();
850 break;
851 case MSR_VEC:
852#ifdef CONFIG_ALTIVEC
853 memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr));
854 t->vscr = vcpu->arch.vscr;
855 t->vrsave = -1;
856 kvmppc_load_up_altivec();
857#endif
858 break;
859 case MSR_VSX:
860#ifdef CONFIG_VSX
861 for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
862 thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i];
863 kvmppc_load_up_vsx();
864#endif
865 break;
866 default:
867 BUG();
868 }
869
870 vcpu->arch.guest_owned_ext |= msr;
871
872 kvmppc_recalc_shadow_msr(vcpu);
873
874 return RESUME_GUEST;
875}
876
877int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
878 unsigned int exit_nr)
879{
880 int r = RESUME_HOST;
881
882 vcpu->stat.sum_exits++;
883
884 run->exit_reason = KVM_EXIT_UNKNOWN;
885 run->ready_for_interrupt_injection = 1;
886
887 trace_kvm_book3s_exit(exit_nr, vcpu);
888 kvm_resched(vcpu);
889 switch (exit_nr) {
890 case BOOK3S_INTERRUPT_INST_STORAGE:
891 vcpu->stat.pf_instruc++;
892
893#ifdef CONFIG_PPC_BOOK3S_32
894 /* We set segments as unused segments when invalidating them. So
895 * treat the respective fault as segment fault. */
896 if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT]
897 == SR_INVALID) {
898 kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
899 r = RESUME_GUEST;
900 break;
901 }
902#endif
903
904 /* only care about PTEG not found errors, but leave NX alone */
905 if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) {
906 r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr);
907 vcpu->stat.sp_instruc++;
908 } else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
909 (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
910 /*
911 * XXX If we do the dcbz hack we use the NX bit to flush&patch the page,
912 * so we can't use the NX bit inside the guest. Let's cross our fingers,
913 * that no guest that needs the dcbz hack does NX.
914 */
915 kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
916 r = RESUME_GUEST;
917 } else {
918 vcpu->arch.shared->msr |=
919 to_svcpu(vcpu)->shadow_srr1 & 0x58000000;
920 kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
921 r = RESUME_GUEST;
922 }
923 break;
924 case BOOK3S_INTERRUPT_DATA_STORAGE:
925 {
926 ulong dar = kvmppc_get_fault_dar(vcpu);
927 vcpu->stat.pf_storage++;
928
929#ifdef CONFIG_PPC_BOOK3S_32
930 /* We set segments as unused segments when invalidating them. So
931 * treat the respective fault as segment fault. */
932 if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) {
933 kvmppc_mmu_map_segment(vcpu, dar);
934 r = RESUME_GUEST;
935 break;
936 }
937#endif
938
939 /* The only case we need to handle is missing shadow PTEs */
940 if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) {
941 r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr);
942 } else {
943 vcpu->arch.shared->dar = dar;
944 vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
945 kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
946 r = RESUME_GUEST;
947 }
948 break;
949 }
950 case BOOK3S_INTERRUPT_DATA_SEGMENT:
951 if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) {
952 vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
953 kvmppc_book3s_queue_irqprio(vcpu,
954 BOOK3S_INTERRUPT_DATA_SEGMENT);
955 }
956 r = RESUME_GUEST;
957 break;
958 case BOOK3S_INTERRUPT_INST_SEGMENT:
959 if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)) < 0) {
960 kvmppc_book3s_queue_irqprio(vcpu,
961 BOOK3S_INTERRUPT_INST_SEGMENT);
962 }
963 r = RESUME_GUEST;
964 break;
965 /* We're good on these - the host merely wanted to get our attention */
966 case BOOK3S_INTERRUPT_DECREMENTER:
967 vcpu->stat.dec_exits++;
968 r = RESUME_GUEST;
969 break;
970 case BOOK3S_INTERRUPT_EXTERNAL:
971 vcpu->stat.ext_intr_exits++;
972 r = RESUME_GUEST;
973 break;
974 case BOOK3S_INTERRUPT_PERFMON:
975 r = RESUME_GUEST;
976 break;
977 case BOOK3S_INTERRUPT_PROGRAM:
978 {
979 enum emulation_result er;
980 ulong flags;
981
982program_interrupt:
983 flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull;
984
985 if (vcpu->arch.shared->msr & MSR_PR) {
986#ifdef EXIT_DEBUG
987 printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
988#endif
989 if ((kvmppc_get_last_inst(vcpu) & 0xff0007ff) !=
990 (INS_DCBZ & 0xfffffff7)) {
991 kvmppc_core_queue_program(vcpu, flags);
992 r = RESUME_GUEST;
993 break;
994 }
995 }
996
997 vcpu->stat.emulated_inst_exits++;
998 er = kvmppc_emulate_instruction(run, vcpu);
999 switch (er) {
1000 case EMULATE_DONE:
1001 r = RESUME_GUEST_NV;
1002 break;
1003 case EMULATE_AGAIN:
1004 r = RESUME_GUEST;
1005 break;
1006 case EMULATE_FAIL:
1007 printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
1008 __func__, kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
1009 kvmppc_core_queue_program(vcpu, flags);
1010 r = RESUME_GUEST;
1011 break;
1012 case EMULATE_DO_MMIO:
1013 run->exit_reason = KVM_EXIT_MMIO;
1014 r = RESUME_HOST_NV;
1015 break;
1016 default:
1017 BUG();
1018 }
1019 break;
1020 }
1021 case BOOK3S_INTERRUPT_SYSCALL:
1022 if (vcpu->arch.osi_enabled &&
1023 (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) &&
1024 (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) {
1025 /* MOL hypercalls */
1026 u64 *gprs = run->osi.gprs;
1027 int i;
1028
1029 run->exit_reason = KVM_EXIT_OSI;
1030 for (i = 0; i < 32; i++)
1031 gprs[i] = kvmppc_get_gpr(vcpu, i);
1032 vcpu->arch.osi_needed = 1;
1033 r = RESUME_HOST_NV;
1034 } else if (!(vcpu->arch.shared->msr & MSR_PR) &&
1035 (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
1036 /* KVM PV hypercalls */
1037 kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
1038 r = RESUME_GUEST;
1039 } else {
1040 /* Guest syscalls */
1041 vcpu->stat.syscall_exits++;
1042 kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
1043 r = RESUME_GUEST;
1044 }
1045 break;
1046 case BOOK3S_INTERRUPT_FP_UNAVAIL:
1047 case BOOK3S_INTERRUPT_ALTIVEC:
1048 case BOOK3S_INTERRUPT_VSX:
1049 {
1050 int ext_msr = 0;
1051
1052 switch (exit_nr) {
1053 case BOOK3S_INTERRUPT_FP_UNAVAIL: ext_msr = MSR_FP; break;
1054 case BOOK3S_INTERRUPT_ALTIVEC: ext_msr = MSR_VEC; break;
1055 case BOOK3S_INTERRUPT_VSX: ext_msr = MSR_VSX; break;
1056 }
1057
1058 switch (kvmppc_check_ext(vcpu, exit_nr)) {
1059 case EMULATE_DONE:
1060 /* everything ok - let's enable the ext */
1061 r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr);
1062 break;
1063 case EMULATE_FAIL:
1064 /* we need to emulate this instruction */
1065 goto program_interrupt;
1066 break;
1067 default:
1068 /* nothing to worry about - go again */
1069 break;
1070 }
1071 break;
1072 }
1073 case BOOK3S_INTERRUPT_ALIGNMENT:
1074 if (kvmppc_read_inst(vcpu) == EMULATE_DONE) {
1075 vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu,
1076 kvmppc_get_last_inst(vcpu));
1077 vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu,
1078 kvmppc_get_last_inst(vcpu));
1079 kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
1080 }
1081 r = RESUME_GUEST;
1082 break;
1083 case BOOK3S_INTERRUPT_MACHINE_CHECK:
1084 case BOOK3S_INTERRUPT_TRACE:
1085 kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
1086 r = RESUME_GUEST;
1087 break;
1088 default:
1089 /* Ugh - bork here! What did we get? */
1090 printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n",
1091 exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1);
1092 r = RESUME_HOST;
1093 BUG();
1094 break;
1095 }
1096
1097
1098 if (!(r & RESUME_HOST)) {
1099 /* To avoid clobbering exit_reason, only check for signals if
1100 * we aren't already exiting to userspace for some other
1101 * reason. */
1102 if (signal_pending(current)) {
1103#ifdef EXIT_DEBUG
1104 printk(KERN_EMERG "KVM: Going back to host\n");
1105#endif
1106 vcpu->stat.signal_exits++;
1107 run->exit_reason = KVM_EXIT_INTR;
1108 r = -EINTR;
1109 } else {
1110 /* In case an interrupt came in that was triggered
1111 * from userspace (like DEC), we need to check what
1112 * to inject now! */
1113 kvmppc_core_deliver_interrupts(vcpu);
1114 }
1115 }
1116
1117 trace_kvm_book3s_reenter(r, vcpu);
1118
1119 return r;
1120}
1121
1122int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 403int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
1123{ 404{
1124 return 0; 405 return 0;
@@ -1179,69 +460,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
1179 return 0; 460 return 0;
1180} 461}
1181 462
1182int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
1183 struct kvm_sregs *sregs)
1184{
1185 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
1186 int i;
1187
1188 sregs->pvr = vcpu->arch.pvr;
1189
1190 sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
1191 if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
1192 for (i = 0; i < 64; i++) {
1193 sregs->u.s.ppc64.slb[i].slbe = vcpu3s->slb[i].orige | i;
1194 sregs->u.s.ppc64.slb[i].slbv = vcpu3s->slb[i].origv;
1195 }
1196 } else {
1197 for (i = 0; i < 16; i++)
1198 sregs->u.s.ppc32.sr[i] = vcpu->arch.shared->sr[i];
1199
1200 for (i = 0; i < 8; i++) {
1201 sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw;
1202 sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw;
1203 }
1204 }
1205
1206 return 0;
1207}
1208
1209int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
1210 struct kvm_sregs *sregs)
1211{
1212 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
1213 int i;
1214
1215 kvmppc_set_pvr(vcpu, sregs->pvr);
1216
1217 vcpu3s->sdr1 = sregs->u.s.sdr1;
1218 if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
1219 for (i = 0; i < 64; i++) {
1220 vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv,
1221 sregs->u.s.ppc64.slb[i].slbe);
1222 }
1223 } else {
1224 for (i = 0; i < 16; i++) {
1225 vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]);
1226 }
1227 for (i = 0; i < 8; i++) {
1228 kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), false,
1229 (u32)sregs->u.s.ppc32.ibat[i]);
1230 kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), true,
1231 (u32)(sregs->u.s.ppc32.ibat[i] >> 32));
1232 kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), false,
1233 (u32)sregs->u.s.ppc32.dbat[i]);
1234 kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), true,
1235 (u32)(sregs->u.s.ppc32.dbat[i] >> 32));
1236 }
1237 }
1238
1239 /* Flush the MMU after messing with the segments */
1240 kvmppc_mmu_pte_flush(vcpu, 0, 0);
1241
1242 return 0;
1243}
1244
1245int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 463int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
1246{ 464{
1247 return -ENOTSUPP; 465 return -ENOTSUPP;
@@ -1296,202 +514,3 @@ out:
1296 mutex_unlock(&kvm->slots_lock); 514 mutex_unlock(&kvm->slots_lock);
1297 return r; 515 return r;
1298} 516}
1299
1300int kvmppc_core_check_processor_compat(void)
1301{
1302 return 0;
1303}
1304
1305struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
1306{
1307 struct kvmppc_vcpu_book3s *vcpu_book3s;
1308 struct kvm_vcpu *vcpu;
1309 int err = -ENOMEM;
1310 unsigned long p;
1311
1312 vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s));
1313 if (!vcpu_book3s)
1314 goto out;
1315
1316 vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *)
1317 kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL);
1318 if (!vcpu_book3s->shadow_vcpu)
1319 goto free_vcpu;
1320
1321 vcpu = &vcpu_book3s->vcpu;
1322 err = kvm_vcpu_init(vcpu, kvm, id);
1323 if (err)
1324 goto free_shadow_vcpu;
1325
1326 p = __get_free_page(GFP_KERNEL|__GFP_ZERO);
1327 /* the real shared page fills the last 4k of our page */
1328 vcpu->arch.shared = (void*)(p + PAGE_SIZE - 4096);
1329 if (!p)
1330 goto uninit_vcpu;
1331
1332 vcpu->arch.host_retip = kvm_return_point;
1333 vcpu->arch.host_msr = mfmsr();
1334#ifdef CONFIG_PPC_BOOK3S_64
1335 /* default to book3s_64 (970fx) */
1336 vcpu->arch.pvr = 0x3C0301;
1337#else
1338 /* default to book3s_32 (750) */
1339 vcpu->arch.pvr = 0x84202;
1340#endif
1341 kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
1342 vcpu_book3s->slb_nr = 64;
1343
1344 /* remember where some real-mode handlers are */
1345 vcpu->arch.trampoline_lowmem = kvmppc_trampoline_lowmem;
1346 vcpu->arch.trampoline_enter = kvmppc_trampoline_enter;
1347 vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
1348#ifdef CONFIG_PPC_BOOK3S_64
1349 vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;
1350#else
1351 vcpu->arch.rmcall = (ulong)kvmppc_rmcall;
1352#endif
1353
1354 vcpu->arch.shadow_msr = MSR_USER64;
1355
1356 err = kvmppc_mmu_init(vcpu);
1357 if (err < 0)
1358 goto uninit_vcpu;
1359
1360 return vcpu;
1361
1362uninit_vcpu:
1363 kvm_vcpu_uninit(vcpu);
1364free_shadow_vcpu:
1365 kfree(vcpu_book3s->shadow_vcpu);
1366free_vcpu:
1367 vfree(vcpu_book3s);
1368out:
1369 return ERR_PTR(err);
1370}
1371
1372void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
1373{
1374 struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
1375
1376 free_page((unsigned long)vcpu->arch.shared & PAGE_MASK);
1377 kvm_vcpu_uninit(vcpu);
1378 kfree(vcpu_book3s->shadow_vcpu);
1379 vfree(vcpu_book3s);
1380}
1381
1382extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
1383int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1384{
1385 int ret;
1386 double fpr[32][TS_FPRWIDTH];
1387 unsigned int fpscr;
1388 int fpexc_mode;
1389#ifdef CONFIG_ALTIVEC
1390 vector128 vr[32];
1391 vector128 vscr;
1392 unsigned long uninitialized_var(vrsave);
1393 int used_vr;
1394#endif
1395#ifdef CONFIG_VSX
1396 int used_vsr;
1397#endif
1398 ulong ext_msr;
1399
1400 /* No need to go into the guest when all we do is going out */
1401 if (signal_pending(current)) {
1402 kvm_run->exit_reason = KVM_EXIT_INTR;
1403 return -EINTR;
1404 }
1405
1406 /* Save FPU state in stack */
1407 if (current->thread.regs->msr & MSR_FP)
1408 giveup_fpu(current);
1409 memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr));
1410 fpscr = current->thread.fpscr.val;
1411 fpexc_mode = current->thread.fpexc_mode;
1412
1413#ifdef CONFIG_ALTIVEC
1414 /* Save Altivec state in stack */
1415 used_vr = current->thread.used_vr;
1416 if (used_vr) {
1417 if (current->thread.regs->msr & MSR_VEC)
1418 giveup_altivec(current);
1419 memcpy(vr, current->thread.vr, sizeof(current->thread.vr));
1420 vscr = current->thread.vscr;
1421 vrsave = current->thread.vrsave;
1422 }
1423#endif
1424
1425#ifdef CONFIG_VSX
1426 /* Save VSX state in stack */
1427 used_vsr = current->thread.used_vsr;
1428 if (used_vsr && (current->thread.regs->msr & MSR_VSX))
1429 __giveup_vsx(current);
1430#endif
1431
1432 /* Remember the MSR with disabled extensions */
1433 ext_msr = current->thread.regs->msr;
1434
1435 /* XXX we get called with irq disabled - change that! */
1436 local_irq_enable();
1437
1438 /* Preload FPU if it's enabled */
1439 if (vcpu->arch.shared->msr & MSR_FP)
1440 kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
1441
1442 ret = __kvmppc_vcpu_entry(kvm_run, vcpu);
1443
1444 local_irq_disable();
1445
1446 current->thread.regs->msr = ext_msr;
1447
1448 /* Make sure we save the guest FPU/Altivec/VSX state */
1449 kvmppc_giveup_ext(vcpu, MSR_FP);
1450 kvmppc_giveup_ext(vcpu, MSR_VEC);
1451 kvmppc_giveup_ext(vcpu, MSR_VSX);
1452
1453 /* Restore FPU state from stack */
1454 memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr));
1455 current->thread.fpscr.val = fpscr;
1456 current->thread.fpexc_mode = fpexc_mode;
1457
1458#ifdef CONFIG_ALTIVEC
1459 /* Restore Altivec state from stack */
1460 if (used_vr && current->thread.used_vr) {
1461 memcpy(current->thread.vr, vr, sizeof(current->thread.vr));
1462 current->thread.vscr = vscr;
1463 current->thread.vrsave = vrsave;
1464 }
1465 current->thread.used_vr = used_vr;
1466#endif
1467
1468#ifdef CONFIG_VSX
1469 current->thread.used_vsr = used_vsr;
1470#endif
1471
1472 return ret;
1473}
1474
1475static int kvmppc_book3s_init(void)
1476{
1477 int r;
1478
1479 r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0,
1480 THIS_MODULE);
1481
1482 if (r)
1483 return r;
1484
1485 r = kvmppc_mmu_hpte_sysinit();
1486
1487 return r;
1488}
1489
1490static void kvmppc_book3s_exit(void)
1491{
1492 kvmppc_mmu_hpte_sysexit();
1493 kvm_exit();
1494}
1495
1496module_init(kvmppc_book3s_init);
1497module_exit(kvmppc_book3s_exit);
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index d7889ef3211e..c6d3e194b6b4 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -41,36 +41,36 @@ static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu)
41} 41}
42 42
43static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( 43static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
44 struct kvmppc_vcpu_book3s *vcpu_book3s, 44 struct kvm_vcpu *vcpu,
45 gva_t eaddr) 45 gva_t eaddr)
46{ 46{
47 int i; 47 int i;
48 u64 esid = GET_ESID(eaddr); 48 u64 esid = GET_ESID(eaddr);
49 u64 esid_1t = GET_ESID_1T(eaddr); 49 u64 esid_1t = GET_ESID_1T(eaddr);
50 50
51 for (i = 0; i < vcpu_book3s->slb_nr; i++) { 51 for (i = 0; i < vcpu->arch.slb_nr; i++) {
52 u64 cmp_esid = esid; 52 u64 cmp_esid = esid;
53 53
54 if (!vcpu_book3s->slb[i].valid) 54 if (!vcpu->arch.slb[i].valid)
55 continue; 55 continue;
56 56
57 if (vcpu_book3s->slb[i].tb) 57 if (vcpu->arch.slb[i].tb)
58 cmp_esid = esid_1t; 58 cmp_esid = esid_1t;
59 59
60 if (vcpu_book3s->slb[i].esid == cmp_esid) 60 if (vcpu->arch.slb[i].esid == cmp_esid)
61 return &vcpu_book3s->slb[i]; 61 return &vcpu->arch.slb[i];
62 } 62 }
63 63
64 dprintk("KVM: No SLB entry found for 0x%lx [%llx | %llx]\n", 64 dprintk("KVM: No SLB entry found for 0x%lx [%llx | %llx]\n",
65 eaddr, esid, esid_1t); 65 eaddr, esid, esid_1t);
66 for (i = 0; i < vcpu_book3s->slb_nr; i++) { 66 for (i = 0; i < vcpu->arch.slb_nr; i++) {
67 if (vcpu_book3s->slb[i].vsid) 67 if (vcpu->arch.slb[i].vsid)
68 dprintk(" %d: %c%c%c %llx %llx\n", i, 68 dprintk(" %d: %c%c%c %llx %llx\n", i,
69 vcpu_book3s->slb[i].valid ? 'v' : ' ', 69 vcpu->arch.slb[i].valid ? 'v' : ' ',
70 vcpu_book3s->slb[i].large ? 'l' : ' ', 70 vcpu->arch.slb[i].large ? 'l' : ' ',
71 vcpu_book3s->slb[i].tb ? 't' : ' ', 71 vcpu->arch.slb[i].tb ? 't' : ' ',
72 vcpu_book3s->slb[i].esid, 72 vcpu->arch.slb[i].esid,
73 vcpu_book3s->slb[i].vsid); 73 vcpu->arch.slb[i].vsid);
74 } 74 }
75 75
76 return NULL; 76 return NULL;
@@ -81,7 +81,7 @@ static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
81{ 81{
82 struct kvmppc_slb *slb; 82 struct kvmppc_slb *slb;
83 83
84 slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), eaddr); 84 slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr);
85 if (!slb) 85 if (!slb)
86 return 0; 86 return 0;
87 87
@@ -180,7 +180,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
180 return 0; 180 return 0;
181 } 181 }
182 182
183 slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, eaddr); 183 slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr);
184 if (!slbe) 184 if (!slbe)
185 goto no_seg_found; 185 goto no_seg_found;
186 186
@@ -320,10 +320,10 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)
320 esid_1t = GET_ESID_1T(rb); 320 esid_1t = GET_ESID_1T(rb);
321 slb_nr = rb & 0xfff; 321 slb_nr = rb & 0xfff;
322 322
323 if (slb_nr > vcpu_book3s->slb_nr) 323 if (slb_nr > vcpu->arch.slb_nr)
324 return; 324 return;
325 325
326 slbe = &vcpu_book3s->slb[slb_nr]; 326 slbe = &vcpu->arch.slb[slb_nr];
327 327
328 slbe->large = (rs & SLB_VSID_L) ? 1 : 0; 328 slbe->large = (rs & SLB_VSID_L) ? 1 : 0;
329 slbe->tb = (rs & SLB_VSID_B_1T) ? 1 : 0; 329 slbe->tb = (rs & SLB_VSID_B_1T) ? 1 : 0;
@@ -344,38 +344,35 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)
344 344
345static u64 kvmppc_mmu_book3s_64_slbmfee(struct kvm_vcpu *vcpu, u64 slb_nr) 345static u64 kvmppc_mmu_book3s_64_slbmfee(struct kvm_vcpu *vcpu, u64 slb_nr)
346{ 346{
347 struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
348 struct kvmppc_slb *slbe; 347 struct kvmppc_slb *slbe;
349 348
350 if (slb_nr > vcpu_book3s->slb_nr) 349 if (slb_nr > vcpu->arch.slb_nr)
351 return 0; 350 return 0;
352 351
353 slbe = &vcpu_book3s->slb[slb_nr]; 352 slbe = &vcpu->arch.slb[slb_nr];
354 353
355 return slbe->orige; 354 return slbe->orige;
356} 355}
357 356
358static u64 kvmppc_mmu_book3s_64_slbmfev(struct kvm_vcpu *vcpu, u64 slb_nr) 357static u64 kvmppc_mmu_book3s_64_slbmfev(struct kvm_vcpu *vcpu, u64 slb_nr)
359{ 358{
360 struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
361 struct kvmppc_slb *slbe; 359 struct kvmppc_slb *slbe;
362 360
363 if (slb_nr > vcpu_book3s->slb_nr) 361 if (slb_nr > vcpu->arch.slb_nr)
364 return 0; 362 return 0;
365 363
366 slbe = &vcpu_book3s->slb[slb_nr]; 364 slbe = &vcpu->arch.slb[slb_nr];
367 365
368 return slbe->origv; 366 return slbe->origv;
369} 367}
370 368
371static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea) 369static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea)
372{ 370{
373 struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
374 struct kvmppc_slb *slbe; 371 struct kvmppc_slb *slbe;
375 372
376 dprintk("KVM MMU: slbie(0x%llx)\n", ea); 373 dprintk("KVM MMU: slbie(0x%llx)\n", ea);
377 374
378 slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, ea); 375 slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea);
379 376
380 if (!slbe) 377 if (!slbe)
381 return; 378 return;
@@ -389,13 +386,12 @@ static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea)
389 386
390static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu) 387static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu)
391{ 388{
392 struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
393 int i; 389 int i;
394 390
395 dprintk("KVM MMU: slbia()\n"); 391 dprintk("KVM MMU: slbia()\n");
396 392
397 for (i = 1; i < vcpu_book3s->slb_nr; i++) 393 for (i = 1; i < vcpu->arch.slb_nr; i++)
398 vcpu_book3s->slb[i].valid = false; 394 vcpu->arch.slb[i].valid = false;
399 395
400 if (vcpu->arch.shared->msr & MSR_IR) { 396 if (vcpu->arch.shared->msr & MSR_IR) {
401 kvmppc_mmu_flush_segments(vcpu); 397 kvmppc_mmu_flush_segments(vcpu);
@@ -464,7 +460,7 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
464 ulong mp_ea = vcpu->arch.magic_page_ea; 460 ulong mp_ea = vcpu->arch.magic_page_ea;
465 461
466 if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { 462 if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
467 slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), ea); 463 slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea);
468 if (slb) 464 if (slb)
469 gvsid = slb->vsid; 465 gvsid = slb->vsid;
470 } 466 }
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
new file mode 100644
index 000000000000..bc3a2ea94217
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -0,0 +1,180 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License, version 2, as
4 * published by the Free Software Foundation.
5 *
6 * This program is distributed in the hope that it will be useful,
7 * but WITHOUT ANY WARRANTY; without even the implied warranty of
8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9 * GNU General Public License for more details.
10 *
11 * You should have received a copy of the GNU General Public License
12 * along with this program; if not, write to the Free Software
13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
14 *
15 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
16 */
17
18#include <linux/types.h>
19#include <linux/string.h>
20#include <linux/kvm.h>
21#include <linux/kvm_host.h>
22#include <linux/highmem.h>
23#include <linux/gfp.h>
24#include <linux/slab.h>
25#include <linux/hugetlb.h>
26
27#include <asm/tlbflush.h>
28#include <asm/kvm_ppc.h>
29#include <asm/kvm_book3s.h>
30#include <asm/mmu-hash64.h>
31#include <asm/hvcall.h>
32#include <asm/synch.h>
33#include <asm/ppc-opcode.h>
34#include <asm/cputable.h>
35
36/* For now use fixed-size 16MB page table */
37#define HPT_ORDER 24
38#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */
39#define HPT_HASH_MASK (HPT_NPTEG - 1)
40
41/* Pages in the VRMA are 16MB pages */
42#define VRMA_PAGE_ORDER 24
43#define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */
44
45/* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
46#define MAX_LPID_970 63
47#define NR_LPIDS (LPID_RSVD + 1)
48unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)];
49
50long kvmppc_alloc_hpt(struct kvm *kvm)
51{
52 unsigned long hpt;
53 unsigned long lpid;
54
55 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN,
56 HPT_ORDER - PAGE_SHIFT);
57 if (!hpt) {
58 pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n");
59 return -ENOMEM;
60 }
61 kvm->arch.hpt_virt = hpt;
62
63 do {
64 lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS);
65 if (lpid >= NR_LPIDS) {
66 pr_err("kvm_alloc_hpt: No LPIDs free\n");
67 free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
68 return -ENOMEM;
69 }
70 } while (test_and_set_bit(lpid, lpid_inuse));
71
72 kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
73 kvm->arch.lpid = lpid;
74
75 pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
76 return 0;
77}
78
79void kvmppc_free_hpt(struct kvm *kvm)
80{
81 clear_bit(kvm->arch.lpid, lpid_inuse);
82 free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
83}
84
85void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
86{
87 unsigned long i;
88 unsigned long npages = kvm->arch.ram_npages;
89 unsigned long pfn;
90 unsigned long *hpte;
91 unsigned long hash;
92 struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo;
93
94 if (!pginfo)
95 return;
96
97 /* VRMA can't be > 1TB */
98 if (npages > 1ul << (40 - kvm->arch.ram_porder))
99 npages = 1ul << (40 - kvm->arch.ram_porder);
100 /* Can't use more than 1 HPTE per HPTEG */
101 if (npages > HPT_NPTEG)
102 npages = HPT_NPTEG;
103
104 for (i = 0; i < npages; ++i) {
105 pfn = pginfo[i].pfn;
106 if (!pfn)
107 break;
108 /* can't use hpt_hash since va > 64 bits */
109 hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
110 /*
111 * We assume that the hash table is empty and no
112 * vcpus are using it at this stage. Since we create
113 * at most one HPTE per HPTEG, we just assume entry 7
114 * is available and use it.
115 */
116 hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 7));
117 hpte += 7 * 2;
118 /* HPTE low word - RPN, protection, etc. */
119 hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C |
120 HPTE_R_M | PP_RWXX;
121 wmb();
122 hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
123 (i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED |
124 HPTE_V_LARGE | HPTE_V_VALID;
125 }
126}
127
128int kvmppc_mmu_hv_init(void)
129{
130 unsigned long host_lpid, rsvd_lpid;
131
132 if (!cpu_has_feature(CPU_FTR_HVMODE))
133 return -EINVAL;
134
135 memset(lpid_inuse, 0, sizeof(lpid_inuse));
136
137 if (cpu_has_feature(CPU_FTR_ARCH_206)) {
138 host_lpid = mfspr(SPRN_LPID); /* POWER7 */
139 rsvd_lpid = LPID_RSVD;
140 } else {
141 host_lpid = 0; /* PPC970 */
142 rsvd_lpid = MAX_LPID_970;
143 }
144
145 set_bit(host_lpid, lpid_inuse);
146 /* rsvd_lpid is reserved for use in partition switching */
147 set_bit(rsvd_lpid, lpid_inuse);
148
149 return 0;
150}
151
152void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
153{
154}
155
156static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
157{
158 kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
159}
160
161static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
162 struct kvmppc_pte *gpte, bool data)
163{
164 return -ENOENT;
165}
166
167void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
168{
169 struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
170
171 if (cpu_has_feature(CPU_FTR_ARCH_206))
172 vcpu->arch.slb_nr = 32; /* POWER7 */
173 else
174 vcpu->arch.slb_nr = 64;
175
176 mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
177 mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
178
179 vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
180}
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
new file mode 100644
index 000000000000..ea0f8c537c28
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -0,0 +1,73 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License, version 2, as
4 * published by the Free Software Foundation.
5 *
6 * This program is distributed in the hope that it will be useful,
7 * but WITHOUT ANY WARRANTY; without even the implied warranty of
8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9 * GNU General Public License for more details.
10 *
11 * You should have received a copy of the GNU General Public License
12 * along with this program; if not, write to the Free Software
13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
14 *
15 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
16 * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
17 */
18
19#include <linux/types.h>
20#include <linux/string.h>
21#include <linux/kvm.h>
22#include <linux/kvm_host.h>
23#include <linux/highmem.h>
24#include <linux/gfp.h>
25#include <linux/slab.h>
26#include <linux/hugetlb.h>
27#include <linux/list.h>
28
29#include <asm/tlbflush.h>
30#include <asm/kvm_ppc.h>
31#include <asm/kvm_book3s.h>
32#include <asm/mmu-hash64.h>
33#include <asm/hvcall.h>
34#include <asm/synch.h>
35#include <asm/ppc-opcode.h>
36#include <asm/kvm_host.h>
37#include <asm/udbg.h>
38
39#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64))
40
41long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
42 unsigned long ioba, unsigned long tce)
43{
44 struct kvm *kvm = vcpu->kvm;
45 struct kvmppc_spapr_tce_table *stt;
46
47 /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
48 /* liobn, ioba, tce); */
49
50 list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
51 if (stt->liobn == liobn) {
52 unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
53 struct page *page;
54 u64 *tbl;
55
56 /* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p window_size=0x%x\n", */
57 /* liobn, stt, stt->window_size); */
58 if (ioba >= stt->window_size)
59 return H_PARAMETER;
60
61 page = stt->pages[idx / TCES_PER_PAGE];
62 tbl = (u64 *)page_address(page);
63
64 /* FIXME: Need to validate the TCE itself */
65 /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
66 tbl[idx % TCES_PER_PAGE] = tce;
67 return H_SUCCESS;
68 }
69 }
70
71 /* Didn't find the liobn, punt it to userspace */
72 return H_TOO_HARD;
73}
diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c
index 1dd5a1ddfd0d..88c8f26add02 100644
--- a/arch/powerpc/kvm/book3s_exports.c
+++ b/arch/powerpc/kvm/book3s_exports.c
@@ -20,8 +20,11 @@
20#include <linux/module.h> 20#include <linux/module.h>
21#include <asm/kvm_book3s.h> 21#include <asm/kvm_book3s.h>
22 22
23EXPORT_SYMBOL_GPL(kvmppc_trampoline_enter); 23#ifdef CONFIG_KVM_BOOK3S_64_HV
24EXPORT_SYMBOL_GPL(kvmppc_trampoline_lowmem); 24EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline);
25#else
26EXPORT_SYMBOL_GPL(kvmppc_handler_trampoline_enter);
27EXPORT_SYMBOL_GPL(kvmppc_handler_lowmem_trampoline);
25EXPORT_SYMBOL_GPL(kvmppc_rmcall); 28EXPORT_SYMBOL_GPL(kvmppc_rmcall);
26EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu); 29EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu);
27#ifdef CONFIG_ALTIVEC 30#ifdef CONFIG_ALTIVEC
@@ -30,3 +33,5 @@ EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec);
30#ifdef CONFIG_VSX 33#ifdef CONFIG_VSX
31EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx); 34EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx);
32#endif 35#endif
36#endif
37
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
new file mode 100644
index 000000000000..cc0d7f1b19ab
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -0,0 +1,1269 @@
1/*
2 * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
3 * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
4 *
5 * Authors:
6 * Paul Mackerras <paulus@au1.ibm.com>
7 * Alexander Graf <agraf@suse.de>
8 * Kevin Wolf <mail@kevin-wolf.de>
9 *
10 * Description: KVM functions specific to running on Book 3S
11 * processors in hypervisor mode (specifically POWER7 and later).
12 *
13 * This file is derived from arch/powerpc/kvm/book3s.c,
14 * by Alexander Graf <agraf@suse.de>.
15 *
16 * This program is free software; you can redistribute it and/or modify
17 * it under the terms of the GNU General Public License, version 2, as
18 * published by the Free Software Foundation.
19 */
20
21#include <linux/kvm_host.h>
22#include <linux/err.h>
23#include <linux/slab.h>
24#include <linux/preempt.h>
25#include <linux/sched.h>
26#include <linux/delay.h>
27#include <linux/fs.h>
28#include <linux/anon_inodes.h>
29#include <linux/cpumask.h>
30#include <linux/spinlock.h>
31#include <linux/page-flags.h>
32
33#include <asm/reg.h>
34#include <asm/cputable.h>
35#include <asm/cacheflush.h>
36#include <asm/tlbflush.h>
37#include <asm/uaccess.h>
38#include <asm/io.h>
39#include <asm/kvm_ppc.h>
40#include <asm/kvm_book3s.h>
41#include <asm/mmu_context.h>
42#include <asm/lppaca.h>
43#include <asm/processor.h>
44#include <asm/cputhreads.h>
45#include <asm/page.h>
46#include <linux/gfp.h>
47#include <linux/sched.h>
48#include <linux/vmalloc.h>
49#include <linux/highmem.h>
50
51/*
52 * For now, limit memory to 64GB and require it to be large pages.
53 * This value is chosen because it makes the ram_pginfo array be
54 * 64kB in size, which is about as large as we want to be trying
55 * to allocate with kmalloc.
56 */
57#define MAX_MEM_ORDER 36
58
59#define LARGE_PAGE_ORDER 24 /* 16MB pages */
60
61/* #define EXIT_DEBUG */
62/* #define EXIT_DEBUG_SIMPLE */
63/* #define EXIT_DEBUG_INT */
64
65void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
66{
67 local_paca->kvm_hstate.kvm_vcpu = vcpu;
68 local_paca->kvm_hstate.kvm_vcore = vcpu->arch.vcore;
69}
70
71void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
72{
73}
74
75static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu);
76static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu);
77
78void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
79{
80 u64 now;
81 unsigned long dec_nsec;
82
83 now = get_tb();
84 if (now >= vcpu->arch.dec_expires && !kvmppc_core_pending_dec(vcpu))
85 kvmppc_core_queue_dec(vcpu);
86 if (vcpu->arch.pending_exceptions)
87 return;
88 if (vcpu->arch.dec_expires != ~(u64)0) {
89 dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC /
90 tb_ticks_per_sec;
91 hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
92 HRTIMER_MODE_REL);
93 }
94
95 kvmppc_vcpu_blocked(vcpu);
96
97 kvm_vcpu_block(vcpu);
98 vcpu->stat.halt_wakeup++;
99
100 if (vcpu->arch.dec_expires != ~(u64)0)
101 hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
102
103 kvmppc_vcpu_unblocked(vcpu);
104}
105
106void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
107{
108 vcpu->arch.shregs.msr = msr;
109}
110
111void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
112{
113 vcpu->arch.pvr = pvr;
114}
115
116void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
117{
118 int r;
119
120 pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
121 pr_err("pc = %.16lx msr = %.16llx trap = %x\n",
122 vcpu->arch.pc, vcpu->arch.shregs.msr, vcpu->arch.trap);
123 for (r = 0; r < 16; ++r)
124 pr_err("r%2d = %.16lx r%d = %.16lx\n",
125 r, kvmppc_get_gpr(vcpu, r),
126 r+16, kvmppc_get_gpr(vcpu, r+16));
127 pr_err("ctr = %.16lx lr = %.16lx\n",
128 vcpu->arch.ctr, vcpu->arch.lr);
129 pr_err("srr0 = %.16llx srr1 = %.16llx\n",
130 vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1);
131 pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
132 vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
133 pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
134 vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
135 pr_err("cr = %.8x xer = %.16lx dsisr = %.8x\n",
136 vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.shregs.dsisr);
137 pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
138 pr_err("fault dar = %.16lx dsisr = %.8x\n",
139 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
140 pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
141 for (r = 0; r < vcpu->arch.slb_max; ++r)
142 pr_err(" ESID = %.16llx VSID = %.16llx\n",
143 vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
144 pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
145 vcpu->kvm->arch.lpcr, vcpu->kvm->arch.sdr1,
146 vcpu->arch.last_inst);
147}
148
149struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
150{
151 int r;
152 struct kvm_vcpu *v, *ret = NULL;
153
154 mutex_lock(&kvm->lock);
155 kvm_for_each_vcpu(r, v, kvm) {
156 if (v->vcpu_id == id) {
157 ret = v;
158 break;
159 }
160 }
161 mutex_unlock(&kvm->lock);
162 return ret;
163}
164
165static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
166{
167 vpa->shared_proc = 1;
168 vpa->yield_count = 1;
169}
170
171static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
172 unsigned long flags,
173 unsigned long vcpuid, unsigned long vpa)
174{
175 struct kvm *kvm = vcpu->kvm;
176 unsigned long pg_index, ra, len;
177 unsigned long pg_offset;
178 void *va;
179 struct kvm_vcpu *tvcpu;
180
181 tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
182 if (!tvcpu)
183 return H_PARAMETER;
184
185 flags >>= 63 - 18;
186 flags &= 7;
187 if (flags == 0 || flags == 4)
188 return H_PARAMETER;
189 if (flags < 4) {
190 if (vpa & 0x7f)
191 return H_PARAMETER;
192 /* registering new area; convert logical addr to real */
193 pg_index = vpa >> kvm->arch.ram_porder;
194 pg_offset = vpa & (kvm->arch.ram_psize - 1);
195 if (pg_index >= kvm->arch.ram_npages)
196 return H_PARAMETER;
197 if (kvm->arch.ram_pginfo[pg_index].pfn == 0)
198 return H_PARAMETER;
199 ra = kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT;
200 ra |= pg_offset;
201 va = __va(ra);
202 if (flags <= 1)
203 len = *(unsigned short *)(va + 4);
204 else
205 len = *(unsigned int *)(va + 4);
206 if (pg_offset + len > kvm->arch.ram_psize)
207 return H_PARAMETER;
208 switch (flags) {
209 case 1: /* register VPA */
210 if (len < 640)
211 return H_PARAMETER;
212 tvcpu->arch.vpa = va;
213 init_vpa(vcpu, va);
214 break;
215 case 2: /* register DTL */
216 if (len < 48)
217 return H_PARAMETER;
218 if (!tvcpu->arch.vpa)
219 return H_RESOURCE;
220 len -= len % 48;
221 tvcpu->arch.dtl = va;
222 tvcpu->arch.dtl_end = va + len;
223 break;
224 case 3: /* register SLB shadow buffer */
225 if (len < 8)
226 return H_PARAMETER;
227 if (!tvcpu->arch.vpa)
228 return H_RESOURCE;
229 tvcpu->arch.slb_shadow = va;
230 len = (len - 16) / 16;
231 tvcpu->arch.slb_shadow = va;
232 break;
233 }
234 } else {
235 switch (flags) {
236 case 5: /* unregister VPA */
237 if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl)
238 return H_RESOURCE;
239 tvcpu->arch.vpa = NULL;
240 break;
241 case 6: /* unregister DTL */
242 tvcpu->arch.dtl = NULL;
243 break;
244 case 7: /* unregister SLB shadow buffer */
245 tvcpu->arch.slb_shadow = NULL;
246 break;
247 }
248 }
249 return H_SUCCESS;
250}
251
252int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
253{
254 unsigned long req = kvmppc_get_gpr(vcpu, 3);
255 unsigned long target, ret = H_SUCCESS;
256 struct kvm_vcpu *tvcpu;
257
258 switch (req) {
259 case H_CEDE:
260 vcpu->arch.shregs.msr |= MSR_EE;
261 vcpu->arch.ceded = 1;
262 smp_mb();
263 if (!vcpu->arch.prodded)
264 kvmppc_vcpu_block(vcpu);
265 else
266 vcpu->arch.prodded = 0;
267 smp_mb();
268 vcpu->arch.ceded = 0;
269 break;
270 case H_PROD:
271 target = kvmppc_get_gpr(vcpu, 4);
272 tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
273 if (!tvcpu) {
274 ret = H_PARAMETER;
275 break;
276 }
277 tvcpu->arch.prodded = 1;
278 smp_mb();
279 if (vcpu->arch.ceded) {
280 if (waitqueue_active(&vcpu->wq)) {
281 wake_up_interruptible(&vcpu->wq);
282 vcpu->stat.halt_wakeup++;
283 }
284 }
285 break;
286 case H_CONFER:
287 break;
288 case H_REGISTER_VPA:
289 ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
290 kvmppc_get_gpr(vcpu, 5),
291 kvmppc_get_gpr(vcpu, 6));
292 break;
293 default:
294 return RESUME_HOST;
295 }
296 kvmppc_set_gpr(vcpu, 3, ret);
297 vcpu->arch.hcall_needed = 0;
298 return RESUME_GUEST;
299}
300
301static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
302 struct task_struct *tsk)
303{
304 int r = RESUME_HOST;
305
306 vcpu->stat.sum_exits++;
307
308 run->exit_reason = KVM_EXIT_UNKNOWN;
309 run->ready_for_interrupt_injection = 1;
310 switch (vcpu->arch.trap) {
311 /* We're good on these - the host merely wanted to get our attention */
312 case BOOK3S_INTERRUPT_HV_DECREMENTER:
313 vcpu->stat.dec_exits++;
314 r = RESUME_GUEST;
315 break;
316 case BOOK3S_INTERRUPT_EXTERNAL:
317 vcpu->stat.ext_intr_exits++;
318 r = RESUME_GUEST;
319 break;
320 case BOOK3S_INTERRUPT_PERFMON:
321 r = RESUME_GUEST;
322 break;
323 case BOOK3S_INTERRUPT_PROGRAM:
324 {
325 ulong flags;
326 /*
327 * Normally program interrupts are delivered directly
328 * to the guest by the hardware, but we can get here
329 * as a result of a hypervisor emulation interrupt
330 * (e40) getting turned into a 700 by BML RTAS.
331 */
332 flags = vcpu->arch.shregs.msr & 0x1f0000ull;
333 kvmppc_core_queue_program(vcpu, flags);
334 r = RESUME_GUEST;
335 break;
336 }
337 case BOOK3S_INTERRUPT_SYSCALL:
338 {
339 /* hcall - punt to userspace */
340 int i;
341
342 if (vcpu->arch.shregs.msr & MSR_PR) {
343 /* sc 1 from userspace - reflect to guest syscall */
344 kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_SYSCALL);
345 r = RESUME_GUEST;
346 break;
347 }
348 run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
349 for (i = 0; i < 9; ++i)
350 run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
351 run->exit_reason = KVM_EXIT_PAPR_HCALL;
352 vcpu->arch.hcall_needed = 1;
353 r = RESUME_HOST;
354 break;
355 }
356 /*
357 * We get these next two if the guest does a bad real-mode access,
358 * as we have enabled VRMA (virtualized real mode area) mode in the
359 * LPCR. We just generate an appropriate DSI/ISI to the guest.
360 */
361 case BOOK3S_INTERRUPT_H_DATA_STORAGE:
362 vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr;
363 vcpu->arch.shregs.dar = vcpu->arch.fault_dar;
364 kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
365 r = RESUME_GUEST;
366 break;
367 case BOOK3S_INTERRUPT_H_INST_STORAGE:
368 kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
369 0x08000000);
370 r = RESUME_GUEST;
371 break;
372 /*
373 * This occurs if the guest executes an illegal instruction.
374 * We just generate a program interrupt to the guest, since
375 * we don't emulate any guest instructions at this stage.
376 */
377 case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
378 kvmppc_core_queue_program(vcpu, 0x80000);
379 r = RESUME_GUEST;
380 break;
381 default:
382 kvmppc_dump_regs(vcpu);
383 printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
384 vcpu->arch.trap, kvmppc_get_pc(vcpu),
385 vcpu->arch.shregs.msr);
386 r = RESUME_HOST;
387 BUG();
388 break;
389 }
390
391
392 if (!(r & RESUME_HOST)) {
393 /* To avoid clobbering exit_reason, only check for signals if
394 * we aren't already exiting to userspace for some other
395 * reason. */
396 if (signal_pending(tsk)) {
397 vcpu->stat.signal_exits++;
398 run->exit_reason = KVM_EXIT_INTR;
399 r = -EINTR;
400 } else {
401 kvmppc_core_deliver_interrupts(vcpu);
402 }
403 }
404
405 return r;
406}
407
408int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
409 struct kvm_sregs *sregs)
410{
411 int i;
412
413 sregs->pvr = vcpu->arch.pvr;
414
415 memset(sregs, 0, sizeof(struct kvm_sregs));
416 for (i = 0; i < vcpu->arch.slb_max; i++) {
417 sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
418 sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
419 }
420
421 return 0;
422}
423
424int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
425 struct kvm_sregs *sregs)
426{
427 int i, j;
428
429 kvmppc_set_pvr(vcpu, sregs->pvr);
430
431 j = 0;
432 for (i = 0; i < vcpu->arch.slb_nr; i++) {
433 if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
434 vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
435 vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
436 ++j;
437 }
438 }
439 vcpu->arch.slb_max = j;
440
441 return 0;
442}
443
444int kvmppc_core_check_processor_compat(void)
445{
446 if (cpu_has_feature(CPU_FTR_HVMODE))
447 return 0;
448 return -EIO;
449}
450
451struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
452{
453 struct kvm_vcpu *vcpu;
454 int err = -EINVAL;
455 int core;
456 struct kvmppc_vcore *vcore;
457
458 core = id / threads_per_core;
459 if (core >= KVM_MAX_VCORES)
460 goto out;
461
462 err = -ENOMEM;
463 vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
464 if (!vcpu)
465 goto out;
466
467 err = kvm_vcpu_init(vcpu, kvm, id);
468 if (err)
469 goto free_vcpu;
470
471 vcpu->arch.shared = &vcpu->arch.shregs;
472 vcpu->arch.last_cpu = -1;
473 vcpu->arch.mmcr[0] = MMCR0_FC;
474 vcpu->arch.ctrl = CTRL_RUNLATCH;
475 /* default to host PVR, since we can't spoof it */
476 vcpu->arch.pvr = mfspr(SPRN_PVR);
477 kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
478
479 kvmppc_mmu_book3s_hv_init(vcpu);
480
481 /*
482 * Some vcpus may start out in stopped state. If we initialize
483 * them to busy-in-host state they will stop other vcpus in the
484 * vcore from running. Instead we initialize them to blocked
485 * state, effectively considering them to be stopped until we
486 * see the first run ioctl for them.
487 */
488 vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
489
490 init_waitqueue_head(&vcpu->arch.cpu_run);
491
492 mutex_lock(&kvm->lock);
493 vcore = kvm->arch.vcores[core];
494 if (!vcore) {
495 vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
496 if (vcore) {
497 INIT_LIST_HEAD(&vcore->runnable_threads);
498 spin_lock_init(&vcore->lock);
499 }
500 kvm->arch.vcores[core] = vcore;
501 }
502 mutex_unlock(&kvm->lock);
503
504 if (!vcore)
505 goto free_vcpu;
506
507 spin_lock(&vcore->lock);
508 ++vcore->num_threads;
509 ++vcore->n_blocked;
510 spin_unlock(&vcore->lock);
511 vcpu->arch.vcore = vcore;
512
513 return vcpu;
514
515free_vcpu:
516 kfree(vcpu);
517out:
518 return ERR_PTR(err);
519}
520
521void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
522{
523 kvm_vcpu_uninit(vcpu);
524 kfree(vcpu);
525}
526
527static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu)
528{
529 struct kvmppc_vcore *vc = vcpu->arch.vcore;
530
531 spin_lock(&vc->lock);
532 vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
533 ++vc->n_blocked;
534 if (vc->n_runnable > 0 &&
535 vc->n_runnable + vc->n_blocked == vc->num_threads) {
536 vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
537 arch.run_list);
538 wake_up(&vcpu->arch.cpu_run);
539 }
540 spin_unlock(&vc->lock);
541}
542
543static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu)
544{
545 struct kvmppc_vcore *vc = vcpu->arch.vcore;
546
547 spin_lock(&vc->lock);
548 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
549 --vc->n_blocked;
550 spin_unlock(&vc->lock);
551}
552
553extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
554extern void xics_wake_cpu(int cpu);
555
556static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
557 struct kvm_vcpu *vcpu)
558{
559 struct kvm_vcpu *v;
560
561 if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
562 return;
563 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
564 --vc->n_runnable;
565 /* decrement the physical thread id of each following vcpu */
566 v = vcpu;
567 list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list)
568 --v->arch.ptid;
569 list_del(&vcpu->arch.run_list);
570}
571
572static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
573{
574 int cpu;
575 struct paca_struct *tpaca;
576 struct kvmppc_vcore *vc = vcpu->arch.vcore;
577
578 cpu = vc->pcpu + vcpu->arch.ptid;
579 tpaca = &paca[cpu];
580 tpaca->kvm_hstate.kvm_vcpu = vcpu;
581 tpaca->kvm_hstate.kvm_vcore = vc;
582 smp_wmb();
583#ifdef CONFIG_PPC_ICP_NATIVE
584 if (vcpu->arch.ptid) {
585 tpaca->cpu_start = 0x80;
586 tpaca->kvm_hstate.in_guest = KVM_GUEST_MODE_GUEST;
587 wmb();
588 xics_wake_cpu(cpu);
589 ++vc->n_woken;
590 }
591#endif
592}
593
594static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
595{
596 int i;
597
598 HMT_low();
599 i = 0;
600 while (vc->nap_count < vc->n_woken) {
601 if (++i >= 1000000) {
602 pr_err("kvmppc_wait_for_nap timeout %d %d\n",
603 vc->nap_count, vc->n_woken);
604 break;
605 }
606 cpu_relax();
607 }
608 HMT_medium();
609}
610
611/*
612 * Check that we are on thread 0 and that any other threads in
613 * this core are off-line.
614 */
615static int on_primary_thread(void)
616{
617 int cpu = smp_processor_id();
618 int thr = cpu_thread_in_core(cpu);
619
620 if (thr)
621 return 0;
622 while (++thr < threads_per_core)
623 if (cpu_online(cpu + thr))
624 return 0;
625 return 1;
626}
627
628/*
629 * Run a set of guest threads on a physical core.
630 * Called with vc->lock held.
631 */
632static int kvmppc_run_core(struct kvmppc_vcore *vc)
633{
634 struct kvm_vcpu *vcpu, *vnext;
635 long ret;
636 u64 now;
637
638 /* don't start if any threads have a signal pending */
639 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
640 if (signal_pending(vcpu->arch.run_task))
641 return 0;
642
643 /*
644 * Make sure we are running on thread 0, and that
645 * secondary threads are offline.
646 * XXX we should also block attempts to bring any
647 * secondary threads online.
648 */
649 if (threads_per_core > 1 && !on_primary_thread()) {
650 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
651 vcpu->arch.ret = -EBUSY;
652 goto out;
653 }
654
655 vc->n_woken = 0;
656 vc->nap_count = 0;
657 vc->entry_exit_count = 0;
658 vc->vcore_running = 1;
659 vc->in_guest = 0;
660 vc->pcpu = smp_processor_id();
661 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
662 kvmppc_start_thread(vcpu);
663 vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
664 arch.run_list);
665
666 spin_unlock(&vc->lock);
667
668 preempt_disable();
669 kvm_guest_enter();
670 __kvmppc_vcore_entry(NULL, vcpu);
671
672 /* wait for secondary threads to finish writing their state to memory */
673 spin_lock(&vc->lock);
674 if (vc->nap_count < vc->n_woken)
675 kvmppc_wait_for_nap(vc);
676 /* prevent other vcpu threads from doing kvmppc_start_thread() now */
677 vc->vcore_running = 2;
678 spin_unlock(&vc->lock);
679
680 /* make sure updates to secondary vcpu structs are visible now */
681 smp_mb();
682 kvm_guest_exit();
683
684 preempt_enable();
685 kvm_resched(vcpu);
686
687 now = get_tb();
688 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
689 /* cancel pending dec exception if dec is positive */
690 if (now < vcpu->arch.dec_expires &&
691 kvmppc_core_pending_dec(vcpu))
692 kvmppc_core_dequeue_dec(vcpu);
693 if (!vcpu->arch.trap) {
694 if (signal_pending(vcpu->arch.run_task)) {
695 vcpu->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
696 vcpu->arch.ret = -EINTR;
697 }
698 continue; /* didn't get to run */
699 }
700 ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu,
701 vcpu->arch.run_task);
702 vcpu->arch.ret = ret;
703 vcpu->arch.trap = 0;
704 }
705
706 spin_lock(&vc->lock);
707 out:
708 vc->vcore_running = 0;
709 list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
710 arch.run_list) {
711 if (vcpu->arch.ret != RESUME_GUEST) {
712 kvmppc_remove_runnable(vc, vcpu);
713 wake_up(&vcpu->arch.cpu_run);
714 }
715 }
716
717 return 1;
718}
719
720static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
721{
722 int ptid;
723 int wait_state;
724 struct kvmppc_vcore *vc;
725 DEFINE_WAIT(wait);
726
727 /* No need to go into the guest when all we do is going out */
728 if (signal_pending(current)) {
729 kvm_run->exit_reason = KVM_EXIT_INTR;
730 return -EINTR;
731 }
732
733 /* On PPC970, check that we have an RMA region */
734 if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201))
735 return -EPERM;
736
737 kvm_run->exit_reason = 0;
738 vcpu->arch.ret = RESUME_GUEST;
739 vcpu->arch.trap = 0;
740
741 flush_fp_to_thread(current);
742 flush_altivec_to_thread(current);
743 flush_vsx_to_thread(current);
744
745 /*
746 * Synchronize with other threads in this virtual core
747 */
748 vc = vcpu->arch.vcore;
749 spin_lock(&vc->lock);
750 /* This happens the first time this is called for a vcpu */
751 if (vcpu->arch.state == KVMPPC_VCPU_BLOCKED)
752 --vc->n_blocked;
753 vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
754 ptid = vc->n_runnable;
755 vcpu->arch.run_task = current;
756 vcpu->arch.kvm_run = kvm_run;
757 vcpu->arch.ptid = ptid;
758 list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
759 ++vc->n_runnable;
760
761 wait_state = TASK_INTERRUPTIBLE;
762 while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
763 if (signal_pending(current)) {
764 if (!vc->vcore_running) {
765 kvm_run->exit_reason = KVM_EXIT_INTR;
766 vcpu->arch.ret = -EINTR;
767 break;
768 }
769 /* have to wait for vcore to stop executing guest */
770 wait_state = TASK_UNINTERRUPTIBLE;
771 smp_send_reschedule(vc->pcpu);
772 }
773
774 if (!vc->vcore_running &&
775 vc->n_runnable + vc->n_blocked == vc->num_threads) {
776 /* we can run now */
777 if (kvmppc_run_core(vc))
778 continue;
779 }
780
781 if (vc->vcore_running == 1 && VCORE_EXIT_COUNT(vc) == 0)
782 kvmppc_start_thread(vcpu);
783
784 /* wait for other threads to come in, or wait for vcore */
785 prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
786 spin_unlock(&vc->lock);
787 schedule();
788 finish_wait(&vcpu->arch.cpu_run, &wait);
789 spin_lock(&vc->lock);
790 }
791
792 if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
793 kvmppc_remove_runnable(vc, vcpu);
794 spin_unlock(&vc->lock);
795
796 return vcpu->arch.ret;
797}
798
799int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
800{
801 int r;
802
803 do {
804 r = kvmppc_run_vcpu(run, vcpu);
805
806 if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
807 !(vcpu->arch.shregs.msr & MSR_PR)) {
808 r = kvmppc_pseries_do_hcall(vcpu);
809 kvmppc_core_deliver_interrupts(vcpu);
810 }
811 } while (r == RESUME_GUEST);
812 return r;
813}
814
815static long kvmppc_stt_npages(unsigned long window_size)
816{
817 return ALIGN((window_size >> SPAPR_TCE_SHIFT)
818 * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
819}
820
821static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
822{
823 struct kvm *kvm = stt->kvm;
824 int i;
825
826 mutex_lock(&kvm->lock);
827 list_del(&stt->list);
828 for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
829 __free_page(stt->pages[i]);
830 kfree(stt);
831 mutex_unlock(&kvm->lock);
832
833 kvm_put_kvm(kvm);
834}
835
836static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
837{
838 struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
839 struct page *page;
840
841 if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size))
842 return VM_FAULT_SIGBUS;
843
844 page = stt->pages[vmf->pgoff];
845 get_page(page);
846 vmf->page = page;
847 return 0;
848}
849
850static const struct vm_operations_struct kvm_spapr_tce_vm_ops = {
851 .fault = kvm_spapr_tce_fault,
852};
853
854static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma)
855{
856 vma->vm_ops = &kvm_spapr_tce_vm_ops;
857 return 0;
858}
859
860static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
861{
862 struct kvmppc_spapr_tce_table *stt = filp->private_data;
863
864 release_spapr_tce_table(stt);
865 return 0;
866}
867
868static struct file_operations kvm_spapr_tce_fops = {
869 .mmap = kvm_spapr_tce_mmap,
870 .release = kvm_spapr_tce_release,
871};
872
873long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
874 struct kvm_create_spapr_tce *args)
875{
876 struct kvmppc_spapr_tce_table *stt = NULL;
877 long npages;
878 int ret = -ENOMEM;
879 int i;
880
881 /* Check this LIOBN hasn't been previously allocated */
882 list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
883 if (stt->liobn == args->liobn)
884 return -EBUSY;
885 }
886
887 npages = kvmppc_stt_npages(args->window_size);
888
889 stt = kzalloc(sizeof(*stt) + npages* sizeof(struct page *),
890 GFP_KERNEL);
891 if (!stt)
892 goto fail;
893
894 stt->liobn = args->liobn;
895 stt->window_size = args->window_size;
896 stt->kvm = kvm;
897
898 for (i = 0; i < npages; i++) {
899 stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
900 if (!stt->pages[i])
901 goto fail;
902 }
903
904 kvm_get_kvm(kvm);
905
906 mutex_lock(&kvm->lock);
907 list_add(&stt->list, &kvm->arch.spapr_tce_tables);
908
909 mutex_unlock(&kvm->lock);
910
911 return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
912 stt, O_RDWR);
913
914fail:
915 if (stt) {
916 for (i = 0; i < npages; i++)
917 if (stt->pages[i])
918 __free_page(stt->pages[i]);
919
920 kfree(stt);
921 }
922 return ret;
923}
924
925/* Work out RMLS (real mode limit selector) field value for a given RMA size.
926 Assumes POWER7 or PPC970. */
927static inline int lpcr_rmls(unsigned long rma_size)
928{
929 switch (rma_size) {
930 case 32ul << 20: /* 32 MB */
931 if (cpu_has_feature(CPU_FTR_ARCH_206))
932 return 8; /* only supported on POWER7 */
933 return -1;
934 case 64ul << 20: /* 64 MB */
935 return 3;
936 case 128ul << 20: /* 128 MB */
937 return 7;
938 case 256ul << 20: /* 256 MB */
939 return 4;
940 case 1ul << 30: /* 1 GB */
941 return 2;
942 case 16ul << 30: /* 16 GB */
943 return 1;
944 case 256ul << 30: /* 256 GB */
945 return 0;
946 default:
947 return -1;
948 }
949}
950
951static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
952{
953 struct kvmppc_rma_info *ri = vma->vm_file->private_data;
954 struct page *page;
955
956 if (vmf->pgoff >= ri->npages)
957 return VM_FAULT_SIGBUS;
958
959 page = pfn_to_page(ri->base_pfn + vmf->pgoff);
960 get_page(page);
961 vmf->page = page;
962 return 0;
963}
964
965static const struct vm_operations_struct kvm_rma_vm_ops = {
966 .fault = kvm_rma_fault,
967};
968
969static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma)
970{
971 vma->vm_flags |= VM_RESERVED;
972 vma->vm_ops = &kvm_rma_vm_ops;
973 return 0;
974}
975
976static int kvm_rma_release(struct inode *inode, struct file *filp)
977{
978 struct kvmppc_rma_info *ri = filp->private_data;
979
980 kvm_release_rma(ri);
981 return 0;
982}
983
984static struct file_operations kvm_rma_fops = {
985 .mmap = kvm_rma_mmap,
986 .release = kvm_rma_release,
987};
988
989long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
990{
991 struct kvmppc_rma_info *ri;
992 long fd;
993
994 ri = kvm_alloc_rma();
995 if (!ri)
996 return -ENOMEM;
997
998 fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR);
999 if (fd < 0)
1000 kvm_release_rma(ri);
1001
1002 ret->rma_size = ri->npages << PAGE_SHIFT;
1003 return fd;
1004}
1005
1006static struct page *hva_to_page(unsigned long addr)
1007{
1008 struct page *page[1];
1009 int npages;
1010
1011 might_sleep();
1012
1013 npages = get_user_pages_fast(addr, 1, 1, page);
1014
1015 if (unlikely(npages != 1))
1016 return 0;
1017
1018 return page[0];
1019}
1020
1021int kvmppc_core_prepare_memory_region(struct kvm *kvm,
1022 struct kvm_userspace_memory_region *mem)
1023{
1024 unsigned long psize, porder;
1025 unsigned long i, npages, totalpages;
1026 unsigned long pg_ix;
1027 struct kvmppc_pginfo *pginfo;
1028 unsigned long hva;
1029 struct kvmppc_rma_info *ri = NULL;
1030 struct page *page;
1031
1032 /* For now, only allow 16MB pages */
1033 porder = LARGE_PAGE_ORDER;
1034 psize = 1ul << porder;
1035 if ((mem->memory_size & (psize - 1)) ||
1036 (mem->guest_phys_addr & (psize - 1))) {
1037 pr_err("bad memory_size=%llx @ %llx\n",
1038 mem->memory_size, mem->guest_phys_addr);
1039 return -EINVAL;
1040 }
1041
1042 npages = mem->memory_size >> porder;
1043 totalpages = (mem->guest_phys_addr + mem->memory_size) >> porder;
1044
1045 /* More memory than we have space to track? */
1046 if (totalpages > (1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER)))
1047 return -EINVAL;
1048
1049 /* Do we already have an RMA registered? */
1050 if (mem->guest_phys_addr == 0 && kvm->arch.rma)
1051 return -EINVAL;
1052
1053 if (totalpages > kvm->arch.ram_npages)
1054 kvm->arch.ram_npages = totalpages;
1055
1056 /* Is this one of our preallocated RMAs? */
1057 if (mem->guest_phys_addr == 0) {
1058 struct vm_area_struct *vma;
1059
1060 down_read(&current->mm->mmap_sem);
1061 vma = find_vma(current->mm, mem->userspace_addr);
1062 if (vma && vma->vm_file &&
1063 vma->vm_file->f_op == &kvm_rma_fops &&
1064 mem->userspace_addr == vma->vm_start)
1065 ri = vma->vm_file->private_data;
1066 up_read(&current->mm->mmap_sem);
1067 if (!ri && cpu_has_feature(CPU_FTR_ARCH_201)) {
1068 pr_err("CPU requires an RMO\n");
1069 return -EINVAL;
1070 }
1071 }
1072
1073 if (ri) {
1074 unsigned long rma_size;
1075 unsigned long lpcr;
1076 long rmls;
1077
1078 rma_size = ri->npages << PAGE_SHIFT;
1079 if (rma_size > mem->memory_size)
1080 rma_size = mem->memory_size;
1081 rmls = lpcr_rmls(rma_size);
1082 if (rmls < 0) {
1083 pr_err("Can't use RMA of 0x%lx bytes\n", rma_size);
1084 return -EINVAL;
1085 }
1086 atomic_inc(&ri->use_count);
1087 kvm->arch.rma = ri;
1088 kvm->arch.n_rma_pages = rma_size >> porder;
1089
1090 /* Update LPCR and RMOR */
1091 lpcr = kvm->arch.lpcr;
1092 if (cpu_has_feature(CPU_FTR_ARCH_201)) {
1093 /* PPC970; insert RMLS value (split field) in HID4 */
1094 lpcr &= ~((1ul << HID4_RMLS0_SH) |
1095 (3ul << HID4_RMLS2_SH));
1096 lpcr |= ((rmls >> 2) << HID4_RMLS0_SH) |
1097 ((rmls & 3) << HID4_RMLS2_SH);
1098 /* RMOR is also in HID4 */
1099 lpcr |= ((ri->base_pfn >> (26 - PAGE_SHIFT)) & 0xffff)
1100 << HID4_RMOR_SH;
1101 } else {
1102 /* POWER7 */
1103 lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L);
1104 lpcr |= rmls << LPCR_RMLS_SH;
1105 kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
1106 }
1107 kvm->arch.lpcr = lpcr;
1108 pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n",
1109 ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
1110 }
1111
1112 pg_ix = mem->guest_phys_addr >> porder;
1113 pginfo = kvm->arch.ram_pginfo + pg_ix;
1114 for (i = 0; i < npages; ++i, ++pg_ix) {
1115 if (ri && pg_ix < kvm->arch.n_rma_pages) {
1116 pginfo[i].pfn = ri->base_pfn +
1117 (pg_ix << (porder - PAGE_SHIFT));
1118 continue;
1119 }
1120 hva = mem->userspace_addr + (i << porder);
1121 page = hva_to_page(hva);
1122 if (!page) {
1123 pr_err("oops, no pfn for hva %lx\n", hva);
1124 goto err;
1125 }
1126 /* Check it's a 16MB page */
1127 if (!PageHead(page) ||
1128 compound_order(page) != (LARGE_PAGE_ORDER - PAGE_SHIFT)) {
1129 pr_err("page at %lx isn't 16MB (o=%d)\n",
1130 hva, compound_order(page));
1131 goto err;
1132 }
1133 pginfo[i].pfn = page_to_pfn(page);
1134 }
1135
1136 return 0;
1137
1138 err:
1139 return -EINVAL;
1140}
1141
1142void kvmppc_core_commit_memory_region(struct kvm *kvm,
1143 struct kvm_userspace_memory_region *mem)
1144{
1145 if (mem->guest_phys_addr == 0 && mem->memory_size != 0 &&
1146 !kvm->arch.rma)
1147 kvmppc_map_vrma(kvm, mem);
1148}
1149
1150int kvmppc_core_init_vm(struct kvm *kvm)
1151{
1152 long r;
1153 unsigned long npages = 1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER);
1154 long err = -ENOMEM;
1155 unsigned long lpcr;
1156
1157 /* Allocate hashed page table */
1158 r = kvmppc_alloc_hpt(kvm);
1159 if (r)
1160 return r;
1161
1162 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
1163
1164 kvm->arch.ram_pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo),
1165 GFP_KERNEL);
1166 if (!kvm->arch.ram_pginfo) {
1167 pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n",
1168 npages * sizeof(struct kvmppc_pginfo));
1169 goto out_free;
1170 }
1171
1172 kvm->arch.ram_npages = 0;
1173 kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER;
1174 kvm->arch.ram_porder = LARGE_PAGE_ORDER;
1175 kvm->arch.rma = NULL;
1176 kvm->arch.n_rma_pages = 0;
1177
1178 kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
1179
1180 if (cpu_has_feature(CPU_FTR_ARCH_201)) {
1181 /* PPC970; HID4 is effectively the LPCR */
1182 unsigned long lpid = kvm->arch.lpid;
1183 kvm->arch.host_lpid = 0;
1184 kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4);
1185 lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH));
1186 lpcr |= ((lpid >> 4) << HID4_LPID1_SH) |
1187 ((lpid & 0xf) << HID4_LPID5_SH);
1188 } else {
1189 /* POWER7; init LPCR for virtual RMA mode */
1190 kvm->arch.host_lpid = mfspr(SPRN_LPID);
1191 kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
1192 lpcr &= LPCR_PECE | LPCR_LPES;
1193 lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
1194 LPCR_VPM0 | LPCR_VRMA_L;
1195 }
1196 kvm->arch.lpcr = lpcr;
1197
1198 return 0;
1199
1200 out_free:
1201 kvmppc_free_hpt(kvm);
1202 return err;
1203}
1204
1205void kvmppc_core_destroy_vm(struct kvm *kvm)
1206{
1207 struct kvmppc_pginfo *pginfo;
1208 unsigned long i;
1209
1210 if (kvm->arch.ram_pginfo) {
1211 pginfo = kvm->arch.ram_pginfo;
1212 kvm->arch.ram_pginfo = NULL;
1213 for (i = kvm->arch.n_rma_pages; i < kvm->arch.ram_npages; ++i)
1214 if (pginfo[i].pfn)
1215 put_page(pfn_to_page(pginfo[i].pfn));
1216 kfree(pginfo);
1217 }
1218 if (kvm->arch.rma) {
1219 kvm_release_rma(kvm->arch.rma);
1220 kvm->arch.rma = NULL;
1221 }
1222
1223 kvmppc_free_hpt(kvm);
1224 WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
1225}
1226
1227/* These are stubs for now */
1228void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
1229{
1230}
1231
1232/* We don't need to emulate any privileged instructions or dcbz */
1233int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
1234 unsigned int inst, int *advance)
1235{
1236 return EMULATE_FAIL;
1237}
1238
1239int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
1240{
1241 return EMULATE_FAIL;
1242}
1243
1244int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
1245{
1246 return EMULATE_FAIL;
1247}
1248
1249static int kvmppc_book3s_hv_init(void)
1250{
1251 int r;
1252
1253 r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
1254
1255 if (r)
1256 return r;
1257
1258 r = kvmppc_mmu_hv_init();
1259
1260 return r;
1261}
1262
1263static void kvmppc_book3s_hv_exit(void)
1264{
1265 kvm_exit();
1266}
1267
1268module_init(kvmppc_book3s_hv_init);
1269module_exit(kvmppc_book3s_hv_exit);
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
new file mode 100644
index 000000000000..d43120355eec
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -0,0 +1,155 @@
1/*
2 * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License, version 2, as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/kvm_host.h>
10#include <linux/preempt.h>
11#include <linux/sched.h>
12#include <linux/spinlock.h>
13#include <linux/bootmem.h>
14#include <linux/init.h>
15
16#include <asm/cputable.h>
17#include <asm/kvm_ppc.h>
18#include <asm/kvm_book3s.h>
19
20/*
21 * This maintains a list of RMAs (real mode areas) for KVM guests to use.
22 * Each RMA has to be physically contiguous and of a size that the
23 * hardware supports. PPC970 and POWER7 support 64MB, 128MB and 256MB,
24 * and other larger sizes. Since we are unlikely to be allocate that
25 * much physically contiguous memory after the system is up and running,
26 * we preallocate a set of RMAs in early boot for KVM to use.
27 */
28static unsigned long kvm_rma_size = 64 << 20; /* 64MB */
29static unsigned long kvm_rma_count;
30
31static int __init early_parse_rma_size(char *p)
32{
33 if (!p)
34 return 1;
35
36 kvm_rma_size = memparse(p, &p);
37
38 return 0;
39}
40early_param("kvm_rma_size", early_parse_rma_size);
41
42static int __init early_parse_rma_count(char *p)
43{
44 if (!p)
45 return 1;
46
47 kvm_rma_count = simple_strtoul(p, NULL, 0);
48
49 return 0;
50}
51early_param("kvm_rma_count", early_parse_rma_count);
52
53static struct kvmppc_rma_info *rma_info;
54static LIST_HEAD(free_rmas);
55static DEFINE_SPINLOCK(rma_lock);
56
57/* Work out RMLS (real mode limit selector) field value for a given RMA size.
58 Assumes POWER7 or PPC970. */
59static inline int lpcr_rmls(unsigned long rma_size)
60{
61 switch (rma_size) {
62 case 32ul << 20: /* 32 MB */
63 if (cpu_has_feature(CPU_FTR_ARCH_206))
64 return 8; /* only supported on POWER7 */
65 return -1;
66 case 64ul << 20: /* 64 MB */
67 return 3;
68 case 128ul << 20: /* 128 MB */
69 return 7;
70 case 256ul << 20: /* 256 MB */
71 return 4;
72 case 1ul << 30: /* 1 GB */
73 return 2;
74 case 16ul << 30: /* 16 GB */
75 return 1;
76 case 256ul << 30: /* 256 GB */
77 return 0;
78 default:
79 return -1;
80 }
81}
82
83/*
84 * Called at boot time while the bootmem allocator is active,
85 * to allocate contiguous physical memory for the real memory
86 * areas for guests.
87 */
88void kvm_rma_init(void)
89{
90 unsigned long i;
91 unsigned long j, npages;
92 void *rma;
93 struct page *pg;
94
95 /* Only do this on PPC970 in HV mode */
96 if (!cpu_has_feature(CPU_FTR_HVMODE) ||
97 !cpu_has_feature(CPU_FTR_ARCH_201))
98 return;
99
100 if (!kvm_rma_size || !kvm_rma_count)
101 return;
102
103 /* Check that the requested size is one supported in hardware */
104 if (lpcr_rmls(kvm_rma_size) < 0) {
105 pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size);
106 return;
107 }
108
109 npages = kvm_rma_size >> PAGE_SHIFT;
110 rma_info = alloc_bootmem(kvm_rma_count * sizeof(struct kvmppc_rma_info));
111 for (i = 0; i < kvm_rma_count; ++i) {
112 rma = alloc_bootmem_align(kvm_rma_size, kvm_rma_size);
113 pr_info("Allocated KVM RMA at %p (%ld MB)\n", rma,
114 kvm_rma_size >> 20);
115 rma_info[i].base_virt = rma;
116 rma_info[i].base_pfn = __pa(rma) >> PAGE_SHIFT;
117 rma_info[i].npages = npages;
118 list_add_tail(&rma_info[i].list, &free_rmas);
119 atomic_set(&rma_info[i].use_count, 0);
120
121 pg = pfn_to_page(rma_info[i].base_pfn);
122 for (j = 0; j < npages; ++j) {
123 atomic_inc(&pg->_count);
124 ++pg;
125 }
126 }
127}
128
129struct kvmppc_rma_info *kvm_alloc_rma(void)
130{
131 struct kvmppc_rma_info *ri;
132
133 ri = NULL;
134 spin_lock(&rma_lock);
135 if (!list_empty(&free_rmas)) {
136 ri = list_first_entry(&free_rmas, struct kvmppc_rma_info, list);
137 list_del(&ri->list);
138 atomic_inc(&ri->use_count);
139 }
140 spin_unlock(&rma_lock);
141 return ri;
142}
143EXPORT_SYMBOL_GPL(kvm_alloc_rma);
144
145void kvm_release_rma(struct kvmppc_rma_info *ri)
146{
147 if (atomic_dec_and_test(&ri->use_count)) {
148 spin_lock(&rma_lock);
149 list_add_tail(&ri->list, &free_rmas);
150 spin_unlock(&rma_lock);
151
152 }
153}
154EXPORT_SYMBOL_GPL(kvm_release_rma);
155
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
new file mode 100644
index 000000000000..3f7b674dd4bf
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -0,0 +1,166 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License, version 2, as
4 * published by the Free Software Foundation.
5 *
6 * This program is distributed in the hope that it will be useful,
7 * but WITHOUT ANY WARRANTY; without even the implied warranty of
8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9 * GNU General Public License for more details.
10 *
11 * You should have received a copy of the GNU General Public License
12 * along with this program; if not, write to the Free Software
13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
14 *
15 * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
16 *
17 * Derived from book3s_interrupts.S, which is:
18 * Copyright SUSE Linux Products GmbH 2009
19 *
20 * Authors: Alexander Graf <agraf@suse.de>
21 */
22
23#include <asm/ppc_asm.h>
24#include <asm/kvm_asm.h>
25#include <asm/reg.h>
26#include <asm/page.h>
27#include <asm/asm-offsets.h>
28#include <asm/exception-64s.h>
29#include <asm/ppc-opcode.h>
30
31/*****************************************************************************
32 * *
33 * Guest entry / exit code that is in kernel module memory (vmalloc) *
34 * *
35 ****************************************************************************/
36
37/* Registers:
38 * r4: vcpu pointer
39 */
40_GLOBAL(__kvmppc_vcore_entry)
41
42 /* Write correct stack frame */
43 mflr r0
44 std r0,PPC_LR_STKOFF(r1)
45
46 /* Save host state to the stack */
47 stdu r1, -SWITCH_FRAME_SIZE(r1)
48
49 /* Save non-volatile registers (r14 - r31) */
50 SAVE_NVGPRS(r1)
51
52 /* Save host DSCR */
53BEGIN_FTR_SECTION
54 mfspr r3, SPRN_DSCR
55 std r3, HSTATE_DSCR(r13)
56END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
57
58 /* Save host DABR */
59 mfspr r3, SPRN_DABR
60 std r3, HSTATE_DABR(r13)
61
62 /* Hard-disable interrupts */
63 mfmsr r10
64 std r10, HSTATE_HOST_MSR(r13)
65 rldicl r10,r10,48,1
66 rotldi r10,r10,16
67 mtmsrd r10,1
68
69 /* Save host PMU registers and load guest PMU registers */
70 /* R4 is live here (vcpu pointer) but not r3 or r5 */
71 li r3, 1
72 sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
73 mfspr r7, SPRN_MMCR0 /* save MMCR0 */
74 mtspr SPRN_MMCR0, r3 /* freeze all counters, disable interrupts */
75 isync
76 ld r3, PACALPPACAPTR(r13) /* is the host using the PMU? */
77 lbz r5, LPPACA_PMCINUSE(r3)
78 cmpwi r5, 0
79 beq 31f /* skip if not */
80 mfspr r5, SPRN_MMCR1
81 mfspr r6, SPRN_MMCRA
82 std r7, HSTATE_MMCR(r13)
83 std r5, HSTATE_MMCR + 8(r13)
84 std r6, HSTATE_MMCR + 16(r13)
85 mfspr r3, SPRN_PMC1
86 mfspr r5, SPRN_PMC2
87 mfspr r6, SPRN_PMC3
88 mfspr r7, SPRN_PMC4
89 mfspr r8, SPRN_PMC5
90 mfspr r9, SPRN_PMC6
91BEGIN_FTR_SECTION
92 mfspr r10, SPRN_PMC7
93 mfspr r11, SPRN_PMC8
94END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
95 stw r3, HSTATE_PMC(r13)
96 stw r5, HSTATE_PMC + 4(r13)
97 stw r6, HSTATE_PMC + 8(r13)
98 stw r7, HSTATE_PMC + 12(r13)
99 stw r8, HSTATE_PMC + 16(r13)
100 stw r9, HSTATE_PMC + 20(r13)
101BEGIN_FTR_SECTION
102 stw r10, HSTATE_PMC + 24(r13)
103 stw r11, HSTATE_PMC + 28(r13)
104END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
10531:
106
107 /*
108 * Put whatever is in the decrementer into the
109 * hypervisor decrementer.
110 */
111 mfspr r8,SPRN_DEC
112 mftb r7
113 mtspr SPRN_HDEC,r8
114 extsw r8,r8
115 add r8,r8,r7
116 std r8,HSTATE_DECEXP(r13)
117
118 /*
119 * On PPC970, if the guest vcpu has an external interrupt pending,
120 * send ourselves an IPI so as to interrupt the guest once it
121 * enables interrupts. (It must have interrupts disabled,
122 * otherwise we would already have delivered the interrupt.)
123 */
124BEGIN_FTR_SECTION
125 ld r0, VCPU_PENDING_EXC(r4)
126 li r7, (1 << BOOK3S_IRQPRIO_EXTERNAL)
127 oris r7, r7, (1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
128 and. r0, r0, r7
129 beq 32f
130 mr r31, r4
131 lhz r3, PACAPACAINDEX(r13)
132 bl smp_send_reschedule
133 nop
134 mr r4, r31
13532:
136END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
137
138 /* Jump to partition switch code */
139 bl .kvmppc_hv_entry_trampoline
140 nop
141
142/*
143 * We return here in virtual mode after the guest exits
144 * with something that we can't handle in real mode.
145 * Interrupts are enabled again at this point.
146 */
147
148.global kvmppc_handler_highmem
149kvmppc_handler_highmem:
150
151 /*
152 * Register usage at this point:
153 *
154 * R1 = host R1
155 * R2 = host R2
156 * R12 = exit handler id
157 * R13 = PACA
158 */
159
160 /* Restore non-volatile host registers (r14 - r31) */
161 REST_NVGPRS(r1)
162
163 addi r1, r1, SWITCH_FRAME_SIZE
164 ld r0, PPC_LR_STKOFF(r1)
165 mtlr r0
166 blr
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
new file mode 100644
index 000000000000..fcfe6b055558
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -0,0 +1,370 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License, version 2, as
4 * published by the Free Software Foundation.
5 *
6 * Copyright 2010-2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
7 */
8
9#include <linux/types.h>
10#include <linux/string.h>
11#include <linux/kvm.h>
12#include <linux/kvm_host.h>
13#include <linux/hugetlb.h>
14
15#include <asm/tlbflush.h>
16#include <asm/kvm_ppc.h>
17#include <asm/kvm_book3s.h>
18#include <asm/mmu-hash64.h>
19#include <asm/hvcall.h>
20#include <asm/synch.h>
21#include <asm/ppc-opcode.h>
22
23/* For now use fixed-size 16MB page table */
24#define HPT_ORDER 24
25#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */
26#define HPT_HASH_MASK (HPT_NPTEG - 1)
27
28#define HPTE_V_HVLOCK 0x40UL
29
30static inline long lock_hpte(unsigned long *hpte, unsigned long bits)
31{
32 unsigned long tmp, old;
33
34 asm volatile(" ldarx %0,0,%2\n"
35 " and. %1,%0,%3\n"
36 " bne 2f\n"
37 " ori %0,%0,%4\n"
38 " stdcx. %0,0,%2\n"
39 " beq+ 2f\n"
40 " li %1,%3\n"
41 "2: isync"
42 : "=&r" (tmp), "=&r" (old)
43 : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)
44 : "cc", "memory");
45 return old == 0;
46}
47
48long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
49 long pte_index, unsigned long pteh, unsigned long ptel)
50{
51 unsigned long porder;
52 struct kvm *kvm = vcpu->kvm;
53 unsigned long i, lpn, pa;
54 unsigned long *hpte;
55
56 /* only handle 4k, 64k and 16M pages for now */
57 porder = 12;
58 if (pteh & HPTE_V_LARGE) {
59 if (cpu_has_feature(CPU_FTR_ARCH_206) &&
60 (ptel & 0xf000) == 0x1000) {
61 /* 64k page */
62 porder = 16;
63 } else if ((ptel & 0xff000) == 0) {
64 /* 16M page */
65 porder = 24;
66 /* lowest AVA bit must be 0 for 16M pages */
67 if (pteh & 0x80)
68 return H_PARAMETER;
69 } else
70 return H_PARAMETER;
71 }
72 lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder;
73 if (lpn >= kvm->arch.ram_npages || porder > kvm->arch.ram_porder)
74 return H_PARAMETER;
75 pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT;
76 if (!pa)
77 return H_PARAMETER;
78 /* Check WIMG */
79 if ((ptel & HPTE_R_WIMG) != HPTE_R_M &&
80 (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
81 return H_PARAMETER;
82 pteh &= ~0x60UL;
83 ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize);
84 ptel |= pa;
85 if (pte_index >= (HPT_NPTEG << 3))
86 return H_PARAMETER;
87 if (likely((flags & H_EXACT) == 0)) {
88 pte_index &= ~7UL;
89 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
90 for (i = 0; ; ++i) {
91 if (i == 8)
92 return H_PTEG_FULL;
93 if ((*hpte & HPTE_V_VALID) == 0 &&
94 lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
95 break;
96 hpte += 2;
97 }
98 } else {
99 i = 0;
100 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
101 if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
102 return H_PTEG_FULL;
103 }
104 hpte[1] = ptel;
105 eieio();
106 hpte[0] = pteh;
107 asm volatile("ptesync" : : : "memory");
108 atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt);
109 vcpu->arch.gpr[4] = pte_index + i;
110 return H_SUCCESS;
111}
112
113static unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
114 unsigned long pte_index)
115{
116 unsigned long rb, va_low;
117
118 rb = (v & ~0x7fUL) << 16; /* AVA field */
119 va_low = pte_index >> 3;
120 if (v & HPTE_V_SECONDARY)
121 va_low = ~va_low;
122 /* xor vsid from AVA */
123 if (!(v & HPTE_V_1TB_SEG))
124 va_low ^= v >> 12;
125 else
126 va_low ^= v >> 24;
127 va_low &= 0x7ff;
128 if (v & HPTE_V_LARGE) {
129 rb |= 1; /* L field */
130 if (cpu_has_feature(CPU_FTR_ARCH_206) &&
131 (r & 0xff000)) {
132 /* non-16MB large page, must be 64k */
133 /* (masks depend on page size) */
134 rb |= 0x1000; /* page encoding in LP field */
135 rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
136 rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */
137 }
138 } else {
139 /* 4kB page */
140 rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */
141 }
142 rb |= (v >> 54) & 0x300; /* B field */
143 return rb;
144}
145
146#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token))
147
148static inline int try_lock_tlbie(unsigned int *lock)
149{
150 unsigned int tmp, old;
151 unsigned int token = LOCK_TOKEN;
152
153 asm volatile("1:lwarx %1,0,%2\n"
154 " cmpwi cr0,%1,0\n"
155 " bne 2f\n"
156 " stwcx. %3,0,%2\n"
157 " bne- 1b\n"
158 " isync\n"
159 "2:"
160 : "=&r" (tmp), "=&r" (old)
161 : "r" (lock), "r" (token)
162 : "cc", "memory");
163 return old == 0;
164}
165
166long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
167 unsigned long pte_index, unsigned long avpn,
168 unsigned long va)
169{
170 struct kvm *kvm = vcpu->kvm;
171 unsigned long *hpte;
172 unsigned long v, r, rb;
173
174 if (pte_index >= (HPT_NPTEG << 3))
175 return H_PARAMETER;
176 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
177 while (!lock_hpte(hpte, HPTE_V_HVLOCK))
178 cpu_relax();
179 if ((hpte[0] & HPTE_V_VALID) == 0 ||
180 ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) ||
181 ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) {
182 hpte[0] &= ~HPTE_V_HVLOCK;
183 return H_NOT_FOUND;
184 }
185 if (atomic_read(&kvm->online_vcpus) == 1)
186 flags |= H_LOCAL;
187 vcpu->arch.gpr[4] = v = hpte[0] & ~HPTE_V_HVLOCK;
188 vcpu->arch.gpr[5] = r = hpte[1];
189 rb = compute_tlbie_rb(v, r, pte_index);
190 hpte[0] = 0;
191 if (!(flags & H_LOCAL)) {
192 while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
193 cpu_relax();
194 asm volatile("ptesync" : : : "memory");
195 asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
196 : : "r" (rb), "r" (kvm->arch.lpid));
197 asm volatile("ptesync" : : : "memory");
198 kvm->arch.tlbie_lock = 0;
199 } else {
200 asm volatile("ptesync" : : : "memory");
201 asm volatile("tlbiel %0" : : "r" (rb));
202 asm volatile("ptesync" : : : "memory");
203 }
204 return H_SUCCESS;
205}
206
207long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
208{
209 struct kvm *kvm = vcpu->kvm;
210 unsigned long *args = &vcpu->arch.gpr[4];
211 unsigned long *hp, tlbrb[4];
212 long int i, found;
213 long int n_inval = 0;
214 unsigned long flags, req, pte_index;
215 long int local = 0;
216 long int ret = H_SUCCESS;
217
218 if (atomic_read(&kvm->online_vcpus) == 1)
219 local = 1;
220 for (i = 0; i < 4; ++i) {
221 pte_index = args[i * 2];
222 flags = pte_index >> 56;
223 pte_index &= ((1ul << 56) - 1);
224 req = flags >> 6;
225 flags &= 3;
226 if (req == 3)
227 break;
228 if (req != 1 || flags == 3 ||
229 pte_index >= (HPT_NPTEG << 3)) {
230 /* parameter error */
231 args[i * 2] = ((0xa0 | flags) << 56) + pte_index;
232 ret = H_PARAMETER;
233 break;
234 }
235 hp = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
236 while (!lock_hpte(hp, HPTE_V_HVLOCK))
237 cpu_relax();
238 found = 0;
239 if (hp[0] & HPTE_V_VALID) {
240 switch (flags & 3) {
241 case 0: /* absolute */
242 found = 1;
243 break;
244 case 1: /* andcond */
245 if (!(hp[0] & args[i * 2 + 1]))
246 found = 1;
247 break;
248 case 2: /* AVPN */
249 if ((hp[0] & ~0x7fUL) == args[i * 2 + 1])
250 found = 1;
251 break;
252 }
253 }
254 if (!found) {
255 hp[0] &= ~HPTE_V_HVLOCK;
256 args[i * 2] = ((0x90 | flags) << 56) + pte_index;
257 continue;
258 }
259 /* insert R and C bits from PTE */
260 flags |= (hp[1] >> 5) & 0x0c;
261 args[i * 2] = ((0x80 | flags) << 56) + pte_index;
262 tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index);
263 hp[0] = 0;
264 }
265 if (n_inval == 0)
266 return ret;
267
268 if (!local) {
269 while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
270 cpu_relax();
271 asm volatile("ptesync" : : : "memory");
272 for (i = 0; i < n_inval; ++i)
273 asm volatile(PPC_TLBIE(%1,%0)
274 : : "r" (tlbrb[i]), "r" (kvm->arch.lpid));
275 asm volatile("eieio; tlbsync; ptesync" : : : "memory");
276 kvm->arch.tlbie_lock = 0;
277 } else {
278 asm volatile("ptesync" : : : "memory");
279 for (i = 0; i < n_inval; ++i)
280 asm volatile("tlbiel %0" : : "r" (tlbrb[i]));
281 asm volatile("ptesync" : : : "memory");
282 }
283 return ret;
284}
285
286long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
287 unsigned long pte_index, unsigned long avpn,
288 unsigned long va)
289{
290 struct kvm *kvm = vcpu->kvm;
291 unsigned long *hpte;
292 unsigned long v, r, rb;
293
294 if (pte_index >= (HPT_NPTEG << 3))
295 return H_PARAMETER;
296 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
297 while (!lock_hpte(hpte, HPTE_V_HVLOCK))
298 cpu_relax();
299 if ((hpte[0] & HPTE_V_VALID) == 0 ||
300 ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) {
301 hpte[0] &= ~HPTE_V_HVLOCK;
302 return H_NOT_FOUND;
303 }
304 if (atomic_read(&kvm->online_vcpus) == 1)
305 flags |= H_LOCAL;
306 v = hpte[0];
307 r = hpte[1] & ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
308 HPTE_R_KEY_HI | HPTE_R_KEY_LO);
309 r |= (flags << 55) & HPTE_R_PP0;
310 r |= (flags << 48) & HPTE_R_KEY_HI;
311 r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
312 rb = compute_tlbie_rb(v, r, pte_index);
313 hpte[0] = v & ~HPTE_V_VALID;
314 if (!(flags & H_LOCAL)) {
315 while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
316 cpu_relax();
317 asm volatile("ptesync" : : : "memory");
318 asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
319 : : "r" (rb), "r" (kvm->arch.lpid));
320 asm volatile("ptesync" : : : "memory");
321 kvm->arch.tlbie_lock = 0;
322 } else {
323 asm volatile("ptesync" : : : "memory");
324 asm volatile("tlbiel %0" : : "r" (rb));
325 asm volatile("ptesync" : : : "memory");
326 }
327 hpte[1] = r;
328 eieio();
329 hpte[0] = v & ~HPTE_V_HVLOCK;
330 asm volatile("ptesync" : : : "memory");
331 return H_SUCCESS;
332}
333
334static unsigned long reverse_xlate(struct kvm *kvm, unsigned long realaddr)
335{
336 long int i;
337 unsigned long offset, rpn;
338
339 offset = realaddr & (kvm->arch.ram_psize - 1);
340 rpn = (realaddr - offset) >> PAGE_SHIFT;
341 for (i = 0; i < kvm->arch.ram_npages; ++i)
342 if (rpn == kvm->arch.ram_pginfo[i].pfn)
343 return (i << PAGE_SHIFT) + offset;
344 return HPTE_R_RPN; /* all 1s in the RPN field */
345}
346
347long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
348 unsigned long pte_index)
349{
350 struct kvm *kvm = vcpu->kvm;
351 unsigned long *hpte, r;
352 int i, n = 1;
353
354 if (pte_index >= (HPT_NPTEG << 3))
355 return H_PARAMETER;
356 if (flags & H_READ_4) {
357 pte_index &= ~3;
358 n = 4;
359 }
360 for (i = 0; i < n; ++i, ++pte_index) {
361 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
362 r = hpte[1];
363 if ((flags & H_R_XLATE) && (hpte[0] & HPTE_V_VALID))
364 r = reverse_xlate(kvm, r & HPTE_R_RPN) |
365 (r & ~HPTE_R_RPN);
366 vcpu->arch.gpr[4 + i * 2] = hpte[0];
367 vcpu->arch.gpr[5 + i * 2] = r;
368 }
369 return H_SUCCESS;
370}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
new file mode 100644
index 000000000000..6dd33581a228
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -0,0 +1,1345 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License, version 2, as
4 * published by the Free Software Foundation.
5 *
6 * This program is distributed in the hope that it will be useful,
7 * but WITHOUT ANY WARRANTY; without even the implied warranty of
8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9 * GNU General Public License for more details.
10 *
11 * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
12 *
13 * Derived from book3s_rmhandlers.S and other files, which are:
14 *
15 * Copyright SUSE Linux Products GmbH 2009
16 *
17 * Authors: Alexander Graf <agraf@suse.de>
18 */
19
20#include <asm/ppc_asm.h>
21#include <asm/kvm_asm.h>
22#include <asm/reg.h>
23#include <asm/page.h>
24#include <asm/asm-offsets.h>
25#include <asm/exception-64s.h>
26
27/*****************************************************************************
28 * *
29 * Real Mode handlers that need to be in the linear mapping *
30 * *
31 ****************************************************************************/
32
33 .globl kvmppc_skip_interrupt
34kvmppc_skip_interrupt:
35 mfspr r13,SPRN_SRR0
36 addi r13,r13,4
37 mtspr SPRN_SRR0,r13
38 GET_SCRATCH0(r13)
39 rfid
40 b .
41
42 .globl kvmppc_skip_Hinterrupt
43kvmppc_skip_Hinterrupt:
44 mfspr r13,SPRN_HSRR0
45 addi r13,r13,4
46 mtspr SPRN_HSRR0,r13
47 GET_SCRATCH0(r13)
48 hrfid
49 b .
50
51/*
52 * Call kvmppc_handler_trampoline_enter in real mode.
53 * Must be called with interrupts hard-disabled.
54 *
55 * Input Registers:
56 *
57 * LR = return address to continue at after eventually re-enabling MMU
58 */
59_GLOBAL(kvmppc_hv_entry_trampoline)
60 mfmsr r10
61 LOAD_REG_ADDR(r5, kvmppc_hv_entry)
62 li r0,MSR_RI
63 andc r0,r10,r0
64 li r6,MSR_IR | MSR_DR
65 andc r6,r10,r6
66 mtmsrd r0,1 /* clear RI in MSR */
67 mtsrr0 r5
68 mtsrr1 r6
69 RFI
70
71#define ULONG_SIZE 8
72#define VCPU_GPR(n) (VCPU_GPRS + (n * ULONG_SIZE))
73
74/******************************************************************************
75 * *
76 * Entry code *
77 * *
78 *****************************************************************************/
79
80#define XICS_XIRR 4
81#define XICS_QIRR 0xc
82
83/*
84 * We come in here when wakened from nap mode on a secondary hw thread.
85 * Relocation is off and most register values are lost.
86 * r13 points to the PACA.
87 */
88 .globl kvm_start_guest
89kvm_start_guest:
90 ld r1,PACAEMERGSP(r13)
91 subi r1,r1,STACK_FRAME_OVERHEAD
92
93 /* get vcpu pointer */
94 ld r4, HSTATE_KVM_VCPU(r13)
95
96 /* We got here with an IPI; clear it */
97 ld r5, HSTATE_XICS_PHYS(r13)
98 li r0, 0xff
99 li r6, XICS_QIRR
100 li r7, XICS_XIRR
101 lwzcix r8, r5, r7 /* ack the interrupt */
102 sync
103 stbcix r0, r5, r6 /* clear it */
104 stwcix r8, r5, r7 /* EOI it */
105
106.global kvmppc_hv_entry
107kvmppc_hv_entry:
108
109 /* Required state:
110 *
111 * R4 = vcpu pointer
112 * MSR = ~IR|DR
113 * R13 = PACA
114 * R1 = host R1
115 * all other volatile GPRS = free
116 */
117 mflr r0
118 std r0, HSTATE_VMHANDLER(r13)
119
120 ld r14, VCPU_GPR(r14)(r4)
121 ld r15, VCPU_GPR(r15)(r4)
122 ld r16, VCPU_GPR(r16)(r4)
123 ld r17, VCPU_GPR(r17)(r4)
124 ld r18, VCPU_GPR(r18)(r4)
125 ld r19, VCPU_GPR(r19)(r4)
126 ld r20, VCPU_GPR(r20)(r4)
127 ld r21, VCPU_GPR(r21)(r4)
128 ld r22, VCPU_GPR(r22)(r4)
129 ld r23, VCPU_GPR(r23)(r4)
130 ld r24, VCPU_GPR(r24)(r4)
131 ld r25, VCPU_GPR(r25)(r4)
132 ld r26, VCPU_GPR(r26)(r4)
133 ld r27, VCPU_GPR(r27)(r4)
134 ld r28, VCPU_GPR(r28)(r4)
135 ld r29, VCPU_GPR(r29)(r4)
136 ld r30, VCPU_GPR(r30)(r4)
137 ld r31, VCPU_GPR(r31)(r4)
138
139 /* Load guest PMU registers */
140 /* R4 is live here (vcpu pointer) */
141 li r3, 1
142 sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
143 mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */
144 isync
145 lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */
146 lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */
147 lwz r6, VCPU_PMC + 8(r4)
148 lwz r7, VCPU_PMC + 12(r4)
149 lwz r8, VCPU_PMC + 16(r4)
150 lwz r9, VCPU_PMC + 20(r4)
151BEGIN_FTR_SECTION
152 lwz r10, VCPU_PMC + 24(r4)
153 lwz r11, VCPU_PMC + 28(r4)
154END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
155 mtspr SPRN_PMC1, r3
156 mtspr SPRN_PMC2, r5
157 mtspr SPRN_PMC3, r6
158 mtspr SPRN_PMC4, r7
159 mtspr SPRN_PMC5, r8
160 mtspr SPRN_PMC6, r9
161BEGIN_FTR_SECTION
162 mtspr SPRN_PMC7, r10
163 mtspr SPRN_PMC8, r11
164END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
165 ld r3, VCPU_MMCR(r4)
166 ld r5, VCPU_MMCR + 8(r4)
167 ld r6, VCPU_MMCR + 16(r4)
168 mtspr SPRN_MMCR1, r5
169 mtspr SPRN_MMCRA, r6
170 mtspr SPRN_MMCR0, r3
171 isync
172
173 /* Load up FP, VMX and VSX registers */
174 bl kvmppc_load_fp
175
176BEGIN_FTR_SECTION
177 /* Switch DSCR to guest value */
178 ld r5, VCPU_DSCR(r4)
179 mtspr SPRN_DSCR, r5
180END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
181
182 /*
183 * Set the decrementer to the guest decrementer.
184 */
185 ld r8,VCPU_DEC_EXPIRES(r4)
186 mftb r7
187 subf r3,r7,r8
188 mtspr SPRN_DEC,r3
189 stw r3,VCPU_DEC(r4)
190
191 ld r5, VCPU_SPRG0(r4)
192 ld r6, VCPU_SPRG1(r4)
193 ld r7, VCPU_SPRG2(r4)
194 ld r8, VCPU_SPRG3(r4)
195 mtspr SPRN_SPRG0, r5
196 mtspr SPRN_SPRG1, r6
197 mtspr SPRN_SPRG2, r7
198 mtspr SPRN_SPRG3, r8
199
200 /* Save R1 in the PACA */
201 std r1, HSTATE_HOST_R1(r13)
202
203 /* Increment yield count if they have a VPA */
204 ld r3, VCPU_VPA(r4)
205 cmpdi r3, 0
206 beq 25f
207 lwz r5, LPPACA_YIELDCOUNT(r3)
208 addi r5, r5, 1
209 stw r5, LPPACA_YIELDCOUNT(r3)
21025:
211 /* Load up DAR and DSISR */
212 ld r5, VCPU_DAR(r4)
213 lwz r6, VCPU_DSISR(r4)
214 mtspr SPRN_DAR, r5
215 mtspr SPRN_DSISR, r6
216
217 /* Set partition DABR */
218 li r5,3
219 ld r6,VCPU_DABR(r4)
220 mtspr SPRN_DABRX,r5
221 mtspr SPRN_DABR,r6
222
223BEGIN_FTR_SECTION
224 /* Restore AMR and UAMOR, set AMOR to all 1s */
225 ld r5,VCPU_AMR(r4)
226 ld r6,VCPU_UAMOR(r4)
227 li r7,-1
228 mtspr SPRN_AMR,r5
229 mtspr SPRN_UAMOR,r6
230 mtspr SPRN_AMOR,r7
231END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
232
233 /* Clear out SLB */
234 li r6,0
235 slbmte r6,r6
236 slbia
237 ptesync
238
239BEGIN_FTR_SECTION
240 b 30f
241END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
242 /*
243 * POWER7 host -> guest partition switch code.
244 * We don't have to lock against concurrent tlbies,
245 * but we do have to coordinate across hardware threads.
246 */
247 /* Increment entry count iff exit count is zero. */
248 ld r5,HSTATE_KVM_VCORE(r13)
249 addi r9,r5,VCORE_ENTRY_EXIT
25021: lwarx r3,0,r9
251 cmpwi r3,0x100 /* any threads starting to exit? */
252 bge secondary_too_late /* if so we're too late to the party */
253 addi r3,r3,1
254 stwcx. r3,0,r9
255 bne 21b
256
257 /* Primary thread switches to guest partition. */
258 ld r9,VCPU_KVM(r4) /* pointer to struct kvm */
259 lwz r6,VCPU_PTID(r4)
260 cmpwi r6,0
261 bne 20f
262 ld r6,KVM_SDR1(r9)
263 lwz r7,KVM_LPID(r9)
264 li r0,LPID_RSVD /* switch to reserved LPID */
265 mtspr SPRN_LPID,r0
266 ptesync
267 mtspr SPRN_SDR1,r6 /* switch to partition page table */
268 mtspr SPRN_LPID,r7
269 isync
270 li r0,1
271 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */
272 b 10f
273
274 /* Secondary threads wait for primary to have done partition switch */
27520: lbz r0,VCORE_IN_GUEST(r5)
276 cmpwi r0,0
277 beq 20b
278
279 /* Set LPCR. Set the MER bit if there is a pending external irq. */
28010: ld r8,KVM_LPCR(r9)
281 ld r0,VCPU_PENDING_EXC(r4)
282 li r7,(1 << BOOK3S_IRQPRIO_EXTERNAL)
283 oris r7,r7,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
284 and. r0,r0,r7
285 beq 11f
286 ori r8,r8,LPCR_MER
28711: mtspr SPRN_LPCR,r8
288 ld r8,KVM_RMOR(r9)
289 mtspr SPRN_RMOR,r8
290 isync
291
292 /* Check if HDEC expires soon */
293 mfspr r3,SPRN_HDEC
294 cmpwi r3,10
295 li r12,BOOK3S_INTERRUPT_HV_DECREMENTER
296 mr r9,r4
297 blt hdec_soon
298
299 /*
300 * Invalidate the TLB if we could possibly have stale TLB
301 * entries for this partition on this core due to the use
302 * of tlbiel.
303 * XXX maybe only need this on primary thread?
304 */
305 ld r9,VCPU_KVM(r4) /* pointer to struct kvm */
306 lwz r5,VCPU_VCPUID(r4)
307 lhz r6,PACAPACAINDEX(r13)
308 rldimi r6,r5,0,62 /* XXX map as if threads 1:1 p:v */
309 lhz r8,VCPU_LAST_CPU(r4)
310 sldi r7,r6,1 /* see if this is the same vcpu */
311 add r7,r7,r9 /* as last ran on this pcpu */
312 lhz r0,KVM_LAST_VCPU(r7)
313 cmpw r6,r8 /* on the same cpu core as last time? */
314 bne 3f
315 cmpw r0,r5 /* same vcpu as this core last ran? */
316 beq 1f
3173: sth r6,VCPU_LAST_CPU(r4) /* if not, invalidate partition TLB */
318 sth r5,KVM_LAST_VCPU(r7)
319 li r6,128
320 mtctr r6
321 li r7,0x800 /* IS field = 0b10 */
322 ptesync
3232: tlbiel r7
324 addi r7,r7,0x1000
325 bdnz 2b
326 ptesync
3271:
328
329 /* Save purr/spurr */
330 mfspr r5,SPRN_PURR
331 mfspr r6,SPRN_SPURR
332 std r5,HSTATE_PURR(r13)
333 std r6,HSTATE_SPURR(r13)
334 ld r7,VCPU_PURR(r4)
335 ld r8,VCPU_SPURR(r4)
336 mtspr SPRN_PURR,r7
337 mtspr SPRN_SPURR,r8
338 b 31f
339
340 /*
341 * PPC970 host -> guest partition switch code.
342 * We have to lock against concurrent tlbies,
343 * using native_tlbie_lock to lock against host tlbies
344 * and kvm->arch.tlbie_lock to lock against guest tlbies.
345 * We also have to invalidate the TLB since its
346 * entries aren't tagged with the LPID.
347 */
34830: ld r9,VCPU_KVM(r4) /* pointer to struct kvm */
349
350 /* first take native_tlbie_lock */
351 .section ".toc","aw"
352toc_tlbie_lock:
353 .tc native_tlbie_lock[TC],native_tlbie_lock
354 .previous
355 ld r3,toc_tlbie_lock@toc(2)
356 lwz r8,PACA_LOCK_TOKEN(r13)
35724: lwarx r0,0,r3
358 cmpwi r0,0
359 bne 24b
360 stwcx. r8,0,r3
361 bne 24b
362 isync
363
364 ld r7,KVM_LPCR(r9) /* use kvm->arch.lpcr to store HID4 */
365 li r0,0x18f
366 rotldi r0,r0,HID4_LPID5_SH /* all lpid bits in HID4 = 1 */
367 or r0,r7,r0
368 ptesync
369 sync
370 mtspr SPRN_HID4,r0 /* switch to reserved LPID */
371 isync
372 li r0,0
373 stw r0,0(r3) /* drop native_tlbie_lock */
374
375 /* invalidate the whole TLB */
376 li r0,256
377 mtctr r0
378 li r6,0
37925: tlbiel r6
380 addi r6,r6,0x1000
381 bdnz 25b
382 ptesync
383
384 /* Take the guest's tlbie_lock */
385 addi r3,r9,KVM_TLBIE_LOCK
38624: lwarx r0,0,r3
387 cmpwi r0,0
388 bne 24b
389 stwcx. r8,0,r3
390 bne 24b
391 isync
392 ld r6,KVM_SDR1(r9)
393 mtspr SPRN_SDR1,r6 /* switch to partition page table */
394
395 /* Set up HID4 with the guest's LPID etc. */
396 sync
397 mtspr SPRN_HID4,r7
398 isync
399
400 /* drop the guest's tlbie_lock */
401 li r0,0
402 stw r0,0(r3)
403
404 /* Check if HDEC expires soon */
405 mfspr r3,SPRN_HDEC
406 cmpwi r3,10
407 li r12,BOOK3S_INTERRUPT_HV_DECREMENTER
408 mr r9,r4
409 blt hdec_soon
410
411 /* Enable HDEC interrupts */
412 mfspr r0,SPRN_HID0
413 li r3,1
414 rldimi r0,r3, HID0_HDICE_SH, 64-HID0_HDICE_SH-1
415 sync
416 mtspr SPRN_HID0,r0
417 mfspr r0,SPRN_HID0
418 mfspr r0,SPRN_HID0
419 mfspr r0,SPRN_HID0
420 mfspr r0,SPRN_HID0
421 mfspr r0,SPRN_HID0
422 mfspr r0,SPRN_HID0
423
424 /* Load up guest SLB entries */
42531: lwz r5,VCPU_SLB_MAX(r4)
426 cmpwi r5,0
427 beq 9f
428 mtctr r5
429 addi r6,r4,VCPU_SLB
4301: ld r8,VCPU_SLB_E(r6)
431 ld r9,VCPU_SLB_V(r6)
432 slbmte r9,r8
433 addi r6,r6,VCPU_SLB_SIZE
434 bdnz 1b
4359:
436
437 /* Restore state of CTRL run bit; assume 1 on entry */
438 lwz r5,VCPU_CTRL(r4)
439 andi. r5,r5,1
440 bne 4f
441 mfspr r6,SPRN_CTRLF
442 clrrdi r6,r6,1
443 mtspr SPRN_CTRLT,r6
4444:
445 ld r6, VCPU_CTR(r4)
446 lwz r7, VCPU_XER(r4)
447
448 mtctr r6
449 mtxer r7
450
451 /* Move SRR0 and SRR1 into the respective regs */
452 ld r6, VCPU_SRR0(r4)
453 ld r7, VCPU_SRR1(r4)
454 mtspr SPRN_SRR0, r6
455 mtspr SPRN_SRR1, r7
456
457 ld r10, VCPU_PC(r4)
458
459 ld r11, VCPU_MSR(r4) /* r10 = vcpu->arch.msr & ~MSR_HV */
460 rldicl r11, r11, 63 - MSR_HV_LG, 1
461 rotldi r11, r11, 1 + MSR_HV_LG
462 ori r11, r11, MSR_ME
463
464fast_guest_return:
465 mtspr SPRN_HSRR0,r10
466 mtspr SPRN_HSRR1,r11
467
468 /* Activate guest mode, so faults get handled by KVM */
469 li r9, KVM_GUEST_MODE_GUEST
470 stb r9, HSTATE_IN_GUEST(r13)
471
472 /* Enter guest */
473
474 ld r5, VCPU_LR(r4)
475 lwz r6, VCPU_CR(r4)
476 mtlr r5
477 mtcr r6
478
479 ld r0, VCPU_GPR(r0)(r4)
480 ld r1, VCPU_GPR(r1)(r4)
481 ld r2, VCPU_GPR(r2)(r4)
482 ld r3, VCPU_GPR(r3)(r4)
483 ld r5, VCPU_GPR(r5)(r4)
484 ld r6, VCPU_GPR(r6)(r4)
485 ld r7, VCPU_GPR(r7)(r4)
486 ld r8, VCPU_GPR(r8)(r4)
487 ld r9, VCPU_GPR(r9)(r4)
488 ld r10, VCPU_GPR(r10)(r4)
489 ld r11, VCPU_GPR(r11)(r4)
490 ld r12, VCPU_GPR(r12)(r4)
491 ld r13, VCPU_GPR(r13)(r4)
492
493 ld r4, VCPU_GPR(r4)(r4)
494
495 hrfid
496 b .
497
498/******************************************************************************
499 * *
500 * Exit code *
501 * *
502 *****************************************************************************/
503
504/*
505 * We come here from the first-level interrupt handlers.
506 */
507 .globl kvmppc_interrupt
508kvmppc_interrupt:
509 /*
510 * Register contents:
511 * R12 = interrupt vector
512 * R13 = PACA
513 * guest CR, R12 saved in shadow VCPU SCRATCH1/0
514 * guest R13 saved in SPRN_SCRATCH0
515 */
516 /* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */
517 std r9, HSTATE_HOST_R2(r13)
518 ld r9, HSTATE_KVM_VCPU(r13)
519
520 /* Save registers */
521
522 std r0, VCPU_GPR(r0)(r9)
523 std r1, VCPU_GPR(r1)(r9)
524 std r2, VCPU_GPR(r2)(r9)
525 std r3, VCPU_GPR(r3)(r9)
526 std r4, VCPU_GPR(r4)(r9)
527 std r5, VCPU_GPR(r5)(r9)
528 std r6, VCPU_GPR(r6)(r9)
529 std r7, VCPU_GPR(r7)(r9)
530 std r8, VCPU_GPR(r8)(r9)
531 ld r0, HSTATE_HOST_R2(r13)
532 std r0, VCPU_GPR(r9)(r9)
533 std r10, VCPU_GPR(r10)(r9)
534 std r11, VCPU_GPR(r11)(r9)
535 ld r3, HSTATE_SCRATCH0(r13)
536 lwz r4, HSTATE_SCRATCH1(r13)
537 std r3, VCPU_GPR(r12)(r9)
538 stw r4, VCPU_CR(r9)
539
540 /* Restore R1/R2 so we can handle faults */
541 ld r1, HSTATE_HOST_R1(r13)
542 ld r2, PACATOC(r13)
543
544 mfspr r10, SPRN_SRR0
545 mfspr r11, SPRN_SRR1
546 std r10, VCPU_SRR0(r9)
547 std r11, VCPU_SRR1(r9)
548 andi. r0, r12, 2 /* need to read HSRR0/1? */
549 beq 1f
550 mfspr r10, SPRN_HSRR0
551 mfspr r11, SPRN_HSRR1
552 clrrdi r12, r12, 2
5531: std r10, VCPU_PC(r9)
554 std r11, VCPU_MSR(r9)
555
556 GET_SCRATCH0(r3)
557 mflr r4
558 std r3, VCPU_GPR(r13)(r9)
559 std r4, VCPU_LR(r9)
560
561 /* Unset guest mode */
562 li r0, KVM_GUEST_MODE_NONE
563 stb r0, HSTATE_IN_GUEST(r13)
564
565 stw r12,VCPU_TRAP(r9)
566
567 /* See if this is a leftover HDEC interrupt */
568 cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER
569 bne 2f
570 mfspr r3,SPRN_HDEC
571 cmpwi r3,0
572 bge ignore_hdec
5732:
574 /* See if this is something we can handle in real mode */
575 cmpwi r12,BOOK3S_INTERRUPT_SYSCALL
576 beq hcall_try_real_mode
577hcall_real_cont:
578
579 /* Check for mediated interrupts (could be done earlier really ...) */
580BEGIN_FTR_SECTION
581 cmpwi r12,BOOK3S_INTERRUPT_EXTERNAL
582 bne+ 1f
583 ld r5,VCPU_KVM(r9)
584 ld r5,KVM_LPCR(r5)
585 andi. r0,r11,MSR_EE
586 beq 1f
587 andi. r0,r5,LPCR_MER
588 bne bounce_ext_interrupt
5891:
590END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
591
592 /* Save DEC */
593 mfspr r5,SPRN_DEC
594 mftb r6
595 extsw r5,r5
596 add r5,r5,r6
597 std r5,VCPU_DEC_EXPIRES(r9)
598
599 /* Save HEIR (HV emulation assist reg) in last_inst
600 if this is an HEI (HV emulation interrupt, e40) */
601 li r3,-1
602BEGIN_FTR_SECTION
603 cmpwi r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST
604 bne 11f
605 mfspr r3,SPRN_HEIR
606END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
60711: stw r3,VCPU_LAST_INST(r9)
608
609 /* Save more register state */
610 mfxer r5
611 mfdar r6
612 mfdsisr r7
613 mfctr r8
614
615 stw r5, VCPU_XER(r9)
616 std r6, VCPU_DAR(r9)
617 stw r7, VCPU_DSISR(r9)
618 std r8, VCPU_CTR(r9)
619 /* grab HDAR & HDSISR if HV data storage interrupt (HDSI) */
620BEGIN_FTR_SECTION
621 cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
622 beq 6f
623END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
6247: std r6, VCPU_FAULT_DAR(r9)
625 stw r7, VCPU_FAULT_DSISR(r9)
626
627 /* Save guest CTRL register, set runlatch to 1 */
628 mfspr r6,SPRN_CTRLF
629 stw r6,VCPU_CTRL(r9)
630 andi. r0,r6,1
631 bne 4f
632 ori r6,r6,1
633 mtspr SPRN_CTRLT,r6
6344:
635 /* Read the guest SLB and save it away */
636 lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */
637 mtctr r0
638 li r6,0
639 addi r7,r9,VCPU_SLB
640 li r5,0
6411: slbmfee r8,r6
642 andis. r0,r8,SLB_ESID_V@h
643 beq 2f
644 add r8,r8,r6 /* put index in */
645 slbmfev r3,r6
646 std r8,VCPU_SLB_E(r7)
647 std r3,VCPU_SLB_V(r7)
648 addi r7,r7,VCPU_SLB_SIZE
649 addi r5,r5,1
6502: addi r6,r6,1
651 bdnz 1b
652 stw r5,VCPU_SLB_MAX(r9)
653
654 /*
655 * Save the guest PURR/SPURR
656 */
657BEGIN_FTR_SECTION
658 mfspr r5,SPRN_PURR
659 mfspr r6,SPRN_SPURR
660 ld r7,VCPU_PURR(r9)
661 ld r8,VCPU_SPURR(r9)
662 std r5,VCPU_PURR(r9)
663 std r6,VCPU_SPURR(r9)
664 subf r5,r7,r5
665 subf r6,r8,r6
666
667 /*
668 * Restore host PURR/SPURR and add guest times
669 * so that the time in the guest gets accounted.
670 */
671 ld r3,HSTATE_PURR(r13)
672 ld r4,HSTATE_SPURR(r13)
673 add r3,r3,r5
674 add r4,r4,r6
675 mtspr SPRN_PURR,r3
676 mtspr SPRN_SPURR,r4
677END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201)
678
679 /* Clear out SLB */
680 li r5,0
681 slbmte r5,r5
682 slbia
683 ptesync
684
685hdec_soon:
686BEGIN_FTR_SECTION
687 b 32f
688END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
689 /*
690 * POWER7 guest -> host partition switch code.
691 * We don't have to lock against tlbies but we do
692 * have to coordinate the hardware threads.
693 */
694 /* Increment the threads-exiting-guest count in the 0xff00
695 bits of vcore->entry_exit_count */
696 lwsync
697 ld r5,HSTATE_KVM_VCORE(r13)
698 addi r6,r5,VCORE_ENTRY_EXIT
69941: lwarx r3,0,r6
700 addi r0,r3,0x100
701 stwcx. r0,0,r6
702 bne 41b
703
704 /*
705 * At this point we have an interrupt that we have to pass
706 * up to the kernel or qemu; we can't handle it in real mode.
707 * Thus we have to do a partition switch, so we have to
708 * collect the other threads, if we are the first thread
709 * to take an interrupt. To do this, we set the HDEC to 0,
710 * which causes an HDEC interrupt in all threads within 2ns
711 * because the HDEC register is shared between all 4 threads.
712 * However, we don't need to bother if this is an HDEC
713 * interrupt, since the other threads will already be on their
714 * way here in that case.
715 */
716 cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER
717 beq 40f
718 cmpwi r3,0x100 /* Are we the first here? */
719 bge 40f
720 cmpwi r3,1
721 ble 40f
722 li r0,0
723 mtspr SPRN_HDEC,r0
72440:
725
726 /* Secondary threads wait for primary to do partition switch */
727 ld r4,VCPU_KVM(r9) /* pointer to struct kvm */
728 ld r5,HSTATE_KVM_VCORE(r13)
729 lwz r3,VCPU_PTID(r9)
730 cmpwi r3,0
731 beq 15f
732 HMT_LOW
73313: lbz r3,VCORE_IN_GUEST(r5)
734 cmpwi r3,0
735 bne 13b
736 HMT_MEDIUM
737 b 16f
738
739 /* Primary thread waits for all the secondaries to exit guest */
74015: lwz r3,VCORE_ENTRY_EXIT(r5)
741 srwi r0,r3,8
742 clrldi r3,r3,56
743 cmpw r3,r0
744 bne 15b
745 isync
746
747 /* Primary thread switches back to host partition */
748 ld r6,KVM_HOST_SDR1(r4)
749 lwz r7,KVM_HOST_LPID(r4)
750 li r8,LPID_RSVD /* switch to reserved LPID */
751 mtspr SPRN_LPID,r8
752 ptesync
753 mtspr SPRN_SDR1,r6 /* switch to partition page table */
754 mtspr SPRN_LPID,r7
755 isync
756 li r0,0
757 stb r0,VCORE_IN_GUEST(r5)
758 lis r8,0x7fff /* MAX_INT@h */
759 mtspr SPRN_HDEC,r8
760
76116: ld r8,KVM_HOST_LPCR(r4)
762 mtspr SPRN_LPCR,r8
763 isync
764 b 33f
765
766 /*
767 * PPC970 guest -> host partition switch code.
768 * We have to lock against concurrent tlbies, and
769 * we have to flush the whole TLB.
770 */
77132: ld r4,VCPU_KVM(r9) /* pointer to struct kvm */
772
773 /* Take the guest's tlbie_lock */
774 lwz r8,PACA_LOCK_TOKEN(r13)
775 addi r3,r4,KVM_TLBIE_LOCK
77624: lwarx r0,0,r3
777 cmpwi r0,0
778 bne 24b
779 stwcx. r8,0,r3
780 bne 24b
781 isync
782
783 ld r7,KVM_HOST_LPCR(r4) /* use kvm->arch.host_lpcr for HID4 */
784 li r0,0x18f
785 rotldi r0,r0,HID4_LPID5_SH /* all lpid bits in HID4 = 1 */
786 or r0,r7,r0
787 ptesync
788 sync
789 mtspr SPRN_HID4,r0 /* switch to reserved LPID */
790 isync
791 li r0,0
792 stw r0,0(r3) /* drop guest tlbie_lock */
793
794 /* invalidate the whole TLB */
795 li r0,256
796 mtctr r0
797 li r6,0
79825: tlbiel r6
799 addi r6,r6,0x1000
800 bdnz 25b
801 ptesync
802
803 /* take native_tlbie_lock */
804 ld r3,toc_tlbie_lock@toc(2)
80524: lwarx r0,0,r3
806 cmpwi r0,0
807 bne 24b
808 stwcx. r8,0,r3
809 bne 24b
810 isync
811
812 ld r6,KVM_HOST_SDR1(r4)
813 mtspr SPRN_SDR1,r6 /* switch to host page table */
814
815 /* Set up host HID4 value */
816 sync
817 mtspr SPRN_HID4,r7
818 isync
819 li r0,0
820 stw r0,0(r3) /* drop native_tlbie_lock */
821
822 lis r8,0x7fff /* MAX_INT@h */
823 mtspr SPRN_HDEC,r8
824
825 /* Disable HDEC interrupts */
826 mfspr r0,SPRN_HID0
827 li r3,0
828 rldimi r0,r3, HID0_HDICE_SH, 64-HID0_HDICE_SH-1
829 sync
830 mtspr SPRN_HID0,r0
831 mfspr r0,SPRN_HID0
832 mfspr r0,SPRN_HID0
833 mfspr r0,SPRN_HID0
834 mfspr r0,SPRN_HID0
835 mfspr r0,SPRN_HID0
836 mfspr r0,SPRN_HID0
837
838 /* load host SLB entries */
83933: ld r8,PACA_SLBSHADOWPTR(r13)
840
841 .rept SLB_NUM_BOLTED
842 ld r5,SLBSHADOW_SAVEAREA(r8)
843 ld r6,SLBSHADOW_SAVEAREA+8(r8)
844 andis. r7,r5,SLB_ESID_V@h
845 beq 1f
846 slbmte r6,r5
8471: addi r8,r8,16
848 .endr
849
850 /* Save and reset AMR and UAMOR before turning on the MMU */
851BEGIN_FTR_SECTION
852 mfspr r5,SPRN_AMR
853 mfspr r6,SPRN_UAMOR
854 std r5,VCPU_AMR(r9)
855 std r6,VCPU_UAMOR(r9)
856 li r6,0
857 mtspr SPRN_AMR,r6
858END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
859
860 /* Restore host DABR and DABRX */
861 ld r5,HSTATE_DABR(r13)
862 li r6,7
863 mtspr SPRN_DABR,r5
864 mtspr SPRN_DABRX,r6
865
866 /* Switch DSCR back to host value */
867BEGIN_FTR_SECTION
868 mfspr r8, SPRN_DSCR
869 ld r7, HSTATE_DSCR(r13)
870 std r8, VCPU_DSCR(r7)
871 mtspr SPRN_DSCR, r7
872END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
873
874 /* Save non-volatile GPRs */
875 std r14, VCPU_GPR(r14)(r9)
876 std r15, VCPU_GPR(r15)(r9)
877 std r16, VCPU_GPR(r16)(r9)
878 std r17, VCPU_GPR(r17)(r9)
879 std r18, VCPU_GPR(r18)(r9)
880 std r19, VCPU_GPR(r19)(r9)
881 std r20, VCPU_GPR(r20)(r9)
882 std r21, VCPU_GPR(r21)(r9)
883 std r22, VCPU_GPR(r22)(r9)
884 std r23, VCPU_GPR(r23)(r9)
885 std r24, VCPU_GPR(r24)(r9)
886 std r25, VCPU_GPR(r25)(r9)
887 std r26, VCPU_GPR(r26)(r9)
888 std r27, VCPU_GPR(r27)(r9)
889 std r28, VCPU_GPR(r28)(r9)
890 std r29, VCPU_GPR(r29)(r9)
891 std r30, VCPU_GPR(r30)(r9)
892 std r31, VCPU_GPR(r31)(r9)
893
894 /* Save SPRGs */
895 mfspr r3, SPRN_SPRG0
896 mfspr r4, SPRN_SPRG1
897 mfspr r5, SPRN_SPRG2
898 mfspr r6, SPRN_SPRG3
899 std r3, VCPU_SPRG0(r9)
900 std r4, VCPU_SPRG1(r9)
901 std r5, VCPU_SPRG2(r9)
902 std r6, VCPU_SPRG3(r9)
903
904 /* Increment yield count if they have a VPA */
905 ld r8, VCPU_VPA(r9) /* do they have a VPA? */
906 cmpdi r8, 0
907 beq 25f
908 lwz r3, LPPACA_YIELDCOUNT(r8)
909 addi r3, r3, 1
910 stw r3, LPPACA_YIELDCOUNT(r8)
91125:
912 /* Save PMU registers if requested */
913 /* r8 and cr0.eq are live here */
914 li r3, 1
915 sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
916 mfspr r4, SPRN_MMCR0 /* save MMCR0 */
917 mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */
918 isync
919 beq 21f /* if no VPA, save PMU stuff anyway */
920 lbz r7, LPPACA_PMCINUSE(r8)
921 cmpwi r7, 0 /* did they ask for PMU stuff to be saved? */
922 bne 21f
923 std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */
924 b 22f
92521: mfspr r5, SPRN_MMCR1
926 mfspr r6, SPRN_MMCRA
927 std r4, VCPU_MMCR(r9)
928 std r5, VCPU_MMCR + 8(r9)
929 std r6, VCPU_MMCR + 16(r9)
930 mfspr r3, SPRN_PMC1
931 mfspr r4, SPRN_PMC2
932 mfspr r5, SPRN_PMC3
933 mfspr r6, SPRN_PMC4
934 mfspr r7, SPRN_PMC5
935 mfspr r8, SPRN_PMC6
936BEGIN_FTR_SECTION
937 mfspr r10, SPRN_PMC7
938 mfspr r11, SPRN_PMC8
939END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
940 stw r3, VCPU_PMC(r9)
941 stw r4, VCPU_PMC + 4(r9)
942 stw r5, VCPU_PMC + 8(r9)
943 stw r6, VCPU_PMC + 12(r9)
944 stw r7, VCPU_PMC + 16(r9)
945 stw r8, VCPU_PMC + 20(r9)
946BEGIN_FTR_SECTION
947 stw r10, VCPU_PMC + 24(r9)
948 stw r11, VCPU_PMC + 28(r9)
949END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
95022:
951 /* save FP state */
952 mr r3, r9
953 bl .kvmppc_save_fp
954
955 /* Secondary threads go off to take a nap on POWER7 */
956BEGIN_FTR_SECTION
957 lwz r0,VCPU_PTID(r3)
958 cmpwi r0,0
959 bne secondary_nap
960END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
961
962 /*
963 * Reload DEC. HDEC interrupts were disabled when
964 * we reloaded the host's LPCR value.
965 */
966 ld r3, HSTATE_DECEXP(r13)
967 mftb r4
968 subf r4, r4, r3
969 mtspr SPRN_DEC, r4
970
971 /* Reload the host's PMU registers */
972 ld r3, PACALPPACAPTR(r13) /* is the host using the PMU? */
973 lbz r4, LPPACA_PMCINUSE(r3)
974 cmpwi r4, 0
975 beq 23f /* skip if not */
976 lwz r3, HSTATE_PMC(r13)
977 lwz r4, HSTATE_PMC + 4(r13)
978 lwz r5, HSTATE_PMC + 8(r13)
979 lwz r6, HSTATE_PMC + 12(r13)
980 lwz r8, HSTATE_PMC + 16(r13)
981 lwz r9, HSTATE_PMC + 20(r13)
982BEGIN_FTR_SECTION
983 lwz r10, HSTATE_PMC + 24(r13)
984 lwz r11, HSTATE_PMC + 28(r13)
985END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
986 mtspr SPRN_PMC1, r3
987 mtspr SPRN_PMC2, r4
988 mtspr SPRN_PMC3, r5
989 mtspr SPRN_PMC4, r6
990 mtspr SPRN_PMC5, r8
991 mtspr SPRN_PMC6, r9
992BEGIN_FTR_SECTION
993 mtspr SPRN_PMC7, r10
994 mtspr SPRN_PMC8, r11
995END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
996 ld r3, HSTATE_MMCR(r13)
997 ld r4, HSTATE_MMCR + 8(r13)
998 ld r5, HSTATE_MMCR + 16(r13)
999 mtspr SPRN_MMCR1, r4
1000 mtspr SPRN_MMCRA, r5
1001 mtspr SPRN_MMCR0, r3
1002 isync
100323:
1004 /*
1005 * For external and machine check interrupts, we need
1006 * to call the Linux handler to process the interrupt.
1007 * We do that by jumping to the interrupt vector address
1008 * which we have in r12. The [h]rfid at the end of the
1009 * handler will return to the book3s_hv_interrupts.S code.
1010 * For other interrupts we do the rfid to get back
1011 * to the book3s_interrupts.S code here.
1012 */
1013 ld r8, HSTATE_VMHANDLER(r13)
1014 ld r7, HSTATE_HOST_MSR(r13)
1015
1016 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
1017 beq 11f
1018 cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK
1019
1020 /* RFI into the highmem handler, or branch to interrupt handler */
102112: mfmsr r6
1022 mtctr r12
1023 li r0, MSR_RI
1024 andc r6, r6, r0
1025 mtmsrd r6, 1 /* Clear RI in MSR */
1026 mtsrr0 r8
1027 mtsrr1 r7
1028 beqctr
1029 RFI
1030
103111:
1032BEGIN_FTR_SECTION
1033 b 12b
1034END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
1035 mtspr SPRN_HSRR0, r8
1036 mtspr SPRN_HSRR1, r7
1037 ba 0x500
1038
10396: mfspr r6,SPRN_HDAR
1040 mfspr r7,SPRN_HDSISR
1041 b 7b
1042
1043/*
1044 * Try to handle an hcall in real mode.
1045 * Returns to the guest if we handle it, or continues on up to
1046 * the kernel if we can't (i.e. if we don't have a handler for
1047 * it, or if the handler returns H_TOO_HARD).
1048 */
1049 .globl hcall_try_real_mode
1050hcall_try_real_mode:
1051 ld r3,VCPU_GPR(r3)(r9)
1052 andi. r0,r11,MSR_PR
1053 bne hcall_real_cont
1054 clrrdi r3,r3,2
1055 cmpldi r3,hcall_real_table_end - hcall_real_table
1056 bge hcall_real_cont
1057 LOAD_REG_ADDR(r4, hcall_real_table)
1058 lwzx r3,r3,r4
1059 cmpwi r3,0
1060 beq hcall_real_cont
1061 add r3,r3,r4
1062 mtctr r3
1063 mr r3,r9 /* get vcpu pointer */
1064 ld r4,VCPU_GPR(r4)(r9)
1065 bctrl
1066 cmpdi r3,H_TOO_HARD
1067 beq hcall_real_fallback
1068 ld r4,HSTATE_KVM_VCPU(r13)
1069 std r3,VCPU_GPR(r3)(r4)
1070 ld r10,VCPU_PC(r4)
1071 ld r11,VCPU_MSR(r4)
1072 b fast_guest_return
1073
1074 /* We've attempted a real mode hcall, but it's punted it back
1075 * to userspace. We need to restore some clobbered volatiles
1076 * before resuming the pass-it-to-qemu path */
1077hcall_real_fallback:
1078 li r12,BOOK3S_INTERRUPT_SYSCALL
1079 ld r9, HSTATE_KVM_VCPU(r13)
1080 ld r11, VCPU_MSR(r9)
1081
1082 b hcall_real_cont
1083
1084 .globl hcall_real_table
1085hcall_real_table:
1086 .long 0 /* 0 - unused */
1087 .long .kvmppc_h_remove - hcall_real_table
1088 .long .kvmppc_h_enter - hcall_real_table
1089 .long .kvmppc_h_read - hcall_real_table
1090 .long 0 /* 0x10 - H_CLEAR_MOD */
1091 .long 0 /* 0x14 - H_CLEAR_REF */
1092 .long .kvmppc_h_protect - hcall_real_table
1093 .long 0 /* 0x1c - H_GET_TCE */
1094 .long .kvmppc_h_put_tce - hcall_real_table
1095 .long 0 /* 0x24 - H_SET_SPRG0 */
1096 .long .kvmppc_h_set_dabr - hcall_real_table
1097 .long 0 /* 0x2c */
1098 .long 0 /* 0x30 */
1099 .long 0 /* 0x34 */
1100 .long 0 /* 0x38 */
1101 .long 0 /* 0x3c */
1102 .long 0 /* 0x40 */
1103 .long 0 /* 0x44 */
1104 .long 0 /* 0x48 */
1105 .long 0 /* 0x4c */
1106 .long 0 /* 0x50 */
1107 .long 0 /* 0x54 */
1108 .long 0 /* 0x58 */
1109 .long 0 /* 0x5c */
1110 .long 0 /* 0x60 */
1111 .long 0 /* 0x64 */
1112 .long 0 /* 0x68 */
1113 .long 0 /* 0x6c */
1114 .long 0 /* 0x70 */
1115 .long 0 /* 0x74 */
1116 .long 0 /* 0x78 */
1117 .long 0 /* 0x7c */
1118 .long 0 /* 0x80 */
1119 .long 0 /* 0x84 */
1120 .long 0 /* 0x88 */
1121 .long 0 /* 0x8c */
1122 .long 0 /* 0x90 */
1123 .long 0 /* 0x94 */
1124 .long 0 /* 0x98 */
1125 .long 0 /* 0x9c */
1126 .long 0 /* 0xa0 */
1127 .long 0 /* 0xa4 */
1128 .long 0 /* 0xa8 */
1129 .long 0 /* 0xac */
1130 .long 0 /* 0xb0 */
1131 .long 0 /* 0xb4 */
1132 .long 0 /* 0xb8 */
1133 .long 0 /* 0xbc */
1134 .long 0 /* 0xc0 */
1135 .long 0 /* 0xc4 */
1136 .long 0 /* 0xc8 */
1137 .long 0 /* 0xcc */
1138 .long 0 /* 0xd0 */
1139 .long 0 /* 0xd4 */
1140 .long 0 /* 0xd8 */
1141 .long 0 /* 0xdc */
1142 .long 0 /* 0xe0 */
1143 .long 0 /* 0xe4 */
1144 .long 0 /* 0xe8 */
1145 .long 0 /* 0xec */
1146 .long 0 /* 0xf0 */
1147 .long 0 /* 0xf4 */
1148 .long 0 /* 0xf8 */
1149 .long 0 /* 0xfc */
1150 .long 0 /* 0x100 */
1151 .long 0 /* 0x104 */
1152 .long 0 /* 0x108 */
1153 .long 0 /* 0x10c */
1154 .long 0 /* 0x110 */
1155 .long 0 /* 0x114 */
1156 .long 0 /* 0x118 */
1157 .long 0 /* 0x11c */
1158 .long 0 /* 0x120 */
1159 .long .kvmppc_h_bulk_remove - hcall_real_table
1160hcall_real_table_end:
1161
1162ignore_hdec:
1163 mr r4,r9
1164 b fast_guest_return
1165
1166bounce_ext_interrupt:
1167 mr r4,r9
1168 mtspr SPRN_SRR0,r10
1169 mtspr SPRN_SRR1,r11
1170 li r10,BOOK3S_INTERRUPT_EXTERNAL
1171 LOAD_REG_IMMEDIATE(r11,MSR_SF | MSR_ME);
1172 b fast_guest_return
1173
1174_GLOBAL(kvmppc_h_set_dabr)
1175 std r4,VCPU_DABR(r3)
1176 mtspr SPRN_DABR,r4
1177 li r3,0
1178 blr
1179
1180secondary_too_late:
1181 ld r5,HSTATE_KVM_VCORE(r13)
1182 HMT_LOW
118313: lbz r3,VCORE_IN_GUEST(r5)
1184 cmpwi r3,0
1185 bne 13b
1186 HMT_MEDIUM
1187 ld r11,PACA_SLBSHADOWPTR(r13)
1188
1189 .rept SLB_NUM_BOLTED
1190 ld r5,SLBSHADOW_SAVEAREA(r11)
1191 ld r6,SLBSHADOW_SAVEAREA+8(r11)
1192 andis. r7,r5,SLB_ESID_V@h
1193 beq 1f
1194 slbmte r6,r5
11951: addi r11,r11,16
1196 .endr
1197 b 50f
1198
1199secondary_nap:
1200 /* Clear any pending IPI */
120150: ld r5, HSTATE_XICS_PHYS(r13)
1202 li r0, 0xff
1203 li r6, XICS_QIRR
1204 stbcix r0, r5, r6
1205
1206 /* increment the nap count and then go to nap mode */
1207 ld r4, HSTATE_KVM_VCORE(r13)
1208 addi r4, r4, VCORE_NAP_COUNT
1209 lwsync /* make previous updates visible */
121051: lwarx r3, 0, r4
1211 addi r3, r3, 1
1212 stwcx. r3, 0, r4
1213 bne 51b
1214 isync
1215
1216 mfspr r4, SPRN_LPCR
1217 li r0, LPCR_PECE
1218 andc r4, r4, r0
1219 ori r4, r4, LPCR_PECE0 /* exit nap on interrupt */
1220 mtspr SPRN_LPCR, r4
1221 li r0, 0
1222 std r0, HSTATE_SCRATCH0(r13)
1223 ptesync
1224 ld r0, HSTATE_SCRATCH0(r13)
12251: cmpd r0, r0
1226 bne 1b
1227 nap
1228 b .
1229
1230/*
1231 * Save away FP, VMX and VSX registers.
1232 * r3 = vcpu pointer
1233 */
1234_GLOBAL(kvmppc_save_fp)
1235 mfmsr r9
1236 ori r8,r9,MSR_FP
1237#ifdef CONFIG_ALTIVEC
1238BEGIN_FTR_SECTION
1239 oris r8,r8,MSR_VEC@h
1240END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
1241#endif
1242#ifdef CONFIG_VSX
1243BEGIN_FTR_SECTION
1244 oris r8,r8,MSR_VSX@h
1245END_FTR_SECTION_IFSET(CPU_FTR_VSX)
1246#endif
1247 mtmsrd r8
1248 isync
1249#ifdef CONFIG_VSX
1250BEGIN_FTR_SECTION
1251 reg = 0
1252 .rept 32
1253 li r6,reg*16+VCPU_VSRS
1254 stxvd2x reg,r6,r3
1255 reg = reg + 1
1256 .endr
1257FTR_SECTION_ELSE
1258#endif
1259 reg = 0
1260 .rept 32
1261 stfd reg,reg*8+VCPU_FPRS(r3)
1262 reg = reg + 1
1263 .endr
1264#ifdef CONFIG_VSX
1265ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
1266#endif
1267 mffs fr0
1268 stfd fr0,VCPU_FPSCR(r3)
1269
1270#ifdef CONFIG_ALTIVEC
1271BEGIN_FTR_SECTION
1272 reg = 0
1273 .rept 32
1274 li r6,reg*16+VCPU_VRS
1275 stvx reg,r6,r3
1276 reg = reg + 1
1277 .endr
1278 mfvscr vr0
1279 li r6,VCPU_VSCR
1280 stvx vr0,r6,r3
1281END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
1282#endif
1283 mfspr r6,SPRN_VRSAVE
1284 stw r6,VCPU_VRSAVE(r3)
1285 mtmsrd r9
1286 isync
1287 blr
1288
1289/*
1290 * Load up FP, VMX and VSX registers
1291 * r4 = vcpu pointer
1292 */
1293 .globl kvmppc_load_fp
1294kvmppc_load_fp:
1295 mfmsr r9
1296 ori r8,r9,MSR_FP
1297#ifdef CONFIG_ALTIVEC
1298BEGIN_FTR_SECTION
1299 oris r8,r8,MSR_VEC@h
1300END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
1301#endif
1302#ifdef CONFIG_VSX
1303BEGIN_FTR_SECTION
1304 oris r8,r8,MSR_VSX@h
1305END_FTR_SECTION_IFSET(CPU_FTR_VSX)
1306#endif
1307 mtmsrd r8
1308 isync
1309 lfd fr0,VCPU_FPSCR(r4)
1310 MTFSF_L(fr0)
1311#ifdef CONFIG_VSX
1312BEGIN_FTR_SECTION
1313 reg = 0
1314 .rept 32
1315 li r7,reg*16+VCPU_VSRS
1316 lxvd2x reg,r7,r4
1317 reg = reg + 1
1318 .endr
1319FTR_SECTION_ELSE
1320#endif
1321 reg = 0
1322 .rept 32
1323 lfd reg,reg*8+VCPU_FPRS(r4)
1324 reg = reg + 1
1325 .endr
1326#ifdef CONFIG_VSX
1327ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
1328#endif
1329
1330#ifdef CONFIG_ALTIVEC
1331BEGIN_FTR_SECTION
1332 li r7,VCPU_VSCR
1333 lvx vr0,r7,r4
1334 mtvscr vr0
1335 reg = 0
1336 .rept 32
1337 li r7,reg*16+VCPU_VRS
1338 lvx reg,r7,r4
1339 reg = reg + 1
1340 .endr
1341END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
1342#endif
1343 lwz r7,VCPU_VRSAVE(r4)
1344 mtspr SPRN_VRSAVE,r7
1345 blr
diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S
index 2f0bc928b08a..c54b0e30cf3f 100644
--- a/arch/powerpc/kvm/book3s_interrupts.S
+++ b/arch/powerpc/kvm/book3s_interrupts.S
@@ -29,8 +29,7 @@
29#define ULONG_SIZE 8 29#define ULONG_SIZE 8
30#define FUNC(name) GLUE(.,name) 30#define FUNC(name) GLUE(.,name)
31 31
32#define GET_SHADOW_VCPU(reg) \ 32#define GET_SHADOW_VCPU_R13
33 addi reg, r13, PACA_KVM_SVCPU
34 33
35#define DISABLE_INTERRUPTS \ 34#define DISABLE_INTERRUPTS \
36 mfmsr r0; \ 35 mfmsr r0; \
@@ -43,8 +42,8 @@
43#define ULONG_SIZE 4 42#define ULONG_SIZE 4
44#define FUNC(name) name 43#define FUNC(name) name
45 44
46#define GET_SHADOW_VCPU(reg) \ 45#define GET_SHADOW_VCPU_R13 \
47 lwz reg, (THREAD + THREAD_KVM_SVCPU)(r2) 46 lwz r13, (THREAD + THREAD_KVM_SVCPU)(r2)
48 47
49#define DISABLE_INTERRUPTS \ 48#define DISABLE_INTERRUPTS \
50 mfmsr r0; \ 49 mfmsr r0; \
@@ -85,7 +84,7 @@
85 * r3: kvm_run pointer 84 * r3: kvm_run pointer
86 * r4: vcpu pointer 85 * r4: vcpu pointer
87 */ 86 */
88_GLOBAL(__kvmppc_vcpu_entry) 87_GLOBAL(__kvmppc_vcpu_run)
89 88
90kvm_start_entry: 89kvm_start_entry:
91 /* Write correct stack frame */ 90 /* Write correct stack frame */
@@ -107,17 +106,11 @@ kvm_start_entry:
107 /* Load non-volatile guest state from the vcpu */ 106 /* Load non-volatile guest state from the vcpu */
108 VCPU_LOAD_NVGPRS(r4) 107 VCPU_LOAD_NVGPRS(r4)
109 108
110 GET_SHADOW_VCPU(r5) 109kvm_start_lightweight:
111
112 /* Save R1/R2 in the PACA */
113 PPC_STL r1, SVCPU_HOST_R1(r5)
114 PPC_STL r2, SVCPU_HOST_R2(r5)
115 110
116 /* XXX swap in/out on load? */ 111 GET_SHADOW_VCPU_R13
117 PPC_LL r3, VCPU_HIGHMEM_HANDLER(r4) 112 PPC_LL r3, VCPU_HIGHMEM_HANDLER(r4)
118 PPC_STL r3, SVCPU_VMHANDLER(r5) 113 PPC_STL r3, HSTATE_VMHANDLER(r13)
119
120kvm_start_lightweight:
121 114
122 PPC_LL r10, VCPU_SHADOW_MSR(r4) /* r10 = vcpu->arch.shadow_msr */ 115 PPC_LL r10, VCPU_SHADOW_MSR(r4) /* r10 = vcpu->arch.shadow_msr */
123 116
diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
index 79751d8dd131..41cb0017e757 100644
--- a/arch/powerpc/kvm/book3s_mmu_hpte.c
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -21,7 +21,6 @@
21#include <linux/kvm_host.h> 21#include <linux/kvm_host.h>
22#include <linux/hash.h> 22#include <linux/hash.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include "trace.h"
25 24
26#include <asm/kvm_ppc.h> 25#include <asm/kvm_ppc.h>
27#include <asm/kvm_book3s.h> 26#include <asm/kvm_book3s.h>
@@ -29,6 +28,8 @@
29#include <asm/mmu_context.h> 28#include <asm/mmu_context.h>
30#include <asm/hw_irq.h> 29#include <asm/hw_irq.h>
31 30
31#include "trace.h"
32
32#define PTE_SIZE 12 33#define PTE_SIZE 12
33 34
34static struct kmem_cache *hpte_cache; 35static struct kmem_cache *hpte_cache;
@@ -58,30 +59,31 @@ static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage)
58void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte) 59void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
59{ 60{
60 u64 index; 61 u64 index;
62 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
61 63
62 trace_kvm_book3s_mmu_map(pte); 64 trace_kvm_book3s_mmu_map(pte);
63 65
64 spin_lock(&vcpu->arch.mmu_lock); 66 spin_lock(&vcpu3s->mmu_lock);
65 67
66 /* Add to ePTE list */ 68 /* Add to ePTE list */
67 index = kvmppc_mmu_hash_pte(pte->pte.eaddr); 69 index = kvmppc_mmu_hash_pte(pte->pte.eaddr);
68 hlist_add_head_rcu(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]); 70 hlist_add_head_rcu(&pte->list_pte, &vcpu3s->hpte_hash_pte[index]);
69 71
70 /* Add to ePTE_long list */ 72 /* Add to ePTE_long list */
71 index = kvmppc_mmu_hash_pte_long(pte->pte.eaddr); 73 index = kvmppc_mmu_hash_pte_long(pte->pte.eaddr);
72 hlist_add_head_rcu(&pte->list_pte_long, 74 hlist_add_head_rcu(&pte->list_pte_long,
73 &vcpu->arch.hpte_hash_pte_long[index]); 75 &vcpu3s->hpte_hash_pte_long[index]);
74 76
75 /* Add to vPTE list */ 77 /* Add to vPTE list */
76 index = kvmppc_mmu_hash_vpte(pte->pte.vpage); 78 index = kvmppc_mmu_hash_vpte(pte->pte.vpage);
77 hlist_add_head_rcu(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]); 79 hlist_add_head_rcu(&pte->list_vpte, &vcpu3s->hpte_hash_vpte[index]);
78 80
79 /* Add to vPTE_long list */ 81 /* Add to vPTE_long list */
80 index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage); 82 index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage);
81 hlist_add_head_rcu(&pte->list_vpte_long, 83 hlist_add_head_rcu(&pte->list_vpte_long,
82 &vcpu->arch.hpte_hash_vpte_long[index]); 84 &vcpu3s->hpte_hash_vpte_long[index]);
83 85
84 spin_unlock(&vcpu->arch.mmu_lock); 86 spin_unlock(&vcpu3s->mmu_lock);
85} 87}
86 88
87static void free_pte_rcu(struct rcu_head *head) 89static void free_pte_rcu(struct rcu_head *head)
@@ -92,16 +94,18 @@ static void free_pte_rcu(struct rcu_head *head)
92 94
93static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) 95static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
94{ 96{
97 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
98
95 trace_kvm_book3s_mmu_invalidate(pte); 99 trace_kvm_book3s_mmu_invalidate(pte);
96 100
97 /* Different for 32 and 64 bit */ 101 /* Different for 32 and 64 bit */
98 kvmppc_mmu_invalidate_pte(vcpu, pte); 102 kvmppc_mmu_invalidate_pte(vcpu, pte);
99 103
100 spin_lock(&vcpu->arch.mmu_lock); 104 spin_lock(&vcpu3s->mmu_lock);
101 105
102 /* pte already invalidated in between? */ 106 /* pte already invalidated in between? */
103 if (hlist_unhashed(&pte->list_pte)) { 107 if (hlist_unhashed(&pte->list_pte)) {
104 spin_unlock(&vcpu->arch.mmu_lock); 108 spin_unlock(&vcpu3s->mmu_lock);
105 return; 109 return;
106 } 110 }
107 111
@@ -115,14 +119,15 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
115 else 119 else
116 kvm_release_pfn_clean(pte->pfn); 120 kvm_release_pfn_clean(pte->pfn);
117 121
118 spin_unlock(&vcpu->arch.mmu_lock); 122 spin_unlock(&vcpu3s->mmu_lock);
119 123
120 vcpu->arch.hpte_cache_count--; 124 vcpu3s->hpte_cache_count--;
121 call_rcu(&pte->rcu_head, free_pte_rcu); 125 call_rcu(&pte->rcu_head, free_pte_rcu);
122} 126}
123 127
124static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu) 128static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
125{ 129{
130 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
126 struct hpte_cache *pte; 131 struct hpte_cache *pte;
127 struct hlist_node *node; 132 struct hlist_node *node;
128 int i; 133 int i;
@@ -130,7 +135,7 @@ static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
130 rcu_read_lock(); 135 rcu_read_lock();
131 136
132 for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { 137 for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
133 struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i]; 138 struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i];
134 139
135 hlist_for_each_entry_rcu(pte, node, list, list_vpte_long) 140 hlist_for_each_entry_rcu(pte, node, list, list_vpte_long)
136 invalidate_pte(vcpu, pte); 141 invalidate_pte(vcpu, pte);
@@ -141,12 +146,13 @@ static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
141 146
142static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea) 147static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea)
143{ 148{
149 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
144 struct hlist_head *list; 150 struct hlist_head *list;
145 struct hlist_node *node; 151 struct hlist_node *node;
146 struct hpte_cache *pte; 152 struct hpte_cache *pte;
147 153
148 /* Find the list of entries in the map */ 154 /* Find the list of entries in the map */
149 list = &vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)]; 155 list = &vcpu3s->hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];
150 156
151 rcu_read_lock(); 157 rcu_read_lock();
152 158
@@ -160,12 +166,13 @@ static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea)
160 166
161static void kvmppc_mmu_pte_flush_long(struct kvm_vcpu *vcpu, ulong guest_ea) 167static void kvmppc_mmu_pte_flush_long(struct kvm_vcpu *vcpu, ulong guest_ea)
162{ 168{
169 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
163 struct hlist_head *list; 170 struct hlist_head *list;
164 struct hlist_node *node; 171 struct hlist_node *node;
165 struct hpte_cache *pte; 172 struct hpte_cache *pte;
166 173
167 /* Find the list of entries in the map */ 174 /* Find the list of entries in the map */
168 list = &vcpu->arch.hpte_hash_pte_long[ 175 list = &vcpu3s->hpte_hash_pte_long[
169 kvmppc_mmu_hash_pte_long(guest_ea)]; 176 kvmppc_mmu_hash_pte_long(guest_ea)];
170 177
171 rcu_read_lock(); 178 rcu_read_lock();
@@ -203,12 +210,13 @@ void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
203/* Flush with mask 0xfffffffff */ 210/* Flush with mask 0xfffffffff */
204static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp) 211static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
205{ 212{
213 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
206 struct hlist_head *list; 214 struct hlist_head *list;
207 struct hlist_node *node; 215 struct hlist_node *node;
208 struct hpte_cache *pte; 216 struct hpte_cache *pte;
209 u64 vp_mask = 0xfffffffffULL; 217 u64 vp_mask = 0xfffffffffULL;
210 218
211 list = &vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)]; 219 list = &vcpu3s->hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
212 220
213 rcu_read_lock(); 221 rcu_read_lock();
214 222
@@ -223,12 +231,13 @@ static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
223/* Flush with mask 0xffffff000 */ 231/* Flush with mask 0xffffff000 */
224static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp) 232static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp)
225{ 233{
234 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
226 struct hlist_head *list; 235 struct hlist_head *list;
227 struct hlist_node *node; 236 struct hlist_node *node;
228 struct hpte_cache *pte; 237 struct hpte_cache *pte;
229 u64 vp_mask = 0xffffff000ULL; 238 u64 vp_mask = 0xffffff000ULL;
230 239
231 list = &vcpu->arch.hpte_hash_vpte_long[ 240 list = &vcpu3s->hpte_hash_vpte_long[
232 kvmppc_mmu_hash_vpte_long(guest_vp)]; 241 kvmppc_mmu_hash_vpte_long(guest_vp)];
233 242
234 rcu_read_lock(); 243 rcu_read_lock();
@@ -261,6 +270,7 @@ void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
261 270
262void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) 271void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
263{ 272{
273 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
264 struct hlist_node *node; 274 struct hlist_node *node;
265 struct hpte_cache *pte; 275 struct hpte_cache *pte;
266 int i; 276 int i;
@@ -270,7 +280,7 @@ void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
270 rcu_read_lock(); 280 rcu_read_lock();
271 281
272 for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { 282 for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
273 struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i]; 283 struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i];
274 284
275 hlist_for_each_entry_rcu(pte, node, list, list_vpte_long) 285 hlist_for_each_entry_rcu(pte, node, list, list_vpte_long)
276 if ((pte->pte.raddr >= pa_start) && 286 if ((pte->pte.raddr >= pa_start) &&
@@ -283,12 +293,13 @@ void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
283 293
284struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu) 294struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
285{ 295{
296 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
286 struct hpte_cache *pte; 297 struct hpte_cache *pte;
287 298
288 pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL); 299 pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL);
289 vcpu->arch.hpte_cache_count++; 300 vcpu3s->hpte_cache_count++;
290 301
291 if (vcpu->arch.hpte_cache_count == HPTEG_CACHE_NUM) 302 if (vcpu3s->hpte_cache_count == HPTEG_CACHE_NUM)
292 kvmppc_mmu_pte_flush_all(vcpu); 303 kvmppc_mmu_pte_flush_all(vcpu);
293 304
294 return pte; 305 return pte;
@@ -309,17 +320,19 @@ static void kvmppc_mmu_hpte_init_hash(struct hlist_head *hash_list, int len)
309 320
310int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu) 321int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
311{ 322{
323 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
324
312 /* init hpte lookup hashes */ 325 /* init hpte lookup hashes */
313 kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte, 326 kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_pte,
314 ARRAY_SIZE(vcpu->arch.hpte_hash_pte)); 327 ARRAY_SIZE(vcpu3s->hpte_hash_pte));
315 kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte_long, 328 kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_pte_long,
316 ARRAY_SIZE(vcpu->arch.hpte_hash_pte_long)); 329 ARRAY_SIZE(vcpu3s->hpte_hash_pte_long));
317 kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte, 330 kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte,
318 ARRAY_SIZE(vcpu->arch.hpte_hash_vpte)); 331 ARRAY_SIZE(vcpu3s->hpte_hash_vpte));
319 kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long, 332 kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_long,
320 ARRAY_SIZE(vcpu->arch.hpte_hash_vpte_long)); 333 ARRAY_SIZE(vcpu3s->hpte_hash_vpte_long));
321 334
322 spin_lock_init(&vcpu->arch.mmu_lock); 335 spin_lock_init(&vcpu3s->mmu_lock);
323 336
324 return 0; 337 return 0;
325} 338}
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
new file mode 100644
index 000000000000..0c0d3f274437
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -0,0 +1,1029 @@
1/*
2 * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
3 *
4 * Authors:
5 * Alexander Graf <agraf@suse.de>
6 * Kevin Wolf <mail@kevin-wolf.de>
7 * Paul Mackerras <paulus@samba.org>
8 *
9 * Description:
10 * Functions relating to running KVM on Book 3S processors where
11 * we don't have access to hypervisor mode, and we run the guest
12 * in problem state (user mode).
13 *
14 * This file is derived from arch/powerpc/kvm/44x.c,
15 * by Hollis Blanchard <hollisb@us.ibm.com>.
16 *
17 * This program is free software; you can redistribute it and/or modify
18 * it under the terms of the GNU General Public License, version 2, as
19 * published by the Free Software Foundation.
20 */
21
22#include <linux/kvm_host.h>
23#include <linux/err.h>
24#include <linux/slab.h>
25
26#include <asm/reg.h>
27#include <asm/cputable.h>
28#include <asm/cacheflush.h>
29#include <asm/tlbflush.h>
30#include <asm/uaccess.h>
31#include <asm/io.h>
32#include <asm/kvm_ppc.h>
33#include <asm/kvm_book3s.h>
34#include <asm/mmu_context.h>
35#include <linux/gfp.h>
36#include <linux/sched.h>
37#include <linux/vmalloc.h>
38#include <linux/highmem.h>
39
40#include "trace.h"
41
42/* #define EXIT_DEBUG */
43/* #define DEBUG_EXT */
44
45static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
46 ulong msr);
47
48/* Some compatibility defines */
49#ifdef CONFIG_PPC_BOOK3S_32
50#define MSR_USER32 MSR_USER
51#define MSR_USER64 MSR_USER
52#define HW_PAGE_SIZE PAGE_SIZE
53#endif
54
55void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
56{
57#ifdef CONFIG_PPC_BOOK3S_64
58 memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb));
59 memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu,
60 sizeof(get_paca()->shadow_vcpu));
61 to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max;
62#endif
63
64#ifdef CONFIG_PPC_BOOK3S_32
65 current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu;
66#endif
67}
68
69void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
70{
71#ifdef CONFIG_PPC_BOOK3S_64
72 memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb));
73 memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu,
74 sizeof(get_paca()->shadow_vcpu));
75 to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max;
76#endif
77
78 kvmppc_giveup_ext(vcpu, MSR_FP);
79 kvmppc_giveup_ext(vcpu, MSR_VEC);
80 kvmppc_giveup_ext(vcpu, MSR_VSX);
81}
82
83static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
84{
85 ulong smsr = vcpu->arch.shared->msr;
86
87 /* Guest MSR values */
88 smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE;
89 /* Process MSR values */
90 smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
91 /* External providers the guest reserved */
92 smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext);
93 /* 64-bit Process MSR values */
94#ifdef CONFIG_PPC_BOOK3S_64
95 smsr |= MSR_ISF | MSR_HV;
96#endif
97 vcpu->arch.shadow_msr = smsr;
98}
99
100void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
101{
102 ulong old_msr = vcpu->arch.shared->msr;
103
104#ifdef EXIT_DEBUG
105 printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr);
106#endif
107
108 msr &= to_book3s(vcpu)->msr_mask;
109 vcpu->arch.shared->msr = msr;
110 kvmppc_recalc_shadow_msr(vcpu);
111
112 if (msr & MSR_POW) {
113 if (!vcpu->arch.pending_exceptions) {
114 kvm_vcpu_block(vcpu);
115 vcpu->stat.halt_wakeup++;
116
117 /* Unset POW bit after we woke up */
118 msr &= ~MSR_POW;
119 vcpu->arch.shared->msr = msr;
120 }
121 }
122
123 if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) !=
124 (old_msr & (MSR_PR|MSR_IR|MSR_DR))) {
125 kvmppc_mmu_flush_segments(vcpu);
126 kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
127
128 /* Preload magic page segment when in kernel mode */
129 if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) {
130 struct kvm_vcpu_arch *a = &vcpu->arch;
131
132 if (msr & MSR_DR)
133 kvmppc_mmu_map_segment(vcpu, a->magic_page_ea);
134 else
135 kvmppc_mmu_map_segment(vcpu, a->magic_page_pa);
136 }
137 }
138
139 /* Preload FPU if it's enabled */
140 if (vcpu->arch.shared->msr & MSR_FP)
141 kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
142}
143
144void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
145{
146 u32 host_pvr;
147
148 vcpu->arch.hflags &= ~BOOK3S_HFLAG_SLB;
149 vcpu->arch.pvr = pvr;
150#ifdef CONFIG_PPC_BOOK3S_64
151 if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
152 kvmppc_mmu_book3s_64_init(vcpu);
153 to_book3s(vcpu)->hior = 0xfff00000;
154 to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
155 } else
156#endif
157 {
158 kvmppc_mmu_book3s_32_init(vcpu);
159 to_book3s(vcpu)->hior = 0;
160 to_book3s(vcpu)->msr_mask = 0xffffffffULL;
161 }
162
163 /* If we are in hypervisor level on 970, we can tell the CPU to
164 * treat DCBZ as 32 bytes store */
165 vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32;
166 if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV) &&
167 !strcmp(cur_cpu_spec->platform, "ppc970"))
168 vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
169
170 /* Cell performs badly if MSR_FEx are set. So let's hope nobody
171 really needs them in a VM on Cell and force disable them. */
172 if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be"))
173 to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1);
174
175#ifdef CONFIG_PPC_BOOK3S_32
176 /* 32 bit Book3S always has 32 byte dcbz */
177 vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
178#endif
179
180 /* On some CPUs we can execute paired single operations natively */
181 asm ( "mfpvr %0" : "=r"(host_pvr));
182 switch (host_pvr) {
183 case 0x00080200: /* lonestar 2.0 */
184 case 0x00088202: /* lonestar 2.2 */
185 case 0x70000100: /* gekko 1.0 */
186 case 0x00080100: /* gekko 2.0 */
187 case 0x00083203: /* gekko 2.3a */
188 case 0x00083213: /* gekko 2.3b */
189 case 0x00083204: /* gekko 2.4 */
190 case 0x00083214: /* gekko 2.4e (8SE) - retail HW2 */
191 case 0x00087200: /* broadway */
192 vcpu->arch.hflags |= BOOK3S_HFLAG_NATIVE_PS;
193 /* Enable HID2.PSE - in case we need it later */
194 mtspr(SPRN_HID2_GEKKO, mfspr(SPRN_HID2_GEKKO) | (1 << 29));
195 }
196}
197
198/* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To
199 * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to
200 * emulate 32 bytes dcbz length.
201 *
202 * The Book3s_64 inventors also realized this case and implemented a special bit
203 * in the HID5 register, which is a hypervisor ressource. Thus we can't use it.
204 *
205 * My approach here is to patch the dcbz instruction on executing pages.
206 */
207static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
208{
209 struct page *hpage;
210 u64 hpage_offset;
211 u32 *page;
212 int i;
213
214 hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
215 if (is_error_page(hpage)) {
216 kvm_release_page_clean(hpage);
217 return;
218 }
219
220 hpage_offset = pte->raddr & ~PAGE_MASK;
221 hpage_offset &= ~0xFFFULL;
222 hpage_offset /= 4;
223
224 get_page(hpage);
225 page = kmap_atomic(hpage, KM_USER0);
226
227 /* patch dcbz into reserved instruction, so we trap */
228 for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++)
229 if ((page[i] & 0xff0007ff) == INS_DCBZ)
230 page[i] &= 0xfffffff7;
231
232 kunmap_atomic(page, KM_USER0);
233 put_page(hpage);
234}
235
236static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
237{
238 ulong mp_pa = vcpu->arch.magic_page_pa;
239
240 if (unlikely(mp_pa) &&
241 unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) {
242 return 1;
243 }
244
245 return kvm_is_visible_gfn(vcpu->kvm, gfn);
246}
247
248int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
249 ulong eaddr, int vec)
250{
251 bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE);
252 int r = RESUME_GUEST;
253 int relocated;
254 int page_found = 0;
255 struct kvmppc_pte pte;
256 bool is_mmio = false;
257 bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false;
258 bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false;
259 u64 vsid;
260
261 relocated = data ? dr : ir;
262
263 /* Resolve real address if translation turned on */
264 if (relocated) {
265 page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data);
266 } else {
267 pte.may_execute = true;
268 pte.may_read = true;
269 pte.may_write = true;
270 pte.raddr = eaddr & KVM_PAM;
271 pte.eaddr = eaddr;
272 pte.vpage = eaddr >> 12;
273 }
274
275 switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
276 case 0:
277 pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12));
278 break;
279 case MSR_DR:
280 case MSR_IR:
281 vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid);
282
283 if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR)
284 pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12));
285 else
286 pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12));
287 pte.vpage |= vsid;
288
289 if (vsid == -1)
290 page_found = -EINVAL;
291 break;
292 }
293
294 if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
295 (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
296 /*
297 * If we do the dcbz hack, we have to NX on every execution,
298 * so we can patch the executing code. This renders our guest
299 * NX-less.
300 */
301 pte.may_execute = !data;
302 }
303
304 if (page_found == -ENOENT) {
305 /* Page not found in guest PTE entries */
306 vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
307 vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
308 vcpu->arch.shared->msr |=
309 (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
310 kvmppc_book3s_queue_irqprio(vcpu, vec);
311 } else if (page_found == -EPERM) {
312 /* Storage protection */
313 vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
314 vcpu->arch.shared->dsisr =
315 to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE;
316 vcpu->arch.shared->dsisr |= DSISR_PROTFAULT;
317 vcpu->arch.shared->msr |=
318 (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
319 kvmppc_book3s_queue_irqprio(vcpu, vec);
320 } else if (page_found == -EINVAL) {
321 /* Page not found in guest SLB */
322 vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
323 kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80);
324 } else if (!is_mmio &&
325 kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) {
326 /* The guest's PTE is not mapped yet. Map on the host */
327 kvmppc_mmu_map_page(vcpu, &pte);
328 if (data)
329 vcpu->stat.sp_storage++;
330 else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
331 (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32)))
332 kvmppc_patch_dcbz(vcpu, &pte);
333 } else {
334 /* MMIO */
335 vcpu->stat.mmio_exits++;
336 vcpu->arch.paddr_accessed = pte.raddr;
337 r = kvmppc_emulate_mmio(run, vcpu);
338 if ( r == RESUME_HOST_NV )
339 r = RESUME_HOST;
340 }
341
342 return r;
343}
344
345static inline int get_fpr_index(int i)
346{
347#ifdef CONFIG_VSX
348 i *= 2;
349#endif
350 return i;
351}
352
353/* Give up external provider (FPU, Altivec, VSX) */
354void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
355{
356 struct thread_struct *t = &current->thread;
357 u64 *vcpu_fpr = vcpu->arch.fpr;
358#ifdef CONFIG_VSX
359 u64 *vcpu_vsx = vcpu->arch.vsr;
360#endif
361 u64 *thread_fpr = (u64*)t->fpr;
362 int i;
363
364 if (!(vcpu->arch.guest_owned_ext & msr))
365 return;
366
367#ifdef DEBUG_EXT
368 printk(KERN_INFO "Giving up ext 0x%lx\n", msr);
369#endif
370
371 switch (msr) {
372 case MSR_FP:
373 giveup_fpu(current);
374 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
375 vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];
376
377 vcpu->arch.fpscr = t->fpscr.val;
378 break;
379 case MSR_VEC:
380#ifdef CONFIG_ALTIVEC
381 giveup_altivec(current);
382 memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr));
383 vcpu->arch.vscr = t->vscr;
384#endif
385 break;
386 case MSR_VSX:
387#ifdef CONFIG_VSX
388 __giveup_vsx(current);
389 for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
390 vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1];
391#endif
392 break;
393 default:
394 BUG();
395 }
396
397 vcpu->arch.guest_owned_ext &= ~msr;
398 current->thread.regs->msr &= ~msr;
399 kvmppc_recalc_shadow_msr(vcpu);
400}
401
402static int kvmppc_read_inst(struct kvm_vcpu *vcpu)
403{
404 ulong srr0 = kvmppc_get_pc(vcpu);
405 u32 last_inst = kvmppc_get_last_inst(vcpu);
406 int ret;
407
408 ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
409 if (ret == -ENOENT) {
410 ulong msr = vcpu->arch.shared->msr;
411
412 msr = kvmppc_set_field(msr, 33, 33, 1);
413 msr = kvmppc_set_field(msr, 34, 36, 0);
414 vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0);
415 kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE);
416 return EMULATE_AGAIN;
417 }
418
419 return EMULATE_DONE;
420}
421
422static int kvmppc_check_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr)
423{
424
425 /* Need to do paired single emulation? */
426 if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE))
427 return EMULATE_DONE;
428
429 /* Read out the instruction */
430 if (kvmppc_read_inst(vcpu) == EMULATE_DONE)
431 /* Need to emulate */
432 return EMULATE_FAIL;
433
434 return EMULATE_AGAIN;
435}
436
437/* Handle external providers (FPU, Altivec, VSX) */
438static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
439 ulong msr)
440{
441 struct thread_struct *t = &current->thread;
442 u64 *vcpu_fpr = vcpu->arch.fpr;
443#ifdef CONFIG_VSX
444 u64 *vcpu_vsx = vcpu->arch.vsr;
445#endif
446 u64 *thread_fpr = (u64*)t->fpr;
447 int i;
448
449 /* When we have paired singles, we emulate in software */
450 if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)
451 return RESUME_GUEST;
452
453 if (!(vcpu->arch.shared->msr & msr)) {
454 kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
455 return RESUME_GUEST;
456 }
457
458 /* We already own the ext */
459 if (vcpu->arch.guest_owned_ext & msr) {
460 return RESUME_GUEST;
461 }
462
463#ifdef DEBUG_EXT
464 printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
465#endif
466
467 current->thread.regs->msr |= msr;
468
469 switch (msr) {
470 case MSR_FP:
471 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
472 thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
473
474 t->fpscr.val = vcpu->arch.fpscr;
475 t->fpexc_mode = 0;
476 kvmppc_load_up_fpu();
477 break;
478 case MSR_VEC:
479#ifdef CONFIG_ALTIVEC
480 memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr));
481 t->vscr = vcpu->arch.vscr;
482 t->vrsave = -1;
483 kvmppc_load_up_altivec();
484#endif
485 break;
486 case MSR_VSX:
487#ifdef CONFIG_VSX
488 for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
489 thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i];
490 kvmppc_load_up_vsx();
491#endif
492 break;
493 default:
494 BUG();
495 }
496
497 vcpu->arch.guest_owned_ext |= msr;
498
499 kvmppc_recalc_shadow_msr(vcpu);
500
501 return RESUME_GUEST;
502}
503
504int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
505 unsigned int exit_nr)
506{
507 int r = RESUME_HOST;
508
509 vcpu->stat.sum_exits++;
510
511 run->exit_reason = KVM_EXIT_UNKNOWN;
512 run->ready_for_interrupt_injection = 1;
513
514 trace_kvm_book3s_exit(exit_nr, vcpu);
515 kvm_resched(vcpu);
516 switch (exit_nr) {
517 case BOOK3S_INTERRUPT_INST_STORAGE:
518 vcpu->stat.pf_instruc++;
519
520#ifdef CONFIG_PPC_BOOK3S_32
521 /* We set segments as unused segments when invalidating them. So
522 * treat the respective fault as segment fault. */
523 if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT]
524 == SR_INVALID) {
525 kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
526 r = RESUME_GUEST;
527 break;
528 }
529#endif
530
531 /* only care about PTEG not found errors, but leave NX alone */
532 if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) {
533 r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr);
534 vcpu->stat.sp_instruc++;
535 } else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
536 (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
537 /*
538 * XXX If we do the dcbz hack we use the NX bit to flush&patch the page,
539 * so we can't use the NX bit inside the guest. Let's cross our fingers,
540 * that no guest that needs the dcbz hack does NX.
541 */
542 kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
543 r = RESUME_GUEST;
544 } else {
545 vcpu->arch.shared->msr |=
546 to_svcpu(vcpu)->shadow_srr1 & 0x58000000;
547 kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
548 r = RESUME_GUEST;
549 }
550 break;
551 case BOOK3S_INTERRUPT_DATA_STORAGE:
552 {
553 ulong dar = kvmppc_get_fault_dar(vcpu);
554 vcpu->stat.pf_storage++;
555
556#ifdef CONFIG_PPC_BOOK3S_32
557 /* We set segments as unused segments when invalidating them. So
558 * treat the respective fault as segment fault. */
559 if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) {
560 kvmppc_mmu_map_segment(vcpu, dar);
561 r = RESUME_GUEST;
562 break;
563 }
564#endif
565
566 /* The only case we need to handle is missing shadow PTEs */
567 if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) {
568 r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr);
569 } else {
570 vcpu->arch.shared->dar = dar;
571 vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
572 kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
573 r = RESUME_GUEST;
574 }
575 break;
576 }
577 case BOOK3S_INTERRUPT_DATA_SEGMENT:
578 if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) {
579 vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
580 kvmppc_book3s_queue_irqprio(vcpu,
581 BOOK3S_INTERRUPT_DATA_SEGMENT);
582 }
583 r = RESUME_GUEST;
584 break;
585 case BOOK3S_INTERRUPT_INST_SEGMENT:
586 if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)) < 0) {
587 kvmppc_book3s_queue_irqprio(vcpu,
588 BOOK3S_INTERRUPT_INST_SEGMENT);
589 }
590 r = RESUME_GUEST;
591 break;
592 /* We're good on these - the host merely wanted to get our attention */
593 case BOOK3S_INTERRUPT_DECREMENTER:
594 vcpu->stat.dec_exits++;
595 r = RESUME_GUEST;
596 break;
597 case BOOK3S_INTERRUPT_EXTERNAL:
598 vcpu->stat.ext_intr_exits++;
599 r = RESUME_GUEST;
600 break;
601 case BOOK3S_INTERRUPT_PERFMON:
602 r = RESUME_GUEST;
603 break;
604 case BOOK3S_INTERRUPT_PROGRAM:
605 {
606 enum emulation_result er;
607 ulong flags;
608
609program_interrupt:
610 flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull;
611
612 if (vcpu->arch.shared->msr & MSR_PR) {
613#ifdef EXIT_DEBUG
614 printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
615#endif
616 if ((kvmppc_get_last_inst(vcpu) & 0xff0007ff) !=
617 (INS_DCBZ & 0xfffffff7)) {
618 kvmppc_core_queue_program(vcpu, flags);
619 r = RESUME_GUEST;
620 break;
621 }
622 }
623
624 vcpu->stat.emulated_inst_exits++;
625 er = kvmppc_emulate_instruction(run, vcpu);
626 switch (er) {
627 case EMULATE_DONE:
628 r = RESUME_GUEST_NV;
629 break;
630 case EMULATE_AGAIN:
631 r = RESUME_GUEST;
632 break;
633 case EMULATE_FAIL:
634 printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
635 __func__, kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
636 kvmppc_core_queue_program(vcpu, flags);
637 r = RESUME_GUEST;
638 break;
639 case EMULATE_DO_MMIO:
640 run->exit_reason = KVM_EXIT_MMIO;
641 r = RESUME_HOST_NV;
642 break;
643 default:
644 BUG();
645 }
646 break;
647 }
648 case BOOK3S_INTERRUPT_SYSCALL:
649 if (vcpu->arch.osi_enabled &&
650 (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) &&
651 (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) {
652 /* MOL hypercalls */
653 u64 *gprs = run->osi.gprs;
654 int i;
655
656 run->exit_reason = KVM_EXIT_OSI;
657 for (i = 0; i < 32; i++)
658 gprs[i] = kvmppc_get_gpr(vcpu, i);
659 vcpu->arch.osi_needed = 1;
660 r = RESUME_HOST_NV;
661 } else if (!(vcpu->arch.shared->msr & MSR_PR) &&
662 (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
663 /* KVM PV hypercalls */
664 kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
665 r = RESUME_GUEST;
666 } else {
667 /* Guest syscalls */
668 vcpu->stat.syscall_exits++;
669 kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
670 r = RESUME_GUEST;
671 }
672 break;
673 case BOOK3S_INTERRUPT_FP_UNAVAIL:
674 case BOOK3S_INTERRUPT_ALTIVEC:
675 case BOOK3S_INTERRUPT_VSX:
676 {
677 int ext_msr = 0;
678
679 switch (exit_nr) {
680 case BOOK3S_INTERRUPT_FP_UNAVAIL: ext_msr = MSR_FP; break;
681 case BOOK3S_INTERRUPT_ALTIVEC: ext_msr = MSR_VEC; break;
682 case BOOK3S_INTERRUPT_VSX: ext_msr = MSR_VSX; break;
683 }
684
685 switch (kvmppc_check_ext(vcpu, exit_nr)) {
686 case EMULATE_DONE:
687 /* everything ok - let's enable the ext */
688 r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr);
689 break;
690 case EMULATE_FAIL:
691 /* we need to emulate this instruction */
692 goto program_interrupt;
693 break;
694 default:
695 /* nothing to worry about - go again */
696 break;
697 }
698 break;
699 }
700 case BOOK3S_INTERRUPT_ALIGNMENT:
701 if (kvmppc_read_inst(vcpu) == EMULATE_DONE) {
702 vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu,
703 kvmppc_get_last_inst(vcpu));
704 vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu,
705 kvmppc_get_last_inst(vcpu));
706 kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
707 }
708 r = RESUME_GUEST;
709 break;
710 case BOOK3S_INTERRUPT_MACHINE_CHECK:
711 case BOOK3S_INTERRUPT_TRACE:
712 kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
713 r = RESUME_GUEST;
714 break;
715 default:
716 /* Ugh - bork here! What did we get? */
717 printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n",
718 exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1);
719 r = RESUME_HOST;
720 BUG();
721 break;
722 }
723
724
725 if (!(r & RESUME_HOST)) {
726 /* To avoid clobbering exit_reason, only check for signals if
727 * we aren't already exiting to userspace for some other
728 * reason. */
729 if (signal_pending(current)) {
730#ifdef EXIT_DEBUG
731 printk(KERN_EMERG "KVM: Going back to host\n");
732#endif
733 vcpu->stat.signal_exits++;
734 run->exit_reason = KVM_EXIT_INTR;
735 r = -EINTR;
736 } else {
737 /* In case an interrupt came in that was triggered
738 * from userspace (like DEC), we need to check what
739 * to inject now! */
740 kvmppc_core_deliver_interrupts(vcpu);
741 }
742 }
743
744 trace_kvm_book3s_reenter(r, vcpu);
745
746 return r;
747}
748
749int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
750 struct kvm_sregs *sregs)
751{
752 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
753 int i;
754
755 sregs->pvr = vcpu->arch.pvr;
756
757 sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
758 if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
759 for (i = 0; i < 64; i++) {
760 sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige | i;
761 sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
762 }
763 } else {
764 for (i = 0; i < 16; i++)
765 sregs->u.s.ppc32.sr[i] = vcpu->arch.shared->sr[i];
766
767 for (i = 0; i < 8; i++) {
768 sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw;
769 sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw;
770 }
771 }
772
773 return 0;
774}
775
776int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
777 struct kvm_sregs *sregs)
778{
779 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
780 int i;
781
782 kvmppc_set_pvr(vcpu, sregs->pvr);
783
784 vcpu3s->sdr1 = sregs->u.s.sdr1;
785 if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
786 for (i = 0; i < 64; i++) {
787 vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv,
788 sregs->u.s.ppc64.slb[i].slbe);
789 }
790 } else {
791 for (i = 0; i < 16; i++) {
792 vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]);
793 }
794 for (i = 0; i < 8; i++) {
795 kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), false,
796 (u32)sregs->u.s.ppc32.ibat[i]);
797 kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), true,
798 (u32)(sregs->u.s.ppc32.ibat[i] >> 32));
799 kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), false,
800 (u32)sregs->u.s.ppc32.dbat[i]);
801 kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), true,
802 (u32)(sregs->u.s.ppc32.dbat[i] >> 32));
803 }
804 }
805
806 /* Flush the MMU after messing with the segments */
807 kvmppc_mmu_pte_flush(vcpu, 0, 0);
808
809 return 0;
810}
811
812int kvmppc_core_check_processor_compat(void)
813{
814 return 0;
815}
816
817struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
818{
819 struct kvmppc_vcpu_book3s *vcpu_book3s;
820 struct kvm_vcpu *vcpu;
821 int err = -ENOMEM;
822 unsigned long p;
823
824 vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s));
825 if (!vcpu_book3s)
826 goto out;
827
828 vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *)
829 kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL);
830 if (!vcpu_book3s->shadow_vcpu)
831 goto free_vcpu;
832
833 vcpu = &vcpu_book3s->vcpu;
834 err = kvm_vcpu_init(vcpu, kvm, id);
835 if (err)
836 goto free_shadow_vcpu;
837
838 p = __get_free_page(GFP_KERNEL|__GFP_ZERO);
839 /* the real shared page fills the last 4k of our page */
840 vcpu->arch.shared = (void*)(p + PAGE_SIZE - 4096);
841 if (!p)
842 goto uninit_vcpu;
843
844 vcpu->arch.host_retip = kvm_return_point;
845 vcpu->arch.host_msr = mfmsr();
846#ifdef CONFIG_PPC_BOOK3S_64
847 /* default to book3s_64 (970fx) */
848 vcpu->arch.pvr = 0x3C0301;
849#else
850 /* default to book3s_32 (750) */
851 vcpu->arch.pvr = 0x84202;
852#endif
853 kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
854 vcpu->arch.slb_nr = 64;
855
856 /* remember where some real-mode handlers are */
857 vcpu->arch.trampoline_lowmem = __pa(kvmppc_handler_lowmem_trampoline);
858 vcpu->arch.trampoline_enter = __pa(kvmppc_handler_trampoline_enter);
859 vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
860#ifdef CONFIG_PPC_BOOK3S_64
861 vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;
862#else
863 vcpu->arch.rmcall = (ulong)kvmppc_rmcall;
864#endif
865
866 vcpu->arch.shadow_msr = MSR_USER64;
867
868 err = kvmppc_mmu_init(vcpu);
869 if (err < 0)
870 goto uninit_vcpu;
871
872 return vcpu;
873
874uninit_vcpu:
875 kvm_vcpu_uninit(vcpu);
876free_shadow_vcpu:
877 kfree(vcpu_book3s->shadow_vcpu);
878free_vcpu:
879 vfree(vcpu_book3s);
880out:
881 return ERR_PTR(err);
882}
883
884void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
885{
886 struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
887
888 free_page((unsigned long)vcpu->arch.shared & PAGE_MASK);
889 kvm_vcpu_uninit(vcpu);
890 kfree(vcpu_book3s->shadow_vcpu);
891 vfree(vcpu_book3s);
892}
893
894int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
895{
896 int ret;
897 double fpr[32][TS_FPRWIDTH];
898 unsigned int fpscr;
899 int fpexc_mode;
900#ifdef CONFIG_ALTIVEC
901 vector128 vr[32];
902 vector128 vscr;
903 unsigned long uninitialized_var(vrsave);
904 int used_vr;
905#endif
906#ifdef CONFIG_VSX
907 int used_vsr;
908#endif
909 ulong ext_msr;
910
911 /* No need to go into the guest when all we do is going out */
912 if (signal_pending(current)) {
913 kvm_run->exit_reason = KVM_EXIT_INTR;
914 return -EINTR;
915 }
916
917 /* Save FPU state in stack */
918 if (current->thread.regs->msr & MSR_FP)
919 giveup_fpu(current);
920 memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr));
921 fpscr = current->thread.fpscr.val;
922 fpexc_mode = current->thread.fpexc_mode;
923
924#ifdef CONFIG_ALTIVEC
925 /* Save Altivec state in stack */
926 used_vr = current->thread.used_vr;
927 if (used_vr) {
928 if (current->thread.regs->msr & MSR_VEC)
929 giveup_altivec(current);
930 memcpy(vr, current->thread.vr, sizeof(current->thread.vr));
931 vscr = current->thread.vscr;
932 vrsave = current->thread.vrsave;
933 }
934#endif
935
936#ifdef CONFIG_VSX
937 /* Save VSX state in stack */
938 used_vsr = current->thread.used_vsr;
939 if (used_vsr && (current->thread.regs->msr & MSR_VSX))
940 __giveup_vsx(current);
941#endif
942
943 /* Remember the MSR with disabled extensions */
944 ext_msr = current->thread.regs->msr;
945
946 /* Preload FPU if it's enabled */
947 if (vcpu->arch.shared->msr & MSR_FP)
948 kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
949
950 kvm_guest_enter();
951
952 ret = __kvmppc_vcpu_run(kvm_run, vcpu);
953
954 kvm_guest_exit();
955
956 local_irq_disable();
957
958 current->thread.regs->msr = ext_msr;
959
960 /* Make sure we save the guest FPU/Altivec/VSX state */
961 kvmppc_giveup_ext(vcpu, MSR_FP);
962 kvmppc_giveup_ext(vcpu, MSR_VEC);
963 kvmppc_giveup_ext(vcpu, MSR_VSX);
964
965 /* Restore FPU state from stack */
966 memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr));
967 current->thread.fpscr.val = fpscr;
968 current->thread.fpexc_mode = fpexc_mode;
969
970#ifdef CONFIG_ALTIVEC
971 /* Restore Altivec state from stack */
972 if (used_vr && current->thread.used_vr) {
973 memcpy(current->thread.vr, vr, sizeof(current->thread.vr));
974 current->thread.vscr = vscr;
975 current->thread.vrsave = vrsave;
976 }
977 current->thread.used_vr = used_vr;
978#endif
979
980#ifdef CONFIG_VSX
981 current->thread.used_vsr = used_vsr;
982#endif
983
984 return ret;
985}
986
987int kvmppc_core_prepare_memory_region(struct kvm *kvm,
988 struct kvm_userspace_memory_region *mem)
989{
990 return 0;
991}
992
993void kvmppc_core_commit_memory_region(struct kvm *kvm,
994 struct kvm_userspace_memory_region *mem)
995{
996}
997
998int kvmppc_core_init_vm(struct kvm *kvm)
999{
1000 return 0;
1001}
1002
1003void kvmppc_core_destroy_vm(struct kvm *kvm)
1004{
1005}
1006
1007static int kvmppc_book3s_init(void)
1008{
1009 int r;
1010
1011 r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0,
1012 THIS_MODULE);
1013
1014 if (r)
1015 return r;
1016
1017 r = kvmppc_mmu_hpte_sysinit();
1018
1019 return r;
1020}
1021
1022static void kvmppc_book3s_exit(void)
1023{
1024 kvmppc_mmu_hpte_sysexit();
1025 kvm_exit();
1026}
1027
1028module_init(kvmppc_book3s_init);
1029module_exit(kvmppc_book3s_exit);
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index 1a1b34487e71..c1f877c4a884 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -36,41 +36,44 @@
36#if defined(CONFIG_PPC_BOOK3S_64) 36#if defined(CONFIG_PPC_BOOK3S_64)
37 37
38#define LOAD_SHADOW_VCPU(reg) GET_PACA(reg) 38#define LOAD_SHADOW_VCPU(reg) GET_PACA(reg)
39#define SHADOW_VCPU_OFF PACA_KVM_SVCPU
40#define MSR_NOIRQ MSR_KERNEL & ~(MSR_IR | MSR_DR) 39#define MSR_NOIRQ MSR_KERNEL & ~(MSR_IR | MSR_DR)
41#define FUNC(name) GLUE(.,name) 40#define FUNC(name) GLUE(.,name)
42 41
42kvmppc_skip_interrupt:
43 /*
44 * Here all GPRs are unchanged from when the interrupt happened
45 * except for r13, which is saved in SPRG_SCRATCH0.
46 */
47 mfspr r13, SPRN_SRR0
48 addi r13, r13, 4
49 mtspr SPRN_SRR0, r13
50 GET_SCRATCH0(r13)
51 rfid
52 b .
53
54kvmppc_skip_Hinterrupt:
55 /*
56 * Here all GPRs are unchanged from when the interrupt happened
57 * except for r13, which is saved in SPRG_SCRATCH0.
58 */
59 mfspr r13, SPRN_HSRR0
60 addi r13, r13, 4
61 mtspr SPRN_HSRR0, r13
62 GET_SCRATCH0(r13)
63 hrfid
64 b .
65
43#elif defined(CONFIG_PPC_BOOK3S_32) 66#elif defined(CONFIG_PPC_BOOK3S_32)
44 67
45#define LOAD_SHADOW_VCPU(reg) \
46 mfspr reg, SPRN_SPRG_THREAD; \
47 lwz reg, THREAD_KVM_SVCPU(reg); \
48 /* PPC32 can have a NULL pointer - let's check for that */ \
49 mtspr SPRN_SPRG_SCRATCH1, r12; /* Save r12 */ \
50 mfcr r12; \
51 cmpwi reg, 0; \
52 bne 1f; \
53 mfspr reg, SPRN_SPRG_SCRATCH0; \
54 mtcr r12; \
55 mfspr r12, SPRN_SPRG_SCRATCH1; \
56 b kvmppc_resume_\intno; \
571:; \
58 mtcr r12; \
59 mfspr r12, SPRN_SPRG_SCRATCH1; \
60 tophys(reg, reg)
61
62#define SHADOW_VCPU_OFF 0
63#define MSR_NOIRQ MSR_KERNEL 68#define MSR_NOIRQ MSR_KERNEL
64#define FUNC(name) name 69#define FUNC(name) name
65 70
66#endif
67
68.macro INTERRUPT_TRAMPOLINE intno 71.macro INTERRUPT_TRAMPOLINE intno
69 72
70.global kvmppc_trampoline_\intno 73.global kvmppc_trampoline_\intno
71kvmppc_trampoline_\intno: 74kvmppc_trampoline_\intno:
72 75
73 SET_SCRATCH0(r13) /* Save r13 */ 76 mtspr SPRN_SPRG_SCRATCH0, r13 /* Save r13 */
74 77
75 /* 78 /*
76 * First thing to do is to find out if we're coming 79 * First thing to do is to find out if we're coming
@@ -78,19 +81,28 @@ kvmppc_trampoline_\intno:
78 * 81 *
79 * To distinguish, we check a magic byte in the PACA/current 82 * To distinguish, we check a magic byte in the PACA/current
80 */ 83 */
81 LOAD_SHADOW_VCPU(r13) 84 mfspr r13, SPRN_SPRG_THREAD
82 PPC_STL r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13) 85 lwz r13, THREAD_KVM_SVCPU(r13)
86 /* PPC32 can have a NULL pointer - let's check for that */
87 mtspr SPRN_SPRG_SCRATCH1, r12 /* Save r12 */
83 mfcr r12 88 mfcr r12
84 stw r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13) 89 cmpwi r13, 0
85 lbz r12, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13) 90 bne 1f
912: mtcr r12
92 mfspr r12, SPRN_SPRG_SCRATCH1
93 mfspr r13, SPRN_SPRG_SCRATCH0 /* r13 = original r13 */
94 b kvmppc_resume_\intno /* Get back original handler */
95
961: tophys(r13, r13)
97 stw r12, HSTATE_SCRATCH1(r13)
98 mfspr r12, SPRN_SPRG_SCRATCH1
99 stw r12, HSTATE_SCRATCH0(r13)
100 lbz r12, HSTATE_IN_GUEST(r13)
86 cmpwi r12, KVM_GUEST_MODE_NONE 101 cmpwi r12, KVM_GUEST_MODE_NONE
87 bne ..kvmppc_handler_hasmagic_\intno 102 bne ..kvmppc_handler_hasmagic_\intno
88 /* No KVM guest? Then jump back to the Linux handler! */ 103 /* No KVM guest? Then jump back to the Linux handler! */
89 lwz r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13) 104 lwz r12, HSTATE_SCRATCH1(r13)
90 mtcr r12 105 b 2b
91 PPC_LL r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
92 GET_SCRATCH0(r13) /* r13 = original r13 */
93 b kvmppc_resume_\intno /* Get back original handler */
94 106
95 /* Now we know we're handling a KVM guest */ 107 /* Now we know we're handling a KVM guest */
96..kvmppc_handler_hasmagic_\intno: 108..kvmppc_handler_hasmagic_\intno:
@@ -112,9 +124,6 @@ INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_MACHINE_CHECK
112INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_DATA_STORAGE 124INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_DATA_STORAGE
113INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_INST_STORAGE 125INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_INST_STORAGE
114INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_EXTERNAL 126INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_EXTERNAL
115#ifdef CONFIG_PPC_BOOK3S_64
116INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_EXTERNAL_HV
117#endif
118INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_ALIGNMENT 127INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_ALIGNMENT
119INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_PROGRAM 128INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_PROGRAM
120INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_FP_UNAVAIL 129INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_FP_UNAVAIL
@@ -124,14 +133,6 @@ INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_TRACE
124INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_PERFMON 133INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_PERFMON
125INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_ALTIVEC 134INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_ALTIVEC
126 135
127/* Those are only available on 64 bit machines */
128
129#ifdef CONFIG_PPC_BOOK3S_64
130INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_DATA_SEGMENT
131INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_INST_SEGMENT
132INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_VSX
133#endif
134
135/* 136/*
136 * Bring us back to the faulting code, but skip the 137 * Bring us back to the faulting code, but skip the
137 * faulting instruction. 138 * faulting instruction.
@@ -143,8 +144,8 @@ INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_VSX
143 * 144 *
144 * R12 = free 145 * R12 = free
145 * R13 = Shadow VCPU (PACA) 146 * R13 = Shadow VCPU (PACA)
146 * SVCPU.SCRATCH0 = guest R12 147 * HSTATE.SCRATCH0 = guest R12
147 * SVCPU.SCRATCH1 = guest CR 148 * HSTATE.SCRATCH1 = guest CR
148 * SPRG_SCRATCH0 = guest R13 149 * SPRG_SCRATCH0 = guest R13
149 * 150 *
150 */ 151 */
@@ -156,13 +157,14 @@ kvmppc_handler_skip_ins:
156 mtsrr0 r12 157 mtsrr0 r12
157 158
158 /* Clean up all state */ 159 /* Clean up all state */
159 lwz r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13) 160 lwz r12, HSTATE_SCRATCH1(r13)
160 mtcr r12 161 mtcr r12
161 PPC_LL r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13) 162 PPC_LL r12, HSTATE_SCRATCH0(r13)
162 GET_SCRATCH0(r13) 163 GET_SCRATCH0(r13)
163 164
164 /* And get back into the code */ 165 /* And get back into the code */
165 RFI 166 RFI
167#endif
166 168
167/* 169/*
168 * This trampoline brings us back to a real mode handler 170 * This trampoline brings us back to a real mode handler
@@ -251,12 +253,4 @@ define_load_up(altivec)
251define_load_up(vsx) 253define_load_up(vsx)
252#endif 254#endif
253 255
254.global kvmppc_trampoline_lowmem
255kvmppc_trampoline_lowmem:
256 PPC_LONG kvmppc_handler_lowmem_trampoline - CONFIG_KERNEL_START
257
258.global kvmppc_trampoline_enter
259kvmppc_trampoline_enter:
260 PPC_LONG kvmppc_handler_trampoline_enter - CONFIG_KERNEL_START
261
262#include "book3s_segment.S" 256#include "book3s_segment.S"
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index 451264274b8c..aed32e517212 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -22,7 +22,7 @@
22#if defined(CONFIG_PPC_BOOK3S_64) 22#if defined(CONFIG_PPC_BOOK3S_64)
23 23
24#define GET_SHADOW_VCPU(reg) \ 24#define GET_SHADOW_VCPU(reg) \
25 addi reg, r13, PACA_KVM_SVCPU 25 mr reg, r13
26 26
27#elif defined(CONFIG_PPC_BOOK3S_32) 27#elif defined(CONFIG_PPC_BOOK3S_32)
28 28
@@ -71,6 +71,10 @@ kvmppc_handler_trampoline_enter:
71 /* r3 = shadow vcpu */ 71 /* r3 = shadow vcpu */
72 GET_SHADOW_VCPU(r3) 72 GET_SHADOW_VCPU(r3)
73 73
74 /* Save R1/R2 in the PACA (64-bit) or shadow_vcpu (32-bit) */
75 PPC_STL r1, HSTATE_HOST_R1(r3)
76 PPC_STL r2, HSTATE_HOST_R2(r3)
77
74 /* Move SRR0 and SRR1 into the respective regs */ 78 /* Move SRR0 and SRR1 into the respective regs */
75 PPC_LL r9, SVCPU_PC(r3) 79 PPC_LL r9, SVCPU_PC(r3)
76 mtsrr0 r9 80 mtsrr0 r9
@@ -78,36 +82,36 @@ kvmppc_handler_trampoline_enter:
78 82
79 /* Activate guest mode, so faults get handled by KVM */ 83 /* Activate guest mode, so faults get handled by KVM */
80 li r11, KVM_GUEST_MODE_GUEST 84 li r11, KVM_GUEST_MODE_GUEST
81 stb r11, SVCPU_IN_GUEST(r3) 85 stb r11, HSTATE_IN_GUEST(r3)
82 86
83 /* Switch to guest segment. This is subarch specific. */ 87 /* Switch to guest segment. This is subarch specific. */
84 LOAD_GUEST_SEGMENTS 88 LOAD_GUEST_SEGMENTS
85 89
86 /* Enter guest */ 90 /* Enter guest */
87 91
88 PPC_LL r4, (SVCPU_CTR)(r3) 92 PPC_LL r4, SVCPU_CTR(r3)
89 PPC_LL r5, (SVCPU_LR)(r3) 93 PPC_LL r5, SVCPU_LR(r3)
90 lwz r6, (SVCPU_CR)(r3) 94 lwz r6, SVCPU_CR(r3)
91 lwz r7, (SVCPU_XER)(r3) 95 lwz r7, SVCPU_XER(r3)
92 96
93 mtctr r4 97 mtctr r4
94 mtlr r5 98 mtlr r5
95 mtcr r6 99 mtcr r6
96 mtxer r7 100 mtxer r7
97 101
98 PPC_LL r0, (SVCPU_R0)(r3) 102 PPC_LL r0, SVCPU_R0(r3)
99 PPC_LL r1, (SVCPU_R1)(r3) 103 PPC_LL r1, SVCPU_R1(r3)
100 PPC_LL r2, (SVCPU_R2)(r3) 104 PPC_LL r2, SVCPU_R2(r3)
101 PPC_LL r4, (SVCPU_R4)(r3) 105 PPC_LL r4, SVCPU_R4(r3)
102 PPC_LL r5, (SVCPU_R5)(r3) 106 PPC_LL r5, SVCPU_R5(r3)
103 PPC_LL r6, (SVCPU_R6)(r3) 107 PPC_LL r6, SVCPU_R6(r3)
104 PPC_LL r7, (SVCPU_R7)(r3) 108 PPC_LL r7, SVCPU_R7(r3)
105 PPC_LL r8, (SVCPU_R8)(r3) 109 PPC_LL r8, SVCPU_R8(r3)
106 PPC_LL r9, (SVCPU_R9)(r3) 110 PPC_LL r9, SVCPU_R9(r3)
107 PPC_LL r10, (SVCPU_R10)(r3) 111 PPC_LL r10, SVCPU_R10(r3)
108 PPC_LL r11, (SVCPU_R11)(r3) 112 PPC_LL r11, SVCPU_R11(r3)
109 PPC_LL r12, (SVCPU_R12)(r3) 113 PPC_LL r12, SVCPU_R12(r3)
110 PPC_LL r13, (SVCPU_R13)(r3) 114 PPC_LL r13, SVCPU_R13(r3)
111 115
112 PPC_LL r3, (SVCPU_R3)(r3) 116 PPC_LL r3, (SVCPU_R3)(r3)
113 117
@@ -125,56 +129,63 @@ kvmppc_handler_trampoline_enter_end:
125.global kvmppc_handler_trampoline_exit 129.global kvmppc_handler_trampoline_exit
126kvmppc_handler_trampoline_exit: 130kvmppc_handler_trampoline_exit:
127 131
132.global kvmppc_interrupt
133kvmppc_interrupt:
134
128 /* Register usage at this point: 135 /* Register usage at this point:
129 * 136 *
130 * SPRG_SCRATCH0 = guest R13 137 * SPRG_SCRATCH0 = guest R13
131 * R12 = exit handler id 138 * R12 = exit handler id
132 * R13 = shadow vcpu - SHADOW_VCPU_OFF [=PACA on PPC64] 139 * R13 = shadow vcpu (32-bit) or PACA (64-bit)
133 * SVCPU.SCRATCH0 = guest R12 140 * HSTATE.SCRATCH0 = guest R12
134 * SVCPU.SCRATCH1 = guest CR 141 * HSTATE.SCRATCH1 = guest CR
135 * 142 *
136 */ 143 */
137 144
138 /* Save registers */ 145 /* Save registers */
139 146
140 PPC_STL r0, (SHADOW_VCPU_OFF + SVCPU_R0)(r13) 147 PPC_STL r0, SVCPU_R0(r13)
141 PPC_STL r1, (SHADOW_VCPU_OFF + SVCPU_R1)(r13) 148 PPC_STL r1, SVCPU_R1(r13)
142 PPC_STL r2, (SHADOW_VCPU_OFF + SVCPU_R2)(r13) 149 PPC_STL r2, SVCPU_R2(r13)
143 PPC_STL r3, (SHADOW_VCPU_OFF + SVCPU_R3)(r13) 150 PPC_STL r3, SVCPU_R3(r13)
144 PPC_STL r4, (SHADOW_VCPU_OFF + SVCPU_R4)(r13) 151 PPC_STL r4, SVCPU_R4(r13)
145 PPC_STL r5, (SHADOW_VCPU_OFF + SVCPU_R5)(r13) 152 PPC_STL r5, SVCPU_R5(r13)
146 PPC_STL r6, (SHADOW_VCPU_OFF + SVCPU_R6)(r13) 153 PPC_STL r6, SVCPU_R6(r13)
147 PPC_STL r7, (SHADOW_VCPU_OFF + SVCPU_R7)(r13) 154 PPC_STL r7, SVCPU_R7(r13)
148 PPC_STL r8, (SHADOW_VCPU_OFF + SVCPU_R8)(r13) 155 PPC_STL r8, SVCPU_R8(r13)
149 PPC_STL r9, (SHADOW_VCPU_OFF + SVCPU_R9)(r13) 156 PPC_STL r9, SVCPU_R9(r13)
150 PPC_STL r10, (SHADOW_VCPU_OFF + SVCPU_R10)(r13) 157 PPC_STL r10, SVCPU_R10(r13)
151 PPC_STL r11, (SHADOW_VCPU_OFF + SVCPU_R11)(r13) 158 PPC_STL r11, SVCPU_R11(r13)
152 159
153 /* Restore R1/R2 so we can handle faults */ 160 /* Restore R1/R2 so we can handle faults */
154 PPC_LL r1, (SHADOW_VCPU_OFF + SVCPU_HOST_R1)(r13) 161 PPC_LL r1, HSTATE_HOST_R1(r13)
155 PPC_LL r2, (SHADOW_VCPU_OFF + SVCPU_HOST_R2)(r13) 162 PPC_LL r2, HSTATE_HOST_R2(r13)
156 163
157 /* Save guest PC and MSR */ 164 /* Save guest PC and MSR */
165#ifdef CONFIG_PPC64
166BEGIN_FTR_SECTION
158 andi. r0,r12,0x2 167 andi. r0,r12,0x2
159 beq 1f 168 beq 1f
160 mfspr r3,SPRN_HSRR0 169 mfspr r3,SPRN_HSRR0
161 mfspr r4,SPRN_HSRR1 170 mfspr r4,SPRN_HSRR1
162 andi. r12,r12,0x3ffd 171 andi. r12,r12,0x3ffd
163 b 2f 172 b 2f
173END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
174#endif
1641: mfsrr0 r3 1751: mfsrr0 r3
165 mfsrr1 r4 176 mfsrr1 r4
1662: 1772:
167 PPC_STL r3, (SHADOW_VCPU_OFF + SVCPU_PC)(r13) 178 PPC_STL r3, SVCPU_PC(r13)
168 PPC_STL r4, (SHADOW_VCPU_OFF + SVCPU_SHADOW_SRR1)(r13) 179 PPC_STL r4, SVCPU_SHADOW_SRR1(r13)
169 180
170 /* Get scratch'ed off registers */ 181 /* Get scratch'ed off registers */
171 GET_SCRATCH0(r9) 182 GET_SCRATCH0(r9)
172 PPC_LL r8, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13) 183 PPC_LL r8, HSTATE_SCRATCH0(r13)
173 lwz r7, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13) 184 lwz r7, HSTATE_SCRATCH1(r13)
174 185
175 PPC_STL r9, (SHADOW_VCPU_OFF + SVCPU_R13)(r13) 186 PPC_STL r9, SVCPU_R13(r13)
176 PPC_STL r8, (SHADOW_VCPU_OFF + SVCPU_R12)(r13) 187 PPC_STL r8, SVCPU_R12(r13)
177 stw r7, (SHADOW_VCPU_OFF + SVCPU_CR)(r13) 188 stw r7, SVCPU_CR(r13)
178 189
179 /* Save more register state */ 190 /* Save more register state */
180 191
@@ -184,11 +195,11 @@ kvmppc_handler_trampoline_exit:
184 mfctr r8 195 mfctr r8
185 mflr r9 196 mflr r9
186 197
187 stw r5, (SHADOW_VCPU_OFF + SVCPU_XER)(r13) 198 stw r5, SVCPU_XER(r13)
188 PPC_STL r6, (SHADOW_VCPU_OFF + SVCPU_FAULT_DAR)(r13) 199 PPC_STL r6, SVCPU_FAULT_DAR(r13)
189 stw r7, (SHADOW_VCPU_OFF + SVCPU_FAULT_DSISR)(r13) 200 stw r7, SVCPU_FAULT_DSISR(r13)
190 PPC_STL r8, (SHADOW_VCPU_OFF + SVCPU_CTR)(r13) 201 PPC_STL r8, SVCPU_CTR(r13)
191 PPC_STL r9, (SHADOW_VCPU_OFF + SVCPU_LR)(r13) 202 PPC_STL r9, SVCPU_LR(r13)
192 203
193 /* 204 /*
194 * In order for us to easily get the last instruction, 205 * In order for us to easily get the last instruction,
@@ -218,7 +229,7 @@ ld_last_inst:
218 /* Set guest mode to 'jump over instruction' so if lwz faults 229 /* Set guest mode to 'jump over instruction' so if lwz faults
219 * we'll just continue at the next IP. */ 230 * we'll just continue at the next IP. */
220 li r9, KVM_GUEST_MODE_SKIP 231 li r9, KVM_GUEST_MODE_SKIP
221 stb r9, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13) 232 stb r9, HSTATE_IN_GUEST(r13)
222 233
223 /* 1) enable paging for data */ 234 /* 1) enable paging for data */
224 mfmsr r9 235 mfmsr r9
@@ -232,13 +243,13 @@ ld_last_inst:
232 sync 243 sync
233 244
234#endif 245#endif
235 stw r0, (SHADOW_VCPU_OFF + SVCPU_LAST_INST)(r13) 246 stw r0, SVCPU_LAST_INST(r13)
236 247
237no_ld_last_inst: 248no_ld_last_inst:
238 249
239 /* Unset guest mode */ 250 /* Unset guest mode */
240 li r9, KVM_GUEST_MODE_NONE 251 li r9, KVM_GUEST_MODE_NONE
241 stb r9, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13) 252 stb r9, HSTATE_IN_GUEST(r13)
242 253
243 /* Switch back to host MMU */ 254 /* Switch back to host MMU */
244 LOAD_HOST_SEGMENTS 255 LOAD_HOST_SEGMENTS
@@ -248,7 +259,7 @@ no_ld_last_inst:
248 * R1 = host R1 259 * R1 = host R1
249 * R2 = host R2 260 * R2 = host R2
250 * R12 = exit handler id 261 * R12 = exit handler id
251 * R13 = shadow vcpu - SHADOW_VCPU_OFF [=PACA on PPC64] 262 * R13 = shadow vcpu (32-bit) or PACA (64-bit)
252 * SVCPU.* = guest * 263 * SVCPU.* = guest *
253 * 264 *
254 */ 265 */
@@ -258,7 +269,7 @@ no_ld_last_inst:
258 ori r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME /* Enable paging */ 269 ori r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME /* Enable paging */
259 mtsrr1 r7 270 mtsrr1 r7
260 /* Load highmem handler address */ 271 /* Load highmem handler address */
261 PPC_LL r8, (SHADOW_VCPU_OFF + SVCPU_VMHANDLER)(r13) 272 PPC_LL r8, HSTATE_VMHANDLER(r13)
262 mtsrr0 r8 273 mtsrr0 r8
263 274
264 RFI 275 RFI
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 8462b3a1c1c7..ee45fa01220e 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -13,6 +13,7 @@
13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
14 * 14 *
15 * Copyright IBM Corp. 2007 15 * Copyright IBM Corp. 2007
16 * Copyright 2010-2011 Freescale Semiconductor, Inc.
16 * 17 *
17 * Authors: Hollis Blanchard <hollisb@us.ibm.com> 18 * Authors: Hollis Blanchard <hollisb@us.ibm.com>
18 * Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com> 19 * Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
@@ -78,6 +79,60 @@ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
78 } 79 }
79} 80}
80 81
82#ifdef CONFIG_SPE
83void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu)
84{
85 preempt_disable();
86 enable_kernel_spe();
87 kvmppc_save_guest_spe(vcpu);
88 vcpu->arch.shadow_msr &= ~MSR_SPE;
89 preempt_enable();
90}
91
92static void kvmppc_vcpu_enable_spe(struct kvm_vcpu *vcpu)
93{
94 preempt_disable();
95 enable_kernel_spe();
96 kvmppc_load_guest_spe(vcpu);
97 vcpu->arch.shadow_msr |= MSR_SPE;
98 preempt_enable();
99}
100
101static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu)
102{
103 if (vcpu->arch.shared->msr & MSR_SPE) {
104 if (!(vcpu->arch.shadow_msr & MSR_SPE))
105 kvmppc_vcpu_enable_spe(vcpu);
106 } else if (vcpu->arch.shadow_msr & MSR_SPE) {
107 kvmppc_vcpu_disable_spe(vcpu);
108 }
109}
110#else
111static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu)
112{
113}
114#endif
115
116/*
117 * Helper function for "full" MSR writes. No need to call this if only
118 * EE/CE/ME/DE/RI are changing.
119 */
120void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
121{
122 u32 old_msr = vcpu->arch.shared->msr;
123
124 vcpu->arch.shared->msr = new_msr;
125
126 kvmppc_mmu_msr_notify(vcpu, old_msr);
127
128 if (vcpu->arch.shared->msr & MSR_WE) {
129 kvm_vcpu_block(vcpu);
130 kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
131 };
132
133 kvmppc_vcpu_sync_spe(vcpu);
134}
135
81static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu, 136static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu,
82 unsigned int priority) 137 unsigned int priority)
83{ 138{
@@ -257,6 +312,19 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
257 vcpu->arch.shared->int_pending = 0; 312 vcpu->arch.shared->int_pending = 0;
258} 313}
259 314
315int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
316{
317 int ret;
318
319 local_irq_disable();
320 kvm_guest_enter();
321 ret = __kvmppc_vcpu_run(kvm_run, vcpu);
322 kvm_guest_exit();
323 local_irq_enable();
324
325 return ret;
326}
327
260/** 328/**
261 * kvmppc_handle_exit 329 * kvmppc_handle_exit
262 * 330 *
@@ -344,10 +412,16 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
344 r = RESUME_GUEST; 412 r = RESUME_GUEST;
345 break; 413 break;
346 414
347 case BOOKE_INTERRUPT_SPE_UNAVAIL: 415#ifdef CONFIG_SPE
348 kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_UNAVAIL); 416 case BOOKE_INTERRUPT_SPE_UNAVAIL: {
417 if (vcpu->arch.shared->msr & MSR_SPE)
418 kvmppc_vcpu_enable_spe(vcpu);
419 else
420 kvmppc_booke_queue_irqprio(vcpu,
421 BOOKE_IRQPRIO_SPE_UNAVAIL);
349 r = RESUME_GUEST; 422 r = RESUME_GUEST;
350 break; 423 break;
424 }
351 425
352 case BOOKE_INTERRUPT_SPE_FP_DATA: 426 case BOOKE_INTERRUPT_SPE_FP_DATA:
353 kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_DATA); 427 kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_DATA);
@@ -358,6 +432,28 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
358 kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_ROUND); 432 kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_ROUND);
359 r = RESUME_GUEST; 433 r = RESUME_GUEST;
360 break; 434 break;
435#else
436 case BOOKE_INTERRUPT_SPE_UNAVAIL:
437 /*
438 * Guest wants SPE, but host kernel doesn't support it. Send
439 * an "unimplemented operation" program check to the guest.
440 */
441 kvmppc_core_queue_program(vcpu, ESR_PUO | ESR_SPV);
442 r = RESUME_GUEST;
443 break;
444
445 /*
446 * These really should never happen without CONFIG_SPE,
447 * as we should never enable the real MSR[SPE] in the guest.
448 */
449 case BOOKE_INTERRUPT_SPE_FP_DATA:
450 case BOOKE_INTERRUPT_SPE_FP_ROUND:
451 printk(KERN_CRIT "%s: unexpected SPE interrupt %u at %08lx\n",
452 __func__, exit_nr, vcpu->arch.pc);
453 run->hw.hardware_exit_reason = exit_nr;
454 r = RESUME_HOST;
455 break;
456#endif
361 457
362 case BOOKE_INTERRUPT_DATA_STORAGE: 458 case BOOKE_INTERRUPT_DATA_STORAGE:
363 kvmppc_core_queue_data_storage(vcpu, vcpu->arch.fault_dear, 459 kvmppc_core_queue_data_storage(vcpu, vcpu->arch.fault_dear,
@@ -392,6 +488,17 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
392 gpa_t gpaddr; 488 gpa_t gpaddr;
393 gfn_t gfn; 489 gfn_t gfn;
394 490
491#ifdef CONFIG_KVM_E500
492 if (!(vcpu->arch.shared->msr & MSR_PR) &&
493 (eaddr & PAGE_MASK) == vcpu->arch.magic_page_ea) {
494 kvmppc_map_magic(vcpu);
495 kvmppc_account_exit(vcpu, DTLB_VIRT_MISS_EXITS);
496 r = RESUME_GUEST;
497
498 break;
499 }
500#endif
501
395 /* Check the guest TLB. */ 502 /* Check the guest TLB. */
396 gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr); 503 gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr);
397 if (gtlb_index < 0) { 504 if (gtlb_index < 0) {
@@ -514,6 +621,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
514 621
515 vcpu->arch.pc = 0; 622 vcpu->arch.pc = 0;
516 vcpu->arch.shared->msr = 0; 623 vcpu->arch.shared->msr = 0;
624 vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS;
517 kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ 625 kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */
518 626
519 vcpu->arch.shadow_pid = 1; 627 vcpu->arch.shadow_pid = 1;
@@ -770,6 +878,26 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
770 return -ENOTSUPP; 878 return -ENOTSUPP;
771} 879}
772 880
881int kvmppc_core_prepare_memory_region(struct kvm *kvm,
882 struct kvm_userspace_memory_region *mem)
883{
884 return 0;
885}
886
887void kvmppc_core_commit_memory_region(struct kvm *kvm,
888 struct kvm_userspace_memory_region *mem)
889{
890}
891
892int kvmppc_core_init_vm(struct kvm *kvm)
893{
894 return 0;
895}
896
897void kvmppc_core_destroy_vm(struct kvm *kvm)
898{
899}
900
773int __init kvmppc_booke_init(void) 901int __init kvmppc_booke_init(void)
774{ 902{
775 unsigned long ivor[16]; 903 unsigned long ivor[16];
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index 492bb7030358..8e1fe33d64e5 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -52,24 +52,19 @@
52 52
53extern unsigned long kvmppc_booke_handlers; 53extern unsigned long kvmppc_booke_handlers;
54 54
55/* Helper function for "full" MSR writes. No need to call this if only EE is 55void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr);
56 * changing. */ 56void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr);
57static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
58{
59 if ((new_msr & MSR_PR) != (vcpu->arch.shared->msr & MSR_PR))
60 kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR);
61
62 vcpu->arch.shared->msr = new_msr;
63
64 if (vcpu->arch.shared->msr & MSR_WE) {
65 kvm_vcpu_block(vcpu);
66 kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
67 };
68}
69 57
70int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, 58int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
71 unsigned int inst, int *advance); 59 unsigned int inst, int *advance);
72int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt); 60int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt);
73int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs); 61int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs);
74 62
63/* low-level asm code to transfer guest state */
64void kvmppc_load_guest_spe(struct kvm_vcpu *vcpu);
65void kvmppc_save_guest_spe(struct kvm_vcpu *vcpu);
66
67/* high-level function, manages flags, host state */
68void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu);
69
75#endif /* __KVM_BOOKE_H__ */ 70#endif /* __KVM_BOOKE_H__ */
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index b58ccae95904..42f2fb1f66e9 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -13,6 +13,7 @@
13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
14 * 14 *
15 * Copyright IBM Corp. 2007 15 * Copyright IBM Corp. 2007
16 * Copyright 2011 Freescale Semiconductor, Inc.
16 * 17 *
17 * Authors: Hollis Blanchard <hollisb@us.ibm.com> 18 * Authors: Hollis Blanchard <hollisb@us.ibm.com>
18 */ 19 */
@@ -24,8 +25,6 @@
24#include <asm/page.h> 25#include <asm/page.h>
25#include <asm/asm-offsets.h> 26#include <asm/asm-offsets.h>
26 27
27#define KVMPPC_MSR_MASK (MSR_CE|MSR_EE|MSR_PR|MSR_DE|MSR_ME|MSR_IS|MSR_DS)
28
29#define VCPU_GPR(n) (VCPU_GPRS + (n * 4)) 28#define VCPU_GPR(n) (VCPU_GPRS + (n * 4))
30 29
31/* The host stack layout: */ 30/* The host stack layout: */
@@ -192,6 +191,12 @@ _GLOBAL(kvmppc_resume_host)
192 lwz r3, VCPU_HOST_PID(r4) 191 lwz r3, VCPU_HOST_PID(r4)
193 mtspr SPRN_PID, r3 192 mtspr SPRN_PID, r3
194 193
194#ifdef CONFIG_FSL_BOOKE
195 /* we cheat and know that Linux doesn't use PID1 which is always 0 */
196 lis r3, 0
197 mtspr SPRN_PID1, r3
198#endif
199
195 /* Restore host IVPR before re-enabling interrupts. We cheat and know 200 /* Restore host IVPR before re-enabling interrupts. We cheat and know
196 * that Linux IVPR is always 0xc0000000. */ 201 * that Linux IVPR is always 0xc0000000. */
197 lis r3, 0xc000 202 lis r3, 0xc000
@@ -241,6 +246,14 @@ _GLOBAL(kvmppc_resume_host)
241heavyweight_exit: 246heavyweight_exit:
242 /* Not returning to guest. */ 247 /* Not returning to guest. */
243 248
249#ifdef CONFIG_SPE
250 /* save guest SPEFSCR and load host SPEFSCR */
251 mfspr r9, SPRN_SPEFSCR
252 stw r9, VCPU_SPEFSCR(r4)
253 lwz r9, VCPU_HOST_SPEFSCR(r4)
254 mtspr SPRN_SPEFSCR, r9
255#endif
256
244 /* We already saved guest volatile register state; now save the 257 /* We already saved guest volatile register state; now save the
245 * non-volatiles. */ 258 * non-volatiles. */
246 stw r15, VCPU_GPR(r15)(r4) 259 stw r15, VCPU_GPR(r15)(r4)
@@ -342,6 +355,14 @@ _GLOBAL(__kvmppc_vcpu_run)
342 lwz r30, VCPU_GPR(r30)(r4) 355 lwz r30, VCPU_GPR(r30)(r4)
343 lwz r31, VCPU_GPR(r31)(r4) 356 lwz r31, VCPU_GPR(r31)(r4)
344 357
358#ifdef CONFIG_SPE
359 /* save host SPEFSCR and load guest SPEFSCR */
360 mfspr r3, SPRN_SPEFSCR
361 stw r3, VCPU_HOST_SPEFSCR(r4)
362 lwz r3, VCPU_SPEFSCR(r4)
363 mtspr SPRN_SPEFSCR, r3
364#endif
365
345lightweight_exit: 366lightweight_exit:
346 stw r2, HOST_R2(r1) 367 stw r2, HOST_R2(r1)
347 368
@@ -350,6 +371,11 @@ lightweight_exit:
350 lwz r3, VCPU_SHADOW_PID(r4) 371 lwz r3, VCPU_SHADOW_PID(r4)
351 mtspr SPRN_PID, r3 372 mtspr SPRN_PID, r3
352 373
374#ifdef CONFIG_FSL_BOOKE
375 lwz r3, VCPU_SHADOW_PID1(r4)
376 mtspr SPRN_PID1, r3
377#endif
378
353#ifdef CONFIG_44x 379#ifdef CONFIG_44x
354 iccci 0, 0 /* XXX hack */ 380 iccci 0, 0 /* XXX hack */
355#endif 381#endif
@@ -405,20 +431,17 @@ lightweight_exit:
405 431
406 /* Finish loading guest volatiles and jump to guest. */ 432 /* Finish loading guest volatiles and jump to guest. */
407 lwz r3, VCPU_CTR(r4) 433 lwz r3, VCPU_CTR(r4)
434 lwz r5, VCPU_CR(r4)
435 lwz r6, VCPU_PC(r4)
436 lwz r7, VCPU_SHADOW_MSR(r4)
408 mtctr r3 437 mtctr r3
409 lwz r3, VCPU_CR(r4) 438 mtcr r5
410 mtcr r3 439 mtsrr0 r6
440 mtsrr1 r7
411 lwz r5, VCPU_GPR(r5)(r4) 441 lwz r5, VCPU_GPR(r5)(r4)
412 lwz r6, VCPU_GPR(r6)(r4) 442 lwz r6, VCPU_GPR(r6)(r4)
413 lwz r7, VCPU_GPR(r7)(r4) 443 lwz r7, VCPU_GPR(r7)(r4)
414 lwz r8, VCPU_GPR(r8)(r4) 444 lwz r8, VCPU_GPR(r8)(r4)
415 lwz r3, VCPU_PC(r4)
416 mtsrr0 r3
417 lwz r3, VCPU_SHARED(r4)
418 lwz r3, (VCPU_SHARED_MSR + 4)(r3)
419 oris r3, r3, KVMPPC_MSR_MASK@h
420 ori r3, r3, KVMPPC_MSR_MASK@l
421 mtsrr1 r3
422 445
423 /* Clear any debug events which occurred since we disabled MSR[DE]. 446 /* Clear any debug events which occurred since we disabled MSR[DE].
424 * XXX This gives us a 3-instruction window in which a breakpoint 447 * XXX This gives us a 3-instruction window in which a breakpoint
@@ -430,3 +453,24 @@ lightweight_exit:
430 lwz r3, VCPU_GPR(r3)(r4) 453 lwz r3, VCPU_GPR(r3)(r4)
431 lwz r4, VCPU_GPR(r4)(r4) 454 lwz r4, VCPU_GPR(r4)(r4)
432 rfi 455 rfi
456
457#ifdef CONFIG_SPE
458_GLOBAL(kvmppc_save_guest_spe)
459 cmpi 0,r3,0
460 beqlr-
461 SAVE_32EVRS(0, r4, r3, VCPU_EVR)
462 evxor evr6, evr6, evr6
463 evmwumiaa evr6, evr6, evr6
464 li r4,VCPU_ACC
465 evstddx evr6, r4, r3 /* save acc */
466 blr
467
468_GLOBAL(kvmppc_load_guest_spe)
469 cmpi 0,r3,0
470 beqlr-
471 li r4,VCPU_ACC
472 evlddx evr6,r4,r3
473 evmra evr6,evr6 /* load acc */
474 REST_32EVRS(0, r4, r3, VCPU_EVR)
475 blr
476#endif
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index 318dbc61ba44..797a7447c268 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. 2 * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
3 * 3 *
4 * Author: Yu Liu, <yu.liu@freescale.com> 4 * Author: Yu Liu, <yu.liu@freescale.com>
5 * 5 *
@@ -41,6 +41,11 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
41void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) 41void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
42{ 42{
43 kvmppc_e500_tlb_put(vcpu); 43 kvmppc_e500_tlb_put(vcpu);
44
45#ifdef CONFIG_SPE
46 if (vcpu->arch.shadow_msr & MSR_SPE)
47 kvmppc_vcpu_disable_spe(vcpu);
48#endif
44} 49}
45 50
46int kvmppc_core_check_processor_compat(void) 51int kvmppc_core_check_processor_compat(void)
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index 69cd665a0caf..d48ae396f41e 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -81,8 +81,12 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
81 kvmppc_set_pid(vcpu, spr_val); 81 kvmppc_set_pid(vcpu, spr_val);
82 break; 82 break;
83 case SPRN_PID1: 83 case SPRN_PID1:
84 if (spr_val != 0)
85 return EMULATE_FAIL;
84 vcpu_e500->pid[1] = spr_val; break; 86 vcpu_e500->pid[1] = spr_val; break;
85 case SPRN_PID2: 87 case SPRN_PID2:
88 if (spr_val != 0)
89 return EMULATE_FAIL;
86 vcpu_e500->pid[2] = spr_val; break; 90 vcpu_e500->pid[2] = spr_val; break;
87 case SPRN_MAS0: 91 case SPRN_MAS0:
88 vcpu_e500->mas0 = spr_val; break; 92 vcpu_e500->mas0 = spr_val; break;
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index b18fe353397d..13c432ea2fa8 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -28,8 +28,196 @@
28 28
29#define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1) 29#define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1)
30 30
31struct id {
32 unsigned long val;
33 struct id **pentry;
34};
35
36#define NUM_TIDS 256
37
38/*
39 * This table provide mappings from:
40 * (guestAS,guestTID,guestPR) --> ID of physical cpu
41 * guestAS [0..1]
42 * guestTID [0..255]
43 * guestPR [0..1]
44 * ID [1..255]
45 * Each vcpu keeps one vcpu_id_table.
46 */
47struct vcpu_id_table {
48 struct id id[2][NUM_TIDS][2];
49};
50
51/*
52 * This table provide reversed mappings of vcpu_id_table:
53 * ID --> address of vcpu_id_table item.
54 * Each physical core has one pcpu_id_table.
55 */
56struct pcpu_id_table {
57 struct id *entry[NUM_TIDS];
58};
59
60static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids);
61
62/* This variable keeps last used shadow ID on local core.
63 * The valid range of shadow ID is [1..255] */
64static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid);
65
31static unsigned int tlb1_entry_num; 66static unsigned int tlb1_entry_num;
32 67
68/*
69 * Allocate a free shadow id and setup a valid sid mapping in given entry.
70 * A mapping is only valid when vcpu_id_table and pcpu_id_table are match.
71 *
72 * The caller must have preemption disabled, and keep it that way until
73 * it has finished with the returned shadow id (either written into the
74 * TLB or arch.shadow_pid, or discarded).
75 */
76static inline int local_sid_setup_one(struct id *entry)
77{
78 unsigned long sid;
79 int ret = -1;
80
81 sid = ++(__get_cpu_var(pcpu_last_used_sid));
82 if (sid < NUM_TIDS) {
83 __get_cpu_var(pcpu_sids).entry[sid] = entry;
84 entry->val = sid;
85 entry->pentry = &__get_cpu_var(pcpu_sids).entry[sid];
86 ret = sid;
87 }
88
89 /*
90 * If sid == NUM_TIDS, we've run out of sids. We return -1, and
91 * the caller will invalidate everything and start over.
92 *
93 * sid > NUM_TIDS indicates a race, which we disable preemption to
94 * avoid.
95 */
96 WARN_ON(sid > NUM_TIDS);
97
98 return ret;
99}
100
101/*
102 * Check if given entry contain a valid shadow id mapping.
103 * An ID mapping is considered valid only if
104 * both vcpu and pcpu know this mapping.
105 *
106 * The caller must have preemption disabled, and keep it that way until
107 * it has finished with the returned shadow id (either written into the
108 * TLB or arch.shadow_pid, or discarded).
109 */
110static inline int local_sid_lookup(struct id *entry)
111{
112 if (entry && entry->val != 0 &&
113 __get_cpu_var(pcpu_sids).entry[entry->val] == entry &&
114 entry->pentry == &__get_cpu_var(pcpu_sids).entry[entry->val])
115 return entry->val;
116 return -1;
117}
118
119/* Invalidate all id mappings on local core */
120static inline void local_sid_destroy_all(void)
121{
122 preempt_disable();
123 __get_cpu_var(pcpu_last_used_sid) = 0;
124 memset(&__get_cpu_var(pcpu_sids), 0, sizeof(__get_cpu_var(pcpu_sids)));
125 preempt_enable();
126}
127
128static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500)
129{
130 vcpu_e500->idt = kzalloc(sizeof(struct vcpu_id_table), GFP_KERNEL);
131 return vcpu_e500->idt;
132}
133
134static void kvmppc_e500_id_table_free(struct kvmppc_vcpu_e500 *vcpu_e500)
135{
136 kfree(vcpu_e500->idt);
137}
138
139/* Invalidate all mappings on vcpu */
140static void kvmppc_e500_id_table_reset_all(struct kvmppc_vcpu_e500 *vcpu_e500)
141{
142 memset(vcpu_e500->idt, 0, sizeof(struct vcpu_id_table));
143
144 /* Update shadow pid when mappings are changed */
145 kvmppc_e500_recalc_shadow_pid(vcpu_e500);
146}
147
148/* Invalidate one ID mapping on vcpu */
149static inline void kvmppc_e500_id_table_reset_one(
150 struct kvmppc_vcpu_e500 *vcpu_e500,
151 int as, int pid, int pr)
152{
153 struct vcpu_id_table *idt = vcpu_e500->idt;
154
155 BUG_ON(as >= 2);
156 BUG_ON(pid >= NUM_TIDS);
157 BUG_ON(pr >= 2);
158
159 idt->id[as][pid][pr].val = 0;
160 idt->id[as][pid][pr].pentry = NULL;
161
162 /* Update shadow pid when mappings are changed */
163 kvmppc_e500_recalc_shadow_pid(vcpu_e500);
164}
165
166/*
167 * Map guest (vcpu,AS,ID,PR) to physical core shadow id.
168 * This function first lookup if a valid mapping exists,
169 * if not, then creates a new one.
170 *
171 * The caller must have preemption disabled, and keep it that way until
172 * it has finished with the returned shadow id (either written into the
173 * TLB or arch.shadow_pid, or discarded).
174 */
175static unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500,
176 unsigned int as, unsigned int gid,
177 unsigned int pr, int avoid_recursion)
178{
179 struct vcpu_id_table *idt = vcpu_e500->idt;
180 int sid;
181
182 BUG_ON(as >= 2);
183 BUG_ON(gid >= NUM_TIDS);
184 BUG_ON(pr >= 2);
185
186 sid = local_sid_lookup(&idt->id[as][gid][pr]);
187
188 while (sid <= 0) {
189 /* No mapping yet */
190 sid = local_sid_setup_one(&idt->id[as][gid][pr]);
191 if (sid <= 0) {
192 _tlbil_all();
193 local_sid_destroy_all();
194 }
195
196 /* Update shadow pid when mappings are changed */
197 if (!avoid_recursion)
198 kvmppc_e500_recalc_shadow_pid(vcpu_e500);
199 }
200
201 return sid;
202}
203
204/* Map guest pid to shadow.
205 * We use PID to keep shadow of current guest non-zero PID,
206 * and use PID1 to keep shadow of guest zero PID.
207 * So that guest tlbe with TID=0 can be accessed at any time */
208void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500)
209{
210 preempt_disable();
211 vcpu_e500->vcpu.arch.shadow_pid = kvmppc_e500_get_sid(vcpu_e500,
212 get_cur_as(&vcpu_e500->vcpu),
213 get_cur_pid(&vcpu_e500->vcpu),
214 get_cur_pr(&vcpu_e500->vcpu), 1);
215 vcpu_e500->vcpu.arch.shadow_pid1 = kvmppc_e500_get_sid(vcpu_e500,
216 get_cur_as(&vcpu_e500->vcpu), 0,
217 get_cur_pr(&vcpu_e500->vcpu), 1);
218 preempt_enable();
219}
220
33void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu) 221void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
34{ 222{
35 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 223 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
@@ -41,25 +229,14 @@ void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
41 229
42 for (tlbsel = 0; tlbsel < 2; tlbsel++) { 230 for (tlbsel = 0; tlbsel < 2; tlbsel++) {
43 printk("Guest TLB%d:\n", tlbsel); 231 printk("Guest TLB%d:\n", tlbsel);
44 for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++) { 232 for (i = 0; i < vcpu_e500->gtlb_size[tlbsel]; i++) {
45 tlbe = &vcpu_e500->guest_tlb[tlbsel][i]; 233 tlbe = &vcpu_e500->gtlb_arch[tlbsel][i];
46 if (tlbe->mas1 & MAS1_VALID) 234 if (tlbe->mas1 & MAS1_VALID)
47 printk(" G[%d][%3d] | %08X | %08X | %08X | %08X |\n", 235 printk(" G[%d][%3d] | %08X | %08X | %08X | %08X |\n",
48 tlbsel, i, tlbe->mas1, tlbe->mas2, 236 tlbsel, i, tlbe->mas1, tlbe->mas2,
49 tlbe->mas3, tlbe->mas7); 237 tlbe->mas3, tlbe->mas7);
50 } 238 }
51 } 239 }
52
53 for (tlbsel = 0; tlbsel < 2; tlbsel++) {
54 printk("Shadow TLB%d:\n", tlbsel);
55 for (i = 0; i < vcpu_e500->shadow_tlb_size[tlbsel]; i++) {
56 tlbe = &vcpu_e500->shadow_tlb[tlbsel][i];
57 if (tlbe->mas1 & MAS1_VALID)
58 printk(" S[%d][%3d] | %08X | %08X | %08X | %08X |\n",
59 tlbsel, i, tlbe->mas1, tlbe->mas2,
60 tlbe->mas3, tlbe->mas7);
61 }
62 }
63} 240}
64 241
65static inline unsigned int tlb0_get_next_victim( 242static inline unsigned int tlb0_get_next_victim(
@@ -67,16 +244,17 @@ static inline unsigned int tlb0_get_next_victim(
67{ 244{
68 unsigned int victim; 245 unsigned int victim;
69 246
70 victim = vcpu_e500->guest_tlb_nv[0]++; 247 victim = vcpu_e500->gtlb_nv[0]++;
71 if (unlikely(vcpu_e500->guest_tlb_nv[0] >= KVM_E500_TLB0_WAY_NUM)) 248 if (unlikely(vcpu_e500->gtlb_nv[0] >= KVM_E500_TLB0_WAY_NUM))
72 vcpu_e500->guest_tlb_nv[0] = 0; 249 vcpu_e500->gtlb_nv[0] = 0;
73 250
74 return victim; 251 return victim;
75} 252}
76 253
77static inline unsigned int tlb1_max_shadow_size(void) 254static inline unsigned int tlb1_max_shadow_size(void)
78{ 255{
79 return tlb1_entry_num - tlbcam_index; 256 /* reserve one entry for magic page */
257 return tlb1_entry_num - tlbcam_index - 1;
80} 258}
81 259
82static inline int tlbe_is_writable(struct tlbe *tlbe) 260static inline int tlbe_is_writable(struct tlbe *tlbe)
@@ -112,72 +290,149 @@ static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode)
112/* 290/*
113 * writing shadow tlb entry to host TLB 291 * writing shadow tlb entry to host TLB
114 */ 292 */
115static inline void __write_host_tlbe(struct tlbe *stlbe) 293static inline void __write_host_tlbe(struct tlbe *stlbe, uint32_t mas0)
116{ 294{
295 unsigned long flags;
296
297 local_irq_save(flags);
298 mtspr(SPRN_MAS0, mas0);
117 mtspr(SPRN_MAS1, stlbe->mas1); 299 mtspr(SPRN_MAS1, stlbe->mas1);
118 mtspr(SPRN_MAS2, stlbe->mas2); 300 mtspr(SPRN_MAS2, stlbe->mas2);
119 mtspr(SPRN_MAS3, stlbe->mas3); 301 mtspr(SPRN_MAS3, stlbe->mas3);
120 mtspr(SPRN_MAS7, stlbe->mas7); 302 mtspr(SPRN_MAS7, stlbe->mas7);
121 __asm__ __volatile__ ("tlbwe\n" : : ); 303 asm volatile("isync; tlbwe" : : : "memory");
304 local_irq_restore(flags);
122} 305}
123 306
124static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, 307static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
125 int tlbsel, int esel) 308 int tlbsel, int esel, struct tlbe *stlbe)
126{ 309{
127 struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel];
128
129 local_irq_disable();
130 if (tlbsel == 0) { 310 if (tlbsel == 0) {
131 __write_host_tlbe(stlbe); 311 __write_host_tlbe(stlbe,
312 MAS0_TLBSEL(0) |
313 MAS0_ESEL(esel & (KVM_E500_TLB0_WAY_NUM - 1)));
132 } else { 314 } else {
133 unsigned register mas0; 315 __write_host_tlbe(stlbe,
134 316 MAS0_TLBSEL(1) |
135 mas0 = mfspr(SPRN_MAS0); 317 MAS0_ESEL(to_htlb1_esel(esel)));
136
137 mtspr(SPRN_MAS0, MAS0_TLBSEL(1) | MAS0_ESEL(to_htlb1_esel(esel)));
138 __write_host_tlbe(stlbe);
139
140 mtspr(SPRN_MAS0, mas0);
141 } 318 }
142 local_irq_enable(); 319 trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2,
320 stlbe->mas3, stlbe->mas7);
321}
322
323void kvmppc_map_magic(struct kvm_vcpu *vcpu)
324{
325 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
326 struct tlbe magic;
327 ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
328 unsigned int stid;
329 pfn_t pfn;
330
331 pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT;
332 get_page(pfn_to_page(pfn));
333
334 preempt_disable();
335 stid = kvmppc_e500_get_sid(vcpu_e500, 0, 0, 0, 0);
336
337 magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) |
338 MAS1_TSIZE(BOOK3E_PAGESZ_4K);
339 magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M;
340 magic.mas3 = (pfn << PAGE_SHIFT) |
341 MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR;
342 magic.mas7 = pfn >> (32 - PAGE_SHIFT);
343
344 __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index));
345 preempt_enable();
143} 346}
144 347
145void kvmppc_e500_tlb_load(struct kvm_vcpu *vcpu, int cpu) 348void kvmppc_e500_tlb_load(struct kvm_vcpu *vcpu, int cpu)
146{ 349{
147 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 350 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
148 int i; 351
149 unsigned register mas0; 352 /* Shadow PID may be expired on local core */
150 353 kvmppc_e500_recalc_shadow_pid(vcpu_e500);
151 /* Load all valid TLB1 entries to reduce guest tlb miss fault */
152 local_irq_disable();
153 mas0 = mfspr(SPRN_MAS0);
154 for (i = 0; i < tlb1_max_shadow_size(); i++) {
155 struct tlbe *stlbe = &vcpu_e500->shadow_tlb[1][i];
156
157 if (get_tlb_v(stlbe)) {
158 mtspr(SPRN_MAS0, MAS0_TLBSEL(1)
159 | MAS0_ESEL(to_htlb1_esel(i)));
160 __write_host_tlbe(stlbe);
161 }
162 }
163 mtspr(SPRN_MAS0, mas0);
164 local_irq_enable();
165} 354}
166 355
167void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu) 356void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu)
168{ 357{
169 _tlbil_all(); 358}
359
360static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
361 int tlbsel, int esel)
362{
363 struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
364 struct vcpu_id_table *idt = vcpu_e500->idt;
365 unsigned int pr, tid, ts, pid;
366 u32 val, eaddr;
367 unsigned long flags;
368
369 ts = get_tlb_ts(gtlbe);
370 tid = get_tlb_tid(gtlbe);
371
372 preempt_disable();
373
374 /* One guest ID may be mapped to two shadow IDs */
375 for (pr = 0; pr < 2; pr++) {
376 /*
377 * The shadow PID can have a valid mapping on at most one
378 * host CPU. In the common case, it will be valid on this
379 * CPU, in which case (for TLB0) we do a local invalidation
380 * of the specific address.
381 *
382 * If the shadow PID is not valid on the current host CPU, or
383 * if we're invalidating a TLB1 entry, we invalidate the
384 * entire shadow PID.
385 */
386 if (tlbsel == 1 ||
387 (pid = local_sid_lookup(&idt->id[ts][tid][pr])) <= 0) {
388 kvmppc_e500_id_table_reset_one(vcpu_e500, ts, tid, pr);
389 continue;
390 }
391
392 /*
393 * The guest is invalidating a TLB0 entry which is in a PID
394 * that has a valid shadow mapping on this host CPU. We
395 * search host TLB0 to invalidate it's shadow TLB entry,
396 * similar to __tlbil_va except that we need to look in AS1.
397 */
398 val = (pid << MAS6_SPID_SHIFT) | MAS6_SAS;
399 eaddr = get_tlb_eaddr(gtlbe);
400
401 local_irq_save(flags);
402
403 mtspr(SPRN_MAS6, val);
404 asm volatile("tlbsx 0, %[eaddr]" : : [eaddr] "r" (eaddr));
405 val = mfspr(SPRN_MAS1);
406 if (val & MAS1_VALID) {
407 mtspr(SPRN_MAS1, val & ~MAS1_VALID);
408 asm volatile("tlbwe");
409 }
410
411 local_irq_restore(flags);
412 }
413
414 preempt_enable();
170} 415}
171 416
172/* Search the guest TLB for a matching entry. */ 417/* Search the guest TLB for a matching entry. */
173static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, 418static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500,
174 gva_t eaddr, int tlbsel, unsigned int pid, int as) 419 gva_t eaddr, int tlbsel, unsigned int pid, int as)
175{ 420{
421 int size = vcpu_e500->gtlb_size[tlbsel];
422 int set_base;
176 int i; 423 int i;
177 424
178 /* XXX Replace loop with fancy data structures. */ 425 if (tlbsel == 0) {
179 for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++) { 426 int mask = size / KVM_E500_TLB0_WAY_NUM - 1;
180 struct tlbe *tlbe = &vcpu_e500->guest_tlb[tlbsel][i]; 427 set_base = (eaddr >> PAGE_SHIFT) & mask;
428 set_base *= KVM_E500_TLB0_WAY_NUM;
429 size = KVM_E500_TLB0_WAY_NUM;
430 } else {
431 set_base = 0;
432 }
433
434 for (i = 0; i < size; i++) {
435 struct tlbe *tlbe = &vcpu_e500->gtlb_arch[tlbsel][set_base + i];
181 unsigned int tid; 436 unsigned int tid;
182 437
183 if (eaddr < get_tlb_eaddr(tlbe)) 438 if (eaddr < get_tlb_eaddr(tlbe))
@@ -196,66 +451,32 @@ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500,
196 if (get_tlb_ts(tlbe) != as && as != -1) 451 if (get_tlb_ts(tlbe) != as && as != -1)
197 continue; 452 continue;
198 453
199 return i; 454 return set_base + i;
200 } 455 }
201 456
202 return -1; 457 return -1;
203} 458}
204 459
205static void kvmppc_e500_shadow_release(struct kvmppc_vcpu_e500 *vcpu_e500, 460static inline void kvmppc_e500_priv_setup(struct tlbe_priv *priv,
206 int tlbsel, int esel) 461 struct tlbe *gtlbe,
207{ 462 pfn_t pfn)
208 struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel];
209 struct page *page = vcpu_e500->shadow_pages[tlbsel][esel];
210
211 if (page) {
212 vcpu_e500->shadow_pages[tlbsel][esel] = NULL;
213
214 if (get_tlb_v(stlbe)) {
215 if (tlbe_is_writable(stlbe))
216 kvm_release_page_dirty(page);
217 else
218 kvm_release_page_clean(page);
219 }
220 }
221}
222
223static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
224 int tlbsel, int esel)
225{ 463{
226 struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel]; 464 priv->pfn = pfn;
465 priv->flags = E500_TLB_VALID;
227 466
228 kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel); 467 if (tlbe_is_writable(gtlbe))
229 stlbe->mas1 = 0; 468 priv->flags |= E500_TLB_DIRTY;
230 trace_kvm_stlb_inval(index_of(tlbsel, esel));
231} 469}
232 470
233static void kvmppc_e500_tlb1_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, 471static inline void kvmppc_e500_priv_release(struct tlbe_priv *priv)
234 gva_t eaddr, gva_t eend, u32 tid)
235{ 472{
236 unsigned int pid = tid & 0xff; 473 if (priv->flags & E500_TLB_VALID) {
237 unsigned int i; 474 if (priv->flags & E500_TLB_DIRTY)
238 475 kvm_release_pfn_dirty(priv->pfn);
239 /* XXX Replace loop with fancy data structures. */ 476 else
240 for (i = 0; i < vcpu_e500->guest_tlb_size[1]; i++) { 477 kvm_release_pfn_clean(priv->pfn);
241 struct tlbe *stlbe = &vcpu_e500->shadow_tlb[1][i];
242 unsigned int tid;
243
244 if (!get_tlb_v(stlbe))
245 continue;
246
247 if (eend < get_tlb_eaddr(stlbe))
248 continue;
249 478
250 if (eaddr > get_tlb_end(stlbe)) 479 priv->flags = 0;
251 continue;
252
253 tid = get_tlb_tid(stlbe);
254 if (tid && (tid != pid))
255 continue;
256
257 kvmppc_e500_stlbe_invalidate(vcpu_e500, 1, i);
258 write_host_tlbe(vcpu_e500, 1, i);
259 } 480 }
260} 481}
261 482
@@ -273,7 +494,7 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
273 tsized = (vcpu_e500->mas4 >> 7) & 0x1f; 494 tsized = (vcpu_e500->mas4 >> 7) & 0x1f;
274 495
275 vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) 496 vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim)
276 | MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); 497 | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
277 vcpu_e500->mas1 = MAS1_VALID | (as ? MAS1_TS : 0) 498 vcpu_e500->mas1 = MAS1_VALID | (as ? MAS1_TS : 0)
278 | MAS1_TID(vcpu_e500->pid[pidsel]) 499 | MAS1_TID(vcpu_e500->pid[pidsel])
279 | MAS1_TSIZE(tsized); 500 | MAS1_TSIZE(tsized);
@@ -286,56 +507,154 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
286 vcpu_e500->mas7 = 0; 507 vcpu_e500->mas7 = 0;
287} 508}
288 509
289static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, 510static inline void kvmppc_e500_setup_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
290 u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel) 511 struct tlbe *gtlbe, int tsize,
512 struct tlbe_priv *priv,
513 u64 gvaddr, struct tlbe *stlbe)
291{ 514{
292 struct page *new_page; 515 pfn_t pfn = priv->pfn;
293 struct tlbe *stlbe; 516 unsigned int stid;
294 hpa_t hpaddr;
295
296 stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel];
297
298 /* Get reference to new page. */
299 new_page = gfn_to_page(vcpu_e500->vcpu.kvm, gfn);
300 if (is_error_page(new_page)) {
301 printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n",
302 (long)gfn);
303 kvm_release_page_clean(new_page);
304 return;
305 }
306 hpaddr = page_to_phys(new_page);
307
308 /* Drop reference to old page. */
309 kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel);
310 517
311 vcpu_e500->shadow_pages[tlbsel][esel] = new_page; 518 stid = kvmppc_e500_get_sid(vcpu_e500, get_tlb_ts(gtlbe),
519 get_tlb_tid(gtlbe),
520 get_cur_pr(&vcpu_e500->vcpu), 0);
312 521
313 /* Force TS=1 IPROT=0 TSIZE=4KB for all guest mappings. */ 522 /* Force TS=1 IPROT=0 for all guest mappings. */
314 stlbe->mas1 = MAS1_TSIZE(BOOK3E_PAGESZ_4K) 523 stlbe->mas1 = MAS1_TSIZE(tsize)
315 | MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID; 524 | MAS1_TID(stid) | MAS1_TS | MAS1_VALID;
316 stlbe->mas2 = (gvaddr & MAS2_EPN) 525 stlbe->mas2 = (gvaddr & MAS2_EPN)
317 | e500_shadow_mas2_attrib(gtlbe->mas2, 526 | e500_shadow_mas2_attrib(gtlbe->mas2,
318 vcpu_e500->vcpu.arch.shared->msr & MSR_PR); 527 vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
319 stlbe->mas3 = (hpaddr & MAS3_RPN) 528 stlbe->mas3 = ((pfn << PAGE_SHIFT) & MAS3_RPN)
320 | e500_shadow_mas3_attrib(gtlbe->mas3, 529 | e500_shadow_mas3_attrib(gtlbe->mas3,
321 vcpu_e500->vcpu.arch.shared->msr & MSR_PR); 530 vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
322 stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN; 531 stlbe->mas7 = (pfn >> (32 - PAGE_SHIFT)) & MAS7_RPN;
532}
323 533
324 trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2, 534
325 stlbe->mas3, stlbe->mas7); 535static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
536 u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel,
537 struct tlbe *stlbe)
538{
539 struct kvm_memory_slot *slot;
540 unsigned long pfn, hva;
541 int pfnmap = 0;
542 int tsize = BOOK3E_PAGESZ_4K;
543 struct tlbe_priv *priv;
544
545 /*
546 * Translate guest physical to true physical, acquiring
547 * a page reference if it is normal, non-reserved memory.
548 *
549 * gfn_to_memslot() must succeed because otherwise we wouldn't
550 * have gotten this far. Eventually we should just pass the slot
551 * pointer through from the first lookup.
552 */
553 slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn);
554 hva = gfn_to_hva_memslot(slot, gfn);
555
556 if (tlbsel == 1) {
557 struct vm_area_struct *vma;
558 down_read(&current->mm->mmap_sem);
559
560 vma = find_vma(current->mm, hva);
561 if (vma && hva >= vma->vm_start &&
562 (vma->vm_flags & VM_PFNMAP)) {
563 /*
564 * This VMA is a physically contiguous region (e.g.
565 * /dev/mem) that bypasses normal Linux page
566 * management. Find the overlap between the
567 * vma and the memslot.
568 */
569
570 unsigned long start, end;
571 unsigned long slot_start, slot_end;
572
573 pfnmap = 1;
574
575 start = vma->vm_pgoff;
576 end = start +
577 ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT);
578
579 pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT);
580
581 slot_start = pfn - (gfn - slot->base_gfn);
582 slot_end = slot_start + slot->npages;
583
584 if (start < slot_start)
585 start = slot_start;
586 if (end > slot_end)
587 end = slot_end;
588
589 tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
590 MAS1_TSIZE_SHIFT;
591
592 /*
593 * e500 doesn't implement the lowest tsize bit,
594 * or 1K pages.
595 */
596 tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
597
598 /*
599 * Now find the largest tsize (up to what the guest
600 * requested) that will cover gfn, stay within the
601 * range, and for which gfn and pfn are mutually
602 * aligned.
603 */
604
605 for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) {
606 unsigned long gfn_start, gfn_end, tsize_pages;
607 tsize_pages = 1 << (tsize - 2);
608
609 gfn_start = gfn & ~(tsize_pages - 1);
610 gfn_end = gfn_start + tsize_pages;
611
612 if (gfn_start + pfn - gfn < start)
613 continue;
614 if (gfn_end + pfn - gfn > end)
615 continue;
616 if ((gfn & (tsize_pages - 1)) !=
617 (pfn & (tsize_pages - 1)))
618 continue;
619
620 gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
621 pfn &= ~(tsize_pages - 1);
622 break;
623 }
624 }
625
626 up_read(&current->mm->mmap_sem);
627 }
628
629 if (likely(!pfnmap)) {
630 pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn);
631 if (is_error_pfn(pfn)) {
632 printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
633 (long)gfn);
634 kvm_release_pfn_clean(pfn);
635 return;
636 }
637 }
638
639 /* Drop old priv and setup new one. */
640 priv = &vcpu_e500->gtlb_priv[tlbsel][esel];
641 kvmppc_e500_priv_release(priv);
642 kvmppc_e500_priv_setup(priv, gtlbe, pfn);
643
644 kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, tsize, priv, gvaddr, stlbe);
326} 645}
327 646
328/* XXX only map the one-one case, for now use TLB0 */ 647/* XXX only map the one-one case, for now use TLB0 */
329static int kvmppc_e500_stlbe_map(struct kvmppc_vcpu_e500 *vcpu_e500, 648static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500,
330 int tlbsel, int esel) 649 int esel, struct tlbe *stlbe)
331{ 650{
332 struct tlbe *gtlbe; 651 struct tlbe *gtlbe;
333 652
334 gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; 653 gtlbe = &vcpu_e500->gtlb_arch[0][esel];
335 654
336 kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe), 655 kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe),
337 get_tlb_raddr(gtlbe) >> PAGE_SHIFT, 656 get_tlb_raddr(gtlbe) >> PAGE_SHIFT,
338 gtlbe, tlbsel, esel); 657 gtlbe, 0, esel, stlbe);
339 658
340 return esel; 659 return esel;
341} 660}
@@ -344,53 +663,37 @@ static int kvmppc_e500_stlbe_map(struct kvmppc_vcpu_e500 *vcpu_e500,
344 * the shadow TLB. */ 663 * the shadow TLB. */
345/* XXX for both one-one and one-to-many , for now use TLB1 */ 664/* XXX for both one-one and one-to-many , for now use TLB1 */
346static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, 665static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
347 u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe) 666 u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, struct tlbe *stlbe)
348{ 667{
349 unsigned int victim; 668 unsigned int victim;
350 669
351 victim = vcpu_e500->guest_tlb_nv[1]++; 670 victim = vcpu_e500->gtlb_nv[1]++;
352 671
353 if (unlikely(vcpu_e500->guest_tlb_nv[1] >= tlb1_max_shadow_size())) 672 if (unlikely(vcpu_e500->gtlb_nv[1] >= tlb1_max_shadow_size()))
354 vcpu_e500->guest_tlb_nv[1] = 0; 673 vcpu_e500->gtlb_nv[1] = 0;
355 674
356 kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, victim); 675 kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, victim, stlbe);
357 676
358 return victim; 677 return victim;
359} 678}
360 679
361/* Invalidate all guest kernel mappings when enter usermode, 680void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr)
362 * so that when they fault back in they will get the
363 * proper permission bits. */
364void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
365{ 681{
366 if (usermode) { 682 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
367 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
368 int i;
369
370 /* XXX Replace loop with fancy data structures. */
371 for (i = 0; i < tlb1_max_shadow_size(); i++)
372 kvmppc_e500_stlbe_invalidate(vcpu_e500, 1, i);
373 683
374 _tlbil_all(); 684 /* Recalc shadow pid since MSR changes */
375 } 685 kvmppc_e500_recalc_shadow_pid(vcpu_e500);
376} 686}
377 687
378static int kvmppc_e500_gtlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, 688static inline int kvmppc_e500_gtlbe_invalidate(
379 int tlbsel, int esel) 689 struct kvmppc_vcpu_e500 *vcpu_e500,
690 int tlbsel, int esel)
380{ 691{
381 struct tlbe *gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; 692 struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
382 693
383 if (unlikely(get_tlb_iprot(gtlbe))) 694 if (unlikely(get_tlb_iprot(gtlbe)))
384 return -1; 695 return -1;
385 696
386 if (tlbsel == 1) {
387 kvmppc_e500_tlb1_invalidate(vcpu_e500, get_tlb_eaddr(gtlbe),
388 get_tlb_end(gtlbe),
389 get_tlb_tid(gtlbe));
390 } else {
391 kvmppc_e500_stlbe_invalidate(vcpu_e500, tlbsel, esel);
392 }
393
394 gtlbe->mas1 = 0; 697 gtlbe->mas1 = 0;
395 698
396 return 0; 699 return 0;
@@ -401,13 +704,14 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value)
401 int esel; 704 int esel;
402 705
403 if (value & MMUCSR0_TLB0FI) 706 if (value & MMUCSR0_TLB0FI)
404 for (esel = 0; esel < vcpu_e500->guest_tlb_size[0]; esel++) 707 for (esel = 0; esel < vcpu_e500->gtlb_size[0]; esel++)
405 kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel); 708 kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel);
406 if (value & MMUCSR0_TLB1FI) 709 if (value & MMUCSR0_TLB1FI)
407 for (esel = 0; esel < vcpu_e500->guest_tlb_size[1]; esel++) 710 for (esel = 0; esel < vcpu_e500->gtlb_size[1]; esel++)
408 kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel); 711 kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel);
409 712
410 _tlbil_all(); 713 /* Invalidate all vcpu id mappings */
714 kvmppc_e500_id_table_reset_all(vcpu_e500);
411 715
412 return EMULATE_DONE; 716 return EMULATE_DONE;
413} 717}
@@ -428,7 +732,7 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb)
428 732
429 if (ia) { 733 if (ia) {
430 /* invalidate all entries */ 734 /* invalidate all entries */
431 for (esel = 0; esel < vcpu_e500->guest_tlb_size[tlbsel]; esel++) 735 for (esel = 0; esel < vcpu_e500->gtlb_size[tlbsel]; esel++)
432 kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); 736 kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
433 } else { 737 } else {
434 ea &= 0xfffff000; 738 ea &= 0xfffff000;
@@ -438,7 +742,8 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb)
438 kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); 742 kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
439 } 743 }
440 744
441 _tlbil_all(); 745 /* Invalidate all vcpu id mappings */
746 kvmppc_e500_id_table_reset_all(vcpu_e500);
442 747
443 return EMULATE_DONE; 748 return EMULATE_DONE;
444} 749}
@@ -452,9 +757,9 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu)
452 tlbsel = get_tlb_tlbsel(vcpu_e500); 757 tlbsel = get_tlb_tlbsel(vcpu_e500);
453 esel = get_tlb_esel(vcpu_e500, tlbsel); 758 esel = get_tlb_esel(vcpu_e500, tlbsel);
454 759
455 gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; 760 gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
456 vcpu_e500->mas0 &= ~MAS0_NV(~0); 761 vcpu_e500->mas0 &= ~MAS0_NV(~0);
457 vcpu_e500->mas0 |= MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); 762 vcpu_e500->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
458 vcpu_e500->mas1 = gtlbe->mas1; 763 vcpu_e500->mas1 = gtlbe->mas1;
459 vcpu_e500->mas2 = gtlbe->mas2; 764 vcpu_e500->mas2 = gtlbe->mas2;
460 vcpu_e500->mas3 = gtlbe->mas3; 765 vcpu_e500->mas3 = gtlbe->mas3;
@@ -477,14 +782,14 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb)
477 for (tlbsel = 0; tlbsel < 2; tlbsel++) { 782 for (tlbsel = 0; tlbsel < 2; tlbsel++) {
478 esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); 783 esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as);
479 if (esel >= 0) { 784 if (esel >= 0) {
480 gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; 785 gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
481 break; 786 break;
482 } 787 }
483 } 788 }
484 789
485 if (gtlbe) { 790 if (gtlbe) {
486 vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel) 791 vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel)
487 | MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); 792 | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
488 vcpu_e500->mas1 = gtlbe->mas1; 793 vcpu_e500->mas1 = gtlbe->mas1;
489 vcpu_e500->mas2 = gtlbe->mas2; 794 vcpu_e500->mas2 = gtlbe->mas2;
490 vcpu_e500->mas3 = gtlbe->mas3; 795 vcpu_e500->mas3 = gtlbe->mas3;
@@ -497,7 +802,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb)
497 victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; 802 victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0;
498 803
499 vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) 804 vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim)
500 | MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); 805 | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
501 vcpu_e500->mas1 = (vcpu_e500->mas6 & MAS6_SPID0) 806 vcpu_e500->mas1 = (vcpu_e500->mas6 & MAS6_SPID0)
502 | (vcpu_e500->mas6 & (MAS6_SAS ? MAS1_TS : 0)) 807 | (vcpu_e500->mas6 & (MAS6_SAS ? MAS1_TS : 0))
503 | (vcpu_e500->mas4 & MAS4_TSIZED(~0)); 808 | (vcpu_e500->mas4 & MAS4_TSIZED(~0));
@@ -514,23 +819,16 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb)
514int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) 819int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
515{ 820{
516 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 821 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
517 u64 eaddr;
518 u64 raddr;
519 u32 tid;
520 struct tlbe *gtlbe; 822 struct tlbe *gtlbe;
521 int tlbsel, esel, stlbsel, sesel; 823 int tlbsel, esel;
522 824
523 tlbsel = get_tlb_tlbsel(vcpu_e500); 825 tlbsel = get_tlb_tlbsel(vcpu_e500);
524 esel = get_tlb_esel(vcpu_e500, tlbsel); 826 esel = get_tlb_esel(vcpu_e500, tlbsel);
525 827
526 gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; 828 gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
527 829
528 if (get_tlb_v(gtlbe) && tlbsel == 1) { 830 if (get_tlb_v(gtlbe))
529 eaddr = get_tlb_eaddr(gtlbe); 831 kvmppc_e500_stlbe_invalidate(vcpu_e500, tlbsel, esel);
530 tid = get_tlb_tid(gtlbe);
531 kvmppc_e500_tlb1_invalidate(vcpu_e500, eaddr,
532 get_tlb_end(gtlbe), tid);
533 }
534 832
535 gtlbe->mas1 = vcpu_e500->mas1; 833 gtlbe->mas1 = vcpu_e500->mas1;
536 gtlbe->mas2 = vcpu_e500->mas2; 834 gtlbe->mas2 = vcpu_e500->mas2;
@@ -542,6 +840,12 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
542 840
543 /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ 841 /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
544 if (tlbe_is_host_safe(vcpu, gtlbe)) { 842 if (tlbe_is_host_safe(vcpu, gtlbe)) {
843 struct tlbe stlbe;
844 int stlbsel, sesel;
845 u64 eaddr;
846 u64 raddr;
847
848 preempt_disable();
545 switch (tlbsel) { 849 switch (tlbsel) {
546 case 0: 850 case 0:
547 /* TLB0 */ 851 /* TLB0 */
@@ -549,7 +853,7 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
549 gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K); 853 gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K);
550 854
551 stlbsel = 0; 855 stlbsel = 0;
552 sesel = kvmppc_e500_stlbe_map(vcpu_e500, 0, esel); 856 sesel = kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe);
553 857
554 break; 858 break;
555 859
@@ -564,13 +868,14 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
564 * are mapped on the fly. */ 868 * are mapped on the fly. */
565 stlbsel = 1; 869 stlbsel = 1;
566 sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, 870 sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr,
567 raddr >> PAGE_SHIFT, gtlbe); 871 raddr >> PAGE_SHIFT, gtlbe, &stlbe);
568 break; 872 break;
569 873
570 default: 874 default:
571 BUG(); 875 BUG();
572 } 876 }
573 write_host_tlbe(vcpu_e500, stlbsel, sesel); 877 write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe);
878 preempt_enable();
574 } 879 }
575 880
576 kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS); 881 kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS);
@@ -610,7 +915,7 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index,
610{ 915{
611 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 916 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
612 struct tlbe *gtlbe = 917 struct tlbe *gtlbe =
613 &vcpu_e500->guest_tlb[tlbsel_of(index)][esel_of(index)]; 918 &vcpu_e500->gtlb_arch[tlbsel_of(index)][esel_of(index)];
614 u64 pgmask = get_tlb_bytes(gtlbe) - 1; 919 u64 pgmask = get_tlb_bytes(gtlbe) - 1;
615 920
616 return get_tlb_raddr(gtlbe) | (eaddr & pgmask); 921 return get_tlb_raddr(gtlbe) | (eaddr & pgmask);
@@ -618,38 +923,37 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index,
618 923
619void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) 924void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
620{ 925{
621 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
622 int tlbsel, i;
623
624 for (tlbsel = 0; tlbsel < 2; tlbsel++)
625 for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++)
626 kvmppc_e500_shadow_release(vcpu_e500, tlbsel, i);
627
628 /* discard all guest mapping */
629 _tlbil_all();
630} 926}
631 927
632void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, 928void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
633 unsigned int index) 929 unsigned int index)
634{ 930{
635 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 931 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
932 struct tlbe_priv *priv;
933 struct tlbe *gtlbe, stlbe;
636 int tlbsel = tlbsel_of(index); 934 int tlbsel = tlbsel_of(index);
637 int esel = esel_of(index); 935 int esel = esel_of(index);
638 int stlbsel, sesel; 936 int stlbsel, sesel;
639 937
938 gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel];
939
940 preempt_disable();
640 switch (tlbsel) { 941 switch (tlbsel) {
641 case 0: 942 case 0:
642 stlbsel = 0; 943 stlbsel = 0;
643 sesel = esel; 944 sesel = esel;
945 priv = &vcpu_e500->gtlb_priv[stlbsel][sesel];
946
947 kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, BOOK3E_PAGESZ_4K,
948 priv, eaddr, &stlbe);
644 break; 949 break;
645 950
646 case 1: { 951 case 1: {
647 gfn_t gfn = gpaddr >> PAGE_SHIFT; 952 gfn_t gfn = gpaddr >> PAGE_SHIFT;
648 struct tlbe *gtlbe
649 = &vcpu_e500->guest_tlb[tlbsel][esel];
650 953
651 stlbsel = 1; 954 stlbsel = 1;
652 sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, gtlbe); 955 sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn,
956 gtlbe, &stlbe);
653 break; 957 break;
654 } 958 }
655 959
@@ -657,7 +961,9 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
657 BUG(); 961 BUG();
658 break; 962 break;
659 } 963 }
660 write_host_tlbe(vcpu_e500, stlbsel, sesel); 964
965 write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe);
966 preempt_enable();
661} 967}
662 968
663int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu, 969int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu,
@@ -679,8 +985,10 @@ void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid)
679{ 985{
680 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 986 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
681 987
682 vcpu_e500->pid[0] = vcpu->arch.shadow_pid = 988 if (vcpu->arch.pid != pid) {
683 vcpu->arch.pid = pid; 989 vcpu_e500->pid[0] = vcpu->arch.pid = pid;
990 kvmppc_e500_recalc_shadow_pid(vcpu_e500);
991 }
684} 992}
685 993
686void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) 994void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500)
@@ -688,14 +996,14 @@ void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500)
688 struct tlbe *tlbe; 996 struct tlbe *tlbe;
689 997
690 /* Insert large initial mapping for guest. */ 998 /* Insert large initial mapping for guest. */
691 tlbe = &vcpu_e500->guest_tlb[1][0]; 999 tlbe = &vcpu_e500->gtlb_arch[1][0];
692 tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M); 1000 tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M);
693 tlbe->mas2 = 0; 1001 tlbe->mas2 = 0;
694 tlbe->mas3 = E500_TLB_SUPER_PERM_MASK; 1002 tlbe->mas3 = E500_TLB_SUPER_PERM_MASK;
695 tlbe->mas7 = 0; 1003 tlbe->mas7 = 0;
696 1004
697 /* 4K map for serial output. Used by kernel wrapper. */ 1005 /* 4K map for serial output. Used by kernel wrapper. */
698 tlbe = &vcpu_e500->guest_tlb[1][1]; 1006 tlbe = &vcpu_e500->gtlb_arch[1][1];
699 tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K); 1007 tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K);
700 tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G; 1008 tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G;
701 tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; 1009 tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK;
@@ -706,68 +1014,64 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
706{ 1014{
707 tlb1_entry_num = mfspr(SPRN_TLB1CFG) & 0xFFF; 1015 tlb1_entry_num = mfspr(SPRN_TLB1CFG) & 0xFFF;
708 1016
709 vcpu_e500->guest_tlb_size[0] = KVM_E500_TLB0_SIZE; 1017 vcpu_e500->gtlb_size[0] = KVM_E500_TLB0_SIZE;
710 vcpu_e500->guest_tlb[0] = 1018 vcpu_e500->gtlb_arch[0] =
711 kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL); 1019 kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL);
712 if (vcpu_e500->guest_tlb[0] == NULL) 1020 if (vcpu_e500->gtlb_arch[0] == NULL)
713 goto err_out; 1021 goto err_out;
714 1022
715 vcpu_e500->shadow_tlb_size[0] = KVM_E500_TLB0_SIZE; 1023 vcpu_e500->gtlb_size[1] = KVM_E500_TLB1_SIZE;
716 vcpu_e500->shadow_tlb[0] = 1024 vcpu_e500->gtlb_arch[1] =
717 kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL);
718 if (vcpu_e500->shadow_tlb[0] == NULL)
719 goto err_out_guest0;
720
721 vcpu_e500->guest_tlb_size[1] = KVM_E500_TLB1_SIZE;
722 vcpu_e500->guest_tlb[1] =
723 kzalloc(sizeof(struct tlbe) * KVM_E500_TLB1_SIZE, GFP_KERNEL); 1025 kzalloc(sizeof(struct tlbe) * KVM_E500_TLB1_SIZE, GFP_KERNEL);
724 if (vcpu_e500->guest_tlb[1] == NULL) 1026 if (vcpu_e500->gtlb_arch[1] == NULL)
725 goto err_out_shadow0; 1027 goto err_out_guest0;
726 1028
727 vcpu_e500->shadow_tlb_size[1] = tlb1_entry_num; 1029 vcpu_e500->gtlb_priv[0] = (struct tlbe_priv *)
728 vcpu_e500->shadow_tlb[1] = 1030 kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB0_SIZE, GFP_KERNEL);
729 kzalloc(sizeof(struct tlbe) * tlb1_entry_num, GFP_KERNEL); 1031 if (vcpu_e500->gtlb_priv[0] == NULL)
730 if (vcpu_e500->shadow_tlb[1] == NULL)
731 goto err_out_guest1; 1032 goto err_out_guest1;
1033 vcpu_e500->gtlb_priv[1] = (struct tlbe_priv *)
1034 kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB1_SIZE, GFP_KERNEL);
732 1035
733 vcpu_e500->shadow_pages[0] = (struct page **) 1036 if (vcpu_e500->gtlb_priv[1] == NULL)
734 kzalloc(sizeof(struct page *) * KVM_E500_TLB0_SIZE, GFP_KERNEL); 1037 goto err_out_priv0;
735 if (vcpu_e500->shadow_pages[0] == NULL)
736 goto err_out_shadow1;
737 1038
738 vcpu_e500->shadow_pages[1] = (struct page **) 1039 if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL)
739 kzalloc(sizeof(struct page *) * tlb1_entry_num, GFP_KERNEL); 1040 goto err_out_priv1;
740 if (vcpu_e500->shadow_pages[1] == NULL)
741 goto err_out_page0;
742 1041
743 /* Init TLB configuration register */ 1042 /* Init TLB configuration register */
744 vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL; 1043 vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL;
745 vcpu_e500->tlb0cfg |= vcpu_e500->guest_tlb_size[0]; 1044 vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_size[0];
746 vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL; 1045 vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL;
747 vcpu_e500->tlb1cfg |= vcpu_e500->guest_tlb_size[1]; 1046 vcpu_e500->tlb1cfg |= vcpu_e500->gtlb_size[1];
748 1047
749 return 0; 1048 return 0;
750 1049
751err_out_page0: 1050err_out_priv1:
752 kfree(vcpu_e500->shadow_pages[0]); 1051 kfree(vcpu_e500->gtlb_priv[1]);
753err_out_shadow1: 1052err_out_priv0:
754 kfree(vcpu_e500->shadow_tlb[1]); 1053 kfree(vcpu_e500->gtlb_priv[0]);
755err_out_guest1: 1054err_out_guest1:
756 kfree(vcpu_e500->guest_tlb[1]); 1055 kfree(vcpu_e500->gtlb_arch[1]);
757err_out_shadow0:
758 kfree(vcpu_e500->shadow_tlb[0]);
759err_out_guest0: 1056err_out_guest0:
760 kfree(vcpu_e500->guest_tlb[0]); 1057 kfree(vcpu_e500->gtlb_arch[0]);
761err_out: 1058err_out:
762 return -1; 1059 return -1;
763} 1060}
764 1061
765void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) 1062void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
766{ 1063{
767 kfree(vcpu_e500->shadow_pages[1]); 1064 int stlbsel, i;
768 kfree(vcpu_e500->shadow_pages[0]); 1065
769 kfree(vcpu_e500->shadow_tlb[1]); 1066 /* release all privs */
770 kfree(vcpu_e500->guest_tlb[1]); 1067 for (stlbsel = 0; stlbsel < 2; stlbsel++)
771 kfree(vcpu_e500->shadow_tlb[0]); 1068 for (i = 0; i < vcpu_e500->gtlb_size[stlbsel]; i++) {
772 kfree(vcpu_e500->guest_tlb[0]); 1069 struct tlbe_priv *priv =
1070 &vcpu_e500->gtlb_priv[stlbsel][i];
1071 kvmppc_e500_priv_release(priv);
1072 }
1073
1074 kvmppc_e500_id_table_free(vcpu_e500);
1075 kfree(vcpu_e500->gtlb_arch[1]);
1076 kfree(vcpu_e500->gtlb_arch[0]);
773} 1077}
diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h
index 458946b4775d..59b88e99a235 100644
--- a/arch/powerpc/kvm/e500_tlb.h
+++ b/arch/powerpc/kvm/e500_tlb.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. 2 * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
3 * 3 *
4 * Author: Yu Liu, yu.liu@freescale.com 4 * Author: Yu Liu, yu.liu@freescale.com
5 * 5 *
@@ -55,6 +55,7 @@ extern void kvmppc_e500_tlb_load(struct kvm_vcpu *, int);
55extern int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *); 55extern int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *);
56extern void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *); 56extern void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *);
57extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *); 57extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *);
58extern void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *);
58 59
59/* TLB helper functions */ 60/* TLB helper functions */
60static inline unsigned int get_tlb_size(const struct tlbe *tlbe) 61static inline unsigned int get_tlb_size(const struct tlbe *tlbe)
@@ -110,6 +111,16 @@ static inline unsigned int get_cur_pid(struct kvm_vcpu *vcpu)
110 return vcpu->arch.pid & 0xff; 111 return vcpu->arch.pid & 0xff;
111} 112}
112 113
114static inline unsigned int get_cur_as(struct kvm_vcpu *vcpu)
115{
116 return !!(vcpu->arch.shared->msr & (MSR_IS | MSR_DS));
117}
118
119static inline unsigned int get_cur_pr(struct kvm_vcpu *vcpu)
120{
121 return !!(vcpu->arch.shared->msr & MSR_PR);
122}
123
113static inline unsigned int get_cur_spid( 124static inline unsigned int get_cur_spid(
114 const struct kvmppc_vcpu_e500 *vcpu_e500) 125 const struct kvmppc_vcpu_e500 *vcpu_e500)
115{ 126{
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 616dd516ca1f..a107c9be0fb1 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -30,6 +30,7 @@
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
31#include <asm/kvm_ppc.h> 31#include <asm/kvm_ppc.h>
32#include <asm/tlbflush.h> 32#include <asm/tlbflush.h>
33#include <asm/cputhreads.h>
33#include "timing.h" 34#include "timing.h"
34#include "../mm/mmu_decl.h" 35#include "../mm/mmu_decl.h"
35 36
@@ -38,8 +39,12 @@
38 39
39int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) 40int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
40{ 41{
42#ifndef CONFIG_KVM_BOOK3S_64_HV
41 return !(v->arch.shared->msr & MSR_WE) || 43 return !(v->arch.shared->msr & MSR_WE) ||
42 !!(v->arch.pending_exceptions); 44 !!(v->arch.pending_exceptions);
45#else
46 return !(v->arch.ceded) || !!(v->arch.pending_exceptions);
47#endif
43} 48}
44 49
45int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) 50int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
@@ -73,7 +78,8 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
73 } 78 }
74 case HC_VENDOR_KVM | KVM_HC_FEATURES: 79 case HC_VENDOR_KVM | KVM_HC_FEATURES:
75 r = HC_EV_SUCCESS; 80 r = HC_EV_SUCCESS;
76#if defined(CONFIG_PPC_BOOK3S) /* XXX Missing magic page on BookE */ 81#if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500)
82 /* XXX Missing magic page on 44x */
77 r2 |= (1 << KVM_FEATURE_MAGIC_PAGE); 83 r2 |= (1 << KVM_FEATURE_MAGIC_PAGE);
78#endif 84#endif
79 85
@@ -147,7 +153,7 @@ void kvm_arch_check_processor_compat(void *rtn)
147 153
148int kvm_arch_init_vm(struct kvm *kvm) 154int kvm_arch_init_vm(struct kvm *kvm)
149{ 155{
150 return 0; 156 return kvmppc_core_init_vm(kvm);
151} 157}
152 158
153void kvm_arch_destroy_vm(struct kvm *kvm) 159void kvm_arch_destroy_vm(struct kvm *kvm)
@@ -163,6 +169,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
163 kvm->vcpus[i] = NULL; 169 kvm->vcpus[i] = NULL;
164 170
165 atomic_set(&kvm->online_vcpus, 0); 171 atomic_set(&kvm->online_vcpus, 0);
172
173 kvmppc_core_destroy_vm(kvm);
174
166 mutex_unlock(&kvm->lock); 175 mutex_unlock(&kvm->lock);
167} 176}
168 177
@@ -180,10 +189,13 @@ int kvm_dev_ioctl_check_extension(long ext)
180#else 189#else
181 case KVM_CAP_PPC_SEGSTATE: 190 case KVM_CAP_PPC_SEGSTATE:
182#endif 191#endif
183 case KVM_CAP_PPC_PAIRED_SINGLES:
184 case KVM_CAP_PPC_UNSET_IRQ: 192 case KVM_CAP_PPC_UNSET_IRQ:
185 case KVM_CAP_PPC_IRQ_LEVEL: 193 case KVM_CAP_PPC_IRQ_LEVEL:
186 case KVM_CAP_ENABLE_CAP: 194 case KVM_CAP_ENABLE_CAP:
195 r = 1;
196 break;
197#ifndef CONFIG_KVM_BOOK3S_64_HV
198 case KVM_CAP_PPC_PAIRED_SINGLES:
187 case KVM_CAP_PPC_OSI: 199 case KVM_CAP_PPC_OSI:
188 case KVM_CAP_PPC_GET_PVINFO: 200 case KVM_CAP_PPC_GET_PVINFO:
189 r = 1; 201 r = 1;
@@ -191,6 +203,21 @@ int kvm_dev_ioctl_check_extension(long ext)
191 case KVM_CAP_COALESCED_MMIO: 203 case KVM_CAP_COALESCED_MMIO:
192 r = KVM_COALESCED_MMIO_PAGE_OFFSET; 204 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
193 break; 205 break;
206#endif
207#ifdef CONFIG_KVM_BOOK3S_64_HV
208 case KVM_CAP_SPAPR_TCE:
209 r = 1;
210 break;
211 case KVM_CAP_PPC_SMT:
212 r = threads_per_core;
213 break;
214 case KVM_CAP_PPC_RMA:
215 r = 1;
216 /* PPC970 requires an RMA */
217 if (cpu_has_feature(CPU_FTR_ARCH_201))
218 r = 2;
219 break;
220#endif
194 default: 221 default:
195 r = 0; 222 r = 0;
196 break; 223 break;
@@ -211,7 +238,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
211 struct kvm_userspace_memory_region *mem, 238 struct kvm_userspace_memory_region *mem,
212 int user_alloc) 239 int user_alloc)
213{ 240{
214 return 0; 241 return kvmppc_core_prepare_memory_region(kvm, mem);
215} 242}
216 243
217void kvm_arch_commit_memory_region(struct kvm *kvm, 244void kvm_arch_commit_memory_region(struct kvm *kvm,
@@ -219,7 +246,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
219 struct kvm_memory_slot old, 246 struct kvm_memory_slot old,
220 int user_alloc) 247 int user_alloc)
221{ 248{
222 return; 249 kvmppc_core_commit_memory_region(kvm, mem);
223} 250}
224 251
225 252
@@ -287,6 +314,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
287 hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); 314 hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
288 tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu); 315 tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu);
289 vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; 316 vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
317 vcpu->arch.dec_expires = ~(u64)0;
290 318
291#ifdef CONFIG_KVM_EXIT_TIMING 319#ifdef CONFIG_KVM_EXIT_TIMING
292 mutex_init(&vcpu->arch.exit_timing_lock); 320 mutex_init(&vcpu->arch.exit_timing_lock);
@@ -313,6 +341,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
313 mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); 341 mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
314#endif 342#endif
315 kvmppc_core_vcpu_load(vcpu, cpu); 343 kvmppc_core_vcpu_load(vcpu, cpu);
344 vcpu->cpu = smp_processor_id();
316} 345}
317 346
318void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 347void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -321,6 +350,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
321#ifdef CONFIG_BOOKE 350#ifdef CONFIG_BOOKE
322 vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); 351 vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
323#endif 352#endif
353 vcpu->cpu = -1;
324} 354}
325 355
326int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 356int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
@@ -492,15 +522,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
492 for (i = 0; i < 32; i++) 522 for (i = 0; i < 32; i++)
493 kvmppc_set_gpr(vcpu, i, gprs[i]); 523 kvmppc_set_gpr(vcpu, i, gprs[i]);
494 vcpu->arch.osi_needed = 0; 524 vcpu->arch.osi_needed = 0;
525 } else if (vcpu->arch.hcall_needed) {
526 int i;
527
528 kvmppc_set_gpr(vcpu, 3, run->papr_hcall.ret);
529 for (i = 0; i < 9; ++i)
530 kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]);
531 vcpu->arch.hcall_needed = 0;
495 } 532 }
496 533
497 kvmppc_core_deliver_interrupts(vcpu); 534 kvmppc_core_deliver_interrupts(vcpu);
498 535
499 local_irq_disable(); 536 r = kvmppc_vcpu_run(run, vcpu);
500 kvm_guest_enter();
501 r = __kvmppc_vcpu_run(run, vcpu);
502 kvm_guest_exit();
503 local_irq_enable();
504 537
505 if (vcpu->sigset_active) 538 if (vcpu->sigset_active)
506 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 539 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
@@ -518,6 +551,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
518 if (waitqueue_active(&vcpu->wq)) { 551 if (waitqueue_active(&vcpu->wq)) {
519 wake_up_interruptible(&vcpu->wq); 552 wake_up_interruptible(&vcpu->wq);
520 vcpu->stat.halt_wakeup++; 553 vcpu->stat.halt_wakeup++;
554 } else if (vcpu->cpu != -1) {
555 smp_send_reschedule(vcpu->cpu);
521 } 556 }
522 557
523 return 0; 558 return 0;
@@ -633,6 +668,29 @@ long kvm_arch_vm_ioctl(struct file *filp,
633 668
634 break; 669 break;
635 } 670 }
671#ifdef CONFIG_KVM_BOOK3S_64_HV
672 case KVM_CREATE_SPAPR_TCE: {
673 struct kvm_create_spapr_tce create_tce;
674 struct kvm *kvm = filp->private_data;
675
676 r = -EFAULT;
677 if (copy_from_user(&create_tce, argp, sizeof(create_tce)))
678 goto out;
679 r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce);
680 goto out;
681 }
682
683 case KVM_ALLOCATE_RMA: {
684 struct kvm *kvm = filp->private_data;
685 struct kvm_allocate_rma rma;
686
687 r = kvm_vm_ioctl_allocate_rma(kvm, &rma);
688 if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma)))
689 r = -EFAULT;
690 break;
691 }
692#endif /* CONFIG_KVM_BOOK3S_64_HV */
693
636 default: 694 default:
637 r = -ENOTTY; 695 r = -ENOTTY;
638 } 696 }
diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c
index 319177df9587..07b6110a4bb7 100644
--- a/arch/powerpc/kvm/timing.c
+++ b/arch/powerpc/kvm/timing.c
@@ -56,15 +56,6 @@ static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type)
56{ 56{
57 u64 old; 57 u64 old;
58 58
59 do_div(duration, tb_ticks_per_usec);
60 if (unlikely(duration > 0xFFFFFFFF)) {
61 printk(KERN_ERR"%s - duration too big -> overflow"
62 " duration %lld type %d exit #%d\n",
63 __func__, duration, type,
64 vcpu->arch.timing_count_type[type]);
65 return;
66 }
67
68 mutex_lock(&vcpu->arch.exit_timing_lock); 59 mutex_lock(&vcpu->arch.exit_timing_lock);
69 60
70 vcpu->arch.timing_count_type[type]++; 61 vcpu->arch.timing_count_type[type]++;
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index 3aca1b042b8c..b135d3d397db 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -103,7 +103,7 @@ TRACE_EVENT(kvm_gtlb_write,
103 * Book3S trace points * 103 * Book3S trace points *
104 *************************************************************************/ 104 *************************************************************************/
105 105
106#ifdef CONFIG_PPC_BOOK3S 106#ifdef CONFIG_KVM_BOOK3S_PR
107 107
108TRACE_EVENT(kvm_book3s_exit, 108TRACE_EVENT(kvm_book3s_exit,
109 TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), 109 TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
@@ -252,7 +252,7 @@ TRACE_EVENT(kvm_book3s_mmu_flush,
252 ), 252 ),
253 253
254 TP_fast_assign( 254 TP_fast_assign(
255 __entry->count = vcpu->arch.hpte_cache_count; 255 __entry->count = to_book3s(vcpu)->hpte_cache_count;
256 __entry->p1 = p1; 256 __entry->p1 = p1;
257 __entry->p2 = p2; 257 __entry->p2 = p2;
258 __entry->type = type; 258 __entry->type = type;
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index dfd764896db0..90039bc64119 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -37,7 +37,7 @@
37 37
38#define HPTE_LOCK_BIT 3 38#define HPTE_LOCK_BIT 3
39 39
40static DEFINE_RAW_SPINLOCK(native_tlbie_lock); 40DEFINE_RAW_SPINLOCK(native_tlbie_lock);
41 41
42static inline void __tlbie(unsigned long va, int psize, int ssize) 42static inline void __tlbie(unsigned long va, int psize, int ssize)
43{ 43{
@@ -51,7 +51,7 @@ static inline void __tlbie(unsigned long va, int psize, int ssize)
51 va &= ~0xffful; 51 va &= ~0xffful;
52 va |= ssize << 8; 52 va |= ssize << 8;
53 asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2) 53 asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
54 : : "r" (va), "r"(0), "i" (CPU_FTR_HVMODE_206) 54 : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
55 : "memory"); 55 : "memory");
56 break; 56 break;
57 default: 57 default:
@@ -61,7 +61,7 @@ static inline void __tlbie(unsigned long va, int psize, int ssize)
61 va |= ssize << 8; 61 va |= ssize << 8;
62 va |= 1; /* L */ 62 va |= 1; /* L */
63 asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2) 63 asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2)
64 : : "r" (va), "r"(0), "i" (CPU_FTR_HVMODE_206) 64 : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
65 : "memory"); 65 : "memory");
66 break; 66 break;
67 } 67 }
diff --git a/arch/powerpc/platforms/iseries/exception.S b/arch/powerpc/platforms/iseries/exception.S
index 29c02f36b32f..f519ee17ff7d 100644
--- a/arch/powerpc/platforms/iseries/exception.S
+++ b/arch/powerpc/platforms/iseries/exception.S
@@ -167,7 +167,7 @@ BEGIN_FTR_SECTION
167 std r12,PACA_EXGEN+EX_R13(r13) 167 std r12,PACA_EXGEN+EX_R13(r13)
168 EXCEPTION_PROLOG_ISERIES_1 168 EXCEPTION_PROLOG_ISERIES_1
169FTR_SECTION_ELSE 169FTR_SECTION_ELSE
170 EXCEPTION_PROLOG_1(PACA_EXGEN) 170 EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0)
171 EXCEPTION_PROLOG_ISERIES_1 171 EXCEPTION_PROLOG_ISERIES_1
172ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_SLB) 172ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_SLB)
173 b data_access_common 173 b data_access_common
diff --git a/arch/powerpc/platforms/iseries/exception.h b/arch/powerpc/platforms/iseries/exception.h
index bae3fba5ad8e..50271b550a99 100644
--- a/arch/powerpc/platforms/iseries/exception.h
+++ b/arch/powerpc/platforms/iseries/exception.h
@@ -39,7 +39,7 @@
39label##_iSeries: \ 39label##_iSeries: \
40 HMT_MEDIUM; \ 40 HMT_MEDIUM; \
41 mtspr SPRN_SPRG_SCRATCH0,r13; /* save r13 */ \ 41 mtspr SPRN_SPRG_SCRATCH0,r13; /* save r13 */ \
42 EXCEPTION_PROLOG_1(area); \ 42 EXCEPTION_PROLOG_1(area, NOTEST, 0); \
43 EXCEPTION_PROLOG_ISERIES_1; \ 43 EXCEPTION_PROLOG_ISERIES_1; \
44 b label##_common 44 b label##_common
45 45
@@ -48,7 +48,7 @@ label##_iSeries: \
48label##_iSeries: \ 48label##_iSeries: \
49 HMT_MEDIUM; \ 49 HMT_MEDIUM; \
50 mtspr SPRN_SPRG_SCRATCH0,r13; /* save r13 */ \ 50 mtspr SPRN_SPRG_SCRATCH0,r13; /* save r13 */ \
51 EXCEPTION_PROLOG_1(PACA_EXGEN); \ 51 EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0); \
52 lbz r10,PACASOFTIRQEN(r13); \ 52 lbz r10,PACASOFTIRQEN(r13); \
53 cmpwi 0,r10,0; \ 53 cmpwi 0,r10,0; \
54 beq- label##_iSeries_masked; \ 54 beq- label##_iSeries_masked; \
diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c
index 1f15ad436140..ba382b59b926 100644
--- a/arch/powerpc/sysdev/xics/icp-native.c
+++ b/arch/powerpc/sysdev/xics/icp-native.c
@@ -17,6 +17,7 @@
17#include <linux/cpu.h> 17#include <linux/cpu.h>
18#include <linux/of.h> 18#include <linux/of.h>
19#include <linux/spinlock.h> 19#include <linux/spinlock.h>
20#include <linux/module.h>
20 21
21#include <asm/prom.h> 22#include <asm/prom.h>
22#include <asm/io.h> 23#include <asm/io.h>
@@ -24,6 +25,7 @@
24#include <asm/irq.h> 25#include <asm/irq.h>
25#include <asm/errno.h> 26#include <asm/errno.h>
26#include <asm/xics.h> 27#include <asm/xics.h>
28#include <asm/kvm_ppc.h>
27 29
28struct icp_ipl { 30struct icp_ipl {
29 union { 31 union {
@@ -139,6 +141,12 @@ static void icp_native_cause_ipi(int cpu, unsigned long data)
139 icp_native_set_qirr(cpu, IPI_PRIORITY); 141 icp_native_set_qirr(cpu, IPI_PRIORITY);
140} 142}
141 143
144void xics_wake_cpu(int cpu)
145{
146 icp_native_set_qirr(cpu, IPI_PRIORITY);
147}
148EXPORT_SYMBOL_GPL(xics_wake_cpu);
149
142static irqreturn_t icp_native_ipi_action(int irq, void *dev_id) 150static irqreturn_t icp_native_ipi_action(int irq, void *dev_id)
143{ 151{
144 int cpu = smp_processor_id(); 152 int cpu = smp_processor_id();
@@ -185,6 +193,7 @@ static int __init icp_native_map_one_cpu(int hw_id, unsigned long addr,
185 } 193 }
186 194
187 icp_native_regs[cpu] = ioremap(addr, size); 195 icp_native_regs[cpu] = ioremap(addr, size);
196 kvmppc_set_xics_phys(cpu, addr);
188 if (!icp_native_regs[cpu]) { 197 if (!icp_native_regs[cpu]) {
189 pr_warning("icp_native: Failed ioremap for CPU %d, " 198 pr_warning("icp_native: Failed ioremap for CPU %d, "
190 "interrupt server #0x%x, addr %#lx\n", 199 "interrupt server #0x%x, addr %#lx\n",
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b2127544fbe7..a67e014e4e44 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -529,6 +529,18 @@ menuconfig PARAVIRT_GUEST
529 529
530if PARAVIRT_GUEST 530if PARAVIRT_GUEST
531 531
532config PARAVIRT_TIME_ACCOUNTING
533 bool "Paravirtual steal time accounting"
534 select PARAVIRT
535 default n
536 ---help---
537 Select this option to enable fine granularity task steal time
538 accounting. Time spent executing other tasks in parallel with
539 the current vCPU is discounted from the vCPU power. To account for
540 that, there can be a small performance impact.
541
542 If in doubt, say N here.
543
532source "arch/x86/xen/Kconfig" 544source "arch/x86/xen/Kconfig"
533 545
534config KVM_CLOCK 546config KVM_CLOCK
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0049211959c0..6040d115ef51 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -229,7 +229,26 @@ struct read_cache {
229 unsigned long end; 229 unsigned long end;
230}; 230};
231 231
232struct decode_cache { 232struct x86_emulate_ctxt {
233 struct x86_emulate_ops *ops;
234
235 /* Register state before/after emulation. */
236 unsigned long eflags;
237 unsigned long eip; /* eip before instruction emulation */
238 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
239 int mode;
240
241 /* interruptibility state, as a result of execution of STI or MOV SS */
242 int interruptibility;
243
244 bool guest_mode; /* guest running a nested guest */
245 bool perm_ok; /* do not check permissions if true */
246 bool only_vendor_specific_insn;
247
248 bool have_exception;
249 struct x86_exception exception;
250
251 /* decode cache */
233 u8 twobyte; 252 u8 twobyte;
234 u8 b; 253 u8 b;
235 u8 intercept; 254 u8 intercept;
@@ -246,8 +265,6 @@ struct decode_cache {
246 unsigned int d; 265 unsigned int d;
247 int (*execute)(struct x86_emulate_ctxt *ctxt); 266 int (*execute)(struct x86_emulate_ctxt *ctxt);
248 int (*check_perm)(struct x86_emulate_ctxt *ctxt); 267 int (*check_perm)(struct x86_emulate_ctxt *ctxt);
249 unsigned long regs[NR_VCPU_REGS];
250 unsigned long eip;
251 /* modrm */ 268 /* modrm */
252 u8 modrm; 269 u8 modrm;
253 u8 modrm_mod; 270 u8 modrm_mod;
@@ -255,34 +272,14 @@ struct decode_cache {
255 u8 modrm_rm; 272 u8 modrm_rm;
256 u8 modrm_seg; 273 u8 modrm_seg;
257 bool rip_relative; 274 bool rip_relative;
275 unsigned long _eip;
276 /* Fields above regs are cleared together. */
277 unsigned long regs[NR_VCPU_REGS];
258 struct fetch_cache fetch; 278 struct fetch_cache fetch;
259 struct read_cache io_read; 279 struct read_cache io_read;
260 struct read_cache mem_read; 280 struct read_cache mem_read;
261}; 281};
262 282
263struct x86_emulate_ctxt {
264 struct x86_emulate_ops *ops;
265
266 /* Register state before/after emulation. */
267 unsigned long eflags;
268 unsigned long eip; /* eip before instruction emulation */
269 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
270 int mode;
271
272 /* interruptibility state, as a result of execution of STI or MOV SS */
273 int interruptibility;
274
275 bool guest_mode; /* guest running a nested guest */
276 bool perm_ok; /* do not check permissions if true */
277 bool only_vendor_specific_insn;
278
279 bool have_exception;
280 struct x86_exception exception;
281
282 /* decode cache */
283 struct decode_cache decode;
284};
285
286/* Repeat String Operation Prefix */ 283/* Repeat String Operation Prefix */
287#define REPE_PREFIX 0xf3 284#define REPE_PREFIX 0xf3
288#define REPNE_PREFIX 0xf2 285#define REPNE_PREFIX 0xf2
@@ -373,6 +370,5 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
373int emulator_task_switch(struct x86_emulate_ctxt *ctxt, 370int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
374 u16 tss_selector, int reason, 371 u16 tss_selector, int reason,
375 bool has_error_code, u32 error_code); 372 bool has_error_code, u32 error_code);
376int emulate_int_real(struct x86_emulate_ctxt *ctxt, 373int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
377 struct x86_emulate_ops *ops, int irq);
378#endif /* _ASM_X86_KVM_X86_EMULATE_H */ 374#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d2ac8e2ee897..dd51c83aa5de 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -48,7 +48,7 @@
48 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 48 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
49 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 49 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
50 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 50 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
51 | X86_CR4_OSXSAVE \ 51 | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \
52 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 52 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
53 53
54#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 54#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -205,6 +205,7 @@ union kvm_mmu_page_role {
205 unsigned invalid:1; 205 unsigned invalid:1;
206 unsigned nxe:1; 206 unsigned nxe:1;
207 unsigned cr0_wp:1; 207 unsigned cr0_wp:1;
208 unsigned smep_andnot_wp:1;
208 }; 209 };
209}; 210};
210 211
@@ -227,15 +228,17 @@ struct kvm_mmu_page {
227 * in this shadow page. 228 * in this shadow page.
228 */ 229 */
229 DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 230 DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
230 bool multimapped; /* More than one parent_pte? */
231 bool unsync; 231 bool unsync;
232 int root_count; /* Currently serving as active root */ 232 int root_count; /* Currently serving as active root */
233 unsigned int unsync_children; 233 unsigned int unsync_children;
234 union { 234 unsigned long parent_ptes; /* Reverse mapping for parent_pte */
235 u64 *parent_pte; /* !multimapped */
236 struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
237 };
238 DECLARE_BITMAP(unsync_child_bitmap, 512); 235 DECLARE_BITMAP(unsync_child_bitmap, 512);
236
237#ifdef CONFIG_X86_32
238 int clear_spte_count;
239#endif
240
241 struct rcu_head rcu;
239}; 242};
240 243
241struct kvm_pv_mmu_op_buffer { 244struct kvm_pv_mmu_op_buffer {
@@ -269,8 +272,6 @@ struct kvm_mmu {
269 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, 272 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
270 struct x86_exception *exception); 273 struct x86_exception *exception);
271 gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); 274 gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
272 void (*prefetch_page)(struct kvm_vcpu *vcpu,
273 struct kvm_mmu_page *page);
274 int (*sync_page)(struct kvm_vcpu *vcpu, 275 int (*sync_page)(struct kvm_vcpu *vcpu,
275 struct kvm_mmu_page *sp); 276 struct kvm_mmu_page *sp);
276 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); 277 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
@@ -346,8 +347,7 @@ struct kvm_vcpu_arch {
346 * put it here to avoid allocation */ 347 * put it here to avoid allocation */
347 struct kvm_pv_mmu_op_buffer mmu_op_buffer; 348 struct kvm_pv_mmu_op_buffer mmu_op_buffer;
348 349
349 struct kvm_mmu_memory_cache mmu_pte_chain_cache; 350 struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
350 struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
351 struct kvm_mmu_memory_cache mmu_page_cache; 351 struct kvm_mmu_memory_cache mmu_page_cache;
352 struct kvm_mmu_memory_cache mmu_page_header_cache; 352 struct kvm_mmu_memory_cache mmu_page_header_cache;
353 353
@@ -393,6 +393,15 @@ struct kvm_vcpu_arch {
393 unsigned int hw_tsc_khz; 393 unsigned int hw_tsc_khz;
394 unsigned int time_offset; 394 unsigned int time_offset;
395 struct page *time_page; 395 struct page *time_page;
396
397 struct {
398 u64 msr_val;
399 u64 last_steal;
400 u64 accum_steal;
401 struct gfn_to_hva_cache stime;
402 struct kvm_steal_time steal;
403 } st;
404
396 u64 last_guest_tsc; 405 u64 last_guest_tsc;
397 u64 last_kernel_ns; 406 u64 last_kernel_ns;
398 u64 last_tsc_nsec; 407 u64 last_tsc_nsec;
@@ -419,6 +428,11 @@ struct kvm_vcpu_arch {
419 u64 mcg_ctl; 428 u64 mcg_ctl;
420 u64 *mce_banks; 429 u64 *mce_banks;
421 430
431 /* Cache MMIO info */
432 u64 mmio_gva;
433 unsigned access;
434 gfn_t mmio_gfn;
435
422 /* used for guest single stepping over the given code position */ 436 /* used for guest single stepping over the given code position */
423 unsigned long singlestep_rip; 437 unsigned long singlestep_rip;
424 438
@@ -441,6 +455,7 @@ struct kvm_arch {
441 unsigned int n_used_mmu_pages; 455 unsigned int n_used_mmu_pages;
442 unsigned int n_requested_mmu_pages; 456 unsigned int n_requested_mmu_pages;
443 unsigned int n_max_mmu_pages; 457 unsigned int n_max_mmu_pages;
458 unsigned int indirect_shadow_pages;
444 atomic_t invlpg_counter; 459 atomic_t invlpg_counter;
445 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; 460 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
446 /* 461 /*
@@ -477,6 +492,8 @@ struct kvm_arch {
477 u64 hv_guest_os_id; 492 u64 hv_guest_os_id;
478 u64 hv_hypercall; 493 u64 hv_hypercall;
479 494
495 atomic_t reader_counter;
496
480 #ifdef CONFIG_KVM_MMU_AUDIT 497 #ifdef CONFIG_KVM_MMU_AUDIT
481 int audit_point; 498 int audit_point;
482 #endif 499 #endif
@@ -559,7 +576,7 @@ struct kvm_x86_ops {
559 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); 576 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
560 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); 577 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
561 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 578 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
562 void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); 579 int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
563 void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); 580 void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
564 void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); 581 void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
565 void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); 582 void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
@@ -636,7 +653,6 @@ void kvm_mmu_module_exit(void);
636void kvm_mmu_destroy(struct kvm_vcpu *vcpu); 653void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
637int kvm_mmu_create(struct kvm_vcpu *vcpu); 654int kvm_mmu_create(struct kvm_vcpu *vcpu);
638int kvm_mmu_setup(struct kvm_vcpu *vcpu); 655int kvm_mmu_setup(struct kvm_vcpu *vcpu);
639void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
640void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 656void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
641 u64 dirty_mask, u64 nx_mask, u64 x_mask); 657 u64 dirty_mask, u64 nx_mask, u64 x_mask);
642 658
@@ -830,11 +846,12 @@ enum {
830asmlinkage void kvm_spurious_fault(void); 846asmlinkage void kvm_spurious_fault(void);
831extern bool kvm_rebooting; 847extern bool kvm_rebooting;
832 848
833#define __kvm_handle_fault_on_reboot(insn) \ 849#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \
834 "666: " insn "\n\t" \ 850 "666: " insn "\n\t" \
835 "668: \n\t" \ 851 "668: \n\t" \
836 ".pushsection .fixup, \"ax\" \n" \ 852 ".pushsection .fixup, \"ax\" \n" \
837 "667: \n\t" \ 853 "667: \n\t" \
854 cleanup_insn "\n\t" \
838 "cmpb $0, kvm_rebooting \n\t" \ 855 "cmpb $0, kvm_rebooting \n\t" \
839 "jne 668b \n\t" \ 856 "jne 668b \n\t" \
840 __ASM_SIZE(push) " $666b \n\t" \ 857 __ASM_SIZE(push) " $666b \n\t" \
@@ -844,6 +861,9 @@ extern bool kvm_rebooting;
844 _ASM_PTR " 666b, 667b \n\t" \ 861 _ASM_PTR " 666b, 667b \n\t" \
845 ".popsection" 862 ".popsection"
846 863
864#define __kvm_handle_fault_on_reboot(insn) \
865 ____kvm_handle_fault_on_reboot(insn, "")
866
847#define KVM_ARCH_WANT_MMU_NOTIFIER 867#define KVM_ARCH_WANT_MMU_NOTIFIER
848int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 868int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
849int kvm_age_hva(struct kvm *kvm, unsigned long hva); 869int kvm_age_hva(struct kvm *kvm, unsigned long hva);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index a427bf77a93d..734c3767cfac 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -21,6 +21,7 @@
21 */ 21 */
22#define KVM_FEATURE_CLOCKSOURCE2 3 22#define KVM_FEATURE_CLOCKSOURCE2 3
23#define KVM_FEATURE_ASYNC_PF 4 23#define KVM_FEATURE_ASYNC_PF 4
24#define KVM_FEATURE_STEAL_TIME 5
24 25
25/* The last 8 bits are used to indicate how to interpret the flags field 26/* The last 8 bits are used to indicate how to interpret the flags field
26 * in pvclock structure. If no bits are set, all flags are ignored. 27 * in pvclock structure. If no bits are set, all flags are ignored.
@@ -30,10 +31,23 @@
30#define MSR_KVM_WALL_CLOCK 0x11 31#define MSR_KVM_WALL_CLOCK 0x11
31#define MSR_KVM_SYSTEM_TIME 0x12 32#define MSR_KVM_SYSTEM_TIME 0x12
32 33
34#define KVM_MSR_ENABLED 1
33/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ 35/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
34#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 36#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
35#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 37#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
36#define MSR_KVM_ASYNC_PF_EN 0x4b564d02 38#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
39#define MSR_KVM_STEAL_TIME 0x4b564d03
40
41struct kvm_steal_time {
42 __u64 steal;
43 __u32 version;
44 __u32 flags;
45 __u32 pad[12];
46};
47
48#define KVM_STEAL_ALIGNMENT_BITS 5
49#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
50#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
37 51
38#define KVM_MAX_MMU_OP_BATCH 32 52#define KVM_MAX_MMU_OP_BATCH 32
39 53
@@ -178,6 +192,7 @@ void __init kvm_guest_init(void);
178void kvm_async_pf_task_wait(u32 token); 192void kvm_async_pf_task_wait(u32 token);
179void kvm_async_pf_task_wake(u32 token); 193void kvm_async_pf_task_wake(u32 token);
180u32 kvm_read_and_reset_pf_reason(void); 194u32 kvm_read_and_reset_pf_reason(void);
195extern void kvm_disable_steal_time(void);
181#else 196#else
182#define kvm_guest_init() do { } while (0) 197#define kvm_guest_init() do { } while (0)
183#define kvm_async_pf_task_wait(T) do {} while(0) 198#define kvm_async_pf_task_wait(T) do {} while(0)
@@ -186,6 +201,11 @@ static inline u32 kvm_read_and_reset_pf_reason(void)
186{ 201{
187 return 0; 202 return 0;
188} 203}
204
205static inline void kvm_disable_steal_time(void)
206{
207 return;
208}
189#endif 209#endif
190 210
191#endif /* __KERNEL__ */ 211#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index d96bdb25ca3d..d52609aeeab8 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -441,6 +441,18 @@
441#define MSR_IA32_VMX_VMCS_ENUM 0x0000048a 441#define MSR_IA32_VMX_VMCS_ENUM 0x0000048a
442#define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b 442#define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b
443#define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c 443#define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c
444#define MSR_IA32_VMX_TRUE_PINBASED_CTLS 0x0000048d
445#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e
446#define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f
447#define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490
448
449/* VMX_BASIC bits and bitmasks */
450#define VMX_BASIC_VMCS_SIZE_SHIFT 32
451#define VMX_BASIC_64 0x0001000000000000LLU
452#define VMX_BASIC_MEM_TYPE_SHIFT 50
453#define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU
454#define VMX_BASIC_MEM_TYPE_WB 6LLU
455#define VMX_BASIC_INOUT 0x0040000000000000LLU
444 456
445/* AMD-V MSRs */ 457/* AMD-V MSRs */
446 458
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ebbc4d8ab170..a7d2db9a74fb 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -230,6 +230,15 @@ static inline unsigned long long paravirt_sched_clock(void)
230 return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); 230 return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
231} 231}
232 232
233struct jump_label_key;
234extern struct jump_label_key paravirt_steal_enabled;
235extern struct jump_label_key paravirt_steal_rq_enabled;
236
237static inline u64 paravirt_steal_clock(int cpu)
238{
239 return PVOP_CALL1(u64, pv_time_ops.steal_clock, cpu);
240}
241
233static inline unsigned long long paravirt_read_pmc(int counter) 242static inline unsigned long long paravirt_read_pmc(int counter)
234{ 243{
235 return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter); 244 return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 82885099c869..2c7652163111 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -89,6 +89,7 @@ struct pv_lazy_ops {
89 89
90struct pv_time_ops { 90struct pv_time_ops {
91 unsigned long long (*sched_clock)(void); 91 unsigned long long (*sched_clock)(void);
92 unsigned long long (*steal_clock)(int cpu);
92 unsigned long (*get_tsc_khz)(void); 93 unsigned long (*get_tsc_khz)(void);
93}; 94};
94 95
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 59ab4dffa377..2dddb317bb39 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -59,6 +59,7 @@
59#define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */ 59#define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */
60#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ 60#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
61#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ 61#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */
62#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */
62#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ 63#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
63#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ 64#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */
64 65
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 84471b810460..2caf290e9895 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -132,6 +132,8 @@ enum vmcs_field {
132 GUEST_IA32_PAT_HIGH = 0x00002805, 132 GUEST_IA32_PAT_HIGH = 0x00002805,
133 GUEST_IA32_EFER = 0x00002806, 133 GUEST_IA32_EFER = 0x00002806,
134 GUEST_IA32_EFER_HIGH = 0x00002807, 134 GUEST_IA32_EFER_HIGH = 0x00002807,
135 GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
136 GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809,
135 GUEST_PDPTR0 = 0x0000280a, 137 GUEST_PDPTR0 = 0x0000280a,
136 GUEST_PDPTR0_HIGH = 0x0000280b, 138 GUEST_PDPTR0_HIGH = 0x0000280b,
137 GUEST_PDPTR1 = 0x0000280c, 139 GUEST_PDPTR1 = 0x0000280c,
@@ -144,6 +146,8 @@ enum vmcs_field {
144 HOST_IA32_PAT_HIGH = 0x00002c01, 146 HOST_IA32_PAT_HIGH = 0x00002c01,
145 HOST_IA32_EFER = 0x00002c02, 147 HOST_IA32_EFER = 0x00002c02,
146 HOST_IA32_EFER_HIGH = 0x00002c03, 148 HOST_IA32_EFER_HIGH = 0x00002c03,
149 HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
150 HOST_IA32_PERF_GLOBAL_CTRL_HIGH = 0x00002c05,
147 PIN_BASED_VM_EXEC_CONTROL = 0x00004000, 151 PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
148 CPU_BASED_VM_EXEC_CONTROL = 0x00004002, 152 CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
149 EXCEPTION_BITMAP = 0x00004004, 153 EXCEPTION_BITMAP = 0x00004004,
@@ -426,4 +430,43 @@ struct vmx_msr_entry {
426 u64 value; 430 u64 value;
427} __aligned(16); 431} __aligned(16);
428 432
433/*
434 * Exit Qualifications for entry failure during or after loading guest state
435 */
436#define ENTRY_FAIL_DEFAULT 0
437#define ENTRY_FAIL_PDPTE 2
438#define ENTRY_FAIL_NMI 3
439#define ENTRY_FAIL_VMCS_LINK_PTR 4
440
441/*
442 * VM-instruction error numbers
443 */
444enum vm_instruction_error_number {
445 VMXERR_VMCALL_IN_VMX_ROOT_OPERATION = 1,
446 VMXERR_VMCLEAR_INVALID_ADDRESS = 2,
447 VMXERR_VMCLEAR_VMXON_POINTER = 3,
448 VMXERR_VMLAUNCH_NONCLEAR_VMCS = 4,
449 VMXERR_VMRESUME_NONLAUNCHED_VMCS = 5,
450 VMXERR_VMRESUME_AFTER_VMXOFF = 6,
451 VMXERR_ENTRY_INVALID_CONTROL_FIELD = 7,
452 VMXERR_ENTRY_INVALID_HOST_STATE_FIELD = 8,
453 VMXERR_VMPTRLD_INVALID_ADDRESS = 9,
454 VMXERR_VMPTRLD_VMXON_POINTER = 10,
455 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID = 11,
456 VMXERR_UNSUPPORTED_VMCS_COMPONENT = 12,
457 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT = 13,
458 VMXERR_VMXON_IN_VMX_ROOT_OPERATION = 15,
459 VMXERR_ENTRY_INVALID_EXECUTIVE_VMCS_POINTER = 16,
460 VMXERR_ENTRY_NONLAUNCHED_EXECUTIVE_VMCS = 17,
461 VMXERR_ENTRY_EXECUTIVE_VMCS_POINTER_NOT_VMXON_POINTER = 18,
462 VMXERR_VMCALL_NONCLEAR_VMCS = 19,
463 VMXERR_VMCALL_INVALID_VM_EXIT_CONTROL_FIELDS = 20,
464 VMXERR_VMCALL_INCORRECT_MSEG_REVISION_ID = 22,
465 VMXERR_VMXOFF_UNDER_DUAL_MONITOR_TREATMENT_OF_SMIS_AND_SMM = 23,
466 VMXERR_VMCALL_INVALID_SMM_MONITOR_FEATURES = 24,
467 VMXERR_ENTRY_INVALID_VM_EXECUTION_CONTROL_FIELDS_IN_EXECUTIVE_VMCS = 25,
468 VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS = 26,
469 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28,
470};
471
429#endif 472#endif
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33c07b0b122e..a9c2116001d6 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -51,6 +51,15 @@ static int parse_no_kvmapf(char *arg)
51 51
52early_param("no-kvmapf", parse_no_kvmapf); 52early_param("no-kvmapf", parse_no_kvmapf);
53 53
54static int steal_acc = 1;
55static int parse_no_stealacc(char *arg)
56{
57 steal_acc = 0;
58 return 0;
59}
60
61early_param("no-steal-acc", parse_no_stealacc);
62
54struct kvm_para_state { 63struct kvm_para_state {
55 u8 mmu_queue[MMU_QUEUE_SIZE]; 64 u8 mmu_queue[MMU_QUEUE_SIZE];
56 int mmu_queue_len; 65 int mmu_queue_len;
@@ -58,6 +67,8 @@ struct kvm_para_state {
58 67
59static DEFINE_PER_CPU(struct kvm_para_state, para_state); 68static DEFINE_PER_CPU(struct kvm_para_state, para_state);
60static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); 69static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
70static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
71static int has_steal_clock = 0;
61 72
62static struct kvm_para_state *kvm_para_state(void) 73static struct kvm_para_state *kvm_para_state(void)
63{ 74{
@@ -441,6 +452,21 @@ static void __init paravirt_ops_setup(void)
441#endif 452#endif
442} 453}
443 454
455static void kvm_register_steal_time(void)
456{
457 int cpu = smp_processor_id();
458 struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
459
460 if (!has_steal_clock)
461 return;
462
463 memset(st, 0, sizeof(*st));
464
465 wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED));
466 printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n",
467 cpu, __pa(st));
468}
469
444void __cpuinit kvm_guest_cpu_init(void) 470void __cpuinit kvm_guest_cpu_init(void)
445{ 471{
446 if (!kvm_para_available()) 472 if (!kvm_para_available())
@@ -457,6 +483,9 @@ void __cpuinit kvm_guest_cpu_init(void)
457 printk(KERN_INFO"KVM setup async PF for cpu %d\n", 483 printk(KERN_INFO"KVM setup async PF for cpu %d\n",
458 smp_processor_id()); 484 smp_processor_id());
459 } 485 }
486
487 if (has_steal_clock)
488 kvm_register_steal_time();
460} 489}
461 490
462static void kvm_pv_disable_apf(void *unused) 491static void kvm_pv_disable_apf(void *unused)
@@ -483,6 +512,31 @@ static struct notifier_block kvm_pv_reboot_nb = {
483 .notifier_call = kvm_pv_reboot_notify, 512 .notifier_call = kvm_pv_reboot_notify,
484}; 513};
485 514
515static u64 kvm_steal_clock(int cpu)
516{
517 u64 steal;
518 struct kvm_steal_time *src;
519 int version;
520
521 src = &per_cpu(steal_time, cpu);
522 do {
523 version = src->version;
524 rmb();
525 steal = src->steal;
526 rmb();
527 } while ((version & 1) || (version != src->version));
528
529 return steal;
530}
531
532void kvm_disable_steal_time(void)
533{
534 if (!has_steal_clock)
535 return;
536
537 wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
538}
539
486#ifdef CONFIG_SMP 540#ifdef CONFIG_SMP
487static void __init kvm_smp_prepare_boot_cpu(void) 541static void __init kvm_smp_prepare_boot_cpu(void)
488{ 542{
@@ -500,6 +554,7 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy)
500 554
501static void kvm_guest_cpu_offline(void *dummy) 555static void kvm_guest_cpu_offline(void *dummy)
502{ 556{
557 kvm_disable_steal_time();
503 kvm_pv_disable_apf(NULL); 558 kvm_pv_disable_apf(NULL);
504 apf_task_wake_all(); 559 apf_task_wake_all();
505} 560}
@@ -548,6 +603,11 @@ void __init kvm_guest_init(void)
548 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) 603 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
549 x86_init.irqs.trap_init = kvm_apf_trap_init; 604 x86_init.irqs.trap_init = kvm_apf_trap_init;
550 605
606 if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
607 has_steal_clock = 1;
608 pv_time_ops.steal_clock = kvm_steal_clock;
609 }
610
551#ifdef CONFIG_SMP 611#ifdef CONFIG_SMP
552 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 612 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
553 register_cpu_notifier(&kvm_cpu_notifier); 613 register_cpu_notifier(&kvm_cpu_notifier);
@@ -555,3 +615,15 @@ void __init kvm_guest_init(void)
555 kvm_guest_cpu_init(); 615 kvm_guest_cpu_init();
556#endif 616#endif
557} 617}
618
619static __init int activate_jump_labels(void)
620{
621 if (has_steal_clock) {
622 jump_label_inc(&paravirt_steal_enabled);
623 if (steal_acc)
624 jump_label_inc(&paravirt_steal_rq_enabled);
625 }
626
627 return 0;
628}
629arch_initcall(activate_jump_labels);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 6389a6bca11b..c1a0188e29ae 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -160,6 +160,7 @@ static void __cpuinit kvm_setup_secondary_clock(void)
160static void kvm_crash_shutdown(struct pt_regs *regs) 160static void kvm_crash_shutdown(struct pt_regs *regs)
161{ 161{
162 native_write_msr(msr_kvm_system_time, 0, 0); 162 native_write_msr(msr_kvm_system_time, 0, 0);
163 kvm_disable_steal_time();
163 native_machine_crash_shutdown(regs); 164 native_machine_crash_shutdown(regs);
164} 165}
165#endif 166#endif
@@ -167,6 +168,7 @@ static void kvm_crash_shutdown(struct pt_regs *regs)
167static void kvm_shutdown(void) 168static void kvm_shutdown(void)
168{ 169{
169 native_write_msr(msr_kvm_system_time, 0, 0); 170 native_write_msr(msr_kvm_system_time, 0, 0);
171 kvm_disable_steal_time();
170 native_machine_shutdown(); 172 native_machine_shutdown();
171} 173}
172 174
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 869e1aeeb71b..613a7931ecc1 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -202,6 +202,14 @@ static void native_flush_tlb_single(unsigned long addr)
202 __native_flush_tlb_single(addr); 202 __native_flush_tlb_single(addr);
203} 203}
204 204
205struct jump_label_key paravirt_steal_enabled;
206struct jump_label_key paravirt_steal_rq_enabled;
207
208static u64 native_steal_clock(int cpu)
209{
210 return 0;
211}
212
205/* These are in entry.S */ 213/* These are in entry.S */
206extern void native_iret(void); 214extern void native_iret(void);
207extern void native_irq_enable_sysexit(void); 215extern void native_irq_enable_sysexit(void);
@@ -307,6 +315,7 @@ struct pv_init_ops pv_init_ops = {
307 315
308struct pv_time_ops pv_time_ops = { 316struct pv_time_ops pv_time_ops = {
309 .sched_clock = native_sched_clock, 317 .sched_clock = native_sched_clock,
318 .steal_clock = native_steal_clock,
310}; 319};
311 320
312struct pv_irq_ops pv_irq_ops = { 321struct pv_irq_ops pv_irq_ops = {
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 65cf8233d25c..988724b236b6 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -31,6 +31,7 @@ config KVM
31 select KVM_ASYNC_PF 31 select KVM_ASYNC_PF
32 select USER_RETURN_NOTIFIER 32 select USER_RETURN_NOTIFIER
33 select KVM_MMIO 33 select KVM_MMIO
34 select TASK_DELAY_ACCT
34 ---help--- 35 ---help---
35 Support hosting fully virtualized guest machines using hardware 36 Support hosting fully virtualized guest machines using hardware
36 virtualization extensions. You will need a fairly recent 37 virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index adc98675cda0..6f08bc940fa8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -407,76 +407,59 @@ struct gprefix {
407 } \ 407 } \
408 } while (0) 408 } while (0)
409 409
410/* Fetch next part of the instruction being emulated. */
411#define insn_fetch(_type, _size, _eip) \
412({ unsigned long _x; \
413 rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
414 if (rc != X86EMUL_CONTINUE) \
415 goto done; \
416 (_eip) += (_size); \
417 (_type)_x; \
418})
419
420#define insn_fetch_arr(_arr, _size, _eip) \
421({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \
422 if (rc != X86EMUL_CONTINUE) \
423 goto done; \
424 (_eip) += (_size); \
425})
426
427static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, 410static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
428 enum x86_intercept intercept, 411 enum x86_intercept intercept,
429 enum x86_intercept_stage stage) 412 enum x86_intercept_stage stage)
430{ 413{
431 struct x86_instruction_info info = { 414 struct x86_instruction_info info = {
432 .intercept = intercept, 415 .intercept = intercept,
433 .rep_prefix = ctxt->decode.rep_prefix, 416 .rep_prefix = ctxt->rep_prefix,
434 .modrm_mod = ctxt->decode.modrm_mod, 417 .modrm_mod = ctxt->modrm_mod,
435 .modrm_reg = ctxt->decode.modrm_reg, 418 .modrm_reg = ctxt->modrm_reg,
436 .modrm_rm = ctxt->decode.modrm_rm, 419 .modrm_rm = ctxt->modrm_rm,
437 .src_val = ctxt->decode.src.val64, 420 .src_val = ctxt->src.val64,
438 .src_bytes = ctxt->decode.src.bytes, 421 .src_bytes = ctxt->src.bytes,
439 .dst_bytes = ctxt->decode.dst.bytes, 422 .dst_bytes = ctxt->dst.bytes,
440 .ad_bytes = ctxt->decode.ad_bytes, 423 .ad_bytes = ctxt->ad_bytes,
441 .next_rip = ctxt->eip, 424 .next_rip = ctxt->eip,
442 }; 425 };
443 426
444 return ctxt->ops->intercept(ctxt, &info, stage); 427 return ctxt->ops->intercept(ctxt, &info, stage);
445} 428}
446 429
447static inline unsigned long ad_mask(struct decode_cache *c) 430static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
448{ 431{
449 return (1UL << (c->ad_bytes << 3)) - 1; 432 return (1UL << (ctxt->ad_bytes << 3)) - 1;
450} 433}
451 434
452/* Access/update address held in a register, based on addressing mode. */ 435/* Access/update address held in a register, based on addressing mode. */
453static inline unsigned long 436static inline unsigned long
454address_mask(struct decode_cache *c, unsigned long reg) 437address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
455{ 438{
456 if (c->ad_bytes == sizeof(unsigned long)) 439 if (ctxt->ad_bytes == sizeof(unsigned long))
457 return reg; 440 return reg;
458 else 441 else
459 return reg & ad_mask(c); 442 return reg & ad_mask(ctxt);
460} 443}
461 444
462static inline unsigned long 445static inline unsigned long
463register_address(struct decode_cache *c, unsigned long reg) 446register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg)
464{ 447{
465 return address_mask(c, reg); 448 return address_mask(ctxt, reg);
466} 449}
467 450
468static inline void 451static inline void
469register_address_increment(struct decode_cache *c, unsigned long *reg, int inc) 452register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc)
470{ 453{
471 if (c->ad_bytes == sizeof(unsigned long)) 454 if (ctxt->ad_bytes == sizeof(unsigned long))
472 *reg += inc; 455 *reg += inc;
473 else 456 else
474 *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c)); 457 *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt));
475} 458}
476 459
477static inline void jmp_rel(struct decode_cache *c, int rel) 460static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
478{ 461{
479 register_address_increment(c, &c->eip, rel); 462 register_address_increment(ctxt, &ctxt->_eip, rel);
480} 463}
481 464
482static u32 desc_limit_scaled(struct desc_struct *desc) 465static u32 desc_limit_scaled(struct desc_struct *desc)
@@ -486,28 +469,26 @@ static u32 desc_limit_scaled(struct desc_struct *desc)
486 return desc->g ? (limit << 12) | 0xfff : limit; 469 return desc->g ? (limit << 12) | 0xfff : limit;
487} 470}
488 471
489static void set_seg_override(struct decode_cache *c, int seg) 472static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg)
490{ 473{
491 c->has_seg_override = true; 474 ctxt->has_seg_override = true;
492 c->seg_override = seg; 475 ctxt->seg_override = seg;
493} 476}
494 477
495static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, 478static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
496 struct x86_emulate_ops *ops, int seg)
497{ 479{
498 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) 480 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
499 return 0; 481 return 0;
500 482
501 return ops->get_cached_segment_base(ctxt, seg); 483 return ctxt->ops->get_cached_segment_base(ctxt, seg);
502} 484}
503 485
504static unsigned seg_override(struct x86_emulate_ctxt *ctxt, 486static unsigned seg_override(struct x86_emulate_ctxt *ctxt)
505 struct decode_cache *c)
506{ 487{
507 if (!c->has_seg_override) 488 if (!ctxt->has_seg_override)
508 return 0; 489 return 0;
509 490
510 return c->seg_override; 491 return ctxt->seg_override;
511} 492}
512 493
513static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, 494static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
@@ -579,7 +560,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
579 unsigned size, bool write, bool fetch, 560 unsigned size, bool write, bool fetch,
580 ulong *linear) 561 ulong *linear)
581{ 562{
582 struct decode_cache *c = &ctxt->decode;
583 struct desc_struct desc; 563 struct desc_struct desc;
584 bool usable; 564 bool usable;
585 ulong la; 565 ulong la;
@@ -587,7 +567,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
587 u16 sel; 567 u16 sel;
588 unsigned cpl, rpl; 568 unsigned cpl, rpl;
589 569
590 la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; 570 la = seg_base(ctxt, addr.seg) + addr.ea;
591 switch (ctxt->mode) { 571 switch (ctxt->mode) {
592 case X86EMUL_MODE_REAL: 572 case X86EMUL_MODE_REAL:
593 break; 573 break;
@@ -637,7 +617,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
637 } 617 }
638 break; 618 break;
639 } 619 }
640 if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : c->ad_bytes != 8) 620 if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8)
641 la &= (u32)-1; 621 la &= (u32)-1;
642 *linear = la; 622 *linear = la;
643 return X86EMUL_CONTINUE; 623 return X86EMUL_CONTINUE;
@@ -671,11 +651,10 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
671 return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); 651 return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
672} 652}
673 653
674static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 654static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt,
675 struct x86_emulate_ops *ops,
676 unsigned long eip, u8 *dest) 655 unsigned long eip, u8 *dest)
677{ 656{
678 struct fetch_cache *fc = &ctxt->decode.fetch; 657 struct fetch_cache *fc = &ctxt->fetch;
679 int rc; 658 int rc;
680 int size, cur_size; 659 int size, cur_size;
681 660
@@ -687,8 +666,8 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
687 rc = __linearize(ctxt, addr, size, false, true, &linear); 666 rc = __linearize(ctxt, addr, size, false, true, &linear);
688 if (rc != X86EMUL_CONTINUE) 667 if (rc != X86EMUL_CONTINUE)
689 return rc; 668 return rc;
690 rc = ops->fetch(ctxt, linear, fc->data + cur_size, 669 rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size,
691 size, &ctxt->exception); 670 size, &ctxt->exception);
692 if (rc != X86EMUL_CONTINUE) 671 if (rc != X86EMUL_CONTINUE)
693 return rc; 672 return rc;
694 fc->end += size; 673 fc->end += size;
@@ -698,7 +677,6 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
698} 677}
699 678
700static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, 679static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
701 struct x86_emulate_ops *ops,
702 unsigned long eip, void *dest, unsigned size) 680 unsigned long eip, void *dest, unsigned size)
703{ 681{
704 int rc; 682 int rc;
@@ -707,13 +685,30 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
707 if (eip + size - ctxt->eip > 15) 685 if (eip + size - ctxt->eip > 15)
708 return X86EMUL_UNHANDLEABLE; 686 return X86EMUL_UNHANDLEABLE;
709 while (size--) { 687 while (size--) {
710 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); 688 rc = do_insn_fetch_byte(ctxt, eip++, dest++);
711 if (rc != X86EMUL_CONTINUE) 689 if (rc != X86EMUL_CONTINUE)
712 return rc; 690 return rc;
713 } 691 }
714 return X86EMUL_CONTINUE; 692 return X86EMUL_CONTINUE;
715} 693}
716 694
695/* Fetch next part of the instruction being emulated. */
696#define insn_fetch(_type, _size, _eip) \
697({ unsigned long _x; \
698 rc = do_insn_fetch(ctxt, (_eip), &_x, (_size)); \
699 if (rc != X86EMUL_CONTINUE) \
700 goto done; \
701 (_eip) += (_size); \
702 (_type)_x; \
703})
704
705#define insn_fetch_arr(_arr, _size, _eip) \
706({ rc = do_insn_fetch(ctxt, (_eip), _arr, (_size)); \
707 if (rc != X86EMUL_CONTINUE) \
708 goto done; \
709 (_eip) += (_size); \
710})
711
717/* 712/*
718 * Given the 'reg' portion of a ModRM byte, and a register block, return a 713 * Given the 'reg' portion of a ModRM byte, and a register block, return a
719 * pointer into the block that addresses the relevant register. 714 * pointer into the block that addresses the relevant register.
@@ -857,16 +852,15 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
857 852
858static void decode_register_operand(struct x86_emulate_ctxt *ctxt, 853static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
859 struct operand *op, 854 struct operand *op,
860 struct decode_cache *c,
861 int inhibit_bytereg) 855 int inhibit_bytereg)
862{ 856{
863 unsigned reg = c->modrm_reg; 857 unsigned reg = ctxt->modrm_reg;
864 int highbyte_regs = c->rex_prefix == 0; 858 int highbyte_regs = ctxt->rex_prefix == 0;
865 859
866 if (!(c->d & ModRM)) 860 if (!(ctxt->d & ModRM))
867 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); 861 reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
868 862
869 if (c->d & Sse) { 863 if (ctxt->d & Sse) {
870 op->type = OP_XMM; 864 op->type = OP_XMM;
871 op->bytes = 16; 865 op->bytes = 16;
872 op->addr.xmm = reg; 866 op->addr.xmm = reg;
@@ -875,49 +869,47 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
875 } 869 }
876 870
877 op->type = OP_REG; 871 op->type = OP_REG;
878 if ((c->d & ByteOp) && !inhibit_bytereg) { 872 if ((ctxt->d & ByteOp) && !inhibit_bytereg) {
879 op->addr.reg = decode_register(reg, c->regs, highbyte_regs); 873 op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs);
880 op->bytes = 1; 874 op->bytes = 1;
881 } else { 875 } else {
882 op->addr.reg = decode_register(reg, c->regs, 0); 876 op->addr.reg = decode_register(reg, ctxt->regs, 0);
883 op->bytes = c->op_bytes; 877 op->bytes = ctxt->op_bytes;
884 } 878 }
885 fetch_register_operand(op); 879 fetch_register_operand(op);
886 op->orig_val = op->val; 880 op->orig_val = op->val;
887} 881}
888 882
889static int decode_modrm(struct x86_emulate_ctxt *ctxt, 883static int decode_modrm(struct x86_emulate_ctxt *ctxt,
890 struct x86_emulate_ops *ops,
891 struct operand *op) 884 struct operand *op)
892{ 885{
893 struct decode_cache *c = &ctxt->decode;
894 u8 sib; 886 u8 sib;
895 int index_reg = 0, base_reg = 0, scale; 887 int index_reg = 0, base_reg = 0, scale;
896 int rc = X86EMUL_CONTINUE; 888 int rc = X86EMUL_CONTINUE;
897 ulong modrm_ea = 0; 889 ulong modrm_ea = 0;
898 890
899 if (c->rex_prefix) { 891 if (ctxt->rex_prefix) {
900 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ 892 ctxt->modrm_reg = (ctxt->rex_prefix & 4) << 1; /* REX.R */
901 index_reg = (c->rex_prefix & 2) << 2; /* REX.X */ 893 index_reg = (ctxt->rex_prefix & 2) << 2; /* REX.X */
902 c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */ 894 ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
903 } 895 }
904 896
905 c->modrm = insn_fetch(u8, 1, c->eip); 897 ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
906 c->modrm_mod |= (c->modrm & 0xc0) >> 6; 898 ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
907 c->modrm_reg |= (c->modrm & 0x38) >> 3; 899 ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
908 c->modrm_rm |= (c->modrm & 0x07); 900 ctxt->modrm_rm |= (ctxt->modrm & 0x07);
909 c->modrm_seg = VCPU_SREG_DS; 901 ctxt->modrm_seg = VCPU_SREG_DS;
910 902
911 if (c->modrm_mod == 3) { 903 if (ctxt->modrm_mod == 3) {
912 op->type = OP_REG; 904 op->type = OP_REG;
913 op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 905 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
914 op->addr.reg = decode_register(c->modrm_rm, 906 op->addr.reg = decode_register(ctxt->modrm_rm,
915 c->regs, c->d & ByteOp); 907 ctxt->regs, ctxt->d & ByteOp);
916 if (c->d & Sse) { 908 if (ctxt->d & Sse) {
917 op->type = OP_XMM; 909 op->type = OP_XMM;
918 op->bytes = 16; 910 op->bytes = 16;
919 op->addr.xmm = c->modrm_rm; 911 op->addr.xmm = ctxt->modrm_rm;
920 read_sse_reg(ctxt, &op->vec_val, c->modrm_rm); 912 read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm);
921 return rc; 913 return rc;
922 } 914 }
923 fetch_register_operand(op); 915 fetch_register_operand(op);
@@ -926,26 +918,26 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
926 918
927 op->type = OP_MEM; 919 op->type = OP_MEM;
928 920
929 if (c->ad_bytes == 2) { 921 if (ctxt->ad_bytes == 2) {
930 unsigned bx = c->regs[VCPU_REGS_RBX]; 922 unsigned bx = ctxt->regs[VCPU_REGS_RBX];
931 unsigned bp = c->regs[VCPU_REGS_RBP]; 923 unsigned bp = ctxt->regs[VCPU_REGS_RBP];
932 unsigned si = c->regs[VCPU_REGS_RSI]; 924 unsigned si = ctxt->regs[VCPU_REGS_RSI];
933 unsigned di = c->regs[VCPU_REGS_RDI]; 925 unsigned di = ctxt->regs[VCPU_REGS_RDI];
934 926
935 /* 16-bit ModR/M decode. */ 927 /* 16-bit ModR/M decode. */
936 switch (c->modrm_mod) { 928 switch (ctxt->modrm_mod) {
937 case 0: 929 case 0:
938 if (c->modrm_rm == 6) 930 if (ctxt->modrm_rm == 6)
939 modrm_ea += insn_fetch(u16, 2, c->eip); 931 modrm_ea += insn_fetch(u16, 2, ctxt->_eip);
940 break; 932 break;
941 case 1: 933 case 1:
942 modrm_ea += insn_fetch(s8, 1, c->eip); 934 modrm_ea += insn_fetch(s8, 1, ctxt->_eip);
943 break; 935 break;
944 case 2: 936 case 2:
945 modrm_ea += insn_fetch(u16, 2, c->eip); 937 modrm_ea += insn_fetch(u16, 2, ctxt->_eip);
946 break; 938 break;
947 } 939 }
948 switch (c->modrm_rm) { 940 switch (ctxt->modrm_rm) {
949 case 0: 941 case 0:
950 modrm_ea += bx + si; 942 modrm_ea += bx + si;
951 break; 943 break;
@@ -965,46 +957,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
965 modrm_ea += di; 957 modrm_ea += di;
966 break; 958 break;
967 case 6: 959 case 6:
968 if (c->modrm_mod != 0) 960 if (ctxt->modrm_mod != 0)
969 modrm_ea += bp; 961 modrm_ea += bp;
970 break; 962 break;
971 case 7: 963 case 7:
972 modrm_ea += bx; 964 modrm_ea += bx;
973 break; 965 break;
974 } 966 }
975 if (c->modrm_rm == 2 || c->modrm_rm == 3 || 967 if (ctxt->modrm_rm == 2 || ctxt->modrm_rm == 3 ||
976 (c->modrm_rm == 6 && c->modrm_mod != 0)) 968 (ctxt->modrm_rm == 6 && ctxt->modrm_mod != 0))
977 c->modrm_seg = VCPU_SREG_SS; 969 ctxt->modrm_seg = VCPU_SREG_SS;
978 modrm_ea = (u16)modrm_ea; 970 modrm_ea = (u16)modrm_ea;
979 } else { 971 } else {
980 /* 32/64-bit ModR/M decode. */ 972 /* 32/64-bit ModR/M decode. */
981 if ((c->modrm_rm & 7) == 4) { 973 if ((ctxt->modrm_rm & 7) == 4) {
982 sib = insn_fetch(u8, 1, c->eip); 974 sib = insn_fetch(u8, 1, ctxt->_eip);
983 index_reg |= (sib >> 3) & 7; 975 index_reg |= (sib >> 3) & 7;
984 base_reg |= sib & 7; 976 base_reg |= sib & 7;
985 scale = sib >> 6; 977 scale = sib >> 6;
986 978
987 if ((base_reg & 7) == 5 && c->modrm_mod == 0) 979 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
988 modrm_ea += insn_fetch(s32, 4, c->eip); 980 modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
989 else 981 else
990 modrm_ea += c->regs[base_reg]; 982 modrm_ea += ctxt->regs[base_reg];
991 if (index_reg != 4) 983 if (index_reg != 4)
992 modrm_ea += c->regs[index_reg] << scale; 984 modrm_ea += ctxt->regs[index_reg] << scale;
993 } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { 985 } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
994 if (ctxt->mode == X86EMUL_MODE_PROT64) 986 if (ctxt->mode == X86EMUL_MODE_PROT64)
995 c->rip_relative = 1; 987 ctxt->rip_relative = 1;
996 } else 988 } else
997 modrm_ea += c->regs[c->modrm_rm]; 989 modrm_ea += ctxt->regs[ctxt->modrm_rm];
998 switch (c->modrm_mod) { 990 switch (ctxt->modrm_mod) {
999 case 0: 991 case 0:
1000 if (c->modrm_rm == 5) 992 if (ctxt->modrm_rm == 5)
1001 modrm_ea += insn_fetch(s32, 4, c->eip); 993 modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
1002 break; 994 break;
1003 case 1: 995 case 1:
1004 modrm_ea += insn_fetch(s8, 1, c->eip); 996 modrm_ea += insn_fetch(s8, 1, ctxt->_eip);
1005 break; 997 break;
1006 case 2: 998 case 2:
1007 modrm_ea += insn_fetch(s32, 4, c->eip); 999 modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
1008 break; 1000 break;
1009 } 1001 }
1010 } 1002 }
@@ -1014,53 +1006,50 @@ done:
1014} 1006}
1015 1007
1016static int decode_abs(struct x86_emulate_ctxt *ctxt, 1008static int decode_abs(struct x86_emulate_ctxt *ctxt,
1017 struct x86_emulate_ops *ops,
1018 struct operand *op) 1009 struct operand *op)
1019{ 1010{
1020 struct decode_cache *c = &ctxt->decode;
1021 int rc = X86EMUL_CONTINUE; 1011 int rc = X86EMUL_CONTINUE;
1022 1012
1023 op->type = OP_MEM; 1013 op->type = OP_MEM;
1024 switch (c->ad_bytes) { 1014 switch (ctxt->ad_bytes) {
1025 case 2: 1015 case 2:
1026 op->addr.mem.ea = insn_fetch(u16, 2, c->eip); 1016 op->addr.mem.ea = insn_fetch(u16, 2, ctxt->_eip);
1027 break; 1017 break;
1028 case 4: 1018 case 4:
1029 op->addr.mem.ea = insn_fetch(u32, 4, c->eip); 1019 op->addr.mem.ea = insn_fetch(u32, 4, ctxt->_eip);
1030 break; 1020 break;
1031 case 8: 1021 case 8:
1032 op->addr.mem.ea = insn_fetch(u64, 8, c->eip); 1022 op->addr.mem.ea = insn_fetch(u64, 8, ctxt->_eip);
1033 break; 1023 break;
1034 } 1024 }
1035done: 1025done:
1036 return rc; 1026 return rc;
1037} 1027}
1038 1028
1039static void fetch_bit_operand(struct decode_cache *c) 1029static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt)
1040{ 1030{
1041 long sv = 0, mask; 1031 long sv = 0, mask;
1042 1032
1043 if (c->dst.type == OP_MEM && c->src.type == OP_REG) { 1033 if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) {
1044 mask = ~(c->dst.bytes * 8 - 1); 1034 mask = ~(ctxt->dst.bytes * 8 - 1);
1045 1035
1046 if (c->src.bytes == 2) 1036 if (ctxt->src.bytes == 2)
1047 sv = (s16)c->src.val & (s16)mask; 1037 sv = (s16)ctxt->src.val & (s16)mask;
1048 else if (c->src.bytes == 4) 1038 else if (ctxt->src.bytes == 4)
1049 sv = (s32)c->src.val & (s32)mask; 1039 sv = (s32)ctxt->src.val & (s32)mask;
1050 1040
1051 c->dst.addr.mem.ea += (sv >> 3); 1041 ctxt->dst.addr.mem.ea += (sv >> 3);
1052 } 1042 }
1053 1043
1054 /* only subword offset */ 1044 /* only subword offset */
1055 c->src.val &= (c->dst.bytes << 3) - 1; 1045 ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
1056} 1046}
1057 1047
1058static int read_emulated(struct x86_emulate_ctxt *ctxt, 1048static int read_emulated(struct x86_emulate_ctxt *ctxt,
1059 struct x86_emulate_ops *ops,
1060 unsigned long addr, void *dest, unsigned size) 1049 unsigned long addr, void *dest, unsigned size)
1061{ 1050{
1062 int rc; 1051 int rc;
1063 struct read_cache *mc = &ctxt->decode.mem_read; 1052 struct read_cache *mc = &ctxt->mem_read;
1064 1053
1065 while (size) { 1054 while (size) {
1066 int n = min(size, 8u); 1055 int n = min(size, 8u);
@@ -1068,8 +1057,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
1068 if (mc->pos < mc->end) 1057 if (mc->pos < mc->end)
1069 goto read_cached; 1058 goto read_cached;
1070 1059
1071 rc = ops->read_emulated(ctxt, addr, mc->data + mc->end, n, 1060 rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
1072 &ctxt->exception); 1061 &ctxt->exception);
1073 if (rc != X86EMUL_CONTINUE) 1062 if (rc != X86EMUL_CONTINUE)
1074 return rc; 1063 return rc;
1075 mc->end += n; 1064 mc->end += n;
@@ -1094,7 +1083,7 @@ static int segmented_read(struct x86_emulate_ctxt *ctxt,
1094 rc = linearize(ctxt, addr, size, false, &linear); 1083 rc = linearize(ctxt, addr, size, false, &linear);
1095 if (rc != X86EMUL_CONTINUE) 1084 if (rc != X86EMUL_CONTINUE)
1096 return rc; 1085 return rc;
1097 return read_emulated(ctxt, ctxt->ops, linear, data, size); 1086 return read_emulated(ctxt, linear, data, size);
1098} 1087}
1099 1088
1100static int segmented_write(struct x86_emulate_ctxt *ctxt, 1089static int segmented_write(struct x86_emulate_ctxt *ctxt,
@@ -1128,26 +1117,24 @@ static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt,
1128} 1117}
1129 1118
1130static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, 1119static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1131 struct x86_emulate_ops *ops,
1132 unsigned int size, unsigned short port, 1120 unsigned int size, unsigned short port,
1133 void *dest) 1121 void *dest)
1134{ 1122{
1135 struct read_cache *rc = &ctxt->decode.io_read; 1123 struct read_cache *rc = &ctxt->io_read;
1136 1124
1137 if (rc->pos == rc->end) { /* refill pio read ahead */ 1125 if (rc->pos == rc->end) { /* refill pio read ahead */
1138 struct decode_cache *c = &ctxt->decode;
1139 unsigned int in_page, n; 1126 unsigned int in_page, n;
1140 unsigned int count = c->rep_prefix ? 1127 unsigned int count = ctxt->rep_prefix ?
1141 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1; 1128 address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) : 1;
1142 in_page = (ctxt->eflags & EFLG_DF) ? 1129 in_page = (ctxt->eflags & EFLG_DF) ?
1143 offset_in_page(c->regs[VCPU_REGS_RDI]) : 1130 offset_in_page(ctxt->regs[VCPU_REGS_RDI]) :
1144 PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]); 1131 PAGE_SIZE - offset_in_page(ctxt->regs[VCPU_REGS_RDI]);
1145 n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, 1132 n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
1146 count); 1133 count);
1147 if (n == 0) 1134 if (n == 0)
1148 n = 1; 1135 n = 1;
1149 rc->pos = rc->end = 0; 1136 rc->pos = rc->end = 0;
1150 if (!ops->pio_in_emulated(ctxt, size, port, rc->data, n)) 1137 if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n))
1151 return 0; 1138 return 0;
1152 rc->end = n * size; 1139 rc->end = n * size;
1153 } 1140 }
@@ -1158,9 +1145,10 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1158} 1145}
1159 1146
1160static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, 1147static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1161 struct x86_emulate_ops *ops,
1162 u16 selector, struct desc_ptr *dt) 1148 u16 selector, struct desc_ptr *dt)
1163{ 1149{
1150 struct x86_emulate_ops *ops = ctxt->ops;
1151
1164 if (selector & 1 << 2) { 1152 if (selector & 1 << 2) {
1165 struct desc_struct desc; 1153 struct desc_struct desc;
1166 u16 sel; 1154 u16 sel;
@@ -1177,48 +1165,42 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1177 1165
1178/* allowed just for 8 bytes segments */ 1166/* allowed just for 8 bytes segments */
1179static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1167static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1180 struct x86_emulate_ops *ops,
1181 u16 selector, struct desc_struct *desc) 1168 u16 selector, struct desc_struct *desc)
1182{ 1169{
1183 struct desc_ptr dt; 1170 struct desc_ptr dt;
1184 u16 index = selector >> 3; 1171 u16 index = selector >> 3;
1185 int ret;
1186 ulong addr; 1172 ulong addr;
1187 1173
1188 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1174 get_descriptor_table_ptr(ctxt, selector, &dt);
1189 1175
1190 if (dt.size < index * 8 + 7) 1176 if (dt.size < index * 8 + 7)
1191 return emulate_gp(ctxt, selector & 0xfffc); 1177 return emulate_gp(ctxt, selector & 0xfffc);
1192 addr = dt.address + index * 8;
1193 ret = ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception);
1194 1178
1195 return ret; 1179 addr = dt.address + index * 8;
1180 return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
1181 &ctxt->exception);
1196} 1182}
1197 1183
1198/* allowed just for 8 bytes segments */ 1184/* allowed just for 8 bytes segments */
1199static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1185static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1200 struct x86_emulate_ops *ops,
1201 u16 selector, struct desc_struct *desc) 1186 u16 selector, struct desc_struct *desc)
1202{ 1187{
1203 struct desc_ptr dt; 1188 struct desc_ptr dt;
1204 u16 index = selector >> 3; 1189 u16 index = selector >> 3;
1205 ulong addr; 1190 ulong addr;
1206 int ret;
1207 1191
1208 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1192 get_descriptor_table_ptr(ctxt, selector, &dt);
1209 1193
1210 if (dt.size < index * 8 + 7) 1194 if (dt.size < index * 8 + 7)
1211 return emulate_gp(ctxt, selector & 0xfffc); 1195 return emulate_gp(ctxt, selector & 0xfffc);
1212 1196
1213 addr = dt.address + index * 8; 1197 addr = dt.address + index * 8;
1214 ret = ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); 1198 return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc,
1215 1199 &ctxt->exception);
1216 return ret;
1217} 1200}
1218 1201
1219/* Does not support long mode */ 1202/* Does not support long mode */
1220static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1203static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1221 struct x86_emulate_ops *ops,
1222 u16 selector, int seg) 1204 u16 selector, int seg)
1223{ 1205{
1224 struct desc_struct seg_desc; 1206 struct desc_struct seg_desc;
@@ -1253,7 +1235,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1253 if (null_selector) /* for NULL selector skip all following checks */ 1235 if (null_selector) /* for NULL selector skip all following checks */
1254 goto load; 1236 goto load;
1255 1237
1256 ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc); 1238 ret = read_segment_descriptor(ctxt, selector, &seg_desc);
1257 if (ret != X86EMUL_CONTINUE) 1239 if (ret != X86EMUL_CONTINUE)
1258 return ret; 1240 return ret;
1259 1241
@@ -1271,7 +1253,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1271 1253
1272 rpl = selector & 3; 1254 rpl = selector & 3;
1273 dpl = seg_desc.dpl; 1255 dpl = seg_desc.dpl;
1274 cpl = ops->cpl(ctxt); 1256 cpl = ctxt->ops->cpl(ctxt);
1275 1257
1276 switch (seg) { 1258 switch (seg) {
1277 case VCPU_SREG_SS: 1259 case VCPU_SREG_SS:
@@ -1322,12 +1304,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1322 if (seg_desc.s) { 1304 if (seg_desc.s) {
1323 /* mark segment as accessed */ 1305 /* mark segment as accessed */
1324 seg_desc.type |= 1; 1306 seg_desc.type |= 1;
1325 ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc); 1307 ret = write_segment_descriptor(ctxt, selector, &seg_desc);
1326 if (ret != X86EMUL_CONTINUE) 1308 if (ret != X86EMUL_CONTINUE)
1327 return ret; 1309 return ret;
1328 } 1310 }
1329load: 1311load:
1330 ops->set_segment(ctxt, selector, &seg_desc, 0, seg); 1312 ctxt->ops->set_segment(ctxt, selector, &seg_desc, 0, seg);
1331 return X86EMUL_CONTINUE; 1313 return X86EMUL_CONTINUE;
1332exception: 1314exception:
1333 emulate_exception(ctxt, err_vec, err_code, true); 1315 emulate_exception(ctxt, err_vec, err_code, true);
@@ -1356,29 +1338,28 @@ static void write_register_operand(struct operand *op)
1356static int writeback(struct x86_emulate_ctxt *ctxt) 1338static int writeback(struct x86_emulate_ctxt *ctxt)
1357{ 1339{
1358 int rc; 1340 int rc;
1359 struct decode_cache *c = &ctxt->decode;
1360 1341
1361 switch (c->dst.type) { 1342 switch (ctxt->dst.type) {
1362 case OP_REG: 1343 case OP_REG:
1363 write_register_operand(&c->dst); 1344 write_register_operand(&ctxt->dst);
1364 break; 1345 break;
1365 case OP_MEM: 1346 case OP_MEM:
1366 if (c->lock_prefix) 1347 if (ctxt->lock_prefix)
1367 rc = segmented_cmpxchg(ctxt, 1348 rc = segmented_cmpxchg(ctxt,
1368 c->dst.addr.mem, 1349 ctxt->dst.addr.mem,
1369 &c->dst.orig_val, 1350 &ctxt->dst.orig_val,
1370 &c->dst.val, 1351 &ctxt->dst.val,
1371 c->dst.bytes); 1352 ctxt->dst.bytes);
1372 else 1353 else
1373 rc = segmented_write(ctxt, 1354 rc = segmented_write(ctxt,
1374 c->dst.addr.mem, 1355 ctxt->dst.addr.mem,
1375 &c->dst.val, 1356 &ctxt->dst.val,
1376 c->dst.bytes); 1357 ctxt->dst.bytes);
1377 if (rc != X86EMUL_CONTINUE) 1358 if (rc != X86EMUL_CONTINUE)
1378 return rc; 1359 return rc;
1379 break; 1360 break;
1380 case OP_XMM: 1361 case OP_XMM:
1381 write_sse_reg(ctxt, &c->dst.vec_val, c->dst.addr.xmm); 1362 write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
1382 break; 1363 break;
1383 case OP_NONE: 1364 case OP_NONE:
1384 /* no writeback */ 1365 /* no writeback */
@@ -1391,50 +1372,45 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
1391 1372
1392static int em_push(struct x86_emulate_ctxt *ctxt) 1373static int em_push(struct x86_emulate_ctxt *ctxt)
1393{ 1374{
1394 struct decode_cache *c = &ctxt->decode;
1395 struct segmented_address addr; 1375 struct segmented_address addr;
1396 1376
1397 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1377 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes);
1398 addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); 1378 addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
1399 addr.seg = VCPU_SREG_SS; 1379 addr.seg = VCPU_SREG_SS;
1400 1380
1401 /* Disable writeback. */ 1381 /* Disable writeback. */
1402 c->dst.type = OP_NONE; 1382 ctxt->dst.type = OP_NONE;
1403 return segmented_write(ctxt, addr, &c->src.val, c->op_bytes); 1383 return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes);
1404} 1384}
1405 1385
1406static int emulate_pop(struct x86_emulate_ctxt *ctxt, 1386static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1407 void *dest, int len) 1387 void *dest, int len)
1408{ 1388{
1409 struct decode_cache *c = &ctxt->decode;
1410 int rc; 1389 int rc;
1411 struct segmented_address addr; 1390 struct segmented_address addr;
1412 1391
1413 addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); 1392 addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
1414 addr.seg = VCPU_SREG_SS; 1393 addr.seg = VCPU_SREG_SS;
1415 rc = segmented_read(ctxt, addr, dest, len); 1394 rc = segmented_read(ctxt, addr, dest, len);
1416 if (rc != X86EMUL_CONTINUE) 1395 if (rc != X86EMUL_CONTINUE)
1417 return rc; 1396 return rc;
1418 1397
1419 register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); 1398 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len);
1420 return rc; 1399 return rc;
1421} 1400}
1422 1401
1423static int em_pop(struct x86_emulate_ctxt *ctxt) 1402static int em_pop(struct x86_emulate_ctxt *ctxt)
1424{ 1403{
1425 struct decode_cache *c = &ctxt->decode; 1404 return emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
1426
1427 return emulate_pop(ctxt, &c->dst.val, c->op_bytes);
1428} 1405}
1429 1406
1430static int emulate_popf(struct x86_emulate_ctxt *ctxt, 1407static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1431 struct x86_emulate_ops *ops, 1408 void *dest, int len)
1432 void *dest, int len)
1433{ 1409{
1434 int rc; 1410 int rc;
1435 unsigned long val, change_mask; 1411 unsigned long val, change_mask;
1436 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 1412 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1437 int cpl = ops->cpl(ctxt); 1413 int cpl = ctxt->ops->cpl(ctxt);
1438 1414
1439 rc = emulate_pop(ctxt, &val, len); 1415 rc = emulate_pop(ctxt, &val, len);
1440 if (rc != X86EMUL_CONTINUE) 1416 if (rc != X86EMUL_CONTINUE)
@@ -1470,49 +1446,41 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1470 1446
1471static int em_popf(struct x86_emulate_ctxt *ctxt) 1447static int em_popf(struct x86_emulate_ctxt *ctxt)
1472{ 1448{
1473 struct decode_cache *c = &ctxt->decode; 1449 ctxt->dst.type = OP_REG;
1474 1450 ctxt->dst.addr.reg = &ctxt->eflags;
1475 c->dst.type = OP_REG; 1451 ctxt->dst.bytes = ctxt->op_bytes;
1476 c->dst.addr.reg = &ctxt->eflags; 1452 return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
1477 c->dst.bytes = c->op_bytes;
1478 return emulate_popf(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
1479} 1453}
1480 1454
1481static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, 1455static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
1482 struct x86_emulate_ops *ops, int seg)
1483{ 1456{
1484 struct decode_cache *c = &ctxt->decode; 1457 ctxt->src.val = get_segment_selector(ctxt, seg);
1485
1486 c->src.val = get_segment_selector(ctxt, seg);
1487 1458
1488 return em_push(ctxt); 1459 return em_push(ctxt);
1489} 1460}
1490 1461
1491static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, 1462static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int seg)
1492 struct x86_emulate_ops *ops, int seg)
1493{ 1463{
1494 struct decode_cache *c = &ctxt->decode;
1495 unsigned long selector; 1464 unsigned long selector;
1496 int rc; 1465 int rc;
1497 1466
1498 rc = emulate_pop(ctxt, &selector, c->op_bytes); 1467 rc = emulate_pop(ctxt, &selector, ctxt->op_bytes);
1499 if (rc != X86EMUL_CONTINUE) 1468 if (rc != X86EMUL_CONTINUE)
1500 return rc; 1469 return rc;
1501 1470
1502 rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg); 1471 rc = load_segment_descriptor(ctxt, (u16)selector, seg);
1503 return rc; 1472 return rc;
1504} 1473}
1505 1474
1506static int em_pusha(struct x86_emulate_ctxt *ctxt) 1475static int em_pusha(struct x86_emulate_ctxt *ctxt)
1507{ 1476{
1508 struct decode_cache *c = &ctxt->decode; 1477 unsigned long old_esp = ctxt->regs[VCPU_REGS_RSP];
1509 unsigned long old_esp = c->regs[VCPU_REGS_RSP];
1510 int rc = X86EMUL_CONTINUE; 1478 int rc = X86EMUL_CONTINUE;
1511 int reg = VCPU_REGS_RAX; 1479 int reg = VCPU_REGS_RAX;
1512 1480
1513 while (reg <= VCPU_REGS_RDI) { 1481 while (reg <= VCPU_REGS_RDI) {
1514 (reg == VCPU_REGS_RSP) ? 1482 (reg == VCPU_REGS_RSP) ?
1515 (c->src.val = old_esp) : (c->src.val = c->regs[reg]); 1483 (ctxt->src.val = old_esp) : (ctxt->src.val = ctxt->regs[reg]);
1516 1484
1517 rc = em_push(ctxt); 1485 rc = em_push(ctxt);
1518 if (rc != X86EMUL_CONTINUE) 1486 if (rc != X86EMUL_CONTINUE)
@@ -1526,26 +1494,23 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt)
1526 1494
1527static int em_pushf(struct x86_emulate_ctxt *ctxt) 1495static int em_pushf(struct x86_emulate_ctxt *ctxt)
1528{ 1496{
1529 struct decode_cache *c = &ctxt->decode; 1497 ctxt->src.val = (unsigned long)ctxt->eflags;
1530
1531 c->src.val = (unsigned long)ctxt->eflags;
1532 return em_push(ctxt); 1498 return em_push(ctxt);
1533} 1499}
1534 1500
1535static int em_popa(struct x86_emulate_ctxt *ctxt) 1501static int em_popa(struct x86_emulate_ctxt *ctxt)
1536{ 1502{
1537 struct decode_cache *c = &ctxt->decode;
1538 int rc = X86EMUL_CONTINUE; 1503 int rc = X86EMUL_CONTINUE;
1539 int reg = VCPU_REGS_RDI; 1504 int reg = VCPU_REGS_RDI;
1540 1505
1541 while (reg >= VCPU_REGS_RAX) { 1506 while (reg >= VCPU_REGS_RAX) {
1542 if (reg == VCPU_REGS_RSP) { 1507 if (reg == VCPU_REGS_RSP) {
1543 register_address_increment(c, &c->regs[VCPU_REGS_RSP], 1508 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP],
1544 c->op_bytes); 1509 ctxt->op_bytes);
1545 --reg; 1510 --reg;
1546 } 1511 }
1547 1512
1548 rc = emulate_pop(ctxt, &c->regs[reg], c->op_bytes); 1513 rc = emulate_pop(ctxt, &ctxt->regs[reg], ctxt->op_bytes);
1549 if (rc != X86EMUL_CONTINUE) 1514 if (rc != X86EMUL_CONTINUE)
1550 break; 1515 break;
1551 --reg; 1516 --reg;
@@ -1553,10 +1518,9 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
1553 return rc; 1518 return rc;
1554} 1519}
1555 1520
1556int emulate_int_real(struct x86_emulate_ctxt *ctxt, 1521int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
1557 struct x86_emulate_ops *ops, int irq)
1558{ 1522{
1559 struct decode_cache *c = &ctxt->decode; 1523 struct x86_emulate_ops *ops = ctxt->ops;
1560 int rc; 1524 int rc;
1561 struct desc_ptr dt; 1525 struct desc_ptr dt;
1562 gva_t cs_addr; 1526 gva_t cs_addr;
@@ -1564,19 +1528,19 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1564 u16 cs, eip; 1528 u16 cs, eip;
1565 1529
1566 /* TODO: Add limit checks */ 1530 /* TODO: Add limit checks */
1567 c->src.val = ctxt->eflags; 1531 ctxt->src.val = ctxt->eflags;
1568 rc = em_push(ctxt); 1532 rc = em_push(ctxt);
1569 if (rc != X86EMUL_CONTINUE) 1533 if (rc != X86EMUL_CONTINUE)
1570 return rc; 1534 return rc;
1571 1535
1572 ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); 1536 ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
1573 1537
1574 c->src.val = get_segment_selector(ctxt, VCPU_SREG_CS); 1538 ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
1575 rc = em_push(ctxt); 1539 rc = em_push(ctxt);
1576 if (rc != X86EMUL_CONTINUE) 1540 if (rc != X86EMUL_CONTINUE)
1577 return rc; 1541 return rc;
1578 1542
1579 c->src.val = c->eip; 1543 ctxt->src.val = ctxt->_eip;
1580 rc = em_push(ctxt); 1544 rc = em_push(ctxt);
1581 if (rc != X86EMUL_CONTINUE) 1545 if (rc != X86EMUL_CONTINUE)
1582 return rc; 1546 return rc;
@@ -1594,21 +1558,20 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1594 if (rc != X86EMUL_CONTINUE) 1558 if (rc != X86EMUL_CONTINUE)
1595 return rc; 1559 return rc;
1596 1560
1597 rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS); 1561 rc = load_segment_descriptor(ctxt, cs, VCPU_SREG_CS);
1598 if (rc != X86EMUL_CONTINUE) 1562 if (rc != X86EMUL_CONTINUE)
1599 return rc; 1563 return rc;
1600 1564
1601 c->eip = eip; 1565 ctxt->_eip = eip;
1602 1566
1603 return rc; 1567 return rc;
1604} 1568}
1605 1569
1606static int emulate_int(struct x86_emulate_ctxt *ctxt, 1570static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq)
1607 struct x86_emulate_ops *ops, int irq)
1608{ 1571{
1609 switch(ctxt->mode) { 1572 switch(ctxt->mode) {
1610 case X86EMUL_MODE_REAL: 1573 case X86EMUL_MODE_REAL:
1611 return emulate_int_real(ctxt, ops, irq); 1574 return emulate_int_real(ctxt, irq);
1612 case X86EMUL_MODE_VM86: 1575 case X86EMUL_MODE_VM86:
1613 case X86EMUL_MODE_PROT16: 1576 case X86EMUL_MODE_PROT16:
1614 case X86EMUL_MODE_PROT32: 1577 case X86EMUL_MODE_PROT32:
@@ -1619,10 +1582,8 @@ static int emulate_int(struct x86_emulate_ctxt *ctxt,
1619 } 1582 }
1620} 1583}
1621 1584
1622static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, 1585static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
1623 struct x86_emulate_ops *ops)
1624{ 1586{
1625 struct decode_cache *c = &ctxt->decode;
1626 int rc = X86EMUL_CONTINUE; 1587 int rc = X86EMUL_CONTINUE;
1627 unsigned long temp_eip = 0; 1588 unsigned long temp_eip = 0;
1628 unsigned long temp_eflags = 0; 1589 unsigned long temp_eflags = 0;
@@ -1634,7 +1595,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1634 1595
1635 /* TODO: Add stack limit check */ 1596 /* TODO: Add stack limit check */
1636 1597
1637 rc = emulate_pop(ctxt, &temp_eip, c->op_bytes); 1598 rc = emulate_pop(ctxt, &temp_eip, ctxt->op_bytes);
1638 1599
1639 if (rc != X86EMUL_CONTINUE) 1600 if (rc != X86EMUL_CONTINUE)
1640 return rc; 1601 return rc;
@@ -1642,27 +1603,27 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1642 if (temp_eip & ~0xffff) 1603 if (temp_eip & ~0xffff)
1643 return emulate_gp(ctxt, 0); 1604 return emulate_gp(ctxt, 0);
1644 1605
1645 rc = emulate_pop(ctxt, &cs, c->op_bytes); 1606 rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
1646 1607
1647 if (rc != X86EMUL_CONTINUE) 1608 if (rc != X86EMUL_CONTINUE)
1648 return rc; 1609 return rc;
1649 1610
1650 rc = emulate_pop(ctxt, &temp_eflags, c->op_bytes); 1611 rc = emulate_pop(ctxt, &temp_eflags, ctxt->op_bytes);
1651 1612
1652 if (rc != X86EMUL_CONTINUE) 1613 if (rc != X86EMUL_CONTINUE)
1653 return rc; 1614 return rc;
1654 1615
1655 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); 1616 rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
1656 1617
1657 if (rc != X86EMUL_CONTINUE) 1618 if (rc != X86EMUL_CONTINUE)
1658 return rc; 1619 return rc;
1659 1620
1660 c->eip = temp_eip; 1621 ctxt->_eip = temp_eip;
1661 1622
1662 1623
1663 if (c->op_bytes == 4) 1624 if (ctxt->op_bytes == 4)
1664 ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); 1625 ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
1665 else if (c->op_bytes == 2) { 1626 else if (ctxt->op_bytes == 2) {
1666 ctxt->eflags &= ~0xffff; 1627 ctxt->eflags &= ~0xffff;
1667 ctxt->eflags |= temp_eflags; 1628 ctxt->eflags |= temp_eflags;
1668 } 1629 }
@@ -1673,12 +1634,11 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1673 return rc; 1634 return rc;
1674} 1635}
1675 1636
1676static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, 1637static int em_iret(struct x86_emulate_ctxt *ctxt)
1677 struct x86_emulate_ops* ops)
1678{ 1638{
1679 switch(ctxt->mode) { 1639 switch(ctxt->mode) {
1680 case X86EMUL_MODE_REAL: 1640 case X86EMUL_MODE_REAL:
1681 return emulate_iret_real(ctxt, ops); 1641 return emulate_iret_real(ctxt);
1682 case X86EMUL_MODE_VM86: 1642 case X86EMUL_MODE_VM86:
1683 case X86EMUL_MODE_PROT16: 1643 case X86EMUL_MODE_PROT16:
1684 case X86EMUL_MODE_PROT32: 1644 case X86EMUL_MODE_PROT32:
@@ -1691,53 +1651,49 @@ static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
1691 1651
1692static int em_jmp_far(struct x86_emulate_ctxt *ctxt) 1652static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
1693{ 1653{
1694 struct decode_cache *c = &ctxt->decode;
1695 int rc; 1654 int rc;
1696 unsigned short sel; 1655 unsigned short sel;
1697 1656
1698 memcpy(&sel, c->src.valptr + c->op_bytes, 2); 1657 memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
1699 1658
1700 rc = load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS); 1659 rc = load_segment_descriptor(ctxt, sel, VCPU_SREG_CS);
1701 if (rc != X86EMUL_CONTINUE) 1660 if (rc != X86EMUL_CONTINUE)
1702 return rc; 1661 return rc;
1703 1662
1704 c->eip = 0; 1663 ctxt->_eip = 0;
1705 memcpy(&c->eip, c->src.valptr, c->op_bytes); 1664 memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes);
1706 return X86EMUL_CONTINUE; 1665 return X86EMUL_CONTINUE;
1707} 1666}
1708 1667
1709static int em_grp1a(struct x86_emulate_ctxt *ctxt) 1668static int em_grp1a(struct x86_emulate_ctxt *ctxt)
1710{ 1669{
1711 struct decode_cache *c = &ctxt->decode; 1670 return emulate_pop(ctxt, &ctxt->dst.val, ctxt->dst.bytes);
1712
1713 return emulate_pop(ctxt, &c->dst.val, c->dst.bytes);
1714} 1671}
1715 1672
1716static int em_grp2(struct x86_emulate_ctxt *ctxt) 1673static int em_grp2(struct x86_emulate_ctxt *ctxt)
1717{ 1674{
1718 struct decode_cache *c = &ctxt->decode; 1675 switch (ctxt->modrm_reg) {
1719 switch (c->modrm_reg) {
1720 case 0: /* rol */ 1676 case 0: /* rol */
1721 emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags); 1677 emulate_2op_SrcB("rol", ctxt->src, ctxt->dst, ctxt->eflags);
1722 break; 1678 break;
1723 case 1: /* ror */ 1679 case 1: /* ror */
1724 emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags); 1680 emulate_2op_SrcB("ror", ctxt->src, ctxt->dst, ctxt->eflags);
1725 break; 1681 break;
1726 case 2: /* rcl */ 1682 case 2: /* rcl */
1727 emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags); 1683 emulate_2op_SrcB("rcl", ctxt->src, ctxt->dst, ctxt->eflags);
1728 break; 1684 break;
1729 case 3: /* rcr */ 1685 case 3: /* rcr */
1730 emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags); 1686 emulate_2op_SrcB("rcr", ctxt->src, ctxt->dst, ctxt->eflags);
1731 break; 1687 break;
1732 case 4: /* sal/shl */ 1688 case 4: /* sal/shl */
1733 case 6: /* sal/shl */ 1689 case 6: /* sal/shl */
1734 emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags); 1690 emulate_2op_SrcB("sal", ctxt->src, ctxt->dst, ctxt->eflags);
1735 break; 1691 break;
1736 case 5: /* shr */ 1692 case 5: /* shr */
1737 emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags); 1693 emulate_2op_SrcB("shr", ctxt->src, ctxt->dst, ctxt->eflags);
1738 break; 1694 break;
1739 case 7: /* sar */ 1695 case 7: /* sar */
1740 emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); 1696 emulate_2op_SrcB("sar", ctxt->src, ctxt->dst, ctxt->eflags);
1741 break; 1697 break;
1742 } 1698 }
1743 return X86EMUL_CONTINUE; 1699 return X86EMUL_CONTINUE;
@@ -1745,33 +1701,32 @@ static int em_grp2(struct x86_emulate_ctxt *ctxt)
1745 1701
1746static int em_grp3(struct x86_emulate_ctxt *ctxt) 1702static int em_grp3(struct x86_emulate_ctxt *ctxt)
1747{ 1703{
1748 struct decode_cache *c = &ctxt->decode; 1704 unsigned long *rax = &ctxt->regs[VCPU_REGS_RAX];
1749 unsigned long *rax = &c->regs[VCPU_REGS_RAX]; 1705 unsigned long *rdx = &ctxt->regs[VCPU_REGS_RDX];
1750 unsigned long *rdx = &c->regs[VCPU_REGS_RDX];
1751 u8 de = 0; 1706 u8 de = 0;
1752 1707
1753 switch (c->modrm_reg) { 1708 switch (ctxt->modrm_reg) {
1754 case 0 ... 1: /* test */ 1709 case 0 ... 1: /* test */
1755 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); 1710 emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags);
1756 break; 1711 break;
1757 case 2: /* not */ 1712 case 2: /* not */
1758 c->dst.val = ~c->dst.val; 1713 ctxt->dst.val = ~ctxt->dst.val;
1759 break; 1714 break;
1760 case 3: /* neg */ 1715 case 3: /* neg */
1761 emulate_1op("neg", c->dst, ctxt->eflags); 1716 emulate_1op("neg", ctxt->dst, ctxt->eflags);
1762 break; 1717 break;
1763 case 4: /* mul */ 1718 case 4: /* mul */
1764 emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags); 1719 emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, ctxt->eflags);
1765 break; 1720 break;
1766 case 5: /* imul */ 1721 case 5: /* imul */
1767 emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags); 1722 emulate_1op_rax_rdx("imul", ctxt->src, *rax, *rdx, ctxt->eflags);
1768 break; 1723 break;
1769 case 6: /* div */ 1724 case 6: /* div */
1770 emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx, 1725 emulate_1op_rax_rdx_ex("div", ctxt->src, *rax, *rdx,
1771 ctxt->eflags, de); 1726 ctxt->eflags, de);
1772 break; 1727 break;
1773 case 7: /* idiv */ 1728 case 7: /* idiv */
1774 emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx, 1729 emulate_1op_rax_rdx_ex("idiv", ctxt->src, *rax, *rdx,
1775 ctxt->eflags, de); 1730 ctxt->eflags, de);
1776 break; 1731 break;
1777 default: 1732 default:
@@ -1784,26 +1739,25 @@ static int em_grp3(struct x86_emulate_ctxt *ctxt)
1784 1739
1785static int em_grp45(struct x86_emulate_ctxt *ctxt) 1740static int em_grp45(struct x86_emulate_ctxt *ctxt)
1786{ 1741{
1787 struct decode_cache *c = &ctxt->decode;
1788 int rc = X86EMUL_CONTINUE; 1742 int rc = X86EMUL_CONTINUE;
1789 1743
1790 switch (c->modrm_reg) { 1744 switch (ctxt->modrm_reg) {
1791 case 0: /* inc */ 1745 case 0: /* inc */
1792 emulate_1op("inc", c->dst, ctxt->eflags); 1746 emulate_1op("inc", ctxt->dst, ctxt->eflags);
1793 break; 1747 break;
1794 case 1: /* dec */ 1748 case 1: /* dec */
1795 emulate_1op("dec", c->dst, ctxt->eflags); 1749 emulate_1op("dec", ctxt->dst, ctxt->eflags);
1796 break; 1750 break;
1797 case 2: /* call near abs */ { 1751 case 2: /* call near abs */ {
1798 long int old_eip; 1752 long int old_eip;
1799 old_eip = c->eip; 1753 old_eip = ctxt->_eip;
1800 c->eip = c->src.val; 1754 ctxt->_eip = ctxt->src.val;
1801 c->src.val = old_eip; 1755 ctxt->src.val = old_eip;
1802 rc = em_push(ctxt); 1756 rc = em_push(ctxt);
1803 break; 1757 break;
1804 } 1758 }
1805 case 4: /* jmp abs */ 1759 case 4: /* jmp abs */
1806 c->eip = c->src.val; 1760 ctxt->_eip = ctxt->src.val;
1807 break; 1761 break;
1808 case 5: /* jmp far */ 1762 case 5: /* jmp far */
1809 rc = em_jmp_far(ctxt); 1763 rc = em_jmp_far(ctxt);
@@ -1817,68 +1771,70 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
1817 1771
1818static int em_grp9(struct x86_emulate_ctxt *ctxt) 1772static int em_grp9(struct x86_emulate_ctxt *ctxt)
1819{ 1773{
1820 struct decode_cache *c = &ctxt->decode; 1774 u64 old = ctxt->dst.orig_val64;
1821 u64 old = c->dst.orig_val64;
1822 1775
1823 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || 1776 if (((u32) (old >> 0) != (u32) ctxt->regs[VCPU_REGS_RAX]) ||
1824 ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { 1777 ((u32) (old >> 32) != (u32) ctxt->regs[VCPU_REGS_RDX])) {
1825 c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); 1778 ctxt->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
1826 c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); 1779 ctxt->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1827 ctxt->eflags &= ~EFLG_ZF; 1780 ctxt->eflags &= ~EFLG_ZF;
1828 } else { 1781 } else {
1829 c->dst.val64 = ((u64)c->regs[VCPU_REGS_RCX] << 32) | 1782 ctxt->dst.val64 = ((u64)ctxt->regs[VCPU_REGS_RCX] << 32) |
1830 (u32) c->regs[VCPU_REGS_RBX]; 1783 (u32) ctxt->regs[VCPU_REGS_RBX];
1831 1784
1832 ctxt->eflags |= EFLG_ZF; 1785 ctxt->eflags |= EFLG_ZF;
1833 } 1786 }
1834 return X86EMUL_CONTINUE; 1787 return X86EMUL_CONTINUE;
1835} 1788}
1836 1789
1837static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, 1790static int em_ret(struct x86_emulate_ctxt *ctxt)
1838 struct x86_emulate_ops *ops) 1791{
1792 ctxt->dst.type = OP_REG;
1793 ctxt->dst.addr.reg = &ctxt->_eip;
1794 ctxt->dst.bytes = ctxt->op_bytes;
1795 return em_pop(ctxt);
1796}
1797
1798static int em_ret_far(struct x86_emulate_ctxt *ctxt)
1839{ 1799{
1840 struct decode_cache *c = &ctxt->decode;
1841 int rc; 1800 int rc;
1842 unsigned long cs; 1801 unsigned long cs;
1843 1802
1844 rc = emulate_pop(ctxt, &c->eip, c->op_bytes); 1803 rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes);
1845 if (rc != X86EMUL_CONTINUE) 1804 if (rc != X86EMUL_CONTINUE)
1846 return rc; 1805 return rc;
1847 if (c->op_bytes == 4) 1806 if (ctxt->op_bytes == 4)
1848 c->eip = (u32)c->eip; 1807 ctxt->_eip = (u32)ctxt->_eip;
1849 rc = emulate_pop(ctxt, &cs, c->op_bytes); 1808 rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
1850 if (rc != X86EMUL_CONTINUE) 1809 if (rc != X86EMUL_CONTINUE)
1851 return rc; 1810 return rc;
1852 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); 1811 rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
1853 return rc; 1812 return rc;
1854} 1813}
1855 1814
1856static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, 1815static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, int seg)
1857 struct x86_emulate_ops *ops, int seg)
1858{ 1816{
1859 struct decode_cache *c = &ctxt->decode;
1860 unsigned short sel; 1817 unsigned short sel;
1861 int rc; 1818 int rc;
1862 1819
1863 memcpy(&sel, c->src.valptr + c->op_bytes, 2); 1820 memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
1864 1821
1865 rc = load_segment_descriptor(ctxt, ops, sel, seg); 1822 rc = load_segment_descriptor(ctxt, sel, seg);
1866 if (rc != X86EMUL_CONTINUE) 1823 if (rc != X86EMUL_CONTINUE)
1867 return rc; 1824 return rc;
1868 1825
1869 c->dst.val = c->src.val; 1826 ctxt->dst.val = ctxt->src.val;
1870 return rc; 1827 return rc;
1871} 1828}
1872 1829
1873static inline void 1830static void
1874setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, 1831setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1875 struct x86_emulate_ops *ops, struct desc_struct *cs, 1832 struct desc_struct *cs, struct desc_struct *ss)
1876 struct desc_struct *ss)
1877{ 1833{
1878 u16 selector; 1834 u16 selector;
1879 1835
1880 memset(cs, 0, sizeof(struct desc_struct)); 1836 memset(cs, 0, sizeof(struct desc_struct));
1881 ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS); 1837 ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
1882 memset(ss, 0, sizeof(struct desc_struct)); 1838 memset(ss, 0, sizeof(struct desc_struct));
1883 1839
1884 cs->l = 0; /* will be adjusted later */ 1840 cs->l = 0; /* will be adjusted later */
@@ -1901,10 +1857,9 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1901 ss->p = 1; 1857 ss->p = 1;
1902} 1858}
1903 1859
1904static int 1860static int em_syscall(struct x86_emulate_ctxt *ctxt)
1905emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1906{ 1861{
1907 struct decode_cache *c = &ctxt->decode; 1862 struct x86_emulate_ops *ops = ctxt->ops;
1908 struct desc_struct cs, ss; 1863 struct desc_struct cs, ss;
1909 u64 msr_data; 1864 u64 msr_data;
1910 u16 cs_sel, ss_sel; 1865 u16 cs_sel, ss_sel;
@@ -1916,7 +1871,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1916 return emulate_ud(ctxt); 1871 return emulate_ud(ctxt);
1917 1872
1918 ops->get_msr(ctxt, MSR_EFER, &efer); 1873 ops->get_msr(ctxt, MSR_EFER, &efer);
1919 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1874 setup_syscalls_segments(ctxt, &cs, &ss);
1920 1875
1921 ops->get_msr(ctxt, MSR_STAR, &msr_data); 1876 ops->get_msr(ctxt, MSR_STAR, &msr_data);
1922 msr_data >>= 32; 1877 msr_data >>= 32;
@@ -1930,15 +1885,15 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1930 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); 1885 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
1931 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); 1886 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
1932 1887
1933 c->regs[VCPU_REGS_RCX] = c->eip; 1888 ctxt->regs[VCPU_REGS_RCX] = ctxt->_eip;
1934 if (efer & EFER_LMA) { 1889 if (efer & EFER_LMA) {
1935#ifdef CONFIG_X86_64 1890#ifdef CONFIG_X86_64
1936 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; 1891 ctxt->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
1937 1892
1938 ops->get_msr(ctxt, 1893 ops->get_msr(ctxt,
1939 ctxt->mode == X86EMUL_MODE_PROT64 ? 1894 ctxt->mode == X86EMUL_MODE_PROT64 ?
1940 MSR_LSTAR : MSR_CSTAR, &msr_data); 1895 MSR_LSTAR : MSR_CSTAR, &msr_data);
1941 c->eip = msr_data; 1896 ctxt->_eip = msr_data;
1942 1897
1943 ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); 1898 ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
1944 ctxt->eflags &= ~(msr_data | EFLG_RF); 1899 ctxt->eflags &= ~(msr_data | EFLG_RF);
@@ -1946,7 +1901,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1946 } else { 1901 } else {
1947 /* legacy mode */ 1902 /* legacy mode */
1948 ops->get_msr(ctxt, MSR_STAR, &msr_data); 1903 ops->get_msr(ctxt, MSR_STAR, &msr_data);
1949 c->eip = (u32)msr_data; 1904 ctxt->_eip = (u32)msr_data;
1950 1905
1951 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 1906 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
1952 } 1907 }
@@ -1954,16 +1909,15 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1954 return X86EMUL_CONTINUE; 1909 return X86EMUL_CONTINUE;
1955} 1910}
1956 1911
1957static int 1912static int em_sysenter(struct x86_emulate_ctxt *ctxt)
1958emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1959{ 1913{
1960 struct decode_cache *c = &ctxt->decode; 1914 struct x86_emulate_ops *ops = ctxt->ops;
1961 struct desc_struct cs, ss; 1915 struct desc_struct cs, ss;
1962 u64 msr_data; 1916 u64 msr_data;
1963 u16 cs_sel, ss_sel; 1917 u16 cs_sel, ss_sel;
1964 u64 efer = 0; 1918 u64 efer = 0;
1965 1919
1966 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); 1920 ops->get_msr(ctxt, MSR_EFER, &efer);
1967 /* inject #GP if in real mode */ 1921 /* inject #GP if in real mode */
1968 if (ctxt->mode == X86EMUL_MODE_REAL) 1922 if (ctxt->mode == X86EMUL_MODE_REAL)
1969 return emulate_gp(ctxt, 0); 1923 return emulate_gp(ctxt, 0);
@@ -1974,7 +1928,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1974 if (ctxt->mode == X86EMUL_MODE_PROT64) 1928 if (ctxt->mode == X86EMUL_MODE_PROT64)
1975 return emulate_ud(ctxt); 1929 return emulate_ud(ctxt);
1976 1930
1977 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1931 setup_syscalls_segments(ctxt, &cs, &ss);
1978 1932
1979 ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); 1933 ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
1980 switch (ctxt->mode) { 1934 switch (ctxt->mode) {
@@ -2002,31 +1956,30 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2002 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); 1956 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
2003 1957
2004 ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data); 1958 ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data);
2005 c->eip = msr_data; 1959 ctxt->_eip = msr_data;
2006 1960
2007 ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); 1961 ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
2008 c->regs[VCPU_REGS_RSP] = msr_data; 1962 ctxt->regs[VCPU_REGS_RSP] = msr_data;
2009 1963
2010 return X86EMUL_CONTINUE; 1964 return X86EMUL_CONTINUE;
2011} 1965}
2012 1966
2013static int 1967static int em_sysexit(struct x86_emulate_ctxt *ctxt)
2014emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2015{ 1968{
2016 struct decode_cache *c = &ctxt->decode; 1969 struct x86_emulate_ops *ops = ctxt->ops;
2017 struct desc_struct cs, ss; 1970 struct desc_struct cs, ss;
2018 u64 msr_data; 1971 u64 msr_data;
2019 int usermode; 1972 int usermode;
2020 u16 cs_sel, ss_sel; 1973 u16 cs_sel = 0, ss_sel = 0;
2021 1974
2022 /* inject #GP if in real mode or Virtual 8086 mode */ 1975 /* inject #GP if in real mode or Virtual 8086 mode */
2023 if (ctxt->mode == X86EMUL_MODE_REAL || 1976 if (ctxt->mode == X86EMUL_MODE_REAL ||
2024 ctxt->mode == X86EMUL_MODE_VM86) 1977 ctxt->mode == X86EMUL_MODE_VM86)
2025 return emulate_gp(ctxt, 0); 1978 return emulate_gp(ctxt, 0);
2026 1979
2027 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1980 setup_syscalls_segments(ctxt, &cs, &ss);
2028 1981
2029 if ((c->rex_prefix & 0x8) != 0x0) 1982 if ((ctxt->rex_prefix & 0x8) != 0x0)
2030 usermode = X86EMUL_MODE_PROT64; 1983 usermode = X86EMUL_MODE_PROT64;
2031 else 1984 else
2032 usermode = X86EMUL_MODE_PROT32; 1985 usermode = X86EMUL_MODE_PROT32;
@@ -2056,14 +2009,13 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2056 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); 2009 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
2057 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); 2010 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
2058 2011
2059 c->eip = c->regs[VCPU_REGS_RDX]; 2012 ctxt->_eip = ctxt->regs[VCPU_REGS_RDX];
2060 c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; 2013 ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX];
2061 2014
2062 return X86EMUL_CONTINUE; 2015 return X86EMUL_CONTINUE;
2063} 2016}
2064 2017
2065static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt, 2018static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
2066 struct x86_emulate_ops *ops)
2067{ 2019{
2068 int iopl; 2020 int iopl;
2069 if (ctxt->mode == X86EMUL_MODE_REAL) 2021 if (ctxt->mode == X86EMUL_MODE_REAL)
@@ -2071,13 +2023,13 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
2071 if (ctxt->mode == X86EMUL_MODE_VM86) 2023 if (ctxt->mode == X86EMUL_MODE_VM86)
2072 return true; 2024 return true;
2073 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 2025 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
2074 return ops->cpl(ctxt) > iopl; 2026 return ctxt->ops->cpl(ctxt) > iopl;
2075} 2027}
2076 2028
2077static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, 2029static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
2078 struct x86_emulate_ops *ops,
2079 u16 port, u16 len) 2030 u16 port, u16 len)
2080{ 2031{
2032 struct x86_emulate_ops *ops = ctxt->ops;
2081 struct desc_struct tr_seg; 2033 struct desc_struct tr_seg;
2082 u32 base3; 2034 u32 base3;
2083 int r; 2035 int r;
@@ -2108,14 +2060,13 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
2108} 2060}
2109 2061
2110static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, 2062static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
2111 struct x86_emulate_ops *ops,
2112 u16 port, u16 len) 2063 u16 port, u16 len)
2113{ 2064{
2114 if (ctxt->perm_ok) 2065 if (ctxt->perm_ok)
2115 return true; 2066 return true;
2116 2067
2117 if (emulator_bad_iopl(ctxt, ops)) 2068 if (emulator_bad_iopl(ctxt))
2118 if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) 2069 if (!emulator_io_port_access_allowed(ctxt, port, len))
2119 return false; 2070 return false;
2120 2071
2121 ctxt->perm_ok = true; 2072 ctxt->perm_ok = true;
@@ -2124,21 +2075,18 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
2124} 2075}
2125 2076
2126static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, 2077static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2127 struct x86_emulate_ops *ops,
2128 struct tss_segment_16 *tss) 2078 struct tss_segment_16 *tss)
2129{ 2079{
2130 struct decode_cache *c = &ctxt->decode; 2080 tss->ip = ctxt->_eip;
2131
2132 tss->ip = c->eip;
2133 tss->flag = ctxt->eflags; 2081 tss->flag = ctxt->eflags;
2134 tss->ax = c->regs[VCPU_REGS_RAX]; 2082 tss->ax = ctxt->regs[VCPU_REGS_RAX];
2135 tss->cx = c->regs[VCPU_REGS_RCX]; 2083 tss->cx = ctxt->regs[VCPU_REGS_RCX];
2136 tss->dx = c->regs[VCPU_REGS_RDX]; 2084 tss->dx = ctxt->regs[VCPU_REGS_RDX];
2137 tss->bx = c->regs[VCPU_REGS_RBX]; 2085 tss->bx = ctxt->regs[VCPU_REGS_RBX];
2138 tss->sp = c->regs[VCPU_REGS_RSP]; 2086 tss->sp = ctxt->regs[VCPU_REGS_RSP];
2139 tss->bp = c->regs[VCPU_REGS_RBP]; 2087 tss->bp = ctxt->regs[VCPU_REGS_RBP];
2140 tss->si = c->regs[VCPU_REGS_RSI]; 2088 tss->si = ctxt->regs[VCPU_REGS_RSI];
2141 tss->di = c->regs[VCPU_REGS_RDI]; 2089 tss->di = ctxt->regs[VCPU_REGS_RDI];
2142 2090
2143 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); 2091 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
2144 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); 2092 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2148,22 +2096,20 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2148} 2096}
2149 2097
2150static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, 2098static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2151 struct x86_emulate_ops *ops,
2152 struct tss_segment_16 *tss) 2099 struct tss_segment_16 *tss)
2153{ 2100{
2154 struct decode_cache *c = &ctxt->decode;
2155 int ret; 2101 int ret;
2156 2102
2157 c->eip = tss->ip; 2103 ctxt->_eip = tss->ip;
2158 ctxt->eflags = tss->flag | 2; 2104 ctxt->eflags = tss->flag | 2;
2159 c->regs[VCPU_REGS_RAX] = tss->ax; 2105 ctxt->regs[VCPU_REGS_RAX] = tss->ax;
2160 c->regs[VCPU_REGS_RCX] = tss->cx; 2106 ctxt->regs[VCPU_REGS_RCX] = tss->cx;
2161 c->regs[VCPU_REGS_RDX] = tss->dx; 2107 ctxt->regs[VCPU_REGS_RDX] = tss->dx;
2162 c->regs[VCPU_REGS_RBX] = tss->bx; 2108 ctxt->regs[VCPU_REGS_RBX] = tss->bx;
2163 c->regs[VCPU_REGS_RSP] = tss->sp; 2109 ctxt->regs[VCPU_REGS_RSP] = tss->sp;
2164 c->regs[VCPU_REGS_RBP] = tss->bp; 2110 ctxt->regs[VCPU_REGS_RBP] = tss->bp;
2165 c->regs[VCPU_REGS_RSI] = tss->si; 2111 ctxt->regs[VCPU_REGS_RSI] = tss->si;
2166 c->regs[VCPU_REGS_RDI] = tss->di; 2112 ctxt->regs[VCPU_REGS_RDI] = tss->di;
2167 2113
2168 /* 2114 /*
2169 * SDM says that segment selectors are loaded before segment 2115 * SDM says that segment selectors are loaded before segment
@@ -2179,19 +2125,19 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2179 * Now load segment descriptors. If fault happenes at this stage 2125 * Now load segment descriptors. If fault happenes at this stage
2180 * it is handled in a context of new task 2126 * it is handled in a context of new task
2181 */ 2127 */
2182 ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR); 2128 ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
2183 if (ret != X86EMUL_CONTINUE) 2129 if (ret != X86EMUL_CONTINUE)
2184 return ret; 2130 return ret;
2185 ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES); 2131 ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
2186 if (ret != X86EMUL_CONTINUE) 2132 if (ret != X86EMUL_CONTINUE)
2187 return ret; 2133 return ret;
2188 ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS); 2134 ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
2189 if (ret != X86EMUL_CONTINUE) 2135 if (ret != X86EMUL_CONTINUE)
2190 return ret; 2136 return ret;
2191 ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS); 2137 ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
2192 if (ret != X86EMUL_CONTINUE) 2138 if (ret != X86EMUL_CONTINUE)
2193 return ret; 2139 return ret;
2194 ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS); 2140 ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
2195 if (ret != X86EMUL_CONTINUE) 2141 if (ret != X86EMUL_CONTINUE)
2196 return ret; 2142 return ret;
2197 2143
@@ -2199,10 +2145,10 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2199} 2145}
2200 2146
2201static int task_switch_16(struct x86_emulate_ctxt *ctxt, 2147static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2202 struct x86_emulate_ops *ops,
2203 u16 tss_selector, u16 old_tss_sel, 2148 u16 tss_selector, u16 old_tss_sel,
2204 ulong old_tss_base, struct desc_struct *new_desc) 2149 ulong old_tss_base, struct desc_struct *new_desc)
2205{ 2150{
2151 struct x86_emulate_ops *ops = ctxt->ops;
2206 struct tss_segment_16 tss_seg; 2152 struct tss_segment_16 tss_seg;
2207 int ret; 2153 int ret;
2208 u32 new_tss_base = get_desc_base(new_desc); 2154 u32 new_tss_base = get_desc_base(new_desc);
@@ -2213,7 +2159,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2213 /* FIXME: need to provide precise fault address */ 2159 /* FIXME: need to provide precise fault address */
2214 return ret; 2160 return ret;
2215 2161
2216 save_state_to_tss16(ctxt, ops, &tss_seg); 2162 save_state_to_tss16(ctxt, &tss_seg);
2217 2163
2218 ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, 2164 ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
2219 &ctxt->exception); 2165 &ctxt->exception);
@@ -2239,26 +2185,23 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2239 return ret; 2185 return ret;
2240 } 2186 }
2241 2187
2242 return load_state_from_tss16(ctxt, ops, &tss_seg); 2188 return load_state_from_tss16(ctxt, &tss_seg);
2243} 2189}
2244 2190
2245static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, 2191static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
2246 struct x86_emulate_ops *ops,
2247 struct tss_segment_32 *tss) 2192 struct tss_segment_32 *tss)
2248{ 2193{
2249 struct decode_cache *c = &ctxt->decode; 2194 tss->cr3 = ctxt->ops->get_cr(ctxt, 3);
2250 2195 tss->eip = ctxt->_eip;
2251 tss->cr3 = ops->get_cr(ctxt, 3);
2252 tss->eip = c->eip;
2253 tss->eflags = ctxt->eflags; 2196 tss->eflags = ctxt->eflags;
2254 tss->eax = c->regs[VCPU_REGS_RAX]; 2197 tss->eax = ctxt->regs[VCPU_REGS_RAX];
2255 tss->ecx = c->regs[VCPU_REGS_RCX]; 2198 tss->ecx = ctxt->regs[VCPU_REGS_RCX];
2256 tss->edx = c->regs[VCPU_REGS_RDX]; 2199 tss->edx = ctxt->regs[VCPU_REGS_RDX];
2257 tss->ebx = c->regs[VCPU_REGS_RBX]; 2200 tss->ebx = ctxt->regs[VCPU_REGS_RBX];
2258 tss->esp = c->regs[VCPU_REGS_RSP]; 2201 tss->esp = ctxt->regs[VCPU_REGS_RSP];
2259 tss->ebp = c->regs[VCPU_REGS_RBP]; 2202 tss->ebp = ctxt->regs[VCPU_REGS_RBP];
2260 tss->esi = c->regs[VCPU_REGS_RSI]; 2203 tss->esi = ctxt->regs[VCPU_REGS_RSI];
2261 tss->edi = c->regs[VCPU_REGS_RDI]; 2204 tss->edi = ctxt->regs[VCPU_REGS_RDI];
2262 2205
2263 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); 2206 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
2264 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); 2207 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2270,24 +2213,22 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
2270} 2213}
2271 2214
2272static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, 2215static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2273 struct x86_emulate_ops *ops,
2274 struct tss_segment_32 *tss) 2216 struct tss_segment_32 *tss)
2275{ 2217{
2276 struct decode_cache *c = &ctxt->decode;
2277 int ret; 2218 int ret;
2278 2219
2279 if (ops->set_cr(ctxt, 3, tss->cr3)) 2220 if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
2280 return emulate_gp(ctxt, 0); 2221 return emulate_gp(ctxt, 0);
2281 c->eip = tss->eip; 2222 ctxt->_eip = tss->eip;
2282 ctxt->eflags = tss->eflags | 2; 2223 ctxt->eflags = tss->eflags | 2;
2283 c->regs[VCPU_REGS_RAX] = tss->eax; 2224 ctxt->regs[VCPU_REGS_RAX] = tss->eax;
2284 c->regs[VCPU_REGS_RCX] = tss->ecx; 2225 ctxt->regs[VCPU_REGS_RCX] = tss->ecx;
2285 c->regs[VCPU_REGS_RDX] = tss->edx; 2226 ctxt->regs[VCPU_REGS_RDX] = tss->edx;
2286 c->regs[VCPU_REGS_RBX] = tss->ebx; 2227 ctxt->regs[VCPU_REGS_RBX] = tss->ebx;
2287 c->regs[VCPU_REGS_RSP] = tss->esp; 2228 ctxt->regs[VCPU_REGS_RSP] = tss->esp;
2288 c->regs[VCPU_REGS_RBP] = tss->ebp; 2229 ctxt->regs[VCPU_REGS_RBP] = tss->ebp;
2289 c->regs[VCPU_REGS_RSI] = tss->esi; 2230 ctxt->regs[VCPU_REGS_RSI] = tss->esi;
2290 c->regs[VCPU_REGS_RDI] = tss->edi; 2231 ctxt->regs[VCPU_REGS_RDI] = tss->edi;
2291 2232
2292 /* 2233 /*
2293 * SDM says that segment selectors are loaded before segment 2234 * SDM says that segment selectors are loaded before segment
@@ -2305,25 +2246,25 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2305 * Now load segment descriptors. If fault happenes at this stage 2246 * Now load segment descriptors. If fault happenes at this stage
2306 * it is handled in a context of new task 2247 * it is handled in a context of new task
2307 */ 2248 */
2308 ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR); 2249 ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
2309 if (ret != X86EMUL_CONTINUE) 2250 if (ret != X86EMUL_CONTINUE)
2310 return ret; 2251 return ret;
2311 ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES); 2252 ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
2312 if (ret != X86EMUL_CONTINUE) 2253 if (ret != X86EMUL_CONTINUE)
2313 return ret; 2254 return ret;
2314 ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS); 2255 ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
2315 if (ret != X86EMUL_CONTINUE) 2256 if (ret != X86EMUL_CONTINUE)
2316 return ret; 2257 return ret;
2317 ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS); 2258 ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
2318 if (ret != X86EMUL_CONTINUE) 2259 if (ret != X86EMUL_CONTINUE)
2319 return ret; 2260 return ret;
2320 ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS); 2261 ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
2321 if (ret != X86EMUL_CONTINUE) 2262 if (ret != X86EMUL_CONTINUE)
2322 return ret; 2263 return ret;
2323 ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS); 2264 ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS);
2324 if (ret != X86EMUL_CONTINUE) 2265 if (ret != X86EMUL_CONTINUE)
2325 return ret; 2266 return ret;
2326 ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS); 2267 ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS);
2327 if (ret != X86EMUL_CONTINUE) 2268 if (ret != X86EMUL_CONTINUE)
2328 return ret; 2269 return ret;
2329 2270
@@ -2331,10 +2272,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2331} 2272}
2332 2273
2333static int task_switch_32(struct x86_emulate_ctxt *ctxt, 2274static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2334 struct x86_emulate_ops *ops,
2335 u16 tss_selector, u16 old_tss_sel, 2275 u16 tss_selector, u16 old_tss_sel,
2336 ulong old_tss_base, struct desc_struct *new_desc) 2276 ulong old_tss_base, struct desc_struct *new_desc)
2337{ 2277{
2278 struct x86_emulate_ops *ops = ctxt->ops;
2338 struct tss_segment_32 tss_seg; 2279 struct tss_segment_32 tss_seg;
2339 int ret; 2280 int ret;
2340 u32 new_tss_base = get_desc_base(new_desc); 2281 u32 new_tss_base = get_desc_base(new_desc);
@@ -2345,7 +2286,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2345 /* FIXME: need to provide precise fault address */ 2286 /* FIXME: need to provide precise fault address */
2346 return ret; 2287 return ret;
2347 2288
2348 save_state_to_tss32(ctxt, ops, &tss_seg); 2289 save_state_to_tss32(ctxt, &tss_seg);
2349 2290
2350 ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, 2291 ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
2351 &ctxt->exception); 2292 &ctxt->exception);
@@ -2371,14 +2312,14 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2371 return ret; 2312 return ret;
2372 } 2313 }
2373 2314
2374 return load_state_from_tss32(ctxt, ops, &tss_seg); 2315 return load_state_from_tss32(ctxt, &tss_seg);
2375} 2316}
2376 2317
2377static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, 2318static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2378 struct x86_emulate_ops *ops,
2379 u16 tss_selector, int reason, 2319 u16 tss_selector, int reason,
2380 bool has_error_code, u32 error_code) 2320 bool has_error_code, u32 error_code)
2381{ 2321{
2322 struct x86_emulate_ops *ops = ctxt->ops;
2382 struct desc_struct curr_tss_desc, next_tss_desc; 2323 struct desc_struct curr_tss_desc, next_tss_desc;
2383 int ret; 2324 int ret;
2384 u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); 2325 u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
@@ -2388,10 +2329,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2388 2329
2389 /* FIXME: old_tss_base == ~0 ? */ 2330 /* FIXME: old_tss_base == ~0 ? */
2390 2331
2391 ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc); 2332 ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
2392 if (ret != X86EMUL_CONTINUE) 2333 if (ret != X86EMUL_CONTINUE)
2393 return ret; 2334 return ret;
2394 ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc); 2335 ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
2395 if (ret != X86EMUL_CONTINUE) 2336 if (ret != X86EMUL_CONTINUE)
2396 return ret; 2337 return ret;
2397 2338
@@ -2413,8 +2354,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2413 2354
2414 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { 2355 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
2415 curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */ 2356 curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
2416 write_segment_descriptor(ctxt, ops, old_tss_sel, 2357 write_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
2417 &curr_tss_desc);
2418 } 2358 }
2419 2359
2420 if (reason == TASK_SWITCH_IRET) 2360 if (reason == TASK_SWITCH_IRET)
@@ -2426,10 +2366,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2426 old_tss_sel = 0xffff; 2366 old_tss_sel = 0xffff;
2427 2367
2428 if (next_tss_desc.type & 8) 2368 if (next_tss_desc.type & 8)
2429 ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel, 2369 ret = task_switch_32(ctxt, tss_selector, old_tss_sel,
2430 old_tss_base, &next_tss_desc); 2370 old_tss_base, &next_tss_desc);
2431 else 2371 else
2432 ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel, 2372 ret = task_switch_16(ctxt, tss_selector, old_tss_sel,
2433 old_tss_base, &next_tss_desc); 2373 old_tss_base, &next_tss_desc);
2434 if (ret != X86EMUL_CONTINUE) 2374 if (ret != X86EMUL_CONTINUE)
2435 return ret; 2375 return ret;
@@ -2439,19 +2379,16 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2439 2379
2440 if (reason != TASK_SWITCH_IRET) { 2380 if (reason != TASK_SWITCH_IRET) {
2441 next_tss_desc.type |= (1 << 1); /* set busy flag */ 2381 next_tss_desc.type |= (1 << 1); /* set busy flag */
2442 write_segment_descriptor(ctxt, ops, tss_selector, 2382 write_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
2443 &next_tss_desc);
2444 } 2383 }
2445 2384
2446 ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS); 2385 ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS);
2447 ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR); 2386 ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
2448 2387
2449 if (has_error_code) { 2388 if (has_error_code) {
2450 struct decode_cache *c = &ctxt->decode; 2389 ctxt->op_bytes = ctxt->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
2451 2390 ctxt->lock_prefix = 0;
2452 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; 2391 ctxt->src.val = (unsigned long) error_code;
2453 c->lock_prefix = 0;
2454 c->src.val = (unsigned long) error_code;
2455 ret = em_push(ctxt); 2392 ret = em_push(ctxt);
2456 } 2393 }
2457 2394
@@ -2462,18 +2399,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2462 u16 tss_selector, int reason, 2399 u16 tss_selector, int reason,
2463 bool has_error_code, u32 error_code) 2400 bool has_error_code, u32 error_code)
2464{ 2401{
2465 struct x86_emulate_ops *ops = ctxt->ops;
2466 struct decode_cache *c = &ctxt->decode;
2467 int rc; 2402 int rc;
2468 2403
2469 c->eip = ctxt->eip; 2404 ctxt->_eip = ctxt->eip;
2470 c->dst.type = OP_NONE; 2405 ctxt->dst.type = OP_NONE;
2471 2406
2472 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, 2407 rc = emulator_do_task_switch(ctxt, tss_selector, reason,
2473 has_error_code, error_code); 2408 has_error_code, error_code);
2474 2409
2475 if (rc == X86EMUL_CONTINUE) 2410 if (rc == X86EMUL_CONTINUE)
2476 ctxt->eip = c->eip; 2411 ctxt->eip = ctxt->_eip;
2477 2412
2478 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 2413 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
2479} 2414}
@@ -2481,22 +2416,20 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2481static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, 2416static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
2482 int reg, struct operand *op) 2417 int reg, struct operand *op)
2483{ 2418{
2484 struct decode_cache *c = &ctxt->decode;
2485 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; 2419 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
2486 2420
2487 register_address_increment(c, &c->regs[reg], df * op->bytes); 2421 register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes);
2488 op->addr.mem.ea = register_address(c, c->regs[reg]); 2422 op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]);
2489 op->addr.mem.seg = seg; 2423 op->addr.mem.seg = seg;
2490} 2424}
2491 2425
2492static int em_das(struct x86_emulate_ctxt *ctxt) 2426static int em_das(struct x86_emulate_ctxt *ctxt)
2493{ 2427{
2494 struct decode_cache *c = &ctxt->decode;
2495 u8 al, old_al; 2428 u8 al, old_al;
2496 bool af, cf, old_cf; 2429 bool af, cf, old_cf;
2497 2430
2498 cf = ctxt->eflags & X86_EFLAGS_CF; 2431 cf = ctxt->eflags & X86_EFLAGS_CF;
2499 al = c->dst.val; 2432 al = ctxt->dst.val;
2500 2433
2501 old_al = al; 2434 old_al = al;
2502 old_cf = cf; 2435 old_cf = cf;
@@ -2514,12 +2447,12 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
2514 cf = true; 2447 cf = true;
2515 } 2448 }
2516 2449
2517 c->dst.val = al; 2450 ctxt->dst.val = al;
2518 /* Set PF, ZF, SF */ 2451 /* Set PF, ZF, SF */
2519 c->src.type = OP_IMM; 2452 ctxt->src.type = OP_IMM;
2520 c->src.val = 0; 2453 ctxt->src.val = 0;
2521 c->src.bytes = 1; 2454 ctxt->src.bytes = 1;
2522 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); 2455 emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags);
2523 ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); 2456 ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
2524 if (cf) 2457 if (cf)
2525 ctxt->eflags |= X86_EFLAGS_CF; 2458 ctxt->eflags |= X86_EFLAGS_CF;
@@ -2530,175 +2463,189 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
2530 2463
2531static int em_call_far(struct x86_emulate_ctxt *ctxt) 2464static int em_call_far(struct x86_emulate_ctxt *ctxt)
2532{ 2465{
2533 struct decode_cache *c = &ctxt->decode;
2534 u16 sel, old_cs; 2466 u16 sel, old_cs;
2535 ulong old_eip; 2467 ulong old_eip;
2536 int rc; 2468 int rc;
2537 2469
2538 old_cs = get_segment_selector(ctxt, VCPU_SREG_CS); 2470 old_cs = get_segment_selector(ctxt, VCPU_SREG_CS);
2539 old_eip = c->eip; 2471 old_eip = ctxt->_eip;
2540 2472
2541 memcpy(&sel, c->src.valptr + c->op_bytes, 2); 2473 memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
2542 if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS)) 2474 if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS))
2543 return X86EMUL_CONTINUE; 2475 return X86EMUL_CONTINUE;
2544 2476
2545 c->eip = 0; 2477 ctxt->_eip = 0;
2546 memcpy(&c->eip, c->src.valptr, c->op_bytes); 2478 memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes);
2547 2479
2548 c->src.val = old_cs; 2480 ctxt->src.val = old_cs;
2549 rc = em_push(ctxt); 2481 rc = em_push(ctxt);
2550 if (rc != X86EMUL_CONTINUE) 2482 if (rc != X86EMUL_CONTINUE)
2551 return rc; 2483 return rc;
2552 2484
2553 c->src.val = old_eip; 2485 ctxt->src.val = old_eip;
2554 return em_push(ctxt); 2486 return em_push(ctxt);
2555} 2487}
2556 2488
2557static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) 2489static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
2558{ 2490{
2559 struct decode_cache *c = &ctxt->decode;
2560 int rc; 2491 int rc;
2561 2492
2562 c->dst.type = OP_REG; 2493 ctxt->dst.type = OP_REG;
2563 c->dst.addr.reg = &c->eip; 2494 ctxt->dst.addr.reg = &ctxt->_eip;
2564 c->dst.bytes = c->op_bytes; 2495 ctxt->dst.bytes = ctxt->op_bytes;
2565 rc = emulate_pop(ctxt, &c->dst.val, c->op_bytes); 2496 rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
2566 if (rc != X86EMUL_CONTINUE) 2497 if (rc != X86EMUL_CONTINUE)
2567 return rc; 2498 return rc;
2568 register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val); 2499 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val);
2569 return X86EMUL_CONTINUE; 2500 return X86EMUL_CONTINUE;
2570} 2501}
2571 2502
2572static int em_add(struct x86_emulate_ctxt *ctxt) 2503static int em_add(struct x86_emulate_ctxt *ctxt)
2573{ 2504{
2574 struct decode_cache *c = &ctxt->decode; 2505 emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags);
2575
2576 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
2577 return X86EMUL_CONTINUE; 2506 return X86EMUL_CONTINUE;
2578} 2507}
2579 2508
2580static int em_or(struct x86_emulate_ctxt *ctxt) 2509static int em_or(struct x86_emulate_ctxt *ctxt)
2581{ 2510{
2582 struct decode_cache *c = &ctxt->decode; 2511 emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags);
2583
2584 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
2585 return X86EMUL_CONTINUE; 2512 return X86EMUL_CONTINUE;
2586} 2513}
2587 2514
2588static int em_adc(struct x86_emulate_ctxt *ctxt) 2515static int em_adc(struct x86_emulate_ctxt *ctxt)
2589{ 2516{
2590 struct decode_cache *c = &ctxt->decode; 2517 emulate_2op_SrcV("adc", ctxt->src, ctxt->dst, ctxt->eflags);
2591
2592 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
2593 return X86EMUL_CONTINUE; 2518 return X86EMUL_CONTINUE;
2594} 2519}
2595 2520
2596static int em_sbb(struct x86_emulate_ctxt *ctxt) 2521static int em_sbb(struct x86_emulate_ctxt *ctxt)
2597{ 2522{
2598 struct decode_cache *c = &ctxt->decode; 2523 emulate_2op_SrcV("sbb", ctxt->src, ctxt->dst, ctxt->eflags);
2599
2600 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
2601 return X86EMUL_CONTINUE; 2524 return X86EMUL_CONTINUE;
2602} 2525}
2603 2526
2604static int em_and(struct x86_emulate_ctxt *ctxt) 2527static int em_and(struct x86_emulate_ctxt *ctxt)
2605{ 2528{
2606 struct decode_cache *c = &ctxt->decode; 2529 emulate_2op_SrcV("and", ctxt->src, ctxt->dst, ctxt->eflags);
2607
2608 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
2609 return X86EMUL_CONTINUE; 2530 return X86EMUL_CONTINUE;
2610} 2531}
2611 2532
2612static int em_sub(struct x86_emulate_ctxt *ctxt) 2533static int em_sub(struct x86_emulate_ctxt *ctxt)
2613{ 2534{
2614 struct decode_cache *c = &ctxt->decode; 2535 emulate_2op_SrcV("sub", ctxt->src, ctxt->dst, ctxt->eflags);
2615
2616 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
2617 return X86EMUL_CONTINUE; 2536 return X86EMUL_CONTINUE;
2618} 2537}
2619 2538
2620static int em_xor(struct x86_emulate_ctxt *ctxt) 2539static int em_xor(struct x86_emulate_ctxt *ctxt)
2621{ 2540{
2622 struct decode_cache *c = &ctxt->decode; 2541 emulate_2op_SrcV("xor", ctxt->src, ctxt->dst, ctxt->eflags);
2623
2624 emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
2625 return X86EMUL_CONTINUE; 2542 return X86EMUL_CONTINUE;
2626} 2543}
2627 2544
2628static int em_cmp(struct x86_emulate_ctxt *ctxt) 2545static int em_cmp(struct x86_emulate_ctxt *ctxt)
2629{ 2546{
2630 struct decode_cache *c = &ctxt->decode; 2547 emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags);
2631
2632 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
2633 /* Disable writeback. */ 2548 /* Disable writeback. */
2634 c->dst.type = OP_NONE; 2549 ctxt->dst.type = OP_NONE;
2635 return X86EMUL_CONTINUE; 2550 return X86EMUL_CONTINUE;
2636} 2551}
2637 2552
2638static int em_imul(struct x86_emulate_ctxt *ctxt) 2553static int em_test(struct x86_emulate_ctxt *ctxt)
2554{
2555 emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags);
2556 return X86EMUL_CONTINUE;
2557}
2558
2559static int em_xchg(struct x86_emulate_ctxt *ctxt)
2639{ 2560{
2640 struct decode_cache *c = &ctxt->decode; 2561 /* Write back the register source. */
2562 ctxt->src.val = ctxt->dst.val;
2563 write_register_operand(&ctxt->src);
2641 2564
2642 emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags); 2565 /* Write back the memory destination with implicit LOCK prefix. */
2566 ctxt->dst.val = ctxt->src.orig_val;
2567 ctxt->lock_prefix = 1;
2643 return X86EMUL_CONTINUE; 2568 return X86EMUL_CONTINUE;
2644} 2569}
2645 2570
2646static int em_imul_3op(struct x86_emulate_ctxt *ctxt) 2571static int em_imul(struct x86_emulate_ctxt *ctxt)
2647{ 2572{
2648 struct decode_cache *c = &ctxt->decode; 2573 emulate_2op_SrcV_nobyte("imul", ctxt->src, ctxt->dst, ctxt->eflags);
2574 return X86EMUL_CONTINUE;
2575}
2649 2576
2650 c->dst.val = c->src2.val; 2577static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
2578{
2579 ctxt->dst.val = ctxt->src2.val;
2651 return em_imul(ctxt); 2580 return em_imul(ctxt);
2652} 2581}
2653 2582
2654static int em_cwd(struct x86_emulate_ctxt *ctxt) 2583static int em_cwd(struct x86_emulate_ctxt *ctxt)
2655{ 2584{
2656 struct decode_cache *c = &ctxt->decode; 2585 ctxt->dst.type = OP_REG;
2657 2586 ctxt->dst.bytes = ctxt->src.bytes;
2658 c->dst.type = OP_REG; 2587 ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
2659 c->dst.bytes = c->src.bytes; 2588 ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1);
2660 c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
2661 c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1);
2662 2589
2663 return X86EMUL_CONTINUE; 2590 return X86EMUL_CONTINUE;
2664} 2591}
2665 2592
2666static int em_rdtsc(struct x86_emulate_ctxt *ctxt) 2593static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
2667{ 2594{
2668 struct decode_cache *c = &ctxt->decode;
2669 u64 tsc = 0; 2595 u64 tsc = 0;
2670 2596
2671 ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); 2597 ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
2672 c->regs[VCPU_REGS_RAX] = (u32)tsc; 2598 ctxt->regs[VCPU_REGS_RAX] = (u32)tsc;
2673 c->regs[VCPU_REGS_RDX] = tsc >> 32; 2599 ctxt->regs[VCPU_REGS_RDX] = tsc >> 32;
2674 return X86EMUL_CONTINUE; 2600 return X86EMUL_CONTINUE;
2675} 2601}
2676 2602
2677static int em_mov(struct x86_emulate_ctxt *ctxt) 2603static int em_mov(struct x86_emulate_ctxt *ctxt)
2678{ 2604{
2679 struct decode_cache *c = &ctxt->decode; 2605 ctxt->dst.val = ctxt->src.val;
2680 c->dst.val = c->src.val;
2681 return X86EMUL_CONTINUE; 2606 return X86EMUL_CONTINUE;
2682} 2607}
2683 2608
2609static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
2610{
2611 if (ctxt->modrm_reg > VCPU_SREG_GS)
2612 return emulate_ud(ctxt);
2613
2614 ctxt->dst.val = get_segment_selector(ctxt, ctxt->modrm_reg);
2615 return X86EMUL_CONTINUE;
2616}
2617
2618static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
2619{
2620 u16 sel = ctxt->src.val;
2621
2622 if (ctxt->modrm_reg == VCPU_SREG_CS || ctxt->modrm_reg > VCPU_SREG_GS)
2623 return emulate_ud(ctxt);
2624
2625 if (ctxt->modrm_reg == VCPU_SREG_SS)
2626 ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
2627
2628 /* Disable writeback. */
2629 ctxt->dst.type = OP_NONE;
2630 return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
2631}
2632
2684static int em_movdqu(struct x86_emulate_ctxt *ctxt) 2633static int em_movdqu(struct x86_emulate_ctxt *ctxt)
2685{ 2634{
2686 struct decode_cache *c = &ctxt->decode; 2635 memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes);
2687 memcpy(&c->dst.vec_val, &c->src.vec_val, c->op_bytes);
2688 return X86EMUL_CONTINUE; 2636 return X86EMUL_CONTINUE;
2689} 2637}
2690 2638
2691static int em_invlpg(struct x86_emulate_ctxt *ctxt) 2639static int em_invlpg(struct x86_emulate_ctxt *ctxt)
2692{ 2640{
2693 struct decode_cache *c = &ctxt->decode;
2694 int rc; 2641 int rc;
2695 ulong linear; 2642 ulong linear;
2696 2643
2697 rc = linearize(ctxt, c->src.addr.mem, 1, false, &linear); 2644 rc = linearize(ctxt, ctxt->src.addr.mem, 1, false, &linear);
2698 if (rc == X86EMUL_CONTINUE) 2645 if (rc == X86EMUL_CONTINUE)
2699 ctxt->ops->invlpg(ctxt, linear); 2646 ctxt->ops->invlpg(ctxt, linear);
2700 /* Disable writeback. */ 2647 /* Disable writeback. */
2701 c->dst.type = OP_NONE; 2648 ctxt->dst.type = OP_NONE;
2702 return X86EMUL_CONTINUE; 2649 return X86EMUL_CONTINUE;
2703} 2650}
2704 2651
@@ -2714,10 +2661,9 @@ static int em_clts(struct x86_emulate_ctxt *ctxt)
2714 2661
2715static int em_vmcall(struct x86_emulate_ctxt *ctxt) 2662static int em_vmcall(struct x86_emulate_ctxt *ctxt)
2716{ 2663{
2717 struct decode_cache *c = &ctxt->decode;
2718 int rc; 2664 int rc;
2719 2665
2720 if (c->modrm_mod != 3 || c->modrm_rm != 1) 2666 if (ctxt->modrm_mod != 3 || ctxt->modrm_rm != 1)
2721 return X86EMUL_UNHANDLEABLE; 2667 return X86EMUL_UNHANDLEABLE;
2722 2668
2723 rc = ctxt->ops->fix_hypercall(ctxt); 2669 rc = ctxt->ops->fix_hypercall(ctxt);
@@ -2725,73 +2671,104 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt)
2725 return rc; 2671 return rc;
2726 2672
2727 /* Let the processor re-execute the fixed hypercall */ 2673 /* Let the processor re-execute the fixed hypercall */
2728 c->eip = ctxt->eip; 2674 ctxt->_eip = ctxt->eip;
2729 /* Disable writeback. */ 2675 /* Disable writeback. */
2730 c->dst.type = OP_NONE; 2676 ctxt->dst.type = OP_NONE;
2731 return X86EMUL_CONTINUE; 2677 return X86EMUL_CONTINUE;
2732} 2678}
2733 2679
2734static int em_lgdt(struct x86_emulate_ctxt *ctxt) 2680static int em_lgdt(struct x86_emulate_ctxt *ctxt)
2735{ 2681{
2736 struct decode_cache *c = &ctxt->decode;
2737 struct desc_ptr desc_ptr; 2682 struct desc_ptr desc_ptr;
2738 int rc; 2683 int rc;
2739 2684
2740 rc = read_descriptor(ctxt, c->src.addr.mem, 2685 rc = read_descriptor(ctxt, ctxt->src.addr.mem,
2741 &desc_ptr.size, &desc_ptr.address, 2686 &desc_ptr.size, &desc_ptr.address,
2742 c->op_bytes); 2687 ctxt->op_bytes);
2743 if (rc != X86EMUL_CONTINUE) 2688 if (rc != X86EMUL_CONTINUE)
2744 return rc; 2689 return rc;
2745 ctxt->ops->set_gdt(ctxt, &desc_ptr); 2690 ctxt->ops->set_gdt(ctxt, &desc_ptr);
2746 /* Disable writeback. */ 2691 /* Disable writeback. */
2747 c->dst.type = OP_NONE; 2692 ctxt->dst.type = OP_NONE;
2748 return X86EMUL_CONTINUE; 2693 return X86EMUL_CONTINUE;
2749} 2694}
2750 2695
2751static int em_vmmcall(struct x86_emulate_ctxt *ctxt) 2696static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
2752{ 2697{
2753 struct decode_cache *c = &ctxt->decode;
2754 int rc; 2698 int rc;
2755 2699
2756 rc = ctxt->ops->fix_hypercall(ctxt); 2700 rc = ctxt->ops->fix_hypercall(ctxt);
2757 2701
2758 /* Disable writeback. */ 2702 /* Disable writeback. */
2759 c->dst.type = OP_NONE; 2703 ctxt->dst.type = OP_NONE;
2760 return rc; 2704 return rc;
2761} 2705}
2762 2706
2763static int em_lidt(struct x86_emulate_ctxt *ctxt) 2707static int em_lidt(struct x86_emulate_ctxt *ctxt)
2764{ 2708{
2765 struct decode_cache *c = &ctxt->decode;
2766 struct desc_ptr desc_ptr; 2709 struct desc_ptr desc_ptr;
2767 int rc; 2710 int rc;
2768 2711
2769 rc = read_descriptor(ctxt, c->src.addr.mem, 2712 rc = read_descriptor(ctxt, ctxt->src.addr.mem,
2770 &desc_ptr.size, &desc_ptr.address, 2713 &desc_ptr.size, &desc_ptr.address,
2771 c->op_bytes); 2714 ctxt->op_bytes);
2772 if (rc != X86EMUL_CONTINUE) 2715 if (rc != X86EMUL_CONTINUE)
2773 return rc; 2716 return rc;
2774 ctxt->ops->set_idt(ctxt, &desc_ptr); 2717 ctxt->ops->set_idt(ctxt, &desc_ptr);
2775 /* Disable writeback. */ 2718 /* Disable writeback. */
2776 c->dst.type = OP_NONE; 2719 ctxt->dst.type = OP_NONE;
2777 return X86EMUL_CONTINUE; 2720 return X86EMUL_CONTINUE;
2778} 2721}
2779 2722
2780static int em_smsw(struct x86_emulate_ctxt *ctxt) 2723static int em_smsw(struct x86_emulate_ctxt *ctxt)
2781{ 2724{
2782 struct decode_cache *c = &ctxt->decode; 2725 ctxt->dst.bytes = 2;
2783 2726 ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0);
2784 c->dst.bytes = 2;
2785 c->dst.val = ctxt->ops->get_cr(ctxt, 0);
2786 return X86EMUL_CONTINUE; 2727 return X86EMUL_CONTINUE;
2787} 2728}
2788 2729
2789static int em_lmsw(struct x86_emulate_ctxt *ctxt) 2730static int em_lmsw(struct x86_emulate_ctxt *ctxt)
2790{ 2731{
2791 struct decode_cache *c = &ctxt->decode;
2792 ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul) 2732 ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
2793 | (c->src.val & 0x0f)); 2733 | (ctxt->src.val & 0x0f));
2794 c->dst.type = OP_NONE; 2734 ctxt->dst.type = OP_NONE;
2735 return X86EMUL_CONTINUE;
2736}
2737
2738static int em_loop(struct x86_emulate_ctxt *ctxt)
2739{
2740 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
2741 if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) &&
2742 (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
2743 jmp_rel(ctxt, ctxt->src.val);
2744
2745 return X86EMUL_CONTINUE;
2746}
2747
2748static int em_jcxz(struct x86_emulate_ctxt *ctxt)
2749{
2750 if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0)
2751 jmp_rel(ctxt, ctxt->src.val);
2752
2753 return X86EMUL_CONTINUE;
2754}
2755
2756static int em_cli(struct x86_emulate_ctxt *ctxt)
2757{
2758 if (emulator_bad_iopl(ctxt))
2759 return emulate_gp(ctxt, 0);
2760
2761 ctxt->eflags &= ~X86_EFLAGS_IF;
2762 return X86EMUL_CONTINUE;
2763}
2764
2765static int em_sti(struct x86_emulate_ctxt *ctxt)
2766{
2767 if (emulator_bad_iopl(ctxt))
2768 return emulate_gp(ctxt, 0);
2769
2770 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
2771 ctxt->eflags |= X86_EFLAGS_IF;
2795 return X86EMUL_CONTINUE; 2772 return X86EMUL_CONTINUE;
2796} 2773}
2797 2774
@@ -2809,9 +2786,7 @@ static bool valid_cr(int nr)
2809 2786
2810static int check_cr_read(struct x86_emulate_ctxt *ctxt) 2787static int check_cr_read(struct x86_emulate_ctxt *ctxt)
2811{ 2788{
2812 struct decode_cache *c = &ctxt->decode; 2789 if (!valid_cr(ctxt->modrm_reg))
2813
2814 if (!valid_cr(c->modrm_reg))
2815 return emulate_ud(ctxt); 2790 return emulate_ud(ctxt);
2816 2791
2817 return X86EMUL_CONTINUE; 2792 return X86EMUL_CONTINUE;
@@ -2819,9 +2794,8 @@ static int check_cr_read(struct x86_emulate_ctxt *ctxt)
2819 2794
2820static int check_cr_write(struct x86_emulate_ctxt *ctxt) 2795static int check_cr_write(struct x86_emulate_ctxt *ctxt)
2821{ 2796{
2822 struct decode_cache *c = &ctxt->decode; 2797 u64 new_val = ctxt->src.val64;
2823 u64 new_val = c->src.val64; 2798 int cr = ctxt->modrm_reg;
2824 int cr = c->modrm_reg;
2825 u64 efer = 0; 2799 u64 efer = 0;
2826 2800
2827 static u64 cr_reserved_bits[] = { 2801 static u64 cr_reserved_bits[] = {
@@ -2898,8 +2872,7 @@ static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
2898 2872
2899static int check_dr_read(struct x86_emulate_ctxt *ctxt) 2873static int check_dr_read(struct x86_emulate_ctxt *ctxt)
2900{ 2874{
2901 struct decode_cache *c = &ctxt->decode; 2875 int dr = ctxt->modrm_reg;
2902 int dr = c->modrm_reg;
2903 u64 cr4; 2876 u64 cr4;
2904 2877
2905 if (dr > 7) 2878 if (dr > 7)
@@ -2917,9 +2890,8 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt)
2917 2890
2918static int check_dr_write(struct x86_emulate_ctxt *ctxt) 2891static int check_dr_write(struct x86_emulate_ctxt *ctxt)
2919{ 2892{
2920 struct decode_cache *c = &ctxt->decode; 2893 u64 new_val = ctxt->src.val64;
2921 u64 new_val = c->src.val64; 2894 int dr = ctxt->modrm_reg;
2922 int dr = c->modrm_reg;
2923 2895
2924 if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL)) 2896 if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL))
2925 return emulate_gp(ctxt, 0); 2897 return emulate_gp(ctxt, 0);
@@ -2941,7 +2913,7 @@ static int check_svme(struct x86_emulate_ctxt *ctxt)
2941 2913
2942static int check_svme_pa(struct x86_emulate_ctxt *ctxt) 2914static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
2943{ 2915{
2944 u64 rax = ctxt->decode.regs[VCPU_REGS_RAX]; 2916 u64 rax = ctxt->regs[VCPU_REGS_RAX];
2945 2917
2946 /* Valid physical address? */ 2918 /* Valid physical address? */
2947 if (rax & 0xffff000000000000ULL) 2919 if (rax & 0xffff000000000000ULL)
@@ -2963,7 +2935,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
2963static int check_rdpmc(struct x86_emulate_ctxt *ctxt) 2935static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
2964{ 2936{
2965 u64 cr4 = ctxt->ops->get_cr(ctxt, 4); 2937 u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
2966 u64 rcx = ctxt->decode.regs[VCPU_REGS_RCX]; 2938 u64 rcx = ctxt->regs[VCPU_REGS_RCX];
2967 2939
2968 if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || 2940 if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
2969 (rcx > 3)) 2941 (rcx > 3))
@@ -2974,10 +2946,8 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
2974 2946
2975static int check_perm_in(struct x86_emulate_ctxt *ctxt) 2947static int check_perm_in(struct x86_emulate_ctxt *ctxt)
2976{ 2948{
2977 struct decode_cache *c = &ctxt->decode; 2949 ctxt->dst.bytes = min(ctxt->dst.bytes, 4u);
2978 2950 if (!emulator_io_permited(ctxt, ctxt->src.val, ctxt->dst.bytes))
2979 c->dst.bytes = min(c->dst.bytes, 4u);
2980 if (!emulator_io_permited(ctxt, ctxt->ops, c->src.val, c->dst.bytes))
2981 return emulate_gp(ctxt, 0); 2951 return emulate_gp(ctxt, 0);
2982 2952
2983 return X86EMUL_CONTINUE; 2953 return X86EMUL_CONTINUE;
@@ -2985,10 +2955,8 @@ static int check_perm_in(struct x86_emulate_ctxt *ctxt)
2985 2955
2986static int check_perm_out(struct x86_emulate_ctxt *ctxt) 2956static int check_perm_out(struct x86_emulate_ctxt *ctxt)
2987{ 2957{
2988 struct decode_cache *c = &ctxt->decode; 2958 ctxt->src.bytes = min(ctxt->src.bytes, 4u);
2989 2959 if (!emulator_io_permited(ctxt, ctxt->dst.val, ctxt->src.bytes))
2990 c->src.bytes = min(c->src.bytes, 4u);
2991 if (!emulator_io_permited(ctxt, ctxt->ops, c->dst.val, c->src.bytes))
2992 return emulate_gp(ctxt, 0); 2960 return emulate_gp(ctxt, 0);
2993 2961
2994 return X86EMUL_CONTINUE; 2962 return X86EMUL_CONTINUE;
@@ -3165,12 +3133,15 @@ static struct opcode opcode_table[256] = {
3165 G(DstMem | SrcImm | ModRM | Group, group1), 3133 G(DstMem | SrcImm | ModRM | Group, group1),
3166 G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), 3134 G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
3167 G(DstMem | SrcImmByte | ModRM | Group, group1), 3135 G(DstMem | SrcImmByte | ModRM | Group, group1),
3168 D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock), 3136 I2bv(DstMem | SrcReg | ModRM, em_test),
3137 I2bv(DstMem | SrcReg | ModRM | Lock, em_xchg),
3169 /* 0x88 - 0x8F */ 3138 /* 0x88 - 0x8F */
3170 I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), 3139 I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
3171 I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), 3140 I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
3172 D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg), 3141 I(DstMem | SrcNone | ModRM | Mov, em_mov_rm_sreg),
3173 D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), 3142 D(ModRM | SrcMem | NoAccess | DstReg),
3143 I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm),
3144 G(0, group1A),
3174 /* 0x90 - 0x97 */ 3145 /* 0x90 - 0x97 */
3175 DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)), 3146 DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
3176 /* 0x98 - 0x9F */ 3147 /* 0x98 - 0x9F */
@@ -3184,7 +3155,7 @@ static struct opcode opcode_table[256] = {
3184 I2bv(SrcSI | DstDI | Mov | String, em_mov), 3155 I2bv(SrcSI | DstDI | Mov | String, em_mov),
3185 I2bv(SrcSI | DstDI | String, em_cmp), 3156 I2bv(SrcSI | DstDI | String, em_cmp),
3186 /* 0xA8 - 0xAF */ 3157 /* 0xA8 - 0xAF */
3187 D2bv(DstAcc | SrcImm), 3158 I2bv(DstAcc | SrcImm, em_test),
3188 I2bv(SrcAcc | DstDI | Mov | String, em_mov), 3159 I2bv(SrcAcc | DstDI | Mov | String, em_mov),
3189 I2bv(SrcSI | DstAcc | Mov | String, em_mov), 3160 I2bv(SrcSI | DstAcc | Mov | String, em_mov),
3190 I2bv(SrcAcc | DstDI | String, em_cmp), 3161 I2bv(SrcAcc | DstDI | String, em_cmp),
@@ -3195,25 +3166,26 @@ static struct opcode opcode_table[256] = {
3195 /* 0xC0 - 0xC7 */ 3166 /* 0xC0 - 0xC7 */
3196 D2bv(DstMem | SrcImmByte | ModRM), 3167 D2bv(DstMem | SrcImmByte | ModRM),
3197 I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), 3168 I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
3198 D(ImplicitOps | Stack), 3169 I(ImplicitOps | Stack, em_ret),
3199 D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), 3170 D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
3200 G(ByteOp, group11), G(0, group11), 3171 G(ByteOp, group11), G(0, group11),
3201 /* 0xC8 - 0xCF */ 3172 /* 0xC8 - 0xCF */
3202 N, N, N, D(ImplicitOps | Stack), 3173 N, N, N, I(ImplicitOps | Stack, em_ret_far),
3203 D(ImplicitOps), DI(SrcImmByte, intn), 3174 D(ImplicitOps), DI(SrcImmByte, intn),
3204 D(ImplicitOps | No64), DI(ImplicitOps, iret), 3175 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
3205 /* 0xD0 - 0xD7 */ 3176 /* 0xD0 - 0xD7 */
3206 D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), 3177 D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
3207 N, N, N, N, 3178 N, N, N, N,
3208 /* 0xD8 - 0xDF */ 3179 /* 0xD8 - 0xDF */
3209 N, N, N, N, N, N, N, N, 3180 N, N, N, N, N, N, N, N,
3210 /* 0xE0 - 0xE7 */ 3181 /* 0xE0 - 0xE7 */
3211 X4(D(SrcImmByte)), 3182 X3(I(SrcImmByte, em_loop)),
3183 I(SrcImmByte, em_jcxz),
3212 D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in), 3184 D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in),
3213 D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out), 3185 D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out),
3214 /* 0xE8 - 0xEF */ 3186 /* 0xE8 - 0xEF */
3215 D(SrcImm | Stack), D(SrcImm | ImplicitOps), 3187 D(SrcImm | Stack), D(SrcImm | ImplicitOps),
3216 D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), 3188 I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps),
3217 D2bvIP(SrcDX | DstAcc, in, check_perm_in), 3189 D2bvIP(SrcDX | DstAcc, in, check_perm_in),
3218 D2bvIP(SrcAcc | DstDX, out, check_perm_out), 3190 D2bvIP(SrcAcc | DstDX, out, check_perm_out),
3219 /* 0xF0 - 0xF7 */ 3191 /* 0xF0 - 0xF7 */
@@ -3221,14 +3193,16 @@ static struct opcode opcode_table[256] = {
3221 DI(ImplicitOps | Priv, hlt), D(ImplicitOps), 3193 DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
3222 G(ByteOp, group3), G(0, group3), 3194 G(ByteOp, group3), G(0, group3),
3223 /* 0xF8 - 0xFF */ 3195 /* 0xF8 - 0xFF */
3224 D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), 3196 D(ImplicitOps), D(ImplicitOps),
3197 I(ImplicitOps, em_cli), I(ImplicitOps, em_sti),
3225 D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), 3198 D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
3226}; 3199};
3227 3200
3228static struct opcode twobyte_table[256] = { 3201static struct opcode twobyte_table[256] = {
3229 /* 0x00 - 0x0F */ 3202 /* 0x00 - 0x0F */
3230 G(0, group6), GD(0, &group7), N, N, 3203 G(0, group6), GD(0, &group7), N, N,
3231 N, D(ImplicitOps | VendorSpecific), DI(ImplicitOps | Priv, clts), N, 3204 N, I(ImplicitOps | VendorSpecific, em_syscall),
3205 II(ImplicitOps | Priv, em_clts, clts), N,
3232 DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, 3206 DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
3233 N, D(ImplicitOps | ModRM), N, N, 3207 N, D(ImplicitOps | ModRM), N, N,
3234 /* 0x10 - 0x1F */ 3208 /* 0x10 - 0x1F */
@@ -3245,7 +3219,8 @@ static struct opcode twobyte_table[256] = {
3245 IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), 3219 IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
3246 DI(ImplicitOps | Priv, rdmsr), 3220 DI(ImplicitOps | Priv, rdmsr),
3247 DIP(ImplicitOps | Priv, rdpmc, check_rdpmc), 3221 DIP(ImplicitOps | Priv, rdpmc, check_rdpmc),
3248 D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific), 3222 I(ImplicitOps | VendorSpecific, em_sysenter),
3223 I(ImplicitOps | Priv | VendorSpecific, em_sysexit),
3249 N, N, 3224 N, N,
3250 N, N, N, N, N, N, N, N, 3225 N, N, N, N, N, N, N, N,
3251 /* 0x40 - 0x4F */ 3226 /* 0x40 - 0x4F */
@@ -3313,11 +3288,11 @@ static struct opcode twobyte_table[256] = {
3313#undef I2bv 3288#undef I2bv
3314#undef I6ALU 3289#undef I6ALU
3315 3290
3316static unsigned imm_size(struct decode_cache *c) 3291static unsigned imm_size(struct x86_emulate_ctxt *ctxt)
3317{ 3292{
3318 unsigned size; 3293 unsigned size;
3319 3294
3320 size = (c->d & ByteOp) ? 1 : c->op_bytes; 3295 size = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3321 if (size == 8) 3296 if (size == 8)
3322 size = 4; 3297 size = 4;
3323 return size; 3298 return size;
@@ -3326,23 +3301,21 @@ static unsigned imm_size(struct decode_cache *c)
3326static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, 3301static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
3327 unsigned size, bool sign_extension) 3302 unsigned size, bool sign_extension)
3328{ 3303{
3329 struct decode_cache *c = &ctxt->decode;
3330 struct x86_emulate_ops *ops = ctxt->ops;
3331 int rc = X86EMUL_CONTINUE; 3304 int rc = X86EMUL_CONTINUE;
3332 3305
3333 op->type = OP_IMM; 3306 op->type = OP_IMM;
3334 op->bytes = size; 3307 op->bytes = size;
3335 op->addr.mem.ea = c->eip; 3308 op->addr.mem.ea = ctxt->_eip;
3336 /* NB. Immediates are sign-extended as necessary. */ 3309 /* NB. Immediates are sign-extended as necessary. */
3337 switch (op->bytes) { 3310 switch (op->bytes) {
3338 case 1: 3311 case 1:
3339 op->val = insn_fetch(s8, 1, c->eip); 3312 op->val = insn_fetch(s8, 1, ctxt->_eip);
3340 break; 3313 break;
3341 case 2: 3314 case 2:
3342 op->val = insn_fetch(s16, 2, c->eip); 3315 op->val = insn_fetch(s16, 2, ctxt->_eip);
3343 break; 3316 break;
3344 case 4: 3317 case 4:
3345 op->val = insn_fetch(s32, 4, c->eip); 3318 op->val = insn_fetch(s32, 4, ctxt->_eip);
3346 break; 3319 break;
3347 } 3320 }
3348 if (!sign_extension) { 3321 if (!sign_extension) {
@@ -3362,11 +3335,8 @@ done:
3362 return rc; 3335 return rc;
3363} 3336}
3364 3337
3365int 3338int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3366x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3367{ 3339{
3368 struct x86_emulate_ops *ops = ctxt->ops;
3369 struct decode_cache *c = &ctxt->decode;
3370 int rc = X86EMUL_CONTINUE; 3340 int rc = X86EMUL_CONTINUE;
3371 int mode = ctxt->mode; 3341 int mode = ctxt->mode;
3372 int def_op_bytes, def_ad_bytes, goffset, simd_prefix; 3342 int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
@@ -3374,11 +3344,11 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3374 struct opcode opcode; 3344 struct opcode opcode;
3375 struct operand memop = { .type = OP_NONE }, *memopp = NULL; 3345 struct operand memop = { .type = OP_NONE }, *memopp = NULL;
3376 3346
3377 c->eip = ctxt->eip; 3347 ctxt->_eip = ctxt->eip;
3378 c->fetch.start = c->eip; 3348 ctxt->fetch.start = ctxt->_eip;
3379 c->fetch.end = c->fetch.start + insn_len; 3349 ctxt->fetch.end = ctxt->fetch.start + insn_len;
3380 if (insn_len > 0) 3350 if (insn_len > 0)
3381 memcpy(c->fetch.data, insn, insn_len); 3351 memcpy(ctxt->fetch.data, insn, insn_len);
3382 3352
3383 switch (mode) { 3353 switch (mode) {
3384 case X86EMUL_MODE_REAL: 3354 case X86EMUL_MODE_REAL:
@@ -3399,46 +3369,46 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3399 return -1; 3369 return -1;
3400 } 3370 }
3401 3371
3402 c->op_bytes = def_op_bytes; 3372 ctxt->op_bytes = def_op_bytes;
3403 c->ad_bytes = def_ad_bytes; 3373 ctxt->ad_bytes = def_ad_bytes;
3404 3374
3405 /* Legacy prefixes. */ 3375 /* Legacy prefixes. */
3406 for (;;) { 3376 for (;;) {
3407 switch (c->b = insn_fetch(u8, 1, c->eip)) { 3377 switch (ctxt->b = insn_fetch(u8, 1, ctxt->_eip)) {
3408 case 0x66: /* operand-size override */ 3378 case 0x66: /* operand-size override */
3409 op_prefix = true; 3379 op_prefix = true;
3410 /* switch between 2/4 bytes */ 3380 /* switch between 2/4 bytes */
3411 c->op_bytes = def_op_bytes ^ 6; 3381 ctxt->op_bytes = def_op_bytes ^ 6;
3412 break; 3382 break;
3413 case 0x67: /* address-size override */ 3383 case 0x67: /* address-size override */
3414 if (mode == X86EMUL_MODE_PROT64) 3384 if (mode == X86EMUL_MODE_PROT64)
3415 /* switch between 4/8 bytes */ 3385 /* switch between 4/8 bytes */
3416 c->ad_bytes = def_ad_bytes ^ 12; 3386 ctxt->ad_bytes = def_ad_bytes ^ 12;
3417 else 3387 else
3418 /* switch between 2/4 bytes */ 3388 /* switch between 2/4 bytes */
3419 c->ad_bytes = def_ad_bytes ^ 6; 3389 ctxt->ad_bytes = def_ad_bytes ^ 6;
3420 break; 3390 break;
3421 case 0x26: /* ES override */ 3391 case 0x26: /* ES override */
3422 case 0x2e: /* CS override */ 3392 case 0x2e: /* CS override */
3423 case 0x36: /* SS override */ 3393 case 0x36: /* SS override */
3424 case 0x3e: /* DS override */ 3394 case 0x3e: /* DS override */
3425 set_seg_override(c, (c->b >> 3) & 3); 3395 set_seg_override(ctxt, (ctxt->b >> 3) & 3);
3426 break; 3396 break;
3427 case 0x64: /* FS override */ 3397 case 0x64: /* FS override */
3428 case 0x65: /* GS override */ 3398 case 0x65: /* GS override */
3429 set_seg_override(c, c->b & 7); 3399 set_seg_override(ctxt, ctxt->b & 7);
3430 break; 3400 break;
3431 case 0x40 ... 0x4f: /* REX */ 3401 case 0x40 ... 0x4f: /* REX */
3432 if (mode != X86EMUL_MODE_PROT64) 3402 if (mode != X86EMUL_MODE_PROT64)
3433 goto done_prefixes; 3403 goto done_prefixes;
3434 c->rex_prefix = c->b; 3404 ctxt->rex_prefix = ctxt->b;
3435 continue; 3405 continue;
3436 case 0xf0: /* LOCK */ 3406 case 0xf0: /* LOCK */
3437 c->lock_prefix = 1; 3407 ctxt->lock_prefix = 1;
3438 break; 3408 break;
3439 case 0xf2: /* REPNE/REPNZ */ 3409 case 0xf2: /* REPNE/REPNZ */
3440 case 0xf3: /* REP/REPE/REPZ */ 3410 case 0xf3: /* REP/REPE/REPZ */
3441 c->rep_prefix = c->b; 3411 ctxt->rep_prefix = ctxt->b;
3442 break; 3412 break;
3443 default: 3413 default:
3444 goto done_prefixes; 3414 goto done_prefixes;
@@ -3446,50 +3416,50 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3446 3416
3447 /* Any legacy prefix after a REX prefix nullifies its effect. */ 3417 /* Any legacy prefix after a REX prefix nullifies its effect. */
3448 3418
3449 c->rex_prefix = 0; 3419 ctxt->rex_prefix = 0;
3450 } 3420 }
3451 3421
3452done_prefixes: 3422done_prefixes:
3453 3423
3454 /* REX prefix. */ 3424 /* REX prefix. */
3455 if (c->rex_prefix & 8) 3425 if (ctxt->rex_prefix & 8)
3456 c->op_bytes = 8; /* REX.W */ 3426 ctxt->op_bytes = 8; /* REX.W */
3457 3427
3458 /* Opcode byte(s). */ 3428 /* Opcode byte(s). */
3459 opcode = opcode_table[c->b]; 3429 opcode = opcode_table[ctxt->b];
3460 /* Two-byte opcode? */ 3430 /* Two-byte opcode? */
3461 if (c->b == 0x0f) { 3431 if (ctxt->b == 0x0f) {
3462 c->twobyte = 1; 3432 ctxt->twobyte = 1;
3463 c->b = insn_fetch(u8, 1, c->eip); 3433 ctxt->b = insn_fetch(u8, 1, ctxt->_eip);
3464 opcode = twobyte_table[c->b]; 3434 opcode = twobyte_table[ctxt->b];
3465 } 3435 }
3466 c->d = opcode.flags; 3436 ctxt->d = opcode.flags;
3467 3437
3468 while (c->d & GroupMask) { 3438 while (ctxt->d & GroupMask) {
3469 switch (c->d & GroupMask) { 3439 switch (ctxt->d & GroupMask) {
3470 case Group: 3440 case Group:
3471 c->modrm = insn_fetch(u8, 1, c->eip); 3441 ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
3472 --c->eip; 3442 --ctxt->_eip;
3473 goffset = (c->modrm >> 3) & 7; 3443 goffset = (ctxt->modrm >> 3) & 7;
3474 opcode = opcode.u.group[goffset]; 3444 opcode = opcode.u.group[goffset];
3475 break; 3445 break;
3476 case GroupDual: 3446 case GroupDual:
3477 c->modrm = insn_fetch(u8, 1, c->eip); 3447 ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
3478 --c->eip; 3448 --ctxt->_eip;
3479 goffset = (c->modrm >> 3) & 7; 3449 goffset = (ctxt->modrm >> 3) & 7;
3480 if ((c->modrm >> 6) == 3) 3450 if ((ctxt->modrm >> 6) == 3)
3481 opcode = opcode.u.gdual->mod3[goffset]; 3451 opcode = opcode.u.gdual->mod3[goffset];
3482 else 3452 else
3483 opcode = opcode.u.gdual->mod012[goffset]; 3453 opcode = opcode.u.gdual->mod012[goffset];
3484 break; 3454 break;
3485 case RMExt: 3455 case RMExt:
3486 goffset = c->modrm & 7; 3456 goffset = ctxt->modrm & 7;
3487 opcode = opcode.u.group[goffset]; 3457 opcode = opcode.u.group[goffset];
3488 break; 3458 break;
3489 case Prefix: 3459 case Prefix:
3490 if (c->rep_prefix && op_prefix) 3460 if (ctxt->rep_prefix && op_prefix)
3491 return X86EMUL_UNHANDLEABLE; 3461 return X86EMUL_UNHANDLEABLE;
3492 simd_prefix = op_prefix ? 0x66 : c->rep_prefix; 3462 simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix;
3493 switch (simd_prefix) { 3463 switch (simd_prefix) {
3494 case 0x00: opcode = opcode.u.gprefix->pfx_no; break; 3464 case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
3495 case 0x66: opcode = opcode.u.gprefix->pfx_66; break; 3465 case 0x66: opcode = opcode.u.gprefix->pfx_66; break;
@@ -3501,61 +3471,61 @@ done_prefixes:
3501 return X86EMUL_UNHANDLEABLE; 3471 return X86EMUL_UNHANDLEABLE;
3502 } 3472 }
3503 3473
3504 c->d &= ~GroupMask; 3474 ctxt->d &= ~GroupMask;
3505 c->d |= opcode.flags; 3475 ctxt->d |= opcode.flags;
3506 } 3476 }
3507 3477
3508 c->execute = opcode.u.execute; 3478 ctxt->execute = opcode.u.execute;
3509 c->check_perm = opcode.check_perm; 3479 ctxt->check_perm = opcode.check_perm;
3510 c->intercept = opcode.intercept; 3480 ctxt->intercept = opcode.intercept;
3511 3481
3512 /* Unrecognised? */ 3482 /* Unrecognised? */
3513 if (c->d == 0 || (c->d & Undefined)) 3483 if (ctxt->d == 0 || (ctxt->d & Undefined))
3514 return -1; 3484 return -1;
3515 3485
3516 if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn) 3486 if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
3517 return -1; 3487 return -1;
3518 3488
3519 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) 3489 if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
3520 c->op_bytes = 8; 3490 ctxt->op_bytes = 8;
3521 3491
3522 if (c->d & Op3264) { 3492 if (ctxt->d & Op3264) {
3523 if (mode == X86EMUL_MODE_PROT64) 3493 if (mode == X86EMUL_MODE_PROT64)
3524 c->op_bytes = 8; 3494 ctxt->op_bytes = 8;
3525 else 3495 else
3526 c->op_bytes = 4; 3496 ctxt->op_bytes = 4;
3527 } 3497 }
3528 3498
3529 if (c->d & Sse) 3499 if (ctxt->d & Sse)
3530 c->op_bytes = 16; 3500 ctxt->op_bytes = 16;
3531 3501
3532 /* ModRM and SIB bytes. */ 3502 /* ModRM and SIB bytes. */
3533 if (c->d & ModRM) { 3503 if (ctxt->d & ModRM) {
3534 rc = decode_modrm(ctxt, ops, &memop); 3504 rc = decode_modrm(ctxt, &memop);
3535 if (!c->has_seg_override) 3505 if (!ctxt->has_seg_override)
3536 set_seg_override(c, c->modrm_seg); 3506 set_seg_override(ctxt, ctxt->modrm_seg);
3537 } else if (c->d & MemAbs) 3507 } else if (ctxt->d & MemAbs)
3538 rc = decode_abs(ctxt, ops, &memop); 3508 rc = decode_abs(ctxt, &memop);
3539 if (rc != X86EMUL_CONTINUE) 3509 if (rc != X86EMUL_CONTINUE)
3540 goto done; 3510 goto done;
3541 3511
3542 if (!c->has_seg_override) 3512 if (!ctxt->has_seg_override)
3543 set_seg_override(c, VCPU_SREG_DS); 3513 set_seg_override(ctxt, VCPU_SREG_DS);
3544 3514
3545 memop.addr.mem.seg = seg_override(ctxt, c); 3515 memop.addr.mem.seg = seg_override(ctxt);
3546 3516
3547 if (memop.type == OP_MEM && c->ad_bytes != 8) 3517 if (memop.type == OP_MEM && ctxt->ad_bytes != 8)
3548 memop.addr.mem.ea = (u32)memop.addr.mem.ea; 3518 memop.addr.mem.ea = (u32)memop.addr.mem.ea;
3549 3519
3550 /* 3520 /*
3551 * Decode and fetch the source operand: register, memory 3521 * Decode and fetch the source operand: register, memory
3552 * or immediate. 3522 * or immediate.
3553 */ 3523 */
3554 switch (c->d & SrcMask) { 3524 switch (ctxt->d & SrcMask) {
3555 case SrcNone: 3525 case SrcNone:
3556 break; 3526 break;
3557 case SrcReg: 3527 case SrcReg:
3558 decode_register_operand(ctxt, &c->src, c, 0); 3528 decode_register_operand(ctxt, &ctxt->src, 0);
3559 break; 3529 break;
3560 case SrcMem16: 3530 case SrcMem16:
3561 memop.bytes = 2; 3531 memop.bytes = 2;
@@ -3564,60 +3534,60 @@ done_prefixes:
3564 memop.bytes = 4; 3534 memop.bytes = 4;
3565 goto srcmem_common; 3535 goto srcmem_common;
3566 case SrcMem: 3536 case SrcMem:
3567 memop.bytes = (c->d & ByteOp) ? 1 : 3537 memop.bytes = (ctxt->d & ByteOp) ? 1 :
3568 c->op_bytes; 3538 ctxt->op_bytes;
3569 srcmem_common: 3539 srcmem_common:
3570 c->src = memop; 3540 ctxt->src = memop;
3571 memopp = &c->src; 3541 memopp = &ctxt->src;
3572 break; 3542 break;
3573 case SrcImmU16: 3543 case SrcImmU16:
3574 rc = decode_imm(ctxt, &c->src, 2, false); 3544 rc = decode_imm(ctxt, &ctxt->src, 2, false);
3575 break; 3545 break;
3576 case SrcImm: 3546 case SrcImm:
3577 rc = decode_imm(ctxt, &c->src, imm_size(c), true); 3547 rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), true);
3578 break; 3548 break;
3579 case SrcImmU: 3549 case SrcImmU:
3580 rc = decode_imm(ctxt, &c->src, imm_size(c), false); 3550 rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), false);
3581 break; 3551 break;
3582 case SrcImmByte: 3552 case SrcImmByte:
3583 rc = decode_imm(ctxt, &c->src, 1, true); 3553 rc = decode_imm(ctxt, &ctxt->src, 1, true);
3584 break; 3554 break;
3585 case SrcImmUByte: 3555 case SrcImmUByte:
3586 rc = decode_imm(ctxt, &c->src, 1, false); 3556 rc = decode_imm(ctxt, &ctxt->src, 1, false);
3587 break; 3557 break;
3588 case SrcAcc: 3558 case SrcAcc:
3589 c->src.type = OP_REG; 3559 ctxt->src.type = OP_REG;
3590 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 3560 ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3591 c->src.addr.reg = &c->regs[VCPU_REGS_RAX]; 3561 ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
3592 fetch_register_operand(&c->src); 3562 fetch_register_operand(&ctxt->src);
3593 break; 3563 break;
3594 case SrcOne: 3564 case SrcOne:
3595 c->src.bytes = 1; 3565 ctxt->src.bytes = 1;
3596 c->src.val = 1; 3566 ctxt->src.val = 1;
3597 break; 3567 break;
3598 case SrcSI: 3568 case SrcSI:
3599 c->src.type = OP_MEM; 3569 ctxt->src.type = OP_MEM;
3600 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 3570 ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3601 c->src.addr.mem.ea = 3571 ctxt->src.addr.mem.ea =
3602 register_address(c, c->regs[VCPU_REGS_RSI]); 3572 register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
3603 c->src.addr.mem.seg = seg_override(ctxt, c); 3573 ctxt->src.addr.mem.seg = seg_override(ctxt);
3604 c->src.val = 0; 3574 ctxt->src.val = 0;
3605 break; 3575 break;
3606 case SrcImmFAddr: 3576 case SrcImmFAddr:
3607 c->src.type = OP_IMM; 3577 ctxt->src.type = OP_IMM;
3608 c->src.addr.mem.ea = c->eip; 3578 ctxt->src.addr.mem.ea = ctxt->_eip;
3609 c->src.bytes = c->op_bytes + 2; 3579 ctxt->src.bytes = ctxt->op_bytes + 2;
3610 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); 3580 insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt->_eip);
3611 break; 3581 break;
3612 case SrcMemFAddr: 3582 case SrcMemFAddr:
3613 memop.bytes = c->op_bytes + 2; 3583 memop.bytes = ctxt->op_bytes + 2;
3614 goto srcmem_common; 3584 goto srcmem_common;
3615 break; 3585 break;
3616 case SrcDX: 3586 case SrcDX:
3617 c->src.type = OP_REG; 3587 ctxt->src.type = OP_REG;
3618 c->src.bytes = 2; 3588 ctxt->src.bytes = 2;
3619 c->src.addr.reg = &c->regs[VCPU_REGS_RDX]; 3589 ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
3620 fetch_register_operand(&c->src); 3590 fetch_register_operand(&ctxt->src);
3621 break; 3591 break;
3622 } 3592 }
3623 3593
@@ -3628,22 +3598,22 @@ done_prefixes:
3628 * Decode and fetch the second source operand: register, memory 3598 * Decode and fetch the second source operand: register, memory
3629 * or immediate. 3599 * or immediate.
3630 */ 3600 */
3631 switch (c->d & Src2Mask) { 3601 switch (ctxt->d & Src2Mask) {
3632 case Src2None: 3602 case Src2None:
3633 break; 3603 break;
3634 case Src2CL: 3604 case Src2CL:
3635 c->src2.bytes = 1; 3605 ctxt->src2.bytes = 1;
3636 c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; 3606 ctxt->src2.val = ctxt->regs[VCPU_REGS_RCX] & 0x8;
3637 break; 3607 break;
3638 case Src2ImmByte: 3608 case Src2ImmByte:
3639 rc = decode_imm(ctxt, &c->src2, 1, true); 3609 rc = decode_imm(ctxt, &ctxt->src2, 1, true);
3640 break; 3610 break;
3641 case Src2One: 3611 case Src2One:
3642 c->src2.bytes = 1; 3612 ctxt->src2.bytes = 1;
3643 c->src2.val = 1; 3613 ctxt->src2.val = 1;
3644 break; 3614 break;
3645 case Src2Imm: 3615 case Src2Imm:
3646 rc = decode_imm(ctxt, &c->src2, imm_size(c), true); 3616 rc = decode_imm(ctxt, &ctxt->src2, imm_size(ctxt), true);
3647 break; 3617 break;
3648 } 3618 }
3649 3619
@@ -3651,68 +3621,66 @@ done_prefixes:
3651 goto done; 3621 goto done;
3652 3622
3653 /* Decode and fetch the destination operand: register or memory. */ 3623 /* Decode and fetch the destination operand: register or memory. */
3654 switch (c->d & DstMask) { 3624 switch (ctxt->d & DstMask) {
3655 case DstReg: 3625 case DstReg:
3656 decode_register_operand(ctxt, &c->dst, c, 3626 decode_register_operand(ctxt, &ctxt->dst,
3657 c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); 3627 ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
3658 break; 3628 break;
3659 case DstImmUByte: 3629 case DstImmUByte:
3660 c->dst.type = OP_IMM; 3630 ctxt->dst.type = OP_IMM;
3661 c->dst.addr.mem.ea = c->eip; 3631 ctxt->dst.addr.mem.ea = ctxt->_eip;
3662 c->dst.bytes = 1; 3632 ctxt->dst.bytes = 1;
3663 c->dst.val = insn_fetch(u8, 1, c->eip); 3633 ctxt->dst.val = insn_fetch(u8, 1, ctxt->_eip);
3664 break; 3634 break;
3665 case DstMem: 3635 case DstMem:
3666 case DstMem64: 3636 case DstMem64:
3667 c->dst = memop; 3637 ctxt->dst = memop;
3668 memopp = &c->dst; 3638 memopp = &ctxt->dst;
3669 if ((c->d & DstMask) == DstMem64) 3639 if ((ctxt->d & DstMask) == DstMem64)
3670 c->dst.bytes = 8; 3640 ctxt->dst.bytes = 8;
3671 else 3641 else
3672 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 3642 ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3673 if (c->d & BitOp) 3643 if (ctxt->d & BitOp)
3674 fetch_bit_operand(c); 3644 fetch_bit_operand(ctxt);
3675 c->dst.orig_val = c->dst.val; 3645 ctxt->dst.orig_val = ctxt->dst.val;
3676 break; 3646 break;
3677 case DstAcc: 3647 case DstAcc:
3678 c->dst.type = OP_REG; 3648 ctxt->dst.type = OP_REG;
3679 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 3649 ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3680 c->dst.addr.reg = &c->regs[VCPU_REGS_RAX]; 3650 ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
3681 fetch_register_operand(&c->dst); 3651 fetch_register_operand(&ctxt->dst);
3682 c->dst.orig_val = c->dst.val; 3652 ctxt->dst.orig_val = ctxt->dst.val;
3683 break; 3653 break;
3684 case DstDI: 3654 case DstDI:
3685 c->dst.type = OP_MEM; 3655 ctxt->dst.type = OP_MEM;
3686 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 3656 ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3687 c->dst.addr.mem.ea = 3657 ctxt->dst.addr.mem.ea =
3688 register_address(c, c->regs[VCPU_REGS_RDI]); 3658 register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
3689 c->dst.addr.mem.seg = VCPU_SREG_ES; 3659 ctxt->dst.addr.mem.seg = VCPU_SREG_ES;
3690 c->dst.val = 0; 3660 ctxt->dst.val = 0;
3691 break; 3661 break;
3692 case DstDX: 3662 case DstDX:
3693 c->dst.type = OP_REG; 3663 ctxt->dst.type = OP_REG;
3694 c->dst.bytes = 2; 3664 ctxt->dst.bytes = 2;
3695 c->dst.addr.reg = &c->regs[VCPU_REGS_RDX]; 3665 ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
3696 fetch_register_operand(&c->dst); 3666 fetch_register_operand(&ctxt->dst);
3697 break; 3667 break;
3698 case ImplicitOps: 3668 case ImplicitOps:
3699 /* Special instructions do their own operand decoding. */ 3669 /* Special instructions do their own operand decoding. */
3700 default: 3670 default:
3701 c->dst.type = OP_NONE; /* Disable writeback. */ 3671 ctxt->dst.type = OP_NONE; /* Disable writeback. */
3702 break; 3672 break;
3703 } 3673 }
3704 3674
3705done: 3675done:
3706 if (memopp && memopp->type == OP_MEM && c->rip_relative) 3676 if (memopp && memopp->type == OP_MEM && ctxt->rip_relative)
3707 memopp->addr.mem.ea += c->eip; 3677 memopp->addr.mem.ea += ctxt->_eip;
3708 3678
3709 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 3679 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
3710} 3680}
3711 3681
3712static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) 3682static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
3713{ 3683{
3714 struct decode_cache *c = &ctxt->decode;
3715
3716 /* The second termination condition only applies for REPE 3684 /* The second termination condition only applies for REPE
3717 * and REPNE. Test if the repeat string operation prefix is 3685 * and REPNE. Test if the repeat string operation prefix is
3718 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the 3686 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
@@ -3720,304 +3688,232 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
3720 * - if REPE/REPZ and ZF = 0 then done 3688 * - if REPE/REPZ and ZF = 0 then done
3721 * - if REPNE/REPNZ and ZF = 1 then done 3689 * - if REPNE/REPNZ and ZF = 1 then done
3722 */ 3690 */
3723 if (((c->b == 0xa6) || (c->b == 0xa7) || 3691 if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) ||
3724 (c->b == 0xae) || (c->b == 0xaf)) 3692 (ctxt->b == 0xae) || (ctxt->b == 0xaf))
3725 && (((c->rep_prefix == REPE_PREFIX) && 3693 && (((ctxt->rep_prefix == REPE_PREFIX) &&
3726 ((ctxt->eflags & EFLG_ZF) == 0)) 3694 ((ctxt->eflags & EFLG_ZF) == 0))
3727 || ((c->rep_prefix == REPNE_PREFIX) && 3695 || ((ctxt->rep_prefix == REPNE_PREFIX) &&
3728 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) 3696 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
3729 return true; 3697 return true;
3730 3698
3731 return false; 3699 return false;
3732} 3700}
3733 3701
3734int 3702int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3735x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3736{ 3703{
3737 struct x86_emulate_ops *ops = ctxt->ops; 3704 struct x86_emulate_ops *ops = ctxt->ops;
3738 u64 msr_data; 3705 u64 msr_data;
3739 struct decode_cache *c = &ctxt->decode;
3740 int rc = X86EMUL_CONTINUE; 3706 int rc = X86EMUL_CONTINUE;
3741 int saved_dst_type = c->dst.type; 3707 int saved_dst_type = ctxt->dst.type;
3742 int irq; /* Used for int 3, int, and into */
3743 3708
3744 ctxt->decode.mem_read.pos = 0; 3709 ctxt->mem_read.pos = 0;
3745 3710
3746 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { 3711 if (ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) {
3747 rc = emulate_ud(ctxt); 3712 rc = emulate_ud(ctxt);
3748 goto done; 3713 goto done;
3749 } 3714 }
3750 3715
3751 /* LOCK prefix is allowed only with some instructions */ 3716 /* LOCK prefix is allowed only with some instructions */
3752 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { 3717 if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) {
3753 rc = emulate_ud(ctxt); 3718 rc = emulate_ud(ctxt);
3754 goto done; 3719 goto done;
3755 } 3720 }
3756 3721
3757 if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { 3722 if ((ctxt->d & SrcMask) == SrcMemFAddr && ctxt->src.type != OP_MEM) {
3758 rc = emulate_ud(ctxt); 3723 rc = emulate_ud(ctxt);
3759 goto done; 3724 goto done;
3760 } 3725 }
3761 3726
3762 if ((c->d & Sse) 3727 if ((ctxt->d & Sse)
3763 && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) 3728 && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)
3764 || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { 3729 || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
3765 rc = emulate_ud(ctxt); 3730 rc = emulate_ud(ctxt);
3766 goto done; 3731 goto done;
3767 } 3732 }
3768 3733
3769 if ((c->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { 3734 if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
3770 rc = emulate_nm(ctxt); 3735 rc = emulate_nm(ctxt);
3771 goto done; 3736 goto done;
3772 } 3737 }
3773 3738
3774 if (unlikely(ctxt->guest_mode) && c->intercept) { 3739 if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
3775 rc = emulator_check_intercept(ctxt, c->intercept, 3740 rc = emulator_check_intercept(ctxt, ctxt->intercept,
3776 X86_ICPT_PRE_EXCEPT); 3741 X86_ICPT_PRE_EXCEPT);
3777 if (rc != X86EMUL_CONTINUE) 3742 if (rc != X86EMUL_CONTINUE)
3778 goto done; 3743 goto done;
3779 } 3744 }
3780 3745
3781 /* Privileged instruction can be executed only in CPL=0 */ 3746 /* Privileged instruction can be executed only in CPL=0 */
3782 if ((c->d & Priv) && ops->cpl(ctxt)) { 3747 if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
3783 rc = emulate_gp(ctxt, 0); 3748 rc = emulate_gp(ctxt, 0);
3784 goto done; 3749 goto done;
3785 } 3750 }
3786 3751
3787 /* Instruction can only be executed in protected mode */ 3752 /* Instruction can only be executed in protected mode */
3788 if ((c->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) { 3753 if ((ctxt->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) {
3789 rc = emulate_ud(ctxt); 3754 rc = emulate_ud(ctxt);
3790 goto done; 3755 goto done;
3791 } 3756 }
3792 3757
3793 /* Do instruction specific permission checks */ 3758 /* Do instruction specific permission checks */
3794 if (c->check_perm) { 3759 if (ctxt->check_perm) {
3795 rc = c->check_perm(ctxt); 3760 rc = ctxt->check_perm(ctxt);
3796 if (rc != X86EMUL_CONTINUE) 3761 if (rc != X86EMUL_CONTINUE)
3797 goto done; 3762 goto done;
3798 } 3763 }
3799 3764
3800 if (unlikely(ctxt->guest_mode) && c->intercept) { 3765 if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
3801 rc = emulator_check_intercept(ctxt, c->intercept, 3766 rc = emulator_check_intercept(ctxt, ctxt->intercept,
3802 X86_ICPT_POST_EXCEPT); 3767 X86_ICPT_POST_EXCEPT);
3803 if (rc != X86EMUL_CONTINUE) 3768 if (rc != X86EMUL_CONTINUE)
3804 goto done; 3769 goto done;
3805 } 3770 }
3806 3771
3807 if (c->rep_prefix && (c->d & String)) { 3772 if (ctxt->rep_prefix && (ctxt->d & String)) {
3808 /* All REP prefixes have the same first termination condition */ 3773 /* All REP prefixes have the same first termination condition */
3809 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { 3774 if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) {
3810 ctxt->eip = c->eip; 3775 ctxt->eip = ctxt->_eip;
3811 goto done; 3776 goto done;
3812 } 3777 }
3813 } 3778 }
3814 3779
3815 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { 3780 if ((ctxt->src.type == OP_MEM) && !(ctxt->d & NoAccess)) {
3816 rc = segmented_read(ctxt, c->src.addr.mem, 3781 rc = segmented_read(ctxt, ctxt->src.addr.mem,
3817 c->src.valptr, c->src.bytes); 3782 ctxt->src.valptr, ctxt->src.bytes);
3818 if (rc != X86EMUL_CONTINUE) 3783 if (rc != X86EMUL_CONTINUE)
3819 goto done; 3784 goto done;
3820 c->src.orig_val64 = c->src.val64; 3785 ctxt->src.orig_val64 = ctxt->src.val64;
3821 } 3786 }
3822 3787
3823 if (c->src2.type == OP_MEM) { 3788 if (ctxt->src2.type == OP_MEM) {
3824 rc = segmented_read(ctxt, c->src2.addr.mem, 3789 rc = segmented_read(ctxt, ctxt->src2.addr.mem,
3825 &c->src2.val, c->src2.bytes); 3790 &ctxt->src2.val, ctxt->src2.bytes);
3826 if (rc != X86EMUL_CONTINUE) 3791 if (rc != X86EMUL_CONTINUE)
3827 goto done; 3792 goto done;
3828 } 3793 }
3829 3794
3830 if ((c->d & DstMask) == ImplicitOps) 3795 if ((ctxt->d & DstMask) == ImplicitOps)
3831 goto special_insn; 3796 goto special_insn;
3832 3797
3833 3798
3834 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { 3799 if ((ctxt->dst.type == OP_MEM) && !(ctxt->d & Mov)) {
3835 /* optimisation - avoid slow emulated read if Mov */ 3800 /* optimisation - avoid slow emulated read if Mov */
3836 rc = segmented_read(ctxt, c->dst.addr.mem, 3801 rc = segmented_read(ctxt, ctxt->dst.addr.mem,
3837 &c->dst.val, c->dst.bytes); 3802 &ctxt->dst.val, ctxt->dst.bytes);
3838 if (rc != X86EMUL_CONTINUE) 3803 if (rc != X86EMUL_CONTINUE)
3839 goto done; 3804 goto done;
3840 } 3805 }
3841 c->dst.orig_val = c->dst.val; 3806 ctxt->dst.orig_val = ctxt->dst.val;
3842 3807
3843special_insn: 3808special_insn:
3844 3809
3845 if (unlikely(ctxt->guest_mode) && c->intercept) { 3810 if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
3846 rc = emulator_check_intercept(ctxt, c->intercept, 3811 rc = emulator_check_intercept(ctxt, ctxt->intercept,
3847 X86_ICPT_POST_MEMACCESS); 3812 X86_ICPT_POST_MEMACCESS);
3848 if (rc != X86EMUL_CONTINUE) 3813 if (rc != X86EMUL_CONTINUE)
3849 goto done; 3814 goto done;
3850 } 3815 }
3851 3816
3852 if (c->execute) { 3817 if (ctxt->execute) {
3853 rc = c->execute(ctxt); 3818 rc = ctxt->execute(ctxt);
3854 if (rc != X86EMUL_CONTINUE) 3819 if (rc != X86EMUL_CONTINUE)
3855 goto done; 3820 goto done;
3856 goto writeback; 3821 goto writeback;
3857 } 3822 }
3858 3823
3859 if (c->twobyte) 3824 if (ctxt->twobyte)
3860 goto twobyte_insn; 3825 goto twobyte_insn;
3861 3826
3862 switch (c->b) { 3827 switch (ctxt->b) {
3863 case 0x06: /* push es */ 3828 case 0x06: /* push es */
3864 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); 3829 rc = emulate_push_sreg(ctxt, VCPU_SREG_ES);
3865 break; 3830 break;
3866 case 0x07: /* pop es */ 3831 case 0x07: /* pop es */
3867 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); 3832 rc = emulate_pop_sreg(ctxt, VCPU_SREG_ES);
3868 break; 3833 break;
3869 case 0x0e: /* push cs */ 3834 case 0x0e: /* push cs */
3870 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); 3835 rc = emulate_push_sreg(ctxt, VCPU_SREG_CS);
3871 break; 3836 break;
3872 case 0x16: /* push ss */ 3837 case 0x16: /* push ss */
3873 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); 3838 rc = emulate_push_sreg(ctxt, VCPU_SREG_SS);
3874 break; 3839 break;
3875 case 0x17: /* pop ss */ 3840 case 0x17: /* pop ss */
3876 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); 3841 rc = emulate_pop_sreg(ctxt, VCPU_SREG_SS);
3877 break; 3842 break;
3878 case 0x1e: /* push ds */ 3843 case 0x1e: /* push ds */
3879 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); 3844 rc = emulate_push_sreg(ctxt, VCPU_SREG_DS);
3880 break; 3845 break;
3881 case 0x1f: /* pop ds */ 3846 case 0x1f: /* pop ds */
3882 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); 3847 rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS);
3883 break; 3848 break;
3884 case 0x40 ... 0x47: /* inc r16/r32 */ 3849 case 0x40 ... 0x47: /* inc r16/r32 */
3885 emulate_1op("inc", c->dst, ctxt->eflags); 3850 emulate_1op("inc", ctxt->dst, ctxt->eflags);
3886 break; 3851 break;
3887 case 0x48 ... 0x4f: /* dec r16/r32 */ 3852 case 0x48 ... 0x4f: /* dec r16/r32 */
3888 emulate_1op("dec", c->dst, ctxt->eflags); 3853 emulate_1op("dec", ctxt->dst, ctxt->eflags);
3889 break; 3854 break;
3890 case 0x63: /* movsxd */ 3855 case 0x63: /* movsxd */
3891 if (ctxt->mode != X86EMUL_MODE_PROT64) 3856 if (ctxt->mode != X86EMUL_MODE_PROT64)
3892 goto cannot_emulate; 3857 goto cannot_emulate;
3893 c->dst.val = (s32) c->src.val; 3858 ctxt->dst.val = (s32) ctxt->src.val;
3894 break; 3859 break;
3895 case 0x6c: /* insb */ 3860 case 0x6c: /* insb */
3896 case 0x6d: /* insw/insd */ 3861 case 0x6d: /* insw/insd */
3897 c->src.val = c->regs[VCPU_REGS_RDX]; 3862 ctxt->src.val = ctxt->regs[VCPU_REGS_RDX];
3898 goto do_io_in; 3863 goto do_io_in;
3899 case 0x6e: /* outsb */ 3864 case 0x6e: /* outsb */
3900 case 0x6f: /* outsw/outsd */ 3865 case 0x6f: /* outsw/outsd */
3901 c->dst.val = c->regs[VCPU_REGS_RDX]; 3866 ctxt->dst.val = ctxt->regs[VCPU_REGS_RDX];
3902 goto do_io_out; 3867 goto do_io_out;
3903 break; 3868 break;
3904 case 0x70 ... 0x7f: /* jcc (short) */ 3869 case 0x70 ... 0x7f: /* jcc (short) */
3905 if (test_cc(c->b, ctxt->eflags)) 3870 if (test_cc(ctxt->b, ctxt->eflags))
3906 jmp_rel(c, c->src.val); 3871 jmp_rel(ctxt, ctxt->src.val);
3907 break;
3908 case 0x84 ... 0x85:
3909 test:
3910 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
3911 break;
3912 case 0x86 ... 0x87: /* xchg */
3913 xchg:
3914 /* Write back the register source. */
3915 c->src.val = c->dst.val;
3916 write_register_operand(&c->src);
3917 /*
3918 * Write back the memory destination with implicit LOCK
3919 * prefix.
3920 */
3921 c->dst.val = c->src.orig_val;
3922 c->lock_prefix = 1;
3923 break;
3924 case 0x8c: /* mov r/m, sreg */
3925 if (c->modrm_reg > VCPU_SREG_GS) {
3926 rc = emulate_ud(ctxt);
3927 goto done;
3928 }
3929 c->dst.val = get_segment_selector(ctxt, c->modrm_reg);
3930 break; 3872 break;
3931 case 0x8d: /* lea r16/r32, m */ 3873 case 0x8d: /* lea r16/r32, m */
3932 c->dst.val = c->src.addr.mem.ea; 3874 ctxt->dst.val = ctxt->src.addr.mem.ea;
3933 break; 3875 break;
3934 case 0x8e: { /* mov seg, r/m16 */
3935 uint16_t sel;
3936
3937 sel = c->src.val;
3938
3939 if (c->modrm_reg == VCPU_SREG_CS ||
3940 c->modrm_reg > VCPU_SREG_GS) {
3941 rc = emulate_ud(ctxt);
3942 goto done;
3943 }
3944
3945 if (c->modrm_reg == VCPU_SREG_SS)
3946 ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
3947
3948 rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
3949
3950 c->dst.type = OP_NONE; /* Disable writeback. */
3951 break;
3952 }
3953 case 0x8f: /* pop (sole member of Grp1a) */ 3876 case 0x8f: /* pop (sole member of Grp1a) */
3954 rc = em_grp1a(ctxt); 3877 rc = em_grp1a(ctxt);
3955 break; 3878 break;
3956 case 0x90 ... 0x97: /* nop / xchg reg, rax */ 3879 case 0x90 ... 0x97: /* nop / xchg reg, rax */
3957 if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) 3880 if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX])
3958 break; 3881 break;
3959 goto xchg; 3882 rc = em_xchg(ctxt);
3883 break;
3960 case 0x98: /* cbw/cwde/cdqe */ 3884 case 0x98: /* cbw/cwde/cdqe */
3961 switch (c->op_bytes) { 3885 switch (ctxt->op_bytes) {
3962 case 2: c->dst.val = (s8)c->dst.val; break; 3886 case 2: ctxt->dst.val = (s8)ctxt->dst.val; break;
3963 case 4: c->dst.val = (s16)c->dst.val; break; 3887 case 4: ctxt->dst.val = (s16)ctxt->dst.val; break;
3964 case 8: c->dst.val = (s32)c->dst.val; break; 3888 case 8: ctxt->dst.val = (s32)ctxt->dst.val; break;
3965 } 3889 }
3966 break; 3890 break;
3967 case 0xa8 ... 0xa9: /* test ax, imm */
3968 goto test;
3969 case 0xc0 ... 0xc1: 3891 case 0xc0 ... 0xc1:
3970 rc = em_grp2(ctxt); 3892 rc = em_grp2(ctxt);
3971 break; 3893 break;
3972 case 0xc3: /* ret */
3973 c->dst.type = OP_REG;
3974 c->dst.addr.reg = &c->eip;
3975 c->dst.bytes = c->op_bytes;
3976 rc = em_pop(ctxt);
3977 break;
3978 case 0xc4: /* les */ 3894 case 0xc4: /* les */
3979 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES); 3895 rc = emulate_load_segment(ctxt, VCPU_SREG_ES);
3980 break; 3896 break;
3981 case 0xc5: /* lds */ 3897 case 0xc5: /* lds */
3982 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS); 3898 rc = emulate_load_segment(ctxt, VCPU_SREG_DS);
3983 break;
3984 case 0xcb: /* ret far */
3985 rc = emulate_ret_far(ctxt, ops);
3986 break; 3899 break;
3987 case 0xcc: /* int3 */ 3900 case 0xcc: /* int3 */
3988 irq = 3; 3901 rc = emulate_int(ctxt, 3);
3989 goto do_interrupt; 3902 break;
3990 case 0xcd: /* int n */ 3903 case 0xcd: /* int n */
3991 irq = c->src.val; 3904 rc = emulate_int(ctxt, ctxt->src.val);
3992 do_interrupt:
3993 rc = emulate_int(ctxt, ops, irq);
3994 break; 3905 break;
3995 case 0xce: /* into */ 3906 case 0xce: /* into */
3996 if (ctxt->eflags & EFLG_OF) { 3907 if (ctxt->eflags & EFLG_OF)
3997 irq = 4; 3908 rc = emulate_int(ctxt, 4);
3998 goto do_interrupt;
3999 }
4000 break;
4001 case 0xcf: /* iret */
4002 rc = emulate_iret(ctxt, ops);
4003 break; 3909 break;
4004 case 0xd0 ... 0xd1: /* Grp2 */ 3910 case 0xd0 ... 0xd1: /* Grp2 */
4005 rc = em_grp2(ctxt); 3911 rc = em_grp2(ctxt);
4006 break; 3912 break;
4007 case 0xd2 ... 0xd3: /* Grp2 */ 3913 case 0xd2 ... 0xd3: /* Grp2 */
4008 c->src.val = c->regs[VCPU_REGS_RCX]; 3914 ctxt->src.val = ctxt->regs[VCPU_REGS_RCX];
4009 rc = em_grp2(ctxt); 3915 rc = em_grp2(ctxt);
4010 break; 3916 break;
4011 case 0xe0 ... 0xe2: /* loop/loopz/loopnz */
4012 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
4013 if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 &&
4014 (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
4015 jmp_rel(c, c->src.val);
4016 break;
4017 case 0xe3: /* jcxz/jecxz/jrcxz */
4018 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0)
4019 jmp_rel(c, c->src.val);
4020 break;
4021 case 0xe4: /* inb */ 3917 case 0xe4: /* inb */
4022 case 0xe5: /* in */ 3918 case 0xe5: /* in */
4023 goto do_io_in; 3919 goto do_io_in;
@@ -4025,35 +3921,30 @@ special_insn:
4025 case 0xe7: /* out */ 3921 case 0xe7: /* out */
4026 goto do_io_out; 3922 goto do_io_out;
4027 case 0xe8: /* call (near) */ { 3923 case 0xe8: /* call (near) */ {
4028 long int rel = c->src.val; 3924 long int rel = ctxt->src.val;
4029 c->src.val = (unsigned long) c->eip; 3925 ctxt->src.val = (unsigned long) ctxt->_eip;
4030 jmp_rel(c, rel); 3926 jmp_rel(ctxt, rel);
4031 rc = em_push(ctxt); 3927 rc = em_push(ctxt);
4032 break; 3928 break;
4033 } 3929 }
4034 case 0xe9: /* jmp rel */ 3930 case 0xe9: /* jmp rel */
4035 goto jmp; 3931 case 0xeb: /* jmp rel short */
4036 case 0xea: /* jmp far */ 3932 jmp_rel(ctxt, ctxt->src.val);
4037 rc = em_jmp_far(ctxt); 3933 ctxt->dst.type = OP_NONE; /* Disable writeback. */
4038 break;
4039 case 0xeb:
4040 jmp: /* jmp rel short */
4041 jmp_rel(c, c->src.val);
4042 c->dst.type = OP_NONE; /* Disable writeback. */
4043 break; 3934 break;
4044 case 0xec: /* in al,dx */ 3935 case 0xec: /* in al,dx */
4045 case 0xed: /* in (e/r)ax,dx */ 3936 case 0xed: /* in (e/r)ax,dx */
4046 do_io_in: 3937 do_io_in:
4047 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, 3938 if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
4048 &c->dst.val)) 3939 &ctxt->dst.val))
4049 goto done; /* IO is needed */ 3940 goto done; /* IO is needed */
4050 break; 3941 break;
4051 case 0xee: /* out dx,al */ 3942 case 0xee: /* out dx,al */
4052 case 0xef: /* out dx,(e/r)ax */ 3943 case 0xef: /* out dx,(e/r)ax */
4053 do_io_out: 3944 do_io_out:
4054 ops->pio_out_emulated(ctxt, c->src.bytes, c->dst.val, 3945 ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
4055 &c->src.val, 1); 3946 &ctxt->src.val, 1);
4056 c->dst.type = OP_NONE; /* Disable writeback. */ 3947 ctxt->dst.type = OP_NONE; /* Disable writeback. */
4057 break; 3948 break;
4058 case 0xf4: /* hlt */ 3949 case 0xf4: /* hlt */
4059 ctxt->ops->halt(ctxt); 3950 ctxt->ops->halt(ctxt);
@@ -4071,22 +3962,6 @@ special_insn:
4071 case 0xf9: /* stc */ 3962 case 0xf9: /* stc */
4072 ctxt->eflags |= EFLG_CF; 3963 ctxt->eflags |= EFLG_CF;
4073 break; 3964 break;
4074 case 0xfa: /* cli */
4075 if (emulator_bad_iopl(ctxt, ops)) {
4076 rc = emulate_gp(ctxt, 0);
4077 goto done;
4078 } else
4079 ctxt->eflags &= ~X86_EFLAGS_IF;
4080 break;
4081 case 0xfb: /* sti */
4082 if (emulator_bad_iopl(ctxt, ops)) {
4083 rc = emulate_gp(ctxt, 0);
4084 goto done;
4085 } else {
4086 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
4087 ctxt->eflags |= X86_EFLAGS_IF;
4088 }
4089 break;
4090 case 0xfc: /* cld */ 3965 case 0xfc: /* cld */
4091 ctxt->eflags &= ~EFLG_DF; 3966 ctxt->eflags &= ~EFLG_DF;
4092 break; 3967 break;
@@ -4115,40 +3990,40 @@ writeback:
4115 * restore dst type in case the decoding will be reused 3990 * restore dst type in case the decoding will be reused
4116 * (happens for string instruction ) 3991 * (happens for string instruction )
4117 */ 3992 */
4118 c->dst.type = saved_dst_type; 3993 ctxt->dst.type = saved_dst_type;
4119 3994
4120 if ((c->d & SrcMask) == SrcSI) 3995 if ((ctxt->d & SrcMask) == SrcSI)
4121 string_addr_inc(ctxt, seg_override(ctxt, c), 3996 string_addr_inc(ctxt, seg_override(ctxt),
4122 VCPU_REGS_RSI, &c->src); 3997 VCPU_REGS_RSI, &ctxt->src);
4123 3998
4124 if ((c->d & DstMask) == DstDI) 3999 if ((ctxt->d & DstMask) == DstDI)
4125 string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI, 4000 string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
4126 &c->dst); 4001 &ctxt->dst);
4127 4002
4128 if (c->rep_prefix && (c->d & String)) { 4003 if (ctxt->rep_prefix && (ctxt->d & String)) {
4129 struct read_cache *r = &ctxt->decode.io_read; 4004 struct read_cache *r = &ctxt->io_read;
4130 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); 4005 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
4131 4006
4132 if (!string_insn_completed(ctxt)) { 4007 if (!string_insn_completed(ctxt)) {
4133 /* 4008 /*
4134 * Re-enter guest when pio read ahead buffer is empty 4009 * Re-enter guest when pio read ahead buffer is empty
4135 * or, if it is not used, after each 1024 iteration. 4010 * or, if it is not used, after each 1024 iteration.
4136 */ 4011 */
4137 if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) && 4012 if ((r->end != 0 || ctxt->regs[VCPU_REGS_RCX] & 0x3ff) &&
4138 (r->end == 0 || r->end != r->pos)) { 4013 (r->end == 0 || r->end != r->pos)) {
4139 /* 4014 /*
4140 * Reset read cache. Usually happens before 4015 * Reset read cache. Usually happens before
4141 * decode, but since instruction is restarted 4016 * decode, but since instruction is restarted
4142 * we have to do it here. 4017 * we have to do it here.
4143 */ 4018 */
4144 ctxt->decode.mem_read.end = 0; 4019 ctxt->mem_read.end = 0;
4145 return EMULATION_RESTART; 4020 return EMULATION_RESTART;
4146 } 4021 }
4147 goto done; /* skip rip writeback */ 4022 goto done; /* skip rip writeback */
4148 } 4023 }
4149 } 4024 }
4150 4025
4151 ctxt->eip = c->eip; 4026 ctxt->eip = ctxt->_eip;
4152 4027
4153done: 4028done:
4154 if (rc == X86EMUL_PROPAGATE_FAULT) 4029 if (rc == X86EMUL_PROPAGATE_FAULT)
@@ -4159,13 +4034,7 @@ done:
4159 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 4034 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
4160 4035
4161twobyte_insn: 4036twobyte_insn:
4162 switch (c->b) { 4037 switch (ctxt->b) {
4163 case 0x05: /* syscall */
4164 rc = emulate_syscall(ctxt, ops);
4165 break;
4166 case 0x06:
4167 rc = em_clts(ctxt);
4168 break;
4169 case 0x09: /* wbinvd */ 4038 case 0x09: /* wbinvd */
4170 (ctxt->ops->wbinvd)(ctxt); 4039 (ctxt->ops->wbinvd)(ctxt);
4171 break; 4040 break;
@@ -4174,21 +4043,21 @@ twobyte_insn:
4174 case 0x18: /* Grp16 (prefetch/nop) */ 4043 case 0x18: /* Grp16 (prefetch/nop) */
4175 break; 4044 break;
4176 case 0x20: /* mov cr, reg */ 4045 case 0x20: /* mov cr, reg */
4177 c->dst.val = ops->get_cr(ctxt, c->modrm_reg); 4046 ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg);
4178 break; 4047 break;
4179 case 0x21: /* mov from dr to reg */ 4048 case 0x21: /* mov from dr to reg */
4180 ops->get_dr(ctxt, c->modrm_reg, &c->dst.val); 4049 ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
4181 break; 4050 break;
4182 case 0x22: /* mov reg, cr */ 4051 case 0x22: /* mov reg, cr */
4183 if (ops->set_cr(ctxt, c->modrm_reg, c->src.val)) { 4052 if (ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) {
4184 emulate_gp(ctxt, 0); 4053 emulate_gp(ctxt, 0);
4185 rc = X86EMUL_PROPAGATE_FAULT; 4054 rc = X86EMUL_PROPAGATE_FAULT;
4186 goto done; 4055 goto done;
4187 } 4056 }
4188 c->dst.type = OP_NONE; 4057 ctxt->dst.type = OP_NONE;
4189 break; 4058 break;
4190 case 0x23: /* mov from reg to dr */ 4059 case 0x23: /* mov from reg to dr */
4191 if (ops->set_dr(ctxt, c->modrm_reg, c->src.val & 4060 if (ops->set_dr(ctxt, ctxt->modrm_reg, ctxt->src.val &
4192 ((ctxt->mode == X86EMUL_MODE_PROT64) ? 4061 ((ctxt->mode == X86EMUL_MODE_PROT64) ?
4193 ~0ULL : ~0U)) < 0) { 4062 ~0ULL : ~0U)) < 0) {
4194 /* #UD condition is already handled by the code above */ 4063 /* #UD condition is already handled by the code above */
@@ -4197,13 +4066,13 @@ twobyte_insn:
4197 goto done; 4066 goto done;
4198 } 4067 }
4199 4068
4200 c->dst.type = OP_NONE; /* no writeback */ 4069 ctxt->dst.type = OP_NONE; /* no writeback */
4201 break; 4070 break;
4202 case 0x30: 4071 case 0x30:
4203 /* wrmsr */ 4072 /* wrmsr */
4204 msr_data = (u32)c->regs[VCPU_REGS_RAX] 4073 msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
4205 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 4074 | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32);
4206 if (ops->set_msr(ctxt, c->regs[VCPU_REGS_RCX], msr_data)) { 4075 if (ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) {
4207 emulate_gp(ctxt, 0); 4076 emulate_gp(ctxt, 0);
4208 rc = X86EMUL_PROPAGATE_FAULT; 4077 rc = X86EMUL_PROPAGATE_FAULT;
4209 goto done; 4078 goto done;
@@ -4212,64 +4081,58 @@ twobyte_insn:
4212 break; 4081 break;
4213 case 0x32: 4082 case 0x32:
4214 /* rdmsr */ 4083 /* rdmsr */
4215 if (ops->get_msr(ctxt, c->regs[VCPU_REGS_RCX], &msr_data)) { 4084 if (ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) {
4216 emulate_gp(ctxt, 0); 4085 emulate_gp(ctxt, 0);
4217 rc = X86EMUL_PROPAGATE_FAULT; 4086 rc = X86EMUL_PROPAGATE_FAULT;
4218 goto done; 4087 goto done;
4219 } else { 4088 } else {
4220 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 4089 ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data;
4221 c->regs[VCPU_REGS_RDX] = msr_data >> 32; 4090 ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32;
4222 } 4091 }
4223 rc = X86EMUL_CONTINUE; 4092 rc = X86EMUL_CONTINUE;
4224 break; 4093 break;
4225 case 0x34: /* sysenter */
4226 rc = emulate_sysenter(ctxt, ops);
4227 break;
4228 case 0x35: /* sysexit */
4229 rc = emulate_sysexit(ctxt, ops);
4230 break;
4231 case 0x40 ... 0x4f: /* cmov */ 4094 case 0x40 ... 0x4f: /* cmov */
4232 c->dst.val = c->dst.orig_val = c->src.val; 4095 ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val;
4233 if (!test_cc(c->b, ctxt->eflags)) 4096 if (!test_cc(ctxt->b, ctxt->eflags))
4234 c->dst.type = OP_NONE; /* no writeback */ 4097 ctxt->dst.type = OP_NONE; /* no writeback */
4235 break; 4098 break;
4236 case 0x80 ... 0x8f: /* jnz rel, etc*/ 4099 case 0x80 ... 0x8f: /* jnz rel, etc*/
4237 if (test_cc(c->b, ctxt->eflags)) 4100 if (test_cc(ctxt->b, ctxt->eflags))
4238 jmp_rel(c, c->src.val); 4101 jmp_rel(ctxt, ctxt->src.val);
4239 break; 4102 break;
4240 case 0x90 ... 0x9f: /* setcc r/m8 */ 4103 case 0x90 ... 0x9f: /* setcc r/m8 */
4241 c->dst.val = test_cc(c->b, ctxt->eflags); 4104 ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
4242 break; 4105 break;
4243 case 0xa0: /* push fs */ 4106 case 0xa0: /* push fs */
4244 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); 4107 rc = emulate_push_sreg(ctxt, VCPU_SREG_FS);
4245 break; 4108 break;
4246 case 0xa1: /* pop fs */ 4109 case 0xa1: /* pop fs */
4247 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); 4110 rc = emulate_pop_sreg(ctxt, VCPU_SREG_FS);
4248 break; 4111 break;
4249 case 0xa3: 4112 case 0xa3:
4250 bt: /* bt */ 4113 bt: /* bt */
4251 c->dst.type = OP_NONE; 4114 ctxt->dst.type = OP_NONE;
4252 /* only subword offset */ 4115 /* only subword offset */
4253 c->src.val &= (c->dst.bytes << 3) - 1; 4116 ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
4254 emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); 4117 emulate_2op_SrcV_nobyte("bt", ctxt->src, ctxt->dst, ctxt->eflags);
4255 break; 4118 break;
4256 case 0xa4: /* shld imm8, r, r/m */ 4119 case 0xa4: /* shld imm8, r, r/m */
4257 case 0xa5: /* shld cl, r, r/m */ 4120 case 0xa5: /* shld cl, r, r/m */
4258 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); 4121 emulate_2op_cl("shld", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags);
4259 break; 4122 break;
4260 case 0xa8: /* push gs */ 4123 case 0xa8: /* push gs */
4261 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); 4124 rc = emulate_push_sreg(ctxt, VCPU_SREG_GS);
4262 break; 4125 break;
4263 case 0xa9: /* pop gs */ 4126 case 0xa9: /* pop gs */
4264 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); 4127 rc = emulate_pop_sreg(ctxt, VCPU_SREG_GS);
4265 break; 4128 break;
4266 case 0xab: 4129 case 0xab:
4267 bts: /* bts */ 4130 bts: /* bts */
4268 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); 4131 emulate_2op_SrcV_nobyte("bts", ctxt->src, ctxt->dst, ctxt->eflags);
4269 break; 4132 break;
4270 case 0xac: /* shrd imm8, r, r/m */ 4133 case 0xac: /* shrd imm8, r, r/m */
4271 case 0xad: /* shrd cl, r, r/m */ 4134 case 0xad: /* shrd cl, r, r/m */
4272 emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags); 4135 emulate_2op_cl("shrd", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags);
4273 break; 4136 break;
4274 case 0xae: /* clflush */ 4137 case 0xae: /* clflush */
4275 break; 4138 break;
@@ -4278,38 +4141,38 @@ twobyte_insn:
4278 * Save real source value, then compare EAX against 4141 * Save real source value, then compare EAX against
4279 * destination. 4142 * destination.
4280 */ 4143 */
4281 c->src.orig_val = c->src.val; 4144 ctxt->src.orig_val = ctxt->src.val;
4282 c->src.val = c->regs[VCPU_REGS_RAX]; 4145 ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
4283 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); 4146 emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags);
4284 if (ctxt->eflags & EFLG_ZF) { 4147 if (ctxt->eflags & EFLG_ZF) {
4285 /* Success: write back to memory. */ 4148 /* Success: write back to memory. */
4286 c->dst.val = c->src.orig_val; 4149 ctxt->dst.val = ctxt->src.orig_val;
4287 } else { 4150 } else {
4288 /* Failure: write the value we saw to EAX. */ 4151 /* Failure: write the value we saw to EAX. */
4289 c->dst.type = OP_REG; 4152 ctxt->dst.type = OP_REG;
4290 c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 4153 ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
4291 } 4154 }
4292 break; 4155 break;
4293 case 0xb2: /* lss */ 4156 case 0xb2: /* lss */
4294 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS); 4157 rc = emulate_load_segment(ctxt, VCPU_SREG_SS);
4295 break; 4158 break;
4296 case 0xb3: 4159 case 0xb3:
4297 btr: /* btr */ 4160 btr: /* btr */
4298 emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); 4161 emulate_2op_SrcV_nobyte("btr", ctxt->src, ctxt->dst, ctxt->eflags);
4299 break; 4162 break;
4300 case 0xb4: /* lfs */ 4163 case 0xb4: /* lfs */
4301 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS); 4164 rc = emulate_load_segment(ctxt, VCPU_SREG_FS);
4302 break; 4165 break;
4303 case 0xb5: /* lgs */ 4166 case 0xb5: /* lgs */
4304 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS); 4167 rc = emulate_load_segment(ctxt, VCPU_SREG_GS);
4305 break; 4168 break;
4306 case 0xb6 ... 0xb7: /* movzx */ 4169 case 0xb6 ... 0xb7: /* movzx */
4307 c->dst.bytes = c->op_bytes; 4170 ctxt->dst.bytes = ctxt->op_bytes;
4308 c->dst.val = (c->d & ByteOp) ? (u8) c->src.val 4171 ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val
4309 : (u16) c->src.val; 4172 : (u16) ctxt->src.val;
4310 break; 4173 break;
4311 case 0xba: /* Grp8 */ 4174 case 0xba: /* Grp8 */
4312 switch (c->modrm_reg & 3) { 4175 switch (ctxt->modrm_reg & 3) {
4313 case 0: 4176 case 0:
4314 goto bt; 4177 goto bt;
4315 case 1: 4178 case 1:
@@ -4322,47 +4185,47 @@ twobyte_insn:
4322 break; 4185 break;
4323 case 0xbb: 4186 case 0xbb:
4324 btc: /* btc */ 4187 btc: /* btc */
4325 emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); 4188 emulate_2op_SrcV_nobyte("btc", ctxt->src, ctxt->dst, ctxt->eflags);
4326 break; 4189 break;
4327 case 0xbc: { /* bsf */ 4190 case 0xbc: { /* bsf */
4328 u8 zf; 4191 u8 zf;
4329 __asm__ ("bsf %2, %0; setz %1" 4192 __asm__ ("bsf %2, %0; setz %1"
4330 : "=r"(c->dst.val), "=q"(zf) 4193 : "=r"(ctxt->dst.val), "=q"(zf)
4331 : "r"(c->src.val)); 4194 : "r"(ctxt->src.val));
4332 ctxt->eflags &= ~X86_EFLAGS_ZF; 4195 ctxt->eflags &= ~X86_EFLAGS_ZF;
4333 if (zf) { 4196 if (zf) {
4334 ctxt->eflags |= X86_EFLAGS_ZF; 4197 ctxt->eflags |= X86_EFLAGS_ZF;
4335 c->dst.type = OP_NONE; /* Disable writeback. */ 4198 ctxt->dst.type = OP_NONE; /* Disable writeback. */
4336 } 4199 }
4337 break; 4200 break;
4338 } 4201 }
4339 case 0xbd: { /* bsr */ 4202 case 0xbd: { /* bsr */
4340 u8 zf; 4203 u8 zf;
4341 __asm__ ("bsr %2, %0; setz %1" 4204 __asm__ ("bsr %2, %0; setz %1"
4342 : "=r"(c->dst.val), "=q"(zf) 4205 : "=r"(ctxt->dst.val), "=q"(zf)
4343 : "r"(c->src.val)); 4206 : "r"(ctxt->src.val));
4344 ctxt->eflags &= ~X86_EFLAGS_ZF; 4207 ctxt->eflags &= ~X86_EFLAGS_ZF;
4345 if (zf) { 4208 if (zf) {
4346 ctxt->eflags |= X86_EFLAGS_ZF; 4209 ctxt->eflags |= X86_EFLAGS_ZF;
4347 c->dst.type = OP_NONE; /* Disable writeback. */ 4210 ctxt->dst.type = OP_NONE; /* Disable writeback. */
4348 } 4211 }
4349 break; 4212 break;
4350 } 4213 }
4351 case 0xbe ... 0xbf: /* movsx */ 4214 case 0xbe ... 0xbf: /* movsx */
4352 c->dst.bytes = c->op_bytes; 4215 ctxt->dst.bytes = ctxt->op_bytes;
4353 c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : 4216 ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val :
4354 (s16) c->src.val; 4217 (s16) ctxt->src.val;
4355 break; 4218 break;
4356 case 0xc0 ... 0xc1: /* xadd */ 4219 case 0xc0 ... 0xc1: /* xadd */
4357 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); 4220 emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags);
4358 /* Write back the register source. */ 4221 /* Write back the register source. */
4359 c->src.val = c->dst.orig_val; 4222 ctxt->src.val = ctxt->dst.orig_val;
4360 write_register_operand(&c->src); 4223 write_register_operand(&ctxt->src);
4361 break; 4224 break;
4362 case 0xc3: /* movnti */ 4225 case 0xc3: /* movnti */
4363 c->dst.bytes = c->op_bytes; 4226 ctxt->dst.bytes = ctxt->op_bytes;
4364 c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : 4227 ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :
4365 (u64) c->src.val; 4228 (u64) ctxt->src.val;
4366 break; 4229 break;
4367 case 0xc7: /* Grp9 (cmpxchg8b) */ 4230 case 0xc7: /* Grp9 (cmpxchg8b) */
4368 rc = em_grp9(ctxt); 4231 rc = em_grp9(ctxt);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index aee38623b768..9335e1bf72ad 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -148,7 +148,7 @@ module_param(oos_shadow, bool, 0644);
148#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ 148#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
149 | PT64_NX_MASK) 149 | PT64_NX_MASK)
150 150
151#define RMAP_EXT 4 151#define PTE_LIST_EXT 4
152 152
153#define ACC_EXEC_MASK 1 153#define ACC_EXEC_MASK 1
154#define ACC_WRITE_MASK PT_WRITABLE_MASK 154#define ACC_WRITE_MASK PT_WRITABLE_MASK
@@ -164,16 +164,16 @@ module_param(oos_shadow, bool, 0644);
164 164
165#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 165#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
166 166
167struct kvm_rmap_desc { 167struct pte_list_desc {
168 u64 *sptes[RMAP_EXT]; 168 u64 *sptes[PTE_LIST_EXT];
169 struct kvm_rmap_desc *more; 169 struct pte_list_desc *more;
170}; 170};
171 171
172struct kvm_shadow_walk_iterator { 172struct kvm_shadow_walk_iterator {
173 u64 addr; 173 u64 addr;
174 hpa_t shadow_addr; 174 hpa_t shadow_addr;
175 int level;
176 u64 *sptep; 175 u64 *sptep;
176 int level;
177 unsigned index; 177 unsigned index;
178}; 178};
179 179
@@ -182,32 +182,68 @@ struct kvm_shadow_walk_iterator {
182 shadow_walk_okay(&(_walker)); \ 182 shadow_walk_okay(&(_walker)); \
183 shadow_walk_next(&(_walker))) 183 shadow_walk_next(&(_walker)))
184 184
185typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); 185#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
186 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
187 shadow_walk_okay(&(_walker)) && \
188 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
189 __shadow_walk_next(&(_walker), spte))
186 190
187static struct kmem_cache *pte_chain_cache; 191static struct kmem_cache *pte_list_desc_cache;
188static struct kmem_cache *rmap_desc_cache;
189static struct kmem_cache *mmu_page_header_cache; 192static struct kmem_cache *mmu_page_header_cache;
190static struct percpu_counter kvm_total_used_mmu_pages; 193static struct percpu_counter kvm_total_used_mmu_pages;
191 194
192static u64 __read_mostly shadow_trap_nonpresent_pte;
193static u64 __read_mostly shadow_notrap_nonpresent_pte;
194static u64 __read_mostly shadow_nx_mask; 195static u64 __read_mostly shadow_nx_mask;
195static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ 196static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
196static u64 __read_mostly shadow_user_mask; 197static u64 __read_mostly shadow_user_mask;
197static u64 __read_mostly shadow_accessed_mask; 198static u64 __read_mostly shadow_accessed_mask;
198static u64 __read_mostly shadow_dirty_mask; 199static u64 __read_mostly shadow_dirty_mask;
200static u64 __read_mostly shadow_mmio_mask;
199 201
200static inline u64 rsvd_bits(int s, int e) 202static void mmu_spte_set(u64 *sptep, u64 spte);
203
204void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
201{ 205{
202 return ((1ULL << (e - s + 1)) - 1) << s; 206 shadow_mmio_mask = mmio_mask;
207}
208EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
209
210static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
211{
212 access &= ACC_WRITE_MASK | ACC_USER_MASK;
213
214 trace_mark_mmio_spte(sptep, gfn, access);
215 mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
203} 216}
204 217
205void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) 218static bool is_mmio_spte(u64 spte)
206{ 219{
207 shadow_trap_nonpresent_pte = trap_pte; 220 return (spte & shadow_mmio_mask) == shadow_mmio_mask;
208 shadow_notrap_nonpresent_pte = notrap_pte; 221}
222
223static gfn_t get_mmio_spte_gfn(u64 spte)
224{
225 return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
226}
227
228static unsigned get_mmio_spte_access(u64 spte)
229{
230 return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
231}
232
233static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
234{
235 if (unlikely(is_noslot_pfn(pfn))) {
236 mark_mmio_spte(sptep, gfn, access);
237 return true;
238 }
239
240 return false;
241}
242
243static inline u64 rsvd_bits(int s, int e)
244{
245 return ((1ULL << (e - s + 1)) - 1) << s;
209} 246}
210EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
211 247
212void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 248void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
213 u64 dirty_mask, u64 nx_mask, u64 x_mask) 249 u64 dirty_mask, u64 nx_mask, u64 x_mask)
@@ -220,11 +256,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
220} 256}
221EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 257EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
222 258
223static bool is_write_protection(struct kvm_vcpu *vcpu)
224{
225 return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
226}
227
228static int is_cpuid_PSE36(void) 259static int is_cpuid_PSE36(void)
229{ 260{
230 return 1; 261 return 1;
@@ -237,8 +268,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
237 268
238static int is_shadow_present_pte(u64 pte) 269static int is_shadow_present_pte(u64 pte)
239{ 270{
240 return pte != shadow_trap_nonpresent_pte 271 return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
241 && pte != shadow_notrap_nonpresent_pte;
242} 272}
243 273
244static int is_large_pte(u64 pte) 274static int is_large_pte(u64 pte)
@@ -246,11 +276,6 @@ static int is_large_pte(u64 pte)
246 return pte & PT_PAGE_SIZE_MASK; 276 return pte & PT_PAGE_SIZE_MASK;
247} 277}
248 278
249static int is_writable_pte(unsigned long pte)
250{
251 return pte & PT_WRITABLE_MASK;
252}
253
254static int is_dirty_gpte(unsigned long pte) 279static int is_dirty_gpte(unsigned long pte)
255{ 280{
256 return pte & PT_DIRTY_MASK; 281 return pte & PT_DIRTY_MASK;
@@ -282,26 +307,154 @@ static gfn_t pse36_gfn_delta(u32 gpte)
282 return (gpte & PT32_DIR_PSE36_MASK) << shift; 307 return (gpte & PT32_DIR_PSE36_MASK) << shift;
283} 308}
284 309
310#ifdef CONFIG_X86_64
285static void __set_spte(u64 *sptep, u64 spte) 311static void __set_spte(u64 *sptep, u64 spte)
286{ 312{
287 set_64bit(sptep, spte); 313 *sptep = spte;
288} 314}
289 315
290static u64 __xchg_spte(u64 *sptep, u64 new_spte) 316static void __update_clear_spte_fast(u64 *sptep, u64 spte)
291{ 317{
292#ifdef CONFIG_X86_64 318 *sptep = spte;
293 return xchg(sptep, new_spte); 319}
320
321static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
322{
323 return xchg(sptep, spte);
324}
325
326static u64 __get_spte_lockless(u64 *sptep)
327{
328 return ACCESS_ONCE(*sptep);
329}
330
331static bool __check_direct_spte_mmio_pf(u64 spte)
332{
333 /* It is valid if the spte is zapped. */
334 return spte == 0ull;
335}
294#else 336#else
295 u64 old_spte; 337union split_spte {
338 struct {
339 u32 spte_low;
340 u32 spte_high;
341 };
342 u64 spte;
343};
296 344
297 do { 345static void count_spte_clear(u64 *sptep, u64 spte)
298 old_spte = *sptep; 346{
299 } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte); 347 struct kvm_mmu_page *sp = page_header(__pa(sptep));
300 348
301 return old_spte; 349 if (is_shadow_present_pte(spte))
302#endif 350 return;
351
352 /* Ensure the spte is completely set before we increase the count */
353 smp_wmb();
354 sp->clear_spte_count++;
355}
356
357static void __set_spte(u64 *sptep, u64 spte)
358{
359 union split_spte *ssptep, sspte;
360
361 ssptep = (union split_spte *)sptep;
362 sspte = (union split_spte)spte;
363
364 ssptep->spte_high = sspte.spte_high;
365
366 /*
367 * If we map the spte from nonpresent to present, We should store
368 * the high bits firstly, then set present bit, so cpu can not
369 * fetch this spte while we are setting the spte.
370 */
371 smp_wmb();
372
373 ssptep->spte_low = sspte.spte_low;
303} 374}
304 375
376static void __update_clear_spte_fast(u64 *sptep, u64 spte)
377{
378 union split_spte *ssptep, sspte;
379
380 ssptep = (union split_spte *)sptep;
381 sspte = (union split_spte)spte;
382
383 ssptep->spte_low = sspte.spte_low;
384
385 /*
386 * If we map the spte from present to nonpresent, we should clear
387 * present bit firstly to avoid vcpu fetch the old high bits.
388 */
389 smp_wmb();
390
391 ssptep->spte_high = sspte.spte_high;
392 count_spte_clear(sptep, spte);
393}
394
395static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
396{
397 union split_spte *ssptep, sspte, orig;
398
399 ssptep = (union split_spte *)sptep;
400 sspte = (union split_spte)spte;
401
402 /* xchg acts as a barrier before the setting of the high bits */
403 orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
404 orig.spte_high = ssptep->spte_high = sspte.spte_high;
405 count_spte_clear(sptep, spte);
406
407 return orig.spte;
408}
409
410/*
411 * The idea using the light way get the spte on x86_32 guest is from
412 * gup_get_pte(arch/x86/mm/gup.c).
413 * The difference is we can not catch the spte tlb flush if we leave
414 * guest mode, so we emulate it by increase clear_spte_count when spte
415 * is cleared.
416 */
417static u64 __get_spte_lockless(u64 *sptep)
418{
419 struct kvm_mmu_page *sp = page_header(__pa(sptep));
420 union split_spte spte, *orig = (union split_spte *)sptep;
421 int count;
422
423retry:
424 count = sp->clear_spte_count;
425 smp_rmb();
426
427 spte.spte_low = orig->spte_low;
428 smp_rmb();
429
430 spte.spte_high = orig->spte_high;
431 smp_rmb();
432
433 if (unlikely(spte.spte_low != orig->spte_low ||
434 count != sp->clear_spte_count))
435 goto retry;
436
437 return spte.spte;
438}
439
440static bool __check_direct_spte_mmio_pf(u64 spte)
441{
442 union split_spte sspte = (union split_spte)spte;
443 u32 high_mmio_mask = shadow_mmio_mask >> 32;
444
445 /* It is valid if the spte is zapped. */
446 if (spte == 0ull)
447 return true;
448
449 /* It is valid if the spte is being zapped. */
450 if (sspte.spte_low == 0ull &&
451 (sspte.spte_high & high_mmio_mask) == high_mmio_mask)
452 return true;
453
454 return false;
455}
456#endif
457
305static bool spte_has_volatile_bits(u64 spte) 458static bool spte_has_volatile_bits(u64 spte)
306{ 459{
307 if (!shadow_accessed_mask) 460 if (!shadow_accessed_mask)
@@ -322,12 +475,30 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
322 return (old_spte & bit_mask) && !(new_spte & bit_mask); 475 return (old_spte & bit_mask) && !(new_spte & bit_mask);
323} 476}
324 477
325static void update_spte(u64 *sptep, u64 new_spte) 478/* Rules for using mmu_spte_set:
479 * Set the sptep from nonpresent to present.
480 * Note: the sptep being assigned *must* be either not present
481 * or in a state where the hardware will not attempt to update
482 * the spte.
483 */
484static void mmu_spte_set(u64 *sptep, u64 new_spte)
485{
486 WARN_ON(is_shadow_present_pte(*sptep));
487 __set_spte(sptep, new_spte);
488}
489
490/* Rules for using mmu_spte_update:
491 * Update the state bits, it means the mapped pfn is not changged.
492 */
493static void mmu_spte_update(u64 *sptep, u64 new_spte)
326{ 494{
327 u64 mask, old_spte = *sptep; 495 u64 mask, old_spte = *sptep;
328 496
329 WARN_ON(!is_rmap_spte(new_spte)); 497 WARN_ON(!is_rmap_spte(new_spte));
330 498
499 if (!is_shadow_present_pte(old_spte))
500 return mmu_spte_set(sptep, new_spte);
501
331 new_spte |= old_spte & shadow_dirty_mask; 502 new_spte |= old_spte & shadow_dirty_mask;
332 503
333 mask = shadow_accessed_mask; 504 mask = shadow_accessed_mask;
@@ -335,9 +506,9 @@ static void update_spte(u64 *sptep, u64 new_spte)
335 mask |= shadow_dirty_mask; 506 mask |= shadow_dirty_mask;
336 507
337 if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) 508 if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
338 __set_spte(sptep, new_spte); 509 __update_clear_spte_fast(sptep, new_spte);
339 else 510 else
340 old_spte = __xchg_spte(sptep, new_spte); 511 old_spte = __update_clear_spte_slow(sptep, new_spte);
341 512
342 if (!shadow_accessed_mask) 513 if (!shadow_accessed_mask)
343 return; 514 return;
@@ -348,6 +519,64 @@ static void update_spte(u64 *sptep, u64 new_spte)
348 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 519 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
349} 520}
350 521
522/*
523 * Rules for using mmu_spte_clear_track_bits:
524 * It sets the sptep from present to nonpresent, and track the
525 * state bits, it is used to clear the last level sptep.
526 */
527static int mmu_spte_clear_track_bits(u64 *sptep)
528{
529 pfn_t pfn;
530 u64 old_spte = *sptep;
531
532 if (!spte_has_volatile_bits(old_spte))
533 __update_clear_spte_fast(sptep, 0ull);
534 else
535 old_spte = __update_clear_spte_slow(sptep, 0ull);
536
537 if (!is_rmap_spte(old_spte))
538 return 0;
539
540 pfn = spte_to_pfn(old_spte);
541 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
542 kvm_set_pfn_accessed(pfn);
543 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
544 kvm_set_pfn_dirty(pfn);
545 return 1;
546}
547
548/*
549 * Rules for using mmu_spte_clear_no_track:
550 * Directly clear spte without caring the state bits of sptep,
551 * it is used to set the upper level spte.
552 */
553static void mmu_spte_clear_no_track(u64 *sptep)
554{
555 __update_clear_spte_fast(sptep, 0ull);
556}
557
558static u64 mmu_spte_get_lockless(u64 *sptep)
559{
560 return __get_spte_lockless(sptep);
561}
562
563static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
564{
565 rcu_read_lock();
566 atomic_inc(&vcpu->kvm->arch.reader_counter);
567
568 /* Increase the counter before walking shadow page table */
569 smp_mb__after_atomic_inc();
570}
571
572static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
573{
574 /* Decrease the counter after walking shadow page table finished */
575 smp_mb__before_atomic_dec();
576 atomic_dec(&vcpu->kvm->arch.reader_counter);
577 rcu_read_unlock();
578}
579
351static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 580static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
352 struct kmem_cache *base_cache, int min) 581 struct kmem_cache *base_cache, int min)
353{ 582{
@@ -397,12 +626,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
397{ 626{
398 int r; 627 int r;
399 628
400 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, 629 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
401 pte_chain_cache, 4); 630 pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
402 if (r)
403 goto out;
404 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
405 rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
406 if (r) 631 if (r)
407 goto out; 632 goto out;
408 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); 633 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -416,8 +641,8 @@ out:
416 641
417static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) 642static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
418{ 643{
419 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache); 644 mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
420 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache); 645 pte_list_desc_cache);
421 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); 646 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
422 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, 647 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
423 mmu_page_header_cache); 648 mmu_page_header_cache);
@@ -433,26 +658,15 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
433 return p; 658 return p;
434} 659}
435 660
436static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) 661static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
437{
438 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
439 sizeof(struct kvm_pte_chain));
440}
441
442static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
443{ 662{
444 kmem_cache_free(pte_chain_cache, pc); 663 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache,
664 sizeof(struct pte_list_desc));
445} 665}
446 666
447static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) 667static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
448{ 668{
449 return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache, 669 kmem_cache_free(pte_list_desc_cache, pte_list_desc);
450 sizeof(struct kvm_rmap_desc));
451}
452
453static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
454{
455 kmem_cache_free(rmap_desc_cache, rd);
456} 670}
457 671
458static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) 672static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
@@ -498,6 +712,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
498 linfo = lpage_info_slot(gfn, slot, i); 712 linfo = lpage_info_slot(gfn, slot, i);
499 linfo->write_count += 1; 713 linfo->write_count += 1;
500 } 714 }
715 kvm->arch.indirect_shadow_pages++;
501} 716}
502 717
503static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) 718static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
@@ -513,6 +728,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
513 linfo->write_count -= 1; 728 linfo->write_count -= 1;
514 WARN_ON(linfo->write_count < 0); 729 WARN_ON(linfo->write_count < 0);
515 } 730 }
731 kvm->arch.indirect_shadow_pages--;
516} 732}
517 733
518static int has_wrprotected_page(struct kvm *kvm, 734static int has_wrprotected_page(struct kvm *kvm,
@@ -588,67 +804,42 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
588} 804}
589 805
590/* 806/*
591 * Take gfn and return the reverse mapping to it. 807 * Pte mapping structures:
592 */
593
594static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
595{
596 struct kvm_memory_slot *slot;
597 struct kvm_lpage_info *linfo;
598
599 slot = gfn_to_memslot(kvm, gfn);
600 if (likely(level == PT_PAGE_TABLE_LEVEL))
601 return &slot->rmap[gfn - slot->base_gfn];
602
603 linfo = lpage_info_slot(gfn, slot, level);
604
605 return &linfo->rmap_pde;
606}
607
608/*
609 * Reverse mapping data structures:
610 * 808 *
611 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry 809 * If pte_list bit zero is zero, then pte_list point to the spte.
612 * that points to page_address(page).
613 * 810 *
614 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc 811 * If pte_list bit zero is one, (then pte_list & ~1) points to a struct
615 * containing more mappings. 812 * pte_list_desc containing more mappings.
616 * 813 *
617 * Returns the number of rmap entries before the spte was added or zero if 814 * Returns the number of pte entries before the spte was added or zero if
618 * the spte was not added. 815 * the spte was not added.
619 * 816 *
620 */ 817 */
621static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 818static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
819 unsigned long *pte_list)
622{ 820{
623 struct kvm_mmu_page *sp; 821 struct pte_list_desc *desc;
624 struct kvm_rmap_desc *desc;
625 unsigned long *rmapp;
626 int i, count = 0; 822 int i, count = 0;
627 823
628 if (!is_rmap_spte(*spte)) 824 if (!*pte_list) {
629 return count; 825 rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
630 sp = page_header(__pa(spte)); 826 *pte_list = (unsigned long)spte;
631 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); 827 } else if (!(*pte_list & 1)) {
632 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 828 rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
633 if (!*rmapp) { 829 desc = mmu_alloc_pte_list_desc(vcpu);
634 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); 830 desc->sptes[0] = (u64 *)*pte_list;
635 *rmapp = (unsigned long)spte;
636 } else if (!(*rmapp & 1)) {
637 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
638 desc = mmu_alloc_rmap_desc(vcpu);
639 desc->sptes[0] = (u64 *)*rmapp;
640 desc->sptes[1] = spte; 831 desc->sptes[1] = spte;
641 *rmapp = (unsigned long)desc | 1; 832 *pte_list = (unsigned long)desc | 1;
642 ++count; 833 ++count;
643 } else { 834 } else {
644 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); 835 rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
645 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 836 desc = (struct pte_list_desc *)(*pte_list & ~1ul);
646 while (desc->sptes[RMAP_EXT-1] && desc->more) { 837 while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
647 desc = desc->more; 838 desc = desc->more;
648 count += RMAP_EXT; 839 count += PTE_LIST_EXT;
649 } 840 }
650 if (desc->sptes[RMAP_EXT-1]) { 841 if (desc->sptes[PTE_LIST_EXT-1]) {
651 desc->more = mmu_alloc_rmap_desc(vcpu); 842 desc->more = mmu_alloc_pte_list_desc(vcpu);
652 desc = desc->more; 843 desc = desc->more;
653 } 844 }
654 for (i = 0; desc->sptes[i]; ++i) 845 for (i = 0; desc->sptes[i]; ++i)
@@ -658,59 +849,78 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
658 return count; 849 return count;
659} 850}
660 851
661static void rmap_desc_remove_entry(unsigned long *rmapp, 852static u64 *pte_list_next(unsigned long *pte_list, u64 *spte)
662 struct kvm_rmap_desc *desc, 853{
663 int i, 854 struct pte_list_desc *desc;
664 struct kvm_rmap_desc *prev_desc) 855 u64 *prev_spte;
856 int i;
857
858 if (!*pte_list)
859 return NULL;
860 else if (!(*pte_list & 1)) {
861 if (!spte)
862 return (u64 *)*pte_list;
863 return NULL;
864 }
865 desc = (struct pte_list_desc *)(*pte_list & ~1ul);
866 prev_spte = NULL;
867 while (desc) {
868 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
869 if (prev_spte == spte)
870 return desc->sptes[i];
871 prev_spte = desc->sptes[i];
872 }
873 desc = desc->more;
874 }
875 return NULL;
876}
877
878static void
879pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
880 int i, struct pte_list_desc *prev_desc)
665{ 881{
666 int j; 882 int j;
667 883
668 for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j) 884 for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
669 ; 885 ;
670 desc->sptes[i] = desc->sptes[j]; 886 desc->sptes[i] = desc->sptes[j];
671 desc->sptes[j] = NULL; 887 desc->sptes[j] = NULL;
672 if (j != 0) 888 if (j != 0)
673 return; 889 return;
674 if (!prev_desc && !desc->more) 890 if (!prev_desc && !desc->more)
675 *rmapp = (unsigned long)desc->sptes[0]; 891 *pte_list = (unsigned long)desc->sptes[0];
676 else 892 else
677 if (prev_desc) 893 if (prev_desc)
678 prev_desc->more = desc->more; 894 prev_desc->more = desc->more;
679 else 895 else
680 *rmapp = (unsigned long)desc->more | 1; 896 *pte_list = (unsigned long)desc->more | 1;
681 mmu_free_rmap_desc(desc); 897 mmu_free_pte_list_desc(desc);
682} 898}
683 899
684static void rmap_remove(struct kvm *kvm, u64 *spte) 900static void pte_list_remove(u64 *spte, unsigned long *pte_list)
685{ 901{
686 struct kvm_rmap_desc *desc; 902 struct pte_list_desc *desc;
687 struct kvm_rmap_desc *prev_desc; 903 struct pte_list_desc *prev_desc;
688 struct kvm_mmu_page *sp;
689 gfn_t gfn;
690 unsigned long *rmapp;
691 int i; 904 int i;
692 905
693 sp = page_header(__pa(spte)); 906 if (!*pte_list) {
694 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); 907 printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
695 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
696 if (!*rmapp) {
697 printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
698 BUG(); 908 BUG();
699 } else if (!(*rmapp & 1)) { 909 } else if (!(*pte_list & 1)) {
700 rmap_printk("rmap_remove: %p 1->0\n", spte); 910 rmap_printk("pte_list_remove: %p 1->0\n", spte);
701 if ((u64 *)*rmapp != spte) { 911 if ((u64 *)*pte_list != spte) {
702 printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte); 912 printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte);
703 BUG(); 913 BUG();
704 } 914 }
705 *rmapp = 0; 915 *pte_list = 0;
706 } else { 916 } else {
707 rmap_printk("rmap_remove: %p many->many\n", spte); 917 rmap_printk("pte_list_remove: %p many->many\n", spte);
708 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 918 desc = (struct pte_list_desc *)(*pte_list & ~1ul);
709 prev_desc = NULL; 919 prev_desc = NULL;
710 while (desc) { 920 while (desc) {
711 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) 921 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
712 if (desc->sptes[i] == spte) { 922 if (desc->sptes[i] == spte) {
713 rmap_desc_remove_entry(rmapp, 923 pte_list_desc_remove_entry(pte_list,
714 desc, i, 924 desc, i,
715 prev_desc); 925 prev_desc);
716 return; 926 return;
@@ -718,62 +928,80 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
718 prev_desc = desc; 928 prev_desc = desc;
719 desc = desc->more; 929 desc = desc->more;
720 } 930 }
721 pr_err("rmap_remove: %p many->many\n", spte); 931 pr_err("pte_list_remove: %p many->many\n", spte);
722 BUG(); 932 BUG();
723 } 933 }
724} 934}
725 935
726static int set_spte_track_bits(u64 *sptep, u64 new_spte) 936typedef void (*pte_list_walk_fn) (u64 *spte);
937static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
727{ 938{
728 pfn_t pfn; 939 struct pte_list_desc *desc;
729 u64 old_spte = *sptep; 940 int i;
730 941
731 if (!spte_has_volatile_bits(old_spte)) 942 if (!*pte_list)
732 __set_spte(sptep, new_spte); 943 return;
733 else
734 old_spte = __xchg_spte(sptep, new_spte);
735 944
736 if (!is_rmap_spte(old_spte)) 945 if (!(*pte_list & 1))
737 return 0; 946 return fn((u64 *)*pte_list);
738 947
739 pfn = spte_to_pfn(old_spte); 948 desc = (struct pte_list_desc *)(*pte_list & ~1ul);
740 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) 949 while (desc) {
741 kvm_set_pfn_accessed(pfn); 950 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
742 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) 951 fn(desc->sptes[i]);
743 kvm_set_pfn_dirty(pfn); 952 desc = desc->more;
744 return 1; 953 }
745} 954}
746 955
747static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) 956/*
957 * Take gfn and return the reverse mapping to it.
958 */
959static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
748{ 960{
749 if (set_spte_track_bits(sptep, new_spte)) 961 struct kvm_memory_slot *slot;
750 rmap_remove(kvm, sptep); 962 struct kvm_lpage_info *linfo;
963
964 slot = gfn_to_memslot(kvm, gfn);
965 if (likely(level == PT_PAGE_TABLE_LEVEL))
966 return &slot->rmap[gfn - slot->base_gfn];
967
968 linfo = lpage_info_slot(gfn, slot, level);
969
970 return &linfo->rmap_pde;
971}
972
973static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
974{
975 struct kvm_mmu_page *sp;
976 unsigned long *rmapp;
977
978 sp = page_header(__pa(spte));
979 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
980 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
981 return pte_list_add(vcpu, spte, rmapp);
751} 982}
752 983
753static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 984static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
754{ 985{
755 struct kvm_rmap_desc *desc; 986 return pte_list_next(rmapp, spte);
756 u64 *prev_spte; 987}
757 int i;
758 988
759 if (!*rmapp) 989static void rmap_remove(struct kvm *kvm, u64 *spte)
760 return NULL; 990{
761 else if (!(*rmapp & 1)) { 991 struct kvm_mmu_page *sp;
762 if (!spte) 992 gfn_t gfn;
763 return (u64 *)*rmapp; 993 unsigned long *rmapp;
764 return NULL; 994
765 } 995 sp = page_header(__pa(spte));
766 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 996 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
767 prev_spte = NULL; 997 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
768 while (desc) { 998 pte_list_remove(spte, rmapp);
769 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { 999}
770 if (prev_spte == spte) 1000
771 return desc->sptes[i]; 1001static void drop_spte(struct kvm *kvm, u64 *sptep)
772 prev_spte = desc->sptes[i]; 1002{
773 } 1003 if (mmu_spte_clear_track_bits(sptep))
774 desc = desc->more; 1004 rmap_remove(kvm, sptep);
775 }
776 return NULL;
777} 1005}
778 1006
779static int rmap_write_protect(struct kvm *kvm, u64 gfn) 1007static int rmap_write_protect(struct kvm *kvm, u64 gfn)
@@ -790,7 +1018,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
790 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1018 BUG_ON(!(*spte & PT_PRESENT_MASK));
791 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 1019 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
792 if (is_writable_pte(*spte)) { 1020 if (is_writable_pte(*spte)) {
793 update_spte(spte, *spte & ~PT_WRITABLE_MASK); 1021 mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
794 write_protected = 1; 1022 write_protected = 1;
795 } 1023 }
796 spte = rmap_next(kvm, rmapp, spte); 1024 spte = rmap_next(kvm, rmapp, spte);
@@ -807,8 +1035,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
807 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 1035 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
808 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 1036 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
809 if (is_writable_pte(*spte)) { 1037 if (is_writable_pte(*spte)) {
810 drop_spte(kvm, spte, 1038 drop_spte(kvm, spte);
811 shadow_trap_nonpresent_pte);
812 --kvm->stat.lpages; 1039 --kvm->stat.lpages;
813 spte = NULL; 1040 spte = NULL;
814 write_protected = 1; 1041 write_protected = 1;
@@ -829,7 +1056,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
829 while ((spte = rmap_next(kvm, rmapp, NULL))) { 1056 while ((spte = rmap_next(kvm, rmapp, NULL))) {
830 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1057 BUG_ON(!(*spte & PT_PRESENT_MASK));
831 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); 1058 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
832 drop_spte(kvm, spte, shadow_trap_nonpresent_pte); 1059 drop_spte(kvm, spte);
833 need_tlb_flush = 1; 1060 need_tlb_flush = 1;
834 } 1061 }
835 return need_tlb_flush; 1062 return need_tlb_flush;
@@ -851,7 +1078,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
851 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); 1078 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
852 need_flush = 1; 1079 need_flush = 1;
853 if (pte_write(*ptep)) { 1080 if (pte_write(*ptep)) {
854 drop_spte(kvm, spte, shadow_trap_nonpresent_pte); 1081 drop_spte(kvm, spte);
855 spte = rmap_next(kvm, rmapp, NULL); 1082 spte = rmap_next(kvm, rmapp, NULL);
856 } else { 1083 } else {
857 new_spte = *spte &~ (PT64_BASE_ADDR_MASK); 1084 new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -860,7 +1087,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
860 new_spte &= ~PT_WRITABLE_MASK; 1087 new_spte &= ~PT_WRITABLE_MASK;
861 new_spte &= ~SPTE_HOST_WRITEABLE; 1088 new_spte &= ~SPTE_HOST_WRITEABLE;
862 new_spte &= ~shadow_accessed_mask; 1089 new_spte &= ~shadow_accessed_mask;
863 set_spte_track_bits(spte, new_spte); 1090 mmu_spte_clear_track_bits(spte);
1091 mmu_spte_set(spte, new_spte);
864 spte = rmap_next(kvm, rmapp, spte); 1092 spte = rmap_next(kvm, rmapp, spte);
865 } 1093 }
866 } 1094 }
@@ -1032,151 +1260,89 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
1032 percpu_counter_add(&kvm_total_used_mmu_pages, nr); 1260 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1033} 1261}
1034 1262
1035static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1263/*
1264 * Remove the sp from shadow page cache, after call it,
1265 * we can not find this sp from the cache, and the shadow
1266 * page table is still valid.
1267 * It should be under the protection of mmu lock.
1268 */
1269static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
1036{ 1270{
1037 ASSERT(is_empty_shadow_page(sp->spt)); 1271 ASSERT(is_empty_shadow_page(sp->spt));
1038 hlist_del(&sp->hash_link); 1272 hlist_del(&sp->hash_link);
1039 list_del(&sp->link);
1040 free_page((unsigned long)sp->spt);
1041 if (!sp->role.direct) 1273 if (!sp->role.direct)
1042 free_page((unsigned long)sp->gfns); 1274 free_page((unsigned long)sp->gfns);
1043 kmem_cache_free(mmu_page_header_cache, sp);
1044 kvm_mod_used_mmu_pages(kvm, -1);
1045} 1275}
1046 1276
1047static unsigned kvm_page_table_hashfn(gfn_t gfn) 1277/*
1278 * Free the shadow page table and the sp, we can do it
1279 * out of the protection of mmu lock.
1280 */
1281static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1048{ 1282{
1049 return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); 1283 list_del(&sp->link);
1284 free_page((unsigned long)sp->spt);
1285 kmem_cache_free(mmu_page_header_cache, sp);
1050} 1286}
1051 1287
1052static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, 1288static unsigned kvm_page_table_hashfn(gfn_t gfn)
1053 u64 *parent_pte, int direct)
1054{ 1289{
1055 struct kvm_mmu_page *sp; 1290 return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
1056
1057 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
1058 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
1059 if (!direct)
1060 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
1061 PAGE_SIZE);
1062 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1063 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1064 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
1065 sp->multimapped = 0;
1066 sp->parent_pte = parent_pte;
1067 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
1068 return sp;
1069} 1291}
1070 1292
1071static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, 1293static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
1072 struct kvm_mmu_page *sp, u64 *parent_pte) 1294 struct kvm_mmu_page *sp, u64 *parent_pte)
1073{ 1295{
1074 struct kvm_pte_chain *pte_chain;
1075 struct hlist_node *node;
1076 int i;
1077
1078 if (!parent_pte) 1296 if (!parent_pte)
1079 return; 1297 return;
1080 if (!sp->multimapped) {
1081 u64 *old = sp->parent_pte;
1082 1298
1083 if (!old) { 1299 pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
1084 sp->parent_pte = parent_pte;
1085 return;
1086 }
1087 sp->multimapped = 1;
1088 pte_chain = mmu_alloc_pte_chain(vcpu);
1089 INIT_HLIST_HEAD(&sp->parent_ptes);
1090 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1091 pte_chain->parent_ptes[0] = old;
1092 }
1093 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
1094 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
1095 continue;
1096 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
1097 if (!pte_chain->parent_ptes[i]) {
1098 pte_chain->parent_ptes[i] = parent_pte;
1099 return;
1100 }
1101 }
1102 pte_chain = mmu_alloc_pte_chain(vcpu);
1103 BUG_ON(!pte_chain);
1104 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1105 pte_chain->parent_ptes[0] = parent_pte;
1106} 1300}
1107 1301
1108static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, 1302static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1109 u64 *parent_pte) 1303 u64 *parent_pte)
1110{ 1304{
1111 struct kvm_pte_chain *pte_chain; 1305 pte_list_remove(parent_pte, &sp->parent_ptes);
1112 struct hlist_node *node;
1113 int i;
1114
1115 if (!sp->multimapped) {
1116 BUG_ON(sp->parent_pte != parent_pte);
1117 sp->parent_pte = NULL;
1118 return;
1119 }
1120 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1121 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1122 if (!pte_chain->parent_ptes[i])
1123 break;
1124 if (pte_chain->parent_ptes[i] != parent_pte)
1125 continue;
1126 while (i + 1 < NR_PTE_CHAIN_ENTRIES
1127 && pte_chain->parent_ptes[i + 1]) {
1128 pte_chain->parent_ptes[i]
1129 = pte_chain->parent_ptes[i + 1];
1130 ++i;
1131 }
1132 pte_chain->parent_ptes[i] = NULL;
1133 if (i == 0) {
1134 hlist_del(&pte_chain->link);
1135 mmu_free_pte_chain(pte_chain);
1136 if (hlist_empty(&sp->parent_ptes)) {
1137 sp->multimapped = 0;
1138 sp->parent_pte = NULL;
1139 }
1140 }
1141 return;
1142 }
1143 BUG();
1144} 1306}
1145 1307
1146static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) 1308static void drop_parent_pte(struct kvm_mmu_page *sp,
1309 u64 *parent_pte)
1147{ 1310{
1148 struct kvm_pte_chain *pte_chain; 1311 mmu_page_remove_parent_pte(sp, parent_pte);
1149 struct hlist_node *node; 1312 mmu_spte_clear_no_track(parent_pte);
1150 struct kvm_mmu_page *parent_sp; 1313}
1151 int i;
1152
1153 if (!sp->multimapped && sp->parent_pte) {
1154 parent_sp = page_header(__pa(sp->parent_pte));
1155 fn(parent_sp, sp->parent_pte);
1156 return;
1157 }
1158
1159 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1160 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1161 u64 *spte = pte_chain->parent_ptes[i];
1162 1314
1163 if (!spte) 1315static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1164 break; 1316 u64 *parent_pte, int direct)
1165 parent_sp = page_header(__pa(spte)); 1317{
1166 fn(parent_sp, spte); 1318 struct kvm_mmu_page *sp;
1167 } 1319 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache,
1320 sizeof *sp);
1321 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
1322 if (!direct)
1323 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
1324 PAGE_SIZE);
1325 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1326 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1327 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
1328 sp->parent_ptes = 0;
1329 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1330 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
1331 return sp;
1168} 1332}
1169 1333
1170static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte); 1334static void mark_unsync(u64 *spte);
1171static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) 1335static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1172{ 1336{
1173 mmu_parent_walk(sp, mark_unsync); 1337 pte_list_walk(&sp->parent_ptes, mark_unsync);
1174} 1338}
1175 1339
1176static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte) 1340static void mark_unsync(u64 *spte)
1177{ 1341{
1342 struct kvm_mmu_page *sp;
1178 unsigned int index; 1343 unsigned int index;
1179 1344
1345 sp = page_header(__pa(spte));
1180 index = spte - sp->spt; 1346 index = spte - sp->spt;
1181 if (__test_and_set_bit(index, sp->unsync_child_bitmap)) 1347 if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1182 return; 1348 return;
@@ -1185,15 +1351,6 @@ static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
1185 kvm_mmu_mark_parents_unsync(sp); 1351 kvm_mmu_mark_parents_unsync(sp);
1186} 1352}
1187 1353
1188static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1189 struct kvm_mmu_page *sp)
1190{
1191 int i;
1192
1193 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1194 sp->spt[i] = shadow_trap_nonpresent_pte;
1195}
1196
1197static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1354static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1198 struct kvm_mmu_page *sp) 1355 struct kvm_mmu_page *sp)
1199{ 1356{
@@ -1475,6 +1632,14 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
1475 } 1632 }
1476} 1633}
1477 1634
1635static void init_shadow_page_table(struct kvm_mmu_page *sp)
1636{
1637 int i;
1638
1639 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1640 sp->spt[i] = 0ull;
1641}
1642
1478static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 1643static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1479 gfn_t gfn, 1644 gfn_t gfn,
1480 gva_t gaddr, 1645 gva_t gaddr,
@@ -1537,10 +1702,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1537 1702
1538 account_shadowed(vcpu->kvm, gfn); 1703 account_shadowed(vcpu->kvm, gfn);
1539 } 1704 }
1540 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) 1705 init_shadow_page_table(sp);
1541 vcpu->arch.mmu.prefetch_page(vcpu, sp);
1542 else
1543 nonpaging_prefetch_page(vcpu, sp);
1544 trace_kvm_mmu_get_page(sp, true); 1706 trace_kvm_mmu_get_page(sp, true);
1545 return sp; 1707 return sp;
1546} 1708}
@@ -1572,21 +1734,28 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
1572 if (iterator->level < PT_PAGE_TABLE_LEVEL) 1734 if (iterator->level < PT_PAGE_TABLE_LEVEL)
1573 return false; 1735 return false;
1574 1736
1575 if (iterator->level == PT_PAGE_TABLE_LEVEL)
1576 if (is_large_pte(*iterator->sptep))
1577 return false;
1578
1579 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); 1737 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
1580 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; 1738 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
1581 return true; 1739 return true;
1582} 1740}
1583 1741
1584static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) 1742static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
1743 u64 spte)
1585{ 1744{
1586 iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK; 1745 if (is_last_spte(spte, iterator->level)) {
1746 iterator->level = 0;
1747 return;
1748 }
1749
1750 iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
1587 --iterator->level; 1751 --iterator->level;
1588} 1752}
1589 1753
1754static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1755{
1756 return __shadow_walk_next(iterator, *iterator->sptep);
1757}
1758
1590static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) 1759static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1591{ 1760{
1592 u64 spte; 1761 u64 spte;
@@ -1594,13 +1763,13 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1594 spte = __pa(sp->spt) 1763 spte = __pa(sp->spt)
1595 | PT_PRESENT_MASK | PT_ACCESSED_MASK 1764 | PT_PRESENT_MASK | PT_ACCESSED_MASK
1596 | PT_WRITABLE_MASK | PT_USER_MASK; 1765 | PT_WRITABLE_MASK | PT_USER_MASK;
1597 __set_spte(sptep, spte); 1766 mmu_spte_set(sptep, spte);
1598} 1767}
1599 1768
1600static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) 1769static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1601{ 1770{
1602 if (is_large_pte(*sptep)) { 1771 if (is_large_pte(*sptep)) {
1603 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 1772 drop_spte(vcpu->kvm, sptep);
1604 kvm_flush_remote_tlbs(vcpu->kvm); 1773 kvm_flush_remote_tlbs(vcpu->kvm);
1605 } 1774 }
1606} 1775}
@@ -1622,38 +1791,39 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1622 if (child->role.access == direct_access) 1791 if (child->role.access == direct_access)
1623 return; 1792 return;
1624 1793
1625 mmu_page_remove_parent_pte(child, sptep); 1794 drop_parent_pte(child, sptep);
1626 __set_spte(sptep, shadow_trap_nonpresent_pte);
1627 kvm_flush_remote_tlbs(vcpu->kvm); 1795 kvm_flush_remote_tlbs(vcpu->kvm);
1628 } 1796 }
1629} 1797}
1630 1798
1799static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
1800 u64 *spte)
1801{
1802 u64 pte;
1803 struct kvm_mmu_page *child;
1804
1805 pte = *spte;
1806 if (is_shadow_present_pte(pte)) {
1807 if (is_last_spte(pte, sp->role.level))
1808 drop_spte(kvm, spte);
1809 else {
1810 child = page_header(pte & PT64_BASE_ADDR_MASK);
1811 drop_parent_pte(child, spte);
1812 }
1813 } else if (is_mmio_spte(pte))
1814 mmu_spte_clear_no_track(spte);
1815
1816 if (is_large_pte(pte))
1817 --kvm->stat.lpages;
1818}
1819
1631static void kvm_mmu_page_unlink_children(struct kvm *kvm, 1820static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1632 struct kvm_mmu_page *sp) 1821 struct kvm_mmu_page *sp)
1633{ 1822{
1634 unsigned i; 1823 unsigned i;
1635 u64 *pt; 1824
1636 u64 ent; 1825 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1637 1826 mmu_page_zap_pte(kvm, sp, sp->spt + i);
1638 pt = sp->spt;
1639
1640 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1641 ent = pt[i];
1642
1643 if (is_shadow_present_pte(ent)) {
1644 if (!is_last_spte(ent, sp->role.level)) {
1645 ent &= PT64_BASE_ADDR_MASK;
1646 mmu_page_remove_parent_pte(page_header(ent),
1647 &pt[i]);
1648 } else {
1649 if (is_large_pte(ent))
1650 --kvm->stat.lpages;
1651 drop_spte(kvm, &pt[i],
1652 shadow_trap_nonpresent_pte);
1653 }
1654 }
1655 pt[i] = shadow_trap_nonpresent_pte;
1656 }
1657} 1827}
1658 1828
1659static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) 1829static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -1674,20 +1844,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1674{ 1844{
1675 u64 *parent_pte; 1845 u64 *parent_pte;
1676 1846
1677 while (sp->multimapped || sp->parent_pte) { 1847 while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL)))
1678 if (!sp->multimapped) 1848 drop_parent_pte(sp, parent_pte);
1679 parent_pte = sp->parent_pte;
1680 else {
1681 struct kvm_pte_chain *chain;
1682
1683 chain = container_of(sp->parent_ptes.first,
1684 struct kvm_pte_chain, link);
1685 parent_pte = chain->parent_ptes[0];
1686 }
1687 BUG_ON(!parent_pte);
1688 kvm_mmu_put_page(sp, parent_pte);
1689 __set_spte(parent_pte, shadow_trap_nonpresent_pte);
1690 }
1691} 1849}
1692 1850
1693static int mmu_zap_unsync_children(struct kvm *kvm, 1851static int mmu_zap_unsync_children(struct kvm *kvm,
@@ -1734,6 +1892,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1734 /* Count self */ 1892 /* Count self */
1735 ret++; 1893 ret++;
1736 list_move(&sp->link, invalid_list); 1894 list_move(&sp->link, invalid_list);
1895 kvm_mod_used_mmu_pages(kvm, -1);
1737 } else { 1896 } else {
1738 list_move(&sp->link, &kvm->arch.active_mmu_pages); 1897 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1739 kvm_reload_remote_mmus(kvm); 1898 kvm_reload_remote_mmus(kvm);
@@ -1744,6 +1903,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1744 return ret; 1903 return ret;
1745} 1904}
1746 1905
1906static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
1907{
1908 struct kvm_mmu_page *sp;
1909
1910 list_for_each_entry(sp, invalid_list, link)
1911 kvm_mmu_isolate_page(sp);
1912}
1913
1914static void free_pages_rcu(struct rcu_head *head)
1915{
1916 struct kvm_mmu_page *next, *sp;
1917
1918 sp = container_of(head, struct kvm_mmu_page, rcu);
1919 while (sp) {
1920 if (!list_empty(&sp->link))
1921 next = list_first_entry(&sp->link,
1922 struct kvm_mmu_page, link);
1923 else
1924 next = NULL;
1925 kvm_mmu_free_page(sp);
1926 sp = next;
1927 }
1928}
1929
1747static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1930static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1748 struct list_head *invalid_list) 1931 struct list_head *invalid_list)
1749{ 1932{
@@ -1754,10 +1937,21 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1754 1937
1755 kvm_flush_remote_tlbs(kvm); 1938 kvm_flush_remote_tlbs(kvm);
1756 1939
1940 if (atomic_read(&kvm->arch.reader_counter)) {
1941 kvm_mmu_isolate_pages(invalid_list);
1942 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1943 list_del_init(invalid_list);
1944
1945 trace_kvm_mmu_delay_free_pages(sp);
1946 call_rcu(&sp->rcu, free_pages_rcu);
1947 return;
1948 }
1949
1757 do { 1950 do {
1758 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); 1951 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1759 WARN_ON(!sp->role.invalid || sp->root_count); 1952 WARN_ON(!sp->role.invalid || sp->root_count);
1760 kvm_mmu_free_page(kvm, sp); 1953 kvm_mmu_isolate_page(sp);
1954 kvm_mmu_free_page(sp);
1761 } while (!list_empty(invalid_list)); 1955 } while (!list_empty(invalid_list));
1762 1956
1763} 1957}
@@ -1783,8 +1977,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
1783 page = container_of(kvm->arch.active_mmu_pages.prev, 1977 page = container_of(kvm->arch.active_mmu_pages.prev,
1784 struct kvm_mmu_page, link); 1978 struct kvm_mmu_page, link);
1785 kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); 1979 kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
1786 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1787 } 1980 }
1981 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1788 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; 1982 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
1789 } 1983 }
1790 1984
@@ -1833,20 +2027,6 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1833 __set_bit(slot, sp->slot_bitmap); 2027 __set_bit(slot, sp->slot_bitmap);
1834} 2028}
1835 2029
1836static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1837{
1838 int i;
1839 u64 *pt = sp->spt;
1840
1841 if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
1842 return;
1843
1844 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1845 if (pt[i] == shadow_notrap_nonpresent_pte)
1846 __set_spte(&pt[i], shadow_trap_nonpresent_pte);
1847 }
1848}
1849
1850/* 2030/*
1851 * The function is based on mtrr_type_lookup() in 2031 * The function is based on mtrr_type_lookup() in
1852 * arch/x86/kernel/cpu/mtrr/generic.c 2032 * arch/x86/kernel/cpu/mtrr/generic.c
@@ -1959,7 +2139,6 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1959 sp->unsync = 1; 2139 sp->unsync = 1;
1960 2140
1961 kvm_mmu_mark_parents_unsync(sp); 2141 kvm_mmu_mark_parents_unsync(sp);
1962 mmu_convert_notrap(sp);
1963} 2142}
1964 2143
1965static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) 2144static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -2002,13 +2181,16 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2002 2181
2003static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2182static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2004 unsigned pte_access, int user_fault, 2183 unsigned pte_access, int user_fault,
2005 int write_fault, int dirty, int level, 2184 int write_fault, int level,
2006 gfn_t gfn, pfn_t pfn, bool speculative, 2185 gfn_t gfn, pfn_t pfn, bool speculative,
2007 bool can_unsync, bool host_writable) 2186 bool can_unsync, bool host_writable)
2008{ 2187{
2009 u64 spte, entry = *sptep; 2188 u64 spte, entry = *sptep;
2010 int ret = 0; 2189 int ret = 0;
2011 2190
2191 if (set_mmio_spte(sptep, gfn, pfn, pte_access))
2192 return 0;
2193
2012 /* 2194 /*
2013 * We don't set the accessed bit, since we sometimes want to see 2195 * We don't set the accessed bit, since we sometimes want to see
2014 * whether the guest actually used the pte (in order to detect 2196 * whether the guest actually used the pte (in order to detect
@@ -2017,8 +2199,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2017 spte = PT_PRESENT_MASK; 2199 spte = PT_PRESENT_MASK;
2018 if (!speculative) 2200 if (!speculative)
2019 spte |= shadow_accessed_mask; 2201 spte |= shadow_accessed_mask;
2020 if (!dirty) 2202
2021 pte_access &= ~ACC_WRITE_MASK;
2022 if (pte_access & ACC_EXEC_MASK) 2203 if (pte_access & ACC_EXEC_MASK)
2023 spte |= shadow_x_mask; 2204 spte |= shadow_x_mask;
2024 else 2205 else
@@ -2045,15 +2226,24 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2045 if (level > PT_PAGE_TABLE_LEVEL && 2226 if (level > PT_PAGE_TABLE_LEVEL &&
2046 has_wrprotected_page(vcpu->kvm, gfn, level)) { 2227 has_wrprotected_page(vcpu->kvm, gfn, level)) {
2047 ret = 1; 2228 ret = 1;
2048 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 2229 drop_spte(vcpu->kvm, sptep);
2049 goto done; 2230 goto done;
2050 } 2231 }
2051 2232
2052 spte |= PT_WRITABLE_MASK; 2233 spte |= PT_WRITABLE_MASK;
2053 2234
2054 if (!vcpu->arch.mmu.direct_map 2235 if (!vcpu->arch.mmu.direct_map
2055 && !(pte_access & ACC_WRITE_MASK)) 2236 && !(pte_access & ACC_WRITE_MASK)) {
2056 spte &= ~PT_USER_MASK; 2237 spte &= ~PT_USER_MASK;
2238 /*
2239 * If we converted a user page to a kernel page,
2240 * so that the kernel can write to it when cr0.wp=0,
2241 * then we should prevent the kernel from executing it
2242 * if SMEP is enabled.
2243 */
2244 if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
2245 spte |= PT64_NX_MASK;
2246 }
2057 2247
2058 /* 2248 /*
2059 * Optimization: for pte sync, if spte was writable the hash 2249 * Optimization: for pte sync, if spte was writable the hash
@@ -2078,7 +2268,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2078 mark_page_dirty(vcpu->kvm, gfn); 2268 mark_page_dirty(vcpu->kvm, gfn);
2079 2269
2080set_pte: 2270set_pte:
2081 update_spte(sptep, spte); 2271 mmu_spte_update(sptep, spte);
2082 /* 2272 /*
2083 * If we overwrite a writable spte with a read-only one we 2273 * If we overwrite a writable spte with a read-only one we
2084 * should flush remote TLBs. Otherwise rmap_write_protect 2274 * should flush remote TLBs. Otherwise rmap_write_protect
@@ -2093,8 +2283,8 @@ done:
2093 2283
2094static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2284static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2095 unsigned pt_access, unsigned pte_access, 2285 unsigned pt_access, unsigned pte_access,
2096 int user_fault, int write_fault, int dirty, 2286 int user_fault, int write_fault,
2097 int *ptwrite, int level, gfn_t gfn, 2287 int *emulate, int level, gfn_t gfn,
2098 pfn_t pfn, bool speculative, 2288 pfn_t pfn, bool speculative,
2099 bool host_writable) 2289 bool host_writable)
2100{ 2290{
@@ -2117,26 +2307,28 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2117 u64 pte = *sptep; 2307 u64 pte = *sptep;
2118 2308
2119 child = page_header(pte & PT64_BASE_ADDR_MASK); 2309 child = page_header(pte & PT64_BASE_ADDR_MASK);
2120 mmu_page_remove_parent_pte(child, sptep); 2310 drop_parent_pte(child, sptep);
2121 __set_spte(sptep, shadow_trap_nonpresent_pte);
2122 kvm_flush_remote_tlbs(vcpu->kvm); 2311 kvm_flush_remote_tlbs(vcpu->kvm);
2123 } else if (pfn != spte_to_pfn(*sptep)) { 2312 } else if (pfn != spte_to_pfn(*sptep)) {
2124 pgprintk("hfn old %llx new %llx\n", 2313 pgprintk("hfn old %llx new %llx\n",
2125 spte_to_pfn(*sptep), pfn); 2314 spte_to_pfn(*sptep), pfn);
2126 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 2315 drop_spte(vcpu->kvm, sptep);
2127 kvm_flush_remote_tlbs(vcpu->kvm); 2316 kvm_flush_remote_tlbs(vcpu->kvm);
2128 } else 2317 } else
2129 was_rmapped = 1; 2318 was_rmapped = 1;
2130 } 2319 }
2131 2320
2132 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, 2321 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
2133 dirty, level, gfn, pfn, speculative, true, 2322 level, gfn, pfn, speculative, true,
2134 host_writable)) { 2323 host_writable)) {
2135 if (write_fault) 2324 if (write_fault)
2136 *ptwrite = 1; 2325 *emulate = 1;
2137 kvm_mmu_flush_tlb(vcpu); 2326 kvm_mmu_flush_tlb(vcpu);
2138 } 2327 }
2139 2328
2329 if (unlikely(is_mmio_spte(*sptep) && emulate))
2330 *emulate = 1;
2331
2140 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2332 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2141 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", 2333 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
2142 is_large_pte(*sptep)? "2MB" : "4kB", 2334 is_large_pte(*sptep)? "2MB" : "4kB",
@@ -2145,11 +2337,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2145 if (!was_rmapped && is_large_pte(*sptep)) 2337 if (!was_rmapped && is_large_pte(*sptep))
2146 ++vcpu->kvm->stat.lpages; 2338 ++vcpu->kvm->stat.lpages;
2147 2339
2148 page_header_update_slot(vcpu->kvm, sptep, gfn); 2340 if (is_shadow_present_pte(*sptep)) {
2149 if (!was_rmapped) { 2341 page_header_update_slot(vcpu->kvm, sptep, gfn);
2150 rmap_count = rmap_add(vcpu, sptep, gfn); 2342 if (!was_rmapped) {
2151 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 2343 rmap_count = rmap_add(vcpu, sptep, gfn);
2152 rmap_recycle(vcpu, sptep, gfn); 2344 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
2345 rmap_recycle(vcpu, sptep, gfn);
2346 }
2153 } 2347 }
2154 kvm_release_pfn_clean(pfn); 2348 kvm_release_pfn_clean(pfn);
2155 if (speculative) { 2349 if (speculative) {
@@ -2170,8 +2364,8 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2170 2364
2171 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); 2365 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
2172 if (!slot) { 2366 if (!slot) {
2173 get_page(bad_page); 2367 get_page(fault_page);
2174 return page_to_pfn(bad_page); 2368 return page_to_pfn(fault_page);
2175 } 2369 }
2176 2370
2177 hva = gfn_to_hva_memslot(slot, gfn); 2371 hva = gfn_to_hva_memslot(slot, gfn);
@@ -2198,7 +2392,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2198 2392
2199 for (i = 0; i < ret; i++, gfn++, start++) 2393 for (i = 0; i < ret; i++, gfn++, start++)
2200 mmu_set_spte(vcpu, start, ACC_ALL, 2394 mmu_set_spte(vcpu, start, ACC_ALL,
2201 access, 0, 0, 1, NULL, 2395 access, 0, 0, NULL,
2202 sp->role.level, gfn, 2396 sp->role.level, gfn,
2203 page_to_pfn(pages[i]), true, true); 2397 page_to_pfn(pages[i]), true, true);
2204 2398
@@ -2217,7 +2411,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2217 spte = sp->spt + i; 2411 spte = sp->spt + i;
2218 2412
2219 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { 2413 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2220 if (*spte != shadow_trap_nonpresent_pte || spte == sptep) { 2414 if (is_shadow_present_pte(*spte) || spte == sptep) {
2221 if (!start) 2415 if (!start)
2222 continue; 2416 continue;
2223 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) 2417 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
@@ -2254,7 +2448,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2254{ 2448{
2255 struct kvm_shadow_walk_iterator iterator; 2449 struct kvm_shadow_walk_iterator iterator;
2256 struct kvm_mmu_page *sp; 2450 struct kvm_mmu_page *sp;
2257 int pt_write = 0; 2451 int emulate = 0;
2258 gfn_t pseudo_gfn; 2452 gfn_t pseudo_gfn;
2259 2453
2260 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 2454 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
@@ -2262,14 +2456,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2262 unsigned pte_access = ACC_ALL; 2456 unsigned pte_access = ACC_ALL;
2263 2457
2264 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, 2458 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
2265 0, write, 1, &pt_write, 2459 0, write, &emulate,
2266 level, gfn, pfn, prefault, map_writable); 2460 level, gfn, pfn, prefault, map_writable);
2267 direct_pte_prefetch(vcpu, iterator.sptep); 2461 direct_pte_prefetch(vcpu, iterator.sptep);
2268 ++vcpu->stat.pf_fixed; 2462 ++vcpu->stat.pf_fixed;
2269 break; 2463 break;
2270 } 2464 }
2271 2465
2272 if (*iterator.sptep == shadow_trap_nonpresent_pte) { 2466 if (!is_shadow_present_pte(*iterator.sptep)) {
2273 u64 base_addr = iterator.addr; 2467 u64 base_addr = iterator.addr;
2274 2468
2275 base_addr &= PT64_LVL_ADDR_MASK(iterator.level); 2469 base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
@@ -2283,14 +2477,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2283 return -ENOMEM; 2477 return -ENOMEM;
2284 } 2478 }
2285 2479
2286 __set_spte(iterator.sptep, 2480 mmu_spte_set(iterator.sptep,
2287 __pa(sp->spt) 2481 __pa(sp->spt)
2288 | PT_PRESENT_MASK | PT_WRITABLE_MASK 2482 | PT_PRESENT_MASK | PT_WRITABLE_MASK
2289 | shadow_user_mask | shadow_x_mask 2483 | shadow_user_mask | shadow_x_mask
2290 | shadow_accessed_mask); 2484 | shadow_accessed_mask);
2291 } 2485 }
2292 } 2486 }
2293 return pt_write; 2487 return emulate;
2294} 2488}
2295 2489
2296static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) 2490static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
@@ -2306,16 +2500,15 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
2306 send_sig_info(SIGBUS, &info, tsk); 2500 send_sig_info(SIGBUS, &info, tsk);
2307} 2501}
2308 2502
2309static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) 2503static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
2310{ 2504{
2311 kvm_release_pfn_clean(pfn); 2505 kvm_release_pfn_clean(pfn);
2312 if (is_hwpoison_pfn(pfn)) { 2506 if (is_hwpoison_pfn(pfn)) {
2313 kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current); 2507 kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
2314 return 0; 2508 return 0;
2315 } else if (is_fault_pfn(pfn)) 2509 }
2316 return -EFAULT;
2317 2510
2318 return 1; 2511 return -EFAULT;
2319} 2512}
2320 2513
2321static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, 2514static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
@@ -2360,6 +2553,30 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2360 } 2553 }
2361} 2554}
2362 2555
2556static bool mmu_invalid_pfn(pfn_t pfn)
2557{
2558 return unlikely(is_invalid_pfn(pfn));
2559}
2560
2561static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
2562 pfn_t pfn, unsigned access, int *ret_val)
2563{
2564 bool ret = true;
2565
2566 /* The pfn is invalid, report the error! */
2567 if (unlikely(is_invalid_pfn(pfn))) {
2568 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
2569 goto exit;
2570 }
2571
2572 if (unlikely(is_noslot_pfn(pfn)))
2573 vcpu_cache_mmio_info(vcpu, gva, gfn, access);
2574
2575 ret = false;
2576exit:
2577 return ret;
2578}
2579
2363static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 2580static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2364 gva_t gva, pfn_t *pfn, bool write, bool *writable); 2581 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2365 2582
@@ -2394,9 +2611,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2394 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) 2611 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
2395 return 0; 2612 return 0;
2396 2613
2397 /* mmio */ 2614 if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
2398 if (is_error_pfn(pfn)) 2615 return r;
2399 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2400 2616
2401 spin_lock(&vcpu->kvm->mmu_lock); 2617 spin_lock(&vcpu->kvm->mmu_lock);
2402 if (mmu_notifier_retry(vcpu, mmu_seq)) 2618 if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2623,6 +2839,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2623 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2839 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2624 return; 2840 return;
2625 2841
2842 vcpu_clear_mmio_info(vcpu, ~0ul);
2626 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); 2843 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
2627 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { 2844 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2628 hpa_t root = vcpu->arch.mmu.root_hpa; 2845 hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -2667,6 +2884,94 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2667 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); 2884 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2668} 2885}
2669 2886
2887static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
2888{
2889 if (direct)
2890 return vcpu_match_mmio_gpa(vcpu, addr);
2891
2892 return vcpu_match_mmio_gva(vcpu, addr);
2893}
2894
2895
2896/*
2897 * On direct hosts, the last spte is only allows two states
2898 * for mmio page fault:
2899 * - It is the mmio spte
2900 * - It is zapped or it is being zapped.
2901 *
2902 * This function completely checks the spte when the last spte
2903 * is not the mmio spte.
2904 */
2905static bool check_direct_spte_mmio_pf(u64 spte)
2906{
2907 return __check_direct_spte_mmio_pf(spte);
2908}
2909
2910static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
2911{
2912 struct kvm_shadow_walk_iterator iterator;
2913 u64 spte = 0ull;
2914
2915 walk_shadow_page_lockless_begin(vcpu);
2916 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
2917 if (!is_shadow_present_pte(spte))
2918 break;
2919 walk_shadow_page_lockless_end(vcpu);
2920
2921 return spte;
2922}
2923
2924/*
2925 * If it is a real mmio page fault, return 1 and emulat the instruction
2926 * directly, return 0 to let CPU fault again on the address, -1 is
2927 * returned if bug is detected.
2928 */
2929int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
2930{
2931 u64 spte;
2932
2933 if (quickly_check_mmio_pf(vcpu, addr, direct))
2934 return 1;
2935
2936 spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
2937
2938 if (is_mmio_spte(spte)) {
2939 gfn_t gfn = get_mmio_spte_gfn(spte);
2940 unsigned access = get_mmio_spte_access(spte);
2941
2942 if (direct)
2943 addr = 0;
2944
2945 trace_handle_mmio_page_fault(addr, gfn, access);
2946 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
2947 return 1;
2948 }
2949
2950 /*
2951 * It's ok if the gva is remapped by other cpus on shadow guest,
2952 * it's a BUG if the gfn is not a mmio page.
2953 */
2954 if (direct && !check_direct_spte_mmio_pf(spte))
2955 return -1;
2956
2957 /*
2958 * If the page table is zapped by other cpus, let CPU fault again on
2959 * the address.
2960 */
2961 return 0;
2962}
2963EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
2964
2965static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
2966 u32 error_code, bool direct)
2967{
2968 int ret;
2969
2970 ret = handle_mmio_page_fault_common(vcpu, addr, direct);
2971 WARN_ON(ret < 0);
2972 return ret;
2973}
2974
2670static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 2975static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2671 u32 error_code, bool prefault) 2976 u32 error_code, bool prefault)
2672{ 2977{
@@ -2674,6 +2979,10 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2674 int r; 2979 int r;
2675 2980
2676 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); 2981 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
2982
2983 if (unlikely(error_code & PFERR_RSVD_MASK))
2984 return handle_mmio_page_fault(vcpu, gva, error_code, true);
2985
2677 r = mmu_topup_memory_caches(vcpu); 2986 r = mmu_topup_memory_caches(vcpu);
2678 if (r) 2987 if (r)
2679 return r; 2988 return r;
@@ -2750,6 +3059,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2750 ASSERT(vcpu); 3059 ASSERT(vcpu);
2751 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3060 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2752 3061
3062 if (unlikely(error_code & PFERR_RSVD_MASK))
3063 return handle_mmio_page_fault(vcpu, gpa, error_code, true);
3064
2753 r = mmu_topup_memory_caches(vcpu); 3065 r = mmu_topup_memory_caches(vcpu);
2754 if (r) 3066 if (r)
2755 return r; 3067 return r;
@@ -2767,9 +3079,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2767 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) 3079 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
2768 return 0; 3080 return 0;
2769 3081
2770 /* mmio */ 3082 if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
2771 if (is_error_pfn(pfn)) 3083 return r;
2772 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); 3084
2773 spin_lock(&vcpu->kvm->mmu_lock); 3085 spin_lock(&vcpu->kvm->mmu_lock);
2774 if (mmu_notifier_retry(vcpu, mmu_seq)) 3086 if (mmu_notifier_retry(vcpu, mmu_seq))
2775 goto out_unlock; 3087 goto out_unlock;
@@ -2800,7 +3112,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
2800 context->page_fault = nonpaging_page_fault; 3112 context->page_fault = nonpaging_page_fault;
2801 context->gva_to_gpa = nonpaging_gva_to_gpa; 3113 context->gva_to_gpa = nonpaging_gva_to_gpa;
2802 context->free = nonpaging_free; 3114 context->free = nonpaging_free;
2803 context->prefetch_page = nonpaging_prefetch_page;
2804 context->sync_page = nonpaging_sync_page; 3115 context->sync_page = nonpaging_sync_page;
2805 context->invlpg = nonpaging_invlpg; 3116 context->invlpg = nonpaging_invlpg;
2806 context->update_pte = nonpaging_update_pte; 3117 context->update_pte = nonpaging_update_pte;
@@ -2848,6 +3159,23 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2848 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; 3159 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2849} 3160}
2850 3161
3162static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
3163 int *nr_present)
3164{
3165 if (unlikely(is_mmio_spte(*sptep))) {
3166 if (gfn != get_mmio_spte_gfn(*sptep)) {
3167 mmu_spte_clear_no_track(sptep);
3168 return true;
3169 }
3170
3171 (*nr_present)++;
3172 mark_mmio_spte(sptep, gfn, access);
3173 return true;
3174 }
3175
3176 return false;
3177}
3178
2851#define PTTYPE 64 3179#define PTTYPE 64
2852#include "paging_tmpl.h" 3180#include "paging_tmpl.h"
2853#undef PTTYPE 3181#undef PTTYPE
@@ -2930,7 +3258,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
2930 context->new_cr3 = paging_new_cr3; 3258 context->new_cr3 = paging_new_cr3;
2931 context->page_fault = paging64_page_fault; 3259 context->page_fault = paging64_page_fault;
2932 context->gva_to_gpa = paging64_gva_to_gpa; 3260 context->gva_to_gpa = paging64_gva_to_gpa;
2933 context->prefetch_page = paging64_prefetch_page;
2934 context->sync_page = paging64_sync_page; 3261 context->sync_page = paging64_sync_page;
2935 context->invlpg = paging64_invlpg; 3262 context->invlpg = paging64_invlpg;
2936 context->update_pte = paging64_update_pte; 3263 context->update_pte = paging64_update_pte;
@@ -2959,7 +3286,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
2959 context->page_fault = paging32_page_fault; 3286 context->page_fault = paging32_page_fault;
2960 context->gva_to_gpa = paging32_gva_to_gpa; 3287 context->gva_to_gpa = paging32_gva_to_gpa;
2961 context->free = paging_free; 3288 context->free = paging_free;
2962 context->prefetch_page = paging32_prefetch_page;
2963 context->sync_page = paging32_sync_page; 3289 context->sync_page = paging32_sync_page;
2964 context->invlpg = paging32_invlpg; 3290 context->invlpg = paging32_invlpg;
2965 context->update_pte = paging32_update_pte; 3291 context->update_pte = paging32_update_pte;
@@ -2984,7 +3310,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2984 context->new_cr3 = nonpaging_new_cr3; 3310 context->new_cr3 = nonpaging_new_cr3;
2985 context->page_fault = tdp_page_fault; 3311 context->page_fault = tdp_page_fault;
2986 context->free = nonpaging_free; 3312 context->free = nonpaging_free;
2987 context->prefetch_page = nonpaging_prefetch_page;
2988 context->sync_page = nonpaging_sync_page; 3313 context->sync_page = nonpaging_sync_page;
2989 context->invlpg = nonpaging_invlpg; 3314 context->invlpg = nonpaging_invlpg;
2990 context->update_pte = nonpaging_update_pte; 3315 context->update_pte = nonpaging_update_pte;
@@ -3023,6 +3348,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3023int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) 3348int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
3024{ 3349{
3025 int r; 3350 int r;
3351 bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
3026 ASSERT(vcpu); 3352 ASSERT(vcpu);
3027 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3353 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3028 3354
@@ -3037,6 +3363,8 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
3037 3363
3038 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); 3364 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
3039 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); 3365 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
3366 vcpu->arch.mmu.base_role.smep_andnot_wp
3367 = smep && !is_write_protection(vcpu);
3040 3368
3041 return r; 3369 return r;
3042} 3370}
@@ -3141,27 +3469,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
3141} 3469}
3142EXPORT_SYMBOL_GPL(kvm_mmu_unload); 3470EXPORT_SYMBOL_GPL(kvm_mmu_unload);
3143 3471
3144static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
3145 struct kvm_mmu_page *sp,
3146 u64 *spte)
3147{
3148 u64 pte;
3149 struct kvm_mmu_page *child;
3150
3151 pte = *spte;
3152 if (is_shadow_present_pte(pte)) {
3153 if (is_last_spte(pte, sp->role.level))
3154 drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
3155 else {
3156 child = page_header(pte & PT64_BASE_ADDR_MASK);
3157 mmu_page_remove_parent_pte(child, spte);
3158 }
3159 }
3160 __set_spte(spte, shadow_trap_nonpresent_pte);
3161 if (is_large_pte(pte))
3162 --vcpu->kvm->stat.lpages;
3163}
3164
3165static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 3472static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3166 struct kvm_mmu_page *sp, u64 *spte, 3473 struct kvm_mmu_page *sp, u64 *spte,
3167 const void *new) 3474 const void *new)
@@ -3233,6 +3540,13 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3233 int level, npte, invlpg_counter, r, flooded = 0; 3540 int level, npte, invlpg_counter, r, flooded = 0;
3234 bool remote_flush, local_flush, zap_page; 3541 bool remote_flush, local_flush, zap_page;
3235 3542
3543 /*
3544 * If we don't have indirect shadow pages, it means no page is
3545 * write-protected, so we can exit simply.
3546 */
3547 if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
3548 return;
3549
3236 zap_page = remote_flush = local_flush = false; 3550 zap_page = remote_flush = local_flush = false;
3237 offset = offset_in_page(gpa); 3551 offset = offset_in_page(gpa);
3238 3552
@@ -3336,7 +3650,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3336 spte = &sp->spt[page_offset / sizeof(*spte)]; 3650 spte = &sp->spt[page_offset / sizeof(*spte)];
3337 while (npte--) { 3651 while (npte--) {
3338 entry = *spte; 3652 entry = *spte;
3339 mmu_pte_write_zap_pte(vcpu, sp, spte); 3653 mmu_page_zap_pte(vcpu->kvm, sp, spte);
3340 if (gentry && 3654 if (gentry &&
3341 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 3655 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
3342 & mask.word)) 3656 & mask.word))
@@ -3380,9 +3694,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
3380 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 3694 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
3381 struct kvm_mmu_page, link); 3695 struct kvm_mmu_page, link);
3382 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); 3696 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
3383 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3384 ++vcpu->kvm->stat.mmu_recycled; 3697 ++vcpu->kvm->stat.mmu_recycled;
3385 } 3698 }
3699 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3386} 3700}
3387 3701
3388int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, 3702int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
@@ -3506,15 +3820,15 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3506 continue; 3820 continue;
3507 3821
3508 if (is_large_pte(pt[i])) { 3822 if (is_large_pte(pt[i])) {
3509 drop_spte(kvm, &pt[i], 3823 drop_spte(kvm, &pt[i]);
3510 shadow_trap_nonpresent_pte);
3511 --kvm->stat.lpages; 3824 --kvm->stat.lpages;
3512 continue; 3825 continue;
3513 } 3826 }
3514 3827
3515 /* avoid RMW */ 3828 /* avoid RMW */
3516 if (is_writable_pte(pt[i])) 3829 if (is_writable_pte(pt[i]))
3517 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); 3830 mmu_spte_update(&pt[i],
3831 pt[i] & ~PT_WRITABLE_MASK);
3518 } 3832 }
3519 } 3833 }
3520 kvm_flush_remote_tlbs(kvm); 3834 kvm_flush_remote_tlbs(kvm);
@@ -3590,25 +3904,18 @@ static struct shrinker mmu_shrinker = {
3590 3904
3591static void mmu_destroy_caches(void) 3905static void mmu_destroy_caches(void)
3592{ 3906{
3593 if (pte_chain_cache) 3907 if (pte_list_desc_cache)
3594 kmem_cache_destroy(pte_chain_cache); 3908 kmem_cache_destroy(pte_list_desc_cache);
3595 if (rmap_desc_cache)
3596 kmem_cache_destroy(rmap_desc_cache);
3597 if (mmu_page_header_cache) 3909 if (mmu_page_header_cache)
3598 kmem_cache_destroy(mmu_page_header_cache); 3910 kmem_cache_destroy(mmu_page_header_cache);
3599} 3911}
3600 3912
3601int kvm_mmu_module_init(void) 3913int kvm_mmu_module_init(void)
3602{ 3914{
3603 pte_chain_cache = kmem_cache_create("kvm_pte_chain", 3915 pte_list_desc_cache = kmem_cache_create("pte_list_desc",
3604 sizeof(struct kvm_pte_chain), 3916 sizeof(struct pte_list_desc),
3605 0, 0, NULL);
3606 if (!pte_chain_cache)
3607 goto nomem;
3608 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
3609 sizeof(struct kvm_rmap_desc),
3610 0, 0, NULL); 3917 0, 0, NULL);
3611 if (!rmap_desc_cache) 3918 if (!pte_list_desc_cache)
3612 goto nomem; 3919 goto nomem;
3613 3920
3614 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", 3921 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
@@ -3775,16 +4082,17 @@ out:
3775int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) 4082int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3776{ 4083{
3777 struct kvm_shadow_walk_iterator iterator; 4084 struct kvm_shadow_walk_iterator iterator;
4085 u64 spte;
3778 int nr_sptes = 0; 4086 int nr_sptes = 0;
3779 4087
3780 spin_lock(&vcpu->kvm->mmu_lock); 4088 walk_shadow_page_lockless_begin(vcpu);
3781 for_each_shadow_entry(vcpu, addr, iterator) { 4089 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
3782 sptes[iterator.level-1] = *iterator.sptep; 4090 sptes[iterator.level-1] = spte;
3783 nr_sptes++; 4091 nr_sptes++;
3784 if (!is_shadow_present_pte(*iterator.sptep)) 4092 if (!is_shadow_present_pte(spte))
3785 break; 4093 break;
3786 } 4094 }
3787 spin_unlock(&vcpu->kvm->mmu_lock); 4095 walk_shadow_page_lockless_end(vcpu);
3788 4096
3789 return nr_sptes; 4097 return nr_sptes;
3790} 4098}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 7086ca85d3e7..e374db9af021 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -49,6 +49,8 @@
49#define PFERR_FETCH_MASK (1U << 4) 49#define PFERR_FETCH_MASK (1U << 4)
50 50
51int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); 51int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
52void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
53int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
52int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); 54int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
53 55
54static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) 56static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
@@ -76,4 +78,27 @@ static inline int is_present_gpte(unsigned long pte)
76 return pte & PT_PRESENT_MASK; 78 return pte & PT_PRESENT_MASK;
77} 79}
78 80
81static inline int is_writable_pte(unsigned long pte)
82{
83 return pte & PT_WRITABLE_MASK;
84}
85
86static inline bool is_write_protection(struct kvm_vcpu *vcpu)
87{
88 return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
89}
90
91static inline bool check_write_user_access(struct kvm_vcpu *vcpu,
92 bool write_fault, bool user_fault,
93 unsigned long pte)
94{
95 if (unlikely(write_fault && !is_writable_pte(pte)
96 && (user_fault || is_write_protection(vcpu))))
97 return false;
98
99 if (unlikely(user_fault && !(pte & PT_USER_MASK)))
100 return false;
101
102 return true;
103}
79#endif 104#endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 5f6223b8bcf7..2460a265be23 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -99,18 +99,6 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
99 "level = %d\n", sp, level); 99 "level = %d\n", sp, level);
100 return; 100 return;
101 } 101 }
102
103 if (*sptep == shadow_notrap_nonpresent_pte) {
104 audit_printk(vcpu->kvm, "notrap spte in unsync "
105 "sp: %p\n", sp);
106 return;
107 }
108 }
109
110 if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
111 audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
112 sp);
113 return;
114 } 102 }
115 103
116 if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level)) 104 if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index b60b4fdb3eda..eed67f34146d 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -196,6 +196,54 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
196 TP_ARGS(sp) 196 TP_ARGS(sp)
197); 197);
198 198
199DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages,
200 TP_PROTO(struct kvm_mmu_page *sp),
201
202 TP_ARGS(sp)
203);
204
205TRACE_EVENT(
206 mark_mmio_spte,
207 TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access),
208 TP_ARGS(sptep, gfn, access),
209
210 TP_STRUCT__entry(
211 __field(void *, sptep)
212 __field(gfn_t, gfn)
213 __field(unsigned, access)
214 ),
215
216 TP_fast_assign(
217 __entry->sptep = sptep;
218 __entry->gfn = gfn;
219 __entry->access = access;
220 ),
221
222 TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn,
223 __entry->access)
224);
225
226TRACE_EVENT(
227 handle_mmio_page_fault,
228 TP_PROTO(u64 addr, gfn_t gfn, unsigned access),
229 TP_ARGS(addr, gfn, access),
230
231 TP_STRUCT__entry(
232 __field(u64, addr)
233 __field(gfn_t, gfn)
234 __field(unsigned, access)
235 ),
236
237 TP_fast_assign(
238 __entry->addr = addr;
239 __entry->gfn = gfn;
240 __entry->access = access;
241 ),
242
243 TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
244 __entry->access)
245);
246
199TRACE_EVENT( 247TRACE_EVENT(
200 kvm_mmu_audit, 248 kvm_mmu_audit,
201 TP_PROTO(struct kvm_vcpu *vcpu, int audit_point), 249 TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9d03ad4dd5ec..507e2b844cfa 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -101,11 +101,15 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
101 return (ret != orig_pte); 101 return (ret != orig_pte);
102} 102}
103 103
104static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) 104static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
105 bool last)
105{ 106{
106 unsigned access; 107 unsigned access;
107 108
108 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; 109 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
110 if (last && !is_dirty_gpte(gpte))
111 access &= ~ACC_WRITE_MASK;
112
109#if PTTYPE == 64 113#if PTTYPE == 64
110 if (vcpu->arch.mmu.nx) 114 if (vcpu->arch.mmu.nx)
111 access &= ~(gpte >> PT64_NX_SHIFT); 115 access &= ~(gpte >> PT64_NX_SHIFT);
@@ -113,6 +117,24 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
113 return access; 117 return access;
114} 118}
115 119
120static bool FNAME(is_last_gpte)(struct guest_walker *walker,
121 struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
122 pt_element_t gpte)
123{
124 if (walker->level == PT_PAGE_TABLE_LEVEL)
125 return true;
126
127 if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) &&
128 (PTTYPE == 64 || is_pse(vcpu)))
129 return true;
130
131 if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) &&
132 (mmu->root_level == PT64_ROOT_LEVEL))
133 return true;
134
135 return false;
136}
137
116/* 138/*
117 * Fetch a guest pte for a guest virtual address 139 * Fetch a guest pte for a guest virtual address
118 */ 140 */
@@ -125,18 +147,17 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
125 gfn_t table_gfn; 147 gfn_t table_gfn;
126 unsigned index, pt_access, uninitialized_var(pte_access); 148 unsigned index, pt_access, uninitialized_var(pte_access);
127 gpa_t pte_gpa; 149 gpa_t pte_gpa;
128 bool eperm, present, rsvd_fault; 150 bool eperm;
129 int offset, write_fault, user_fault, fetch_fault; 151 int offset;
130 152 const int write_fault = access & PFERR_WRITE_MASK;
131 write_fault = access & PFERR_WRITE_MASK; 153 const int user_fault = access & PFERR_USER_MASK;
132 user_fault = access & PFERR_USER_MASK; 154 const int fetch_fault = access & PFERR_FETCH_MASK;
133 fetch_fault = access & PFERR_FETCH_MASK; 155 u16 errcode = 0;
134 156
135 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, 157 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
136 fetch_fault); 158 fetch_fault);
137walk: 159retry_walk:
138 present = true; 160 eperm = false;
139 eperm = rsvd_fault = false;
140 walker->level = mmu->root_level; 161 walker->level = mmu->root_level;
141 pte = mmu->get_cr3(vcpu); 162 pte = mmu->get_cr3(vcpu);
142 163
@@ -144,10 +165,8 @@ walk:
144 if (walker->level == PT32E_ROOT_LEVEL) { 165 if (walker->level == PT32E_ROOT_LEVEL) {
145 pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3); 166 pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
146 trace_kvm_mmu_paging_element(pte, walker->level); 167 trace_kvm_mmu_paging_element(pte, walker->level);
147 if (!is_present_gpte(pte)) { 168 if (!is_present_gpte(pte))
148 present = false;
149 goto error; 169 goto error;
150 }
151 --walker->level; 170 --walker->level;
152 } 171 }
153#endif 172#endif
@@ -170,42 +189,31 @@ walk:
170 189
171 real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), 190 real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
172 PFERR_USER_MASK|PFERR_WRITE_MASK); 191 PFERR_USER_MASK|PFERR_WRITE_MASK);
173 if (unlikely(real_gfn == UNMAPPED_GVA)) { 192 if (unlikely(real_gfn == UNMAPPED_GVA))
174 present = false; 193 goto error;
175 break;
176 }
177 real_gfn = gpa_to_gfn(real_gfn); 194 real_gfn = gpa_to_gfn(real_gfn);
178 195
179 host_addr = gfn_to_hva(vcpu->kvm, real_gfn); 196 host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
180 if (unlikely(kvm_is_error_hva(host_addr))) { 197 if (unlikely(kvm_is_error_hva(host_addr)))
181 present = false; 198 goto error;
182 break;
183 }
184 199
185 ptep_user = (pt_element_t __user *)((void *)host_addr + offset); 200 ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
186 if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) { 201 if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
187 present = false; 202 goto error;
188 break;
189 }
190 203
191 trace_kvm_mmu_paging_element(pte, walker->level); 204 trace_kvm_mmu_paging_element(pte, walker->level);
192 205
193 if (unlikely(!is_present_gpte(pte))) { 206 if (unlikely(!is_present_gpte(pte)))
194 present = false; 207 goto error;
195 break;
196 }
197 208
198 if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, 209 if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
199 walker->level))) { 210 walker->level))) {
200 rsvd_fault = true; 211 errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
201 break; 212 goto error;
202 } 213 }
203 214
204 if (unlikely(write_fault && !is_writable_pte(pte) 215 if (!check_write_user_access(vcpu, write_fault, user_fault,
205 && (user_fault || is_write_protection(vcpu)))) 216 pte))
206 eperm = true;
207
208 if (unlikely(user_fault && !(pte & PT_USER_MASK)))
209 eperm = true; 217 eperm = true;
210 218
211#if PTTYPE == 64 219#if PTTYPE == 64
@@ -213,39 +221,35 @@ walk:
213 eperm = true; 221 eperm = true;
214#endif 222#endif
215 223
216 if (!eperm && !rsvd_fault 224 if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
217 && unlikely(!(pte & PT_ACCESSED_MASK))) {
218 int ret; 225 int ret;
219 trace_kvm_mmu_set_accessed_bit(table_gfn, index, 226 trace_kvm_mmu_set_accessed_bit(table_gfn, index,
220 sizeof(pte)); 227 sizeof(pte));
221 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, 228 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
222 pte, pte|PT_ACCESSED_MASK); 229 pte, pte|PT_ACCESSED_MASK);
223 if (unlikely(ret < 0)) { 230 if (unlikely(ret < 0))
224 present = false; 231 goto error;
225 break; 232 else if (ret)
226 } else if (ret) 233 goto retry_walk;
227 goto walk;
228 234
229 mark_page_dirty(vcpu->kvm, table_gfn); 235 mark_page_dirty(vcpu->kvm, table_gfn);
230 pte |= PT_ACCESSED_MASK; 236 pte |= PT_ACCESSED_MASK;
231 } 237 }
232 238
233 pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
234
235 walker->ptes[walker->level - 1] = pte; 239 walker->ptes[walker->level - 1] = pte;
236 240
237 if ((walker->level == PT_PAGE_TABLE_LEVEL) || 241 if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) {
238 ((walker->level == PT_DIRECTORY_LEVEL) &&
239 is_large_pte(pte) &&
240 (PTTYPE == 64 || is_pse(vcpu))) ||
241 ((walker->level == PT_PDPE_LEVEL) &&
242 is_large_pte(pte) &&
243 mmu->root_level == PT64_ROOT_LEVEL)) {
244 int lvl = walker->level; 242 int lvl = walker->level;
245 gpa_t real_gpa; 243 gpa_t real_gpa;
246 gfn_t gfn; 244 gfn_t gfn;
247 u32 ac; 245 u32 ac;
248 246
247 /* check if the kernel is fetching from user page */
248 if (unlikely(pte_access & PT_USER_MASK) &&
249 kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
250 if (fetch_fault && !user_fault)
251 eperm = true;
252
249 gfn = gpte_to_gfn_lvl(pte, lvl); 253 gfn = gpte_to_gfn_lvl(pte, lvl);
250 gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; 254 gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
251 255
@@ -266,12 +270,14 @@ walk:
266 break; 270 break;
267 } 271 }
268 272
269 pt_access = pte_access; 273 pt_access &= FNAME(gpte_access)(vcpu, pte, false);
270 --walker->level; 274 --walker->level;
271 } 275 }
272 276
273 if (unlikely(!present || eperm || rsvd_fault)) 277 if (unlikely(eperm)) {
278 errcode |= PFERR_PRESENT_MASK;
274 goto error; 279 goto error;
280 }
275 281
276 if (write_fault && unlikely(!is_dirty_gpte(pte))) { 282 if (write_fault && unlikely(!is_dirty_gpte(pte))) {
277 int ret; 283 int ret;
@@ -279,17 +285,17 @@ walk:
279 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 285 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
280 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, 286 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
281 pte, pte|PT_DIRTY_MASK); 287 pte, pte|PT_DIRTY_MASK);
282 if (unlikely(ret < 0)) { 288 if (unlikely(ret < 0))
283 present = false;
284 goto error; 289 goto error;
285 } else if (ret) 290 else if (ret)
286 goto walk; 291 goto retry_walk;
287 292
288 mark_page_dirty(vcpu->kvm, table_gfn); 293 mark_page_dirty(vcpu->kvm, table_gfn);
289 pte |= PT_DIRTY_MASK; 294 pte |= PT_DIRTY_MASK;
290 walker->ptes[walker->level - 1] = pte; 295 walker->ptes[walker->level - 1] = pte;
291 } 296 }
292 297
298 pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true);
293 walker->pt_access = pt_access; 299 walker->pt_access = pt_access;
294 walker->pte_access = pte_access; 300 walker->pte_access = pte_access;
295 pgprintk("%s: pte %llx pte_access %x pt_access %x\n", 301 pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
@@ -297,19 +303,14 @@ walk:
297 return 1; 303 return 1;
298 304
299error: 305error:
306 errcode |= write_fault | user_fault;
307 if (fetch_fault && (mmu->nx ||
308 kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
309 errcode |= PFERR_FETCH_MASK;
310
300 walker->fault.vector = PF_VECTOR; 311 walker->fault.vector = PF_VECTOR;
301 walker->fault.error_code_valid = true; 312 walker->fault.error_code_valid = true;
302 walker->fault.error_code = 0; 313 walker->fault.error_code = errcode;
303 if (present)
304 walker->fault.error_code |= PFERR_PRESENT_MASK;
305
306 walker->fault.error_code |= write_fault | user_fault;
307
308 if (fetch_fault && mmu->nx)
309 walker->fault.error_code |= PFERR_FETCH_MASK;
310 if (rsvd_fault)
311 walker->fault.error_code |= PFERR_RSVD_MASK;
312
313 walker->fault.address = addr; 314 walker->fault.address = addr;
314 walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; 315 walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
315 316
@@ -336,16 +337,11 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
336 struct kvm_mmu_page *sp, u64 *spte, 337 struct kvm_mmu_page *sp, u64 *spte,
337 pt_element_t gpte) 338 pt_element_t gpte)
338{ 339{
339 u64 nonpresent = shadow_trap_nonpresent_pte;
340
341 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) 340 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
342 goto no_present; 341 goto no_present;
343 342
344 if (!is_present_gpte(gpte)) { 343 if (!is_present_gpte(gpte))
345 if (!sp->unsync)
346 nonpresent = shadow_notrap_nonpresent_pte;
347 goto no_present; 344 goto no_present;
348 }
349 345
350 if (!(gpte & PT_ACCESSED_MASK)) 346 if (!(gpte & PT_ACCESSED_MASK))
351 goto no_present; 347 goto no_present;
@@ -353,7 +349,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
353 return false; 349 return false;
354 350
355no_present: 351no_present:
356 drop_spte(vcpu->kvm, spte, nonpresent); 352 drop_spte(vcpu->kvm, spte);
357 return true; 353 return true;
358} 354}
359 355
@@ -369,9 +365,9 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
369 return; 365 return;
370 366
371 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 367 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
372 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 368 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
373 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); 369 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
374 if (is_error_pfn(pfn)) { 370 if (mmu_invalid_pfn(pfn)) {
375 kvm_release_pfn_clean(pfn); 371 kvm_release_pfn_clean(pfn);
376 return; 372 return;
377 } 373 }
@@ -381,7 +377,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
381 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 377 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
382 */ 378 */
383 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 379 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
384 is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL, 380 NULL, PT_PAGE_TABLE_LEVEL,
385 gpte_to_gfn(gpte), pfn, true, true); 381 gpte_to_gfn(gpte), pfn, true, true);
386} 382}
387 383
@@ -432,12 +428,11 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
432 unsigned pte_access; 428 unsigned pte_access;
433 gfn_t gfn; 429 gfn_t gfn;
434 pfn_t pfn; 430 pfn_t pfn;
435 bool dirty;
436 431
437 if (spte == sptep) 432 if (spte == sptep)
438 continue; 433 continue;
439 434
440 if (*spte != shadow_trap_nonpresent_pte) 435 if (is_shadow_present_pte(*spte))
441 continue; 436 continue;
442 437
443 gpte = gptep[i]; 438 gpte = gptep[i];
@@ -445,18 +440,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
445 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) 440 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
446 continue; 441 continue;
447 442
448 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 443 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
444 true);
449 gfn = gpte_to_gfn(gpte); 445 gfn = gpte_to_gfn(gpte);
450 dirty = is_dirty_gpte(gpte);
451 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, 446 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
452 (pte_access & ACC_WRITE_MASK) && dirty); 447 pte_access & ACC_WRITE_MASK);
453 if (is_error_pfn(pfn)) { 448 if (mmu_invalid_pfn(pfn)) {
454 kvm_release_pfn_clean(pfn); 449 kvm_release_pfn_clean(pfn);
455 break; 450 break;
456 } 451 }
457 452
458 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 453 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
459 dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn, 454 NULL, PT_PAGE_TABLE_LEVEL, gfn,
460 pfn, true, true); 455 pfn, true, true);
461 } 456 }
462} 457}
@@ -467,12 +462,11 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
467static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 462static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
468 struct guest_walker *gw, 463 struct guest_walker *gw,
469 int user_fault, int write_fault, int hlevel, 464 int user_fault, int write_fault, int hlevel,
470 int *ptwrite, pfn_t pfn, bool map_writable, 465 int *emulate, pfn_t pfn, bool map_writable,
471 bool prefault) 466 bool prefault)
472{ 467{
473 unsigned access = gw->pt_access; 468 unsigned access = gw->pt_access;
474 struct kvm_mmu_page *sp = NULL; 469 struct kvm_mmu_page *sp = NULL;
475 bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
476 int top_level; 470 int top_level;
477 unsigned direct_access; 471 unsigned direct_access;
478 struct kvm_shadow_walk_iterator it; 472 struct kvm_shadow_walk_iterator it;
@@ -480,9 +474,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
480 if (!is_present_gpte(gw->ptes[gw->level - 1])) 474 if (!is_present_gpte(gw->ptes[gw->level - 1]))
481 return NULL; 475 return NULL;
482 476
483 direct_access = gw->pt_access & gw->pte_access; 477 direct_access = gw->pte_access;
484 if (!dirty)
485 direct_access &= ~ACC_WRITE_MASK;
486 478
487 top_level = vcpu->arch.mmu.root_level; 479 top_level = vcpu->arch.mmu.root_level;
488 if (top_level == PT32E_ROOT_LEVEL) 480 if (top_level == PT32E_ROOT_LEVEL)
@@ -540,8 +532,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
540 link_shadow_page(it.sptep, sp); 532 link_shadow_page(it.sptep, sp);
541 } 533 }
542 534
543 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, 535 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
544 user_fault, write_fault, dirty, ptwrite, it.level, 536 user_fault, write_fault, emulate, it.level,
545 gw->gfn, pfn, prefault, map_writable); 537 gw->gfn, pfn, prefault, map_writable);
546 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 538 FNAME(pte_prefetch)(vcpu, gw, it.sptep);
547 539
@@ -575,7 +567,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
575 int user_fault = error_code & PFERR_USER_MASK; 567 int user_fault = error_code & PFERR_USER_MASK;
576 struct guest_walker walker; 568 struct guest_walker walker;
577 u64 *sptep; 569 u64 *sptep;
578 int write_pt = 0; 570 int emulate = 0;
579 int r; 571 int r;
580 pfn_t pfn; 572 pfn_t pfn;
581 int level = PT_PAGE_TABLE_LEVEL; 573 int level = PT_PAGE_TABLE_LEVEL;
@@ -585,6 +577,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
585 577
586 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 578 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
587 579
580 if (unlikely(error_code & PFERR_RSVD_MASK))
581 return handle_mmio_page_fault(vcpu, addr, error_code,
582 mmu_is_nested(vcpu));
583
588 r = mmu_topup_memory_caches(vcpu); 584 r = mmu_topup_memory_caches(vcpu);
589 if (r) 585 if (r)
590 return r; 586 return r;
@@ -623,9 +619,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
623 &map_writable)) 619 &map_writable))
624 return 0; 620 return 0;
625 621
626 /* mmio */ 622 if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
627 if (is_error_pfn(pfn)) 623 walker.gfn, pfn, walker.pte_access, &r))
628 return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn); 624 return r;
629 625
630 spin_lock(&vcpu->kvm->mmu_lock); 626 spin_lock(&vcpu->kvm->mmu_lock);
631 if (mmu_notifier_retry(vcpu, mmu_seq)) 627 if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -636,19 +632,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
636 if (!force_pt_level) 632 if (!force_pt_level)
637 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); 633 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
638 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 634 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
639 level, &write_pt, pfn, map_writable, prefault); 635 level, &emulate, pfn, map_writable, prefault);
640 (void)sptep; 636 (void)sptep;
641 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, 637 pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
642 sptep, *sptep, write_pt); 638 sptep, *sptep, emulate);
643 639
644 if (!write_pt) 640 if (!emulate)
645 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 641 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
646 642
647 ++vcpu->stat.pf_fixed; 643 ++vcpu->stat.pf_fixed;
648 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); 644 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
649 spin_unlock(&vcpu->kvm->mmu_lock); 645 spin_unlock(&vcpu->kvm->mmu_lock);
650 646
651 return write_pt; 647 return emulate;
652 648
653out_unlock: 649out_unlock:
654 spin_unlock(&vcpu->kvm->mmu_lock); 650 spin_unlock(&vcpu->kvm->mmu_lock);
@@ -665,6 +661,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
665 u64 *sptep; 661 u64 *sptep;
666 int need_flush = 0; 662 int need_flush = 0;
667 663
664 vcpu_clear_mmio_info(vcpu, gva);
665
668 spin_lock(&vcpu->kvm->mmu_lock); 666 spin_lock(&vcpu->kvm->mmu_lock);
669 667
670 for_each_shadow_entry(vcpu, gva, iterator) { 668 for_each_shadow_entry(vcpu, gva, iterator) {
@@ -688,11 +686,11 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
688 if (is_shadow_present_pte(*sptep)) { 686 if (is_shadow_present_pte(*sptep)) {
689 if (is_large_pte(*sptep)) 687 if (is_large_pte(*sptep))
690 --vcpu->kvm->stat.lpages; 688 --vcpu->kvm->stat.lpages;
691 drop_spte(vcpu->kvm, sptep, 689 drop_spte(vcpu->kvm, sptep);
692 shadow_trap_nonpresent_pte);
693 need_flush = 1; 690 need_flush = 1;
694 } else 691 } else if (is_mmio_spte(*sptep))
695 __set_spte(sptep, shadow_trap_nonpresent_pte); 692 mmu_spte_clear_no_track(sptep);
693
696 break; 694 break;
697 } 695 }
698 696
@@ -752,36 +750,6 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
752 return gpa; 750 return gpa;
753} 751}
754 752
755static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
756 struct kvm_mmu_page *sp)
757{
758 int i, j, offset, r;
759 pt_element_t pt[256 / sizeof(pt_element_t)];
760 gpa_t pte_gpa;
761
762 if (sp->role.direct
763 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
764 nonpaging_prefetch_page(vcpu, sp);
765 return;
766 }
767
768 pte_gpa = gfn_to_gpa(sp->gfn);
769 if (PTTYPE == 32) {
770 offset = sp->role.quadrant << PT64_LEVEL_BITS;
771 pte_gpa += offset * sizeof(pt_element_t);
772 }
773
774 for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
775 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
776 pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
777 for (j = 0; j < ARRAY_SIZE(pt); ++j)
778 if (r || is_present_gpte(pt[j]))
779 sp->spt[i+j] = shadow_trap_nonpresent_pte;
780 else
781 sp->spt[i+j] = shadow_notrap_nonpresent_pte;
782 }
783}
784
785/* 753/*
786 * Using the cached information from sp->gfns is safe because: 754 * Using the cached information from sp->gfns is safe because:
787 * - The spte has a reference to the struct page, so the pfn for a given gfn 755 * - The spte has a reference to the struct page, so the pfn for a given gfn
@@ -817,7 +785,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
817 gpa_t pte_gpa; 785 gpa_t pte_gpa;
818 gfn_t gfn; 786 gfn_t gfn;
819 787
820 if (!is_shadow_present_pte(sp->spt[i])) 788 if (!sp->spt[i])
821 continue; 789 continue;
822 790
823 pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); 791 pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
@@ -826,26 +794,30 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
826 sizeof(pt_element_t))) 794 sizeof(pt_element_t)))
827 return -EINVAL; 795 return -EINVAL;
828 796
829 gfn = gpte_to_gfn(gpte);
830
831 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { 797 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
832 vcpu->kvm->tlbs_dirty++; 798 vcpu->kvm->tlbs_dirty++;
833 continue; 799 continue;
834 } 800 }
835 801
802 gfn = gpte_to_gfn(gpte);
803 pte_access = sp->role.access;
804 pte_access &= FNAME(gpte_access)(vcpu, gpte, true);
805
806 if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
807 continue;
808
836 if (gfn != sp->gfns[i]) { 809 if (gfn != sp->gfns[i]) {
837 drop_spte(vcpu->kvm, &sp->spt[i], 810 drop_spte(vcpu->kvm, &sp->spt[i]);
838 shadow_trap_nonpresent_pte);
839 vcpu->kvm->tlbs_dirty++; 811 vcpu->kvm->tlbs_dirty++;
840 continue; 812 continue;
841 } 813 }
842 814
843 nr_present++; 815 nr_present++;
844 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 816
845 host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; 817 host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
846 818
847 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 819 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
848 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, 820 PT_PAGE_TABLE_LEVEL, gfn,
849 spte_to_pfn(sp->spt[i]), true, false, 821 spte_to_pfn(sp->spt[i]), true, false,
850 host_writable); 822 host_writable);
851 } 823 }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 506e4fe23adc..475d1c948501 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1496,11 +1496,14 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1496 update_cr0_intercept(svm); 1496 update_cr0_intercept(svm);
1497} 1497}
1498 1498
1499static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1499static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1500{ 1500{
1501 unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; 1501 unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
1502 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; 1502 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
1503 1503
1504 if (cr4 & X86_CR4_VMXE)
1505 return 1;
1506
1504 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) 1507 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1505 svm_flush_tlb(vcpu); 1508 svm_flush_tlb(vcpu);
1506 1509
@@ -1510,6 +1513,7 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1510 cr4 |= host_cr4_mce; 1513 cr4 |= host_cr4_mce;
1511 to_svm(vcpu)->vmcb->save.cr4 = cr4; 1514 to_svm(vcpu)->vmcb->save.cr4 = cr4;
1512 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); 1515 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1516 return 0;
1513} 1517}
1514 1518
1515static void svm_set_segment(struct kvm_vcpu *vcpu, 1519static void svm_set_segment(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index db932760ea82..3ff898c104f7 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -675,12 +675,12 @@ TRACE_EVENT(kvm_emulate_insn,
675 ), 675 ),
676 676
677 TP_fast_assign( 677 TP_fast_assign(
678 __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start; 678 __entry->rip = vcpu->arch.emulate_ctxt.fetch.start;
679 __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS); 679 __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
680 __entry->len = vcpu->arch.emulate_ctxt.decode.eip 680 __entry->len = vcpu->arch.emulate_ctxt._eip
681 - vcpu->arch.emulate_ctxt.decode.fetch.start; 681 - vcpu->arch.emulate_ctxt.fetch.start;
682 memcpy(__entry->insn, 682 memcpy(__entry->insn,
683 vcpu->arch.emulate_ctxt.decode.fetch.data, 683 vcpu->arch.emulate_ctxt.fetch.data,
684 15); 684 15);
685 __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode); 685 __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
686 __entry->failed = failed; 686 __entry->failed = failed;
@@ -698,6 +698,29 @@ TRACE_EVENT(kvm_emulate_insn,
698#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0) 698#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
699#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1) 699#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
700 700
701TRACE_EVENT(
702 vcpu_match_mmio,
703 TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match),
704 TP_ARGS(gva, gpa, write, gpa_match),
705
706 TP_STRUCT__entry(
707 __field(gva_t, gva)
708 __field(gpa_t, gpa)
709 __field(bool, write)
710 __field(bool, gpa_match)
711 ),
712
713 TP_fast_assign(
714 __entry->gva = gva;
715 __entry->gpa = gpa;
716 __entry->write = write;
717 __entry->gpa_match = gpa_match
718 ),
719
720 TP_printk("gva %#lx gpa %#llx %s %s", __entry->gva, __entry->gpa,
721 __entry->write ? "Write" : "Read",
722 __entry->gpa_match ? "GPA" : "GVA")
723);
701#endif /* _TRACE_KVM_H */ 724#endif /* _TRACE_KVM_H */
702 725
703#undef TRACE_INCLUDE_PATH 726#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d48ec60ea421..e65a158dee64 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -43,13 +43,12 @@
43#include "trace.h" 43#include "trace.h"
44 44
45#define __ex(x) __kvm_handle_fault_on_reboot(x) 45#define __ex(x) __kvm_handle_fault_on_reboot(x)
46#define __ex_clear(x, reg) \
47 ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
46 48
47MODULE_AUTHOR("Qumranet"); 49MODULE_AUTHOR("Qumranet");
48MODULE_LICENSE("GPL"); 50MODULE_LICENSE("GPL");
49 51
50static int __read_mostly bypass_guest_pf = 1;
51module_param(bypass_guest_pf, bool, S_IRUGO);
52
53static int __read_mostly enable_vpid = 1; 52static int __read_mostly enable_vpid = 1;
54module_param_named(vpid, enable_vpid, bool, 0444); 53module_param_named(vpid, enable_vpid, bool, 0444);
55 54
@@ -72,6 +71,14 @@ module_param(vmm_exclusive, bool, S_IRUGO);
72static int __read_mostly yield_on_hlt = 1; 71static int __read_mostly yield_on_hlt = 1;
73module_param(yield_on_hlt, bool, S_IRUGO); 72module_param(yield_on_hlt, bool, S_IRUGO);
74 73
74/*
75 * If nested=1, nested virtualization is supported, i.e., guests may use
76 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
77 * use VMX instructions.
78 */
79static int __read_mostly nested = 0;
80module_param(nested, bool, S_IRUGO);
81
75#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 82#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
76 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) 83 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
77#define KVM_GUEST_CR0_MASK \ 84#define KVM_GUEST_CR0_MASK \
@@ -109,6 +116,7 @@ static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
109module_param(ple_window, int, S_IRUGO); 116module_param(ple_window, int, S_IRUGO);
110 117
111#define NR_AUTOLOAD_MSRS 1 118#define NR_AUTOLOAD_MSRS 1
119#define VMCS02_POOL_SIZE 1
112 120
113struct vmcs { 121struct vmcs {
114 u32 revision_id; 122 u32 revision_id;
@@ -116,17 +124,237 @@ struct vmcs {
116 char data[0]; 124 char data[0];
117}; 125};
118 126
127/*
128 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
129 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
130 * loaded on this CPU (so we can clear them if the CPU goes down).
131 */
132struct loaded_vmcs {
133 struct vmcs *vmcs;
134 int cpu;
135 int launched;
136 struct list_head loaded_vmcss_on_cpu_link;
137};
138
119struct shared_msr_entry { 139struct shared_msr_entry {
120 unsigned index; 140 unsigned index;
121 u64 data; 141 u64 data;
122 u64 mask; 142 u64 mask;
123}; 143};
124 144
145/*
146 * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
147 * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
148 * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
149 * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
150 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
151 * More than one of these structures may exist, if L1 runs multiple L2 guests.
152 * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
153 * underlying hardware which will be used to run L2.
154 * This structure is packed to ensure that its layout is identical across
155 * machines (necessary for live migration).
156 * If there are changes in this struct, VMCS12_REVISION must be changed.
157 */
158typedef u64 natural_width;
159struct __packed vmcs12 {
160 /* According to the Intel spec, a VMCS region must start with the
161 * following two fields. Then follow implementation-specific data.
162 */
163 u32 revision_id;
164 u32 abort;
165
166 u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
167 u32 padding[7]; /* room for future expansion */
168
169 u64 io_bitmap_a;
170 u64 io_bitmap_b;
171 u64 msr_bitmap;
172 u64 vm_exit_msr_store_addr;
173 u64 vm_exit_msr_load_addr;
174 u64 vm_entry_msr_load_addr;
175 u64 tsc_offset;
176 u64 virtual_apic_page_addr;
177 u64 apic_access_addr;
178 u64 ept_pointer;
179 u64 guest_physical_address;
180 u64 vmcs_link_pointer;
181 u64 guest_ia32_debugctl;
182 u64 guest_ia32_pat;
183 u64 guest_ia32_efer;
184 u64 guest_ia32_perf_global_ctrl;
185 u64 guest_pdptr0;
186 u64 guest_pdptr1;
187 u64 guest_pdptr2;
188 u64 guest_pdptr3;
189 u64 host_ia32_pat;
190 u64 host_ia32_efer;
191 u64 host_ia32_perf_global_ctrl;
192 u64 padding64[8]; /* room for future expansion */
193 /*
194 * To allow migration of L1 (complete with its L2 guests) between
195 * machines of different natural widths (32 or 64 bit), we cannot have
196 * unsigned long fields with no explict size. We use u64 (aliased
197 * natural_width) instead. Luckily, x86 is little-endian.
198 */
199 natural_width cr0_guest_host_mask;
200 natural_width cr4_guest_host_mask;
201 natural_width cr0_read_shadow;
202 natural_width cr4_read_shadow;
203 natural_width cr3_target_value0;
204 natural_width cr3_target_value1;
205 natural_width cr3_target_value2;
206 natural_width cr3_target_value3;
207 natural_width exit_qualification;
208 natural_width guest_linear_address;
209 natural_width guest_cr0;
210 natural_width guest_cr3;
211 natural_width guest_cr4;
212 natural_width guest_es_base;
213 natural_width guest_cs_base;
214 natural_width guest_ss_base;
215 natural_width guest_ds_base;
216 natural_width guest_fs_base;
217 natural_width guest_gs_base;
218 natural_width guest_ldtr_base;
219 natural_width guest_tr_base;
220 natural_width guest_gdtr_base;
221 natural_width guest_idtr_base;
222 natural_width guest_dr7;
223 natural_width guest_rsp;
224 natural_width guest_rip;
225 natural_width guest_rflags;
226 natural_width guest_pending_dbg_exceptions;
227 natural_width guest_sysenter_esp;
228 natural_width guest_sysenter_eip;
229 natural_width host_cr0;
230 natural_width host_cr3;
231 natural_width host_cr4;
232 natural_width host_fs_base;
233 natural_width host_gs_base;
234 natural_width host_tr_base;
235 natural_width host_gdtr_base;
236 natural_width host_idtr_base;
237 natural_width host_ia32_sysenter_esp;
238 natural_width host_ia32_sysenter_eip;
239 natural_width host_rsp;
240 natural_width host_rip;
241 natural_width paddingl[8]; /* room for future expansion */
242 u32 pin_based_vm_exec_control;
243 u32 cpu_based_vm_exec_control;
244 u32 exception_bitmap;
245 u32 page_fault_error_code_mask;
246 u32 page_fault_error_code_match;
247 u32 cr3_target_count;
248 u32 vm_exit_controls;
249 u32 vm_exit_msr_store_count;
250 u32 vm_exit_msr_load_count;
251 u32 vm_entry_controls;
252 u32 vm_entry_msr_load_count;
253 u32 vm_entry_intr_info_field;
254 u32 vm_entry_exception_error_code;
255 u32 vm_entry_instruction_len;
256 u32 tpr_threshold;
257 u32 secondary_vm_exec_control;
258 u32 vm_instruction_error;
259 u32 vm_exit_reason;
260 u32 vm_exit_intr_info;
261 u32 vm_exit_intr_error_code;
262 u32 idt_vectoring_info_field;
263 u32 idt_vectoring_error_code;
264 u32 vm_exit_instruction_len;
265 u32 vmx_instruction_info;
266 u32 guest_es_limit;
267 u32 guest_cs_limit;
268 u32 guest_ss_limit;
269 u32 guest_ds_limit;
270 u32 guest_fs_limit;
271 u32 guest_gs_limit;
272 u32 guest_ldtr_limit;
273 u32 guest_tr_limit;
274 u32 guest_gdtr_limit;
275 u32 guest_idtr_limit;
276 u32 guest_es_ar_bytes;
277 u32 guest_cs_ar_bytes;
278 u32 guest_ss_ar_bytes;
279 u32 guest_ds_ar_bytes;
280 u32 guest_fs_ar_bytes;
281 u32 guest_gs_ar_bytes;
282 u32 guest_ldtr_ar_bytes;
283 u32 guest_tr_ar_bytes;
284 u32 guest_interruptibility_info;
285 u32 guest_activity_state;
286 u32 guest_sysenter_cs;
287 u32 host_ia32_sysenter_cs;
288 u32 padding32[8]; /* room for future expansion */
289 u16 virtual_processor_id;
290 u16 guest_es_selector;
291 u16 guest_cs_selector;
292 u16 guest_ss_selector;
293 u16 guest_ds_selector;
294 u16 guest_fs_selector;
295 u16 guest_gs_selector;
296 u16 guest_ldtr_selector;
297 u16 guest_tr_selector;
298 u16 host_es_selector;
299 u16 host_cs_selector;
300 u16 host_ss_selector;
301 u16 host_ds_selector;
302 u16 host_fs_selector;
303 u16 host_gs_selector;
304 u16 host_tr_selector;
305};
306
307/*
308 * VMCS12_REVISION is an arbitrary id that should be changed if the content or
309 * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
310 * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
311 */
312#define VMCS12_REVISION 0x11e57ed0
313
314/*
315 * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
316 * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
317 * current implementation, 4K are reserved to avoid future complications.
318 */
319#define VMCS12_SIZE 0x1000
320
321/* Used to remember the last vmcs02 used for some recently used vmcs12s */
322struct vmcs02_list {
323 struct list_head list;
324 gpa_t vmptr;
325 struct loaded_vmcs vmcs02;
326};
327
328/*
329 * The nested_vmx structure is part of vcpu_vmx, and holds information we need
330 * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
331 */
332struct nested_vmx {
333 /* Has the level1 guest done vmxon? */
334 bool vmxon;
335
336 /* The guest-physical address of the current VMCS L1 keeps for L2 */
337 gpa_t current_vmptr;
338 /* The host-usable pointer to the above */
339 struct page *current_vmcs12_page;
340 struct vmcs12 *current_vmcs12;
341
342 /* vmcs02_list cache of VMCSs recently used to run L2 guests */
343 struct list_head vmcs02_pool;
344 int vmcs02_num;
345 u64 vmcs01_tsc_offset;
346 /* L2 must run next, and mustn't decide to exit to L1. */
347 bool nested_run_pending;
348 /*
349 * Guest pages referred to in vmcs02 with host-physical pointers, so
350 * we must keep them pinned while L2 runs.
351 */
352 struct page *apic_access_page;
353};
354
125struct vcpu_vmx { 355struct vcpu_vmx {
126 struct kvm_vcpu vcpu; 356 struct kvm_vcpu vcpu;
127 struct list_head local_vcpus_link;
128 unsigned long host_rsp; 357 unsigned long host_rsp;
129 int launched;
130 u8 fail; 358 u8 fail;
131 u8 cpl; 359 u8 cpl;
132 bool nmi_known_unmasked; 360 bool nmi_known_unmasked;
@@ -140,7 +368,14 @@ struct vcpu_vmx {
140 u64 msr_host_kernel_gs_base; 368 u64 msr_host_kernel_gs_base;
141 u64 msr_guest_kernel_gs_base; 369 u64 msr_guest_kernel_gs_base;
142#endif 370#endif
143 struct vmcs *vmcs; 371 /*
372 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
373 * non-nested (L1) guest, it always points to vmcs01. For a nested
374 * guest (L2), it points to a different VMCS.
375 */
376 struct loaded_vmcs vmcs01;
377 struct loaded_vmcs *loaded_vmcs;
378 bool __launched; /* temporary, used in vmx_vcpu_run */
144 struct msr_autoload { 379 struct msr_autoload {
145 unsigned nr; 380 unsigned nr;
146 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; 381 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
@@ -176,6 +411,9 @@ struct vcpu_vmx {
176 u32 exit_reason; 411 u32 exit_reason;
177 412
178 bool rdtscp_enabled; 413 bool rdtscp_enabled;
414
415 /* Support for a guest hypervisor (nested VMX) */
416 struct nested_vmx nested;
179}; 417};
180 418
181enum segment_cache_field { 419enum segment_cache_field {
@@ -192,6 +430,174 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
192 return container_of(vcpu, struct vcpu_vmx, vcpu); 430 return container_of(vcpu, struct vcpu_vmx, vcpu);
193} 431}
194 432
433#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
434#define FIELD(number, name) [number] = VMCS12_OFFSET(name)
435#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
436 [number##_HIGH] = VMCS12_OFFSET(name)+4
437
438static unsigned short vmcs_field_to_offset_table[] = {
439 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
440 FIELD(GUEST_ES_SELECTOR, guest_es_selector),
441 FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
442 FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
443 FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
444 FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
445 FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
446 FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
447 FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
448 FIELD(HOST_ES_SELECTOR, host_es_selector),
449 FIELD(HOST_CS_SELECTOR, host_cs_selector),
450 FIELD(HOST_SS_SELECTOR, host_ss_selector),
451 FIELD(HOST_DS_SELECTOR, host_ds_selector),
452 FIELD(HOST_FS_SELECTOR, host_fs_selector),
453 FIELD(HOST_GS_SELECTOR, host_gs_selector),
454 FIELD(HOST_TR_SELECTOR, host_tr_selector),
455 FIELD64(IO_BITMAP_A, io_bitmap_a),
456 FIELD64(IO_BITMAP_B, io_bitmap_b),
457 FIELD64(MSR_BITMAP, msr_bitmap),
458 FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
459 FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
460 FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
461 FIELD64(TSC_OFFSET, tsc_offset),
462 FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
463 FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
464 FIELD64(EPT_POINTER, ept_pointer),
465 FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
466 FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
467 FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
468 FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
469 FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
470 FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
471 FIELD64(GUEST_PDPTR0, guest_pdptr0),
472 FIELD64(GUEST_PDPTR1, guest_pdptr1),
473 FIELD64(GUEST_PDPTR2, guest_pdptr2),
474 FIELD64(GUEST_PDPTR3, guest_pdptr3),
475 FIELD64(HOST_IA32_PAT, host_ia32_pat),
476 FIELD64(HOST_IA32_EFER, host_ia32_efer),
477 FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
478 FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
479 FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
480 FIELD(EXCEPTION_BITMAP, exception_bitmap),
481 FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
482 FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
483 FIELD(CR3_TARGET_COUNT, cr3_target_count),
484 FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
485 FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
486 FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
487 FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
488 FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
489 FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
490 FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
491 FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
492 FIELD(TPR_THRESHOLD, tpr_threshold),
493 FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
494 FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
495 FIELD(VM_EXIT_REASON, vm_exit_reason),
496 FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
497 FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
498 FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
499 FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
500 FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
501 FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
502 FIELD(GUEST_ES_LIMIT, guest_es_limit),
503 FIELD(GUEST_CS_LIMIT, guest_cs_limit),
504 FIELD(GUEST_SS_LIMIT, guest_ss_limit),
505 FIELD(GUEST_DS_LIMIT, guest_ds_limit),
506 FIELD(GUEST_FS_LIMIT, guest_fs_limit),
507 FIELD(GUEST_GS_LIMIT, guest_gs_limit),
508 FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
509 FIELD(GUEST_TR_LIMIT, guest_tr_limit),
510 FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
511 FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
512 FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
513 FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
514 FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
515 FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
516 FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
517 FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
518 FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
519 FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
520 FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
521 FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
522 FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
523 FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
524 FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
525 FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
526 FIELD(CR0_READ_SHADOW, cr0_read_shadow),
527 FIELD(CR4_READ_SHADOW, cr4_read_shadow),
528 FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
529 FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
530 FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
531 FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
532 FIELD(EXIT_QUALIFICATION, exit_qualification),
533 FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
534 FIELD(GUEST_CR0, guest_cr0),
535 FIELD(GUEST_CR3, guest_cr3),
536 FIELD(GUEST_CR4, guest_cr4),
537 FIELD(GUEST_ES_BASE, guest_es_base),
538 FIELD(GUEST_CS_BASE, guest_cs_base),
539 FIELD(GUEST_SS_BASE, guest_ss_base),
540 FIELD(GUEST_DS_BASE, guest_ds_base),
541 FIELD(GUEST_FS_BASE, guest_fs_base),
542 FIELD(GUEST_GS_BASE, guest_gs_base),
543 FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
544 FIELD(GUEST_TR_BASE, guest_tr_base),
545 FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
546 FIELD(GUEST_IDTR_BASE, guest_idtr_base),
547 FIELD(GUEST_DR7, guest_dr7),
548 FIELD(GUEST_RSP, guest_rsp),
549 FIELD(GUEST_RIP, guest_rip),
550 FIELD(GUEST_RFLAGS, guest_rflags),
551 FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
552 FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
553 FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
554 FIELD(HOST_CR0, host_cr0),
555 FIELD(HOST_CR3, host_cr3),
556 FIELD(HOST_CR4, host_cr4),
557 FIELD(HOST_FS_BASE, host_fs_base),
558 FIELD(HOST_GS_BASE, host_gs_base),
559 FIELD(HOST_TR_BASE, host_tr_base),
560 FIELD(HOST_GDTR_BASE, host_gdtr_base),
561 FIELD(HOST_IDTR_BASE, host_idtr_base),
562 FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
563 FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
564 FIELD(HOST_RSP, host_rsp),
565 FIELD(HOST_RIP, host_rip),
566};
567static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table);
568
569static inline short vmcs_field_to_offset(unsigned long field)
570{
571 if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0)
572 return -1;
573 return vmcs_field_to_offset_table[field];
574}
575
576static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
577{
578 return to_vmx(vcpu)->nested.current_vmcs12;
579}
580
581static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
582{
583 struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
584 if (is_error_page(page)) {
585 kvm_release_page_clean(page);
586 return NULL;
587 }
588 return page;
589}
590
591static void nested_release_page(struct page *page)
592{
593 kvm_release_page_dirty(page);
594}
595
596static void nested_release_page_clean(struct page *page)
597{
598 kvm_release_page_clean(page);
599}
600
195static u64 construct_eptp(unsigned long root_hpa); 601static u64 construct_eptp(unsigned long root_hpa);
196static void kvm_cpu_vmxon(u64 addr); 602static void kvm_cpu_vmxon(u64 addr);
197static void kvm_cpu_vmxoff(void); 603static void kvm_cpu_vmxoff(void);
@@ -200,7 +606,11 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
200 606
201static DEFINE_PER_CPU(struct vmcs *, vmxarea); 607static DEFINE_PER_CPU(struct vmcs *, vmxarea);
202static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 608static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
203static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); 609/*
610 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
611 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
612 */
613static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
204static DEFINE_PER_CPU(struct desc_ptr, host_gdt); 614static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
205 615
206static unsigned long *vmx_io_bitmap_a; 616static unsigned long *vmx_io_bitmap_a;
@@ -442,6 +852,35 @@ static inline bool report_flexpriority(void)
442 return flexpriority_enabled; 852 return flexpriority_enabled;
443} 853}
444 854
855static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
856{
857 return vmcs12->cpu_based_vm_exec_control & bit;
858}
859
860static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
861{
862 return (vmcs12->cpu_based_vm_exec_control &
863 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
864 (vmcs12->secondary_vm_exec_control & bit);
865}
866
867static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
868 struct kvm_vcpu *vcpu)
869{
870 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
871}
872
873static inline bool is_exception(u32 intr_info)
874{
875 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
876 == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
877}
878
879static void nested_vmx_vmexit(struct kvm_vcpu *vcpu);
880static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
881 struct vmcs12 *vmcs12,
882 u32 reason, unsigned long qualification);
883
445static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 884static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
446{ 885{
447 int i; 886 int i;
@@ -501,6 +940,13 @@ static void vmcs_clear(struct vmcs *vmcs)
501 vmcs, phys_addr); 940 vmcs, phys_addr);
502} 941}
503 942
943static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
944{
945 vmcs_clear(loaded_vmcs->vmcs);
946 loaded_vmcs->cpu = -1;
947 loaded_vmcs->launched = 0;
948}
949
504static void vmcs_load(struct vmcs *vmcs) 950static void vmcs_load(struct vmcs *vmcs)
505{ 951{
506 u64 phys_addr = __pa(vmcs); 952 u64 phys_addr = __pa(vmcs);
@@ -510,29 +956,28 @@ static void vmcs_load(struct vmcs *vmcs)
510 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) 956 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
511 : "cc", "memory"); 957 : "cc", "memory");
512 if (error) 958 if (error)
513 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", 959 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
514 vmcs, phys_addr); 960 vmcs, phys_addr);
515} 961}
516 962
517static void __vcpu_clear(void *arg) 963static void __loaded_vmcs_clear(void *arg)
518{ 964{
519 struct vcpu_vmx *vmx = arg; 965 struct loaded_vmcs *loaded_vmcs = arg;
520 int cpu = raw_smp_processor_id(); 966 int cpu = raw_smp_processor_id();
521 967
522 if (vmx->vcpu.cpu == cpu) 968 if (loaded_vmcs->cpu != cpu)
523 vmcs_clear(vmx->vmcs); 969 return; /* vcpu migration can race with cpu offline */
524 if (per_cpu(current_vmcs, cpu) == vmx->vmcs) 970 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
525 per_cpu(current_vmcs, cpu) = NULL; 971 per_cpu(current_vmcs, cpu) = NULL;
526 list_del(&vmx->local_vcpus_link); 972 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
527 vmx->vcpu.cpu = -1; 973 loaded_vmcs_init(loaded_vmcs);
528 vmx->launched = 0;
529} 974}
530 975
531static void vcpu_clear(struct vcpu_vmx *vmx) 976static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
532{ 977{
533 if (vmx->vcpu.cpu == -1) 978 if (loaded_vmcs->cpu != -1)
534 return; 979 smp_call_function_single(
535 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); 980 loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
536} 981}
537 982
538static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) 983static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
@@ -585,26 +1030,26 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
585 } 1030 }
586} 1031}
587 1032
588static unsigned long vmcs_readl(unsigned long field) 1033static __always_inline unsigned long vmcs_readl(unsigned long field)
589{ 1034{
590 unsigned long value = 0; 1035 unsigned long value;
591 1036
592 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) 1037 asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
593 : "+a"(value) : "d"(field) : "cc"); 1038 : "=a"(value) : "d"(field) : "cc");
594 return value; 1039 return value;
595} 1040}
596 1041
597static u16 vmcs_read16(unsigned long field) 1042static __always_inline u16 vmcs_read16(unsigned long field)
598{ 1043{
599 return vmcs_readl(field); 1044 return vmcs_readl(field);
600} 1045}
601 1046
602static u32 vmcs_read32(unsigned long field) 1047static __always_inline u32 vmcs_read32(unsigned long field)
603{ 1048{
604 return vmcs_readl(field); 1049 return vmcs_readl(field);
605} 1050}
606 1051
607static u64 vmcs_read64(unsigned long field) 1052static __always_inline u64 vmcs_read64(unsigned long field)
608{ 1053{
609#ifdef CONFIG_X86_64 1054#ifdef CONFIG_X86_64
610 return vmcs_readl(field); 1055 return vmcs_readl(field);
@@ -731,6 +1176,15 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
731 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 1176 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
732 if (vcpu->fpu_active) 1177 if (vcpu->fpu_active)
733 eb &= ~(1u << NM_VECTOR); 1178 eb &= ~(1u << NM_VECTOR);
1179
1180 /* When we are running a nested L2 guest and L1 specified for it a
1181 * certain exception bitmap, we must trap the same exceptions and pass
1182 * them to L1. When running L2, we will only handle the exceptions
1183 * specified above if L1 did not want them.
1184 */
1185 if (is_guest_mode(vcpu))
1186 eb |= get_vmcs12(vcpu)->exception_bitmap;
1187
734 vmcs_write32(EXCEPTION_BITMAP, eb); 1188 vmcs_write32(EXCEPTION_BITMAP, eb);
735} 1189}
736 1190
@@ -971,22 +1425,22 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
971 1425
972 if (!vmm_exclusive) 1426 if (!vmm_exclusive)
973 kvm_cpu_vmxon(phys_addr); 1427 kvm_cpu_vmxon(phys_addr);
974 else if (vcpu->cpu != cpu) 1428 else if (vmx->loaded_vmcs->cpu != cpu)
975 vcpu_clear(vmx); 1429 loaded_vmcs_clear(vmx->loaded_vmcs);
976 1430
977 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { 1431 if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
978 per_cpu(current_vmcs, cpu) = vmx->vmcs; 1432 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
979 vmcs_load(vmx->vmcs); 1433 vmcs_load(vmx->loaded_vmcs->vmcs);
980 } 1434 }
981 1435
982 if (vcpu->cpu != cpu) { 1436 if (vmx->loaded_vmcs->cpu != cpu) {
983 struct desc_ptr *gdt = &__get_cpu_var(host_gdt); 1437 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
984 unsigned long sysenter_esp; 1438 unsigned long sysenter_esp;
985 1439
986 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1440 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
987 local_irq_disable(); 1441 local_irq_disable();
988 list_add(&vmx->local_vcpus_link, 1442 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
989 &per_cpu(vcpus_on_cpu, cpu)); 1443 &per_cpu(loaded_vmcss_on_cpu, cpu));
990 local_irq_enable(); 1444 local_irq_enable();
991 1445
992 /* 1446 /*
@@ -998,6 +1452,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
998 1452
999 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 1453 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
1000 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 1454 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
1455 vmx->loaded_vmcs->cpu = cpu;
1001 } 1456 }
1002} 1457}
1003 1458
@@ -1005,7 +1460,8 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1005{ 1460{
1006 __vmx_load_host_state(to_vmx(vcpu)); 1461 __vmx_load_host_state(to_vmx(vcpu));
1007 if (!vmm_exclusive) { 1462 if (!vmm_exclusive) {
1008 __vcpu_clear(to_vmx(vcpu)); 1463 __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
1464 vcpu->cpu = -1;
1009 kvm_cpu_vmxoff(); 1465 kvm_cpu_vmxoff();
1010 } 1466 }
1011} 1467}
@@ -1023,19 +1479,55 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
1023 vmcs_writel(GUEST_CR0, cr0); 1479 vmcs_writel(GUEST_CR0, cr0);
1024 update_exception_bitmap(vcpu); 1480 update_exception_bitmap(vcpu);
1025 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; 1481 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
1482 if (is_guest_mode(vcpu))
1483 vcpu->arch.cr0_guest_owned_bits &=
1484 ~get_vmcs12(vcpu)->cr0_guest_host_mask;
1026 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 1485 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
1027} 1486}
1028 1487
1029static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); 1488static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
1030 1489
1490/*
1491 * Return the cr0 value that a nested guest would read. This is a combination
1492 * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
1493 * its hypervisor (cr0_read_shadow).
1494 */
1495static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
1496{
1497 return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
1498 (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
1499}
1500static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
1501{
1502 return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
1503 (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
1504}
1505
1031static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) 1506static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
1032{ 1507{
1508 /* Note that there is no vcpu->fpu_active = 0 here. The caller must
1509 * set this *before* calling this function.
1510 */
1033 vmx_decache_cr0_guest_bits(vcpu); 1511 vmx_decache_cr0_guest_bits(vcpu);
1034 vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); 1512 vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
1035 update_exception_bitmap(vcpu); 1513 update_exception_bitmap(vcpu);
1036 vcpu->arch.cr0_guest_owned_bits = 0; 1514 vcpu->arch.cr0_guest_owned_bits = 0;
1037 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 1515 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
1038 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); 1516 if (is_guest_mode(vcpu)) {
1517 /*
1518 * L1's specified read shadow might not contain the TS bit,
1519 * so now that we turned on shadowing of this bit, we need to
1520 * set this bit of the shadow. Like in nested_vmx_run we need
1521 * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
1522 * up-to-date here because we just decached cr0.TS (and we'll
1523 * only update vmcs12->guest_cr0 on nested exit).
1524 */
1525 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1526 vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
1527 (vcpu->arch.cr0 & X86_CR0_TS);
1528 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
1529 } else
1530 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
1039} 1531}
1040 1532
1041static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 1533static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -1119,6 +1611,25 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1119 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 1611 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1120} 1612}
1121 1613
1614/*
1615 * KVM wants to inject page-faults which it got to the guest. This function
1616 * checks whether in a nested guest, we need to inject them to L1 or L2.
1617 * This function assumes it is called with the exit reason in vmcs02 being
1618 * a #PF exception (this is the only case in which KVM injects a #PF when L2
1619 * is running).
1620 */
1621static int nested_pf_handled(struct kvm_vcpu *vcpu)
1622{
1623 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1624
1625 /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
1626 if (!(vmcs12->exception_bitmap & PF_VECTOR))
1627 return 0;
1628
1629 nested_vmx_vmexit(vcpu);
1630 return 1;
1631}
1632
1122static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 1633static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1123 bool has_error_code, u32 error_code, 1634 bool has_error_code, u32 error_code,
1124 bool reinject) 1635 bool reinject)
@@ -1126,6 +1637,10 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1126 struct vcpu_vmx *vmx = to_vmx(vcpu); 1637 struct vcpu_vmx *vmx = to_vmx(vcpu);
1127 u32 intr_info = nr | INTR_INFO_VALID_MASK; 1638 u32 intr_info = nr | INTR_INFO_VALID_MASK;
1128 1639
1640 if (nr == PF_VECTOR && is_guest_mode(vcpu) &&
1641 nested_pf_handled(vcpu))
1642 return;
1643
1129 if (has_error_code) { 1644 if (has_error_code) {
1130 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); 1645 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1131 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 1646 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
@@ -1248,12 +1763,24 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
1248static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 1763static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1249{ 1764{
1250 vmcs_write64(TSC_OFFSET, offset); 1765 vmcs_write64(TSC_OFFSET, offset);
1766 if (is_guest_mode(vcpu))
1767 /*
1768 * We're here if L1 chose not to trap the TSC MSR. Since
1769 * prepare_vmcs12() does not copy tsc_offset, we need to also
1770 * set the vmcs12 field here.
1771 */
1772 get_vmcs12(vcpu)->tsc_offset = offset -
1773 to_vmx(vcpu)->nested.vmcs01_tsc_offset;
1251} 1774}
1252 1775
1253static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) 1776static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
1254{ 1777{
1255 u64 offset = vmcs_read64(TSC_OFFSET); 1778 u64 offset = vmcs_read64(TSC_OFFSET);
1256 vmcs_write64(TSC_OFFSET, offset + adjustment); 1779 vmcs_write64(TSC_OFFSET, offset + adjustment);
1780 if (is_guest_mode(vcpu)) {
1781 /* Even when running L2, the adjustment needs to apply to L1 */
1782 to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
1783 }
1257} 1784}
1258 1785
1259static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) 1786static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
@@ -1261,6 +1788,236 @@ static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1261 return target_tsc - native_read_tsc(); 1788 return target_tsc - native_read_tsc();
1262} 1789}
1263 1790
1791static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
1792{
1793 struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
1794 return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
1795}
1796
1797/*
1798 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
1799 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
1800 * all guests if the "nested" module option is off, and can also be disabled
1801 * for a single guest by disabling its VMX cpuid bit.
1802 */
1803static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
1804{
1805 return nested && guest_cpuid_has_vmx(vcpu);
1806}
1807
1808/*
1809 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
1810 * returned for the various VMX controls MSRs when nested VMX is enabled.
1811 * The same values should also be used to verify that vmcs12 control fields are
1812 * valid during nested entry from L1 to L2.
1813 * Each of these control msrs has a low and high 32-bit half: A low bit is on
1814 * if the corresponding bit in the (32-bit) control field *must* be on, and a
1815 * bit in the high half is on if the corresponding bit in the control field
1816 * may be on. See also vmx_control_verify().
1817 * TODO: allow these variables to be modified (downgraded) by module options
1818 * or other means.
1819 */
1820static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
1821static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
1822static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
1823static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
1824static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
1825static __init void nested_vmx_setup_ctls_msrs(void)
1826{
1827 /*
1828 * Note that as a general rule, the high half of the MSRs (bits in
1829 * the control fields which may be 1) should be initialized by the
1830 * intersection of the underlying hardware's MSR (i.e., features which
1831 * can be supported) and the list of features we want to expose -
1832 * because they are known to be properly supported in our code.
1833 * Also, usually, the low half of the MSRs (bits which must be 1) can
1834 * be set to 0, meaning that L1 may turn off any of these bits. The
1835 * reason is that if one of these bits is necessary, it will appear
1836 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
1837 * fields of vmcs01 and vmcs02, will turn these bits off - and
1838 * nested_vmx_exit_handled() will not pass related exits to L1.
1839 * These rules have exceptions below.
1840 */
1841
1842 /* pin-based controls */
1843 /*
1844 * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is
1845 * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR.
1846 */
1847 nested_vmx_pinbased_ctls_low = 0x16 ;
1848 nested_vmx_pinbased_ctls_high = 0x16 |
1849 PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
1850 PIN_BASED_VIRTUAL_NMIS;
1851
1852 /* exit controls */
1853 nested_vmx_exit_ctls_low = 0;
1854 /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
1855#ifdef CONFIG_X86_64
1856 nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
1857#else
1858 nested_vmx_exit_ctls_high = 0;
1859#endif
1860
1861 /* entry controls */
1862 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
1863 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
1864 nested_vmx_entry_ctls_low = 0;
1865 nested_vmx_entry_ctls_high &=
1866 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
1867
1868 /* cpu-based controls */
1869 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
1870 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
1871 nested_vmx_procbased_ctls_low = 0;
1872 nested_vmx_procbased_ctls_high &=
1873 CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING |
1874 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
1875 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
1876 CPU_BASED_CR3_STORE_EXITING |
1877#ifdef CONFIG_X86_64
1878 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
1879#endif
1880 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
1881 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
1882 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1883 /*
1884 * We can allow some features even when not supported by the
1885 * hardware. For example, L1 can specify an MSR bitmap - and we
1886 * can use it to avoid exits to L1 - even when L0 runs L2
1887 * without MSR bitmaps.
1888 */
1889 nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS;
1890
1891 /* secondary cpu-based controls */
1892 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
1893 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
1894 nested_vmx_secondary_ctls_low = 0;
1895 nested_vmx_secondary_ctls_high &=
1896 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1897}
1898
1899static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
1900{
1901 /*
1902 * Bits 0 in high must be 0, and bits 1 in low must be 1.
1903 */
1904 return ((control & high) | low) == control;
1905}
1906
1907static inline u64 vmx_control_msr(u32 low, u32 high)
1908{
1909 return low | ((u64)high << 32);
1910}
1911
1912/*
1913 * If we allow our guest to use VMX instructions (i.e., nested VMX), we should
1914 * also let it use VMX-specific MSRs.
1915 * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a
1916 * VMX-specific MSR, or 0 when we haven't (and the caller should handle it
1917 * like all other MSRs).
1918 */
1919static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1920{
1921 if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC &&
1922 msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) {
1923 /*
1924 * According to the spec, processors which do not support VMX
1925 * should throw a #GP(0) when VMX capability MSRs are read.
1926 */
1927 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
1928 return 1;
1929 }
1930
1931 switch (msr_index) {
1932 case MSR_IA32_FEATURE_CONTROL:
1933 *pdata = 0;
1934 break;
1935 case MSR_IA32_VMX_BASIC:
1936 /*
1937 * This MSR reports some information about VMX support. We
1938 * should return information about the VMX we emulate for the
1939 * guest, and the VMCS structure we give it - not about the
1940 * VMX support of the underlying hardware.
1941 */
1942 *pdata = VMCS12_REVISION |
1943 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
1944 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
1945 break;
1946 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1947 case MSR_IA32_VMX_PINBASED_CTLS:
1948 *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
1949 nested_vmx_pinbased_ctls_high);
1950 break;
1951 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1952 case MSR_IA32_VMX_PROCBASED_CTLS:
1953 *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
1954 nested_vmx_procbased_ctls_high);
1955 break;
1956 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1957 case MSR_IA32_VMX_EXIT_CTLS:
1958 *pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
1959 nested_vmx_exit_ctls_high);
1960 break;
1961 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1962 case MSR_IA32_VMX_ENTRY_CTLS:
1963 *pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
1964 nested_vmx_entry_ctls_high);
1965 break;
1966 case MSR_IA32_VMX_MISC:
1967 *pdata = 0;
1968 break;
1969 /*
1970 * These MSRs specify bits which the guest must keep fixed (on or off)
1971 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
1972 * We picked the standard core2 setting.
1973 */
1974#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
1975#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
1976 case MSR_IA32_VMX_CR0_FIXED0:
1977 *pdata = VMXON_CR0_ALWAYSON;
1978 break;
1979 case MSR_IA32_VMX_CR0_FIXED1:
1980 *pdata = -1ULL;
1981 break;
1982 case MSR_IA32_VMX_CR4_FIXED0:
1983 *pdata = VMXON_CR4_ALWAYSON;
1984 break;
1985 case MSR_IA32_VMX_CR4_FIXED1:
1986 *pdata = -1ULL;
1987 break;
1988 case MSR_IA32_VMX_VMCS_ENUM:
1989 *pdata = 0x1f;
1990 break;
1991 case MSR_IA32_VMX_PROCBASED_CTLS2:
1992 *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
1993 nested_vmx_secondary_ctls_high);
1994 break;
1995 case MSR_IA32_VMX_EPT_VPID_CAP:
1996 /* Currently, no nested ept or nested vpid */
1997 *pdata = 0;
1998 break;
1999 default:
2000 return 0;
2001 }
2002
2003 return 1;
2004}
2005
2006static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2007{
2008 if (!nested_vmx_allowed(vcpu))
2009 return 0;
2010
2011 if (msr_index == MSR_IA32_FEATURE_CONTROL)
2012 /* TODO: the right thing. */
2013 return 1;
2014 /*
2015 * No need to treat VMX capability MSRs specially: If we don't handle
2016 * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
2017 */
2018 return 0;
2019}
2020
1264/* 2021/*
1265 * Reads an msr value (of 'msr_index') into 'pdata'. 2022 * Reads an msr value (of 'msr_index') into 'pdata'.
1266 * Returns 0 on success, non-0 otherwise. 2023 * Returns 0 on success, non-0 otherwise.
@@ -1309,6 +2066,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1309 /* Otherwise falls through */ 2066 /* Otherwise falls through */
1310 default: 2067 default:
1311 vmx_load_host_state(to_vmx(vcpu)); 2068 vmx_load_host_state(to_vmx(vcpu));
2069 if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
2070 return 0;
1312 msr = find_msr_entry(to_vmx(vcpu), msr_index); 2071 msr = find_msr_entry(to_vmx(vcpu), msr_index);
1313 if (msr) { 2072 if (msr) {
1314 vmx_load_host_state(to_vmx(vcpu)); 2073 vmx_load_host_state(to_vmx(vcpu));
@@ -1380,6 +2139,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1380 return 1; 2139 return 1;
1381 /* Otherwise falls through */ 2140 /* Otherwise falls through */
1382 default: 2141 default:
2142 if (vmx_set_vmx_msr(vcpu, msr_index, data))
2143 break;
1383 msr = find_msr_entry(vmx, msr_index); 2144 msr = find_msr_entry(vmx, msr_index);
1384 if (msr) { 2145 if (msr) {
1385 vmx_load_host_state(vmx); 2146 vmx_load_host_state(vmx);
@@ -1469,7 +2230,7 @@ static int hardware_enable(void *garbage)
1469 if (read_cr4() & X86_CR4_VMXE) 2230 if (read_cr4() & X86_CR4_VMXE)
1470 return -EBUSY; 2231 return -EBUSY;
1471 2232
1472 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); 2233 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
1473 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 2234 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1474 2235
1475 test_bits = FEATURE_CONTROL_LOCKED; 2236 test_bits = FEATURE_CONTROL_LOCKED;
@@ -1493,14 +2254,14 @@ static int hardware_enable(void *garbage)
1493 return 0; 2254 return 0;
1494} 2255}
1495 2256
1496static void vmclear_local_vcpus(void) 2257static void vmclear_local_loaded_vmcss(void)
1497{ 2258{
1498 int cpu = raw_smp_processor_id(); 2259 int cpu = raw_smp_processor_id();
1499 struct vcpu_vmx *vmx, *n; 2260 struct loaded_vmcs *v, *n;
1500 2261
1501 list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu), 2262 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
1502 local_vcpus_link) 2263 loaded_vmcss_on_cpu_link)
1503 __vcpu_clear(vmx); 2264 __loaded_vmcs_clear(v);
1504} 2265}
1505 2266
1506 2267
@@ -1515,7 +2276,7 @@ static void kvm_cpu_vmxoff(void)
1515static void hardware_disable(void *garbage) 2276static void hardware_disable(void *garbage)
1516{ 2277{
1517 if (vmm_exclusive) { 2278 if (vmm_exclusive) {
1518 vmclear_local_vcpus(); 2279 vmclear_local_loaded_vmcss();
1519 kvm_cpu_vmxoff(); 2280 kvm_cpu_vmxoff();
1520 } 2281 }
1521 write_cr4(read_cr4() & ~X86_CR4_VMXE); 2282 write_cr4(read_cr4() & ~X86_CR4_VMXE);
@@ -1696,6 +2457,18 @@ static void free_vmcs(struct vmcs *vmcs)
1696 free_pages((unsigned long)vmcs, vmcs_config.order); 2457 free_pages((unsigned long)vmcs, vmcs_config.order);
1697} 2458}
1698 2459
2460/*
2461 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2462 */
2463static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2464{
2465 if (!loaded_vmcs->vmcs)
2466 return;
2467 loaded_vmcs_clear(loaded_vmcs);
2468 free_vmcs(loaded_vmcs->vmcs);
2469 loaded_vmcs->vmcs = NULL;
2470}
2471
1699static void free_kvm_area(void) 2472static void free_kvm_area(void)
1700{ 2473{
1701 int cpu; 2474 int cpu;
@@ -1756,6 +2529,9 @@ static __init int hardware_setup(void)
1756 if (!cpu_has_vmx_ple()) 2529 if (!cpu_has_vmx_ple())
1757 ple_gap = 0; 2530 ple_gap = 0;
1758 2531
2532 if (nested)
2533 nested_vmx_setup_ctls_msrs();
2534
1759 return alloc_kvm_area(); 2535 return alloc_kvm_area();
1760} 2536}
1761 2537
@@ -2041,7 +2817,7 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
2041 (unsigned long *)&vcpu->arch.regs_dirty); 2817 (unsigned long *)&vcpu->arch.regs_dirty);
2042} 2818}
2043 2819
2044static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 2820static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
2045 2821
2046static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, 2822static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
2047 unsigned long cr0, 2823 unsigned long cr0,
@@ -2139,11 +2915,23 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
2139 vmcs_writel(GUEST_CR3, guest_cr3); 2915 vmcs_writel(GUEST_CR3, guest_cr3);
2140} 2916}
2141 2917
2142static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 2918static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2143{ 2919{
2144 unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? 2920 unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
2145 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 2921 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
2146 2922
2923 if (cr4 & X86_CR4_VMXE) {
2924 /*
2925 * To use VMXON (and later other VMX instructions), a guest
2926 * must first be able to turn on cr4.VMXE (see handle_vmon()).
2927 * So basically the check on whether to allow nested VMX
2928 * is here.
2929 */
2930 if (!nested_vmx_allowed(vcpu))
2931 return 1;
2932 } else if (to_vmx(vcpu)->nested.vmxon)
2933 return 1;
2934
2147 vcpu->arch.cr4 = cr4; 2935 vcpu->arch.cr4 = cr4;
2148 if (enable_ept) { 2936 if (enable_ept) {
2149 if (!is_paging(vcpu)) { 2937 if (!is_paging(vcpu)) {
@@ -2156,6 +2944,7 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2156 2944
2157 vmcs_writel(CR4_READ_SHADOW, cr4); 2945 vmcs_writel(CR4_READ_SHADOW, cr4);
2158 vmcs_writel(GUEST_CR4, hw_cr4); 2946 vmcs_writel(GUEST_CR4, hw_cr4);
2947 return 0;
2159} 2948}
2160 2949
2161static void vmx_get_segment(struct kvm_vcpu *vcpu, 2950static void vmx_get_segment(struct kvm_vcpu *vcpu,
@@ -2721,18 +3510,110 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
2721} 3510}
2722 3511
2723/* 3512/*
3513 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
3514 * will not change in the lifetime of the guest.
3515 * Note that host-state that does change is set elsewhere. E.g., host-state
3516 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
3517 */
3518static void vmx_set_constant_host_state(void)
3519{
3520 u32 low32, high32;
3521 unsigned long tmpl;
3522 struct desc_ptr dt;
3523
3524 vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
3525 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
3526 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
3527
3528 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
3529 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3530 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3531 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3532 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
3533
3534 native_store_idt(&dt);
3535 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
3536
3537 asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
3538 vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
3539
3540 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
3541 vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
3542 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
3543 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
3544
3545 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
3546 rdmsr(MSR_IA32_CR_PAT, low32, high32);
3547 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
3548 }
3549}
3550
3551static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
3552{
3553 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
3554 if (enable_ept)
3555 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
3556 if (is_guest_mode(&vmx->vcpu))
3557 vmx->vcpu.arch.cr4_guest_owned_bits &=
3558 ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
3559 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
3560}
3561
3562static u32 vmx_exec_control(struct vcpu_vmx *vmx)
3563{
3564 u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
3565 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
3566 exec_control &= ~CPU_BASED_TPR_SHADOW;
3567#ifdef CONFIG_X86_64
3568 exec_control |= CPU_BASED_CR8_STORE_EXITING |
3569 CPU_BASED_CR8_LOAD_EXITING;
3570#endif
3571 }
3572 if (!enable_ept)
3573 exec_control |= CPU_BASED_CR3_STORE_EXITING |
3574 CPU_BASED_CR3_LOAD_EXITING |
3575 CPU_BASED_INVLPG_EXITING;
3576 return exec_control;
3577}
3578
3579static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3580{
3581 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
3582 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
3583 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
3584 if (vmx->vpid == 0)
3585 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
3586 if (!enable_ept) {
3587 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
3588 enable_unrestricted_guest = 0;
3589 }
3590 if (!enable_unrestricted_guest)
3591 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
3592 if (!ple_gap)
3593 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
3594 return exec_control;
3595}
3596
3597static void ept_set_mmio_spte_mask(void)
3598{
3599 /*
3600 * EPT Misconfigurations can be generated if the value of bits 2:0
3601 * of an EPT paging-structure entry is 110b (write/execute).
3602 * Also, magic bits (0xffull << 49) is set to quickly identify mmio
3603 * spte.
3604 */
3605 kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
3606}
3607
3608/*
2724 * Sets up the vmcs for emulated real mode. 3609 * Sets up the vmcs for emulated real mode.
2725 */ 3610 */
2726static int vmx_vcpu_setup(struct vcpu_vmx *vmx) 3611static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2727{ 3612{
2728 u32 host_sysenter_cs, msr_low, msr_high; 3613#ifdef CONFIG_X86_64
2729 u32 junk;
2730 u64 host_pat;
2731 unsigned long a; 3614 unsigned long a;
2732 struct desc_ptr dt; 3615#endif
2733 int i; 3616 int i;
2734 unsigned long kvm_vmx_return;
2735 u32 exec_control;
2736 3617
2737 /* I/O */ 3618 /* I/O */
2738 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); 3619 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
@@ -2747,36 +3628,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2747 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, 3628 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
2748 vmcs_config.pin_based_exec_ctrl); 3629 vmcs_config.pin_based_exec_ctrl);
2749 3630
2750 exec_control = vmcs_config.cpu_based_exec_ctrl; 3631 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
2751 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
2752 exec_control &= ~CPU_BASED_TPR_SHADOW;
2753#ifdef CONFIG_X86_64
2754 exec_control |= CPU_BASED_CR8_STORE_EXITING |
2755 CPU_BASED_CR8_LOAD_EXITING;
2756#endif
2757 }
2758 if (!enable_ept)
2759 exec_control |= CPU_BASED_CR3_STORE_EXITING |
2760 CPU_BASED_CR3_LOAD_EXITING |
2761 CPU_BASED_INVLPG_EXITING;
2762 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
2763 3632
2764 if (cpu_has_secondary_exec_ctrls()) { 3633 if (cpu_has_secondary_exec_ctrls()) {
2765 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 3634 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
2766 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) 3635 vmx_secondary_exec_control(vmx));
2767 exec_control &=
2768 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2769 if (vmx->vpid == 0)
2770 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2771 if (!enable_ept) {
2772 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2773 enable_unrestricted_guest = 0;
2774 }
2775 if (!enable_unrestricted_guest)
2776 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2777 if (!ple_gap)
2778 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
2779 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2780 } 3636 }
2781 3637
2782 if (ple_gap) { 3638 if (ple_gap) {
@@ -2784,20 +3640,13 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2784 vmcs_write32(PLE_WINDOW, ple_window); 3640 vmcs_write32(PLE_WINDOW, ple_window);
2785 } 3641 }
2786 3642
2787 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); 3643 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
2788 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); 3644 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
2789 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 3645 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
2790 3646
2791 vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
2792 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
2793 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
2794
2795 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
2796 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
2797 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
2798 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 3647 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
2799 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 3648 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
2800 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 3649 vmx_set_constant_host_state();
2801#ifdef CONFIG_X86_64 3650#ifdef CONFIG_X86_64
2802 rdmsrl(MSR_FS_BASE, a); 3651 rdmsrl(MSR_FS_BASE, a);
2803 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ 3652 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
@@ -2808,32 +3657,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2808 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 3657 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
2809#endif 3658#endif
2810 3659
2811 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
2812
2813 native_store_idt(&dt);
2814 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
2815
2816 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
2817 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
2818 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 3660 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
2819 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3661 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
2820 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); 3662 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
2821 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3663 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
2822 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); 3664 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
2823 3665
2824 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
2825 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
2826 rdmsrl(MSR_IA32_SYSENTER_ESP, a);
2827 vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
2828 rdmsrl(MSR_IA32_SYSENTER_EIP, a);
2829 vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
2830
2831 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
2832 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
2833 host_pat = msr_low | ((u64) msr_high << 32);
2834 vmcs_write64(HOST_IA32_PAT, host_pat);
2835 }
2836 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 3666 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
3667 u32 msr_low, msr_high;
3668 u64 host_pat;
2837 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); 3669 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
2838 host_pat = msr_low | ((u64) msr_high << 32); 3670 host_pat = msr_low | ((u64) msr_high << 32);
2839 /* Write the default value follow host pat */ 3671 /* Write the default value follow host pat */
@@ -2863,10 +3695,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2863 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); 3695 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
2864 3696
2865 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 3697 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
2866 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; 3698 set_cr4_guest_host_mask(vmx);
2867 if (enable_ept)
2868 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
2869 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
2870 3699
2871 kvm_write_tsc(&vmx->vcpu, 0); 3700 kvm_write_tsc(&vmx->vcpu, 0);
2872 3701
@@ -2990,9 +3819,25 @@ out:
2990 return ret; 3819 return ret;
2991} 3820}
2992 3821
3822/*
3823 * In nested virtualization, check if L1 asked to exit on external interrupts.
3824 * For most existing hypervisors, this will always return true.
3825 */
3826static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
3827{
3828 return get_vmcs12(vcpu)->pin_based_vm_exec_control &
3829 PIN_BASED_EXT_INTR_MASK;
3830}
3831
2993static void enable_irq_window(struct kvm_vcpu *vcpu) 3832static void enable_irq_window(struct kvm_vcpu *vcpu)
2994{ 3833{
2995 u32 cpu_based_vm_exec_control; 3834 u32 cpu_based_vm_exec_control;
3835 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
3836 /* We can get here when nested_run_pending caused
3837 * vmx_interrupt_allowed() to return false. In this case, do
3838 * nothing - the interrupt will be injected later.
3839 */
3840 return;
2996 3841
2997 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 3842 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2998 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; 3843 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
@@ -3049,6 +3894,9 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
3049{ 3894{
3050 struct vcpu_vmx *vmx = to_vmx(vcpu); 3895 struct vcpu_vmx *vmx = to_vmx(vcpu);
3051 3896
3897 if (is_guest_mode(vcpu))
3898 return;
3899
3052 if (!cpu_has_virtual_nmis()) { 3900 if (!cpu_has_virtual_nmis()) {
3053 /* 3901 /*
3054 * Tracking the NMI-blocked state in software is built upon 3902 * Tracking the NMI-blocked state in software is built upon
@@ -3115,6 +3963,17 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3115 3963
3116static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 3964static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
3117{ 3965{
3966 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
3967 struct vmcs12 *vmcs12;
3968 if (to_vmx(vcpu)->nested.nested_run_pending)
3969 return 0;
3970 nested_vmx_vmexit(vcpu);
3971 vmcs12 = get_vmcs12(vcpu);
3972 vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
3973 vmcs12->vm_exit_intr_info = 0;
3974 /* fall through to normal code, but now in L1, not L2 */
3975 }
3976
3118 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 3977 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
3119 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3978 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
3120 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 3979 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
@@ -3356,6 +4215,58 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3356 hypercall[2] = 0xc1; 4215 hypercall[2] = 0xc1;
3357} 4216}
3358 4217
4218/* called to set cr0 as approriate for a mov-to-cr0 exit. */
4219static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
4220{
4221 if (to_vmx(vcpu)->nested.vmxon &&
4222 ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
4223 return 1;
4224
4225 if (is_guest_mode(vcpu)) {
4226 /*
4227 * We get here when L2 changed cr0 in a way that did not change
4228 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
4229 * but did change L0 shadowed bits. This can currently happen
4230 * with the TS bit: L0 may want to leave TS on (for lazy fpu
4231 * loading) while pretending to allow the guest to change it.
4232 */
4233 if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) |
4234 (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits)))
4235 return 1;
4236 vmcs_writel(CR0_READ_SHADOW, val);
4237 return 0;
4238 } else
4239 return kvm_set_cr0(vcpu, val);
4240}
4241
4242static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
4243{
4244 if (is_guest_mode(vcpu)) {
4245 if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) |
4246 (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits)))
4247 return 1;
4248 vmcs_writel(CR4_READ_SHADOW, val);
4249 return 0;
4250 } else
4251 return kvm_set_cr4(vcpu, val);
4252}
4253
4254/* called to set cr0 as approriate for clts instruction exit. */
4255static void handle_clts(struct kvm_vcpu *vcpu)
4256{
4257 if (is_guest_mode(vcpu)) {
4258 /*
4259 * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS
4260 * but we did (!fpu_active). We need to keep GUEST_CR0.TS on,
4261 * just pretend it's off (also in arch.cr0 for fpu_activate).
4262 */
4263 vmcs_writel(CR0_READ_SHADOW,
4264 vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
4265 vcpu->arch.cr0 &= ~X86_CR0_TS;
4266 } else
4267 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
4268}
4269
3359static int handle_cr(struct kvm_vcpu *vcpu) 4270static int handle_cr(struct kvm_vcpu *vcpu)
3360{ 4271{
3361 unsigned long exit_qualification, val; 4272 unsigned long exit_qualification, val;
@@ -3372,7 +4283,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3372 trace_kvm_cr_write(cr, val); 4283 trace_kvm_cr_write(cr, val);
3373 switch (cr) { 4284 switch (cr) {
3374 case 0: 4285 case 0:
3375 err = kvm_set_cr0(vcpu, val); 4286 err = handle_set_cr0(vcpu, val);
3376 kvm_complete_insn_gp(vcpu, err); 4287 kvm_complete_insn_gp(vcpu, err);
3377 return 1; 4288 return 1;
3378 case 3: 4289 case 3:
@@ -3380,7 +4291,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3380 kvm_complete_insn_gp(vcpu, err); 4291 kvm_complete_insn_gp(vcpu, err);
3381 return 1; 4292 return 1;
3382 case 4: 4293 case 4:
3383 err = kvm_set_cr4(vcpu, val); 4294 err = handle_set_cr4(vcpu, val);
3384 kvm_complete_insn_gp(vcpu, err); 4295 kvm_complete_insn_gp(vcpu, err);
3385 return 1; 4296 return 1;
3386 case 8: { 4297 case 8: {
@@ -3398,7 +4309,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3398 }; 4309 };
3399 break; 4310 break;
3400 case 2: /* clts */ 4311 case 2: /* clts */
3401 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); 4312 handle_clts(vcpu);
3402 trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); 4313 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
3403 skip_emulated_instruction(vcpu); 4314 skip_emulated_instruction(vcpu);
3404 vmx_fpu_activate(vcpu); 4315 vmx_fpu_activate(vcpu);
@@ -3574,12 +4485,6 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
3574 return 1; 4485 return 1;
3575} 4486}
3576 4487
3577static int handle_vmx_insn(struct kvm_vcpu *vcpu)
3578{
3579 kvm_queue_exception(vcpu, UD_VECTOR);
3580 return 1;
3581}
3582
3583static int handle_invd(struct kvm_vcpu *vcpu) 4488static int handle_invd(struct kvm_vcpu *vcpu)
3584{ 4489{
3585 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 4490 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
@@ -3777,11 +4682,19 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
3777static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 4682static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
3778{ 4683{
3779 u64 sptes[4]; 4684 u64 sptes[4];
3780 int nr_sptes, i; 4685 int nr_sptes, i, ret;
3781 gpa_t gpa; 4686 gpa_t gpa;
3782 4687
3783 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 4688 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
3784 4689
4690 ret = handle_mmio_page_fault_common(vcpu, gpa, true);
4691 if (likely(ret == 1))
4692 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
4693 EMULATE_DONE;
4694 if (unlikely(!ret))
4695 return 1;
4696
4697 /* It is the real ept misconfig */
3785 printk(KERN_ERR "EPT: Misconfiguration.\n"); 4698 printk(KERN_ERR "EPT: Misconfiguration.\n");
3786 printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); 4699 printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
3787 4700
@@ -3866,6 +4779,639 @@ static int handle_invalid_op(struct kvm_vcpu *vcpu)
3866} 4779}
3867 4780
3868/* 4781/*
4782 * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
4783 * We could reuse a single VMCS for all the L2 guests, but we also want the
4784 * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
4785 * allows keeping them loaded on the processor, and in the future will allow
4786 * optimizations where prepare_vmcs02 doesn't need to set all the fields on
4787 * every entry if they never change.
4788 * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
4789 * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
4790 *
4791 * The following functions allocate and free a vmcs02 in this pool.
4792 */
4793
4794/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
4795static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
4796{
4797 struct vmcs02_list *item;
4798 list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
4799 if (item->vmptr == vmx->nested.current_vmptr) {
4800 list_move(&item->list, &vmx->nested.vmcs02_pool);
4801 return &item->vmcs02;
4802 }
4803
4804 if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
4805 /* Recycle the least recently used VMCS. */
4806 item = list_entry(vmx->nested.vmcs02_pool.prev,
4807 struct vmcs02_list, list);
4808 item->vmptr = vmx->nested.current_vmptr;
4809 list_move(&item->list, &vmx->nested.vmcs02_pool);
4810 return &item->vmcs02;
4811 }
4812
4813 /* Create a new VMCS */
4814 item = (struct vmcs02_list *)
4815 kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
4816 if (!item)
4817 return NULL;
4818 item->vmcs02.vmcs = alloc_vmcs();
4819 if (!item->vmcs02.vmcs) {
4820 kfree(item);
4821 return NULL;
4822 }
4823 loaded_vmcs_init(&item->vmcs02);
4824 item->vmptr = vmx->nested.current_vmptr;
4825 list_add(&(item->list), &(vmx->nested.vmcs02_pool));
4826 vmx->nested.vmcs02_num++;
4827 return &item->vmcs02;
4828}
4829
4830/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
4831static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
4832{
4833 struct vmcs02_list *item;
4834 list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
4835 if (item->vmptr == vmptr) {
4836 free_loaded_vmcs(&item->vmcs02);
4837 list_del(&item->list);
4838 kfree(item);
4839 vmx->nested.vmcs02_num--;
4840 return;
4841 }
4842}
4843
4844/*
4845 * Free all VMCSs saved for this vcpu, except the one pointed by
4846 * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one
4847 * currently used, if running L2), and vmcs01 when running L2.
4848 */
4849static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
4850{
4851 struct vmcs02_list *item, *n;
4852 list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
4853 if (vmx->loaded_vmcs != &item->vmcs02)
4854 free_loaded_vmcs(&item->vmcs02);
4855 list_del(&item->list);
4856 kfree(item);
4857 }
4858 vmx->nested.vmcs02_num = 0;
4859
4860 if (vmx->loaded_vmcs != &vmx->vmcs01)
4861 free_loaded_vmcs(&vmx->vmcs01);
4862}
4863
4864/*
4865 * Emulate the VMXON instruction.
4866 * Currently, we just remember that VMX is active, and do not save or even
4867 * inspect the argument to VMXON (the so-called "VMXON pointer") because we
4868 * do not currently need to store anything in that guest-allocated memory
4869 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
4870 * argument is different from the VMXON pointer (which the spec says they do).
4871 */
4872static int handle_vmon(struct kvm_vcpu *vcpu)
4873{
4874 struct kvm_segment cs;
4875 struct vcpu_vmx *vmx = to_vmx(vcpu);
4876
4877 /* The Intel VMX Instruction Reference lists a bunch of bits that
4878 * are prerequisite to running VMXON, most notably cr4.VMXE must be
4879 * set to 1 (see vmx_set_cr4() for when we allow the guest to set this).
4880 * Otherwise, we should fail with #UD. We test these now:
4881 */
4882 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) ||
4883 !kvm_read_cr0_bits(vcpu, X86_CR0_PE) ||
4884 (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
4885 kvm_queue_exception(vcpu, UD_VECTOR);
4886 return 1;
4887 }
4888
4889 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
4890 if (is_long_mode(vcpu) && !cs.l) {
4891 kvm_queue_exception(vcpu, UD_VECTOR);
4892 return 1;
4893 }
4894
4895 if (vmx_get_cpl(vcpu)) {
4896 kvm_inject_gp(vcpu, 0);
4897 return 1;
4898 }
4899
4900 INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
4901 vmx->nested.vmcs02_num = 0;
4902
4903 vmx->nested.vmxon = true;
4904
4905 skip_emulated_instruction(vcpu);
4906 return 1;
4907}
4908
4909/*
4910 * Intel's VMX Instruction Reference specifies a common set of prerequisites
4911 * for running VMX instructions (except VMXON, whose prerequisites are
4912 * slightly different). It also specifies what exception to inject otherwise.
4913 */
4914static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
4915{
4916 struct kvm_segment cs;
4917 struct vcpu_vmx *vmx = to_vmx(vcpu);
4918
4919 if (!vmx->nested.vmxon) {
4920 kvm_queue_exception(vcpu, UD_VECTOR);
4921 return 0;
4922 }
4923
4924 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
4925 if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
4926 (is_long_mode(vcpu) && !cs.l)) {
4927 kvm_queue_exception(vcpu, UD_VECTOR);
4928 return 0;
4929 }
4930
4931 if (vmx_get_cpl(vcpu)) {
4932 kvm_inject_gp(vcpu, 0);
4933 return 0;
4934 }
4935
4936 return 1;
4937}
4938
4939/*
4940 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
4941 * just stops using VMX.
4942 */
4943static void free_nested(struct vcpu_vmx *vmx)
4944{
4945 if (!vmx->nested.vmxon)
4946 return;
4947 vmx->nested.vmxon = false;
4948 if (vmx->nested.current_vmptr != -1ull) {
4949 kunmap(vmx->nested.current_vmcs12_page);
4950 nested_release_page(vmx->nested.current_vmcs12_page);
4951 vmx->nested.current_vmptr = -1ull;
4952 vmx->nested.current_vmcs12 = NULL;
4953 }
4954 /* Unpin physical memory we referred to in current vmcs02 */
4955 if (vmx->nested.apic_access_page) {
4956 nested_release_page(vmx->nested.apic_access_page);
4957 vmx->nested.apic_access_page = 0;
4958 }
4959
4960 nested_free_all_saved_vmcss(vmx);
4961}
4962
4963/* Emulate the VMXOFF instruction */
4964static int handle_vmoff(struct kvm_vcpu *vcpu)
4965{
4966 if (!nested_vmx_check_permission(vcpu))
4967 return 1;
4968 free_nested(to_vmx(vcpu));
4969 skip_emulated_instruction(vcpu);
4970 return 1;
4971}
4972
4973/*
4974 * Decode the memory-address operand of a vmx instruction, as recorded on an
4975 * exit caused by such an instruction (run by a guest hypervisor).
4976 * On success, returns 0. When the operand is invalid, returns 1 and throws
4977 * #UD or #GP.
4978 */
4979static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
4980 unsigned long exit_qualification,
4981 u32 vmx_instruction_info, gva_t *ret)
4982{
4983 /*
4984 * According to Vol. 3B, "Information for VM Exits Due to Instruction
4985 * Execution", on an exit, vmx_instruction_info holds most of the
4986 * addressing components of the operand. Only the displacement part
4987 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4988 * For how an actual address is calculated from all these components,
4989 * refer to Vol. 1, "Operand Addressing".
4990 */
4991 int scaling = vmx_instruction_info & 3;
4992 int addr_size = (vmx_instruction_info >> 7) & 7;
4993 bool is_reg = vmx_instruction_info & (1u << 10);
4994 int seg_reg = (vmx_instruction_info >> 15) & 7;
4995 int index_reg = (vmx_instruction_info >> 18) & 0xf;
4996 bool index_is_valid = !(vmx_instruction_info & (1u << 22));
4997 int base_reg = (vmx_instruction_info >> 23) & 0xf;
4998 bool base_is_valid = !(vmx_instruction_info & (1u << 27));
4999
5000 if (is_reg) {
5001 kvm_queue_exception(vcpu, UD_VECTOR);
5002 return 1;
5003 }
5004
5005 /* Addr = segment_base + offset */
5006 /* offset = base + [index * scale] + displacement */
5007 *ret = vmx_get_segment_base(vcpu, seg_reg);
5008 if (base_is_valid)
5009 *ret += kvm_register_read(vcpu, base_reg);
5010 if (index_is_valid)
5011 *ret += kvm_register_read(vcpu, index_reg)<<scaling;
5012 *ret += exit_qualification; /* holds the displacement */
5013
5014 if (addr_size == 1) /* 32 bit */
5015 *ret &= 0xffffffff;
5016
5017 /*
5018 * TODO: throw #GP (and return 1) in various cases that the VM*
5019 * instructions require it - e.g., offset beyond segment limit,
5020 * unusable or unreadable/unwritable segment, non-canonical 64-bit
5021 * address, and so on. Currently these are not checked.
5022 */
5023 return 0;
5024}
5025
5026/*
5027 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
5028 * set the success or error code of an emulated VMX instruction, as specified
5029 * by Vol 2B, VMX Instruction Reference, "Conventions".
5030 */
5031static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
5032{
5033 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
5034 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5035 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
5036}
5037
5038static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
5039{
5040 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5041 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
5042 X86_EFLAGS_SF | X86_EFLAGS_OF))
5043 | X86_EFLAGS_CF);
5044}
5045
5046static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
5047 u32 vm_instruction_error)
5048{
5049 if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
5050 /*
5051 * failValid writes the error number to the current VMCS, which
5052 * can't be done there isn't a current VMCS.
5053 */
5054 nested_vmx_failInvalid(vcpu);
5055 return;
5056 }
5057 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5058 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5059 X86_EFLAGS_SF | X86_EFLAGS_OF))
5060 | X86_EFLAGS_ZF);
5061 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
5062}
5063
5064/* Emulate the VMCLEAR instruction */
5065static int handle_vmclear(struct kvm_vcpu *vcpu)
5066{
5067 struct vcpu_vmx *vmx = to_vmx(vcpu);
5068 gva_t gva;
5069 gpa_t vmptr;
5070 struct vmcs12 *vmcs12;
5071 struct page *page;
5072 struct x86_exception e;
5073
5074 if (!nested_vmx_check_permission(vcpu))
5075 return 1;
5076
5077 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5078 vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
5079 return 1;
5080
5081 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
5082 sizeof(vmptr), &e)) {
5083 kvm_inject_page_fault(vcpu, &e);
5084 return 1;
5085 }
5086
5087 if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
5088 nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
5089 skip_emulated_instruction(vcpu);
5090 return 1;
5091 }
5092
5093 if (vmptr == vmx->nested.current_vmptr) {
5094 kunmap(vmx->nested.current_vmcs12_page);
5095 nested_release_page(vmx->nested.current_vmcs12_page);
5096 vmx->nested.current_vmptr = -1ull;
5097 vmx->nested.current_vmcs12 = NULL;
5098 }
5099
5100 page = nested_get_page(vcpu, vmptr);
5101 if (page == NULL) {
5102 /*
5103 * For accurate processor emulation, VMCLEAR beyond available
5104 * physical memory should do nothing at all. However, it is
5105 * possible that a nested vmx bug, not a guest hypervisor bug,
5106 * resulted in this case, so let's shut down before doing any
5107 * more damage:
5108 */
5109 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5110 return 1;
5111 }
5112 vmcs12 = kmap(page);
5113 vmcs12->launch_state = 0;
5114 kunmap(page);
5115 nested_release_page(page);
5116
5117 nested_free_vmcs02(vmx, vmptr);
5118
5119 skip_emulated_instruction(vcpu);
5120 nested_vmx_succeed(vcpu);
5121 return 1;
5122}
5123
5124static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
5125
5126/* Emulate the VMLAUNCH instruction */
5127static int handle_vmlaunch(struct kvm_vcpu *vcpu)
5128{
5129 return nested_vmx_run(vcpu, true);
5130}
5131
5132/* Emulate the VMRESUME instruction */
5133static int handle_vmresume(struct kvm_vcpu *vcpu)
5134{
5135
5136 return nested_vmx_run(vcpu, false);
5137}
5138
5139enum vmcs_field_type {
5140 VMCS_FIELD_TYPE_U16 = 0,
5141 VMCS_FIELD_TYPE_U64 = 1,
5142 VMCS_FIELD_TYPE_U32 = 2,
5143 VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
5144};
5145
5146static inline int vmcs_field_type(unsigned long field)
5147{
5148 if (0x1 & field) /* the *_HIGH fields are all 32 bit */
5149 return VMCS_FIELD_TYPE_U32;
5150 return (field >> 13) & 0x3 ;
5151}
5152
5153static inline int vmcs_field_readonly(unsigned long field)
5154{
5155 return (((field >> 10) & 0x3) == 1);
5156}
5157
5158/*
5159 * Read a vmcs12 field. Since these can have varying lengths and we return
5160 * one type, we chose the biggest type (u64) and zero-extend the return value
5161 * to that size. Note that the caller, handle_vmread, might need to use only
5162 * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
5163 * 64-bit fields are to be returned).
5164 */
5165static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
5166 unsigned long field, u64 *ret)
5167{
5168 short offset = vmcs_field_to_offset(field);
5169 char *p;
5170
5171 if (offset < 0)
5172 return 0;
5173
5174 p = ((char *)(get_vmcs12(vcpu))) + offset;
5175
5176 switch (vmcs_field_type(field)) {
5177 case VMCS_FIELD_TYPE_NATURAL_WIDTH:
5178 *ret = *((natural_width *)p);
5179 return 1;
5180 case VMCS_FIELD_TYPE_U16:
5181 *ret = *((u16 *)p);
5182 return 1;
5183 case VMCS_FIELD_TYPE_U32:
5184 *ret = *((u32 *)p);
5185 return 1;
5186 case VMCS_FIELD_TYPE_U64:
5187 *ret = *((u64 *)p);
5188 return 1;
5189 default:
5190 return 0; /* can never happen. */
5191 }
5192}
5193
5194/*
5195 * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
5196 * used before) all generate the same failure when it is missing.
5197 */
5198static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
5199{
5200 struct vcpu_vmx *vmx = to_vmx(vcpu);
5201 if (vmx->nested.current_vmptr == -1ull) {
5202 nested_vmx_failInvalid(vcpu);
5203 skip_emulated_instruction(vcpu);
5204 return 0;
5205 }
5206 return 1;
5207}
5208
5209static int handle_vmread(struct kvm_vcpu *vcpu)
5210{
5211 unsigned long field;
5212 u64 field_value;
5213 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5214 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5215 gva_t gva = 0;
5216
5217 if (!nested_vmx_check_permission(vcpu) ||
5218 !nested_vmx_check_vmcs12(vcpu))
5219 return 1;
5220
5221 /* Decode instruction info and find the field to read */
5222 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5223 /* Read the field, zero-extended to a u64 field_value */
5224 if (!vmcs12_read_any(vcpu, field, &field_value)) {
5225 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5226 skip_emulated_instruction(vcpu);
5227 return 1;
5228 }
5229 /*
5230 * Now copy part of this value to register or memory, as requested.
5231 * Note that the number of bits actually copied is 32 or 64 depending
5232 * on the guest's mode (32 or 64 bit), not on the given field's length.
5233 */
5234 if (vmx_instruction_info & (1u << 10)) {
5235 kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
5236 field_value);
5237 } else {
5238 if (get_vmx_mem_address(vcpu, exit_qualification,
5239 vmx_instruction_info, &gva))
5240 return 1;
5241 /* _system ok, as nested_vmx_check_permission verified cpl=0 */
5242 kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
5243 &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
5244 }
5245
5246 nested_vmx_succeed(vcpu);
5247 skip_emulated_instruction(vcpu);
5248 return 1;
5249}
5250
5251
5252static int handle_vmwrite(struct kvm_vcpu *vcpu)
5253{
5254 unsigned long field;
5255 gva_t gva;
5256 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5257 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5258 char *p;
5259 short offset;
5260 /* The value to write might be 32 or 64 bits, depending on L1's long
5261 * mode, and eventually we need to write that into a field of several
5262 * possible lengths. The code below first zero-extends the value to 64
5263 * bit (field_value), and then copies only the approriate number of
5264 * bits into the vmcs12 field.
5265 */
5266 u64 field_value = 0;
5267 struct x86_exception e;
5268
5269 if (!nested_vmx_check_permission(vcpu) ||
5270 !nested_vmx_check_vmcs12(vcpu))
5271 return 1;
5272
5273 if (vmx_instruction_info & (1u << 10))
5274 field_value = kvm_register_read(vcpu,
5275 (((vmx_instruction_info) >> 3) & 0xf));
5276 else {
5277 if (get_vmx_mem_address(vcpu, exit_qualification,
5278 vmx_instruction_info, &gva))
5279 return 1;
5280 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
5281 &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) {
5282 kvm_inject_page_fault(vcpu, &e);
5283 return 1;
5284 }
5285 }
5286
5287
5288 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5289 if (vmcs_field_readonly(field)) {
5290 nested_vmx_failValid(vcpu,
5291 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
5292 skip_emulated_instruction(vcpu);
5293 return 1;
5294 }
5295
5296 offset = vmcs_field_to_offset(field);
5297 if (offset < 0) {
5298 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5299 skip_emulated_instruction(vcpu);
5300 return 1;
5301 }
5302 p = ((char *) get_vmcs12(vcpu)) + offset;
5303
5304 switch (vmcs_field_type(field)) {
5305 case VMCS_FIELD_TYPE_U16:
5306 *(u16 *)p = field_value;
5307 break;
5308 case VMCS_FIELD_TYPE_U32:
5309 *(u32 *)p = field_value;
5310 break;
5311 case VMCS_FIELD_TYPE_U64:
5312 *(u64 *)p = field_value;
5313 break;
5314 case VMCS_FIELD_TYPE_NATURAL_WIDTH:
5315 *(natural_width *)p = field_value;
5316 break;
5317 default:
5318 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5319 skip_emulated_instruction(vcpu);
5320 return 1;
5321 }
5322
5323 nested_vmx_succeed(vcpu);
5324 skip_emulated_instruction(vcpu);
5325 return 1;
5326}
5327
5328/* Emulate the VMPTRLD instruction */
5329static int handle_vmptrld(struct kvm_vcpu *vcpu)
5330{
5331 struct vcpu_vmx *vmx = to_vmx(vcpu);
5332 gva_t gva;
5333 gpa_t vmptr;
5334 struct x86_exception e;
5335
5336 if (!nested_vmx_check_permission(vcpu))
5337 return 1;
5338
5339 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5340 vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
5341 return 1;
5342
5343 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
5344 sizeof(vmptr), &e)) {
5345 kvm_inject_page_fault(vcpu, &e);
5346 return 1;
5347 }
5348
5349 if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
5350 nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
5351 skip_emulated_instruction(vcpu);
5352 return 1;
5353 }
5354
5355 if (vmx->nested.current_vmptr != vmptr) {
5356 struct vmcs12 *new_vmcs12;
5357 struct page *page;
5358 page = nested_get_page(vcpu, vmptr);
5359 if (page == NULL) {
5360 nested_vmx_failInvalid(vcpu);
5361 skip_emulated_instruction(vcpu);
5362 return 1;
5363 }
5364 new_vmcs12 = kmap(page);
5365 if (new_vmcs12->revision_id != VMCS12_REVISION) {
5366 kunmap(page);
5367 nested_release_page_clean(page);
5368 nested_vmx_failValid(vcpu,
5369 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5370 skip_emulated_instruction(vcpu);
5371 return 1;
5372 }
5373 if (vmx->nested.current_vmptr != -1ull) {
5374 kunmap(vmx->nested.current_vmcs12_page);
5375 nested_release_page(vmx->nested.current_vmcs12_page);
5376 }
5377
5378 vmx->nested.current_vmptr = vmptr;
5379 vmx->nested.current_vmcs12 = new_vmcs12;
5380 vmx->nested.current_vmcs12_page = page;
5381 }
5382
5383 nested_vmx_succeed(vcpu);
5384 skip_emulated_instruction(vcpu);
5385 return 1;
5386}
5387
5388/* Emulate the VMPTRST instruction */
5389static int handle_vmptrst(struct kvm_vcpu *vcpu)
5390{
5391 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5392 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5393 gva_t vmcs_gva;
5394 struct x86_exception e;
5395
5396 if (!nested_vmx_check_permission(vcpu))
5397 return 1;
5398
5399 if (get_vmx_mem_address(vcpu, exit_qualification,
5400 vmx_instruction_info, &vmcs_gva))
5401 return 1;
5402 /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */
5403 if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
5404 (void *)&to_vmx(vcpu)->nested.current_vmptr,
5405 sizeof(u64), &e)) {
5406 kvm_inject_page_fault(vcpu, &e);
5407 return 1;
5408 }
5409 nested_vmx_succeed(vcpu);
5410 skip_emulated_instruction(vcpu);
5411 return 1;
5412}
5413
5414/*
3869 * The exit handlers return 1 if the exit was handled fully and guest execution 5415 * The exit handlers return 1 if the exit was handled fully and guest execution
3870 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 5416 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
3871 * to be done to userspace and return 0. 5417 * to be done to userspace and return 0.
@@ -3886,15 +5432,15 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3886 [EXIT_REASON_INVD] = handle_invd, 5432 [EXIT_REASON_INVD] = handle_invd,
3887 [EXIT_REASON_INVLPG] = handle_invlpg, 5433 [EXIT_REASON_INVLPG] = handle_invlpg,
3888 [EXIT_REASON_VMCALL] = handle_vmcall, 5434 [EXIT_REASON_VMCALL] = handle_vmcall,
3889 [EXIT_REASON_VMCLEAR] = handle_vmx_insn, 5435 [EXIT_REASON_VMCLEAR] = handle_vmclear,
3890 [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, 5436 [EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
3891 [EXIT_REASON_VMPTRLD] = handle_vmx_insn, 5437 [EXIT_REASON_VMPTRLD] = handle_vmptrld,
3892 [EXIT_REASON_VMPTRST] = handle_vmx_insn, 5438 [EXIT_REASON_VMPTRST] = handle_vmptrst,
3893 [EXIT_REASON_VMREAD] = handle_vmx_insn, 5439 [EXIT_REASON_VMREAD] = handle_vmread,
3894 [EXIT_REASON_VMRESUME] = handle_vmx_insn, 5440 [EXIT_REASON_VMRESUME] = handle_vmresume,
3895 [EXIT_REASON_VMWRITE] = handle_vmx_insn, 5441 [EXIT_REASON_VMWRITE] = handle_vmwrite,
3896 [EXIT_REASON_VMOFF] = handle_vmx_insn, 5442 [EXIT_REASON_VMOFF] = handle_vmoff,
3897 [EXIT_REASON_VMON] = handle_vmx_insn, 5443 [EXIT_REASON_VMON] = handle_vmon,
3898 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 5444 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
3899 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 5445 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
3900 [EXIT_REASON_WBINVD] = handle_wbinvd, 5446 [EXIT_REASON_WBINVD] = handle_wbinvd,
@@ -3911,6 +5457,229 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3911static const int kvm_vmx_max_exit_handlers = 5457static const int kvm_vmx_max_exit_handlers =
3912 ARRAY_SIZE(kvm_vmx_exit_handlers); 5458 ARRAY_SIZE(kvm_vmx_exit_handlers);
3913 5459
5460/*
5461 * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
5462 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
5463 * disinterest in the current event (read or write a specific MSR) by using an
5464 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
5465 */
5466static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5467 struct vmcs12 *vmcs12, u32 exit_reason)
5468{
5469 u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
5470 gpa_t bitmap;
5471
5472 if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS))
5473 return 1;
5474
5475 /*
5476 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
5477 * for the four combinations of read/write and low/high MSR numbers.
5478 * First we need to figure out which of the four to use:
5479 */
5480 bitmap = vmcs12->msr_bitmap;
5481 if (exit_reason == EXIT_REASON_MSR_WRITE)
5482 bitmap += 2048;
5483 if (msr_index >= 0xc0000000) {
5484 msr_index -= 0xc0000000;
5485 bitmap += 1024;
5486 }
5487
5488 /* Then read the msr_index'th bit from this bitmap: */
5489 if (msr_index < 1024*8) {
5490 unsigned char b;
5491 kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1);
5492 return 1 & (b >> (msr_index & 7));
5493 } else
5494 return 1; /* let L1 handle the wrong parameter */
5495}
5496
5497/*
5498 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
5499 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
5500 * intercept (via guest_host_mask etc.) the current event.
5501 */
5502static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
5503 struct vmcs12 *vmcs12)
5504{
5505 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5506 int cr = exit_qualification & 15;
5507 int reg = (exit_qualification >> 8) & 15;
5508 unsigned long val = kvm_register_read(vcpu, reg);
5509
5510 switch ((exit_qualification >> 4) & 3) {
5511 case 0: /* mov to cr */
5512 switch (cr) {
5513 case 0:
5514 if (vmcs12->cr0_guest_host_mask &
5515 (val ^ vmcs12->cr0_read_shadow))
5516 return 1;
5517 break;
5518 case 3:
5519 if ((vmcs12->cr3_target_count >= 1 &&
5520 vmcs12->cr3_target_value0 == val) ||
5521 (vmcs12->cr3_target_count >= 2 &&
5522 vmcs12->cr3_target_value1 == val) ||
5523 (vmcs12->cr3_target_count >= 3 &&
5524 vmcs12->cr3_target_value2 == val) ||
5525 (vmcs12->cr3_target_count >= 4 &&
5526 vmcs12->cr3_target_value3 == val))
5527 return 0;
5528 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
5529 return 1;
5530 break;
5531 case 4:
5532 if (vmcs12->cr4_guest_host_mask &
5533 (vmcs12->cr4_read_shadow ^ val))
5534 return 1;
5535 break;
5536 case 8:
5537 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
5538 return 1;
5539 break;
5540 }
5541 break;
5542 case 2: /* clts */
5543 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
5544 (vmcs12->cr0_read_shadow & X86_CR0_TS))
5545 return 1;
5546 break;
5547 case 1: /* mov from cr */
5548 switch (cr) {
5549 case 3:
5550 if (vmcs12->cpu_based_vm_exec_control &
5551 CPU_BASED_CR3_STORE_EXITING)
5552 return 1;
5553 break;
5554 case 8:
5555 if (vmcs12->cpu_based_vm_exec_control &
5556 CPU_BASED_CR8_STORE_EXITING)
5557 return 1;
5558 break;
5559 }
5560 break;
5561 case 3: /* lmsw */
5562 /*
5563 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
5564 * cr0. Other attempted changes are ignored, with no exit.
5565 */
5566 if (vmcs12->cr0_guest_host_mask & 0xe &
5567 (val ^ vmcs12->cr0_read_shadow))
5568 return 1;
5569 if ((vmcs12->cr0_guest_host_mask & 0x1) &&
5570 !(vmcs12->cr0_read_shadow & 0x1) &&
5571 (val & 0x1))
5572 return 1;
5573 break;
5574 }
5575 return 0;
5576}
5577
5578/*
5579 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
5580 * should handle it ourselves in L0 (and then continue L2). Only call this
5581 * when in is_guest_mode (L2).
5582 */
5583static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
5584{
5585 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
5586 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
5587 struct vcpu_vmx *vmx = to_vmx(vcpu);
5588 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5589
5590 if (vmx->nested.nested_run_pending)
5591 return 0;
5592
5593 if (unlikely(vmx->fail)) {
5594 printk(KERN_INFO "%s failed vm entry %x\n",
5595 __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
5596 return 1;
5597 }
5598
5599 switch (exit_reason) {
5600 case EXIT_REASON_EXCEPTION_NMI:
5601 if (!is_exception(intr_info))
5602 return 0;
5603 else if (is_page_fault(intr_info))
5604 return enable_ept;
5605 return vmcs12->exception_bitmap &
5606 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
5607 case EXIT_REASON_EXTERNAL_INTERRUPT:
5608 return 0;
5609 case EXIT_REASON_TRIPLE_FAULT:
5610 return 1;
5611 case EXIT_REASON_PENDING_INTERRUPT:
5612 case EXIT_REASON_NMI_WINDOW:
5613 /*
5614 * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit
5615 * (aka Interrupt Window Exiting) only when L1 turned it on,
5616 * so if we got a PENDING_INTERRUPT exit, this must be for L1.
5617 * Same for NMI Window Exiting.
5618 */
5619 return 1;
5620 case EXIT_REASON_TASK_SWITCH:
5621 return 1;
5622 case EXIT_REASON_CPUID:
5623 return 1;
5624 case EXIT_REASON_HLT:
5625 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
5626 case EXIT_REASON_INVD:
5627 return 1;
5628 case EXIT_REASON_INVLPG:
5629 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5630 case EXIT_REASON_RDPMC:
5631 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
5632 case EXIT_REASON_RDTSC:
5633 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
5634 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
5635 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
5636 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
5637 case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
5638 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
5639 /*
5640 * VMX instructions trap unconditionally. This allows L1 to
5641 * emulate them for its L2 guest, i.e., allows 3-level nesting!
5642 */
5643 return 1;
5644 case EXIT_REASON_CR_ACCESS:
5645 return nested_vmx_exit_handled_cr(vcpu, vmcs12);
5646 case EXIT_REASON_DR_ACCESS:
5647 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
5648 case EXIT_REASON_IO_INSTRUCTION:
5649 /* TODO: support IO bitmaps */
5650 return 1;
5651 case EXIT_REASON_MSR_READ:
5652 case EXIT_REASON_MSR_WRITE:
5653 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
5654 case EXIT_REASON_INVALID_STATE:
5655 return 1;
5656 case EXIT_REASON_MWAIT_INSTRUCTION:
5657 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
5658 case EXIT_REASON_MONITOR_INSTRUCTION:
5659 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
5660 case EXIT_REASON_PAUSE_INSTRUCTION:
5661 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
5662 nested_cpu_has2(vmcs12,
5663 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
5664 case EXIT_REASON_MCE_DURING_VMENTRY:
5665 return 0;
5666 case EXIT_REASON_TPR_BELOW_THRESHOLD:
5667 return 1;
5668 case EXIT_REASON_APIC_ACCESS:
5669 return nested_cpu_has2(vmcs12,
5670 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
5671 case EXIT_REASON_EPT_VIOLATION:
5672 case EXIT_REASON_EPT_MISCONFIG:
5673 return 0;
5674 case EXIT_REASON_WBINVD:
5675 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
5676 case EXIT_REASON_XSETBV:
5677 return 1;
5678 default:
5679 return 1;
5680 }
5681}
5682
3914static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) 5683static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3915{ 5684{
3916 *info1 = vmcs_readl(EXIT_QUALIFICATION); 5685 *info1 = vmcs_readl(EXIT_QUALIFICATION);
@@ -3933,6 +5702,25 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3933 if (vmx->emulation_required && emulate_invalid_guest_state) 5702 if (vmx->emulation_required && emulate_invalid_guest_state)
3934 return handle_invalid_guest_state(vcpu); 5703 return handle_invalid_guest_state(vcpu);
3935 5704
5705 /*
5706 * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
5707 * we did not inject a still-pending event to L1 now because of
5708 * nested_run_pending, we need to re-enable this bit.
5709 */
5710 if (vmx->nested.nested_run_pending)
5711 kvm_make_request(KVM_REQ_EVENT, vcpu);
5712
5713 if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH ||
5714 exit_reason == EXIT_REASON_VMRESUME))
5715 vmx->nested.nested_run_pending = 1;
5716 else
5717 vmx->nested.nested_run_pending = 0;
5718
5719 if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
5720 nested_vmx_vmexit(vcpu);
5721 return 1;
5722 }
5723
3936 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { 5724 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
3937 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 5725 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3938 vcpu->run->fail_entry.hardware_entry_failure_reason 5726 vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -3955,7 +5743,9 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3955 "(0x%x) and exit reason is 0x%x\n", 5743 "(0x%x) and exit reason is 0x%x\n",
3956 __func__, vectoring_info, exit_reason); 5744 __func__, vectoring_info, exit_reason);
3957 5745
3958 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { 5746 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
5747 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
5748 get_vmcs12(vcpu), vcpu)))) {
3959 if (vmx_interrupt_allowed(vcpu)) { 5749 if (vmx_interrupt_allowed(vcpu)) {
3960 vmx->soft_vnmi_blocked = 0; 5750 vmx->soft_vnmi_blocked = 0;
3961 } else if (vmx->vnmi_blocked_time > 1000000000LL && 5751 } else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -4118,6 +5908,8 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
4118 5908
4119static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 5909static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
4120{ 5910{
5911 if (is_guest_mode(&vmx->vcpu))
5912 return;
4121 __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, 5913 __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
4122 VM_EXIT_INSTRUCTION_LEN, 5914 VM_EXIT_INSTRUCTION_LEN,
4123 IDT_VECTORING_ERROR_CODE); 5915 IDT_VECTORING_ERROR_CODE);
@@ -4125,6 +5917,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
4125 5917
4126static void vmx_cancel_injection(struct kvm_vcpu *vcpu) 5918static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
4127{ 5919{
5920 if (is_guest_mode(vcpu))
5921 return;
4128 __vmx_complete_interrupts(to_vmx(vcpu), 5922 __vmx_complete_interrupts(to_vmx(vcpu),
4129 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 5923 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
4130 VM_ENTRY_INSTRUCTION_LEN, 5924 VM_ENTRY_INSTRUCTION_LEN,
@@ -4145,6 +5939,21 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4145{ 5939{
4146 struct vcpu_vmx *vmx = to_vmx(vcpu); 5940 struct vcpu_vmx *vmx = to_vmx(vcpu);
4147 5941
5942 if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
5943 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5944 if (vmcs12->idt_vectoring_info_field &
5945 VECTORING_INFO_VALID_MASK) {
5946 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
5947 vmcs12->idt_vectoring_info_field);
5948 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
5949 vmcs12->vm_exit_instruction_len);
5950 if (vmcs12->idt_vectoring_info_field &
5951 VECTORING_INFO_DELIVER_CODE_MASK)
5952 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
5953 vmcs12->idt_vectoring_error_code);
5954 }
5955 }
5956
4148 /* Record the guest's net vcpu time for enforced NMI injections. */ 5957 /* Record the guest's net vcpu time for enforced NMI injections. */
4149 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 5958 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
4150 vmx->entry_time = ktime_get(); 5959 vmx->entry_time = ktime_get();
@@ -4167,6 +5976,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4167 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 5976 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
4168 vmx_set_interrupt_shadow(vcpu, 0); 5977 vmx_set_interrupt_shadow(vcpu, 0);
4169 5978
5979 vmx->__launched = vmx->loaded_vmcs->launched;
4170 asm( 5980 asm(
4171 /* Store host registers */ 5981 /* Store host registers */
4172 "push %%"R"dx; push %%"R"bp;" 5982 "push %%"R"dx; push %%"R"bp;"
@@ -4237,7 +6047,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4237 "pop %%"R"bp; pop %%"R"dx \n\t" 6047 "pop %%"R"bp; pop %%"R"dx \n\t"
4238 "setbe %c[fail](%0) \n\t" 6048 "setbe %c[fail](%0) \n\t"
4239 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 6049 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
4240 [launched]"i"(offsetof(struct vcpu_vmx, launched)), 6050 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
4241 [fail]"i"(offsetof(struct vcpu_vmx, fail)), 6051 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
4242 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), 6052 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
4243 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), 6053 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
@@ -4276,8 +6086,19 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4276 6086
4277 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 6087 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
4278 6088
6089 if (is_guest_mode(vcpu)) {
6090 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6091 vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info;
6092 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
6093 vmcs12->idt_vectoring_error_code =
6094 vmcs_read32(IDT_VECTORING_ERROR_CODE);
6095 vmcs12->vm_exit_instruction_len =
6096 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6097 }
6098 }
6099
4279 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 6100 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
4280 vmx->launched = 1; 6101 vmx->loaded_vmcs->launched = 1;
4281 6102
4282 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); 6103 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
4283 6104
@@ -4289,41 +6110,18 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4289#undef R 6110#undef R
4290#undef Q 6111#undef Q
4291 6112
4292static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
4293{
4294 struct vcpu_vmx *vmx = to_vmx(vcpu);
4295
4296 if (vmx->vmcs) {
4297 vcpu_clear(vmx);
4298 free_vmcs(vmx->vmcs);
4299 vmx->vmcs = NULL;
4300 }
4301}
4302
4303static void vmx_free_vcpu(struct kvm_vcpu *vcpu) 6113static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
4304{ 6114{
4305 struct vcpu_vmx *vmx = to_vmx(vcpu); 6115 struct vcpu_vmx *vmx = to_vmx(vcpu);
4306 6116
4307 free_vpid(vmx); 6117 free_vpid(vmx);
4308 vmx_free_vmcs(vcpu); 6118 free_nested(vmx);
6119 free_loaded_vmcs(vmx->loaded_vmcs);
4309 kfree(vmx->guest_msrs); 6120 kfree(vmx->guest_msrs);
4310 kvm_vcpu_uninit(vcpu); 6121 kvm_vcpu_uninit(vcpu);
4311 kmem_cache_free(kvm_vcpu_cache, vmx); 6122 kmem_cache_free(kvm_vcpu_cache, vmx);
4312} 6123}
4313 6124
4314static inline void vmcs_init(struct vmcs *vmcs)
4315{
4316 u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
4317
4318 if (!vmm_exclusive)
4319 kvm_cpu_vmxon(phys_addr);
4320
4321 vmcs_clear(vmcs);
4322
4323 if (!vmm_exclusive)
4324 kvm_cpu_vmxoff();
4325}
4326
4327static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) 6125static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4328{ 6126{
4329 int err; 6127 int err;
@@ -4345,11 +6143,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4345 goto uninit_vcpu; 6143 goto uninit_vcpu;
4346 } 6144 }
4347 6145
4348 vmx->vmcs = alloc_vmcs(); 6146 vmx->loaded_vmcs = &vmx->vmcs01;
4349 if (!vmx->vmcs) 6147 vmx->loaded_vmcs->vmcs = alloc_vmcs();
6148 if (!vmx->loaded_vmcs->vmcs)
4350 goto free_msrs; 6149 goto free_msrs;
4351 6150 if (!vmm_exclusive)
4352 vmcs_init(vmx->vmcs); 6151 kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
6152 loaded_vmcs_init(vmx->loaded_vmcs);
6153 if (!vmm_exclusive)
6154 kvm_cpu_vmxoff();
4353 6155
4354 cpu = get_cpu(); 6156 cpu = get_cpu();
4355 vmx_vcpu_load(&vmx->vcpu, cpu); 6157 vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -4375,10 +6177,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4375 goto free_vmcs; 6177 goto free_vmcs;
4376 } 6178 }
4377 6179
6180 vmx->nested.current_vmptr = -1ull;
6181 vmx->nested.current_vmcs12 = NULL;
6182
4378 return &vmx->vcpu; 6183 return &vmx->vcpu;
4379 6184
4380free_vmcs: 6185free_vmcs:
4381 free_vmcs(vmx->vmcs); 6186 free_vmcs(vmx->loaded_vmcs->vmcs);
4382free_msrs: 6187free_msrs:
4383 kfree(vmx->guest_msrs); 6188 kfree(vmx->guest_msrs);
4384uninit_vcpu: 6189uninit_vcpu:
@@ -4512,6 +6317,650 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
4512 6317
4513static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 6318static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
4514{ 6319{
6320 if (func == 1 && nested)
6321 entry->ecx |= bit(X86_FEATURE_VMX);
6322}
6323
6324/*
6325 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
6326 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
6327 * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2
6328 * guest in a way that will both be appropriate to L1's requests, and our
6329 * needs. In addition to modifying the active vmcs (which is vmcs02), this
6330 * function also has additional necessary side-effects, like setting various
6331 * vcpu->arch fields.
6332 */
6333static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6334{
6335 struct vcpu_vmx *vmx = to_vmx(vcpu);
6336 u32 exec_control;
6337
6338 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
6339 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
6340 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
6341 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
6342 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
6343 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
6344 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
6345 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
6346 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
6347 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
6348 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
6349 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
6350 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
6351 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
6352 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
6353 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
6354 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
6355 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
6356 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
6357 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
6358 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
6359 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
6360 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
6361 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
6362 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
6363 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
6364 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
6365 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
6366 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
6367 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
6368 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
6369 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
6370 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
6371 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
6372 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
6373 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
6374
6375 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
6376 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
6377 vmcs12->vm_entry_intr_info_field);
6378 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
6379 vmcs12->vm_entry_exception_error_code);
6380 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
6381 vmcs12->vm_entry_instruction_len);
6382 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
6383 vmcs12->guest_interruptibility_info);
6384 vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state);
6385 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
6386 vmcs_writel(GUEST_DR7, vmcs12->guest_dr7);
6387 vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
6388 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
6389 vmcs12->guest_pending_dbg_exceptions);
6390 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
6391 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
6392
6393 vmcs_write64(VMCS_LINK_POINTER, -1ull);
6394
6395 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
6396 (vmcs_config.pin_based_exec_ctrl |
6397 vmcs12->pin_based_vm_exec_control));
6398
6399 /*
6400 * Whether page-faults are trapped is determined by a combination of
6401 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
6402 * If enable_ept, L0 doesn't care about page faults and we should
6403 * set all of these to L1's desires. However, if !enable_ept, L0 does
6404 * care about (at least some) page faults, and because it is not easy
6405 * (if at all possible?) to merge L0 and L1's desires, we simply ask
6406 * to exit on each and every L2 page fault. This is done by setting
6407 * MASK=MATCH=0 and (see below) EB.PF=1.
6408 * Note that below we don't need special code to set EB.PF beyond the
6409 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
6410 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
6411 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
6412 *
6413 * A problem with this approach (when !enable_ept) is that L1 may be
6414 * injected with more page faults than it asked for. This could have
6415 * caused problems, but in practice existing hypervisors don't care.
6416 * To fix this, we will need to emulate the PFEC checking (on the L1
6417 * page tables), using walk_addr(), when injecting PFs to L1.
6418 */
6419 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
6420 enable_ept ? vmcs12->page_fault_error_code_mask : 0);
6421 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
6422 enable_ept ? vmcs12->page_fault_error_code_match : 0);
6423
6424 if (cpu_has_secondary_exec_ctrls()) {
6425 u32 exec_control = vmx_secondary_exec_control(vmx);
6426 if (!vmx->rdtscp_enabled)
6427 exec_control &= ~SECONDARY_EXEC_RDTSCP;
6428 /* Take the following fields only from vmcs12 */
6429 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6430 if (nested_cpu_has(vmcs12,
6431 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
6432 exec_control |= vmcs12->secondary_vm_exec_control;
6433
6434 if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
6435 /*
6436 * Translate L1 physical address to host physical
6437 * address for vmcs02. Keep the page pinned, so this
6438 * physical address remains valid. We keep a reference
6439 * to it so we can release it later.
6440 */
6441 if (vmx->nested.apic_access_page) /* shouldn't happen */
6442 nested_release_page(vmx->nested.apic_access_page);
6443 vmx->nested.apic_access_page =
6444 nested_get_page(vcpu, vmcs12->apic_access_addr);
6445 /*
6446 * If translation failed, no matter: This feature asks
6447 * to exit when accessing the given address, and if it
6448 * can never be accessed, this feature won't do
6449 * anything anyway.
6450 */
6451 if (!vmx->nested.apic_access_page)
6452 exec_control &=
6453 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6454 else
6455 vmcs_write64(APIC_ACCESS_ADDR,
6456 page_to_phys(vmx->nested.apic_access_page));
6457 }
6458
6459 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
6460 }
6461
6462
6463 /*
6464 * Set host-state according to L0's settings (vmcs12 is irrelevant here)
6465 * Some constant fields are set here by vmx_set_constant_host_state().
6466 * Other fields are different per CPU, and will be set later when
6467 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
6468 */
6469 vmx_set_constant_host_state();
6470
6471 /*
6472 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
6473 * entry, but only if the current (host) sp changed from the value
6474 * we wrote last (vmx->host_rsp). This cache is no longer relevant
6475 * if we switch vmcs, and rather than hold a separate cache per vmcs,
6476 * here we just force the write to happen on entry.
6477 */
6478 vmx->host_rsp = 0;
6479
6480 exec_control = vmx_exec_control(vmx); /* L0's desires */
6481 exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
6482 exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
6483 exec_control &= ~CPU_BASED_TPR_SHADOW;
6484 exec_control |= vmcs12->cpu_based_vm_exec_control;
6485 /*
6486 * Merging of IO and MSR bitmaps not currently supported.
6487 * Rather, exit every time.
6488 */
6489 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
6490 exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
6491 exec_control |= CPU_BASED_UNCOND_IO_EXITING;
6492
6493 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
6494
6495 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
6496 * bitwise-or of what L1 wants to trap for L2, and what we want to
6497 * trap. Note that CR0.TS also needs updating - we do this later.
6498 */
6499 update_exception_bitmap(vcpu);
6500 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
6501 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
6502
6503 /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
6504 vmcs_write32(VM_EXIT_CONTROLS,
6505 vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
6506 vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
6507 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
6508
6509 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
6510 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
6511 else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
6512 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
6513
6514
6515 set_cr4_guest_host_mask(vmx);
6516
6517 vmcs_write64(TSC_OFFSET,
6518 vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
6519
6520 if (enable_vpid) {
6521 /*
6522 * Trivially support vpid by letting L2s share their parent
6523 * L1's vpid. TODO: move to a more elaborate solution, giving
6524 * each L2 its own vpid and exposing the vpid feature to L1.
6525 */
6526 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
6527 vmx_flush_tlb(vcpu);
6528 }
6529
6530 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
6531 vcpu->arch.efer = vmcs12->guest_ia32_efer;
6532 if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
6533 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
6534 else
6535 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
6536 /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
6537 vmx_set_efer(vcpu, vcpu->arch.efer);
6538
6539 /*
6540 * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
6541 * TS bit (for lazy fpu) and bits which we consider mandatory enabled.
6542 * The CR0_READ_SHADOW is what L2 should have expected to read given
6543 * the specifications by L1; It's not enough to take
6544 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
6545 * have more bits than L1 expected.
6546 */
6547 vmx_set_cr0(vcpu, vmcs12->guest_cr0);
6548 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
6549
6550 vmx_set_cr4(vcpu, vmcs12->guest_cr4);
6551 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
6552
6553 /* shadow page tables on either EPT or shadow page tables */
6554 kvm_set_cr3(vcpu, vmcs12->guest_cr3);
6555 kvm_mmu_reset_context(vcpu);
6556
6557 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
6558 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
6559}
6560
6561/*
6562 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
6563 * for running an L2 nested guest.
6564 */
6565static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
6566{
6567 struct vmcs12 *vmcs12;
6568 struct vcpu_vmx *vmx = to_vmx(vcpu);
6569 int cpu;
6570 struct loaded_vmcs *vmcs02;
6571
6572 if (!nested_vmx_check_permission(vcpu) ||
6573 !nested_vmx_check_vmcs12(vcpu))
6574 return 1;
6575
6576 skip_emulated_instruction(vcpu);
6577 vmcs12 = get_vmcs12(vcpu);
6578
6579 /*
6580 * The nested entry process starts with enforcing various prerequisites
6581 * on vmcs12 as required by the Intel SDM, and act appropriately when
6582 * they fail: As the SDM explains, some conditions should cause the
6583 * instruction to fail, while others will cause the instruction to seem
6584 * to succeed, but return an EXIT_REASON_INVALID_STATE.
6585 * To speed up the normal (success) code path, we should avoid checking
6586 * for misconfigurations which will anyway be caught by the processor
6587 * when using the merged vmcs02.
6588 */
6589 if (vmcs12->launch_state == launch) {
6590 nested_vmx_failValid(vcpu,
6591 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
6592 : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
6593 return 1;
6594 }
6595
6596 if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
6597 !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) {
6598 /*TODO: Also verify bits beyond physical address width are 0*/
6599 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6600 return 1;
6601 }
6602
6603 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
6604 !IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) {
6605 /*TODO: Also verify bits beyond physical address width are 0*/
6606 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6607 return 1;
6608 }
6609
6610 if (vmcs12->vm_entry_msr_load_count > 0 ||
6611 vmcs12->vm_exit_msr_load_count > 0 ||
6612 vmcs12->vm_exit_msr_store_count > 0) {
6613 if (printk_ratelimit())
6614 printk(KERN_WARNING
6615 "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__);
6616 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6617 return 1;
6618 }
6619
6620 if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
6621 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) ||
6622 !vmx_control_verify(vmcs12->secondary_vm_exec_control,
6623 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||
6624 !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
6625 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||
6626 !vmx_control_verify(vmcs12->vm_exit_controls,
6627 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) ||
6628 !vmx_control_verify(vmcs12->vm_entry_controls,
6629 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high))
6630 {
6631 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6632 return 1;
6633 }
6634
6635 if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
6636 ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
6637 nested_vmx_failValid(vcpu,
6638 VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
6639 return 1;
6640 }
6641
6642 if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
6643 ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
6644 nested_vmx_entry_failure(vcpu, vmcs12,
6645 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
6646 return 1;
6647 }
6648 if (vmcs12->vmcs_link_pointer != -1ull) {
6649 nested_vmx_entry_failure(vcpu, vmcs12,
6650 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
6651 return 1;
6652 }
6653
6654 /*
6655 * We're finally done with prerequisite checking, and can start with
6656 * the nested entry.
6657 */
6658
6659 vmcs02 = nested_get_current_vmcs02(vmx);
6660 if (!vmcs02)
6661 return -ENOMEM;
6662
6663 enter_guest_mode(vcpu);
6664
6665 vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
6666
6667 cpu = get_cpu();
6668 vmx->loaded_vmcs = vmcs02;
6669 vmx_vcpu_put(vcpu);
6670 vmx_vcpu_load(vcpu, cpu);
6671 vcpu->cpu = cpu;
6672 put_cpu();
6673
6674 vmcs12->launch_state = 1;
6675
6676 prepare_vmcs02(vcpu, vmcs12);
6677
6678 /*
6679 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
6680 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
6681 * returned as far as L1 is concerned. It will only return (and set
6682 * the success flag) when L2 exits (see nested_vmx_vmexit()).
6683 */
6684 return 1;
6685}
6686
6687/*
6688 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
6689 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
6690 * This function returns the new value we should put in vmcs12.guest_cr0.
6691 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
6692 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
6693 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
6694 * didn't trap the bit, because if L1 did, so would L0).
6695 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
6696 * been modified by L2, and L1 knows it. So just leave the old value of
6697 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
6698 * isn't relevant, because if L0 traps this bit it can set it to anything.
6699 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
6700 * changed these bits, and therefore they need to be updated, but L0
6701 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
6702 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
6703 */
6704static inline unsigned long
6705vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6706{
6707 return
6708 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
6709 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
6710 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
6711 vcpu->arch.cr0_guest_owned_bits));
6712}
6713
6714static inline unsigned long
6715vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6716{
6717 return
6718 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
6719 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
6720 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
6721 vcpu->arch.cr4_guest_owned_bits));
6722}
6723
6724/*
6725 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
6726 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
6727 * and this function updates it to reflect the changes to the guest state while
6728 * L2 was running (and perhaps made some exits which were handled directly by L0
6729 * without going back to L1), and to reflect the exit reason.
6730 * Note that we do not have to copy here all VMCS fields, just those that
6731 * could have changed by the L2 guest or the exit - i.e., the guest-state and
6732 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
6733 * which already writes to vmcs12 directly.
6734 */
6735void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6736{
6737 /* update guest state fields: */
6738 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
6739 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
6740
6741 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
6742 vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
6743 vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
6744 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
6745
6746 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
6747 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
6748 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
6749 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
6750 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
6751 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
6752 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
6753 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
6754 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
6755 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
6756 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
6757 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
6758 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
6759 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
6760 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
6761 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
6762 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
6763 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
6764 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
6765 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
6766 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
6767 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
6768 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
6769 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
6770 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
6771 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
6772 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
6773 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
6774 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
6775 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
6776 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
6777 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
6778 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
6779 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
6780 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
6781 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
6782
6783 vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
6784 vmcs12->guest_interruptibility_info =
6785 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
6786 vmcs12->guest_pending_dbg_exceptions =
6787 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
6788
6789 /* TODO: These cannot have changed unless we have MSR bitmaps and
6790 * the relevant bit asks not to trap the change */
6791 vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
6792 if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT)
6793 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
6794 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
6795 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
6796 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
6797
6798 /* update exit information fields: */
6799
6800 vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON);
6801 vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6802
6803 vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
6804 vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6805 vmcs12->idt_vectoring_info_field =
6806 vmcs_read32(IDT_VECTORING_INFO_FIELD);
6807 vmcs12->idt_vectoring_error_code =
6808 vmcs_read32(IDT_VECTORING_ERROR_CODE);
6809 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6810 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6811
6812 /* clear vm-entry fields which are to be cleared on exit */
6813 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
6814 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
6815}
6816
6817/*
6818 * A part of what we need to when the nested L2 guest exits and we want to
6819 * run its L1 parent, is to reset L1's guest state to the host state specified
6820 * in vmcs12.
6821 * This function is to be called not only on normal nested exit, but also on
6822 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
6823 * Failures During or After Loading Guest State").
6824 * This function should be called when the active VMCS is L1's (vmcs01).
6825 */
6826void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6827{
6828 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
6829 vcpu->arch.efer = vmcs12->host_ia32_efer;
6830 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
6831 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
6832 else
6833 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
6834 vmx_set_efer(vcpu, vcpu->arch.efer);
6835
6836 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
6837 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
6838 /*
6839 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
6840 * actually changed, because it depends on the current state of
6841 * fpu_active (which may have changed).
6842 * Note that vmx_set_cr0 refers to efer set above.
6843 */
6844 kvm_set_cr0(vcpu, vmcs12->host_cr0);
6845 /*
6846 * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
6847 * to apply the same changes to L1's vmcs. We just set cr0 correctly,
6848 * but we also need to update cr0_guest_host_mask and exception_bitmap.
6849 */
6850 update_exception_bitmap(vcpu);
6851 vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
6852 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
6853
6854 /*
6855 * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01
6856 * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
6857 */
6858 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
6859 kvm_set_cr4(vcpu, vmcs12->host_cr4);
6860
6861 /* shadow page tables on either EPT or shadow page tables */
6862 kvm_set_cr3(vcpu, vmcs12->host_cr3);
6863 kvm_mmu_reset_context(vcpu);
6864
6865 if (enable_vpid) {
6866 /*
6867 * Trivially support vpid by letting L2s share their parent
6868 * L1's vpid. TODO: move to a more elaborate solution, giving
6869 * each L2 its own vpid and exposing the vpid feature to L1.
6870 */
6871 vmx_flush_tlb(vcpu);
6872 }
6873
6874
6875 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
6876 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
6877 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
6878 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
6879 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
6880 vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base);
6881 vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base);
6882 vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
6883 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
6884 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
6885 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
6886 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
6887 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
6888 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
6889 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
6890
6891 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
6892 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
6893 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
6894 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
6895 vmcs12->host_ia32_perf_global_ctrl);
6896}
6897
6898/*
6899 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
6900 * and modify vmcs12 to make it see what it would expect to see there if
6901 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
6902 */
6903static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
6904{
6905 struct vcpu_vmx *vmx = to_vmx(vcpu);
6906 int cpu;
6907 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6908
6909 leave_guest_mode(vcpu);
6910 prepare_vmcs12(vcpu, vmcs12);
6911
6912 cpu = get_cpu();
6913 vmx->loaded_vmcs = &vmx->vmcs01;
6914 vmx_vcpu_put(vcpu);
6915 vmx_vcpu_load(vcpu, cpu);
6916 vcpu->cpu = cpu;
6917 put_cpu();
6918
6919 /* if no vmcs02 cache requested, remove the one we used */
6920 if (VMCS02_POOL_SIZE == 0)
6921 nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
6922
6923 load_vmcs12_host_state(vcpu, vmcs12);
6924
6925 /* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */
6926 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
6927
6928 /* This is needed for same reason as it was needed in prepare_vmcs02 */
6929 vmx->host_rsp = 0;
6930
6931 /* Unpin physical memory we referred to in vmcs02 */
6932 if (vmx->nested.apic_access_page) {
6933 nested_release_page(vmx->nested.apic_access_page);
6934 vmx->nested.apic_access_page = 0;
6935 }
6936
6937 /*
6938 * Exiting from L2 to L1, we're now back to L1 which thinks it just
6939 * finished a VMLAUNCH or VMRESUME instruction, so we need to set the
6940 * success or failure flag accordingly.
6941 */
6942 if (unlikely(vmx->fail)) {
6943 vmx->fail = 0;
6944 nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR));
6945 } else
6946 nested_vmx_succeed(vcpu);
6947}
6948
6949/*
6950 * L1's failure to enter L2 is a subset of a normal exit, as explained in
6951 * 23.7 "VM-entry failures during or after loading guest state" (this also
6952 * lists the acceptable exit-reason and exit-qualification parameters).
6953 * It should only be called before L2 actually succeeded to run, and when
6954 * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss).
6955 */
6956static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
6957 struct vmcs12 *vmcs12,
6958 u32 reason, unsigned long qualification)
6959{
6960 load_vmcs12_host_state(vcpu, vmcs12);
6961 vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
6962 vmcs12->exit_qualification = qualification;
6963 nested_vmx_succeed(vcpu);
4515} 6964}
4516 6965
4517static int vmx_check_intercept(struct kvm_vcpu *vcpu, 6966static int vmx_check_intercept(struct kvm_vcpu *vcpu,
@@ -4670,16 +7119,13 @@ static int __init vmx_init(void)
4670 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); 7119 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
4671 7120
4672 if (enable_ept) { 7121 if (enable_ept) {
4673 bypass_guest_pf = 0;
4674 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 7122 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
4675 VMX_EPT_EXECUTABLE_MASK); 7123 VMX_EPT_EXECUTABLE_MASK);
7124 ept_set_mmio_spte_mask();
4676 kvm_enable_tdp(); 7125 kvm_enable_tdp();
4677 } else 7126 } else
4678 kvm_disable_tdp(); 7127 kvm_disable_tdp();
4679 7128
4680 if (bypass_guest_pf)
4681 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
4682
4683 return 0; 7129 return 0;
4684 7130
4685out3: 7131out3:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 77c9d8673dc4..84a28ea45fa4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -347,6 +347,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
347 vcpu->arch.cr2 = fault->address; 347 vcpu->arch.cr2 = fault->address;
348 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); 348 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
349} 349}
350EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
350 351
351void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) 352void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
352{ 353{
@@ -579,6 +580,22 @@ static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
579 return best && (best->ecx & bit(X86_FEATURE_XSAVE)); 580 return best && (best->ecx & bit(X86_FEATURE_XSAVE));
580} 581}
581 582
583static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
584{
585 struct kvm_cpuid_entry2 *best;
586
587 best = kvm_find_cpuid_entry(vcpu, 7, 0);
588 return best && (best->ebx & bit(X86_FEATURE_SMEP));
589}
590
591static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
592{
593 struct kvm_cpuid_entry2 *best;
594
595 best = kvm_find_cpuid_entry(vcpu, 7, 0);
596 return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
597}
598
582static void update_cpuid(struct kvm_vcpu *vcpu) 599static void update_cpuid(struct kvm_vcpu *vcpu)
583{ 600{
584 struct kvm_cpuid_entry2 *best; 601 struct kvm_cpuid_entry2 *best;
@@ -598,14 +615,20 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
598int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 615int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
599{ 616{
600 unsigned long old_cr4 = kvm_read_cr4(vcpu); 617 unsigned long old_cr4 = kvm_read_cr4(vcpu);
601 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; 618 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
602 619 X86_CR4_PAE | X86_CR4_SMEP;
603 if (cr4 & CR4_RESERVED_BITS) 620 if (cr4 & CR4_RESERVED_BITS)
604 return 1; 621 return 1;
605 622
606 if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) 623 if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
607 return 1; 624 return 1;
608 625
626 if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
627 return 1;
628
629 if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS))
630 return 1;
631
609 if (is_long_mode(vcpu)) { 632 if (is_long_mode(vcpu)) {
610 if (!(cr4 & X86_CR4_PAE)) 633 if (!(cr4 & X86_CR4_PAE))
611 return 1; 634 return 1;
@@ -615,11 +638,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
615 kvm_read_cr3(vcpu))) 638 kvm_read_cr3(vcpu)))
616 return 1; 639 return 1;
617 640
618 if (cr4 & X86_CR4_VMXE) 641 if (kvm_x86_ops->set_cr4(vcpu, cr4))
619 return 1; 642 return 1;
620 643
621 kvm_x86_ops->set_cr4(vcpu, cr4);
622
623 if ((cr4 ^ old_cr4) & pdptr_bits) 644 if ((cr4 ^ old_cr4) & pdptr_bits)
624 kvm_mmu_reset_context(vcpu); 645 kvm_mmu_reset_context(vcpu);
625 646
@@ -787,12 +808,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
787 * kvm-specific. Those are put in the beginning of the list. 808 * kvm-specific. Those are put in the beginning of the list.
788 */ 809 */
789 810
790#define KVM_SAVE_MSRS_BEGIN 8 811#define KVM_SAVE_MSRS_BEGIN 9
791static u32 msrs_to_save[] = { 812static u32 msrs_to_save[] = {
792 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 813 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
793 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 814 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
794 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 815 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
795 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, 816 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
796 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 817 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
797 MSR_STAR, 818 MSR_STAR,
798#ifdef CONFIG_X86_64 819#ifdef CONFIG_X86_64
@@ -1388,7 +1409,7 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1388 return 1; 1409 return 1;
1389 kvm_x86_ops->patch_hypercall(vcpu, instructions); 1410 kvm_x86_ops->patch_hypercall(vcpu, instructions);
1390 ((unsigned char *)instructions)[3] = 0xc3; /* ret */ 1411 ((unsigned char *)instructions)[3] = 0xc3; /* ret */
1391 if (copy_to_user((void __user *)addr, instructions, 4)) 1412 if (__copy_to_user((void __user *)addr, instructions, 4))
1392 return 1; 1413 return 1;
1393 kvm->arch.hv_hypercall = data; 1414 kvm->arch.hv_hypercall = data;
1394 break; 1415 break;
@@ -1415,7 +1436,7 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1415 HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); 1436 HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
1416 if (kvm_is_error_hva(addr)) 1437 if (kvm_is_error_hva(addr))
1417 return 1; 1438 return 1;
1418 if (clear_user((void __user *)addr, PAGE_SIZE)) 1439 if (__clear_user((void __user *)addr, PAGE_SIZE))
1419 return 1; 1440 return 1;
1420 vcpu->arch.hv_vapic = data; 1441 vcpu->arch.hv_vapic = data;
1421 break; 1442 break;
@@ -1467,6 +1488,35 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
1467 } 1488 }
1468} 1489}
1469 1490
1491static void accumulate_steal_time(struct kvm_vcpu *vcpu)
1492{
1493 u64 delta;
1494
1495 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
1496 return;
1497
1498 delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
1499 vcpu->arch.st.last_steal = current->sched_info.run_delay;
1500 vcpu->arch.st.accum_steal = delta;
1501}
1502
1503static void record_steal_time(struct kvm_vcpu *vcpu)
1504{
1505 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
1506 return;
1507
1508 if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
1509 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
1510 return;
1511
1512 vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
1513 vcpu->arch.st.steal.version += 2;
1514 vcpu->arch.st.accum_steal = 0;
1515
1516 kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
1517 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
1518}
1519
1470int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1520int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1471{ 1521{
1472 switch (msr) { 1522 switch (msr) {
@@ -1549,6 +1599,33 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1549 if (kvm_pv_enable_async_pf(vcpu, data)) 1599 if (kvm_pv_enable_async_pf(vcpu, data))
1550 return 1; 1600 return 1;
1551 break; 1601 break;
1602 case MSR_KVM_STEAL_TIME:
1603
1604 if (unlikely(!sched_info_on()))
1605 return 1;
1606
1607 if (data & KVM_STEAL_RESERVED_MASK)
1608 return 1;
1609
1610 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
1611 data & KVM_STEAL_VALID_BITS))
1612 return 1;
1613
1614 vcpu->arch.st.msr_val = data;
1615
1616 if (!(data & KVM_MSR_ENABLED))
1617 break;
1618
1619 vcpu->arch.st.last_steal = current->sched_info.run_delay;
1620
1621 preempt_disable();
1622 accumulate_steal_time(vcpu);
1623 preempt_enable();
1624
1625 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
1626
1627 break;
1628
1552 case MSR_IA32_MCG_CTL: 1629 case MSR_IA32_MCG_CTL:
1553 case MSR_IA32_MCG_STATUS: 1630 case MSR_IA32_MCG_STATUS:
1554 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1631 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1834,6 +1911,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1834 case MSR_KVM_ASYNC_PF_EN: 1911 case MSR_KVM_ASYNC_PF_EN:
1835 data = vcpu->arch.apf.msr_val; 1912 data = vcpu->arch.apf.msr_val;
1836 break; 1913 break;
1914 case MSR_KVM_STEAL_TIME:
1915 data = vcpu->arch.st.msr_val;
1916 break;
1837 case MSR_IA32_P5_MC_ADDR: 1917 case MSR_IA32_P5_MC_ADDR:
1838 case MSR_IA32_P5_MC_TYPE: 1918 case MSR_IA32_P5_MC_TYPE:
1839 case MSR_IA32_MCG_CAP: 1919 case MSR_IA32_MCG_CAP:
@@ -2145,6 +2225,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2145 kvm_migrate_timers(vcpu); 2225 kvm_migrate_timers(vcpu);
2146 vcpu->cpu = cpu; 2226 vcpu->cpu = cpu;
2147 } 2227 }
2228
2229 accumulate_steal_time(vcpu);
2230 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2148} 2231}
2149 2232
2150void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 2233void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -2283,6 +2366,13 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2283 entry->flags = 0; 2366 entry->flags = 0;
2284} 2367}
2285 2368
2369static bool supported_xcr0_bit(unsigned bit)
2370{
2371 u64 mask = ((u64)1 << bit);
2372
2373 return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
2374}
2375
2286#define F(x) bit(X86_FEATURE_##x) 2376#define F(x) bit(X86_FEATURE_##x)
2287 2377
2288static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 2378static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
@@ -2328,7 +2418,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2328 0 /* Reserved, DCA */ | F(XMM4_1) | 2418 0 /* Reserved, DCA */ | F(XMM4_1) |
2329 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 2419 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
2330 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | 2420 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
2331 F(F16C); 2421 F(F16C) | F(RDRAND);
2332 /* cpuid 0x80000001.ecx */ 2422 /* cpuid 0x80000001.ecx */
2333 const u32 kvm_supported_word6_x86_features = 2423 const u32 kvm_supported_word6_x86_features =
2334 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | 2424 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
@@ -2342,6 +2432,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2342 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | 2432 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
2343 F(PMM) | F(PMM_EN); 2433 F(PMM) | F(PMM_EN);
2344 2434
2435 /* cpuid 7.0.ebx */
2436 const u32 kvm_supported_word9_x86_features =
2437 F(SMEP) | F(FSGSBASE) | F(ERMS);
2438
2345 /* all calls to cpuid_count() should be made on the same cpu */ 2439 /* all calls to cpuid_count() should be made on the same cpu */
2346 get_cpu(); 2440 get_cpu();
2347 do_cpuid_1_ent(entry, function, index); 2441 do_cpuid_1_ent(entry, function, index);
@@ -2376,7 +2470,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2376 } 2470 }
2377 break; 2471 break;
2378 } 2472 }
2379 /* function 4 and 0xb have additional index. */ 2473 /* function 4 has additional index. */
2380 case 4: { 2474 case 4: {
2381 int i, cache_type; 2475 int i, cache_type;
2382 2476
@@ -2393,6 +2487,22 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2393 } 2487 }
2394 break; 2488 break;
2395 } 2489 }
2490 case 7: {
2491 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2492 /* Mask ebx against host capbability word 9 */
2493 if (index == 0) {
2494 entry->ebx &= kvm_supported_word9_x86_features;
2495 cpuid_mask(&entry->ebx, 9);
2496 } else
2497 entry->ebx = 0;
2498 entry->eax = 0;
2499 entry->ecx = 0;
2500 entry->edx = 0;
2501 break;
2502 }
2503 case 9:
2504 break;
2505 /* function 0xb has additional index. */
2396 case 0xb: { 2506 case 0xb: {
2397 int i, level_type; 2507 int i, level_type;
2398 2508
@@ -2410,16 +2520,17 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2410 break; 2520 break;
2411 } 2521 }
2412 case 0xd: { 2522 case 0xd: {
2413 int i; 2523 int idx, i;
2414 2524
2415 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2525 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2416 for (i = 1; *nent < maxnent && i < 64; ++i) { 2526 for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) {
2417 if (entry[i].eax == 0) 2527 do_cpuid_1_ent(&entry[i], function, idx);
2528 if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
2418 continue; 2529 continue;
2419 do_cpuid_1_ent(&entry[i], function, i);
2420 entry[i].flags |= 2530 entry[i].flags |=
2421 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2531 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2422 ++*nent; 2532 ++*nent;
2533 ++i;
2423 } 2534 }
2424 break; 2535 break;
2425 } 2536 }
@@ -2438,6 +2549,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2438 (1 << KVM_FEATURE_CLOCKSOURCE2) | 2549 (1 << KVM_FEATURE_CLOCKSOURCE2) |
2439 (1 << KVM_FEATURE_ASYNC_PF) | 2550 (1 << KVM_FEATURE_ASYNC_PF) |
2440 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); 2551 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
2552
2553 if (sched_info_on())
2554 entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
2555
2441 entry->ebx = 0; 2556 entry->ebx = 0;
2442 entry->ecx = 0; 2557 entry->ecx = 0;
2443 entry->edx = 0; 2558 entry->edx = 0;
@@ -2451,6 +2566,24 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2451 entry->ecx &= kvm_supported_word6_x86_features; 2566 entry->ecx &= kvm_supported_word6_x86_features;
2452 cpuid_mask(&entry->ecx, 6); 2567 cpuid_mask(&entry->ecx, 6);
2453 break; 2568 break;
2569 case 0x80000008: {
2570 unsigned g_phys_as = (entry->eax >> 16) & 0xff;
2571 unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
2572 unsigned phys_as = entry->eax & 0xff;
2573
2574 if (!g_phys_as)
2575 g_phys_as = phys_as;
2576 entry->eax = g_phys_as | (virt_as << 8);
2577 entry->ebx = entry->edx = 0;
2578 break;
2579 }
2580 case 0x80000019:
2581 entry->ecx = entry->edx = 0;
2582 break;
2583 case 0x8000001a:
2584 break;
2585 case 0x8000001d:
2586 break;
2454 /*Add support for Centaur's CPUID instruction*/ 2587 /*Add support for Centaur's CPUID instruction*/
2455 case 0xC0000000: 2588 case 0xC0000000:
2456 /*Just support up to 0xC0000004 now*/ 2589 /*Just support up to 0xC0000004 now*/
@@ -2460,10 +2593,16 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2460 entry->edx &= kvm_supported_word5_x86_features; 2593 entry->edx &= kvm_supported_word5_x86_features;
2461 cpuid_mask(&entry->edx, 5); 2594 cpuid_mask(&entry->edx, 5);
2462 break; 2595 break;
2596 case 3: /* Processor serial number */
2597 case 5: /* MONITOR/MWAIT */
2598 case 6: /* Thermal management */
2599 case 0xA: /* Architectural Performance Monitoring */
2600 case 0x80000007: /* Advanced power management */
2463 case 0xC0000002: 2601 case 0xC0000002:
2464 case 0xC0000003: 2602 case 0xC0000003:
2465 case 0xC0000004: 2603 case 0xC0000004:
2466 /*Now nothing to do, reserved for the future*/ 2604 default:
2605 entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
2467 break; 2606 break;
2468 } 2607 }
2469 2608
@@ -3817,7 +3956,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
3817 exception); 3956 exception);
3818} 3957}
3819 3958
3820static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, 3959int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
3821 gva_t addr, void *val, unsigned int bytes, 3960 gva_t addr, void *val, unsigned int bytes,
3822 struct x86_exception *exception) 3961 struct x86_exception *exception)
3823{ 3962{
@@ -3827,6 +3966,7 @@ static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
3827 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, 3966 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
3828 exception); 3967 exception);
3829} 3968}
3969EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
3830 3970
3831static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, 3971static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3832 gva_t addr, void *val, unsigned int bytes, 3972 gva_t addr, void *val, unsigned int bytes,
@@ -3836,7 +3976,7 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3836 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); 3976 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
3837} 3977}
3838 3978
3839static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, 3979int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3840 gva_t addr, void *val, 3980 gva_t addr, void *val,
3841 unsigned int bytes, 3981 unsigned int bytes,
3842 struct x86_exception *exception) 3982 struct x86_exception *exception)
@@ -3868,6 +4008,42 @@ static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3868out: 4008out:
3869 return r; 4009 return r;
3870} 4010}
4011EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
4012
4013static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
4014 gpa_t *gpa, struct x86_exception *exception,
4015 bool write)
4016{
4017 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4018
4019 if (vcpu_match_mmio_gva(vcpu, gva) &&
4020 check_write_user_access(vcpu, write, access,
4021 vcpu->arch.access)) {
4022 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
4023 (gva & (PAGE_SIZE - 1));
4024 trace_vcpu_match_mmio(gva, *gpa, write, false);
4025 return 1;
4026 }
4027
4028 if (write)
4029 access |= PFERR_WRITE_MASK;
4030
4031 *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4032
4033 if (*gpa == UNMAPPED_GVA)
4034 return -1;
4035
4036 /* For APIC access vmexit */
4037 if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
4038 return 1;
4039
4040 if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
4041 trace_vcpu_match_mmio(gva, *gpa, write, true);
4042 return 1;
4043 }
4044
4045 return 0;
4046}
3871 4047
3872static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, 4048static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
3873 unsigned long addr, 4049 unsigned long addr,
@@ -3876,8 +4052,8 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
3876 struct x86_exception *exception) 4052 struct x86_exception *exception)
3877{ 4053{
3878 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 4054 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3879 gpa_t gpa; 4055 gpa_t gpa;
3880 int handled; 4056 int handled, ret;
3881 4057
3882 if (vcpu->mmio_read_completed) { 4058 if (vcpu->mmio_read_completed) {
3883 memcpy(val, vcpu->mmio_data, bytes); 4059 memcpy(val, vcpu->mmio_data, bytes);
@@ -3887,13 +4063,12 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
3887 return X86EMUL_CONTINUE; 4063 return X86EMUL_CONTINUE;
3888 } 4064 }
3889 4065
3890 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception); 4066 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false);
3891 4067
3892 if (gpa == UNMAPPED_GVA) 4068 if (ret < 0)
3893 return X86EMUL_PROPAGATE_FAULT; 4069 return X86EMUL_PROPAGATE_FAULT;
3894 4070
3895 /* For APIC access vmexit */ 4071 if (ret)
3896 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3897 goto mmio; 4072 goto mmio;
3898 4073
3899 if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception) 4074 if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
@@ -3944,16 +4119,16 @@ static int emulator_write_emulated_onepage(unsigned long addr,
3944 struct x86_exception *exception, 4119 struct x86_exception *exception,
3945 struct kvm_vcpu *vcpu) 4120 struct kvm_vcpu *vcpu)
3946{ 4121{
3947 gpa_t gpa; 4122 gpa_t gpa;
3948 int handled; 4123 int handled, ret;
3949 4124
3950 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); 4125 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true);
3951 4126
3952 if (gpa == UNMAPPED_GVA) 4127 if (ret < 0)
3953 return X86EMUL_PROPAGATE_FAULT; 4128 return X86EMUL_PROPAGATE_FAULT;
3954 4129
3955 /* For APIC access vmexit */ 4130 /* For APIC access vmexit */
3956 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 4131 if (ret)
3957 goto mmio; 4132 goto mmio;
3958 4133
3959 if (emulator_write_phys(vcpu, gpa, val, bytes)) 4134 if (emulator_write_phys(vcpu, gpa, val, bytes))
@@ -4473,9 +4648,24 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
4473 kvm_queue_exception(vcpu, ctxt->exception.vector); 4648 kvm_queue_exception(vcpu, ctxt->exception.vector);
4474} 4649}
4475 4650
4651static void init_decode_cache(struct x86_emulate_ctxt *ctxt,
4652 const unsigned long *regs)
4653{
4654 memset(&ctxt->twobyte, 0,
4655 (void *)&ctxt->regs - (void *)&ctxt->twobyte);
4656 memcpy(ctxt->regs, regs, sizeof(ctxt->regs));
4657
4658 ctxt->fetch.start = 0;
4659 ctxt->fetch.end = 0;
4660 ctxt->io_read.pos = 0;
4661 ctxt->io_read.end = 0;
4662 ctxt->mem_read.pos = 0;
4663 ctxt->mem_read.end = 0;
4664}
4665
4476static void init_emulate_ctxt(struct kvm_vcpu *vcpu) 4666static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4477{ 4667{
4478 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4668 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4479 int cs_db, cs_l; 4669 int cs_db, cs_l;
4480 4670
4481 /* 4671 /*
@@ -4488,40 +4678,38 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4488 4678
4489 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 4679 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4490 4680
4491 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); 4681 ctxt->eflags = kvm_get_rflags(vcpu);
4492 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); 4682 ctxt->eip = kvm_rip_read(vcpu);
4493 vcpu->arch.emulate_ctxt.mode = 4683 ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
4494 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : 4684 (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
4495 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 4685 cs_l ? X86EMUL_MODE_PROT64 :
4496 ? X86EMUL_MODE_VM86 : cs_l 4686 cs_db ? X86EMUL_MODE_PROT32 :
4497 ? X86EMUL_MODE_PROT64 : cs_db 4687 X86EMUL_MODE_PROT16;
4498 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 4688 ctxt->guest_mode = is_guest_mode(vcpu);
4499 vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu); 4689
4500 memset(c, 0, sizeof(struct decode_cache)); 4690 init_decode_cache(ctxt, vcpu->arch.regs);
4501 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4502 vcpu->arch.emulate_regs_need_sync_from_vcpu = false; 4691 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4503} 4692}
4504 4693
4505int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) 4694int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
4506{ 4695{
4507 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4696 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4508 int ret; 4697 int ret;
4509 4698
4510 init_emulate_ctxt(vcpu); 4699 init_emulate_ctxt(vcpu);
4511 4700
4512 vcpu->arch.emulate_ctxt.decode.op_bytes = 2; 4701 ctxt->op_bytes = 2;
4513 vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; 4702 ctxt->ad_bytes = 2;
4514 vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip + 4703 ctxt->_eip = ctxt->eip + inc_eip;
4515 inc_eip; 4704 ret = emulate_int_real(ctxt, irq);
4516 ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
4517 4705
4518 if (ret != X86EMUL_CONTINUE) 4706 if (ret != X86EMUL_CONTINUE)
4519 return EMULATE_FAIL; 4707 return EMULATE_FAIL;
4520 4708
4521 vcpu->arch.emulate_ctxt.eip = c->eip; 4709 ctxt->eip = ctxt->_eip;
4522 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 4710 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
4523 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 4711 kvm_rip_write(vcpu, ctxt->eip);
4524 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 4712 kvm_set_rflags(vcpu, ctxt->eflags);
4525 4713
4526 if (irq == NMI_VECTOR) 4714 if (irq == NMI_VECTOR)
4527 vcpu->arch.nmi_pending = false; 4715 vcpu->arch.nmi_pending = false;
@@ -4582,21 +4770,21 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4582 int insn_len) 4770 int insn_len)
4583{ 4771{
4584 int r; 4772 int r;
4585 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4773 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4586 bool writeback = true; 4774 bool writeback = true;
4587 4775
4588 kvm_clear_exception_queue(vcpu); 4776 kvm_clear_exception_queue(vcpu);
4589 4777
4590 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 4778 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
4591 init_emulate_ctxt(vcpu); 4779 init_emulate_ctxt(vcpu);
4592 vcpu->arch.emulate_ctxt.interruptibility = 0; 4780 ctxt->interruptibility = 0;
4593 vcpu->arch.emulate_ctxt.have_exception = false; 4781 ctxt->have_exception = false;
4594 vcpu->arch.emulate_ctxt.perm_ok = false; 4782 ctxt->perm_ok = false;
4595 4783
4596 vcpu->arch.emulate_ctxt.only_vendor_specific_insn 4784 ctxt->only_vendor_specific_insn
4597 = emulation_type & EMULTYPE_TRAP_UD; 4785 = emulation_type & EMULTYPE_TRAP_UD;
4598 4786
4599 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); 4787 r = x86_decode_insn(ctxt, insn, insn_len);
4600 4788
4601 trace_kvm_emulate_insn_start(vcpu); 4789 trace_kvm_emulate_insn_start(vcpu);
4602 ++vcpu->stat.insn_emulation; 4790 ++vcpu->stat.insn_emulation;
@@ -4612,7 +4800,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4612 } 4800 }
4613 4801
4614 if (emulation_type & EMULTYPE_SKIP) { 4802 if (emulation_type & EMULTYPE_SKIP) {
4615 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); 4803 kvm_rip_write(vcpu, ctxt->_eip);
4616 return EMULATE_DONE; 4804 return EMULATE_DONE;
4617 } 4805 }
4618 4806
@@ -4620,11 +4808,11 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4620 changes registers values during IO operation */ 4808 changes registers values during IO operation */
4621 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { 4809 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
4622 vcpu->arch.emulate_regs_need_sync_from_vcpu = false; 4810 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4623 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); 4811 memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs);
4624 } 4812 }
4625 4813
4626restart: 4814restart:
4627 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); 4815 r = x86_emulate_insn(ctxt);
4628 4816
4629 if (r == EMULATION_INTERCEPTED) 4817 if (r == EMULATION_INTERCEPTED)
4630 return EMULATE_DONE; 4818 return EMULATE_DONE;
@@ -4636,7 +4824,7 @@ restart:
4636 return handle_emulation_failure(vcpu); 4824 return handle_emulation_failure(vcpu);
4637 } 4825 }
4638 4826
4639 if (vcpu->arch.emulate_ctxt.have_exception) { 4827 if (ctxt->have_exception) {
4640 inject_emulated_exception(vcpu); 4828 inject_emulated_exception(vcpu);
4641 r = EMULATE_DONE; 4829 r = EMULATE_DONE;
4642 } else if (vcpu->arch.pio.count) { 4830 } else if (vcpu->arch.pio.count) {
@@ -4655,13 +4843,12 @@ restart:
4655 r = EMULATE_DONE; 4843 r = EMULATE_DONE;
4656 4844
4657 if (writeback) { 4845 if (writeback) {
4658 toggle_interruptibility(vcpu, 4846 toggle_interruptibility(vcpu, ctxt->interruptibility);
4659 vcpu->arch.emulate_ctxt.interruptibility); 4847 kvm_set_rflags(vcpu, ctxt->eflags);
4660 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4661 kvm_make_request(KVM_REQ_EVENT, vcpu); 4848 kvm_make_request(KVM_REQ_EVENT, vcpu);
4662 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 4849 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
4663 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 4850 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
4664 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 4851 kvm_rip_write(vcpu, ctxt->eip);
4665 } else 4852 } else
4666 vcpu->arch.emulate_regs_need_sync_to_vcpu = true; 4853 vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
4667 4854
@@ -4878,6 +5065,30 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
4878} 5065}
4879EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); 5066EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
4880 5067
5068static void kvm_set_mmio_spte_mask(void)
5069{
5070 u64 mask;
5071 int maxphyaddr = boot_cpu_data.x86_phys_bits;
5072
5073 /*
5074 * Set the reserved bits and the present bit of an paging-structure
5075 * entry to generate page fault with PFER.RSV = 1.
5076 */
5077 mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr;
5078 mask |= 1ull;
5079
5080#ifdef CONFIG_X86_64
5081 /*
5082 * If reserved bit is not supported, clear the present bit to disable
5083 * mmio page fault.
5084 */
5085 if (maxphyaddr == 52)
5086 mask &= ~1ull;
5087#endif
5088
5089 kvm_mmu_set_mmio_spte_mask(mask);
5090}
5091
4881int kvm_arch_init(void *opaque) 5092int kvm_arch_init(void *opaque)
4882{ 5093{
4883 int r; 5094 int r;
@@ -4904,10 +5115,10 @@ int kvm_arch_init(void *opaque)
4904 if (r) 5115 if (r)
4905 goto out; 5116 goto out;
4906 5117
5118 kvm_set_mmio_spte_mask();
4907 kvm_init_msr_list(); 5119 kvm_init_msr_list();
4908 5120
4909 kvm_x86_ops = ops; 5121 kvm_x86_ops = ops;
4910 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
4911 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 5122 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
4912 PT_DIRTY_MASK, PT64_NX_MASK, 0); 5123 PT_DIRTY_MASK, PT64_NX_MASK, 0);
4913 5124
@@ -5082,8 +5293,7 @@ int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
5082 5293
5083 kvm_x86_ops->patch_hypercall(vcpu, instruction); 5294 kvm_x86_ops->patch_hypercall(vcpu, instruction);
5084 5295
5085 return emulator_write_emulated(&vcpu->arch.emulate_ctxt, 5296 return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
5086 rip, instruction, 3, NULL);
5087} 5297}
5088 5298
5089static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 5299static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
@@ -5384,6 +5594,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5384 r = 1; 5594 r = 1;
5385 goto out; 5595 goto out;
5386 } 5596 }
5597 if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
5598 record_steal_time(vcpu);
5599
5387 } 5600 }
5388 5601
5389 r = kvm_mmu_reload(vcpu); 5602 r = kvm_mmu_reload(vcpu);
@@ -5671,8 +5884,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
5671 * that usually, but some bad designed PV devices (vmware 5884 * that usually, but some bad designed PV devices (vmware
5672 * backdoor interface) need this to work 5885 * backdoor interface) need this to work
5673 */ 5886 */
5674 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 5887 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
5675 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 5888 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
5676 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 5889 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5677 } 5890 }
5678 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 5891 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
@@ -5801,21 +6014,20 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
5801int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 6014int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
5802 bool has_error_code, u32 error_code) 6015 bool has_error_code, u32 error_code)
5803{ 6016{
5804 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 6017 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
5805 int ret; 6018 int ret;
5806 6019
5807 init_emulate_ctxt(vcpu); 6020 init_emulate_ctxt(vcpu);
5808 6021
5809 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, 6022 ret = emulator_task_switch(ctxt, tss_selector, reason,
5810 tss_selector, reason, has_error_code, 6023 has_error_code, error_code);
5811 error_code);
5812 6024
5813 if (ret) 6025 if (ret)
5814 return EMULATE_FAIL; 6026 return EMULATE_FAIL;
5815 6027
5816 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 6028 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
5817 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 6029 kvm_rip_write(vcpu, ctxt->eip);
5818 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 6030 kvm_set_rflags(vcpu, ctxt->eflags);
5819 kvm_make_request(KVM_REQ_EVENT, vcpu); 6031 kvm_make_request(KVM_REQ_EVENT, vcpu);
5820 return EMULATE_DONE; 6032 return EMULATE_DONE;
5821} 6033}
@@ -6093,12 +6305,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
6093 if (r == 0) 6305 if (r == 0)
6094 r = kvm_mmu_setup(vcpu); 6306 r = kvm_mmu_setup(vcpu);
6095 vcpu_put(vcpu); 6307 vcpu_put(vcpu);
6096 if (r < 0)
6097 goto free_vcpu;
6098 6308
6099 return 0;
6100free_vcpu:
6101 kvm_x86_ops->vcpu_free(vcpu);
6102 return r; 6309 return r;
6103} 6310}
6104 6311
@@ -6126,6 +6333,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
6126 6333
6127 kvm_make_request(KVM_REQ_EVENT, vcpu); 6334 kvm_make_request(KVM_REQ_EVENT, vcpu);
6128 vcpu->arch.apf.msr_val = 0; 6335 vcpu->arch.apf.msr_val = 0;
6336 vcpu->arch.st.msr_val = 0;
6129 6337
6130 kvmclock_reset(vcpu); 6338 kvmclock_reset(vcpu);
6131 6339
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index e407ed3df817..d36fe237c665 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -75,10 +75,54 @@ static inline u32 bit(int bitno)
75 return 1 << (bitno & 31); 75 return 1 << (bitno & 31);
76} 76}
77 77
78static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
79 gva_t gva, gfn_t gfn, unsigned access)
80{
81 vcpu->arch.mmio_gva = gva & PAGE_MASK;
82 vcpu->arch.access = access;
83 vcpu->arch.mmio_gfn = gfn;
84}
85
86/*
87 * Clear the mmio cache info for the given gva,
88 * specially, if gva is ~0ul, we clear all mmio cache info.
89 */
90static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
91{
92 if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
93 return;
94
95 vcpu->arch.mmio_gva = 0;
96}
97
98static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
99{
100 if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK))
101 return true;
102
103 return false;
104}
105
106static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
107{
108 if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
109 return true;
110
111 return false;
112}
113
78void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 114void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
79void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 115void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
80int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); 116int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
81 117
82void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); 118void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
83 119
120int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
121 gva_t addr, void *val, unsigned int bytes,
122 struct x86_exception *exception);
123
124int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
125 gva_t addr, void *val, unsigned int bytes,
126 struct x86_exception *exception);
127
84#endif 128#endif
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 55ef181521ff..2c366b52f505 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -161,6 +161,7 @@ struct kvm_pit_config {
161#define KVM_EXIT_NMI 16 161#define KVM_EXIT_NMI 16
162#define KVM_EXIT_INTERNAL_ERROR 17 162#define KVM_EXIT_INTERNAL_ERROR 17
163#define KVM_EXIT_OSI 18 163#define KVM_EXIT_OSI 18
164#define KVM_EXIT_PAPR_HCALL 19
164 165
165/* For KVM_EXIT_INTERNAL_ERROR */ 166/* For KVM_EXIT_INTERNAL_ERROR */
166#define KVM_INTERNAL_ERROR_EMULATION 1 167#define KVM_INTERNAL_ERROR_EMULATION 1
@@ -264,6 +265,11 @@ struct kvm_run {
264 struct { 265 struct {
265 __u64 gprs[32]; 266 __u64 gprs[32];
266 } osi; 267 } osi;
268 struct {
269 __u64 nr;
270 __u64 ret;
271 __u64 args[9];
272 } papr_hcall;
267 /* Fix the size of the union. */ 273 /* Fix the size of the union. */
268 char padding[256]; 274 char padding[256];
269 }; 275 };
@@ -544,6 +550,9 @@ struct kvm_ppc_pvinfo {
544#define KVM_CAP_TSC_CONTROL 60 550#define KVM_CAP_TSC_CONTROL 60
545#define KVM_CAP_GET_TSC_KHZ 61 551#define KVM_CAP_GET_TSC_KHZ 61
546#define KVM_CAP_PPC_BOOKE_SREGS 62 552#define KVM_CAP_PPC_BOOKE_SREGS 62
553#define KVM_CAP_SPAPR_TCE 63
554#define KVM_CAP_PPC_SMT 64
555#define KVM_CAP_PPC_RMA 65
547 556
548#ifdef KVM_CAP_IRQ_ROUTING 557#ifdef KVM_CAP_IRQ_ROUTING
549 558
@@ -746,6 +755,9 @@ struct kvm_clock_data {
746/* Available with KVM_CAP_XCRS */ 755/* Available with KVM_CAP_XCRS */
747#define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs) 756#define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs)
748#define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs) 757#define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs)
758#define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce)
759/* Available with KVM_CAP_RMA */
760#define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma)
749 761
750#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) 762#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
751 763
@@ -773,20 +785,14 @@ struct kvm_assigned_pci_dev {
773 785
774struct kvm_assigned_irq { 786struct kvm_assigned_irq {
775 __u32 assigned_dev_id; 787 __u32 assigned_dev_id;
776 __u32 host_irq; 788 __u32 host_irq; /* ignored (legacy field) */
777 __u32 guest_irq; 789 __u32 guest_irq;
778 __u32 flags; 790 __u32 flags;
779 union { 791 union {
780 struct {
781 __u32 addr_lo;
782 __u32 addr_hi;
783 __u32 data;
784 } guest_msi;
785 __u32 reserved[12]; 792 __u32 reserved[12];
786 }; 793 };
787}; 794};
788 795
789
790struct kvm_assigned_msix_nr { 796struct kvm_assigned_msix_nr {
791 __u32 assigned_dev_id; 797 __u32 assigned_dev_id;
792 __u16 entry_nr; 798 __u16 entry_nr;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 31ebb59cbd2f..eabb21a30c34 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -47,6 +47,7 @@
47#define KVM_REQ_DEACTIVATE_FPU 10 47#define KVM_REQ_DEACTIVATE_FPU 10
48#define KVM_REQ_EVENT 11 48#define KVM_REQ_EVENT 11
49#define KVM_REQ_APF_HALT 12 49#define KVM_REQ_APF_HALT 12
50#define KVM_REQ_STEAL_UPDATE 13
50 51
51#define KVM_USERSPACE_IRQ_SOURCE_ID 0 52#define KVM_USERSPACE_IRQ_SOURCE_ID 0
52 53
@@ -326,12 +327,17 @@ static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
326static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } 327static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
327 328
328extern struct page *bad_page; 329extern struct page *bad_page;
330extern struct page *fault_page;
331
329extern pfn_t bad_pfn; 332extern pfn_t bad_pfn;
333extern pfn_t fault_pfn;
330 334
331int is_error_page(struct page *page); 335int is_error_page(struct page *page);
332int is_error_pfn(pfn_t pfn); 336int is_error_pfn(pfn_t pfn);
333int is_hwpoison_pfn(pfn_t pfn); 337int is_hwpoison_pfn(pfn_t pfn);
334int is_fault_pfn(pfn_t pfn); 338int is_fault_pfn(pfn_t pfn);
339int is_noslot_pfn(pfn_t pfn);
340int is_invalid_pfn(pfn_t pfn);
335int kvm_is_error_hva(unsigned long addr); 341int kvm_is_error_hva(unsigned long addr);
336int kvm_set_memory_region(struct kvm *kvm, 342int kvm_set_memory_region(struct kvm *kvm,
337 struct kvm_userspace_memory_region *mem, 343 struct kvm_userspace_memory_region *mem,
@@ -381,6 +387,8 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
381int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 387int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
382 unsigned long len); 388 unsigned long len);
383int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); 389int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
390int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
391 void *data, unsigned long len);
384int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 392int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
385 int offset, int len); 393 int offset, int len);
386int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 394int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
diff --git a/kernel/compat.c b/kernel/compat.c
index fc9eb093acd5..18197ae2d465 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -890,6 +890,7 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
890 case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); 890 case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
891 } 891 }
892} 892}
893EXPORT_SYMBOL_GPL(sigset_from_compat);
893 894
894asmlinkage long 895asmlinkage long
895compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, 896compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ead9b610aa71..418b3f7053aa 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -19,8 +19,10 @@
19#include <linux/time.h> 19#include <linux/time.h>
20#include <linux/sysctl.h> 20#include <linux/sysctl.h>
21#include <linux/delayacct.h> 21#include <linux/delayacct.h>
22#include <linux/module.h>
22 23
23int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ 24int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
25EXPORT_SYMBOL_GPL(delayacct_on);
24struct kmem_cache *delayacct_cache; 26struct kmem_cache *delayacct_cache;
25 27
26static int __init delayacct_setup_disable(char *str) 28static int __init delayacct_setup_disable(char *str)
diff --git a/kernel/sched.c b/kernel/sched.c
index 9aaf567c5da5..751a7cc6a5cd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,6 +75,9 @@
75#include <asm/tlb.h> 75#include <asm/tlb.h>
76#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
77#include <asm/mutex.h> 77#include <asm/mutex.h>
78#ifdef CONFIG_PARAVIRT
79#include <asm/paravirt.h>
80#endif
78 81
79#include "sched_cpupri.h" 82#include "sched_cpupri.h"
80#include "workqueue_sched.h" 83#include "workqueue_sched.h"
@@ -528,6 +531,12 @@ struct rq {
528#ifdef CONFIG_IRQ_TIME_ACCOUNTING 531#ifdef CONFIG_IRQ_TIME_ACCOUNTING
529 u64 prev_irq_time; 532 u64 prev_irq_time;
530#endif 533#endif
534#ifdef CONFIG_PARAVIRT
535 u64 prev_steal_time;
536#endif
537#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
538 u64 prev_steal_time_rq;
539#endif
531 540
532 /* calc_load related fields */ 541 /* calc_load related fields */
533 unsigned long calc_load_update; 542 unsigned long calc_load_update;
@@ -1921,10 +1930,28 @@ void account_system_vtime(struct task_struct *curr)
1921} 1930}
1922EXPORT_SYMBOL_GPL(account_system_vtime); 1931EXPORT_SYMBOL_GPL(account_system_vtime);
1923 1932
1924static void update_rq_clock_task(struct rq *rq, s64 delta) 1933#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1934
1935#ifdef CONFIG_PARAVIRT
1936static inline u64 steal_ticks(u64 steal)
1925{ 1937{
1926 s64 irq_delta; 1938 if (unlikely(steal > NSEC_PER_SEC))
1939 return div_u64(steal, TICK_NSEC);
1927 1940
1941 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
1942}
1943#endif
1944
1945static void update_rq_clock_task(struct rq *rq, s64 delta)
1946{
1947/*
1948 * In theory, the compile should just see 0 here, and optimize out the call
1949 * to sched_rt_avg_update. But I don't trust it...
1950 */
1951#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
1952 s64 steal = 0, irq_delta = 0;
1953#endif
1954#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1928 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 1955 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
1929 1956
1930 /* 1957 /*
@@ -1947,12 +1974,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
1947 1974
1948 rq->prev_irq_time += irq_delta; 1975 rq->prev_irq_time += irq_delta;
1949 delta -= irq_delta; 1976 delta -= irq_delta;
1977#endif
1978#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
1979 if (static_branch((&paravirt_steal_rq_enabled))) {
1980 u64 st;
1981
1982 steal = paravirt_steal_clock(cpu_of(rq));
1983 steal -= rq->prev_steal_time_rq;
1984
1985 if (unlikely(steal > delta))
1986 steal = delta;
1987
1988 st = steal_ticks(steal);
1989 steal = st * TICK_NSEC;
1990
1991 rq->prev_steal_time_rq += steal;
1992
1993 delta -= steal;
1994 }
1995#endif
1996
1950 rq->clock_task += delta; 1997 rq->clock_task += delta;
1951 1998
1952 if (irq_delta && sched_feat(NONIRQ_POWER)) 1999#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
1953 sched_rt_avg_update(rq, irq_delta); 2000 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
2001 sched_rt_avg_update(rq, irq_delta + steal);
2002#endif
1954} 2003}
1955 2004
2005#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1956static int irqtime_account_hi_update(void) 2006static int irqtime_account_hi_update(void)
1957{ 2007{
1958 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2008 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
@@ -1987,12 +2037,7 @@ static int irqtime_account_si_update(void)
1987 2037
1988#define sched_clock_irqtime (0) 2038#define sched_clock_irqtime (0)
1989 2039
1990static void update_rq_clock_task(struct rq *rq, s64 delta) 2040#endif
1991{
1992 rq->clock_task += delta;
1993}
1994
1995#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1996 2041
1997#include "sched_idletask.c" 2042#include "sched_idletask.c"
1998#include "sched_fair.c" 2043#include "sched_fair.c"
@@ -3845,6 +3890,25 @@ void account_idle_time(cputime_t cputime)
3845 cpustat->idle = cputime64_add(cpustat->idle, cputime64); 3890 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
3846} 3891}
3847 3892
3893static __always_inline bool steal_account_process_tick(void)
3894{
3895#ifdef CONFIG_PARAVIRT
3896 if (static_branch(&paravirt_steal_enabled)) {
3897 u64 steal, st = 0;
3898
3899 steal = paravirt_steal_clock(smp_processor_id());
3900 steal -= this_rq()->prev_steal_time;
3901
3902 st = steal_ticks(steal);
3903 this_rq()->prev_steal_time += st * TICK_NSEC;
3904
3905 account_steal_time(st);
3906 return st;
3907 }
3908#endif
3909 return false;
3910}
3911
3848#ifndef CONFIG_VIRT_CPU_ACCOUNTING 3912#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3849 3913
3850#ifdef CONFIG_IRQ_TIME_ACCOUNTING 3914#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -3876,6 +3940,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3876 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); 3940 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3877 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3941 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3878 3942
3943 if (steal_account_process_tick())
3944 return;
3945
3879 if (irqtime_account_hi_update()) { 3946 if (irqtime_account_hi_update()) {
3880 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3947 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3881 } else if (irqtime_account_si_update()) { 3948 } else if (irqtime_account_si_update()) {
@@ -3929,6 +3996,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
3929 return; 3996 return;
3930 } 3997 }
3931 3998
3999 if (steal_account_process_tick())
4000 return;
4001
3932 if (user_tick) 4002 if (user_tick)
3933 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 4003 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3934 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 4004 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 1e7066d76c26..2e74677cb040 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,9 +61,9 @@ SCHED_FEAT(LB_BIAS, 1)
61SCHED_FEAT(OWNER_SPIN, 1) 61SCHED_FEAT(OWNER_SPIN, 1)
62 62
63/* 63/*
64 * Decrement CPU power based on irq activity 64 * Decrement CPU power based on time not spent running tasks
65 */ 65 */
66SCHED_FEAT(NONIRQ_POWER, 1) 66SCHED_FEAT(NONTASK_POWER, 1)
67 67
68/* 68/*
69 * Queue remote wakeups on the target CPU and process them 69 * Queue remote wakeups on the target CPU and process them
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 6cc4b97ec458..4e9eaeb518c7 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -617,7 +617,7 @@ static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
617 if (adev->entries_nr == 0) { 617 if (adev->entries_nr == 0) {
618 adev->entries_nr = entry_nr->entry_nr; 618 adev->entries_nr = entry_nr->entry_nr;
619 if (adev->entries_nr == 0 || 619 if (adev->entries_nr == 0 ||
620 adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) { 620 adev->entries_nr > KVM_MAX_MSIX_PER_DEV) {
621 r = -EINVAL; 621 r = -EINVAL;
622 goto msix_nr_out; 622 goto msix_nr_out;
623 } 623 }
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 62a9caf0563c..78c80f67f535 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -30,6 +30,12 @@
30#include <linux/iommu.h> 30#include <linux/iommu.h>
31#include <linux/intel-iommu.h> 31#include <linux/intel-iommu.h>
32 32
33static int allow_unsafe_assigned_interrupts;
34module_param_named(allow_unsafe_assigned_interrupts,
35 allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR);
36MODULE_PARM_DESC(allow_unsafe_assigned_interrupts,
37 "Enable device assignment on platforms without interrupt remapping support.");
38
33static int kvm_iommu_unmap_memslots(struct kvm *kvm); 39static int kvm_iommu_unmap_memslots(struct kvm *kvm);
34static void kvm_iommu_put_pages(struct kvm *kvm, 40static void kvm_iommu_put_pages(struct kvm *kvm,
35 gfn_t base_gfn, unsigned long npages); 41 gfn_t base_gfn, unsigned long npages);
@@ -231,6 +237,18 @@ int kvm_iommu_map_guest(struct kvm *kvm)
231 if (!kvm->arch.iommu_domain) 237 if (!kvm->arch.iommu_domain)
232 return -ENOMEM; 238 return -ENOMEM;
233 239
240 if (!allow_unsafe_assigned_interrupts &&
241 !iommu_domain_has_cap(kvm->arch.iommu_domain,
242 IOMMU_CAP_INTR_REMAP)) {
243 printk(KERN_WARNING "%s: No interrupt remapping support,"
244 " disallowing device assignment."
245 " Re-enble with \"allow_unsafe_assigned_interrupts=1\""
246 " module option.\n", __func__);
247 iommu_domain_free(kvm->arch.iommu_domain);
248 kvm->arch.iommu_domain = NULL;
249 return -EPERM;
250 }
251
234 r = kvm_iommu_map_memslots(kvm); 252 r = kvm_iommu_map_memslots(kvm);
235 if (r) 253 if (r)
236 goto out_unmap; 254 goto out_unmap;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 96ebc0679415..aefdda390f5e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -84,6 +84,10 @@ struct dentry *kvm_debugfs_dir;
84 84
85static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 85static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
86 unsigned long arg); 86 unsigned long arg);
87#ifdef CONFIG_COMPAT
88static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
89 unsigned long arg);
90#endif
87static int hardware_enable_all(void); 91static int hardware_enable_all(void);
88static void hardware_disable_all(void); 92static void hardware_disable_all(void);
89 93
@@ -97,8 +101,8 @@ static bool largepages_enabled = true;
97static struct page *hwpoison_page; 101static struct page *hwpoison_page;
98static pfn_t hwpoison_pfn; 102static pfn_t hwpoison_pfn;
99 103
100static struct page *fault_page; 104struct page *fault_page;
101static pfn_t fault_pfn; 105pfn_t fault_pfn;
102 106
103inline int kvm_is_mmio_pfn(pfn_t pfn) 107inline int kvm_is_mmio_pfn(pfn_t pfn)
104{ 108{
@@ -827,6 +831,13 @@ skip_lpage:
827 831
828 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); 832 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
829 833
834 /*
835 * If the new memory slot is created, we need to clear all
836 * mmio sptes.
837 */
838 if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
839 kvm_arch_flush_shadow(kvm);
840
830 kvm_free_physmem_slot(&old, &new); 841 kvm_free_physmem_slot(&old, &new);
831 kfree(old_memslots); 842 kfree(old_memslots);
832 843
@@ -927,6 +938,18 @@ int is_fault_pfn(pfn_t pfn)
927} 938}
928EXPORT_SYMBOL_GPL(is_fault_pfn); 939EXPORT_SYMBOL_GPL(is_fault_pfn);
929 940
941int is_noslot_pfn(pfn_t pfn)
942{
943 return pfn == bad_pfn;
944}
945EXPORT_SYMBOL_GPL(is_noslot_pfn);
946
947int is_invalid_pfn(pfn_t pfn)
948{
949 return pfn == hwpoison_pfn || pfn == fault_pfn;
950}
951EXPORT_SYMBOL_GPL(is_invalid_pfn);
952
930static inline unsigned long bad_hva(void) 953static inline unsigned long bad_hva(void)
931{ 954{
932 return PAGE_OFFSET; 955 return PAGE_OFFSET;
@@ -1345,7 +1368,7 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
1345 addr = gfn_to_hva(kvm, gfn); 1368 addr = gfn_to_hva(kvm, gfn);
1346 if (kvm_is_error_hva(addr)) 1369 if (kvm_is_error_hva(addr))
1347 return -EFAULT; 1370 return -EFAULT;
1348 r = copy_to_user((void __user *)addr + offset, data, len); 1371 r = __copy_to_user((void __user *)addr + offset, data, len);
1349 if (r) 1372 if (r)
1350 return -EFAULT; 1373 return -EFAULT;
1351 mark_page_dirty(kvm, gfn); 1374 mark_page_dirty(kvm, gfn);
@@ -1405,7 +1428,7 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1405 if (kvm_is_error_hva(ghc->hva)) 1428 if (kvm_is_error_hva(ghc->hva))
1406 return -EFAULT; 1429 return -EFAULT;
1407 1430
1408 r = copy_to_user((void __user *)ghc->hva, data, len); 1431 r = __copy_to_user((void __user *)ghc->hva, data, len);
1409 if (r) 1432 if (r)
1410 return -EFAULT; 1433 return -EFAULT;
1411 mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); 1434 mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
@@ -1414,6 +1437,26 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1414} 1437}
1415EXPORT_SYMBOL_GPL(kvm_write_guest_cached); 1438EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
1416 1439
1440int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1441 void *data, unsigned long len)
1442{
1443 struct kvm_memslots *slots = kvm_memslots(kvm);
1444 int r;
1445
1446 if (slots->generation != ghc->generation)
1447 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
1448
1449 if (kvm_is_error_hva(ghc->hva))
1450 return -EFAULT;
1451
1452 r = __copy_from_user(data, (void __user *)ghc->hva, len);
1453 if (r)
1454 return -EFAULT;
1455
1456 return 0;
1457}
1458EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
1459
1417int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 1460int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
1418{ 1461{
1419 return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page, 1462 return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
@@ -1586,7 +1629,9 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
1586static struct file_operations kvm_vcpu_fops = { 1629static struct file_operations kvm_vcpu_fops = {
1587 .release = kvm_vcpu_release, 1630 .release = kvm_vcpu_release,
1588 .unlocked_ioctl = kvm_vcpu_ioctl, 1631 .unlocked_ioctl = kvm_vcpu_ioctl,
1589 .compat_ioctl = kvm_vcpu_ioctl, 1632#ifdef CONFIG_COMPAT
1633 .compat_ioctl = kvm_vcpu_compat_ioctl,
1634#endif
1590 .mmap = kvm_vcpu_mmap, 1635 .mmap = kvm_vcpu_mmap,
1591 .llseek = noop_llseek, 1636 .llseek = noop_llseek,
1592}; 1637};
@@ -1615,18 +1660,18 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
1615 1660
1616 r = kvm_arch_vcpu_setup(vcpu); 1661 r = kvm_arch_vcpu_setup(vcpu);
1617 if (r) 1662 if (r)
1618 return r; 1663 goto vcpu_destroy;
1619 1664
1620 mutex_lock(&kvm->lock); 1665 mutex_lock(&kvm->lock);
1621 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { 1666 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
1622 r = -EINVAL; 1667 r = -EINVAL;
1623 goto vcpu_destroy; 1668 goto unlock_vcpu_destroy;
1624 } 1669 }
1625 1670
1626 kvm_for_each_vcpu(r, v, kvm) 1671 kvm_for_each_vcpu(r, v, kvm)
1627 if (v->vcpu_id == id) { 1672 if (v->vcpu_id == id) {
1628 r = -EEXIST; 1673 r = -EEXIST;
1629 goto vcpu_destroy; 1674 goto unlock_vcpu_destroy;
1630 } 1675 }
1631 1676
1632 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); 1677 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
@@ -1636,7 +1681,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
1636 r = create_vcpu_fd(vcpu); 1681 r = create_vcpu_fd(vcpu);
1637 if (r < 0) { 1682 if (r < 0) {
1638 kvm_put_kvm(kvm); 1683 kvm_put_kvm(kvm);
1639 goto vcpu_destroy; 1684 goto unlock_vcpu_destroy;
1640 } 1685 }
1641 1686
1642 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; 1687 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
@@ -1650,8 +1695,9 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
1650 mutex_unlock(&kvm->lock); 1695 mutex_unlock(&kvm->lock);
1651 return r; 1696 return r;
1652 1697
1653vcpu_destroy: 1698unlock_vcpu_destroy:
1654 mutex_unlock(&kvm->lock); 1699 mutex_unlock(&kvm->lock);
1700vcpu_destroy:
1655 kvm_arch_vcpu_destroy(vcpu); 1701 kvm_arch_vcpu_destroy(vcpu);
1656 return r; 1702 return r;
1657} 1703}
@@ -1874,6 +1920,50 @@ out:
1874 return r; 1920 return r;
1875} 1921}
1876 1922
1923#ifdef CONFIG_COMPAT
1924static long kvm_vcpu_compat_ioctl(struct file *filp,
1925 unsigned int ioctl, unsigned long arg)
1926{
1927 struct kvm_vcpu *vcpu = filp->private_data;
1928 void __user *argp = compat_ptr(arg);
1929 int r;
1930
1931 if (vcpu->kvm->mm != current->mm)
1932 return -EIO;
1933
1934 switch (ioctl) {
1935 case KVM_SET_SIGNAL_MASK: {
1936 struct kvm_signal_mask __user *sigmask_arg = argp;
1937 struct kvm_signal_mask kvm_sigmask;
1938 compat_sigset_t csigset;
1939 sigset_t sigset;
1940
1941 if (argp) {
1942 r = -EFAULT;
1943 if (copy_from_user(&kvm_sigmask, argp,
1944 sizeof kvm_sigmask))
1945 goto out;
1946 r = -EINVAL;
1947 if (kvm_sigmask.len != sizeof csigset)
1948 goto out;
1949 r = -EFAULT;
1950 if (copy_from_user(&csigset, sigmask_arg->sigset,
1951 sizeof csigset))
1952 goto out;
1953 }
1954 sigset_from_compat(&sigset, &csigset);
1955 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
1956 break;
1957 }
1958 default:
1959 r = kvm_vcpu_ioctl(filp, ioctl, arg);
1960 }
1961
1962out:
1963 return r;
1964}
1965#endif
1966
1877static long kvm_vm_ioctl(struct file *filp, 1967static long kvm_vm_ioctl(struct file *filp,
1878 unsigned int ioctl, unsigned long arg) 1968 unsigned int ioctl, unsigned long arg)
1879{ 1969{