aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/virtual/kvm/api.txt259
-rw-r--r--Documentation/virtual/kvm/ppc-pv.txt24
-rw-r--r--arch/ia64/include/asm/kvm.h4
-rw-r--r--arch/ia64/include/asm/kvm_host.h3
-rw-r--r--arch/ia64/kvm/kvm-ia64.c25
-rw-r--r--arch/powerpc/include/asm/kvm.h46
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h98
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_32.h6
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h180
-rw-r--r--arch/powerpc/include/asm/kvm_e500.h52
-rw-r--r--arch/powerpc/include/asm/kvm_host.h90
-rw-r--r--arch/powerpc/include/asm/kvm_para.h41
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h25
-rw-r--r--arch/powerpc/include/asm/mmu-book3e.h6
-rw-r--r--arch/powerpc/include/asm/mmu-hash64.h2
-rw-r--r--arch/powerpc/include/asm/ppc-opcode.h4
-rw-r--r--arch/powerpc/include/asm/reg.h5
-rw-r--r--arch/powerpc/kernel/asm-offsets.c16
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S8
-rw-r--r--arch/powerpc/kernel/kvm.c307
-rw-r--r--arch/powerpc/kernel/kvm_emul.S112
-rw-r--r--arch/powerpc/kernel/setup_64.c2
-rw-r--r--arch/powerpc/kvm/Kconfig1
-rw-r--r--arch/powerpc/kvm/book3s.c57
-rw-r--r--arch/powerpc/kvm/book3s_32_mmu_host.c21
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_host.c66
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c919
-rw-r--r--arch/powerpc/kvm/book3s_emulate.c8
-rw-r--r--arch/powerpc/kvm/book3s_hv.c465
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c209
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c835
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S176
-rw-r--r--arch/powerpc/kvm/book3s_paired_singles.c9
-rw-r--r--arch/powerpc/kvm/book3s_pr.c178
-rw-r--r--arch/powerpc/kvm/booke.c150
-rw-r--r--arch/powerpc/kvm/booke.h4
-rw-r--r--arch/powerpc/kvm/booke_emulate.c23
-rw-r--r--arch/powerpc/kvm/booke_interrupts.S18
-rw-r--r--arch/powerpc/kvm/e500.c32
-rw-r--r--arch/powerpc/kvm/e500_emulate.c38
-rw-r--r--arch/powerpc/kvm/e500_tlb.c775
-rw-r--r--arch/powerpc/kvm/e500_tlb.h80
-rw-r--r--arch/powerpc/kvm/emulate.c61
-rw-r--r--arch/powerpc/kvm/powerpc.c148
-rw-r--r--arch/powerpc/kvm/trace.h62
-rw-r--r--arch/powerpc/mm/hugetlbpage.c2
-rw-r--r--arch/s390/include/asm/kvm.h11
-rw-r--r--arch/s390/include/asm/kvm_host.h12
-rw-r--r--arch/s390/kvm/Kconfig9
-rw-r--r--arch/s390/kvm/diag.c6
-rw-r--r--arch/s390/kvm/intercept.c24
-rw-r--r--arch/s390/kvm/interrupt.c3
-rw-r--r--arch/s390/kvm/kvm-s390.c221
-rw-r--r--arch/s390/kvm/kvm-s390.h18
-rw-r--r--arch/s390/kvm/priv.c27
-rw-r--r--arch/s390/kvm/sigp.c57
-rw-r--r--arch/x86/include/asm/kvm.h4
-rw-r--r--arch/x86/include/asm/kvm_emulate.h3
-rw-r--r--arch/x86/include/asm/kvm_host.h63
-rw-r--r--arch/x86/include/asm/perf_event.h1
-rw-r--r--arch/x86/include/asm/tsc.h4
-rw-r--r--arch/x86/include/asm/x86_init.h6
-rw-r--r--arch/x86/kernel/kvmclock.c15
-rw-r--r--arch/x86/kernel/smpboot.c1
-rw-r--r--arch/x86/kernel/tsc.c4
-rw-r--r--arch/x86/kernel/x86_init.c5
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c112
-rw-r--r--arch/x86/kvm/i8259.c1
-rw-r--r--arch/x86/kvm/lapic.c4
-rw-r--r--arch/x86/kvm/mmu.c85
-rw-r--r--arch/x86/kvm/mmu_audit.c4
-rw-r--r--arch/x86/kvm/pmu.c10
-rw-r--r--arch/x86/kvm/svm.c119
-rw-r--r--arch/x86/kvm/vmx.c53
-rw-r--r--arch/x86/kvm/x86.c403
-rw-r--r--arch/x86/power/cpu.c4
-rw-r--r--include/linux/kvm.h98
-rw-r--r--include/linux/kvm_host.h69
-rw-r--r--virt/kvm/assigned-dev.c213
-rw-r--r--virt/kvm/kvm_main.c144
82 files changed, 5808 insertions, 1667 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index e1d94bf4056e..6386f8c0482e 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -95,7 +95,7 @@ described as 'basic' will be available.
95Capability: basic 95Capability: basic
96Architectures: all 96Architectures: all
97Type: system ioctl 97Type: system ioctl
98Parameters: none 98Parameters: machine type identifier (KVM_VM_*)
99Returns: a VM fd that can be used to control the new virtual machine. 99Returns: a VM fd that can be used to control the new virtual machine.
100 100
101The new VM has no virtual cpus and no memory. An mmap() of a VM fd 101The new VM has no virtual cpus and no memory. An mmap() of a VM fd
@@ -103,6 +103,11 @@ will access the virtual machine's physical address space; offset zero
103corresponds to guest physical address zero. Use of mmap() on a VM fd 103corresponds to guest physical address zero. Use of mmap() on a VM fd
104is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is 104is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is
105available. 105available.
106You most certainly want to use 0 as machine type.
107
108In order to create user controlled virtual machines on S390, check
109KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as
110privileged user (CAP_SYS_ADMIN).
106 111
1074.3 KVM_GET_MSR_INDEX_LIST 1124.3 KVM_GET_MSR_INDEX_LIST
108 113
@@ -213,6 +218,11 @@ allocation of vcpu ids. For example, if userspace wants
213single-threaded guest vcpus, it should make all vcpu ids be a multiple 218single-threaded guest vcpus, it should make all vcpu ids be a multiple
214of the number of vcpus per vcore. 219of the number of vcpus per vcore.
215 220
221For virtual cpus that have been created with S390 user controlled virtual
222machines, the resulting vcpu fd can be memory mapped at page offset
223KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual
224cpu's hardware control block.
225
2164.8 KVM_GET_DIRTY_LOG (vm ioctl) 2264.8 KVM_GET_DIRTY_LOG (vm ioctl)
217 227
218Capability: basic 228Capability: basic
@@ -1159,6 +1169,14 @@ following flags are specified:
1159 1169
1160/* Depends on KVM_CAP_IOMMU */ 1170/* Depends on KVM_CAP_IOMMU */
1161#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) 1171#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
1172/* The following two depend on KVM_CAP_PCI_2_3 */
1173#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
1174#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
1175
1176If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts
1177via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other
1178assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the
1179guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details.
1162 1180
1163The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure 1181The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
1164isolation of the device. Usages not specifying this flag are deprecated. 1182isolation of the device. Usages not specifying this flag are deprecated.
@@ -1399,6 +1417,71 @@ The following flags are defined:
1399If datamatch flag is set, the event will be signaled only if the written value 1417If datamatch flag is set, the event will be signaled only if the written value
1400to the registered address is equal to datamatch in struct kvm_ioeventfd. 1418to the registered address is equal to datamatch in struct kvm_ioeventfd.
1401 1419
14204.59 KVM_DIRTY_TLB
1421
1422Capability: KVM_CAP_SW_TLB
1423Architectures: ppc
1424Type: vcpu ioctl
1425Parameters: struct kvm_dirty_tlb (in)
1426Returns: 0 on success, -1 on error
1427
1428struct kvm_dirty_tlb {
1429 __u64 bitmap;
1430 __u32 num_dirty;
1431};
1432
1433This must be called whenever userspace has changed an entry in the shared
1434TLB, prior to calling KVM_RUN on the associated vcpu.
1435
1436The "bitmap" field is the userspace address of an array. This array
1437consists of a number of bits, equal to the total number of TLB entries as
1438determined by the last successful call to KVM_CONFIG_TLB, rounded up to the
1439nearest multiple of 64.
1440
1441Each bit corresponds to one TLB entry, ordered the same as in the shared TLB
1442array.
1443
1444The array is little-endian: the bit 0 is the least significant bit of the
1445first byte, bit 8 is the least significant bit of the second byte, etc.
1446This avoids any complications with differing word sizes.
1447
1448The "num_dirty" field is a performance hint for KVM to determine whether it
1449should skip processing the bitmap and just invalidate everything. It must
1450be set to the number of set bits in the bitmap.
1451
14524.60 KVM_ASSIGN_SET_INTX_MASK
1453
1454Capability: KVM_CAP_PCI_2_3
1455Architectures: x86
1456Type: vm ioctl
1457Parameters: struct kvm_assigned_pci_dev (in)
1458Returns: 0 on success, -1 on error
1459
1460Allows userspace to mask PCI INTx interrupts from the assigned device. The
1461kernel will not deliver INTx interrupts to the guest between setting and
1462clearing of KVM_ASSIGN_SET_INTX_MASK via this interface. This enables use of
1463and emulation of PCI 2.3 INTx disable command register behavior.
1464
1465This may be used for both PCI 2.3 devices supporting INTx disable natively and
1466older devices lacking this support. Userspace is responsible for emulating the
1467read value of the INTx disable bit in the guest visible PCI command register.
1468When modifying the INTx disable state, userspace should precede updating the
1469physical device command register by calling this ioctl to inform the kernel of
1470the new intended INTx mask state.
1471
1472Note that the kernel uses the device INTx disable bit to internally manage the
1473device interrupt state for PCI 2.3 devices. Reads of this register may
1474therefore not match the expected value. Writes should always use the guest
1475intended INTx disable value rather than attempting to read-copy-update the
1476current physical device state. Races between user and kernel updates to the
1477INTx disable bit are handled lazily in the kernel. It's possible the device
1478may generate unintended interrupts, but they will not be injected into the
1479guest.
1480
1481See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified
1482by assigned_dev_id. In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is
1483evaluated.
1484
14024.62 KVM_CREATE_SPAPR_TCE 14854.62 KVM_CREATE_SPAPR_TCE
1403 1486
1404Capability: KVM_CAP_SPAPR_TCE 1487Capability: KVM_CAP_SPAPR_TCE
@@ -1491,6 +1574,101 @@ following algorithm:
1491Some guests configure the LINT1 NMI input to cause a panic, aiding in 1574Some guests configure the LINT1 NMI input to cause a panic, aiding in
1492debugging. 1575debugging.
1493 1576
15774.65 KVM_S390_UCAS_MAP
1578
1579Capability: KVM_CAP_S390_UCONTROL
1580Architectures: s390
1581Type: vcpu ioctl
1582Parameters: struct kvm_s390_ucas_mapping (in)
1583Returns: 0 in case of success
1584
1585The parameter is defined like this:
1586 struct kvm_s390_ucas_mapping {
1587 __u64 user_addr;
1588 __u64 vcpu_addr;
1589 __u64 length;
1590 };
1591
1592This ioctl maps the memory at "user_addr" with the length "length" to
1593the vcpu's address space starting at "vcpu_addr". All parameters need to
1594be alligned by 1 megabyte.
1595
15964.66 KVM_S390_UCAS_UNMAP
1597
1598Capability: KVM_CAP_S390_UCONTROL
1599Architectures: s390
1600Type: vcpu ioctl
1601Parameters: struct kvm_s390_ucas_mapping (in)
1602Returns: 0 in case of success
1603
1604The parameter is defined like this:
1605 struct kvm_s390_ucas_mapping {
1606 __u64 user_addr;
1607 __u64 vcpu_addr;
1608 __u64 length;
1609 };
1610
1611This ioctl unmaps the memory in the vcpu's address space starting at
1612"vcpu_addr" with the length "length". The field "user_addr" is ignored.
1613All parameters need to be alligned by 1 megabyte.
1614
16154.67 KVM_S390_VCPU_FAULT
1616
1617Capability: KVM_CAP_S390_UCONTROL
1618Architectures: s390
1619Type: vcpu ioctl
1620Parameters: vcpu absolute address (in)
1621Returns: 0 in case of success
1622
1623This call creates a page table entry on the virtual cpu's address space
1624(for user controlled virtual machines) or the virtual machine's address
1625space (for regular virtual machines). This only works for minor faults,
1626thus it's recommended to access subject memory page via the user page
1627table upfront. This is useful to handle validity intercepts for user
1628controlled virtual machines to fault in the virtual cpu's lowcore pages
1629prior to calling the KVM_RUN ioctl.
1630
16314.68 KVM_SET_ONE_REG
1632
1633Capability: KVM_CAP_ONE_REG
1634Architectures: all
1635Type: vcpu ioctl
1636Parameters: struct kvm_one_reg (in)
1637Returns: 0 on success, negative value on failure
1638
1639struct kvm_one_reg {
1640 __u64 id;
1641 __u64 addr;
1642};
1643
1644Using this ioctl, a single vcpu register can be set to a specific value
1645defined by user space with the passed in struct kvm_one_reg, where id
1646refers to the register identifier as described below and addr is a pointer
1647to a variable with the respective size. There can be architecture agnostic
1648and architecture specific registers. Each have their own range of operation
1649and their own constants and width. To keep track of the implemented
1650registers, find a list below:
1651
1652 Arch | Register | Width (bits)
1653 | |
1654 PPC | KVM_REG_PPC_HIOR | 64
1655
16564.69 KVM_GET_ONE_REG
1657
1658Capability: KVM_CAP_ONE_REG
1659Architectures: all
1660Type: vcpu ioctl
1661Parameters: struct kvm_one_reg (in and out)
1662Returns: 0 on success, negative value on failure
1663
1664This ioctl allows to receive the value of a single register implemented
1665in a vcpu. The register to read is indicated by the "id" field of the
1666kvm_one_reg struct passed in. On success, the register value can be found
1667at the memory location pointed to by "addr".
1668
1669The list of registers accessible using this interface is identical to the
1670list in 4.64.
1671
14945. The kvm_run structure 16725. The kvm_run structure
1495 1673
1496Application code obtains a pointer to the kvm_run structure by 1674Application code obtains a pointer to the kvm_run structure by
@@ -1651,6 +1829,20 @@ s390 specific.
1651 1829
1652s390 specific. 1830s390 specific.
1653 1831
1832 /* KVM_EXIT_S390_UCONTROL */
1833 struct {
1834 __u64 trans_exc_code;
1835 __u32 pgm_code;
1836 } s390_ucontrol;
1837
1838s390 specific. A page fault has occurred for a user controlled virtual
1839machine (KVM_VM_S390_UNCONTROL) on it's host page table that cannot be
1840resolved by the kernel.
1841The program code and the translation exception code that were placed
1842in the cpu's lowcore are presented here as defined by the z Architecture
1843Principles of Operation Book in the Chapter for Dynamic Address Translation
1844(DAT)
1845
1654 /* KVM_EXIT_DCR */ 1846 /* KVM_EXIT_DCR */
1655 struct { 1847 struct {
1656 __u32 dcrn; 1848 __u32 dcrn;
@@ -1693,6 +1885,29 @@ developer registration required to access it).
1693 /* Fix the size of the union. */ 1885 /* Fix the size of the union. */
1694 char padding[256]; 1886 char padding[256];
1695 }; 1887 };
1888
1889 /*
1890 * shared registers between kvm and userspace.
1891 * kvm_valid_regs specifies the register classes set by the host
1892 * kvm_dirty_regs specified the register classes dirtied by userspace
1893 * struct kvm_sync_regs is architecture specific, as well as the
1894 * bits for kvm_valid_regs and kvm_dirty_regs
1895 */
1896 __u64 kvm_valid_regs;
1897 __u64 kvm_dirty_regs;
1898 union {
1899 struct kvm_sync_regs regs;
1900 char padding[1024];
1901 } s;
1902
1903If KVM_CAP_SYNC_REGS is defined, these fields allow userspace to access
1904certain guest registers without having to call SET/GET_*REGS. Thus we can
1905avoid some system call overhead if userspace has to handle the exit.
1906Userspace can query the validity of the structure by checking
1907kvm_valid_regs for specific bits. These bits are architecture specific
1908and usually define the validity of a groups of registers. (e.g. one bit
1909 for general purpose registers)
1910
1696}; 1911};
1697 1912
16986. Capabilities that can be enabled 19136. Capabilities that can be enabled
@@ -1741,3 +1956,45 @@ HTAB address part of SDR1 contains an HVA instead of a GPA, as PAPR keeps the
1741HTAB invisible to the guest. 1956HTAB invisible to the guest.
1742 1957
1743When this capability is enabled, KVM_EXIT_PAPR_HCALL can occur. 1958When this capability is enabled, KVM_EXIT_PAPR_HCALL can occur.
1959
19606.3 KVM_CAP_SW_TLB
1961
1962Architectures: ppc
1963Parameters: args[0] is the address of a struct kvm_config_tlb
1964Returns: 0 on success; -1 on error
1965
1966struct kvm_config_tlb {
1967 __u64 params;
1968 __u64 array;
1969 __u32 mmu_type;
1970 __u32 array_len;
1971};
1972
1973Configures the virtual CPU's TLB array, establishing a shared memory area
1974between userspace and KVM. The "params" and "array" fields are userspace
1975addresses of mmu-type-specific data structures. The "array_len" field is an
1976safety mechanism, and should be set to the size in bytes of the memory that
1977userspace has reserved for the array. It must be at least the size dictated
1978by "mmu_type" and "params".
1979
1980While KVM_RUN is active, the shared region is under control of KVM. Its
1981contents are undefined, and any modification by userspace results in
1982boundedly undefined behavior.
1983
1984On return from KVM_RUN, the shared region will reflect the current state of
1985the guest's TLB. If userspace makes any changes, it must call KVM_DIRTY_TLB
1986to tell KVM which entries have been changed, prior to calling KVM_RUN again
1987on this vcpu.
1988
1989For mmu types KVM_MMU_FSL_BOOKE_NOHV and KVM_MMU_FSL_BOOKE_HV:
1990 - The "params" field is of type "struct kvm_book3e_206_tlb_params".
1991 - The "array" field points to an array of type "struct
1992 kvm_book3e_206_tlb_entry".
1993 - The array consists of all entries in the first TLB, followed by all
1994 entries in the second TLB.
1995 - Within a TLB, entries are ordered first by increasing set number. Within a
1996 set, entries are ordered by way (increasing ESEL).
1997 - The hash for determining set number in TLB0 is: (MAS2 >> 12) & (num_sets - 1)
1998 where "num_sets" is the tlb_sizes[] value divided by the tlb_ways[] value.
1999 - The tsize field of mas1 shall be set to 4K on TLB0, even though the
2000 hardware ignores this value for TLB0.
diff --git a/Documentation/virtual/kvm/ppc-pv.txt b/Documentation/virtual/kvm/ppc-pv.txt
index 2b7ce190cde4..6e7c37050930 100644
--- a/Documentation/virtual/kvm/ppc-pv.txt
+++ b/Documentation/virtual/kvm/ppc-pv.txt
@@ -81,28 +81,8 @@ additional registers to the magic page. If you add fields to the magic page,
81also define a new hypercall feature to indicate that the host can give you more 81also define a new hypercall feature to indicate that the host can give you more
82registers. Only if the host supports the additional features, make use of them. 82registers. Only if the host supports the additional features, make use of them.
83 83
84The magic page has the following layout as described in 84The magic page layout is described by struct kvm_vcpu_arch_shared
85arch/powerpc/include/asm/kvm_para.h: 85in arch/powerpc/include/asm/kvm_para.h.
86
87struct kvm_vcpu_arch_shared {
88 __u64 scratch1;
89 __u64 scratch2;
90 __u64 scratch3;
91 __u64 critical; /* Guest may not get interrupts if == r1 */
92 __u64 sprg0;
93 __u64 sprg1;
94 __u64 sprg2;
95 __u64 sprg3;
96 __u64 srr0;
97 __u64 srr1;
98 __u64 dar;
99 __u64 msr;
100 __u32 dsisr;
101 __u32 int_pending; /* Tells the guest if we have an interrupt */
102};
103
104Additions to the page must only occur at the end. Struct fields are always 32
105or 64 bit aligned, depending on them being 32 or 64 bit wide respectively.
106 86
107Magic page features 87Magic page features
108=================== 88===================
diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h
index bc90c75adf67..b9f82c84f093 100644
--- a/arch/ia64/include/asm/kvm.h
+++ b/arch/ia64/include/asm/kvm.h
@@ -261,4 +261,8 @@ struct kvm_debug_exit_arch {
261struct kvm_guest_debug_arch { 261struct kvm_guest_debug_arch {
262}; 262};
263 263
264/* definition of registers in kvm_run */
265struct kvm_sync_regs {
266};
267
264#endif 268#endif
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 2689ee54a1c9..e35b3a84a40b 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -459,6 +459,9 @@ struct kvm_sal_data {
459 unsigned long boot_gp; 459 unsigned long boot_gp;
460}; 460};
461 461
462struct kvm_arch_memory_slot {
463};
464
462struct kvm_arch { 465struct kvm_arch {
463 spinlock_t dirty_log_lock; 466 spinlock_t dirty_log_lock;
464 467
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 405052002493..f5104b7c52cd 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -809,10 +809,13 @@ static void kvm_build_io_pmt(struct kvm *kvm)
809#define GUEST_PHYSICAL_RR4 0x2739 809#define GUEST_PHYSICAL_RR4 0x2739
810#define VMM_INIT_RR 0x1660 810#define VMM_INIT_RR 0x1660
811 811
812int kvm_arch_init_vm(struct kvm *kvm) 812int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
813{ 813{
814 BUG_ON(!kvm); 814 BUG_ON(!kvm);
815 815
816 if (type)
817 return -EINVAL;
818
816 kvm->arch.is_sn2 = ia64_platform_is("sn2"); 819 kvm->arch.is_sn2 = ia64_platform_is("sn2");
817 820
818 kvm->arch.metaphysical_rr0 = GUEST_PHYSICAL_RR0; 821 kvm->arch.metaphysical_rr0 = GUEST_PHYSICAL_RR0;
@@ -1169,6 +1172,11 @@ out:
1169 1172
1170#define PALE_RESET_ENTRY 0x80000000ffffffb0UL 1173#define PALE_RESET_ENTRY 0x80000000ffffffb0UL
1171 1174
1175bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
1176{
1177 return irqchip_in_kernel(vcpu->kcm) == (vcpu->arch.apic != NULL);
1178}
1179
1172int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 1180int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
1173{ 1181{
1174 struct kvm_vcpu *v; 1182 struct kvm_vcpu *v;
@@ -1563,6 +1571,21 @@ out:
1563 return r; 1571 return r;
1564} 1572}
1565 1573
1574int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
1575{
1576 return VM_FAULT_SIGBUS;
1577}
1578
1579void kvm_arch_free_memslot(struct kvm_memory_slot *free,
1580 struct kvm_memory_slot *dont)
1581{
1582}
1583
1584int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
1585{
1586 return 0;
1587}
1588
1566int kvm_arch_prepare_memory_region(struct kvm *kvm, 1589int kvm_arch_prepare_memory_region(struct kvm *kvm,
1567 struct kvm_memory_slot *memslot, 1590 struct kvm_memory_slot *memslot,
1568 struct kvm_memory_slot old, 1591 struct kvm_memory_slot old,
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index f7727d91ac6b..b921c3f48928 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -265,12 +265,9 @@ struct kvm_debug_exit_arch {
265struct kvm_guest_debug_arch { 265struct kvm_guest_debug_arch {
266}; 266};
267 267
268#define KVM_REG_MASK 0x001f 268/* definition of registers in kvm_run */
269#define KVM_REG_EXT_MASK 0xffe0 269struct kvm_sync_regs {
270#define KVM_REG_GPR 0x0000 270};
271#define KVM_REG_FPR 0x0020
272#define KVM_REG_QPR 0x0040
273#define KVM_REG_FQPR 0x0060
274 271
275#define KVM_INTERRUPT_SET -1U 272#define KVM_INTERRUPT_SET -1U
276#define KVM_INTERRUPT_UNSET -2U 273#define KVM_INTERRUPT_UNSET -2U
@@ -292,4 +289,41 @@ struct kvm_allocate_rma {
292 __u64 rma_size; 289 __u64 rma_size;
293}; 290};
294 291
292struct kvm_book3e_206_tlb_entry {
293 __u32 mas8;
294 __u32 mas1;
295 __u64 mas2;
296 __u64 mas7_3;
297};
298
299struct kvm_book3e_206_tlb_params {
300 /*
301 * For mmu types KVM_MMU_FSL_BOOKE_NOHV and KVM_MMU_FSL_BOOKE_HV:
302 *
303 * - The number of ways of TLB0 must be a power of two between 2 and
304 * 16.
305 * - TLB1 must be fully associative.
306 * - The size of TLB0 must be a multiple of the number of ways, and
307 * the number of sets must be a power of two.
308 * - The size of TLB1 may not exceed 64 entries.
309 * - TLB0 supports 4 KiB pages.
310 * - The page sizes supported by TLB1 are as indicated by
311 * TLB1CFG (if MMUCFG[MAVN] = 0) or TLB1PS (if MMUCFG[MAVN] = 1)
312 * as returned by KVM_GET_SREGS.
313 * - TLB2 and TLB3 are reserved, and their entries in tlb_sizes[]
314 * and tlb_ways[] must be zero.
315 *
316 * tlb_ways[n] = tlb_sizes[n] means the array is fully associative.
317 *
318 * KVM will adjust TLBnCFG based on the sizes configured here,
319 * though arrays greater than 2048 entries will have TLBnCFG[NENTRY]
320 * set to zero.
321 */
322 __u32 tlb_sizes[4];
323 __u32 tlb_ways[4];
324 __u32 reserved[8];
325};
326
327#define KVM_REG_PPC_HIOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1)
328
295#endif /* __LINUX_KVM_POWERPC_H */ 329#endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 69c7377d2071..aa795ccef294 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -90,6 +90,8 @@ struct kvmppc_vcpu_book3s {
90#endif 90#endif
91 int context_id[SID_CONTEXTS]; 91 int context_id[SID_CONTEXTS];
92 92
93 bool hior_explicit; /* HIOR is set by ioctl, not PVR */
94
93 struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE]; 95 struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
94 struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG]; 96 struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
95 struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE]; 97 struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
@@ -119,6 +121,11 @@ extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu);
119extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte); 121extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
120extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr); 122extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
121extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu); 123extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
124extern int kvmppc_book3s_hv_page_fault(struct kvm_run *run,
125 struct kvm_vcpu *vcpu, unsigned long addr,
126 unsigned long status);
127extern long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr,
128 unsigned long slb_v, unsigned long valid);
122 129
123extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte); 130extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
124extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu); 131extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu);
@@ -138,6 +145,21 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
138extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); 145extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
139extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu); 146extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
140extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); 147extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
148extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
149 unsigned long *rmap, long pte_index, int realmode);
150extern void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep,
151 unsigned long pte_index);
152void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep,
153 unsigned long pte_index);
154extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr,
155 unsigned long *nb_ret);
156extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr);
157extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
158 long pte_index, unsigned long pteh, unsigned long ptel);
159extern long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
160 long pte_index, unsigned long pteh, unsigned long ptel);
161extern long kvmppc_hv_get_dirty_log(struct kvm *kvm,
162 struct kvm_memory_slot *memslot);
141 163
142extern void kvmppc_entry_trampoline(void); 164extern void kvmppc_entry_trampoline(void);
143extern void kvmppc_hv_entry_trampoline(void); 165extern void kvmppc_hv_entry_trampoline(void);
@@ -183,7 +205,9 @@ static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
183static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) 205static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
184{ 206{
185 if ( num < 14 ) { 207 if ( num < 14 ) {
186 to_svcpu(vcpu)->gpr[num] = val; 208 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
209 svcpu->gpr[num] = val;
210 svcpu_put(svcpu);
187 to_book3s(vcpu)->shadow_vcpu->gpr[num] = val; 211 to_book3s(vcpu)->shadow_vcpu->gpr[num] = val;
188 } else 212 } else
189 vcpu->arch.gpr[num] = val; 213 vcpu->arch.gpr[num] = val;
@@ -191,80 +215,120 @@ static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
191 215
192static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num) 216static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
193{ 217{
194 if ( num < 14 ) 218 if ( num < 14 ) {
195 return to_svcpu(vcpu)->gpr[num]; 219 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
196 else 220 ulong r = svcpu->gpr[num];
221 svcpu_put(svcpu);
222 return r;
223 } else
197 return vcpu->arch.gpr[num]; 224 return vcpu->arch.gpr[num];
198} 225}
199 226
200static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) 227static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
201{ 228{
202 to_svcpu(vcpu)->cr = val; 229 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
230 svcpu->cr = val;
231 svcpu_put(svcpu);
203 to_book3s(vcpu)->shadow_vcpu->cr = val; 232 to_book3s(vcpu)->shadow_vcpu->cr = val;
204} 233}
205 234
206static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) 235static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
207{ 236{
208 return to_svcpu(vcpu)->cr; 237 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
238 u32 r;
239 r = svcpu->cr;
240 svcpu_put(svcpu);
241 return r;
209} 242}
210 243
211static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val) 244static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
212{ 245{
213 to_svcpu(vcpu)->xer = val; 246 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
247 svcpu->xer = val;
214 to_book3s(vcpu)->shadow_vcpu->xer = val; 248 to_book3s(vcpu)->shadow_vcpu->xer = val;
249 svcpu_put(svcpu);
215} 250}
216 251
217static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu) 252static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
218{ 253{
219 return to_svcpu(vcpu)->xer; 254 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
255 u32 r;
256 r = svcpu->xer;
257 svcpu_put(svcpu);
258 return r;
220} 259}
221 260
222static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val) 261static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val)
223{ 262{
224 to_svcpu(vcpu)->ctr = val; 263 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
264 svcpu->ctr = val;
265 svcpu_put(svcpu);
225} 266}
226 267
227static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu) 268static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu)
228{ 269{
229 return to_svcpu(vcpu)->ctr; 270 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
271 ulong r;
272 r = svcpu->ctr;
273 svcpu_put(svcpu);
274 return r;
230} 275}
231 276
232static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val) 277static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val)
233{ 278{
234 to_svcpu(vcpu)->lr = val; 279 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
280 svcpu->lr = val;
281 svcpu_put(svcpu);
235} 282}
236 283
237static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu) 284static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu)
238{ 285{
239 return to_svcpu(vcpu)->lr; 286 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
287 ulong r;
288 r = svcpu->lr;
289 svcpu_put(svcpu);
290 return r;
240} 291}
241 292
242static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val) 293static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val)
243{ 294{
244 to_svcpu(vcpu)->pc = val; 295 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
296 svcpu->pc = val;
297 svcpu_put(svcpu);
245} 298}
246 299
247static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu) 300static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
248{ 301{
249 return to_svcpu(vcpu)->pc; 302 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
303 ulong r;
304 r = svcpu->pc;
305 svcpu_put(svcpu);
306 return r;
250} 307}
251 308
252static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu) 309static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
253{ 310{
254 ulong pc = kvmppc_get_pc(vcpu); 311 ulong pc = kvmppc_get_pc(vcpu);
255 struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu); 312 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
313 u32 r;
256 314
257 /* Load the instruction manually if it failed to do so in the 315 /* Load the instruction manually if it failed to do so in the
258 * exit path */ 316 * exit path */
259 if (svcpu->last_inst == KVM_INST_FETCH_FAILED) 317 if (svcpu->last_inst == KVM_INST_FETCH_FAILED)
260 kvmppc_ld(vcpu, &pc, sizeof(u32), &svcpu->last_inst, false); 318 kvmppc_ld(vcpu, &pc, sizeof(u32), &svcpu->last_inst, false);
261 319
262 return svcpu->last_inst; 320 r = svcpu->last_inst;
321 svcpu_put(svcpu);
322 return r;
263} 323}
264 324
265static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu) 325static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
266{ 326{
267 return to_svcpu(vcpu)->fault_dar; 327 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
328 ulong r;
329 r = svcpu->fault_dar;
330 svcpu_put(svcpu);
331 return r;
268} 332}
269 333
270static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) 334static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/include/asm/kvm_book3s_32.h b/arch/powerpc/include/asm/kvm_book3s_32.h
index de604db135f5..38040ff82063 100644
--- a/arch/powerpc/include/asm/kvm_book3s_32.h
+++ b/arch/powerpc/include/asm/kvm_book3s_32.h
@@ -20,11 +20,15 @@
20#ifndef __ASM_KVM_BOOK3S_32_H__ 20#ifndef __ASM_KVM_BOOK3S_32_H__
21#define __ASM_KVM_BOOK3S_32_H__ 21#define __ASM_KVM_BOOK3S_32_H__
22 22
23static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu) 23static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu)
24{ 24{
25 return to_book3s(vcpu)->shadow_vcpu; 25 return to_book3s(vcpu)->shadow_vcpu;
26} 26}
27 27
28static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
29{
30}
31
28#define PTE_SIZE 12 32#define PTE_SIZE 12
29#define VSID_ALL 0 33#define VSID_ALL 0
30#define SR_INVALID 0x00000001 /* VSID 1 should always be unused */ 34#define SR_INVALID 0x00000001 /* VSID 1 should always be unused */
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index d0ac94f98f9e..b0c08b142770 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -21,14 +21,56 @@
21#define __ASM_KVM_BOOK3S_64_H__ 21#define __ASM_KVM_BOOK3S_64_H__
22 22
23#ifdef CONFIG_KVM_BOOK3S_PR 23#ifdef CONFIG_KVM_BOOK3S_PR
24static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu) 24static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu)
25{ 25{
26 preempt_disable();
26 return &get_paca()->shadow_vcpu; 27 return &get_paca()->shadow_vcpu;
27} 28}
29
30static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
31{
32 preempt_enable();
33}
28#endif 34#endif
29 35
30#define SPAPR_TCE_SHIFT 12 36#define SPAPR_TCE_SHIFT 12
31 37
38#ifdef CONFIG_KVM_BOOK3S_64_HV
39/* For now use fixed-size 16MB page table */
40#define HPT_ORDER 24
41#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */
42#define HPT_NPTE (HPT_NPTEG << 3) /* 8 PTEs per PTEG */
43#define HPT_HASH_MASK (HPT_NPTEG - 1)
44#endif
45
46#define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */
47
48/*
49 * We use a lock bit in HPTE dword 0 to synchronize updates and
50 * accesses to each HPTE, and another bit to indicate non-present
51 * HPTEs.
52 */
53#define HPTE_V_HVLOCK 0x40UL
54#define HPTE_V_ABSENT 0x20UL
55
56static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
57{
58 unsigned long tmp, old;
59
60 asm volatile(" ldarx %0,0,%2\n"
61 " and. %1,%0,%3\n"
62 " bne 2f\n"
63 " ori %0,%0,%4\n"
64 " stdcx. %0,0,%2\n"
65 " beq+ 2f\n"
66 " li %1,%3\n"
67 "2: isync"
68 : "=&r" (tmp), "=&r" (old)
69 : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)
70 : "cc", "memory");
71 return old == 0;
72}
73
32static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, 74static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
33 unsigned long pte_index) 75 unsigned long pte_index)
34{ 76{
@@ -62,4 +104,140 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
62 return rb; 104 return rb;
63} 105}
64 106
107static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
108{
109 /* only handle 4k, 64k and 16M pages for now */
110 if (!(h & HPTE_V_LARGE))
111 return 1ul << 12; /* 4k page */
112 if ((l & 0xf000) == 0x1000 && cpu_has_feature(CPU_FTR_ARCH_206))
113 return 1ul << 16; /* 64k page */
114 if ((l & 0xff000) == 0)
115 return 1ul << 24; /* 16M page */
116 return 0; /* error */
117}
118
119static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
120{
121 return ((ptel & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
122}
123
124static inline int hpte_is_writable(unsigned long ptel)
125{
126 unsigned long pp = ptel & (HPTE_R_PP0 | HPTE_R_PP);
127
128 return pp != PP_RXRX && pp != PP_RXXX;
129}
130
131static inline unsigned long hpte_make_readonly(unsigned long ptel)
132{
133 if ((ptel & HPTE_R_PP0) || (ptel & HPTE_R_PP) == PP_RWXX)
134 ptel = (ptel & ~HPTE_R_PP) | PP_RXXX;
135 else
136 ptel |= PP_RXRX;
137 return ptel;
138}
139
140static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type)
141{
142 unsigned int wimg = ptel & HPTE_R_WIMG;
143
144 /* Handle SAO */
145 if (wimg == (HPTE_R_W | HPTE_R_I | HPTE_R_M) &&
146 cpu_has_feature(CPU_FTR_ARCH_206))
147 wimg = HPTE_R_M;
148
149 if (!io_type)
150 return wimg == HPTE_R_M;
151
152 return (wimg & (HPTE_R_W | HPTE_R_I)) == io_type;
153}
154
155/*
156 * Lock and read a linux PTE. If it's present and writable, atomically
157 * set dirty and referenced bits and return the PTE, otherwise return 0.
158 */
159static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int writing)
160{
161 pte_t pte, tmp;
162
163 /* wait until _PAGE_BUSY is clear then set it atomically */
164 __asm__ __volatile__ (
165 "1: ldarx %0,0,%3\n"
166 " andi. %1,%0,%4\n"
167 " bne- 1b\n"
168 " ori %1,%0,%4\n"
169 " stdcx. %1,0,%3\n"
170 " bne- 1b"
171 : "=&r" (pte), "=&r" (tmp), "=m" (*p)
172 : "r" (p), "i" (_PAGE_BUSY)
173 : "cc");
174
175 if (pte_present(pte)) {
176 pte = pte_mkyoung(pte);
177 if (writing && pte_write(pte))
178 pte = pte_mkdirty(pte);
179 }
180
181 *p = pte; /* clears _PAGE_BUSY */
182
183 return pte;
184}
185
186/* Return HPTE cache control bits corresponding to Linux pte bits */
187static inline unsigned long hpte_cache_bits(unsigned long pte_val)
188{
189#if _PAGE_NO_CACHE == HPTE_R_I && _PAGE_WRITETHRU == HPTE_R_W
190 return pte_val & (HPTE_R_W | HPTE_R_I);
191#else
192 return ((pte_val & _PAGE_NO_CACHE) ? HPTE_R_I : 0) +
193 ((pte_val & _PAGE_WRITETHRU) ? HPTE_R_W : 0);
194#endif
195}
196
197static inline bool hpte_read_permission(unsigned long pp, unsigned long key)
198{
199 if (key)
200 return PP_RWRX <= pp && pp <= PP_RXRX;
201 return 1;
202}
203
204static inline bool hpte_write_permission(unsigned long pp, unsigned long key)
205{
206 if (key)
207 return pp == PP_RWRW;
208 return pp <= PP_RWRW;
209}
210
211static inline int hpte_get_skey_perm(unsigned long hpte_r, unsigned long amr)
212{
213 unsigned long skey;
214
215 skey = ((hpte_r & HPTE_R_KEY_HI) >> 57) |
216 ((hpte_r & HPTE_R_KEY_LO) >> 9);
217 return (amr >> (62 - 2 * skey)) & 3;
218}
219
220static inline void lock_rmap(unsigned long *rmap)
221{
222 do {
223 while (test_bit(KVMPPC_RMAP_LOCK_BIT, rmap))
224 cpu_relax();
225 } while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmap));
226}
227
228static inline void unlock_rmap(unsigned long *rmap)
229{
230 __clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmap);
231}
232
233static inline bool slot_is_aligned(struct kvm_memory_slot *memslot,
234 unsigned long pagesize)
235{
236 unsigned long mask = (pagesize >> PAGE_SHIFT) - 1;
237
238 if (pagesize <= PAGE_SIZE)
239 return 1;
240 return !(memslot->base_gfn & mask) && !(memslot->npages & mask);
241}
242
65#endif /* __ASM_KVM_BOOK3S_64_H__ */ 243#endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h
index adbfca9dd100..8cd50a514271 100644
--- a/arch/powerpc/include/asm/kvm_e500.h
+++ b/arch/powerpc/include/asm/kvm_e500.h
@@ -22,46 +22,55 @@
22#define E500_PID_NUM 3 22#define E500_PID_NUM 3
23#define E500_TLB_NUM 2 23#define E500_TLB_NUM 2
24 24
25struct tlbe{
26 u32 mas1;
27 u32 mas2;
28 u32 mas3;
29 u32 mas7;
30};
31
32#define E500_TLB_VALID 1 25#define E500_TLB_VALID 1
33#define E500_TLB_DIRTY 2 26#define E500_TLB_DIRTY 2
34 27
35struct tlbe_priv { 28struct tlbe_ref {
36 pfn_t pfn; 29 pfn_t pfn;
37 unsigned int flags; /* E500_TLB_* */ 30 unsigned int flags; /* E500_TLB_* */
38}; 31};
39 32
33struct tlbe_priv {
34 struct tlbe_ref ref; /* TLB0 only -- TLB1 uses tlb_refs */
35};
36
40struct vcpu_id_table; 37struct vcpu_id_table;
41 38
39struct kvmppc_e500_tlb_params {
40 int entries, ways, sets;
41};
42
42struct kvmppc_vcpu_e500 { 43struct kvmppc_vcpu_e500 {
43 /* Unmodified copy of the guest's TLB. */ 44 /* Unmodified copy of the guest's TLB -- shared with host userspace. */
44 struct tlbe *gtlb_arch[E500_TLB_NUM]; 45 struct kvm_book3e_206_tlb_entry *gtlb_arch;
46
47 /* Starting entry number in gtlb_arch[] */
48 int gtlb_offset[E500_TLB_NUM];
45 49
46 /* KVM internal information associated with each guest TLB entry */ 50 /* KVM internal information associated with each guest TLB entry */
47 struct tlbe_priv *gtlb_priv[E500_TLB_NUM]; 51 struct tlbe_priv *gtlb_priv[E500_TLB_NUM];
48 52
49 unsigned int gtlb_size[E500_TLB_NUM]; 53 struct kvmppc_e500_tlb_params gtlb_params[E500_TLB_NUM];
54
50 unsigned int gtlb_nv[E500_TLB_NUM]; 55 unsigned int gtlb_nv[E500_TLB_NUM];
51 56
57 /*
58 * information associated with each host TLB entry --
59 * TLB1 only for now. If/when guest TLB1 entries can be
60 * mapped with host TLB0, this will be used for that too.
61 *
62 * We don't want to use this for guest TLB0 because then we'd
63 * have the overhead of doing the translation again even if
64 * the entry is still in the guest TLB (e.g. we swapped out
65 * and back, and our host TLB entries got evicted).
66 */
67 struct tlbe_ref *tlb_refs[E500_TLB_NUM];
68 unsigned int host_tlb1_nv;
69
52 u32 host_pid[E500_PID_NUM]; 70 u32 host_pid[E500_PID_NUM];
53 u32 pid[E500_PID_NUM]; 71 u32 pid[E500_PID_NUM];
54 u32 svr; 72 u32 svr;
55 73
56 u32 mas0;
57 u32 mas1;
58 u32 mas2;
59 u32 mas3;
60 u32 mas4;
61 u32 mas5;
62 u32 mas6;
63 u32 mas7;
64
65 /* vcpu id table */ 74 /* vcpu id table */
66 struct vcpu_id_table *idt; 75 struct vcpu_id_table *idt;
67 76
@@ -73,6 +82,9 @@ struct kvmppc_vcpu_e500 {
73 u32 tlb1cfg; 82 u32 tlb1cfg;
74 u64 mcar; 83 u64 mcar;
75 84
85 struct page **shared_tlb_pages;
86 int num_shared_tlb_pages;
87
76 struct kvm_vcpu vcpu; 88 struct kvm_vcpu vcpu;
77}; 89};
78 90
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index bf8af5d5d5dc..52eb9c1f4fe0 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -32,17 +32,32 @@
32#include <linux/atomic.h> 32#include <linux/atomic.h>
33#include <asm/kvm_asm.h> 33#include <asm/kvm_asm.h>
34#include <asm/processor.h> 34#include <asm/processor.h>
35#include <asm/page.h>
35 36
36#define KVM_MAX_VCPUS NR_CPUS 37#define KVM_MAX_VCPUS NR_CPUS
37#define KVM_MAX_VCORES NR_CPUS 38#define KVM_MAX_VCORES NR_CPUS
38#define KVM_MEMORY_SLOTS 32 39#define KVM_MEMORY_SLOTS 32
39/* memory slots that does not exposed to userspace */ 40/* memory slots that does not exposed to userspace */
40#define KVM_PRIVATE_MEM_SLOTS 4 41#define KVM_PRIVATE_MEM_SLOTS 4
42#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
41 43
42#ifdef CONFIG_KVM_MMIO 44#ifdef CONFIG_KVM_MMIO
43#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 45#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
44#endif 46#endif
45 47
48#ifdef CONFIG_KVM_BOOK3S_64_HV
49#include <linux/mmu_notifier.h>
50
51#define KVM_ARCH_WANT_MMU_NOTIFIER
52
53struct kvm;
54extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
55extern int kvm_age_hva(struct kvm *kvm, unsigned long hva);
56extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
57extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
58
59#endif
60
46/* We don't currently support large pages. */ 61/* We don't currently support large pages. */
47#define KVM_HPAGE_GFN_SHIFT(x) 0 62#define KVM_HPAGE_GFN_SHIFT(x) 0
48#define KVM_NR_PAGE_SIZES 1 63#define KVM_NR_PAGE_SIZES 1
@@ -158,34 +173,72 @@ struct kvmppc_spapr_tce_table {
158 struct page *pages[0]; 173 struct page *pages[0];
159}; 174};
160 175
161struct kvmppc_rma_info { 176struct kvmppc_linear_info {
162 void *base_virt; 177 void *base_virt;
163 unsigned long base_pfn; 178 unsigned long base_pfn;
164 unsigned long npages; 179 unsigned long npages;
165 struct list_head list; 180 struct list_head list;
166 atomic_t use_count; 181 atomic_t use_count;
182 int type;
183};
184
185/*
186 * The reverse mapping array has one entry for each HPTE,
187 * which stores the guest's view of the second word of the HPTE
188 * (including the guest physical address of the mapping),
189 * plus forward and backward pointers in a doubly-linked ring
190 * of HPTEs that map the same host page. The pointers in this
191 * ring are 32-bit HPTE indexes, to save space.
192 */
193struct revmap_entry {
194 unsigned long guest_rpte;
195 unsigned int forw, back;
196};
197
198/*
199 * We use the top bit of each memslot->rmap entry as a lock bit,
200 * and bit 32 as a present flag. The bottom 32 bits are the
201 * index in the guest HPT of a HPTE that points to the page.
202 */
203#define KVMPPC_RMAP_LOCK_BIT 63
204#define KVMPPC_RMAP_RC_SHIFT 32
205#define KVMPPC_RMAP_REFERENCED (HPTE_R_R << KVMPPC_RMAP_RC_SHIFT)
206#define KVMPPC_RMAP_CHANGED (HPTE_R_C << KVMPPC_RMAP_RC_SHIFT)
207#define KVMPPC_RMAP_PRESENT 0x100000000ul
208#define KVMPPC_RMAP_INDEX 0xfffffffful
209
210/* Low-order bits in kvm->arch.slot_phys[][] */
211#define KVMPPC_PAGE_ORDER_MASK 0x1f
212#define KVMPPC_PAGE_NO_CACHE HPTE_R_I /* 0x20 */
213#define KVMPPC_PAGE_WRITETHRU HPTE_R_W /* 0x40 */
214#define KVMPPC_GOT_PAGE 0x80
215
216struct kvm_arch_memory_slot {
167}; 217};
168 218
169struct kvm_arch { 219struct kvm_arch {
170#ifdef CONFIG_KVM_BOOK3S_64_HV 220#ifdef CONFIG_KVM_BOOK3S_64_HV
171 unsigned long hpt_virt; 221 unsigned long hpt_virt;
172 unsigned long ram_npages; 222 struct revmap_entry *revmap;
173 unsigned long ram_psize;
174 unsigned long ram_porder;
175 struct kvmppc_pginfo *ram_pginfo;
176 unsigned int lpid; 223 unsigned int lpid;
177 unsigned int host_lpid; 224 unsigned int host_lpid;
178 unsigned long host_lpcr; 225 unsigned long host_lpcr;
179 unsigned long sdr1; 226 unsigned long sdr1;
180 unsigned long host_sdr1; 227 unsigned long host_sdr1;
181 int tlbie_lock; 228 int tlbie_lock;
182 int n_rma_pages;
183 unsigned long lpcr; 229 unsigned long lpcr;
184 unsigned long rmor; 230 unsigned long rmor;
185 struct kvmppc_rma_info *rma; 231 struct kvmppc_linear_info *rma;
232 unsigned long vrma_slb_v;
233 int rma_setup_done;
234 int using_mmu_notifiers;
186 struct list_head spapr_tce_tables; 235 struct list_head spapr_tce_tables;
236 spinlock_t slot_phys_lock;
237 unsigned long *slot_phys[KVM_MEM_SLOTS_NUM];
238 int slot_npages[KVM_MEM_SLOTS_NUM];
187 unsigned short last_vcpu[NR_CPUS]; 239 unsigned short last_vcpu[NR_CPUS];
188 struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; 240 struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
241 struct kvmppc_linear_info *hpt_li;
189#endif /* CONFIG_KVM_BOOK3S_64_HV */ 242#endif /* CONFIG_KVM_BOOK3S_64_HV */
190}; 243};
191 244
@@ -318,10 +371,6 @@ struct kvm_vcpu_arch {
318 u32 vrsave; /* also USPRG0 */ 371 u32 vrsave; /* also USPRG0 */
319 u32 mmucr; 372 u32 mmucr;
320 ulong shadow_msr; 373 ulong shadow_msr;
321 ulong sprg4;
322 ulong sprg5;
323 ulong sprg6;
324 ulong sprg7;
325 ulong csrr0; 374 ulong csrr0;
326 ulong csrr1; 375 ulong csrr1;
327 ulong dsrr0; 376 ulong dsrr0;
@@ -329,16 +378,14 @@ struct kvm_vcpu_arch {
329 ulong mcsrr0; 378 ulong mcsrr0;
330 ulong mcsrr1; 379 ulong mcsrr1;
331 ulong mcsr; 380 ulong mcsr;
332 ulong esr;
333 u32 dec; 381 u32 dec;
334 u32 decar; 382 u32 decar;
335 u32 tbl; 383 u32 tbl;
336 u32 tbu; 384 u32 tbu;
337 u32 tcr; 385 u32 tcr;
338 u32 tsr; 386 ulong tsr; /* we need to perform set/clr_bits() which requires ulong */
339 u32 ivor[64]; 387 u32 ivor[64];
340 ulong ivpr; 388 ulong ivpr;
341 u32 pir;
342 u32 pvr; 389 u32 pvr;
343 390
344 u32 shadow_pid; 391 u32 shadow_pid;
@@ -427,9 +474,14 @@ struct kvm_vcpu_arch {
427#ifdef CONFIG_KVM_BOOK3S_64_HV 474#ifdef CONFIG_KVM_BOOK3S_64_HV
428 struct kvm_vcpu_arch_shared shregs; 475 struct kvm_vcpu_arch_shared shregs;
429 476
477 unsigned long pgfault_addr;
478 long pgfault_index;
479 unsigned long pgfault_hpte[2];
480
430 struct list_head run_list; 481 struct list_head run_list;
431 struct task_struct *run_task; 482 struct task_struct *run_task;
432 struct kvm_run *kvm_run; 483 struct kvm_run *kvm_run;
484 pgd_t *pgdir;
433#endif 485#endif
434}; 486};
435 487
@@ -438,4 +490,12 @@ struct kvm_vcpu_arch {
438#define KVMPPC_VCPU_BUSY_IN_HOST 1 490#define KVMPPC_VCPU_BUSY_IN_HOST 1
439#define KVMPPC_VCPU_RUNNABLE 2 491#define KVMPPC_VCPU_RUNNABLE 2
440 492
493/* Values for vcpu->arch.io_gpr */
494#define KVM_MMIO_REG_MASK 0x001f
495#define KVM_MMIO_REG_EXT_MASK 0xffe0
496#define KVM_MMIO_REG_GPR 0x0000
497#define KVM_MMIO_REG_FPR 0x0020
498#define KVM_MMIO_REG_QPR 0x0040
499#define KVM_MMIO_REG_FQPR 0x0060
500
441#endif /* __POWERPC_KVM_HOST_H__ */ 501#endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 50533f9adf40..7b754e743003 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -22,6 +22,16 @@
22 22
23#include <linux/types.h> 23#include <linux/types.h>
24 24
25/*
26 * Additions to this struct must only occur at the end, and should be
27 * accompanied by a KVM_MAGIC_FEAT flag to advertise that they are present
28 * (albeit not necessarily relevant to the current target hardware platform).
29 *
30 * Struct fields are always 32 or 64 bit aligned, depending on them being 32
31 * or 64 bit wide respectively.
32 *
33 * See Documentation/virtual/kvm/ppc-pv.txt
34 */
25struct kvm_vcpu_arch_shared { 35struct kvm_vcpu_arch_shared {
26 __u64 scratch1; 36 __u64 scratch1;
27 __u64 scratch2; 37 __u64 scratch2;
@@ -33,11 +43,35 @@ struct kvm_vcpu_arch_shared {
33 __u64 sprg3; 43 __u64 sprg3;
34 __u64 srr0; 44 __u64 srr0;
35 __u64 srr1; 45 __u64 srr1;
36 __u64 dar; 46 __u64 dar; /* dear on BookE */
37 __u64 msr; 47 __u64 msr;
38 __u32 dsisr; 48 __u32 dsisr;
39 __u32 int_pending; /* Tells the guest if we have an interrupt */ 49 __u32 int_pending; /* Tells the guest if we have an interrupt */
40 __u32 sr[16]; 50 __u32 sr[16];
51 __u32 mas0;
52 __u32 mas1;
53 __u64 mas7_3;
54 __u64 mas2;
55 __u32 mas4;
56 __u32 mas6;
57 __u32 esr;
58 __u32 pir;
59
60 /*
61 * SPRG4-7 are user-readable, so we can only keep these consistent
62 * between the shared area and the real registers when there's an
63 * intervening exit to KVM. This also applies to SPRG3 on some
64 * chips.
65 *
66 * This suffices for access by guest userspace, since in PR-mode
67 * KVM, an exit must occur when changing the guest's MSR[PR].
68 * If the guest kernel writes to SPRG3-7 via the shared area, it
69 * must also use the shared area for reading while in kernel space.
70 */
71 __u64 sprg4;
72 __u64 sprg5;
73 __u64 sprg6;
74 __u64 sprg7;
41}; 75};
42 76
43#define KVM_SC_MAGIC_R0 0x4b564d21 /* "KVM!" */ 77#define KVM_SC_MAGIC_R0 0x4b564d21 /* "KVM!" */
@@ -47,7 +81,10 @@ struct kvm_vcpu_arch_shared {
47 81
48#define KVM_FEATURE_MAGIC_PAGE 1 82#define KVM_FEATURE_MAGIC_PAGE 1
49 83
50#define KVM_MAGIC_FEAT_SR (1 << 0) 84#define KVM_MAGIC_FEAT_SR (1 << 0)
85
86/* MASn, ESR, PIR, and high SPRGs */
87#define KVM_MAGIC_FEAT_MAS0_TO_SPRG7 (1 << 1)
51 88
52#ifdef __KERNEL__ 89#ifdef __KERNEL__
53 90
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 46efd1a265c9..9d6dee0f7d48 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -66,6 +66,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run,
66extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu); 66extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
67extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); 67extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
68extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); 68extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
69extern void kvmppc_decrementer_func(unsigned long data);
69extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); 70extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
70 71
71/* Core-specific hooks */ 72/* Core-specific hooks */
@@ -94,7 +95,7 @@ extern int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
94extern void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu); 95extern void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
95extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu); 96extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu);
96 97
97extern void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu); 98extern void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu);
98extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu); 99extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu);
99extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags); 100extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags);
100extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu); 101extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu);
@@ -120,15 +121,17 @@ extern long kvmppc_alloc_hpt(struct kvm *kvm);
120extern void kvmppc_free_hpt(struct kvm *kvm); 121extern void kvmppc_free_hpt(struct kvm *kvm);
121extern long kvmppc_prepare_vrma(struct kvm *kvm, 122extern long kvmppc_prepare_vrma(struct kvm *kvm,
122 struct kvm_userspace_memory_region *mem); 123 struct kvm_userspace_memory_region *mem);
123extern void kvmppc_map_vrma(struct kvm *kvm, 124extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
124 struct kvm_userspace_memory_region *mem); 125 struct kvm_memory_slot *memslot, unsigned long porder);
125extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); 126extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
126extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, 127extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
127 struct kvm_create_spapr_tce *args); 128 struct kvm_create_spapr_tce *args);
128extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, 129extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
129 struct kvm_allocate_rma *rma); 130 struct kvm_allocate_rma *rma);
130extern struct kvmppc_rma_info *kvm_alloc_rma(void); 131extern struct kvmppc_linear_info *kvm_alloc_rma(void);
131extern void kvm_release_rma(struct kvmppc_rma_info *ri); 132extern void kvm_release_rma(struct kvmppc_linear_info *ri);
133extern struct kvmppc_linear_info *kvm_alloc_hpt(void);
134extern void kvm_release_hpt(struct kvmppc_linear_info *li);
132extern int kvmppc_core_init_vm(struct kvm *kvm); 135extern int kvmppc_core_init_vm(struct kvm *kvm);
133extern void kvmppc_core_destroy_vm(struct kvm *kvm); 136extern void kvmppc_core_destroy_vm(struct kvm *kvm);
134extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, 137extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
@@ -175,6 +178,9 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
175void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); 178void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
176int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); 179int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
177 180
181int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg);
182int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg);
183
178void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid); 184void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
179 185
180#ifdef CONFIG_KVM_BOOK3S_64_HV 186#ifdef CONFIG_KVM_BOOK3S_64_HV
@@ -183,14 +189,19 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
183 paca[cpu].kvm_hstate.xics_phys = addr; 189 paca[cpu].kvm_hstate.xics_phys = addr;
184} 190}
185 191
186extern void kvm_rma_init(void); 192extern void kvm_linear_init(void);
187 193
188#else 194#else
189static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) 195static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
190{} 196{}
191 197
192static inline void kvm_rma_init(void) 198static inline void kvm_linear_init(void)
193{} 199{}
194#endif 200#endif
195 201
202int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
203 struct kvm_config_tlb *cfg);
204int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
205 struct kvm_dirty_tlb *cfg);
206
196#endif /* __POWERPC_KVM_PPC_H__ */ 207#endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h
index f5f89cafebd0..cdb5421877e2 100644
--- a/arch/powerpc/include/asm/mmu-book3e.h
+++ b/arch/powerpc/include/asm/mmu-book3e.h
@@ -41,9 +41,10 @@
41/* MAS registers bit definitions */ 41/* MAS registers bit definitions */
42 42
43#define MAS0_TLBSEL(x) (((x) << 28) & 0x30000000) 43#define MAS0_TLBSEL(x) (((x) << 28) & 0x30000000)
44#define MAS0_ESEL(x) (((x) << 16) & 0x0FFF0000)
45#define MAS0_NV(x) ((x) & 0x00000FFF)
46#define MAS0_ESEL_MASK 0x0FFF0000 44#define MAS0_ESEL_MASK 0x0FFF0000
45#define MAS0_ESEL_SHIFT 16
46#define MAS0_ESEL(x) (((x) << MAS0_ESEL_SHIFT) & MAS0_ESEL_MASK)
47#define MAS0_NV(x) ((x) & 0x00000FFF)
47#define MAS0_HES 0x00004000 48#define MAS0_HES 0x00004000
48#define MAS0_WQ_ALLWAYS 0x00000000 49#define MAS0_WQ_ALLWAYS 0x00000000
49#define MAS0_WQ_COND 0x00001000 50#define MAS0_WQ_COND 0x00001000
@@ -167,6 +168,7 @@
167#define TLBnCFG_MAXSIZE 0x000f0000 /* Maximum Page Size (v1.0) */ 168#define TLBnCFG_MAXSIZE 0x000f0000 /* Maximum Page Size (v1.0) */
168#define TLBnCFG_MAXSIZE_SHIFT 16 169#define TLBnCFG_MAXSIZE_SHIFT 16
169#define TLBnCFG_ASSOC 0xff000000 /* Associativity */ 170#define TLBnCFG_ASSOC 0xff000000 /* Associativity */
171#define TLBnCFG_ASSOC_SHIFT 24
170 172
171/* TLBnPS encoding */ 173/* TLBnPS encoding */
172#define TLBnPS_4K 0x00000004 174#define TLBnPS_4K 0x00000004
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index 412ba493cb98..0759dd8bf5aa 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -108,11 +108,11 @@ extern char initial_stab[];
108#define HPTE_V_VRMA_MASK ASM_CONST(0x4001ffffff000000) 108#define HPTE_V_VRMA_MASK ASM_CONST(0x4001ffffff000000)
109 109
110/* Values for PP (assumes Ks=0, Kp=1) */ 110/* Values for PP (assumes Ks=0, Kp=1) */
111/* pp0 will always be 0 for linux */
112#define PP_RWXX 0 /* Supervisor read/write, User none */ 111#define PP_RWXX 0 /* Supervisor read/write, User none */
113#define PP_RWRX 1 /* Supervisor read/write, User read */ 112#define PP_RWRX 1 /* Supervisor read/write, User read */
114#define PP_RWRW 2 /* Supervisor read/write, User read/write */ 113#define PP_RWRW 2 /* Supervisor read/write, User read/write */
115#define PP_RXRX 3 /* Supervisor read, User read */ 114#define PP_RXRX 3 /* Supervisor read, User read */
115#define PP_RXXX (HPTE_R_PP0 | 2) /* Supervisor read, user none */
116 116
117#ifndef __ASSEMBLY__ 117#ifndef __ASSEMBLY__
118 118
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index e980faae4225..d81f99430fe7 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -45,6 +45,7 @@
45#define PPC_INST_MFSPR_DSCR_MASK 0xfc1fffff 45#define PPC_INST_MFSPR_DSCR_MASK 0xfc1fffff
46#define PPC_INST_MTSPR_DSCR 0x7c1103a6 46#define PPC_INST_MTSPR_DSCR 0x7c1103a6
47#define PPC_INST_MTSPR_DSCR_MASK 0xfc1fffff 47#define PPC_INST_MTSPR_DSCR_MASK 0xfc1fffff
48#define PPC_INST_SLBFEE 0x7c0007a7
48 49
49#define PPC_INST_STRING 0x7c00042a 50#define PPC_INST_STRING 0x7c00042a
50#define PPC_INST_STRING_MASK 0xfc0007fe 51#define PPC_INST_STRING_MASK 0xfc0007fe
@@ -183,7 +184,8 @@
183 __PPC_RS(t) | __PPC_RA(a) | __PPC_RB(b)) 184 __PPC_RS(t) | __PPC_RA(a) | __PPC_RB(b))
184#define PPC_ERATSX_DOT(t, a, w) stringify_in_c(.long PPC_INST_ERATSX_DOT | \ 185#define PPC_ERATSX_DOT(t, a, w) stringify_in_c(.long PPC_INST_ERATSX_DOT | \
185 __PPC_RS(t) | __PPC_RA(a) | __PPC_RB(b)) 186 __PPC_RS(t) | __PPC_RA(a) | __PPC_RB(b))
186 187#define PPC_SLBFEE_DOT(t, b) stringify_in_c(.long PPC_INST_SLBFEE | \
188 __PPC_RT(t) | __PPC_RB(b))
187 189
188/* 190/*
189 * Define what the VSX XX1 form instructions will look like, then add 191 * Define what the VSX XX1 form instructions will look like, then add
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index b1a215eabef6..9d7f0fb69028 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -216,6 +216,7 @@
216#define DSISR_ISSTORE 0x02000000 /* access was a store */ 216#define DSISR_ISSTORE 0x02000000 /* access was a store */
217#define DSISR_DABRMATCH 0x00400000 /* hit data breakpoint */ 217#define DSISR_DABRMATCH 0x00400000 /* hit data breakpoint */
218#define DSISR_NOSEGMENT 0x00200000 /* STAB/SLB miss */ 218#define DSISR_NOSEGMENT 0x00200000 /* STAB/SLB miss */
219#define DSISR_KEYFAULT 0x00200000 /* Key fault */
219#define SPRN_TBRL 0x10C /* Time Base Read Lower Register (user, R/O) */ 220#define SPRN_TBRL 0x10C /* Time Base Read Lower Register (user, R/O) */
220#define SPRN_TBRU 0x10D /* Time Base Read Upper Register (user, R/O) */ 221#define SPRN_TBRU 0x10D /* Time Base Read Upper Register (user, R/O) */
221#define SPRN_TBWL 0x11C /* Time Base Lower Register (super, R/W) */ 222#define SPRN_TBWL 0x11C /* Time Base Lower Register (super, R/W) */
@@ -237,6 +238,7 @@
237#define LPCR_ISL (1ul << (63-2)) 238#define LPCR_ISL (1ul << (63-2))
238#define LPCR_VC_SH (63-2) 239#define LPCR_VC_SH (63-2)
239#define LPCR_DPFD_SH (63-11) 240#define LPCR_DPFD_SH (63-11)
241#define LPCR_VRMASD (0x1ful << (63-16))
240#define LPCR_VRMA_L (1ul << (63-12)) 242#define LPCR_VRMA_L (1ul << (63-12))
241#define LPCR_VRMA_LP0 (1ul << (63-15)) 243#define LPCR_VRMA_LP0 (1ul << (63-15))
242#define LPCR_VRMA_LP1 (1ul << (63-16)) 244#define LPCR_VRMA_LP1 (1ul << (63-16))
@@ -493,6 +495,9 @@
493#define SPRN_SPRG7 0x117 /* Special Purpose Register General 7 */ 495#define SPRN_SPRG7 0x117 /* Special Purpose Register General 7 */
494#define SPRN_SRR0 0x01A /* Save/Restore Register 0 */ 496#define SPRN_SRR0 0x01A /* Save/Restore Register 0 */
495#define SPRN_SRR1 0x01B /* Save/Restore Register 1 */ 497#define SPRN_SRR1 0x01B /* Save/Restore Register 1 */
498#define SRR1_ISI_NOPT 0x40000000 /* ISI: Not found in hash */
499#define SRR1_ISI_N_OR_G 0x10000000 /* ISI: Access is no-exec or G */
500#define SRR1_ISI_PROT 0x08000000 /* ISI: Other protection fault */
496#define SRR1_WAKEMASK 0x00380000 /* reason for wakeup */ 501#define SRR1_WAKEMASK 0x00380000 /* reason for wakeup */
497#define SRR1_WAKESYSERR 0x00300000 /* System error */ 502#define SRR1_WAKESYSERR 0x00300000 /* System error */
498#define SRR1_WAKEEE 0x00200000 /* External interrupt */ 503#define SRR1_WAKEEE 0x00200000 /* External interrupt */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index cc492e48ddfa..34b8afe94a50 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -412,16 +412,23 @@ int main(void)
412 DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2)); 412 DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
413 DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3)); 413 DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
414#endif 414#endif
415 DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4)); 415 DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4));
416 DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5)); 416 DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5));
417 DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6)); 417 DEFINE(VCPU_SHARED_SPRG6, offsetof(struct kvm_vcpu_arch_shared, sprg6));
418 DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7)); 418 DEFINE(VCPU_SHARED_SPRG7, offsetof(struct kvm_vcpu_arch_shared, sprg7));
419 DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid)); 419 DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
420 DEFINE(VCPU_SHADOW_PID1, offsetof(struct kvm_vcpu, arch.shadow_pid1)); 420 DEFINE(VCPU_SHADOW_PID1, offsetof(struct kvm_vcpu, arch.shadow_pid1));
421 DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared)); 421 DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared));
422 DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr)); 422 DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr));
423 DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr)); 423 DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
424 424
425 DEFINE(VCPU_SHARED_MAS0, offsetof(struct kvm_vcpu_arch_shared, mas0));
426 DEFINE(VCPU_SHARED_MAS1, offsetof(struct kvm_vcpu_arch_shared, mas1));
427 DEFINE(VCPU_SHARED_MAS2, offsetof(struct kvm_vcpu_arch_shared, mas2));
428 DEFINE(VCPU_SHARED_MAS7_3, offsetof(struct kvm_vcpu_arch_shared, mas7_3));
429 DEFINE(VCPU_SHARED_MAS4, offsetof(struct kvm_vcpu_arch_shared, mas4));
430 DEFINE(VCPU_SHARED_MAS6, offsetof(struct kvm_vcpu_arch_shared, mas6));
431
425 /* book3s */ 432 /* book3s */
426#ifdef CONFIG_KVM_BOOK3S_64_HV 433#ifdef CONFIG_KVM_BOOK3S_64_HV
427 DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid)); 434 DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid));
@@ -434,6 +441,7 @@ int main(void)
434 DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu)); 441 DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
435 DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr)); 442 DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
436 DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor)); 443 DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
444 DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
437 DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr)); 445 DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
438 DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); 446 DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
439#endif 447#endif
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 2d0868a4e2f0..cb705fdbb458 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -101,14 +101,14 @@ data_access_not_stab:
101END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB) 101END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
102#endif 102#endif
103 EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD, 103 EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD,
104 KVMTEST_PR, 0x300) 104 KVMTEST, 0x300)
105 105
106 . = 0x380 106 . = 0x380
107 .globl data_access_slb_pSeries 107 .globl data_access_slb_pSeries
108data_access_slb_pSeries: 108data_access_slb_pSeries:
109 HMT_MEDIUM 109 HMT_MEDIUM
110 SET_SCRATCH0(r13) 110 SET_SCRATCH0(r13)
111 EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380) 111 EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380)
112 std r3,PACA_EXSLB+EX_R3(r13) 112 std r3,PACA_EXSLB+EX_R3(r13)
113 mfspr r3,SPRN_DAR 113 mfspr r3,SPRN_DAR
114#ifdef __DISABLED__ 114#ifdef __DISABLED__
@@ -330,8 +330,8 @@ do_stab_bolted_pSeries:
330 EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD) 330 EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
331#endif /* CONFIG_POWER4_ONLY */ 331#endif /* CONFIG_POWER4_ONLY */
332 332
333 KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x300) 333 KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300)
334 KVM_HANDLER_PR_SKIP(PACA_EXSLB, EXC_STD, 0x380) 334 KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380)
335 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x400) 335 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x400)
336 KVM_HANDLER_PR(PACA_EXSLB, EXC_STD, 0x480) 336 KVM_HANDLER_PR(PACA_EXSLB, EXC_STD, 0x480)
337 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x900) 337 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x900)
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 2985338d0e10..62bdf2389669 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved. 2 * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
3 * Copyright 2010-2011 Freescale Semiconductor, Inc.
3 * 4 *
4 * Authors: 5 * Authors:
5 * Alexander Graf <agraf@suse.de> 6 * Alexander Graf <agraf@suse.de>
@@ -29,6 +30,7 @@
29#include <asm/sections.h> 30#include <asm/sections.h>
30#include <asm/cacheflush.h> 31#include <asm/cacheflush.h>
31#include <asm/disassemble.h> 32#include <asm/disassemble.h>
33#include <asm/ppc-opcode.h>
32 34
33#define KVM_MAGIC_PAGE (-4096L) 35#define KVM_MAGIC_PAGE (-4096L)
34#define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x) 36#define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x)
@@ -41,34 +43,30 @@
41#define KVM_INST_B 0x48000000 43#define KVM_INST_B 0x48000000
42#define KVM_INST_B_MASK 0x03ffffff 44#define KVM_INST_B_MASK 0x03ffffff
43#define KVM_INST_B_MAX 0x01ffffff 45#define KVM_INST_B_MAX 0x01ffffff
46#define KVM_INST_LI 0x38000000
44 47
45#define KVM_MASK_RT 0x03e00000 48#define KVM_MASK_RT 0x03e00000
46#define KVM_RT_30 0x03c00000 49#define KVM_RT_30 0x03c00000
47#define KVM_MASK_RB 0x0000f800 50#define KVM_MASK_RB 0x0000f800
48#define KVM_INST_MFMSR 0x7c0000a6 51#define KVM_INST_MFMSR 0x7c0000a6
49#define KVM_INST_MFSPR_SPRG0 0x7c1042a6 52
50#define KVM_INST_MFSPR_SPRG1 0x7c1142a6 53#define SPR_FROM 0
51#define KVM_INST_MFSPR_SPRG2 0x7c1242a6 54#define SPR_TO 0x100
52#define KVM_INST_MFSPR_SPRG3 0x7c1342a6 55
53#define KVM_INST_MFSPR_SRR0 0x7c1a02a6 56#define KVM_INST_SPR(sprn, moveto) (0x7c0002a6 | \
54#define KVM_INST_MFSPR_SRR1 0x7c1b02a6 57 (((sprn) & 0x1f) << 16) | \
55#define KVM_INST_MFSPR_DAR 0x7c1302a6 58 (((sprn) & 0x3e0) << 6) | \
56#define KVM_INST_MFSPR_DSISR 0x7c1202a6 59 (moveto))
57 60
58#define KVM_INST_MTSPR_SPRG0 0x7c1043a6 61#define KVM_INST_MFSPR(sprn) KVM_INST_SPR(sprn, SPR_FROM)
59#define KVM_INST_MTSPR_SPRG1 0x7c1143a6 62#define KVM_INST_MTSPR(sprn) KVM_INST_SPR(sprn, SPR_TO)
60#define KVM_INST_MTSPR_SPRG2 0x7c1243a6
61#define KVM_INST_MTSPR_SPRG3 0x7c1343a6
62#define KVM_INST_MTSPR_SRR0 0x7c1a03a6
63#define KVM_INST_MTSPR_SRR1 0x7c1b03a6
64#define KVM_INST_MTSPR_DAR 0x7c1303a6
65#define KVM_INST_MTSPR_DSISR 0x7c1203a6
66 63
67#define KVM_INST_TLBSYNC 0x7c00046c 64#define KVM_INST_TLBSYNC 0x7c00046c
68#define KVM_INST_MTMSRD_L0 0x7c000164 65#define KVM_INST_MTMSRD_L0 0x7c000164
69#define KVM_INST_MTMSRD_L1 0x7c010164 66#define KVM_INST_MTMSRD_L1 0x7c010164
70#define KVM_INST_MTMSR 0x7c000124 67#define KVM_INST_MTMSR 0x7c000124
71 68
69#define KVM_INST_WRTEE 0x7c000106
72#define KVM_INST_WRTEEI_0 0x7c000146 70#define KVM_INST_WRTEEI_0 0x7c000146
73#define KVM_INST_WRTEEI_1 0x7c008146 71#define KVM_INST_WRTEEI_1 0x7c008146
74 72
@@ -270,26 +268,27 @@ static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt)
270 268
271#ifdef CONFIG_BOOKE 269#ifdef CONFIG_BOOKE
272 270
273extern u32 kvm_emulate_wrteei_branch_offs; 271extern u32 kvm_emulate_wrtee_branch_offs;
274extern u32 kvm_emulate_wrteei_ee_offs; 272extern u32 kvm_emulate_wrtee_reg_offs;
275extern u32 kvm_emulate_wrteei_len; 273extern u32 kvm_emulate_wrtee_orig_ins_offs;
276extern u32 kvm_emulate_wrteei[]; 274extern u32 kvm_emulate_wrtee_len;
275extern u32 kvm_emulate_wrtee[];
277 276
278static void kvm_patch_ins_wrteei(u32 *inst) 277static void kvm_patch_ins_wrtee(u32 *inst, u32 rt, int imm_one)
279{ 278{
280 u32 *p; 279 u32 *p;
281 int distance_start; 280 int distance_start;
282 int distance_end; 281 int distance_end;
283 ulong next_inst; 282 ulong next_inst;
284 283
285 p = kvm_alloc(kvm_emulate_wrteei_len * 4); 284 p = kvm_alloc(kvm_emulate_wrtee_len * 4);
286 if (!p) 285 if (!p)
287 return; 286 return;
288 287
289 /* Find out where we are and put everything there */ 288 /* Find out where we are and put everything there */
290 distance_start = (ulong)p - (ulong)inst; 289 distance_start = (ulong)p - (ulong)inst;
291 next_inst = ((ulong)inst + 4); 290 next_inst = ((ulong)inst + 4);
292 distance_end = next_inst - (ulong)&p[kvm_emulate_wrteei_branch_offs]; 291 distance_end = next_inst - (ulong)&p[kvm_emulate_wrtee_branch_offs];
293 292
294 /* Make sure we only write valid b instructions */ 293 /* Make sure we only write valid b instructions */
295 if (distance_start > KVM_INST_B_MAX) { 294 if (distance_start > KVM_INST_B_MAX) {
@@ -298,10 +297,65 @@ static void kvm_patch_ins_wrteei(u32 *inst)
298 } 297 }
299 298
300 /* Modify the chunk to fit the invocation */ 299 /* Modify the chunk to fit the invocation */
301 memcpy(p, kvm_emulate_wrteei, kvm_emulate_wrteei_len * 4); 300 memcpy(p, kvm_emulate_wrtee, kvm_emulate_wrtee_len * 4);
302 p[kvm_emulate_wrteei_branch_offs] |= distance_end & KVM_INST_B_MASK; 301 p[kvm_emulate_wrtee_branch_offs] |= distance_end & KVM_INST_B_MASK;
303 p[kvm_emulate_wrteei_ee_offs] |= (*inst & MSR_EE); 302
304 flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrteei_len * 4); 303 if (imm_one) {
304 p[kvm_emulate_wrtee_reg_offs] =
305 KVM_INST_LI | __PPC_RT(30) | MSR_EE;
306 } else {
307 /* Make clobbered registers work too */
308 switch (get_rt(rt)) {
309 case 30:
310 kvm_patch_ins_ll(&p[kvm_emulate_wrtee_reg_offs],
311 magic_var(scratch2), KVM_RT_30);
312 break;
313 case 31:
314 kvm_patch_ins_ll(&p[kvm_emulate_wrtee_reg_offs],
315 magic_var(scratch1), KVM_RT_30);
316 break;
317 default:
318 p[kvm_emulate_wrtee_reg_offs] |= rt;
319 break;
320 }
321 }
322
323 p[kvm_emulate_wrtee_orig_ins_offs] = *inst;
324 flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrtee_len * 4);
325
326 /* Patch the invocation */
327 kvm_patch_ins_b(inst, distance_start);
328}
329
330extern u32 kvm_emulate_wrteei_0_branch_offs;
331extern u32 kvm_emulate_wrteei_0_len;
332extern u32 kvm_emulate_wrteei_0[];
333
334static void kvm_patch_ins_wrteei_0(u32 *inst)
335{
336 u32 *p;
337 int distance_start;
338 int distance_end;
339 ulong next_inst;
340
341 p = kvm_alloc(kvm_emulate_wrteei_0_len * 4);
342 if (!p)
343 return;
344
345 /* Find out where we are and put everything there */
346 distance_start = (ulong)p - (ulong)inst;
347 next_inst = ((ulong)inst + 4);
348 distance_end = next_inst - (ulong)&p[kvm_emulate_wrteei_0_branch_offs];
349
350 /* Make sure we only write valid b instructions */
351 if (distance_start > KVM_INST_B_MAX) {
352 kvm_patching_worked = false;
353 return;
354 }
355
356 memcpy(p, kvm_emulate_wrteei_0, kvm_emulate_wrteei_0_len * 4);
357 p[kvm_emulate_wrteei_0_branch_offs] |= distance_end & KVM_INST_B_MASK;
358 flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrteei_0_len * 4);
305 359
306 /* Patch the invocation */ 360 /* Patch the invocation */
307 kvm_patch_ins_b(inst, distance_start); 361 kvm_patch_ins_b(inst, distance_start);
@@ -380,56 +434,191 @@ static void kvm_check_ins(u32 *inst, u32 features)
380 case KVM_INST_MFMSR: 434 case KVM_INST_MFMSR:
381 kvm_patch_ins_ld(inst, magic_var(msr), inst_rt); 435 kvm_patch_ins_ld(inst, magic_var(msr), inst_rt);
382 break; 436 break;
383 case KVM_INST_MFSPR_SPRG0: 437 case KVM_INST_MFSPR(SPRN_SPRG0):
384 kvm_patch_ins_ld(inst, magic_var(sprg0), inst_rt); 438 kvm_patch_ins_ld(inst, magic_var(sprg0), inst_rt);
385 break; 439 break;
386 case KVM_INST_MFSPR_SPRG1: 440 case KVM_INST_MFSPR(SPRN_SPRG1):
387 kvm_patch_ins_ld(inst, magic_var(sprg1), inst_rt); 441 kvm_patch_ins_ld(inst, magic_var(sprg1), inst_rt);
388 break; 442 break;
389 case KVM_INST_MFSPR_SPRG2: 443 case KVM_INST_MFSPR(SPRN_SPRG2):
390 kvm_patch_ins_ld(inst, magic_var(sprg2), inst_rt); 444 kvm_patch_ins_ld(inst, magic_var(sprg2), inst_rt);
391 break; 445 break;
392 case KVM_INST_MFSPR_SPRG3: 446 case KVM_INST_MFSPR(SPRN_SPRG3):
393 kvm_patch_ins_ld(inst, magic_var(sprg3), inst_rt); 447 kvm_patch_ins_ld(inst, magic_var(sprg3), inst_rt);
394 break; 448 break;
395 case KVM_INST_MFSPR_SRR0: 449 case KVM_INST_MFSPR(SPRN_SRR0):
396 kvm_patch_ins_ld(inst, magic_var(srr0), inst_rt); 450 kvm_patch_ins_ld(inst, magic_var(srr0), inst_rt);
397 break; 451 break;
398 case KVM_INST_MFSPR_SRR1: 452 case KVM_INST_MFSPR(SPRN_SRR1):
399 kvm_patch_ins_ld(inst, magic_var(srr1), inst_rt); 453 kvm_patch_ins_ld(inst, magic_var(srr1), inst_rt);
400 break; 454 break;
401 case KVM_INST_MFSPR_DAR: 455#ifdef CONFIG_BOOKE
456 case KVM_INST_MFSPR(SPRN_DEAR):
457#else
458 case KVM_INST_MFSPR(SPRN_DAR):
459#endif
402 kvm_patch_ins_ld(inst, magic_var(dar), inst_rt); 460 kvm_patch_ins_ld(inst, magic_var(dar), inst_rt);
403 break; 461 break;
404 case KVM_INST_MFSPR_DSISR: 462 case KVM_INST_MFSPR(SPRN_DSISR):
405 kvm_patch_ins_lwz(inst, magic_var(dsisr), inst_rt); 463 kvm_patch_ins_lwz(inst, magic_var(dsisr), inst_rt);
406 break; 464 break;
407 465
466#ifdef CONFIG_PPC_BOOK3E_MMU
467 case KVM_INST_MFSPR(SPRN_MAS0):
468 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
469 kvm_patch_ins_lwz(inst, magic_var(mas0), inst_rt);
470 break;
471 case KVM_INST_MFSPR(SPRN_MAS1):
472 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
473 kvm_patch_ins_lwz(inst, magic_var(mas1), inst_rt);
474 break;
475 case KVM_INST_MFSPR(SPRN_MAS2):
476 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
477 kvm_patch_ins_ld(inst, magic_var(mas2), inst_rt);
478 break;
479 case KVM_INST_MFSPR(SPRN_MAS3):
480 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
481 kvm_patch_ins_lwz(inst, magic_var(mas7_3) + 4, inst_rt);
482 break;
483 case KVM_INST_MFSPR(SPRN_MAS4):
484 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
485 kvm_patch_ins_lwz(inst, magic_var(mas4), inst_rt);
486 break;
487 case KVM_INST_MFSPR(SPRN_MAS6):
488 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
489 kvm_patch_ins_lwz(inst, magic_var(mas6), inst_rt);
490 break;
491 case KVM_INST_MFSPR(SPRN_MAS7):
492 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
493 kvm_patch_ins_lwz(inst, magic_var(mas7_3), inst_rt);
494 break;
495#endif /* CONFIG_PPC_BOOK3E_MMU */
496
497 case KVM_INST_MFSPR(SPRN_SPRG4):
498#ifdef CONFIG_BOOKE
499 case KVM_INST_MFSPR(SPRN_SPRG4R):
500#endif
501 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
502 kvm_patch_ins_ld(inst, magic_var(sprg4), inst_rt);
503 break;
504 case KVM_INST_MFSPR(SPRN_SPRG5):
505#ifdef CONFIG_BOOKE
506 case KVM_INST_MFSPR(SPRN_SPRG5R):
507#endif
508 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
509 kvm_patch_ins_ld(inst, magic_var(sprg5), inst_rt);
510 break;
511 case KVM_INST_MFSPR(SPRN_SPRG6):
512#ifdef CONFIG_BOOKE
513 case KVM_INST_MFSPR(SPRN_SPRG6R):
514#endif
515 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
516 kvm_patch_ins_ld(inst, magic_var(sprg6), inst_rt);
517 break;
518 case KVM_INST_MFSPR(SPRN_SPRG7):
519#ifdef CONFIG_BOOKE
520 case KVM_INST_MFSPR(SPRN_SPRG7R):
521#endif
522 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
523 kvm_patch_ins_ld(inst, magic_var(sprg7), inst_rt);
524 break;
525
526#ifdef CONFIG_BOOKE
527 case KVM_INST_MFSPR(SPRN_ESR):
528 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
529 kvm_patch_ins_lwz(inst, magic_var(esr), inst_rt);
530 break;
531#endif
532
533 case KVM_INST_MFSPR(SPRN_PIR):
534 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
535 kvm_patch_ins_lwz(inst, magic_var(pir), inst_rt);
536 break;
537
538
408 /* Stores */ 539 /* Stores */
409 case KVM_INST_MTSPR_SPRG0: 540 case KVM_INST_MTSPR(SPRN_SPRG0):
410 kvm_patch_ins_std(inst, magic_var(sprg0), inst_rt); 541 kvm_patch_ins_std(inst, magic_var(sprg0), inst_rt);
411 break; 542 break;
412 case KVM_INST_MTSPR_SPRG1: 543 case KVM_INST_MTSPR(SPRN_SPRG1):
413 kvm_patch_ins_std(inst, magic_var(sprg1), inst_rt); 544 kvm_patch_ins_std(inst, magic_var(sprg1), inst_rt);
414 break; 545 break;
415 case KVM_INST_MTSPR_SPRG2: 546 case KVM_INST_MTSPR(SPRN_SPRG2):
416 kvm_patch_ins_std(inst, magic_var(sprg2), inst_rt); 547 kvm_patch_ins_std(inst, magic_var(sprg2), inst_rt);
417 break; 548 break;
418 case KVM_INST_MTSPR_SPRG3: 549 case KVM_INST_MTSPR(SPRN_SPRG3):
419 kvm_patch_ins_std(inst, magic_var(sprg3), inst_rt); 550 kvm_patch_ins_std(inst, magic_var(sprg3), inst_rt);
420 break; 551 break;
421 case KVM_INST_MTSPR_SRR0: 552 case KVM_INST_MTSPR(SPRN_SRR0):
422 kvm_patch_ins_std(inst, magic_var(srr0), inst_rt); 553 kvm_patch_ins_std(inst, magic_var(srr0), inst_rt);
423 break; 554 break;
424 case KVM_INST_MTSPR_SRR1: 555 case KVM_INST_MTSPR(SPRN_SRR1):
425 kvm_patch_ins_std(inst, magic_var(srr1), inst_rt); 556 kvm_patch_ins_std(inst, magic_var(srr1), inst_rt);
426 break; 557 break;
427 case KVM_INST_MTSPR_DAR: 558#ifdef CONFIG_BOOKE
559 case KVM_INST_MTSPR(SPRN_DEAR):
560#else
561 case KVM_INST_MTSPR(SPRN_DAR):
562#endif
428 kvm_patch_ins_std(inst, magic_var(dar), inst_rt); 563 kvm_patch_ins_std(inst, magic_var(dar), inst_rt);
429 break; 564 break;
430 case KVM_INST_MTSPR_DSISR: 565 case KVM_INST_MTSPR(SPRN_DSISR):
431 kvm_patch_ins_stw(inst, magic_var(dsisr), inst_rt); 566 kvm_patch_ins_stw(inst, magic_var(dsisr), inst_rt);
432 break; 567 break;
568#ifdef CONFIG_PPC_BOOK3E_MMU
569 case KVM_INST_MTSPR(SPRN_MAS0):
570 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
571 kvm_patch_ins_stw(inst, magic_var(mas0), inst_rt);
572 break;
573 case KVM_INST_MTSPR(SPRN_MAS1):
574 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
575 kvm_patch_ins_stw(inst, magic_var(mas1), inst_rt);
576 break;
577 case KVM_INST_MTSPR(SPRN_MAS2):
578 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
579 kvm_patch_ins_std(inst, magic_var(mas2), inst_rt);
580 break;
581 case KVM_INST_MTSPR(SPRN_MAS3):
582 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
583 kvm_patch_ins_stw(inst, magic_var(mas7_3) + 4, inst_rt);
584 break;
585 case KVM_INST_MTSPR(SPRN_MAS4):
586 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
587 kvm_patch_ins_stw(inst, magic_var(mas4), inst_rt);
588 break;
589 case KVM_INST_MTSPR(SPRN_MAS6):
590 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
591 kvm_patch_ins_stw(inst, magic_var(mas6), inst_rt);
592 break;
593 case KVM_INST_MTSPR(SPRN_MAS7):
594 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
595 kvm_patch_ins_stw(inst, magic_var(mas7_3), inst_rt);
596 break;
597#endif /* CONFIG_PPC_BOOK3E_MMU */
598
599 case KVM_INST_MTSPR(SPRN_SPRG4):
600 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
601 kvm_patch_ins_std(inst, magic_var(sprg4), inst_rt);
602 break;
603 case KVM_INST_MTSPR(SPRN_SPRG5):
604 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
605 kvm_patch_ins_std(inst, magic_var(sprg5), inst_rt);
606 break;
607 case KVM_INST_MTSPR(SPRN_SPRG6):
608 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
609 kvm_patch_ins_std(inst, magic_var(sprg6), inst_rt);
610 break;
611 case KVM_INST_MTSPR(SPRN_SPRG7):
612 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
613 kvm_patch_ins_std(inst, magic_var(sprg7), inst_rt);
614 break;
615
616#ifdef CONFIG_BOOKE
617 case KVM_INST_MTSPR(SPRN_ESR):
618 if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
619 kvm_patch_ins_stw(inst, magic_var(esr), inst_rt);
620 break;
621#endif
433 622
434 /* Nops */ 623 /* Nops */
435 case KVM_INST_TLBSYNC: 624 case KVM_INST_TLBSYNC:
@@ -444,6 +633,11 @@ static void kvm_check_ins(u32 *inst, u32 features)
444 case KVM_INST_MTMSRD_L0: 633 case KVM_INST_MTMSRD_L0:
445 kvm_patch_ins_mtmsr(inst, inst_rt); 634 kvm_patch_ins_mtmsr(inst, inst_rt);
446 break; 635 break;
636#ifdef CONFIG_BOOKE
637 case KVM_INST_WRTEE:
638 kvm_patch_ins_wrtee(inst, inst_rt, 0);
639 break;
640#endif
447 } 641 }
448 642
449 switch (inst_no_rt & ~KVM_MASK_RB) { 643 switch (inst_no_rt & ~KVM_MASK_RB) {
@@ -461,13 +655,19 @@ static void kvm_check_ins(u32 *inst, u32 features)
461 switch (_inst) { 655 switch (_inst) {
462#ifdef CONFIG_BOOKE 656#ifdef CONFIG_BOOKE
463 case KVM_INST_WRTEEI_0: 657 case KVM_INST_WRTEEI_0:
658 kvm_patch_ins_wrteei_0(inst);
659 break;
660
464 case KVM_INST_WRTEEI_1: 661 case KVM_INST_WRTEEI_1:
465 kvm_patch_ins_wrteei(inst); 662 kvm_patch_ins_wrtee(inst, 0, 1);
466 break; 663 break;
467#endif 664#endif
468 } 665 }
469} 666}
470 667
668extern u32 kvm_template_start[];
669extern u32 kvm_template_end[];
670
471static void kvm_use_magic_page(void) 671static void kvm_use_magic_page(void)
472{ 672{
473 u32 *p; 673 u32 *p;
@@ -488,8 +688,23 @@ static void kvm_use_magic_page(void)
488 start = (void*)_stext; 688 start = (void*)_stext;
489 end = (void*)_etext; 689 end = (void*)_etext;
490 690
491 for (p = start; p < end; p++) 691 /*
692 * Being interrupted in the middle of patching would
693 * be bad for SPRG4-7, which KVM can't keep in sync
694 * with emulated accesses because reads don't trap.
695 */
696 local_irq_disable();
697
698 for (p = start; p < end; p++) {
699 /* Avoid patching the template code */
700 if (p >= kvm_template_start && p < kvm_template_end) {
701 p = kvm_template_end - 1;
702 continue;
703 }
492 kvm_check_ins(p, features); 704 kvm_check_ins(p, features);
705 }
706
707 local_irq_enable();
493 708
494 printk(KERN_INFO "KVM: Live patching for a fast VM %s\n", 709 printk(KERN_INFO "KVM: Live patching for a fast VM %s\n",
495 kvm_patching_worked ? "worked" : "failed"); 710 kvm_patching_worked ? "worked" : "failed");
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
index f2b1b2523e61..e291cf3cf954 100644
--- a/arch/powerpc/kernel/kvm_emul.S
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -13,6 +13,7 @@
13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
14 * 14 *
15 * Copyright SUSE Linux Products GmbH 2010 15 * Copyright SUSE Linux Products GmbH 2010
16 * Copyright 2010-2011 Freescale Semiconductor, Inc.
16 * 17 *
17 * Authors: Alexander Graf <agraf@suse.de> 18 * Authors: Alexander Graf <agraf@suse.de>
18 */ 19 */
@@ -65,6 +66,9 @@ kvm_hypercall_start:
65 shared->critical == r1 and r2 is always != r1 */ \ 66 shared->critical == r1 and r2 is always != r1 */ \
66 STL64(r2, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0); 67 STL64(r2, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0);
67 68
69.global kvm_template_start
70kvm_template_start:
71
68.global kvm_emulate_mtmsrd 72.global kvm_emulate_mtmsrd
69kvm_emulate_mtmsrd: 73kvm_emulate_mtmsrd:
70 74
@@ -167,6 +171,9 @@ maybe_stay_in_guest:
167kvm_emulate_mtmsr_reg2: 171kvm_emulate_mtmsr_reg2:
168 ori r30, r0, 0 172 ori r30, r0, 0
169 173
174 /* Put MSR into magic page because we don't call mtmsr */
175 STL64(r30, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
176
170 /* Check if we have to fetch an interrupt */ 177 /* Check if we have to fetch an interrupt */
171 lwz r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0) 178 lwz r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0)
172 cmpwi r31, 0 179 cmpwi r31, 0
@@ -174,15 +181,10 @@ kvm_emulate_mtmsr_reg2:
174 181
175 /* Check if we may trigger an interrupt */ 182 /* Check if we may trigger an interrupt */
176 andi. r31, r30, MSR_EE 183 andi. r31, r30, MSR_EE
177 beq no_mtmsr 184 bne do_mtmsr
178
179 b do_mtmsr
180 185
181no_mtmsr: 186no_mtmsr:
182 187
183 /* Put MSR into magic page because we don't call mtmsr */
184 STL64(r30, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
185
186 SCRATCH_RESTORE 188 SCRATCH_RESTORE
187 189
188 /* Go back to caller */ 190 /* Go back to caller */
@@ -210,24 +212,80 @@ kvm_emulate_mtmsr_orig_ins_offs:
210kvm_emulate_mtmsr_len: 212kvm_emulate_mtmsr_len:
211 .long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4 213 .long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4
212 214
215/* also used for wrteei 1 */
216.global kvm_emulate_wrtee
217kvm_emulate_wrtee:
218
219 SCRATCH_SAVE
220
221 /* Fetch old MSR in r31 */
222 LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
223
224 /* Insert new MSR[EE] */
225kvm_emulate_wrtee_reg:
226 ori r30, r0, 0
227 rlwimi r31, r30, 0, MSR_EE
228
229 /*
230 * If MSR[EE] is now set, check for a pending interrupt.
231 * We could skip this if MSR[EE] was already on, but that
232 * should be rare, so don't bother.
233 */
234 andi. r30, r30, MSR_EE
235
236 /* Put MSR into magic page because we don't call wrtee */
237 STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
238
239 beq no_wrtee
240
241 /* Check if we have to fetch an interrupt */
242 lwz r30, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0)
243 cmpwi r30, 0
244 bne do_wrtee
245
246no_wrtee:
247 SCRATCH_RESTORE
248
249 /* Go back to caller */
250kvm_emulate_wrtee_branch:
251 b .
252
253do_wrtee:
254 SCRATCH_RESTORE
213 255
256 /* Just fire off the wrtee if it's critical */
257kvm_emulate_wrtee_orig_ins:
258 wrtee r0
214 259
215.global kvm_emulate_wrteei 260 b kvm_emulate_wrtee_branch
216kvm_emulate_wrteei:
217 261
262kvm_emulate_wrtee_end:
263
264.global kvm_emulate_wrtee_branch_offs
265kvm_emulate_wrtee_branch_offs:
266 .long (kvm_emulate_wrtee_branch - kvm_emulate_wrtee) / 4
267
268.global kvm_emulate_wrtee_reg_offs
269kvm_emulate_wrtee_reg_offs:
270 .long (kvm_emulate_wrtee_reg - kvm_emulate_wrtee) / 4
271
272.global kvm_emulate_wrtee_orig_ins_offs
273kvm_emulate_wrtee_orig_ins_offs:
274 .long (kvm_emulate_wrtee_orig_ins - kvm_emulate_wrtee) / 4
275
276.global kvm_emulate_wrtee_len
277kvm_emulate_wrtee_len:
278 .long (kvm_emulate_wrtee_end - kvm_emulate_wrtee) / 4
279
280.global kvm_emulate_wrteei_0
281kvm_emulate_wrteei_0:
218 SCRATCH_SAVE 282 SCRATCH_SAVE
219 283
220 /* Fetch old MSR in r31 */ 284 /* Fetch old MSR in r31 */
221 LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) 285 LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
222 286
223 /* Remove MSR_EE from old MSR */ 287 /* Remove MSR_EE from old MSR */
224 li r30, 0 288 rlwinm r31, r31, 0, ~MSR_EE
225 ori r30, r30, MSR_EE
226 andc r31, r31, r30
227
228 /* OR new MSR_EE onto the old MSR */
229kvm_emulate_wrteei_ee:
230 ori r31, r31, 0
231 289
232 /* Write new MSR value back */ 290 /* Write new MSR value back */
233 STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) 291 STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
@@ -235,22 +293,17 @@ kvm_emulate_wrteei_ee:
235 SCRATCH_RESTORE 293 SCRATCH_RESTORE
236 294
237 /* Go back to caller */ 295 /* Go back to caller */
238kvm_emulate_wrteei_branch: 296kvm_emulate_wrteei_0_branch:
239 b . 297 b .
240kvm_emulate_wrteei_end: 298kvm_emulate_wrteei_0_end:
241
242.global kvm_emulate_wrteei_branch_offs
243kvm_emulate_wrteei_branch_offs:
244 .long (kvm_emulate_wrteei_branch - kvm_emulate_wrteei) / 4
245 299
246.global kvm_emulate_wrteei_ee_offs 300.global kvm_emulate_wrteei_0_branch_offs
247kvm_emulate_wrteei_ee_offs: 301kvm_emulate_wrteei_0_branch_offs:
248 .long (kvm_emulate_wrteei_ee - kvm_emulate_wrteei) / 4 302 .long (kvm_emulate_wrteei_0_branch - kvm_emulate_wrteei_0) / 4
249
250.global kvm_emulate_wrteei_len
251kvm_emulate_wrteei_len:
252 .long (kvm_emulate_wrteei_end - kvm_emulate_wrteei) / 4
253 303
304.global kvm_emulate_wrteei_0_len
305kvm_emulate_wrteei_0_len:
306 .long (kvm_emulate_wrteei_0_end - kvm_emulate_wrteei_0) / 4
254 307
255.global kvm_emulate_mtsrin 308.global kvm_emulate_mtsrin
256kvm_emulate_mtsrin: 309kvm_emulate_mtsrin:
@@ -300,3 +353,6 @@ kvm_emulate_mtsrin_orig_ins_offs:
300.global kvm_emulate_mtsrin_len 353.global kvm_emulate_mtsrin_len
301kvm_emulate_mtsrin_len: 354kvm_emulate_mtsrin_len:
302 .long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4 355 .long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4
356
357.global kvm_template_end
358kvm_template_end:
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 4cb8f1e9d044..4721b0c8d7b7 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -598,7 +598,7 @@ void __init setup_arch(char **cmdline_p)
598 /* Initialize the MMU context management stuff */ 598 /* Initialize the MMU context management stuff */
599 mmu_context_init(); 599 mmu_context_init();
600 600
601 kvm_rma_init(); 601 kvm_linear_init();
602 602
603 ppc64_boot_msg(0x15, "Setup Done"); 603 ppc64_boot_msg(0x15, "Setup Done");
604} 604}
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 78133deb4b64..8f64709ae331 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -69,6 +69,7 @@ config KVM_BOOK3S_64
69config KVM_BOOK3S_64_HV 69config KVM_BOOK3S_64_HV
70 bool "KVM support for POWER7 and PPC970 using hypervisor mode in host" 70 bool "KVM support for POWER7 and PPC970 using hypervisor mode in host"
71 depends on KVM_BOOK3S_64 71 depends on KVM_BOOK3S_64
72 select MMU_NOTIFIER
72 ---help--- 73 ---help---
73 Support running unmodified book3s_64 guest kernels in 74 Support running unmodified book3s_64 guest kernels in
74 virtual machines on POWER7 and PPC970 processors that have 75 virtual machines on POWER7 and PPC970 processors that have
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index e41ac6f7dcf1..7d54f4ed6d96 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -258,7 +258,7 @@ static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority)
258 return true; 258 return true;
259} 259}
260 260
261void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) 261void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
262{ 262{
263 unsigned long *pending = &vcpu->arch.pending_exceptions; 263 unsigned long *pending = &vcpu->arch.pending_exceptions;
264 unsigned long old_pending = vcpu->arch.pending_exceptions; 264 unsigned long old_pending = vcpu->arch.pending_exceptions;
@@ -423,10 +423,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
423 regs->sprg1 = vcpu->arch.shared->sprg1; 423 regs->sprg1 = vcpu->arch.shared->sprg1;
424 regs->sprg2 = vcpu->arch.shared->sprg2; 424 regs->sprg2 = vcpu->arch.shared->sprg2;
425 regs->sprg3 = vcpu->arch.shared->sprg3; 425 regs->sprg3 = vcpu->arch.shared->sprg3;
426 regs->sprg4 = vcpu->arch.sprg4; 426 regs->sprg4 = vcpu->arch.shared->sprg4;
427 regs->sprg5 = vcpu->arch.sprg5; 427 regs->sprg5 = vcpu->arch.shared->sprg5;
428 regs->sprg6 = vcpu->arch.sprg6; 428 regs->sprg6 = vcpu->arch.shared->sprg6;
429 regs->sprg7 = vcpu->arch.sprg7; 429 regs->sprg7 = vcpu->arch.shared->sprg7;
430 430
431 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) 431 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
432 regs->gpr[i] = kvmppc_get_gpr(vcpu, i); 432 regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
@@ -450,10 +450,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
450 vcpu->arch.shared->sprg1 = regs->sprg1; 450 vcpu->arch.shared->sprg1 = regs->sprg1;
451 vcpu->arch.shared->sprg2 = regs->sprg2; 451 vcpu->arch.shared->sprg2 = regs->sprg2;
452 vcpu->arch.shared->sprg3 = regs->sprg3; 452 vcpu->arch.shared->sprg3 = regs->sprg3;
453 vcpu->arch.sprg4 = regs->sprg4; 453 vcpu->arch.shared->sprg4 = regs->sprg4;
454 vcpu->arch.sprg5 = regs->sprg5; 454 vcpu->arch.shared->sprg5 = regs->sprg5;
455 vcpu->arch.sprg6 = regs->sprg6; 455 vcpu->arch.shared->sprg6 = regs->sprg6;
456 vcpu->arch.sprg7 = regs->sprg7; 456 vcpu->arch.shared->sprg7 = regs->sprg7;
457 457
458 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) 458 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
459 kvmppc_set_gpr(vcpu, i, regs->gpr[i]); 459 kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
@@ -477,41 +477,10 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
477 return 0; 477 return 0;
478} 478}
479 479
480/* 480void kvmppc_decrementer_func(unsigned long data)
481 * Get (and clear) the dirty memory log for a memory slot.
482 */
483int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
484 struct kvm_dirty_log *log)
485{ 481{
486 struct kvm_memory_slot *memslot; 482 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
487 struct kvm_vcpu *vcpu;
488 ulong ga, ga_end;
489 int is_dirty = 0;
490 int r;
491 unsigned long n;
492
493 mutex_lock(&kvm->slots_lock);
494
495 r = kvm_get_dirty_log(kvm, log, &is_dirty);
496 if (r)
497 goto out;
498
499 /* If nothing is dirty, don't bother messing with page tables. */
500 if (is_dirty) {
501 memslot = id_to_memslot(kvm->memslots, log->slot);
502 483
503 ga = memslot->base_gfn << PAGE_SHIFT; 484 kvmppc_core_queue_dec(vcpu);
504 ga_end = ga + (memslot->npages << PAGE_SHIFT); 485 kvm_vcpu_kick(vcpu);
505
506 kvm_for_each_vcpu(n, vcpu, kvm)
507 kvmppc_mmu_pte_pflush(vcpu, ga, ga_end);
508
509 n = kvm_dirty_bitmap_bytes(memslot);
510 memset(memslot->dirty_bitmap, 0, n);
511 }
512
513 r = 0;
514out:
515 mutex_unlock(&kvm->slots_lock);
516 return r;
517} 486}
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index 9fecbfbce773..f922c29bb234 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -151,13 +151,15 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
151 bool primary = false; 151 bool primary = false;
152 bool evict = false; 152 bool evict = false;
153 struct hpte_cache *pte; 153 struct hpte_cache *pte;
154 int r = 0;
154 155
155 /* Get host physical address for gpa */ 156 /* Get host physical address for gpa */
156 hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); 157 hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
157 if (is_error_pfn(hpaddr)) { 158 if (is_error_pfn(hpaddr)) {
158 printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", 159 printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n",
159 orig_pte->eaddr); 160 orig_pte->eaddr);
160 return -EINVAL; 161 r = -EINVAL;
162 goto out;
161 } 163 }
162 hpaddr <<= PAGE_SHIFT; 164 hpaddr <<= PAGE_SHIFT;
163 165
@@ -249,7 +251,8 @@ next_pteg:
249 251
250 kvmppc_mmu_hpte_cache_map(vcpu, pte); 252 kvmppc_mmu_hpte_cache_map(vcpu, pte);
251 253
252 return 0; 254out:
255 return r;
253} 256}
254 257
255static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) 258static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
@@ -297,12 +300,14 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
297 u64 gvsid; 300 u64 gvsid;
298 u32 sr; 301 u32 sr;
299 struct kvmppc_sid_map *map; 302 struct kvmppc_sid_map *map;
300 struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu); 303 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
304 int r = 0;
301 305
302 if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) { 306 if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) {
303 /* Invalidate an entry */ 307 /* Invalidate an entry */
304 svcpu->sr[esid] = SR_INVALID; 308 svcpu->sr[esid] = SR_INVALID;
305 return -ENOENT; 309 r = -ENOENT;
310 goto out;
306 } 311 }
307 312
308 map = find_sid_vsid(vcpu, gvsid); 313 map = find_sid_vsid(vcpu, gvsid);
@@ -315,17 +320,21 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
315 320
316 dprintk_sr("MMU: mtsr %d, 0x%x\n", esid, sr); 321 dprintk_sr("MMU: mtsr %d, 0x%x\n", esid, sr);
317 322
318 return 0; 323out:
324 svcpu_put(svcpu);
325 return r;
319} 326}
320 327
321void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) 328void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
322{ 329{
323 int i; 330 int i;
324 struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu); 331 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
325 332
326 dprintk_sr("MMU: flushing all segments (%d)\n", ARRAY_SIZE(svcpu->sr)); 333 dprintk_sr("MMU: flushing all segments (%d)\n", ARRAY_SIZE(svcpu->sr));
327 for (i = 0; i < ARRAY_SIZE(svcpu->sr); i++) 334 for (i = 0; i < ARRAY_SIZE(svcpu->sr); i++)
328 svcpu->sr[i] = SR_INVALID; 335 svcpu->sr[i] = SR_INVALID;
336
337 svcpu_put(svcpu);
329} 338}
330 339
331void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) 340void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index fa2f08434ba5..6f87f39a1ac2 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -88,12 +88,14 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
88 int vflags = 0; 88 int vflags = 0;
89 int attempt = 0; 89 int attempt = 0;
90 struct kvmppc_sid_map *map; 90 struct kvmppc_sid_map *map;
91 int r = 0;
91 92
92 /* Get host physical address for gpa */ 93 /* Get host physical address for gpa */
93 hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); 94 hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
94 if (is_error_pfn(hpaddr)) { 95 if (is_error_pfn(hpaddr)) {
95 printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); 96 printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr);
96 return -EINVAL; 97 r = -EINVAL;
98 goto out;
97 } 99 }
98 hpaddr <<= PAGE_SHIFT; 100 hpaddr <<= PAGE_SHIFT;
99 hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK); 101 hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK);
@@ -110,7 +112,8 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
110 printk(KERN_ERR "KVM: Segment map for 0x%llx (0x%lx) failed\n", 112 printk(KERN_ERR "KVM: Segment map for 0x%llx (0x%lx) failed\n",
111 vsid, orig_pte->eaddr); 113 vsid, orig_pte->eaddr);
112 WARN_ON(true); 114 WARN_ON(true);
113 return -EINVAL; 115 r = -EINVAL;
116 goto out;
114 } 117 }
115 118
116 vsid = map->host_vsid; 119 vsid = map->host_vsid;
@@ -131,8 +134,10 @@ map_again:
131 134
132 /* In case we tried normal mapping already, let's nuke old entries */ 135 /* In case we tried normal mapping already, let's nuke old entries */
133 if (attempt > 1) 136 if (attempt > 1)
134 if (ppc_md.hpte_remove(hpteg) < 0) 137 if (ppc_md.hpte_remove(hpteg) < 0) {
135 return -1; 138 r = -1;
139 goto out;
140 }
136 141
137 ret = ppc_md.hpte_insert(hpteg, va, hpaddr, rflags, vflags, MMU_PAGE_4K, MMU_SEGSIZE_256M); 142 ret = ppc_md.hpte_insert(hpteg, va, hpaddr, rflags, vflags, MMU_PAGE_4K, MMU_SEGSIZE_256M);
138 143
@@ -162,7 +167,8 @@ map_again:
162 kvmppc_mmu_hpte_cache_map(vcpu, pte); 167 kvmppc_mmu_hpte_cache_map(vcpu, pte);
163 } 168 }
164 169
165 return 0; 170out:
171 return r;
166} 172}
167 173
168static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) 174static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
@@ -207,25 +213,30 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
207 213
208static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid) 214static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid)
209{ 215{
216 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
210 int i; 217 int i;
211 int max_slb_size = 64; 218 int max_slb_size = 64;
212 int found_inval = -1; 219 int found_inval = -1;
213 int r; 220 int r;
214 221
215 if (!to_svcpu(vcpu)->slb_max) 222 if (!svcpu->slb_max)
216 to_svcpu(vcpu)->slb_max = 1; 223 svcpu->slb_max = 1;
217 224
218 /* Are we overwriting? */ 225 /* Are we overwriting? */
219 for (i = 1; i < to_svcpu(vcpu)->slb_max; i++) { 226 for (i = 1; i < svcpu->slb_max; i++) {
220 if (!(to_svcpu(vcpu)->slb[i].esid & SLB_ESID_V)) 227 if (!(svcpu->slb[i].esid & SLB_ESID_V))
221 found_inval = i; 228 found_inval = i;
222 else if ((to_svcpu(vcpu)->slb[i].esid & ESID_MASK) == esid) 229 else if ((svcpu->slb[i].esid & ESID_MASK) == esid) {
223 return i; 230 r = i;
231 goto out;
232 }
224 } 233 }
225 234
226 /* Found a spare entry that was invalidated before */ 235 /* Found a spare entry that was invalidated before */
227 if (found_inval > 0) 236 if (found_inval > 0) {
228 return found_inval; 237 r = found_inval;
238 goto out;
239 }
229 240
230 /* No spare invalid entry, so create one */ 241 /* No spare invalid entry, so create one */
231 242
@@ -233,30 +244,35 @@ static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid)
233 max_slb_size = mmu_slb_size; 244 max_slb_size = mmu_slb_size;
234 245
235 /* Overflowing -> purge */ 246 /* Overflowing -> purge */
236 if ((to_svcpu(vcpu)->slb_max) == max_slb_size) 247 if ((svcpu->slb_max) == max_slb_size)
237 kvmppc_mmu_flush_segments(vcpu); 248 kvmppc_mmu_flush_segments(vcpu);
238 249
239 r = to_svcpu(vcpu)->slb_max; 250 r = svcpu->slb_max;
240 to_svcpu(vcpu)->slb_max++; 251 svcpu->slb_max++;
241 252
253out:
254 svcpu_put(svcpu);
242 return r; 255 return r;
243} 256}
244 257
245int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) 258int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
246{ 259{
260 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
247 u64 esid = eaddr >> SID_SHIFT; 261 u64 esid = eaddr >> SID_SHIFT;
248 u64 slb_esid = (eaddr & ESID_MASK) | SLB_ESID_V; 262 u64 slb_esid = (eaddr & ESID_MASK) | SLB_ESID_V;
249 u64 slb_vsid = SLB_VSID_USER; 263 u64 slb_vsid = SLB_VSID_USER;
250 u64 gvsid; 264 u64 gvsid;
251 int slb_index; 265 int slb_index;
252 struct kvmppc_sid_map *map; 266 struct kvmppc_sid_map *map;
267 int r = 0;
253 268
254 slb_index = kvmppc_mmu_next_segment(vcpu, eaddr & ESID_MASK); 269 slb_index = kvmppc_mmu_next_segment(vcpu, eaddr & ESID_MASK);
255 270
256 if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) { 271 if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) {
257 /* Invalidate an entry */ 272 /* Invalidate an entry */
258 to_svcpu(vcpu)->slb[slb_index].esid = 0; 273 svcpu->slb[slb_index].esid = 0;
259 return -ENOENT; 274 r = -ENOENT;
275 goto out;
260 } 276 }
261 277
262 map = find_sid_vsid(vcpu, gvsid); 278 map = find_sid_vsid(vcpu, gvsid);
@@ -269,18 +285,22 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
269 slb_vsid &= ~SLB_VSID_KP; 285 slb_vsid &= ~SLB_VSID_KP;
270 slb_esid |= slb_index; 286 slb_esid |= slb_index;
271 287
272 to_svcpu(vcpu)->slb[slb_index].esid = slb_esid; 288 svcpu->slb[slb_index].esid = slb_esid;
273 to_svcpu(vcpu)->slb[slb_index].vsid = slb_vsid; 289 svcpu->slb[slb_index].vsid = slb_vsid;
274 290
275 trace_kvm_book3s_slbmte(slb_vsid, slb_esid); 291 trace_kvm_book3s_slbmte(slb_vsid, slb_esid);
276 292
277 return 0; 293out:
294 svcpu_put(svcpu);
295 return r;
278} 296}
279 297
280void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) 298void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
281{ 299{
282 to_svcpu(vcpu)->slb_max = 1; 300 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
283 to_svcpu(vcpu)->slb[0].esid = 0; 301 svcpu->slb_max = 1;
302 svcpu->slb[0].esid = 0;
303 svcpu_put(svcpu);
284} 304}
285 305
286void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) 306void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index bc3a2ea94217..ddc485a529f2 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -23,6 +23,7 @@
23#include <linux/gfp.h> 23#include <linux/gfp.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/hugetlb.h> 25#include <linux/hugetlb.h>
26#include <linux/vmalloc.h>
26 27
27#include <asm/tlbflush.h> 28#include <asm/tlbflush.h>
28#include <asm/kvm_ppc.h> 29#include <asm/kvm_ppc.h>
@@ -33,15 +34,6 @@
33#include <asm/ppc-opcode.h> 34#include <asm/ppc-opcode.h>
34#include <asm/cputable.h> 35#include <asm/cputable.h>
35 36
36/* For now use fixed-size 16MB page table */
37#define HPT_ORDER 24
38#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */
39#define HPT_HASH_MASK (HPT_NPTEG - 1)
40
41/* Pages in the VRMA are 16MB pages */
42#define VRMA_PAGE_ORDER 24
43#define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */
44
45/* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ 37/* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
46#define MAX_LPID_970 63 38#define MAX_LPID_970 63
47#define NR_LPIDS (LPID_RSVD + 1) 39#define NR_LPIDS (LPID_RSVD + 1)
@@ -51,21 +43,41 @@ long kvmppc_alloc_hpt(struct kvm *kvm)
51{ 43{
52 unsigned long hpt; 44 unsigned long hpt;
53 unsigned long lpid; 45 unsigned long lpid;
46 struct revmap_entry *rev;
47 struct kvmppc_linear_info *li;
48
49 /* Allocate guest's hashed page table */
50 li = kvm_alloc_hpt();
51 if (li) {
52 /* using preallocated memory */
53 hpt = (ulong)li->base_virt;
54 kvm->arch.hpt_li = li;
55 } else {
56 /* using dynamic memory */
57 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
58 __GFP_NOWARN, HPT_ORDER - PAGE_SHIFT);
59 }
54 60
55 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN,
56 HPT_ORDER - PAGE_SHIFT);
57 if (!hpt) { 61 if (!hpt) {
58 pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n"); 62 pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n");
59 return -ENOMEM; 63 return -ENOMEM;
60 } 64 }
61 kvm->arch.hpt_virt = hpt; 65 kvm->arch.hpt_virt = hpt;
62 66
67 /* Allocate reverse map array */
68 rev = vmalloc(sizeof(struct revmap_entry) * HPT_NPTE);
69 if (!rev) {
70 pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n");
71 goto out_freehpt;
72 }
73 kvm->arch.revmap = rev;
74
75 /* Allocate the guest's logical partition ID */
63 do { 76 do {
64 lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS); 77 lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS);
65 if (lpid >= NR_LPIDS) { 78 if (lpid >= NR_LPIDS) {
66 pr_err("kvm_alloc_hpt: No LPIDs free\n"); 79 pr_err("kvm_alloc_hpt: No LPIDs free\n");
67 free_pages(hpt, HPT_ORDER - PAGE_SHIFT); 80 goto out_freeboth;
68 return -ENOMEM;
69 } 81 }
70 } while (test_and_set_bit(lpid, lpid_inuse)); 82 } while (test_and_set_bit(lpid, lpid_inuse));
71 83
@@ -74,37 +86,64 @@ long kvmppc_alloc_hpt(struct kvm *kvm)
74 86
75 pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid); 87 pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
76 return 0; 88 return 0;
89
90 out_freeboth:
91 vfree(rev);
92 out_freehpt:
93 free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
94 return -ENOMEM;
77} 95}
78 96
79void kvmppc_free_hpt(struct kvm *kvm) 97void kvmppc_free_hpt(struct kvm *kvm)
80{ 98{
81 clear_bit(kvm->arch.lpid, lpid_inuse); 99 clear_bit(kvm->arch.lpid, lpid_inuse);
82 free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT); 100 vfree(kvm->arch.revmap);
101 if (kvm->arch.hpt_li)
102 kvm_release_hpt(kvm->arch.hpt_li);
103 else
104 free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
105}
106
107/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
108static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize)
109{
110 return (pgsize > 0x1000) ? HPTE_V_LARGE : 0;
111}
112
113/* Bits in second HPTE dword for pagesize 4k, 64k or 16M */
114static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize)
115{
116 return (pgsize == 0x10000) ? 0x1000 : 0;
83} 117}
84 118
85void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem) 119void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
120 unsigned long porder)
86{ 121{
87 unsigned long i; 122 unsigned long i;
88 unsigned long npages = kvm->arch.ram_npages; 123 unsigned long npages;
89 unsigned long pfn; 124 unsigned long hp_v, hp_r;
90 unsigned long *hpte; 125 unsigned long addr, hash;
91 unsigned long hash; 126 unsigned long psize;
92 struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo; 127 unsigned long hp0, hp1;
128 long ret;
93 129
94 if (!pginfo) 130 psize = 1ul << porder;
95 return; 131 npages = memslot->npages >> (porder - PAGE_SHIFT);
96 132
97 /* VRMA can't be > 1TB */ 133 /* VRMA can't be > 1TB */
98 if (npages > 1ul << (40 - kvm->arch.ram_porder)) 134 if (npages > 1ul << (40 - porder))
99 npages = 1ul << (40 - kvm->arch.ram_porder); 135 npages = 1ul << (40 - porder);
100 /* Can't use more than 1 HPTE per HPTEG */ 136 /* Can't use more than 1 HPTE per HPTEG */
101 if (npages > HPT_NPTEG) 137 if (npages > HPT_NPTEG)
102 npages = HPT_NPTEG; 138 npages = HPT_NPTEG;
103 139
140 hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
141 HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
142 hp1 = hpte1_pgsize_encoding(psize) |
143 HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
144
104 for (i = 0; i < npages; ++i) { 145 for (i = 0; i < npages; ++i) {
105 pfn = pginfo[i].pfn; 146 addr = i << porder;
106 if (!pfn)
107 break;
108 /* can't use hpt_hash since va > 64 bits */ 147 /* can't use hpt_hash since va > 64 bits */
109 hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK; 148 hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
110 /* 149 /*
@@ -113,15 +152,15 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
113 * at most one HPTE per HPTEG, we just assume entry 7 152 * at most one HPTE per HPTEG, we just assume entry 7
114 * is available and use it. 153 * is available and use it.
115 */ 154 */
116 hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 7)); 155 hash = (hash << 3) + 7;
117 hpte += 7 * 2; 156 hp_v = hp0 | ((addr >> 16) & ~0x7fUL);
118 /* HPTE low word - RPN, protection, etc. */ 157 hp_r = hp1 | addr;
119 hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C | 158 ret = kvmppc_virtmode_h_enter(vcpu, H_EXACT, hash, hp_v, hp_r);
120 HPTE_R_M | PP_RWXX; 159 if (ret != H_SUCCESS) {
121 wmb(); 160 pr_err("KVM: map_vrma at %lx failed, ret=%ld\n",
122 hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | 161 addr, ret);
123 (i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED | 162 break;
124 HPTE_V_LARGE | HPTE_V_VALID; 163 }
125 } 164 }
126} 165}
127 166
@@ -158,10 +197,814 @@ static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
158 kvmppc_set_msr(vcpu, MSR_SF | MSR_ME); 197 kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
159} 198}
160 199
200/*
201 * This is called to get a reference to a guest page if there isn't
202 * one already in the kvm->arch.slot_phys[][] arrays.
203 */
204static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
205 struct kvm_memory_slot *memslot,
206 unsigned long psize)
207{
208 unsigned long start;
209 long np, err;
210 struct page *page, *hpage, *pages[1];
211 unsigned long s, pgsize;
212 unsigned long *physp;
213 unsigned int is_io, got, pgorder;
214 struct vm_area_struct *vma;
215 unsigned long pfn, i, npages;
216
217 physp = kvm->arch.slot_phys[memslot->id];
218 if (!physp)
219 return -EINVAL;
220 if (physp[gfn - memslot->base_gfn])
221 return 0;
222
223 is_io = 0;
224 got = 0;
225 page = NULL;
226 pgsize = psize;
227 err = -EINVAL;
228 start = gfn_to_hva_memslot(memslot, gfn);
229
230 /* Instantiate and get the page we want access to */
231 np = get_user_pages_fast(start, 1, 1, pages);
232 if (np != 1) {
233 /* Look up the vma for the page */
234 down_read(&current->mm->mmap_sem);
235 vma = find_vma(current->mm, start);
236 if (!vma || vma->vm_start > start ||
237 start + psize > vma->vm_end ||
238 !(vma->vm_flags & VM_PFNMAP))
239 goto up_err;
240 is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot));
241 pfn = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
242 /* check alignment of pfn vs. requested page size */
243 if (psize > PAGE_SIZE && (pfn & ((psize >> PAGE_SHIFT) - 1)))
244 goto up_err;
245 up_read(&current->mm->mmap_sem);
246
247 } else {
248 page = pages[0];
249 got = KVMPPC_GOT_PAGE;
250
251 /* See if this is a large page */
252 s = PAGE_SIZE;
253 if (PageHuge(page)) {
254 hpage = compound_head(page);
255 s <<= compound_order(hpage);
256 /* Get the whole large page if slot alignment is ok */
257 if (s > psize && slot_is_aligned(memslot, s) &&
258 !(memslot->userspace_addr & (s - 1))) {
259 start &= ~(s - 1);
260 pgsize = s;
261 page = hpage;
262 }
263 }
264 if (s < psize)
265 goto out;
266 pfn = page_to_pfn(page);
267 }
268
269 npages = pgsize >> PAGE_SHIFT;
270 pgorder = __ilog2(npages);
271 physp += (gfn - memslot->base_gfn) & ~(npages - 1);
272 spin_lock(&kvm->arch.slot_phys_lock);
273 for (i = 0; i < npages; ++i) {
274 if (!physp[i]) {
275 physp[i] = ((pfn + i) << PAGE_SHIFT) +
276 got + is_io + pgorder;
277 got = 0;
278 }
279 }
280 spin_unlock(&kvm->arch.slot_phys_lock);
281 err = 0;
282
283 out:
284 if (got) {
285 if (PageHuge(page))
286 page = compound_head(page);
287 put_page(page);
288 }
289 return err;
290
291 up_err:
292 up_read(&current->mm->mmap_sem);
293 return err;
294}
295
296/*
297 * We come here on a H_ENTER call from the guest when we are not
298 * using mmu notifiers and we don't have the requested page pinned
299 * already.
300 */
301long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
302 long pte_index, unsigned long pteh, unsigned long ptel)
303{
304 struct kvm *kvm = vcpu->kvm;
305 unsigned long psize, gpa, gfn;
306 struct kvm_memory_slot *memslot;
307 long ret;
308
309 if (kvm->arch.using_mmu_notifiers)
310 goto do_insert;
311
312 psize = hpte_page_size(pteh, ptel);
313 if (!psize)
314 return H_PARAMETER;
315
316 pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
317
318 /* Find the memslot (if any) for this address */
319 gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
320 gfn = gpa >> PAGE_SHIFT;
321 memslot = gfn_to_memslot(kvm, gfn);
322 if (memslot && !(memslot->flags & KVM_MEMSLOT_INVALID)) {
323 if (!slot_is_aligned(memslot, psize))
324 return H_PARAMETER;
325 if (kvmppc_get_guest_page(kvm, gfn, memslot, psize) < 0)
326 return H_PARAMETER;
327 }
328
329 do_insert:
330 /* Protect linux PTE lookup from page table destruction */
331 rcu_read_lock_sched(); /* this disables preemption too */
332 vcpu->arch.pgdir = current->mm->pgd;
333 ret = kvmppc_h_enter(vcpu, flags, pte_index, pteh, ptel);
334 rcu_read_unlock_sched();
335 if (ret == H_TOO_HARD) {
336 /* this can't happen */
337 pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n");
338 ret = H_RESOURCE; /* or something */
339 }
340 return ret;
341
342}
343
344static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
345 gva_t eaddr)
346{
347 u64 mask;
348 int i;
349
350 for (i = 0; i < vcpu->arch.slb_nr; i++) {
351 if (!(vcpu->arch.slb[i].orige & SLB_ESID_V))
352 continue;
353
354 if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T)
355 mask = ESID_MASK_1T;
356 else
357 mask = ESID_MASK;
358
359 if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0)
360 return &vcpu->arch.slb[i];
361 }
362 return NULL;
363}
364
365static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
366 unsigned long ea)
367{
368 unsigned long ra_mask;
369
370 ra_mask = hpte_page_size(v, r) - 1;
371 return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
372}
373
161static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, 374static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
162 struct kvmppc_pte *gpte, bool data) 375 struct kvmppc_pte *gpte, bool data)
376{
377 struct kvm *kvm = vcpu->kvm;
378 struct kvmppc_slb *slbe;
379 unsigned long slb_v;
380 unsigned long pp, key;
381 unsigned long v, gr;
382 unsigned long *hptep;
383 int index;
384 int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
385
386 /* Get SLB entry */
387 if (virtmode) {
388 slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
389 if (!slbe)
390 return -EINVAL;
391 slb_v = slbe->origv;
392 } else {
393 /* real mode access */
394 slb_v = vcpu->kvm->arch.vrma_slb_v;
395 }
396
397 /* Find the HPTE in the hash table */
398 index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v,
399 HPTE_V_VALID | HPTE_V_ABSENT);
400 if (index < 0)
401 return -ENOENT;
402 hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
403 v = hptep[0] & ~HPTE_V_HVLOCK;
404 gr = kvm->arch.revmap[index].guest_rpte;
405
406 /* Unlock the HPTE */
407 asm volatile("lwsync" : : : "memory");
408 hptep[0] = v;
409
410 gpte->eaddr = eaddr;
411 gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff);
412
413 /* Get PP bits and key for permission check */
414 pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
415 key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
416 key &= slb_v;
417
418 /* Calculate permissions */
419 gpte->may_read = hpte_read_permission(pp, key);
420 gpte->may_write = hpte_write_permission(pp, key);
421 gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G));
422
423 /* Storage key permission check for POWER7 */
424 if (data && virtmode && cpu_has_feature(CPU_FTR_ARCH_206)) {
425 int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr);
426 if (amrfield & 1)
427 gpte->may_read = 0;
428 if (amrfield & 2)
429 gpte->may_write = 0;
430 }
431
432 /* Get the guest physical address */
433 gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr);
434 return 0;
435}
436
437/*
438 * Quick test for whether an instruction is a load or a store.
439 * If the instruction is a load or a store, then this will indicate
440 * which it is, at least on server processors. (Embedded processors
441 * have some external PID instructions that don't follow the rule
442 * embodied here.) If the instruction isn't a load or store, then
443 * this doesn't return anything useful.
444 */
445static int instruction_is_store(unsigned int instr)
446{
447 unsigned int mask;
448
449 mask = 0x10000000;
450 if ((instr & 0xfc000000) == 0x7c000000)
451 mask = 0x100; /* major opcode 31 */
452 return (instr & mask) != 0;
453}
454
455static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
456 unsigned long gpa, int is_store)
457{
458 int ret;
459 u32 last_inst;
460 unsigned long srr0 = kvmppc_get_pc(vcpu);
461
462 /* We try to load the last instruction. We don't let
463 * emulate_instruction do it as it doesn't check what
464 * kvmppc_ld returns.
465 * If we fail, we just return to the guest and try executing it again.
466 */
467 if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED) {
468 ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
469 if (ret != EMULATE_DONE || last_inst == KVM_INST_FETCH_FAILED)
470 return RESUME_GUEST;
471 vcpu->arch.last_inst = last_inst;
472 }
473
474 /*
475 * WARNING: We do not know for sure whether the instruction we just
476 * read from memory is the same that caused the fault in the first
477 * place. If the instruction we read is neither an load or a store,
478 * then it can't access memory, so we don't need to worry about
479 * enforcing access permissions. So, assuming it is a load or
480 * store, we just check that its direction (load or store) is
481 * consistent with the original fault, since that's what we
482 * checked the access permissions against. If there is a mismatch
483 * we just return and retry the instruction.
484 */
485
486 if (instruction_is_store(vcpu->arch.last_inst) != !!is_store)
487 return RESUME_GUEST;
488
489 /*
490 * Emulated accesses are emulated by looking at the hash for
491 * translation once, then performing the access later. The
492 * translation could be invalidated in the meantime in which
493 * point performing the subsequent memory access on the old
494 * physical address could possibly be a security hole for the
495 * guest (but not the host).
496 *
497 * This is less of an issue for MMIO stores since they aren't
498 * globally visible. It could be an issue for MMIO loads to
499 * a certain extent but we'll ignore it for now.
500 */
501
502 vcpu->arch.paddr_accessed = gpa;
503 return kvmppc_emulate_mmio(run, vcpu);
504}
505
506int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
507 unsigned long ea, unsigned long dsisr)
508{
509 struct kvm *kvm = vcpu->kvm;
510 unsigned long *hptep, hpte[3], r;
511 unsigned long mmu_seq, psize, pte_size;
512 unsigned long gfn, hva, pfn;
513 struct kvm_memory_slot *memslot;
514 unsigned long *rmap;
515 struct revmap_entry *rev;
516 struct page *page, *pages[1];
517 long index, ret, npages;
518 unsigned long is_io;
519 unsigned int writing, write_ok;
520 struct vm_area_struct *vma;
521 unsigned long rcbits;
522
523 /*
524 * Real-mode code has already searched the HPT and found the
525 * entry we're interested in. Lock the entry and check that
526 * it hasn't changed. If it has, just return and re-execute the
527 * instruction.
528 */
529 if (ea != vcpu->arch.pgfault_addr)
530 return RESUME_GUEST;
531 index = vcpu->arch.pgfault_index;
532 hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
533 rev = &kvm->arch.revmap[index];
534 preempt_disable();
535 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
536 cpu_relax();
537 hpte[0] = hptep[0] & ~HPTE_V_HVLOCK;
538 hpte[1] = hptep[1];
539 hpte[2] = r = rev->guest_rpte;
540 asm volatile("lwsync" : : : "memory");
541 hptep[0] = hpte[0];
542 preempt_enable();
543
544 if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
545 hpte[1] != vcpu->arch.pgfault_hpte[1])
546 return RESUME_GUEST;
547
548 /* Translate the logical address and get the page */
549 psize = hpte_page_size(hpte[0], r);
550 gfn = hpte_rpn(r, psize);
551 memslot = gfn_to_memslot(kvm, gfn);
552
553 /* No memslot means it's an emulated MMIO region */
554 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
555 unsigned long gpa = (gfn << PAGE_SHIFT) | (ea & (psize - 1));
556 return kvmppc_hv_emulate_mmio(run, vcpu, gpa,
557 dsisr & DSISR_ISSTORE);
558 }
559
560 if (!kvm->arch.using_mmu_notifiers)
561 return -EFAULT; /* should never get here */
562
563 /* used to check for invalidations in progress */
564 mmu_seq = kvm->mmu_notifier_seq;
565 smp_rmb();
566
567 is_io = 0;
568 pfn = 0;
569 page = NULL;
570 pte_size = PAGE_SIZE;
571 writing = (dsisr & DSISR_ISSTORE) != 0;
572 /* If writing != 0, then the HPTE must allow writing, if we get here */
573 write_ok = writing;
574 hva = gfn_to_hva_memslot(memslot, gfn);
575 npages = get_user_pages_fast(hva, 1, writing, pages);
576 if (npages < 1) {
577 /* Check if it's an I/O mapping */
578 down_read(&current->mm->mmap_sem);
579 vma = find_vma(current->mm, hva);
580 if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end &&
581 (vma->vm_flags & VM_PFNMAP)) {
582 pfn = vma->vm_pgoff +
583 ((hva - vma->vm_start) >> PAGE_SHIFT);
584 pte_size = psize;
585 is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot));
586 write_ok = vma->vm_flags & VM_WRITE;
587 }
588 up_read(&current->mm->mmap_sem);
589 if (!pfn)
590 return -EFAULT;
591 } else {
592 page = pages[0];
593 if (PageHuge(page)) {
594 page = compound_head(page);
595 pte_size <<= compound_order(page);
596 }
597 /* if the guest wants write access, see if that is OK */
598 if (!writing && hpte_is_writable(r)) {
599 pte_t *ptep, pte;
600
601 /*
602 * We need to protect against page table destruction
603 * while looking up and updating the pte.
604 */
605 rcu_read_lock_sched();
606 ptep = find_linux_pte_or_hugepte(current->mm->pgd,
607 hva, NULL);
608 if (ptep && pte_present(*ptep)) {
609 pte = kvmppc_read_update_linux_pte(ptep, 1);
610 if (pte_write(pte))
611 write_ok = 1;
612 }
613 rcu_read_unlock_sched();
614 }
615 pfn = page_to_pfn(page);
616 }
617
618 ret = -EFAULT;
619 if (psize > pte_size)
620 goto out_put;
621
622 /* Check WIMG vs. the actual page we're accessing */
623 if (!hpte_cache_flags_ok(r, is_io)) {
624 if (is_io)
625 return -EFAULT;
626 /*
627 * Allow guest to map emulated device memory as
628 * uncacheable, but actually make it cacheable.
629 */
630 r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M;
631 }
632
633 /* Set the HPTE to point to pfn */
634 r = (r & ~(HPTE_R_PP0 - pte_size)) | (pfn << PAGE_SHIFT);
635 if (hpte_is_writable(r) && !write_ok)
636 r = hpte_make_readonly(r);
637 ret = RESUME_GUEST;
638 preempt_disable();
639 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
640 cpu_relax();
641 if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] ||
642 rev->guest_rpte != hpte[2])
643 /* HPTE has been changed under us; let the guest retry */
644 goto out_unlock;
645 hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
646
647 rmap = &memslot->rmap[gfn - memslot->base_gfn];
648 lock_rmap(rmap);
649
650 /* Check if we might have been invalidated; let the guest retry if so */
651 ret = RESUME_GUEST;
652 if (mmu_notifier_retry(vcpu, mmu_seq)) {
653 unlock_rmap(rmap);
654 goto out_unlock;
655 }
656
657 /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */
658 rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
659 r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
660
661 if (hptep[0] & HPTE_V_VALID) {
662 /* HPTE was previously valid, so we need to invalidate it */
663 unlock_rmap(rmap);
664 hptep[0] |= HPTE_V_ABSENT;
665 kvmppc_invalidate_hpte(kvm, hptep, index);
666 /* don't lose previous R and C bits */
667 r |= hptep[1] & (HPTE_R_R | HPTE_R_C);
668 } else {
669 kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
670 }
671
672 hptep[1] = r;
673 eieio();
674 hptep[0] = hpte[0];
675 asm volatile("ptesync" : : : "memory");
676 preempt_enable();
677 if (page && hpte_is_writable(r))
678 SetPageDirty(page);
679
680 out_put:
681 if (page)
682 put_page(page);
683 return ret;
684
685 out_unlock:
686 hptep[0] &= ~HPTE_V_HVLOCK;
687 preempt_enable();
688 goto out_put;
689}
690
691static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
692 int (*handler)(struct kvm *kvm, unsigned long *rmapp,
693 unsigned long gfn))
694{
695 int ret;
696 int retval = 0;
697 struct kvm_memslots *slots;
698 struct kvm_memory_slot *memslot;
699
700 slots = kvm_memslots(kvm);
701 kvm_for_each_memslot(memslot, slots) {
702 unsigned long start = memslot->userspace_addr;
703 unsigned long end;
704
705 end = start + (memslot->npages << PAGE_SHIFT);
706 if (hva >= start && hva < end) {
707 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
708
709 ret = handler(kvm, &memslot->rmap[gfn_offset],
710 memslot->base_gfn + gfn_offset);
711 retval |= ret;
712 }
713 }
714
715 return retval;
716}
717
718static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
719 unsigned long gfn)
720{
721 struct revmap_entry *rev = kvm->arch.revmap;
722 unsigned long h, i, j;
723 unsigned long *hptep;
724 unsigned long ptel, psize, rcbits;
725
726 for (;;) {
727 lock_rmap(rmapp);
728 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
729 unlock_rmap(rmapp);
730 break;
731 }
732
733 /*
734 * To avoid an ABBA deadlock with the HPTE lock bit,
735 * we can't spin on the HPTE lock while holding the
736 * rmap chain lock.
737 */
738 i = *rmapp & KVMPPC_RMAP_INDEX;
739 hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
740 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
741 /* unlock rmap before spinning on the HPTE lock */
742 unlock_rmap(rmapp);
743 while (hptep[0] & HPTE_V_HVLOCK)
744 cpu_relax();
745 continue;
746 }
747 j = rev[i].forw;
748 if (j == i) {
749 /* chain is now empty */
750 *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
751 } else {
752 /* remove i from chain */
753 h = rev[i].back;
754 rev[h].forw = j;
755 rev[j].back = h;
756 rev[i].forw = rev[i].back = i;
757 *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
758 }
759
760 /* Now check and modify the HPTE */
761 ptel = rev[i].guest_rpte;
762 psize = hpte_page_size(hptep[0], ptel);
763 if ((hptep[0] & HPTE_V_VALID) &&
764 hpte_rpn(ptel, psize) == gfn) {
765 hptep[0] |= HPTE_V_ABSENT;
766 kvmppc_invalidate_hpte(kvm, hptep, i);
767 /* Harvest R and C */
768 rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C);
769 *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
770 rev[i].guest_rpte = ptel | rcbits;
771 }
772 unlock_rmap(rmapp);
773 hptep[0] &= ~HPTE_V_HVLOCK;
774 }
775 return 0;
776}
777
778int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
779{
780 if (kvm->arch.using_mmu_notifiers)
781 kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
782 return 0;
783}
784
785static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
786 unsigned long gfn)
787{
788 struct revmap_entry *rev = kvm->arch.revmap;
789 unsigned long head, i, j;
790 unsigned long *hptep;
791 int ret = 0;
792
793 retry:
794 lock_rmap(rmapp);
795 if (*rmapp & KVMPPC_RMAP_REFERENCED) {
796 *rmapp &= ~KVMPPC_RMAP_REFERENCED;
797 ret = 1;
798 }
799 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
800 unlock_rmap(rmapp);
801 return ret;
802 }
803
804 i = head = *rmapp & KVMPPC_RMAP_INDEX;
805 do {
806 hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
807 j = rev[i].forw;
808
809 /* If this HPTE isn't referenced, ignore it */
810 if (!(hptep[1] & HPTE_R_R))
811 continue;
812
813 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
814 /* unlock rmap before spinning on the HPTE lock */
815 unlock_rmap(rmapp);
816 while (hptep[0] & HPTE_V_HVLOCK)
817 cpu_relax();
818 goto retry;
819 }
820
821 /* Now check and modify the HPTE */
822 if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) {
823 kvmppc_clear_ref_hpte(kvm, hptep, i);
824 rev[i].guest_rpte |= HPTE_R_R;
825 ret = 1;
826 }
827 hptep[0] &= ~HPTE_V_HVLOCK;
828 } while ((i = j) != head);
829
830 unlock_rmap(rmapp);
831 return ret;
832}
833
834int kvm_age_hva(struct kvm *kvm, unsigned long hva)
835{
836 if (!kvm->arch.using_mmu_notifiers)
837 return 0;
838 return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
839}
840
841static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
842 unsigned long gfn)
843{
844 struct revmap_entry *rev = kvm->arch.revmap;
845 unsigned long head, i, j;
846 unsigned long *hp;
847 int ret = 1;
848
849 if (*rmapp & KVMPPC_RMAP_REFERENCED)
850 return 1;
851
852 lock_rmap(rmapp);
853 if (*rmapp & KVMPPC_RMAP_REFERENCED)
854 goto out;
855
856 if (*rmapp & KVMPPC_RMAP_PRESENT) {
857 i = head = *rmapp & KVMPPC_RMAP_INDEX;
858 do {
859 hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4));
860 j = rev[i].forw;
861 if (hp[1] & HPTE_R_R)
862 goto out;
863 } while ((i = j) != head);
864 }
865 ret = 0;
866
867 out:
868 unlock_rmap(rmapp);
869 return ret;
870}
871
872int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
873{
874 if (!kvm->arch.using_mmu_notifiers)
875 return 0;
876 return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp);
877}
878
879void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
163{ 880{
164 return -ENOENT; 881 if (!kvm->arch.using_mmu_notifiers)
882 return;
883 kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
884}
885
886static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
887{
888 struct revmap_entry *rev = kvm->arch.revmap;
889 unsigned long head, i, j;
890 unsigned long *hptep;
891 int ret = 0;
892
893 retry:
894 lock_rmap(rmapp);
895 if (*rmapp & KVMPPC_RMAP_CHANGED) {
896 *rmapp &= ~KVMPPC_RMAP_CHANGED;
897 ret = 1;
898 }
899 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
900 unlock_rmap(rmapp);
901 return ret;
902 }
903
904 i = head = *rmapp & KVMPPC_RMAP_INDEX;
905 do {
906 hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
907 j = rev[i].forw;
908
909 if (!(hptep[1] & HPTE_R_C))
910 continue;
911
912 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
913 /* unlock rmap before spinning on the HPTE lock */
914 unlock_rmap(rmapp);
915 while (hptep[0] & HPTE_V_HVLOCK)
916 cpu_relax();
917 goto retry;
918 }
919
920 /* Now check and modify the HPTE */
921 if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_C)) {
922 /* need to make it temporarily absent to clear C */
923 hptep[0] |= HPTE_V_ABSENT;
924 kvmppc_invalidate_hpte(kvm, hptep, i);
925 hptep[1] &= ~HPTE_R_C;
926 eieio();
927 hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
928 rev[i].guest_rpte |= HPTE_R_C;
929 ret = 1;
930 }
931 hptep[0] &= ~HPTE_V_HVLOCK;
932 } while ((i = j) != head);
933
934 unlock_rmap(rmapp);
935 return ret;
936}
937
938long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
939{
940 unsigned long i;
941 unsigned long *rmapp, *map;
942
943 preempt_disable();
944 rmapp = memslot->rmap;
945 map = memslot->dirty_bitmap;
946 for (i = 0; i < memslot->npages; ++i) {
947 if (kvm_test_clear_dirty(kvm, rmapp))
948 __set_bit_le(i, map);
949 ++rmapp;
950 }
951 preempt_enable();
952 return 0;
953}
954
955void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
956 unsigned long *nb_ret)
957{
958 struct kvm_memory_slot *memslot;
959 unsigned long gfn = gpa >> PAGE_SHIFT;
960 struct page *page, *pages[1];
961 int npages;
962 unsigned long hva, psize, offset;
963 unsigned long pa;
964 unsigned long *physp;
965
966 memslot = gfn_to_memslot(kvm, gfn);
967 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
968 return NULL;
969 if (!kvm->arch.using_mmu_notifiers) {
970 physp = kvm->arch.slot_phys[memslot->id];
971 if (!physp)
972 return NULL;
973 physp += gfn - memslot->base_gfn;
974 pa = *physp;
975 if (!pa) {
976 if (kvmppc_get_guest_page(kvm, gfn, memslot,
977 PAGE_SIZE) < 0)
978 return NULL;
979 pa = *physp;
980 }
981 page = pfn_to_page(pa >> PAGE_SHIFT);
982 } else {
983 hva = gfn_to_hva_memslot(memslot, gfn);
984 npages = get_user_pages_fast(hva, 1, 1, pages);
985 if (npages < 1)
986 return NULL;
987 page = pages[0];
988 }
989 psize = PAGE_SIZE;
990 if (PageHuge(page)) {
991 page = compound_head(page);
992 psize <<= compound_order(page);
993 }
994 if (!kvm->arch.using_mmu_notifiers)
995 get_page(page);
996 offset = gpa & (psize - 1);
997 if (nb_ret)
998 *nb_ret = psize - offset;
999 return page_address(page) + offset;
1000}
1001
1002void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
1003{
1004 struct page *page = virt_to_page(va);
1005
1006 page = compound_head(page);
1007 put_page(page);
165} 1008}
166 1009
167void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) 1010void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 0c9dc62532d0..f1950d131827 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -230,9 +230,12 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
230 230
231 r = kvmppc_st(vcpu, &addr, 32, zeros, true); 231 r = kvmppc_st(vcpu, &addr, 32, zeros, true);
232 if ((r == -ENOENT) || (r == -EPERM)) { 232 if ((r == -ENOENT) || (r == -EPERM)) {
233 struct kvmppc_book3s_shadow_vcpu *svcpu;
234
235 svcpu = svcpu_get(vcpu);
233 *advance = 0; 236 *advance = 0;
234 vcpu->arch.shared->dar = vaddr; 237 vcpu->arch.shared->dar = vaddr;
235 to_svcpu(vcpu)->fault_dar = vaddr; 238 svcpu->fault_dar = vaddr;
236 239
237 dsisr = DSISR_ISSTORE; 240 dsisr = DSISR_ISSTORE;
238 if (r == -ENOENT) 241 if (r == -ENOENT)
@@ -241,7 +244,8 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
241 dsisr |= DSISR_PROTFAULT; 244 dsisr |= DSISR_PROTFAULT;
242 245
243 vcpu->arch.shared->dsisr = dsisr; 246 vcpu->arch.shared->dsisr = dsisr;
244 to_svcpu(vcpu)->fault_dsisr = dsisr; 247 svcpu->fault_dsisr = dsisr;
248 svcpu_put(svcpu);
245 249
246 kvmppc_book3s_queue_irqprio(vcpu, 250 kvmppc_book3s_queue_irqprio(vcpu,
247 BOOK3S_INTERRUPT_DATA_STORAGE); 251 BOOK3S_INTERRUPT_DATA_STORAGE);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index a7267167a550..d386b6198bc7 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -48,22 +48,14 @@
48#include <linux/gfp.h> 48#include <linux/gfp.h>
49#include <linux/vmalloc.h> 49#include <linux/vmalloc.h>
50#include <linux/highmem.h> 50#include <linux/highmem.h>
51 51#include <linux/hugetlb.h>
52/*
53 * For now, limit memory to 64GB and require it to be large pages.
54 * This value is chosen because it makes the ram_pginfo array be
55 * 64kB in size, which is about as large as we want to be trying
56 * to allocate with kmalloc.
57 */
58#define MAX_MEM_ORDER 36
59
60#define LARGE_PAGE_ORDER 24 /* 16MB pages */
61 52
62/* #define EXIT_DEBUG */ 53/* #define EXIT_DEBUG */
63/* #define EXIT_DEBUG_SIMPLE */ 54/* #define EXIT_DEBUG_SIMPLE */
64/* #define EXIT_DEBUG_INT */ 55/* #define EXIT_DEBUG_INT */
65 56
66static void kvmppc_end_cede(struct kvm_vcpu *vcpu); 57static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
58static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu);
67 59
68void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 60void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
69{ 61{
@@ -146,10 +138,10 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
146 unsigned long vcpuid, unsigned long vpa) 138 unsigned long vcpuid, unsigned long vpa)
147{ 139{
148 struct kvm *kvm = vcpu->kvm; 140 struct kvm *kvm = vcpu->kvm;
149 unsigned long pg_index, ra, len; 141 unsigned long len, nb;
150 unsigned long pg_offset;
151 void *va; 142 void *va;
152 struct kvm_vcpu *tvcpu; 143 struct kvm_vcpu *tvcpu;
144 int err = H_PARAMETER;
153 145
154 tvcpu = kvmppc_find_vcpu(kvm, vcpuid); 146 tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
155 if (!tvcpu) 147 if (!tvcpu)
@@ -162,45 +154,41 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
162 if (flags < 4) { 154 if (flags < 4) {
163 if (vpa & 0x7f) 155 if (vpa & 0x7f)
164 return H_PARAMETER; 156 return H_PARAMETER;
157 if (flags >= 2 && !tvcpu->arch.vpa)
158 return H_RESOURCE;
165 /* registering new area; convert logical addr to real */ 159 /* registering new area; convert logical addr to real */
166 pg_index = vpa >> kvm->arch.ram_porder; 160 va = kvmppc_pin_guest_page(kvm, vpa, &nb);
167 pg_offset = vpa & (kvm->arch.ram_psize - 1); 161 if (va == NULL)
168 if (pg_index >= kvm->arch.ram_npages)
169 return H_PARAMETER; 162 return H_PARAMETER;
170 if (kvm->arch.ram_pginfo[pg_index].pfn == 0)
171 return H_PARAMETER;
172 ra = kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT;
173 ra |= pg_offset;
174 va = __va(ra);
175 if (flags <= 1) 163 if (flags <= 1)
176 len = *(unsigned short *)(va + 4); 164 len = *(unsigned short *)(va + 4);
177 else 165 else
178 len = *(unsigned int *)(va + 4); 166 len = *(unsigned int *)(va + 4);
179 if (pg_offset + len > kvm->arch.ram_psize) 167 if (len > nb)
180 return H_PARAMETER; 168 goto out_unpin;
181 switch (flags) { 169 switch (flags) {
182 case 1: /* register VPA */ 170 case 1: /* register VPA */
183 if (len < 640) 171 if (len < 640)
184 return H_PARAMETER; 172 goto out_unpin;
173 if (tvcpu->arch.vpa)
174 kvmppc_unpin_guest_page(kvm, vcpu->arch.vpa);
185 tvcpu->arch.vpa = va; 175 tvcpu->arch.vpa = va;
186 init_vpa(vcpu, va); 176 init_vpa(vcpu, va);
187 break; 177 break;
188 case 2: /* register DTL */ 178 case 2: /* register DTL */
189 if (len < 48) 179 if (len < 48)
190 return H_PARAMETER; 180 goto out_unpin;
191 if (!tvcpu->arch.vpa)
192 return H_RESOURCE;
193 len -= len % 48; 181 len -= len % 48;
182 if (tvcpu->arch.dtl)
183 kvmppc_unpin_guest_page(kvm, vcpu->arch.dtl);
194 tvcpu->arch.dtl = va; 184 tvcpu->arch.dtl = va;
195 tvcpu->arch.dtl_end = va + len; 185 tvcpu->arch.dtl_end = va + len;
196 break; 186 break;
197 case 3: /* register SLB shadow buffer */ 187 case 3: /* register SLB shadow buffer */
198 if (len < 8) 188 if (len < 16)
199 return H_PARAMETER; 189 goto out_unpin;
200 if (!tvcpu->arch.vpa) 190 if (tvcpu->arch.slb_shadow)
201 return H_RESOURCE; 191 kvmppc_unpin_guest_page(kvm, vcpu->arch.slb_shadow);
202 tvcpu->arch.slb_shadow = va;
203 len = (len - 16) / 16;
204 tvcpu->arch.slb_shadow = va; 192 tvcpu->arch.slb_shadow = va;
205 break; 193 break;
206 } 194 }
@@ -209,17 +197,30 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
209 case 5: /* unregister VPA */ 197 case 5: /* unregister VPA */
210 if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl) 198 if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl)
211 return H_RESOURCE; 199 return H_RESOURCE;
200 if (!tvcpu->arch.vpa)
201 break;
202 kvmppc_unpin_guest_page(kvm, tvcpu->arch.vpa);
212 tvcpu->arch.vpa = NULL; 203 tvcpu->arch.vpa = NULL;
213 break; 204 break;
214 case 6: /* unregister DTL */ 205 case 6: /* unregister DTL */
206 if (!tvcpu->arch.dtl)
207 break;
208 kvmppc_unpin_guest_page(kvm, tvcpu->arch.dtl);
215 tvcpu->arch.dtl = NULL; 209 tvcpu->arch.dtl = NULL;
216 break; 210 break;
217 case 7: /* unregister SLB shadow buffer */ 211 case 7: /* unregister SLB shadow buffer */
212 if (!tvcpu->arch.slb_shadow)
213 break;
214 kvmppc_unpin_guest_page(kvm, tvcpu->arch.slb_shadow);
218 tvcpu->arch.slb_shadow = NULL; 215 tvcpu->arch.slb_shadow = NULL;
219 break; 216 break;
220 } 217 }
221 } 218 }
222 return H_SUCCESS; 219 return H_SUCCESS;
220
221 out_unpin:
222 kvmppc_unpin_guest_page(kvm, va);
223 return err;
223} 224}
224 225
225int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) 226int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
@@ -229,6 +230,12 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
229 struct kvm_vcpu *tvcpu; 230 struct kvm_vcpu *tvcpu;
230 231
231 switch (req) { 232 switch (req) {
233 case H_ENTER:
234 ret = kvmppc_virtmode_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4),
235 kvmppc_get_gpr(vcpu, 5),
236 kvmppc_get_gpr(vcpu, 6),
237 kvmppc_get_gpr(vcpu, 7));
238 break;
232 case H_CEDE: 239 case H_CEDE:
233 break; 240 break;
234 case H_PROD: 241 case H_PROD:
@@ -318,20 +325,19 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
318 break; 325 break;
319 } 326 }
320 /* 327 /*
321 * We get these next two if the guest does a bad real-mode access, 328 * We get these next two if the guest accesses a page which it thinks
322 * as we have enabled VRMA (virtualized real mode area) mode in the 329 * it has mapped but which is not actually present, either because
323 * LPCR. We just generate an appropriate DSI/ISI to the guest. 330 * it is for an emulated I/O device or because the corresonding
331 * host page has been paged out. Any other HDSI/HISI interrupts
332 * have been handled already.
324 */ 333 */
325 case BOOK3S_INTERRUPT_H_DATA_STORAGE: 334 case BOOK3S_INTERRUPT_H_DATA_STORAGE:
326 vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr; 335 r = kvmppc_book3s_hv_page_fault(run, vcpu,
327 vcpu->arch.shregs.dar = vcpu->arch.fault_dar; 336 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
328 kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
329 r = RESUME_GUEST;
330 break; 337 break;
331 case BOOK3S_INTERRUPT_H_INST_STORAGE: 338 case BOOK3S_INTERRUPT_H_INST_STORAGE:
332 kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE, 339 r = kvmppc_book3s_hv_page_fault(run, vcpu,
333 0x08000000); 340 kvmppc_get_pc(vcpu), 0);
334 r = RESUME_GUEST;
335 break; 341 break;
336 /* 342 /*
337 * This occurs if the guest executes an illegal instruction. 343 * This occurs if the guest executes an illegal instruction.
@@ -391,6 +397,42 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
391 return 0; 397 return 0;
392} 398}
393 399
400int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
401{
402 int r = -EINVAL;
403
404 switch (reg->id) {
405 case KVM_REG_PPC_HIOR:
406 r = put_user(0, (u64 __user *)reg->addr);
407 break;
408 default:
409 break;
410 }
411
412 return r;
413}
414
415int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
416{
417 int r = -EINVAL;
418
419 switch (reg->id) {
420 case KVM_REG_PPC_HIOR:
421 {
422 u64 hior;
423 /* Only allow this to be set to zero */
424 r = get_user(hior, (u64 __user *)reg->addr);
425 if (!r && (hior != 0))
426 r = -EINVAL;
427 break;
428 }
429 default:
430 break;
431 }
432
433 return r;
434}
435
394int kvmppc_core_check_processor_compat(void) 436int kvmppc_core_check_processor_compat(void)
395{ 437{
396 if (cpu_has_feature(CPU_FTR_HVMODE)) 438 if (cpu_has_feature(CPU_FTR_HVMODE))
@@ -410,7 +452,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
410 goto out; 452 goto out;
411 453
412 err = -ENOMEM; 454 err = -ENOMEM;
413 vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); 455 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
414 if (!vcpu) 456 if (!vcpu)
415 goto out; 457 goto out;
416 458
@@ -462,15 +504,21 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
462 return vcpu; 504 return vcpu;
463 505
464free_vcpu: 506free_vcpu:
465 kfree(vcpu); 507 kmem_cache_free(kvm_vcpu_cache, vcpu);
466out: 508out:
467 return ERR_PTR(err); 509 return ERR_PTR(err);
468} 510}
469 511
470void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) 512void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
471{ 513{
514 if (vcpu->arch.dtl)
515 kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.dtl);
516 if (vcpu->arch.slb_shadow)
517 kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.slb_shadow);
518 if (vcpu->arch.vpa)
519 kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa);
472 kvm_vcpu_uninit(vcpu); 520 kvm_vcpu_uninit(vcpu);
473 kfree(vcpu); 521 kmem_cache_free(kvm_vcpu_cache, vcpu);
474} 522}
475 523
476static void kvmppc_set_timer(struct kvm_vcpu *vcpu) 524static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
@@ -481,7 +529,7 @@ static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
481 if (now > vcpu->arch.dec_expires) { 529 if (now > vcpu->arch.dec_expires) {
482 /* decrementer has already gone negative */ 530 /* decrementer has already gone negative */
483 kvmppc_core_queue_dec(vcpu); 531 kvmppc_core_queue_dec(vcpu);
484 kvmppc_core_deliver_interrupts(vcpu); 532 kvmppc_core_prepare_to_enter(vcpu);
485 return; 533 return;
486 } 534 }
487 dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC 535 dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC
@@ -796,7 +844,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
796 844
797 list_for_each_entry_safe(v, vn, &vc->runnable_threads, 845 list_for_each_entry_safe(v, vn, &vc->runnable_threads,
798 arch.run_list) { 846 arch.run_list) {
799 kvmppc_core_deliver_interrupts(v); 847 kvmppc_core_prepare_to_enter(v);
800 if (signal_pending(v->arch.run_task)) { 848 if (signal_pending(v->arch.run_task)) {
801 kvmppc_remove_runnable(vc, v); 849 kvmppc_remove_runnable(vc, v);
802 v->stat.signal_exits++; 850 v->stat.signal_exits++;
@@ -835,20 +883,26 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
835 return -EINVAL; 883 return -EINVAL;
836 } 884 }
837 885
886 kvmppc_core_prepare_to_enter(vcpu);
887
838 /* No need to go into the guest when all we'll do is come back out */ 888 /* No need to go into the guest when all we'll do is come back out */
839 if (signal_pending(current)) { 889 if (signal_pending(current)) {
840 run->exit_reason = KVM_EXIT_INTR; 890 run->exit_reason = KVM_EXIT_INTR;
841 return -EINTR; 891 return -EINTR;
842 } 892 }
843 893
844 /* On PPC970, check that we have an RMA region */ 894 /* On the first time here, set up VRMA or RMA */
845 if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201)) 895 if (!vcpu->kvm->arch.rma_setup_done) {
846 return -EPERM; 896 r = kvmppc_hv_setup_rma(vcpu);
897 if (r)
898 return r;
899 }
847 900
848 flush_fp_to_thread(current); 901 flush_fp_to_thread(current);
849 flush_altivec_to_thread(current); 902 flush_altivec_to_thread(current);
850 flush_vsx_to_thread(current); 903 flush_vsx_to_thread(current);
851 vcpu->arch.wqp = &vcpu->arch.vcore->wq; 904 vcpu->arch.wqp = &vcpu->arch.vcore->wq;
905 vcpu->arch.pgdir = current->mm->pgd;
852 906
853 do { 907 do {
854 r = kvmppc_run_vcpu(run, vcpu); 908 r = kvmppc_run_vcpu(run, vcpu);
@@ -856,7 +910,7 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
856 if (run->exit_reason == KVM_EXIT_PAPR_HCALL && 910 if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
857 !(vcpu->arch.shregs.msr & MSR_PR)) { 911 !(vcpu->arch.shregs.msr & MSR_PR)) {
858 r = kvmppc_pseries_do_hcall(vcpu); 912 r = kvmppc_pseries_do_hcall(vcpu);
859 kvmppc_core_deliver_interrupts(vcpu); 913 kvmppc_core_prepare_to_enter(vcpu);
860 } 914 }
861 } while (r == RESUME_GUEST); 915 } while (r == RESUME_GUEST);
862 return r; 916 return r;
@@ -1000,7 +1054,7 @@ static inline int lpcr_rmls(unsigned long rma_size)
1000 1054
1001static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1055static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1002{ 1056{
1003 struct kvmppc_rma_info *ri = vma->vm_file->private_data; 1057 struct kvmppc_linear_info *ri = vma->vm_file->private_data;
1004 struct page *page; 1058 struct page *page;
1005 1059
1006 if (vmf->pgoff >= ri->npages) 1060 if (vmf->pgoff >= ri->npages)
@@ -1025,7 +1079,7 @@ static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma)
1025 1079
1026static int kvm_rma_release(struct inode *inode, struct file *filp) 1080static int kvm_rma_release(struct inode *inode, struct file *filp)
1027{ 1081{
1028 struct kvmppc_rma_info *ri = filp->private_data; 1082 struct kvmppc_linear_info *ri = filp->private_data;
1029 1083
1030 kvm_release_rma(ri); 1084 kvm_release_rma(ri);
1031 return 0; 1085 return 0;
@@ -1038,7 +1092,7 @@ static struct file_operations kvm_rma_fops = {
1038 1092
1039long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret) 1093long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
1040{ 1094{
1041 struct kvmppc_rma_info *ri; 1095 struct kvmppc_linear_info *ri;
1042 long fd; 1096 long fd;
1043 1097
1044 ri = kvm_alloc_rma(); 1098 ri = kvm_alloc_rma();
@@ -1053,89 +1107,189 @@ long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
1053 return fd; 1107 return fd;
1054} 1108}
1055 1109
1056static struct page *hva_to_page(unsigned long addr) 1110/*
1111 * Get (and clear) the dirty memory log for a memory slot.
1112 */
1113int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
1057{ 1114{
1058 struct page *page[1]; 1115 struct kvm_memory_slot *memslot;
1059 int npages; 1116 int r;
1117 unsigned long n;
1060 1118
1061 might_sleep(); 1119 mutex_lock(&kvm->slots_lock);
1062 1120
1063 npages = get_user_pages_fast(addr, 1, 1, page); 1121 r = -EINVAL;
1122 if (log->slot >= KVM_MEMORY_SLOTS)
1123 goto out;
1064 1124
1065 if (unlikely(npages != 1)) 1125 memslot = id_to_memslot(kvm->memslots, log->slot);
1066 return 0; 1126 r = -ENOENT;
1127 if (!memslot->dirty_bitmap)
1128 goto out;
1129
1130 n = kvm_dirty_bitmap_bytes(memslot);
1131 memset(memslot->dirty_bitmap, 0, n);
1132
1133 r = kvmppc_hv_get_dirty_log(kvm, memslot);
1134 if (r)
1135 goto out;
1067 1136
1068 return page[0]; 1137 r = -EFAULT;
1138 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
1139 goto out;
1140
1141 r = 0;
1142out:
1143 mutex_unlock(&kvm->slots_lock);
1144 return r;
1145}
1146
1147static unsigned long slb_pgsize_encoding(unsigned long psize)
1148{
1149 unsigned long senc = 0;
1150
1151 if (psize > 0x1000) {
1152 senc = SLB_VSID_L;
1153 if (psize == 0x10000)
1154 senc |= SLB_VSID_LP_01;
1155 }
1156 return senc;
1069} 1157}
1070 1158
1071int kvmppc_core_prepare_memory_region(struct kvm *kvm, 1159int kvmppc_core_prepare_memory_region(struct kvm *kvm,
1072 struct kvm_userspace_memory_region *mem) 1160 struct kvm_userspace_memory_region *mem)
1073{ 1161{
1074 unsigned long psize, porder; 1162 unsigned long npages;
1075 unsigned long i, npages, totalpages; 1163 unsigned long *phys;
1076 unsigned long pg_ix; 1164
1077 struct kvmppc_pginfo *pginfo; 1165 /* Allocate a slot_phys array */
1078 unsigned long hva; 1166 phys = kvm->arch.slot_phys[mem->slot];
1079 struct kvmppc_rma_info *ri = NULL; 1167 if (!kvm->arch.using_mmu_notifiers && !phys) {
1168 npages = mem->memory_size >> PAGE_SHIFT;
1169 phys = vzalloc(npages * sizeof(unsigned long));
1170 if (!phys)
1171 return -ENOMEM;
1172 kvm->arch.slot_phys[mem->slot] = phys;
1173 kvm->arch.slot_npages[mem->slot] = npages;
1174 }
1175
1176 return 0;
1177}
1178
1179static void unpin_slot(struct kvm *kvm, int slot_id)
1180{
1181 unsigned long *physp;
1182 unsigned long j, npages, pfn;
1080 struct page *page; 1183 struct page *page;
1081 1184
1082 /* For now, only allow 16MB pages */ 1185 physp = kvm->arch.slot_phys[slot_id];
1083 porder = LARGE_PAGE_ORDER; 1186 npages = kvm->arch.slot_npages[slot_id];
1084 psize = 1ul << porder; 1187 if (physp) {
1085 if ((mem->memory_size & (psize - 1)) || 1188 spin_lock(&kvm->arch.slot_phys_lock);
1086 (mem->guest_phys_addr & (psize - 1))) { 1189 for (j = 0; j < npages; j++) {
1087 pr_err("bad memory_size=%llx @ %llx\n", 1190 if (!(physp[j] & KVMPPC_GOT_PAGE))
1088 mem->memory_size, mem->guest_phys_addr); 1191 continue;
1089 return -EINVAL; 1192 pfn = physp[j] >> PAGE_SHIFT;
1193 page = pfn_to_page(pfn);
1194 if (PageHuge(page))
1195 page = compound_head(page);
1196 SetPageDirty(page);
1197 put_page(page);
1198 }
1199 kvm->arch.slot_phys[slot_id] = NULL;
1200 spin_unlock(&kvm->arch.slot_phys_lock);
1201 vfree(physp);
1090 } 1202 }
1203}
1091 1204
1092 npages = mem->memory_size >> porder; 1205void kvmppc_core_commit_memory_region(struct kvm *kvm,
1093 totalpages = (mem->guest_phys_addr + mem->memory_size) >> porder; 1206 struct kvm_userspace_memory_region *mem)
1207{
1208}
1094 1209
1095 /* More memory than we have space to track? */ 1210static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu)
1096 if (totalpages > (1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER))) 1211{
1097 return -EINVAL; 1212 int err = 0;
1213 struct kvm *kvm = vcpu->kvm;
1214 struct kvmppc_linear_info *ri = NULL;
1215 unsigned long hva;
1216 struct kvm_memory_slot *memslot;
1217 struct vm_area_struct *vma;
1218 unsigned long lpcr, senc;
1219 unsigned long psize, porder;
1220 unsigned long rma_size;
1221 unsigned long rmls;
1222 unsigned long *physp;
1223 unsigned long i, npages;
1098 1224
1099 /* Do we already have an RMA registered? */ 1225 mutex_lock(&kvm->lock);
1100 if (mem->guest_phys_addr == 0 && kvm->arch.rma) 1226 if (kvm->arch.rma_setup_done)
1101 return -EINVAL; 1227 goto out; /* another vcpu beat us to it */
1102 1228
1103 if (totalpages > kvm->arch.ram_npages) 1229 /* Look up the memslot for guest physical address 0 */
1104 kvm->arch.ram_npages = totalpages; 1230 memslot = gfn_to_memslot(kvm, 0);
1231
1232 /* We must have some memory at 0 by now */
1233 err = -EINVAL;
1234 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
1235 goto out;
1236
1237 /* Look up the VMA for the start of this memory slot */
1238 hva = memslot->userspace_addr;
1239 down_read(&current->mm->mmap_sem);
1240 vma = find_vma(current->mm, hva);
1241 if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO))
1242 goto up_out;
1243
1244 psize = vma_kernel_pagesize(vma);
1245 porder = __ilog2(psize);
1105 1246
1106 /* Is this one of our preallocated RMAs? */ 1247 /* Is this one of our preallocated RMAs? */
1107 if (mem->guest_phys_addr == 0) { 1248 if (vma->vm_file && vma->vm_file->f_op == &kvm_rma_fops &&
1108 struct vm_area_struct *vma; 1249 hva == vma->vm_start)
1109 1250 ri = vma->vm_file->private_data;
1110 down_read(&current->mm->mmap_sem); 1251
1111 vma = find_vma(current->mm, mem->userspace_addr); 1252 up_read(&current->mm->mmap_sem);
1112 if (vma && vma->vm_file && 1253
1113 vma->vm_file->f_op == &kvm_rma_fops && 1254 if (!ri) {
1114 mem->userspace_addr == vma->vm_start) 1255 /* On POWER7, use VRMA; on PPC970, give up */
1115 ri = vma->vm_file->private_data; 1256 err = -EPERM;
1116 up_read(&current->mm->mmap_sem); 1257 if (cpu_has_feature(CPU_FTR_ARCH_201)) {
1117 if (!ri && cpu_has_feature(CPU_FTR_ARCH_201)) { 1258 pr_err("KVM: CPU requires an RMO\n");
1118 pr_err("CPU requires an RMO\n"); 1259 goto out;
1119 return -EINVAL;
1120 } 1260 }
1121 }
1122 1261
1123 if (ri) { 1262 /* We can handle 4k, 64k or 16M pages in the VRMA */
1124 unsigned long rma_size; 1263 err = -EINVAL;
1125 unsigned long lpcr; 1264 if (!(psize == 0x1000 || psize == 0x10000 ||
1126 long rmls; 1265 psize == 0x1000000))
1266 goto out;
1267
1268 /* Update VRMASD field in the LPCR */
1269 senc = slb_pgsize_encoding(psize);
1270 kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
1271 (VRMA_VSID << SLB_VSID_SHIFT_1T);
1272 lpcr = kvm->arch.lpcr & ~LPCR_VRMASD;
1273 lpcr |= senc << (LPCR_VRMASD_SH - 4);
1274 kvm->arch.lpcr = lpcr;
1127 1275
1128 rma_size = ri->npages << PAGE_SHIFT; 1276 /* Create HPTEs in the hash page table for the VRMA */
1129 if (rma_size > mem->memory_size) 1277 kvmppc_map_vrma(vcpu, memslot, porder);
1130 rma_size = mem->memory_size; 1278
1279 } else {
1280 /* Set up to use an RMO region */
1281 rma_size = ri->npages;
1282 if (rma_size > memslot->npages)
1283 rma_size = memslot->npages;
1284 rma_size <<= PAGE_SHIFT;
1131 rmls = lpcr_rmls(rma_size); 1285 rmls = lpcr_rmls(rma_size);
1286 err = -EINVAL;
1132 if (rmls < 0) { 1287 if (rmls < 0) {
1133 pr_err("Can't use RMA of 0x%lx bytes\n", rma_size); 1288 pr_err("KVM: Can't use RMA of 0x%lx bytes\n", rma_size);
1134 return -EINVAL; 1289 goto out;
1135 } 1290 }
1136 atomic_inc(&ri->use_count); 1291 atomic_inc(&ri->use_count);
1137 kvm->arch.rma = ri; 1292 kvm->arch.rma = ri;
1138 kvm->arch.n_rma_pages = rma_size >> porder;
1139 1293
1140 /* Update LPCR and RMOR */ 1294 /* Update LPCR and RMOR */
1141 lpcr = kvm->arch.lpcr; 1295 lpcr = kvm->arch.lpcr;
@@ -1155,53 +1309,35 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
1155 kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT; 1309 kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
1156 } 1310 }
1157 kvm->arch.lpcr = lpcr; 1311 kvm->arch.lpcr = lpcr;
1158 pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n", 1312 pr_info("KVM: Using RMO at %lx size %lx (LPCR = %lx)\n",
1159 ri->base_pfn << PAGE_SHIFT, rma_size, lpcr); 1313 ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
1160 }
1161 1314
1162 pg_ix = mem->guest_phys_addr >> porder; 1315 /* Initialize phys addrs of pages in RMO */
1163 pginfo = kvm->arch.ram_pginfo + pg_ix; 1316 npages = ri->npages;
1164 for (i = 0; i < npages; ++i, ++pg_ix) { 1317 porder = __ilog2(npages);
1165 if (ri && pg_ix < kvm->arch.n_rma_pages) { 1318 physp = kvm->arch.slot_phys[memslot->id];
1166 pginfo[i].pfn = ri->base_pfn + 1319 spin_lock(&kvm->arch.slot_phys_lock);
1167 (pg_ix << (porder - PAGE_SHIFT)); 1320 for (i = 0; i < npages; ++i)
1168 continue; 1321 physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) + porder;
1169 } 1322 spin_unlock(&kvm->arch.slot_phys_lock);
1170 hva = mem->userspace_addr + (i << porder);
1171 page = hva_to_page(hva);
1172 if (!page) {
1173 pr_err("oops, no pfn for hva %lx\n", hva);
1174 goto err;
1175 }
1176 /* Check it's a 16MB page */
1177 if (!PageHead(page) ||
1178 compound_order(page) != (LARGE_PAGE_ORDER - PAGE_SHIFT)) {
1179 pr_err("page at %lx isn't 16MB (o=%d)\n",
1180 hva, compound_order(page));
1181 goto err;
1182 }
1183 pginfo[i].pfn = page_to_pfn(page);
1184 } 1323 }
1185 1324
1186 return 0; 1325 /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */
1187 1326 smp_wmb();
1188 err: 1327 kvm->arch.rma_setup_done = 1;
1189 return -EINVAL; 1328 err = 0;
1190} 1329 out:
1330 mutex_unlock(&kvm->lock);
1331 return err;
1191 1332
1192void kvmppc_core_commit_memory_region(struct kvm *kvm, 1333 up_out:
1193 struct kvm_userspace_memory_region *mem) 1334 up_read(&current->mm->mmap_sem);
1194{ 1335 goto out;
1195 if (mem->guest_phys_addr == 0 && mem->memory_size != 0 &&
1196 !kvm->arch.rma)
1197 kvmppc_map_vrma(kvm, mem);
1198} 1336}
1199 1337
1200int kvmppc_core_init_vm(struct kvm *kvm) 1338int kvmppc_core_init_vm(struct kvm *kvm)
1201{ 1339{
1202 long r; 1340 long r;
1203 unsigned long npages = 1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER);
1204 long err = -ENOMEM;
1205 unsigned long lpcr; 1341 unsigned long lpcr;
1206 1342
1207 /* Allocate hashed page table */ 1343 /* Allocate hashed page table */
@@ -1211,19 +1347,7 @@ int kvmppc_core_init_vm(struct kvm *kvm)
1211 1347
1212 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); 1348 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
1213 1349
1214 kvm->arch.ram_pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo),
1215 GFP_KERNEL);
1216 if (!kvm->arch.ram_pginfo) {
1217 pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n",
1218 npages * sizeof(struct kvmppc_pginfo));
1219 goto out_free;
1220 }
1221
1222 kvm->arch.ram_npages = 0;
1223 kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER;
1224 kvm->arch.ram_porder = LARGE_PAGE_ORDER;
1225 kvm->arch.rma = NULL; 1350 kvm->arch.rma = NULL;
1226 kvm->arch.n_rma_pages = 0;
1227 1351
1228 kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); 1352 kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
1229 1353
@@ -1241,30 +1365,25 @@ int kvmppc_core_init_vm(struct kvm *kvm)
1241 kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR); 1365 kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
1242 lpcr &= LPCR_PECE | LPCR_LPES; 1366 lpcr &= LPCR_PECE | LPCR_LPES;
1243 lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE | 1367 lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
1244 LPCR_VPM0 | LPCR_VRMA_L; 1368 LPCR_VPM0 | LPCR_VPM1;
1369 kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
1370 (VRMA_VSID << SLB_VSID_SHIFT_1T);
1245 } 1371 }
1246 kvm->arch.lpcr = lpcr; 1372 kvm->arch.lpcr = lpcr;
1247 1373
1374 kvm->arch.using_mmu_notifiers = !!cpu_has_feature(CPU_FTR_ARCH_206);
1375 spin_lock_init(&kvm->arch.slot_phys_lock);
1248 return 0; 1376 return 0;
1249
1250 out_free:
1251 kvmppc_free_hpt(kvm);
1252 return err;
1253} 1377}
1254 1378
1255void kvmppc_core_destroy_vm(struct kvm *kvm) 1379void kvmppc_core_destroy_vm(struct kvm *kvm)
1256{ 1380{
1257 struct kvmppc_pginfo *pginfo;
1258 unsigned long i; 1381 unsigned long i;
1259 1382
1260 if (kvm->arch.ram_pginfo) { 1383 if (!kvm->arch.using_mmu_notifiers)
1261 pginfo = kvm->arch.ram_pginfo; 1384 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
1262 kvm->arch.ram_pginfo = NULL; 1385 unpin_slot(kvm, i);
1263 for (i = kvm->arch.n_rma_pages; i < kvm->arch.ram_npages; ++i) 1386
1264 if (pginfo[i].pfn)
1265 put_page(pfn_to_page(pginfo[i].pfn));
1266 kfree(pginfo);
1267 }
1268 if (kvm->arch.rma) { 1387 if (kvm->arch.rma) {
1269 kvm_release_rma(kvm->arch.rma); 1388 kvm_release_rma(kvm->arch.rma);
1270 kvm->arch.rma = NULL; 1389 kvm->arch.rma = NULL;
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index a795a13f4a70..bed1279aa6a8 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -18,6 +18,15 @@
18#include <asm/kvm_ppc.h> 18#include <asm/kvm_ppc.h>
19#include <asm/kvm_book3s.h> 19#include <asm/kvm_book3s.h>
20 20
21#define KVM_LINEAR_RMA 0
22#define KVM_LINEAR_HPT 1
23
24static void __init kvm_linear_init_one(ulong size, int count, int type);
25static struct kvmppc_linear_info *kvm_alloc_linear(int type);
26static void kvm_release_linear(struct kvmppc_linear_info *ri);
27
28/*************** RMA *************/
29
21/* 30/*
22 * This maintains a list of RMAs (real mode areas) for KVM guests to use. 31 * This maintains a list of RMAs (real mode areas) for KVM guests to use.
23 * Each RMA has to be physically contiguous and of a size that the 32 * Each RMA has to be physically contiguous and of a size that the
@@ -29,32 +38,6 @@
29static unsigned long kvm_rma_size = 64 << 20; /* 64MB */ 38static unsigned long kvm_rma_size = 64 << 20; /* 64MB */
30static unsigned long kvm_rma_count; 39static unsigned long kvm_rma_count;
31 40
32static int __init early_parse_rma_size(char *p)
33{
34 if (!p)
35 return 1;
36
37 kvm_rma_size = memparse(p, &p);
38
39 return 0;
40}
41early_param("kvm_rma_size", early_parse_rma_size);
42
43static int __init early_parse_rma_count(char *p)
44{
45 if (!p)
46 return 1;
47
48 kvm_rma_count = simple_strtoul(p, NULL, 0);
49
50 return 0;
51}
52early_param("kvm_rma_count", early_parse_rma_count);
53
54static struct kvmppc_rma_info *rma_info;
55static LIST_HEAD(free_rmas);
56static DEFINE_SPINLOCK(rma_lock);
57
58/* Work out RMLS (real mode limit selector) field value for a given RMA size. 41/* Work out RMLS (real mode limit selector) field value for a given RMA size.
59 Assumes POWER7 or PPC970. */ 42 Assumes POWER7 or PPC970. */
60static inline int lpcr_rmls(unsigned long rma_size) 43static inline int lpcr_rmls(unsigned long rma_size)
@@ -81,45 +64,106 @@ static inline int lpcr_rmls(unsigned long rma_size)
81 } 64 }
82} 65}
83 66
67static int __init early_parse_rma_size(char *p)
68{
69 if (!p)
70 return 1;
71
72 kvm_rma_size = memparse(p, &p);
73
74 return 0;
75}
76early_param("kvm_rma_size", early_parse_rma_size);
77
78static int __init early_parse_rma_count(char *p)
79{
80 if (!p)
81 return 1;
82
83 kvm_rma_count = simple_strtoul(p, NULL, 0);
84
85 return 0;
86}
87early_param("kvm_rma_count", early_parse_rma_count);
88
89struct kvmppc_linear_info *kvm_alloc_rma(void)
90{
91 return kvm_alloc_linear(KVM_LINEAR_RMA);
92}
93EXPORT_SYMBOL_GPL(kvm_alloc_rma);
94
95void kvm_release_rma(struct kvmppc_linear_info *ri)
96{
97 kvm_release_linear(ri);
98}
99EXPORT_SYMBOL_GPL(kvm_release_rma);
100
101/*************** HPT *************/
102
84/* 103/*
85 * Called at boot time while the bootmem allocator is active, 104 * This maintains a list of big linear HPT tables that contain the GVA->HPA
86 * to allocate contiguous physical memory for the real memory 105 * memory mappings. If we don't reserve those early on, we might not be able
87 * areas for guests. 106 * to get a big (usually 16MB) linear memory region from the kernel anymore.
88 */ 107 */
89void __init kvm_rma_init(void) 108
109static unsigned long kvm_hpt_count;
110
111static int __init early_parse_hpt_count(char *p)
112{
113 if (!p)
114 return 1;
115
116 kvm_hpt_count = simple_strtoul(p, NULL, 0);
117
118 return 0;
119}
120early_param("kvm_hpt_count", early_parse_hpt_count);
121
122struct kvmppc_linear_info *kvm_alloc_hpt(void)
123{
124 return kvm_alloc_linear(KVM_LINEAR_HPT);
125}
126EXPORT_SYMBOL_GPL(kvm_alloc_hpt);
127
128void kvm_release_hpt(struct kvmppc_linear_info *li)
129{
130 kvm_release_linear(li);
131}
132EXPORT_SYMBOL_GPL(kvm_release_hpt);
133
134/*************** generic *************/
135
136static LIST_HEAD(free_linears);
137static DEFINE_SPINLOCK(linear_lock);
138
139static void __init kvm_linear_init_one(ulong size, int count, int type)
90{ 140{
91 unsigned long i; 141 unsigned long i;
92 unsigned long j, npages; 142 unsigned long j, npages;
93 void *rma; 143 void *linear;
94 struct page *pg; 144 struct page *pg;
145 const char *typestr;
146 struct kvmppc_linear_info *linear_info;
95 147
96 /* Only do this on PPC970 in HV mode */ 148 if (!count)
97 if (!cpu_has_feature(CPU_FTR_HVMODE) ||
98 !cpu_has_feature(CPU_FTR_ARCH_201))
99 return;
100
101 if (!kvm_rma_size || !kvm_rma_count)
102 return; 149 return;
103 150
104 /* Check that the requested size is one supported in hardware */ 151 typestr = (type == KVM_LINEAR_RMA) ? "RMA" : "HPT";
105 if (lpcr_rmls(kvm_rma_size) < 0) { 152
106 pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size); 153 npages = size >> PAGE_SHIFT;
107 return; 154 linear_info = alloc_bootmem(count * sizeof(struct kvmppc_linear_info));
108 } 155 for (i = 0; i < count; ++i) {
109 156 linear = alloc_bootmem_align(size, size);
110 npages = kvm_rma_size >> PAGE_SHIFT; 157 pr_info("Allocated KVM %s at %p (%ld MB)\n", typestr, linear,
111 rma_info = alloc_bootmem(kvm_rma_count * sizeof(struct kvmppc_rma_info)); 158 size >> 20);
112 for (i = 0; i < kvm_rma_count; ++i) { 159 linear_info[i].base_virt = linear;
113 rma = alloc_bootmem_align(kvm_rma_size, kvm_rma_size); 160 linear_info[i].base_pfn = __pa(linear) >> PAGE_SHIFT;
114 pr_info("Allocated KVM RMA at %p (%ld MB)\n", rma, 161 linear_info[i].npages = npages;
115 kvm_rma_size >> 20); 162 linear_info[i].type = type;
116 rma_info[i].base_virt = rma; 163 list_add_tail(&linear_info[i].list, &free_linears);
117 rma_info[i].base_pfn = __pa(rma) >> PAGE_SHIFT; 164 atomic_set(&linear_info[i].use_count, 0);
118 rma_info[i].npages = npages; 165
119 list_add_tail(&rma_info[i].list, &free_rmas); 166 pg = pfn_to_page(linear_info[i].base_pfn);
120 atomic_set(&rma_info[i].use_count, 0);
121
122 pg = pfn_to_page(rma_info[i].base_pfn);
123 for (j = 0; j < npages; ++j) { 167 for (j = 0; j < npages; ++j) {
124 atomic_inc(&pg->_count); 168 atomic_inc(&pg->_count);
125 ++pg; 169 ++pg;
@@ -127,30 +171,59 @@ void __init kvm_rma_init(void)
127 } 171 }
128} 172}
129 173
130struct kvmppc_rma_info *kvm_alloc_rma(void) 174static struct kvmppc_linear_info *kvm_alloc_linear(int type)
131{ 175{
132 struct kvmppc_rma_info *ri; 176 struct kvmppc_linear_info *ri;
133 177
134 ri = NULL; 178 ri = NULL;
135 spin_lock(&rma_lock); 179 spin_lock(&linear_lock);
136 if (!list_empty(&free_rmas)) { 180 list_for_each_entry(ri, &free_linears, list) {
137 ri = list_first_entry(&free_rmas, struct kvmppc_rma_info, list); 181 if (ri->type != type)
182 continue;
183
138 list_del(&ri->list); 184 list_del(&ri->list);
139 atomic_inc(&ri->use_count); 185 atomic_inc(&ri->use_count);
186 break;
140 } 187 }
141 spin_unlock(&rma_lock); 188 spin_unlock(&linear_lock);
189 memset(ri->base_virt, 0, ri->npages << PAGE_SHIFT);
142 return ri; 190 return ri;
143} 191}
144EXPORT_SYMBOL_GPL(kvm_alloc_rma);
145 192
146void kvm_release_rma(struct kvmppc_rma_info *ri) 193static void kvm_release_linear(struct kvmppc_linear_info *ri)
147{ 194{
148 if (atomic_dec_and_test(&ri->use_count)) { 195 if (atomic_dec_and_test(&ri->use_count)) {
149 spin_lock(&rma_lock); 196 spin_lock(&linear_lock);
150 list_add_tail(&ri->list, &free_rmas); 197 list_add_tail(&ri->list, &free_linears);
151 spin_unlock(&rma_lock); 198 spin_unlock(&linear_lock);
152 199
153 } 200 }
154} 201}
155EXPORT_SYMBOL_GPL(kvm_release_rma);
156 202
203/*
204 * Called at boot time while the bootmem allocator is active,
205 * to allocate contiguous physical memory for the hash page
206 * tables for guests.
207 */
208void __init kvm_linear_init(void)
209{
210 /* HPT */
211 kvm_linear_init_one(1 << HPT_ORDER, kvm_hpt_count, KVM_LINEAR_HPT);
212
213 /* RMA */
214 /* Only do this on PPC970 in HV mode */
215 if (!cpu_has_feature(CPU_FTR_HVMODE) ||
216 !cpu_has_feature(CPU_FTR_ARCH_201))
217 return;
218
219 if (!kvm_rma_size || !kvm_rma_count)
220 return;
221
222 /* Check that the requested size is one supported in hardware */
223 if (lpcr_rmls(kvm_rma_size) < 0) {
224 pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size);
225 return;
226 }
227
228 kvm_linear_init_one(kvm_rma_size, kvm_rma_count, KVM_LINEAR_RMA);
229}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index bacb0cfa3602..def880aea63a 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -11,6 +11,7 @@
11#include <linux/kvm.h> 11#include <linux/kvm.h>
12#include <linux/kvm_host.h> 12#include <linux/kvm_host.h>
13#include <linux/hugetlb.h> 13#include <linux/hugetlb.h>
14#include <linux/module.h>
14 15
15#include <asm/tlbflush.h> 16#include <asm/tlbflush.h>
16#include <asm/kvm_ppc.h> 17#include <asm/kvm_ppc.h>
@@ -20,95 +21,307 @@
20#include <asm/synch.h> 21#include <asm/synch.h>
21#include <asm/ppc-opcode.h> 22#include <asm/ppc-opcode.h>
22 23
23/* For now use fixed-size 16MB page table */ 24/* Translate address of a vmalloc'd thing to a linear map address */
24#define HPT_ORDER 24 25static void *real_vmalloc_addr(void *x)
25#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ 26{
26#define HPT_HASH_MASK (HPT_NPTEG - 1) 27 unsigned long addr = (unsigned long) x;
28 pte_t *p;
27 29
28#define HPTE_V_HVLOCK 0x40UL 30 p = find_linux_pte(swapper_pg_dir, addr);
31 if (!p || !pte_present(*p))
32 return NULL;
33 /* assume we don't have huge pages in vmalloc space... */
34 addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
35 return __va(addr);
36}
29 37
30static inline long lock_hpte(unsigned long *hpte, unsigned long bits) 38/*
39 * Add this HPTE into the chain for the real page.
40 * Must be called with the chain locked; it unlocks the chain.
41 */
42void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
43 unsigned long *rmap, long pte_index, int realmode)
31{ 44{
32 unsigned long tmp, old; 45 struct revmap_entry *head, *tail;
46 unsigned long i;
33 47
34 asm volatile(" ldarx %0,0,%2\n" 48 if (*rmap & KVMPPC_RMAP_PRESENT) {
35 " and. %1,%0,%3\n" 49 i = *rmap & KVMPPC_RMAP_INDEX;
36 " bne 2f\n" 50 head = &kvm->arch.revmap[i];
37 " ori %0,%0,%4\n" 51 if (realmode)
38 " stdcx. %0,0,%2\n" 52 head = real_vmalloc_addr(head);
39 " beq+ 2f\n" 53 tail = &kvm->arch.revmap[head->back];
40 " li %1,%3\n" 54 if (realmode)
41 "2: isync" 55 tail = real_vmalloc_addr(tail);
42 : "=&r" (tmp), "=&r" (old) 56 rev->forw = i;
43 : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK) 57 rev->back = head->back;
44 : "cc", "memory"); 58 tail->forw = pte_index;
45 return old == 0; 59 head->back = pte_index;
60 } else {
61 rev->forw = rev->back = pte_index;
62 i = pte_index;
63 }
64 smp_wmb();
65 *rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */
66}
67EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
68
69/* Remove this HPTE from the chain for a real page */
70static void remove_revmap_chain(struct kvm *kvm, long pte_index,
71 struct revmap_entry *rev,
72 unsigned long hpte_v, unsigned long hpte_r)
73{
74 struct revmap_entry *next, *prev;
75 unsigned long gfn, ptel, head;
76 struct kvm_memory_slot *memslot;
77 unsigned long *rmap;
78 unsigned long rcbits;
79
80 rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
81 ptel = rev->guest_rpte |= rcbits;
82 gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
83 memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
84 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
85 return;
86
87 rmap = real_vmalloc_addr(&memslot->rmap[gfn - memslot->base_gfn]);
88 lock_rmap(rmap);
89
90 head = *rmap & KVMPPC_RMAP_INDEX;
91 next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]);
92 prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]);
93 next->back = rev->back;
94 prev->forw = rev->forw;
95 if (head == pte_index) {
96 head = rev->forw;
97 if (head == pte_index)
98 *rmap &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
99 else
100 *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head;
101 }
102 *rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
103 unlock_rmap(rmap);
104}
105
106static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva,
107 int writing, unsigned long *pte_sizep)
108{
109 pte_t *ptep;
110 unsigned long ps = *pte_sizep;
111 unsigned int shift;
112
113 ptep = find_linux_pte_or_hugepte(vcpu->arch.pgdir, hva, &shift);
114 if (!ptep)
115 return __pte(0);
116 if (shift)
117 *pte_sizep = 1ul << shift;
118 else
119 *pte_sizep = PAGE_SIZE;
120 if (ps > *pte_sizep)
121 return __pte(0);
122 if (!pte_present(*ptep))
123 return __pte(0);
124 return kvmppc_read_update_linux_pte(ptep, writing);
125}
126
127static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v)
128{
129 asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
130 hpte[0] = hpte_v;
46} 131}
47 132
48long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, 133long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
49 long pte_index, unsigned long pteh, unsigned long ptel) 134 long pte_index, unsigned long pteh, unsigned long ptel)
50{ 135{
51 unsigned long porder;
52 struct kvm *kvm = vcpu->kvm; 136 struct kvm *kvm = vcpu->kvm;
53 unsigned long i, lpn, pa; 137 unsigned long i, pa, gpa, gfn, psize;
138 unsigned long slot_fn, hva;
54 unsigned long *hpte; 139 unsigned long *hpte;
140 struct revmap_entry *rev;
141 unsigned long g_ptel = ptel;
142 struct kvm_memory_slot *memslot;
143 unsigned long *physp, pte_size;
144 unsigned long is_io;
145 unsigned long *rmap;
146 pte_t pte;
147 unsigned int writing;
148 unsigned long mmu_seq;
149 unsigned long rcbits;
150 bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING;
55 151
56 /* only handle 4k, 64k and 16M pages for now */ 152 psize = hpte_page_size(pteh, ptel);
57 porder = 12; 153 if (!psize)
58 if (pteh & HPTE_V_LARGE) { 154 return H_PARAMETER;
59 if (cpu_has_feature(CPU_FTR_ARCH_206) && 155 writing = hpte_is_writable(ptel);
60 (ptel & 0xf000) == 0x1000) { 156 pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
61 /* 64k page */ 157
62 porder = 16; 158 /* used later to detect if we might have been invalidated */
63 } else if ((ptel & 0xff000) == 0) { 159 mmu_seq = kvm->mmu_notifier_seq;
64 /* 16M page */ 160 smp_rmb();
65 porder = 24; 161
66 /* lowest AVA bit must be 0 for 16M pages */ 162 /* Find the memslot (if any) for this address */
67 if (pteh & 0x80) 163 gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
68 return H_PARAMETER; 164 gfn = gpa >> PAGE_SHIFT;
69 } else 165 memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
166 pa = 0;
167 is_io = ~0ul;
168 rmap = NULL;
169 if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) {
170 /* PPC970 can't do emulated MMIO */
171 if (!cpu_has_feature(CPU_FTR_ARCH_206))
70 return H_PARAMETER; 172 return H_PARAMETER;
173 /* Emulated MMIO - mark this with key=31 */
174 pteh |= HPTE_V_ABSENT;
175 ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO;
176 goto do_insert;
71 } 177 }
72 lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder; 178
73 if (lpn >= kvm->arch.ram_npages || porder > kvm->arch.ram_porder) 179 /* Check if the requested page fits entirely in the memslot. */
74 return H_PARAMETER; 180 if (!slot_is_aligned(memslot, psize))
75 pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT;
76 if (!pa)
77 return H_PARAMETER; 181 return H_PARAMETER;
78 /* Check WIMG */ 182 slot_fn = gfn - memslot->base_gfn;
79 if ((ptel & HPTE_R_WIMG) != HPTE_R_M && 183 rmap = &memslot->rmap[slot_fn];
80 (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M)) 184
185 if (!kvm->arch.using_mmu_notifiers) {
186 physp = kvm->arch.slot_phys[memslot->id];
187 if (!physp)
188 return H_PARAMETER;
189 physp += slot_fn;
190 if (realmode)
191 physp = real_vmalloc_addr(physp);
192 pa = *physp;
193 if (!pa)
194 return H_TOO_HARD;
195 is_io = pa & (HPTE_R_I | HPTE_R_W);
196 pte_size = PAGE_SIZE << (pa & KVMPPC_PAGE_ORDER_MASK);
197 pa &= PAGE_MASK;
198 } else {
199 /* Translate to host virtual address */
200 hva = gfn_to_hva_memslot(memslot, gfn);
201
202 /* Look up the Linux PTE for the backing page */
203 pte_size = psize;
204 pte = lookup_linux_pte(vcpu, hva, writing, &pte_size);
205 if (pte_present(pte)) {
206 if (writing && !pte_write(pte))
207 /* make the actual HPTE be read-only */
208 ptel = hpte_make_readonly(ptel);
209 is_io = hpte_cache_bits(pte_val(pte));
210 pa = pte_pfn(pte) << PAGE_SHIFT;
211 }
212 }
213 if (pte_size < psize)
81 return H_PARAMETER; 214 return H_PARAMETER;
82 pteh &= ~0x60UL; 215 if (pa && pte_size > psize)
83 ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize); 216 pa |= gpa & (pte_size - 1);
217
218 ptel &= ~(HPTE_R_PP0 - psize);
84 ptel |= pa; 219 ptel |= pa;
85 if (pte_index >= (HPT_NPTEG << 3)) 220
221 if (pa)
222 pteh |= HPTE_V_VALID;
223 else
224 pteh |= HPTE_V_ABSENT;
225
226 /* Check WIMG */
227 if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) {
228 if (is_io)
229 return H_PARAMETER;
230 /*
231 * Allow guest to map emulated device memory as
232 * uncacheable, but actually make it cacheable.
233 */
234 ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G);
235 ptel |= HPTE_R_M;
236 }
237
238 /* Find and lock the HPTEG slot to use */
239 do_insert:
240 if (pte_index >= HPT_NPTE)
86 return H_PARAMETER; 241 return H_PARAMETER;
87 if (likely((flags & H_EXACT) == 0)) { 242 if (likely((flags & H_EXACT) == 0)) {
88 pte_index &= ~7UL; 243 pte_index &= ~7UL;
89 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); 244 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
90 for (i = 0; ; ++i) { 245 for (i = 0; i < 8; ++i) {
91 if (i == 8)
92 return H_PTEG_FULL;
93 if ((*hpte & HPTE_V_VALID) == 0 && 246 if ((*hpte & HPTE_V_VALID) == 0 &&
94 lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) 247 try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
248 HPTE_V_ABSENT))
95 break; 249 break;
96 hpte += 2; 250 hpte += 2;
97 } 251 }
252 if (i == 8) {
253 /*
254 * Since try_lock_hpte doesn't retry (not even stdcx.
255 * failures), it could be that there is a free slot
256 * but we transiently failed to lock it. Try again,
257 * actually locking each slot and checking it.
258 */
259 hpte -= 16;
260 for (i = 0; i < 8; ++i) {
261 while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
262 cpu_relax();
263 if (!(*hpte & (HPTE_V_VALID | HPTE_V_ABSENT)))
264 break;
265 *hpte &= ~HPTE_V_HVLOCK;
266 hpte += 2;
267 }
268 if (i == 8)
269 return H_PTEG_FULL;
270 }
271 pte_index += i;
98 } else { 272 } else {
99 i = 0;
100 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); 273 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
101 if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) 274 if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
102 return H_PTEG_FULL; 275 HPTE_V_ABSENT)) {
276 /* Lock the slot and check again */
277 while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
278 cpu_relax();
279 if (*hpte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
280 *hpte &= ~HPTE_V_HVLOCK;
281 return H_PTEG_FULL;
282 }
283 }
103 } 284 }
285
286 /* Save away the guest's idea of the second HPTE dword */
287 rev = &kvm->arch.revmap[pte_index];
288 if (realmode)
289 rev = real_vmalloc_addr(rev);
290 if (rev)
291 rev->guest_rpte = g_ptel;
292
293 /* Link HPTE into reverse-map chain */
294 if (pteh & HPTE_V_VALID) {
295 if (realmode)
296 rmap = real_vmalloc_addr(rmap);
297 lock_rmap(rmap);
298 /* Check for pending invalidations under the rmap chain lock */
299 if (kvm->arch.using_mmu_notifiers &&
300 mmu_notifier_retry(vcpu, mmu_seq)) {
301 /* inval in progress, write a non-present HPTE */
302 pteh |= HPTE_V_ABSENT;
303 pteh &= ~HPTE_V_VALID;
304 unlock_rmap(rmap);
305 } else {
306 kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index,
307 realmode);
308 /* Only set R/C in real HPTE if already set in *rmap */
309 rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
310 ptel &= rcbits | ~(HPTE_R_R | HPTE_R_C);
311 }
312 }
313
104 hpte[1] = ptel; 314 hpte[1] = ptel;
315
316 /* Write the first HPTE dword, unlocking the HPTE and making it valid */
105 eieio(); 317 eieio();
106 hpte[0] = pteh; 318 hpte[0] = pteh;
107 asm volatile("ptesync" : : : "memory"); 319 asm volatile("ptesync" : : : "memory");
108 atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt); 320
109 vcpu->arch.gpr[4] = pte_index + i; 321 vcpu->arch.gpr[4] = pte_index;
110 return H_SUCCESS; 322 return H_SUCCESS;
111} 323}
324EXPORT_SYMBOL_GPL(kvmppc_h_enter);
112 325
113#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token)) 326#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token))
114 327
@@ -137,37 +350,46 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
137 struct kvm *kvm = vcpu->kvm; 350 struct kvm *kvm = vcpu->kvm;
138 unsigned long *hpte; 351 unsigned long *hpte;
139 unsigned long v, r, rb; 352 unsigned long v, r, rb;
353 struct revmap_entry *rev;
140 354
141 if (pte_index >= (HPT_NPTEG << 3)) 355 if (pte_index >= HPT_NPTE)
142 return H_PARAMETER; 356 return H_PARAMETER;
143 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); 357 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
144 while (!lock_hpte(hpte, HPTE_V_HVLOCK)) 358 while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
145 cpu_relax(); 359 cpu_relax();
146 if ((hpte[0] & HPTE_V_VALID) == 0 || 360 if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
147 ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) || 361 ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) ||
148 ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) { 362 ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) {
149 hpte[0] &= ~HPTE_V_HVLOCK; 363 hpte[0] &= ~HPTE_V_HVLOCK;
150 return H_NOT_FOUND; 364 return H_NOT_FOUND;
151 } 365 }
152 if (atomic_read(&kvm->online_vcpus) == 1) 366
153 flags |= H_LOCAL; 367 rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
154 vcpu->arch.gpr[4] = v = hpte[0] & ~HPTE_V_HVLOCK; 368 v = hpte[0] & ~HPTE_V_HVLOCK;
155 vcpu->arch.gpr[5] = r = hpte[1]; 369 if (v & HPTE_V_VALID) {
156 rb = compute_tlbie_rb(v, r, pte_index); 370 hpte[0] &= ~HPTE_V_VALID;
157 hpte[0] = 0; 371 rb = compute_tlbie_rb(v, hpte[1], pte_index);
158 if (!(flags & H_LOCAL)) { 372 if (!(flags & H_LOCAL) && atomic_read(&kvm->online_vcpus) > 1) {
159 while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) 373 while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
160 cpu_relax(); 374 cpu_relax();
161 asm volatile("ptesync" : : : "memory"); 375 asm volatile("ptesync" : : : "memory");
162 asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" 376 asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
163 : : "r" (rb), "r" (kvm->arch.lpid)); 377 : : "r" (rb), "r" (kvm->arch.lpid));
164 asm volatile("ptesync" : : : "memory"); 378 asm volatile("ptesync" : : : "memory");
165 kvm->arch.tlbie_lock = 0; 379 kvm->arch.tlbie_lock = 0;
166 } else { 380 } else {
167 asm volatile("ptesync" : : : "memory"); 381 asm volatile("ptesync" : : : "memory");
168 asm volatile("tlbiel %0" : : "r" (rb)); 382 asm volatile("tlbiel %0" : : "r" (rb));
169 asm volatile("ptesync" : : : "memory"); 383 asm volatile("ptesync" : : : "memory");
384 }
385 /* Read PTE low word after tlbie to get final R/C values */
386 remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]);
170 } 387 }
388 r = rev->guest_rpte;
389 unlock_hpte(hpte, 0);
390
391 vcpu->arch.gpr[4] = v;
392 vcpu->arch.gpr[5] = r;
171 return H_SUCCESS; 393 return H_SUCCESS;
172} 394}
173 395
@@ -175,78 +397,117 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
175{ 397{
176 struct kvm *kvm = vcpu->kvm; 398 struct kvm *kvm = vcpu->kvm;
177 unsigned long *args = &vcpu->arch.gpr[4]; 399 unsigned long *args = &vcpu->arch.gpr[4];
178 unsigned long *hp, tlbrb[4]; 400 unsigned long *hp, *hptes[4], tlbrb[4];
179 long int i, found; 401 long int i, j, k, n, found, indexes[4];
180 long int n_inval = 0; 402 unsigned long flags, req, pte_index, rcbits;
181 unsigned long flags, req, pte_index;
182 long int local = 0; 403 long int local = 0;
183 long int ret = H_SUCCESS; 404 long int ret = H_SUCCESS;
405 struct revmap_entry *rev, *revs[4];
184 406
185 if (atomic_read(&kvm->online_vcpus) == 1) 407 if (atomic_read(&kvm->online_vcpus) == 1)
186 local = 1; 408 local = 1;
187 for (i = 0; i < 4; ++i) { 409 for (i = 0; i < 4 && ret == H_SUCCESS; ) {
188 pte_index = args[i * 2]; 410 n = 0;
189 flags = pte_index >> 56; 411 for (; i < 4; ++i) {
190 pte_index &= ((1ul << 56) - 1); 412 j = i * 2;
191 req = flags >> 6; 413 pte_index = args[j];
192 flags &= 3; 414 flags = pte_index >> 56;
193 if (req == 3) 415 pte_index &= ((1ul << 56) - 1);
194 break; 416 req = flags >> 6;
195 if (req != 1 || flags == 3 || 417 flags &= 3;
196 pte_index >= (HPT_NPTEG << 3)) { 418 if (req == 3) { /* no more requests */
197 /* parameter error */ 419 i = 4;
198 args[i * 2] = ((0xa0 | flags) << 56) + pte_index;
199 ret = H_PARAMETER;
200 break;
201 }
202 hp = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
203 while (!lock_hpte(hp, HPTE_V_HVLOCK))
204 cpu_relax();
205 found = 0;
206 if (hp[0] & HPTE_V_VALID) {
207 switch (flags & 3) {
208 case 0: /* absolute */
209 found = 1;
210 break; 420 break;
211 case 1: /* andcond */ 421 }
212 if (!(hp[0] & args[i * 2 + 1])) 422 if (req != 1 || flags == 3 || pte_index >= HPT_NPTE) {
213 found = 1; 423 /* parameter error */
424 args[j] = ((0xa0 | flags) << 56) + pte_index;
425 ret = H_PARAMETER;
214 break; 426 break;
215 case 2: /* AVPN */ 427 }
216 if ((hp[0] & ~0x7fUL) == args[i * 2 + 1]) 428 hp = (unsigned long *)
429 (kvm->arch.hpt_virt + (pte_index << 4));
430 /* to avoid deadlock, don't spin except for first */
431 if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) {
432 if (n)
433 break;
434 while (!try_lock_hpte(hp, HPTE_V_HVLOCK))
435 cpu_relax();
436 }
437 found = 0;
438 if (hp[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) {
439 switch (flags & 3) {
440 case 0: /* absolute */
217 found = 1; 441 found = 1;
218 break; 442 break;
443 case 1: /* andcond */
444 if (!(hp[0] & args[j + 1]))
445 found = 1;
446 break;
447 case 2: /* AVPN */
448 if ((hp[0] & ~0x7fUL) == args[j + 1])
449 found = 1;
450 break;
451 }
452 }
453 if (!found) {
454 hp[0] &= ~HPTE_V_HVLOCK;
455 args[j] = ((0x90 | flags) << 56) + pte_index;
456 continue;
219 } 457 }
458
459 args[j] = ((0x80 | flags) << 56) + pte_index;
460 rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
461
462 if (!(hp[0] & HPTE_V_VALID)) {
463 /* insert R and C bits from PTE */
464 rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
465 args[j] |= rcbits << (56 - 5);
466 continue;
467 }
468
469 hp[0] &= ~HPTE_V_VALID; /* leave it locked */
470 tlbrb[n] = compute_tlbie_rb(hp[0], hp[1], pte_index);
471 indexes[n] = j;
472 hptes[n] = hp;
473 revs[n] = rev;
474 ++n;
475 }
476
477 if (!n)
478 break;
479
480 /* Now that we've collected a batch, do the tlbies */
481 if (!local) {
482 while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
483 cpu_relax();
484 asm volatile("ptesync" : : : "memory");
485 for (k = 0; k < n; ++k)
486 asm volatile(PPC_TLBIE(%1,%0) : :
487 "r" (tlbrb[k]),
488 "r" (kvm->arch.lpid));
489 asm volatile("eieio; tlbsync; ptesync" : : : "memory");
490 kvm->arch.tlbie_lock = 0;
491 } else {
492 asm volatile("ptesync" : : : "memory");
493 for (k = 0; k < n; ++k)
494 asm volatile("tlbiel %0" : : "r" (tlbrb[k]));
495 asm volatile("ptesync" : : : "memory");
220 } 496 }
221 if (!found) { 497
222 hp[0] &= ~HPTE_V_HVLOCK; 498 /* Read PTE low words after tlbie to get final R/C values */
223 args[i * 2] = ((0x90 | flags) << 56) + pte_index; 499 for (k = 0; k < n; ++k) {
224 continue; 500 j = indexes[k];
501 pte_index = args[j] & ((1ul << 56) - 1);
502 hp = hptes[k];
503 rev = revs[k];
504 remove_revmap_chain(kvm, pte_index, rev, hp[0], hp[1]);
505 rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
506 args[j] |= rcbits << (56 - 5);
507 hp[0] = 0;
225 } 508 }
226 /* insert R and C bits from PTE */
227 flags |= (hp[1] >> 5) & 0x0c;
228 args[i * 2] = ((0x80 | flags) << 56) + pte_index;
229 tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index);
230 hp[0] = 0;
231 }
232 if (n_inval == 0)
233 return ret;
234
235 if (!local) {
236 while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
237 cpu_relax();
238 asm volatile("ptesync" : : : "memory");
239 for (i = 0; i < n_inval; ++i)
240 asm volatile(PPC_TLBIE(%1,%0)
241 : : "r" (tlbrb[i]), "r" (kvm->arch.lpid));
242 asm volatile("eieio; tlbsync; ptesync" : : : "memory");
243 kvm->arch.tlbie_lock = 0;
244 } else {
245 asm volatile("ptesync" : : : "memory");
246 for (i = 0; i < n_inval; ++i)
247 asm volatile("tlbiel %0" : : "r" (tlbrb[i]));
248 asm volatile("ptesync" : : : "memory");
249 } 509 }
510
250 return ret; 511 return ret;
251} 512}
252 513
@@ -256,40 +517,55 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
256{ 517{
257 struct kvm *kvm = vcpu->kvm; 518 struct kvm *kvm = vcpu->kvm;
258 unsigned long *hpte; 519 unsigned long *hpte;
259 unsigned long v, r, rb; 520 struct revmap_entry *rev;
521 unsigned long v, r, rb, mask, bits;
260 522
261 if (pte_index >= (HPT_NPTEG << 3)) 523 if (pte_index >= HPT_NPTE)
262 return H_PARAMETER; 524 return H_PARAMETER;
525
263 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); 526 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
264 while (!lock_hpte(hpte, HPTE_V_HVLOCK)) 527 while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
265 cpu_relax(); 528 cpu_relax();
266 if ((hpte[0] & HPTE_V_VALID) == 0 || 529 if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
267 ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) { 530 ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) {
268 hpte[0] &= ~HPTE_V_HVLOCK; 531 hpte[0] &= ~HPTE_V_HVLOCK;
269 return H_NOT_FOUND; 532 return H_NOT_FOUND;
270 } 533 }
534
271 if (atomic_read(&kvm->online_vcpus) == 1) 535 if (atomic_read(&kvm->online_vcpus) == 1)
272 flags |= H_LOCAL; 536 flags |= H_LOCAL;
273 v = hpte[0]; 537 v = hpte[0];
274 r = hpte[1] & ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | 538 bits = (flags << 55) & HPTE_R_PP0;
275 HPTE_R_KEY_HI | HPTE_R_KEY_LO); 539 bits |= (flags << 48) & HPTE_R_KEY_HI;
276 r |= (flags << 55) & HPTE_R_PP0; 540 bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
277 r |= (flags << 48) & HPTE_R_KEY_HI; 541
278 r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); 542 /* Update guest view of 2nd HPTE dword */
279 rb = compute_tlbie_rb(v, r, pte_index); 543 mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
280 hpte[0] = v & ~HPTE_V_VALID; 544 HPTE_R_KEY_HI | HPTE_R_KEY_LO;
281 if (!(flags & H_LOCAL)) { 545 rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
282 while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) 546 if (rev) {
283 cpu_relax(); 547 r = (rev->guest_rpte & ~mask) | bits;
284 asm volatile("ptesync" : : : "memory"); 548 rev->guest_rpte = r;
285 asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" 549 }
286 : : "r" (rb), "r" (kvm->arch.lpid)); 550 r = (hpte[1] & ~mask) | bits;
287 asm volatile("ptesync" : : : "memory"); 551
288 kvm->arch.tlbie_lock = 0; 552 /* Update HPTE */
289 } else { 553 if (v & HPTE_V_VALID) {
290 asm volatile("ptesync" : : : "memory"); 554 rb = compute_tlbie_rb(v, r, pte_index);
291 asm volatile("tlbiel %0" : : "r" (rb)); 555 hpte[0] = v & ~HPTE_V_VALID;
292 asm volatile("ptesync" : : : "memory"); 556 if (!(flags & H_LOCAL)) {
557 while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
558 cpu_relax();
559 asm volatile("ptesync" : : : "memory");
560 asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
561 : : "r" (rb), "r" (kvm->arch.lpid));
562 asm volatile("ptesync" : : : "memory");
563 kvm->arch.tlbie_lock = 0;
564 } else {
565 asm volatile("ptesync" : : : "memory");
566 asm volatile("tlbiel %0" : : "r" (rb));
567 asm volatile("ptesync" : : : "memory");
568 }
293 } 569 }
294 hpte[1] = r; 570 hpte[1] = r;
295 eieio(); 571 eieio();
@@ -298,40 +574,243 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
298 return H_SUCCESS; 574 return H_SUCCESS;
299} 575}
300 576
301static unsigned long reverse_xlate(struct kvm *kvm, unsigned long realaddr)
302{
303 long int i;
304 unsigned long offset, rpn;
305
306 offset = realaddr & (kvm->arch.ram_psize - 1);
307 rpn = (realaddr - offset) >> PAGE_SHIFT;
308 for (i = 0; i < kvm->arch.ram_npages; ++i)
309 if (rpn == kvm->arch.ram_pginfo[i].pfn)
310 return (i << PAGE_SHIFT) + offset;
311 return HPTE_R_RPN; /* all 1s in the RPN field */
312}
313
314long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, 577long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
315 unsigned long pte_index) 578 unsigned long pte_index)
316{ 579{
317 struct kvm *kvm = vcpu->kvm; 580 struct kvm *kvm = vcpu->kvm;
318 unsigned long *hpte, r; 581 unsigned long *hpte, v, r;
319 int i, n = 1; 582 int i, n = 1;
583 struct revmap_entry *rev = NULL;
320 584
321 if (pte_index >= (HPT_NPTEG << 3)) 585 if (pte_index >= HPT_NPTE)
322 return H_PARAMETER; 586 return H_PARAMETER;
323 if (flags & H_READ_4) { 587 if (flags & H_READ_4) {
324 pte_index &= ~3; 588 pte_index &= ~3;
325 n = 4; 589 n = 4;
326 } 590 }
591 rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
327 for (i = 0; i < n; ++i, ++pte_index) { 592 for (i = 0; i < n; ++i, ++pte_index) {
328 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); 593 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
594 v = hpte[0] & ~HPTE_V_HVLOCK;
329 r = hpte[1]; 595 r = hpte[1];
330 if ((flags & H_R_XLATE) && (hpte[0] & HPTE_V_VALID)) 596 if (v & HPTE_V_ABSENT) {
331 r = reverse_xlate(kvm, r & HPTE_R_RPN) | 597 v &= ~HPTE_V_ABSENT;
332 (r & ~HPTE_R_RPN); 598 v |= HPTE_V_VALID;
333 vcpu->arch.gpr[4 + i * 2] = hpte[0]; 599 }
600 if (v & HPTE_V_VALID)
601 r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C));
602 vcpu->arch.gpr[4 + i * 2] = v;
334 vcpu->arch.gpr[5 + i * 2] = r; 603 vcpu->arch.gpr[5 + i * 2] = r;
335 } 604 }
336 return H_SUCCESS; 605 return H_SUCCESS;
337} 606}
607
608void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep,
609 unsigned long pte_index)
610{
611 unsigned long rb;
612
613 hptep[0] &= ~HPTE_V_VALID;
614 rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index);
615 while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
616 cpu_relax();
617 asm volatile("ptesync" : : : "memory");
618 asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
619 : : "r" (rb), "r" (kvm->arch.lpid));
620 asm volatile("ptesync" : : : "memory");
621 kvm->arch.tlbie_lock = 0;
622}
623EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte);
624
625void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep,
626 unsigned long pte_index)
627{
628 unsigned long rb;
629 unsigned char rbyte;
630
631 rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index);
632 rbyte = (hptep[1] & ~HPTE_R_R) >> 8;
633 /* modify only the second-last byte, which contains the ref bit */
634 *((char *)hptep + 14) = rbyte;
635 while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
636 cpu_relax();
637 asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
638 : : "r" (rb), "r" (kvm->arch.lpid));
639 asm volatile("ptesync" : : : "memory");
640 kvm->arch.tlbie_lock = 0;
641}
642EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte);
643
644static int slb_base_page_shift[4] = {
645 24, /* 16M */
646 16, /* 64k */
647 34, /* 16G */
648 20, /* 1M, unsupported */
649};
650
651long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
652 unsigned long valid)
653{
654 unsigned int i;
655 unsigned int pshift;
656 unsigned long somask;
657 unsigned long vsid, hash;
658 unsigned long avpn;
659 unsigned long *hpte;
660 unsigned long mask, val;
661 unsigned long v, r;
662
663 /* Get page shift, work out hash and AVPN etc. */
664 mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY;
665 val = 0;
666 pshift = 12;
667 if (slb_v & SLB_VSID_L) {
668 mask |= HPTE_V_LARGE;
669 val |= HPTE_V_LARGE;
670 pshift = slb_base_page_shift[(slb_v & SLB_VSID_LP) >> 4];
671 }
672 if (slb_v & SLB_VSID_B_1T) {
673 somask = (1UL << 40) - 1;
674 vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T;
675 vsid ^= vsid << 25;
676 } else {
677 somask = (1UL << 28) - 1;
678 vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
679 }
680 hash = (vsid ^ ((eaddr & somask) >> pshift)) & HPT_HASH_MASK;
681 avpn = slb_v & ~(somask >> 16); /* also includes B */
682 avpn |= (eaddr & somask) >> 16;
683
684 if (pshift >= 24)
685 avpn &= ~((1UL << (pshift - 16)) - 1);
686 else
687 avpn &= ~0x7fUL;
688 val |= avpn;
689
690 for (;;) {
691 hpte = (unsigned long *)(kvm->arch.hpt_virt + (hash << 7));
692
693 for (i = 0; i < 16; i += 2) {
694 /* Read the PTE racily */
695 v = hpte[i] & ~HPTE_V_HVLOCK;
696
697 /* Check valid/absent, hash, segment size and AVPN */
698 if (!(v & valid) || (v & mask) != val)
699 continue;
700
701 /* Lock the PTE and read it under the lock */
702 while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK))
703 cpu_relax();
704 v = hpte[i] & ~HPTE_V_HVLOCK;
705 r = hpte[i+1];
706
707 /*
708 * Check the HPTE again, including large page size
709 * Since we don't currently allow any MPSS (mixed
710 * page-size segment) page sizes, it is sufficient
711 * to check against the actual page size.
712 */
713 if ((v & valid) && (v & mask) == val &&
714 hpte_page_size(v, r) == (1ul << pshift))
715 /* Return with the HPTE still locked */
716 return (hash << 3) + (i >> 1);
717
718 /* Unlock and move on */
719 hpte[i] = v;
720 }
721
722 if (val & HPTE_V_SECONDARY)
723 break;
724 val |= HPTE_V_SECONDARY;
725 hash = hash ^ HPT_HASH_MASK;
726 }
727 return -1;
728}
729EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte);
730
731/*
732 * Called in real mode to check whether an HPTE not found fault
733 * is due to accessing a paged-out page or an emulated MMIO page,
734 * or if a protection fault is due to accessing a page that the
735 * guest wanted read/write access to but which we made read-only.
736 * Returns a possibly modified status (DSISR) value if not
737 * (i.e. pass the interrupt to the guest),
738 * -1 to pass the fault up to host kernel mode code, -2 to do that
739 * and also load the instruction word (for MMIO emulation),
740 * or 0 if we should make the guest retry the access.
741 */
742long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
743 unsigned long slb_v, unsigned int status, bool data)
744{
745 struct kvm *kvm = vcpu->kvm;
746 long int index;
747 unsigned long v, r, gr;
748 unsigned long *hpte;
749 unsigned long valid;
750 struct revmap_entry *rev;
751 unsigned long pp, key;
752
753 /* For protection fault, expect to find a valid HPTE */
754 valid = HPTE_V_VALID;
755 if (status & DSISR_NOHPTE)
756 valid |= HPTE_V_ABSENT;
757
758 index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid);
759 if (index < 0) {
760 if (status & DSISR_NOHPTE)
761 return status; /* there really was no HPTE */
762 return 0; /* for prot fault, HPTE disappeared */
763 }
764 hpte = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
765 v = hpte[0] & ~HPTE_V_HVLOCK;
766 r = hpte[1];
767 rev = real_vmalloc_addr(&kvm->arch.revmap[index]);
768 gr = rev->guest_rpte;
769
770 unlock_hpte(hpte, v);
771
772 /* For not found, if the HPTE is valid by now, retry the instruction */
773 if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID))
774 return 0;
775
776 /* Check access permissions to the page */
777 pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
778 key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
779 status &= ~DSISR_NOHPTE; /* DSISR_NOHPTE == SRR1_ISI_NOPT */
780 if (!data) {
781 if (gr & (HPTE_R_N | HPTE_R_G))
782 return status | SRR1_ISI_N_OR_G;
783 if (!hpte_read_permission(pp, slb_v & key))
784 return status | SRR1_ISI_PROT;
785 } else if (status & DSISR_ISSTORE) {
786 /* check write permission */
787 if (!hpte_write_permission(pp, slb_v & key))
788 return status | DSISR_PROTFAULT;
789 } else {
790 if (!hpte_read_permission(pp, slb_v & key))
791 return status | DSISR_PROTFAULT;
792 }
793
794 /* Check storage key, if applicable */
795 if (data && (vcpu->arch.shregs.msr & MSR_DR)) {
796 unsigned int perm = hpte_get_skey_perm(gr, vcpu->arch.amr);
797 if (status & DSISR_ISSTORE)
798 perm >>= 1;
799 if (perm & 1)
800 return status | DSISR_KEYFAULT;
801 }
802
803 /* Save HPTE info for virtual-mode handler */
804 vcpu->arch.pgfault_addr = addr;
805 vcpu->arch.pgfault_index = index;
806 vcpu->arch.pgfault_hpte[0] = v;
807 vcpu->arch.pgfault_hpte[1] = r;
808
809 /* Check the storage key to see if it is possibly emulated MMIO */
810 if (data && (vcpu->arch.shregs.msr & MSR_IR) &&
811 (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
812 (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
813 return -2; /* MMIO emulation - load instr word */
814
815 return -1; /* send fault up to host kernel mode */
816}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 5c8b26183f50..b70bf22a3ff3 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -601,6 +601,30 @@ kvmppc_interrupt:
601 601
602 stw r12,VCPU_TRAP(r9) 602 stw r12,VCPU_TRAP(r9)
603 603
604 /* Save HEIR (HV emulation assist reg) in last_inst
605 if this is an HEI (HV emulation interrupt, e40) */
606 li r3,KVM_INST_FETCH_FAILED
607BEGIN_FTR_SECTION
608 cmpwi r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST
609 bne 11f
610 mfspr r3,SPRN_HEIR
611END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
61211: stw r3,VCPU_LAST_INST(r9)
613
614 /* these are volatile across C function calls */
615 mfctr r3
616 mfxer r4
617 std r3, VCPU_CTR(r9)
618 stw r4, VCPU_XER(r9)
619
620BEGIN_FTR_SECTION
621 /* If this is a page table miss then see if it's theirs or ours */
622 cmpwi r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
623 beq kvmppc_hdsi
624 cmpwi r12, BOOK3S_INTERRUPT_H_INST_STORAGE
625 beq kvmppc_hisi
626END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
627
604 /* See if this is a leftover HDEC interrupt */ 628 /* See if this is a leftover HDEC interrupt */
605 cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER 629 cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER
606 bne 2f 630 bne 2f
@@ -608,7 +632,7 @@ kvmppc_interrupt:
608 cmpwi r3,0 632 cmpwi r3,0
609 bge ignore_hdec 633 bge ignore_hdec
6102: 6342:
611 /* See if this is something we can handle in real mode */ 635 /* See if this is an hcall we can handle in real mode */
612 cmpwi r12,BOOK3S_INTERRUPT_SYSCALL 636 cmpwi r12,BOOK3S_INTERRUPT_SYSCALL
613 beq hcall_try_real_mode 637 beq hcall_try_real_mode
614 638
@@ -624,6 +648,7 @@ BEGIN_FTR_SECTION
6241: 6481:
625END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) 649END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
626 650
651nohpte_cont:
627hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ 652hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
628 /* Save DEC */ 653 /* Save DEC */
629 mfspr r5,SPRN_DEC 654 mfspr r5,SPRN_DEC
@@ -632,36 +657,21 @@ hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
632 add r5,r5,r6 657 add r5,r5,r6
633 std r5,VCPU_DEC_EXPIRES(r9) 658 std r5,VCPU_DEC_EXPIRES(r9)
634 659
635 /* Save HEIR (HV emulation assist reg) in last_inst
636 if this is an HEI (HV emulation interrupt, e40) */
637 li r3,-1
638BEGIN_FTR_SECTION
639 cmpwi r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST
640 bne 11f
641 mfspr r3,SPRN_HEIR
642END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
64311: stw r3,VCPU_LAST_INST(r9)
644
645 /* Save more register state */ 660 /* Save more register state */
646 mfxer r5
647 mfdar r6 661 mfdar r6
648 mfdsisr r7 662 mfdsisr r7
649 mfctr r8
650
651 stw r5, VCPU_XER(r9)
652 std r6, VCPU_DAR(r9) 663 std r6, VCPU_DAR(r9)
653 stw r7, VCPU_DSISR(r9) 664 stw r7, VCPU_DSISR(r9)
654 std r8, VCPU_CTR(r9)
655 /* grab HDAR & HDSISR if HV data storage interrupt (HDSI) */
656BEGIN_FTR_SECTION 665BEGIN_FTR_SECTION
666 /* don't overwrite fault_dar/fault_dsisr if HDSI */
657 cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE 667 cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
658 beq 6f 668 beq 6f
659END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) 669END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
6607: std r6, VCPU_FAULT_DAR(r9) 670 std r6, VCPU_FAULT_DAR(r9)
661 stw r7, VCPU_FAULT_DSISR(r9) 671 stw r7, VCPU_FAULT_DSISR(r9)
662 672
663 /* Save guest CTRL register, set runlatch to 1 */ 673 /* Save guest CTRL register, set runlatch to 1 */
664 mfspr r6,SPRN_CTRLF 6746: mfspr r6,SPRN_CTRLF
665 stw r6,VCPU_CTRL(r9) 675 stw r6,VCPU_CTRL(r9)
666 andi. r0,r6,1 676 andi. r0,r6,1
667 bne 4f 677 bne 4f
@@ -1094,9 +1104,131 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
1094 mtspr SPRN_HSRR1, r7 1104 mtspr SPRN_HSRR1, r7
1095 ba 0x500 1105 ba 0x500
1096 1106
10976: mfspr r6,SPRN_HDAR 1107/*
1098 mfspr r7,SPRN_HDSISR 1108 * Check whether an HDSI is an HPTE not found fault or something else.
1099 b 7b 1109 * If it is an HPTE not found fault that is due to the guest accessing
1110 * a page that they have mapped but which we have paged out, then
1111 * we continue on with the guest exit path. In all other cases,
1112 * reflect the HDSI to the guest as a DSI.
1113 */
1114kvmppc_hdsi:
1115 mfspr r4, SPRN_HDAR
1116 mfspr r6, SPRN_HDSISR
1117 /* HPTE not found fault or protection fault? */
1118 andis. r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h
1119 beq 1f /* if not, send it to the guest */
1120 andi. r0, r11, MSR_DR /* data relocation enabled? */
1121 beq 3f
1122 clrrdi r0, r4, 28
1123 PPC_SLBFEE_DOT(r5, r0) /* if so, look up SLB */
1124 bne 1f /* if no SLB entry found */
11254: std r4, VCPU_FAULT_DAR(r9)
1126 stw r6, VCPU_FAULT_DSISR(r9)
1127
1128 /* Search the hash table. */
1129 mr r3, r9 /* vcpu pointer */
1130 li r7, 1 /* data fault */
1131 bl .kvmppc_hpte_hv_fault
1132 ld r9, HSTATE_KVM_VCPU(r13)
1133 ld r10, VCPU_PC(r9)
1134 ld r11, VCPU_MSR(r9)
1135 li r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
1136 cmpdi r3, 0 /* retry the instruction */
1137 beq 6f
1138 cmpdi r3, -1 /* handle in kernel mode */
1139 beq nohpte_cont
1140 cmpdi r3, -2 /* MMIO emulation; need instr word */
1141 beq 2f
1142
1143 /* Synthesize a DSI for the guest */
1144 ld r4, VCPU_FAULT_DAR(r9)
1145 mr r6, r3
11461: mtspr SPRN_DAR, r4
1147 mtspr SPRN_DSISR, r6
1148 mtspr SPRN_SRR0, r10
1149 mtspr SPRN_SRR1, r11
1150 li r10, BOOK3S_INTERRUPT_DATA_STORAGE
1151 li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */
1152 rotldi r11, r11, 63
11536: ld r7, VCPU_CTR(r9)
1154 lwz r8, VCPU_XER(r9)
1155 mtctr r7
1156 mtxer r8
1157 mr r4, r9
1158 b fast_guest_return
1159
11603: ld r5, VCPU_KVM(r9) /* not relocated, use VRMA */
1161 ld r5, KVM_VRMA_SLB_V(r5)
1162 b 4b
1163
1164 /* If this is for emulated MMIO, load the instruction word */
11652: li r8, KVM_INST_FETCH_FAILED /* In case lwz faults */
1166
1167 /* Set guest mode to 'jump over instruction' so if lwz faults
1168 * we'll just continue at the next IP. */
1169 li r0, KVM_GUEST_MODE_SKIP
1170 stb r0, HSTATE_IN_GUEST(r13)
1171
1172 /* Do the access with MSR:DR enabled */
1173 mfmsr r3
1174 ori r4, r3, MSR_DR /* Enable paging for data */
1175 mtmsrd r4
1176 lwz r8, 0(r10)
1177 mtmsrd r3
1178
1179 /* Store the result */
1180 stw r8, VCPU_LAST_INST(r9)
1181
1182 /* Unset guest mode. */
1183 li r0, KVM_GUEST_MODE_NONE
1184 stb r0, HSTATE_IN_GUEST(r13)
1185 b nohpte_cont
1186
1187/*
1188 * Similarly for an HISI, reflect it to the guest as an ISI unless
1189 * it is an HPTE not found fault for a page that we have paged out.
1190 */
1191kvmppc_hisi:
1192 andis. r0, r11, SRR1_ISI_NOPT@h
1193 beq 1f
1194 andi. r0, r11, MSR_IR /* instruction relocation enabled? */
1195 beq 3f
1196 clrrdi r0, r10, 28
1197 PPC_SLBFEE_DOT(r5, r0) /* if so, look up SLB */
1198 bne 1f /* if no SLB entry found */
11994:
1200 /* Search the hash table. */
1201 mr r3, r9 /* vcpu pointer */
1202 mr r4, r10
1203 mr r6, r11
1204 li r7, 0 /* instruction fault */
1205 bl .kvmppc_hpte_hv_fault
1206 ld r9, HSTATE_KVM_VCPU(r13)
1207 ld r10, VCPU_PC(r9)
1208 ld r11, VCPU_MSR(r9)
1209 li r12, BOOK3S_INTERRUPT_H_INST_STORAGE
1210 cmpdi r3, 0 /* retry the instruction */
1211 beq 6f
1212 cmpdi r3, -1 /* handle in kernel mode */
1213 beq nohpte_cont
1214
1215 /* Synthesize an ISI for the guest */
1216 mr r11, r3
12171: mtspr SPRN_SRR0, r10
1218 mtspr SPRN_SRR1, r11
1219 li r10, BOOK3S_INTERRUPT_INST_STORAGE
1220 li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */
1221 rotldi r11, r11, 63
12226: ld r7, VCPU_CTR(r9)
1223 lwz r8, VCPU_XER(r9)
1224 mtctr r7
1225 mtxer r8
1226 mr r4, r9
1227 b fast_guest_return
1228
12293: ld r6, VCPU_KVM(r9) /* not relocated, use VRMA */
1230 ld r5, KVM_VRMA_SLB_V(r6)
1231 b 4b
1100 1232
1101/* 1233/*
1102 * Try to handle an hcall in real mode. 1234 * Try to handle an hcall in real mode.
diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c
index 7b0ee96c1bed..e70ef2d86431 100644
--- a/arch/powerpc/kvm/book3s_paired_singles.c
+++ b/arch/powerpc/kvm/book3s_paired_singles.c
@@ -196,7 +196,8 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
196 kvmppc_inject_pf(vcpu, addr, false); 196 kvmppc_inject_pf(vcpu, addr, false);
197 goto done_load; 197 goto done_load;
198 } else if (r == EMULATE_DO_MMIO) { 198 } else if (r == EMULATE_DO_MMIO) {
199 emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FPR | rs, len, 1); 199 emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FPR | rs,
200 len, 1);
200 goto done_load; 201 goto done_load;
201 } 202 }
202 203
@@ -286,11 +287,13 @@ static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
286 kvmppc_inject_pf(vcpu, addr, false); 287 kvmppc_inject_pf(vcpu, addr, false);
287 goto done_load; 288 goto done_load;
288 } else if ((r == EMULATE_DO_MMIO) && w) { 289 } else if ((r == EMULATE_DO_MMIO) && w) {
289 emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FPR | rs, 4, 1); 290 emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FPR | rs,
291 4, 1);
290 vcpu->arch.qpr[rs] = tmp[1]; 292 vcpu->arch.qpr[rs] = tmp[1];
291 goto done_load; 293 goto done_load;
292 } else if (r == EMULATE_DO_MMIO) { 294 } else if (r == EMULATE_DO_MMIO) {
293 emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FQPR | rs, 8, 1); 295 emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FQPR | rs,
296 8, 1);
294 goto done_load; 297 goto done_load;
295 } 298 }
296 299
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 220fcdf26978..7340e1090b77 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -51,15 +51,19 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
51#define MSR_USER32 MSR_USER 51#define MSR_USER32 MSR_USER
52#define MSR_USER64 MSR_USER 52#define MSR_USER64 MSR_USER
53#define HW_PAGE_SIZE PAGE_SIZE 53#define HW_PAGE_SIZE PAGE_SIZE
54#define __hard_irq_disable local_irq_disable
55#define __hard_irq_enable local_irq_enable
54#endif 56#endif
55 57
56void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 58void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
57{ 59{
58#ifdef CONFIG_PPC_BOOK3S_64 60#ifdef CONFIG_PPC_BOOK3S_64
59 memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb)); 61 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
62 memcpy(svcpu->slb, to_book3s(vcpu)->slb_shadow, sizeof(svcpu->slb));
60 memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu, 63 memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu,
61 sizeof(get_paca()->shadow_vcpu)); 64 sizeof(get_paca()->shadow_vcpu));
62 to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max; 65 svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max;
66 svcpu_put(svcpu);
63#endif 67#endif
64 68
65#ifdef CONFIG_PPC_BOOK3S_32 69#ifdef CONFIG_PPC_BOOK3S_32
@@ -70,10 +74,12 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
70void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) 74void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
71{ 75{
72#ifdef CONFIG_PPC_BOOK3S_64 76#ifdef CONFIG_PPC_BOOK3S_64
73 memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb)); 77 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
78 memcpy(to_book3s(vcpu)->slb_shadow, svcpu->slb, sizeof(svcpu->slb));
74 memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu, 79 memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu,
75 sizeof(get_paca()->shadow_vcpu)); 80 sizeof(get_paca()->shadow_vcpu));
76 to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max; 81 to_book3s(vcpu)->slb_shadow_max = svcpu->slb_max;
82 svcpu_put(svcpu);
77#endif 83#endif
78 84
79 kvmppc_giveup_ext(vcpu, MSR_FP); 85 kvmppc_giveup_ext(vcpu, MSR_FP);
@@ -151,14 +157,16 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
151#ifdef CONFIG_PPC_BOOK3S_64 157#ifdef CONFIG_PPC_BOOK3S_64
152 if ((pvr >= 0x330000) && (pvr < 0x70330000)) { 158 if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
153 kvmppc_mmu_book3s_64_init(vcpu); 159 kvmppc_mmu_book3s_64_init(vcpu);
154 to_book3s(vcpu)->hior = 0xfff00000; 160 if (!to_book3s(vcpu)->hior_explicit)
161 to_book3s(vcpu)->hior = 0xfff00000;
155 to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL; 162 to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
156 vcpu->arch.cpu_type = KVM_CPU_3S_64; 163 vcpu->arch.cpu_type = KVM_CPU_3S_64;
157 } else 164 } else
158#endif 165#endif
159 { 166 {
160 kvmppc_mmu_book3s_32_init(vcpu); 167 kvmppc_mmu_book3s_32_init(vcpu);
161 to_book3s(vcpu)->hior = 0; 168 if (!to_book3s(vcpu)->hior_explicit)
169 to_book3s(vcpu)->hior = 0;
162 to_book3s(vcpu)->msr_mask = 0xffffffffULL; 170 to_book3s(vcpu)->msr_mask = 0xffffffffULL;
163 vcpu->arch.cpu_type = KVM_CPU_3S_32; 171 vcpu->arch.cpu_type = KVM_CPU_3S_32;
164 } 172 }
@@ -308,19 +316,22 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
308 316
309 if (page_found == -ENOENT) { 317 if (page_found == -ENOENT) {
310 /* Page not found in guest PTE entries */ 318 /* Page not found in guest PTE entries */
319 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
311 vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); 320 vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
312 vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; 321 vcpu->arch.shared->dsisr = svcpu->fault_dsisr;
313 vcpu->arch.shared->msr |= 322 vcpu->arch.shared->msr |=
314 (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); 323 (svcpu->shadow_srr1 & 0x00000000f8000000ULL);
324 svcpu_put(svcpu);
315 kvmppc_book3s_queue_irqprio(vcpu, vec); 325 kvmppc_book3s_queue_irqprio(vcpu, vec);
316 } else if (page_found == -EPERM) { 326 } else if (page_found == -EPERM) {
317 /* Storage protection */ 327 /* Storage protection */
328 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
318 vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); 329 vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
319 vcpu->arch.shared->dsisr = 330 vcpu->arch.shared->dsisr = svcpu->fault_dsisr & ~DSISR_NOHPTE;
320 to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE;
321 vcpu->arch.shared->dsisr |= DSISR_PROTFAULT; 331 vcpu->arch.shared->dsisr |= DSISR_PROTFAULT;
322 vcpu->arch.shared->msr |= 332 vcpu->arch.shared->msr |=
323 (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); 333 svcpu->shadow_srr1 & 0x00000000f8000000ULL;
334 svcpu_put(svcpu);
324 kvmppc_book3s_queue_irqprio(vcpu, vec); 335 kvmppc_book3s_queue_irqprio(vcpu, vec);
325 } else if (page_found == -EINVAL) { 336 } else if (page_found == -EINVAL) {
326 /* Page not found in guest SLB */ 337 /* Page not found in guest SLB */
@@ -517,24 +528,29 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
517 run->ready_for_interrupt_injection = 1; 528 run->ready_for_interrupt_injection = 1;
518 529
519 trace_kvm_book3s_exit(exit_nr, vcpu); 530 trace_kvm_book3s_exit(exit_nr, vcpu);
531 preempt_enable();
520 kvm_resched(vcpu); 532 kvm_resched(vcpu);
521 switch (exit_nr) { 533 switch (exit_nr) {
522 case BOOK3S_INTERRUPT_INST_STORAGE: 534 case BOOK3S_INTERRUPT_INST_STORAGE:
535 {
536 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
537 ulong shadow_srr1 = svcpu->shadow_srr1;
523 vcpu->stat.pf_instruc++; 538 vcpu->stat.pf_instruc++;
524 539
525#ifdef CONFIG_PPC_BOOK3S_32 540#ifdef CONFIG_PPC_BOOK3S_32
526 /* We set segments as unused segments when invalidating them. So 541 /* We set segments as unused segments when invalidating them. So
527 * treat the respective fault as segment fault. */ 542 * treat the respective fault as segment fault. */
528 if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT] 543 if (svcpu->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT] == SR_INVALID) {
529 == SR_INVALID) {
530 kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); 544 kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
531 r = RESUME_GUEST; 545 r = RESUME_GUEST;
546 svcpu_put(svcpu);
532 break; 547 break;
533 } 548 }
534#endif 549#endif
550 svcpu_put(svcpu);
535 551
536 /* only care about PTEG not found errors, but leave NX alone */ 552 /* only care about PTEG not found errors, but leave NX alone */
537 if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) { 553 if (shadow_srr1 & 0x40000000) {
538 r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr); 554 r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr);
539 vcpu->stat.sp_instruc++; 555 vcpu->stat.sp_instruc++;
540 } else if (vcpu->arch.mmu.is_dcbz32(vcpu) && 556 } else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
@@ -547,33 +563,37 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
547 kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL); 563 kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
548 r = RESUME_GUEST; 564 r = RESUME_GUEST;
549 } else { 565 } else {
550 vcpu->arch.shared->msr |= 566 vcpu->arch.shared->msr |= shadow_srr1 & 0x58000000;
551 to_svcpu(vcpu)->shadow_srr1 & 0x58000000;
552 kvmppc_book3s_queue_irqprio(vcpu, exit_nr); 567 kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
553 r = RESUME_GUEST; 568 r = RESUME_GUEST;
554 } 569 }
555 break; 570 break;
571 }
556 case BOOK3S_INTERRUPT_DATA_STORAGE: 572 case BOOK3S_INTERRUPT_DATA_STORAGE:
557 { 573 {
558 ulong dar = kvmppc_get_fault_dar(vcpu); 574 ulong dar = kvmppc_get_fault_dar(vcpu);
575 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
576 u32 fault_dsisr = svcpu->fault_dsisr;
559 vcpu->stat.pf_storage++; 577 vcpu->stat.pf_storage++;
560 578
561#ifdef CONFIG_PPC_BOOK3S_32 579#ifdef CONFIG_PPC_BOOK3S_32
562 /* We set segments as unused segments when invalidating them. So 580 /* We set segments as unused segments when invalidating them. So
563 * treat the respective fault as segment fault. */ 581 * treat the respective fault as segment fault. */
564 if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) { 582 if ((svcpu->sr[dar >> SID_SHIFT]) == SR_INVALID) {
565 kvmppc_mmu_map_segment(vcpu, dar); 583 kvmppc_mmu_map_segment(vcpu, dar);
566 r = RESUME_GUEST; 584 r = RESUME_GUEST;
585 svcpu_put(svcpu);
567 break; 586 break;
568 } 587 }
569#endif 588#endif
589 svcpu_put(svcpu);
570 590
571 /* The only case we need to handle is missing shadow PTEs */ 591 /* The only case we need to handle is missing shadow PTEs */
572 if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) { 592 if (fault_dsisr & DSISR_NOHPTE) {
573 r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr); 593 r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr);
574 } else { 594 } else {
575 vcpu->arch.shared->dar = dar; 595 vcpu->arch.shared->dar = dar;
576 vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; 596 vcpu->arch.shared->dsisr = fault_dsisr;
577 kvmppc_book3s_queue_irqprio(vcpu, exit_nr); 597 kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
578 r = RESUME_GUEST; 598 r = RESUME_GUEST;
579 } 599 }
@@ -609,10 +629,13 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
609 case BOOK3S_INTERRUPT_PROGRAM: 629 case BOOK3S_INTERRUPT_PROGRAM:
610 { 630 {
611 enum emulation_result er; 631 enum emulation_result er;
632 struct kvmppc_book3s_shadow_vcpu *svcpu;
612 ulong flags; 633 ulong flags;
613 634
614program_interrupt: 635program_interrupt:
615 flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull; 636 svcpu = svcpu_get(vcpu);
637 flags = svcpu->shadow_srr1 & 0x1f0000ull;
638 svcpu_put(svcpu);
616 639
617 if (vcpu->arch.shared->msr & MSR_PR) { 640 if (vcpu->arch.shared->msr & MSR_PR) {
618#ifdef EXIT_DEBUG 641#ifdef EXIT_DEBUG
@@ -740,20 +763,33 @@ program_interrupt:
740 r = RESUME_GUEST; 763 r = RESUME_GUEST;
741 break; 764 break;
742 default: 765 default:
766 {
767 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
768 ulong shadow_srr1 = svcpu->shadow_srr1;
769 svcpu_put(svcpu);
743 /* Ugh - bork here! What did we get? */ 770 /* Ugh - bork here! What did we get? */
744 printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", 771 printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n",
745 exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1); 772 exit_nr, kvmppc_get_pc(vcpu), shadow_srr1);
746 r = RESUME_HOST; 773 r = RESUME_HOST;
747 BUG(); 774 BUG();
748 break; 775 break;
749 } 776 }
750 777 }
751 778
752 if (!(r & RESUME_HOST)) { 779 if (!(r & RESUME_HOST)) {
753 /* To avoid clobbering exit_reason, only check for signals if 780 /* To avoid clobbering exit_reason, only check for signals if
754 * we aren't already exiting to userspace for some other 781 * we aren't already exiting to userspace for some other
755 * reason. */ 782 * reason. */
783
784 /*
785 * Interrupts could be timers for the guest which we have to
786 * inject again, so let's postpone them until we're in the guest
787 * and if we really did time things so badly, then we just exit
788 * again due to a host external interrupt.
789 */
790 __hard_irq_disable();
756 if (signal_pending(current)) { 791 if (signal_pending(current)) {
792 __hard_irq_enable();
757#ifdef EXIT_DEBUG 793#ifdef EXIT_DEBUG
758 printk(KERN_EMERG "KVM: Going back to host\n"); 794 printk(KERN_EMERG "KVM: Going back to host\n");
759#endif 795#endif
@@ -761,10 +797,12 @@ program_interrupt:
761 run->exit_reason = KVM_EXIT_INTR; 797 run->exit_reason = KVM_EXIT_INTR;
762 r = -EINTR; 798 r = -EINTR;
763 } else { 799 } else {
800 preempt_disable();
801
764 /* In case an interrupt came in that was triggered 802 /* In case an interrupt came in that was triggered
765 * from userspace (like DEC), we need to check what 803 * from userspace (like DEC), we need to check what
766 * to inject now! */ 804 * to inject now! */
767 kvmppc_core_deliver_interrupts(vcpu); 805 kvmppc_core_prepare_to_enter(vcpu);
768 } 806 }
769 } 807 }
770 808
@@ -836,6 +874,38 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
836 return 0; 874 return 0;
837} 875}
838 876
877int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
878{
879 int r = -EINVAL;
880
881 switch (reg->id) {
882 case KVM_REG_PPC_HIOR:
883 r = put_user(to_book3s(vcpu)->hior, (u64 __user *)reg->addr);
884 break;
885 default:
886 break;
887 }
888
889 return r;
890}
891
892int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
893{
894 int r = -EINVAL;
895
896 switch (reg->id) {
897 case KVM_REG_PPC_HIOR:
898 r = get_user(to_book3s(vcpu)->hior, (u64 __user *)reg->addr);
899 if (!r)
900 to_book3s(vcpu)->hior_explicit = true;
901 break;
902 default:
903 break;
904 }
905
906 return r;
907}
908
839int kvmppc_core_check_processor_compat(void) 909int kvmppc_core_check_processor_compat(void)
840{ 910{
841 return 0; 911 return 0;
@@ -923,16 +993,31 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
923#endif 993#endif
924 ulong ext_msr; 994 ulong ext_msr;
925 995
996 preempt_disable();
997
926 /* Check if we can run the vcpu at all */ 998 /* Check if we can run the vcpu at all */
927 if (!vcpu->arch.sane) { 999 if (!vcpu->arch.sane) {
928 kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 1000 kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
929 return -EINVAL; 1001 ret = -EINVAL;
1002 goto out;
930 } 1003 }
931 1004
1005 kvmppc_core_prepare_to_enter(vcpu);
1006
1007 /*
1008 * Interrupts could be timers for the guest which we have to inject
1009 * again, so let's postpone them until we're in the guest and if we
1010 * really did time things so badly, then we just exit again due to
1011 * a host external interrupt.
1012 */
1013 __hard_irq_disable();
1014
932 /* No need to go into the guest when all we do is going out */ 1015 /* No need to go into the guest when all we do is going out */
933 if (signal_pending(current)) { 1016 if (signal_pending(current)) {
1017 __hard_irq_enable();
934 kvm_run->exit_reason = KVM_EXIT_INTR; 1018 kvm_run->exit_reason = KVM_EXIT_INTR;
935 return -EINTR; 1019 ret = -EINTR;
1020 goto out;
936 } 1021 }
937 1022
938 /* Save FPU state in stack */ 1023 /* Save FPU state in stack */
@@ -974,8 +1059,6 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
974 1059
975 kvm_guest_exit(); 1060 kvm_guest_exit();
976 1061
977 local_irq_disable();
978
979 current->thread.regs->msr = ext_msr; 1062 current->thread.regs->msr = ext_msr;
980 1063
981 /* Make sure we save the guest FPU/Altivec/VSX state */ 1064 /* Make sure we save the guest FPU/Altivec/VSX state */
@@ -1002,9 +1085,50 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1002 current->thread.used_vsr = used_vsr; 1085 current->thread.used_vsr = used_vsr;
1003#endif 1086#endif
1004 1087
1088out:
1089 preempt_enable();
1005 return ret; 1090 return ret;
1006} 1091}
1007 1092
1093/*
1094 * Get (and clear) the dirty memory log for a memory slot.
1095 */
1096int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1097 struct kvm_dirty_log *log)
1098{
1099 struct kvm_memory_slot *memslot;
1100 struct kvm_vcpu *vcpu;
1101 ulong ga, ga_end;
1102 int is_dirty = 0;
1103 int r;
1104 unsigned long n;
1105
1106 mutex_lock(&kvm->slots_lock);
1107
1108 r = kvm_get_dirty_log(kvm, log, &is_dirty);
1109 if (r)
1110 goto out;
1111
1112 /* If nothing is dirty, don't bother messing with page tables. */
1113 if (is_dirty) {
1114 memslot = id_to_memslot(kvm->memslots, log->slot);
1115
1116 ga = memslot->base_gfn << PAGE_SHIFT;
1117 ga_end = ga + (memslot->npages << PAGE_SHIFT);
1118
1119 kvm_for_each_vcpu(n, vcpu, kvm)
1120 kvmppc_mmu_pte_pflush(vcpu, ga, ga_end);
1121
1122 n = kvm_dirty_bitmap_bytes(memslot);
1123 memset(memslot->dirty_bitmap, 0, n);
1124 }
1125
1126 r = 0;
1127out:
1128 mutex_unlock(&kvm->slots_lock);
1129 return r;
1130}
1131
1008int kvmppc_core_prepare_memory_region(struct kvm *kvm, 1132int kvmppc_core_prepare_memory_region(struct kvm *kvm,
1009 struct kvm_userspace_memory_region *mem) 1133 struct kvm_userspace_memory_region *mem)
1010{ 1134{
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index bb6c988f010a..ee9e1ee9c858 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -124,12 +124,6 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
124 vcpu->arch.shared->msr = new_msr; 124 vcpu->arch.shared->msr = new_msr;
125 125
126 kvmppc_mmu_msr_notify(vcpu, old_msr); 126 kvmppc_mmu_msr_notify(vcpu, old_msr);
127
128 if (vcpu->arch.shared->msr & MSR_WE) {
129 kvm_vcpu_block(vcpu);
130 kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
131 };
132
133 kvmppc_vcpu_sync_spe(vcpu); 127 kvmppc_vcpu_sync_spe(vcpu);
134} 128}
135 129
@@ -258,9 +252,11 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
258 allowed = vcpu->arch.shared->msr & MSR_ME; 252 allowed = vcpu->arch.shared->msr & MSR_ME;
259 msr_mask = 0; 253 msr_mask = 0;
260 break; 254 break;
261 case BOOKE_IRQPRIO_EXTERNAL:
262 case BOOKE_IRQPRIO_DECREMENTER: 255 case BOOKE_IRQPRIO_DECREMENTER:
263 case BOOKE_IRQPRIO_FIT: 256 case BOOKE_IRQPRIO_FIT:
257 keep_irq = true;
258 /* fall through */
259 case BOOKE_IRQPRIO_EXTERNAL:
264 allowed = vcpu->arch.shared->msr & MSR_EE; 260 allowed = vcpu->arch.shared->msr & MSR_EE;
265 allowed = allowed && !crit; 261 allowed = allowed && !crit;
266 msr_mask = MSR_CE|MSR_ME|MSR_DE; 262 msr_mask = MSR_CE|MSR_ME|MSR_DE;
@@ -276,7 +272,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
276 vcpu->arch.shared->srr1 = vcpu->arch.shared->msr; 272 vcpu->arch.shared->srr1 = vcpu->arch.shared->msr;
277 vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority]; 273 vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority];
278 if (update_esr == true) 274 if (update_esr == true)
279 vcpu->arch.esr = vcpu->arch.queued_esr; 275 vcpu->arch.shared->esr = vcpu->arch.queued_esr;
280 if (update_dear == true) 276 if (update_dear == true)
281 vcpu->arch.shared->dar = vcpu->arch.queued_dear; 277 vcpu->arch.shared->dar = vcpu->arch.queued_dear;
282 kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask); 278 kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask);
@@ -288,13 +284,26 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
288 return allowed; 284 return allowed;
289} 285}
290 286
291/* Check pending exceptions and deliver one, if possible. */ 287static void update_timer_ints(struct kvm_vcpu *vcpu)
292void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) 288{
289 if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
290 kvmppc_core_queue_dec(vcpu);
291 else
292 kvmppc_core_dequeue_dec(vcpu);
293}
294
295static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
293{ 296{
294 unsigned long *pending = &vcpu->arch.pending_exceptions; 297 unsigned long *pending = &vcpu->arch.pending_exceptions;
295 unsigned long old_pending = vcpu->arch.pending_exceptions;
296 unsigned int priority; 298 unsigned int priority;
297 299
300 if (vcpu->requests) {
301 if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) {
302 smp_mb();
303 update_timer_ints(vcpu);
304 }
305 }
306
298 priority = __ffs(*pending); 307 priority = __ffs(*pending);
299 while (priority <= BOOKE_IRQPRIO_MAX) { 308 while (priority <= BOOKE_IRQPRIO_MAX) {
300 if (kvmppc_booke_irqprio_deliver(vcpu, priority)) 309 if (kvmppc_booke_irqprio_deliver(vcpu, priority))
@@ -306,10 +315,24 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
306 } 315 }
307 316
308 /* Tell the guest about our interrupt status */ 317 /* Tell the guest about our interrupt status */
309 if (*pending) 318 vcpu->arch.shared->int_pending = !!*pending;
310 vcpu->arch.shared->int_pending = 1; 319}
311 else if (old_pending) 320
312 vcpu->arch.shared->int_pending = 0; 321/* Check pending exceptions and deliver one, if possible. */
322void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
323{
324 WARN_ON_ONCE(!irqs_disabled());
325
326 kvmppc_core_check_exceptions(vcpu);
327
328 if (vcpu->arch.shared->msr & MSR_WE) {
329 local_irq_enable();
330 kvm_vcpu_block(vcpu);
331 local_irq_disable();
332
333 kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
334 kvmppc_core_check_exceptions(vcpu);
335 };
313} 336}
314 337
315int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 338int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
@@ -322,11 +345,21 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
322 } 345 }
323 346
324 local_irq_disable(); 347 local_irq_disable();
348
349 kvmppc_core_prepare_to_enter(vcpu);
350
351 if (signal_pending(current)) {
352 kvm_run->exit_reason = KVM_EXIT_INTR;
353 ret = -EINTR;
354 goto out;
355 }
356
325 kvm_guest_enter(); 357 kvm_guest_enter();
326 ret = __kvmppc_vcpu_run(kvm_run, vcpu); 358 ret = __kvmppc_vcpu_run(kvm_run, vcpu);
327 kvm_guest_exit(); 359 kvm_guest_exit();
328 local_irq_enable();
329 360
361out:
362 local_irq_enable();
330 return ret; 363 return ret;
331} 364}
332 365
@@ -603,7 +636,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
603 636
604 local_irq_disable(); 637 local_irq_disable();
605 638
606 kvmppc_core_deliver_interrupts(vcpu); 639 kvmppc_core_prepare_to_enter(vcpu);
607 640
608 if (!(r & RESUME_HOST)) { 641 if (!(r & RESUME_HOST)) {
609 /* To avoid clobbering exit_reason, only check for signals if 642 /* To avoid clobbering exit_reason, only check for signals if
@@ -628,6 +661,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
628 vcpu->arch.pc = 0; 661 vcpu->arch.pc = 0;
629 vcpu->arch.shared->msr = 0; 662 vcpu->arch.shared->msr = 0;
630 vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS; 663 vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS;
664 vcpu->arch.shared->pir = vcpu->vcpu_id;
631 kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ 665 kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */
632 666
633 vcpu->arch.shadow_pid = 1; 667 vcpu->arch.shadow_pid = 1;
@@ -662,10 +696,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
662 regs->sprg1 = vcpu->arch.shared->sprg1; 696 regs->sprg1 = vcpu->arch.shared->sprg1;
663 regs->sprg2 = vcpu->arch.shared->sprg2; 697 regs->sprg2 = vcpu->arch.shared->sprg2;
664 regs->sprg3 = vcpu->arch.shared->sprg3; 698 regs->sprg3 = vcpu->arch.shared->sprg3;
665 regs->sprg4 = vcpu->arch.sprg4; 699 regs->sprg4 = vcpu->arch.shared->sprg4;
666 regs->sprg5 = vcpu->arch.sprg5; 700 regs->sprg5 = vcpu->arch.shared->sprg5;
667 regs->sprg6 = vcpu->arch.sprg6; 701 regs->sprg6 = vcpu->arch.shared->sprg6;
668 regs->sprg7 = vcpu->arch.sprg7; 702 regs->sprg7 = vcpu->arch.shared->sprg7;
669 703
670 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) 704 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
671 regs->gpr[i] = kvmppc_get_gpr(vcpu, i); 705 regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
@@ -690,10 +724,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
690 vcpu->arch.shared->sprg1 = regs->sprg1; 724 vcpu->arch.shared->sprg1 = regs->sprg1;
691 vcpu->arch.shared->sprg2 = regs->sprg2; 725 vcpu->arch.shared->sprg2 = regs->sprg2;
692 vcpu->arch.shared->sprg3 = regs->sprg3; 726 vcpu->arch.shared->sprg3 = regs->sprg3;
693 vcpu->arch.sprg4 = regs->sprg4; 727 vcpu->arch.shared->sprg4 = regs->sprg4;
694 vcpu->arch.sprg5 = regs->sprg5; 728 vcpu->arch.shared->sprg5 = regs->sprg5;
695 vcpu->arch.sprg6 = regs->sprg6; 729 vcpu->arch.shared->sprg6 = regs->sprg6;
696 vcpu->arch.sprg7 = regs->sprg7; 730 vcpu->arch.shared->sprg7 = regs->sprg7;
697 731
698 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) 732 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
699 kvmppc_set_gpr(vcpu, i, regs->gpr[i]); 733 kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
@@ -711,7 +745,7 @@ static void get_sregs_base(struct kvm_vcpu *vcpu,
711 sregs->u.e.csrr0 = vcpu->arch.csrr0; 745 sregs->u.e.csrr0 = vcpu->arch.csrr0;
712 sregs->u.e.csrr1 = vcpu->arch.csrr1; 746 sregs->u.e.csrr1 = vcpu->arch.csrr1;
713 sregs->u.e.mcsr = vcpu->arch.mcsr; 747 sregs->u.e.mcsr = vcpu->arch.mcsr;
714 sregs->u.e.esr = vcpu->arch.esr; 748 sregs->u.e.esr = vcpu->arch.shared->esr;
715 sregs->u.e.dear = vcpu->arch.shared->dar; 749 sregs->u.e.dear = vcpu->arch.shared->dar;
716 sregs->u.e.tsr = vcpu->arch.tsr; 750 sregs->u.e.tsr = vcpu->arch.tsr;
717 sregs->u.e.tcr = vcpu->arch.tcr; 751 sregs->u.e.tcr = vcpu->arch.tcr;
@@ -729,28 +763,19 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
729 vcpu->arch.csrr0 = sregs->u.e.csrr0; 763 vcpu->arch.csrr0 = sregs->u.e.csrr0;
730 vcpu->arch.csrr1 = sregs->u.e.csrr1; 764 vcpu->arch.csrr1 = sregs->u.e.csrr1;
731 vcpu->arch.mcsr = sregs->u.e.mcsr; 765 vcpu->arch.mcsr = sregs->u.e.mcsr;
732 vcpu->arch.esr = sregs->u.e.esr; 766 vcpu->arch.shared->esr = sregs->u.e.esr;
733 vcpu->arch.shared->dar = sregs->u.e.dear; 767 vcpu->arch.shared->dar = sregs->u.e.dear;
734 vcpu->arch.vrsave = sregs->u.e.vrsave; 768 vcpu->arch.vrsave = sregs->u.e.vrsave;
735 vcpu->arch.tcr = sregs->u.e.tcr; 769 kvmppc_set_tcr(vcpu, sregs->u.e.tcr);
736 770
737 if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_DEC) 771 if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_DEC) {
738 vcpu->arch.dec = sregs->u.e.dec; 772 vcpu->arch.dec = sregs->u.e.dec;
739 773 kvmppc_emulate_dec(vcpu);
740 kvmppc_emulate_dec(vcpu); 774 }
741 775
742 if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { 776 if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
743 /* 777 vcpu->arch.tsr = sregs->u.e.tsr;
744 * FIXME: existing KVM timer handling is incomplete. 778 update_timer_ints(vcpu);
745 * TSR cannot be read by the guest, and its value in
746 * vcpu->arch is always zero. For now, just handle
747 * the case where the caller is trying to inject a
748 * decrementer interrupt.
749 */
750
751 if ((sregs->u.e.tsr & TSR_DIS) &&
752 (vcpu->arch.tcr & TCR_DIE))
753 kvmppc_core_queue_dec(vcpu);
754 } 779 }
755 780
756 return 0; 781 return 0;
@@ -761,7 +786,7 @@ static void get_sregs_arch206(struct kvm_vcpu *vcpu,
761{ 786{
762 sregs->u.e.features |= KVM_SREGS_E_ARCH206; 787 sregs->u.e.features |= KVM_SREGS_E_ARCH206;
763 788
764 sregs->u.e.pir = 0; 789 sregs->u.e.pir = vcpu->vcpu_id;
765 sregs->u.e.mcsrr0 = vcpu->arch.mcsrr0; 790 sregs->u.e.mcsrr0 = vcpu->arch.mcsrr0;
766 sregs->u.e.mcsrr1 = vcpu->arch.mcsrr1; 791 sregs->u.e.mcsrr1 = vcpu->arch.mcsrr1;
767 sregs->u.e.decar = vcpu->arch.decar; 792 sregs->u.e.decar = vcpu->arch.decar;
@@ -774,7 +799,7 @@ static int set_sregs_arch206(struct kvm_vcpu *vcpu,
774 if (!(sregs->u.e.features & KVM_SREGS_E_ARCH206)) 799 if (!(sregs->u.e.features & KVM_SREGS_E_ARCH206))
775 return 0; 800 return 0;
776 801
777 if (sregs->u.e.pir != 0) 802 if (sregs->u.e.pir != vcpu->vcpu_id)
778 return -EINVAL; 803 return -EINVAL;
779 804
780 vcpu->arch.mcsrr0 = sregs->u.e.mcsrr0; 805 vcpu->arch.mcsrr0 = sregs->u.e.mcsrr0;
@@ -862,6 +887,16 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
862 return kvmppc_core_set_sregs(vcpu, sregs); 887 return kvmppc_core_set_sregs(vcpu, sregs);
863} 888}
864 889
890int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
891{
892 return -EINVAL;
893}
894
895int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
896{
897 return -EINVAL;
898}
899
865int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 900int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
866{ 901{
867 return -ENOTSUPP; 902 return -ENOTSUPP;
@@ -906,6 +941,33 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
906{ 941{
907} 942}
908 943
944void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
945{
946 vcpu->arch.tcr = new_tcr;
947 update_timer_ints(vcpu);
948}
949
950void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
951{
952 set_bits(tsr_bits, &vcpu->arch.tsr);
953 smp_wmb();
954 kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
955 kvm_vcpu_kick(vcpu);
956}
957
958void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
959{
960 clear_bits(tsr_bits, &vcpu->arch.tsr);
961 update_timer_ints(vcpu);
962}
963
964void kvmppc_decrementer_func(unsigned long data)
965{
966 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
967
968 kvmppc_set_tsr_bits(vcpu, TSR_DIS);
969}
970
909int __init kvmppc_booke_init(void) 971int __init kvmppc_booke_init(void)
910{ 972{
911 unsigned long ivor[16]; 973 unsigned long ivor[16];
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index 8e1fe33d64e5..2fe202705a3f 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -55,6 +55,10 @@ extern unsigned long kvmppc_booke_handlers;
55void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr); 55void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr);
56void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr); 56void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr);
57 57
58void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr);
59void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits);
60void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits);
61
58int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, 62int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
59 unsigned int inst, int *advance); 63 unsigned int inst, int *advance);
60int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt); 64int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt);
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 1260f5f24c0c..3e652da36534 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -13,6 +13,7 @@
13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
14 * 14 *
15 * Copyright IBM Corp. 2008 15 * Copyright IBM Corp. 2008
16 * Copyright 2011 Freescale Semiconductor, Inc.
16 * 17 *
17 * Authors: Hollis Blanchard <hollisb@us.ibm.com> 18 * Authors: Hollis Blanchard <hollisb@us.ibm.com>
18 */ 19 */
@@ -107,7 +108,7 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
107 case SPRN_DEAR: 108 case SPRN_DEAR:
108 vcpu->arch.shared->dar = spr_val; break; 109 vcpu->arch.shared->dar = spr_val; break;
109 case SPRN_ESR: 110 case SPRN_ESR:
110 vcpu->arch.esr = spr_val; break; 111 vcpu->arch.shared->esr = spr_val; break;
111 case SPRN_DBCR0: 112 case SPRN_DBCR0:
112 vcpu->arch.dbcr0 = spr_val; break; 113 vcpu->arch.dbcr0 = spr_val; break;
113 case SPRN_DBCR1: 114 case SPRN_DBCR1:
@@ -115,23 +116,23 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
115 case SPRN_DBSR: 116 case SPRN_DBSR:
116 vcpu->arch.dbsr &= ~spr_val; break; 117 vcpu->arch.dbsr &= ~spr_val; break;
117 case SPRN_TSR: 118 case SPRN_TSR:
118 vcpu->arch.tsr &= ~spr_val; break; 119 kvmppc_clr_tsr_bits(vcpu, spr_val);
120 break;
119 case SPRN_TCR: 121 case SPRN_TCR:
120 vcpu->arch.tcr = spr_val; 122 kvmppc_set_tcr(vcpu, spr_val);
121 kvmppc_emulate_dec(vcpu);
122 break; 123 break;
123 124
124 /* Note: SPRG4-7 are user-readable. These values are 125 /* Note: SPRG4-7 are user-readable. These values are
125 * loaded into the real SPRGs when resuming the 126 * loaded into the real SPRGs when resuming the
126 * guest. */ 127 * guest. */
127 case SPRN_SPRG4: 128 case SPRN_SPRG4:
128 vcpu->arch.sprg4 = spr_val; break; 129 vcpu->arch.shared->sprg4 = spr_val; break;
129 case SPRN_SPRG5: 130 case SPRN_SPRG5:
130 vcpu->arch.sprg5 = spr_val; break; 131 vcpu->arch.shared->sprg5 = spr_val; break;
131 case SPRN_SPRG6: 132 case SPRN_SPRG6:
132 vcpu->arch.sprg6 = spr_val; break; 133 vcpu->arch.shared->sprg6 = spr_val; break;
133 case SPRN_SPRG7: 134 case SPRN_SPRG7:
134 vcpu->arch.sprg7 = spr_val; break; 135 vcpu->arch.shared->sprg7 = spr_val; break;
135 136
136 case SPRN_IVPR: 137 case SPRN_IVPR:
137 vcpu->arch.ivpr = spr_val; 138 vcpu->arch.ivpr = spr_val;
@@ -202,13 +203,17 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
202 case SPRN_DEAR: 203 case SPRN_DEAR:
203 kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar); break; 204 kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar); break;
204 case SPRN_ESR: 205 case SPRN_ESR:
205 kvmppc_set_gpr(vcpu, rt, vcpu->arch.esr); break; 206 kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->esr); break;
206 case SPRN_DBCR0: 207 case SPRN_DBCR0:
207 kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr0); break; 208 kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr0); break;
208 case SPRN_DBCR1: 209 case SPRN_DBCR1:
209 kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr1); break; 210 kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr1); break;
210 case SPRN_DBSR: 211 case SPRN_DBSR:
211 kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbsr); break; 212 kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbsr); break;
213 case SPRN_TSR:
214 kvmppc_set_gpr(vcpu, rt, vcpu->arch.tsr); break;
215 case SPRN_TCR:
216 kvmppc_set_gpr(vcpu, rt, vcpu->arch.tcr); break;
212 217
213 case SPRN_IVOR0: 218 case SPRN_IVOR0:
214 kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]); 219 kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]);
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index 42f2fb1f66e9..10d8ef602e5c 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -402,19 +402,25 @@ lightweight_exit:
402 /* Save vcpu pointer for the exception handlers. */ 402 /* Save vcpu pointer for the exception handlers. */
403 mtspr SPRN_SPRG_WVCPU, r4 403 mtspr SPRN_SPRG_WVCPU, r4
404 404
405 lwz r5, VCPU_SHARED(r4)
406
405 /* Can't switch the stack pointer until after IVPR is switched, 407 /* Can't switch the stack pointer until after IVPR is switched,
406 * because host interrupt handlers would get confused. */ 408 * because host interrupt handlers would get confused. */
407 lwz r1, VCPU_GPR(r1)(r4) 409 lwz r1, VCPU_GPR(r1)(r4)
408 410
409 /* Host interrupt handlers may have clobbered these guest-readable 411 /*
410 * SPRGs, so we need to reload them here with the guest's values. */ 412 * Host interrupt handlers may have clobbered these
411 lwz r3, VCPU_SPRG4(r4) 413 * guest-readable SPRGs, or the guest kernel may have
414 * written directly to the shared area, so we
415 * need to reload them here with the guest's values.
416 */
417 lwz r3, VCPU_SHARED_SPRG4(r5)
412 mtspr SPRN_SPRG4W, r3 418 mtspr SPRN_SPRG4W, r3
413 lwz r3, VCPU_SPRG5(r4) 419 lwz r3, VCPU_SHARED_SPRG5(r5)
414 mtspr SPRN_SPRG5W, r3 420 mtspr SPRN_SPRG5W, r3
415 lwz r3, VCPU_SPRG6(r4) 421 lwz r3, VCPU_SHARED_SPRG6(r5)
416 mtspr SPRN_SPRG6W, r3 422 mtspr SPRN_SPRG6W, r3
417 lwz r3, VCPU_SPRG7(r4) 423 lwz r3, VCPU_SHARED_SPRG7(r5)
418 mtspr SPRN_SPRG7W, r3 424 mtspr SPRN_SPRG7W, r3
419 425
420#ifdef CONFIG_KVM_EXIT_TIMING 426#ifdef CONFIG_KVM_EXIT_TIMING
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index 8c0d45a6faf7..ddcd896fa2ff 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -71,9 +71,6 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
71 vcpu->arch.pvr = mfspr(SPRN_PVR); 71 vcpu->arch.pvr = mfspr(SPRN_PVR);
72 vcpu_e500->svr = mfspr(SPRN_SVR); 72 vcpu_e500->svr = mfspr(SPRN_SVR);
73 73
74 /* Since booke kvm only support one core, update all vcpus' PIR to 0 */
75 vcpu->vcpu_id = 0;
76
77 vcpu->arch.cpu_type = KVM_CPU_E500V2; 74 vcpu->arch.cpu_type = KVM_CPU_E500V2;
78 75
79 return 0; 76 return 0;
@@ -118,12 +115,12 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
118 sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0; 115 sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0;
119 sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar; 116 sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar;
120 117
121 sregs->u.e.mas0 = vcpu_e500->mas0; 118 sregs->u.e.mas0 = vcpu->arch.shared->mas0;
122 sregs->u.e.mas1 = vcpu_e500->mas1; 119 sregs->u.e.mas1 = vcpu->arch.shared->mas1;
123 sregs->u.e.mas2 = vcpu_e500->mas2; 120 sregs->u.e.mas2 = vcpu->arch.shared->mas2;
124 sregs->u.e.mas7_3 = ((u64)vcpu_e500->mas7 << 32) | vcpu_e500->mas3; 121 sregs->u.e.mas7_3 = vcpu->arch.shared->mas7_3;
125 sregs->u.e.mas4 = vcpu_e500->mas4; 122 sregs->u.e.mas4 = vcpu->arch.shared->mas4;
126 sregs->u.e.mas6 = vcpu_e500->mas6; 123 sregs->u.e.mas6 = vcpu->arch.shared->mas6;
127 124
128 sregs->u.e.mmucfg = mfspr(SPRN_MMUCFG); 125 sregs->u.e.mmucfg = mfspr(SPRN_MMUCFG);
129 sregs->u.e.tlbcfg[0] = vcpu_e500->tlb0cfg; 126 sregs->u.e.tlbcfg[0] = vcpu_e500->tlb0cfg;
@@ -151,13 +148,12 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
151 } 148 }
152 149
153 if (sregs->u.e.features & KVM_SREGS_E_ARCH206_MMU) { 150 if (sregs->u.e.features & KVM_SREGS_E_ARCH206_MMU) {
154 vcpu_e500->mas0 = sregs->u.e.mas0; 151 vcpu->arch.shared->mas0 = sregs->u.e.mas0;
155 vcpu_e500->mas1 = sregs->u.e.mas1; 152 vcpu->arch.shared->mas1 = sregs->u.e.mas1;
156 vcpu_e500->mas2 = sregs->u.e.mas2; 153 vcpu->arch.shared->mas2 = sregs->u.e.mas2;
157 vcpu_e500->mas7 = sregs->u.e.mas7_3 >> 32; 154 vcpu->arch.shared->mas7_3 = sregs->u.e.mas7_3;
158 vcpu_e500->mas3 = (u32)sregs->u.e.mas7_3; 155 vcpu->arch.shared->mas4 = sregs->u.e.mas4;
159 vcpu_e500->mas4 = sregs->u.e.mas4; 156 vcpu->arch.shared->mas6 = sregs->u.e.mas6;
160 vcpu_e500->mas6 = sregs->u.e.mas6;
161 } 157 }
162 158
163 if (!(sregs->u.e.features & KVM_SREGS_E_IVOR)) 159 if (!(sregs->u.e.features & KVM_SREGS_E_IVOR))
@@ -233,6 +229,10 @@ static int __init kvmppc_e500_init(void)
233 unsigned long ivor[3]; 229 unsigned long ivor[3];
234 unsigned long max_ivor = 0; 230 unsigned long max_ivor = 0;
235 231
232 r = kvmppc_core_check_processor_compat();
233 if (r)
234 return r;
235
236 r = kvmppc_booke_init(); 236 r = kvmppc_booke_init();
237 if (r) 237 if (r)
238 return r; 238 return r;
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index d48ae396f41e..6d0b2bd54fb0 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -89,19 +89,23 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
89 return EMULATE_FAIL; 89 return EMULATE_FAIL;
90 vcpu_e500->pid[2] = spr_val; break; 90 vcpu_e500->pid[2] = spr_val; break;
91 case SPRN_MAS0: 91 case SPRN_MAS0:
92 vcpu_e500->mas0 = spr_val; break; 92 vcpu->arch.shared->mas0 = spr_val; break;
93 case SPRN_MAS1: 93 case SPRN_MAS1:
94 vcpu_e500->mas1 = spr_val; break; 94 vcpu->arch.shared->mas1 = spr_val; break;
95 case SPRN_MAS2: 95 case SPRN_MAS2:
96 vcpu_e500->mas2 = spr_val; break; 96 vcpu->arch.shared->mas2 = spr_val; break;
97 case SPRN_MAS3: 97 case SPRN_MAS3:
98 vcpu_e500->mas3 = spr_val; break; 98 vcpu->arch.shared->mas7_3 &= ~(u64)0xffffffff;
99 vcpu->arch.shared->mas7_3 |= spr_val;
100 break;
99 case SPRN_MAS4: 101 case SPRN_MAS4:
100 vcpu_e500->mas4 = spr_val; break; 102 vcpu->arch.shared->mas4 = spr_val; break;
101 case SPRN_MAS6: 103 case SPRN_MAS6:
102 vcpu_e500->mas6 = spr_val; break; 104 vcpu->arch.shared->mas6 = spr_val; break;
103 case SPRN_MAS7: 105 case SPRN_MAS7:
104 vcpu_e500->mas7 = spr_val; break; 106 vcpu->arch.shared->mas7_3 &= (u64)0xffffffff;
107 vcpu->arch.shared->mas7_3 |= (u64)spr_val << 32;
108 break;
105 case SPRN_L1CSR0: 109 case SPRN_L1CSR0:
106 vcpu_e500->l1csr0 = spr_val; 110 vcpu_e500->l1csr0 = spr_val;
107 vcpu_e500->l1csr0 &= ~(L1CSR0_DCFI | L1CSR0_CLFC); 111 vcpu_e500->l1csr0 &= ~(L1CSR0_DCFI | L1CSR0_CLFC);
@@ -143,6 +147,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
143{ 147{
144 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 148 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
145 int emulated = EMULATE_DONE; 149 int emulated = EMULATE_DONE;
150 unsigned long val;
146 151
147 switch (sprn) { 152 switch (sprn) {
148 case SPRN_PID: 153 case SPRN_PID:
@@ -152,20 +157,23 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
152 case SPRN_PID2: 157 case SPRN_PID2:
153 kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[2]); break; 158 kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[2]); break;
154 case SPRN_MAS0: 159 case SPRN_MAS0:
155 kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas0); break; 160 kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas0); break;
156 case SPRN_MAS1: 161 case SPRN_MAS1:
157 kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas1); break; 162 kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas1); break;
158 case SPRN_MAS2: 163 case SPRN_MAS2:
159 kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas2); break; 164 kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas2); break;
160 case SPRN_MAS3: 165 case SPRN_MAS3:
161 kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas3); break; 166 val = (u32)vcpu->arch.shared->mas7_3;
167 kvmppc_set_gpr(vcpu, rt, val);
168 break;
162 case SPRN_MAS4: 169 case SPRN_MAS4:
163 kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas4); break; 170 kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas4); break;
164 case SPRN_MAS6: 171 case SPRN_MAS6:
165 kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas6); break; 172 kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas6); break;
166 case SPRN_MAS7: 173 case SPRN_MAS7:
167 kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas7); break; 174 val = vcpu->arch.shared->mas7_3 >> 32;
168 175 kvmppc_set_gpr(vcpu, rt, val);
176 break;
169 case SPRN_TLB0CFG: 177 case SPRN_TLB0CFG:
170 kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb0cfg); break; 178 kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb0cfg); break;
171 case SPRN_TLB1CFG: 179 case SPRN_TLB1CFG:
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 13c432ea2fa8..6e53e4164de1 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -12,12 +12,19 @@
12 * published by the Free Software Foundation. 12 * published by the Free Software Foundation.
13 */ 13 */
14 14
15#include <linux/kernel.h>
15#include <linux/types.h> 16#include <linux/types.h>
16#include <linux/slab.h> 17#include <linux/slab.h>
17#include <linux/string.h> 18#include <linux/string.h>
18#include <linux/kvm.h> 19#include <linux/kvm.h>
19#include <linux/kvm_host.h> 20#include <linux/kvm_host.h>
20#include <linux/highmem.h> 21#include <linux/highmem.h>
22#include <linux/log2.h>
23#include <linux/uaccess.h>
24#include <linux/sched.h>
25#include <linux/rwsem.h>
26#include <linux/vmalloc.h>
27#include <linux/hugetlb.h>
21#include <asm/kvm_ppc.h> 28#include <asm/kvm_ppc.h>
22#include <asm/kvm_e500.h> 29#include <asm/kvm_e500.h>
23 30
@@ -26,7 +33,7 @@
26#include "trace.h" 33#include "trace.h"
27#include "timing.h" 34#include "timing.h"
28 35
29#define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1) 36#define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1)
30 37
31struct id { 38struct id {
32 unsigned long val; 39 unsigned long val;
@@ -63,7 +70,14 @@ static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids);
63 * The valid range of shadow ID is [1..255] */ 70 * The valid range of shadow ID is [1..255] */
64static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid); 71static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid);
65 72
66static unsigned int tlb1_entry_num; 73static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM];
74
75static struct kvm_book3e_206_tlb_entry *get_entry(
76 struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int entry)
77{
78 int offset = vcpu_e500->gtlb_offset[tlbsel];
79 return &vcpu_e500->gtlb_arch[offset + entry];
80}
67 81
68/* 82/*
69 * Allocate a free shadow id and setup a valid sid mapping in given entry. 83 * Allocate a free shadow id and setup a valid sid mapping in given entry.
@@ -116,13 +130,11 @@ static inline int local_sid_lookup(struct id *entry)
116 return -1; 130 return -1;
117} 131}
118 132
119/* Invalidate all id mappings on local core */ 133/* Invalidate all id mappings on local core -- call with preempt disabled */
120static inline void local_sid_destroy_all(void) 134static inline void local_sid_destroy_all(void)
121{ 135{
122 preempt_disable();
123 __get_cpu_var(pcpu_last_used_sid) = 0; 136 __get_cpu_var(pcpu_last_used_sid) = 0;
124 memset(&__get_cpu_var(pcpu_sids), 0, sizeof(__get_cpu_var(pcpu_sids))); 137 memset(&__get_cpu_var(pcpu_sids), 0, sizeof(__get_cpu_var(pcpu_sids)));
125 preempt_enable();
126} 138}
127 139
128static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500) 140static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500)
@@ -218,34 +230,13 @@ void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500)
218 preempt_enable(); 230 preempt_enable();
219} 231}
220 232
221void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu) 233static inline unsigned int gtlb0_get_next_victim(
222{
223 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
224 struct tlbe *tlbe;
225 int i, tlbsel;
226
227 printk("| %8s | %8s | %8s | %8s | %8s |\n",
228 "nr", "mas1", "mas2", "mas3", "mas7");
229
230 for (tlbsel = 0; tlbsel < 2; tlbsel++) {
231 printk("Guest TLB%d:\n", tlbsel);
232 for (i = 0; i < vcpu_e500->gtlb_size[tlbsel]; i++) {
233 tlbe = &vcpu_e500->gtlb_arch[tlbsel][i];
234 if (tlbe->mas1 & MAS1_VALID)
235 printk(" G[%d][%3d] | %08X | %08X | %08X | %08X |\n",
236 tlbsel, i, tlbe->mas1, tlbe->mas2,
237 tlbe->mas3, tlbe->mas7);
238 }
239 }
240}
241
242static inline unsigned int tlb0_get_next_victim(
243 struct kvmppc_vcpu_e500 *vcpu_e500) 234 struct kvmppc_vcpu_e500 *vcpu_e500)
244{ 235{
245 unsigned int victim; 236 unsigned int victim;
246 237
247 victim = vcpu_e500->gtlb_nv[0]++; 238 victim = vcpu_e500->gtlb_nv[0]++;
248 if (unlikely(vcpu_e500->gtlb_nv[0] >= KVM_E500_TLB0_WAY_NUM)) 239 if (unlikely(vcpu_e500->gtlb_nv[0] >= vcpu_e500->gtlb_params[0].ways))
249 vcpu_e500->gtlb_nv[0] = 0; 240 vcpu_e500->gtlb_nv[0] = 0;
250 241
251 return victim; 242 return victim;
@@ -254,12 +245,12 @@ static inline unsigned int tlb0_get_next_victim(
254static inline unsigned int tlb1_max_shadow_size(void) 245static inline unsigned int tlb1_max_shadow_size(void)
255{ 246{
256 /* reserve one entry for magic page */ 247 /* reserve one entry for magic page */
257 return tlb1_entry_num - tlbcam_index - 1; 248 return host_tlb_params[1].entries - tlbcam_index - 1;
258} 249}
259 250
260static inline int tlbe_is_writable(struct tlbe *tlbe) 251static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe)
261{ 252{
262 return tlbe->mas3 & (MAS3_SW|MAS3_UW); 253 return tlbe->mas7_3 & (MAS3_SW|MAS3_UW);
263} 254}
264 255
265static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode) 256static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode)
@@ -290,40 +281,66 @@ static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode)
290/* 281/*
291 * writing shadow tlb entry to host TLB 282 * writing shadow tlb entry to host TLB
292 */ 283 */
293static inline void __write_host_tlbe(struct tlbe *stlbe, uint32_t mas0) 284static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe,
285 uint32_t mas0)
294{ 286{
295 unsigned long flags; 287 unsigned long flags;
296 288
297 local_irq_save(flags); 289 local_irq_save(flags);
298 mtspr(SPRN_MAS0, mas0); 290 mtspr(SPRN_MAS0, mas0);
299 mtspr(SPRN_MAS1, stlbe->mas1); 291 mtspr(SPRN_MAS1, stlbe->mas1);
300 mtspr(SPRN_MAS2, stlbe->mas2); 292 mtspr(SPRN_MAS2, (unsigned long)stlbe->mas2);
301 mtspr(SPRN_MAS3, stlbe->mas3); 293 mtspr(SPRN_MAS3, (u32)stlbe->mas7_3);
302 mtspr(SPRN_MAS7, stlbe->mas7); 294 mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32));
303 asm volatile("isync; tlbwe" : : : "memory"); 295 asm volatile("isync; tlbwe" : : : "memory");
304 local_irq_restore(flags); 296 local_irq_restore(flags);
297
298 trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1,
299 stlbe->mas2, stlbe->mas7_3);
300}
301
302/*
303 * Acquire a mas0 with victim hint, as if we just took a TLB miss.
304 *
305 * We don't care about the address we're searching for, other than that it's
306 * in the right set and is not present in the TLB. Using a zero PID and a
307 * userspace address means we don't have to set and then restore MAS5, or
308 * calculate a proper MAS6 value.
309 */
310static u32 get_host_mas0(unsigned long eaddr)
311{
312 unsigned long flags;
313 u32 mas0;
314
315 local_irq_save(flags);
316 mtspr(SPRN_MAS6, 0);
317 asm volatile("tlbsx 0, %0" : : "b" (eaddr & ~CONFIG_PAGE_OFFSET));
318 mas0 = mfspr(SPRN_MAS0);
319 local_irq_restore(flags);
320
321 return mas0;
305} 322}
306 323
324/* sesel is for tlb1 only */
307static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, 325static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
308 int tlbsel, int esel, struct tlbe *stlbe) 326 int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe)
309{ 327{
328 u32 mas0;
329
310 if (tlbsel == 0) { 330 if (tlbsel == 0) {
311 __write_host_tlbe(stlbe, 331 mas0 = get_host_mas0(stlbe->mas2);
312 MAS0_TLBSEL(0) | 332 __write_host_tlbe(stlbe, mas0);
313 MAS0_ESEL(esel & (KVM_E500_TLB0_WAY_NUM - 1)));
314 } else { 333 } else {
315 __write_host_tlbe(stlbe, 334 __write_host_tlbe(stlbe,
316 MAS0_TLBSEL(1) | 335 MAS0_TLBSEL(1) |
317 MAS0_ESEL(to_htlb1_esel(esel))); 336 MAS0_ESEL(to_htlb1_esel(sesel)));
318 } 337 }
319 trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2,
320 stlbe->mas3, stlbe->mas7);
321} 338}
322 339
323void kvmppc_map_magic(struct kvm_vcpu *vcpu) 340void kvmppc_map_magic(struct kvm_vcpu *vcpu)
324{ 341{
325 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 342 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
326 struct tlbe magic; 343 struct kvm_book3e_206_tlb_entry magic;
327 ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK; 344 ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
328 unsigned int stid; 345 unsigned int stid;
329 pfn_t pfn; 346 pfn_t pfn;
@@ -337,9 +354,9 @@ void kvmppc_map_magic(struct kvm_vcpu *vcpu)
337 magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) | 354 magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) |
338 MAS1_TSIZE(BOOK3E_PAGESZ_4K); 355 MAS1_TSIZE(BOOK3E_PAGESZ_4K);
339 magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M; 356 magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M;
340 magic.mas3 = (pfn << PAGE_SHIFT) | 357 magic.mas7_3 = ((u64)pfn << PAGE_SHIFT) |
341 MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR; 358 MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR;
342 magic.mas7 = pfn >> (32 - PAGE_SHIFT); 359 magic.mas8 = 0;
343 360
344 __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index)); 361 __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index));
345 preempt_enable(); 362 preempt_enable();
@@ -357,10 +374,11 @@ void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu)
357{ 374{
358} 375}
359 376
360static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, 377static void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500,
361 int tlbsel, int esel) 378 int tlbsel, int esel)
362{ 379{
363 struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; 380 struct kvm_book3e_206_tlb_entry *gtlbe =
381 get_entry(vcpu_e500, tlbsel, esel);
364 struct vcpu_id_table *idt = vcpu_e500->idt; 382 struct vcpu_id_table *idt = vcpu_e500->idt;
365 unsigned int pr, tid, ts, pid; 383 unsigned int pr, tid, ts, pid;
366 u32 val, eaddr; 384 u32 val, eaddr;
@@ -414,25 +432,57 @@ static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
414 preempt_enable(); 432 preempt_enable();
415} 433}
416 434
435static int tlb0_set_base(gva_t addr, int sets, int ways)
436{
437 int set_base;
438
439 set_base = (addr >> PAGE_SHIFT) & (sets - 1);
440 set_base *= ways;
441
442 return set_base;
443}
444
445static int gtlb0_set_base(struct kvmppc_vcpu_e500 *vcpu_e500, gva_t addr)
446{
447 return tlb0_set_base(addr, vcpu_e500->gtlb_params[0].sets,
448 vcpu_e500->gtlb_params[0].ways);
449}
450
451static unsigned int get_tlb_esel(struct kvm_vcpu *vcpu, int tlbsel)
452{
453 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
454 int esel = get_tlb_esel_bit(vcpu);
455
456 if (tlbsel == 0) {
457 esel &= vcpu_e500->gtlb_params[0].ways - 1;
458 esel += gtlb0_set_base(vcpu_e500, vcpu->arch.shared->mas2);
459 } else {
460 esel &= vcpu_e500->gtlb_params[tlbsel].entries - 1;
461 }
462
463 return esel;
464}
465
417/* Search the guest TLB for a matching entry. */ 466/* Search the guest TLB for a matching entry. */
418static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, 467static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500,
419 gva_t eaddr, int tlbsel, unsigned int pid, int as) 468 gva_t eaddr, int tlbsel, unsigned int pid, int as)
420{ 469{
421 int size = vcpu_e500->gtlb_size[tlbsel]; 470 int size = vcpu_e500->gtlb_params[tlbsel].entries;
422 int set_base; 471 unsigned int set_base, offset;
423 int i; 472 int i;
424 473
425 if (tlbsel == 0) { 474 if (tlbsel == 0) {
426 int mask = size / KVM_E500_TLB0_WAY_NUM - 1; 475 set_base = gtlb0_set_base(vcpu_e500, eaddr);
427 set_base = (eaddr >> PAGE_SHIFT) & mask; 476 size = vcpu_e500->gtlb_params[0].ways;
428 set_base *= KVM_E500_TLB0_WAY_NUM;
429 size = KVM_E500_TLB0_WAY_NUM;
430 } else { 477 } else {
431 set_base = 0; 478 set_base = 0;
432 } 479 }
433 480
481 offset = vcpu_e500->gtlb_offset[tlbsel];
482
434 for (i = 0; i < size; i++) { 483 for (i = 0; i < size; i++) {
435 struct tlbe *tlbe = &vcpu_e500->gtlb_arch[tlbsel][set_base + i]; 484 struct kvm_book3e_206_tlb_entry *tlbe =
485 &vcpu_e500->gtlb_arch[offset + set_base + i];
436 unsigned int tid; 486 unsigned int tid;
437 487
438 if (eaddr < get_tlb_eaddr(tlbe)) 488 if (eaddr < get_tlb_eaddr(tlbe))
@@ -457,27 +507,55 @@ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500,
457 return -1; 507 return -1;
458} 508}
459 509
460static inline void kvmppc_e500_priv_setup(struct tlbe_priv *priv, 510static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
461 struct tlbe *gtlbe, 511 struct kvm_book3e_206_tlb_entry *gtlbe,
462 pfn_t pfn) 512 pfn_t pfn)
463{ 513{
464 priv->pfn = pfn; 514 ref->pfn = pfn;
465 priv->flags = E500_TLB_VALID; 515 ref->flags = E500_TLB_VALID;
466 516
467 if (tlbe_is_writable(gtlbe)) 517 if (tlbe_is_writable(gtlbe))
468 priv->flags |= E500_TLB_DIRTY; 518 ref->flags |= E500_TLB_DIRTY;
469} 519}
470 520
471static inline void kvmppc_e500_priv_release(struct tlbe_priv *priv) 521static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref)
472{ 522{
473 if (priv->flags & E500_TLB_VALID) { 523 if (ref->flags & E500_TLB_VALID) {
474 if (priv->flags & E500_TLB_DIRTY) 524 if (ref->flags & E500_TLB_DIRTY)
475 kvm_release_pfn_dirty(priv->pfn); 525 kvm_release_pfn_dirty(ref->pfn);
476 else 526 else
477 kvm_release_pfn_clean(priv->pfn); 527 kvm_release_pfn_clean(ref->pfn);
528
529 ref->flags = 0;
530 }
531}
532
533static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500)
534{
535 int tlbsel = 0;
536 int i;
537
538 for (i = 0; i < vcpu_e500->gtlb_params[tlbsel].entries; i++) {
539 struct tlbe_ref *ref =
540 &vcpu_e500->gtlb_priv[tlbsel][i].ref;
541 kvmppc_e500_ref_release(ref);
542 }
543}
544
545static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500)
546{
547 int stlbsel = 1;
548 int i;
549
550 kvmppc_e500_id_table_reset_all(vcpu_e500);
478 551
479 priv->flags = 0; 552 for (i = 0; i < host_tlb_params[stlbsel].entries; i++) {
553 struct tlbe_ref *ref =
554 &vcpu_e500->tlb_refs[stlbsel][i];
555 kvmppc_e500_ref_release(ref);
480 } 556 }
557
558 clear_tlb_privs(vcpu_e500);
481} 559}
482 560
483static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, 561static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
@@ -488,59 +566,54 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
488 int tlbsel; 566 int tlbsel;
489 567
490 /* since we only have two TLBs, only lower bit is used. */ 568 /* since we only have two TLBs, only lower bit is used. */
491 tlbsel = (vcpu_e500->mas4 >> 28) & 0x1; 569 tlbsel = (vcpu->arch.shared->mas4 >> 28) & 0x1;
492 victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; 570 victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0;
493 pidsel = (vcpu_e500->mas4 >> 16) & 0xf; 571 pidsel = (vcpu->arch.shared->mas4 >> 16) & 0xf;
494 tsized = (vcpu_e500->mas4 >> 7) & 0x1f; 572 tsized = (vcpu->arch.shared->mas4 >> 7) & 0x1f;
495 573
496 vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) 574 vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim)
497 | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); 575 | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
498 vcpu_e500->mas1 = MAS1_VALID | (as ? MAS1_TS : 0) 576 vcpu->arch.shared->mas1 = MAS1_VALID | (as ? MAS1_TS : 0)
499 | MAS1_TID(vcpu_e500->pid[pidsel]) 577 | MAS1_TID(vcpu_e500->pid[pidsel])
500 | MAS1_TSIZE(tsized); 578 | MAS1_TSIZE(tsized);
501 vcpu_e500->mas2 = (eaddr & MAS2_EPN) 579 vcpu->arch.shared->mas2 = (eaddr & MAS2_EPN)
502 | (vcpu_e500->mas4 & MAS2_ATTRIB_MASK); 580 | (vcpu->arch.shared->mas4 & MAS2_ATTRIB_MASK);
503 vcpu_e500->mas3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; 581 vcpu->arch.shared->mas7_3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3;
504 vcpu_e500->mas6 = (vcpu_e500->mas6 & MAS6_SPID1) 582 vcpu->arch.shared->mas6 = (vcpu->arch.shared->mas6 & MAS6_SPID1)
505 | (get_cur_pid(vcpu) << 16) 583 | (get_cur_pid(vcpu) << 16)
506 | (as ? MAS6_SAS : 0); 584 | (as ? MAS6_SAS : 0);
507 vcpu_e500->mas7 = 0;
508} 585}
509 586
510static inline void kvmppc_e500_setup_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, 587/* TID must be supplied by the caller */
511 struct tlbe *gtlbe, int tsize, 588static inline void kvmppc_e500_setup_stlbe(
512 struct tlbe_priv *priv, 589 struct kvmppc_vcpu_e500 *vcpu_e500,
513 u64 gvaddr, struct tlbe *stlbe) 590 struct kvm_book3e_206_tlb_entry *gtlbe,
591 int tsize, struct tlbe_ref *ref, u64 gvaddr,
592 struct kvm_book3e_206_tlb_entry *stlbe)
514{ 593{
515 pfn_t pfn = priv->pfn; 594 pfn_t pfn = ref->pfn;
516 unsigned int stid;
517 595
518 stid = kvmppc_e500_get_sid(vcpu_e500, get_tlb_ts(gtlbe), 596 BUG_ON(!(ref->flags & E500_TLB_VALID));
519 get_tlb_tid(gtlbe),
520 get_cur_pr(&vcpu_e500->vcpu), 0);
521 597
522 /* Force TS=1 IPROT=0 for all guest mappings. */ 598 /* Force TS=1 IPROT=0 for all guest mappings. */
523 stlbe->mas1 = MAS1_TSIZE(tsize) 599 stlbe->mas1 = MAS1_TSIZE(tsize) | MAS1_TS | MAS1_VALID;
524 | MAS1_TID(stid) | MAS1_TS | MAS1_VALID;
525 stlbe->mas2 = (gvaddr & MAS2_EPN) 600 stlbe->mas2 = (gvaddr & MAS2_EPN)
526 | e500_shadow_mas2_attrib(gtlbe->mas2, 601 | e500_shadow_mas2_attrib(gtlbe->mas2,
527 vcpu_e500->vcpu.arch.shared->msr & MSR_PR); 602 vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
528 stlbe->mas3 = ((pfn << PAGE_SHIFT) & MAS3_RPN) 603 stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT)
529 | e500_shadow_mas3_attrib(gtlbe->mas3, 604 | e500_shadow_mas3_attrib(gtlbe->mas7_3,
530 vcpu_e500->vcpu.arch.shared->msr & MSR_PR); 605 vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
531 stlbe->mas7 = (pfn >> (32 - PAGE_SHIFT)) & MAS7_RPN;
532} 606}
533 607
534
535static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, 608static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
536 u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel, 609 u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
537 struct tlbe *stlbe) 610 int tlbsel, struct kvm_book3e_206_tlb_entry *stlbe,
611 struct tlbe_ref *ref)
538{ 612{
539 struct kvm_memory_slot *slot; 613 struct kvm_memory_slot *slot;
540 unsigned long pfn, hva; 614 unsigned long pfn, hva;
541 int pfnmap = 0; 615 int pfnmap = 0;
542 int tsize = BOOK3E_PAGESZ_4K; 616 int tsize = BOOK3E_PAGESZ_4K;
543 struct tlbe_priv *priv;
544 617
545 /* 618 /*
546 * Translate guest physical to true physical, acquiring 619 * Translate guest physical to true physical, acquiring
@@ -621,12 +694,31 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
621 pfn &= ~(tsize_pages - 1); 694 pfn &= ~(tsize_pages - 1);
622 break; 695 break;
623 } 696 }
697 } else if (vma && hva >= vma->vm_start &&
698 (vma->vm_flags & VM_HUGETLB)) {
699 unsigned long psize = vma_kernel_pagesize(vma);
700
701 tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
702 MAS1_TSIZE_SHIFT;
703
704 /*
705 * Take the largest page size that satisfies both host
706 * and guest mapping
707 */
708 tsize = min(__ilog2(psize) - 10, tsize);
709
710 /*
711 * e500 doesn't implement the lowest tsize bit,
712 * or 1K pages.
713 */
714 tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
624 } 715 }
625 716
626 up_read(&current->mm->mmap_sem); 717 up_read(&current->mm->mmap_sem);
627 } 718 }
628 719
629 if (likely(!pfnmap)) { 720 if (likely(!pfnmap)) {
721 unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
630 pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn); 722 pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn);
631 if (is_error_pfn(pfn)) { 723 if (is_error_pfn(pfn)) {
632 printk(KERN_ERR "Couldn't get real page for gfn %lx!\n", 724 printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
@@ -634,45 +726,52 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
634 kvm_release_pfn_clean(pfn); 726 kvm_release_pfn_clean(pfn);
635 return; 727 return;
636 } 728 }
729
730 /* Align guest and physical address to page map boundaries */
731 pfn &= ~(tsize_pages - 1);
732 gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
637 } 733 }
638 734
639 /* Drop old priv and setup new one. */ 735 /* Drop old ref and setup new one. */
640 priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; 736 kvmppc_e500_ref_release(ref);
641 kvmppc_e500_priv_release(priv); 737 kvmppc_e500_ref_setup(ref, gtlbe, pfn);
642 kvmppc_e500_priv_setup(priv, gtlbe, pfn);
643 738
644 kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, tsize, priv, gvaddr, stlbe); 739 kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, tsize, ref, gvaddr, stlbe);
645} 740}
646 741
647/* XXX only map the one-one case, for now use TLB0 */ 742/* XXX only map the one-one case, for now use TLB0 */
648static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, 743static void kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500,
649 int esel, struct tlbe *stlbe) 744 int esel,
745 struct kvm_book3e_206_tlb_entry *stlbe)
650{ 746{
651 struct tlbe *gtlbe; 747 struct kvm_book3e_206_tlb_entry *gtlbe;
748 struct tlbe_ref *ref;
652 749
653 gtlbe = &vcpu_e500->gtlb_arch[0][esel]; 750 gtlbe = get_entry(vcpu_e500, 0, esel);
751 ref = &vcpu_e500->gtlb_priv[0][esel].ref;
654 752
655 kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe), 753 kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe),
656 get_tlb_raddr(gtlbe) >> PAGE_SHIFT, 754 get_tlb_raddr(gtlbe) >> PAGE_SHIFT,
657 gtlbe, 0, esel, stlbe); 755 gtlbe, 0, stlbe, ref);
658
659 return esel;
660} 756}
661 757
662/* Caller must ensure that the specified guest TLB entry is safe to insert into 758/* Caller must ensure that the specified guest TLB entry is safe to insert into
663 * the shadow TLB. */ 759 * the shadow TLB. */
664/* XXX for both one-one and one-to-many , for now use TLB1 */ 760/* XXX for both one-one and one-to-many , for now use TLB1 */
665static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, 761static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
666 u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, struct tlbe *stlbe) 762 u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
763 struct kvm_book3e_206_tlb_entry *stlbe)
667{ 764{
765 struct tlbe_ref *ref;
668 unsigned int victim; 766 unsigned int victim;
669 767
670 victim = vcpu_e500->gtlb_nv[1]++; 768 victim = vcpu_e500->host_tlb1_nv++;
671 769
672 if (unlikely(vcpu_e500->gtlb_nv[1] >= tlb1_max_shadow_size())) 770 if (unlikely(vcpu_e500->host_tlb1_nv >= tlb1_max_shadow_size()))
673 vcpu_e500->gtlb_nv[1] = 0; 771 vcpu_e500->host_tlb1_nv = 0;
674 772
675 kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, victim, stlbe); 773 ref = &vcpu_e500->tlb_refs[1][victim];
774 kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe, ref);
676 775
677 return victim; 776 return victim;
678} 777}
@@ -689,7 +788,8 @@ static inline int kvmppc_e500_gtlbe_invalidate(
689 struct kvmppc_vcpu_e500 *vcpu_e500, 788 struct kvmppc_vcpu_e500 *vcpu_e500,
690 int tlbsel, int esel) 789 int tlbsel, int esel)
691{ 790{
692 struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; 791 struct kvm_book3e_206_tlb_entry *gtlbe =
792 get_entry(vcpu_e500, tlbsel, esel);
693 793
694 if (unlikely(get_tlb_iprot(gtlbe))) 794 if (unlikely(get_tlb_iprot(gtlbe)))
695 return -1; 795 return -1;
@@ -704,10 +804,10 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value)
704 int esel; 804 int esel;
705 805
706 if (value & MMUCSR0_TLB0FI) 806 if (value & MMUCSR0_TLB0FI)
707 for (esel = 0; esel < vcpu_e500->gtlb_size[0]; esel++) 807 for (esel = 0; esel < vcpu_e500->gtlb_params[0].entries; esel++)
708 kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel); 808 kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel);
709 if (value & MMUCSR0_TLB1FI) 809 if (value & MMUCSR0_TLB1FI)
710 for (esel = 0; esel < vcpu_e500->gtlb_size[1]; esel++) 810 for (esel = 0; esel < vcpu_e500->gtlb_params[1].entries; esel++)
711 kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel); 811 kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel);
712 812
713 /* Invalidate all vcpu id mappings */ 813 /* Invalidate all vcpu id mappings */
@@ -732,7 +832,8 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb)
732 832
733 if (ia) { 833 if (ia) {
734 /* invalidate all entries */ 834 /* invalidate all entries */
735 for (esel = 0; esel < vcpu_e500->gtlb_size[tlbsel]; esel++) 835 for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries;
836 esel++)
736 kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); 837 kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
737 } else { 838 } else {
738 ea &= 0xfffff000; 839 ea &= 0xfffff000;
@@ -752,18 +853,17 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu)
752{ 853{
753 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 854 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
754 int tlbsel, esel; 855 int tlbsel, esel;
755 struct tlbe *gtlbe; 856 struct kvm_book3e_206_tlb_entry *gtlbe;
756 857
757 tlbsel = get_tlb_tlbsel(vcpu_e500); 858 tlbsel = get_tlb_tlbsel(vcpu);
758 esel = get_tlb_esel(vcpu_e500, tlbsel); 859 esel = get_tlb_esel(vcpu, tlbsel);
759 860
760 gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; 861 gtlbe = get_entry(vcpu_e500, tlbsel, esel);
761 vcpu_e500->mas0 &= ~MAS0_NV(~0); 862 vcpu->arch.shared->mas0 &= ~MAS0_NV(~0);
762 vcpu_e500->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); 863 vcpu->arch.shared->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
763 vcpu_e500->mas1 = gtlbe->mas1; 864 vcpu->arch.shared->mas1 = gtlbe->mas1;
764 vcpu_e500->mas2 = gtlbe->mas2; 865 vcpu->arch.shared->mas2 = gtlbe->mas2;
765 vcpu_e500->mas3 = gtlbe->mas3; 866 vcpu->arch.shared->mas7_3 = gtlbe->mas7_3;
766 vcpu_e500->mas7 = gtlbe->mas7;
767 867
768 return EMULATE_DONE; 868 return EMULATE_DONE;
769} 869}
@@ -771,10 +871,10 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu)
771int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) 871int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb)
772{ 872{
773 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 873 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
774 int as = !!get_cur_sas(vcpu_e500); 874 int as = !!get_cur_sas(vcpu);
775 unsigned int pid = get_cur_spid(vcpu_e500); 875 unsigned int pid = get_cur_spid(vcpu);
776 int esel, tlbsel; 876 int esel, tlbsel;
777 struct tlbe *gtlbe = NULL; 877 struct kvm_book3e_206_tlb_entry *gtlbe = NULL;
778 gva_t ea; 878 gva_t ea;
779 879
780 ea = kvmppc_get_gpr(vcpu, rb); 880 ea = kvmppc_get_gpr(vcpu, rb);
@@ -782,70 +882,90 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb)
782 for (tlbsel = 0; tlbsel < 2; tlbsel++) { 882 for (tlbsel = 0; tlbsel < 2; tlbsel++) {
783 esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); 883 esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as);
784 if (esel >= 0) { 884 if (esel >= 0) {
785 gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; 885 gtlbe = get_entry(vcpu_e500, tlbsel, esel);
786 break; 886 break;
787 } 887 }
788 } 888 }
789 889
790 if (gtlbe) { 890 if (gtlbe) {
791 vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel) 891 esel &= vcpu_e500->gtlb_params[tlbsel].ways - 1;
892
893 vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel)
792 | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); 894 | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
793 vcpu_e500->mas1 = gtlbe->mas1; 895 vcpu->arch.shared->mas1 = gtlbe->mas1;
794 vcpu_e500->mas2 = gtlbe->mas2; 896 vcpu->arch.shared->mas2 = gtlbe->mas2;
795 vcpu_e500->mas3 = gtlbe->mas3; 897 vcpu->arch.shared->mas7_3 = gtlbe->mas7_3;
796 vcpu_e500->mas7 = gtlbe->mas7;
797 } else { 898 } else {
798 int victim; 899 int victim;
799 900
800 /* since we only have two TLBs, only lower bit is used. */ 901 /* since we only have two TLBs, only lower bit is used. */
801 tlbsel = vcpu_e500->mas4 >> 28 & 0x1; 902 tlbsel = vcpu->arch.shared->mas4 >> 28 & 0x1;
802 victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; 903 victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0;
803 904
804 vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) 905 vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel)
906 | MAS0_ESEL(victim)
805 | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); 907 | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
806 vcpu_e500->mas1 = (vcpu_e500->mas6 & MAS6_SPID0) 908 vcpu->arch.shared->mas1 =
807 | (vcpu_e500->mas6 & (MAS6_SAS ? MAS1_TS : 0)) 909 (vcpu->arch.shared->mas6 & MAS6_SPID0)
808 | (vcpu_e500->mas4 & MAS4_TSIZED(~0)); 910 | (vcpu->arch.shared->mas6 & (MAS6_SAS ? MAS1_TS : 0))
809 vcpu_e500->mas2 &= MAS2_EPN; 911 | (vcpu->arch.shared->mas4 & MAS4_TSIZED(~0));
810 vcpu_e500->mas2 |= vcpu_e500->mas4 & MAS2_ATTRIB_MASK; 912 vcpu->arch.shared->mas2 &= MAS2_EPN;
811 vcpu_e500->mas3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; 913 vcpu->arch.shared->mas2 |= vcpu->arch.shared->mas4 &
812 vcpu_e500->mas7 = 0; 914 MAS2_ATTRIB_MASK;
915 vcpu->arch.shared->mas7_3 &= MAS3_U0 | MAS3_U1 |
916 MAS3_U2 | MAS3_U3;
813 } 917 }
814 918
815 kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS); 919 kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS);
816 return EMULATE_DONE; 920 return EMULATE_DONE;
817} 921}
818 922
923/* sesel is for tlb1 only */
924static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
925 struct kvm_book3e_206_tlb_entry *gtlbe,
926 struct kvm_book3e_206_tlb_entry *stlbe,
927 int stlbsel, int sesel)
928{
929 int stid;
930
931 preempt_disable();
932 stid = kvmppc_e500_get_sid(vcpu_e500, get_tlb_ts(gtlbe),
933 get_tlb_tid(gtlbe),
934 get_cur_pr(&vcpu_e500->vcpu), 0);
935
936 stlbe->mas1 |= MAS1_TID(stid);
937 write_host_tlbe(vcpu_e500, stlbsel, sesel, stlbe);
938 preempt_enable();
939}
940
819int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) 941int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
820{ 942{
821 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 943 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
822 struct tlbe *gtlbe; 944 struct kvm_book3e_206_tlb_entry *gtlbe;
823 int tlbsel, esel; 945 int tlbsel, esel;
824 946
825 tlbsel = get_tlb_tlbsel(vcpu_e500); 947 tlbsel = get_tlb_tlbsel(vcpu);
826 esel = get_tlb_esel(vcpu_e500, tlbsel); 948 esel = get_tlb_esel(vcpu, tlbsel);
827 949
828 gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; 950 gtlbe = get_entry(vcpu_e500, tlbsel, esel);
829 951
830 if (get_tlb_v(gtlbe)) 952 if (get_tlb_v(gtlbe))
831 kvmppc_e500_stlbe_invalidate(vcpu_e500, tlbsel, esel); 953 inval_gtlbe_on_host(vcpu_e500, tlbsel, esel);
832 954
833 gtlbe->mas1 = vcpu_e500->mas1; 955 gtlbe->mas1 = vcpu->arch.shared->mas1;
834 gtlbe->mas2 = vcpu_e500->mas2; 956 gtlbe->mas2 = vcpu->arch.shared->mas2;
835 gtlbe->mas3 = vcpu_e500->mas3; 957 gtlbe->mas7_3 = vcpu->arch.shared->mas7_3;
836 gtlbe->mas7 = vcpu_e500->mas7;
837 958
838 trace_kvm_gtlb_write(vcpu_e500->mas0, gtlbe->mas1, gtlbe->mas2, 959 trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1,
839 gtlbe->mas3, gtlbe->mas7); 960 gtlbe->mas2, gtlbe->mas7_3);
840 961
841 /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ 962 /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
842 if (tlbe_is_host_safe(vcpu, gtlbe)) { 963 if (tlbe_is_host_safe(vcpu, gtlbe)) {
843 struct tlbe stlbe; 964 struct kvm_book3e_206_tlb_entry stlbe;
844 int stlbsel, sesel; 965 int stlbsel, sesel;
845 u64 eaddr; 966 u64 eaddr;
846 u64 raddr; 967 u64 raddr;
847 968
848 preempt_disable();
849 switch (tlbsel) { 969 switch (tlbsel) {
850 case 0: 970 case 0:
851 /* TLB0 */ 971 /* TLB0 */
@@ -853,7 +973,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
853 gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K); 973 gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K);
854 974
855 stlbsel = 0; 975 stlbsel = 0;
856 sesel = kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe); 976 kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe);
977 sesel = 0; /* unused */
857 978
858 break; 979 break;
859 980
@@ -874,8 +995,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
874 default: 995 default:
875 BUG(); 996 BUG();
876 } 997 }
877 write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe); 998
878 preempt_enable(); 999 write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel);
879 } 1000 }
880 1001
881 kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS); 1002 kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS);
@@ -914,9 +1035,11 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index,
914 gva_t eaddr) 1035 gva_t eaddr)
915{ 1036{
916 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 1037 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
917 struct tlbe *gtlbe = 1038 struct kvm_book3e_206_tlb_entry *gtlbe;
918 &vcpu_e500->gtlb_arch[tlbsel_of(index)][esel_of(index)]; 1039 u64 pgmask;
919 u64 pgmask = get_tlb_bytes(gtlbe) - 1; 1040
1041 gtlbe = get_entry(vcpu_e500, tlbsel_of(index), esel_of(index));
1042 pgmask = get_tlb_bytes(gtlbe) - 1;
920 1043
921 return get_tlb_raddr(gtlbe) | (eaddr & pgmask); 1044 return get_tlb_raddr(gtlbe) | (eaddr & pgmask);
922} 1045}
@@ -930,22 +1053,21 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
930{ 1053{
931 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 1054 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
932 struct tlbe_priv *priv; 1055 struct tlbe_priv *priv;
933 struct tlbe *gtlbe, stlbe; 1056 struct kvm_book3e_206_tlb_entry *gtlbe, stlbe;
934 int tlbsel = tlbsel_of(index); 1057 int tlbsel = tlbsel_of(index);
935 int esel = esel_of(index); 1058 int esel = esel_of(index);
936 int stlbsel, sesel; 1059 int stlbsel, sesel;
937 1060
938 gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; 1061 gtlbe = get_entry(vcpu_e500, tlbsel, esel);
939 1062
940 preempt_disable();
941 switch (tlbsel) { 1063 switch (tlbsel) {
942 case 0: 1064 case 0:
943 stlbsel = 0; 1065 stlbsel = 0;
944 sesel = esel; 1066 sesel = 0; /* unused */
945 priv = &vcpu_e500->gtlb_priv[stlbsel][sesel]; 1067 priv = &vcpu_e500->gtlb_priv[tlbsel][esel];
946 1068
947 kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, BOOK3E_PAGESZ_4K, 1069 kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, BOOK3E_PAGESZ_4K,
948 priv, eaddr, &stlbe); 1070 &priv->ref, eaddr, &stlbe);
949 break; 1071 break;
950 1072
951 case 1: { 1073 case 1: {
@@ -962,8 +1084,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
962 break; 1084 break;
963 } 1085 }
964 1086
965 write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe); 1087 write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel);
966 preempt_enable();
967} 1088}
968 1089
969int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu, 1090int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu,
@@ -993,85 +1114,279 @@ void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid)
993 1114
994void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) 1115void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500)
995{ 1116{
996 struct tlbe *tlbe; 1117 struct kvm_book3e_206_tlb_entry *tlbe;
997 1118
998 /* Insert large initial mapping for guest. */ 1119 /* Insert large initial mapping for guest. */
999 tlbe = &vcpu_e500->gtlb_arch[1][0]; 1120 tlbe = get_entry(vcpu_e500, 1, 0);
1000 tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M); 1121 tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M);
1001 tlbe->mas2 = 0; 1122 tlbe->mas2 = 0;
1002 tlbe->mas3 = E500_TLB_SUPER_PERM_MASK; 1123 tlbe->mas7_3 = E500_TLB_SUPER_PERM_MASK;
1003 tlbe->mas7 = 0;
1004 1124
1005 /* 4K map for serial output. Used by kernel wrapper. */ 1125 /* 4K map for serial output. Used by kernel wrapper. */
1006 tlbe = &vcpu_e500->gtlb_arch[1][1]; 1126 tlbe = get_entry(vcpu_e500, 1, 1);
1007 tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K); 1127 tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K);
1008 tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G; 1128 tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G;
1009 tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; 1129 tlbe->mas7_3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK;
1010 tlbe->mas7 = 0; 1130}
1131
1132static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
1133{
1134 int i;
1135
1136 clear_tlb_refs(vcpu_e500);
1137 kfree(vcpu_e500->gtlb_priv[0]);
1138 kfree(vcpu_e500->gtlb_priv[1]);
1139
1140 if (vcpu_e500->shared_tlb_pages) {
1141 vfree((void *)(round_down((uintptr_t)vcpu_e500->gtlb_arch,
1142 PAGE_SIZE)));
1143
1144 for (i = 0; i < vcpu_e500->num_shared_tlb_pages; i++) {
1145 set_page_dirty_lock(vcpu_e500->shared_tlb_pages[i]);
1146 put_page(vcpu_e500->shared_tlb_pages[i]);
1147 }
1148
1149 vcpu_e500->num_shared_tlb_pages = 0;
1150 vcpu_e500->shared_tlb_pages = NULL;
1151 } else {
1152 kfree(vcpu_e500->gtlb_arch);
1153 }
1154
1155 vcpu_e500->gtlb_arch = NULL;
1156}
1157
1158int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
1159 struct kvm_config_tlb *cfg)
1160{
1161 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
1162 struct kvm_book3e_206_tlb_params params;
1163 char *virt;
1164 struct page **pages;
1165 struct tlbe_priv *privs[2] = {};
1166 size_t array_len;
1167 u32 sets;
1168 int num_pages, ret, i;
1169
1170 if (cfg->mmu_type != KVM_MMU_FSL_BOOKE_NOHV)
1171 return -EINVAL;
1172
1173 if (copy_from_user(&params, (void __user *)(uintptr_t)cfg->params,
1174 sizeof(params)))
1175 return -EFAULT;
1176
1177 if (params.tlb_sizes[1] > 64)
1178 return -EINVAL;
1179 if (params.tlb_ways[1] != params.tlb_sizes[1])
1180 return -EINVAL;
1181 if (params.tlb_sizes[2] != 0 || params.tlb_sizes[3] != 0)
1182 return -EINVAL;
1183 if (params.tlb_ways[2] != 0 || params.tlb_ways[3] != 0)
1184 return -EINVAL;
1185
1186 if (!is_power_of_2(params.tlb_ways[0]))
1187 return -EINVAL;
1188
1189 sets = params.tlb_sizes[0] >> ilog2(params.tlb_ways[0]);
1190 if (!is_power_of_2(sets))
1191 return -EINVAL;
1192
1193 array_len = params.tlb_sizes[0] + params.tlb_sizes[1];
1194 array_len *= sizeof(struct kvm_book3e_206_tlb_entry);
1195
1196 if (cfg->array_len < array_len)
1197 return -EINVAL;
1198
1199 num_pages = DIV_ROUND_UP(cfg->array + array_len - 1, PAGE_SIZE) -
1200 cfg->array / PAGE_SIZE;
1201 pages = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL);
1202 if (!pages)
1203 return -ENOMEM;
1204
1205 ret = get_user_pages_fast(cfg->array, num_pages, 1, pages);
1206 if (ret < 0)
1207 goto err_pages;
1208
1209 if (ret != num_pages) {
1210 num_pages = ret;
1211 ret = -EFAULT;
1212 goto err_put_page;
1213 }
1214
1215 virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL);
1216 if (!virt)
1217 goto err_put_page;
1218
1219 privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0],
1220 GFP_KERNEL);
1221 privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1],
1222 GFP_KERNEL);
1223
1224 if (!privs[0] || !privs[1])
1225 goto err_put_page;
1226
1227 free_gtlb(vcpu_e500);
1228
1229 vcpu_e500->gtlb_priv[0] = privs[0];
1230 vcpu_e500->gtlb_priv[1] = privs[1];
1231
1232 vcpu_e500->gtlb_arch = (struct kvm_book3e_206_tlb_entry *)
1233 (virt + (cfg->array & (PAGE_SIZE - 1)));
1234
1235 vcpu_e500->gtlb_params[0].entries = params.tlb_sizes[0];
1236 vcpu_e500->gtlb_params[1].entries = params.tlb_sizes[1];
1237
1238 vcpu_e500->gtlb_offset[0] = 0;
1239 vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0];
1240
1241 vcpu_e500->tlb0cfg &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
1242 if (params.tlb_sizes[0] <= 2048)
1243 vcpu_e500->tlb0cfg |= params.tlb_sizes[0];
1244 vcpu_e500->tlb0cfg |= params.tlb_ways[0] << TLBnCFG_ASSOC_SHIFT;
1245
1246 vcpu_e500->tlb1cfg &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
1247 vcpu_e500->tlb1cfg |= params.tlb_sizes[1];
1248 vcpu_e500->tlb1cfg |= params.tlb_ways[1] << TLBnCFG_ASSOC_SHIFT;
1249
1250 vcpu_e500->shared_tlb_pages = pages;
1251 vcpu_e500->num_shared_tlb_pages = num_pages;
1252
1253 vcpu_e500->gtlb_params[0].ways = params.tlb_ways[0];
1254 vcpu_e500->gtlb_params[0].sets = sets;
1255
1256 vcpu_e500->gtlb_params[1].ways = params.tlb_sizes[1];
1257 vcpu_e500->gtlb_params[1].sets = 1;
1258
1259 return 0;
1260
1261err_put_page:
1262 kfree(privs[0]);
1263 kfree(privs[1]);
1264
1265 for (i = 0; i < num_pages; i++)
1266 put_page(pages[i]);
1267
1268err_pages:
1269 kfree(pages);
1270 return ret;
1271}
1272
1273int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
1274 struct kvm_dirty_tlb *dirty)
1275{
1276 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
1277
1278 clear_tlb_refs(vcpu_e500);
1279 return 0;
1011} 1280}
1012 1281
1013int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) 1282int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
1014{ 1283{
1015 tlb1_entry_num = mfspr(SPRN_TLB1CFG) & 0xFFF; 1284 int entry_size = sizeof(struct kvm_book3e_206_tlb_entry);
1016 1285 int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE;
1017 vcpu_e500->gtlb_size[0] = KVM_E500_TLB0_SIZE; 1286
1018 vcpu_e500->gtlb_arch[0] = 1287 host_tlb_params[0].entries = mfspr(SPRN_TLB0CFG) & TLBnCFG_N_ENTRY;
1019 kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL); 1288 host_tlb_params[1].entries = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
1020 if (vcpu_e500->gtlb_arch[0] == NULL) 1289
1021 goto err_out; 1290 /*
1022 1291 * This should never happen on real e500 hardware, but is
1023 vcpu_e500->gtlb_size[1] = KVM_E500_TLB1_SIZE; 1292 * architecturally possible -- e.g. in some weird nested
1024 vcpu_e500->gtlb_arch[1] = 1293 * virtualization case.
1025 kzalloc(sizeof(struct tlbe) * KVM_E500_TLB1_SIZE, GFP_KERNEL); 1294 */
1026 if (vcpu_e500->gtlb_arch[1] == NULL) 1295 if (host_tlb_params[0].entries == 0 ||
1027 goto err_out_guest0; 1296 host_tlb_params[1].entries == 0) {
1028 1297 pr_err("%s: need to know host tlb size\n", __func__);
1029 vcpu_e500->gtlb_priv[0] = (struct tlbe_priv *) 1298 return -ENODEV;
1030 kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB0_SIZE, GFP_KERNEL); 1299 }
1031 if (vcpu_e500->gtlb_priv[0] == NULL) 1300
1032 goto err_out_guest1; 1301 host_tlb_params[0].ways = (mfspr(SPRN_TLB0CFG) & TLBnCFG_ASSOC) >>
1033 vcpu_e500->gtlb_priv[1] = (struct tlbe_priv *) 1302 TLBnCFG_ASSOC_SHIFT;
1034 kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB1_SIZE, GFP_KERNEL); 1303 host_tlb_params[1].ways = host_tlb_params[1].entries;
1035 1304
1036 if (vcpu_e500->gtlb_priv[1] == NULL) 1305 if (!is_power_of_2(host_tlb_params[0].entries) ||
1037 goto err_out_priv0; 1306 !is_power_of_2(host_tlb_params[0].ways) ||
1307 host_tlb_params[0].entries < host_tlb_params[0].ways ||
1308 host_tlb_params[0].ways == 0) {
1309 pr_err("%s: bad tlb0 host config: %u entries %u ways\n",
1310 __func__, host_tlb_params[0].entries,
1311 host_tlb_params[0].ways);
1312 return -ENODEV;
1313 }
1314
1315 host_tlb_params[0].sets =
1316 host_tlb_params[0].entries / host_tlb_params[0].ways;
1317 host_tlb_params[1].sets = 1;
1318
1319 vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE;
1320 vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE;
1321
1322 vcpu_e500->gtlb_params[0].ways = KVM_E500_TLB0_WAY_NUM;
1323 vcpu_e500->gtlb_params[0].sets =
1324 KVM_E500_TLB0_SIZE / KVM_E500_TLB0_WAY_NUM;
1325
1326 vcpu_e500->gtlb_params[1].ways = KVM_E500_TLB1_SIZE;
1327 vcpu_e500->gtlb_params[1].sets = 1;
1328
1329 vcpu_e500->gtlb_arch = kmalloc(entries * entry_size, GFP_KERNEL);
1330 if (!vcpu_e500->gtlb_arch)
1331 return -ENOMEM;
1332
1333 vcpu_e500->gtlb_offset[0] = 0;
1334 vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE;
1335
1336 vcpu_e500->tlb_refs[0] =
1337 kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[0].entries,
1338 GFP_KERNEL);
1339 if (!vcpu_e500->tlb_refs[0])
1340 goto err;
1341
1342 vcpu_e500->tlb_refs[1] =
1343 kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[1].entries,
1344 GFP_KERNEL);
1345 if (!vcpu_e500->tlb_refs[1])
1346 goto err;
1347
1348 vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) *
1349 vcpu_e500->gtlb_params[0].entries,
1350 GFP_KERNEL);
1351 if (!vcpu_e500->gtlb_priv[0])
1352 goto err;
1353
1354 vcpu_e500->gtlb_priv[1] = kzalloc(sizeof(struct tlbe_ref) *
1355 vcpu_e500->gtlb_params[1].entries,
1356 GFP_KERNEL);
1357 if (!vcpu_e500->gtlb_priv[1])
1358 goto err;
1038 1359
1039 if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) 1360 if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL)
1040 goto err_out_priv1; 1361 goto err;
1041 1362
1042 /* Init TLB configuration register */ 1363 /* Init TLB configuration register */
1043 vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL; 1364 vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) &
1044 vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_size[0]; 1365 ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
1045 vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL; 1366 vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_params[0].entries;
1046 vcpu_e500->tlb1cfg |= vcpu_e500->gtlb_size[1]; 1367 vcpu_e500->tlb0cfg |=
1368 vcpu_e500->gtlb_params[0].ways << TLBnCFG_ASSOC_SHIFT;
1369
1370 vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) &
1371 ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
1372 vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_params[1].entries;
1373 vcpu_e500->tlb0cfg |=
1374 vcpu_e500->gtlb_params[1].ways << TLBnCFG_ASSOC_SHIFT;
1047 1375
1048 return 0; 1376 return 0;
1049 1377
1050err_out_priv1: 1378err:
1051 kfree(vcpu_e500->gtlb_priv[1]); 1379 free_gtlb(vcpu_e500);
1052err_out_priv0: 1380 kfree(vcpu_e500->tlb_refs[0]);
1053 kfree(vcpu_e500->gtlb_priv[0]); 1381 kfree(vcpu_e500->tlb_refs[1]);
1054err_out_guest1:
1055 kfree(vcpu_e500->gtlb_arch[1]);
1056err_out_guest0:
1057 kfree(vcpu_e500->gtlb_arch[0]);
1058err_out:
1059 return -1; 1382 return -1;
1060} 1383}
1061 1384
1062void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) 1385void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
1063{ 1386{
1064 int stlbsel, i; 1387 free_gtlb(vcpu_e500);
1065
1066 /* release all privs */
1067 for (stlbsel = 0; stlbsel < 2; stlbsel++)
1068 for (i = 0; i < vcpu_e500->gtlb_size[stlbsel]; i++) {
1069 struct tlbe_priv *priv =
1070 &vcpu_e500->gtlb_priv[stlbsel][i];
1071 kvmppc_e500_priv_release(priv);
1072 }
1073
1074 kvmppc_e500_id_table_free(vcpu_e500); 1388 kvmppc_e500_id_table_free(vcpu_e500);
1075 kfree(vcpu_e500->gtlb_arch[1]); 1389
1076 kfree(vcpu_e500->gtlb_arch[0]); 1390 kfree(vcpu_e500->tlb_refs[0]);
1391 kfree(vcpu_e500->tlb_refs[1]);
1077} 1392}
diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h
index 59b88e99a235..5c6d2d7bf058 100644
--- a/arch/powerpc/kvm/e500_tlb.h
+++ b/arch/powerpc/kvm/e500_tlb.h
@@ -20,13 +20,9 @@
20#include <asm/tlb.h> 20#include <asm/tlb.h>
21#include <asm/kvm_e500.h> 21#include <asm/kvm_e500.h>
22 22
23#define KVM_E500_TLB0_WAY_SIZE_BIT 7 /* Fixed */ 23/* This geometry is the legacy default -- can be overridden by userspace */
24#define KVM_E500_TLB0_WAY_SIZE (1UL << KVM_E500_TLB0_WAY_SIZE_BIT) 24#define KVM_E500_TLB0_WAY_SIZE 128
25#define KVM_E500_TLB0_WAY_SIZE_MASK (KVM_E500_TLB0_WAY_SIZE - 1) 25#define KVM_E500_TLB0_WAY_NUM 2
26
27#define KVM_E500_TLB0_WAY_NUM_BIT 1 /* No greater than 7 */
28#define KVM_E500_TLB0_WAY_NUM (1UL << KVM_E500_TLB0_WAY_NUM_BIT)
29#define KVM_E500_TLB0_WAY_NUM_MASK (KVM_E500_TLB0_WAY_NUM - 1)
30 26
31#define KVM_E500_TLB0_SIZE (KVM_E500_TLB0_WAY_SIZE * KVM_E500_TLB0_WAY_NUM) 27#define KVM_E500_TLB0_SIZE (KVM_E500_TLB0_WAY_SIZE * KVM_E500_TLB0_WAY_NUM)
32#define KVM_E500_TLB1_SIZE 16 28#define KVM_E500_TLB1_SIZE 16
@@ -58,50 +54,54 @@ extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *);
58extern void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *); 54extern void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *);
59 55
60/* TLB helper functions */ 56/* TLB helper functions */
61static inline unsigned int get_tlb_size(const struct tlbe *tlbe) 57static inline unsigned int
58get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe)
62{ 59{
63 return (tlbe->mas1 >> 7) & 0x1f; 60 return (tlbe->mas1 >> 7) & 0x1f;
64} 61}
65 62
66static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe) 63static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe)
67{ 64{
68 return tlbe->mas2 & 0xfffff000; 65 return tlbe->mas2 & 0xfffff000;
69} 66}
70 67
71static inline u64 get_tlb_bytes(const struct tlbe *tlbe) 68static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe)
72{ 69{
73 unsigned int pgsize = get_tlb_size(tlbe); 70 unsigned int pgsize = get_tlb_size(tlbe);
74 return 1ULL << 10 << pgsize; 71 return 1ULL << 10 << pgsize;
75} 72}
76 73
77static inline gva_t get_tlb_end(const struct tlbe *tlbe) 74static inline gva_t get_tlb_end(const struct kvm_book3e_206_tlb_entry *tlbe)
78{ 75{
79 u64 bytes = get_tlb_bytes(tlbe); 76 u64 bytes = get_tlb_bytes(tlbe);
80 return get_tlb_eaddr(tlbe) + bytes - 1; 77 return get_tlb_eaddr(tlbe) + bytes - 1;
81} 78}
82 79
83static inline u64 get_tlb_raddr(const struct tlbe *tlbe) 80static inline u64 get_tlb_raddr(const struct kvm_book3e_206_tlb_entry *tlbe)
84{ 81{
85 u64 rpn = tlbe->mas7; 82 return tlbe->mas7_3 & ~0xfffULL;
86 return (rpn << 32) | (tlbe->mas3 & 0xfffff000);
87} 83}
88 84
89static inline unsigned int get_tlb_tid(const struct tlbe *tlbe) 85static inline unsigned int
86get_tlb_tid(const struct kvm_book3e_206_tlb_entry *tlbe)
90{ 87{
91 return (tlbe->mas1 >> 16) & 0xff; 88 return (tlbe->mas1 >> 16) & 0xff;
92} 89}
93 90
94static inline unsigned int get_tlb_ts(const struct tlbe *tlbe) 91static inline unsigned int
92get_tlb_ts(const struct kvm_book3e_206_tlb_entry *tlbe)
95{ 93{
96 return (tlbe->mas1 >> 12) & 0x1; 94 return (tlbe->mas1 >> 12) & 0x1;
97} 95}
98 96
99static inline unsigned int get_tlb_v(const struct tlbe *tlbe) 97static inline unsigned int
98get_tlb_v(const struct kvm_book3e_206_tlb_entry *tlbe)
100{ 99{
101 return (tlbe->mas1 >> 31) & 0x1; 100 return (tlbe->mas1 >> 31) & 0x1;
102} 101}
103 102
104static inline unsigned int get_tlb_iprot(const struct tlbe *tlbe) 103static inline unsigned int
104get_tlb_iprot(const struct kvm_book3e_206_tlb_entry *tlbe)
105{ 105{
106 return (tlbe->mas1 >> 30) & 0x1; 106 return (tlbe->mas1 >> 30) & 0x1;
107} 107}
@@ -121,59 +121,37 @@ static inline unsigned int get_cur_pr(struct kvm_vcpu *vcpu)
121 return !!(vcpu->arch.shared->msr & MSR_PR); 121 return !!(vcpu->arch.shared->msr & MSR_PR);
122} 122}
123 123
124static inline unsigned int get_cur_spid( 124static inline unsigned int get_cur_spid(const struct kvm_vcpu *vcpu)
125 const struct kvmppc_vcpu_e500 *vcpu_e500)
126{ 125{
127 return (vcpu_e500->mas6 >> 16) & 0xff; 126 return (vcpu->arch.shared->mas6 >> 16) & 0xff;
128} 127}
129 128
130static inline unsigned int get_cur_sas( 129static inline unsigned int get_cur_sas(const struct kvm_vcpu *vcpu)
131 const struct kvmppc_vcpu_e500 *vcpu_e500)
132{ 130{
133 return vcpu_e500->mas6 & 0x1; 131 return vcpu->arch.shared->mas6 & 0x1;
134} 132}
135 133
136static inline unsigned int get_tlb_tlbsel( 134static inline unsigned int get_tlb_tlbsel(const struct kvm_vcpu *vcpu)
137 const struct kvmppc_vcpu_e500 *vcpu_e500)
138{ 135{
139 /* 136 /*
140 * Manual says that tlbsel has 2 bits wide. 137 * Manual says that tlbsel has 2 bits wide.
141 * Since we only have two TLBs, only lower bit is used. 138 * Since we only have two TLBs, only lower bit is used.
142 */ 139 */
143 return (vcpu_e500->mas0 >> 28) & 0x1; 140 return (vcpu->arch.shared->mas0 >> 28) & 0x1;
144}
145
146static inline unsigned int get_tlb_nv_bit(
147 const struct kvmppc_vcpu_e500 *vcpu_e500)
148{
149 return vcpu_e500->mas0 & 0xfff;
150} 141}
151 142
152static inline unsigned int get_tlb_esel_bit( 143static inline unsigned int get_tlb_nv_bit(const struct kvm_vcpu *vcpu)
153 const struct kvmppc_vcpu_e500 *vcpu_e500)
154{ 144{
155 return (vcpu_e500->mas0 >> 16) & 0xfff; 145 return vcpu->arch.shared->mas0 & 0xfff;
156} 146}
157 147
158static inline unsigned int get_tlb_esel( 148static inline unsigned int get_tlb_esel_bit(const struct kvm_vcpu *vcpu)
159 const struct kvmppc_vcpu_e500 *vcpu_e500,
160 int tlbsel)
161{ 149{
162 unsigned int esel = get_tlb_esel_bit(vcpu_e500); 150 return (vcpu->arch.shared->mas0 >> 16) & 0xfff;
163
164 if (tlbsel == 0) {
165 esel &= KVM_E500_TLB0_WAY_NUM_MASK;
166 esel |= ((vcpu_e500->mas2 >> 12) & KVM_E500_TLB0_WAY_SIZE_MASK)
167 << KVM_E500_TLB0_WAY_NUM_BIT;
168 } else {
169 esel &= KVM_E500_TLB1_SIZE - 1;
170 }
171
172 return esel;
173} 151}
174 152
175static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu, 153static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
176 const struct tlbe *tlbe) 154 const struct kvm_book3e_206_tlb_entry *tlbe)
177{ 155{
178 gpa_t gpa; 156 gpa_t gpa;
179 157
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 141dce3c6810..968f40101883 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -13,6 +13,7 @@
13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
14 * 14 *
15 * Copyright IBM Corp. 2007 15 * Copyright IBM Corp. 2007
16 * Copyright 2011 Freescale Semiconductor, Inc.
16 * 17 *
17 * Authors: Hollis Blanchard <hollisb@us.ibm.com> 18 * Authors: Hollis Blanchard <hollisb@us.ibm.com>
18 */ 19 */
@@ -69,54 +70,55 @@
69#define OP_STH 44 70#define OP_STH 44
70#define OP_STHU 45 71#define OP_STHU 45
71 72
72#ifdef CONFIG_PPC_BOOK3S
73static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu)
74{
75 return 1;
76}
77#else
78static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu)
79{
80 return vcpu->arch.tcr & TCR_DIE;
81}
82#endif
83
84void kvmppc_emulate_dec(struct kvm_vcpu *vcpu) 73void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
85{ 74{
86 unsigned long dec_nsec; 75 unsigned long dec_nsec;
76 unsigned long long dec_time;
87 77
88 pr_debug("mtDEC: %x\n", vcpu->arch.dec); 78 pr_debug("mtDEC: %x\n", vcpu->arch.dec);
79 hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
80
89#ifdef CONFIG_PPC_BOOK3S 81#ifdef CONFIG_PPC_BOOK3S
90 /* mtdec lowers the interrupt line when positive. */ 82 /* mtdec lowers the interrupt line when positive. */
91 kvmppc_core_dequeue_dec(vcpu); 83 kvmppc_core_dequeue_dec(vcpu);
92 84
93 /* POWER4+ triggers a dec interrupt if the value is < 0 */ 85 /* POWER4+ triggers a dec interrupt if the value is < 0 */
94 if (vcpu->arch.dec & 0x80000000) { 86 if (vcpu->arch.dec & 0x80000000) {
95 hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
96 kvmppc_core_queue_dec(vcpu); 87 kvmppc_core_queue_dec(vcpu);
97 return; 88 return;
98 } 89 }
99#endif 90#endif
100 if (kvmppc_dec_enabled(vcpu)) { 91
101 /* The decrementer ticks at the same rate as the timebase, so 92#ifdef CONFIG_BOOKE
102 * that's how we convert the guest DEC value to the number of 93 /* On BOOKE, DEC = 0 is as good as decrementer not enabled */
103 * host ticks. */ 94 if (vcpu->arch.dec == 0)
104 95 return;
105 hrtimer_try_to_cancel(&vcpu->arch.dec_timer); 96#endif
106 dec_nsec = vcpu->arch.dec; 97
107 dec_nsec *= 1000; 98 /*
108 dec_nsec /= tb_ticks_per_usec; 99 * The decrementer ticks at the same rate as the timebase, so
109 hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec), 100 * that's how we convert the guest DEC value to the number of
110 HRTIMER_MODE_REL); 101 * host ticks.
111 vcpu->arch.dec_jiffies = get_tb(); 102 */
112 } else { 103
113 hrtimer_try_to_cancel(&vcpu->arch.dec_timer); 104 dec_time = vcpu->arch.dec;
114 } 105 dec_time *= 1000;
106 do_div(dec_time, tb_ticks_per_usec);
107 dec_nsec = do_div(dec_time, NSEC_PER_SEC);
108 hrtimer_start(&vcpu->arch.dec_timer,
109 ktime_set(dec_time, dec_nsec), HRTIMER_MODE_REL);
110 vcpu->arch.dec_jiffies = get_tb();
115} 111}
116 112
117u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb) 113u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb)
118{ 114{
119 u64 jd = tb - vcpu->arch.dec_jiffies; 115 u64 jd = tb - vcpu->arch.dec_jiffies;
116
117#ifdef CONFIG_BOOKE
118 if (vcpu->arch.dec < jd)
119 return 0;
120#endif
121
120 return vcpu->arch.dec - jd; 122 return vcpu->arch.dec - jd;
121} 123}
122 124
@@ -159,7 +161,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
159 case OP_TRAP_64: 161 case OP_TRAP_64:
160 kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP); 162 kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP);
161#else 163#else
162 kvmppc_core_queue_program(vcpu, vcpu->arch.esr | ESR_PTR); 164 kvmppc_core_queue_program(vcpu,
165 vcpu->arch.shared->esr | ESR_PTR);
163#endif 166#endif
164 advance = 0; 167 advance = 0;
165 break; 168 break;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 607fbdf24b84..00d7e345b3fe 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -39,7 +39,8 @@
39int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) 39int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
40{ 40{
41 return !(v->arch.shared->msr & MSR_WE) || 41 return !(v->arch.shared->msr & MSR_WE) ||
42 !!(v->arch.pending_exceptions); 42 !!(v->arch.pending_exceptions) ||
43 v->requests;
43} 44}
44 45
45int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) 46int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
@@ -66,7 +67,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
66 vcpu->arch.magic_page_pa = param1; 67 vcpu->arch.magic_page_pa = param1;
67 vcpu->arch.magic_page_ea = param2; 68 vcpu->arch.magic_page_ea = param2;
68 69
69 r2 = KVM_MAGIC_FEAT_SR; 70 r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7;
70 71
71 r = HC_EV_SUCCESS; 72 r = HC_EV_SUCCESS;
72 break; 73 break;
@@ -171,8 +172,11 @@ void kvm_arch_check_processor_compat(void *rtn)
171 *(int *)rtn = kvmppc_core_check_processor_compat(); 172 *(int *)rtn = kvmppc_core_check_processor_compat();
172} 173}
173 174
174int kvm_arch_init_vm(struct kvm *kvm) 175int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
175{ 176{
177 if (type)
178 return -EINVAL;
179
176 return kvmppc_core_init_vm(kvm); 180 return kvmppc_core_init_vm(kvm);
177} 181}
178 182
@@ -208,17 +212,22 @@ int kvm_dev_ioctl_check_extension(long ext)
208 case KVM_CAP_PPC_BOOKE_SREGS: 212 case KVM_CAP_PPC_BOOKE_SREGS:
209#else 213#else
210 case KVM_CAP_PPC_SEGSTATE: 214 case KVM_CAP_PPC_SEGSTATE:
215 case KVM_CAP_PPC_HIOR:
211 case KVM_CAP_PPC_PAPR: 216 case KVM_CAP_PPC_PAPR:
212#endif 217#endif
213 case KVM_CAP_PPC_UNSET_IRQ: 218 case KVM_CAP_PPC_UNSET_IRQ:
214 case KVM_CAP_PPC_IRQ_LEVEL: 219 case KVM_CAP_PPC_IRQ_LEVEL:
215 case KVM_CAP_ENABLE_CAP: 220 case KVM_CAP_ENABLE_CAP:
221 case KVM_CAP_ONE_REG:
216 r = 1; 222 r = 1;
217 break; 223 break;
218#ifndef CONFIG_KVM_BOOK3S_64_HV 224#ifndef CONFIG_KVM_BOOK3S_64_HV
219 case KVM_CAP_PPC_PAIRED_SINGLES: 225 case KVM_CAP_PPC_PAIRED_SINGLES:
220 case KVM_CAP_PPC_OSI: 226 case KVM_CAP_PPC_OSI:
221 case KVM_CAP_PPC_GET_PVINFO: 227 case KVM_CAP_PPC_GET_PVINFO:
228#ifdef CONFIG_KVM_E500
229 case KVM_CAP_SW_TLB:
230#endif
222 r = 1; 231 r = 1;
223 break; 232 break;
224 case KVM_CAP_COALESCED_MMIO: 233 case KVM_CAP_COALESCED_MMIO:
@@ -238,7 +247,26 @@ int kvm_dev_ioctl_check_extension(long ext)
238 if (cpu_has_feature(CPU_FTR_ARCH_201)) 247 if (cpu_has_feature(CPU_FTR_ARCH_201))
239 r = 2; 248 r = 2;
240 break; 249 break;
250 case KVM_CAP_SYNC_MMU:
251 r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0;
252 break;
241#endif 253#endif
254 case KVM_CAP_NR_VCPUS:
255 /*
256 * Recommending a number of CPUs is somewhat arbitrary; we
257 * return the number of present CPUs for -HV (since a host
258 * will have secondary threads "offline"), and for other KVM
259 * implementations just count online CPUs.
260 */
261#ifdef CONFIG_KVM_BOOK3S_64_HV
262 r = num_present_cpus();
263#else
264 r = num_online_cpus();
265#endif
266 break;
267 case KVM_CAP_MAX_VCPUS:
268 r = KVM_MAX_VCPUS;
269 break;
242 default: 270 default:
243 r = 0; 271 r = 0;
244 break; 272 break;
@@ -253,6 +281,16 @@ long kvm_arch_dev_ioctl(struct file *filp,
253 return -EINVAL; 281 return -EINVAL;
254} 282}
255 283
284void kvm_arch_free_memslot(struct kvm_memory_slot *free,
285 struct kvm_memory_slot *dont)
286{
287}
288
289int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
290{
291 return 0;
292}
293
256int kvm_arch_prepare_memory_region(struct kvm *kvm, 294int kvm_arch_prepare_memory_region(struct kvm *kvm,
257 struct kvm_memory_slot *memslot, 295 struct kvm_memory_slot *memslot,
258 struct kvm_memory_slot old, 296 struct kvm_memory_slot old,
@@ -279,9 +317,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
279{ 317{
280 struct kvm_vcpu *vcpu; 318 struct kvm_vcpu *vcpu;
281 vcpu = kvmppc_core_vcpu_create(kvm, id); 319 vcpu = kvmppc_core_vcpu_create(kvm, id);
282 vcpu->arch.wqp = &vcpu->wq; 320 if (!IS_ERR(vcpu)) {
283 if (!IS_ERR(vcpu)) 321 vcpu->arch.wqp = &vcpu->wq;
284 kvmppc_create_vcpu_debugfs(vcpu, id); 322 kvmppc_create_vcpu_debugfs(vcpu, id);
323 }
285 return vcpu; 324 return vcpu;
286} 325}
287 326
@@ -305,18 +344,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
305 return kvmppc_core_pending_dec(vcpu); 344 return kvmppc_core_pending_dec(vcpu);
306} 345}
307 346
308static void kvmppc_decrementer_func(unsigned long data)
309{
310 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
311
312 kvmppc_core_queue_dec(vcpu);
313
314 if (waitqueue_active(vcpu->arch.wqp)) {
315 wake_up_interruptible(vcpu->arch.wqp);
316 vcpu->stat.halt_wakeup++;
317 }
318}
319
320/* 347/*
321 * low level hrtimer wake routine. Because this runs in hardirq context 348 * low level hrtimer wake routine. Because this runs in hardirq context
322 * we schedule a tasklet to do the real work. 349 * we schedule a tasklet to do the real work.
@@ -431,20 +458,20 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
431 458
432 kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr); 459 kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);
433 460
434 switch (vcpu->arch.io_gpr & KVM_REG_EXT_MASK) { 461 switch (vcpu->arch.io_gpr & KVM_MMIO_REG_EXT_MASK) {
435 case KVM_REG_GPR: 462 case KVM_MMIO_REG_GPR:
436 kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr); 463 kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);
437 break; 464 break;
438 case KVM_REG_FPR: 465 case KVM_MMIO_REG_FPR:
439 vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; 466 vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr;
440 break; 467 break;
441#ifdef CONFIG_PPC_BOOK3S 468#ifdef CONFIG_PPC_BOOK3S
442 case KVM_REG_QPR: 469 case KVM_MMIO_REG_QPR:
443 vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; 470 vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr;
444 break; 471 break;
445 case KVM_REG_FQPR: 472 case KVM_MMIO_REG_FQPR:
446 vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; 473 vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr;
447 vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; 474 vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr;
448 break; 475 break;
449#endif 476#endif
450 default: 477 default:
@@ -553,8 +580,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
553 vcpu->arch.hcall_needed = 0; 580 vcpu->arch.hcall_needed = 0;
554 } 581 }
555 582
556 kvmppc_core_deliver_interrupts(vcpu);
557
558 r = kvmppc_vcpu_run(run, vcpu); 583 r = kvmppc_vcpu_run(run, vcpu);
559 584
560 if (vcpu->sigset_active) 585 if (vcpu->sigset_active)
@@ -563,6 +588,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
563 return r; 588 return r;
564} 589}
565 590
591void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
592{
593 int me;
594 int cpu = vcpu->cpu;
595
596 me = get_cpu();
597 if (waitqueue_active(vcpu->arch.wqp)) {
598 wake_up_interruptible(vcpu->arch.wqp);
599 vcpu->stat.halt_wakeup++;
600 } else if (cpu != me && cpu != -1) {
601 smp_send_reschedule(vcpu->cpu);
602 }
603 put_cpu();
604}
605
566int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) 606int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
567{ 607{
568 if (irq->irq == KVM_INTERRUPT_UNSET) { 608 if (irq->irq == KVM_INTERRUPT_UNSET) {
@@ -571,13 +611,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
571 } 611 }
572 612
573 kvmppc_core_queue_external(vcpu, irq); 613 kvmppc_core_queue_external(vcpu, irq);
574 614 kvm_vcpu_kick(vcpu);
575 if (waitqueue_active(vcpu->arch.wqp)) {
576 wake_up_interruptible(vcpu->arch.wqp);
577 vcpu->stat.halt_wakeup++;
578 } else if (vcpu->cpu != -1) {
579 smp_send_reschedule(vcpu->cpu);
580 }
581 615
582 return 0; 616 return 0;
583} 617}
@@ -599,6 +633,19 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
599 r = 0; 633 r = 0;
600 vcpu->arch.papr_enabled = true; 634 vcpu->arch.papr_enabled = true;
601 break; 635 break;
636#ifdef CONFIG_KVM_E500
637 case KVM_CAP_SW_TLB: {
638 struct kvm_config_tlb cfg;
639 void __user *user_ptr = (void __user *)(uintptr_t)cap->args[0];
640
641 r = -EFAULT;
642 if (copy_from_user(&cfg, user_ptr, sizeof(cfg)))
643 break;
644
645 r = kvm_vcpu_ioctl_config_tlb(vcpu, &cfg);
646 break;
647 }
648#endif
602 default: 649 default:
603 r = -EINVAL; 650 r = -EINVAL;
604 break; 651 break;
@@ -648,6 +695,32 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
648 r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); 695 r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
649 break; 696 break;
650 } 697 }
698
699 case KVM_SET_ONE_REG:
700 case KVM_GET_ONE_REG:
701 {
702 struct kvm_one_reg reg;
703 r = -EFAULT;
704 if (copy_from_user(&reg, argp, sizeof(reg)))
705 goto out;
706 if (ioctl == KVM_SET_ONE_REG)
707 r = kvm_vcpu_ioctl_set_one_reg(vcpu, &reg);
708 else
709 r = kvm_vcpu_ioctl_get_one_reg(vcpu, &reg);
710 break;
711 }
712
713#ifdef CONFIG_KVM_E500
714 case KVM_DIRTY_TLB: {
715 struct kvm_dirty_tlb dirty;
716 r = -EFAULT;
717 if (copy_from_user(&dirty, argp, sizeof(dirty)))
718 goto out;
719 r = kvm_vcpu_ioctl_dirty_tlb(vcpu, &dirty);
720 break;
721 }
722#endif
723
651 default: 724 default:
652 r = -EINVAL; 725 r = -EINVAL;
653 } 726 }
@@ -656,6 +729,11 @@ out:
656 return r; 729 return r;
657} 730}
658 731
732int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
733{
734 return VM_FAULT_SIGBUS;
735}
736
659static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) 737static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
660{ 738{
661 u32 inst_lis = 0x3c000000; 739 u32 inst_lis = 0x3c000000;
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index b135d3d397db..877186b7b1c3 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -118,11 +118,14 @@ TRACE_EVENT(kvm_book3s_exit,
118 ), 118 ),
119 119
120 TP_fast_assign( 120 TP_fast_assign(
121 struct kvmppc_book3s_shadow_vcpu *svcpu;
121 __entry->exit_nr = exit_nr; 122 __entry->exit_nr = exit_nr;
122 __entry->pc = kvmppc_get_pc(vcpu); 123 __entry->pc = kvmppc_get_pc(vcpu);
123 __entry->dar = kvmppc_get_fault_dar(vcpu); 124 __entry->dar = kvmppc_get_fault_dar(vcpu);
124 __entry->msr = vcpu->arch.shared->msr; 125 __entry->msr = vcpu->arch.shared->msr;
125 __entry->srr1 = to_svcpu(vcpu)->shadow_srr1; 126 svcpu = svcpu_get(vcpu);
127 __entry->srr1 = svcpu->shadow_srr1;
128 svcpu_put(svcpu);
126 ), 129 ),
127 130
128 TP_printk("exit=0x%x | pc=0x%lx | msr=0x%lx | dar=0x%lx | srr1=0x%lx", 131 TP_printk("exit=0x%x | pc=0x%lx | msr=0x%lx | dar=0x%lx | srr1=0x%lx",
@@ -337,6 +340,63 @@ TRACE_EVENT(kvm_book3s_slbmte,
337 340
338#endif /* CONFIG_PPC_BOOK3S */ 341#endif /* CONFIG_PPC_BOOK3S */
339 342
343
344/*************************************************************************
345 * Book3E trace points *
346 *************************************************************************/
347
348#ifdef CONFIG_BOOKE
349
350TRACE_EVENT(kvm_booke206_stlb_write,
351 TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3),
352 TP_ARGS(mas0, mas8, mas1, mas2, mas7_3),
353
354 TP_STRUCT__entry(
355 __field( __u32, mas0 )
356 __field( __u32, mas8 )
357 __field( __u32, mas1 )
358 __field( __u64, mas2 )
359 __field( __u64, mas7_3 )
360 ),
361
362 TP_fast_assign(
363 __entry->mas0 = mas0;
364 __entry->mas8 = mas8;
365 __entry->mas1 = mas1;
366 __entry->mas2 = mas2;
367 __entry->mas7_3 = mas7_3;
368 ),
369
370 TP_printk("mas0=%x mas8=%x mas1=%x mas2=%llx mas7_3=%llx",
371 __entry->mas0, __entry->mas8, __entry->mas1,
372 __entry->mas2, __entry->mas7_3)
373);
374
375TRACE_EVENT(kvm_booke206_gtlb_write,
376 TP_PROTO(__u32 mas0, __u32 mas1, __u64 mas2, __u64 mas7_3),
377 TP_ARGS(mas0, mas1, mas2, mas7_3),
378
379 TP_STRUCT__entry(
380 __field( __u32, mas0 )
381 __field( __u32, mas1 )
382 __field( __u64, mas2 )
383 __field( __u64, mas7_3 )
384 ),
385
386 TP_fast_assign(
387 __entry->mas0 = mas0;
388 __entry->mas1 = mas1;
389 __entry->mas2 = mas2;
390 __entry->mas7_3 = mas7_3;
391 ),
392
393 TP_printk("mas0=%x mas1=%x mas2=%llx mas7_3=%llx",
394 __entry->mas0, __entry->mas1,
395 __entry->mas2, __entry->mas7_3)
396);
397
398#endif
399
340#endif /* _TRACE_KVM_H */ 400#endif /* _TRACE_KVM_H */
341 401
342/* This part must be outside protection */ 402/* This part must be outside protection */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index a3e628727697..fb05b123218f 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -12,6 +12,7 @@
12#include <linux/io.h> 12#include <linux/io.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/hugetlb.h> 14#include <linux/hugetlb.h>
15#include <linux/export.h>
15#include <linux/of_fdt.h> 16#include <linux/of_fdt.h>
16#include <linux/memblock.h> 17#include <linux/memblock.h>
17#include <linux/bootmem.h> 18#include <linux/bootmem.h>
@@ -103,6 +104,7 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
103 *shift = hugepd_shift(*hpdp); 104 *shift = hugepd_shift(*hpdp);
104 return hugepte_offset(hpdp, ea, pdshift); 105 return hugepte_offset(hpdp, ea, pdshift);
105} 106}
107EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
106 108
107pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 109pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
108{ 110{
diff --git a/arch/s390/include/asm/kvm.h b/arch/s390/include/asm/kvm.h
index 82b32a100c7d..96076676e224 100644
--- a/arch/s390/include/asm/kvm.h
+++ b/arch/s390/include/asm/kvm.h
@@ -41,4 +41,15 @@ struct kvm_debug_exit_arch {
41struct kvm_guest_debug_arch { 41struct kvm_guest_debug_arch {
42}; 42};
43 43
44#define KVM_SYNC_PREFIX (1UL << 0)
45#define KVM_SYNC_GPRS (1UL << 1)
46#define KVM_SYNC_ACRS (1UL << 2)
47#define KVM_SYNC_CRS (1UL << 3)
48/* definition of registers in kvm_run */
49struct kvm_sync_regs {
50 __u64 prefix; /* prefix register */
51 __u64 gprs[16]; /* general purpose registers */
52 __u32 acrs[16]; /* access registers */
53 __u64 crs[16]; /* control registers */
54};
44#endif 55#endif
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index b0c235cb6ad5..7343872890a2 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -220,18 +220,17 @@ struct kvm_s390_float_interrupt {
220 struct list_head list; 220 struct list_head list;
221 atomic_t active; 221 atomic_t active;
222 int next_rr_cpu; 222 int next_rr_cpu;
223 unsigned long idle_mask [(64 + sizeof(long) - 1) / sizeof(long)]; 223 unsigned long idle_mask[(KVM_MAX_VCPUS + sizeof(long) - 1)
224 struct kvm_s390_local_interrupt *local_int[64]; 224 / sizeof(long)];
225 struct kvm_s390_local_interrupt *local_int[KVM_MAX_VCPUS];
225}; 226};
226 227
227 228
228struct kvm_vcpu_arch { 229struct kvm_vcpu_arch {
229 struct kvm_s390_sie_block *sie_block; 230 struct kvm_s390_sie_block *sie_block;
230 unsigned long guest_gprs[16];
231 s390_fp_regs host_fpregs; 231 s390_fp_regs host_fpregs;
232 unsigned int host_acrs[NUM_ACRS]; 232 unsigned int host_acrs[NUM_ACRS];
233 s390_fp_regs guest_fpregs; 233 s390_fp_regs guest_fpregs;
234 unsigned int guest_acrs[NUM_ACRS];
235 struct kvm_s390_local_interrupt local_int; 234 struct kvm_s390_local_interrupt local_int;
236 struct hrtimer ckc_timer; 235 struct hrtimer ckc_timer;
237 struct tasklet_struct tasklet; 236 struct tasklet_struct tasklet;
@@ -246,6 +245,9 @@ struct kvm_vm_stat {
246 u32 remote_tlb_flush; 245 u32 remote_tlb_flush;
247}; 246};
248 247
248struct kvm_arch_memory_slot {
249};
250
249struct kvm_arch{ 251struct kvm_arch{
250 struct sca_block *sca; 252 struct sca_block *sca;
251 debug_info_t *dbf; 253 debug_info_t *dbf;
@@ -253,5 +255,5 @@ struct kvm_arch{
253 struct gmap *gmap; 255 struct gmap *gmap;
254}; 256};
255 257
256extern int sie64a(struct kvm_s390_sie_block *, unsigned long *); 258extern int sie64a(struct kvm_s390_sie_block *, u64 *);
257#endif 259#endif
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index a21634173a66..78eb9847008f 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -34,6 +34,15 @@ config KVM
34 34
35 If unsure, say N. 35 If unsure, say N.
36 36
37config KVM_S390_UCONTROL
38 bool "Userspace controlled virtual machines"
39 depends on KVM
40 ---help---
41 Allow CAP_SYS_ADMIN users to create KVM virtual machines that are
42 controlled by userspace.
43
44 If unsure, say N.
45
37# OK, it's a little counter-intuitive to do this, but it puts it neatly under 46# OK, it's a little counter-intuitive to do this, but it puts it neatly under
38# the virtualization menu. 47# the virtualization menu.
39source drivers/vhost/Kconfig 48source drivers/vhost/Kconfig
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 8943e82cd4d9..a353f0ea45c2 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -20,8 +20,8 @@ static int diag_release_pages(struct kvm_vcpu *vcpu)
20 unsigned long start, end; 20 unsigned long start, end;
21 unsigned long prefix = vcpu->arch.sie_block->prefix; 21 unsigned long prefix = vcpu->arch.sie_block->prefix;
22 22
23 start = vcpu->arch.guest_gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; 23 start = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4];
24 end = vcpu->arch.guest_gprs[vcpu->arch.sie_block->ipa & 0xf] + 4096; 24 end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + 4096;
25 25
26 if (start & ~PAGE_MASK || end & ~PAGE_MASK || start > end 26 if (start & ~PAGE_MASK || end & ~PAGE_MASK || start > end
27 || start < 2 * PAGE_SIZE) 27 || start < 2 * PAGE_SIZE)
@@ -56,7 +56,7 @@ static int __diag_time_slice_end(struct kvm_vcpu *vcpu)
56static int __diag_ipl_functions(struct kvm_vcpu *vcpu) 56static int __diag_ipl_functions(struct kvm_vcpu *vcpu)
57{ 57{
58 unsigned int reg = vcpu->arch.sie_block->ipa & 0xf; 58 unsigned int reg = vcpu->arch.sie_block->ipa & 0xf;
59 unsigned long subcode = vcpu->arch.guest_gprs[reg] & 0xffff; 59 unsigned long subcode = vcpu->run->s.regs.gprs[reg] & 0xffff;
60 60
61 VCPU_EVENT(vcpu, 5, "diag ipl functions, subcode %lx", subcode); 61 VCPU_EVENT(vcpu, 5, "diag ipl functions, subcode %lx", subcode);
62 switch (subcode) { 62 switch (subcode) {
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 02434543eabb..361456577c6f 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -36,7 +36,7 @@ static int handle_lctlg(struct kvm_vcpu *vcpu)
36 36
37 useraddr = disp2; 37 useraddr = disp2;
38 if (base2) 38 if (base2)
39 useraddr += vcpu->arch.guest_gprs[base2]; 39 useraddr += vcpu->run->s.regs.gprs[base2];
40 40
41 if (useraddr & 7) 41 if (useraddr & 7)
42 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 42 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -75,7 +75,7 @@ static int handle_lctl(struct kvm_vcpu *vcpu)
75 75
76 useraddr = disp2; 76 useraddr = disp2;
77 if (base2) 77 if (base2)
78 useraddr += vcpu->arch.guest_gprs[base2]; 78 useraddr += vcpu->run->s.regs.gprs[base2];
79 79
80 if (useraddr & 3) 80 if (useraddr & 3)
81 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 81 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -133,13 +133,6 @@ static int handle_stop(struct kvm_vcpu *vcpu)
133 133
134 vcpu->stat.exit_stop_request++; 134 vcpu->stat.exit_stop_request++;
135 spin_lock_bh(&vcpu->arch.local_int.lock); 135 spin_lock_bh(&vcpu->arch.local_int.lock);
136 if (vcpu->arch.local_int.action_bits & ACTION_STORE_ON_STOP) {
137 vcpu->arch.local_int.action_bits &= ~ACTION_STORE_ON_STOP;
138 rc = kvm_s390_vcpu_store_status(vcpu,
139 KVM_S390_STORE_STATUS_NOADDR);
140 if (rc >= 0)
141 rc = -EOPNOTSUPP;
142 }
143 136
144 if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) { 137 if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) {
145 vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP; 138 vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP;
@@ -155,7 +148,18 @@ static int handle_stop(struct kvm_vcpu *vcpu)
155 rc = -EOPNOTSUPP; 148 rc = -EOPNOTSUPP;
156 } 149 }
157 150
158 spin_unlock_bh(&vcpu->arch.local_int.lock); 151 if (vcpu->arch.local_int.action_bits & ACTION_STORE_ON_STOP) {
152 vcpu->arch.local_int.action_bits &= ~ACTION_STORE_ON_STOP;
153 /* store status must be called unlocked. Since local_int.lock
154 * only protects local_int.* and not guest memory we can give
155 * up the lock here */
156 spin_unlock_bh(&vcpu->arch.local_int.lock);
157 rc = kvm_s390_vcpu_store_status(vcpu,
158 KVM_S390_STORE_STATUS_NOADDR);
159 if (rc >= 0)
160 rc = -EOPNOTSUPP;
161 } else
162 spin_unlock_bh(&vcpu->arch.local_int.lock);
159 return rc; 163 return rc;
160} 164}
161 165
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index f0647ce6da21..2d9f9a72bb81 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -236,8 +236,7 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
236 VCPU_EVENT(vcpu, 4, "interrupt: set prefix to %x", 236 VCPU_EVENT(vcpu, 4, "interrupt: set prefix to %x",
237 inti->prefix.address); 237 inti->prefix.address);
238 vcpu->stat.deliver_prefix_signal++; 238 vcpu->stat.deliver_prefix_signal++;
239 vcpu->arch.sie_block->prefix = inti->prefix.address; 239 kvm_s390_set_prefix(vcpu, inti->prefix.address);
240 vcpu->arch.sie_block->ihcpu = 0xffff;
241 break; 240 break;
242 241
243 case KVM_S390_RESTART: 242 case KVM_S390_RESTART:
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d1c445732451..17ad69d596fd 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -129,6 +129,10 @@ int kvm_dev_ioctl_check_extension(long ext)
129 case KVM_CAP_S390_PSW: 129 case KVM_CAP_S390_PSW:
130 case KVM_CAP_S390_GMAP: 130 case KVM_CAP_S390_GMAP:
131 case KVM_CAP_SYNC_MMU: 131 case KVM_CAP_SYNC_MMU:
132#ifdef CONFIG_KVM_S390_UCONTROL
133 case KVM_CAP_S390_UCONTROL:
134#endif
135 case KVM_CAP_SYNC_REGS:
132 r = 1; 136 r = 1;
133 break; 137 break;
134 default: 138 default:
@@ -171,11 +175,22 @@ long kvm_arch_vm_ioctl(struct file *filp,
171 return r; 175 return r;
172} 176}
173 177
174int kvm_arch_init_vm(struct kvm *kvm) 178int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
175{ 179{
176 int rc; 180 int rc;
177 char debug_name[16]; 181 char debug_name[16];
178 182
183 rc = -EINVAL;
184#ifdef CONFIG_KVM_S390_UCONTROL
185 if (type & ~KVM_VM_S390_UCONTROL)
186 goto out_err;
187 if ((type & KVM_VM_S390_UCONTROL) && (!capable(CAP_SYS_ADMIN)))
188 goto out_err;
189#else
190 if (type)
191 goto out_err;
192#endif
193
179 rc = s390_enable_sie(); 194 rc = s390_enable_sie();
180 if (rc) 195 if (rc)
181 goto out_err; 196 goto out_err;
@@ -198,10 +213,13 @@ int kvm_arch_init_vm(struct kvm *kvm)
198 debug_register_view(kvm->arch.dbf, &debug_sprintf_view); 213 debug_register_view(kvm->arch.dbf, &debug_sprintf_view);
199 VM_EVENT(kvm, 3, "%s", "vm created"); 214 VM_EVENT(kvm, 3, "%s", "vm created");
200 215
201 kvm->arch.gmap = gmap_alloc(current->mm); 216 if (type & KVM_VM_S390_UCONTROL) {
202 if (!kvm->arch.gmap) 217 kvm->arch.gmap = NULL;
203 goto out_nogmap; 218 } else {
204 219 kvm->arch.gmap = gmap_alloc(current->mm);
220 if (!kvm->arch.gmap)
221 goto out_nogmap;
222 }
205 return 0; 223 return 0;
206out_nogmap: 224out_nogmap:
207 debug_unregister(kvm->arch.dbf); 225 debug_unregister(kvm->arch.dbf);
@@ -214,11 +232,18 @@ out_err:
214void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 232void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
215{ 233{
216 VCPU_EVENT(vcpu, 3, "%s", "free cpu"); 234 VCPU_EVENT(vcpu, 3, "%s", "free cpu");
217 clear_bit(63 - vcpu->vcpu_id, (unsigned long *) &vcpu->kvm->arch.sca->mcn); 235 if (!kvm_is_ucontrol(vcpu->kvm)) {
218 if (vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda == 236 clear_bit(63 - vcpu->vcpu_id,
219 (__u64) vcpu->arch.sie_block) 237 (unsigned long *) &vcpu->kvm->arch.sca->mcn);
220 vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0; 238 if (vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda ==
239 (__u64) vcpu->arch.sie_block)
240 vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0;
241 }
221 smp_mb(); 242 smp_mb();
243
244 if (kvm_is_ucontrol(vcpu->kvm))
245 gmap_free(vcpu->arch.gmap);
246
222 free_page((unsigned long)(vcpu->arch.sie_block)); 247 free_page((unsigned long)(vcpu->arch.sie_block));
223 kvm_vcpu_uninit(vcpu); 248 kvm_vcpu_uninit(vcpu);
224 kfree(vcpu); 249 kfree(vcpu);
@@ -249,13 +274,25 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
249 kvm_free_vcpus(kvm); 274 kvm_free_vcpus(kvm);
250 free_page((unsigned long)(kvm->arch.sca)); 275 free_page((unsigned long)(kvm->arch.sca));
251 debug_unregister(kvm->arch.dbf); 276 debug_unregister(kvm->arch.dbf);
252 gmap_free(kvm->arch.gmap); 277 if (!kvm_is_ucontrol(kvm))
278 gmap_free(kvm->arch.gmap);
253} 279}
254 280
255/* Section: vcpu related */ 281/* Section: vcpu related */
256int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 282int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
257{ 283{
284 if (kvm_is_ucontrol(vcpu->kvm)) {
285 vcpu->arch.gmap = gmap_alloc(current->mm);
286 if (!vcpu->arch.gmap)
287 return -ENOMEM;
288 return 0;
289 }
290
258 vcpu->arch.gmap = vcpu->kvm->arch.gmap; 291 vcpu->arch.gmap = vcpu->kvm->arch.gmap;
292 vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX |
293 KVM_SYNC_GPRS |
294 KVM_SYNC_ACRS |
295 KVM_SYNC_CRS;
259 return 0; 296 return 0;
260} 297}
261 298
@@ -270,7 +307,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
270 save_access_regs(vcpu->arch.host_acrs); 307 save_access_regs(vcpu->arch.host_acrs);
271 vcpu->arch.guest_fpregs.fpc &= FPC_VALID_MASK; 308 vcpu->arch.guest_fpregs.fpc &= FPC_VALID_MASK;
272 restore_fp_regs(&vcpu->arch.guest_fpregs); 309 restore_fp_regs(&vcpu->arch.guest_fpregs);
273 restore_access_regs(vcpu->arch.guest_acrs); 310 restore_access_regs(vcpu->run->s.regs.acrs);
274 gmap_enable(vcpu->arch.gmap); 311 gmap_enable(vcpu->arch.gmap);
275 atomic_set_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); 312 atomic_set_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
276} 313}
@@ -280,7 +317,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
280 atomic_clear_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); 317 atomic_clear_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
281 gmap_disable(vcpu->arch.gmap); 318 gmap_disable(vcpu->arch.gmap);
282 save_fp_regs(&vcpu->arch.guest_fpregs); 319 save_fp_regs(&vcpu->arch.guest_fpregs);
283 save_access_regs(vcpu->arch.guest_acrs); 320 save_access_regs(vcpu->run->s.regs.acrs);
284 restore_fp_regs(&vcpu->arch.host_fpregs); 321 restore_fp_regs(&vcpu->arch.host_fpregs);
285 restore_access_regs(vcpu->arch.host_acrs); 322 restore_access_regs(vcpu->arch.host_acrs);
286} 323}
@@ -290,8 +327,7 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
290 /* this equals initial cpu reset in pop, but we don't switch to ESA */ 327 /* this equals initial cpu reset in pop, but we don't switch to ESA */
291 vcpu->arch.sie_block->gpsw.mask = 0UL; 328 vcpu->arch.sie_block->gpsw.mask = 0UL;
292 vcpu->arch.sie_block->gpsw.addr = 0UL; 329 vcpu->arch.sie_block->gpsw.addr = 0UL;
293 vcpu->arch.sie_block->prefix = 0UL; 330 kvm_s390_set_prefix(vcpu, 0);
294 vcpu->arch.sie_block->ihcpu = 0xffff;
295 vcpu->arch.sie_block->cputm = 0UL; 331 vcpu->arch.sie_block->cputm = 0UL;
296 vcpu->arch.sie_block->ckc = 0UL; 332 vcpu->arch.sie_block->ckc = 0UL;
297 vcpu->arch.sie_block->todpr = 0; 333 vcpu->arch.sie_block->todpr = 0;
@@ -342,12 +378,19 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
342 goto out_free_cpu; 378 goto out_free_cpu;
343 379
344 vcpu->arch.sie_block->icpua = id; 380 vcpu->arch.sie_block->icpua = id;
345 BUG_ON(!kvm->arch.sca); 381 if (!kvm_is_ucontrol(kvm)) {
346 if (!kvm->arch.sca->cpu[id].sda) 382 if (!kvm->arch.sca) {
347 kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block; 383 WARN_ON_ONCE(1);
348 vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32); 384 goto out_free_cpu;
349 vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca; 385 }
350 set_bit(63 - id, (unsigned long *) &kvm->arch.sca->mcn); 386 if (!kvm->arch.sca->cpu[id].sda)
387 kvm->arch.sca->cpu[id].sda =
388 (__u64) vcpu->arch.sie_block;
389 vcpu->arch.sie_block->scaoh =
390 (__u32)(((__u64)kvm->arch.sca) >> 32);
391 vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca;
392 set_bit(63 - id, (unsigned long *) &kvm->arch.sca->mcn);
393 }
351 394
352 spin_lock_init(&vcpu->arch.local_int.lock); 395 spin_lock_init(&vcpu->arch.local_int.lock);
353 INIT_LIST_HEAD(&vcpu->arch.local_int.list); 396 INIT_LIST_HEAD(&vcpu->arch.local_int.list);
@@ -388,29 +431,29 @@ static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
388 431
389int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 432int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
390{ 433{
391 memcpy(&vcpu->arch.guest_gprs, &regs->gprs, sizeof(regs->gprs)); 434 memcpy(&vcpu->run->s.regs.gprs, &regs->gprs, sizeof(regs->gprs));
392 return 0; 435 return 0;
393} 436}
394 437
395int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 438int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
396{ 439{
397 memcpy(&regs->gprs, &vcpu->arch.guest_gprs, sizeof(regs->gprs)); 440 memcpy(&regs->gprs, &vcpu->run->s.regs.gprs, sizeof(regs->gprs));
398 return 0; 441 return 0;
399} 442}
400 443
401int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 444int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
402 struct kvm_sregs *sregs) 445 struct kvm_sregs *sregs)
403{ 446{
404 memcpy(&vcpu->arch.guest_acrs, &sregs->acrs, sizeof(sregs->acrs)); 447 memcpy(&vcpu->run->s.regs.acrs, &sregs->acrs, sizeof(sregs->acrs));
405 memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs)); 448 memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs));
406 restore_access_regs(vcpu->arch.guest_acrs); 449 restore_access_regs(vcpu->run->s.regs.acrs);
407 return 0; 450 return 0;
408} 451}
409 452
410int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 453int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
411 struct kvm_sregs *sregs) 454 struct kvm_sregs *sregs)
412{ 455{
413 memcpy(&sregs->acrs, &vcpu->arch.guest_acrs, sizeof(sregs->acrs)); 456 memcpy(&sregs->acrs, &vcpu->run->s.regs.acrs, sizeof(sregs->acrs));
414 memcpy(&sregs->crs, &vcpu->arch.sie_block->gcr, sizeof(sregs->crs)); 457 memcpy(&sregs->crs, &vcpu->arch.sie_block->gcr, sizeof(sregs->crs));
415 return 0; 458 return 0;
416} 459}
@@ -418,7 +461,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
418int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 461int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
419{ 462{
420 memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs)); 463 memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs));
421 vcpu->arch.guest_fpregs.fpc = fpu->fpc; 464 vcpu->arch.guest_fpregs.fpc = fpu->fpc & FPC_VALID_MASK;
422 restore_fp_regs(&vcpu->arch.guest_fpregs); 465 restore_fp_regs(&vcpu->arch.guest_fpregs);
423 return 0; 466 return 0;
424} 467}
@@ -467,9 +510,11 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
467 return -EINVAL; /* not implemented yet */ 510 return -EINVAL; /* not implemented yet */
468} 511}
469 512
470static void __vcpu_run(struct kvm_vcpu *vcpu) 513static int __vcpu_run(struct kvm_vcpu *vcpu)
471{ 514{
472 memcpy(&vcpu->arch.sie_block->gg14, &vcpu->arch.guest_gprs[14], 16); 515 int rc;
516
517 memcpy(&vcpu->arch.sie_block->gg14, &vcpu->run->s.regs.gprs[14], 16);
473 518
474 if (need_resched()) 519 if (need_resched())
475 schedule(); 520 schedule();
@@ -477,7 +522,8 @@ static void __vcpu_run(struct kvm_vcpu *vcpu)
477 if (test_thread_flag(TIF_MCCK_PENDING)) 522 if (test_thread_flag(TIF_MCCK_PENDING))
478 s390_handle_mcck(); 523 s390_handle_mcck();
479 524
480 kvm_s390_deliver_pending_interrupts(vcpu); 525 if (!kvm_is_ucontrol(vcpu->kvm))
526 kvm_s390_deliver_pending_interrupts(vcpu);
481 527
482 vcpu->arch.sie_block->icptcode = 0; 528 vcpu->arch.sie_block->icptcode = 0;
483 local_irq_disable(); 529 local_irq_disable();
@@ -485,9 +531,15 @@ static void __vcpu_run(struct kvm_vcpu *vcpu)
485 local_irq_enable(); 531 local_irq_enable();
486 VCPU_EVENT(vcpu, 6, "entering sie flags %x", 532 VCPU_EVENT(vcpu, 6, "entering sie flags %x",
487 atomic_read(&vcpu->arch.sie_block->cpuflags)); 533 atomic_read(&vcpu->arch.sie_block->cpuflags));
488 if (sie64a(vcpu->arch.sie_block, vcpu->arch.guest_gprs)) { 534 rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs);
489 VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction"); 535 if (rc) {
490 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 536 if (kvm_is_ucontrol(vcpu->kvm)) {
537 rc = SIE_INTERCEPT_UCONTROL;
538 } else {
539 VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
540 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
541 rc = 0;
542 }
491 } 543 }
492 VCPU_EVENT(vcpu, 6, "exit sie icptcode %d", 544 VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
493 vcpu->arch.sie_block->icptcode); 545 vcpu->arch.sie_block->icptcode);
@@ -495,7 +547,8 @@ static void __vcpu_run(struct kvm_vcpu *vcpu)
495 kvm_guest_exit(); 547 kvm_guest_exit();
496 local_irq_enable(); 548 local_irq_enable();
497 549
498 memcpy(&vcpu->arch.guest_gprs[14], &vcpu->arch.sie_block->gg14, 16); 550 memcpy(&vcpu->run->s.regs.gprs[14], &vcpu->arch.sie_block->gg14, 16);
551 return rc;
499} 552}
500 553
501int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 554int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -516,6 +569,7 @@ rerun_vcpu:
516 case KVM_EXIT_UNKNOWN: 569 case KVM_EXIT_UNKNOWN:
517 case KVM_EXIT_INTR: 570 case KVM_EXIT_INTR:
518 case KVM_EXIT_S390_RESET: 571 case KVM_EXIT_S390_RESET:
572 case KVM_EXIT_S390_UCONTROL:
519 break; 573 break;
520 default: 574 default:
521 BUG(); 575 BUG();
@@ -523,12 +577,26 @@ rerun_vcpu:
523 577
524 vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask; 578 vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask;
525 vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr; 579 vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr;
580 if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX) {
581 kvm_run->kvm_dirty_regs &= ~KVM_SYNC_PREFIX;
582 kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix);
583 }
584 if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) {
585 kvm_run->kvm_dirty_regs &= ~KVM_SYNC_CRS;
586 memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128);
587 kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix);
588 }
526 589
527 might_fault(); 590 might_fault();
528 591
529 do { 592 do {
530 __vcpu_run(vcpu); 593 rc = __vcpu_run(vcpu);
531 rc = kvm_handle_sie_intercept(vcpu); 594 if (rc)
595 break;
596 if (kvm_is_ucontrol(vcpu->kvm))
597 rc = -EOPNOTSUPP;
598 else
599 rc = kvm_handle_sie_intercept(vcpu);
532 } while (!signal_pending(current) && !rc); 600 } while (!signal_pending(current) && !rc);
533 601
534 if (rc == SIE_INTERCEPT_RERUNVCPU) 602 if (rc == SIE_INTERCEPT_RERUNVCPU)
@@ -539,6 +607,16 @@ rerun_vcpu:
539 rc = -EINTR; 607 rc = -EINTR;
540 } 608 }
541 609
610#ifdef CONFIG_KVM_S390_UCONTROL
611 if (rc == SIE_INTERCEPT_UCONTROL) {
612 kvm_run->exit_reason = KVM_EXIT_S390_UCONTROL;
613 kvm_run->s390_ucontrol.trans_exc_code =
614 current->thread.gmap_addr;
615 kvm_run->s390_ucontrol.pgm_code = 0x10;
616 rc = 0;
617 }
618#endif
619
542 if (rc == -EOPNOTSUPP) { 620 if (rc == -EOPNOTSUPP) {
543 /* intercept cannot be handled in-kernel, prepare kvm-run */ 621 /* intercept cannot be handled in-kernel, prepare kvm-run */
544 kvm_run->exit_reason = KVM_EXIT_S390_SIEIC; 622 kvm_run->exit_reason = KVM_EXIT_S390_SIEIC;
@@ -556,6 +634,8 @@ rerun_vcpu:
556 634
557 kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask; 635 kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask;
558 kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr; 636 kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr;
637 kvm_run->s.regs.prefix = vcpu->arch.sie_block->prefix;
638 memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128);
559 639
560 if (vcpu->sigset_active) 640 if (vcpu->sigset_active)
561 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 641 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
@@ -602,7 +682,7 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
602 return -EFAULT; 682 return -EFAULT;
603 683
604 if (__guestcopy(vcpu, addr + offsetof(struct save_area, gp_regs), 684 if (__guestcopy(vcpu, addr + offsetof(struct save_area, gp_regs),
605 vcpu->arch.guest_gprs, 128, prefix)) 685 vcpu->run->s.regs.gprs, 128, prefix))
606 return -EFAULT; 686 return -EFAULT;
607 687
608 if (__guestcopy(vcpu, addr + offsetof(struct save_area, psw), 688 if (__guestcopy(vcpu, addr + offsetof(struct save_area, psw),
@@ -631,7 +711,7 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
631 return -EFAULT; 711 return -EFAULT;
632 712
633 if (__guestcopy(vcpu, addr + offsetof(struct save_area, acc_regs), 713 if (__guestcopy(vcpu, addr + offsetof(struct save_area, acc_regs),
634 &vcpu->arch.guest_acrs, 64, prefix)) 714 &vcpu->run->s.regs.acrs, 64, prefix))
635 return -EFAULT; 715 return -EFAULT;
636 716
637 if (__guestcopy(vcpu, 717 if (__guestcopy(vcpu,
@@ -673,12 +753,77 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
673 case KVM_S390_INITIAL_RESET: 753 case KVM_S390_INITIAL_RESET:
674 r = kvm_arch_vcpu_ioctl_initial_reset(vcpu); 754 r = kvm_arch_vcpu_ioctl_initial_reset(vcpu);
675 break; 755 break;
756#ifdef CONFIG_KVM_S390_UCONTROL
757 case KVM_S390_UCAS_MAP: {
758 struct kvm_s390_ucas_mapping ucasmap;
759
760 if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) {
761 r = -EFAULT;
762 break;
763 }
764
765 if (!kvm_is_ucontrol(vcpu->kvm)) {
766 r = -EINVAL;
767 break;
768 }
769
770 r = gmap_map_segment(vcpu->arch.gmap, ucasmap.user_addr,
771 ucasmap.vcpu_addr, ucasmap.length);
772 break;
773 }
774 case KVM_S390_UCAS_UNMAP: {
775 struct kvm_s390_ucas_mapping ucasmap;
776
777 if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) {
778 r = -EFAULT;
779 break;
780 }
781
782 if (!kvm_is_ucontrol(vcpu->kvm)) {
783 r = -EINVAL;
784 break;
785 }
786
787 r = gmap_unmap_segment(vcpu->arch.gmap, ucasmap.vcpu_addr,
788 ucasmap.length);
789 break;
790 }
791#endif
792 case KVM_S390_VCPU_FAULT: {
793 r = gmap_fault(arg, vcpu->arch.gmap);
794 if (!IS_ERR_VALUE(r))
795 r = 0;
796 break;
797 }
676 default: 798 default:
677 r = -EINVAL; 799 r = -ENOTTY;
678 } 800 }
679 return r; 801 return r;
680} 802}
681 803
804int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
805{
806#ifdef CONFIG_KVM_S390_UCONTROL
807 if ((vmf->pgoff == KVM_S390_SIE_PAGE_OFFSET)
808 && (kvm_is_ucontrol(vcpu->kvm))) {
809 vmf->page = virt_to_page(vcpu->arch.sie_block);
810 get_page(vmf->page);
811 return 0;
812 }
813#endif
814 return VM_FAULT_SIGBUS;
815}
816
817void kvm_arch_free_memslot(struct kvm_memory_slot *free,
818 struct kvm_memory_slot *dont)
819{
820}
821
822int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
823{
824 return 0;
825}
826
682/* Section: memory related */ 827/* Section: memory related */
683int kvm_arch_prepare_memory_region(struct kvm *kvm, 828int kvm_arch_prepare_memory_region(struct kvm *kvm,
684 struct kvm_memory_slot *memslot, 829 struct kvm_memory_slot *memslot,
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 99b0b7597115..ff28f9d1c9eb 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -26,6 +26,7 @@ typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
26 26
27/* negativ values are error codes, positive values for internal conditions */ 27/* negativ values are error codes, positive values for internal conditions */
28#define SIE_INTERCEPT_RERUNVCPU (1<<0) 28#define SIE_INTERCEPT_RERUNVCPU (1<<0)
29#define SIE_INTERCEPT_UCONTROL (1<<1)
29int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu); 30int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu);
30 31
31#define VM_EVENT(d_kvm, d_loglevel, d_string, d_args...)\ 32#define VM_EVENT(d_kvm, d_loglevel, d_string, d_args...)\
@@ -47,6 +48,23 @@ static inline int __cpu_is_stopped(struct kvm_vcpu *vcpu)
47 return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_STOP_INT; 48 return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_STOP_INT;
48} 49}
49 50
51static inline int kvm_is_ucontrol(struct kvm *kvm)
52{
53#ifdef CONFIG_KVM_S390_UCONTROL
54 if (kvm->arch.gmap)
55 return 0;
56 return 1;
57#else
58 return 0;
59#endif
60}
61
62static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix)
63{
64 vcpu->arch.sie_block->prefix = prefix & 0x7fffe000u;
65 vcpu->arch.sie_block->ihcpu = 0xffff;
66}
67
50int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); 68int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
51enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer); 69enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
52void kvm_s390_tasklet(unsigned long parm); 70void kvm_s390_tasklet(unsigned long parm);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index d02638959922..e5a45dbd26ac 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -33,7 +33,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
33 33
34 operand2 = disp2; 34 operand2 = disp2;
35 if (base2) 35 if (base2)
36 operand2 += vcpu->arch.guest_gprs[base2]; 36 operand2 += vcpu->run->s.regs.gprs[base2];
37 37
38 /* must be word boundary */ 38 /* must be word boundary */
39 if (operand2 & 3) { 39 if (operand2 & 3) {
@@ -56,8 +56,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
56 goto out; 56 goto out;
57 } 57 }
58 58
59 vcpu->arch.sie_block->prefix = address; 59 kvm_s390_set_prefix(vcpu, address);
60 vcpu->arch.sie_block->ihcpu = 0xffff;
61 60
62 VCPU_EVENT(vcpu, 5, "setting prefix to %x", address); 61 VCPU_EVENT(vcpu, 5, "setting prefix to %x", address);
63out: 62out:
@@ -74,7 +73,7 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu)
74 vcpu->stat.instruction_stpx++; 73 vcpu->stat.instruction_stpx++;
75 operand2 = disp2; 74 operand2 = disp2;
76 if (base2) 75 if (base2)
77 operand2 += vcpu->arch.guest_gprs[base2]; 76 operand2 += vcpu->run->s.regs.gprs[base2];
78 77
79 /* must be word boundary */ 78 /* must be word boundary */
80 if (operand2 & 3) { 79 if (operand2 & 3) {
@@ -106,7 +105,7 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
106 vcpu->stat.instruction_stap++; 105 vcpu->stat.instruction_stap++;
107 useraddr = disp2; 106 useraddr = disp2;
108 if (base2) 107 if (base2)
109 useraddr += vcpu->arch.guest_gprs[base2]; 108 useraddr += vcpu->run->s.regs.gprs[base2];
110 109
111 if (useraddr & 1) { 110 if (useraddr & 1) {
112 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 111 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -181,7 +180,7 @@ static int handle_stidp(struct kvm_vcpu *vcpu)
181 vcpu->stat.instruction_stidp++; 180 vcpu->stat.instruction_stidp++;
182 operand2 = disp2; 181 operand2 = disp2;
183 if (base2) 182 if (base2)
184 operand2 += vcpu->arch.guest_gprs[base2]; 183 operand2 += vcpu->run->s.regs.gprs[base2];
185 184
186 if (operand2 & 7) { 185 if (operand2 & 7) {
187 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 186 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -232,9 +231,9 @@ static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem)
232 231
233static int handle_stsi(struct kvm_vcpu *vcpu) 232static int handle_stsi(struct kvm_vcpu *vcpu)
234{ 233{
235 int fc = (vcpu->arch.guest_gprs[0] & 0xf0000000) >> 28; 234 int fc = (vcpu->run->s.regs.gprs[0] & 0xf0000000) >> 28;
236 int sel1 = vcpu->arch.guest_gprs[0] & 0xff; 235 int sel1 = vcpu->run->s.regs.gprs[0] & 0xff;
237 int sel2 = vcpu->arch.guest_gprs[1] & 0xffff; 236 int sel2 = vcpu->run->s.regs.gprs[1] & 0xffff;
238 int base2 = vcpu->arch.sie_block->ipb >> 28; 237 int base2 = vcpu->arch.sie_block->ipb >> 28;
239 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); 238 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
240 u64 operand2; 239 u64 operand2;
@@ -245,14 +244,14 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
245 244
246 operand2 = disp2; 245 operand2 = disp2;
247 if (base2) 246 if (base2)
248 operand2 += vcpu->arch.guest_gprs[base2]; 247 operand2 += vcpu->run->s.regs.gprs[base2];
249 248
250 if (operand2 & 0xfff && fc > 0) 249 if (operand2 & 0xfff && fc > 0)
251 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 250 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
252 251
253 switch (fc) { 252 switch (fc) {
254 case 0: 253 case 0:
255 vcpu->arch.guest_gprs[0] = 3 << 28; 254 vcpu->run->s.regs.gprs[0] = 3 << 28;
256 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); 255 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
257 return 0; 256 return 0;
258 case 1: /* same handling for 1 and 2 */ 257 case 1: /* same handling for 1 and 2 */
@@ -281,7 +280,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
281 } 280 }
282 free_page(mem); 281 free_page(mem);
283 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); 282 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
284 vcpu->arch.guest_gprs[0] = 0; 283 vcpu->run->s.regs.gprs[0] = 0;
285 return 0; 284 return 0;
286out_mem: 285out_mem:
287 free_page(mem); 286 free_page(mem);
@@ -333,8 +332,8 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
333 int disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16; 332 int disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16;
334 int base2 = (vcpu->arch.sie_block->ipb & 0xf000) >> 12; 333 int base2 = (vcpu->arch.sie_block->ipb & 0xf000) >> 12;
335 int disp2 = vcpu->arch.sie_block->ipb & 0x0fff; 334 int disp2 = vcpu->arch.sie_block->ipb & 0x0fff;
336 u64 address1 = disp1 + base1 ? vcpu->arch.guest_gprs[base1] : 0; 335 u64 address1 = disp1 + base1 ? vcpu->run->s.regs.gprs[base1] : 0;
337 u64 address2 = disp2 + base2 ? vcpu->arch.guest_gprs[base2] : 0; 336 u64 address2 = disp2 + base2 ? vcpu->run->s.regs.gprs[base2] : 0;
338 struct vm_area_struct *vma; 337 struct vm_area_struct *vma;
339 unsigned long user_address; 338 unsigned long user_address;
340 339
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 0a7941d74bc6..0ad4cf238391 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -48,7 +48,7 @@
48 48
49 49
50static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr, 50static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
51 unsigned long *reg) 51 u64 *reg)
52{ 52{
53 struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; 53 struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
54 int rc; 54 int rc;
@@ -160,12 +160,15 @@ static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action)
160 inti->type = KVM_S390_SIGP_STOP; 160 inti->type = KVM_S390_SIGP_STOP;
161 161
162 spin_lock_bh(&li->lock); 162 spin_lock_bh(&li->lock);
163 if ((atomic_read(li->cpuflags) & CPUSTAT_STOPPED))
164 goto out;
163 list_add_tail(&inti->list, &li->list); 165 list_add_tail(&inti->list, &li->list);
164 atomic_set(&li->active, 1); 166 atomic_set(&li->active, 1);
165 atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags); 167 atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
166 li->action_bits |= action; 168 li->action_bits |= action;
167 if (waitqueue_active(&li->wq)) 169 if (waitqueue_active(&li->wq))
168 wake_up_interruptible(&li->wq); 170 wake_up_interruptible(&li->wq);
171out:
169 spin_unlock_bh(&li->lock); 172 spin_unlock_bh(&li->lock);
170 173
171 return 0; /* order accepted */ 174 return 0; /* order accepted */
@@ -220,7 +223,7 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter)
220} 223}
221 224
222static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address, 225static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
223 unsigned long *reg) 226 u64 *reg)
224{ 227{
225 struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; 228 struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
226 struct kvm_s390_local_interrupt *li = NULL; 229 struct kvm_s390_local_interrupt *li = NULL;
@@ -278,7 +281,7 @@ out_fi:
278} 281}
279 282
280static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr, 283static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr,
281 unsigned long *reg) 284 u64 *reg)
282{ 285{
283 int rc; 286 int rc;
284 struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; 287 struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
@@ -309,6 +312,34 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr,
309 return rc; 312 return rc;
310} 313}
311 314
315static int __sigp_restart(struct kvm_vcpu *vcpu, u16 cpu_addr)
316{
317 int rc = 0;
318 struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
319 struct kvm_s390_local_interrupt *li;
320
321 if (cpu_addr >= KVM_MAX_VCPUS)
322 return 3; /* not operational */
323
324 spin_lock(&fi->lock);
325 li = fi->local_int[cpu_addr];
326 if (li == NULL) {
327 rc = 3; /* not operational */
328 goto out;
329 }
330
331 spin_lock_bh(&li->lock);
332 if (li->action_bits & ACTION_STOP_ON_STOP)
333 rc = 2; /* busy */
334 else
335 VCPU_EVENT(vcpu, 4, "sigp restart %x to handle userspace",
336 cpu_addr);
337 spin_unlock_bh(&li->lock);
338out:
339 spin_unlock(&fi->lock);
340 return rc;
341}
342
312int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) 343int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
313{ 344{
314 int r1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; 345 int r1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
@@ -316,7 +347,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
316 int base2 = vcpu->arch.sie_block->ipb >> 28; 347 int base2 = vcpu->arch.sie_block->ipb >> 28;
317 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); 348 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
318 u32 parameter; 349 u32 parameter;
319 u16 cpu_addr = vcpu->arch.guest_gprs[r3]; 350 u16 cpu_addr = vcpu->run->s.regs.gprs[r3];
320 u8 order_code; 351 u8 order_code;
321 int rc; 352 int rc;
322 353
@@ -327,18 +358,18 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
327 358
328 order_code = disp2; 359 order_code = disp2;
329 if (base2) 360 if (base2)
330 order_code += vcpu->arch.guest_gprs[base2]; 361 order_code += vcpu->run->s.regs.gprs[base2];
331 362
332 if (r1 % 2) 363 if (r1 % 2)
333 parameter = vcpu->arch.guest_gprs[r1]; 364 parameter = vcpu->run->s.regs.gprs[r1];
334 else 365 else
335 parameter = vcpu->arch.guest_gprs[r1 + 1]; 366 parameter = vcpu->run->s.regs.gprs[r1 + 1];
336 367
337 switch (order_code) { 368 switch (order_code) {
338 case SIGP_SENSE: 369 case SIGP_SENSE:
339 vcpu->stat.instruction_sigp_sense++; 370 vcpu->stat.instruction_sigp_sense++;
340 rc = __sigp_sense(vcpu, cpu_addr, 371 rc = __sigp_sense(vcpu, cpu_addr,
341 &vcpu->arch.guest_gprs[r1]); 372 &vcpu->run->s.regs.gprs[r1]);
342 break; 373 break;
343 case SIGP_EXTERNAL_CALL: 374 case SIGP_EXTERNAL_CALL:
344 vcpu->stat.instruction_sigp_external_call++; 375 vcpu->stat.instruction_sigp_external_call++;
@@ -354,7 +385,8 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
354 break; 385 break;
355 case SIGP_STOP_STORE_STATUS: 386 case SIGP_STOP_STORE_STATUS:
356 vcpu->stat.instruction_sigp_stop++; 387 vcpu->stat.instruction_sigp_stop++;
357 rc = __sigp_stop(vcpu, cpu_addr, ACTION_STORE_ON_STOP); 388 rc = __sigp_stop(vcpu, cpu_addr, ACTION_STORE_ON_STOP |
389 ACTION_STOP_ON_STOP);
358 break; 390 break;
359 case SIGP_SET_ARCH: 391 case SIGP_SET_ARCH:
360 vcpu->stat.instruction_sigp_arch++; 392 vcpu->stat.instruction_sigp_arch++;
@@ -363,15 +395,18 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
363 case SIGP_SET_PREFIX: 395 case SIGP_SET_PREFIX:
364 vcpu->stat.instruction_sigp_prefix++; 396 vcpu->stat.instruction_sigp_prefix++;
365 rc = __sigp_set_prefix(vcpu, cpu_addr, parameter, 397 rc = __sigp_set_prefix(vcpu, cpu_addr, parameter,
366 &vcpu->arch.guest_gprs[r1]); 398 &vcpu->run->s.regs.gprs[r1]);
367 break; 399 break;
368 case SIGP_SENSE_RUNNING: 400 case SIGP_SENSE_RUNNING:
369 vcpu->stat.instruction_sigp_sense_running++; 401 vcpu->stat.instruction_sigp_sense_running++;
370 rc = __sigp_sense_running(vcpu, cpu_addr, 402 rc = __sigp_sense_running(vcpu, cpu_addr,
371 &vcpu->arch.guest_gprs[r1]); 403 &vcpu->run->s.regs.gprs[r1]);
372 break; 404 break;
373 case SIGP_RESTART: 405 case SIGP_RESTART:
374 vcpu->stat.instruction_sigp_restart++; 406 vcpu->stat.instruction_sigp_restart++;
407 rc = __sigp_restart(vcpu, cpu_addr);
408 if (rc == 2) /* busy */
409 break;
375 /* user space must know about restart */ 410 /* user space must know about restart */
376 default: 411 default:
377 return -EOPNOTSUPP; 412 return -EOPNOTSUPP;
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 4d8dcbdfc120..e7d1c194d272 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -321,4 +321,8 @@ struct kvm_xcrs {
321 __u64 padding[16]; 321 __u64 padding[16];
322}; 322};
323 323
324/* definition of registers in kvm_run */
325struct kvm_sync_regs {
326};
327
324#endif /* _ASM_X86_KVM_H */ 328#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7b9cfc4878af..c222e1a1b12a 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -176,6 +176,7 @@ struct x86_emulate_ops {
176 void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); 176 void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
177 ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); 177 ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
178 int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); 178 int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
179 void (*set_rflags)(struct x86_emulate_ctxt *ctxt, ulong val);
179 int (*cpl)(struct x86_emulate_ctxt *ctxt); 180 int (*cpl)(struct x86_emulate_ctxt *ctxt);
180 int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); 181 int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
181 int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); 182 int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
@@ -388,7 +389,7 @@ bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt);
388#define EMULATION_INTERCEPTED 2 389#define EMULATION_INTERCEPTED 2
389int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); 390int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
390int emulator_task_switch(struct x86_emulate_ctxt *ctxt, 391int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
391 u16 tss_selector, int reason, 392 u16 tss_selector, int idt_index, int reason,
392 bool has_error_code, u32 error_code); 393 bool has_error_code, u32 error_code);
393int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); 394int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
394#endif /* _ASM_X86_KVM_X86_EMULATE_H */ 395#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 52d6640a5ca1..e216ba066e79 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -29,7 +29,7 @@
29#include <asm/msr-index.h> 29#include <asm/msr-index.h>
30 30
31#define KVM_MAX_VCPUS 254 31#define KVM_MAX_VCPUS 254
32#define KVM_SOFT_MAX_VCPUS 64 32#define KVM_SOFT_MAX_VCPUS 160
33#define KVM_MEMORY_SLOTS 32 33#define KVM_MEMORY_SLOTS 32
34/* memory slots that does not exposed to userspace */ 34/* memory slots that does not exposed to userspace */
35#define KVM_PRIVATE_MEM_SLOTS 4 35#define KVM_PRIVATE_MEM_SLOTS 4
@@ -181,13 +181,6 @@ struct kvm_mmu_memory_cache {
181 void *objects[KVM_NR_MEM_OBJS]; 181 void *objects[KVM_NR_MEM_OBJS];
182}; 182};
183 183
184#define NR_PTE_CHAIN_ENTRIES 5
185
186struct kvm_pte_chain {
187 u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
188 struct hlist_node link;
189};
190
191/* 184/*
192 * kvm_mmu_page_role, below, is defined as: 185 * kvm_mmu_page_role, below, is defined as:
193 * 186 *
@@ -427,12 +420,16 @@ struct kvm_vcpu_arch {
427 420
428 u64 last_guest_tsc; 421 u64 last_guest_tsc;
429 u64 last_kernel_ns; 422 u64 last_kernel_ns;
430 u64 last_tsc_nsec; 423 u64 last_host_tsc;
431 u64 last_tsc_write; 424 u64 tsc_offset_adjustment;
432 u32 virtual_tsc_khz; 425 u64 this_tsc_nsec;
426 u64 this_tsc_write;
427 u8 this_tsc_generation;
433 bool tsc_catchup; 428 bool tsc_catchup;
434 u32 tsc_catchup_mult; 429 bool tsc_always_catchup;
435 s8 tsc_catchup_shift; 430 s8 virtual_tsc_shift;
431 u32 virtual_tsc_mult;
432 u32 virtual_tsc_khz;
436 433
437 atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ 434 atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
438 unsigned nmi_pending; /* NMI queued after currently running handler */ 435 unsigned nmi_pending; /* NMI queued after currently running handler */
@@ -478,6 +475,21 @@ struct kvm_vcpu_arch {
478 u32 id; 475 u32 id;
479 bool send_user_only; 476 bool send_user_only;
480 } apf; 477 } apf;
478
479 /* OSVW MSRs (AMD only) */
480 struct {
481 u64 length;
482 u64 status;
483 } osvw;
484};
485
486struct kvm_lpage_info {
487 unsigned long rmap_pde;
488 int write_count;
489};
490
491struct kvm_arch_memory_slot {
492 struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
481}; 493};
482 494
483struct kvm_arch { 495struct kvm_arch {
@@ -511,8 +523,12 @@ struct kvm_arch {
511 s64 kvmclock_offset; 523 s64 kvmclock_offset;
512 raw_spinlock_t tsc_write_lock; 524 raw_spinlock_t tsc_write_lock;
513 u64 last_tsc_nsec; 525 u64 last_tsc_nsec;
514 u64 last_tsc_offset;
515 u64 last_tsc_write; 526 u64 last_tsc_write;
527 u32 last_tsc_khz;
528 u64 cur_tsc_nsec;
529 u64 cur_tsc_write;
530 u64 cur_tsc_offset;
531 u8 cur_tsc_generation;
516 532
517 struct kvm_xen_hvm_config xen_hvm_config; 533 struct kvm_xen_hvm_config xen_hvm_config;
518 534
@@ -644,7 +660,7 @@ struct kvm_x86_ops {
644 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 660 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
645 int (*get_lpage_level)(void); 661 int (*get_lpage_level)(void);
646 bool (*rdtscp_supported)(void); 662 bool (*rdtscp_supported)(void);
647 void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment); 663 void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host);
648 664
649 void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 665 void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
650 666
@@ -652,7 +668,7 @@ struct kvm_x86_ops {
652 668
653 bool (*has_wbinvd_exit)(void); 669 bool (*has_wbinvd_exit)(void);
654 670
655 void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz); 671 void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale);
656 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); 672 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
657 673
658 u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); 674 u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
@@ -674,6 +690,17 @@ struct kvm_arch_async_pf {
674 690
675extern struct kvm_x86_ops *kvm_x86_ops; 691extern struct kvm_x86_ops *kvm_x86_ops;
676 692
693static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
694 s64 adjustment)
695{
696 kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, false);
697}
698
699static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
700{
701 kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, true);
702}
703
677int kvm_mmu_module_init(void); 704int kvm_mmu_module_init(void);
678void kvm_mmu_module_exit(void); 705void kvm_mmu_module_exit(void);
679 706
@@ -741,8 +768,8 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
741void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 768void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
742int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); 769int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
743 770
744int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 771int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
745 bool has_error_code, u32 error_code); 772 int reason, bool has_error_code, u32 error_code);
746 773
747int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 774int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
748int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 775int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index e8fb2c7a5f4f..2291895b1836 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -23,6 +23,7 @@
23#define ARCH_PERFMON_EVENTSEL_USR (1ULL << 16) 23#define ARCH_PERFMON_EVENTSEL_USR (1ULL << 16)
24#define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17) 24#define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17)
25#define ARCH_PERFMON_EVENTSEL_EDGE (1ULL << 18) 25#define ARCH_PERFMON_EVENTSEL_EDGE (1ULL << 18)
26#define ARCH_PERFMON_EVENTSEL_PIN_CONTROL (1ULL << 19)
26#define ARCH_PERFMON_EVENTSEL_INT (1ULL << 20) 27#define ARCH_PERFMON_EVENTSEL_INT (1ULL << 20)
27#define ARCH_PERFMON_EVENTSEL_ANY (1ULL << 21) 28#define ARCH_PERFMON_EVENTSEL_ANY (1ULL << 21)
28#define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22) 29#define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22)
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 15d99153a96d..c91e8b9d588b 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -61,7 +61,7 @@ extern void check_tsc_sync_source(int cpu);
61extern void check_tsc_sync_target(void); 61extern void check_tsc_sync_target(void);
62 62
63extern int notsc_setup(char *); 63extern int notsc_setup(char *);
64extern void save_sched_clock_state(void); 64extern void tsc_save_sched_clock_state(void);
65extern void restore_sched_clock_state(void); 65extern void tsc_restore_sched_clock_state(void);
66 66
67#endif /* _ASM_X86_TSC_H */ 67#endif /* _ASM_X86_TSC_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 517d4767ffdd..baaca8defec8 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -145,9 +145,11 @@ struct x86_init_ops {
145/** 145/**
146 * struct x86_cpuinit_ops - platform specific cpu hotplug setups 146 * struct x86_cpuinit_ops - platform specific cpu hotplug setups
147 * @setup_percpu_clockev: set up the per cpu clock event device 147 * @setup_percpu_clockev: set up the per cpu clock event device
148 * @early_percpu_clock_init: early init of the per cpu clock event device
148 */ 149 */
149struct x86_cpuinit_ops { 150struct x86_cpuinit_ops {
150 void (*setup_percpu_clockev)(void); 151 void (*setup_percpu_clockev)(void);
152 void (*early_percpu_clock_init)(void);
151 void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node); 153 void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node);
152}; 154};
153 155
@@ -160,6 +162,8 @@ struct x86_cpuinit_ops {
160 * @is_untracked_pat_range exclude from PAT logic 162 * @is_untracked_pat_range exclude from PAT logic
161 * @nmi_init enable NMI on cpus 163 * @nmi_init enable NMI on cpus
162 * @i8042_detect pre-detect if i8042 controller exists 164 * @i8042_detect pre-detect if i8042 controller exists
165 * @save_sched_clock_state: save state for sched_clock() on suspend
166 * @restore_sched_clock_state: restore state for sched_clock() on resume
163 */ 167 */
164struct x86_platform_ops { 168struct x86_platform_ops {
165 unsigned long (*calibrate_tsc)(void); 169 unsigned long (*calibrate_tsc)(void);
@@ -171,6 +175,8 @@ struct x86_platform_ops {
171 void (*nmi_init)(void); 175 void (*nmi_init)(void);
172 unsigned char (*get_nmi_reason)(void); 176 unsigned char (*get_nmi_reason)(void);
173 int (*i8042_detect)(void); 177 int (*i8042_detect)(void);
178 void (*save_sched_clock_state)(void);
179 void (*restore_sched_clock_state)(void);
174}; 180};
175 181
176struct pci_dev; 182struct pci_dev;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 44842d756b29..f8492da65bfc 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -136,6 +136,15 @@ int kvm_register_clock(char *txt)
136 return ret; 136 return ret;
137} 137}
138 138
139static void kvm_save_sched_clock_state(void)
140{
141}
142
143static void kvm_restore_sched_clock_state(void)
144{
145 kvm_register_clock("primary cpu clock, resume");
146}
147
139#ifdef CONFIG_X86_LOCAL_APIC 148#ifdef CONFIG_X86_LOCAL_APIC
140static void __cpuinit kvm_setup_secondary_clock(void) 149static void __cpuinit kvm_setup_secondary_clock(void)
141{ 150{
@@ -144,8 +153,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
144 * we shouldn't fail. 153 * we shouldn't fail.
145 */ 154 */
146 WARN_ON(kvm_register_clock("secondary cpu clock")); 155 WARN_ON(kvm_register_clock("secondary cpu clock"));
147 /* ok, done with our trickery, call native */
148 setup_secondary_APIC_clock();
149} 156}
150#endif 157#endif
151 158
@@ -194,9 +201,11 @@ void __init kvmclock_init(void)
194 x86_platform.get_wallclock = kvm_get_wallclock; 201 x86_platform.get_wallclock = kvm_get_wallclock;
195 x86_platform.set_wallclock = kvm_set_wallclock; 202 x86_platform.set_wallclock = kvm_set_wallclock;
196#ifdef CONFIG_X86_LOCAL_APIC 203#ifdef CONFIG_X86_LOCAL_APIC
197 x86_cpuinit.setup_percpu_clockev = 204 x86_cpuinit.early_percpu_clock_init =
198 kvm_setup_secondary_clock; 205 kvm_setup_secondary_clock;
199#endif 206#endif
207 x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
208 x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
200 machine_ops.shutdown = kvm_shutdown; 209 machine_ops.shutdown = kvm_shutdown;
201#ifdef CONFIG_KEXEC 210#ifdef CONFIG_KEXEC
202 machine_ops.crash_shutdown = kvm_crash_shutdown; 211 machine_ops.crash_shutdown = kvm_crash_shutdown;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index e578a79a3093..5104a2b685cf 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -255,6 +255,7 @@ notrace static void __cpuinit start_secondary(void *unused)
255 * most necessary things. 255 * most necessary things.
256 */ 256 */
257 cpu_init(); 257 cpu_init();
258 x86_cpuinit.early_percpu_clock_init();
258 preempt_disable(); 259 preempt_disable();
259 smp_callin(); 260 smp_callin();
260 261
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 183c5925a9fe..899a03f2d181 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -630,7 +630,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
630 630
631static unsigned long long cyc2ns_suspend; 631static unsigned long long cyc2ns_suspend;
632 632
633void save_sched_clock_state(void) 633void tsc_save_sched_clock_state(void)
634{ 634{
635 if (!sched_clock_stable) 635 if (!sched_clock_stable)
636 return; 636 return;
@@ -646,7 +646,7 @@ void save_sched_clock_state(void)
646 * that sched_clock() continues from the point where it was left off during 646 * that sched_clock() continues from the point where it was left off during
647 * suspend. 647 * suspend.
648 */ 648 */
649void restore_sched_clock_state(void) 649void tsc_restore_sched_clock_state(void)
650{ 650{
651 unsigned long long offset; 651 unsigned long long offset;
652 unsigned long flags; 652 unsigned long flags;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 947a06ccc673..e9f265fd79ae 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -91,6 +91,7 @@ struct x86_init_ops x86_init __initdata = {
91}; 91};
92 92
93struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { 93struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
94 .early_percpu_clock_init = x86_init_noop,
94 .setup_percpu_clockev = setup_secondary_APIC_clock, 95 .setup_percpu_clockev = setup_secondary_APIC_clock,
95 .fixup_cpu_id = x86_default_fixup_cpu_id, 96 .fixup_cpu_id = x86_default_fixup_cpu_id,
96}; 97};
@@ -107,7 +108,9 @@ struct x86_platform_ops x86_platform = {
107 .is_untracked_pat_range = is_ISA_range, 108 .is_untracked_pat_range = is_ISA_range,
108 .nmi_init = default_nmi_init, 109 .nmi_init = default_nmi_init,
109 .get_nmi_reason = default_get_nmi_reason, 110 .get_nmi_reason = default_get_nmi_reason,
110 .i8042_detect = default_i8042_detect 111 .i8042_detect = default_i8042_detect,
112 .save_sched_clock_state = tsc_save_sched_clock_state,
113 .restore_sched_clock_state = tsc_restore_sched_clock_state,
111}; 114};
112 115
113EXPORT_SYMBOL_GPL(x86_platform); 116EXPORT_SYMBOL_GPL(x86_platform);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 89b02bfaaca5..9fed5bedaad6 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -236,7 +236,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
236 const u32 kvm_supported_word6_x86_features = 236 const u32 kvm_supported_word6_x86_features =
237 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | 237 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
238 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | 238 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
239 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | 239 F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
240 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); 240 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
241 241
242 /* cpuid 0xC0000001.edx */ 242 /* cpuid 0xC0000001.edx */
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 5b97e1797a6d..26d1fb437eb5 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -43,4 +43,12 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
43 return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); 43 return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
44} 44}
45 45
46static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
47{
48 struct kvm_cpuid_entry2 *best;
49
50 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
51 return best && (best->ecx & bit(X86_FEATURE_OSVW));
52}
53
46#endif 54#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0982507b962a..83756223f8aa 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -57,6 +57,7 @@
57#define OpDS 23ull /* DS */ 57#define OpDS 23ull /* DS */
58#define OpFS 24ull /* FS */ 58#define OpFS 24ull /* FS */
59#define OpGS 25ull /* GS */ 59#define OpGS 25ull /* GS */
60#define OpMem8 26ull /* 8-bit zero extended memory operand */
60 61
61#define OpBits 5 /* Width of operand field */ 62#define OpBits 5 /* Width of operand field */
62#define OpMask ((1ull << OpBits) - 1) 63#define OpMask ((1ull << OpBits) - 1)
@@ -101,6 +102,7 @@
101#define SrcAcc (OpAcc << SrcShift) 102#define SrcAcc (OpAcc << SrcShift)
102#define SrcImmU16 (OpImmU16 << SrcShift) 103#define SrcImmU16 (OpImmU16 << SrcShift)
103#define SrcDX (OpDX << SrcShift) 104#define SrcDX (OpDX << SrcShift)
105#define SrcMem8 (OpMem8 << SrcShift)
104#define SrcMask (OpMask << SrcShift) 106#define SrcMask (OpMask << SrcShift)
105#define BitOp (1<<11) 107#define BitOp (1<<11)
106#define MemAbs (1<<12) /* Memory operand is absolute displacement */ 108#define MemAbs (1<<12) /* Memory operand is absolute displacement */
@@ -858,8 +860,7 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
858} 860}
859 861
860static void decode_register_operand(struct x86_emulate_ctxt *ctxt, 862static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
861 struct operand *op, 863 struct operand *op)
862 int inhibit_bytereg)
863{ 864{
864 unsigned reg = ctxt->modrm_reg; 865 unsigned reg = ctxt->modrm_reg;
865 int highbyte_regs = ctxt->rex_prefix == 0; 866 int highbyte_regs = ctxt->rex_prefix == 0;
@@ -876,7 +877,7 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
876 } 877 }
877 878
878 op->type = OP_REG; 879 op->type = OP_REG;
879 if ((ctxt->d & ByteOp) && !inhibit_bytereg) { 880 if (ctxt->d & ByteOp) {
880 op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs); 881 op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs);
881 op->bytes = 1; 882 op->bytes = 1;
882 } else { 883 } else {
@@ -1151,6 +1152,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1151 return 1; 1152 return 1;
1152} 1153}
1153 1154
1155static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt,
1156 u16 index, struct desc_struct *desc)
1157{
1158 struct desc_ptr dt;
1159 ulong addr;
1160
1161 ctxt->ops->get_idt(ctxt, &dt);
1162
1163 if (dt.size < index * 8 + 7)
1164 return emulate_gp(ctxt, index << 3 | 0x2);
1165
1166 addr = dt.address + index * 8;
1167 return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
1168 &ctxt->exception);
1169}
1170
1154static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, 1171static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1155 u16 selector, struct desc_ptr *dt) 1172 u16 selector, struct desc_ptr *dt)
1156{ 1173{
@@ -1227,6 +1244,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1227 seg_desc.type = 3; 1244 seg_desc.type = 3;
1228 seg_desc.p = 1; 1245 seg_desc.p = 1;
1229 seg_desc.s = 1; 1246 seg_desc.s = 1;
1247 if (ctxt->mode == X86EMUL_MODE_VM86)
1248 seg_desc.dpl = 3;
1230 goto load; 1249 goto load;
1231 } 1250 }
1232 1251
@@ -1891,6 +1910,17 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1891 ss->p = 1; 1910 ss->p = 1;
1892} 1911}
1893 1912
1913static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
1914{
1915 u32 eax, ebx, ecx, edx;
1916
1917 eax = ecx = 0;
1918 return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)
1919 && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
1920 && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
1921 && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
1922}
1923
1894static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) 1924static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
1895{ 1925{
1896 struct x86_emulate_ops *ops = ctxt->ops; 1926 struct x86_emulate_ops *ops = ctxt->ops;
@@ -2007,6 +2037,14 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
2007 if (ctxt->mode == X86EMUL_MODE_REAL) 2037 if (ctxt->mode == X86EMUL_MODE_REAL)
2008 return emulate_gp(ctxt, 0); 2038 return emulate_gp(ctxt, 0);
2009 2039
2040 /*
2041 * Not recognized on AMD in compat mode (but is recognized in legacy
2042 * mode).
2043 */
2044 if ((ctxt->mode == X86EMUL_MODE_PROT32) && (efer & EFER_LMA)
2045 && !vendor_intel(ctxt))
2046 return emulate_ud(ctxt);
2047
2010 /* XXX sysenter/sysexit have not been tested in 64bit mode. 2048 /* XXX sysenter/sysexit have not been tested in 64bit mode.
2011 * Therefore, we inject an #UD. 2049 * Therefore, we inject an #UD.
2012 */ 2050 */
@@ -2306,6 +2344,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2306 return emulate_gp(ctxt, 0); 2344 return emulate_gp(ctxt, 0);
2307 ctxt->_eip = tss->eip; 2345 ctxt->_eip = tss->eip;
2308 ctxt->eflags = tss->eflags | 2; 2346 ctxt->eflags = tss->eflags | 2;
2347
2348 /* General purpose registers */
2309 ctxt->regs[VCPU_REGS_RAX] = tss->eax; 2349 ctxt->regs[VCPU_REGS_RAX] = tss->eax;
2310 ctxt->regs[VCPU_REGS_RCX] = tss->ecx; 2350 ctxt->regs[VCPU_REGS_RCX] = tss->ecx;
2311 ctxt->regs[VCPU_REGS_RDX] = tss->edx; 2351 ctxt->regs[VCPU_REGS_RDX] = tss->edx;
@@ -2328,6 +2368,24 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2328 set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS); 2368 set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS);
2329 2369
2330 /* 2370 /*
2371 * If we're switching between Protected Mode and VM86, we need to make
2372 * sure to update the mode before loading the segment descriptors so
2373 * that the selectors are interpreted correctly.
2374 *
2375 * Need to get rflags to the vcpu struct immediately because it
2376 * influences the CPL which is checked at least when loading the segment
2377 * descriptors and when pushing an error code to the new kernel stack.
2378 *
2379 * TODO Introduce a separate ctxt->ops->set_cpl callback
2380 */
2381 if (ctxt->eflags & X86_EFLAGS_VM)
2382 ctxt->mode = X86EMUL_MODE_VM86;
2383 else
2384 ctxt->mode = X86EMUL_MODE_PROT32;
2385
2386 ctxt->ops->set_rflags(ctxt, ctxt->eflags);
2387
2388 /*
2331 * Now load segment descriptors. If fault happenes at this stage 2389 * Now load segment descriptors. If fault happenes at this stage
2332 * it is handled in a context of new task 2390 * it is handled in a context of new task
2333 */ 2391 */
@@ -2401,7 +2459,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2401} 2459}
2402 2460
2403static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, 2461static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2404 u16 tss_selector, int reason, 2462 u16 tss_selector, int idt_index, int reason,
2405 bool has_error_code, u32 error_code) 2463 bool has_error_code, u32 error_code)
2406{ 2464{
2407 struct x86_emulate_ops *ops = ctxt->ops; 2465 struct x86_emulate_ops *ops = ctxt->ops;
@@ -2423,12 +2481,35 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2423 2481
2424 /* FIXME: check that next_tss_desc is tss */ 2482 /* FIXME: check that next_tss_desc is tss */
2425 2483
2426 if (reason != TASK_SWITCH_IRET) { 2484 /*
2427 if ((tss_selector & 3) > next_tss_desc.dpl || 2485 * Check privileges. The three cases are task switch caused by...
2428 ops->cpl(ctxt) > next_tss_desc.dpl) 2486 *
2429 return emulate_gp(ctxt, 0); 2487 * 1. jmp/call/int to task gate: Check against DPL of the task gate
2488 * 2. Exception/IRQ/iret: No check is performed
2489 * 3. jmp/call to TSS: Check agains DPL of the TSS
2490 */
2491 if (reason == TASK_SWITCH_GATE) {
2492 if (idt_index != -1) {
2493 /* Software interrupts */
2494 struct desc_struct task_gate_desc;
2495 int dpl;
2496
2497 ret = read_interrupt_descriptor(ctxt, idt_index,
2498 &task_gate_desc);
2499 if (ret != X86EMUL_CONTINUE)
2500 return ret;
2501
2502 dpl = task_gate_desc.dpl;
2503 if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
2504 return emulate_gp(ctxt, (idt_index << 3) | 0x2);
2505 }
2506 } else if (reason != TASK_SWITCH_IRET) {
2507 int dpl = next_tss_desc.dpl;
2508 if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
2509 return emulate_gp(ctxt, tss_selector);
2430 } 2510 }
2431 2511
2512
2432 desc_limit = desc_limit_scaled(&next_tss_desc); 2513 desc_limit = desc_limit_scaled(&next_tss_desc);
2433 if (!next_tss_desc.p || 2514 if (!next_tss_desc.p ||
2434 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || 2515 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
@@ -2481,7 +2562,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2481} 2562}
2482 2563
2483int emulator_task_switch(struct x86_emulate_ctxt *ctxt, 2564int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2484 u16 tss_selector, int reason, 2565 u16 tss_selector, int idt_index, int reason,
2485 bool has_error_code, u32 error_code) 2566 bool has_error_code, u32 error_code)
2486{ 2567{
2487 int rc; 2568 int rc;
@@ -2489,7 +2570,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2489 ctxt->_eip = ctxt->eip; 2570 ctxt->_eip = ctxt->eip;
2490 ctxt->dst.type = OP_NONE; 2571 ctxt->dst.type = OP_NONE;
2491 2572
2492 rc = emulator_do_task_switch(ctxt, tss_selector, reason, 2573 rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason,
2493 has_error_code, error_code); 2574 has_error_code, error_code);
2494 2575
2495 if (rc == X86EMUL_CONTINUE) 2576 if (rc == X86EMUL_CONTINUE)
@@ -3514,13 +3595,13 @@ static struct opcode twobyte_table[256] = {
3514 I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), 3595 I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
3515 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), 3596 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
3516 I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), 3597 I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
3517 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 3598 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3518 /* 0xB8 - 0xBF */ 3599 /* 0xB8 - 0xBF */
3519 N, N, 3600 N, N,
3520 G(BitOp, group8), 3601 G(BitOp, group8),
3521 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), 3602 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
3522 I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), 3603 I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
3523 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 3604 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3524 /* 0xC0 - 0xCF */ 3605 /* 0xC0 - 0xCF */
3525 D2bv(DstMem | SrcReg | ModRM | Lock), 3606 D2bv(DstMem | SrcReg | ModRM | Lock),
3526 N, D(DstMem | SrcReg | ModRM | Mov), 3607 N, D(DstMem | SrcReg | ModRM | Mov),
@@ -3602,9 +3683,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
3602 3683
3603 switch (d) { 3684 switch (d) {
3604 case OpReg: 3685 case OpReg:
3605 decode_register_operand(ctxt, op, 3686 decode_register_operand(ctxt, op);
3606 op == &ctxt->dst &&
3607 ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
3608 break; 3687 break;
3609 case OpImmUByte: 3688 case OpImmUByte:
3610 rc = decode_imm(ctxt, op, 1, false); 3689 rc = decode_imm(ctxt, op, 1, false);
@@ -3656,6 +3735,9 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
3656 case OpImm: 3735 case OpImm:
3657 rc = decode_imm(ctxt, op, imm_size(ctxt), true); 3736 rc = decode_imm(ctxt, op, imm_size(ctxt), true);
3658 break; 3737 break;
3738 case OpMem8:
3739 ctxt->memop.bytes = 1;
3740 goto mem_common;
3659 case OpMem16: 3741 case OpMem16:
3660 ctxt->memop.bytes = 2; 3742 ctxt->memop.bytes = 2;
3661 goto mem_common; 3743 goto mem_common;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index b6a73537e1ef..81cf4fa4a2be 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -307,6 +307,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
307 if (val & 0x10) { 307 if (val & 0x10) {
308 s->init4 = val & 1; 308 s->init4 = val & 1;
309 s->last_irr = 0; 309 s->last_irr = 0;
310 s->irr &= s->elcr;
310 s->imr = 0; 311 s->imr = 0;
311 s->priority_add = 0; 312 s->priority_add = 0;
312 s->special_mask = 0; 313 s->special_mask = 0;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 31bfc6927bc0..858432287ab6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -433,7 +433,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
433 break; 433 break;
434 434
435 case APIC_DM_INIT: 435 case APIC_DM_INIT:
436 if (level) { 436 if (!trig_mode || level) {
437 result = 1; 437 result = 1;
438 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 438 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
439 kvm_make_request(KVM_REQ_EVENT, vcpu); 439 kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -731,7 +731,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
731 u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; 731 u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
732 u64 ns = 0; 732 u64 ns = 0;
733 struct kvm_vcpu *vcpu = apic->vcpu; 733 struct kvm_vcpu *vcpu = apic->vcpu;
734 unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu); 734 unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
735 unsigned long flags; 735 unsigned long flags;
736 736
737 if (unlikely(!tscdeadline || !this_tsc_khz)) 737 if (unlikely(!tscdeadline || !this_tsc_khz))
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 224b02c3cda9..4cb164268846 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -688,9 +688,8 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
688{ 688{
689 unsigned long idx; 689 unsigned long idx;
690 690
691 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 691 idx = gfn_to_index(gfn, slot->base_gfn, level);
692 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); 692 return &slot->arch.lpage_info[level - 2][idx];
693 return &slot->lpage_info[level - 2][idx];
694} 693}
695 694
696static void account_shadowed(struct kvm *kvm, gfn_t gfn) 695static void account_shadowed(struct kvm *kvm, gfn_t gfn)
@@ -946,7 +945,7 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
946 } 945 }
947} 946}
948 947
949static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level, 948static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
950 struct kvm_memory_slot *slot) 949 struct kvm_memory_slot *slot)
951{ 950{
952 struct kvm_lpage_info *linfo; 951 struct kvm_lpage_info *linfo;
@@ -966,7 +965,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
966 struct kvm_memory_slot *slot; 965 struct kvm_memory_slot *slot;
967 966
968 slot = gfn_to_memslot(kvm, gfn); 967 slot = gfn_to_memslot(kvm, gfn);
969 return __gfn_to_rmap(kvm, gfn, level, slot); 968 return __gfn_to_rmap(gfn, level, slot);
970} 969}
971 970
972static bool rmap_can_add(struct kvm_vcpu *vcpu) 971static bool rmap_can_add(struct kvm_vcpu *vcpu)
@@ -988,7 +987,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
988 return pte_list_add(vcpu, spte, rmapp); 987 return pte_list_add(vcpu, spte, rmapp);
989} 988}
990 989
991static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 990static u64 *rmap_next(unsigned long *rmapp, u64 *spte)
992{ 991{
993 return pte_list_next(rmapp, spte); 992 return pte_list_next(rmapp, spte);
994} 993}
@@ -1018,8 +1017,8 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
1018 u64 *spte; 1017 u64 *spte;
1019 int i, write_protected = 0; 1018 int i, write_protected = 0;
1020 1019
1021 rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot); 1020 rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot);
1022 spte = rmap_next(kvm, rmapp, NULL); 1021 spte = rmap_next(rmapp, NULL);
1023 while (spte) { 1022 while (spte) {
1024 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1023 BUG_ON(!(*spte & PT_PRESENT_MASK));
1025 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 1024 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
@@ -1027,14 +1026,14 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
1027 mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); 1026 mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
1028 write_protected = 1; 1027 write_protected = 1;
1029 } 1028 }
1030 spte = rmap_next(kvm, rmapp, spte); 1029 spte = rmap_next(rmapp, spte);
1031 } 1030 }
1032 1031
1033 /* check for huge page mappings */ 1032 /* check for huge page mappings */
1034 for (i = PT_DIRECTORY_LEVEL; 1033 for (i = PT_DIRECTORY_LEVEL;
1035 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 1034 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
1036 rmapp = __gfn_to_rmap(kvm, gfn, i, slot); 1035 rmapp = __gfn_to_rmap(gfn, i, slot);
1037 spte = rmap_next(kvm, rmapp, NULL); 1036 spte = rmap_next(rmapp, NULL);
1038 while (spte) { 1037 while (spte) {
1039 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1038 BUG_ON(!(*spte & PT_PRESENT_MASK));
1040 BUG_ON(!is_large_pte(*spte)); 1039 BUG_ON(!is_large_pte(*spte));
@@ -1045,7 +1044,7 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
1045 spte = NULL; 1044 spte = NULL;
1046 write_protected = 1; 1045 write_protected = 1;
1047 } 1046 }
1048 spte = rmap_next(kvm, rmapp, spte); 1047 spte = rmap_next(rmapp, spte);
1049 } 1048 }
1050 } 1049 }
1051 1050
@@ -1066,7 +1065,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
1066 u64 *spte; 1065 u64 *spte;
1067 int need_tlb_flush = 0; 1066 int need_tlb_flush = 0;
1068 1067
1069 while ((spte = rmap_next(kvm, rmapp, NULL))) { 1068 while ((spte = rmap_next(rmapp, NULL))) {
1070 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1069 BUG_ON(!(*spte & PT_PRESENT_MASK));
1071 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); 1070 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
1072 drop_spte(kvm, spte); 1071 drop_spte(kvm, spte);
@@ -1085,14 +1084,14 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1085 1084
1086 WARN_ON(pte_huge(*ptep)); 1085 WARN_ON(pte_huge(*ptep));
1087 new_pfn = pte_pfn(*ptep); 1086 new_pfn = pte_pfn(*ptep);
1088 spte = rmap_next(kvm, rmapp, NULL); 1087 spte = rmap_next(rmapp, NULL);
1089 while (spte) { 1088 while (spte) {
1090 BUG_ON(!is_shadow_present_pte(*spte)); 1089 BUG_ON(!is_shadow_present_pte(*spte));
1091 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); 1090 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
1092 need_flush = 1; 1091 need_flush = 1;
1093 if (pte_write(*ptep)) { 1092 if (pte_write(*ptep)) {
1094 drop_spte(kvm, spte); 1093 drop_spte(kvm, spte);
1095 spte = rmap_next(kvm, rmapp, NULL); 1094 spte = rmap_next(rmapp, NULL);
1096 } else { 1095 } else {
1097 new_spte = *spte &~ (PT64_BASE_ADDR_MASK); 1096 new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
1098 new_spte |= (u64)new_pfn << PAGE_SHIFT; 1097 new_spte |= (u64)new_pfn << PAGE_SHIFT;
@@ -1102,7 +1101,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1102 new_spte &= ~shadow_accessed_mask; 1101 new_spte &= ~shadow_accessed_mask;
1103 mmu_spte_clear_track_bits(spte); 1102 mmu_spte_clear_track_bits(spte);
1104 mmu_spte_set(spte, new_spte); 1103 mmu_spte_set(spte, new_spte);
1105 spte = rmap_next(kvm, rmapp, spte); 1104 spte = rmap_next(rmapp, spte);
1106 } 1105 }
1107 } 1106 }
1108 if (need_flush) 1107 if (need_flush)
@@ -1176,7 +1175,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1176 if (!shadow_accessed_mask) 1175 if (!shadow_accessed_mask)
1177 return kvm_unmap_rmapp(kvm, rmapp, data); 1176 return kvm_unmap_rmapp(kvm, rmapp, data);
1178 1177
1179 spte = rmap_next(kvm, rmapp, NULL); 1178 spte = rmap_next(rmapp, NULL);
1180 while (spte) { 1179 while (spte) {
1181 int _young; 1180 int _young;
1182 u64 _spte = *spte; 1181 u64 _spte = *spte;
@@ -1186,7 +1185,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1186 young = 1; 1185 young = 1;
1187 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); 1186 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
1188 } 1187 }
1189 spte = rmap_next(kvm, rmapp, spte); 1188 spte = rmap_next(rmapp, spte);
1190 } 1189 }
1191 return young; 1190 return young;
1192} 1191}
@@ -1205,7 +1204,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1205 if (!shadow_accessed_mask) 1204 if (!shadow_accessed_mask)
1206 goto out; 1205 goto out;
1207 1206
1208 spte = rmap_next(kvm, rmapp, NULL); 1207 spte = rmap_next(rmapp, NULL);
1209 while (spte) { 1208 while (spte) {
1210 u64 _spte = *spte; 1209 u64 _spte = *spte;
1211 BUG_ON(!(_spte & PT_PRESENT_MASK)); 1210 BUG_ON(!(_spte & PT_PRESENT_MASK));
@@ -1214,7 +1213,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1214 young = 1; 1213 young = 1;
1215 break; 1214 break;
1216 } 1215 }
1217 spte = rmap_next(kvm, rmapp, spte); 1216 spte = rmap_next(rmapp, spte);
1218 } 1217 }
1219out: 1218out:
1220 return young; 1219 return young;
@@ -1391,11 +1390,6 @@ struct kvm_mmu_pages {
1391 unsigned int nr; 1390 unsigned int nr;
1392}; 1391};
1393 1392
1394#define for_each_unsync_children(bitmap, idx) \
1395 for (idx = find_first_bit(bitmap, 512); \
1396 idx < 512; \
1397 idx = find_next_bit(bitmap, 512, idx+1))
1398
1399static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, 1393static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1400 int idx) 1394 int idx)
1401{ 1395{
@@ -1417,7 +1411,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1417{ 1411{
1418 int i, ret, nr_unsync_leaf = 0; 1412 int i, ret, nr_unsync_leaf = 0;
1419 1413
1420 for_each_unsync_children(sp->unsync_child_bitmap, i) { 1414 for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
1421 struct kvm_mmu_page *child; 1415 struct kvm_mmu_page *child;
1422 u64 ent = sp->spt[i]; 1416 u64 ent = sp->spt[i];
1423 1417
@@ -1803,6 +1797,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1803{ 1797{
1804 if (is_large_pte(*sptep)) { 1798 if (is_large_pte(*sptep)) {
1805 drop_spte(vcpu->kvm, sptep); 1799 drop_spte(vcpu->kvm, sptep);
1800 --vcpu->kvm->stat.lpages;
1806 kvm_flush_remote_tlbs(vcpu->kvm); 1801 kvm_flush_remote_tlbs(vcpu->kvm);
1807 } 1802 }
1808} 1803}
@@ -3190,15 +3185,14 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
3190#undef PTTYPE 3185#undef PTTYPE
3191 3186
3192static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, 3187static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
3193 struct kvm_mmu *context, 3188 struct kvm_mmu *context)
3194 int level)
3195{ 3189{
3196 int maxphyaddr = cpuid_maxphyaddr(vcpu); 3190 int maxphyaddr = cpuid_maxphyaddr(vcpu);
3197 u64 exb_bit_rsvd = 0; 3191 u64 exb_bit_rsvd = 0;
3198 3192
3199 if (!context->nx) 3193 if (!context->nx)
3200 exb_bit_rsvd = rsvd_bits(63, 63); 3194 exb_bit_rsvd = rsvd_bits(63, 63);
3201 switch (level) { 3195 switch (context->root_level) {
3202 case PT32_ROOT_LEVEL: 3196 case PT32_ROOT_LEVEL:
3203 /* no rsvd bits for 2 level 4K page table entries */ 3197 /* no rsvd bits for 2 level 4K page table entries */
3204 context->rsvd_bits_mask[0][1] = 0; 3198 context->rsvd_bits_mask[0][1] = 0;
@@ -3256,8 +3250,9 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
3256 int level) 3250 int level)
3257{ 3251{
3258 context->nx = is_nx(vcpu); 3252 context->nx = is_nx(vcpu);
3253 context->root_level = level;
3259 3254
3260 reset_rsvds_bits_mask(vcpu, context, level); 3255 reset_rsvds_bits_mask(vcpu, context);
3261 3256
3262 ASSERT(is_pae(vcpu)); 3257 ASSERT(is_pae(vcpu));
3263 context->new_cr3 = paging_new_cr3; 3258 context->new_cr3 = paging_new_cr3;
@@ -3267,7 +3262,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
3267 context->invlpg = paging64_invlpg; 3262 context->invlpg = paging64_invlpg;
3268 context->update_pte = paging64_update_pte; 3263 context->update_pte = paging64_update_pte;
3269 context->free = paging_free; 3264 context->free = paging_free;
3270 context->root_level = level;
3271 context->shadow_root_level = level; 3265 context->shadow_root_level = level;
3272 context->root_hpa = INVALID_PAGE; 3266 context->root_hpa = INVALID_PAGE;
3273 context->direct_map = false; 3267 context->direct_map = false;
@@ -3284,8 +3278,9 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
3284 struct kvm_mmu *context) 3278 struct kvm_mmu *context)
3285{ 3279{
3286 context->nx = false; 3280 context->nx = false;
3281 context->root_level = PT32_ROOT_LEVEL;
3287 3282
3288 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); 3283 reset_rsvds_bits_mask(vcpu, context);
3289 3284
3290 context->new_cr3 = paging_new_cr3; 3285 context->new_cr3 = paging_new_cr3;
3291 context->page_fault = paging32_page_fault; 3286 context->page_fault = paging32_page_fault;
@@ -3294,7 +3289,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
3294 context->sync_page = paging32_sync_page; 3289 context->sync_page = paging32_sync_page;
3295 context->invlpg = paging32_invlpg; 3290 context->invlpg = paging32_invlpg;
3296 context->update_pte = paging32_update_pte; 3291 context->update_pte = paging32_update_pte;
3297 context->root_level = PT32_ROOT_LEVEL;
3298 context->shadow_root_level = PT32E_ROOT_LEVEL; 3292 context->shadow_root_level = PT32E_ROOT_LEVEL;
3299 context->root_hpa = INVALID_PAGE; 3293 context->root_hpa = INVALID_PAGE;
3300 context->direct_map = false; 3294 context->direct_map = false;
@@ -3325,7 +3319,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3325 context->get_cr3 = get_cr3; 3319 context->get_cr3 = get_cr3;
3326 context->get_pdptr = kvm_pdptr_read; 3320 context->get_pdptr = kvm_pdptr_read;
3327 context->inject_page_fault = kvm_inject_page_fault; 3321 context->inject_page_fault = kvm_inject_page_fault;
3328 context->nx = is_nx(vcpu);
3329 3322
3330 if (!is_paging(vcpu)) { 3323 if (!is_paging(vcpu)) {
3331 context->nx = false; 3324 context->nx = false;
@@ -3333,19 +3326,19 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3333 context->root_level = 0; 3326 context->root_level = 0;
3334 } else if (is_long_mode(vcpu)) { 3327 } else if (is_long_mode(vcpu)) {
3335 context->nx = is_nx(vcpu); 3328 context->nx = is_nx(vcpu);
3336 reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
3337 context->gva_to_gpa = paging64_gva_to_gpa;
3338 context->root_level = PT64_ROOT_LEVEL; 3329 context->root_level = PT64_ROOT_LEVEL;
3330 reset_rsvds_bits_mask(vcpu, context);
3331 context->gva_to_gpa = paging64_gva_to_gpa;
3339 } else if (is_pae(vcpu)) { 3332 } else if (is_pae(vcpu)) {
3340 context->nx = is_nx(vcpu); 3333 context->nx = is_nx(vcpu);
3341 reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
3342 context->gva_to_gpa = paging64_gva_to_gpa;
3343 context->root_level = PT32E_ROOT_LEVEL; 3334 context->root_level = PT32E_ROOT_LEVEL;
3335 reset_rsvds_bits_mask(vcpu, context);
3336 context->gva_to_gpa = paging64_gva_to_gpa;
3344 } else { 3337 } else {
3345 context->nx = false; 3338 context->nx = false;
3346 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
3347 context->gva_to_gpa = paging32_gva_to_gpa;
3348 context->root_level = PT32_ROOT_LEVEL; 3339 context->root_level = PT32_ROOT_LEVEL;
3340 reset_rsvds_bits_mask(vcpu, context);
3341 context->gva_to_gpa = paging32_gva_to_gpa;
3349 } 3342 }
3350 3343
3351 return 0; 3344 return 0;
@@ -3408,18 +3401,18 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3408 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; 3401 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
3409 } else if (is_long_mode(vcpu)) { 3402 } else if (is_long_mode(vcpu)) {
3410 g_context->nx = is_nx(vcpu); 3403 g_context->nx = is_nx(vcpu);
3411 reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
3412 g_context->root_level = PT64_ROOT_LEVEL; 3404 g_context->root_level = PT64_ROOT_LEVEL;
3405 reset_rsvds_bits_mask(vcpu, g_context);
3413 g_context->gva_to_gpa = paging64_gva_to_gpa_nested; 3406 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
3414 } else if (is_pae(vcpu)) { 3407 } else if (is_pae(vcpu)) {
3415 g_context->nx = is_nx(vcpu); 3408 g_context->nx = is_nx(vcpu);
3416 reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
3417 g_context->root_level = PT32E_ROOT_LEVEL; 3409 g_context->root_level = PT32E_ROOT_LEVEL;
3410 reset_rsvds_bits_mask(vcpu, g_context);
3418 g_context->gva_to_gpa = paging64_gva_to_gpa_nested; 3411 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
3419 } else { 3412 } else {
3420 g_context->nx = false; 3413 g_context->nx = false;
3421 reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
3422 g_context->root_level = PT32_ROOT_LEVEL; 3414 g_context->root_level = PT32_ROOT_LEVEL;
3415 reset_rsvds_bits_mask(vcpu, g_context);
3423 g_context->gva_to_gpa = paging32_gva_to_gpa_nested; 3416 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
3424 } 3417 }
3425 3418
@@ -3555,7 +3548,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
3555 * If we're seeing too many writes to a page, it may no longer be a page table, 3548 * If we're seeing too many writes to a page, it may no longer be a page table,
3556 * or we may be forking, in which case it is better to unmap the page. 3549 * or we may be forking, in which case it is better to unmap the page.
3557 */ 3550 */
3558static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte) 3551static bool detect_write_flooding(struct kvm_mmu_page *sp)
3559{ 3552{
3560 /* 3553 /*
3561 * Skip write-flooding detected for the sp whose level is 1, because 3554 * Skip write-flooding detected for the sp whose level is 1, because
@@ -3664,10 +3657,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3664 3657
3665 mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; 3658 mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
3666 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { 3659 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
3667 spte = get_written_sptes(sp, gpa, &npte);
3668
3669 if (detect_write_misaligned(sp, gpa, bytes) || 3660 if (detect_write_misaligned(sp, gpa, bytes) ||
3670 detect_write_flooding(sp, spte)) { 3661 detect_write_flooding(sp)) {
3671 zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 3662 zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
3672 &invalid_list); 3663 &invalid_list);
3673 ++vcpu->kvm->stat.mmu_flooded; 3664 ++vcpu->kvm->stat.mmu_flooded;
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index ea7b4fd34676..715da5a19a5b 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -200,13 +200,13 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
200 slot = gfn_to_memslot(kvm, sp->gfn); 200 slot = gfn_to_memslot(kvm, sp->gfn);
201 rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; 201 rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
202 202
203 spte = rmap_next(kvm, rmapp, NULL); 203 spte = rmap_next(rmapp, NULL);
204 while (spte) { 204 while (spte) {
205 if (is_writable_pte(*spte)) 205 if (is_writable_pte(*spte))
206 audit_printk(kvm, "shadow page has writable " 206 audit_printk(kvm, "shadow page has writable "
207 "mappings: gfn %llx role %x\n", 207 "mappings: gfn %llx role %x\n",
208 sp->gfn, sp->role.word); 208 sp->gfn, sp->role.word);
209 spte = rmap_next(kvm, rmapp, spte); 209 spte = rmap_next(rmapp, spte);
210 } 210 }
211} 211}
212 212
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 7aad5446f393..a73f0c104813 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -33,10 +33,11 @@ static struct kvm_arch_event_perf_mapping {
33 [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, 33 [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
34 [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, 34 [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
35 [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, 35 [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
36 [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
36}; 37};
37 38
38/* mapping between fixed pmc index and arch_events array */ 39/* mapping between fixed pmc index and arch_events array */
39int fixed_pmc_events[] = {1, 0, 2}; 40int fixed_pmc_events[] = {1, 0, 7};
40 41
41static bool pmc_is_gp(struct kvm_pmc *pmc) 42static bool pmc_is_gp(struct kvm_pmc *pmc)
42{ 43{
@@ -210,6 +211,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
210 unsigned config, type = PERF_TYPE_RAW; 211 unsigned config, type = PERF_TYPE_RAW;
211 u8 event_select, unit_mask; 212 u8 event_select, unit_mask;
212 213
214 if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
215 printk_once("kvm pmu: pin control bit is ignored\n");
216
213 pmc->eventsel = eventsel; 217 pmc->eventsel = eventsel;
214 218
215 stop_counter(pmc); 219 stop_counter(pmc);
@@ -220,7 +224,7 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
220 event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; 224 event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
221 unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; 225 unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
222 226
223 if (!(event_select & (ARCH_PERFMON_EVENTSEL_EDGE | 227 if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
224 ARCH_PERFMON_EVENTSEL_INV | 228 ARCH_PERFMON_EVENTSEL_INV |
225 ARCH_PERFMON_EVENTSEL_CMASK))) { 229 ARCH_PERFMON_EVENTSEL_CMASK))) {
226 config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, 230 config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
@@ -413,7 +417,7 @@ int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
413 struct kvm_pmc *counters; 417 struct kvm_pmc *counters;
414 u64 ctr; 418 u64 ctr;
415 419
416 pmc &= (3u << 30) - 1; 420 pmc &= ~(3u << 30);
417 if (!fixed && pmc >= pmu->nr_arch_gp_counters) 421 if (!fixed && pmc >= pmu->nr_arch_gp_counters)
418 return 1; 422 return 1;
419 if (fixed && pmc >= pmu->nr_arch_fixed_counters) 423 if (fixed && pmc >= pmu->nr_arch_fixed_counters)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e385214711cb..e334389e1c75 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -111,6 +111,12 @@ struct nested_state {
111#define MSRPM_OFFSETS 16 111#define MSRPM_OFFSETS 16
112static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; 112static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
113 113
114/*
115 * Set osvw_len to higher value when updated Revision Guides
116 * are published and we know what the new status bits are
117 */
118static uint64_t osvw_len = 4, osvw_status;
119
114struct vcpu_svm { 120struct vcpu_svm {
115 struct kvm_vcpu vcpu; 121 struct kvm_vcpu vcpu;
116 struct vmcb *vmcb; 122 struct vmcb *vmcb;
@@ -177,11 +183,13 @@ static bool npt_enabled = true;
177#else 183#else
178static bool npt_enabled; 184static bool npt_enabled;
179#endif 185#endif
180static int npt = 1;
181 186
187/* allow nested paging (virtualized MMU) for all guests */
188static int npt = true;
182module_param(npt, int, S_IRUGO); 189module_param(npt, int, S_IRUGO);
183 190
184static int nested = 1; 191/* allow nested virtualization in KVM/SVM */
192static int nested = true;
185module_param(nested, int, S_IRUGO); 193module_param(nested, int, S_IRUGO);
186 194
187static void svm_flush_tlb(struct kvm_vcpu *vcpu); 195static void svm_flush_tlb(struct kvm_vcpu *vcpu);
@@ -557,6 +565,27 @@ static void svm_init_erratum_383(void)
557 erratum_383_found = true; 565 erratum_383_found = true;
558} 566}
559 567
568static void svm_init_osvw(struct kvm_vcpu *vcpu)
569{
570 /*
571 * Guests should see errata 400 and 415 as fixed (assuming that
572 * HLT and IO instructions are intercepted).
573 */
574 vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
575 vcpu->arch.osvw.status = osvw_status & ~(6ULL);
576
577 /*
578 * By increasing VCPU's osvw.length to 3 we are telling the guest that
579 * all osvw.status bits inside that length, including bit 0 (which is
580 * reserved for erratum 298), are valid. However, if host processor's
581 * osvw_len is 0 then osvw_status[0] carries no information. We need to
582 * be conservative here and therefore we tell the guest that erratum 298
583 * is present (because we really don't know).
584 */
585 if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
586 vcpu->arch.osvw.status |= 1;
587}
588
560static int has_svm(void) 589static int has_svm(void)
561{ 590{
562 const char *msg; 591 const char *msg;
@@ -623,6 +652,36 @@ static int svm_hardware_enable(void *garbage)
623 __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT; 652 __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT;
624 } 653 }
625 654
655
656 /*
657 * Get OSVW bits.
658 *
659 * Note that it is possible to have a system with mixed processor
660 * revisions and therefore different OSVW bits. If bits are not the same
661 * on different processors then choose the worst case (i.e. if erratum
662 * is present on one processor and not on another then assume that the
663 * erratum is present everywhere).
664 */
665 if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
666 uint64_t len, status = 0;
667 int err;
668
669 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
670 if (!err)
671 status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
672 &err);
673
674 if (err)
675 osvw_status = osvw_len = 0;
676 else {
677 if (len < osvw_len)
678 osvw_len = len;
679 osvw_status |= status;
680 osvw_status &= (1ULL << osvw_len) - 1;
681 }
682 } else
683 osvw_status = osvw_len = 0;
684
626 svm_init_erratum_383(); 685 svm_init_erratum_383();
627 686
628 amd_pmu_enable_virt(); 687 amd_pmu_enable_virt();
@@ -910,20 +969,25 @@ static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
910 return _tsc; 969 return _tsc;
911} 970}
912 971
913static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) 972static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
914{ 973{
915 struct vcpu_svm *svm = to_svm(vcpu); 974 struct vcpu_svm *svm = to_svm(vcpu);
916 u64 ratio; 975 u64 ratio;
917 u64 khz; 976 u64 khz;
918 977
919 /* TSC scaling supported? */ 978 /* Guest TSC same frequency as host TSC? */
920 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) 979 if (!scale) {
980 svm->tsc_ratio = TSC_RATIO_DEFAULT;
921 return; 981 return;
982 }
922 983
923 /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */ 984 /* TSC scaling supported? */
924 if (user_tsc_khz == 0) { 985 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
925 vcpu->arch.virtual_tsc_khz = 0; 986 if (user_tsc_khz > tsc_khz) {
926 svm->tsc_ratio = TSC_RATIO_DEFAULT; 987 vcpu->arch.tsc_catchup = 1;
988 vcpu->arch.tsc_always_catchup = 1;
989 } else
990 WARN(1, "user requested TSC rate below hardware speed\n");
927 return; 991 return;
928 } 992 }
929 993
@@ -938,7 +1002,6 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
938 user_tsc_khz); 1002 user_tsc_khz);
939 return; 1003 return;
940 } 1004 }
941 vcpu->arch.virtual_tsc_khz = user_tsc_khz;
942 svm->tsc_ratio = ratio; 1005 svm->tsc_ratio = ratio;
943} 1006}
944 1007
@@ -958,10 +1021,14 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
958 mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1021 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
959} 1022}
960 1023
961static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) 1024static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
962{ 1025{
963 struct vcpu_svm *svm = to_svm(vcpu); 1026 struct vcpu_svm *svm = to_svm(vcpu);
964 1027
1028 WARN_ON(adjustment < 0);
1029 if (host)
1030 adjustment = svm_scale_tsc(vcpu, adjustment);
1031
965 svm->vmcb->control.tsc_offset += adjustment; 1032 svm->vmcb->control.tsc_offset += adjustment;
966 if (is_guest_mode(vcpu)) 1033 if (is_guest_mode(vcpu))
967 svm->nested.hsave->control.tsc_offset += adjustment; 1034 svm->nested.hsave->control.tsc_offset += adjustment;
@@ -1191,6 +1258,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1191 if (kvm_vcpu_is_bsp(&svm->vcpu)) 1258 if (kvm_vcpu_is_bsp(&svm->vcpu))
1192 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 1259 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
1193 1260
1261 svm_init_osvw(&svm->vcpu);
1262
1194 return &svm->vcpu; 1263 return &svm->vcpu;
1195 1264
1196free_page4: 1265free_page4:
@@ -1268,6 +1337,21 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1268 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1337 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1269} 1338}
1270 1339
1340static void svm_update_cpl(struct kvm_vcpu *vcpu)
1341{
1342 struct vcpu_svm *svm = to_svm(vcpu);
1343 int cpl;
1344
1345 if (!is_protmode(vcpu))
1346 cpl = 0;
1347 else if (svm->vmcb->save.rflags & X86_EFLAGS_VM)
1348 cpl = 3;
1349 else
1350 cpl = svm->vmcb->save.cs.selector & 0x3;
1351
1352 svm->vmcb->save.cpl = cpl;
1353}
1354
1271static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 1355static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1272{ 1356{
1273 return to_svm(vcpu)->vmcb->save.rflags; 1357 return to_svm(vcpu)->vmcb->save.rflags;
@@ -1275,7 +1359,11 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1275 1359
1276static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1360static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1277{ 1361{
1362 unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags;
1363
1278 to_svm(vcpu)->vmcb->save.rflags = rflags; 1364 to_svm(vcpu)->vmcb->save.rflags = rflags;
1365 if ((old_rflags ^ rflags) & X86_EFLAGS_VM)
1366 svm_update_cpl(vcpu);
1279} 1367}
1280 1368
1281static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 1369static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
@@ -1543,9 +1631,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
1543 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; 1631 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1544 } 1632 }
1545 if (seg == VCPU_SREG_CS) 1633 if (seg == VCPU_SREG_CS)
1546 svm->vmcb->save.cpl 1634 svm_update_cpl(vcpu);
1547 = (svm->vmcb->save.cs.attrib
1548 >> SVM_SELECTOR_DPL_SHIFT) & 3;
1549 1635
1550 mark_dirty(svm->vmcb, VMCB_SEG); 1636 mark_dirty(svm->vmcb, VMCB_SEG);
1551} 1637}
@@ -2735,7 +2821,10 @@ static int task_switch_interception(struct vcpu_svm *svm)
2735 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) 2821 (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
2736 skip_emulated_instruction(&svm->vcpu); 2822 skip_emulated_instruction(&svm->vcpu);
2737 2823
2738 if (kvm_task_switch(&svm->vcpu, tss_selector, reason, 2824 if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2825 int_vec = -1;
2826
2827 if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
2739 has_error_code, error_code) == EMULATE_FAIL) { 2828 has_error_code, error_code) == EMULATE_FAIL) {
2740 svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 2829 svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2741 svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 2830 svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 246490f643b6..280751c84724 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -70,9 +70,6 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
70static bool __read_mostly vmm_exclusive = 1; 70static bool __read_mostly vmm_exclusive = 1;
71module_param(vmm_exclusive, bool, S_IRUGO); 71module_param(vmm_exclusive, bool, S_IRUGO);
72 72
73static bool __read_mostly yield_on_hlt = 1;
74module_param(yield_on_hlt, bool, S_IRUGO);
75
76static bool __read_mostly fasteoi = 1; 73static bool __read_mostly fasteoi = 1;
77module_param(fasteoi, bool, S_IRUGO); 74module_param(fasteoi, bool, S_IRUGO);
78 75
@@ -1655,17 +1652,6 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
1655 vmx_set_interrupt_shadow(vcpu, 0); 1652 vmx_set_interrupt_shadow(vcpu, 0);
1656} 1653}
1657 1654
1658static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1659{
1660 /* Ensure that we clear the HLT state in the VMCS. We don't need to
1661 * explicitly skip the instruction because if the HLT state is set, then
1662 * the instruction is already executing and RIP has already been
1663 * advanced. */
1664 if (!yield_on_hlt &&
1665 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1666 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1667}
1668
1669/* 1655/*
1670 * KVM wants to inject page-faults which it got to the guest. This function 1656 * KVM wants to inject page-faults which it got to the guest. This function
1671 * checks whether in a nested guest, we need to inject them to L1 or L2. 1657 * checks whether in a nested guest, we need to inject them to L1 or L2.
@@ -1678,7 +1664,7 @@ static int nested_pf_handled(struct kvm_vcpu *vcpu)
1678 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1664 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1679 1665
1680 /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ 1666 /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
1681 if (!(vmcs12->exception_bitmap & PF_VECTOR)) 1667 if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR)))
1682 return 0; 1668 return 0;
1683 1669
1684 nested_vmx_vmexit(vcpu); 1670 nested_vmx_vmexit(vcpu);
@@ -1718,7 +1704,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1718 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1704 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1719 1705
1720 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1706 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1721 vmx_clear_hlt(vcpu);
1722} 1707}
1723 1708
1724static bool vmx_rdtscp_supported(void) 1709static bool vmx_rdtscp_supported(void)
@@ -1817,13 +1802,19 @@ u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu)
1817} 1802}
1818 1803
1819/* 1804/*
1820 * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ 1805 * Engage any workarounds for mis-matched TSC rates. Currently limited to
1821 * ioctl. In this case the call-back should update internal vmx state to make 1806 * software catchup for faster rates on slower CPUs.
1822 * the changes effective.
1823 */ 1807 */
1824static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) 1808static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
1825{ 1809{
1826 /* Nothing to do here */ 1810 if (!scale)
1811 return;
1812
1813 if (user_tsc_khz > tsc_khz) {
1814 vcpu->arch.tsc_catchup = 1;
1815 vcpu->arch.tsc_always_catchup = 1;
1816 } else
1817 WARN(1, "user requested TSC rate below hardware speed\n");
1827} 1818}
1828 1819
1829/* 1820/*
@@ -1850,7 +1841,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1850 } 1841 }
1851} 1842}
1852 1843
1853static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) 1844static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
1854{ 1845{
1855 u64 offset = vmcs_read64(TSC_OFFSET); 1846 u64 offset = vmcs_read64(TSC_OFFSET);
1856 vmcs_write64(TSC_OFFSET, offset + adjustment); 1847 vmcs_write64(TSC_OFFSET, offset + adjustment);
@@ -2219,6 +2210,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2219 msr = find_msr_entry(vmx, msr_index); 2210 msr = find_msr_entry(vmx, msr_index);
2220 if (msr) { 2211 if (msr) {
2221 msr->data = data; 2212 msr->data = data;
2213 if (msr - vmx->guest_msrs < vmx->save_nmsrs)
2214 kvm_set_shared_msr(msr->index, msr->data,
2215 msr->mask);
2222 break; 2216 break;
2223 } 2217 }
2224 ret = kvm_set_msr_common(vcpu, msr_index, data); 2218 ret = kvm_set_msr_common(vcpu, msr_index, data);
@@ -2399,7 +2393,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2399 &_pin_based_exec_control) < 0) 2393 &_pin_based_exec_control) < 0)
2400 return -EIO; 2394 return -EIO;
2401 2395
2402 min = 2396 min = CPU_BASED_HLT_EXITING |
2403#ifdef CONFIG_X86_64 2397#ifdef CONFIG_X86_64
2404 CPU_BASED_CR8_LOAD_EXITING | 2398 CPU_BASED_CR8_LOAD_EXITING |
2405 CPU_BASED_CR8_STORE_EXITING | 2399 CPU_BASED_CR8_STORE_EXITING |
@@ -2414,9 +2408,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2414 CPU_BASED_INVLPG_EXITING | 2408 CPU_BASED_INVLPG_EXITING |
2415 CPU_BASED_RDPMC_EXITING; 2409 CPU_BASED_RDPMC_EXITING;
2416 2410
2417 if (yield_on_hlt)
2418 min |= CPU_BASED_HLT_EXITING;
2419
2420 opt = CPU_BASED_TPR_SHADOW | 2411 opt = CPU_BASED_TPR_SHADOW |
2421 CPU_BASED_USE_MSR_BITMAPS | 2412 CPU_BASED_USE_MSR_BITMAPS |
2422 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 2413 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -4003,7 +3994,6 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
4003 } else 3994 } else
4004 intr |= INTR_TYPE_EXT_INTR; 3995 intr |= INTR_TYPE_EXT_INTR;
4005 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 3996 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
4006 vmx_clear_hlt(vcpu);
4007} 3997}
4008 3998
4009static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 3999static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -4035,7 +4025,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4035 } 4025 }
4036 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 4026 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
4037 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 4027 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
4038 vmx_clear_hlt(vcpu);
4039} 4028}
4040 4029
4041static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) 4030static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -4672,9 +4661,10 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
4672 bool has_error_code = false; 4661 bool has_error_code = false;
4673 u32 error_code = 0; 4662 u32 error_code = 0;
4674 u16 tss_selector; 4663 u16 tss_selector;
4675 int reason, type, idt_v; 4664 int reason, type, idt_v, idt_index;
4676 4665
4677 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); 4666 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
4667 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
4678 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); 4668 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
4679 4669
4680 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4670 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4712,8 +4702,9 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
4712 type != INTR_TYPE_NMI_INTR)) 4702 type != INTR_TYPE_NMI_INTR))
4713 skip_emulated_instruction(vcpu); 4703 skip_emulated_instruction(vcpu);
4714 4704
4715 if (kvm_task_switch(vcpu, tss_selector, reason, 4705 if (kvm_task_switch(vcpu, tss_selector,
4716 has_error_code, error_code) == EMULATE_FAIL) { 4706 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
4707 has_error_code, error_code) == EMULATE_FAIL) {
4717 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 4708 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4718 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 4709 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
4719 vcpu->run->internal.ndata = 0; 4710 vcpu->run->internal.ndata = 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 54696b5f8443..4044ce0bf7c1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -97,6 +97,10 @@ EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
97u32 kvm_max_guest_tsc_khz; 97u32 kvm_max_guest_tsc_khz;
98EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); 98EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
99 99
100/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
101static u32 tsc_tolerance_ppm = 250;
102module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
103
100#define KVM_NR_SHARED_MSRS 16 104#define KVM_NR_SHARED_MSRS 16
101 105
102struct kvm_shared_msrs_global { 106struct kvm_shared_msrs_global {
@@ -969,50 +973,51 @@ static inline u64 get_kernel_ns(void)
969static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 973static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
970unsigned long max_tsc_khz; 974unsigned long max_tsc_khz;
971 975
972static inline int kvm_tsc_changes_freq(void) 976static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
973{ 977{
974 int cpu = get_cpu(); 978 return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
975 int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && 979 vcpu->arch.virtual_tsc_shift);
976 cpufreq_quick_get(cpu) != 0;
977 put_cpu();
978 return ret;
979} 980}
980 981
981u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) 982static u32 adjust_tsc_khz(u32 khz, s32 ppm)
982{ 983{
983 if (vcpu->arch.virtual_tsc_khz) 984 u64 v = (u64)khz * (1000000 + ppm);
984 return vcpu->arch.virtual_tsc_khz; 985 do_div(v, 1000000);
985 else 986 return v;
986 return __this_cpu_read(cpu_tsc_khz);
987} 987}
988 988
989static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) 989static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
990{ 990{
991 u64 ret; 991 u32 thresh_lo, thresh_hi;
992 992 int use_scaling = 0;
993 WARN_ON(preemptible());
994 if (kvm_tsc_changes_freq())
995 printk_once(KERN_WARNING
996 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
997 ret = nsec * vcpu_tsc_khz(vcpu);
998 do_div(ret, USEC_PER_SEC);
999 return ret;
1000}
1001 993
1002static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
1003{
1004 /* Compute a scale to convert nanoseconds in TSC cycles */ 994 /* Compute a scale to convert nanoseconds in TSC cycles */
1005 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, 995 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
1006 &vcpu->arch.tsc_catchup_shift, 996 &vcpu->arch.virtual_tsc_shift,
1007 &vcpu->arch.tsc_catchup_mult); 997 &vcpu->arch.virtual_tsc_mult);
998 vcpu->arch.virtual_tsc_khz = this_tsc_khz;
999
1000 /*
1001 * Compute the variation in TSC rate which is acceptable
1002 * within the range of tolerance and decide if the
1003 * rate being applied is within that bounds of the hardware
1004 * rate. If so, no scaling or compensation need be done.
1005 */
1006 thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
1007 thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
1008 if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
1009 pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
1010 use_scaling = 1;
1011 }
1012 kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
1008} 1013}
1009 1014
1010static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) 1015static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1011{ 1016{
1012 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, 1017 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
1013 vcpu->arch.tsc_catchup_mult, 1018 vcpu->arch.virtual_tsc_mult,
1014 vcpu->arch.tsc_catchup_shift); 1019 vcpu->arch.virtual_tsc_shift);
1015 tsc += vcpu->arch.last_tsc_write; 1020 tsc += vcpu->arch.this_tsc_write;
1016 return tsc; 1021 return tsc;
1017} 1022}
1018 1023
@@ -1021,48 +1026,88 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1021 struct kvm *kvm = vcpu->kvm; 1026 struct kvm *kvm = vcpu->kvm;
1022 u64 offset, ns, elapsed; 1027 u64 offset, ns, elapsed;
1023 unsigned long flags; 1028 unsigned long flags;
1024 s64 sdiff; 1029 s64 usdiff;
1025 1030
1026 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 1031 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1027 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); 1032 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1028 ns = get_kernel_ns(); 1033 ns = get_kernel_ns();
1029 elapsed = ns - kvm->arch.last_tsc_nsec; 1034 elapsed = ns - kvm->arch.last_tsc_nsec;
1030 sdiff = data - kvm->arch.last_tsc_write; 1035
1031 if (sdiff < 0) 1036 /* n.b - signed multiplication and division required */
1032 sdiff = -sdiff; 1037 usdiff = data - kvm->arch.last_tsc_write;
1038#ifdef CONFIG_X86_64
1039 usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
1040#else
1041 /* do_div() only does unsigned */
1042 asm("idivl %2; xor %%edx, %%edx"
1043 : "=A"(usdiff)
1044 : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
1045#endif
1046 do_div(elapsed, 1000);
1047 usdiff -= elapsed;
1048 if (usdiff < 0)
1049 usdiff = -usdiff;
1033 1050
1034 /* 1051 /*
1035 * Special case: close write to TSC within 5 seconds of 1052 * Special case: TSC write with a small delta (1 second) of virtual
1036 * another CPU is interpreted as an attempt to synchronize 1053 * cycle time against real time is interpreted as an attempt to
1037 * The 5 seconds is to accommodate host load / swapping as 1054 * synchronize the CPU.
1038 * well as any reset of TSC during the boot process. 1055 *
1039 * 1056 * For a reliable TSC, we can match TSC offsets, and for an unstable
1040 * In that case, for a reliable TSC, we can match TSC offsets, 1057 * TSC, we add elapsed time in this computation. We could let the
1041 * or make a best guest using elapsed value. 1058 * compensation code attempt to catch up if we fall behind, but
1042 */ 1059 * it's better to try to match offsets from the beginning.
1043 if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) && 1060 */
1044 elapsed < 5ULL * NSEC_PER_SEC) { 1061 if (usdiff < USEC_PER_SEC &&
1062 vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
1045 if (!check_tsc_unstable()) { 1063 if (!check_tsc_unstable()) {
1046 offset = kvm->arch.last_tsc_offset; 1064 offset = kvm->arch.cur_tsc_offset;
1047 pr_debug("kvm: matched tsc offset for %llu\n", data); 1065 pr_debug("kvm: matched tsc offset for %llu\n", data);
1048 } else { 1066 } else {
1049 u64 delta = nsec_to_cycles(vcpu, elapsed); 1067 u64 delta = nsec_to_cycles(vcpu, elapsed);
1050 offset += delta; 1068 data += delta;
1069 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1051 pr_debug("kvm: adjusted tsc offset by %llu\n", delta); 1070 pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1052 } 1071 }
1053 ns = kvm->arch.last_tsc_nsec; 1072 } else {
1073 /*
1074 * We split periods of matched TSC writes into generations.
1075 * For each generation, we track the original measured
1076 * nanosecond time, offset, and write, so if TSCs are in
1077 * sync, we can match exact offset, and if not, we can match
1078 * exact software computaion in compute_guest_tsc()
1079 *
1080 * These values are tracked in kvm->arch.cur_xxx variables.
1081 */
1082 kvm->arch.cur_tsc_generation++;
1083 kvm->arch.cur_tsc_nsec = ns;
1084 kvm->arch.cur_tsc_write = data;
1085 kvm->arch.cur_tsc_offset = offset;
1086 pr_debug("kvm: new tsc generation %u, clock %llu\n",
1087 kvm->arch.cur_tsc_generation, data);
1054 } 1088 }
1089
1090 /*
1091 * We also track th most recent recorded KHZ, write and time to
1092 * allow the matching interval to be extended at each write.
1093 */
1055 kvm->arch.last_tsc_nsec = ns; 1094 kvm->arch.last_tsc_nsec = ns;
1056 kvm->arch.last_tsc_write = data; 1095 kvm->arch.last_tsc_write = data;
1057 kvm->arch.last_tsc_offset = offset; 1096 kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
1058 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1059 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1060 1097
1061 /* Reset of TSC must disable overshoot protection below */ 1098 /* Reset of TSC must disable overshoot protection below */
1062 vcpu->arch.hv_clock.tsc_timestamp = 0; 1099 vcpu->arch.hv_clock.tsc_timestamp = 0;
1063 vcpu->arch.last_tsc_write = data; 1100 vcpu->arch.last_guest_tsc = data;
1064 vcpu->arch.last_tsc_nsec = ns; 1101
1102 /* Keep track of which generation this VCPU has synchronized to */
1103 vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
1104 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
1105 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
1106
1107 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1108 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1065} 1109}
1110
1066EXPORT_SYMBOL_GPL(kvm_write_tsc); 1111EXPORT_SYMBOL_GPL(kvm_write_tsc);
1067 1112
1068static int kvm_guest_time_update(struct kvm_vcpu *v) 1113static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -1078,7 +1123,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1078 local_irq_save(flags); 1123 local_irq_save(flags);
1079 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); 1124 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
1080 kernel_ns = get_kernel_ns(); 1125 kernel_ns = get_kernel_ns();
1081 this_tsc_khz = vcpu_tsc_khz(v); 1126 this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
1082 if (unlikely(this_tsc_khz == 0)) { 1127 if (unlikely(this_tsc_khz == 0)) {
1083 local_irq_restore(flags); 1128 local_irq_restore(flags);
1084 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); 1129 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
@@ -1098,7 +1143,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1098 if (vcpu->tsc_catchup) { 1143 if (vcpu->tsc_catchup) {
1099 u64 tsc = compute_guest_tsc(v, kernel_ns); 1144 u64 tsc = compute_guest_tsc(v, kernel_ns);
1100 if (tsc > tsc_timestamp) { 1145 if (tsc > tsc_timestamp) {
1101 kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); 1146 adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
1102 tsc_timestamp = tsc; 1147 tsc_timestamp = tsc;
1103 } 1148 }
1104 } 1149 }
@@ -1130,7 +1175,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1130 * observed by the guest and ensure the new system time is greater. 1175 * observed by the guest and ensure the new system time is greater.
1131 */ 1176 */
1132 max_kernel_ns = 0; 1177 max_kernel_ns = 0;
1133 if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { 1178 if (vcpu->hv_clock.tsc_timestamp) {
1134 max_kernel_ns = vcpu->last_guest_tsc - 1179 max_kernel_ns = vcpu->last_guest_tsc -
1135 vcpu->hv_clock.tsc_timestamp; 1180 vcpu->hv_clock.tsc_timestamp;
1136 max_kernel_ns = pvclock_scale_delta(max_kernel_ns, 1181 max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
@@ -1504,6 +1549,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1504 case MSR_K7_HWCR: 1549 case MSR_K7_HWCR:
1505 data &= ~(u64)0x40; /* ignore flush filter disable */ 1550 data &= ~(u64)0x40; /* ignore flush filter disable */
1506 data &= ~(u64)0x100; /* ignore ignne emulation enable */ 1551 data &= ~(u64)0x100; /* ignore ignne emulation enable */
1552 data &= ~(u64)0x8; /* ignore TLB cache disable */
1507 if (data != 0) { 1553 if (data != 0) {
1508 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 1554 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
1509 data); 1555 data);
@@ -1676,6 +1722,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1676 */ 1722 */
1677 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); 1723 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
1678 break; 1724 break;
1725 case MSR_AMD64_OSVW_ID_LENGTH:
1726 if (!guest_cpuid_has_osvw(vcpu))
1727 return 1;
1728 vcpu->arch.osvw.length = data;
1729 break;
1730 case MSR_AMD64_OSVW_STATUS:
1731 if (!guest_cpuid_has_osvw(vcpu))
1732 return 1;
1733 vcpu->arch.osvw.status = data;
1734 break;
1679 default: 1735 default:
1680 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 1736 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1681 return xen_hvm_config(vcpu, data); 1737 return xen_hvm_config(vcpu, data);
@@ -1960,6 +2016,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1960 */ 2016 */
1961 data = 0xbe702111; 2017 data = 0xbe702111;
1962 break; 2018 break;
2019 case MSR_AMD64_OSVW_ID_LENGTH:
2020 if (!guest_cpuid_has_osvw(vcpu))
2021 return 1;
2022 data = vcpu->arch.osvw.length;
2023 break;
2024 case MSR_AMD64_OSVW_STATUS:
2025 if (!guest_cpuid_has_osvw(vcpu))
2026 return 1;
2027 data = vcpu->arch.osvw.status;
2028 break;
1963 default: 2029 default:
1964 if (kvm_pmu_msr(vcpu, msr)) 2030 if (kvm_pmu_msr(vcpu, msr))
1965 return kvm_pmu_get_msr(vcpu, msr, pdata); 2031 return kvm_pmu_get_msr(vcpu, msr, pdata);
@@ -2080,6 +2146,7 @@ int kvm_dev_ioctl_check_extension(long ext)
2080 case KVM_CAP_XSAVE: 2146 case KVM_CAP_XSAVE:
2081 case KVM_CAP_ASYNC_PF: 2147 case KVM_CAP_ASYNC_PF:
2082 case KVM_CAP_GET_TSC_KHZ: 2148 case KVM_CAP_GET_TSC_KHZ:
2149 case KVM_CAP_PCI_2_3:
2083 r = 1; 2150 r = 1;
2084 break; 2151 break;
2085 case KVM_CAP_COALESCED_MMIO: 2152 case KVM_CAP_COALESCED_MMIO:
@@ -2214,19 +2281,23 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2214 } 2281 }
2215 2282
2216 kvm_x86_ops->vcpu_load(vcpu, cpu); 2283 kvm_x86_ops->vcpu_load(vcpu, cpu);
2217 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
2218 /* Make sure TSC doesn't go backwards */
2219 s64 tsc_delta;
2220 u64 tsc;
2221 2284
2222 tsc = kvm_x86_ops->read_l1_tsc(vcpu); 2285 /* Apply any externally detected TSC adjustments (due to suspend) */
2223 tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : 2286 if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
2224 tsc - vcpu->arch.last_guest_tsc; 2287 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
2288 vcpu->arch.tsc_offset_adjustment = 0;
2289 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
2290 }
2225 2291
2292 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
2293 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
2294 native_read_tsc() - vcpu->arch.last_host_tsc;
2226 if (tsc_delta < 0) 2295 if (tsc_delta < 0)
2227 mark_tsc_unstable("KVM discovered backwards TSC"); 2296 mark_tsc_unstable("KVM discovered backwards TSC");
2228 if (check_tsc_unstable()) { 2297 if (check_tsc_unstable()) {
2229 kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); 2298 u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
2299 vcpu->arch.last_guest_tsc);
2300 kvm_x86_ops->write_tsc_offset(vcpu, offset);
2230 vcpu->arch.tsc_catchup = 1; 2301 vcpu->arch.tsc_catchup = 1;
2231 } 2302 }
2232 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2303 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -2243,7 +2314,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2243{ 2314{
2244 kvm_x86_ops->vcpu_put(vcpu); 2315 kvm_x86_ops->vcpu_put(vcpu);
2245 kvm_put_guest_fpu(vcpu); 2316 kvm_put_guest_fpu(vcpu);
2246 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); 2317 vcpu->arch.last_host_tsc = native_read_tsc();
2247} 2318}
2248 2319
2249static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 2320static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -2785,26 +2856,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2785 u32 user_tsc_khz; 2856 u32 user_tsc_khz;
2786 2857
2787 r = -EINVAL; 2858 r = -EINVAL;
2788 if (!kvm_has_tsc_control)
2789 break;
2790
2791 user_tsc_khz = (u32)arg; 2859 user_tsc_khz = (u32)arg;
2792 2860
2793 if (user_tsc_khz >= kvm_max_guest_tsc_khz) 2861 if (user_tsc_khz >= kvm_max_guest_tsc_khz)
2794 goto out; 2862 goto out;
2795 2863
2796 kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz); 2864 if (user_tsc_khz == 0)
2865 user_tsc_khz = tsc_khz;
2866
2867 kvm_set_tsc_khz(vcpu, user_tsc_khz);
2797 2868
2798 r = 0; 2869 r = 0;
2799 goto out; 2870 goto out;
2800 } 2871 }
2801 case KVM_GET_TSC_KHZ: { 2872 case KVM_GET_TSC_KHZ: {
2802 r = -EIO; 2873 r = vcpu->arch.virtual_tsc_khz;
2803 if (check_tsc_unstable())
2804 goto out;
2805
2806 r = vcpu_tsc_khz(vcpu);
2807
2808 goto out; 2874 goto out;
2809 } 2875 }
2810 default: 2876 default:
@@ -2815,6 +2881,11 @@ out:
2815 return r; 2881 return r;
2816} 2882}
2817 2883
2884int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
2885{
2886 return VM_FAULT_SIGBUS;
2887}
2888
2818static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) 2889static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
2819{ 2890{
2820 int ret; 2891 int ret;
@@ -2998,6 +3069,8 @@ static void write_protect_slot(struct kvm *kvm,
2998 unsigned long *dirty_bitmap, 3069 unsigned long *dirty_bitmap,
2999 unsigned long nr_dirty_pages) 3070 unsigned long nr_dirty_pages)
3000{ 3071{
3072 spin_lock(&kvm->mmu_lock);
3073
3001 /* Not many dirty pages compared to # of shadow pages. */ 3074 /* Not many dirty pages compared to # of shadow pages. */
3002 if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { 3075 if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
3003 unsigned long gfn_offset; 3076 unsigned long gfn_offset;
@@ -3005,16 +3078,13 @@ static void write_protect_slot(struct kvm *kvm,
3005 for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { 3078 for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
3006 unsigned long gfn = memslot->base_gfn + gfn_offset; 3079 unsigned long gfn = memslot->base_gfn + gfn_offset;
3007 3080
3008 spin_lock(&kvm->mmu_lock);
3009 kvm_mmu_rmap_write_protect(kvm, gfn, memslot); 3081 kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
3010 spin_unlock(&kvm->mmu_lock);
3011 } 3082 }
3012 kvm_flush_remote_tlbs(kvm); 3083 kvm_flush_remote_tlbs(kvm);
3013 } else { 3084 } else
3014 spin_lock(&kvm->mmu_lock);
3015 kvm_mmu_slot_remove_write_access(kvm, memslot->id); 3085 kvm_mmu_slot_remove_write_access(kvm, memslot->id);
3016 spin_unlock(&kvm->mmu_lock); 3086
3017 } 3087 spin_unlock(&kvm->mmu_lock);
3018} 3088}
3019 3089
3020/* 3090/*
@@ -3133,6 +3203,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
3133 r = -EEXIST; 3203 r = -EEXIST;
3134 if (kvm->arch.vpic) 3204 if (kvm->arch.vpic)
3135 goto create_irqchip_unlock; 3205 goto create_irqchip_unlock;
3206 r = -EINVAL;
3207 if (atomic_read(&kvm->online_vcpus))
3208 goto create_irqchip_unlock;
3136 r = -ENOMEM; 3209 r = -ENOMEM;
3137 vpic = kvm_create_pic(kvm); 3210 vpic = kvm_create_pic(kvm);
3138 if (vpic) { 3211 if (vpic) {
@@ -4063,6 +4136,11 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
4063 return res; 4136 return res;
4064} 4137}
4065 4138
4139static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
4140{
4141 kvm_set_rflags(emul_to_vcpu(ctxt), val);
4142}
4143
4066static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) 4144static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
4067{ 4145{
4068 return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); 4146 return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
@@ -4244,6 +4322,7 @@ static struct x86_emulate_ops emulate_ops = {
4244 .set_idt = emulator_set_idt, 4322 .set_idt = emulator_set_idt,
4245 .get_cr = emulator_get_cr, 4323 .get_cr = emulator_get_cr,
4246 .set_cr = emulator_set_cr, 4324 .set_cr = emulator_set_cr,
4325 .set_rflags = emulator_set_rflags,
4247 .cpl = emulator_get_cpl, 4326 .cpl = emulator_get_cpl,
4248 .get_dr = emulator_get_dr, 4327 .get_dr = emulator_get_dr,
4249 .set_dr = emulator_set_dr, 4328 .set_dr = emulator_set_dr,
@@ -5288,6 +5367,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5288 profile_hit(KVM_PROFILING, (void *)rip); 5367 profile_hit(KVM_PROFILING, (void *)rip);
5289 } 5368 }
5290 5369
5370 if (unlikely(vcpu->arch.tsc_always_catchup))
5371 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5291 5372
5292 kvm_lapic_sync_from_vapic(vcpu); 5373 kvm_lapic_sync_from_vapic(vcpu);
5293 5374
@@ -5587,15 +5668,15 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
5587 return 0; 5668 return 0;
5588} 5669}
5589 5670
5590int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 5671int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
5591 bool has_error_code, u32 error_code) 5672 int reason, bool has_error_code, u32 error_code)
5592{ 5673{
5593 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 5674 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
5594 int ret; 5675 int ret;
5595 5676
5596 init_emulate_ctxt(vcpu); 5677 init_emulate_ctxt(vcpu);
5597 5678
5598 ret = emulator_task_switch(ctxt, tss_selector, reason, 5679 ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
5599 has_error_code, error_code); 5680 has_error_code, error_code);
5600 5681
5601 if (ret) 5682 if (ret)
@@ -5928,13 +6009,88 @@ int kvm_arch_hardware_enable(void *garbage)
5928 struct kvm *kvm; 6009 struct kvm *kvm;
5929 struct kvm_vcpu *vcpu; 6010 struct kvm_vcpu *vcpu;
5930 int i; 6011 int i;
6012 int ret;
6013 u64 local_tsc;
6014 u64 max_tsc = 0;
6015 bool stable, backwards_tsc = false;
5931 6016
5932 kvm_shared_msr_cpu_online(); 6017 kvm_shared_msr_cpu_online();
5933 list_for_each_entry(kvm, &vm_list, vm_list) 6018 ret = kvm_x86_ops->hardware_enable(garbage);
5934 kvm_for_each_vcpu(i, vcpu, kvm) 6019 if (ret != 0)
5935 if (vcpu->cpu == smp_processor_id()) 6020 return ret;
5936 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 6021
5937 return kvm_x86_ops->hardware_enable(garbage); 6022 local_tsc = native_read_tsc();
6023 stable = !check_tsc_unstable();
6024 list_for_each_entry(kvm, &vm_list, vm_list) {
6025 kvm_for_each_vcpu(i, vcpu, kvm) {
6026 if (!stable && vcpu->cpu == smp_processor_id())
6027 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
6028 if (stable && vcpu->arch.last_host_tsc > local_tsc) {
6029 backwards_tsc = true;
6030 if (vcpu->arch.last_host_tsc > max_tsc)
6031 max_tsc = vcpu->arch.last_host_tsc;
6032 }
6033 }
6034 }
6035
6036 /*
6037 * Sometimes, even reliable TSCs go backwards. This happens on
6038 * platforms that reset TSC during suspend or hibernate actions, but
6039 * maintain synchronization. We must compensate. Fortunately, we can
6040 * detect that condition here, which happens early in CPU bringup,
6041 * before any KVM threads can be running. Unfortunately, we can't
6042 * bring the TSCs fully up to date with real time, as we aren't yet far
6043 * enough into CPU bringup that we know how much real time has actually
6044 * elapsed; our helper function, get_kernel_ns() will be using boot
6045 * variables that haven't been updated yet.
6046 *
6047 * So we simply find the maximum observed TSC above, then record the
6048 * adjustment to TSC in each VCPU. When the VCPU later gets loaded,
6049 * the adjustment will be applied. Note that we accumulate
6050 * adjustments, in case multiple suspend cycles happen before some VCPU
6051 * gets a chance to run again. In the event that no KVM threads get a
6052 * chance to run, we will miss the entire elapsed period, as we'll have
6053 * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
6054 * loose cycle time. This isn't too big a deal, since the loss will be
6055 * uniform across all VCPUs (not to mention the scenario is extremely
6056 * unlikely). It is possible that a second hibernate recovery happens
6057 * much faster than a first, causing the observed TSC here to be
6058 * smaller; this would require additional padding adjustment, which is
6059 * why we set last_host_tsc to the local tsc observed here.
6060 *
6061 * N.B. - this code below runs only on platforms with reliable TSC,
6062 * as that is the only way backwards_tsc is set above. Also note
6063 * that this runs for ALL vcpus, which is not a bug; all VCPUs should
6064 * have the same delta_cyc adjustment applied if backwards_tsc
6065 * is detected. Note further, this adjustment is only done once,
6066 * as we reset last_host_tsc on all VCPUs to stop this from being
6067 * called multiple times (one for each physical CPU bringup).
6068 *
6069 * Platforms with unnreliable TSCs don't have to deal with this, they
6070 * will be compensated by the logic in vcpu_load, which sets the TSC to
6071 * catchup mode. This will catchup all VCPUs to real time, but cannot
6072 * guarantee that they stay in perfect synchronization.
6073 */
6074 if (backwards_tsc) {
6075 u64 delta_cyc = max_tsc - local_tsc;
6076 list_for_each_entry(kvm, &vm_list, vm_list) {
6077 kvm_for_each_vcpu(i, vcpu, kvm) {
6078 vcpu->arch.tsc_offset_adjustment += delta_cyc;
6079 vcpu->arch.last_host_tsc = local_tsc;
6080 }
6081
6082 /*
6083 * We have to disable TSC offset matching.. if you were
6084 * booting a VM while issuing an S4 host suspend....
6085 * you may have some problem. Solving this issue is
6086 * left as an exercise to the reader.
6087 */
6088 kvm->arch.last_tsc_nsec = 0;
6089 kvm->arch.last_tsc_write = 0;
6090 }
6091
6092 }
6093 return 0;
5938} 6094}
5939 6095
5940void kvm_arch_hardware_disable(void *garbage) 6096void kvm_arch_hardware_disable(void *garbage)
@@ -5958,6 +6114,11 @@ void kvm_arch_check_processor_compat(void *rtn)
5958 kvm_x86_ops->check_processor_compatibility(rtn); 6114 kvm_x86_ops->check_processor_compatibility(rtn);
5959} 6115}
5960 6116
6117bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
6118{
6119 return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
6120}
6121
5961int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 6122int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5962{ 6123{
5963 struct page *page; 6124 struct page *page;
@@ -5980,7 +6141,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5980 } 6141 }
5981 vcpu->arch.pio_data = page_address(page); 6142 vcpu->arch.pio_data = page_address(page);
5982 6143
5983 kvm_init_tsc_catchup(vcpu, max_tsc_khz); 6144 kvm_set_tsc_khz(vcpu, max_tsc_khz);
5984 6145
5985 r = kvm_mmu_create(vcpu); 6146 r = kvm_mmu_create(vcpu);
5986 if (r < 0) 6147 if (r < 0)
@@ -6032,8 +6193,11 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
6032 free_page((unsigned long)vcpu->arch.pio_data); 6193 free_page((unsigned long)vcpu->arch.pio_data);
6033} 6194}
6034 6195
6035int kvm_arch_init_vm(struct kvm *kvm) 6196int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
6036{ 6197{
6198 if (type)
6199 return -EINVAL;
6200
6037 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 6201 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
6038 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 6202 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
6039 6203
@@ -6093,6 +6257,65 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
6093 put_page(kvm->arch.ept_identity_pagetable); 6257 put_page(kvm->arch.ept_identity_pagetable);
6094} 6258}
6095 6259
6260void kvm_arch_free_memslot(struct kvm_memory_slot *free,
6261 struct kvm_memory_slot *dont)
6262{
6263 int i;
6264
6265 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6266 if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
6267 vfree(free->arch.lpage_info[i]);
6268 free->arch.lpage_info[i] = NULL;
6269 }
6270 }
6271}
6272
6273int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
6274{
6275 int i;
6276
6277 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6278 unsigned long ugfn;
6279 int lpages;
6280 int level = i + 2;
6281
6282 lpages = gfn_to_index(slot->base_gfn + npages - 1,
6283 slot->base_gfn, level) + 1;
6284
6285 slot->arch.lpage_info[i] =
6286 vzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
6287 if (!slot->arch.lpage_info[i])
6288 goto out_free;
6289
6290 if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
6291 slot->arch.lpage_info[i][0].write_count = 1;
6292 if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
6293 slot->arch.lpage_info[i][lpages - 1].write_count = 1;
6294 ugfn = slot->userspace_addr >> PAGE_SHIFT;
6295 /*
6296 * If the gfn and userspace address are not aligned wrt each
6297 * other, or if explicitly asked to, disable large page
6298 * support for this slot
6299 */
6300 if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
6301 !kvm_largepages_enabled()) {
6302 unsigned long j;
6303
6304 for (j = 0; j < lpages; ++j)
6305 slot->arch.lpage_info[i][j].write_count = 1;
6306 }
6307 }
6308
6309 return 0;
6310
6311out_free:
6312 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6313 vfree(slot->arch.lpage_info[i]);
6314 slot->arch.lpage_info[i] = NULL;
6315 }
6316 return -ENOMEM;
6317}
6318
6096int kvm_arch_prepare_memory_region(struct kvm *kvm, 6319int kvm_arch_prepare_memory_region(struct kvm *kvm,
6097 struct kvm_memory_slot *memslot, 6320 struct kvm_memory_slot *memslot,
6098 struct kvm_memory_slot old, 6321 struct kvm_memory_slot old,
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 4889655ba784..47936830968c 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -115,7 +115,7 @@ static void __save_processor_state(struct saved_context *ctxt)
115void save_processor_state(void) 115void save_processor_state(void)
116{ 116{
117 __save_processor_state(&saved_context); 117 __save_processor_state(&saved_context);
118 save_sched_clock_state(); 118 x86_platform.save_sched_clock_state();
119} 119}
120#ifdef CONFIG_X86_32 120#ifdef CONFIG_X86_32
121EXPORT_SYMBOL(save_processor_state); 121EXPORT_SYMBOL(save_processor_state);
@@ -231,8 +231,8 @@ static void __restore_processor_state(struct saved_context *ctxt)
231/* Needed by apm.c */ 231/* Needed by apm.c */
232void restore_processor_state(void) 232void restore_processor_state(void)
233{ 233{
234 x86_platform.restore_sched_clock_state();
234 __restore_processor_state(&saved_context); 235 __restore_processor_state(&saved_context);
235 restore_sched_clock_state();
236} 236}
237#ifdef CONFIG_X86_32 237#ifdef CONFIG_X86_32
238EXPORT_SYMBOL(restore_processor_state); 238EXPORT_SYMBOL(restore_processor_state);
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 68e67e50d028..6c322a90b92f 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -162,6 +162,7 @@ struct kvm_pit_config {
162#define KVM_EXIT_INTERNAL_ERROR 17 162#define KVM_EXIT_INTERNAL_ERROR 17
163#define KVM_EXIT_OSI 18 163#define KVM_EXIT_OSI 18
164#define KVM_EXIT_PAPR_HCALL 19 164#define KVM_EXIT_PAPR_HCALL 19
165#define KVM_EXIT_S390_UCONTROL 20
165 166
166/* For KVM_EXIT_INTERNAL_ERROR */ 167/* For KVM_EXIT_INTERNAL_ERROR */
167#define KVM_INTERNAL_ERROR_EMULATION 1 168#define KVM_INTERNAL_ERROR_EMULATION 1
@@ -249,6 +250,11 @@ struct kvm_run {
249#define KVM_S390_RESET_CPU_INIT 8 250#define KVM_S390_RESET_CPU_INIT 8
250#define KVM_S390_RESET_IPL 16 251#define KVM_S390_RESET_IPL 16
251 __u64 s390_reset_flags; 252 __u64 s390_reset_flags;
253 /* KVM_EXIT_S390_UCONTROL */
254 struct {
255 __u64 trans_exc_code;
256 __u32 pgm_code;
257 } s390_ucontrol;
252 /* KVM_EXIT_DCR */ 258 /* KVM_EXIT_DCR */
253 struct { 259 struct {
254 __u32 dcrn; 260 __u32 dcrn;
@@ -273,6 +279,20 @@ struct kvm_run {
273 /* Fix the size of the union. */ 279 /* Fix the size of the union. */
274 char padding[256]; 280 char padding[256];
275 }; 281 };
282
283 /*
284 * shared registers between kvm and userspace.
285 * kvm_valid_regs specifies the register classes set by the host
286 * kvm_dirty_regs specified the register classes dirtied by userspace
287 * struct kvm_sync_regs is architecture specific, as well as the
288 * bits for kvm_valid_regs and kvm_dirty_regs
289 */
290 __u64 kvm_valid_regs;
291 __u64 kvm_dirty_regs;
292 union {
293 struct kvm_sync_regs regs;
294 char padding[1024];
295 } s;
276}; 296};
277 297
278/* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */ 298/* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */
@@ -431,6 +451,11 @@ struct kvm_ppc_pvinfo {
431 451
432#define KVMIO 0xAE 452#define KVMIO 0xAE
433 453
454/* machine type bits, to be used as argument to KVM_CREATE_VM */
455#define KVM_VM_S390_UCONTROL 1
456
457#define KVM_S390_SIE_PAGE_OFFSET 1
458
434/* 459/*
435 * ioctls for /dev/kvm fds: 460 * ioctls for /dev/kvm fds:
436 */ 461 */
@@ -555,9 +580,15 @@ struct kvm_ppc_pvinfo {
555#define KVM_CAP_PPC_SMT 64 580#define KVM_CAP_PPC_SMT 64
556#define KVM_CAP_PPC_RMA 65 581#define KVM_CAP_PPC_RMA 65
557#define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ 582#define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */
583#define KVM_CAP_PPC_HIOR 67
558#define KVM_CAP_PPC_PAPR 68 584#define KVM_CAP_PPC_PAPR 68
585#define KVM_CAP_SW_TLB 69
586#define KVM_CAP_ONE_REG 70
559#define KVM_CAP_S390_GMAP 71 587#define KVM_CAP_S390_GMAP 71
560#define KVM_CAP_TSC_DEADLINE_TIMER 72 588#define KVM_CAP_TSC_DEADLINE_TIMER 72
589#define KVM_CAP_S390_UCONTROL 73
590#define KVM_CAP_SYNC_REGS 74
591#define KVM_CAP_PCI_2_3 75
561 592
562#ifdef KVM_CAP_IRQ_ROUTING 593#ifdef KVM_CAP_IRQ_ROUTING
563 594
@@ -637,6 +668,52 @@ struct kvm_clock_data {
637 __u32 pad[9]; 668 __u32 pad[9];
638}; 669};
639 670
671#define KVM_MMU_FSL_BOOKE_NOHV 0
672#define KVM_MMU_FSL_BOOKE_HV 1
673
674struct kvm_config_tlb {
675 __u64 params;
676 __u64 array;
677 __u32 mmu_type;
678 __u32 array_len;
679};
680
681struct kvm_dirty_tlb {
682 __u64 bitmap;
683 __u32 num_dirty;
684};
685
686/* Available with KVM_CAP_ONE_REG */
687
688#define KVM_REG_ARCH_MASK 0xff00000000000000ULL
689#define KVM_REG_GENERIC 0x0000000000000000ULL
690
691/*
692 * Architecture specific registers are to be defined in arch headers and
693 * ORed with the arch identifier.
694 */
695#define KVM_REG_PPC 0x1000000000000000ULL
696#define KVM_REG_X86 0x2000000000000000ULL
697#define KVM_REG_IA64 0x3000000000000000ULL
698#define KVM_REG_ARM 0x4000000000000000ULL
699#define KVM_REG_S390 0x5000000000000000ULL
700
701#define KVM_REG_SIZE_SHIFT 52
702#define KVM_REG_SIZE_MASK 0x00f0000000000000ULL
703#define KVM_REG_SIZE_U8 0x0000000000000000ULL
704#define KVM_REG_SIZE_U16 0x0010000000000000ULL
705#define KVM_REG_SIZE_U32 0x0020000000000000ULL
706#define KVM_REG_SIZE_U64 0x0030000000000000ULL
707#define KVM_REG_SIZE_U128 0x0040000000000000ULL
708#define KVM_REG_SIZE_U256 0x0050000000000000ULL
709#define KVM_REG_SIZE_U512 0x0060000000000000ULL
710#define KVM_REG_SIZE_U1024 0x0070000000000000ULL
711
712struct kvm_one_reg {
713 __u64 id;
714 __u64 addr;
715};
716
640/* 717/*
641 * ioctls for VM fds 718 * ioctls for VM fds
642 */ 719 */
@@ -655,6 +732,17 @@ struct kvm_clock_data {
655 struct kvm_userspace_memory_region) 732 struct kvm_userspace_memory_region)
656#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) 733#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47)
657#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) 734#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64)
735
736/* enable ucontrol for s390 */
737struct kvm_s390_ucas_mapping {
738 __u64 user_addr;
739 __u64 vcpu_addr;
740 __u64 length;
741};
742#define KVM_S390_UCAS_MAP _IOW(KVMIO, 0x50, struct kvm_s390_ucas_mapping)
743#define KVM_S390_UCAS_UNMAP _IOW(KVMIO, 0x51, struct kvm_s390_ucas_mapping)
744#define KVM_S390_VCPU_FAULT _IOW(KVMIO, 0x52, unsigned long)
745
658/* Device model IOC */ 746/* Device model IOC */
659#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) 747#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60)
660#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) 748#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level)
@@ -697,6 +785,9 @@ struct kvm_clock_data {
697/* Available with KVM_CAP_TSC_CONTROL */ 785/* Available with KVM_CAP_TSC_CONTROL */
698#define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2) 786#define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2)
699#define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3) 787#define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3)
788/* Available with KVM_CAP_PCI_2_3 */
789#define KVM_ASSIGN_SET_INTX_MASK _IOW(KVMIO, 0xa4, \
790 struct kvm_assigned_pci_dev)
700 791
701/* 792/*
702 * ioctls for vcpu fds 793 * ioctls for vcpu fds
@@ -763,8 +854,15 @@ struct kvm_clock_data {
763#define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce) 854#define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce)
764/* Available with KVM_CAP_RMA */ 855/* Available with KVM_CAP_RMA */
765#define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) 856#define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma)
857/* Available with KVM_CAP_SW_TLB */
858#define KVM_DIRTY_TLB _IOW(KVMIO, 0xaa, struct kvm_dirty_tlb)
859/* Available with KVM_CAP_ONE_REG */
860#define KVM_GET_ONE_REG _IOW(KVMIO, 0xab, struct kvm_one_reg)
861#define KVM_SET_ONE_REG _IOW(KVMIO, 0xac, struct kvm_one_reg)
766 862
767#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) 863#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
864#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
865#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
768 866
769struct kvm_assigned_pci_dev { 867struct kvm_assigned_pci_dev {
770 __u32 assigned_dev_id; 868 __u32 assigned_dev_id;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ca1b153585d3..665a260c7e09 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -172,11 +172,6 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
172 */ 172 */
173#define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1) 173#define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
174 174
175struct kvm_lpage_info {
176 unsigned long rmap_pde;
177 int write_count;
178};
179
180struct kvm_memory_slot { 175struct kvm_memory_slot {
181 gfn_t base_gfn; 176 gfn_t base_gfn;
182 unsigned long npages; 177 unsigned long npages;
@@ -185,7 +180,7 @@ struct kvm_memory_slot {
185 unsigned long *dirty_bitmap; 180 unsigned long *dirty_bitmap;
186 unsigned long *dirty_bitmap_head; 181 unsigned long *dirty_bitmap_head;
187 unsigned long nr_dirty_pages; 182 unsigned long nr_dirty_pages;
188 struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; 183 struct kvm_arch_memory_slot arch;
189 unsigned long userspace_addr; 184 unsigned long userspace_addr;
190 int user_alloc; 185 int user_alloc;
191 int id; 186 int id;
@@ -377,6 +372,9 @@ int kvm_set_memory_region(struct kvm *kvm,
377int __kvm_set_memory_region(struct kvm *kvm, 372int __kvm_set_memory_region(struct kvm *kvm,
378 struct kvm_userspace_memory_region *mem, 373 struct kvm_userspace_memory_region *mem,
379 int user_alloc); 374 int user_alloc);
375void kvm_arch_free_memslot(struct kvm_memory_slot *free,
376 struct kvm_memory_slot *dont);
377int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages);
380int kvm_arch_prepare_memory_region(struct kvm *kvm, 378int kvm_arch_prepare_memory_region(struct kvm *kvm,
381 struct kvm_memory_slot *memslot, 379 struct kvm_memory_slot *memslot,
382 struct kvm_memory_slot old, 380 struct kvm_memory_slot old,
@@ -386,6 +384,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
386 struct kvm_userspace_memory_region *mem, 384 struct kvm_userspace_memory_region *mem,
387 struct kvm_memory_slot old, 385 struct kvm_memory_slot old,
388 int user_alloc); 386 int user_alloc);
387bool kvm_largepages_enabled(void);
389void kvm_disable_largepages(void); 388void kvm_disable_largepages(void);
390void kvm_arch_flush_shadow(struct kvm *kvm); 389void kvm_arch_flush_shadow(struct kvm *kvm);
391 390
@@ -451,6 +450,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
451 unsigned int ioctl, unsigned long arg); 450 unsigned int ioctl, unsigned long arg);
452long kvm_arch_vcpu_ioctl(struct file *filp, 451long kvm_arch_vcpu_ioctl(struct file *filp,
453 unsigned int ioctl, unsigned long arg); 452 unsigned int ioctl, unsigned long arg);
453int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf);
454 454
455int kvm_dev_ioctl_check_extension(long ext); 455int kvm_dev_ioctl_check_extension(long ext);
456 456
@@ -521,7 +521,7 @@ static inline void kvm_arch_free_vm(struct kvm *kvm)
521} 521}
522#endif 522#endif
523 523
524int kvm_arch_init_vm(struct kvm *kvm); 524int kvm_arch_init_vm(struct kvm *kvm, unsigned long type);
525void kvm_arch_destroy_vm(struct kvm *kvm); 525void kvm_arch_destroy_vm(struct kvm *kvm);
526void kvm_free_all_assigned_devices(struct kvm *kvm); 526void kvm_free_all_assigned_devices(struct kvm *kvm);
527void kvm_arch_sync_events(struct kvm *kvm); 527void kvm_arch_sync_events(struct kvm *kvm);
@@ -547,6 +547,7 @@ struct kvm_assigned_dev_kernel {
547 unsigned int entries_nr; 547 unsigned int entries_nr;
548 int host_irq; 548 int host_irq;
549 bool host_irq_disabled; 549 bool host_irq_disabled;
550 bool pci_2_3;
550 struct msix_entry *host_msix_entries; 551 struct msix_entry *host_msix_entries;
551 int guest_irq; 552 int guest_irq;
552 struct msix_entry *guest_msix_entries; 553 struct msix_entry *guest_msix_entries;
@@ -556,6 +557,7 @@ struct kvm_assigned_dev_kernel {
556 struct pci_dev *dev; 557 struct pci_dev *dev;
557 struct kvm *kvm; 558 struct kvm *kvm;
558 spinlock_t intx_lock; 559 spinlock_t intx_lock;
560 spinlock_t intx_mask_lock;
559 char irq_name[32]; 561 char irq_name[32];
560 struct pci_saved_state *pci_saved_state; 562 struct pci_saved_state *pci_saved_state;
561}; 563};
@@ -651,11 +653,43 @@ static inline void kvm_guest_exit(void)
651 current->flags &= ~PF_VCPU; 653 current->flags &= ~PF_VCPU;
652} 654}
653 655
656/*
657 * search_memslots() and __gfn_to_memslot() are here because they are
658 * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c.
659 * gfn_to_memslot() itself isn't here as an inline because that would
660 * bloat other code too much.
661 */
662static inline struct kvm_memory_slot *
663search_memslots(struct kvm_memslots *slots, gfn_t gfn)
664{
665 struct kvm_memory_slot *memslot;
666
667 kvm_for_each_memslot(memslot, slots)
668 if (gfn >= memslot->base_gfn &&
669 gfn < memslot->base_gfn + memslot->npages)
670 return memslot;
671
672 return NULL;
673}
674
675static inline struct kvm_memory_slot *
676__gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
677{
678 return search_memslots(slots, gfn);
679}
680
654static inline int memslot_id(struct kvm *kvm, gfn_t gfn) 681static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
655{ 682{
656 return gfn_to_memslot(kvm, gfn)->id; 683 return gfn_to_memslot(kvm, gfn)->id;
657} 684}
658 685
686static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
687{
688 /* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */
689 return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
690 (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
691}
692
659static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, 693static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
660 gfn_t gfn) 694 gfn_t gfn)
661{ 695{
@@ -702,12 +736,16 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se
702 if (unlikely(vcpu->kvm->mmu_notifier_count)) 736 if (unlikely(vcpu->kvm->mmu_notifier_count))
703 return 1; 737 return 1;
704 /* 738 /*
705 * Both reads happen under the mmu_lock and both values are 739 * Ensure the read of mmu_notifier_count happens before the read
706 * modified under mmu_lock, so there's no need of smb_rmb() 740 * of mmu_notifier_seq. This interacts with the smp_wmb() in
707 * here in between, otherwise mmu_notifier_count should be 741 * mmu_notifier_invalidate_range_end to make sure that the caller
708 * read before mmu_notifier_seq, see 742 * either sees the old (non-zero) value of mmu_notifier_count or
709 * mmu_notifier_invalidate_range_end write side. 743 * the new (incremented) value of mmu_notifier_seq.
744 * PowerPC Book3s HV KVM calls this under a per-page lock
745 * rather than under kvm->mmu_lock, for scalability, so
746 * can't rely on kvm->mmu_lock to keep things ordered.
710 */ 747 */
748 smp_rmb();
711 if (vcpu->kvm->mmu_notifier_seq != mmu_seq) 749 if (vcpu->kvm->mmu_notifier_seq != mmu_seq)
712 return 1; 750 return 1;
713 return 0; 751 return 0;
@@ -770,6 +808,13 @@ static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
770{ 808{
771 return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id; 809 return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id;
772} 810}
811
812bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu);
813
814#else
815
816static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; }
817
773#endif 818#endif
774 819
775#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT 820#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 758e3b36d4cf..01f572c10c71 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -49,31 +49,73 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
49 index = i; 49 index = i;
50 break; 50 break;
51 } 51 }
52 if (index < 0) { 52 if (index < 0)
53 printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); 53 printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
54 return 0;
55 }
56 54
57 return index; 55 return index;
58} 56}
59 57
60static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) 58static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
61{ 59{
62 struct kvm_assigned_dev_kernel *assigned_dev = dev_id; 60 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
61 int ret;
62
63 spin_lock(&assigned_dev->intx_lock);
64 if (pci_check_and_mask_intx(assigned_dev->dev)) {
65 assigned_dev->host_irq_disabled = true;
66 ret = IRQ_WAKE_THREAD;
67 } else
68 ret = IRQ_NONE;
69 spin_unlock(&assigned_dev->intx_lock);
70
71 return ret;
72}
63 73
64 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) { 74static void
65 spin_lock(&assigned_dev->intx_lock); 75kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
76 int vector)
77{
78 if (unlikely(assigned_dev->irq_requested_type &
79 KVM_DEV_IRQ_GUEST_INTX)) {
80 spin_lock(&assigned_dev->intx_mask_lock);
81 if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
82 kvm_set_irq(assigned_dev->kvm,
83 assigned_dev->irq_source_id, vector, 1);
84 spin_unlock(&assigned_dev->intx_mask_lock);
85 } else
86 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
87 vector, 1);
88}
89
90static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
91{
92 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
93
94 if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
95 spin_lock_irq(&assigned_dev->intx_lock);
66 disable_irq_nosync(irq); 96 disable_irq_nosync(irq);
67 assigned_dev->host_irq_disabled = true; 97 assigned_dev->host_irq_disabled = true;
68 spin_unlock(&assigned_dev->intx_lock); 98 spin_unlock_irq(&assigned_dev->intx_lock);
69 } 99 }
70 100
71 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, 101 kvm_assigned_dev_raise_guest_irq(assigned_dev,
72 assigned_dev->guest_irq, 1); 102 assigned_dev->guest_irq);
73 103
74 return IRQ_HANDLED; 104 return IRQ_HANDLED;
75} 105}
76 106
107#ifdef __KVM_HAVE_MSI
108static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
109{
110 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
111
112 kvm_assigned_dev_raise_guest_irq(assigned_dev,
113 assigned_dev->guest_irq);
114
115 return IRQ_HANDLED;
116}
117#endif
118
77#ifdef __KVM_HAVE_MSIX 119#ifdef __KVM_HAVE_MSIX
78static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) 120static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
79{ 121{
@@ -83,8 +125,7 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
83 125
84 if (index >= 0) { 126 if (index >= 0) {
85 vector = assigned_dev->guest_msix_entries[index].vector; 127 vector = assigned_dev->guest_msix_entries[index].vector;
86 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, 128 kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
87 vector, 1);
88 } 129 }
89 130
90 return IRQ_HANDLED; 131 return IRQ_HANDLED;
@@ -100,15 +141,31 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
100 141
101 kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); 142 kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
102 143
103 /* The guest irq may be shared so this ack may be 144 spin_lock(&dev->intx_mask_lock);
104 * from another device. 145
105 */ 146 if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
106 spin_lock(&dev->intx_lock); 147 bool reassert = false;
107 if (dev->host_irq_disabled) { 148
108 enable_irq(dev->host_irq); 149 spin_lock_irq(&dev->intx_lock);
109 dev->host_irq_disabled = false; 150 /*
151 * The guest IRQ may be shared so this ack can come from an
152 * IRQ for another guest device.
153 */
154 if (dev->host_irq_disabled) {
155 if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
156 enable_irq(dev->host_irq);
157 else if (!pci_check_and_unmask_intx(dev->dev))
158 reassert = true;
159 dev->host_irq_disabled = reassert;
160 }
161 spin_unlock_irq(&dev->intx_lock);
162
163 if (reassert)
164 kvm_set_irq(dev->kvm, dev->irq_source_id,
165 dev->guest_irq, 1);
110 } 166 }
111 spin_unlock(&dev->intx_lock); 167
168 spin_unlock(&dev->intx_mask_lock);
112} 169}
113 170
114static void deassign_guest_irq(struct kvm *kvm, 171static void deassign_guest_irq(struct kvm *kvm,
@@ -156,7 +213,15 @@ static void deassign_host_irq(struct kvm *kvm,
156 pci_disable_msix(assigned_dev->dev); 213 pci_disable_msix(assigned_dev->dev);
157 } else { 214 } else {
158 /* Deal with MSI and INTx */ 215 /* Deal with MSI and INTx */
159 disable_irq(assigned_dev->host_irq); 216 if ((assigned_dev->irq_requested_type &
217 KVM_DEV_IRQ_HOST_INTX) &&
218 (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
219 spin_lock_irq(&assigned_dev->intx_lock);
220 pci_intx(assigned_dev->dev, false);
221 spin_unlock_irq(&assigned_dev->intx_lock);
222 synchronize_irq(assigned_dev->host_irq);
223 } else
224 disable_irq(assigned_dev->host_irq);
160 225
161 free_irq(assigned_dev->host_irq, assigned_dev); 226 free_irq(assigned_dev->host_irq, assigned_dev);
162 227
@@ -237,15 +302,34 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
237static int assigned_device_enable_host_intx(struct kvm *kvm, 302static int assigned_device_enable_host_intx(struct kvm *kvm,
238 struct kvm_assigned_dev_kernel *dev) 303 struct kvm_assigned_dev_kernel *dev)
239{ 304{
305 irq_handler_t irq_handler;
306 unsigned long flags;
307
240 dev->host_irq = dev->dev->irq; 308 dev->host_irq = dev->dev->irq;
241 /* Even though this is PCI, we don't want to use shared 309
242 * interrupts. Sharing host devices with guest-assigned devices 310 /*
243 * on the same interrupt line is not a happy situation: there 311 * We can only share the IRQ line with other host devices if we are
244 * are going to be long delays in accepting, acking, etc. 312 * able to disable the IRQ source at device-level - independently of
313 * the guest driver. Otherwise host devices may suffer from unbounded
314 * IRQ latencies when the guest keeps the line asserted.
245 */ 315 */
246 if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, 316 if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
247 IRQF_ONESHOT, dev->irq_name, dev)) 317 irq_handler = kvm_assigned_dev_intx;
318 flags = IRQF_SHARED;
319 } else {
320 irq_handler = NULL;
321 flags = IRQF_ONESHOT;
322 }
323 if (request_threaded_irq(dev->host_irq, irq_handler,
324 kvm_assigned_dev_thread_intx, flags,
325 dev->irq_name, dev))
248 return -EIO; 326 return -EIO;
327
328 if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
329 spin_lock_irq(&dev->intx_lock);
330 pci_intx(dev->dev, true);
331 spin_unlock_irq(&dev->intx_lock);
332 }
249 return 0; 333 return 0;
250} 334}
251 335
@@ -262,8 +346,9 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
262 } 346 }
263 347
264 dev->host_irq = dev->dev->irq; 348 dev->host_irq = dev->dev->irq;
265 if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, 349 if (request_threaded_irq(dev->host_irq, NULL,
266 0, dev->irq_name, dev)) { 350 kvm_assigned_dev_thread_msi, 0,
351 dev->irq_name, dev)) {
267 pci_disable_msi(dev->dev); 352 pci_disable_msi(dev->dev);
268 return -EIO; 353 return -EIO;
269 } 354 }
@@ -321,7 +406,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
321{ 406{
322 dev->guest_irq = irq->guest_irq; 407 dev->guest_irq = irq->guest_irq;
323 dev->ack_notifier.gsi = -1; 408 dev->ack_notifier.gsi = -1;
324 dev->host_irq_disabled = false;
325 return 0; 409 return 0;
326} 410}
327#endif 411#endif
@@ -333,7 +417,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
333{ 417{
334 dev->guest_irq = irq->guest_irq; 418 dev->guest_irq = irq->guest_irq;
335 dev->ack_notifier.gsi = -1; 419 dev->ack_notifier.gsi = -1;
336 dev->host_irq_disabled = false;
337 return 0; 420 return 0;
338} 421}
339#endif 422#endif
@@ -367,6 +450,7 @@ static int assign_host_irq(struct kvm *kvm,
367 default: 450 default:
368 r = -EINVAL; 451 r = -EINVAL;
369 } 452 }
453 dev->host_irq_disabled = false;
370 454
371 if (!r) 455 if (!r)
372 dev->irq_requested_type |= host_irq_type; 456 dev->irq_requested_type |= host_irq_type;
@@ -468,6 +552,7 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
468{ 552{
469 int r = -ENODEV; 553 int r = -ENODEV;
470 struct kvm_assigned_dev_kernel *match; 554 struct kvm_assigned_dev_kernel *match;
555 unsigned long irq_type;
471 556
472 mutex_lock(&kvm->lock); 557 mutex_lock(&kvm->lock);
473 558
@@ -476,7 +561,9 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
476 if (!match) 561 if (!match)
477 goto out; 562 goto out;
478 563
479 r = kvm_deassign_irq(kvm, match, assigned_irq->flags); 564 irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
565 KVM_DEV_IRQ_GUEST_MASK);
566 r = kvm_deassign_irq(kvm, match, irq_type);
480out: 567out:
481 mutex_unlock(&kvm->lock); 568 mutex_unlock(&kvm->lock);
482 return r; 569 return r;
@@ -609,6 +696,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
609 if (!match->pci_saved_state) 696 if (!match->pci_saved_state)
610 printk(KERN_DEBUG "%s: Couldn't store %s saved state\n", 697 printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
611 __func__, dev_name(&dev->dev)); 698 __func__, dev_name(&dev->dev));
699
700 if (!pci_intx_mask_supported(dev))
701 assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
702
612 match->assigned_dev_id = assigned_dev->assigned_dev_id; 703 match->assigned_dev_id = assigned_dev->assigned_dev_id;
613 match->host_segnr = assigned_dev->segnr; 704 match->host_segnr = assigned_dev->segnr;
614 match->host_busnr = assigned_dev->busnr; 705 match->host_busnr = assigned_dev->busnr;
@@ -616,6 +707,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
616 match->flags = assigned_dev->flags; 707 match->flags = assigned_dev->flags;
617 match->dev = dev; 708 match->dev = dev;
618 spin_lock_init(&match->intx_lock); 709 spin_lock_init(&match->intx_lock);
710 spin_lock_init(&match->intx_mask_lock);
619 match->irq_source_id = -1; 711 match->irq_source_id = -1;
620 match->kvm = kvm; 712 match->kvm = kvm;
621 match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; 713 match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
@@ -761,6 +853,55 @@ msix_entry_out:
761} 853}
762#endif 854#endif
763 855
856static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
857 struct kvm_assigned_pci_dev *assigned_dev)
858{
859 int r = 0;
860 struct kvm_assigned_dev_kernel *match;
861
862 mutex_lock(&kvm->lock);
863
864 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
865 assigned_dev->assigned_dev_id);
866 if (!match) {
867 r = -ENODEV;
868 goto out;
869 }
870
871 spin_lock(&match->intx_mask_lock);
872
873 match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
874 match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
875
876 if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
877 if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
878 kvm_set_irq(match->kvm, match->irq_source_id,
879 match->guest_irq, 0);
880 /*
881 * Masking at hardware-level is performed on demand,
882 * i.e. when an IRQ actually arrives at the host.
883 */
884 } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
885 /*
886 * Unmask the IRQ line if required. Unmasking at
887 * device level will be performed by user space.
888 */
889 spin_lock_irq(&match->intx_lock);
890 if (match->host_irq_disabled) {
891 enable_irq(match->host_irq);
892 match->host_irq_disabled = false;
893 }
894 spin_unlock_irq(&match->intx_lock);
895 }
896 }
897
898 spin_unlock(&match->intx_mask_lock);
899
900out:
901 mutex_unlock(&kvm->lock);
902 return r;
903}
904
764long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, 905long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
765 unsigned long arg) 906 unsigned long arg)
766{ 907{
@@ -868,6 +1009,15 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
868 break; 1009 break;
869 } 1010 }
870#endif 1011#endif
1012 case KVM_ASSIGN_SET_INTX_MASK: {
1013 struct kvm_assigned_pci_dev assigned_dev;
1014
1015 r = -EFAULT;
1016 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
1017 goto out;
1018 r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
1019 break;
1020 }
871 default: 1021 default:
872 r = -ENOTTY; 1022 r = -ENOTTY;
873 break; 1023 break;
@@ -875,4 +1025,3 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
875out: 1025out:
876 return r; 1026 return r;
877} 1027}
878
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a91f980077d8..42b73930a6de 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -203,7 +203,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
203 203
204void kvm_flush_remote_tlbs(struct kvm *kvm) 204void kvm_flush_remote_tlbs(struct kvm *kvm)
205{ 205{
206 int dirty_count = kvm->tlbs_dirty; 206 long dirty_count = kvm->tlbs_dirty;
207 207
208 smp_mb(); 208 smp_mb();
209 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 209 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
@@ -289,15 +289,15 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
289 */ 289 */
290 idx = srcu_read_lock(&kvm->srcu); 290 idx = srcu_read_lock(&kvm->srcu);
291 spin_lock(&kvm->mmu_lock); 291 spin_lock(&kvm->mmu_lock);
292
292 kvm->mmu_notifier_seq++; 293 kvm->mmu_notifier_seq++;
293 need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty; 294 need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
294 spin_unlock(&kvm->mmu_lock);
295 srcu_read_unlock(&kvm->srcu, idx);
296
297 /* we've to flush the tlb before the pages can be freed */ 295 /* we've to flush the tlb before the pages can be freed */
298 if (need_tlb_flush) 296 if (need_tlb_flush)
299 kvm_flush_remote_tlbs(kvm); 297 kvm_flush_remote_tlbs(kvm);
300 298
299 spin_unlock(&kvm->mmu_lock);
300 srcu_read_unlock(&kvm->srcu, idx);
301} 301}
302 302
303static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 303static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
@@ -335,12 +335,12 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
335 for (; start < end; start += PAGE_SIZE) 335 for (; start < end; start += PAGE_SIZE)
336 need_tlb_flush |= kvm_unmap_hva(kvm, start); 336 need_tlb_flush |= kvm_unmap_hva(kvm, start);
337 need_tlb_flush |= kvm->tlbs_dirty; 337 need_tlb_flush |= kvm->tlbs_dirty;
338 spin_unlock(&kvm->mmu_lock);
339 srcu_read_unlock(&kvm->srcu, idx);
340
341 /* we've to flush the tlb before the pages can be freed */ 338 /* we've to flush the tlb before the pages can be freed */
342 if (need_tlb_flush) 339 if (need_tlb_flush)
343 kvm_flush_remote_tlbs(kvm); 340 kvm_flush_remote_tlbs(kvm);
341
342 spin_unlock(&kvm->mmu_lock);
343 srcu_read_unlock(&kvm->srcu, idx);
344} 344}
345 345
346static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 346static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
@@ -357,11 +357,11 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
357 * been freed. 357 * been freed.
358 */ 358 */
359 kvm->mmu_notifier_seq++; 359 kvm->mmu_notifier_seq++;
360 smp_wmb();
360 /* 361 /*
361 * The above sequence increase must be visible before the 362 * The above sequence increase must be visible before the
362 * below count decrease but both values are read by the kvm 363 * below count decrease, which is ensured by the smp_wmb above
363 * page fault under mmu_lock spinlock so we don't need to add 364 * in conjunction with the smp_rmb in mmu_notifier_retry().
364 * a smb_wmb() here in between the two.
365 */ 365 */
366 kvm->mmu_notifier_count--; 366 kvm->mmu_notifier_count--;
367 spin_unlock(&kvm->mmu_lock); 367 spin_unlock(&kvm->mmu_lock);
@@ -378,13 +378,14 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
378 378
379 idx = srcu_read_lock(&kvm->srcu); 379 idx = srcu_read_lock(&kvm->srcu);
380 spin_lock(&kvm->mmu_lock); 380 spin_lock(&kvm->mmu_lock);
381 young = kvm_age_hva(kvm, address);
382 spin_unlock(&kvm->mmu_lock);
383 srcu_read_unlock(&kvm->srcu, idx);
384 381
382 young = kvm_age_hva(kvm, address);
385 if (young) 383 if (young)
386 kvm_flush_remote_tlbs(kvm); 384 kvm_flush_remote_tlbs(kvm);
387 385
386 spin_unlock(&kvm->mmu_lock);
387 srcu_read_unlock(&kvm->srcu, idx);
388
388 return young; 389 return young;
389} 390}
390 391
@@ -449,7 +450,7 @@ static void kvm_init_memslots_id(struct kvm *kvm)
449 slots->id_to_index[i] = slots->memslots[i].id = i; 450 slots->id_to_index[i] = slots->memslots[i].id = i;
450} 451}
451 452
452static struct kvm *kvm_create_vm(void) 453static struct kvm *kvm_create_vm(unsigned long type)
453{ 454{
454 int r, i; 455 int r, i;
455 struct kvm *kvm = kvm_arch_alloc_vm(); 456 struct kvm *kvm = kvm_arch_alloc_vm();
@@ -457,7 +458,7 @@ static struct kvm *kvm_create_vm(void)
457 if (!kvm) 458 if (!kvm)
458 return ERR_PTR(-ENOMEM); 459 return ERR_PTR(-ENOMEM);
459 460
460 r = kvm_arch_init_vm(kvm); 461 r = kvm_arch_init_vm(kvm, type);
461 if (r) 462 if (r)
462 goto out_err_nodisable; 463 goto out_err_nodisable;
463 464
@@ -535,21 +536,13 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
535static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 536static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
536 struct kvm_memory_slot *dont) 537 struct kvm_memory_slot *dont)
537{ 538{
538 int i;
539
540 if (!dont || free->rmap != dont->rmap) 539 if (!dont || free->rmap != dont->rmap)
541 vfree(free->rmap); 540 vfree(free->rmap);
542 541
543 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 542 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
544 kvm_destroy_dirty_bitmap(free); 543 kvm_destroy_dirty_bitmap(free);
545 544
546 545 kvm_arch_free_memslot(free, dont);
547 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
548 if (!dont || free->lpage_info[i] != dont->lpage_info[i]) {
549 vfree(free->lpage_info[i]);
550 free->lpage_info[i] = NULL;
551 }
552 }
553 546
554 free->npages = 0; 547 free->npages = 0;
555 free->rmap = NULL; 548 free->rmap = NULL;
@@ -616,7 +609,6 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
616 return 0; 609 return 0;
617} 610}
618 611
619#ifndef CONFIG_S390
620/* 612/*
621 * Allocation size is twice as large as the actual dirty bitmap size. 613 * Allocation size is twice as large as the actual dirty bitmap size.
622 * This makes it possible to do double buffering: see x86's 614 * This makes it possible to do double buffering: see x86's
@@ -624,6 +616,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
624 */ 616 */
625static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) 617static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
626{ 618{
619#ifndef CONFIG_S390
627 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 620 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
628 621
629 if (dirty_bytes > PAGE_SIZE) 622 if (dirty_bytes > PAGE_SIZE)
@@ -636,21 +629,8 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
636 629
637 memslot->dirty_bitmap_head = memslot->dirty_bitmap; 630 memslot->dirty_bitmap_head = memslot->dirty_bitmap;
638 memslot->nr_dirty_pages = 0; 631 memslot->nr_dirty_pages = 0;
639 return 0;
640}
641#endif /* !CONFIG_S390 */ 632#endif /* !CONFIG_S390 */
642 633 return 0;
643static struct kvm_memory_slot *
644search_memslots(struct kvm_memslots *slots, gfn_t gfn)
645{
646 struct kvm_memory_slot *memslot;
647
648 kvm_for_each_memslot(memslot, slots)
649 if (gfn >= memslot->base_gfn &&
650 gfn < memslot->base_gfn + memslot->npages)
651 return memslot;
652
653 return NULL;
654} 634}
655 635
656static int cmp_memslot(const void *slot1, const void *slot2) 636static int cmp_memslot(const void *slot1, const void *slot2)
@@ -778,69 +758,24 @@ int __kvm_set_memory_region(struct kvm *kvm,
778 r = -ENOMEM; 758 r = -ENOMEM;
779 759
780 /* Allocate if a slot is being created */ 760 /* Allocate if a slot is being created */
761 if (npages && !old.npages) {
762 new.user_alloc = user_alloc;
763 new.userspace_addr = mem->userspace_addr;
781#ifndef CONFIG_S390 764#ifndef CONFIG_S390
782 if (npages && !new.rmap) {
783 new.rmap = vzalloc(npages * sizeof(*new.rmap)); 765 new.rmap = vzalloc(npages * sizeof(*new.rmap));
784
785 if (!new.rmap) 766 if (!new.rmap)
786 goto out_free; 767 goto out_free;
787 768#endif /* not defined CONFIG_S390 */
788 new.user_alloc = user_alloc; 769 if (kvm_arch_create_memslot(&new, npages))
789 new.userspace_addr = mem->userspace_addr;
790 }
791 if (!npages)
792 goto skip_lpage;
793
794 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
795 unsigned long ugfn;
796 unsigned long j;
797 int lpages;
798 int level = i + 2;
799
800 /* Avoid unused variable warning if no large pages */
801 (void)level;
802
803 if (new.lpage_info[i])
804 continue;
805
806 lpages = 1 + ((base_gfn + npages - 1)
807 >> KVM_HPAGE_GFN_SHIFT(level));
808 lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
809
810 new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i]));
811
812 if (!new.lpage_info[i])
813 goto out_free; 770 goto out_free;
814
815 if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
816 new.lpage_info[i][0].write_count = 1;
817 if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
818 new.lpage_info[i][lpages - 1].write_count = 1;
819 ugfn = new.userspace_addr >> PAGE_SHIFT;
820 /*
821 * If the gfn and userspace address are not aligned wrt each
822 * other, or if explicitly asked to, disable large page
823 * support for this slot
824 */
825 if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
826 !largepages_enabled)
827 for (j = 0; j < lpages; ++j)
828 new.lpage_info[i][j].write_count = 1;
829 } 771 }
830 772
831skip_lpage:
832
833 /* Allocate page dirty bitmap if needed */ 773 /* Allocate page dirty bitmap if needed */
834 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 774 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
835 if (kvm_create_dirty_bitmap(&new) < 0) 775 if (kvm_create_dirty_bitmap(&new) < 0)
836 goto out_free; 776 goto out_free;
837 /* destroy any largepage mappings for dirty tracking */ 777 /* destroy any largepage mappings for dirty tracking */
838 } 778 }
839#else /* not defined CONFIG_S390 */
840 new.user_alloc = user_alloc;
841 if (user_alloc)
842 new.userspace_addr = mem->userspace_addr;
843#endif /* not defined CONFIG_S390 */
844 779
845 if (!npages) { 780 if (!npages) {
846 struct kvm_memory_slot *slot; 781 struct kvm_memory_slot *slot;
@@ -890,8 +825,7 @@ skip_lpage:
890 if (!npages) { 825 if (!npages) {
891 new.rmap = NULL; 826 new.rmap = NULL;
892 new.dirty_bitmap = NULL; 827 new.dirty_bitmap = NULL;
893 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) 828 memset(&new.arch, 0, sizeof(new.arch));
894 new.lpage_info[i] = NULL;
895 } 829 }
896 830
897 update_memslots(slots, &new); 831 update_memslots(slots, &new);
@@ -978,6 +912,11 @@ out:
978 return r; 912 return r;
979} 913}
980 914
915bool kvm_largepages_enabled(void)
916{
917 return largepages_enabled;
918}
919
981void kvm_disable_largepages(void) 920void kvm_disable_largepages(void)
982{ 921{
983 largepages_enabled = false; 922 largepages_enabled = false;
@@ -1031,12 +970,6 @@ int kvm_is_error_hva(unsigned long addr)
1031} 970}
1032EXPORT_SYMBOL_GPL(kvm_is_error_hva); 971EXPORT_SYMBOL_GPL(kvm_is_error_hva);
1033 972
1034static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
1035 gfn_t gfn)
1036{
1037 return search_memslots(slots, gfn);
1038}
1039
1040struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 973struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
1041{ 974{
1042 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 975 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
@@ -1459,7 +1392,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1459 1392
1460 ghc->gpa = gpa; 1393 ghc->gpa = gpa;
1461 ghc->generation = slots->generation; 1394 ghc->generation = slots->generation;
1462 ghc->memslot = __gfn_to_memslot(slots, gfn); 1395 ghc->memslot = gfn_to_memslot(kvm, gfn);
1463 ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL); 1396 ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
1464 if (!kvm_is_error_hva(ghc->hva)) 1397 if (!kvm_is_error_hva(ghc->hva))
1465 ghc->hva += offset; 1398 ghc->hva += offset;
@@ -1657,7 +1590,7 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1657 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 1590 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
1658#endif 1591#endif
1659 else 1592 else
1660 return VM_FAULT_SIGBUS; 1593 return kvm_arch_vcpu_fault(vcpu, vmf);
1661 get_page(page); 1594 get_page(page);
1662 vmf->page = page; 1595 vmf->page = page;
1663 return 0; 1596 return 0;
@@ -1718,6 +1651,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
1718 goto vcpu_destroy; 1651 goto vcpu_destroy;
1719 1652
1720 mutex_lock(&kvm->lock); 1653 mutex_lock(&kvm->lock);
1654 if (!kvm_vcpu_compatible(vcpu)) {
1655 r = -EINVAL;
1656 goto unlock_vcpu_destroy;
1657 }
1721 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { 1658 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
1722 r = -EINVAL; 1659 r = -EINVAL;
1723 goto unlock_vcpu_destroy; 1660 goto unlock_vcpu_destroy;
@@ -2198,12 +2135,12 @@ static struct file_operations kvm_vm_fops = {
2198 .llseek = noop_llseek, 2135 .llseek = noop_llseek,
2199}; 2136};
2200 2137
2201static int kvm_dev_ioctl_create_vm(void) 2138static int kvm_dev_ioctl_create_vm(unsigned long type)
2202{ 2139{
2203 int r; 2140 int r;
2204 struct kvm *kvm; 2141 struct kvm *kvm;
2205 2142
2206 kvm = kvm_create_vm(); 2143 kvm = kvm_create_vm(type);
2207 if (IS_ERR(kvm)) 2144 if (IS_ERR(kvm))
2208 return PTR_ERR(kvm); 2145 return PTR_ERR(kvm);
2209#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2146#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
@@ -2254,10 +2191,7 @@ static long kvm_dev_ioctl(struct file *filp,
2254 r = KVM_API_VERSION; 2191 r = KVM_API_VERSION;
2255 break; 2192 break;
2256 case KVM_CREATE_VM: 2193 case KVM_CREATE_VM:
2257 r = -EINVAL; 2194 r = kvm_dev_ioctl_create_vm(arg);
2258 if (arg)
2259 goto out;
2260 r = kvm_dev_ioctl_create_vm();
2261 break; 2195 break;
2262 case KVM_CHECK_EXTENSION: 2196 case KVM_CHECK_EXTENSION:
2263 r = kvm_dev_ioctl_check_extension_generic(arg); 2197 r = kvm_dev_ioctl_check_extension_generic(arg);