aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-02-24 16:07:18 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-24 16:07:18 -0500
commit89f883372fa60f604d136924baf3e89ff1870e9e (patch)
treecb69b0a14957945ba00d3d392bf9ccbbef56f3b8
parent9e2d59ad580d590134285f361a0e80f0e98c0207 (diff)
parent6b73a96065e89dc9fa75ba4f78b1aa3a3bbd0470 (diff)
Merge tag 'kvm-3.9-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Marcelo Tosatti: "KVM updates for the 3.9 merge window, including x86 real mode emulation fixes, stronger memory slot interface restrictions, mmu_lock spinlock hold time reduction, improved handling of large page faults on shadow, initial APICv HW acceleration support, s390 channel IO based virtio, amongst others" * tag 'kvm-3.9-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (143 commits) Revert "KVM: MMU: lazily drop large spte" x86: pvclock kvm: align allocation size to page size KVM: nVMX: Remove redundant get_vmcs12 from nested_vmx_exit_handled_msr x86 emulator: fix parity calculation for AAD instruction KVM: PPC: BookE: Handle alignment interrupts booke: Added DBCR4 SPR number KVM: PPC: booke: Allow multiple exception types KVM: PPC: booke: use vcpu reference from thread_struct KVM: Remove user_alloc from struct kvm_memory_slot KVM: VMX: disable apicv by default KVM: s390: Fix handling of iscs. KVM: MMU: cleanup __direct_map KVM: MMU: remove pt_access in mmu_set_spte KVM: MMU: cleanup mapping-level KVM: MMU: lazily drop large spte KVM: VMX: cleanup vmx_set_cr0(). KVM: VMX: add missing exit names to VMX_EXIT_REASONS array KVM: VMX: disable SMEP feature when guest is in non-paging mode KVM: Remove duplicate text in api.txt Revert "KVM: MMU: split kvm_mmu_free_page" ...
-rw-r--r--Documentation/virtual/kvm/api.txt108
-rw-r--r--Documentation/virtual/kvm/mmu.txt7
-rw-r--r--arch/ia64/include/asm/kvm_host.h4
-rw-r--r--arch/ia64/kvm/kvm-ia64.c8
-rw-r--r--arch/ia64/kvm/lapic.h6
-rw-r--r--arch/powerpc/include/asm/kvm_host.h8
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h12
-rw-r--r--arch/powerpc/include/asm/reg.h2
-rw-r--r--arch/powerpc/include/asm/reg_booke.h1
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h6
-rw-r--r--arch/powerpc/kernel/asm-offsets.c2
-rw-r--r--arch/powerpc/kvm/Makefile9
-rw-r--r--arch/powerpc/kvm/book3s_emulate.c30
-rw-r--r--arch/powerpc/kvm/book3s_hv.c2
-rw-r--r--arch/powerpc/kvm/book3s_pr.c5
-rw-r--r--arch/powerpc/kvm/booke.c70
-rw-r--r--arch/powerpc/kvm/booke.h1
-rw-r--r--arch/powerpc/kvm/booke_emulate.c3
-rw-r--r--arch/powerpc/kvm/booke_interrupts.S49
-rw-r--r--arch/powerpc/kvm/e500.c16
-rw-r--r--arch/powerpc/kvm/e500.h1
-rw-r--r--arch/powerpc/kvm/e500_mmu.c (renamed from arch/powerpc/kvm/e500_tlb.c)659
-rw-r--r--arch/powerpc/kvm/e500_mmu_host.c699
-rw-r--r--arch/powerpc/kvm/e500_mmu_host.h18
-rw-r--r--arch/powerpc/kvm/emulate.c5
-rw-r--r--arch/powerpc/kvm/powerpc.c17
-rw-r--r--arch/s390/include/asm/irq.h1
-rw-r--r--arch/s390/include/asm/kvm_host.h15
-rw-r--r--arch/s390/kernel/irq.c1
-rw-r--r--arch/s390/kvm/intercept.c45
-rw-r--r--arch/s390/kvm/interrupt.c264
-rw-r--r--arch/s390/kvm/kvm-s390.c50
-rw-r--r--arch/s390/kvm/kvm-s390.h46
-rw-r--r--arch/s390/kvm/priv.c316
-rw-r--r--arch/s390/kvm/sigp.c10
-rw-r--r--arch/s390/kvm/trace-s390.h26
-rw-r--r--arch/x86/include/asm/kvm_host.h26
-rw-r--r--arch/x86/include/asm/kvm_para.h2
-rw-r--r--arch/x86/include/asm/vmx.h18
-rw-r--r--arch/x86/include/uapi/asm/vmx.h9
-rw-r--r--arch/x86/kernel/kvmclock.c11
-rw-r--r--arch/x86/kvm/emulate.c673
-rw-r--r--arch/x86/kvm/i8254.c1
-rw-r--r--arch/x86/kvm/i8259.c2
-rw-r--r--arch/x86/kvm/irq.c74
-rw-r--r--arch/x86/kvm/lapic.c140
-rw-r--r--arch/x86/kvm/lapic.h34
-rw-r--r--arch/x86/kvm/mmu.c168
-rw-r--r--arch/x86/kvm/mmutrace.h6
-rw-r--r--arch/x86/kvm/paging_tmpl.h106
-rw-r--r--arch/x86/kvm/svm.c24
-rw-r--r--arch/x86/kvm/vmx.c714
-rw-r--r--arch/x86/kvm/x86.c168
-rw-r--r--drivers/s390/kvm/Makefile2
-rw-r--r--drivers/s390/kvm/kvm_virtio.c38
-rw-r--r--drivers/s390/kvm/virtio_ccw.c926
-rw-r--r--include/linux/kvm_host.h30
-rw-r--r--include/trace/events/kvm.h2
-rw-r--r--include/uapi/linux/kvm.h27
-rw-r--r--kernel/sched/core.c25
-rw-r--r--virt/kvm/ioapic.c48
-rw-r--r--virt/kvm/ioapic.h4
-rw-r--r--virt/kvm/iommu.c4
-rw-r--r--virt/kvm/irq_comm.c25
-rw-r--r--virt/kvm/kvm_main.c201
65 files changed, 4359 insertions, 1671 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index e0fa0ea2b187..119358dfb742 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -219,19 +219,6 @@ allocation of vcpu ids. For example, if userspace wants
219single-threaded guest vcpus, it should make all vcpu ids be a multiple 219single-threaded guest vcpus, it should make all vcpu ids be a multiple
220of the number of vcpus per vcore. 220of the number of vcpus per vcore.
221 221
222On powerpc using book3s_hv mode, the vcpus are mapped onto virtual
223threads in one or more virtual CPU cores. (This is because the
224hardware requires all the hardware threads in a CPU core to be in the
225same partition.) The KVM_CAP_PPC_SMT capability indicates the number
226of vcpus per virtual core (vcore). The vcore id is obtained by
227dividing the vcpu id by the number of vcpus per vcore. The vcpus in a
228given vcore will always be in the same physical core as each other
229(though that might be a different physical core from time to time).
230Userspace can control the threading (SMT) mode of the guest by its
231allocation of vcpu ids. For example, if userspace wants
232single-threaded guest vcpus, it should make all vcpu ids be a multiple
233of the number of vcpus per vcore.
234
235For virtual cpus that have been created with S390 user controlled virtual 222For virtual cpus that have been created with S390 user controlled virtual
236machines, the resulting vcpu fd can be memory mapped at page offset 223machines, the resulting vcpu fd can be memory mapped at page offset
237KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual 224KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual
@@ -345,7 +332,7 @@ struct kvm_sregs {
345 __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; 332 __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
346}; 333};
347 334
348/* ppc -- see arch/powerpc/include/asm/kvm.h */ 335/* ppc -- see arch/powerpc/include/uapi/asm/kvm.h */
349 336
350interrupt_bitmap is a bitmap of pending external interrupts. At most 337interrupt_bitmap is a bitmap of pending external interrupts. At most
351one bit may be set. This interrupt has been acknowledged by the APIC 338one bit may be set. This interrupt has been acknowledged by the APIC
@@ -892,12 +879,12 @@ It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr
892be identical. This allows large pages in the guest to be backed by large 879be identical. This allows large pages in the guest to be backed by large
893pages in the host. 880pages in the host.
894 881
895The flags field supports two flag, KVM_MEM_LOG_DIRTY_PAGES, which instructs 882The flags field supports two flags: KVM_MEM_LOG_DIRTY_PAGES and
896kvm to keep track of writes to memory within the slot. See KVM_GET_DIRTY_LOG 883KVM_MEM_READONLY. The former can be set to instruct KVM to keep track of
897ioctl. The KVM_CAP_READONLY_MEM capability indicates the availability of the 884writes to memory within the slot. See KVM_GET_DIRTY_LOG ioctl to know how to
898KVM_MEM_READONLY flag. When this flag is set for a memory region, KVM only 885use it. The latter can be set, if KVM_CAP_READONLY_MEM capability allows it,
899allows read accesses. Writes will be posted to userspace as KVM_EXIT_MMIO 886to make a new slot read-only. In this case, writes to this memory will be
900exits. 887posted to userspace as KVM_EXIT_MMIO exits.
901 888
902When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of 889When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of
903the memory region are automatically reflected into the guest. For example, an 890the memory region are automatically reflected into the guest. For example, an
@@ -931,7 +918,7 @@ documentation when it pops into existence).
9314.37 KVM_ENABLE_CAP 9184.37 KVM_ENABLE_CAP
932 919
933Capability: KVM_CAP_ENABLE_CAP 920Capability: KVM_CAP_ENABLE_CAP
934Architectures: ppc 921Architectures: ppc, s390
935Type: vcpu ioctl 922Type: vcpu ioctl
936Parameters: struct kvm_enable_cap (in) 923Parameters: struct kvm_enable_cap (in)
937Returns: 0 on success; -1 on error 924Returns: 0 on success; -1 on error
@@ -1792,6 +1779,7 @@ registers, find a list below:
1792 PPC | KVM_REG_PPC_VPA_SLB | 128 1779 PPC | KVM_REG_PPC_VPA_SLB | 128
1793 PPC | KVM_REG_PPC_VPA_DTL | 128 1780 PPC | KVM_REG_PPC_VPA_DTL | 128
1794 PPC | KVM_REG_PPC_EPCR | 32 1781 PPC | KVM_REG_PPC_EPCR | 32
1782 PPC | KVM_REG_PPC_EPR | 32
1795 1783
1796ARM registers are mapped using the lower 32 bits. The upper 16 of that 1784ARM registers are mapped using the lower 32 bits. The upper 16 of that
1797is the register group type, or coprocessor number: 1785is the register group type, or coprocessor number:
@@ -2108,6 +2096,14 @@ KVM_S390_INT_VIRTIO (vm) - virtio external interrupt; external interrupt
2108KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm 2096KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm
2109KVM_S390_INT_EMERGENCY (vcpu) - sigp emergency; source cpu in parm 2097KVM_S390_INT_EMERGENCY (vcpu) - sigp emergency; source cpu in parm
2110KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm 2098KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm
2099KVM_S390_INT_IO(ai,cssid,ssid,schid) (vm) - compound value to indicate an
2100 I/O interrupt (ai - adapter interrupt; cssid,ssid,schid - subchannel);
2101 I/O interruption parameters in parm (subchannel) and parm64 (intparm,
2102 interruption subclass)
2103KVM_S390_MCHK (vm, vcpu) - machine check interrupt; cr 14 bits in parm,
2104 machine check interrupt code in parm64 (note that
2105 machine checks needing further payload are not
2106 supported by this ioctl)
2111 2107
2112Note that the vcpu ioctl is asynchronous to vcpu execution. 2108Note that the vcpu ioctl is asynchronous to vcpu execution.
2113 2109
@@ -2359,8 +2355,8 @@ executed a memory-mapped I/O instruction which could not be satisfied
2359by kvm. The 'data' member contains the written data if 'is_write' is 2355by kvm. The 'data' member contains the written data if 'is_write' is
2360true, and should be filled by application code otherwise. 2356true, and should be filled by application code otherwise.
2361 2357
2362NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR 2358NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR,
2363 and KVM_EXIT_PAPR the corresponding 2359 KVM_EXIT_PAPR and KVM_EXIT_EPR the corresponding
2364operations are complete (and guest state is consistent) only after userspace 2360operations are complete (and guest state is consistent) only after userspace
2365has re-entered the kernel with KVM_RUN. The kernel side will first finish 2361has re-entered the kernel with KVM_RUN. The kernel side will first finish
2366incomplete operations and then check for pending signals. Userspace 2362incomplete operations and then check for pending signals. Userspace
@@ -2463,6 +2459,41 @@ The possible hypercalls are defined in the Power Architecture Platform
2463Requirements (PAPR) document available from www.power.org (free 2459Requirements (PAPR) document available from www.power.org (free
2464developer registration required to access it). 2460developer registration required to access it).
2465 2461
2462 /* KVM_EXIT_S390_TSCH */
2463 struct {
2464 __u16 subchannel_id;
2465 __u16 subchannel_nr;
2466 __u32 io_int_parm;
2467 __u32 io_int_word;
2468 __u32 ipb;
2469 __u8 dequeued;
2470 } s390_tsch;
2471
2472s390 specific. This exit occurs when KVM_CAP_S390_CSS_SUPPORT has been enabled
2473and TEST SUBCHANNEL was intercepted. If dequeued is set, a pending I/O
2474interrupt for the target subchannel has been dequeued and subchannel_id,
2475subchannel_nr, io_int_parm and io_int_word contain the parameters for that
2476interrupt. ipb is needed for instruction parameter decoding.
2477
2478 /* KVM_EXIT_EPR */
2479 struct {
2480 __u32 epr;
2481 } epr;
2482
2483On FSL BookE PowerPC chips, the interrupt controller has a fast patch
2484interrupt acknowledge path to the core. When the core successfully
2485delivers an interrupt, it automatically populates the EPR register with
2486the interrupt vector number and acknowledges the interrupt inside
2487the interrupt controller.
2488
2489In case the interrupt controller lives in user space, we need to do
2490the interrupt acknowledge cycle through it to fetch the next to be
2491delivered interrupt vector using this exit.
2492
2493It gets triggered whenever both KVM_CAP_PPC_EPR are enabled and an
2494external interrupt has just been delivered into the guest. User space
2495should put the acknowledged interrupt vector into the 'epr' field.
2496
2466 /* Fix the size of the union. */ 2497 /* Fix the size of the union. */
2467 char padding[256]; 2498 char padding[256];
2468 }; 2499 };
@@ -2584,3 +2615,34 @@ For mmu types KVM_MMU_FSL_BOOKE_NOHV and KVM_MMU_FSL_BOOKE_HV:
2584 where "num_sets" is the tlb_sizes[] value divided by the tlb_ways[] value. 2615 where "num_sets" is the tlb_sizes[] value divided by the tlb_ways[] value.
2585 - The tsize field of mas1 shall be set to 4K on TLB0, even though the 2616 - The tsize field of mas1 shall be set to 4K on TLB0, even though the
2586 hardware ignores this value for TLB0. 2617 hardware ignores this value for TLB0.
2618
26196.4 KVM_CAP_S390_CSS_SUPPORT
2620
2621Architectures: s390
2622Parameters: none
2623Returns: 0 on success; -1 on error
2624
2625This capability enables support for handling of channel I/O instructions.
2626
2627TEST PENDING INTERRUPTION and the interrupt portion of TEST SUBCHANNEL are
2628handled in-kernel, while the other I/O instructions are passed to userspace.
2629
2630When this capability is enabled, KVM_EXIT_S390_TSCH will occur on TEST
2631SUBCHANNEL intercepts.
2632
26336.5 KVM_CAP_PPC_EPR
2634
2635Architectures: ppc
2636Parameters: args[0] defines whether the proxy facility is active
2637Returns: 0 on success; -1 on error
2638
2639This capability enables or disables the delivery of interrupts through the
2640external proxy facility.
2641
2642When enabled (args[0] != 0), every time the guest gets an external interrupt
2643delivered, it automatically exits into user space with a KVM_EXIT_EPR exit
2644to receive the topmost interrupt vector.
2645
2646When disabled (args[0] == 0), behavior is as if this facility is unsupported.
2647
2648When this capability is enabled, KVM_EXIT_EPR can occur.
diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
index fa5f1dbc6b23..43fcb761ed16 100644
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@ -187,13 +187,6 @@ Shadow pages contain the following information:
187 perform a reverse map from a pte to a gfn. When role.direct is set, any 187 perform a reverse map from a pte to a gfn. When role.direct is set, any
188 element of this array can be calculated from the gfn field when used, in 188 element of this array can be calculated from the gfn field when used, in
189 this case, the array of gfns is not allocated. See role.direct and gfn. 189 this case, the array of gfns is not allocated. See role.direct and gfn.
190 slot_bitmap:
191 A bitmap containing one bit per memory slot. If the page contains a pte
192 mapping a page from memory slot n, then bit n of slot_bitmap will be set
193 (if a page is aliased among several slots, then it is not guaranteed that
194 all slots will be marked).
195 Used during dirty logging to avoid scanning a shadow page if none if its
196 pages need tracking.
197 root_count: 190 root_count:
198 A counter keeping track of how many hardware registers (guest cr3 or 191 A counter keeping track of how many hardware registers (guest cr3 or
199 pdptrs) are now pointing at the page. While this counter is nonzero, the 192 pdptrs) are now pointing at the page. While this counter is nonzero, the
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 6d6a5ac48d85..cfa74983c675 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -23,9 +23,7 @@
23#ifndef __ASM_KVM_HOST_H 23#ifndef __ASM_KVM_HOST_H
24#define __ASM_KVM_HOST_H 24#define __ASM_KVM_HOST_H
25 25
26#define KVM_MEMORY_SLOTS 32 26#define KVM_USER_MEM_SLOTS 32
27/* memory slots that does not exposed to userspace */
28#define KVM_PRIVATE_MEM_SLOTS 4
29 27
30#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 28#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
31 29
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index bd1c51555038..ad3126a58644 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -955,7 +955,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
955 kvm_mem.guest_phys_addr; 955 kvm_mem.guest_phys_addr;
956 kvm_userspace_mem.memory_size = kvm_mem.memory_size; 956 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
957 r = kvm_vm_ioctl_set_memory_region(kvm, 957 r = kvm_vm_ioctl_set_memory_region(kvm,
958 &kvm_userspace_mem, 0); 958 &kvm_userspace_mem, false);
959 if (r) 959 if (r)
960 goto out; 960 goto out;
961 break; 961 break;
@@ -1580,7 +1580,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
1580 struct kvm_memory_slot *memslot, 1580 struct kvm_memory_slot *memslot,
1581 struct kvm_memory_slot old, 1581 struct kvm_memory_slot old,
1582 struct kvm_userspace_memory_region *mem, 1582 struct kvm_userspace_memory_region *mem,
1583 int user_alloc) 1583 bool user_alloc)
1584{ 1584{
1585 unsigned long i; 1585 unsigned long i;
1586 unsigned long pfn; 1586 unsigned long pfn;
@@ -1611,7 +1611,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
1611void kvm_arch_commit_memory_region(struct kvm *kvm, 1611void kvm_arch_commit_memory_region(struct kvm *kvm,
1612 struct kvm_userspace_memory_region *mem, 1612 struct kvm_userspace_memory_region *mem,
1613 struct kvm_memory_slot old, 1613 struct kvm_memory_slot old,
1614 int user_alloc) 1614 bool user_alloc)
1615{ 1615{
1616 return; 1616 return;
1617} 1617}
@@ -1834,7 +1834,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1834 mutex_lock(&kvm->slots_lock); 1834 mutex_lock(&kvm->slots_lock);
1835 1835
1836 r = -EINVAL; 1836 r = -EINVAL;
1837 if (log->slot >= KVM_MEMORY_SLOTS) 1837 if (log->slot >= KVM_USER_MEM_SLOTS)
1838 goto out; 1838 goto out;
1839 1839
1840 memslot = id_to_memslot(kvm->memslots, log->slot); 1840 memslot = id_to_memslot(kvm->memslots, log->slot);
diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index c5f92a926a9a..c3e2935b6db4 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -27,4 +27,10 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
27#define kvm_apic_present(x) (true) 27#define kvm_apic_present(x) (true)
28#define kvm_lapic_enabled(x) (true) 28#define kvm_lapic_enabled(x) (true)
29 29
30static inline bool kvm_apic_vid_enabled(void)
31{
32 /* IA64 has no apicv supporting, do nothing here */
33 return false;
34}
35
30#endif 36#endif
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 03d7beae89a0..d1bb86074721 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -37,10 +37,8 @@
37 37
38#define KVM_MAX_VCPUS NR_CPUS 38#define KVM_MAX_VCPUS NR_CPUS
39#define KVM_MAX_VCORES NR_CPUS 39#define KVM_MAX_VCORES NR_CPUS
40#define KVM_MEMORY_SLOTS 32 40#define KVM_USER_MEM_SLOTS 32
41/* memory slots that does not exposed to userspace */ 41#define KVM_MEM_SLOTS_NUM KVM_USER_MEM_SLOTS
42#define KVM_PRIVATE_MEM_SLOTS 4
43#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
44 42
45#ifdef CONFIG_KVM_MMIO 43#ifdef CONFIG_KVM_MMIO
46#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 44#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
@@ -523,6 +521,8 @@ struct kvm_vcpu_arch {
523 u8 sane; 521 u8 sane;
524 u8 cpu_type; 522 u8 cpu_type;
525 u8 hcall_needed; 523 u8 hcall_needed;
524 u8 epr_enabled;
525 u8 epr_needed;
526 526
527 u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ 527 u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
528 528
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 572aa7530619..44a657adf416 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -44,12 +44,11 @@ enum emulation_result {
44 EMULATE_DO_DCR, /* kvm_run filled with DCR request */ 44 EMULATE_DO_DCR, /* kvm_run filled with DCR request */
45 EMULATE_FAIL, /* can't emulate this instruction */ 45 EMULATE_FAIL, /* can't emulate this instruction */
46 EMULATE_AGAIN, /* something went wrong. go again */ 46 EMULATE_AGAIN, /* something went wrong. go again */
47 EMULATE_DO_PAPR, /* kvm_run filled with PAPR request */
47}; 48};
48 49
49extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); 50extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
50extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); 51extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
51extern char kvmppc_handlers_start[];
52extern unsigned long kvmppc_handler_len;
53extern void kvmppc_handler_highmem(void); 52extern void kvmppc_handler_highmem(void);
54 53
55extern void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu); 54extern void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu);
@@ -263,6 +262,15 @@ static inline void kvm_linear_init(void)
263{} 262{}
264#endif 263#endif
265 264
265static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr)
266{
267#ifdef CONFIG_KVM_BOOKE_HV
268 mtspr(SPRN_GEPR, epr);
269#elif defined(CONFIG_BOOKE)
270 vcpu->arch.epr = epr;
271#endif
272}
273
266int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, 274int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
267 struct kvm_config_tlb *cfg); 275 struct kvm_config_tlb *cfg);
268int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu, 276int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 7035e608f3fa..e66586122030 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -956,8 +956,6 @@
956#define SPRN_SPRG_RSCRATCH_DBG SPRN_SPRG9 956#define SPRN_SPRG_RSCRATCH_DBG SPRN_SPRG9
957#define SPRN_SPRG_WSCRATCH_DBG SPRN_SPRG9 957#define SPRN_SPRG_WSCRATCH_DBG SPRN_SPRG9
958#endif 958#endif
959#define SPRN_SPRG_RVCPU SPRN_SPRG1
960#define SPRN_SPRG_WVCPU SPRN_SPRG1
961#endif 959#endif
962 960
963#ifdef CONFIG_8xx 961#ifdef CONFIG_8xx
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index e07e6af5e1ff..b417de3cc2c4 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -56,6 +56,7 @@
56#define SPRN_SPRG7W 0x117 /* Special Purpose Register General 7 Write */ 56#define SPRN_SPRG7W 0x117 /* Special Purpose Register General 7 Write */
57#define SPRN_EPCR 0x133 /* Embedded Processor Control Register */ 57#define SPRN_EPCR 0x133 /* Embedded Processor Control Register */
58#define SPRN_DBCR2 0x136 /* Debug Control Register 2 */ 58#define SPRN_DBCR2 0x136 /* Debug Control Register 2 */
59#define SPRN_DBCR4 0x233 /* Debug Control Register 4 */
59#define SPRN_MSRP 0x137 /* MSR Protect Register */ 60#define SPRN_MSRP 0x137 /* MSR Protect Register */
60#define SPRN_IAC3 0x13A /* Instruction Address Compare 3 */ 61#define SPRN_IAC3 0x13A /* Instruction Address Compare 3 */
61#define SPRN_IAC4 0x13B /* Instruction Address Compare 4 */ 62#define SPRN_IAC4 0x13B /* Instruction Address Compare 4 */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 2fba8a66fb10..16064d00adb9 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -114,7 +114,10 @@ struct kvm_regs {
114/* Embedded Floating Point (SPE) -- IVOR32-34 if KVM_SREGS_E_IVOR */ 114/* Embedded Floating Point (SPE) -- IVOR32-34 if KVM_SREGS_E_IVOR */
115#define KVM_SREGS_E_SPE (1 << 9) 115#define KVM_SREGS_E_SPE (1 << 9)
116 116
117/* External Proxy (EXP) -- EPR */ 117/*
118 * DEPRECATED! USE ONE_REG FOR THIS ONE!
119 * External Proxy (EXP) -- EPR
120 */
118#define KVM_SREGS_EXP (1 << 10) 121#define KVM_SREGS_EXP (1 << 10)
119 122
120/* External PID (E.PD) -- EPSC/EPLC */ 123/* External PID (E.PD) -- EPSC/EPLC */
@@ -412,5 +415,6 @@ struct kvm_get_htab_header {
412#define KVM_REG_PPC_VPA_DTL (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x84) 415#define KVM_REG_PPC_VPA_DTL (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x84)
413 416
414#define KVM_REG_PPC_EPCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85) 417#define KVM_REG_PPC_EPCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85)
418#define KVM_REG_PPC_EPR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x86)
415 419
416#endif /* __LINUX_KVM_POWERPC_H */ 420#endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 781190367292..b6c17ec9b169 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -118,7 +118,7 @@ int main(void)
118#ifdef CONFIG_KVM_BOOK3S_32_HANDLER 118#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
119 DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu)); 119 DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu));
120#endif 120#endif
121#ifdef CONFIG_KVM_BOOKE_HV 121#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE)
122 DEFINE(THREAD_KVM_VCPU, offsetof(struct thread_struct, kvm_vcpu)); 122 DEFINE(THREAD_KVM_VCPU, offsetof(struct thread_struct, kvm_vcpu));
123#endif 123#endif
124 124
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 1e473d46322c..b772eded8c26 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -10,7 +10,8 @@ common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o \
10 eventfd.o) 10 eventfd.o)
11 11
12CFLAGS_44x_tlb.o := -I. 12CFLAGS_44x_tlb.o := -I.
13CFLAGS_e500_tlb.o := -I. 13CFLAGS_e500_mmu.o := -I.
14CFLAGS_e500_mmu_host.o := -I.
14CFLAGS_emulate.o := -I. 15CFLAGS_emulate.o := -I.
15 16
16common-objs-y += powerpc.o emulate.o 17common-objs-y += powerpc.o emulate.o
@@ -35,7 +36,8 @@ kvm-e500-objs := \
35 booke_emulate.o \ 36 booke_emulate.o \
36 booke_interrupts.o \ 37 booke_interrupts.o \
37 e500.o \ 38 e500.o \
38 e500_tlb.o \ 39 e500_mmu.o \
40 e500_mmu_host.o \
39 e500_emulate.o 41 e500_emulate.o
40kvm-objs-$(CONFIG_KVM_E500V2) := $(kvm-e500-objs) 42kvm-objs-$(CONFIG_KVM_E500V2) := $(kvm-e500-objs)
41 43
@@ -45,7 +47,8 @@ kvm-e500mc-objs := \
45 booke_emulate.o \ 47 booke_emulate.o \
46 bookehv_interrupts.o \ 48 bookehv_interrupts.o \
47 e500mc.o \ 49 e500mc.o \
48 e500_tlb.o \ 50 e500_mmu.o \
51 e500_mmu_host.o \
49 e500_emulate.o 52 e500_emulate.o
50kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs) 53kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs)
51 54
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index d31a716f7f2b..836c56975e21 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -34,6 +34,8 @@
34#define OP_31_XOP_MTSRIN 242 34#define OP_31_XOP_MTSRIN 242
35#define OP_31_XOP_TLBIEL 274 35#define OP_31_XOP_TLBIEL 274
36#define OP_31_XOP_TLBIE 306 36#define OP_31_XOP_TLBIE 306
37/* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */
38#define OP_31_XOP_FAKE_SC1 308
37#define OP_31_XOP_SLBMTE 402 39#define OP_31_XOP_SLBMTE 402
38#define OP_31_XOP_SLBIE 434 40#define OP_31_XOP_SLBIE 434
39#define OP_31_XOP_SLBIA 498 41#define OP_31_XOP_SLBIA 498
@@ -170,6 +172,32 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
170 vcpu->arch.mmu.tlbie(vcpu, addr, large); 172 vcpu->arch.mmu.tlbie(vcpu, addr, large);
171 break; 173 break;
172 } 174 }
175#ifdef CONFIG_KVM_BOOK3S_64_PR
176 case OP_31_XOP_FAKE_SC1:
177 {
178 /* SC 1 papr hypercalls */
179 ulong cmd = kvmppc_get_gpr(vcpu, 3);
180 int i;
181
182 if ((vcpu->arch.shared->msr & MSR_PR) ||
183 !vcpu->arch.papr_enabled) {
184 emulated = EMULATE_FAIL;
185 break;
186 }
187
188 if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE)
189 break;
190
191 run->papr_hcall.nr = cmd;
192 for (i = 0; i < 9; ++i) {
193 ulong gpr = kvmppc_get_gpr(vcpu, 4 + i);
194 run->papr_hcall.args[i] = gpr;
195 }
196
197 emulated = EMULATE_DO_PAPR;
198 break;
199 }
200#endif
173 case OP_31_XOP_EIOIO: 201 case OP_31_XOP_EIOIO:
174 break; 202 break;
175 case OP_31_XOP_SLBMTE: 203 case OP_31_XOP_SLBMTE:
@@ -427,6 +455,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
427 case SPRN_PMC3_GEKKO: 455 case SPRN_PMC3_GEKKO:
428 case SPRN_PMC4_GEKKO: 456 case SPRN_PMC4_GEKKO:
429 case SPRN_WPAR_GEKKO: 457 case SPRN_WPAR_GEKKO:
458 case SPRN_MSSSR0:
430 break; 459 break;
431unprivileged: 460unprivileged:
432 default: 461 default:
@@ -523,6 +552,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
523 case SPRN_PMC3_GEKKO: 552 case SPRN_PMC3_GEKKO:
524 case SPRN_PMC4_GEKKO: 553 case SPRN_PMC4_GEKKO:
525 case SPRN_WPAR_GEKKO: 554 case SPRN_WPAR_GEKKO:
555 case SPRN_MSSSR0:
526 *spr_val = 0; 556 *spr_val = 0;
527 break; 557 break;
528 default: 558 default:
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 71d0c90b62bf..80dcc53a1aba 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1549,7 +1549,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
1549 mutex_lock(&kvm->slots_lock); 1549 mutex_lock(&kvm->slots_lock);
1550 1550
1551 r = -EINVAL; 1551 r = -EINVAL;
1552 if (log->slot >= KVM_MEMORY_SLOTS) 1552 if (log->slot >= KVM_USER_MEM_SLOTS)
1553 goto out; 1553 goto out;
1554 1554
1555 memslot = id_to_memslot(kvm->memslots, log->slot); 1555 memslot = id_to_memslot(kvm->memslots, log->slot);
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 6702442ca818..5e93438afb06 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -762,6 +762,11 @@ program_interrupt:
762 run->exit_reason = KVM_EXIT_MMIO; 762 run->exit_reason = KVM_EXIT_MMIO;
763 r = RESUME_HOST_NV; 763 r = RESUME_HOST_NV;
764 break; 764 break;
765 case EMULATE_DO_PAPR:
766 run->exit_reason = KVM_EXIT_PAPR_HCALL;
767 vcpu->arch.hcall_needed = 1;
768 r = RESUME_HOST_NV;
769 break;
765 default: 770 default:
766 BUG(); 771 BUG();
767 } 772 }
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 69f114015780..020923e43134 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -182,6 +182,14 @@ static void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu,
182 kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE); 182 kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE);
183} 183}
184 184
185static void kvmppc_core_queue_alignment(struct kvm_vcpu *vcpu, ulong dear_flags,
186 ulong esr_flags)
187{
188 vcpu->arch.queued_dear = dear_flags;
189 vcpu->arch.queued_esr = esr_flags;
190 kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALIGNMENT);
191}
192
185void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong esr_flags) 193void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong esr_flags)
186{ 194{
187 vcpu->arch.queued_esr = esr_flags; 195 vcpu->arch.queued_esr = esr_flags;
@@ -300,13 +308,22 @@ static void set_guest_esr(struct kvm_vcpu *vcpu, u32 esr)
300#endif 308#endif
301} 309}
302 310
311static unsigned long get_guest_epr(struct kvm_vcpu *vcpu)
312{
313#ifdef CONFIG_KVM_BOOKE_HV
314 return mfspr(SPRN_GEPR);
315#else
316 return vcpu->arch.epr;
317#endif
318}
319
303/* Deliver the interrupt of the corresponding priority, if possible. */ 320/* Deliver the interrupt of the corresponding priority, if possible. */
304static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, 321static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
305 unsigned int priority) 322 unsigned int priority)
306{ 323{
307 int allowed = 0; 324 int allowed = 0;
308 ulong msr_mask = 0; 325 ulong msr_mask = 0;
309 bool update_esr = false, update_dear = false; 326 bool update_esr = false, update_dear = false, update_epr = false;
310 ulong crit_raw = vcpu->arch.shared->critical; 327 ulong crit_raw = vcpu->arch.shared->critical;
311 ulong crit_r1 = kvmppc_get_gpr(vcpu, 1); 328 ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
312 bool crit; 329 bool crit;
@@ -330,9 +347,13 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
330 keep_irq = true; 347 keep_irq = true;
331 } 348 }
332 349
350 if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_enabled)
351 update_epr = true;
352
333 switch (priority) { 353 switch (priority) {
334 case BOOKE_IRQPRIO_DTLB_MISS: 354 case BOOKE_IRQPRIO_DTLB_MISS:
335 case BOOKE_IRQPRIO_DATA_STORAGE: 355 case BOOKE_IRQPRIO_DATA_STORAGE:
356 case BOOKE_IRQPRIO_ALIGNMENT:
336 update_dear = true; 357 update_dear = true;
337 /* fall through */ 358 /* fall through */
338 case BOOKE_IRQPRIO_INST_STORAGE: 359 case BOOKE_IRQPRIO_INST_STORAGE:
@@ -346,7 +367,6 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
346 case BOOKE_IRQPRIO_SPE_FP_DATA: 367 case BOOKE_IRQPRIO_SPE_FP_DATA:
347 case BOOKE_IRQPRIO_SPE_FP_ROUND: 368 case BOOKE_IRQPRIO_SPE_FP_ROUND:
348 case BOOKE_IRQPRIO_AP_UNAVAIL: 369 case BOOKE_IRQPRIO_AP_UNAVAIL:
349 case BOOKE_IRQPRIO_ALIGNMENT:
350 allowed = 1; 370 allowed = 1;
351 msr_mask = MSR_CE | MSR_ME | MSR_DE; 371 msr_mask = MSR_CE | MSR_ME | MSR_DE;
352 int_class = INT_CLASS_NONCRIT; 372 int_class = INT_CLASS_NONCRIT;
@@ -408,6 +428,8 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
408 set_guest_esr(vcpu, vcpu->arch.queued_esr); 428 set_guest_esr(vcpu, vcpu->arch.queued_esr);
409 if (update_dear == true) 429 if (update_dear == true)
410 set_guest_dear(vcpu, vcpu->arch.queued_dear); 430 set_guest_dear(vcpu, vcpu->arch.queued_dear);
431 if (update_epr == true)
432 kvm_make_request(KVM_REQ_EPR_EXIT, vcpu);
411 433
412 new_msr &= msr_mask; 434 new_msr &= msr_mask;
413#if defined(CONFIG_64BIT) 435#if defined(CONFIG_64BIT)
@@ -581,6 +603,11 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
581 603
582 kvmppc_core_check_exceptions(vcpu); 604 kvmppc_core_check_exceptions(vcpu);
583 605
606 if (vcpu->requests) {
607 /* Exception delivery raised request; start over */
608 return 1;
609 }
610
584 if (vcpu->arch.shared->msr & MSR_WE) { 611 if (vcpu->arch.shared->msr & MSR_WE) {
585 local_irq_enable(); 612 local_irq_enable();
586 kvm_vcpu_block(vcpu); 613 kvm_vcpu_block(vcpu);
@@ -610,6 +637,13 @@ int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
610 r = 0; 637 r = 0;
611 } 638 }
612 639
640 if (kvm_check_request(KVM_REQ_EPR_EXIT, vcpu)) {
641 vcpu->run->epr.epr = 0;
642 vcpu->arch.epr_needed = true;
643 vcpu->run->exit_reason = KVM_EXIT_EPR;
644 r = 0;
645 }
646
613 return r; 647 return r;
614} 648}
615 649
@@ -945,6 +979,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
945 r = RESUME_GUEST; 979 r = RESUME_GUEST;
946 break; 980 break;
947 981
982 case BOOKE_INTERRUPT_ALIGNMENT:
983 kvmppc_core_queue_alignment(vcpu, vcpu->arch.fault_dear,
984 vcpu->arch.fault_esr);
985 r = RESUME_GUEST;
986 break;
987
948#ifdef CONFIG_KVM_BOOKE_HV 988#ifdef CONFIG_KVM_BOOKE_HV
949 case BOOKE_INTERRUPT_HV_SYSCALL: 989 case BOOKE_INTERRUPT_HV_SYSCALL:
950 if (!(vcpu->arch.shared->msr & MSR_PR)) { 990 if (!(vcpu->arch.shared->msr & MSR_PR)) {
@@ -1388,6 +1428,11 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
1388 &vcpu->arch.dbg_reg.dac[dac], sizeof(u64)); 1428 &vcpu->arch.dbg_reg.dac[dac], sizeof(u64));
1389 break; 1429 break;
1390 } 1430 }
1431 case KVM_REG_PPC_EPR: {
1432 u32 epr = get_guest_epr(vcpu);
1433 r = put_user(epr, (u32 __user *)(long)reg->addr);
1434 break;
1435 }
1391#if defined(CONFIG_64BIT) 1436#if defined(CONFIG_64BIT)
1392 case KVM_REG_PPC_EPCR: 1437 case KVM_REG_PPC_EPCR:
1393 r = put_user(vcpu->arch.epcr, (u32 __user *)(long)reg->addr); 1438 r = put_user(vcpu->arch.epcr, (u32 __user *)(long)reg->addr);
@@ -1420,6 +1465,13 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
1420 (u64 __user *)(long)reg->addr, sizeof(u64)); 1465 (u64 __user *)(long)reg->addr, sizeof(u64));
1421 break; 1466 break;
1422 } 1467 }
1468 case KVM_REG_PPC_EPR: {
1469 u32 new_epr;
1470 r = get_user(new_epr, (u32 __user *)(long)reg->addr);
1471 if (!r)
1472 kvmppc_set_epr(vcpu, new_epr);
1473 break;
1474 }
1423#if defined(CONFIG_64BIT) 1475#if defined(CONFIG_64BIT)
1424 case KVM_REG_PPC_EPCR: { 1476 case KVM_REG_PPC_EPCR: {
1425 u32 new_epcr; 1477 u32 new_epcr;
@@ -1556,7 +1608,9 @@ int __init kvmppc_booke_init(void)
1556{ 1608{
1557#ifndef CONFIG_KVM_BOOKE_HV 1609#ifndef CONFIG_KVM_BOOKE_HV
1558 unsigned long ivor[16]; 1610 unsigned long ivor[16];
1611 unsigned long *handler = kvmppc_booke_handler_addr;
1559 unsigned long max_ivor = 0; 1612 unsigned long max_ivor = 0;
1613 unsigned long handler_len;
1560 int i; 1614 int i;
1561 1615
1562 /* We install our own exception handlers by hijacking IVPR. IVPR must 1616 /* We install our own exception handlers by hijacking IVPR. IVPR must
@@ -1589,14 +1643,16 @@ int __init kvmppc_booke_init(void)
1589 1643
1590 for (i = 0; i < 16; i++) { 1644 for (i = 0; i < 16; i++) {
1591 if (ivor[i] > max_ivor) 1645 if (ivor[i] > max_ivor)
1592 max_ivor = ivor[i]; 1646 max_ivor = i;
1593 1647
1648 handler_len = handler[i + 1] - handler[i];
1594 memcpy((void *)kvmppc_booke_handlers + ivor[i], 1649 memcpy((void *)kvmppc_booke_handlers + ivor[i],
1595 kvmppc_handlers_start + i * kvmppc_handler_len, 1650 (void *)handler[i], handler_len);
1596 kvmppc_handler_len);
1597 } 1651 }
1598 flush_icache_range(kvmppc_booke_handlers, 1652
1599 kvmppc_booke_handlers + max_ivor + kvmppc_handler_len); 1653 handler_len = handler[max_ivor + 1] - handler[max_ivor];
1654 flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers +
1655 ivor[max_ivor] + handler_len);
1600#endif /* !BOOKE_HV */ 1656#endif /* !BOOKE_HV */
1601 return 0; 1657 return 0;
1602} 1658}
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index e9b88e433f64..5fd1ba693579 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -65,6 +65,7 @@
65 (1 << BOOKE_IRQPRIO_CRITICAL)) 65 (1 << BOOKE_IRQPRIO_CRITICAL))
66 66
67extern unsigned long kvmppc_booke_handlers; 67extern unsigned long kvmppc_booke_handlers;
68extern unsigned long kvmppc_booke_handler_addr[];
68 69
69void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr); 70void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr);
70void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr); 71void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr);
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 4685b8cf2249..27a4b2877c10 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -269,6 +269,9 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
269 case SPRN_ESR: 269 case SPRN_ESR:
270 *spr_val = vcpu->arch.shared->esr; 270 *spr_val = vcpu->arch.shared->esr;
271 break; 271 break;
272 case SPRN_EPR:
273 *spr_val = vcpu->arch.epr;
274 break;
272 case SPRN_CSRR0: 275 case SPRN_CSRR0:
273 *spr_val = vcpu->arch.csrr0; 276 *spr_val = vcpu->arch.csrr0;
274 break; 277 break;
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index bb46b32f9813..f4bb55c96517 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -45,18 +45,21 @@
45 (1<<BOOKE_INTERRUPT_DEBUG)) 45 (1<<BOOKE_INTERRUPT_DEBUG))
46 46
47#define NEED_DEAR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \ 47#define NEED_DEAR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \
48 (1<<BOOKE_INTERRUPT_DTLB_MISS)) 48 (1<<BOOKE_INTERRUPT_DTLB_MISS) | \
49 (1<<BOOKE_INTERRUPT_ALIGNMENT))
49 50
50#define NEED_ESR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \ 51#define NEED_ESR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \
51 (1<<BOOKE_INTERRUPT_INST_STORAGE) | \ 52 (1<<BOOKE_INTERRUPT_INST_STORAGE) | \
52 (1<<BOOKE_INTERRUPT_PROGRAM) | \ 53 (1<<BOOKE_INTERRUPT_PROGRAM) | \
53 (1<<BOOKE_INTERRUPT_DTLB_MISS)) 54 (1<<BOOKE_INTERRUPT_DTLB_MISS) | \
55 (1<<BOOKE_INTERRUPT_ALIGNMENT))
54 56
55.macro KVM_HANDLER ivor_nr scratch srr0 57.macro KVM_HANDLER ivor_nr scratch srr0
56_GLOBAL(kvmppc_handler_\ivor_nr) 58_GLOBAL(kvmppc_handler_\ivor_nr)
57 /* Get pointer to vcpu and record exit number. */ 59 /* Get pointer to vcpu and record exit number. */
58 mtspr \scratch , r4 60 mtspr \scratch , r4
59 mfspr r4, SPRN_SPRG_RVCPU 61 mfspr r4, SPRN_SPRG_THREAD
62 lwz r4, THREAD_KVM_VCPU(r4)
60 stw r3, VCPU_GPR(R3)(r4) 63 stw r3, VCPU_GPR(R3)(r4)
61 stw r5, VCPU_GPR(R5)(r4) 64 stw r5, VCPU_GPR(R5)(r4)
62 stw r6, VCPU_GPR(R6)(r4) 65 stw r6, VCPU_GPR(R6)(r4)
@@ -73,6 +76,14 @@ _GLOBAL(kvmppc_handler_\ivor_nr)
73 bctr 76 bctr
74.endm 77.endm
75 78
79.macro KVM_HANDLER_ADDR ivor_nr
80 .long kvmppc_handler_\ivor_nr
81.endm
82
83.macro KVM_HANDLER_END
84 .long kvmppc_handlers_end
85.endm
86
76_GLOBAL(kvmppc_handlers_start) 87_GLOBAL(kvmppc_handlers_start)
77KVM_HANDLER BOOKE_INTERRUPT_CRITICAL SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 88KVM_HANDLER BOOKE_INTERRUPT_CRITICAL SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
78KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK SPRN_SPRG_RSCRATCH_MC SPRN_MCSRR0 89KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK SPRN_SPRG_RSCRATCH_MC SPRN_MCSRR0
@@ -93,9 +104,7 @@ KVM_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
93KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 104KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
94KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA SPRN_SPRG_RSCRATCH0 SPRN_SRR0 105KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA SPRN_SPRG_RSCRATCH0 SPRN_SRR0
95KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND SPRN_SPRG_RSCRATCH0 SPRN_SRR0 106KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND SPRN_SPRG_RSCRATCH0 SPRN_SRR0
96 107_GLOBAL(kvmppc_handlers_end)
97_GLOBAL(kvmppc_handler_len)
98 .long kvmppc_handler_1 - kvmppc_handler_0
99 108
100/* Registers: 109/* Registers:
101 * SPRG_SCRATCH0: guest r4 110 * SPRG_SCRATCH0: guest r4
@@ -402,9 +411,6 @@ lightweight_exit:
402 lwz r8, kvmppc_booke_handlers@l(r8) 411 lwz r8, kvmppc_booke_handlers@l(r8)
403 mtspr SPRN_IVPR, r8 412 mtspr SPRN_IVPR, r8
404 413
405 /* Save vcpu pointer for the exception handlers. */
406 mtspr SPRN_SPRG_WVCPU, r4
407
408 lwz r5, VCPU_SHARED(r4) 414 lwz r5, VCPU_SHARED(r4)
409 415
410 /* Can't switch the stack pointer until after IVPR is switched, 416 /* Can't switch the stack pointer until after IVPR is switched,
@@ -463,6 +469,31 @@ lightweight_exit:
463 lwz r4, VCPU_GPR(R4)(r4) 469 lwz r4, VCPU_GPR(R4)(r4)
464 rfi 470 rfi
465 471
472 .data
473 .align 4
474 .globl kvmppc_booke_handler_addr
475kvmppc_booke_handler_addr:
476KVM_HANDLER_ADDR BOOKE_INTERRUPT_CRITICAL
477KVM_HANDLER_ADDR BOOKE_INTERRUPT_MACHINE_CHECK
478KVM_HANDLER_ADDR BOOKE_INTERRUPT_DATA_STORAGE
479KVM_HANDLER_ADDR BOOKE_INTERRUPT_INST_STORAGE
480KVM_HANDLER_ADDR BOOKE_INTERRUPT_EXTERNAL
481KVM_HANDLER_ADDR BOOKE_INTERRUPT_ALIGNMENT
482KVM_HANDLER_ADDR BOOKE_INTERRUPT_PROGRAM
483KVM_HANDLER_ADDR BOOKE_INTERRUPT_FP_UNAVAIL
484KVM_HANDLER_ADDR BOOKE_INTERRUPT_SYSCALL
485KVM_HANDLER_ADDR BOOKE_INTERRUPT_AP_UNAVAIL
486KVM_HANDLER_ADDR BOOKE_INTERRUPT_DECREMENTER
487KVM_HANDLER_ADDR BOOKE_INTERRUPT_FIT
488KVM_HANDLER_ADDR BOOKE_INTERRUPT_WATCHDOG
489KVM_HANDLER_ADDR BOOKE_INTERRUPT_DTLB_MISS
490KVM_HANDLER_ADDR BOOKE_INTERRUPT_ITLB_MISS
491KVM_HANDLER_ADDR BOOKE_INTERRUPT_DEBUG
492KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_UNAVAIL
493KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_FP_DATA
494KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_FP_ROUND
495KVM_HANDLER_END /*Always keep this in end*/
496
466#ifdef CONFIG_SPE 497#ifdef CONFIG_SPE
467_GLOBAL(kvmppc_save_guest_spe) 498_GLOBAL(kvmppc_save_guest_spe)
468 cmpi 0,r3,0 499 cmpi 0,r3,0
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index b479ed77c515..6dd4de7802bf 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -491,6 +491,9 @@ static int __init kvmppc_e500_init(void)
491{ 491{
492 int r, i; 492 int r, i;
493 unsigned long ivor[3]; 493 unsigned long ivor[3];
494 /* Process remaining handlers above the generic first 16 */
495 unsigned long *handler = &kvmppc_booke_handler_addr[16];
496 unsigned long handler_len;
494 unsigned long max_ivor = 0; 497 unsigned long max_ivor = 0;
495 498
496 r = kvmppc_core_check_processor_compat(); 499 r = kvmppc_core_check_processor_compat();
@@ -506,15 +509,16 @@ static int __init kvmppc_e500_init(void)
506 ivor[1] = mfspr(SPRN_IVOR33); 509 ivor[1] = mfspr(SPRN_IVOR33);
507 ivor[2] = mfspr(SPRN_IVOR34); 510 ivor[2] = mfspr(SPRN_IVOR34);
508 for (i = 0; i < 3; i++) { 511 for (i = 0; i < 3; i++) {
509 if (ivor[i] > max_ivor) 512 if (ivor[i] > ivor[max_ivor])
510 max_ivor = ivor[i]; 513 max_ivor = i;
511 514
515 handler_len = handler[i + 1] - handler[i];
512 memcpy((void *)kvmppc_booke_handlers + ivor[i], 516 memcpy((void *)kvmppc_booke_handlers + ivor[i],
513 kvmppc_handlers_start + (i + 16) * kvmppc_handler_len, 517 (void *)handler[i], handler_len);
514 kvmppc_handler_len);
515 } 518 }
516 flush_icache_range(kvmppc_booke_handlers, 519 handler_len = handler[max_ivor + 1] - handler[max_ivor];
517 kvmppc_booke_handlers + max_ivor + kvmppc_handler_len); 520 flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers +
521 ivor[max_ivor] + handler_len);
518 522
519 return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); 523 return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
520} 524}
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index c70d37ed770a..41cefd43655f 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -28,6 +28,7 @@
28 28
29#define E500_TLB_VALID 1 29#define E500_TLB_VALID 1
30#define E500_TLB_BITMAP 2 30#define E500_TLB_BITMAP 2
31#define E500_TLB_TLB0 (1 << 2)
31 32
32struct tlbe_ref { 33struct tlbe_ref {
33 pfn_t pfn; 34 pfn_t pfn;
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_mmu.c
index cf3f18012371..5c4475983f78 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -1,10 +1,11 @@
1/* 1/*
2 * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. 2 * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved.
3 * 3 *
4 * Author: Yu Liu, yu.liu@freescale.com 4 * Author: Yu Liu, yu.liu@freescale.com
5 * Scott Wood, scottwood@freescale.com 5 * Scott Wood, scottwood@freescale.com
6 * Ashish Kalra, ashish.kalra@freescale.com 6 * Ashish Kalra, ashish.kalra@freescale.com
7 * Varun Sethi, varun.sethi@freescale.com 7 * Varun Sethi, varun.sethi@freescale.com
8 * Alexander Graf, agraf@suse.de
8 * 9 *
9 * Description: 10 * Description:
10 * This file is based on arch/powerpc/kvm/44x_tlb.c, 11 * This file is based on arch/powerpc/kvm/44x_tlb.c,
@@ -33,10 +34,7 @@
33#include "e500.h" 34#include "e500.h"
34#include "trace.h" 35#include "trace.h"
35#include "timing.h" 36#include "timing.h"
36 37#include "e500_mmu_host.h"
37#define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1)
38
39static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM];
40 38
41static inline unsigned int gtlb0_get_next_victim( 39static inline unsigned int gtlb0_get_next_victim(
42 struct kvmppc_vcpu_e500 *vcpu_e500) 40 struct kvmppc_vcpu_e500 *vcpu_e500)
@@ -50,174 +48,6 @@ static inline unsigned int gtlb0_get_next_victim(
50 return victim; 48 return victim;
51} 49}
52 50
53static inline unsigned int tlb1_max_shadow_size(void)
54{
55 /* reserve one entry for magic page */
56 return host_tlb_params[1].entries - tlbcam_index - 1;
57}
58
59static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe)
60{
61 return tlbe->mas7_3 & (MAS3_SW|MAS3_UW);
62}
63
64static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode)
65{
66 /* Mask off reserved bits. */
67 mas3 &= MAS3_ATTRIB_MASK;
68
69#ifndef CONFIG_KVM_BOOKE_HV
70 if (!usermode) {
71 /* Guest is in supervisor mode,
72 * so we need to translate guest
73 * supervisor permissions into user permissions. */
74 mas3 &= ~E500_TLB_USER_PERM_MASK;
75 mas3 |= (mas3 & E500_TLB_SUPER_PERM_MASK) << 1;
76 }
77 mas3 |= E500_TLB_SUPER_PERM_MASK;
78#endif
79 return mas3;
80}
81
82static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode)
83{
84#ifdef CONFIG_SMP
85 return (mas2 & MAS2_ATTRIB_MASK) | MAS2_M;
86#else
87 return mas2 & MAS2_ATTRIB_MASK;
88#endif
89}
90
91/*
92 * writing shadow tlb entry to host TLB
93 */
94static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe,
95 uint32_t mas0)
96{
97 unsigned long flags;
98
99 local_irq_save(flags);
100 mtspr(SPRN_MAS0, mas0);
101 mtspr(SPRN_MAS1, stlbe->mas1);
102 mtspr(SPRN_MAS2, (unsigned long)stlbe->mas2);
103 mtspr(SPRN_MAS3, (u32)stlbe->mas7_3);
104 mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32));
105#ifdef CONFIG_KVM_BOOKE_HV
106 mtspr(SPRN_MAS8, stlbe->mas8);
107#endif
108 asm volatile("isync; tlbwe" : : : "memory");
109
110#ifdef CONFIG_KVM_BOOKE_HV
111 /* Must clear mas8 for other host tlbwe's */
112 mtspr(SPRN_MAS8, 0);
113 isync();
114#endif
115 local_irq_restore(flags);
116
117 trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1,
118 stlbe->mas2, stlbe->mas7_3);
119}
120
121/*
122 * Acquire a mas0 with victim hint, as if we just took a TLB miss.
123 *
124 * We don't care about the address we're searching for, other than that it's
125 * in the right set and is not present in the TLB. Using a zero PID and a
126 * userspace address means we don't have to set and then restore MAS5, or
127 * calculate a proper MAS6 value.
128 */
129static u32 get_host_mas0(unsigned long eaddr)
130{
131 unsigned long flags;
132 u32 mas0;
133
134 local_irq_save(flags);
135 mtspr(SPRN_MAS6, 0);
136 asm volatile("tlbsx 0, %0" : : "b" (eaddr & ~CONFIG_PAGE_OFFSET));
137 mas0 = mfspr(SPRN_MAS0);
138 local_irq_restore(flags);
139
140 return mas0;
141}
142
143/* sesel is for tlb1 only */
144static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
145 int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe)
146{
147 u32 mas0;
148
149 if (tlbsel == 0) {
150 mas0 = get_host_mas0(stlbe->mas2);
151 __write_host_tlbe(stlbe, mas0);
152 } else {
153 __write_host_tlbe(stlbe,
154 MAS0_TLBSEL(1) |
155 MAS0_ESEL(to_htlb1_esel(sesel)));
156 }
157}
158
159#ifdef CONFIG_KVM_E500V2
160void kvmppc_map_magic(struct kvm_vcpu *vcpu)
161{
162 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
163 struct kvm_book3e_206_tlb_entry magic;
164 ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
165 unsigned int stid;
166 pfn_t pfn;
167
168 pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT;
169 get_page(pfn_to_page(pfn));
170
171 preempt_disable();
172 stid = kvmppc_e500_get_sid(vcpu_e500, 0, 0, 0, 0);
173
174 magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) |
175 MAS1_TSIZE(BOOK3E_PAGESZ_4K);
176 magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M;
177 magic.mas7_3 = ((u64)pfn << PAGE_SHIFT) |
178 MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR;
179 magic.mas8 = 0;
180
181 __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index));
182 preempt_enable();
183}
184#endif
185
186static void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500,
187 int tlbsel, int esel)
188{
189 struct kvm_book3e_206_tlb_entry *gtlbe =
190 get_entry(vcpu_e500, tlbsel, esel);
191
192 if (tlbsel == 1 &&
193 vcpu_e500->gtlb_priv[1][esel].ref.flags & E500_TLB_BITMAP) {
194 u64 tmp = vcpu_e500->g2h_tlb1_map[esel];
195 int hw_tlb_indx;
196 unsigned long flags;
197
198 local_irq_save(flags);
199 while (tmp) {
200 hw_tlb_indx = __ilog2_u64(tmp & -tmp);
201 mtspr(SPRN_MAS0,
202 MAS0_TLBSEL(1) |
203 MAS0_ESEL(to_htlb1_esel(hw_tlb_indx)));
204 mtspr(SPRN_MAS1, 0);
205 asm volatile("tlbwe");
206 vcpu_e500->h2g_tlb1_rmap[hw_tlb_indx] = 0;
207 tmp &= tmp - 1;
208 }
209 mb();
210 vcpu_e500->g2h_tlb1_map[esel] = 0;
211 vcpu_e500->gtlb_priv[1][esel].ref.flags &= ~E500_TLB_BITMAP;
212 local_irq_restore(flags);
213
214 return;
215 }
216
217 /* Guest tlbe is backed by at most one host tlbe per shadow pid. */
218 kvmppc_e500_tlbil_one(vcpu_e500, gtlbe);
219}
220
221static int tlb0_set_base(gva_t addr, int sets, int ways) 51static int tlb0_set_base(gva_t addr, int sets, int ways)
222{ 52{
223 int set_base; 53 int set_base;
@@ -296,70 +126,6 @@ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500,
296 return -1; 126 return -1;
297} 127}
298 128
299static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
300 struct kvm_book3e_206_tlb_entry *gtlbe,
301 pfn_t pfn)
302{
303 ref->pfn = pfn;
304 ref->flags = E500_TLB_VALID;
305
306 if (tlbe_is_writable(gtlbe))
307 kvm_set_pfn_dirty(pfn);
308}
309
310static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref)
311{
312 if (ref->flags & E500_TLB_VALID) {
313 trace_kvm_booke206_ref_release(ref->pfn, ref->flags);
314 ref->flags = 0;
315 }
316}
317
318static void clear_tlb1_bitmap(struct kvmppc_vcpu_e500 *vcpu_e500)
319{
320 if (vcpu_e500->g2h_tlb1_map)
321 memset(vcpu_e500->g2h_tlb1_map, 0,
322 sizeof(u64) * vcpu_e500->gtlb_params[1].entries);
323 if (vcpu_e500->h2g_tlb1_rmap)
324 memset(vcpu_e500->h2g_tlb1_rmap, 0,
325 sizeof(unsigned int) * host_tlb_params[1].entries);
326}
327
328static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500)
329{
330 int tlbsel = 0;
331 int i;
332
333 for (i = 0; i < vcpu_e500->gtlb_params[tlbsel].entries; i++) {
334 struct tlbe_ref *ref =
335 &vcpu_e500->gtlb_priv[tlbsel][i].ref;
336 kvmppc_e500_ref_release(ref);
337 }
338}
339
340static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500)
341{
342 int stlbsel = 1;
343 int i;
344
345 kvmppc_e500_tlbil_all(vcpu_e500);
346
347 for (i = 0; i < host_tlb_params[stlbsel].entries; i++) {
348 struct tlbe_ref *ref =
349 &vcpu_e500->tlb_refs[stlbsel][i];
350 kvmppc_e500_ref_release(ref);
351 }
352
353 clear_tlb_privs(vcpu_e500);
354}
355
356void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu)
357{
358 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
359 clear_tlb_refs(vcpu_e500);
360 clear_tlb1_bitmap(vcpu_e500);
361}
362
363static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, 129static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
364 unsigned int eaddr, int as) 130 unsigned int eaddr, int as)
365{ 131{
@@ -385,216 +151,6 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
385 | (as ? MAS6_SAS : 0); 151 | (as ? MAS6_SAS : 0);
386} 152}
387 153
388/* TID must be supplied by the caller */
389static inline void kvmppc_e500_setup_stlbe(
390 struct kvm_vcpu *vcpu,
391 struct kvm_book3e_206_tlb_entry *gtlbe,
392 int tsize, struct tlbe_ref *ref, u64 gvaddr,
393 struct kvm_book3e_206_tlb_entry *stlbe)
394{
395 pfn_t pfn = ref->pfn;
396 u32 pr = vcpu->arch.shared->msr & MSR_PR;
397
398 BUG_ON(!(ref->flags & E500_TLB_VALID));
399
400 /* Force IPROT=0 for all guest mappings. */
401 stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID;
402 stlbe->mas2 = (gvaddr & MAS2_EPN) |
403 e500_shadow_mas2_attrib(gtlbe->mas2, pr);
404 stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) |
405 e500_shadow_mas3_attrib(gtlbe->mas7_3, pr);
406
407#ifdef CONFIG_KVM_BOOKE_HV
408 stlbe->mas8 = MAS8_TGS | vcpu->kvm->arch.lpid;
409#endif
410}
411
412static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
413 u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
414 int tlbsel, struct kvm_book3e_206_tlb_entry *stlbe,
415 struct tlbe_ref *ref)
416{
417 struct kvm_memory_slot *slot;
418 unsigned long pfn = 0; /* silence GCC warning */
419 unsigned long hva;
420 int pfnmap = 0;
421 int tsize = BOOK3E_PAGESZ_4K;
422
423 /*
424 * Translate guest physical to true physical, acquiring
425 * a page reference if it is normal, non-reserved memory.
426 *
427 * gfn_to_memslot() must succeed because otherwise we wouldn't
428 * have gotten this far. Eventually we should just pass the slot
429 * pointer through from the first lookup.
430 */
431 slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn);
432 hva = gfn_to_hva_memslot(slot, gfn);
433
434 if (tlbsel == 1) {
435 struct vm_area_struct *vma;
436 down_read(&current->mm->mmap_sem);
437
438 vma = find_vma(current->mm, hva);
439 if (vma && hva >= vma->vm_start &&
440 (vma->vm_flags & VM_PFNMAP)) {
441 /*
442 * This VMA is a physically contiguous region (e.g.
443 * /dev/mem) that bypasses normal Linux page
444 * management. Find the overlap between the
445 * vma and the memslot.
446 */
447
448 unsigned long start, end;
449 unsigned long slot_start, slot_end;
450
451 pfnmap = 1;
452
453 start = vma->vm_pgoff;
454 end = start +
455 ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT);
456
457 pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT);
458
459 slot_start = pfn - (gfn - slot->base_gfn);
460 slot_end = slot_start + slot->npages;
461
462 if (start < slot_start)
463 start = slot_start;
464 if (end > slot_end)
465 end = slot_end;
466
467 tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
468 MAS1_TSIZE_SHIFT;
469
470 /*
471 * e500 doesn't implement the lowest tsize bit,
472 * or 1K pages.
473 */
474 tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
475
476 /*
477 * Now find the largest tsize (up to what the guest
478 * requested) that will cover gfn, stay within the
479 * range, and for which gfn and pfn are mutually
480 * aligned.
481 */
482
483 for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) {
484 unsigned long gfn_start, gfn_end, tsize_pages;
485 tsize_pages = 1 << (tsize - 2);
486
487 gfn_start = gfn & ~(tsize_pages - 1);
488 gfn_end = gfn_start + tsize_pages;
489
490 if (gfn_start + pfn - gfn < start)
491 continue;
492 if (gfn_end + pfn - gfn > end)
493 continue;
494 if ((gfn & (tsize_pages - 1)) !=
495 (pfn & (tsize_pages - 1)))
496 continue;
497
498 gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
499 pfn &= ~(tsize_pages - 1);
500 break;
501 }
502 } else if (vma && hva >= vma->vm_start &&
503 (vma->vm_flags & VM_HUGETLB)) {
504 unsigned long psize = vma_kernel_pagesize(vma);
505
506 tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
507 MAS1_TSIZE_SHIFT;
508
509 /*
510 * Take the largest page size that satisfies both host
511 * and guest mapping
512 */
513 tsize = min(__ilog2(psize) - 10, tsize);
514
515 /*
516 * e500 doesn't implement the lowest tsize bit,
517 * or 1K pages.
518 */
519 tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
520 }
521
522 up_read(&current->mm->mmap_sem);
523 }
524
525 if (likely(!pfnmap)) {
526 unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
527 pfn = gfn_to_pfn_memslot(slot, gfn);
528 if (is_error_noslot_pfn(pfn)) {
529 printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
530 (long)gfn);
531 return;
532 }
533
534 /* Align guest and physical address to page map boundaries */
535 pfn &= ~(tsize_pages - 1);
536 gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
537 }
538
539 /* Drop old ref and setup new one. */
540 kvmppc_e500_ref_release(ref);
541 kvmppc_e500_ref_setup(ref, gtlbe, pfn);
542
543 kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
544 ref, gvaddr, stlbe);
545
546 /* Clear i-cache for new pages */
547 kvmppc_mmu_flush_icache(pfn);
548
549 /* Drop refcount on page, so that mmu notifiers can clear it */
550 kvm_release_pfn_clean(pfn);
551}
552
553/* XXX only map the one-one case, for now use TLB0 */
554static void kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500,
555 int esel,
556 struct kvm_book3e_206_tlb_entry *stlbe)
557{
558 struct kvm_book3e_206_tlb_entry *gtlbe;
559 struct tlbe_ref *ref;
560
561 gtlbe = get_entry(vcpu_e500, 0, esel);
562 ref = &vcpu_e500->gtlb_priv[0][esel].ref;
563
564 kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe),
565 get_tlb_raddr(gtlbe) >> PAGE_SHIFT,
566 gtlbe, 0, stlbe, ref);
567}
568
569/* Caller must ensure that the specified guest TLB entry is safe to insert into
570 * the shadow TLB. */
571/* XXX for both one-one and one-to-many , for now use TLB1 */
572static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
573 u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
574 struct kvm_book3e_206_tlb_entry *stlbe, int esel)
575{
576 struct tlbe_ref *ref;
577 unsigned int victim;
578
579 victim = vcpu_e500->host_tlb1_nv++;
580
581 if (unlikely(vcpu_e500->host_tlb1_nv >= tlb1_max_shadow_size()))
582 vcpu_e500->host_tlb1_nv = 0;
583
584 ref = &vcpu_e500->tlb_refs[1][victim];
585 kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe, ref);
586
587 vcpu_e500->g2h_tlb1_map[esel] |= (u64)1 << victim;
588 vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_BITMAP;
589 if (vcpu_e500->h2g_tlb1_rmap[victim]) {
590 unsigned int idx = vcpu_e500->h2g_tlb1_rmap[victim];
591 vcpu_e500->g2h_tlb1_map[idx] &= ~(1ULL << victim);
592 }
593 vcpu_e500->h2g_tlb1_rmap[victim] = esel;
594
595 return victim;
596}
597
598static void kvmppc_recalc_tlb1map_range(struct kvmppc_vcpu_e500 *vcpu_e500) 154static void kvmppc_recalc_tlb1map_range(struct kvmppc_vcpu_e500 *vcpu_e500)
599{ 155{
600 int size = vcpu_e500->gtlb_params[1].entries; 156 int size = vcpu_e500->gtlb_params[1].entries;
@@ -683,8 +239,8 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value)
683 for (esel = 0; esel < vcpu_e500->gtlb_params[1].entries; esel++) 239 for (esel = 0; esel < vcpu_e500->gtlb_params[1].entries; esel++)
684 kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel); 240 kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel);
685 241
686 /* Invalidate all vcpu id mappings */ 242 /* Invalidate all host shadow mappings */
687 kvmppc_e500_tlbil_all(vcpu_e500); 243 kvmppc_core_flush_tlb(&vcpu_e500->vcpu);
688 244
689 return EMULATE_DONE; 245 return EMULATE_DONE;
690} 246}
@@ -713,8 +269,8 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea)
713 kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); 269 kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
714 } 270 }
715 271
716 /* Invalidate all vcpu id mappings */ 272 /* Invalidate all host shadow mappings */
717 kvmppc_e500_tlbil_all(vcpu_e500); 273 kvmppc_core_flush_tlb(&vcpu_e500->vcpu);
718 274
719 return EMULATE_DONE; 275 return EMULATE_DONE;
720} 276}
@@ -834,27 +390,11 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea)
834 return EMULATE_DONE; 390 return EMULATE_DONE;
835} 391}
836 392
837/* sesel is for tlb1 only */
838static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
839 struct kvm_book3e_206_tlb_entry *gtlbe,
840 struct kvm_book3e_206_tlb_entry *stlbe,
841 int stlbsel, int sesel)
842{
843 int stid;
844
845 preempt_disable();
846 stid = kvmppc_e500_get_tlb_stid(&vcpu_e500->vcpu, gtlbe);
847
848 stlbe->mas1 |= MAS1_TID(stid);
849 write_host_tlbe(vcpu_e500, stlbsel, sesel, stlbe);
850 preempt_enable();
851}
852
853int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) 393int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
854{ 394{
855 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 395 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
856 struct kvm_book3e_206_tlb_entry *gtlbe, stlbe; 396 struct kvm_book3e_206_tlb_entry *gtlbe;
857 int tlbsel, esel, stlbsel, sesel; 397 int tlbsel, esel;
858 int recal = 0; 398 int recal = 0;
859 399
860 tlbsel = get_tlb_tlbsel(vcpu); 400 tlbsel = get_tlb_tlbsel(vcpu);
@@ -892,40 +432,16 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
892 432
893 /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ 433 /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
894 if (tlbe_is_host_safe(vcpu, gtlbe)) { 434 if (tlbe_is_host_safe(vcpu, gtlbe)) {
895 u64 eaddr; 435 u64 eaddr = get_tlb_eaddr(gtlbe);
896 u64 raddr; 436 u64 raddr = get_tlb_raddr(gtlbe);
897 437
898 switch (tlbsel) { 438 if (tlbsel == 0) {
899 case 0:
900 /* TLB0 */
901 gtlbe->mas1 &= ~MAS1_TSIZE(~0); 439 gtlbe->mas1 &= ~MAS1_TSIZE(~0);
902 gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K); 440 gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K);
903
904 stlbsel = 0;
905 kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe);
906 sesel = 0; /* unused */
907
908 break;
909
910 case 1:
911 /* TLB1 */
912 eaddr = get_tlb_eaddr(gtlbe);
913 raddr = get_tlb_raddr(gtlbe);
914
915 /* Create a 4KB mapping on the host.
916 * If the guest wanted a large page,
917 * only the first 4KB is mapped here and the rest
918 * are mapped on the fly. */
919 stlbsel = 1;
920 sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr,
921 raddr >> PAGE_SHIFT, gtlbe, &stlbe, esel);
922 break;
923
924 default:
925 BUG();
926 } 441 }
927 442
928 write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel); 443 /* Premap the faulting page */
444 kvmppc_mmu_map(vcpu, eaddr, raddr, index_of(tlbsel, esel));
929 } 445 }
930 446
931 kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS); 447 kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS);
@@ -1019,100 +535,14 @@ void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
1019{ 535{
1020} 536}
1021 537
1022void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
1023 unsigned int index)
1024{
1025 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
1026 struct tlbe_priv *priv;
1027 struct kvm_book3e_206_tlb_entry *gtlbe, stlbe;
1028 int tlbsel = tlbsel_of(index);
1029 int esel = esel_of(index);
1030 int stlbsel, sesel;
1031
1032 gtlbe = get_entry(vcpu_e500, tlbsel, esel);
1033
1034 switch (tlbsel) {
1035 case 0:
1036 stlbsel = 0;
1037 sesel = 0; /* unused */
1038 priv = &vcpu_e500->gtlb_priv[tlbsel][esel];
1039
1040 /* Only triggers after clear_tlb_refs */
1041 if (unlikely(!(priv->ref.flags & E500_TLB_VALID)))
1042 kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe);
1043 else
1044 kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K,
1045 &priv->ref, eaddr, &stlbe);
1046 break;
1047
1048 case 1: {
1049 gfn_t gfn = gpaddr >> PAGE_SHIFT;
1050
1051 stlbsel = 1;
1052 sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn,
1053 gtlbe, &stlbe, esel);
1054 break;
1055 }
1056
1057 default:
1058 BUG();
1059 break;
1060 }
1061
1062 write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel);
1063}
1064
1065/************* MMU Notifiers *************/
1066
1067int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
1068{
1069 trace_kvm_unmap_hva(hva);
1070
1071 /*
1072 * Flush all shadow tlb entries everywhere. This is slow, but
1073 * we are 100% sure that we catch the to be unmapped page
1074 */
1075 kvm_flush_remote_tlbs(kvm);
1076
1077 return 0;
1078}
1079
1080int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
1081{
1082 /* kvm_unmap_hva flushes everything anyways */
1083 kvm_unmap_hva(kvm, start);
1084
1085 return 0;
1086}
1087
1088int kvm_age_hva(struct kvm *kvm, unsigned long hva)
1089{
1090 /* XXX could be more clever ;) */
1091 return 0;
1092}
1093
1094int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1095{
1096 /* XXX could be more clever ;) */
1097 return 0;
1098}
1099
1100void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1101{
1102 /* The page will get remapped properly on its next fault */
1103 kvm_unmap_hva(kvm, hva);
1104}
1105
1106/*****************************************/ 538/*****************************************/
1107 539
1108static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500) 540static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
1109{ 541{
1110 int i; 542 int i;
1111 543
1112 clear_tlb1_bitmap(vcpu_e500); 544 kvmppc_core_flush_tlb(&vcpu_e500->vcpu);
1113 kfree(vcpu_e500->g2h_tlb1_map); 545 kfree(vcpu_e500->g2h_tlb1_map);
1114
1115 clear_tlb_refs(vcpu_e500);
1116 kfree(vcpu_e500->gtlb_priv[0]); 546 kfree(vcpu_e500->gtlb_priv[0]);
1117 kfree(vcpu_e500->gtlb_priv[1]); 547 kfree(vcpu_e500->gtlb_priv[1]);
1118 548
@@ -1303,7 +733,7 @@ int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
1303{ 733{
1304 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 734 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
1305 kvmppc_recalc_tlb1map_range(vcpu_e500); 735 kvmppc_recalc_tlb1map_range(vcpu_e500);
1306 clear_tlb_refs(vcpu_e500); 736 kvmppc_core_flush_tlb(vcpu);
1307 return 0; 737 return 0;
1308} 738}
1309 739
@@ -1313,37 +743,8 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
1313 int entry_size = sizeof(struct kvm_book3e_206_tlb_entry); 743 int entry_size = sizeof(struct kvm_book3e_206_tlb_entry);
1314 int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE; 744 int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE;
1315 745
1316 host_tlb_params[0].entries = mfspr(SPRN_TLB0CFG) & TLBnCFG_N_ENTRY; 746 if (e500_mmu_host_init(vcpu_e500))
1317 host_tlb_params[1].entries = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; 747 goto err;
1318
1319 /*
1320 * This should never happen on real e500 hardware, but is
1321 * architecturally possible -- e.g. in some weird nested
1322 * virtualization case.
1323 */
1324 if (host_tlb_params[0].entries == 0 ||
1325 host_tlb_params[1].entries == 0) {
1326 pr_err("%s: need to know host tlb size\n", __func__);
1327 return -ENODEV;
1328 }
1329
1330 host_tlb_params[0].ways = (mfspr(SPRN_TLB0CFG) & TLBnCFG_ASSOC) >>
1331 TLBnCFG_ASSOC_SHIFT;
1332 host_tlb_params[1].ways = host_tlb_params[1].entries;
1333
1334 if (!is_power_of_2(host_tlb_params[0].entries) ||
1335 !is_power_of_2(host_tlb_params[0].ways) ||
1336 host_tlb_params[0].entries < host_tlb_params[0].ways ||
1337 host_tlb_params[0].ways == 0) {
1338 pr_err("%s: bad tlb0 host config: %u entries %u ways\n",
1339 __func__, host_tlb_params[0].entries,
1340 host_tlb_params[0].ways);
1341 return -ENODEV;
1342 }
1343
1344 host_tlb_params[0].sets =
1345 host_tlb_params[0].entries / host_tlb_params[0].ways;
1346 host_tlb_params[1].sets = 1;
1347 748
1348 vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE; 749 vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE;
1349 vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE; 750 vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE;
@@ -1362,18 +763,6 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
1362 vcpu_e500->gtlb_offset[0] = 0; 763 vcpu_e500->gtlb_offset[0] = 0;
1363 vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE; 764 vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE;
1364 765
1365 vcpu_e500->tlb_refs[0] =
1366 kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[0].entries,
1367 GFP_KERNEL);
1368 if (!vcpu_e500->tlb_refs[0])
1369 goto err;
1370
1371 vcpu_e500->tlb_refs[1] =
1372 kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[1].entries,
1373 GFP_KERNEL);
1374 if (!vcpu_e500->tlb_refs[1])
1375 goto err;
1376
1377 vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) * 766 vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) *
1378 vcpu_e500->gtlb_params[0].entries, 767 vcpu_e500->gtlb_params[0].entries,
1379 GFP_KERNEL); 768 GFP_KERNEL);
@@ -1392,12 +781,6 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
1392 if (!vcpu_e500->g2h_tlb1_map) 781 if (!vcpu_e500->g2h_tlb1_map)
1393 goto err; 782 goto err;
1394 783
1395 vcpu_e500->h2g_tlb1_rmap = kzalloc(sizeof(unsigned int) *
1396 host_tlb_params[1].entries,
1397 GFP_KERNEL);
1398 if (!vcpu_e500->h2g_tlb1_rmap)
1399 goto err;
1400
1401 /* Init TLB configuration register */ 784 /* Init TLB configuration register */
1402 vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) & 785 vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) &
1403 ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); 786 ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
@@ -1416,15 +799,11 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
1416 799
1417err: 800err:
1418 free_gtlb(vcpu_e500); 801 free_gtlb(vcpu_e500);
1419 kfree(vcpu_e500->tlb_refs[0]);
1420 kfree(vcpu_e500->tlb_refs[1]);
1421 return -1; 802 return -1;
1422} 803}
1423 804
1424void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) 805void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
1425{ 806{
1426 free_gtlb(vcpu_e500); 807 free_gtlb(vcpu_e500);
1427 kfree(vcpu_e500->h2g_tlb1_rmap); 808 e500_mmu_host_uninit(vcpu_e500);
1428 kfree(vcpu_e500->tlb_refs[0]);
1429 kfree(vcpu_e500->tlb_refs[1]);
1430} 809}
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
new file mode 100644
index 000000000000..a222edfb9a9b
--- /dev/null
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -0,0 +1,699 @@
1/*
2 * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved.
3 *
4 * Author: Yu Liu, yu.liu@freescale.com
5 * Scott Wood, scottwood@freescale.com
6 * Ashish Kalra, ashish.kalra@freescale.com
7 * Varun Sethi, varun.sethi@freescale.com
8 * Alexander Graf, agraf@suse.de
9 *
10 * Description:
11 * This file is based on arch/powerpc/kvm/44x_tlb.c,
12 * by Hollis Blanchard <hollisb@us.ibm.com>.
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License, version 2, as
16 * published by the Free Software Foundation.
17 */
18
19#include <linux/kernel.h>
20#include <linux/types.h>
21#include <linux/slab.h>
22#include <linux/string.h>
23#include <linux/kvm.h>
24#include <linux/kvm_host.h>
25#include <linux/highmem.h>
26#include <linux/log2.h>
27#include <linux/uaccess.h>
28#include <linux/sched.h>
29#include <linux/rwsem.h>
30#include <linux/vmalloc.h>
31#include <linux/hugetlb.h>
32#include <asm/kvm_ppc.h>
33
34#include "e500.h"
35#include "trace.h"
36#include "timing.h"
37#include "e500_mmu_host.h"
38
39#define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1)
40
41static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM];
42
43static inline unsigned int tlb1_max_shadow_size(void)
44{
45 /* reserve one entry for magic page */
46 return host_tlb_params[1].entries - tlbcam_index - 1;
47}
48
49static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode)
50{
51 /* Mask off reserved bits. */
52 mas3 &= MAS3_ATTRIB_MASK;
53
54#ifndef CONFIG_KVM_BOOKE_HV
55 if (!usermode) {
56 /* Guest is in supervisor mode,
57 * so we need to translate guest
58 * supervisor permissions into user permissions. */
59 mas3 &= ~E500_TLB_USER_PERM_MASK;
60 mas3 |= (mas3 & E500_TLB_SUPER_PERM_MASK) << 1;
61 }
62 mas3 |= E500_TLB_SUPER_PERM_MASK;
63#endif
64 return mas3;
65}
66
67static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode)
68{
69#ifdef CONFIG_SMP
70 return (mas2 & MAS2_ATTRIB_MASK) | MAS2_M;
71#else
72 return mas2 & MAS2_ATTRIB_MASK;
73#endif
74}
75
76/*
77 * writing shadow tlb entry to host TLB
78 */
79static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe,
80 uint32_t mas0)
81{
82 unsigned long flags;
83
84 local_irq_save(flags);
85 mtspr(SPRN_MAS0, mas0);
86 mtspr(SPRN_MAS1, stlbe->mas1);
87 mtspr(SPRN_MAS2, (unsigned long)stlbe->mas2);
88 mtspr(SPRN_MAS3, (u32)stlbe->mas7_3);
89 mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32));
90#ifdef CONFIG_KVM_BOOKE_HV
91 mtspr(SPRN_MAS8, stlbe->mas8);
92#endif
93 asm volatile("isync; tlbwe" : : : "memory");
94
95#ifdef CONFIG_KVM_BOOKE_HV
96 /* Must clear mas8 for other host tlbwe's */
97 mtspr(SPRN_MAS8, 0);
98 isync();
99#endif
100 local_irq_restore(flags);
101
102 trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1,
103 stlbe->mas2, stlbe->mas7_3);
104}
105
106/*
107 * Acquire a mas0 with victim hint, as if we just took a TLB miss.
108 *
109 * We don't care about the address we're searching for, other than that it's
110 * in the right set and is not present in the TLB. Using a zero PID and a
111 * userspace address means we don't have to set and then restore MAS5, or
112 * calculate a proper MAS6 value.
113 */
114static u32 get_host_mas0(unsigned long eaddr)
115{
116 unsigned long flags;
117 u32 mas0;
118
119 local_irq_save(flags);
120 mtspr(SPRN_MAS6, 0);
121 asm volatile("tlbsx 0, %0" : : "b" (eaddr & ~CONFIG_PAGE_OFFSET));
122 mas0 = mfspr(SPRN_MAS0);
123 local_irq_restore(flags);
124
125 return mas0;
126}
127
128/* sesel is for tlb1 only */
129static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
130 int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe)
131{
132 u32 mas0;
133
134 if (tlbsel == 0) {
135 mas0 = get_host_mas0(stlbe->mas2);
136 __write_host_tlbe(stlbe, mas0);
137 } else {
138 __write_host_tlbe(stlbe,
139 MAS0_TLBSEL(1) |
140 MAS0_ESEL(to_htlb1_esel(sesel)));
141 }
142}
143
144/* sesel is for tlb1 only */
145static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
146 struct kvm_book3e_206_tlb_entry *gtlbe,
147 struct kvm_book3e_206_tlb_entry *stlbe,
148 int stlbsel, int sesel)
149{
150 int stid;
151
152 preempt_disable();
153 stid = kvmppc_e500_get_tlb_stid(&vcpu_e500->vcpu, gtlbe);
154
155 stlbe->mas1 |= MAS1_TID(stid);
156 write_host_tlbe(vcpu_e500, stlbsel, sesel, stlbe);
157 preempt_enable();
158}
159
160#ifdef CONFIG_KVM_E500V2
161/* XXX should be a hook in the gva2hpa translation */
162void kvmppc_map_magic(struct kvm_vcpu *vcpu)
163{
164 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
165 struct kvm_book3e_206_tlb_entry magic;
166 ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
167 unsigned int stid;
168 pfn_t pfn;
169
170 pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT;
171 get_page(pfn_to_page(pfn));
172
173 preempt_disable();
174 stid = kvmppc_e500_get_sid(vcpu_e500, 0, 0, 0, 0);
175
176 magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) |
177 MAS1_TSIZE(BOOK3E_PAGESZ_4K);
178 magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M;
179 magic.mas7_3 = ((u64)pfn << PAGE_SHIFT) |
180 MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR;
181 magic.mas8 = 0;
182
183 __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index));
184 preempt_enable();
185}
186#endif
187
188void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
189 int esel)
190{
191 struct kvm_book3e_206_tlb_entry *gtlbe =
192 get_entry(vcpu_e500, tlbsel, esel);
193 struct tlbe_ref *ref = &vcpu_e500->gtlb_priv[tlbsel][esel].ref;
194
195 /* Don't bother with unmapped entries */
196 if (!(ref->flags & E500_TLB_VALID))
197 return;
198
199 if (tlbsel == 1 && ref->flags & E500_TLB_BITMAP) {
200 u64 tmp = vcpu_e500->g2h_tlb1_map[esel];
201 int hw_tlb_indx;
202 unsigned long flags;
203
204 local_irq_save(flags);
205 while (tmp) {
206 hw_tlb_indx = __ilog2_u64(tmp & -tmp);
207 mtspr(SPRN_MAS0,
208 MAS0_TLBSEL(1) |
209 MAS0_ESEL(to_htlb1_esel(hw_tlb_indx)));
210 mtspr(SPRN_MAS1, 0);
211 asm volatile("tlbwe");
212 vcpu_e500->h2g_tlb1_rmap[hw_tlb_indx] = 0;
213 tmp &= tmp - 1;
214 }
215 mb();
216 vcpu_e500->g2h_tlb1_map[esel] = 0;
217 ref->flags &= ~(E500_TLB_BITMAP | E500_TLB_VALID);
218 local_irq_restore(flags);
219 }
220
221 if (tlbsel == 1 && ref->flags & E500_TLB_TLB0) {
222 /*
223 * TLB1 entry is backed by 4k pages. This should happen
224 * rarely and is not worth optimizing. Invalidate everything.
225 */
226 kvmppc_e500_tlbil_all(vcpu_e500);
227 ref->flags &= ~(E500_TLB_TLB0 | E500_TLB_VALID);
228 }
229
230 /* Already invalidated in between */
231 if (!(ref->flags & E500_TLB_VALID))
232 return;
233
234 /* Guest tlbe is backed by at most one host tlbe per shadow pid. */
235 kvmppc_e500_tlbil_one(vcpu_e500, gtlbe);
236
237 /* Mark the TLB as not backed by the host anymore */
238 ref->flags &= ~E500_TLB_VALID;
239}
240
241static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe)
242{
243 return tlbe->mas7_3 & (MAS3_SW|MAS3_UW);
244}
245
246static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
247 struct kvm_book3e_206_tlb_entry *gtlbe,
248 pfn_t pfn)
249{
250 ref->pfn = pfn;
251 ref->flags = E500_TLB_VALID;
252
253 if (tlbe_is_writable(gtlbe))
254 kvm_set_pfn_dirty(pfn);
255}
256
257static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref)
258{
259 if (ref->flags & E500_TLB_VALID) {
260 trace_kvm_booke206_ref_release(ref->pfn, ref->flags);
261 ref->flags = 0;
262 }
263}
264
265static void clear_tlb1_bitmap(struct kvmppc_vcpu_e500 *vcpu_e500)
266{
267 if (vcpu_e500->g2h_tlb1_map)
268 memset(vcpu_e500->g2h_tlb1_map, 0,
269 sizeof(u64) * vcpu_e500->gtlb_params[1].entries);
270 if (vcpu_e500->h2g_tlb1_rmap)
271 memset(vcpu_e500->h2g_tlb1_rmap, 0,
272 sizeof(unsigned int) * host_tlb_params[1].entries);
273}
274
275static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500)
276{
277 int tlbsel = 0;
278 int i;
279
280 for (i = 0; i < vcpu_e500->gtlb_params[tlbsel].entries; i++) {
281 struct tlbe_ref *ref =
282 &vcpu_e500->gtlb_priv[tlbsel][i].ref;
283 kvmppc_e500_ref_release(ref);
284 }
285}
286
287static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500)
288{
289 int stlbsel = 1;
290 int i;
291
292 kvmppc_e500_tlbil_all(vcpu_e500);
293
294 for (i = 0; i < host_tlb_params[stlbsel].entries; i++) {
295 struct tlbe_ref *ref =
296 &vcpu_e500->tlb_refs[stlbsel][i];
297 kvmppc_e500_ref_release(ref);
298 }
299
300 clear_tlb_privs(vcpu_e500);
301}
302
303void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu)
304{
305 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
306 clear_tlb_refs(vcpu_e500);
307 clear_tlb1_bitmap(vcpu_e500);
308}
309
310/* TID must be supplied by the caller */
311static void kvmppc_e500_setup_stlbe(
312 struct kvm_vcpu *vcpu,
313 struct kvm_book3e_206_tlb_entry *gtlbe,
314 int tsize, struct tlbe_ref *ref, u64 gvaddr,
315 struct kvm_book3e_206_tlb_entry *stlbe)
316{
317 pfn_t pfn = ref->pfn;
318 u32 pr = vcpu->arch.shared->msr & MSR_PR;
319
320 BUG_ON(!(ref->flags & E500_TLB_VALID));
321
322 /* Force IPROT=0 for all guest mappings. */
323 stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID;
324 stlbe->mas2 = (gvaddr & MAS2_EPN) |
325 e500_shadow_mas2_attrib(gtlbe->mas2, pr);
326 stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) |
327 e500_shadow_mas3_attrib(gtlbe->mas7_3, pr);
328
329#ifdef CONFIG_KVM_BOOKE_HV
330 stlbe->mas8 = MAS8_TGS | vcpu->kvm->arch.lpid;
331#endif
332}
333
334static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
335 u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
336 int tlbsel, struct kvm_book3e_206_tlb_entry *stlbe,
337 struct tlbe_ref *ref)
338{
339 struct kvm_memory_slot *slot;
340 unsigned long pfn = 0; /* silence GCC warning */
341 unsigned long hva;
342 int pfnmap = 0;
343 int tsize = BOOK3E_PAGESZ_4K;
344
345 /*
346 * Translate guest physical to true physical, acquiring
347 * a page reference if it is normal, non-reserved memory.
348 *
349 * gfn_to_memslot() must succeed because otherwise we wouldn't
350 * have gotten this far. Eventually we should just pass the slot
351 * pointer through from the first lookup.
352 */
353 slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn);
354 hva = gfn_to_hva_memslot(slot, gfn);
355
356 if (tlbsel == 1) {
357 struct vm_area_struct *vma;
358 down_read(&current->mm->mmap_sem);
359
360 vma = find_vma(current->mm, hva);
361 if (vma && hva >= vma->vm_start &&
362 (vma->vm_flags & VM_PFNMAP)) {
363 /*
364 * This VMA is a physically contiguous region (e.g.
365 * /dev/mem) that bypasses normal Linux page
366 * management. Find the overlap between the
367 * vma and the memslot.
368 */
369
370 unsigned long start, end;
371 unsigned long slot_start, slot_end;
372
373 pfnmap = 1;
374
375 start = vma->vm_pgoff;
376 end = start +
377 ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT);
378
379 pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT);
380
381 slot_start = pfn - (gfn - slot->base_gfn);
382 slot_end = slot_start + slot->npages;
383
384 if (start < slot_start)
385 start = slot_start;
386 if (end > slot_end)
387 end = slot_end;
388
389 tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
390 MAS1_TSIZE_SHIFT;
391
392 /*
393 * e500 doesn't implement the lowest tsize bit,
394 * or 1K pages.
395 */
396 tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
397
398 /*
399 * Now find the largest tsize (up to what the guest
400 * requested) that will cover gfn, stay within the
401 * range, and for which gfn and pfn are mutually
402 * aligned.
403 */
404
405 for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) {
406 unsigned long gfn_start, gfn_end, tsize_pages;
407 tsize_pages = 1 << (tsize - 2);
408
409 gfn_start = gfn & ~(tsize_pages - 1);
410 gfn_end = gfn_start + tsize_pages;
411
412 if (gfn_start + pfn - gfn < start)
413 continue;
414 if (gfn_end + pfn - gfn > end)
415 continue;
416 if ((gfn & (tsize_pages - 1)) !=
417 (pfn & (tsize_pages - 1)))
418 continue;
419
420 gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
421 pfn &= ~(tsize_pages - 1);
422 break;
423 }
424 } else if (vma && hva >= vma->vm_start &&
425 (vma->vm_flags & VM_HUGETLB)) {
426 unsigned long psize = vma_kernel_pagesize(vma);
427
428 tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
429 MAS1_TSIZE_SHIFT;
430
431 /*
432 * Take the largest page size that satisfies both host
433 * and guest mapping
434 */
435 tsize = min(__ilog2(psize) - 10, tsize);
436
437 /*
438 * e500 doesn't implement the lowest tsize bit,
439 * or 1K pages.
440 */
441 tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
442 }
443
444 up_read(&current->mm->mmap_sem);
445 }
446
447 if (likely(!pfnmap)) {
448 unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
449 pfn = gfn_to_pfn_memslot(slot, gfn);
450 if (is_error_noslot_pfn(pfn)) {
451 printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
452 (long)gfn);
453 return -EINVAL;
454 }
455
456 /* Align guest and physical address to page map boundaries */
457 pfn &= ~(tsize_pages - 1);
458 gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
459 }
460
461 /* Drop old ref and setup new one. */
462 kvmppc_e500_ref_release(ref);
463 kvmppc_e500_ref_setup(ref, gtlbe, pfn);
464
465 kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
466 ref, gvaddr, stlbe);
467
468 /* Clear i-cache for new pages */
469 kvmppc_mmu_flush_icache(pfn);
470
471 /* Drop refcount on page, so that mmu notifiers can clear it */
472 kvm_release_pfn_clean(pfn);
473
474 return 0;
475}
476
477/* XXX only map the one-one case, for now use TLB0 */
478static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, int esel,
479 struct kvm_book3e_206_tlb_entry *stlbe)
480{
481 struct kvm_book3e_206_tlb_entry *gtlbe;
482 struct tlbe_ref *ref;
483 int stlbsel = 0;
484 int sesel = 0;
485 int r;
486
487 gtlbe = get_entry(vcpu_e500, 0, esel);
488 ref = &vcpu_e500->gtlb_priv[0][esel].ref;
489
490 r = kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe),
491 get_tlb_raddr(gtlbe) >> PAGE_SHIFT,
492 gtlbe, 0, stlbe, ref);
493 if (r)
494 return r;
495
496 write_stlbe(vcpu_e500, gtlbe, stlbe, stlbsel, sesel);
497
498 return 0;
499}
500
501static int kvmppc_e500_tlb1_map_tlb1(struct kvmppc_vcpu_e500 *vcpu_e500,
502 struct tlbe_ref *ref,
503 int esel)
504{
505 unsigned int sesel = vcpu_e500->host_tlb1_nv++;
506
507 if (unlikely(vcpu_e500->host_tlb1_nv >= tlb1_max_shadow_size()))
508 vcpu_e500->host_tlb1_nv = 0;
509
510 vcpu_e500->tlb_refs[1][sesel] = *ref;
511 vcpu_e500->g2h_tlb1_map[esel] |= (u64)1 << sesel;
512 vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_BITMAP;
513 if (vcpu_e500->h2g_tlb1_rmap[sesel]) {
514 unsigned int idx = vcpu_e500->h2g_tlb1_rmap[sesel];
515 vcpu_e500->g2h_tlb1_map[idx] &= ~(1ULL << sesel);
516 }
517 vcpu_e500->h2g_tlb1_rmap[sesel] = esel;
518
519 return sesel;
520}
521
522/* Caller must ensure that the specified guest TLB entry is safe to insert into
523 * the shadow TLB. */
524/* For both one-one and one-to-many */
525static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
526 u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
527 struct kvm_book3e_206_tlb_entry *stlbe, int esel)
528{
529 struct tlbe_ref ref;
530 int sesel;
531 int r;
532
533 ref.flags = 0;
534 r = kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe,
535 &ref);
536 if (r)
537 return r;
538
539 /* Use TLB0 when we can only map a page with 4k */
540 if (get_tlb_tsize(stlbe) == BOOK3E_PAGESZ_4K) {
541 vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_TLB0;
542 write_stlbe(vcpu_e500, gtlbe, stlbe, 0, 0);
543 return 0;
544 }
545
546 /* Otherwise map into TLB1 */
547 sesel = kvmppc_e500_tlb1_map_tlb1(vcpu_e500, &ref, esel);
548 write_stlbe(vcpu_e500, gtlbe, stlbe, 1, sesel);
549
550 return 0;
551}
552
553void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
554 unsigned int index)
555{
556 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
557 struct tlbe_priv *priv;
558 struct kvm_book3e_206_tlb_entry *gtlbe, stlbe;
559 int tlbsel = tlbsel_of(index);
560 int esel = esel_of(index);
561
562 gtlbe = get_entry(vcpu_e500, tlbsel, esel);
563
564 switch (tlbsel) {
565 case 0:
566 priv = &vcpu_e500->gtlb_priv[tlbsel][esel];
567
568 /* Triggers after clear_tlb_refs or on initial mapping */
569 if (!(priv->ref.flags & E500_TLB_VALID)) {
570 kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe);
571 } else {
572 kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K,
573 &priv->ref, eaddr, &stlbe);
574 write_stlbe(vcpu_e500, gtlbe, &stlbe, 0, 0);
575 }
576 break;
577
578 case 1: {
579 gfn_t gfn = gpaddr >> PAGE_SHIFT;
580 kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, gtlbe, &stlbe,
581 esel);
582 break;
583 }
584
585 default:
586 BUG();
587 break;
588 }
589}
590
591/************* MMU Notifiers *************/
592
593int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
594{
595 trace_kvm_unmap_hva(hva);
596
597 /*
598 * Flush all shadow tlb entries everywhere. This is slow, but
599 * we are 100% sure that we catch the to be unmapped page
600 */
601 kvm_flush_remote_tlbs(kvm);
602
603 return 0;
604}
605
606int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
607{
608 /* kvm_unmap_hva flushes everything anyways */
609 kvm_unmap_hva(kvm, start);
610
611 return 0;
612}
613
614int kvm_age_hva(struct kvm *kvm, unsigned long hva)
615{
616 /* XXX could be more clever ;) */
617 return 0;
618}
619
620int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
621{
622 /* XXX could be more clever ;) */
623 return 0;
624}
625
626void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
627{
628 /* The page will get remapped properly on its next fault */
629 kvm_unmap_hva(kvm, hva);
630}
631
632/*****************************************/
633
634int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500)
635{
636 host_tlb_params[0].entries = mfspr(SPRN_TLB0CFG) & TLBnCFG_N_ENTRY;
637 host_tlb_params[1].entries = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
638
639 /*
640 * This should never happen on real e500 hardware, but is
641 * architecturally possible -- e.g. in some weird nested
642 * virtualization case.
643 */
644 if (host_tlb_params[0].entries == 0 ||
645 host_tlb_params[1].entries == 0) {
646 pr_err("%s: need to know host tlb size\n", __func__);
647 return -ENODEV;
648 }
649
650 host_tlb_params[0].ways = (mfspr(SPRN_TLB0CFG) & TLBnCFG_ASSOC) >>
651 TLBnCFG_ASSOC_SHIFT;
652 host_tlb_params[1].ways = host_tlb_params[1].entries;
653
654 if (!is_power_of_2(host_tlb_params[0].entries) ||
655 !is_power_of_2(host_tlb_params[0].ways) ||
656 host_tlb_params[0].entries < host_tlb_params[0].ways ||
657 host_tlb_params[0].ways == 0) {
658 pr_err("%s: bad tlb0 host config: %u entries %u ways\n",
659 __func__, host_tlb_params[0].entries,
660 host_tlb_params[0].ways);
661 return -ENODEV;
662 }
663
664 host_tlb_params[0].sets =
665 host_tlb_params[0].entries / host_tlb_params[0].ways;
666 host_tlb_params[1].sets = 1;
667
668 vcpu_e500->tlb_refs[0] =
669 kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[0].entries,
670 GFP_KERNEL);
671 if (!vcpu_e500->tlb_refs[0])
672 goto err;
673
674 vcpu_e500->tlb_refs[1] =
675 kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[1].entries,
676 GFP_KERNEL);
677 if (!vcpu_e500->tlb_refs[1])
678 goto err;
679
680 vcpu_e500->h2g_tlb1_rmap = kzalloc(sizeof(unsigned int) *
681 host_tlb_params[1].entries,
682 GFP_KERNEL);
683 if (!vcpu_e500->h2g_tlb1_rmap)
684 goto err;
685
686 return 0;
687
688err:
689 kfree(vcpu_e500->tlb_refs[0]);
690 kfree(vcpu_e500->tlb_refs[1]);
691 return -EINVAL;
692}
693
694void e500_mmu_host_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
695{
696 kfree(vcpu_e500->h2g_tlb1_rmap);
697 kfree(vcpu_e500->tlb_refs[0]);
698 kfree(vcpu_e500->tlb_refs[1]);
699}
diff --git a/arch/powerpc/kvm/e500_mmu_host.h b/arch/powerpc/kvm/e500_mmu_host.h
new file mode 100644
index 000000000000..7624835b76c7
--- /dev/null
+++ b/arch/powerpc/kvm/e500_mmu_host.h
@@ -0,0 +1,18 @@
1/*
2 * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License, version 2, as
6 * published by the Free Software Foundation.
7 */
8
9#ifndef KVM_E500_MMU_HOST_H
10#define KVM_E500_MMU_HOST_H
11
12void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
13 int esel);
14
15int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500);
16void e500_mmu_host_uninit(struct kvmppc_vcpu_e500 *vcpu_e500);
17
18#endif /* KVM_E500_MMU_HOST_H */
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 9d9cddc5b346..7a73b6f72a8b 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -150,8 +150,6 @@ static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
150 case SPRN_TBWL: break; 150 case SPRN_TBWL: break;
151 case SPRN_TBWU: break; 151 case SPRN_TBWU: break;
152 152
153 case SPRN_MSSSR0: break;
154
155 case SPRN_DEC: 153 case SPRN_DEC:
156 vcpu->arch.dec = spr_val; 154 vcpu->arch.dec = spr_val;
157 kvmppc_emulate_dec(vcpu); 155 kvmppc_emulate_dec(vcpu);
@@ -202,9 +200,6 @@ static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
202 case SPRN_PIR: 200 case SPRN_PIR:
203 spr_val = vcpu->vcpu_id; 201 spr_val = vcpu->vcpu_id;
204 break; 202 break;
205 case SPRN_MSSSR0:
206 spr_val = 0;
207 break;
208 203
209 /* Note: mftb and TBRL/TBWL are user-accessible, so 204 /* Note: mftb and TBRL/TBWL are user-accessible, so
210 * the guest can always access the real TB anyways. 205 * the guest can always access the real TB anyways.
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 70739a089560..934413cd3a1b 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -237,7 +237,8 @@ int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
237 r = RESUME_HOST; 237 r = RESUME_HOST;
238 break; 238 break;
239 default: 239 default:
240 BUG(); 240 WARN_ON(1);
241 r = RESUME_GUEST;
241 } 242 }
242 243
243 return r; 244 return r;
@@ -305,6 +306,7 @@ int kvm_dev_ioctl_check_extension(long ext)
305#ifdef CONFIG_BOOKE 306#ifdef CONFIG_BOOKE
306 case KVM_CAP_PPC_BOOKE_SREGS: 307 case KVM_CAP_PPC_BOOKE_SREGS:
307 case KVM_CAP_PPC_BOOKE_WATCHDOG: 308 case KVM_CAP_PPC_BOOKE_WATCHDOG:
309 case KVM_CAP_PPC_EPR:
308#else 310#else
309 case KVM_CAP_PPC_SEGSTATE: 311 case KVM_CAP_PPC_SEGSTATE:
310 case KVM_CAP_PPC_HIOR: 312 case KVM_CAP_PPC_HIOR:
@@ -412,7 +414,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
412 struct kvm_memory_slot *memslot, 414 struct kvm_memory_slot *memslot,
413 struct kvm_memory_slot old, 415 struct kvm_memory_slot old,
414 struct kvm_userspace_memory_region *mem, 416 struct kvm_userspace_memory_region *mem,
415 int user_alloc) 417 bool user_alloc)
416{ 418{
417 return kvmppc_core_prepare_memory_region(kvm, memslot, mem); 419 return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
418} 420}
@@ -420,7 +422,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
420void kvm_arch_commit_memory_region(struct kvm *kvm, 422void kvm_arch_commit_memory_region(struct kvm *kvm,
421 struct kvm_userspace_memory_region *mem, 423 struct kvm_userspace_memory_region *mem,
422 struct kvm_memory_slot old, 424 struct kvm_memory_slot old,
423 int user_alloc) 425 bool user_alloc)
424{ 426{
425 kvmppc_core_commit_memory_region(kvm, mem, old); 427 kvmppc_core_commit_memory_region(kvm, mem, old);
426} 428}
@@ -720,6 +722,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
720 for (i = 0; i < 9; ++i) 722 for (i = 0; i < 9; ++i)
721 kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]); 723 kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]);
722 vcpu->arch.hcall_needed = 0; 724 vcpu->arch.hcall_needed = 0;
725#ifdef CONFIG_BOOKE
726 } else if (vcpu->arch.epr_needed) {
727 kvmppc_set_epr(vcpu, run->epr.epr);
728 vcpu->arch.epr_needed = 0;
729#endif
723 } 730 }
724 731
725 r = kvmppc_vcpu_run(run, vcpu); 732 r = kvmppc_vcpu_run(run, vcpu);
@@ -761,6 +768,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
761 r = 0; 768 r = 0;
762 vcpu->arch.papr_enabled = true; 769 vcpu->arch.papr_enabled = true;
763 break; 770 break;
771 case KVM_CAP_PPC_EPR:
772 r = 0;
773 vcpu->arch.epr_enabled = cap->args[0];
774 break;
764#ifdef CONFIG_BOOKE 775#ifdef CONFIG_BOOKE
765 case KVM_CAP_PPC_BOOKE_WATCHDOG: 776 case KVM_CAP_PPC_BOOKE_WATCHDOG:
766 r = 0; 777 r = 0;
diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h
index 7def77302d63..87c17bfb2968 100644
--- a/arch/s390/include/asm/irq.h
+++ b/arch/s390/include/asm/irq.h
@@ -41,6 +41,7 @@ enum interruption_class {
41 IRQIO_CSC, 41 IRQIO_CSC,
42 IRQIO_PCI, 42 IRQIO_PCI,
43 IRQIO_MSI, 43 IRQIO_MSI,
44 IRQIO_VIR,
44 NMI_NMI, 45 NMI_NMI,
45 CPU_RST, 46 CPU_RST,
46 NR_ARCH_IRQS 47 NR_ARCH_IRQS
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index b7841546991f..16bd5d169cdb 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -20,9 +20,7 @@
20#include <asm/cpu.h> 20#include <asm/cpu.h>
21 21
22#define KVM_MAX_VCPUS 64 22#define KVM_MAX_VCPUS 64
23#define KVM_MEMORY_SLOTS 32 23#define KVM_USER_MEM_SLOTS 32
24/* memory slots that does not exposed to userspace */
25#define KVM_PRIVATE_MEM_SLOTS 4
26 24
27struct sca_entry { 25struct sca_entry {
28 atomic_t scn; 26 atomic_t scn;
@@ -76,8 +74,11 @@ struct kvm_s390_sie_block {
76 __u64 epoch; /* 0x0038 */ 74 __u64 epoch; /* 0x0038 */
77 __u8 reserved40[4]; /* 0x0040 */ 75 __u8 reserved40[4]; /* 0x0040 */
78#define LCTL_CR0 0x8000 76#define LCTL_CR0 0x8000
77#define LCTL_CR6 0x0200
78#define LCTL_CR14 0x0002
79 __u16 lctl; /* 0x0044 */ 79 __u16 lctl; /* 0x0044 */
80 __s16 icpua; /* 0x0046 */ 80 __s16 icpua; /* 0x0046 */
81#define ICTL_LPSW 0x00400000
81 __u32 ictl; /* 0x0048 */ 82 __u32 ictl; /* 0x0048 */
82 __u32 eca; /* 0x004c */ 83 __u32 eca; /* 0x004c */
83 __u8 icptcode; /* 0x0050 */ 84 __u8 icptcode; /* 0x0050 */
@@ -127,6 +128,7 @@ struct kvm_vcpu_stat {
127 u32 deliver_prefix_signal; 128 u32 deliver_prefix_signal;
128 u32 deliver_restart_signal; 129 u32 deliver_restart_signal;
129 u32 deliver_program_int; 130 u32 deliver_program_int;
131 u32 deliver_io_int;
130 u32 exit_wait_state; 132 u32 exit_wait_state;
131 u32 instruction_stidp; 133 u32 instruction_stidp;
132 u32 instruction_spx; 134 u32 instruction_spx;
@@ -187,6 +189,11 @@ struct kvm_s390_emerg_info {
187 __u16 code; 189 __u16 code;
188}; 190};
189 191
192struct kvm_s390_mchk_info {
193 __u64 cr14;
194 __u64 mcic;
195};
196
190struct kvm_s390_interrupt_info { 197struct kvm_s390_interrupt_info {
191 struct list_head list; 198 struct list_head list;
192 u64 type; 199 u64 type;
@@ -197,6 +204,7 @@ struct kvm_s390_interrupt_info {
197 struct kvm_s390_emerg_info emerg; 204 struct kvm_s390_emerg_info emerg;
198 struct kvm_s390_extcall_info extcall; 205 struct kvm_s390_extcall_info extcall;
199 struct kvm_s390_prefix_info prefix; 206 struct kvm_s390_prefix_info prefix;
207 struct kvm_s390_mchk_info mchk;
200 }; 208 };
201}; 209};
202 210
@@ -254,6 +262,7 @@ struct kvm_arch{
254 debug_info_t *dbf; 262 debug_info_t *dbf;
255 struct kvm_s390_float_interrupt float_int; 263 struct kvm_s390_float_interrupt float_int;
256 struct gmap *gmap; 264 struct gmap *gmap;
265 int css_support;
257}; 266};
258 267
259extern int sie64a(struct kvm_s390_sie_block *, u64 *); 268extern int sie64a(struct kvm_s390_sie_block *, u64 *);
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index 9df824ea1667..1630f439cd2a 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -81,6 +81,7 @@ static const struct irq_class irqclass_sub_desc[NR_ARCH_IRQS] = {
81 [IRQIO_CSC] = {.name = "CSC", .desc = "[I/O] CHSC Subchannel"}, 81 [IRQIO_CSC] = {.name = "CSC", .desc = "[I/O] CHSC Subchannel"},
82 [IRQIO_PCI] = {.name = "PCI", .desc = "[I/O] PCI Interrupt" }, 82 [IRQIO_PCI] = {.name = "PCI", .desc = "[I/O] PCI Interrupt" },
83 [IRQIO_MSI] = {.name = "MSI", .desc = "[I/O] MSI Interrupt" }, 83 [IRQIO_MSI] = {.name = "MSI", .desc = "[I/O] MSI Interrupt" },
84 [IRQIO_VIR] = {.name = "VIR", .desc = "[I/O] Virtual I/O Devices"},
84 [NMI_NMI] = {.name = "NMI", .desc = "[NMI] Machine Check"}, 85 [NMI_NMI] = {.name = "NMI", .desc = "[NMI] Machine Check"},
85 [CPU_RST] = {.name = "RST", .desc = "[CPU] CPU Restart"}, 86 [CPU_RST] = {.name = "RST", .desc = "[CPU] CPU Restart"},
86}; 87};
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 22798ec33fd1..f26ff1e31bdb 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -26,27 +26,20 @@ static int handle_lctlg(struct kvm_vcpu *vcpu)
26{ 26{
27 int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; 27 int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
28 int reg3 = vcpu->arch.sie_block->ipa & 0x000f; 28 int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
29 int base2 = vcpu->arch.sie_block->ipb >> 28;
30 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) +
31 ((vcpu->arch.sie_block->ipb & 0xff00) << 4);
32 u64 useraddr; 29 u64 useraddr;
33 int reg, rc; 30 int reg, rc;
34 31
35 vcpu->stat.instruction_lctlg++; 32 vcpu->stat.instruction_lctlg++;
36 if ((vcpu->arch.sie_block->ipb & 0xff) != 0x2f)
37 return -EOPNOTSUPP;
38 33
39 useraddr = disp2; 34 useraddr = kvm_s390_get_base_disp_rsy(vcpu);
40 if (base2)
41 useraddr += vcpu->run->s.regs.gprs[base2];
42 35
43 if (useraddr & 7) 36 if (useraddr & 7)
44 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 37 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
45 38
46 reg = reg1; 39 reg = reg1;
47 40
48 VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2, 41 VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x, addr:%llx", reg1, reg3,
49 disp2); 42 useraddr);
50 trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr); 43 trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr);
51 44
52 do { 45 do {
@@ -68,23 +61,19 @@ static int handle_lctl(struct kvm_vcpu *vcpu)
68{ 61{
69 int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; 62 int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
70 int reg3 = vcpu->arch.sie_block->ipa & 0x000f; 63 int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
71 int base2 = vcpu->arch.sie_block->ipb >> 28;
72 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
73 u64 useraddr; 64 u64 useraddr;
74 u32 val = 0; 65 u32 val = 0;
75 int reg, rc; 66 int reg, rc;
76 67
77 vcpu->stat.instruction_lctl++; 68 vcpu->stat.instruction_lctl++;
78 69
79 useraddr = disp2; 70 useraddr = kvm_s390_get_base_disp_rs(vcpu);
80 if (base2)
81 useraddr += vcpu->run->s.regs.gprs[base2];
82 71
83 if (useraddr & 3) 72 if (useraddr & 3)
84 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 73 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
85 74
86 VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2, 75 VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x, addr:%llx", reg1, reg3,
87 disp2); 76 useraddr);
88 trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr); 77 trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr);
89 78
90 reg = reg1; 79 reg = reg1;
@@ -104,14 +93,31 @@ static int handle_lctl(struct kvm_vcpu *vcpu)
104 return 0; 93 return 0;
105} 94}
106 95
107static intercept_handler_t instruction_handlers[256] = { 96static const intercept_handler_t eb_handlers[256] = {
97 [0x2f] = handle_lctlg,
98 [0x8a] = kvm_s390_handle_priv_eb,
99};
100
101static int handle_eb(struct kvm_vcpu *vcpu)
102{
103 intercept_handler_t handler;
104
105 handler = eb_handlers[vcpu->arch.sie_block->ipb & 0xff];
106 if (handler)
107 return handler(vcpu);
108 return -EOPNOTSUPP;
109}
110
111static const intercept_handler_t instruction_handlers[256] = {
108 [0x01] = kvm_s390_handle_01, 112 [0x01] = kvm_s390_handle_01,
113 [0x82] = kvm_s390_handle_lpsw,
109 [0x83] = kvm_s390_handle_diag, 114 [0x83] = kvm_s390_handle_diag,
110 [0xae] = kvm_s390_handle_sigp, 115 [0xae] = kvm_s390_handle_sigp,
111 [0xb2] = kvm_s390_handle_b2, 116 [0xb2] = kvm_s390_handle_b2,
112 [0xb7] = handle_lctl, 117 [0xb7] = handle_lctl,
118 [0xb9] = kvm_s390_handle_b9,
113 [0xe5] = kvm_s390_handle_e5, 119 [0xe5] = kvm_s390_handle_e5,
114 [0xeb] = handle_lctlg, 120 [0xeb] = handle_eb,
115}; 121};
116 122
117static int handle_noop(struct kvm_vcpu *vcpu) 123static int handle_noop(struct kvm_vcpu *vcpu)
@@ -258,6 +264,7 @@ static const intercept_handler_t intercept_funcs[] = {
258 [0x0C >> 2] = handle_instruction_and_prog, 264 [0x0C >> 2] = handle_instruction_and_prog,
259 [0x10 >> 2] = handle_noop, 265 [0x10 >> 2] = handle_noop,
260 [0x14 >> 2] = handle_noop, 266 [0x14 >> 2] = handle_noop,
267 [0x18 >> 2] = handle_noop,
261 [0x1C >> 2] = kvm_s390_handle_wait, 268 [0x1C >> 2] = kvm_s390_handle_wait,
262 [0x20 >> 2] = handle_validity, 269 [0x20 >> 2] = handle_validity,
263 [0x28 >> 2] = handle_stop, 270 [0x28 >> 2] = handle_stop,
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 87418b50f21c..37116a77cb4b 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -21,11 +21,31 @@
21#include "gaccess.h" 21#include "gaccess.h"
22#include "trace-s390.h" 22#include "trace-s390.h"
23 23
24#define IOINT_SCHID_MASK 0x0000ffff
25#define IOINT_SSID_MASK 0x00030000
26#define IOINT_CSSID_MASK 0x03fc0000
27#define IOINT_AI_MASK 0x04000000
28
29static int is_ioint(u64 type)
30{
31 return ((type & 0xfffe0000u) != 0xfffe0000u);
32}
33
24static int psw_extint_disabled(struct kvm_vcpu *vcpu) 34static int psw_extint_disabled(struct kvm_vcpu *vcpu)
25{ 35{
26 return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT); 36 return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT);
27} 37}
28 38
39static int psw_ioint_disabled(struct kvm_vcpu *vcpu)
40{
41 return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_IO);
42}
43
44static int psw_mchk_disabled(struct kvm_vcpu *vcpu)
45{
46 return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_MCHECK);
47}
48
29static int psw_interrupts_disabled(struct kvm_vcpu *vcpu) 49static int psw_interrupts_disabled(struct kvm_vcpu *vcpu)
30{ 50{
31 if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) || 51 if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) ||
@@ -35,6 +55,13 @@ static int psw_interrupts_disabled(struct kvm_vcpu *vcpu)
35 return 1; 55 return 1;
36} 56}
37 57
58static u64 int_word_to_isc_bits(u32 int_word)
59{
60 u8 isc = (int_word & 0x38000000) >> 27;
61
62 return (0x80 >> isc) << 24;
63}
64
38static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu, 65static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
39 struct kvm_s390_interrupt_info *inti) 66 struct kvm_s390_interrupt_info *inti)
40{ 67{
@@ -67,7 +94,22 @@ static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
67 case KVM_S390_SIGP_SET_PREFIX: 94 case KVM_S390_SIGP_SET_PREFIX:
68 case KVM_S390_RESTART: 95 case KVM_S390_RESTART:
69 return 1; 96 return 1;
97 case KVM_S390_MCHK:
98 if (psw_mchk_disabled(vcpu))
99 return 0;
100 if (vcpu->arch.sie_block->gcr[14] & inti->mchk.cr14)
101 return 1;
102 return 0;
103 case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
104 if (psw_ioint_disabled(vcpu))
105 return 0;
106 if (vcpu->arch.sie_block->gcr[6] &
107 int_word_to_isc_bits(inti->io.io_int_word))
108 return 1;
109 return 0;
70 default: 110 default:
111 printk(KERN_WARNING "illegal interrupt type %llx\n",
112 inti->type);
71 BUG(); 113 BUG();
72 } 114 }
73 return 0; 115 return 0;
@@ -93,6 +135,7 @@ static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
93 CPUSTAT_IO_INT | CPUSTAT_EXT_INT | CPUSTAT_STOP_INT, 135 CPUSTAT_IO_INT | CPUSTAT_EXT_INT | CPUSTAT_STOP_INT,
94 &vcpu->arch.sie_block->cpuflags); 136 &vcpu->arch.sie_block->cpuflags);
95 vcpu->arch.sie_block->lctl = 0x0000; 137 vcpu->arch.sie_block->lctl = 0x0000;
138 vcpu->arch.sie_block->ictl &= ~ICTL_LPSW;
96} 139}
97 140
98static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag) 141static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag)
@@ -116,6 +159,18 @@ static void __set_intercept_indicator(struct kvm_vcpu *vcpu,
116 case KVM_S390_SIGP_STOP: 159 case KVM_S390_SIGP_STOP:
117 __set_cpuflag(vcpu, CPUSTAT_STOP_INT); 160 __set_cpuflag(vcpu, CPUSTAT_STOP_INT);
118 break; 161 break;
162 case KVM_S390_MCHK:
163 if (psw_mchk_disabled(vcpu))
164 vcpu->arch.sie_block->ictl |= ICTL_LPSW;
165 else
166 vcpu->arch.sie_block->lctl |= LCTL_CR14;
167 break;
168 case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
169 if (psw_ioint_disabled(vcpu))
170 __set_cpuflag(vcpu, CPUSTAT_IO_INT);
171 else
172 vcpu->arch.sie_block->lctl |= LCTL_CR6;
173 break;
119 default: 174 default:
120 BUG(); 175 BUG();
121 } 176 }
@@ -297,6 +352,73 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
297 exception = 1; 352 exception = 1;
298 break; 353 break;
299 354
355 case KVM_S390_MCHK:
356 VCPU_EVENT(vcpu, 4, "interrupt: machine check mcic=%llx",
357 inti->mchk.mcic);
358 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
359 inti->mchk.cr14,
360 inti->mchk.mcic);
361 rc = kvm_s390_vcpu_store_status(vcpu,
362 KVM_S390_STORE_STATUS_PREFIXED);
363 if (rc == -EFAULT)
364 exception = 1;
365
366 rc = put_guest_u64(vcpu, __LC_MCCK_CODE, inti->mchk.mcic);
367 if (rc == -EFAULT)
368 exception = 1;
369
370 rc = copy_to_guest(vcpu, __LC_MCK_OLD_PSW,
371 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
372 if (rc == -EFAULT)
373 exception = 1;
374
375 rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
376 __LC_MCK_NEW_PSW, sizeof(psw_t));
377 if (rc == -EFAULT)
378 exception = 1;
379 break;
380
381 case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
382 {
383 __u32 param0 = ((__u32)inti->io.subchannel_id << 16) |
384 inti->io.subchannel_nr;
385 __u64 param1 = ((__u64)inti->io.io_int_parm << 32) |
386 inti->io.io_int_word;
387 VCPU_EVENT(vcpu, 4, "interrupt: I/O %llx", inti->type);
388 vcpu->stat.deliver_io_int++;
389 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
390 param0, param1);
391 rc = put_guest_u16(vcpu, __LC_SUBCHANNEL_ID,
392 inti->io.subchannel_id);
393 if (rc == -EFAULT)
394 exception = 1;
395
396 rc = put_guest_u16(vcpu, __LC_SUBCHANNEL_NR,
397 inti->io.subchannel_nr);
398 if (rc == -EFAULT)
399 exception = 1;
400
401 rc = put_guest_u32(vcpu, __LC_IO_INT_PARM,
402 inti->io.io_int_parm);
403 if (rc == -EFAULT)
404 exception = 1;
405
406 rc = put_guest_u32(vcpu, __LC_IO_INT_WORD,
407 inti->io.io_int_word);
408 if (rc == -EFAULT)
409 exception = 1;
410
411 rc = copy_to_guest(vcpu, __LC_IO_OLD_PSW,
412 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
413 if (rc == -EFAULT)
414 exception = 1;
415
416 rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
417 __LC_IO_NEW_PSW, sizeof(psw_t));
418 if (rc == -EFAULT)
419 exception = 1;
420 break;
421 }
300 default: 422 default:
301 BUG(); 423 BUG();
302 } 424 }
@@ -518,6 +640,61 @@ void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
518 } 640 }
519} 641}
520 642
643void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu)
644{
645 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
646 struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int;
647 struct kvm_s390_interrupt_info *n, *inti = NULL;
648 int deliver;
649
650 __reset_intercept_indicators(vcpu);
651 if (atomic_read(&li->active)) {
652 do {
653 deliver = 0;
654 spin_lock_bh(&li->lock);
655 list_for_each_entry_safe(inti, n, &li->list, list) {
656 if ((inti->type == KVM_S390_MCHK) &&
657 __interrupt_is_deliverable(vcpu, inti)) {
658 list_del(&inti->list);
659 deliver = 1;
660 break;
661 }
662 __set_intercept_indicator(vcpu, inti);
663 }
664 if (list_empty(&li->list))
665 atomic_set(&li->active, 0);
666 spin_unlock_bh(&li->lock);
667 if (deliver) {
668 __do_deliver_interrupt(vcpu, inti);
669 kfree(inti);
670 }
671 } while (deliver);
672 }
673
674 if (atomic_read(&fi->active)) {
675 do {
676 deliver = 0;
677 spin_lock(&fi->lock);
678 list_for_each_entry_safe(inti, n, &fi->list, list) {
679 if ((inti->type == KVM_S390_MCHK) &&
680 __interrupt_is_deliverable(vcpu, inti)) {
681 list_del(&inti->list);
682 deliver = 1;
683 break;
684 }
685 __set_intercept_indicator(vcpu, inti);
686 }
687 if (list_empty(&fi->list))
688 atomic_set(&fi->active, 0);
689 spin_unlock(&fi->lock);
690 if (deliver) {
691 __do_deliver_interrupt(vcpu, inti);
692 kfree(inti);
693 }
694 } while (deliver);
695 }
696}
697
521int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code) 698int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
522{ 699{
523 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 700 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -540,12 +717,50 @@ int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
540 return 0; 717 return 0;
541} 718}
542 719
720struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
721 u64 cr6, u64 schid)
722{
723 struct kvm_s390_float_interrupt *fi;
724 struct kvm_s390_interrupt_info *inti, *iter;
725
726 if ((!schid && !cr6) || (schid && cr6))
727 return NULL;
728 mutex_lock(&kvm->lock);
729 fi = &kvm->arch.float_int;
730 spin_lock(&fi->lock);
731 inti = NULL;
732 list_for_each_entry(iter, &fi->list, list) {
733 if (!is_ioint(iter->type))
734 continue;
735 if (cr6 &&
736 ((cr6 & int_word_to_isc_bits(iter->io.io_int_word)) == 0))
737 continue;
738 if (schid) {
739 if (((schid & 0x00000000ffff0000) >> 16) !=
740 iter->io.subchannel_id)
741 continue;
742 if ((schid & 0x000000000000ffff) !=
743 iter->io.subchannel_nr)
744 continue;
745 }
746 inti = iter;
747 break;
748 }
749 if (inti)
750 list_del_init(&inti->list);
751 if (list_empty(&fi->list))
752 atomic_set(&fi->active, 0);
753 spin_unlock(&fi->lock);
754 mutex_unlock(&kvm->lock);
755 return inti;
756}
757
543int kvm_s390_inject_vm(struct kvm *kvm, 758int kvm_s390_inject_vm(struct kvm *kvm,
544 struct kvm_s390_interrupt *s390int) 759 struct kvm_s390_interrupt *s390int)
545{ 760{
546 struct kvm_s390_local_interrupt *li; 761 struct kvm_s390_local_interrupt *li;
547 struct kvm_s390_float_interrupt *fi; 762 struct kvm_s390_float_interrupt *fi;
548 struct kvm_s390_interrupt_info *inti; 763 struct kvm_s390_interrupt_info *inti, *iter;
549 int sigcpu; 764 int sigcpu;
550 765
551 inti = kzalloc(sizeof(*inti), GFP_KERNEL); 766 inti = kzalloc(sizeof(*inti), GFP_KERNEL);
@@ -569,6 +784,29 @@ int kvm_s390_inject_vm(struct kvm *kvm,
569 case KVM_S390_SIGP_STOP: 784 case KVM_S390_SIGP_STOP:
570 case KVM_S390_INT_EXTERNAL_CALL: 785 case KVM_S390_INT_EXTERNAL_CALL:
571 case KVM_S390_INT_EMERGENCY: 786 case KVM_S390_INT_EMERGENCY:
787 kfree(inti);
788 return -EINVAL;
789 case KVM_S390_MCHK:
790 VM_EVENT(kvm, 5, "inject: machine check parm64:%llx",
791 s390int->parm64);
792 inti->type = s390int->type;
793 inti->mchk.cr14 = s390int->parm; /* upper bits are not used */
794 inti->mchk.mcic = s390int->parm64;
795 break;
796 case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
797 if (s390int->type & IOINT_AI_MASK)
798 VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
799 else
800 VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
801 s390int->type & IOINT_CSSID_MASK,
802 s390int->type & IOINT_SSID_MASK,
803 s390int->type & IOINT_SCHID_MASK);
804 inti->type = s390int->type;
805 inti->io.subchannel_id = s390int->parm >> 16;
806 inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
807 inti->io.io_int_parm = s390int->parm64 >> 32;
808 inti->io.io_int_word = s390int->parm64 & 0x00000000ffffffffull;
809 break;
572 default: 810 default:
573 kfree(inti); 811 kfree(inti);
574 return -EINVAL; 812 return -EINVAL;
@@ -579,7 +817,22 @@ int kvm_s390_inject_vm(struct kvm *kvm,
579 mutex_lock(&kvm->lock); 817 mutex_lock(&kvm->lock);
580 fi = &kvm->arch.float_int; 818 fi = &kvm->arch.float_int;
581 spin_lock(&fi->lock); 819 spin_lock(&fi->lock);
582 list_add_tail(&inti->list, &fi->list); 820 if (!is_ioint(inti->type))
821 list_add_tail(&inti->list, &fi->list);
822 else {
823 u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
824
825 /* Keep I/O interrupts sorted in isc order. */
826 list_for_each_entry(iter, &fi->list, list) {
827 if (!is_ioint(iter->type))
828 continue;
829 if (int_word_to_isc_bits(iter->io.io_int_word)
830 <= isc_bits)
831 continue;
832 break;
833 }
834 list_add_tail(&inti->list, &iter->list);
835 }
583 atomic_set(&fi->active, 1); 836 atomic_set(&fi->active, 1);
584 sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS); 837 sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
585 if (sigcpu == KVM_MAX_VCPUS) { 838 if (sigcpu == KVM_MAX_VCPUS) {
@@ -651,8 +904,15 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
651 inti->type = s390int->type; 904 inti->type = s390int->type;
652 inti->emerg.code = s390int->parm; 905 inti->emerg.code = s390int->parm;
653 break; 906 break;
907 case KVM_S390_MCHK:
908 VCPU_EVENT(vcpu, 5, "inject: machine check parm64:%llx",
909 s390int->parm64);
910 inti->type = s390int->type;
911 inti->mchk.mcic = s390int->parm64;
912 break;
654 case KVM_S390_INT_VIRTIO: 913 case KVM_S390_INT_VIRTIO:
655 case KVM_S390_INT_SERVICE: 914 case KVM_S390_INT_SERVICE:
915 case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
656 default: 916 default:
657 kfree(inti); 917 kfree(inti);
658 return -EINVAL; 918 return -EINVAL;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 2923781590a6..4cf35a0a79e7 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -140,6 +140,8 @@ int kvm_dev_ioctl_check_extension(long ext)
140#endif 140#endif
141 case KVM_CAP_SYNC_REGS: 141 case KVM_CAP_SYNC_REGS:
142 case KVM_CAP_ONE_REG: 142 case KVM_CAP_ONE_REG:
143 case KVM_CAP_ENABLE_CAP:
144 case KVM_CAP_S390_CSS_SUPPORT:
143 r = 1; 145 r = 1;
144 break; 146 break;
145 case KVM_CAP_NR_VCPUS: 147 case KVM_CAP_NR_VCPUS:
@@ -234,6 +236,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
234 if (!kvm->arch.gmap) 236 if (!kvm->arch.gmap)
235 goto out_nogmap; 237 goto out_nogmap;
236 } 238 }
239
240 kvm->arch.css_support = 0;
241
237 return 0; 242 return 0;
238out_nogmap: 243out_nogmap:
239 debug_unregister(kvm->arch.dbf); 244 debug_unregister(kvm->arch.dbf);
@@ -659,6 +664,7 @@ rerun_vcpu:
659 case KVM_EXIT_INTR: 664 case KVM_EXIT_INTR:
660 case KVM_EXIT_S390_RESET: 665 case KVM_EXIT_S390_RESET:
661 case KVM_EXIT_S390_UCONTROL: 666 case KVM_EXIT_S390_UCONTROL:
667 case KVM_EXIT_S390_TSCH:
662 break; 668 break;
663 default: 669 default:
664 BUG(); 670 BUG();
@@ -766,6 +772,14 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
766 } else 772 } else
767 prefix = 0; 773 prefix = 0;
768 774
775 /*
776 * The guest FPRS and ACRS are in the host FPRS/ACRS due to the lazy
777 * copying in vcpu load/put. Lets update our copies before we save
778 * it into the save area
779 */
780 save_fp_regs(&vcpu->arch.guest_fpregs);
781 save_access_regs(vcpu->run->s.regs.acrs);
782
769 if (__guestcopy(vcpu, addr + offsetof(struct save_area, fp_regs), 783 if (__guestcopy(vcpu, addr + offsetof(struct save_area, fp_regs),
770 vcpu->arch.guest_fpregs.fprs, 128, prefix)) 784 vcpu->arch.guest_fpregs.fprs, 128, prefix))
771 return -EFAULT; 785 return -EFAULT;
@@ -810,6 +824,29 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
810 return 0; 824 return 0;
811} 825}
812 826
827static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
828 struct kvm_enable_cap *cap)
829{
830 int r;
831
832 if (cap->flags)
833 return -EINVAL;
834
835 switch (cap->cap) {
836 case KVM_CAP_S390_CSS_SUPPORT:
837 if (!vcpu->kvm->arch.css_support) {
838 vcpu->kvm->arch.css_support = 1;
839 trace_kvm_s390_enable_css(vcpu->kvm);
840 }
841 r = 0;
842 break;
843 default:
844 r = -EINVAL;
845 break;
846 }
847 return r;
848}
849
813long kvm_arch_vcpu_ioctl(struct file *filp, 850long kvm_arch_vcpu_ioctl(struct file *filp,
814 unsigned int ioctl, unsigned long arg) 851 unsigned int ioctl, unsigned long arg)
815{ 852{
@@ -896,6 +933,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
896 r = 0; 933 r = 0;
897 break; 934 break;
898 } 935 }
936 case KVM_ENABLE_CAP:
937 {
938 struct kvm_enable_cap cap;
939 r = -EFAULT;
940 if (copy_from_user(&cap, argp, sizeof(cap)))
941 break;
942 r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
943 break;
944 }
899 default: 945 default:
900 r = -ENOTTY; 946 r = -ENOTTY;
901 } 947 }
@@ -930,7 +976,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
930 struct kvm_memory_slot *memslot, 976 struct kvm_memory_slot *memslot,
931 struct kvm_memory_slot old, 977 struct kvm_memory_slot old,
932 struct kvm_userspace_memory_region *mem, 978 struct kvm_userspace_memory_region *mem,
933 int user_alloc) 979 bool user_alloc)
934{ 980{
935 /* A few sanity checks. We can have exactly one memory slot which has 981 /* A few sanity checks. We can have exactly one memory slot which has
936 to start at guest virtual zero and which has to be located at a 982 to start at guest virtual zero and which has to be located at a
@@ -960,7 +1006,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
960void kvm_arch_commit_memory_region(struct kvm *kvm, 1006void kvm_arch_commit_memory_region(struct kvm *kvm,
961 struct kvm_userspace_memory_region *mem, 1007 struct kvm_userspace_memory_region *mem,
962 struct kvm_memory_slot old, 1008 struct kvm_memory_slot old,
963 int user_alloc) 1009 bool user_alloc)
964{ 1010{
965 int rc; 1011 int rc;
966 1012
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index d75bc5e92c5b..4d89d64a8161 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -65,21 +65,67 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix)
65 vcpu->arch.sie_block->ihcpu = 0xffff; 65 vcpu->arch.sie_block->ihcpu = 0xffff;
66} 66}
67 67
68static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu)
69{
70 u32 base2 = vcpu->arch.sie_block->ipb >> 28;
71 u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
72
73 return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
74}
75
76static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu,
77 u64 *address1, u64 *address2)
78{
79 u32 base1 = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28;
80 u32 disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16;
81 u32 base2 = (vcpu->arch.sie_block->ipb & 0xf000) >> 12;
82 u32 disp2 = vcpu->arch.sie_block->ipb & 0x0fff;
83
84 *address1 = (base1 ? vcpu->run->s.regs.gprs[base1] : 0) + disp1;
85 *address2 = (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
86}
87
88static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu)
89{
90 u32 base2 = vcpu->arch.sie_block->ipb >> 28;
91 u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) +
92 ((vcpu->arch.sie_block->ipb & 0xff00) << 4);
93 /* The displacement is a 20bit _SIGNED_ value */
94 if (disp2 & 0x80000)
95 disp2+=0xfff00000;
96
97 return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + (long)(int)disp2;
98}
99
100static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu)
101{
102 u32 base2 = vcpu->arch.sie_block->ipb >> 28;
103 u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
104
105 return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
106}
107
68int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); 108int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
69enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer); 109enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
70void kvm_s390_tasklet(unsigned long parm); 110void kvm_s390_tasklet(unsigned long parm);
71void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu); 111void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu);
112void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu);
72int kvm_s390_inject_vm(struct kvm *kvm, 113int kvm_s390_inject_vm(struct kvm *kvm,
73 struct kvm_s390_interrupt *s390int); 114 struct kvm_s390_interrupt *s390int);
74int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, 115int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
75 struct kvm_s390_interrupt *s390int); 116 struct kvm_s390_interrupt *s390int);
76int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code); 117int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
77int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action); 118int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action);
119struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
120 u64 cr6, u64 schid);
78 121
79/* implemented in priv.c */ 122/* implemented in priv.c */
80int kvm_s390_handle_b2(struct kvm_vcpu *vcpu); 123int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
81int kvm_s390_handle_e5(struct kvm_vcpu *vcpu); 124int kvm_s390_handle_e5(struct kvm_vcpu *vcpu);
82int kvm_s390_handle_01(struct kvm_vcpu *vcpu); 125int kvm_s390_handle_01(struct kvm_vcpu *vcpu);
126int kvm_s390_handle_b9(struct kvm_vcpu *vcpu);
127int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu);
128int kvm_s390_handle_priv_eb(struct kvm_vcpu *vcpu);
83 129
84/* implemented in sigp.c */ 130/* implemented in sigp.c */
85int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); 131int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index d768906f15c8..0ef9894606e5 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -18,23 +18,21 @@
18#include <asm/debug.h> 18#include <asm/debug.h>
19#include <asm/ebcdic.h> 19#include <asm/ebcdic.h>
20#include <asm/sysinfo.h> 20#include <asm/sysinfo.h>
21#include <asm/ptrace.h>
22#include <asm/compat.h>
21#include "gaccess.h" 23#include "gaccess.h"
22#include "kvm-s390.h" 24#include "kvm-s390.h"
23#include "trace.h" 25#include "trace.h"
24 26
25static int handle_set_prefix(struct kvm_vcpu *vcpu) 27static int handle_set_prefix(struct kvm_vcpu *vcpu)
26{ 28{
27 int base2 = vcpu->arch.sie_block->ipb >> 28;
28 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
29 u64 operand2; 29 u64 operand2;
30 u32 address = 0; 30 u32 address = 0;
31 u8 tmp; 31 u8 tmp;
32 32
33 vcpu->stat.instruction_spx++; 33 vcpu->stat.instruction_spx++;
34 34
35 operand2 = disp2; 35 operand2 = kvm_s390_get_base_disp_s(vcpu);
36 if (base2)
37 operand2 += vcpu->run->s.regs.gprs[base2];
38 36
39 /* must be word boundary */ 37 /* must be word boundary */
40 if (operand2 & 3) { 38 if (operand2 & 3) {
@@ -67,15 +65,12 @@ out:
67 65
68static int handle_store_prefix(struct kvm_vcpu *vcpu) 66static int handle_store_prefix(struct kvm_vcpu *vcpu)
69{ 67{
70 int base2 = vcpu->arch.sie_block->ipb >> 28;
71 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
72 u64 operand2; 68 u64 operand2;
73 u32 address; 69 u32 address;
74 70
75 vcpu->stat.instruction_stpx++; 71 vcpu->stat.instruction_stpx++;
76 operand2 = disp2; 72
77 if (base2) 73 operand2 = kvm_s390_get_base_disp_s(vcpu);
78 operand2 += vcpu->run->s.regs.gprs[base2];
79 74
80 /* must be word boundary */ 75 /* must be word boundary */
81 if (operand2 & 3) { 76 if (operand2 & 3) {
@@ -100,15 +95,12 @@ out:
100 95
101static int handle_store_cpu_address(struct kvm_vcpu *vcpu) 96static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
102{ 97{
103 int base2 = vcpu->arch.sie_block->ipb >> 28;
104 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
105 u64 useraddr; 98 u64 useraddr;
106 int rc; 99 int rc;
107 100
108 vcpu->stat.instruction_stap++; 101 vcpu->stat.instruction_stap++;
109 useraddr = disp2; 102
110 if (base2) 103 useraddr = kvm_s390_get_base_disp_s(vcpu);
111 useraddr += vcpu->run->s.regs.gprs[base2];
112 104
113 if (useraddr & 1) { 105 if (useraddr & 1) {
114 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 106 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -135,24 +127,96 @@ static int handle_skey(struct kvm_vcpu *vcpu)
135 return 0; 127 return 0;
136} 128}
137 129
138static int handle_stsch(struct kvm_vcpu *vcpu) 130static int handle_tpi(struct kvm_vcpu *vcpu)
139{ 131{
140 vcpu->stat.instruction_stsch++; 132 u64 addr;
141 VCPU_EVENT(vcpu, 4, "%s", "store subchannel - CC3"); 133 struct kvm_s390_interrupt_info *inti;
142 /* condition code 3 */ 134 int cc;
135
136 addr = kvm_s390_get_base_disp_s(vcpu);
137
138 inti = kvm_s390_get_io_int(vcpu->kvm, vcpu->run->s.regs.crs[6], 0);
139 if (inti) {
140 if (addr) {
141 /*
142 * Store the two-word I/O interruption code into the
143 * provided area.
144 */
145 put_guest_u16(vcpu, addr, inti->io.subchannel_id);
146 put_guest_u16(vcpu, addr + 2, inti->io.subchannel_nr);
147 put_guest_u32(vcpu, addr + 4, inti->io.io_int_parm);
148 } else {
149 /*
150 * Store the three-word I/O interruption code into
151 * the appropriate lowcore area.
152 */
153 put_guest_u16(vcpu, 184, inti->io.subchannel_id);
154 put_guest_u16(vcpu, 186, inti->io.subchannel_nr);
155 put_guest_u32(vcpu, 188, inti->io.io_int_parm);
156 put_guest_u32(vcpu, 192, inti->io.io_int_word);
157 }
158 cc = 1;
159 } else
160 cc = 0;
161 kfree(inti);
162 /* Set condition code and we're done. */
143 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); 163 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
144 vcpu->arch.sie_block->gpsw.mask |= (3 & 3ul) << 44; 164 vcpu->arch.sie_block->gpsw.mask |= (cc & 3ul) << 44;
145 return 0; 165 return 0;
146} 166}
147 167
148static int handle_chsc(struct kvm_vcpu *vcpu) 168static int handle_tsch(struct kvm_vcpu *vcpu)
149{ 169{
150 vcpu->stat.instruction_chsc++; 170 struct kvm_s390_interrupt_info *inti;
151 VCPU_EVENT(vcpu, 4, "%s", "channel subsystem call - CC3"); 171
152 /* condition code 3 */ 172 inti = kvm_s390_get_io_int(vcpu->kvm, 0,
153 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); 173 vcpu->run->s.regs.gprs[1]);
154 vcpu->arch.sie_block->gpsw.mask |= (3 & 3ul) << 44; 174
155 return 0; 175 /*
176 * Prepare exit to userspace.
177 * We indicate whether we dequeued a pending I/O interrupt
178 * so that userspace can re-inject it if the instruction gets
179 * a program check. While this may re-order the pending I/O
180 * interrupts, this is no problem since the priority is kept
181 * intact.
182 */
183 vcpu->run->exit_reason = KVM_EXIT_S390_TSCH;
184 vcpu->run->s390_tsch.dequeued = !!inti;
185 if (inti) {
186 vcpu->run->s390_tsch.subchannel_id = inti->io.subchannel_id;
187 vcpu->run->s390_tsch.subchannel_nr = inti->io.subchannel_nr;
188 vcpu->run->s390_tsch.io_int_parm = inti->io.io_int_parm;
189 vcpu->run->s390_tsch.io_int_word = inti->io.io_int_word;
190 }
191 vcpu->run->s390_tsch.ipb = vcpu->arch.sie_block->ipb;
192 kfree(inti);
193 return -EREMOTE;
194}
195
196static int handle_io_inst(struct kvm_vcpu *vcpu)
197{
198 VCPU_EVENT(vcpu, 4, "%s", "I/O instruction");
199
200 if (vcpu->kvm->arch.css_support) {
201 /*
202 * Most I/O instructions will be handled by userspace.
203 * Exceptions are tpi and the interrupt portion of tsch.
204 */
205 if (vcpu->arch.sie_block->ipa == 0xb236)
206 return handle_tpi(vcpu);
207 if (vcpu->arch.sie_block->ipa == 0xb235)
208 return handle_tsch(vcpu);
209 /* Handle in userspace. */
210 return -EOPNOTSUPP;
211 } else {
212 /*
213 * Set condition code 3 to stop the guest from issueing channel
214 * I/O instructions.
215 */
216 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
217 vcpu->arch.sie_block->gpsw.mask |= (3 & 3ul) << 44;
218 return 0;
219 }
156} 220}
157 221
158static int handle_stfl(struct kvm_vcpu *vcpu) 222static int handle_stfl(struct kvm_vcpu *vcpu)
@@ -176,17 +240,107 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
176 return 0; 240 return 0;
177} 241}
178 242
243static void handle_new_psw(struct kvm_vcpu *vcpu)
244{
245 /* Check whether the new psw is enabled for machine checks. */
246 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_MCHECK)
247 kvm_s390_deliver_pending_machine_checks(vcpu);
248}
249
250#define PSW_MASK_ADDR_MODE (PSW_MASK_EA | PSW_MASK_BA)
251#define PSW_MASK_UNASSIGNED 0xb80800fe7fffffffUL
252#define PSW_ADDR_24 0x00000000000fffffUL
253#define PSW_ADDR_31 0x000000007fffffffUL
254
255int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
256{
257 u64 addr;
258 psw_compat_t new_psw;
259
260 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
261 return kvm_s390_inject_program_int(vcpu,
262 PGM_PRIVILEGED_OPERATION);
263
264 addr = kvm_s390_get_base_disp_s(vcpu);
265
266 if (addr & 7) {
267 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
268 goto out;
269 }
270
271 if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) {
272 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
273 goto out;
274 }
275
276 if (!(new_psw.mask & PSW32_MASK_BASE)) {
277 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
278 goto out;
279 }
280
281 vcpu->arch.sie_block->gpsw.mask =
282 (new_psw.mask & ~PSW32_MASK_BASE) << 32;
283 vcpu->arch.sie_block->gpsw.addr = new_psw.addr;
284
285 if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_UNASSIGNED) ||
286 (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) &&
287 (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_24)) ||
288 ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) ==
289 PSW_MASK_EA)) {
290 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
291 goto out;
292 }
293
294 handle_new_psw(vcpu);
295out:
296 return 0;
297}
298
299static int handle_lpswe(struct kvm_vcpu *vcpu)
300{
301 u64 addr;
302 psw_t new_psw;
303
304 addr = kvm_s390_get_base_disp_s(vcpu);
305
306 if (addr & 7) {
307 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
308 goto out;
309 }
310
311 if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) {
312 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
313 goto out;
314 }
315
316 vcpu->arch.sie_block->gpsw.mask = new_psw.mask;
317 vcpu->arch.sie_block->gpsw.addr = new_psw.addr;
318
319 if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_UNASSIGNED) ||
320 (((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) ==
321 PSW_MASK_BA) &&
322 (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_31)) ||
323 (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) &&
324 (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_24)) ||
325 ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) ==
326 PSW_MASK_EA)) {
327 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
328 goto out;
329 }
330
331 handle_new_psw(vcpu);
332out:
333 return 0;
334}
335
179static int handle_stidp(struct kvm_vcpu *vcpu) 336static int handle_stidp(struct kvm_vcpu *vcpu)
180{ 337{
181 int base2 = vcpu->arch.sie_block->ipb >> 28;
182 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
183 u64 operand2; 338 u64 operand2;
184 int rc; 339 int rc;
185 340
186 vcpu->stat.instruction_stidp++; 341 vcpu->stat.instruction_stidp++;
187 operand2 = disp2; 342
188 if (base2) 343 operand2 = kvm_s390_get_base_disp_s(vcpu);
189 operand2 += vcpu->run->s.regs.gprs[base2];
190 344
191 if (operand2 & 7) { 345 if (operand2 & 7) {
192 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 346 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -240,17 +394,13 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
240 int fc = (vcpu->run->s.regs.gprs[0] & 0xf0000000) >> 28; 394 int fc = (vcpu->run->s.regs.gprs[0] & 0xf0000000) >> 28;
241 int sel1 = vcpu->run->s.regs.gprs[0] & 0xff; 395 int sel1 = vcpu->run->s.regs.gprs[0] & 0xff;
242 int sel2 = vcpu->run->s.regs.gprs[1] & 0xffff; 396 int sel2 = vcpu->run->s.regs.gprs[1] & 0xffff;
243 int base2 = vcpu->arch.sie_block->ipb >> 28;
244 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
245 u64 operand2; 397 u64 operand2;
246 unsigned long mem; 398 unsigned long mem;
247 399
248 vcpu->stat.instruction_stsi++; 400 vcpu->stat.instruction_stsi++;
249 VCPU_EVENT(vcpu, 4, "stsi: fc: %x sel1: %x sel2: %x", fc, sel1, sel2); 401 VCPU_EVENT(vcpu, 4, "stsi: fc: %x sel1: %x sel2: %x", fc, sel1, sel2);
250 402
251 operand2 = disp2; 403 operand2 = kvm_s390_get_base_disp_s(vcpu);
252 if (base2)
253 operand2 += vcpu->run->s.regs.gprs[base2];
254 404
255 if (operand2 & 0xfff && fc > 0) 405 if (operand2 & 0xfff && fc > 0)
256 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 406 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -297,7 +447,7 @@ out_fail:
297 return 0; 447 return 0;
298} 448}
299 449
300static intercept_handler_t priv_handlers[256] = { 450static const intercept_handler_t b2_handlers[256] = {
301 [0x02] = handle_stidp, 451 [0x02] = handle_stidp,
302 [0x10] = handle_set_prefix, 452 [0x10] = handle_set_prefix,
303 [0x11] = handle_store_prefix, 453 [0x11] = handle_store_prefix,
@@ -305,10 +455,25 @@ static intercept_handler_t priv_handlers[256] = {
305 [0x29] = handle_skey, 455 [0x29] = handle_skey,
306 [0x2a] = handle_skey, 456 [0x2a] = handle_skey,
307 [0x2b] = handle_skey, 457 [0x2b] = handle_skey,
308 [0x34] = handle_stsch, 458 [0x30] = handle_io_inst,
309 [0x5f] = handle_chsc, 459 [0x31] = handle_io_inst,
460 [0x32] = handle_io_inst,
461 [0x33] = handle_io_inst,
462 [0x34] = handle_io_inst,
463 [0x35] = handle_io_inst,
464 [0x36] = handle_io_inst,
465 [0x37] = handle_io_inst,
466 [0x38] = handle_io_inst,
467 [0x39] = handle_io_inst,
468 [0x3a] = handle_io_inst,
469 [0x3b] = handle_io_inst,
470 [0x3c] = handle_io_inst,
471 [0x5f] = handle_io_inst,
472 [0x74] = handle_io_inst,
473 [0x76] = handle_io_inst,
310 [0x7d] = handle_stsi, 474 [0x7d] = handle_stsi,
311 [0xb1] = handle_stfl, 475 [0xb1] = handle_stfl,
476 [0xb2] = handle_lpswe,
312}; 477};
313 478
314int kvm_s390_handle_b2(struct kvm_vcpu *vcpu) 479int kvm_s390_handle_b2(struct kvm_vcpu *vcpu)
@@ -322,7 +487,7 @@ int kvm_s390_handle_b2(struct kvm_vcpu *vcpu)
322 * state bit and (a) handle the instruction or (b) send a code 2 487 * state bit and (a) handle the instruction or (b) send a code 2
323 * program check. 488 * program check.
324 * Anything else goes to userspace.*/ 489 * Anything else goes to userspace.*/
325 handler = priv_handlers[vcpu->arch.sie_block->ipa & 0x00ff]; 490 handler = b2_handlers[vcpu->arch.sie_block->ipa & 0x00ff];
326 if (handler) { 491 if (handler) {
327 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 492 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
328 return kvm_s390_inject_program_int(vcpu, 493 return kvm_s390_inject_program_int(vcpu,
@@ -333,19 +498,74 @@ int kvm_s390_handle_b2(struct kvm_vcpu *vcpu)
333 return -EOPNOTSUPP; 498 return -EOPNOTSUPP;
334} 499}
335 500
501static int handle_epsw(struct kvm_vcpu *vcpu)
502{
503 int reg1, reg2;
504
505 reg1 = (vcpu->arch.sie_block->ipb & 0x00f00000) >> 24;
506 reg2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16;
507
508 /* This basically extracts the mask half of the psw. */
509 vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000;
510 vcpu->run->s.regs.gprs[reg1] |= vcpu->arch.sie_block->gpsw.mask >> 32;
511 if (reg2) {
512 vcpu->run->s.regs.gprs[reg2] &= 0xffffffff00000000;
513 vcpu->run->s.regs.gprs[reg2] |=
514 vcpu->arch.sie_block->gpsw.mask & 0x00000000ffffffff;
515 }
516 return 0;
517}
518
519static const intercept_handler_t b9_handlers[256] = {
520 [0x8d] = handle_epsw,
521 [0x9c] = handle_io_inst,
522};
523
524int kvm_s390_handle_b9(struct kvm_vcpu *vcpu)
525{
526 intercept_handler_t handler;
527
528 /* This is handled just as for the B2 instructions. */
529 handler = b9_handlers[vcpu->arch.sie_block->ipa & 0x00ff];
530 if (handler) {
531 if ((handler != handle_epsw) &&
532 (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE))
533 return kvm_s390_inject_program_int(vcpu,
534 PGM_PRIVILEGED_OPERATION);
535 else
536 return handler(vcpu);
537 }
538 return -EOPNOTSUPP;
539}
540
541static const intercept_handler_t eb_handlers[256] = {
542 [0x8a] = handle_io_inst,
543};
544
545int kvm_s390_handle_priv_eb(struct kvm_vcpu *vcpu)
546{
547 intercept_handler_t handler;
548
549 /* All eb instructions that end up here are privileged. */
550 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
551 return kvm_s390_inject_program_int(vcpu,
552 PGM_PRIVILEGED_OPERATION);
553 handler = eb_handlers[vcpu->arch.sie_block->ipb & 0xff];
554 if (handler)
555 return handler(vcpu);
556 return -EOPNOTSUPP;
557}
558
336static int handle_tprot(struct kvm_vcpu *vcpu) 559static int handle_tprot(struct kvm_vcpu *vcpu)
337{ 560{
338 int base1 = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28; 561 u64 address1, address2;
339 int disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16;
340 int base2 = (vcpu->arch.sie_block->ipb & 0xf000) >> 12;
341 int disp2 = vcpu->arch.sie_block->ipb & 0x0fff;
342 u64 address1 = disp1 + base1 ? vcpu->run->s.regs.gprs[base1] : 0;
343 u64 address2 = disp2 + base2 ? vcpu->run->s.regs.gprs[base2] : 0;
344 struct vm_area_struct *vma; 562 struct vm_area_struct *vma;
345 unsigned long user_address; 563 unsigned long user_address;
346 564
347 vcpu->stat.instruction_tprot++; 565 vcpu->stat.instruction_tprot++;
348 566
567 kvm_s390_get_base_disp_sse(vcpu, &address1, &address2);
568
349 /* we only handle the Linux memory detection case: 569 /* we only handle the Linux memory detection case:
350 * access key == 0 570 * access key == 0
351 * guest DAT == off 571 * guest DAT == off
@@ -405,7 +625,7 @@ static int handle_sckpf(struct kvm_vcpu *vcpu)
405 return 0; 625 return 0;
406} 626}
407 627
408static intercept_handler_t x01_handlers[256] = { 628static const intercept_handler_t x01_handlers[256] = {
409 [0x07] = handle_sckpf, 629 [0x07] = handle_sckpf,
410}; 630};
411 631
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 566ddf6e8dfb..1c48ab2845e0 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -137,8 +137,10 @@ static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action)
137 inti->type = KVM_S390_SIGP_STOP; 137 inti->type = KVM_S390_SIGP_STOP;
138 138
139 spin_lock_bh(&li->lock); 139 spin_lock_bh(&li->lock);
140 if ((atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) 140 if ((atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) {
141 kfree(inti);
141 goto out; 142 goto out;
143 }
142 list_add_tail(&inti->list, &li->list); 144 list_add_tail(&inti->list, &li->list);
143 atomic_set(&li->active, 1); 145 atomic_set(&li->active, 1);
144 atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags); 146 atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
@@ -324,8 +326,6 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
324{ 326{
325 int r1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; 327 int r1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
326 int r3 = vcpu->arch.sie_block->ipa & 0x000f; 328 int r3 = vcpu->arch.sie_block->ipa & 0x000f;
327 int base2 = vcpu->arch.sie_block->ipb >> 28;
328 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
329 u32 parameter; 329 u32 parameter;
330 u16 cpu_addr = vcpu->run->s.regs.gprs[r3]; 330 u16 cpu_addr = vcpu->run->s.regs.gprs[r3];
331 u8 order_code; 331 u8 order_code;
@@ -336,9 +336,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
336 return kvm_s390_inject_program_int(vcpu, 336 return kvm_s390_inject_program_int(vcpu,
337 PGM_PRIVILEGED_OPERATION); 337 PGM_PRIVILEGED_OPERATION);
338 338
339 order_code = disp2; 339 order_code = kvm_s390_get_base_disp_rs(vcpu);
340 if (base2)
341 order_code += vcpu->run->s.regs.gprs[base2];
342 340
343 if (r1 % 2) 341 if (r1 % 2)
344 parameter = vcpu->run->s.regs.gprs[r1]; 342 parameter = vcpu->run->s.regs.gprs[r1];
diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h
index 90fdf85b5ff7..13f30f58a2df 100644
--- a/arch/s390/kvm/trace-s390.h
+++ b/arch/s390/kvm/trace-s390.h
@@ -141,13 +141,13 @@ TRACE_EVENT(kvm_s390_inject_vcpu,
141 * Trace point for the actual delivery of interrupts. 141 * Trace point for the actual delivery of interrupts.
142 */ 142 */
143TRACE_EVENT(kvm_s390_deliver_interrupt, 143TRACE_EVENT(kvm_s390_deliver_interrupt,
144 TP_PROTO(unsigned int id, __u64 type, __u32 data0, __u64 data1), 144 TP_PROTO(unsigned int id, __u64 type, __u64 data0, __u64 data1),
145 TP_ARGS(id, type, data0, data1), 145 TP_ARGS(id, type, data0, data1),
146 146
147 TP_STRUCT__entry( 147 TP_STRUCT__entry(
148 __field(int, id) 148 __field(int, id)
149 __field(__u32, inttype) 149 __field(__u32, inttype)
150 __field(__u32, data0) 150 __field(__u64, data0)
151 __field(__u64, data1) 151 __field(__u64, data1)
152 ), 152 ),
153 153
@@ -159,7 +159,7 @@ TRACE_EVENT(kvm_s390_deliver_interrupt,
159 ), 159 ),
160 160
161 TP_printk("deliver interrupt (vcpu %d): type:%x (%s) " \ 161 TP_printk("deliver interrupt (vcpu %d): type:%x (%s) " \
162 "data:%08x %016llx", 162 "data:%08llx %016llx",
163 __entry->id, __entry->inttype, 163 __entry->id, __entry->inttype,
164 __print_symbolic(__entry->inttype, kvm_s390_int_type), 164 __print_symbolic(__entry->inttype, kvm_s390_int_type),
165 __entry->data0, __entry->data1) 165 __entry->data0, __entry->data1)
@@ -204,6 +204,26 @@ TRACE_EVENT(kvm_s390_stop_request,
204 ); 204 );
205 205
206 206
207/*
208 * Trace point for enabling channel I/O instruction support.
209 */
210TRACE_EVENT(kvm_s390_enable_css,
211 TP_PROTO(void *kvm),
212 TP_ARGS(kvm),
213
214 TP_STRUCT__entry(
215 __field(void *, kvm)
216 ),
217
218 TP_fast_assign(
219 __entry->kvm = kvm;
220 ),
221
222 TP_printk("enabling channel I/O support (kvm @ %p)\n",
223 __entry->kvm)
224 );
225
226
207#endif /* _TRACE_KVMS390_H */ 227#endif /* _TRACE_KVMS390_H */
208 228
209/* This part must be outside protection */ 229/* This part must be outside protection */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dc87b65e9c3a..635a74d22409 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -33,10 +33,10 @@
33 33
34#define KVM_MAX_VCPUS 254 34#define KVM_MAX_VCPUS 254
35#define KVM_SOFT_MAX_VCPUS 160 35#define KVM_SOFT_MAX_VCPUS 160
36#define KVM_MEMORY_SLOTS 32 36#define KVM_USER_MEM_SLOTS 125
37/* memory slots that does not exposed to userspace */ 37/* memory slots that are not exposed to userspace */
38#define KVM_PRIVATE_MEM_SLOTS 4 38#define KVM_PRIVATE_MEM_SLOTS 3
39#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) 39#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
40 40
41#define KVM_MMIO_SIZE 16 41#define KVM_MMIO_SIZE 16
42 42
@@ -219,11 +219,6 @@ struct kvm_mmu_page {
219 u64 *spt; 219 u64 *spt;
220 /* hold the gfn of each spte inside spt */ 220 /* hold the gfn of each spte inside spt */
221 gfn_t *gfns; 221 gfn_t *gfns;
222 /*
223 * One bit set per slot which has memory
224 * in this shadow page.
225 */
226 DECLARE_BITMAP(slot_bitmap, KVM_MEM_SLOTS_NUM);
227 bool unsync; 222 bool unsync;
228 int root_count; /* Currently serving as active root */ 223 int root_count; /* Currently serving as active root */
229 unsigned int unsync_children; 224 unsigned int unsync_children;
@@ -502,6 +497,13 @@ struct kvm_vcpu_arch {
502 u64 msr_val; 497 u64 msr_val;
503 struct gfn_to_hva_cache data; 498 struct gfn_to_hva_cache data;
504 } pv_eoi; 499 } pv_eoi;
500
501 /*
502 * Indicate whether the access faults on its page table in guest
503 * which is set when fix page fault and used to detect unhandeable
504 * instruction.
505 */
506 bool write_fault_to_shadow_pgtable;
505}; 507};
506 508
507struct kvm_lpage_info { 509struct kvm_lpage_info {
@@ -697,6 +699,11 @@ struct kvm_x86_ops {
697 void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 699 void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
698 void (*enable_irq_window)(struct kvm_vcpu *vcpu); 700 void (*enable_irq_window)(struct kvm_vcpu *vcpu);
699 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 701 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
702 int (*vm_has_apicv)(struct kvm *kvm);
703 void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
704 void (*hwapic_isr_update)(struct kvm *kvm, int isr);
705 void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
706 void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
700 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 707 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
701 int (*get_tdp_level)(void); 708 int (*get_tdp_level)(void);
702 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 709 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
@@ -991,6 +998,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva);
991int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); 998int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
992void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 999void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
993int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); 1000int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
1001int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
994int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 1002int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
995int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 1003int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
996int kvm_cpu_get_interrupt(struct kvm_vcpu *v); 1004int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 65231e173baf..695399f2d5eb 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -27,7 +27,7 @@ static inline bool kvm_check_and_clear_guest_paused(void)
27 * 27 *
28 * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively. 28 * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
29 * The hypercall number should be placed in rax and the return value will be 29 * The hypercall number should be placed in rax and the return value will be
30 * placed in rax. No other registers will be clobbered unless explicited 30 * placed in rax. No other registers will be clobbered unless explicitly
31 * noted by the particular hypercall. 31 * noted by the particular hypercall.
32 */ 32 */
33 33
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 235b49fa554b..b6fbf860e398 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -57,9 +57,12 @@
57#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 57#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
58#define SECONDARY_EXEC_ENABLE_EPT 0x00000002 58#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
59#define SECONDARY_EXEC_RDTSCP 0x00000008 59#define SECONDARY_EXEC_RDTSCP 0x00000008
60#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010
60#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 61#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
61#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 62#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
62#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 63#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
64#define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100
65#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200
63#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 66#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
64#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 67#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
65 68
@@ -97,6 +100,7 @@ enum vmcs_field {
97 GUEST_GS_SELECTOR = 0x0000080a, 100 GUEST_GS_SELECTOR = 0x0000080a,
98 GUEST_LDTR_SELECTOR = 0x0000080c, 101 GUEST_LDTR_SELECTOR = 0x0000080c,
99 GUEST_TR_SELECTOR = 0x0000080e, 102 GUEST_TR_SELECTOR = 0x0000080e,
103 GUEST_INTR_STATUS = 0x00000810,
100 HOST_ES_SELECTOR = 0x00000c00, 104 HOST_ES_SELECTOR = 0x00000c00,
101 HOST_CS_SELECTOR = 0x00000c02, 105 HOST_CS_SELECTOR = 0x00000c02,
102 HOST_SS_SELECTOR = 0x00000c04, 106 HOST_SS_SELECTOR = 0x00000c04,
@@ -124,6 +128,14 @@ enum vmcs_field {
124 APIC_ACCESS_ADDR_HIGH = 0x00002015, 128 APIC_ACCESS_ADDR_HIGH = 0x00002015,
125 EPT_POINTER = 0x0000201a, 129 EPT_POINTER = 0x0000201a,
126 EPT_POINTER_HIGH = 0x0000201b, 130 EPT_POINTER_HIGH = 0x0000201b,
131 EOI_EXIT_BITMAP0 = 0x0000201c,
132 EOI_EXIT_BITMAP0_HIGH = 0x0000201d,
133 EOI_EXIT_BITMAP1 = 0x0000201e,
134 EOI_EXIT_BITMAP1_HIGH = 0x0000201f,
135 EOI_EXIT_BITMAP2 = 0x00002020,
136 EOI_EXIT_BITMAP2_HIGH = 0x00002021,
137 EOI_EXIT_BITMAP3 = 0x00002022,
138 EOI_EXIT_BITMAP3_HIGH = 0x00002023,
127 GUEST_PHYSICAL_ADDRESS = 0x00002400, 139 GUEST_PHYSICAL_ADDRESS = 0x00002400,
128 GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, 140 GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
129 VMCS_LINK_POINTER = 0x00002800, 141 VMCS_LINK_POINTER = 0x00002800,
@@ -346,9 +358,9 @@ enum vmcs_field {
346 358
347#define AR_RESERVD_MASK 0xfffe0f00 359#define AR_RESERVD_MASK 0xfffe0f00
348 360
349#define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0) 361#define TSS_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 0)
350#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1) 362#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 1)
351#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2) 363#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 2)
352 364
353#define VMX_NR_VPIDS (1 << 16) 365#define VMX_NR_VPIDS (1 << 16)
354#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 366#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 979d03bce135..2871fccfee68 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -62,10 +62,12 @@
62#define EXIT_REASON_MCE_DURING_VMENTRY 41 62#define EXIT_REASON_MCE_DURING_VMENTRY 41
63#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 63#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
64#define EXIT_REASON_APIC_ACCESS 44 64#define EXIT_REASON_APIC_ACCESS 44
65#define EXIT_REASON_EOI_INDUCED 45
65#define EXIT_REASON_EPT_VIOLATION 48 66#define EXIT_REASON_EPT_VIOLATION 48
66#define EXIT_REASON_EPT_MISCONFIG 49 67#define EXIT_REASON_EPT_MISCONFIG 49
67#define EXIT_REASON_WBINVD 54 68#define EXIT_REASON_WBINVD 54
68#define EXIT_REASON_XSETBV 55 69#define EXIT_REASON_XSETBV 55
70#define EXIT_REASON_APIC_WRITE 56
69#define EXIT_REASON_INVPCID 58 71#define EXIT_REASON_INVPCID 58
70 72
71#define VMX_EXIT_REASONS \ 73#define VMX_EXIT_REASONS \
@@ -103,7 +105,12 @@
103 { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ 105 { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \
104 { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ 106 { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \
105 { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ 107 { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \
106 { EXIT_REASON_WBINVD, "WBINVD" } 108 { EXIT_REASON_WBINVD, "WBINVD" }, \
109 { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \
110 { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
111 { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
112 { EXIT_REASON_INVD, "INVD" }, \
113 { EXIT_REASON_INVPCID, "INVPCID" }
107 114
108 115
109#endif /* _UAPIVMX_H */ 116#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 9f966dc0b9e4..0732f0089a3d 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -218,6 +218,9 @@ static void kvm_shutdown(void)
218void __init kvmclock_init(void) 218void __init kvmclock_init(void)
219{ 219{
220 unsigned long mem; 220 unsigned long mem;
221 int size;
222
223 size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
221 224
222 if (!kvm_para_available()) 225 if (!kvm_para_available())
223 return; 226 return;
@@ -231,16 +234,14 @@ void __init kvmclock_init(void)
231 printk(KERN_INFO "kvm-clock: Using msrs %x and %x", 234 printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
232 msr_kvm_system_time, msr_kvm_wall_clock); 235 msr_kvm_system_time, msr_kvm_wall_clock);
233 236
234 mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS, 237 mem = memblock_alloc(size, PAGE_SIZE);
235 PAGE_SIZE);
236 if (!mem) 238 if (!mem)
237 return; 239 return;
238 hv_clock = __va(mem); 240 hv_clock = __va(mem);
239 241
240 if (kvm_register_clock("boot clock")) { 242 if (kvm_register_clock("boot clock")) {
241 hv_clock = NULL; 243 hv_clock = NULL;
242 memblock_free(mem, 244 memblock_free(mem, size);
243 sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
244 return; 245 return;
245 } 246 }
246 pv_time_ops.sched_clock = kvm_clock_read; 247 pv_time_ops.sched_clock = kvm_clock_read;
@@ -275,7 +276,7 @@ int __init kvm_setup_vsyscall_timeinfo(void)
275 struct pvclock_vcpu_time_info *vcpu_time; 276 struct pvclock_vcpu_time_info *vcpu_time;
276 unsigned int size; 277 unsigned int size;
277 278
278 size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS; 279 size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
279 280
280 preempt_disable(); 281 preempt_disable();
281 cpu = smp_processor_id(); 282 cpu = smp_processor_id();
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a27e76371108..a335cc6cde72 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -24,6 +24,7 @@
24#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
25#include <linux/module.h> 25#include <linux/module.h>
26#include <asm/kvm_emulate.h> 26#include <asm/kvm_emulate.h>
27#include <linux/stringify.h>
27 28
28#include "x86.h" 29#include "x86.h"
29#include "tss.h" 30#include "tss.h"
@@ -43,7 +44,7 @@
43#define OpCL 9ull /* CL register (for shifts) */ 44#define OpCL 9ull /* CL register (for shifts) */
44#define OpImmByte 10ull /* 8-bit sign extended immediate */ 45#define OpImmByte 10ull /* 8-bit sign extended immediate */
45#define OpOne 11ull /* Implied 1 */ 46#define OpOne 11ull /* Implied 1 */
46#define OpImm 12ull /* Sign extended immediate */ 47#define OpImm 12ull /* Sign extended up to 32-bit immediate */
47#define OpMem16 13ull /* Memory operand (16-bit). */ 48#define OpMem16 13ull /* Memory operand (16-bit). */
48#define OpMem32 14ull /* Memory operand (32-bit). */ 49#define OpMem32 14ull /* Memory operand (32-bit). */
49#define OpImmU 15ull /* Immediate operand, zero extended */ 50#define OpImmU 15ull /* Immediate operand, zero extended */
@@ -58,6 +59,7 @@
58#define OpFS 24ull /* FS */ 59#define OpFS 24ull /* FS */
59#define OpGS 25ull /* GS */ 60#define OpGS 25ull /* GS */
60#define OpMem8 26ull /* 8-bit zero extended memory operand */ 61#define OpMem8 26ull /* 8-bit zero extended memory operand */
62#define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */
61 63
62#define OpBits 5 /* Width of operand field */ 64#define OpBits 5 /* Width of operand field */
63#define OpMask ((1ull << OpBits) - 1) 65#define OpMask ((1ull << OpBits) - 1)
@@ -101,6 +103,7 @@
101#define SrcMemFAddr (OpMemFAddr << SrcShift) 103#define SrcMemFAddr (OpMemFAddr << SrcShift)
102#define SrcAcc (OpAcc << SrcShift) 104#define SrcAcc (OpAcc << SrcShift)
103#define SrcImmU16 (OpImmU16 << SrcShift) 105#define SrcImmU16 (OpImmU16 << SrcShift)
106#define SrcImm64 (OpImm64 << SrcShift)
104#define SrcDX (OpDX << SrcShift) 107#define SrcDX (OpDX << SrcShift)
105#define SrcMem8 (OpMem8 << SrcShift) 108#define SrcMem8 (OpMem8 << SrcShift)
106#define SrcMask (OpMask << SrcShift) 109#define SrcMask (OpMask << SrcShift)
@@ -113,6 +116,7 @@
113#define GroupDual (2<<15) /* Alternate decoding of mod == 3 */ 116#define GroupDual (2<<15) /* Alternate decoding of mod == 3 */
114#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ 117#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
115#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ 118#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
119#define Escape (5<<15) /* Escape to coprocessor instruction */
116#define Sse (1<<18) /* SSE Vector instruction */ 120#define Sse (1<<18) /* SSE Vector instruction */
117/* Generic ModRM decode. */ 121/* Generic ModRM decode. */
118#define ModRM (1<<19) 122#define ModRM (1<<19)
@@ -146,6 +150,8 @@
146#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ 150#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */
147#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */ 151#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */
148#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ 152#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */
153#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */
154#define NoWrite ((u64)1 << 45) /* No writeback */
149 155
150#define X2(x...) x, x 156#define X2(x...) x, x
151#define X3(x...) X2(x), x 157#define X3(x...) X2(x), x
@@ -156,6 +162,27 @@
156#define X8(x...) X4(x), X4(x) 162#define X8(x...) X4(x), X4(x)
157#define X16(x...) X8(x), X8(x) 163#define X16(x...) X8(x), X8(x)
158 164
165#define NR_FASTOP (ilog2(sizeof(ulong)) + 1)
166#define FASTOP_SIZE 8
167
168/*
169 * fastop functions have a special calling convention:
170 *
171 * dst: [rdx]:rax (in/out)
172 * src: rbx (in/out)
173 * src2: rcx (in)
174 * flags: rflags (in/out)
175 *
176 * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
177 * different operand sizes can be reached by calculation, rather than a jump
178 * table (which would be bigger than the code).
179 *
180 * fastop functions are declared as taking a never-defined fastop parameter,
181 * so they can't be called from C directly.
182 */
183
184struct fastop;
185
159struct opcode { 186struct opcode {
160 u64 flags : 56; 187 u64 flags : 56;
161 u64 intercept : 8; 188 u64 intercept : 8;
@@ -164,6 +191,8 @@ struct opcode {
164 const struct opcode *group; 191 const struct opcode *group;
165 const struct group_dual *gdual; 192 const struct group_dual *gdual;
166 const struct gprefix *gprefix; 193 const struct gprefix *gprefix;
194 const struct escape *esc;
195 void (*fastop)(struct fastop *fake);
167 } u; 196 } u;
168 int (*check_perm)(struct x86_emulate_ctxt *ctxt); 197 int (*check_perm)(struct x86_emulate_ctxt *ctxt);
169}; 198};
@@ -180,6 +209,11 @@ struct gprefix {
180 struct opcode pfx_f3; 209 struct opcode pfx_f3;
181}; 210};
182 211
212struct escape {
213 struct opcode op[8];
214 struct opcode high[64];
215};
216
183/* EFLAGS bit definitions. */ 217/* EFLAGS bit definitions. */
184#define EFLG_ID (1<<21) 218#define EFLG_ID (1<<21)
185#define EFLG_VIP (1<<20) 219#define EFLG_VIP (1<<20)
@@ -407,6 +441,97 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
407 } \ 441 } \
408 } while (0) 442 } while (0)
409 443
444static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
445
446#define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t"
447#define FOP_RET "ret \n\t"
448
449#define FOP_START(op) \
450 extern void em_##op(struct fastop *fake); \
451 asm(".pushsection .text, \"ax\" \n\t" \
452 ".global em_" #op " \n\t" \
453 FOP_ALIGN \
454 "em_" #op ": \n\t"
455
456#define FOP_END \
457 ".popsection")
458
459#define FOPNOP() FOP_ALIGN FOP_RET
460
461#define FOP1E(op, dst) \
462 FOP_ALIGN #op " %" #dst " \n\t" FOP_RET
463
464#define FASTOP1(op) \
465 FOP_START(op) \
466 FOP1E(op##b, al) \
467 FOP1E(op##w, ax) \
468 FOP1E(op##l, eax) \
469 ON64(FOP1E(op##q, rax)) \
470 FOP_END
471
472#define FOP2E(op, dst, src) \
473 FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET
474
475#define FASTOP2(op) \
476 FOP_START(op) \
477 FOP2E(op##b, al, bl) \
478 FOP2E(op##w, ax, bx) \
479 FOP2E(op##l, eax, ebx) \
480 ON64(FOP2E(op##q, rax, rbx)) \
481 FOP_END
482
483/* 2 operand, word only */
484#define FASTOP2W(op) \
485 FOP_START(op) \
486 FOPNOP() \
487 FOP2E(op##w, ax, bx) \
488 FOP2E(op##l, eax, ebx) \
489 ON64(FOP2E(op##q, rax, rbx)) \
490 FOP_END
491
492/* 2 operand, src is CL */
493#define FASTOP2CL(op) \
494 FOP_START(op) \
495 FOP2E(op##b, al, cl) \
496 FOP2E(op##w, ax, cl) \
497 FOP2E(op##l, eax, cl) \
498 ON64(FOP2E(op##q, rax, cl)) \
499 FOP_END
500
501#define FOP3E(op, dst, src, src2) \
502 FOP_ALIGN #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET
503
504/* 3-operand, word-only, src2=cl */
505#define FASTOP3WCL(op) \
506 FOP_START(op) \
507 FOPNOP() \
508 FOP3E(op##w, ax, bx, cl) \
509 FOP3E(op##l, eax, ebx, cl) \
510 ON64(FOP3E(op##q, rax, rbx, cl)) \
511 FOP_END
512
513/* Special case for SETcc - 1 instruction per cc */
514#define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t"
515
516FOP_START(setcc)
517FOP_SETCC(seto)
518FOP_SETCC(setno)
519FOP_SETCC(setc)
520FOP_SETCC(setnc)
521FOP_SETCC(setz)
522FOP_SETCC(setnz)
523FOP_SETCC(setbe)
524FOP_SETCC(setnbe)
525FOP_SETCC(sets)
526FOP_SETCC(setns)
527FOP_SETCC(setp)
528FOP_SETCC(setnp)
529FOP_SETCC(setl)
530FOP_SETCC(setnl)
531FOP_SETCC(setle)
532FOP_SETCC(setnle)
533FOP_END;
534
410#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ 535#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \
411 do { \ 536 do { \
412 unsigned long _tmp; \ 537 unsigned long _tmp; \
@@ -663,7 +788,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
663 ulong la; 788 ulong la;
664 u32 lim; 789 u32 lim;
665 u16 sel; 790 u16 sel;
666 unsigned cpl, rpl; 791 unsigned cpl;
667 792
668 la = seg_base(ctxt, addr.seg) + addr.ea; 793 la = seg_base(ctxt, addr.seg) + addr.ea;
669 switch (ctxt->mode) { 794 switch (ctxt->mode) {
@@ -697,11 +822,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
697 goto bad; 822 goto bad;
698 } 823 }
699 cpl = ctxt->ops->cpl(ctxt); 824 cpl = ctxt->ops->cpl(ctxt);
700 if (ctxt->mode == X86EMUL_MODE_REAL)
701 rpl = 0;
702 else
703 rpl = sel & 3;
704 cpl = max(cpl, rpl);
705 if (!(desc.type & 8)) { 825 if (!(desc.type & 8)) {
706 /* data segment */ 826 /* data segment */
707 if (cpl > desc.dpl) 827 if (cpl > desc.dpl)
@@ -852,39 +972,50 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
852 return rc; 972 return rc;
853} 973}
854 974
855static int test_cc(unsigned int condition, unsigned int flags) 975FASTOP2(add);
856{ 976FASTOP2(or);
857 int rc = 0; 977FASTOP2(adc);
858 978FASTOP2(sbb);
859 switch ((condition & 15) >> 1) { 979FASTOP2(and);
860 case 0: /* o */ 980FASTOP2(sub);
861 rc |= (flags & EFLG_OF); 981FASTOP2(xor);
862 break; 982FASTOP2(cmp);
863 case 1: /* b/c/nae */ 983FASTOP2(test);
864 rc |= (flags & EFLG_CF); 984
865 break; 985FASTOP3WCL(shld);
866 case 2: /* z/e */ 986FASTOP3WCL(shrd);
867 rc |= (flags & EFLG_ZF); 987
868 break; 988FASTOP2W(imul);
869 case 3: /* be/na */ 989
870 rc |= (flags & (EFLG_CF|EFLG_ZF)); 990FASTOP1(not);
871 break; 991FASTOP1(neg);
872 case 4: /* s */ 992FASTOP1(inc);
873 rc |= (flags & EFLG_SF); 993FASTOP1(dec);
874 break; 994
875 case 5: /* p/pe */ 995FASTOP2CL(rol);
876 rc |= (flags & EFLG_PF); 996FASTOP2CL(ror);
877 break; 997FASTOP2CL(rcl);
878 case 7: /* le/ng */ 998FASTOP2CL(rcr);
879 rc |= (flags & EFLG_ZF); 999FASTOP2CL(shl);
880 /* fall through */ 1000FASTOP2CL(shr);
881 case 6: /* l/nge */ 1001FASTOP2CL(sar);
882 rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); 1002
883 break; 1003FASTOP2W(bsf);
884 } 1004FASTOP2W(bsr);
885 1005FASTOP2W(bt);
886 /* Odd condition identifiers (lsb == 1) have inverted sense. */ 1006FASTOP2W(bts);
887 return (!!rc ^ (condition & 1)); 1007FASTOP2W(btr);
1008FASTOP2W(btc);
1009
1010static u8 test_cc(unsigned int condition, unsigned long flags)
1011{
1012 u8 rc;
1013 void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);
1014
1015 flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
1016 asm("push %[flags]; popf; call *%[fastop]"
1017 : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags));
1018 return rc;
888} 1019}
889 1020
890static void fetch_register_operand(struct operand *op) 1021static void fetch_register_operand(struct operand *op)
@@ -994,6 +1125,53 @@ static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
994 ctxt->ops->put_fpu(ctxt); 1125 ctxt->ops->put_fpu(ctxt);
995} 1126}
996 1127
1128static int em_fninit(struct x86_emulate_ctxt *ctxt)
1129{
1130 if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
1131 return emulate_nm(ctxt);
1132
1133 ctxt->ops->get_fpu(ctxt);
1134 asm volatile("fninit");
1135 ctxt->ops->put_fpu(ctxt);
1136 return X86EMUL_CONTINUE;
1137}
1138
1139static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
1140{
1141 u16 fcw;
1142
1143 if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
1144 return emulate_nm(ctxt);
1145
1146 ctxt->ops->get_fpu(ctxt);
1147 asm volatile("fnstcw %0": "+m"(fcw));
1148 ctxt->ops->put_fpu(ctxt);
1149
1150 /* force 2 byte destination */
1151 ctxt->dst.bytes = 2;
1152 ctxt->dst.val = fcw;
1153
1154 return X86EMUL_CONTINUE;
1155}
1156
1157static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
1158{
1159 u16 fsw;
1160
1161 if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
1162 return emulate_nm(ctxt);
1163
1164 ctxt->ops->get_fpu(ctxt);
1165 asm volatile("fnstsw %0": "+m"(fsw));
1166 ctxt->ops->put_fpu(ctxt);
1167
1168 /* force 2 byte destination */
1169 ctxt->dst.bytes = 2;
1170 ctxt->dst.val = fsw;
1171
1172 return X86EMUL_CONTINUE;
1173}
1174
997static void decode_register_operand(struct x86_emulate_ctxt *ctxt, 1175static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
998 struct operand *op) 1176 struct operand *op)
999{ 1177{
@@ -1534,6 +1712,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
1534{ 1712{
1535 int rc; 1713 int rc;
1536 1714
1715 if (ctxt->d & NoWrite)
1716 return X86EMUL_CONTINUE;
1717
1537 switch (ctxt->dst.type) { 1718 switch (ctxt->dst.type) {
1538 case OP_REG: 1719 case OP_REG:
1539 write_register_operand(&ctxt->dst); 1720 write_register_operand(&ctxt->dst);
@@ -1918,47 +2099,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
1918 return X86EMUL_CONTINUE; 2099 return X86EMUL_CONTINUE;
1919} 2100}
1920 2101
1921static int em_grp2(struct x86_emulate_ctxt *ctxt)
1922{
1923 switch (ctxt->modrm_reg) {
1924 case 0: /* rol */
1925 emulate_2op_SrcB(ctxt, "rol");
1926 break;
1927 case 1: /* ror */
1928 emulate_2op_SrcB(ctxt, "ror");
1929 break;
1930 case 2: /* rcl */
1931 emulate_2op_SrcB(ctxt, "rcl");
1932 break;
1933 case 3: /* rcr */
1934 emulate_2op_SrcB(ctxt, "rcr");
1935 break;
1936 case 4: /* sal/shl */
1937 case 6: /* sal/shl */
1938 emulate_2op_SrcB(ctxt, "sal");
1939 break;
1940 case 5: /* shr */
1941 emulate_2op_SrcB(ctxt, "shr");
1942 break;
1943 case 7: /* sar */
1944 emulate_2op_SrcB(ctxt, "sar");
1945 break;
1946 }
1947 return X86EMUL_CONTINUE;
1948}
1949
1950static int em_not(struct x86_emulate_ctxt *ctxt)
1951{
1952 ctxt->dst.val = ~ctxt->dst.val;
1953 return X86EMUL_CONTINUE;
1954}
1955
1956static int em_neg(struct x86_emulate_ctxt *ctxt)
1957{
1958 emulate_1op(ctxt, "neg");
1959 return X86EMUL_CONTINUE;
1960}
1961
1962static int em_mul_ex(struct x86_emulate_ctxt *ctxt) 2102static int em_mul_ex(struct x86_emulate_ctxt *ctxt)
1963{ 2103{
1964 u8 ex = 0; 2104 u8 ex = 0;
@@ -2000,12 +2140,6 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
2000 int rc = X86EMUL_CONTINUE; 2140 int rc = X86EMUL_CONTINUE;
2001 2141
2002 switch (ctxt->modrm_reg) { 2142 switch (ctxt->modrm_reg) {
2003 case 0: /* inc */
2004 emulate_1op(ctxt, "inc");
2005 break;
2006 case 1: /* dec */
2007 emulate_1op(ctxt, "dec");
2008 break;
2009 case 2: /* call near abs */ { 2143 case 2: /* call near abs */ {
2010 long int old_eip; 2144 long int old_eip;
2011 old_eip = ctxt->_eip; 2145 old_eip = ctxt->_eip;
@@ -2075,7 +2209,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
2075 /* Save real source value, then compare EAX against destination. */ 2209 /* Save real source value, then compare EAX against destination. */
2076 ctxt->src.orig_val = ctxt->src.val; 2210 ctxt->src.orig_val = ctxt->src.val;
2077 ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX); 2211 ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX);
2078 emulate_2op_SrcV(ctxt, "cmp"); 2212 fastop(ctxt, em_cmp);
2079 2213
2080 if (ctxt->eflags & EFLG_ZF) { 2214 if (ctxt->eflags & EFLG_ZF) {
2081 /* Success: write back to memory. */ 2215 /* Success: write back to memory. */
@@ -2843,7 +2977,7 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
2843 ctxt->src.type = OP_IMM; 2977 ctxt->src.type = OP_IMM;
2844 ctxt->src.val = 0; 2978 ctxt->src.val = 0;
2845 ctxt->src.bytes = 1; 2979 ctxt->src.bytes = 1;
2846 emulate_2op_SrcV(ctxt, "or"); 2980 fastop(ctxt, em_or);
2847 ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); 2981 ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
2848 if (cf) 2982 if (cf)
2849 ctxt->eflags |= X86_EFLAGS_CF; 2983 ctxt->eflags |= X86_EFLAGS_CF;
@@ -2852,6 +2986,24 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
2852 return X86EMUL_CONTINUE; 2986 return X86EMUL_CONTINUE;
2853} 2987}
2854 2988
2989static int em_aad(struct x86_emulate_ctxt *ctxt)
2990{
2991 u8 al = ctxt->dst.val & 0xff;
2992 u8 ah = (ctxt->dst.val >> 8) & 0xff;
2993
2994 al = (al + (ah * ctxt->src.val)) & 0xff;
2995
2996 ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al;
2997
2998 /* Set PF, ZF, SF */
2999 ctxt->src.type = OP_IMM;
3000 ctxt->src.val = 0;
3001 ctxt->src.bytes = 1;
3002 fastop(ctxt, em_or);
3003
3004 return X86EMUL_CONTINUE;
3005}
3006
2855static int em_call(struct x86_emulate_ctxt *ctxt) 3007static int em_call(struct x86_emulate_ctxt *ctxt)
2856{ 3008{
2857 long rel = ctxt->src.val; 3009 long rel = ctxt->src.val;
@@ -2900,64 +3052,6 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
2900 return X86EMUL_CONTINUE; 3052 return X86EMUL_CONTINUE;
2901} 3053}
2902 3054
2903static int em_add(struct x86_emulate_ctxt *ctxt)
2904{
2905 emulate_2op_SrcV(ctxt, "add");
2906 return X86EMUL_CONTINUE;
2907}
2908
2909static int em_or(struct x86_emulate_ctxt *ctxt)
2910{
2911 emulate_2op_SrcV(ctxt, "or");
2912 return X86EMUL_CONTINUE;
2913}
2914
2915static int em_adc(struct x86_emulate_ctxt *ctxt)
2916{
2917 emulate_2op_SrcV(ctxt, "adc");
2918 return X86EMUL_CONTINUE;
2919}
2920
2921static int em_sbb(struct x86_emulate_ctxt *ctxt)
2922{
2923 emulate_2op_SrcV(ctxt, "sbb");
2924 return X86EMUL_CONTINUE;
2925}
2926
2927static int em_and(struct x86_emulate_ctxt *ctxt)
2928{
2929 emulate_2op_SrcV(ctxt, "and");
2930 return X86EMUL_CONTINUE;
2931}
2932
2933static int em_sub(struct x86_emulate_ctxt *ctxt)
2934{
2935 emulate_2op_SrcV(ctxt, "sub");
2936 return X86EMUL_CONTINUE;
2937}
2938
2939static int em_xor(struct x86_emulate_ctxt *ctxt)
2940{
2941 emulate_2op_SrcV(ctxt, "xor");
2942 return X86EMUL_CONTINUE;
2943}
2944
2945static int em_cmp(struct x86_emulate_ctxt *ctxt)
2946{
2947 emulate_2op_SrcV(ctxt, "cmp");
2948 /* Disable writeback. */
2949 ctxt->dst.type = OP_NONE;
2950 return X86EMUL_CONTINUE;
2951}
2952
2953static int em_test(struct x86_emulate_ctxt *ctxt)
2954{
2955 emulate_2op_SrcV(ctxt, "test");
2956 /* Disable writeback. */
2957 ctxt->dst.type = OP_NONE;
2958 return X86EMUL_CONTINUE;
2959}
2960
2961static int em_xchg(struct x86_emulate_ctxt *ctxt) 3055static int em_xchg(struct x86_emulate_ctxt *ctxt)
2962{ 3056{
2963 /* Write back the register source. */ 3057 /* Write back the register source. */
@@ -2970,16 +3064,10 @@ static int em_xchg(struct x86_emulate_ctxt *ctxt)
2970 return X86EMUL_CONTINUE; 3064 return X86EMUL_CONTINUE;
2971} 3065}
2972 3066
2973static int em_imul(struct x86_emulate_ctxt *ctxt)
2974{
2975 emulate_2op_SrcV_nobyte(ctxt, "imul");
2976 return X86EMUL_CONTINUE;
2977}
2978
2979static int em_imul_3op(struct x86_emulate_ctxt *ctxt) 3067static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
2980{ 3068{
2981 ctxt->dst.val = ctxt->src2.val; 3069 ctxt->dst.val = ctxt->src2.val;
2982 return em_imul(ctxt); 3070 return fastop(ctxt, em_imul);
2983} 3071}
2984 3072
2985static int em_cwd(struct x86_emulate_ctxt *ctxt) 3073static int em_cwd(struct x86_emulate_ctxt *ctxt)
@@ -3300,47 +3388,6 @@ static int em_sti(struct x86_emulate_ctxt *ctxt)
3300 return X86EMUL_CONTINUE; 3388 return X86EMUL_CONTINUE;
3301} 3389}
3302 3390
3303static int em_bt(struct x86_emulate_ctxt *ctxt)
3304{
3305 /* Disable writeback. */
3306 ctxt->dst.type = OP_NONE;
3307 /* only subword offset */
3308 ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
3309
3310 emulate_2op_SrcV_nobyte(ctxt, "bt");
3311 return X86EMUL_CONTINUE;
3312}
3313
3314static int em_bts(struct x86_emulate_ctxt *ctxt)
3315{
3316 emulate_2op_SrcV_nobyte(ctxt, "bts");
3317 return X86EMUL_CONTINUE;
3318}
3319
3320static int em_btr(struct x86_emulate_ctxt *ctxt)
3321{
3322 emulate_2op_SrcV_nobyte(ctxt, "btr");
3323 return X86EMUL_CONTINUE;
3324}
3325
3326static int em_btc(struct x86_emulate_ctxt *ctxt)
3327{
3328 emulate_2op_SrcV_nobyte(ctxt, "btc");
3329 return X86EMUL_CONTINUE;
3330}
3331
3332static int em_bsf(struct x86_emulate_ctxt *ctxt)
3333{
3334 emulate_2op_SrcV_nobyte(ctxt, "bsf");
3335 return X86EMUL_CONTINUE;
3336}
3337
3338static int em_bsr(struct x86_emulate_ctxt *ctxt)
3339{
3340 emulate_2op_SrcV_nobyte(ctxt, "bsr");
3341 return X86EMUL_CONTINUE;
3342}
3343
3344static int em_cpuid(struct x86_emulate_ctxt *ctxt) 3391static int em_cpuid(struct x86_emulate_ctxt *ctxt)
3345{ 3392{
3346 u32 eax, ebx, ecx, edx; 3393 u32 eax, ebx, ecx, edx;
@@ -3572,7 +3619,9 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3572#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } 3619#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
3573#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } 3620#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
3574#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } 3621#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
3622#define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) }
3575#define I(_f, _e) { .flags = (_f), .u.execute = (_e) } 3623#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
3624#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
3576#define II(_f, _e, _i) \ 3625#define II(_f, _e, _i) \
3577 { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } 3626 { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
3578#define IIP(_f, _e, _i, _p) \ 3627#define IIP(_f, _e, _i, _p) \
@@ -3583,12 +3632,13 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3583#define D2bv(_f) D((_f) | ByteOp), D(_f) 3632#define D2bv(_f) D((_f) | ByteOp), D(_f)
3584#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) 3633#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
3585#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) 3634#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
3635#define F2bv(_f, _e) F((_f) | ByteOp, _e), F(_f, _e)
3586#define I2bvIP(_f, _e, _i, _p) \ 3636#define I2bvIP(_f, _e, _i, _p) \
3587 IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p) 3637 IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p)
3588 3638
3589#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \ 3639#define F6ALU(_f, _e) F2bv((_f) | DstMem | SrcReg | ModRM, _e), \
3590 I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ 3640 F2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
3591 I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) 3641 F2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
3592 3642
3593static const struct opcode group7_rm1[] = { 3643static const struct opcode group7_rm1[] = {
3594 DI(SrcNone | Priv, monitor), 3644 DI(SrcNone | Priv, monitor),
@@ -3614,25 +3664,36 @@ static const struct opcode group7_rm7[] = {
3614}; 3664};
3615 3665
3616static const struct opcode group1[] = { 3666static const struct opcode group1[] = {
3617 I(Lock, em_add), 3667 F(Lock, em_add),
3618 I(Lock | PageTable, em_or), 3668 F(Lock | PageTable, em_or),
3619 I(Lock, em_adc), 3669 F(Lock, em_adc),
3620 I(Lock, em_sbb), 3670 F(Lock, em_sbb),
3621 I(Lock | PageTable, em_and), 3671 F(Lock | PageTable, em_and),
3622 I(Lock, em_sub), 3672 F(Lock, em_sub),
3623 I(Lock, em_xor), 3673 F(Lock, em_xor),
3624 I(0, em_cmp), 3674 F(NoWrite, em_cmp),
3625}; 3675};
3626 3676
3627static const struct opcode group1A[] = { 3677static const struct opcode group1A[] = {
3628 I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, 3678 I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
3629}; 3679};
3630 3680
3681static const struct opcode group2[] = {
3682 F(DstMem | ModRM, em_rol),
3683 F(DstMem | ModRM, em_ror),
3684 F(DstMem | ModRM, em_rcl),
3685 F(DstMem | ModRM, em_rcr),
3686 F(DstMem | ModRM, em_shl),
3687 F(DstMem | ModRM, em_shr),
3688 F(DstMem | ModRM, em_shl),
3689 F(DstMem | ModRM, em_sar),
3690};
3691
3631static const struct opcode group3[] = { 3692static const struct opcode group3[] = {
3632 I(DstMem | SrcImm, em_test), 3693 F(DstMem | SrcImm | NoWrite, em_test),
3633 I(DstMem | SrcImm, em_test), 3694 F(DstMem | SrcImm | NoWrite, em_test),
3634 I(DstMem | SrcNone | Lock, em_not), 3695 F(DstMem | SrcNone | Lock, em_not),
3635 I(DstMem | SrcNone | Lock, em_neg), 3696 F(DstMem | SrcNone | Lock, em_neg),
3636 I(SrcMem, em_mul_ex), 3697 I(SrcMem, em_mul_ex),
3637 I(SrcMem, em_imul_ex), 3698 I(SrcMem, em_imul_ex),
3638 I(SrcMem, em_div_ex), 3699 I(SrcMem, em_div_ex),
@@ -3640,14 +3701,14 @@ static const struct opcode group3[] = {
3640}; 3701};
3641 3702
3642static const struct opcode group4[] = { 3703static const struct opcode group4[] = {
3643 I(ByteOp | DstMem | SrcNone | Lock, em_grp45), 3704 F(ByteOp | DstMem | SrcNone | Lock, em_inc),
3644 I(ByteOp | DstMem | SrcNone | Lock, em_grp45), 3705 F(ByteOp | DstMem | SrcNone | Lock, em_dec),
3645 N, N, N, N, N, N, 3706 N, N, N, N, N, N,
3646}; 3707};
3647 3708
3648static const struct opcode group5[] = { 3709static const struct opcode group5[] = {
3649 I(DstMem | SrcNone | Lock, em_grp45), 3710 F(DstMem | SrcNone | Lock, em_inc),
3650 I(DstMem | SrcNone | Lock, em_grp45), 3711 F(DstMem | SrcNone | Lock, em_dec),
3651 I(SrcMem | Stack, em_grp45), 3712 I(SrcMem | Stack, em_grp45),
3652 I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), 3713 I(SrcMemFAddr | ImplicitOps | Stack, em_call_far),
3653 I(SrcMem | Stack, em_grp45), 3714 I(SrcMem | Stack, em_grp45),
@@ -3682,10 +3743,10 @@ static const struct group_dual group7 = { {
3682 3743
3683static const struct opcode group8[] = { 3744static const struct opcode group8[] = {
3684 N, N, N, N, 3745 N, N, N, N,
3685 I(DstMem | SrcImmByte, em_bt), 3746 F(DstMem | SrcImmByte | NoWrite, em_bt),
3686 I(DstMem | SrcImmByte | Lock | PageTable, em_bts), 3747 F(DstMem | SrcImmByte | Lock | PageTable, em_bts),
3687 I(DstMem | SrcImmByte | Lock, em_btr), 3748 F(DstMem | SrcImmByte | Lock, em_btr),
3688 I(DstMem | SrcImmByte | Lock | PageTable, em_btc), 3749 F(DstMem | SrcImmByte | Lock | PageTable, em_btc),
3689}; 3750};
3690 3751
3691static const struct group_dual group9 = { { 3752static const struct group_dual group9 = { {
@@ -3707,33 +3768,96 @@ static const struct gprefix pfx_vmovntpx = {
3707 I(0, em_mov), N, N, N, 3768 I(0, em_mov), N, N, N,
3708}; 3769};
3709 3770
3771static const struct escape escape_d9 = { {
3772 N, N, N, N, N, N, N, I(DstMem, em_fnstcw),
3773}, {
3774 /* 0xC0 - 0xC7 */
3775 N, N, N, N, N, N, N, N,
3776 /* 0xC8 - 0xCF */
3777 N, N, N, N, N, N, N, N,
3778 /* 0xD0 - 0xC7 */
3779 N, N, N, N, N, N, N, N,
3780 /* 0xD8 - 0xDF */
3781 N, N, N, N, N, N, N, N,
3782 /* 0xE0 - 0xE7 */
3783 N, N, N, N, N, N, N, N,
3784 /* 0xE8 - 0xEF */
3785 N, N, N, N, N, N, N, N,
3786 /* 0xF0 - 0xF7 */
3787 N, N, N, N, N, N, N, N,
3788 /* 0xF8 - 0xFF */
3789 N, N, N, N, N, N, N, N,
3790} };
3791
3792static const struct escape escape_db = { {
3793 N, N, N, N, N, N, N, N,
3794}, {
3795 /* 0xC0 - 0xC7 */
3796 N, N, N, N, N, N, N, N,
3797 /* 0xC8 - 0xCF */
3798 N, N, N, N, N, N, N, N,
3799 /* 0xD0 - 0xC7 */
3800 N, N, N, N, N, N, N, N,
3801 /* 0xD8 - 0xDF */
3802 N, N, N, N, N, N, N, N,
3803 /* 0xE0 - 0xE7 */
3804 N, N, N, I(ImplicitOps, em_fninit), N, N, N, N,
3805 /* 0xE8 - 0xEF */
3806 N, N, N, N, N, N, N, N,
3807 /* 0xF0 - 0xF7 */
3808 N, N, N, N, N, N, N, N,
3809 /* 0xF8 - 0xFF */
3810 N, N, N, N, N, N, N, N,
3811} };
3812
3813static const struct escape escape_dd = { {
3814 N, N, N, N, N, N, N, I(DstMem, em_fnstsw),
3815}, {
3816 /* 0xC0 - 0xC7 */
3817 N, N, N, N, N, N, N, N,
3818 /* 0xC8 - 0xCF */
3819 N, N, N, N, N, N, N, N,
3820 /* 0xD0 - 0xC7 */
3821 N, N, N, N, N, N, N, N,
3822 /* 0xD8 - 0xDF */
3823 N, N, N, N, N, N, N, N,
3824 /* 0xE0 - 0xE7 */
3825 N, N, N, N, N, N, N, N,
3826 /* 0xE8 - 0xEF */
3827 N, N, N, N, N, N, N, N,
3828 /* 0xF0 - 0xF7 */
3829 N, N, N, N, N, N, N, N,
3830 /* 0xF8 - 0xFF */
3831 N, N, N, N, N, N, N, N,
3832} };
3833
3710static const struct opcode opcode_table[256] = { 3834static const struct opcode opcode_table[256] = {
3711 /* 0x00 - 0x07 */ 3835 /* 0x00 - 0x07 */
3712 I6ALU(Lock, em_add), 3836 F6ALU(Lock, em_add),
3713 I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), 3837 I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
3714 I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg), 3838 I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg),
3715 /* 0x08 - 0x0F */ 3839 /* 0x08 - 0x0F */
3716 I6ALU(Lock | PageTable, em_or), 3840 F6ALU(Lock | PageTable, em_or),
3717 I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg), 3841 I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg),
3718 N, 3842 N,
3719 /* 0x10 - 0x17 */ 3843 /* 0x10 - 0x17 */
3720 I6ALU(Lock, em_adc), 3844 F6ALU(Lock, em_adc),
3721 I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg), 3845 I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg),
3722 I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg), 3846 I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg),
3723 /* 0x18 - 0x1F */ 3847 /* 0x18 - 0x1F */
3724 I6ALU(Lock, em_sbb), 3848 F6ALU(Lock, em_sbb),
3725 I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg), 3849 I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg),
3726 I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg), 3850 I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg),
3727 /* 0x20 - 0x27 */ 3851 /* 0x20 - 0x27 */
3728 I6ALU(Lock | PageTable, em_and), N, N, 3852 F6ALU(Lock | PageTable, em_and), N, N,
3729 /* 0x28 - 0x2F */ 3853 /* 0x28 - 0x2F */
3730 I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), 3854 F6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
3731 /* 0x30 - 0x37 */ 3855 /* 0x30 - 0x37 */
3732 I6ALU(Lock, em_xor), N, N, 3856 F6ALU(Lock, em_xor), N, N,
3733 /* 0x38 - 0x3F */ 3857 /* 0x38 - 0x3F */
3734 I6ALU(0, em_cmp), N, N, 3858 F6ALU(NoWrite, em_cmp), N, N,
3735 /* 0x40 - 0x4F */ 3859 /* 0x40 - 0x4F */
3736 X16(D(DstReg)), 3860 X8(F(DstReg, em_inc)), X8(F(DstReg, em_dec)),
3737 /* 0x50 - 0x57 */ 3861 /* 0x50 - 0x57 */
3738 X8(I(SrcReg | Stack, em_push)), 3862 X8(I(SrcReg | Stack, em_push)),
3739 /* 0x58 - 0x5F */ 3863 /* 0x58 - 0x5F */
@@ -3757,7 +3881,7 @@ static const struct opcode opcode_table[256] = {
3757 G(DstMem | SrcImm, group1), 3881 G(DstMem | SrcImm, group1),
3758 G(ByteOp | DstMem | SrcImm | No64, group1), 3882 G(ByteOp | DstMem | SrcImm | No64, group1),
3759 G(DstMem | SrcImmByte, group1), 3883 G(DstMem | SrcImmByte, group1),
3760 I2bv(DstMem | SrcReg | ModRM, em_test), 3884 F2bv(DstMem | SrcReg | ModRM | NoWrite, em_test),
3761 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg), 3885 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
3762 /* 0x88 - 0x8F */ 3886 /* 0x88 - 0x8F */
3763 I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov), 3887 I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov),
@@ -3777,18 +3901,18 @@ static const struct opcode opcode_table[256] = {
3777 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), 3901 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
3778 I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), 3902 I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
3779 I2bv(SrcSI | DstDI | Mov | String, em_mov), 3903 I2bv(SrcSI | DstDI | Mov | String, em_mov),
3780 I2bv(SrcSI | DstDI | String, em_cmp), 3904 F2bv(SrcSI | DstDI | String | NoWrite, em_cmp),
3781 /* 0xA8 - 0xAF */ 3905 /* 0xA8 - 0xAF */
3782 I2bv(DstAcc | SrcImm, em_test), 3906 F2bv(DstAcc | SrcImm | NoWrite, em_test),
3783 I2bv(SrcAcc | DstDI | Mov | String, em_mov), 3907 I2bv(SrcAcc | DstDI | Mov | String, em_mov),
3784 I2bv(SrcSI | DstAcc | Mov | String, em_mov), 3908 I2bv(SrcSI | DstAcc | Mov | String, em_mov),
3785 I2bv(SrcAcc | DstDI | String, em_cmp), 3909 F2bv(SrcAcc | DstDI | String | NoWrite, em_cmp),
3786 /* 0xB0 - 0xB7 */ 3910 /* 0xB0 - 0xB7 */
3787 X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), 3911 X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
3788 /* 0xB8 - 0xBF */ 3912 /* 0xB8 - 0xBF */
3789 X8(I(DstReg | SrcImm | Mov, em_mov)), 3913 X8(I(DstReg | SrcImm64 | Mov, em_mov)),
3790 /* 0xC0 - 0xC7 */ 3914 /* 0xC0 - 0xC7 */
3791 D2bv(DstMem | SrcImmByte | ModRM), 3915 G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2),
3792 I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), 3916 I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
3793 I(ImplicitOps | Stack, em_ret), 3917 I(ImplicitOps | Stack, em_ret),
3794 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg), 3918 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg),
@@ -3800,10 +3924,11 @@ static const struct opcode opcode_table[256] = {
3800 D(ImplicitOps), DI(SrcImmByte, intn), 3924 D(ImplicitOps), DI(SrcImmByte, intn),
3801 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), 3925 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
3802 /* 0xD0 - 0xD7 */ 3926 /* 0xD0 - 0xD7 */
3803 D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), 3927 G(Src2One | ByteOp, group2), G(Src2One, group2),
3804 N, N, N, N, 3928 G(Src2CL | ByteOp, group2), G(Src2CL, group2),
3929 N, I(DstAcc | SrcImmByte | No64, em_aad), N, N,
3805 /* 0xD8 - 0xDF */ 3930 /* 0xD8 - 0xDF */
3806 N, N, N, N, N, N, N, N, 3931 N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N,
3807 /* 0xE0 - 0xE7 */ 3932 /* 0xE0 - 0xE7 */
3808 X3(I(SrcImmByte, em_loop)), 3933 X3(I(SrcImmByte, em_loop)),
3809 I(SrcImmByte, em_jcxz), 3934 I(SrcImmByte, em_jcxz),
@@ -3870,28 +3995,29 @@ static const struct opcode twobyte_table[256] = {
3870 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), 3995 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
3871 /* 0xA0 - 0xA7 */ 3996 /* 0xA0 - 0xA7 */
3872 I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), 3997 I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
3873 II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), 3998 II(ImplicitOps, em_cpuid, cpuid),
3874 D(DstMem | SrcReg | Src2ImmByte | ModRM), 3999 F(DstMem | SrcReg | ModRM | BitOp | NoWrite, em_bt),
3875 D(DstMem | SrcReg | Src2CL | ModRM), N, N, 4000 F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shld),
4001 F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
3876 /* 0xA8 - 0xAF */ 4002 /* 0xA8 - 0xAF */
3877 I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), 4003 I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
3878 DI(ImplicitOps, rsm), 4004 DI(ImplicitOps, rsm),
3879 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), 4005 F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
3880 D(DstMem | SrcReg | Src2ImmByte | ModRM), 4006 F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
3881 D(DstMem | SrcReg | Src2CL | ModRM), 4007 F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
3882 D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), 4008 D(ModRM), F(DstReg | SrcMem | ModRM, em_imul),
3883 /* 0xB0 - 0xB7 */ 4009 /* 0xB0 - 0xB7 */
3884 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), 4010 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg),
3885 I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), 4011 I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
3886 I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), 4012 F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
3887 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), 4013 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
3888 I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), 4014 I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
3889 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 4015 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3890 /* 0xB8 - 0xBF */ 4016 /* 0xB8 - 0xBF */
3891 N, N, 4017 N, N,
3892 G(BitOp, group8), 4018 G(BitOp, group8),
3893 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), 4019 F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
3894 I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), 4020 F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr),
3895 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 4021 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3896 /* 0xC0 - 0xC7 */ 4022 /* 0xC0 - 0xC7 */
3897 D2bv(DstMem | SrcReg | ModRM | Lock), 4023 D2bv(DstMem | SrcReg | ModRM | Lock),
@@ -3950,6 +4076,9 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
3950 case 4: 4076 case 4:
3951 op->val = insn_fetch(s32, ctxt); 4077 op->val = insn_fetch(s32, ctxt);
3952 break; 4078 break;
4079 case 8:
4080 op->val = insn_fetch(s64, ctxt);
4081 break;
3953 } 4082 }
3954 if (!sign_extension) { 4083 if (!sign_extension) {
3955 switch (op->bytes) { 4084 switch (op->bytes) {
@@ -4028,6 +4157,9 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
4028 case OpImm: 4157 case OpImm:
4029 rc = decode_imm(ctxt, op, imm_size(ctxt), true); 4158 rc = decode_imm(ctxt, op, imm_size(ctxt), true);
4030 break; 4159 break;
4160 case OpImm64:
4161 rc = decode_imm(ctxt, op, ctxt->op_bytes, true);
4162 break;
4031 case OpMem8: 4163 case OpMem8:
4032 ctxt->memop.bytes = 1; 4164 ctxt->memop.bytes = 1;
4033 goto mem_common; 4165 goto mem_common;
@@ -4222,6 +4354,12 @@ done_prefixes:
4222 case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break; 4354 case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break;
4223 } 4355 }
4224 break; 4356 break;
4357 case Escape:
4358 if (ctxt->modrm > 0xbf)
4359 opcode = opcode.u.esc->high[ctxt->modrm - 0xc0];
4360 else
4361 opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7];
4362 break;
4225 default: 4363 default:
4226 return EMULATION_FAILED; 4364 return EMULATION_FAILED;
4227 } 4365 }
@@ -4354,6 +4492,16 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
4354 read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); 4492 read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
4355} 4493}
4356 4494
4495static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
4496{
4497 ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
4498 fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
4499 asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
4500 : "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags)
4501 : "c"(ctxt->src2.val), [fastop]"S"(fop));
4502 ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
4503 return X86EMUL_CONTINUE;
4504}
4357 4505
4358int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) 4506int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4359{ 4507{
@@ -4483,6 +4631,13 @@ special_insn:
4483 } 4631 }
4484 4632
4485 if (ctxt->execute) { 4633 if (ctxt->execute) {
4634 if (ctxt->d & Fastop) {
4635 void (*fop)(struct fastop *) = (void *)ctxt->execute;
4636 rc = fastop(ctxt, fop);
4637 if (rc != X86EMUL_CONTINUE)
4638 goto done;
4639 goto writeback;
4640 }
4486 rc = ctxt->execute(ctxt); 4641 rc = ctxt->execute(ctxt);
4487 if (rc != X86EMUL_CONTINUE) 4642 if (rc != X86EMUL_CONTINUE)
4488 goto done; 4643 goto done;
@@ -4493,12 +4648,6 @@ special_insn:
4493 goto twobyte_insn; 4648 goto twobyte_insn;
4494 4649
4495 switch (ctxt->b) { 4650 switch (ctxt->b) {
4496 case 0x40 ... 0x47: /* inc r16/r32 */
4497 emulate_1op(ctxt, "inc");
4498 break;
4499 case 0x48 ... 0x4f: /* dec r16/r32 */
4500 emulate_1op(ctxt, "dec");
4501 break;
4502 case 0x63: /* movsxd */ 4651 case 0x63: /* movsxd */
4503 if (ctxt->mode != X86EMUL_MODE_PROT64) 4652 if (ctxt->mode != X86EMUL_MODE_PROT64)
4504 goto cannot_emulate; 4653 goto cannot_emulate;
@@ -4523,9 +4672,6 @@ special_insn:
4523 case 8: ctxt->dst.val = (s32)ctxt->dst.val; break; 4672 case 8: ctxt->dst.val = (s32)ctxt->dst.val; break;
4524 } 4673 }
4525 break; 4674 break;
4526 case 0xc0 ... 0xc1:
4527 rc = em_grp2(ctxt);
4528 break;
4529 case 0xcc: /* int3 */ 4675 case 0xcc: /* int3 */
4530 rc = emulate_int(ctxt, 3); 4676 rc = emulate_int(ctxt, 3);
4531 break; 4677 break;
@@ -4536,13 +4682,6 @@ special_insn:
4536 if (ctxt->eflags & EFLG_OF) 4682 if (ctxt->eflags & EFLG_OF)
4537 rc = emulate_int(ctxt, 4); 4683 rc = emulate_int(ctxt, 4);
4538 break; 4684 break;
4539 case 0xd0 ... 0xd1: /* Grp2 */
4540 rc = em_grp2(ctxt);
4541 break;
4542 case 0xd2 ... 0xd3: /* Grp2 */
4543 ctxt->src.val = reg_read(ctxt, VCPU_REGS_RCX);
4544 rc = em_grp2(ctxt);
4545 break;
4546 case 0xe9: /* jmp rel */ 4685 case 0xe9: /* jmp rel */
4547 case 0xeb: /* jmp rel short */ 4686 case 0xeb: /* jmp rel short */
4548 jmp_rel(ctxt, ctxt->src.val); 4687 jmp_rel(ctxt, ctxt->src.val);
@@ -4661,14 +4800,6 @@ twobyte_insn:
4661 case 0x90 ... 0x9f: /* setcc r/m8 */ 4800 case 0x90 ... 0x9f: /* setcc r/m8 */
4662 ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); 4801 ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
4663 break; 4802 break;
4664 case 0xa4: /* shld imm8, r, r/m */
4665 case 0xa5: /* shld cl, r, r/m */
4666 emulate_2op_cl(ctxt, "shld");
4667 break;
4668 case 0xac: /* shrd imm8, r, r/m */
4669 case 0xad: /* shrd cl, r, r/m */
4670 emulate_2op_cl(ctxt, "shrd");
4671 break;
4672 case 0xae: /* clflush */ 4803 case 0xae: /* clflush */
4673 break; 4804 break;
4674 case 0xb6 ... 0xb7: /* movzx */ 4805 case 0xb6 ... 0xb7: /* movzx */
@@ -4682,7 +4813,7 @@ twobyte_insn:
4682 (s16) ctxt->src.val; 4813 (s16) ctxt->src.val;
4683 break; 4814 break;
4684 case 0xc0 ... 0xc1: /* xadd */ 4815 case 0xc0 ... 0xc1: /* xadd */
4685 emulate_2op_SrcV(ctxt, "add"); 4816 fastop(ctxt, em_add);
4686 /* Write back the register source. */ 4817 /* Write back the register source. */
4687 ctxt->src.val = ctxt->dst.orig_val; 4818 ctxt->src.val = ctxt->dst.orig_val;
4688 write_register_operand(&ctxt->src); 4819 write_register_operand(&ctxt->src);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 11300d2fa714..c1d30b2fc9bb 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -122,7 +122,6 @@ static s64 __kpit_elapsed(struct kvm *kvm)
122 */ 122 */
123 remaining = hrtimer_get_remaining(&ps->timer); 123 remaining = hrtimer_get_remaining(&ps->timer);
124 elapsed = ps->period - ktime_to_ns(remaining); 124 elapsed = ps->period - ktime_to_ns(remaining);
125 elapsed = mod_64(elapsed, ps->period);
126 125
127 return elapsed; 126 return elapsed;
128} 127}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 848206df0967..cc31f7c06d3d 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -241,6 +241,8 @@ int kvm_pic_read_irq(struct kvm *kvm)
241 int irq, irq2, intno; 241 int irq, irq2, intno;
242 struct kvm_pic *s = pic_irqchip(kvm); 242 struct kvm_pic *s = pic_irqchip(kvm);
243 243
244 s->output = 0;
245
244 pic_lock(s); 246 pic_lock(s);
245 irq = pic_get_irq(&s->pics[0]); 247 irq = pic_get_irq(&s->pics[0]);
246 if (irq >= 0) { 248 if (irq >= 0) {
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 7e06ba1618bd..484bc874688b 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -38,49 +38,81 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
38EXPORT_SYMBOL(kvm_cpu_has_pending_timer); 38EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
39 39
40/* 40/*
41 * check if there is pending interrupt from
42 * non-APIC source without intack.
43 */
44static int kvm_cpu_has_extint(struct kvm_vcpu *v)
45{
46 if (kvm_apic_accept_pic_intr(v))
47 return pic_irqchip(v->kvm)->output; /* PIC */
48 else
49 return 0;
50}
51
52/*
53 * check if there is injectable interrupt:
54 * when virtual interrupt delivery enabled,
55 * interrupt from apic will handled by hardware,
56 * we don't need to check it here.
57 */
58int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
59{
60 if (!irqchip_in_kernel(v->kvm))
61 return v->arch.interrupt.pending;
62
63 if (kvm_cpu_has_extint(v))
64 return 1;
65
66 if (kvm_apic_vid_enabled(v->kvm))
67 return 0;
68
69 return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
70}
71
72/*
41 * check if there is pending interrupt without 73 * check if there is pending interrupt without
42 * intack. 74 * intack.
43 */ 75 */
44int kvm_cpu_has_interrupt(struct kvm_vcpu *v) 76int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
45{ 77{
46 struct kvm_pic *s;
47
48 if (!irqchip_in_kernel(v->kvm)) 78 if (!irqchip_in_kernel(v->kvm))
49 return v->arch.interrupt.pending; 79 return v->arch.interrupt.pending;
50 80
51 if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ 81 if (kvm_cpu_has_extint(v))
52 if (kvm_apic_accept_pic_intr(v)) { 82 return 1;
53 s = pic_irqchip(v->kvm); /* PIC */ 83
54 return s->output; 84 return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
55 } else
56 return 0;
57 }
58 return 1;
59} 85}
60EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); 86EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
61 87
62/* 88/*
89 * Read pending interrupt(from non-APIC source)
90 * vector and intack.
91 */
92static int kvm_cpu_get_extint(struct kvm_vcpu *v)
93{
94 if (kvm_cpu_has_extint(v))
95 return kvm_pic_read_irq(v->kvm); /* PIC */
96 return -1;
97}
98
99/*
63 * Read pending interrupt vector and intack. 100 * Read pending interrupt vector and intack.
64 */ 101 */
65int kvm_cpu_get_interrupt(struct kvm_vcpu *v) 102int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
66{ 103{
67 struct kvm_pic *s;
68 int vector; 104 int vector;
69 105
70 if (!irqchip_in_kernel(v->kvm)) 106 if (!irqchip_in_kernel(v->kvm))
71 return v->arch.interrupt.nr; 107 return v->arch.interrupt.nr;
72 108
73 vector = kvm_get_apic_interrupt(v); /* APIC */ 109 vector = kvm_cpu_get_extint(v);
74 if (vector == -1) { 110
75 if (kvm_apic_accept_pic_intr(v)) { 111 if (kvm_apic_vid_enabled(v->kvm) || vector != -1)
76 s = pic_irqchip(v->kvm); 112 return vector; /* PIC */
77 s->output = 0; /* PIC */ 113
78 vector = kvm_pic_read_irq(v->kvm); 114 return kvm_get_apic_interrupt(v); /* APIC */
79 }
80 }
81 return vector;
82} 115}
83EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
84 116
85void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) 117void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
86{ 118{
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9392f527f107..02b51dd4e4ad 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -140,31 +140,56 @@ static inline int apic_enabled(struct kvm_lapic *apic)
140 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ 140 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
141 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) 141 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
142 142
143static inline int apic_x2apic_mode(struct kvm_lapic *apic)
144{
145 return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
146}
147
148static inline int kvm_apic_id(struct kvm_lapic *apic) 143static inline int kvm_apic_id(struct kvm_lapic *apic)
149{ 144{
150 return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; 145 return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
151} 146}
152 147
153static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr) 148void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
149 struct kvm_lapic_irq *irq,
150 u64 *eoi_exit_bitmap)
154{ 151{
155 u16 cid; 152 struct kvm_lapic **dst;
156 ldr >>= 32 - map->ldr_bits; 153 struct kvm_apic_map *map;
157 cid = (ldr >> map->cid_shift) & map->cid_mask; 154 unsigned long bitmap = 1;
155 int i;
158 156
159 BUG_ON(cid >= ARRAY_SIZE(map->logical_map)); 157 rcu_read_lock();
158 map = rcu_dereference(vcpu->kvm->arch.apic_map);
160 159
161 return cid; 160 if (unlikely(!map)) {
162} 161 __set_bit(irq->vector, (unsigned long *)eoi_exit_bitmap);
162 goto out;
163 }
163 164
164static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr) 165 if (irq->dest_mode == 0) { /* physical mode */
165{ 166 if (irq->delivery_mode == APIC_DM_LOWEST ||
166 ldr >>= (32 - map->ldr_bits); 167 irq->dest_id == 0xff) {
167 return ldr & map->lid_mask; 168 __set_bit(irq->vector,
169 (unsigned long *)eoi_exit_bitmap);
170 goto out;
171 }
172 dst = &map->phys_map[irq->dest_id & 0xff];
173 } else {
174 u32 mda = irq->dest_id << (32 - map->ldr_bits);
175
176 dst = map->logical_map[apic_cluster_id(map, mda)];
177
178 bitmap = apic_logical_id(map, mda);
179 }
180
181 for_each_set_bit(i, &bitmap, 16) {
182 if (!dst[i])
183 continue;
184 if (dst[i]->vcpu == vcpu) {
185 __set_bit(irq->vector,
186 (unsigned long *)eoi_exit_bitmap);
187 break;
188 }
189 }
190
191out:
192 rcu_read_unlock();
168} 193}
169 194
170static void recalculate_apic_map(struct kvm *kvm) 195static void recalculate_apic_map(struct kvm *kvm)
@@ -230,6 +255,8 @@ out:
230 255
231 if (old) 256 if (old)
232 kfree_rcu(old, rcu); 257 kfree_rcu(old, rcu);
258
259 kvm_ioapic_make_eoibitmap_request(kvm);
233} 260}
234 261
235static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) 262static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
@@ -345,6 +372,10 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
345{ 372{
346 int result; 373 int result;
347 374
375 /*
376 * Note that irr_pending is just a hint. It will be always
377 * true with virtual interrupt delivery enabled.
378 */
348 if (!apic->irr_pending) 379 if (!apic->irr_pending)
349 return -1; 380 return -1;
350 381
@@ -461,6 +492,8 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
461static inline int apic_find_highest_isr(struct kvm_lapic *apic) 492static inline int apic_find_highest_isr(struct kvm_lapic *apic)
462{ 493{
463 int result; 494 int result;
495
496 /* Note that isr_count is always 1 with vid enabled */
464 if (!apic->isr_count) 497 if (!apic->isr_count)
465 return -1; 498 return -1;
466 if (likely(apic->highest_isr_cache != -1)) 499 if (likely(apic->highest_isr_cache != -1))
@@ -740,6 +773,19 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
740 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; 773 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
741} 774}
742 775
776static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
777{
778 if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
779 kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
780 int trigger_mode;
781 if (apic_test_vector(vector, apic->regs + APIC_TMR))
782 trigger_mode = IOAPIC_LEVEL_TRIG;
783 else
784 trigger_mode = IOAPIC_EDGE_TRIG;
785 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
786 }
787}
788
743static int apic_set_eoi(struct kvm_lapic *apic) 789static int apic_set_eoi(struct kvm_lapic *apic)
744{ 790{
745 int vector = apic_find_highest_isr(apic); 791 int vector = apic_find_highest_isr(apic);
@@ -756,19 +802,26 @@ static int apic_set_eoi(struct kvm_lapic *apic)
756 apic_clear_isr(vector, apic); 802 apic_clear_isr(vector, apic);
757 apic_update_ppr(apic); 803 apic_update_ppr(apic);
758 804
759 if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && 805 kvm_ioapic_send_eoi(apic, vector);
760 kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
761 int trigger_mode;
762 if (apic_test_vector(vector, apic->regs + APIC_TMR))
763 trigger_mode = IOAPIC_LEVEL_TRIG;
764 else
765 trigger_mode = IOAPIC_EDGE_TRIG;
766 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
767 }
768 kvm_make_request(KVM_REQ_EVENT, apic->vcpu); 806 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
769 return vector; 807 return vector;
770} 808}
771 809
810/*
811 * this interface assumes a trap-like exit, which has already finished
812 * desired side effect including vISR and vPPR update.
813 */
814void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
815{
816 struct kvm_lapic *apic = vcpu->arch.apic;
817
818 trace_kvm_eoi(apic, vector);
819
820 kvm_ioapic_send_eoi(apic, vector);
821 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
822}
823EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated);
824
772static void apic_send_ipi(struct kvm_lapic *apic) 825static void apic_send_ipi(struct kvm_lapic *apic)
773{ 826{
774 u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR); 827 u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR);
@@ -1212,6 +1265,21 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
1212} 1265}
1213EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); 1266EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
1214 1267
1268/* emulate APIC access in a trap manner */
1269void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
1270{
1271 u32 val = 0;
1272
1273 /* hw has done the conditional check and inst decode */
1274 offset &= 0xff0;
1275
1276 apic_reg_read(vcpu->arch.apic, offset, 4, &val);
1277
1278 /* TODO: optimize to just emulate side effect w/o one more write */
1279 apic_reg_write(vcpu->arch.apic, offset, val);
1280}
1281EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode);
1282
1215void kvm_free_lapic(struct kvm_vcpu *vcpu) 1283void kvm_free_lapic(struct kvm_vcpu *vcpu)
1216{ 1284{
1217 struct kvm_lapic *apic = vcpu->arch.apic; 1285 struct kvm_lapic *apic = vcpu->arch.apic;
@@ -1288,6 +1356,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
1288 1356
1289void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) 1357void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
1290{ 1358{
1359 u64 old_value = vcpu->arch.apic_base;
1291 struct kvm_lapic *apic = vcpu->arch.apic; 1360 struct kvm_lapic *apic = vcpu->arch.apic;
1292 1361
1293 if (!apic) { 1362 if (!apic) {
@@ -1309,11 +1378,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
1309 value &= ~MSR_IA32_APICBASE_BSP; 1378 value &= ~MSR_IA32_APICBASE_BSP;
1310 1379
1311 vcpu->arch.apic_base = value; 1380 vcpu->arch.apic_base = value;
1312 if (apic_x2apic_mode(apic)) { 1381 if ((old_value ^ value) & X2APIC_ENABLE) {
1313 u32 id = kvm_apic_id(apic); 1382 if (value & X2APIC_ENABLE) {
1314 u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); 1383 u32 id = kvm_apic_id(apic);
1315 kvm_apic_set_ldr(apic, ldr); 1384 u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
1385 kvm_apic_set_ldr(apic, ldr);
1386 kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
1387 } else
1388 kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
1316 } 1389 }
1390
1317 apic->base_address = apic->vcpu->arch.apic_base & 1391 apic->base_address = apic->vcpu->arch.apic_base &
1318 MSR_IA32_APICBASE_BASE; 1392 MSR_IA32_APICBASE_BASE;
1319 1393
@@ -1359,8 +1433,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1359 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); 1433 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
1360 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); 1434 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
1361 } 1435 }
1362 apic->irr_pending = false; 1436 apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm);
1363 apic->isr_count = 0; 1437 apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm);
1364 apic->highest_isr_cache = -1; 1438 apic->highest_isr_cache = -1;
1365 update_divide_count(apic); 1439 update_divide_count(apic);
1366 atomic_set(&apic->lapic_timer.pending, 0); 1440 atomic_set(&apic->lapic_timer.pending, 0);
@@ -1575,8 +1649,10 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
1575 update_divide_count(apic); 1649 update_divide_count(apic);
1576 start_apic_timer(apic); 1650 start_apic_timer(apic);
1577 apic->irr_pending = true; 1651 apic->irr_pending = true;
1578 apic->isr_count = count_vectors(apic->regs + APIC_ISR); 1652 apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ?
1653 1 : count_vectors(apic->regs + APIC_ISR);
1579 apic->highest_isr_cache = -1; 1654 apic->highest_isr_cache = -1;
1655 kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
1580 kvm_make_request(KVM_REQ_EVENT, vcpu); 1656 kvm_make_request(KVM_REQ_EVENT, vcpu);
1581} 1657}
1582 1658
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index e5ebf9f3571f..1676d34ddb4e 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -64,6 +64,9 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
64u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); 64u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
65void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); 65void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
66 66
67void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset);
68void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector);
69
67void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); 70void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
68void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); 71void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
69void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); 72void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
@@ -124,4 +127,35 @@ static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
124 return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic); 127 return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
125} 128}
126 129
130static inline int apic_x2apic_mode(struct kvm_lapic *apic)
131{
132 return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
133}
134
135static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
136{
137 return kvm_x86_ops->vm_has_apicv(kvm);
138}
139
140static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
141{
142 u16 cid;
143 ldr >>= 32 - map->ldr_bits;
144 cid = (ldr >> map->cid_shift) & map->cid_mask;
145
146 BUG_ON(cid >= ARRAY_SIZE(map->logical_map));
147
148 return cid;
149}
150
151static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
152{
153 ldr >>= (32 - map->ldr_bits);
154 return ldr & map->lid_mask;
155}
156
157void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
158 struct kvm_lapic_irq *irq,
159 u64 *eoi_bitmap);
160
127#endif 161#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 01d7c2ad05f5..4ed3edbe06bd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -448,7 +448,8 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
448 448
449static bool spte_is_locklessly_modifiable(u64 spte) 449static bool spte_is_locklessly_modifiable(u64 spte)
450{ 450{
451 return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); 451 return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
452 (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
452} 453}
453 454
454static bool spte_has_volatile_bits(u64 spte) 455static bool spte_has_volatile_bits(u64 spte)
@@ -831,8 +832,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
831 if (host_level == PT_PAGE_TABLE_LEVEL) 832 if (host_level == PT_PAGE_TABLE_LEVEL)
832 return host_level; 833 return host_level;
833 834
834 max_level = kvm_x86_ops->get_lpage_level() < host_level ? 835 max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
835 kvm_x86_ops->get_lpage_level() : host_level;
836 836
837 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) 837 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
838 if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) 838 if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
@@ -1142,7 +1142,7 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
1142} 1142}
1143 1143
1144static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, 1144static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
1145 int level, bool pt_protect) 1145 bool pt_protect)
1146{ 1146{
1147 u64 *sptep; 1147 u64 *sptep;
1148 struct rmap_iterator iter; 1148 struct rmap_iterator iter;
@@ -1180,7 +1180,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1180 while (mask) { 1180 while (mask) {
1181 rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), 1181 rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1182 PT_PAGE_TABLE_LEVEL, slot); 1182 PT_PAGE_TABLE_LEVEL, slot);
1183 __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); 1183 __rmap_write_protect(kvm, rmapp, false);
1184 1184
1185 /* clear the first set bit */ 1185 /* clear the first set bit */
1186 mask &= mask - 1; 1186 mask &= mask - 1;
@@ -1199,7 +1199,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
1199 for (i = PT_PAGE_TABLE_LEVEL; 1199 for (i = PT_PAGE_TABLE_LEVEL;
1200 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 1200 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
1201 rmapp = __gfn_to_rmap(gfn, i, slot); 1201 rmapp = __gfn_to_rmap(gfn, i, slot);
1202 write_protected |= __rmap_write_protect(kvm, rmapp, i, true); 1202 write_protected |= __rmap_write_protect(kvm, rmapp, true);
1203 } 1203 }
1204 1204
1205 return write_protected; 1205 return write_protected;
@@ -1460,28 +1460,14 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
1460 percpu_counter_add(&kvm_total_used_mmu_pages, nr); 1460 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1461} 1461}
1462 1462
1463/* 1463static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1464 * Remove the sp from shadow page cache, after call it,
1465 * we can not find this sp from the cache, and the shadow
1466 * page table is still valid.
1467 * It should be under the protection of mmu lock.
1468 */
1469static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
1470{ 1464{
1471 ASSERT(is_empty_shadow_page(sp->spt)); 1465 ASSERT(is_empty_shadow_page(sp->spt));
1472 hlist_del(&sp->hash_link); 1466 hlist_del(&sp->hash_link);
1473 if (!sp->role.direct)
1474 free_page((unsigned long)sp->gfns);
1475}
1476
1477/*
1478 * Free the shadow page table and the sp, we can do it
1479 * out of the protection of mmu lock.
1480 */
1481static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1482{
1483 list_del(&sp->link); 1467 list_del(&sp->link);
1484 free_page((unsigned long)sp->spt); 1468 free_page((unsigned long)sp->spt);
1469 if (!sp->role.direct)
1470 free_page((unsigned long)sp->gfns);
1485 kmem_cache_free(mmu_page_header_cache, sp); 1471 kmem_cache_free(mmu_page_header_cache, sp);
1486} 1472}
1487 1473
@@ -1522,7 +1508,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1522 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); 1508 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1523 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 1509 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1524 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 1510 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1525 bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
1526 sp->parent_ptes = 0; 1511 sp->parent_ptes = 0;
1527 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1512 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1528 kvm_mod_used_mmu_pages(vcpu->kvm, +1); 1513 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
@@ -1973,9 +1958,9 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1973{ 1958{
1974 u64 spte; 1959 u64 spte;
1975 1960
1976 spte = __pa(sp->spt) 1961 spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
1977 | PT_PRESENT_MASK | PT_ACCESSED_MASK 1962 shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
1978 | PT_WRITABLE_MASK | PT_USER_MASK; 1963
1979 mmu_spte_set(sptep, spte); 1964 mmu_spte_set(sptep, spte);
1980} 1965}
1981 1966
@@ -2126,7 +2111,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2126 do { 2111 do {
2127 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); 2112 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
2128 WARN_ON(!sp->role.invalid || sp->root_count); 2113 WARN_ON(!sp->role.invalid || sp->root_count);
2129 kvm_mmu_isolate_page(sp);
2130 kvm_mmu_free_page(sp); 2114 kvm_mmu_free_page(sp);
2131 } while (!list_empty(invalid_list)); 2115 } while (!list_empty(invalid_list));
2132} 2116}
@@ -2144,6 +2128,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
2144 * change the value 2128 * change the value
2145 */ 2129 */
2146 2130
2131 spin_lock(&kvm->mmu_lock);
2132
2147 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { 2133 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2148 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && 2134 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
2149 !list_empty(&kvm->arch.active_mmu_pages)) { 2135 !list_empty(&kvm->arch.active_mmu_pages)) {
@@ -2158,6 +2144,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
2158 } 2144 }
2159 2145
2160 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; 2146 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2147
2148 spin_unlock(&kvm->mmu_lock);
2161} 2149}
2162 2150
2163int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 2151int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -2183,14 +2171,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2183} 2171}
2184EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); 2172EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
2185 2173
2186static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
2187{
2188 int slot = memslot_id(kvm, gfn);
2189 struct kvm_mmu_page *sp = page_header(__pa(pte));
2190
2191 __set_bit(slot, sp->slot_bitmap);
2192}
2193
2194/* 2174/*
2195 * The function is based on mtrr_type_lookup() in 2175 * The function is based on mtrr_type_lookup() in
2196 * arch/x86/kernel/cpu/mtrr/generic.c 2176 * arch/x86/kernel/cpu/mtrr/generic.c
@@ -2332,9 +2312,8 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2332 if (s->role.level != PT_PAGE_TABLE_LEVEL) 2312 if (s->role.level != PT_PAGE_TABLE_LEVEL)
2333 return 1; 2313 return 1;
2334 2314
2335 if (!need_unsync && !s->unsync) { 2315 if (!s->unsync)
2336 need_unsync = true; 2316 need_unsync = true;
2337 }
2338 } 2317 }
2339 if (need_unsync) 2318 if (need_unsync)
2340 kvm_unsync_pages(vcpu, gfn); 2319 kvm_unsync_pages(vcpu, gfn);
@@ -2342,8 +2321,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2342} 2321}
2343 2322
2344static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2323static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2345 unsigned pte_access, int user_fault, 2324 unsigned pte_access, int level,
2346 int write_fault, int level,
2347 gfn_t gfn, pfn_t pfn, bool speculative, 2325 gfn_t gfn, pfn_t pfn, bool speculative,
2348 bool can_unsync, bool host_writable) 2326 bool can_unsync, bool host_writable)
2349{ 2327{
@@ -2378,20 +2356,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2378 2356
2379 spte |= (u64)pfn << PAGE_SHIFT; 2357 spte |= (u64)pfn << PAGE_SHIFT;
2380 2358
2381 if ((pte_access & ACC_WRITE_MASK) 2359 if (pte_access & ACC_WRITE_MASK) {
2382 || (!vcpu->arch.mmu.direct_map && write_fault
2383 && !is_write_protection(vcpu) && !user_fault)) {
2384 2360
2385 /* 2361 /*
2386 * There are two cases: 2362 * Other vcpu creates new sp in the window between
2387 * - the one is other vcpu creates new sp in the window 2363 * mapping_level() and acquiring mmu-lock. We can
2388 * between mapping_level() and acquiring mmu-lock. 2364 * allow guest to retry the access, the mapping can
2389 * - the another case is the new sp is created by itself 2365 * be fixed if guest refault.
2390 * (page-fault path) when guest uses the target gfn as
2391 * its page table.
2392 * Both of these cases can be fixed by allowing guest to
2393 * retry the access, it will refault, then we can establish
2394 * the mapping by using small page.
2395 */ 2366 */
2396 if (level > PT_PAGE_TABLE_LEVEL && 2367 if (level > PT_PAGE_TABLE_LEVEL &&
2397 has_wrprotected_page(vcpu->kvm, gfn, level)) 2368 has_wrprotected_page(vcpu->kvm, gfn, level))
@@ -2399,19 +2370,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2399 2370
2400 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; 2371 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
2401 2372
2402 if (!vcpu->arch.mmu.direct_map
2403 && !(pte_access & ACC_WRITE_MASK)) {
2404 spte &= ~PT_USER_MASK;
2405 /*
2406 * If we converted a user page to a kernel page,
2407 * so that the kernel can write to it when cr0.wp=0,
2408 * then we should prevent the kernel from executing it
2409 * if SMEP is enabled.
2410 */
2411 if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
2412 spte |= PT64_NX_MASK;
2413 }
2414
2415 /* 2373 /*
2416 * Optimization: for pte sync, if spte was writable the hash 2374 * Optimization: for pte sync, if spte was writable the hash
2417 * lookup is unnecessary (and expensive). Write protection 2375 * lookup is unnecessary (and expensive). Write protection
@@ -2441,19 +2399,15 @@ done:
2441} 2399}
2442 2400
2443static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2401static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2444 unsigned pt_access, unsigned pte_access, 2402 unsigned pte_access, int write_fault, int *emulate,
2445 int user_fault, int write_fault, 2403 int level, gfn_t gfn, pfn_t pfn, bool speculative,
2446 int *emulate, int level, gfn_t gfn,
2447 pfn_t pfn, bool speculative,
2448 bool host_writable) 2404 bool host_writable)
2449{ 2405{
2450 int was_rmapped = 0; 2406 int was_rmapped = 0;
2451 int rmap_count; 2407 int rmap_count;
2452 2408
2453 pgprintk("%s: spte %llx access %x write_fault %d" 2409 pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
2454 " user_fault %d gfn %llx\n", 2410 *sptep, write_fault, gfn);
2455 __func__, *sptep, pt_access,
2456 write_fault, user_fault, gfn);
2457 2411
2458 if (is_rmap_spte(*sptep)) { 2412 if (is_rmap_spte(*sptep)) {
2459 /* 2413 /*
@@ -2477,9 +2431,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2477 was_rmapped = 1; 2431 was_rmapped = 1;
2478 } 2432 }
2479 2433
2480 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, 2434 if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
2481 level, gfn, pfn, speculative, true, 2435 true, host_writable)) {
2482 host_writable)) {
2483 if (write_fault) 2436 if (write_fault)
2484 *emulate = 1; 2437 *emulate = 1;
2485 kvm_mmu_flush_tlb(vcpu); 2438 kvm_mmu_flush_tlb(vcpu);
@@ -2497,7 +2450,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2497 ++vcpu->kvm->stat.lpages; 2450 ++vcpu->kvm->stat.lpages;
2498 2451
2499 if (is_shadow_present_pte(*sptep)) { 2452 if (is_shadow_present_pte(*sptep)) {
2500 page_header_update_slot(vcpu->kvm, sptep, gfn);
2501 if (!was_rmapped) { 2453 if (!was_rmapped) {
2502 rmap_count = rmap_add(vcpu, sptep, gfn); 2454 rmap_count = rmap_add(vcpu, sptep, gfn);
2503 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 2455 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
@@ -2571,10 +2523,9 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2571 return -1; 2523 return -1;
2572 2524
2573 for (i = 0; i < ret; i++, gfn++, start++) 2525 for (i = 0; i < ret; i++, gfn++, start++)
2574 mmu_set_spte(vcpu, start, ACC_ALL, 2526 mmu_set_spte(vcpu, start, access, 0, NULL,
2575 access, 0, 0, NULL, 2527 sp->role.level, gfn, page_to_pfn(pages[i]),
2576 sp->role.level, gfn, 2528 true, true);
2577 page_to_pfn(pages[i]), true, true);
2578 2529
2579 return 0; 2530 return 0;
2580} 2531}
@@ -2633,11 +2584,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2633 2584
2634 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 2585 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2635 if (iterator.level == level) { 2586 if (iterator.level == level) {
2636 unsigned pte_access = ACC_ALL; 2587 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
2637 2588 write, &emulate, level, gfn, pfn,
2638 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, 2589 prefault, map_writable);
2639 0, write, &emulate,
2640 level, gfn, pfn, prefault, map_writable);
2641 direct_pte_prefetch(vcpu, iterator.sptep); 2590 direct_pte_prefetch(vcpu, iterator.sptep);
2642 ++vcpu->stat.pf_fixed; 2591 ++vcpu->stat.pf_fixed;
2643 break; 2592 break;
@@ -2652,11 +2601,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2652 iterator.level - 1, 2601 iterator.level - 1,
2653 1, ACC_ALL, iterator.sptep); 2602 1, ACC_ALL, iterator.sptep);
2654 2603
2655 mmu_spte_set(iterator.sptep, 2604 link_shadow_page(iterator.sptep, sp);
2656 __pa(sp->spt)
2657 | PT_PRESENT_MASK | PT_WRITABLE_MASK
2658 | shadow_user_mask | shadow_x_mask
2659 | shadow_accessed_mask);
2660 } 2605 }
2661 } 2606 }
2662 return emulate; 2607 return emulate;
@@ -3719,6 +3664,7 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
3719 else 3664 else
3720 r = paging32_init_context(vcpu, context); 3665 r = paging32_init_context(vcpu, context);
3721 3666
3667 vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);
3722 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); 3668 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
3723 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); 3669 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
3724 vcpu->arch.mmu.base_role.smep_andnot_wp 3670 vcpu->arch.mmu.base_role.smep_andnot_wp
@@ -3885,7 +3831,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
3885 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 3831 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
3886 *gpa &= ~(gpa_t)7; 3832 *gpa &= ~(gpa_t)7;
3887 *bytes = 8; 3833 *bytes = 8;
3888 r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8)); 3834 r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8);
3889 if (r) 3835 if (r)
3890 gentry = 0; 3836 gentry = 0;
3891 new = (const u8 *)&gentry; 3837 new = (const u8 *)&gentry;
@@ -4039,7 +3985,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
4039 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 3985 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
4040 & mask.word) && rmap_can_add(vcpu)) 3986 & mask.word) && rmap_can_add(vcpu))
4041 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 3987 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
4042 if (!remote_flush && need_remote_flush(entry, *spte)) 3988 if (need_remote_flush(entry, *spte))
4043 remote_flush = true; 3989 remote_flush = true;
4044 ++spte; 3990 ++spte;
4045 } 3991 }
@@ -4198,26 +4144,36 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
4198 4144
4199void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 4145void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
4200{ 4146{
4201 struct kvm_mmu_page *sp; 4147 struct kvm_memory_slot *memslot;
4202 bool flush = false; 4148 gfn_t last_gfn;
4149 int i;
4203 4150
4204 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { 4151 memslot = id_to_memslot(kvm->memslots, slot);
4205 int i; 4152 last_gfn = memslot->base_gfn + memslot->npages - 1;
4206 u64 *pt;
4207 4153
4208 if (!test_bit(slot, sp->slot_bitmap)) 4154 spin_lock(&kvm->mmu_lock);
4209 continue;
4210 4155
4211 pt = sp->spt; 4156 for (i = PT_PAGE_TABLE_LEVEL;
4212 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 4157 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
4213 if (!is_shadow_present_pte(pt[i]) || 4158 unsigned long *rmapp;
4214 !is_last_spte(pt[i], sp->role.level)) 4159 unsigned long last_index, index;
4215 continue;
4216 4160
4217 spte_write_protect(kvm, &pt[i], &flush, false); 4161 rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
4162 last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
4163
4164 for (index = 0; index <= last_index; ++index, ++rmapp) {
4165 if (*rmapp)
4166 __rmap_write_protect(kvm, rmapp, false);
4167
4168 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
4169 kvm_flush_remote_tlbs(kvm);
4170 cond_resched_lock(&kvm->mmu_lock);
4171 }
4218 } 4172 }
4219 } 4173 }
4174
4220 kvm_flush_remote_tlbs(kvm); 4175 kvm_flush_remote_tlbs(kvm);
4176 spin_unlock(&kvm->mmu_lock);
4221} 4177}
4222 4178
4223void kvm_mmu_zap_all(struct kvm *kvm) 4179void kvm_mmu_zap_all(struct kvm *kvm)
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index cd6e98333ba3..b8f6172f4174 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -195,12 +195,6 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
195 TP_ARGS(sp) 195 TP_ARGS(sp)
196); 196);
197 197
198DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages,
199 TP_PROTO(struct kvm_mmu_page *sp),
200
201 TP_ARGS(sp)
202);
203
204TRACE_EVENT( 198TRACE_EVENT(
205 mark_mmio_spte, 199 mark_mmio_spte,
206 TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access), 200 TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access),
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 891eb6d93b8b..105dd5bd550e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -151,7 +151,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
151 pt_element_t pte; 151 pt_element_t pte;
152 pt_element_t __user *uninitialized_var(ptep_user); 152 pt_element_t __user *uninitialized_var(ptep_user);
153 gfn_t table_gfn; 153 gfn_t table_gfn;
154 unsigned index, pt_access, pte_access, accessed_dirty, shift; 154 unsigned index, pt_access, pte_access, accessed_dirty;
155 gpa_t pte_gpa; 155 gpa_t pte_gpa;
156 int offset; 156 int offset;
157 const int write_fault = access & PFERR_WRITE_MASK; 157 const int write_fault = access & PFERR_WRITE_MASK;
@@ -249,16 +249,12 @@ retry_walk:
249 249
250 if (!write_fault) 250 if (!write_fault)
251 protect_clean_gpte(&pte_access, pte); 251 protect_clean_gpte(&pte_access, pte);
252 252 else
253 /* 253 /*
254 * On a write fault, fold the dirty bit into accessed_dirty by shifting it one 254 * On a write fault, fold the dirty bit into accessed_dirty by
255 * place right. 255 * shifting it one place right.
256 * 256 */
257 * On a read fault, do nothing. 257 accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
258 */
259 shift = write_fault >> ilog2(PFERR_WRITE_MASK);
260 shift *= PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT;
261 accessed_dirty &= pte >> shift;
262 258
263 if (unlikely(!accessed_dirty)) { 259 if (unlikely(!accessed_dirty)) {
264 ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); 260 ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
@@ -330,8 +326,8 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
330 * we call mmu_set_spte() with host_writable = true because 326 * we call mmu_set_spte() with host_writable = true because
331 * pte_prefetch_gfn_to_pfn always gets a writable pfn. 327 * pte_prefetch_gfn_to_pfn always gets a writable pfn.
332 */ 328 */
333 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 329 mmu_set_spte(vcpu, spte, pte_access, 0, NULL, PT_PAGE_TABLE_LEVEL,
334 NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true); 330 gfn, pfn, true, true);
335 331
336 return true; 332 return true;
337} 333}
@@ -405,7 +401,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
405 */ 401 */
406static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 402static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
407 struct guest_walker *gw, 403 struct guest_walker *gw,
408 int user_fault, int write_fault, int hlevel, 404 int write_fault, int hlevel,
409 pfn_t pfn, bool map_writable, bool prefault) 405 pfn_t pfn, bool map_writable, bool prefault)
410{ 406{
411 struct kvm_mmu_page *sp = NULL; 407 struct kvm_mmu_page *sp = NULL;
@@ -413,9 +409,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
413 unsigned direct_access, access = gw->pt_access; 409 unsigned direct_access, access = gw->pt_access;
414 int top_level, emulate = 0; 410 int top_level, emulate = 0;
415 411
416 if (!is_present_gpte(gw->ptes[gw->level - 1]))
417 return 0;
418
419 direct_access = gw->pte_access; 412 direct_access = gw->pte_access;
420 413
421 top_level = vcpu->arch.mmu.root_level; 414 top_level = vcpu->arch.mmu.root_level;
@@ -477,9 +470,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
477 } 470 }
478 471
479 clear_sp_write_flooding_count(it.sptep); 472 clear_sp_write_flooding_count(it.sptep);
480 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, 473 mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, &emulate,
481 user_fault, write_fault, &emulate, it.level, 474 it.level, gw->gfn, pfn, prefault, map_writable);
482 gw->gfn, pfn, prefault, map_writable);
483 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 475 FNAME(pte_prefetch)(vcpu, gw, it.sptep);
484 476
485 return emulate; 477 return emulate;
@@ -491,6 +483,46 @@ out_gpte_changed:
491 return 0; 483 return 0;
492} 484}
493 485
486 /*
487 * To see whether the mapped gfn can write its page table in the current
488 * mapping.
489 *
490 * It is the helper function of FNAME(page_fault). When guest uses large page
491 * size to map the writable gfn which is used as current page table, we should
492 * force kvm to use small page size to map it because new shadow page will be
493 * created when kvm establishes shadow page table that stop kvm using large
494 * page size. Do it early can avoid unnecessary #PF and emulation.
495 *
496 * @write_fault_to_shadow_pgtable will return true if the fault gfn is
497 * currently used as its page table.
498 *
499 * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
500 * since the PDPT is always shadowed, that means, we can not use large page
501 * size to map the gfn which is used as PDPT.
502 */
503static bool
504FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
505 struct guest_walker *walker, int user_fault,
506 bool *write_fault_to_shadow_pgtable)
507{
508 int level;
509 gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
510 bool self_changed = false;
511
512 if (!(walker->pte_access & ACC_WRITE_MASK ||
513 (!is_write_protection(vcpu) && !user_fault)))
514 return false;
515
516 for (level = walker->level; level <= walker->max_level; level++) {
517 gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
518
519 self_changed |= !(gfn & mask);
520 *write_fault_to_shadow_pgtable |= !gfn;
521 }
522
523 return self_changed;
524}
525
494/* 526/*
495 * Page fault handler. There are several causes for a page fault: 527 * Page fault handler. There are several causes for a page fault:
496 * - there is no shadow pte for the guest pte 528 * - there is no shadow pte for the guest pte
@@ -516,7 +548,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
516 int level = PT_PAGE_TABLE_LEVEL; 548 int level = PT_PAGE_TABLE_LEVEL;
517 int force_pt_level; 549 int force_pt_level;
518 unsigned long mmu_seq; 550 unsigned long mmu_seq;
519 bool map_writable; 551 bool map_writable, is_self_change_mapping;
520 552
521 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 553 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
522 554
@@ -544,8 +576,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
544 return 0; 576 return 0;
545 } 577 }
546 578
579 vcpu->arch.write_fault_to_shadow_pgtable = false;
580
581 is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
582 &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
583
547 if (walker.level >= PT_DIRECTORY_LEVEL) 584 if (walker.level >= PT_DIRECTORY_LEVEL)
548 force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); 585 force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
586 || is_self_change_mapping;
549 else 587 else
550 force_pt_level = 1; 588 force_pt_level = 1;
551 if (!force_pt_level) { 589 if (!force_pt_level) {
@@ -564,6 +602,26 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
564 walker.gfn, pfn, walker.pte_access, &r)) 602 walker.gfn, pfn, walker.pte_access, &r))
565 return r; 603 return r;
566 604
605 /*
606 * Do not change pte_access if the pfn is a mmio page, otherwise
607 * we will cache the incorrect access into mmio spte.
608 */
609 if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
610 !is_write_protection(vcpu) && !user_fault &&
611 !is_noslot_pfn(pfn)) {
612 walker.pte_access |= ACC_WRITE_MASK;
613 walker.pte_access &= ~ACC_USER_MASK;
614
615 /*
616 * If we converted a user page to a kernel page,
617 * so that the kernel can write to it when cr0.wp=0,
618 * then we should prevent the kernel from executing it
619 * if SMEP is enabled.
620 */
621 if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
622 walker.pte_access &= ~ACC_EXEC_MASK;
623 }
624
567 spin_lock(&vcpu->kvm->mmu_lock); 625 spin_lock(&vcpu->kvm->mmu_lock);
568 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 626 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
569 goto out_unlock; 627 goto out_unlock;
@@ -572,7 +630,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
572 kvm_mmu_free_some_pages(vcpu); 630 kvm_mmu_free_some_pages(vcpu);
573 if (!force_pt_level) 631 if (!force_pt_level)
574 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); 632 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
575 r = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 633 r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
576 level, pfn, map_writable, prefault); 634 level, pfn, map_writable, prefault);
577 ++vcpu->stat.pf_fixed; 635 ++vcpu->stat.pf_fixed;
578 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); 636 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
@@ -747,7 +805,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
747 805
748 host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; 806 host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
749 807
750 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 808 set_spte(vcpu, &sp->spt[i], pte_access,
751 PT_PAGE_TABLE_LEVEL, gfn, 809 PT_PAGE_TABLE_LEVEL, gfn,
752 spte_to_pfn(sp->spt[i]), true, false, 810 spte_to_pfn(sp->spt[i]), true, false,
753 host_writable); 811 host_writable);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d29d3cd1c156..e1b1ce21bc00 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3571,6 +3571,26 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3571 set_cr_intercept(svm, INTERCEPT_CR8_WRITE); 3571 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3572} 3572}
3573 3573
3574static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
3575{
3576 return;
3577}
3578
3579static int svm_vm_has_apicv(struct kvm *kvm)
3580{
3581 return 0;
3582}
3583
3584static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
3585{
3586 return;
3587}
3588
3589static void svm_hwapic_isr_update(struct kvm *kvm, int isr)
3590{
3591 return;
3592}
3593
3574static int svm_nmi_allowed(struct kvm_vcpu *vcpu) 3594static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
3575{ 3595{
3576 struct vcpu_svm *svm = to_svm(vcpu); 3596 struct vcpu_svm *svm = to_svm(vcpu);
@@ -4290,6 +4310,10 @@ static struct kvm_x86_ops svm_x86_ops = {
4290 .enable_nmi_window = enable_nmi_window, 4310 .enable_nmi_window = enable_nmi_window,
4291 .enable_irq_window = enable_irq_window, 4311 .enable_irq_window = enable_irq_window,
4292 .update_cr8_intercept = update_cr8_intercept, 4312 .update_cr8_intercept = update_cr8_intercept,
4313 .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
4314 .vm_has_apicv = svm_vm_has_apicv,
4315 .load_eoi_exitmap = svm_load_eoi_exitmap,
4316 .hwapic_isr_update = svm_hwapic_isr_update,
4293 4317
4294 .set_tss_addr = svm_set_tss_addr, 4318 .set_tss_addr = svm_set_tss_addr,
4295 .get_tdp_level = get_npt_level, 4319 .get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9120ae1901e4..6667042714cc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -84,6 +84,8 @@ module_param(vmm_exclusive, bool, S_IRUGO);
84static bool __read_mostly fasteoi = 1; 84static bool __read_mostly fasteoi = 1;
85module_param(fasteoi, bool, S_IRUGO); 85module_param(fasteoi, bool, S_IRUGO);
86 86
87static bool __read_mostly enable_apicv_reg_vid;
88
87/* 89/*
88 * If nested=1, nested virtualization is supported, i.e., guests may use 90 * If nested=1, nested virtualization is supported, i.e., guests may use
89 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 91 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -92,12 +94,8 @@ module_param(fasteoi, bool, S_IRUGO);
92static bool __read_mostly nested = 0; 94static bool __read_mostly nested = 0;
93module_param(nested, bool, S_IRUGO); 95module_param(nested, bool, S_IRUGO);
94 96
95#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 97#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
96 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) 98#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
97#define KVM_GUEST_CR0_MASK \
98 (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
99#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
100 (X86_CR0_WP | X86_CR0_NE)
101#define KVM_VM_CR0_ALWAYS_ON \ 99#define KVM_VM_CR0_ALWAYS_ON \
102 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 100 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
103#define KVM_CR4_GUEST_OWNED_BITS \ 101#define KVM_CR4_GUEST_OWNED_BITS \
@@ -624,6 +622,8 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
624 struct kvm_segment *var, int seg); 622 struct kvm_segment *var, int seg);
625static void vmx_get_segment(struct kvm_vcpu *vcpu, 623static void vmx_get_segment(struct kvm_vcpu *vcpu,
626 struct kvm_segment *var, int seg); 624 struct kvm_segment *var, int seg);
625static bool guest_state_valid(struct kvm_vcpu *vcpu);
626static u32 vmx_segment_access_rights(struct kvm_segment *var);
627 627
628static DEFINE_PER_CPU(struct vmcs *, vmxarea); 628static DEFINE_PER_CPU(struct vmcs *, vmxarea);
629static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 629static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -638,6 +638,8 @@ static unsigned long *vmx_io_bitmap_a;
638static unsigned long *vmx_io_bitmap_b; 638static unsigned long *vmx_io_bitmap_b;
639static unsigned long *vmx_msr_bitmap_legacy; 639static unsigned long *vmx_msr_bitmap_legacy;
640static unsigned long *vmx_msr_bitmap_longmode; 640static unsigned long *vmx_msr_bitmap_longmode;
641static unsigned long *vmx_msr_bitmap_legacy_x2apic;
642static unsigned long *vmx_msr_bitmap_longmode_x2apic;
641 643
642static bool cpu_has_load_ia32_efer; 644static bool cpu_has_load_ia32_efer;
643static bool cpu_has_load_perf_global_ctrl; 645static bool cpu_has_load_perf_global_ctrl;
@@ -762,6 +764,24 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
762 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 764 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
763} 765}
764 766
767static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
768{
769 return vmcs_config.cpu_based_2nd_exec_ctrl &
770 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
771}
772
773static inline bool cpu_has_vmx_apic_register_virt(void)
774{
775 return vmcs_config.cpu_based_2nd_exec_ctrl &
776 SECONDARY_EXEC_APIC_REGISTER_VIRT;
777}
778
779static inline bool cpu_has_vmx_virtual_intr_delivery(void)
780{
781 return vmcs_config.cpu_based_2nd_exec_ctrl &
782 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
783}
784
765static inline bool cpu_has_vmx_flexpriority(void) 785static inline bool cpu_has_vmx_flexpriority(void)
766{ 786{
767 return cpu_has_vmx_tpr_shadow() && 787 return cpu_has_vmx_tpr_shadow() &&
@@ -1694,7 +1714,6 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1694static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1714static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1695{ 1715{
1696 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); 1716 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
1697 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
1698 to_vmx(vcpu)->rflags = rflags; 1717 to_vmx(vcpu)->rflags = rflags;
1699 if (to_vmx(vcpu)->rmode.vm86_active) { 1718 if (to_vmx(vcpu)->rmode.vm86_active) {
1700 to_vmx(vcpu)->rmode.save_rflags = rflags; 1719 to_vmx(vcpu)->rmode.save_rflags = rflags;
@@ -1820,6 +1839,25 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
1820 vmx->guest_msrs[from] = tmp; 1839 vmx->guest_msrs[from] = tmp;
1821} 1840}
1822 1841
1842static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
1843{
1844 unsigned long *msr_bitmap;
1845
1846 if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) {
1847 if (is_long_mode(vcpu))
1848 msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
1849 else
1850 msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
1851 } else {
1852 if (is_long_mode(vcpu))
1853 msr_bitmap = vmx_msr_bitmap_longmode;
1854 else
1855 msr_bitmap = vmx_msr_bitmap_legacy;
1856 }
1857
1858 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
1859}
1860
1823/* 1861/*
1824 * Set up the vmcs to automatically save and restore system 1862 * Set up the vmcs to automatically save and restore system
1825 * msrs. Don't touch the 64-bit msrs if the guest is in legacy 1863 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
@@ -1828,7 +1866,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
1828static void setup_msrs(struct vcpu_vmx *vmx) 1866static void setup_msrs(struct vcpu_vmx *vmx)
1829{ 1867{
1830 int save_nmsrs, index; 1868 int save_nmsrs, index;
1831 unsigned long *msr_bitmap;
1832 1869
1833 save_nmsrs = 0; 1870 save_nmsrs = 0;
1834#ifdef CONFIG_X86_64 1871#ifdef CONFIG_X86_64
@@ -1860,14 +1897,8 @@ static void setup_msrs(struct vcpu_vmx *vmx)
1860 1897
1861 vmx->save_nmsrs = save_nmsrs; 1898 vmx->save_nmsrs = save_nmsrs;
1862 1899
1863 if (cpu_has_vmx_msr_bitmap()) { 1900 if (cpu_has_vmx_msr_bitmap())
1864 if (is_long_mode(&vmx->vcpu)) 1901 vmx_set_msr_bitmap(&vmx->vcpu);
1865 msr_bitmap = vmx_msr_bitmap_longmode;
1866 else
1867 msr_bitmap = vmx_msr_bitmap_legacy;
1868
1869 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
1870 }
1871} 1902}
1872 1903
1873/* 1904/*
@@ -2533,13 +2564,16 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2533 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 2564 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2534 min2 = 0; 2565 min2 = 0;
2535 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2566 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2567 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2536 SECONDARY_EXEC_WBINVD_EXITING | 2568 SECONDARY_EXEC_WBINVD_EXITING |
2537 SECONDARY_EXEC_ENABLE_VPID | 2569 SECONDARY_EXEC_ENABLE_VPID |
2538 SECONDARY_EXEC_ENABLE_EPT | 2570 SECONDARY_EXEC_ENABLE_EPT |
2539 SECONDARY_EXEC_UNRESTRICTED_GUEST | 2571 SECONDARY_EXEC_UNRESTRICTED_GUEST |
2540 SECONDARY_EXEC_PAUSE_LOOP_EXITING | 2572 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
2541 SECONDARY_EXEC_RDTSCP | 2573 SECONDARY_EXEC_RDTSCP |
2542 SECONDARY_EXEC_ENABLE_INVPCID; 2574 SECONDARY_EXEC_ENABLE_INVPCID |
2575 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2576 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
2543 if (adjust_vmx_controls(min2, opt2, 2577 if (adjust_vmx_controls(min2, opt2,
2544 MSR_IA32_VMX_PROCBASED_CTLS2, 2578 MSR_IA32_VMX_PROCBASED_CTLS2,
2545 &_cpu_based_2nd_exec_control) < 0) 2579 &_cpu_based_2nd_exec_control) < 0)
@@ -2550,6 +2584,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2550 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2584 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2551 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 2585 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2552#endif 2586#endif
2587
2588 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2589 _cpu_based_2nd_exec_control &= ~(
2590 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2591 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2592 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2593
2553 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 2594 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
2554 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT 2595 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
2555 enabled */ 2596 enabled */
@@ -2747,6 +2788,15 @@ static __init int hardware_setup(void)
2747 if (!cpu_has_vmx_ple()) 2788 if (!cpu_has_vmx_ple())
2748 ple_gap = 0; 2789 ple_gap = 0;
2749 2790
2791 if (!cpu_has_vmx_apic_register_virt() ||
2792 !cpu_has_vmx_virtual_intr_delivery())
2793 enable_apicv_reg_vid = 0;
2794
2795 if (enable_apicv_reg_vid)
2796 kvm_x86_ops->update_cr8_intercept = NULL;
2797 else
2798 kvm_x86_ops->hwapic_irr_update = NULL;
2799
2750 if (nested) 2800 if (nested)
2751 nested_vmx_setup_ctls_msrs(); 2801 nested_vmx_setup_ctls_msrs();
2752 2802
@@ -2758,18 +2808,28 @@ static __exit void hardware_unsetup(void)
2758 free_kvm_area(); 2808 free_kvm_area();
2759} 2809}
2760 2810
2761static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save) 2811static bool emulation_required(struct kvm_vcpu *vcpu)
2762{ 2812{
2763 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2813 return emulate_invalid_guest_state && !guest_state_valid(vcpu);
2764 struct kvm_segment tmp = *save; 2814}
2765 2815
2766 if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) { 2816static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
2767 tmp.base = vmcs_readl(sf->base); 2817 struct kvm_segment *save)
2768 tmp.selector = vmcs_read16(sf->selector); 2818{
2769 tmp.dpl = tmp.selector & SELECTOR_RPL_MASK; 2819 if (!emulate_invalid_guest_state) {
2770 tmp.s = 1; 2820 /*
2821 * CS and SS RPL should be equal during guest entry according
2822 * to VMX spec, but in reality it is not always so. Since vcpu
2823 * is in the middle of the transition from real mode to
2824 * protected mode it is safe to assume that RPL 0 is a good
2825 * default value.
2826 */
2827 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
2828 save->selector &= ~SELECTOR_RPL_MASK;
2829 save->dpl = save->selector & SELECTOR_RPL_MASK;
2830 save->s = 1;
2771 } 2831 }
2772 vmx_set_segment(vcpu, &tmp, seg); 2832 vmx_set_segment(vcpu, save, seg);
2773} 2833}
2774 2834
2775static void enter_pmode(struct kvm_vcpu *vcpu) 2835static void enter_pmode(struct kvm_vcpu *vcpu)
@@ -2777,7 +2837,17 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
2777 unsigned long flags; 2837 unsigned long flags;
2778 struct vcpu_vmx *vmx = to_vmx(vcpu); 2838 struct vcpu_vmx *vmx = to_vmx(vcpu);
2779 2839
2780 vmx->emulation_required = 1; 2840 /*
2841 * Update real mode segment cache. It may be not up-to-date if sement
2842 * register was written while vcpu was in a guest mode.
2843 */
2844 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2845 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2846 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2847 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2848 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2849 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2850
2781 vmx->rmode.vm86_active = 0; 2851 vmx->rmode.vm86_active = 0;
2782 2852
2783 vmx_segment_cache_clear(vmx); 2853 vmx_segment_cache_clear(vmx);
@@ -2794,22 +2864,16 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
2794 2864
2795 update_exception_bitmap(vcpu); 2865 update_exception_bitmap(vcpu);
2796 2866
2797 if (emulate_invalid_guest_state) 2867 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2798 return; 2868 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2799 2869 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2800 fix_pmode_dataseg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 2870 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2801 fix_pmode_dataseg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 2871 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2802 fix_pmode_dataseg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 2872 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2803 fix_pmode_dataseg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2804
2805 vmx_segment_cache_clear(vmx);
2806 2873
2807 vmcs_write16(GUEST_SS_SELECTOR, 0); 2874 /* CPL is always 0 when CPU enters protected mode */
2808 vmcs_write32(GUEST_SS_AR_BYTES, 0x93); 2875 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
2809 2876 vmx->cpl = 0;
2810 vmcs_write16(GUEST_CS_SELECTOR,
2811 vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
2812 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
2813} 2877}
2814 2878
2815static gva_t rmode_tss_base(struct kvm *kvm) 2879static gva_t rmode_tss_base(struct kvm *kvm)
@@ -2831,36 +2895,51 @@ static gva_t rmode_tss_base(struct kvm *kvm)
2831static void fix_rmode_seg(int seg, struct kvm_segment *save) 2895static void fix_rmode_seg(int seg, struct kvm_segment *save)
2832{ 2896{
2833 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2897 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2834 2898 struct kvm_segment var = *save;
2835 vmcs_write16(sf->selector, save->base >> 4); 2899
2836 vmcs_write32(sf->base, save->base & 0xffff0); 2900 var.dpl = 0x3;
2837 vmcs_write32(sf->limit, 0xffff); 2901 if (seg == VCPU_SREG_CS)
2838 vmcs_write32(sf->ar_bytes, 0xf3); 2902 var.type = 0x3;
2839 if (save->base & 0xf) 2903
2840 printk_once(KERN_WARNING "kvm: segment base is not paragraph" 2904 if (!emulate_invalid_guest_state) {
2841 " aligned when entering protected mode (seg=%d)", 2905 var.selector = var.base >> 4;
2842 seg); 2906 var.base = var.base & 0xffff0;
2907 var.limit = 0xffff;
2908 var.g = 0;
2909 var.db = 0;
2910 var.present = 1;
2911 var.s = 1;
2912 var.l = 0;
2913 var.unusable = 0;
2914 var.type = 0x3;
2915 var.avl = 0;
2916 if (save->base & 0xf)
2917 printk_once(KERN_WARNING "kvm: segment base is not "
2918 "paragraph aligned when entering "
2919 "protected mode (seg=%d)", seg);
2920 }
2921
2922 vmcs_write16(sf->selector, var.selector);
2923 vmcs_write32(sf->base, var.base);
2924 vmcs_write32(sf->limit, var.limit);
2925 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
2843} 2926}
2844 2927
2845static void enter_rmode(struct kvm_vcpu *vcpu) 2928static void enter_rmode(struct kvm_vcpu *vcpu)
2846{ 2929{
2847 unsigned long flags; 2930 unsigned long flags;
2848 struct vcpu_vmx *vmx = to_vmx(vcpu); 2931 struct vcpu_vmx *vmx = to_vmx(vcpu);
2849 struct kvm_segment var;
2850
2851 if (enable_unrestricted_guest)
2852 return;
2853 2932
2854 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 2933 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2855 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 2934 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2856 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 2935 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2857 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 2936 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2858 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 2937 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2938 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2939 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2859 2940
2860 vmx->emulation_required = 1;
2861 vmx->rmode.vm86_active = 1; 2941 vmx->rmode.vm86_active = 1;
2862 2942
2863
2864 /* 2943 /*
2865 * Very old userspace does not call KVM_SET_TSS_ADDR before entering 2944 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
2866 * vcpu. Call it here with phys address pointing 16M below 4G. 2945 * vcpu. Call it here with phys address pointing 16M below 4G.
@@ -2888,28 +2967,13 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
2888 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 2967 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
2889 update_exception_bitmap(vcpu); 2968 update_exception_bitmap(vcpu);
2890 2969
2891 if (emulate_invalid_guest_state) 2970 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2892 goto continue_rmode; 2971 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2893 2972 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2894 vmx_get_segment(vcpu, &var, VCPU_SREG_SS); 2973 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2895 vmx_set_segment(vcpu, &var, VCPU_SREG_SS); 2974 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2896 2975 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2897 vmx_get_segment(vcpu, &var, VCPU_SREG_CS);
2898 vmx_set_segment(vcpu, &var, VCPU_SREG_CS);
2899
2900 vmx_get_segment(vcpu, &var, VCPU_SREG_ES);
2901 vmx_set_segment(vcpu, &var, VCPU_SREG_ES);
2902
2903 vmx_get_segment(vcpu, &var, VCPU_SREG_DS);
2904 vmx_set_segment(vcpu, &var, VCPU_SREG_DS);
2905 2976
2906 vmx_get_segment(vcpu, &var, VCPU_SREG_GS);
2907 vmx_set_segment(vcpu, &var, VCPU_SREG_GS);
2908
2909 vmx_get_segment(vcpu, &var, VCPU_SREG_FS);
2910 vmx_set_segment(vcpu, &var, VCPU_SREG_FS);
2911
2912continue_rmode:
2913 kvm_mmu_reset_context(vcpu); 2977 kvm_mmu_reset_context(vcpu);
2914} 2978}
2915 2979
@@ -3068,17 +3132,18 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3068 struct vcpu_vmx *vmx = to_vmx(vcpu); 3132 struct vcpu_vmx *vmx = to_vmx(vcpu);
3069 unsigned long hw_cr0; 3133 unsigned long hw_cr0;
3070 3134
3135 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK);
3071 if (enable_unrestricted_guest) 3136 if (enable_unrestricted_guest)
3072 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST) 3137 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
3073 | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; 3138 else {
3074 else 3139 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
3075 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
3076 3140
3077 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 3141 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3078 enter_pmode(vcpu); 3142 enter_pmode(vcpu);
3079 3143
3080 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) 3144 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3081 enter_rmode(vcpu); 3145 enter_rmode(vcpu);
3146 }
3082 3147
3083#ifdef CONFIG_X86_64 3148#ifdef CONFIG_X86_64
3084 if (vcpu->arch.efer & EFER_LME) { 3149 if (vcpu->arch.efer & EFER_LME) {
@@ -3098,7 +3163,9 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3098 vmcs_writel(CR0_READ_SHADOW, cr0); 3163 vmcs_writel(CR0_READ_SHADOW, cr0);
3099 vmcs_writel(GUEST_CR0, hw_cr0); 3164 vmcs_writel(GUEST_CR0, hw_cr0);
3100 vcpu->arch.cr0 = cr0; 3165 vcpu->arch.cr0 = cr0;
3101 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); 3166
3167 /* depends on vcpu->arch.cr0 to be set to a new value */
3168 vmx->emulation_required = emulation_required(vcpu);
3102} 3169}
3103 3170
3104static u64 construct_eptp(unsigned long root_hpa) 3171static u64 construct_eptp(unsigned long root_hpa)
@@ -3155,6 +3222,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3155 if (!is_paging(vcpu)) { 3222 if (!is_paging(vcpu)) {
3156 hw_cr4 &= ~X86_CR4_PAE; 3223 hw_cr4 &= ~X86_CR4_PAE;
3157 hw_cr4 |= X86_CR4_PSE; 3224 hw_cr4 |= X86_CR4_PSE;
3225 /*
3226 * SMEP is disabled if CPU is in non-paging mode in
3227 * hardware. However KVM always uses paging mode to
3228 * emulate guest non-paging mode with TDP.
3229 * To emulate this behavior, SMEP needs to be manually
3230 * disabled when guest switches to non-paging mode.
3231 */
3232 hw_cr4 &= ~X86_CR4_SMEP;
3158 } else if (!(cr4 & X86_CR4_PAE)) { 3233 } else if (!(cr4 & X86_CR4_PAE)) {
3159 hw_cr4 &= ~X86_CR4_PAE; 3234 hw_cr4 &= ~X86_CR4_PAE;
3160 } 3235 }
@@ -3171,10 +3246,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
3171 struct vcpu_vmx *vmx = to_vmx(vcpu); 3246 struct vcpu_vmx *vmx = to_vmx(vcpu);
3172 u32 ar; 3247 u32 ar;
3173 3248
3174 if (vmx->rmode.vm86_active 3249 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3175 && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
3176 || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
3177 || seg == VCPU_SREG_GS)) {
3178 *var = vmx->rmode.segs[seg]; 3250 *var = vmx->rmode.segs[seg];
3179 if (seg == VCPU_SREG_TR 3251 if (seg == VCPU_SREG_TR
3180 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3252 || var->selector == vmx_read_guest_seg_selector(vmx, seg))
@@ -3187,8 +3259,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
3187 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3259 var->limit = vmx_read_guest_seg_limit(vmx, seg);
3188 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3260 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3189 ar = vmx_read_guest_seg_ar(vmx, seg); 3261 ar = vmx_read_guest_seg_ar(vmx, seg);
3190 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
3191 ar = 0;
3192 var->type = ar & 15; 3262 var->type = ar & 15;
3193 var->s = (ar >> 4) & 1; 3263 var->s = (ar >> 4) & 1;
3194 var->dpl = (ar >> 5) & 3; 3264 var->dpl = (ar >> 5) & 3;
@@ -3211,8 +3281,10 @@ static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3211 return vmx_read_guest_seg_base(to_vmx(vcpu), seg); 3281 return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3212} 3282}
3213 3283
3214static int __vmx_get_cpl(struct kvm_vcpu *vcpu) 3284static int vmx_get_cpl(struct kvm_vcpu *vcpu)
3215{ 3285{
3286 struct vcpu_vmx *vmx = to_vmx(vcpu);
3287
3216 if (!is_protmode(vcpu)) 3288 if (!is_protmode(vcpu))
3217 return 0; 3289 return 0;
3218 3290
@@ -3220,24 +3292,9 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
3220 && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ 3292 && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */
3221 return 3; 3293 return 3;
3222 3294
3223 return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3;
3224}
3225
3226static int vmx_get_cpl(struct kvm_vcpu *vcpu)
3227{
3228 struct vcpu_vmx *vmx = to_vmx(vcpu);
3229
3230 /*
3231 * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations
3232 * fail; use the cache instead.
3233 */
3234 if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) {
3235 return vmx->cpl;
3236 }
3237
3238 if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { 3295 if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
3239 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); 3296 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3240 vmx->cpl = __vmx_get_cpl(vcpu); 3297 vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3;
3241 } 3298 }
3242 3299
3243 return vmx->cpl; 3300 return vmx->cpl;
@@ -3269,28 +3326,23 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3269{ 3326{
3270 struct vcpu_vmx *vmx = to_vmx(vcpu); 3327 struct vcpu_vmx *vmx = to_vmx(vcpu);
3271 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3328 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3272 u32 ar;
3273 3329
3274 vmx_segment_cache_clear(vmx); 3330 vmx_segment_cache_clear(vmx);
3331 if (seg == VCPU_SREG_CS)
3332 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3275 3333
3276 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { 3334 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3277 vmcs_write16(sf->selector, var->selector); 3335 vmx->rmode.segs[seg] = *var;
3278 vmx->rmode.segs[VCPU_SREG_TR] = *var; 3336 if (seg == VCPU_SREG_TR)
3279 return; 3337 vmcs_write16(sf->selector, var->selector);
3338 else if (var->s)
3339 fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3340 goto out;
3280 } 3341 }
3342
3281 vmcs_writel(sf->base, var->base); 3343 vmcs_writel(sf->base, var->base);
3282 vmcs_write32(sf->limit, var->limit); 3344 vmcs_write32(sf->limit, var->limit);
3283 vmcs_write16(sf->selector, var->selector); 3345 vmcs_write16(sf->selector, var->selector);
3284 if (vmx->rmode.vm86_active && var->s) {
3285 vmx->rmode.segs[seg] = *var;
3286 /*
3287 * Hack real-mode segments into vm86 compatibility.
3288 */
3289 if (var->base == 0xffff0000 && var->selector == 0xf000)
3290 vmcs_writel(sf->base, 0xf0000);
3291 ar = 0xf3;
3292 } else
3293 ar = vmx_segment_access_rights(var);
3294 3346
3295 /* 3347 /*
3296 * Fix the "Accessed" bit in AR field of segment registers for older 3348 * Fix the "Accessed" bit in AR field of segment registers for older
@@ -3304,42 +3356,12 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3304 * kvm hack. 3356 * kvm hack.
3305 */ 3357 */
3306 if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) 3358 if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
3307 ar |= 0x1; /* Accessed */ 3359 var->type |= 0x1; /* Accessed */
3308 3360
3309 vmcs_write32(sf->ar_bytes, ar); 3361 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3310 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3311 3362
3312 /* 3363out:
3313 * Fix segments for real mode guest in hosts that don't have 3364 vmx->emulation_required |= emulation_required(vcpu);
3314 * "unrestricted_mode" or it was disabled.
3315 * This is done to allow migration of the guests from hosts with
3316 * unrestricted guest like Westmere to older host that don't have
3317 * unrestricted guest like Nehelem.
3318 */
3319 if (vmx->rmode.vm86_active) {
3320 switch (seg) {
3321 case VCPU_SREG_CS:
3322 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
3323 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
3324 if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
3325 vmcs_writel(GUEST_CS_BASE, 0xf0000);
3326 vmcs_write16(GUEST_CS_SELECTOR,
3327 vmcs_readl(GUEST_CS_BASE) >> 4);
3328 break;
3329 case VCPU_SREG_ES:
3330 case VCPU_SREG_DS:
3331 case VCPU_SREG_GS:
3332 case VCPU_SREG_FS:
3333 fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3334 break;
3335 case VCPU_SREG_SS:
3336 vmcs_write16(GUEST_SS_SELECTOR,
3337 vmcs_readl(GUEST_SS_BASE) >> 4);
3338 vmcs_write32(GUEST_SS_LIMIT, 0xffff);
3339 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
3340 break;
3341 }
3342 }
3343} 3365}
3344 3366
3345static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3367static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -3380,13 +3402,16 @@ static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3380 u32 ar; 3402 u32 ar;
3381 3403
3382 vmx_get_segment(vcpu, &var, seg); 3404 vmx_get_segment(vcpu, &var, seg);
3405 var.dpl = 0x3;
3406 if (seg == VCPU_SREG_CS)
3407 var.type = 0x3;
3383 ar = vmx_segment_access_rights(&var); 3408 ar = vmx_segment_access_rights(&var);
3384 3409
3385 if (var.base != (var.selector << 4)) 3410 if (var.base != (var.selector << 4))
3386 return false; 3411 return false;
3387 if (var.limit < 0xffff) 3412 if (var.limit != 0xffff)
3388 return false; 3413 return false;
3389 if (((ar | (3 << AR_DPL_SHIFT)) & ~(AR_G_MASK | AR_DB_MASK)) != 0xf3) 3414 if (ar != 0xf3)
3390 return false; 3415 return false;
3391 3416
3392 return true; 3417 return true;
@@ -3521,6 +3546,9 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3521 */ 3546 */
3522static bool guest_state_valid(struct kvm_vcpu *vcpu) 3547static bool guest_state_valid(struct kvm_vcpu *vcpu)
3523{ 3548{
3549 if (enable_unrestricted_guest)
3550 return true;
3551
3524 /* real mode guest state checks */ 3552 /* real mode guest state checks */
3525 if (!is_protmode(vcpu)) { 3553 if (!is_protmode(vcpu)) {
3526 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 3554 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
@@ -3644,12 +3672,9 @@ static void seg_setup(int seg)
3644 vmcs_write16(sf->selector, 0); 3672 vmcs_write16(sf->selector, 0);
3645 vmcs_writel(sf->base, 0); 3673 vmcs_writel(sf->base, 0);
3646 vmcs_write32(sf->limit, 0xffff); 3674 vmcs_write32(sf->limit, 0xffff);
3647 if (enable_unrestricted_guest) { 3675 ar = 0x93;
3648 ar = 0x93; 3676 if (seg == VCPU_SREG_CS)
3649 if (seg == VCPU_SREG_CS) 3677 ar |= 0x08; /* code segment */
3650 ar |= 0x08; /* code segment */
3651 } else
3652 ar = 0xf3;
3653 3678
3654 vmcs_write32(sf->ar_bytes, ar); 3679 vmcs_write32(sf->ar_bytes, ar);
3655} 3680}
@@ -3667,7 +3692,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
3667 kvm_userspace_mem.flags = 0; 3692 kvm_userspace_mem.flags = 0;
3668 kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; 3693 kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
3669 kvm_userspace_mem.memory_size = PAGE_SIZE; 3694 kvm_userspace_mem.memory_size = PAGE_SIZE;
3670 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); 3695 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
3671 if (r) 3696 if (r)
3672 goto out; 3697 goto out;
3673 3698
@@ -3697,7 +3722,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
3697 kvm_userspace_mem.guest_phys_addr = 3722 kvm_userspace_mem.guest_phys_addr =
3698 kvm->arch.ept_identity_map_addr; 3723 kvm->arch.ept_identity_map_addr;
3699 kvm_userspace_mem.memory_size = PAGE_SIZE; 3724 kvm_userspace_mem.memory_size = PAGE_SIZE;
3700 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); 3725 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
3701 if (r) 3726 if (r)
3702 goto out; 3727 goto out;
3703 3728
@@ -3739,7 +3764,10 @@ static void free_vpid(struct vcpu_vmx *vmx)
3739 spin_unlock(&vmx_vpid_lock); 3764 spin_unlock(&vmx_vpid_lock);
3740} 3765}
3741 3766
3742static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) 3767#define MSR_TYPE_R 1
3768#define MSR_TYPE_W 2
3769static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
3770 u32 msr, int type)
3743{ 3771{
3744 int f = sizeof(unsigned long); 3772 int f = sizeof(unsigned long);
3745 3773
@@ -3752,20 +3780,93 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
3752 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 3780 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
3753 */ 3781 */
3754 if (msr <= 0x1fff) { 3782 if (msr <= 0x1fff) {
3755 __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */ 3783 if (type & MSR_TYPE_R)
3756 __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */ 3784 /* read-low */
3785 __clear_bit(msr, msr_bitmap + 0x000 / f);
3786
3787 if (type & MSR_TYPE_W)
3788 /* write-low */
3789 __clear_bit(msr, msr_bitmap + 0x800 / f);
3790
3757 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 3791 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
3758 msr &= 0x1fff; 3792 msr &= 0x1fff;
3759 __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */ 3793 if (type & MSR_TYPE_R)
3760 __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */ 3794 /* read-high */
3795 __clear_bit(msr, msr_bitmap + 0x400 / f);
3796
3797 if (type & MSR_TYPE_W)
3798 /* write-high */
3799 __clear_bit(msr, msr_bitmap + 0xc00 / f);
3800
3801 }
3802}
3803
3804static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
3805 u32 msr, int type)
3806{
3807 int f = sizeof(unsigned long);
3808
3809 if (!cpu_has_vmx_msr_bitmap())
3810 return;
3811
3812 /*
3813 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
3814 * have the write-low and read-high bitmap offsets the wrong way round.
3815 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
3816 */
3817 if (msr <= 0x1fff) {
3818 if (type & MSR_TYPE_R)
3819 /* read-low */
3820 __set_bit(msr, msr_bitmap + 0x000 / f);
3821
3822 if (type & MSR_TYPE_W)
3823 /* write-low */
3824 __set_bit(msr, msr_bitmap + 0x800 / f);
3825
3826 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
3827 msr &= 0x1fff;
3828 if (type & MSR_TYPE_R)
3829 /* read-high */
3830 __set_bit(msr, msr_bitmap + 0x400 / f);
3831
3832 if (type & MSR_TYPE_W)
3833 /* write-high */
3834 __set_bit(msr, msr_bitmap + 0xc00 / f);
3835
3761 } 3836 }
3762} 3837}
3763 3838
3764static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) 3839static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
3765{ 3840{
3766 if (!longmode_only) 3841 if (!longmode_only)
3767 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr); 3842 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
3768 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr); 3843 msr, MSR_TYPE_R | MSR_TYPE_W);
3844 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
3845 msr, MSR_TYPE_R | MSR_TYPE_W);
3846}
3847
3848static void vmx_enable_intercept_msr_read_x2apic(u32 msr)
3849{
3850 __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
3851 msr, MSR_TYPE_R);
3852 __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
3853 msr, MSR_TYPE_R);
3854}
3855
3856static void vmx_disable_intercept_msr_read_x2apic(u32 msr)
3857{
3858 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
3859 msr, MSR_TYPE_R);
3860 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
3861 msr, MSR_TYPE_R);
3862}
3863
3864static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
3865{
3866 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
3867 msr, MSR_TYPE_W);
3868 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
3869 msr, MSR_TYPE_W);
3769} 3870}
3770 3871
3771/* 3872/*
@@ -3844,6 +3945,11 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
3844 return exec_control; 3945 return exec_control;
3845} 3946}
3846 3947
3948static int vmx_vm_has_apicv(struct kvm *kvm)
3949{
3950 return enable_apicv_reg_vid && irqchip_in_kernel(kvm);
3951}
3952
3847static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 3953static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3848{ 3954{
3849 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 3955 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
@@ -3861,6 +3967,10 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3861 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 3967 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
3862 if (!ple_gap) 3968 if (!ple_gap)
3863 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 3969 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
3970 if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
3971 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
3972 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3973 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
3864 return exec_control; 3974 return exec_control;
3865} 3975}
3866 3976
@@ -3905,6 +4015,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
3905 vmx_secondary_exec_control(vmx)); 4015 vmx_secondary_exec_control(vmx));
3906 } 4016 }
3907 4017
4018 if (enable_apicv_reg_vid) {
4019 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4020 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4021 vmcs_write64(EOI_EXIT_BITMAP2, 0);
4022 vmcs_write64(EOI_EXIT_BITMAP3, 0);
4023
4024 vmcs_write16(GUEST_INTR_STATUS, 0);
4025 }
4026
3908 if (ple_gap) { 4027 if (ple_gap) {
3909 vmcs_write32(PLE_GAP, ple_gap); 4028 vmcs_write32(PLE_GAP, ple_gap);
3910 vmcs_write32(PLE_WINDOW, ple_window); 4029 vmcs_write32(PLE_WINDOW, ple_window);
@@ -3990,14 +4109,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
3990 vmx_segment_cache_clear(vmx); 4109 vmx_segment_cache_clear(vmx);
3991 4110
3992 seg_setup(VCPU_SREG_CS); 4111 seg_setup(VCPU_SREG_CS);
3993 /* 4112 if (kvm_vcpu_is_bsp(&vmx->vcpu))
3994 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
3995 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
3996 */
3997 if (kvm_vcpu_is_bsp(&vmx->vcpu)) {
3998 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4113 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
3999 vmcs_writel(GUEST_CS_BASE, 0x000f0000); 4114 else {
4000 } else {
4001 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); 4115 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
4002 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); 4116 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
4003 } 4117 }
@@ -4073,9 +4187,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4073 4187
4074 ret = 0; 4188 ret = 0;
4075 4189
4076 /* HACK: Don't enable emulation on guest boot/reset */
4077 vmx->emulation_required = 0;
4078
4079 return ret; 4190 return ret;
4080} 4191}
4081 4192
@@ -4251,7 +4362,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
4251 .flags = 0, 4362 .flags = 0,
4252 }; 4363 };
4253 4364
4254 ret = kvm_set_memory_region(kvm, &tss_mem, 0); 4365 ret = kvm_set_memory_region(kvm, &tss_mem, false);
4255 if (ret) 4366 if (ret)
4256 return ret; 4367 return ret;
4257 kvm->arch.tss_addr = addr; 4368 kvm->arch.tss_addr = addr;
@@ -4261,28 +4372,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
4261 return 0; 4372 return 0;
4262} 4373}
4263 4374
4264static int handle_rmode_exception(struct kvm_vcpu *vcpu, 4375static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
4265 int vec, u32 err_code)
4266{ 4376{
4267 /*
4268 * Instruction with address size override prefix opcode 0x67
4269 * Cause the #SS fault with 0 error code in VM86 mode.
4270 */
4271 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
4272 if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
4273 return 1;
4274 /*
4275 * Forward all other exceptions that are valid in real mode.
4276 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
4277 * the required debugging infrastructure rework.
4278 */
4279 switch (vec) { 4377 switch (vec) {
4280 case DB_VECTOR:
4281 if (vcpu->guest_debug &
4282 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
4283 return 0;
4284 kvm_queue_exception(vcpu, vec);
4285 return 1;
4286 case BP_VECTOR: 4378 case BP_VECTOR:
4287 /* 4379 /*
4288 * Update instruction length as we may reinject the exception 4380 * Update instruction length as we may reinject the exception
@@ -4291,7 +4383,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
4291 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = 4383 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
4292 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4384 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4293 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 4385 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
4294 return 0; 4386 return false;
4387 /* fall through */
4388 case DB_VECTOR:
4389 if (vcpu->guest_debug &
4390 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
4391 return false;
4295 /* fall through */ 4392 /* fall through */
4296 case DE_VECTOR: 4393 case DE_VECTOR:
4297 case OF_VECTOR: 4394 case OF_VECTOR:
@@ -4301,10 +4398,37 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
4301 case SS_VECTOR: 4398 case SS_VECTOR:
4302 case GP_VECTOR: 4399 case GP_VECTOR:
4303 case MF_VECTOR: 4400 case MF_VECTOR:
4304 kvm_queue_exception(vcpu, vec); 4401 return true;
4305 return 1; 4402 break;
4306 } 4403 }
4307 return 0; 4404 return false;
4405}
4406
4407static int handle_rmode_exception(struct kvm_vcpu *vcpu,
4408 int vec, u32 err_code)
4409{
4410 /*
4411 * Instruction with address size override prefix opcode 0x67
4412 * Cause the #SS fault with 0 error code in VM86 mode.
4413 */
4414 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
4415 if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
4416 if (vcpu->arch.halt_request) {
4417 vcpu->arch.halt_request = 0;
4418 return kvm_emulate_halt(vcpu);
4419 }
4420 return 1;
4421 }
4422 return 0;
4423 }
4424
4425 /*
4426 * Forward all other exceptions that are valid in real mode.
4427 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
4428 * the required debugging infrastructure rework.
4429 */
4430 kvm_queue_exception(vcpu, vec);
4431 return 1;
4308} 4432}
4309 4433
4310/* 4434/*
@@ -4392,17 +4516,11 @@ static int handle_exception(struct kvm_vcpu *vcpu)
4392 return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); 4516 return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
4393 } 4517 }
4394 4518
4395 if (vmx->rmode.vm86_active &&
4396 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
4397 error_code)) {
4398 if (vcpu->arch.halt_request) {
4399 vcpu->arch.halt_request = 0;
4400 return kvm_emulate_halt(vcpu);
4401 }
4402 return 1;
4403 }
4404
4405 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 4519 ex_no = intr_info & INTR_INFO_VECTOR_MASK;
4520
4521 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
4522 return handle_rmode_exception(vcpu, ex_no, error_code);
4523
4406 switch (ex_no) { 4524 switch (ex_no) {
4407 case DB_VECTOR: 4525 case DB_VECTOR:
4408 dr6 = vmcs_readl(EXIT_QUALIFICATION); 4526 dr6 = vmcs_readl(EXIT_QUALIFICATION);
@@ -4820,6 +4938,26 @@ static int handle_apic_access(struct kvm_vcpu *vcpu)
4820 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 4938 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
4821} 4939}
4822 4940
4941static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
4942{
4943 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4944 int vector = exit_qualification & 0xff;
4945
4946 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
4947 kvm_apic_set_eoi_accelerated(vcpu, vector);
4948 return 1;
4949}
4950
4951static int handle_apic_write(struct kvm_vcpu *vcpu)
4952{
4953 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4954 u32 offset = exit_qualification & 0xfff;
4955
4956 /* APIC-write VM exit is trap-like and thus no need to adjust IP */
4957 kvm_apic_write_nodecode(vcpu, offset);
4958 return 1;
4959}
4960
4823static int handle_task_switch(struct kvm_vcpu *vcpu) 4961static int handle_task_switch(struct kvm_vcpu *vcpu)
4824{ 4962{
4825 struct vcpu_vmx *vmx = to_vmx(vcpu); 4963 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5065,7 +5203,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5065 schedule(); 5203 schedule();
5066 } 5204 }
5067 5205
5068 vmx->emulation_required = !guest_state_valid(vcpu); 5206 vmx->emulation_required = emulation_required(vcpu);
5069out: 5207out:
5070 return ret; 5208 return ret;
5071} 5209}
@@ -5754,6 +5892,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
5754 [EXIT_REASON_VMON] = handle_vmon, 5892 [EXIT_REASON_VMON] = handle_vmon,
5755 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 5893 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
5756 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 5894 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
5895 [EXIT_REASON_APIC_WRITE] = handle_apic_write,
5896 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
5757 [EXIT_REASON_WBINVD] = handle_wbinvd, 5897 [EXIT_REASON_WBINVD] = handle_wbinvd,
5758 [EXIT_REASON_XSETBV] = handle_xsetbv, 5898 [EXIT_REASON_XSETBV] = handle_xsetbv,
5759 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 5899 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
@@ -5780,7 +5920,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5780 u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; 5920 u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
5781 gpa_t bitmap; 5921 gpa_t bitmap;
5782 5922
5783 if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS)) 5923 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
5784 return 1; 5924 return 1;
5785 5925
5786 /* 5926 /*
@@ -6008,7 +6148,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
6008 u32 vectoring_info = vmx->idt_vectoring_info; 6148 u32 vectoring_info = vmx->idt_vectoring_info;
6009 6149
6010 /* If guest state is invalid, start emulating */ 6150 /* If guest state is invalid, start emulating */
6011 if (vmx->emulation_required && emulate_invalid_guest_state) 6151 if (vmx->emulation_required)
6012 return handle_invalid_guest_state(vcpu); 6152 return handle_invalid_guest_state(vcpu);
6013 6153
6014 /* 6154 /*
@@ -6103,6 +6243,85 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6103 vmcs_write32(TPR_THRESHOLD, irr); 6243 vmcs_write32(TPR_THRESHOLD, irr);
6104} 6244}
6105 6245
6246static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
6247{
6248 u32 sec_exec_control;
6249
6250 /*
6251 * There is not point to enable virtualize x2apic without enable
6252 * apicv
6253 */
6254 if (!cpu_has_vmx_virtualize_x2apic_mode() ||
6255 !vmx_vm_has_apicv(vcpu->kvm))
6256 return;
6257
6258 if (!vm_need_tpr_shadow(vcpu->kvm))
6259 return;
6260
6261 sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6262
6263 if (set) {
6264 sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6265 sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6266 } else {
6267 sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6268 sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6269 }
6270 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
6271
6272 vmx_set_msr_bitmap(vcpu);
6273}
6274
6275static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
6276{
6277 u16 status;
6278 u8 old;
6279
6280 if (!vmx_vm_has_apicv(kvm))
6281 return;
6282
6283 if (isr == -1)
6284 isr = 0;
6285
6286 status = vmcs_read16(GUEST_INTR_STATUS);
6287 old = status >> 8;
6288 if (isr != old) {
6289 status &= 0xff;
6290 status |= isr << 8;
6291 vmcs_write16(GUEST_INTR_STATUS, status);
6292 }
6293}
6294
6295static void vmx_set_rvi(int vector)
6296{
6297 u16 status;
6298 u8 old;
6299
6300 status = vmcs_read16(GUEST_INTR_STATUS);
6301 old = (u8)status & 0xff;
6302 if ((u8)vector != old) {
6303 status &= ~0xff;
6304 status |= (u8)vector;
6305 vmcs_write16(GUEST_INTR_STATUS, status);
6306 }
6307}
6308
6309static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
6310{
6311 if (max_irr == -1)
6312 return;
6313
6314 vmx_set_rvi(max_irr);
6315}
6316
6317static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
6318{
6319 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
6320 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
6321 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
6322 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
6323}
6324
6106static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) 6325static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
6107{ 6326{
6108 u32 exit_intr_info; 6327 u32 exit_intr_info;
@@ -6291,7 +6510,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6291 6510
6292 /* Don't enter VMX if guest state is invalid, let the exit handler 6511 /* Don't enter VMX if guest state is invalid, let the exit handler
6293 start emulation until we arrive back to a valid state */ 6512 start emulation until we arrive back to a valid state */
6294 if (vmx->emulation_required && emulate_invalid_guest_state) 6513 if (vmx->emulation_required)
6295 return; 6514 return;
6296 6515
6297 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) 6516 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
@@ -7366,6 +7585,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
7366 .enable_nmi_window = enable_nmi_window, 7585 .enable_nmi_window = enable_nmi_window,
7367 .enable_irq_window = enable_irq_window, 7586 .enable_irq_window = enable_irq_window,
7368 .update_cr8_intercept = update_cr8_intercept, 7587 .update_cr8_intercept = update_cr8_intercept,
7588 .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
7589 .vm_has_apicv = vmx_vm_has_apicv,
7590 .load_eoi_exitmap = vmx_load_eoi_exitmap,
7591 .hwapic_irr_update = vmx_hwapic_irr_update,
7592 .hwapic_isr_update = vmx_hwapic_isr_update,
7369 7593
7370 .set_tss_addr = vmx_set_tss_addr, 7594 .set_tss_addr = vmx_set_tss_addr,
7371 .get_tdp_level = get_ept_level, 7595 .get_tdp_level = get_ept_level,
@@ -7398,7 +7622,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
7398 7622
7399static int __init vmx_init(void) 7623static int __init vmx_init(void)
7400{ 7624{
7401 int r, i; 7625 int r, i, msr;
7402 7626
7403 rdmsrl_safe(MSR_EFER, &host_efer); 7627 rdmsrl_safe(MSR_EFER, &host_efer);
7404 7628
@@ -7419,11 +7643,19 @@ static int __init vmx_init(void)
7419 if (!vmx_msr_bitmap_legacy) 7643 if (!vmx_msr_bitmap_legacy)
7420 goto out1; 7644 goto out1;
7421 7645
7646 vmx_msr_bitmap_legacy_x2apic =
7647 (unsigned long *)__get_free_page(GFP_KERNEL);
7648 if (!vmx_msr_bitmap_legacy_x2apic)
7649 goto out2;
7422 7650
7423 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); 7651 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
7424 if (!vmx_msr_bitmap_longmode) 7652 if (!vmx_msr_bitmap_longmode)
7425 goto out2; 7653 goto out3;
7426 7654
7655 vmx_msr_bitmap_longmode_x2apic =
7656 (unsigned long *)__get_free_page(GFP_KERNEL);
7657 if (!vmx_msr_bitmap_longmode_x2apic)
7658 goto out4;
7427 7659
7428 /* 7660 /*
7429 * Allow direct access to the PC debug port (it is often used for I/O 7661 * Allow direct access to the PC debug port (it is often used for I/O
@@ -7455,6 +7687,28 @@ static int __init vmx_init(void)
7455 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); 7687 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
7456 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); 7688 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
7457 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); 7689 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
7690 memcpy(vmx_msr_bitmap_legacy_x2apic,
7691 vmx_msr_bitmap_legacy, PAGE_SIZE);
7692 memcpy(vmx_msr_bitmap_longmode_x2apic,
7693 vmx_msr_bitmap_longmode, PAGE_SIZE);
7694
7695 if (enable_apicv_reg_vid) {
7696 for (msr = 0x800; msr <= 0x8ff; msr++)
7697 vmx_disable_intercept_msr_read_x2apic(msr);
7698
7699 /* According SDM, in x2apic mode, the whole id reg is used.
7700 * But in KVM, it only use the highest eight bits. Need to
7701 * intercept it */
7702 vmx_enable_intercept_msr_read_x2apic(0x802);
7703 /* TMCCT */
7704 vmx_enable_intercept_msr_read_x2apic(0x839);
7705 /* TPR */
7706 vmx_disable_intercept_msr_write_x2apic(0x808);
7707 /* EOI */
7708 vmx_disable_intercept_msr_write_x2apic(0x80b);
7709 /* SELF-IPI */
7710 vmx_disable_intercept_msr_write_x2apic(0x83f);
7711 }
7458 7712
7459 if (enable_ept) { 7713 if (enable_ept) {
7460 kvm_mmu_set_mask_ptes(0ull, 7714 kvm_mmu_set_mask_ptes(0ull,
@@ -7468,8 +7722,10 @@ static int __init vmx_init(void)
7468 7722
7469 return 0; 7723 return 0;
7470 7724
7471out3: 7725out4:
7472 free_page((unsigned long)vmx_msr_bitmap_longmode); 7726 free_page((unsigned long)vmx_msr_bitmap_longmode);
7727out3:
7728 free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
7473out2: 7729out2:
7474 free_page((unsigned long)vmx_msr_bitmap_legacy); 7730 free_page((unsigned long)vmx_msr_bitmap_legacy);
7475out1: 7731out1:
@@ -7481,6 +7737,8 @@ out:
7481 7737
7482static void __exit vmx_exit(void) 7738static void __exit vmx_exit(void)
7483{ 7739{
7740 free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
7741 free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
7484 free_page((unsigned long)vmx_msr_bitmap_legacy); 7742 free_page((unsigned long)vmx_msr_bitmap_legacy);
7485 free_page((unsigned long)vmx_msr_bitmap_longmode); 7743 free_page((unsigned long)vmx_msr_bitmap_longmode);
7486 free_page((unsigned long)vmx_io_bitmap_b); 7744 free_page((unsigned long)vmx_io_bitmap_b);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 37040079cd6b..f71500af1f81 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -872,8 +872,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
872 872
873 kvm_x86_ops->set_efer(vcpu, efer); 873 kvm_x86_ops->set_efer(vcpu, efer);
874 874
875 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
876
877 /* Update reserved bits */ 875 /* Update reserved bits */
878 if ((efer ^ old_efer) & EFER_NX) 876 if ((efer ^ old_efer) & EFER_NX)
879 kvm_mmu_reset_context(vcpu); 877 kvm_mmu_reset_context(vcpu);
@@ -2522,7 +2520,7 @@ int kvm_dev_ioctl_check_extension(long ext)
2522 r = KVM_MAX_VCPUS; 2520 r = KVM_MAX_VCPUS;
2523 break; 2521 break;
2524 case KVM_CAP_NR_MEMSLOTS: 2522 case KVM_CAP_NR_MEMSLOTS:
2525 r = KVM_MEMORY_SLOTS; 2523 r = KVM_USER_MEM_SLOTS;
2526 break; 2524 break;
2527 case KVM_CAP_PV_MMU: /* obsolete */ 2525 case KVM_CAP_PV_MMU: /* obsolete */
2528 r = 0; 2526 r = 0;
@@ -3274,12 +3272,10 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
3274 return -EINVAL; 3272 return -EINVAL;
3275 3273
3276 mutex_lock(&kvm->slots_lock); 3274 mutex_lock(&kvm->slots_lock);
3277 spin_lock(&kvm->mmu_lock);
3278 3275
3279 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 3276 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
3280 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 3277 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
3281 3278
3282 spin_unlock(&kvm->mmu_lock);
3283 mutex_unlock(&kvm->slots_lock); 3279 mutex_unlock(&kvm->slots_lock);
3284 return 0; 3280 return 0;
3285} 3281}
@@ -3439,7 +3435,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
3439 mutex_lock(&kvm->slots_lock); 3435 mutex_lock(&kvm->slots_lock);
3440 3436
3441 r = -EINVAL; 3437 r = -EINVAL;
3442 if (log->slot >= KVM_MEMORY_SLOTS) 3438 if (log->slot >= KVM_USER_MEM_SLOTS)
3443 goto out; 3439 goto out;
3444 3440
3445 memslot = id_to_memslot(kvm->memslots, log->slot); 3441 memslot = id_to_memslot(kvm->memslots, log->slot);
@@ -4495,8 +4491,10 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
4495 kvm_get_segment(emul_to_vcpu(ctxt), &var, seg); 4491 kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
4496 *selector = var.selector; 4492 *selector = var.selector;
4497 4493
4498 if (var.unusable) 4494 if (var.unusable) {
4495 memset(desc, 0, sizeof(*desc));
4499 return false; 4496 return false;
4497 }
4500 4498
4501 if (var.g) 4499 if (var.g)
4502 var.limit >>= 12; 4500 var.limit >>= 12;
@@ -4757,26 +4755,26 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
4757 return r; 4755 return r;
4758} 4756}
4759 4757
4760static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) 4758static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
4759 bool write_fault_to_shadow_pgtable)
4761{ 4760{
4762 gpa_t gpa; 4761 gpa_t gpa = cr2;
4763 pfn_t pfn; 4762 pfn_t pfn;
4764 4763
4765 if (tdp_enabled) 4764 if (!vcpu->arch.mmu.direct_map) {
4766 return false; 4765 /*
4767 4766 * Write permission should be allowed since only
4768 /* 4767 * write access need to be emulated.
4769 * if emulation was due to access to shadowed page table 4768 */
4770 * and it failed try to unshadow page and re-enter the 4769 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
4771 * guest to let CPU execute the instruction.
4772 */
4773 if (kvm_mmu_unprotect_page_virt(vcpu, gva))
4774 return true;
4775
4776 gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
4777 4770
4778 if (gpa == UNMAPPED_GVA) 4771 /*
4779 return true; /* let cpu generate fault */ 4772 * If the mapping is invalid in guest, let cpu retry
4773 * it to generate fault.
4774 */
4775 if (gpa == UNMAPPED_GVA)
4776 return true;
4777 }
4780 4778
4781 /* 4779 /*
4782 * Do not retry the unhandleable instruction if it faults on the 4780 * Do not retry the unhandleable instruction if it faults on the
@@ -4785,12 +4783,43 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4785 * instruction -> ... 4783 * instruction -> ...
4786 */ 4784 */
4787 pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); 4785 pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
4788 if (!is_error_noslot_pfn(pfn)) { 4786
4789 kvm_release_pfn_clean(pfn); 4787 /*
4788 * If the instruction failed on the error pfn, it can not be fixed,
4789 * report the error to userspace.
4790 */
4791 if (is_error_noslot_pfn(pfn))
4792 return false;
4793
4794 kvm_release_pfn_clean(pfn);
4795
4796 /* The instructions are well-emulated on direct mmu. */
4797 if (vcpu->arch.mmu.direct_map) {
4798 unsigned int indirect_shadow_pages;
4799
4800 spin_lock(&vcpu->kvm->mmu_lock);
4801 indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
4802 spin_unlock(&vcpu->kvm->mmu_lock);
4803
4804 if (indirect_shadow_pages)
4805 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
4806
4790 return true; 4807 return true;
4791 } 4808 }
4792 4809
4793 return false; 4810 /*
4811 * if emulation was due to access to shadowed page table
4812 * and it failed try to unshadow page and re-enter the
4813 * guest to let CPU execute the instruction.
4814 */
4815 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
4816
4817 /*
4818 * If the access faults on its page table, it can not
4819 * be fixed by unprotecting shadow page and it should
4820 * be reported to userspace.
4821 */
4822 return !write_fault_to_shadow_pgtable;
4794} 4823}
4795 4824
4796static bool retry_instruction(struct x86_emulate_ctxt *ctxt, 4825static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
@@ -4832,7 +4861,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
4832 if (!vcpu->arch.mmu.direct_map) 4861 if (!vcpu->arch.mmu.direct_map)
4833 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); 4862 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
4834 4863
4835 kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 4864 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
4836 4865
4837 return true; 4866 return true;
4838} 4867}
@@ -4849,7 +4878,13 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4849 int r; 4878 int r;
4850 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4879 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4851 bool writeback = true; 4880 bool writeback = true;
4881 bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
4852 4882
4883 /*
4884 * Clear write_fault_to_shadow_pgtable here to ensure it is
4885 * never reused.
4886 */
4887 vcpu->arch.write_fault_to_shadow_pgtable = false;
4853 kvm_clear_exception_queue(vcpu); 4888 kvm_clear_exception_queue(vcpu);
4854 4889
4855 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 4890 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
@@ -4868,7 +4903,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4868 if (r != EMULATION_OK) { 4903 if (r != EMULATION_OK) {
4869 if (emulation_type & EMULTYPE_TRAP_UD) 4904 if (emulation_type & EMULTYPE_TRAP_UD)
4870 return EMULATE_FAIL; 4905 return EMULATE_FAIL;
4871 if (reexecute_instruction(vcpu, cr2)) 4906 if (reexecute_instruction(vcpu, cr2,
4907 write_fault_to_spt))
4872 return EMULATE_DONE; 4908 return EMULATE_DONE;
4873 if (emulation_type & EMULTYPE_SKIP) 4909 if (emulation_type & EMULTYPE_SKIP)
4874 return EMULATE_FAIL; 4910 return EMULATE_FAIL;
@@ -4898,7 +4934,7 @@ restart:
4898 return EMULATE_DONE; 4934 return EMULATE_DONE;
4899 4935
4900 if (r == EMULATION_FAILED) { 4936 if (r == EMULATION_FAILED) {
4901 if (reexecute_instruction(vcpu, cr2)) 4937 if (reexecute_instruction(vcpu, cr2, write_fault_to_spt))
4902 return EMULATE_DONE; 4938 return EMULATE_DONE;
4903 4939
4904 return handle_emulation_failure(vcpu); 4940 return handle_emulation_failure(vcpu);
@@ -5541,7 +5577,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
5541 vcpu->arch.nmi_injected = true; 5577 vcpu->arch.nmi_injected = true;
5542 kvm_x86_ops->set_nmi(vcpu); 5578 kvm_x86_ops->set_nmi(vcpu);
5543 } 5579 }
5544 } else if (kvm_cpu_has_interrupt(vcpu)) { 5580 } else if (kvm_cpu_has_injectable_intr(vcpu)) {
5545 if (kvm_x86_ops->interrupt_allowed(vcpu)) { 5581 if (kvm_x86_ops->interrupt_allowed(vcpu)) {
5546 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), 5582 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
5547 false); 5583 false);
@@ -5609,6 +5645,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
5609#endif 5645#endif
5610} 5646}
5611 5647
5648static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
5649{
5650 u64 eoi_exit_bitmap[4];
5651
5652 memset(eoi_exit_bitmap, 0, 32);
5653
5654 kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap);
5655 kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
5656}
5657
5612static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5658static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5613{ 5659{
5614 int r; 5660 int r;
@@ -5662,6 +5708,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5662 kvm_handle_pmu_event(vcpu); 5708 kvm_handle_pmu_event(vcpu);
5663 if (kvm_check_request(KVM_REQ_PMI, vcpu)) 5709 if (kvm_check_request(KVM_REQ_PMI, vcpu))
5664 kvm_deliver_pmi(vcpu); 5710 kvm_deliver_pmi(vcpu);
5711 if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu))
5712 update_eoi_exitmap(vcpu);
5665 } 5713 }
5666 5714
5667 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { 5715 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -5670,10 +5718,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5670 /* enable NMI/IRQ window open exits if needed */ 5718 /* enable NMI/IRQ window open exits if needed */
5671 if (vcpu->arch.nmi_pending) 5719 if (vcpu->arch.nmi_pending)
5672 kvm_x86_ops->enable_nmi_window(vcpu); 5720 kvm_x86_ops->enable_nmi_window(vcpu);
5673 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) 5721 else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
5674 kvm_x86_ops->enable_irq_window(vcpu); 5722 kvm_x86_ops->enable_irq_window(vcpu);
5675 5723
5676 if (kvm_lapic_enabled(vcpu)) { 5724 if (kvm_lapic_enabled(vcpu)) {
5725 /*
5726 * Update architecture specific hints for APIC
5727 * virtual interrupt delivery.
5728 */
5729 if (kvm_x86_ops->hwapic_irr_update)
5730 kvm_x86_ops->hwapic_irr_update(vcpu,
5731 kvm_lapic_find_highest_irr(vcpu));
5677 update_cr8_intercept(vcpu); 5732 update_cr8_intercept(vcpu);
5678 kvm_lapic_sync_to_vapic(vcpu); 5733 kvm_lapic_sync_to_vapic(vcpu);
5679 } 5734 }
@@ -6853,48 +6908,43 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
6853 struct kvm_memory_slot *memslot, 6908 struct kvm_memory_slot *memslot,
6854 struct kvm_memory_slot old, 6909 struct kvm_memory_slot old,
6855 struct kvm_userspace_memory_region *mem, 6910 struct kvm_userspace_memory_region *mem,
6856 int user_alloc) 6911 bool user_alloc)
6857{ 6912{
6858 int npages = memslot->npages; 6913 int npages = memslot->npages;
6859 int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
6860 6914
6861 /* Prevent internal slot pages from being moved by fork()/COW. */ 6915 /*
6862 if (memslot->id >= KVM_MEMORY_SLOTS) 6916 * Only private memory slots need to be mapped here since
6863 map_flags = MAP_SHARED | MAP_ANONYMOUS; 6917 * KVM_SET_MEMORY_REGION ioctl is no longer supported.
6864
6865 /*To keep backward compatibility with older userspace,
6866 *x86 needs to handle !user_alloc case.
6867 */ 6918 */
6868 if (!user_alloc) { 6919 if ((memslot->id >= KVM_USER_MEM_SLOTS) && npages && !old.npages) {
6869 if (npages && !old.npages) { 6920 unsigned long userspace_addr;
6870 unsigned long userspace_addr;
6871 6921
6872 userspace_addr = vm_mmap(NULL, 0, 6922 /*
6873 npages * PAGE_SIZE, 6923 * MAP_SHARED to prevent internal slot pages from being moved
6874 PROT_READ | PROT_WRITE, 6924 * by fork()/COW.
6875 map_flags, 6925 */
6876 0); 6926 userspace_addr = vm_mmap(NULL, 0, npages * PAGE_SIZE,
6927 PROT_READ | PROT_WRITE,
6928 MAP_SHARED | MAP_ANONYMOUS, 0);
6877 6929
6878 if (IS_ERR((void *)userspace_addr)) 6930 if (IS_ERR((void *)userspace_addr))
6879 return PTR_ERR((void *)userspace_addr); 6931 return PTR_ERR((void *)userspace_addr);
6880 6932
6881 memslot->userspace_addr = userspace_addr; 6933 memslot->userspace_addr = userspace_addr;
6882 }
6883 } 6934 }
6884 6935
6885
6886 return 0; 6936 return 0;
6887} 6937}
6888 6938
6889void kvm_arch_commit_memory_region(struct kvm *kvm, 6939void kvm_arch_commit_memory_region(struct kvm *kvm,
6890 struct kvm_userspace_memory_region *mem, 6940 struct kvm_userspace_memory_region *mem,
6891 struct kvm_memory_slot old, 6941 struct kvm_memory_slot old,
6892 int user_alloc) 6942 bool user_alloc)
6893{ 6943{
6894 6944
6895 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; 6945 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
6896 6946
6897 if (!user_alloc && !old.user_alloc && old.npages && !npages) { 6947 if ((mem->slot >= KVM_USER_MEM_SLOTS) && old.npages && !npages) {
6898 int ret; 6948 int ret;
6899 6949
6900 ret = vm_munmap(old.userspace_addr, 6950 ret = vm_munmap(old.userspace_addr,
@@ -6908,11 +6958,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6908 if (!kvm->arch.n_requested_mmu_pages) 6958 if (!kvm->arch.n_requested_mmu_pages)
6909 nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 6959 nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
6910 6960
6911 spin_lock(&kvm->mmu_lock);
6912 if (nr_mmu_pages) 6961 if (nr_mmu_pages)
6913 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 6962 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
6914 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 6963 /*
6915 spin_unlock(&kvm->mmu_lock); 6964 * Write protect all pages for dirty logging.
6965 * Existing largepage mappings are destroyed here and new ones will
6966 * not be created until the end of the logging.
6967 */
6968 if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
6969 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
6916 /* 6970 /*
6917 * If memory slot is created, or moved, we need to clear all 6971 * If memory slot is created, or moved, we need to clear all
6918 * mmio sptes. 6972 * mmio sptes.
diff --git a/drivers/s390/kvm/Makefile b/drivers/s390/kvm/Makefile
index 0815690ac1e0..241891a57caf 100644
--- a/drivers/s390/kvm/Makefile
+++ b/drivers/s390/kvm/Makefile
@@ -6,4 +6,4 @@
6# it under the terms of the GNU General Public License (version 2 only) 6# it under the terms of the GNU General Public License (version 2 only)
7# as published by the Free Software Foundation. 7# as published by the Free Software Foundation.
8 8
9obj-$(CONFIG_S390_GUEST) += kvm_virtio.o 9obj-$(CONFIG_S390_GUEST) += kvm_virtio.o virtio_ccw.o
diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c
index 8491111aec12..03a15e016778 100644
--- a/drivers/s390/kvm/kvm_virtio.c
+++ b/drivers/s390/kvm/kvm_virtio.c
@@ -422,6 +422,26 @@ static void kvm_extint_handler(struct ext_code ext_code,
422} 422}
423 423
424/* 424/*
425 * For s390-virtio, we expect a page above main storage containing
426 * the virtio configuration. Try to actually load from this area
427 * in order to figure out if the host provides this page.
428 */
429static int __init test_devices_support(unsigned long addr)
430{
431 int ret = -EIO;
432
433 asm volatile(
434 "0: lura 0,%1\n"
435 "1: xgr %0,%0\n"
436 "2:\n"
437 EX_TABLE(0b,2b)
438 EX_TABLE(1b,2b)
439 : "+d" (ret)
440 : "a" (addr)
441 : "0", "cc");
442 return ret;
443}
444/*
425 * Init function for virtio 445 * Init function for virtio
426 * devices are in a single page above top of "normal" mem 446 * devices are in a single page above top of "normal" mem
427 */ 447 */
@@ -432,21 +452,23 @@ static int __init kvm_devices_init(void)
432 if (!MACHINE_IS_KVM) 452 if (!MACHINE_IS_KVM)
433 return -ENODEV; 453 return -ENODEV;
434 454
455 if (test_devices_support(real_memory_size) < 0)
456 return -ENODEV;
457
458 rc = vmem_add_mapping(real_memory_size, PAGE_SIZE);
459 if (rc)
460 return rc;
461
462 kvm_devices = (void *) real_memory_size;
463
435 kvm_root = root_device_register("kvm_s390"); 464 kvm_root = root_device_register("kvm_s390");
436 if (IS_ERR(kvm_root)) { 465 if (IS_ERR(kvm_root)) {
437 rc = PTR_ERR(kvm_root); 466 rc = PTR_ERR(kvm_root);
438 printk(KERN_ERR "Could not register kvm_s390 root device"); 467 printk(KERN_ERR "Could not register kvm_s390 root device");
468 vmem_remove_mapping(real_memory_size, PAGE_SIZE);
439 return rc; 469 return rc;
440 } 470 }
441 471
442 rc = vmem_add_mapping(real_memory_size, PAGE_SIZE);
443 if (rc) {
444 root_device_unregister(kvm_root);
445 return rc;
446 }
447
448 kvm_devices = (void *) real_memory_size;
449
450 INIT_WORK(&hotplug_work, hotplug_devices); 472 INIT_WORK(&hotplug_work, hotplug_devices);
451 473
452 service_subclass_irq_register(); 474 service_subclass_irq_register();
diff --git a/drivers/s390/kvm/virtio_ccw.c b/drivers/s390/kvm/virtio_ccw.c
new file mode 100644
index 000000000000..2029b6caa595
--- /dev/null
+++ b/drivers/s390/kvm/virtio_ccw.c
@@ -0,0 +1,926 @@
1/*
2 * ccw based virtio transport
3 *
4 * Copyright IBM Corp. 2012
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License (version 2 only)
8 * as published by the Free Software Foundation.
9 *
10 * Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>
11 */
12
13#include <linux/kernel_stat.h>
14#include <linux/init.h>
15#include <linux/bootmem.h>
16#include <linux/err.h>
17#include <linux/virtio.h>
18#include <linux/virtio_config.h>
19#include <linux/slab.h>
20#include <linux/interrupt.h>
21#include <linux/virtio_ring.h>
22#include <linux/pfn.h>
23#include <linux/async.h>
24#include <linux/wait.h>
25#include <linux/list.h>
26#include <linux/bitops.h>
27#include <linux/module.h>
28#include <linux/io.h>
29#include <linux/kvm_para.h>
30#include <asm/setup.h>
31#include <asm/irq.h>
32#include <asm/cio.h>
33#include <asm/ccwdev.h>
34
35/*
36 * virtio related functions
37 */
38
39struct vq_config_block {
40 __u16 index;
41 __u16 num;
42} __packed;
43
44#define VIRTIO_CCW_CONFIG_SIZE 0x100
45/* same as PCI config space size, should be enough for all drivers */
46
47struct virtio_ccw_device {
48 struct virtio_device vdev;
49 __u8 *status;
50 __u8 config[VIRTIO_CCW_CONFIG_SIZE];
51 struct ccw_device *cdev;
52 __u32 curr_io;
53 int err;
54 wait_queue_head_t wait_q;
55 spinlock_t lock;
56 struct list_head virtqueues;
57 unsigned long indicators;
58 unsigned long indicators2;
59 struct vq_config_block *config_block;
60};
61
62struct vq_info_block {
63 __u64 queue;
64 __u32 align;
65 __u16 index;
66 __u16 num;
67} __packed;
68
69struct virtio_feature_desc {
70 __u32 features;
71 __u8 index;
72} __packed;
73
74struct virtio_ccw_vq_info {
75 struct virtqueue *vq;
76 int num;
77 void *queue;
78 struct vq_info_block *info_block;
79 struct list_head node;
80};
81
82#define KVM_VIRTIO_CCW_RING_ALIGN 4096
83
84#define KVM_S390_VIRTIO_CCW_NOTIFY 3
85
86#define CCW_CMD_SET_VQ 0x13
87#define CCW_CMD_VDEV_RESET 0x33
88#define CCW_CMD_SET_IND 0x43
89#define CCW_CMD_SET_CONF_IND 0x53
90#define CCW_CMD_READ_FEAT 0x12
91#define CCW_CMD_WRITE_FEAT 0x11
92#define CCW_CMD_READ_CONF 0x22
93#define CCW_CMD_WRITE_CONF 0x21
94#define CCW_CMD_WRITE_STATUS 0x31
95#define CCW_CMD_READ_VQ_CONF 0x32
96
97#define VIRTIO_CCW_DOING_SET_VQ 0x00010000
98#define VIRTIO_CCW_DOING_RESET 0x00040000
99#define VIRTIO_CCW_DOING_READ_FEAT 0x00080000
100#define VIRTIO_CCW_DOING_WRITE_FEAT 0x00100000
101#define VIRTIO_CCW_DOING_READ_CONFIG 0x00200000
102#define VIRTIO_CCW_DOING_WRITE_CONFIG 0x00400000
103#define VIRTIO_CCW_DOING_WRITE_STATUS 0x00800000
104#define VIRTIO_CCW_DOING_SET_IND 0x01000000
105#define VIRTIO_CCW_DOING_READ_VQ_CONF 0x02000000
106#define VIRTIO_CCW_DOING_SET_CONF_IND 0x04000000
107#define VIRTIO_CCW_INTPARM_MASK 0xffff0000
108
109static struct virtio_ccw_device *to_vc_device(struct virtio_device *vdev)
110{
111 return container_of(vdev, struct virtio_ccw_device, vdev);
112}
113
114static int doing_io(struct virtio_ccw_device *vcdev, __u32 flag)
115{
116 unsigned long flags;
117 __u32 ret;
118
119 spin_lock_irqsave(get_ccwdev_lock(vcdev->cdev), flags);
120 if (vcdev->err)
121 ret = 0;
122 else
123 ret = vcdev->curr_io & flag;
124 spin_unlock_irqrestore(get_ccwdev_lock(vcdev->cdev), flags);
125 return ret;
126}
127
128static int ccw_io_helper(struct virtio_ccw_device *vcdev,
129 struct ccw1 *ccw, __u32 intparm)
130{
131 int ret;
132 unsigned long flags;
133 int flag = intparm & VIRTIO_CCW_INTPARM_MASK;
134
135 do {
136 spin_lock_irqsave(get_ccwdev_lock(vcdev->cdev), flags);
137 ret = ccw_device_start(vcdev->cdev, ccw, intparm, 0, 0);
138 if (!ret)
139 vcdev->curr_io |= flag;
140 spin_unlock_irqrestore(get_ccwdev_lock(vcdev->cdev), flags);
141 cpu_relax();
142 } while (ret == -EBUSY);
143 wait_event(vcdev->wait_q, doing_io(vcdev, flag) == 0);
144 return ret ? ret : vcdev->err;
145}
146
147static inline long do_kvm_notify(struct subchannel_id schid,
148 unsigned long queue_index)
149{
150 register unsigned long __nr asm("1") = KVM_S390_VIRTIO_CCW_NOTIFY;
151 register struct subchannel_id __schid asm("2") = schid;
152 register unsigned long __index asm("3") = queue_index;
153 register long __rc asm("2");
154
155 asm volatile ("diag 2,4,0x500\n"
156 : "=d" (__rc) : "d" (__nr), "d" (__schid), "d" (__index)
157 : "memory", "cc");
158 return __rc;
159}
160
161static void virtio_ccw_kvm_notify(struct virtqueue *vq)
162{
163 struct virtio_ccw_vq_info *info = vq->priv;
164 struct virtio_ccw_device *vcdev;
165 struct subchannel_id schid;
166
167 vcdev = to_vc_device(info->vq->vdev);
168 ccw_device_get_schid(vcdev->cdev, &schid);
169 do_kvm_notify(schid, virtqueue_get_queue_index(vq));
170}
171
172static int virtio_ccw_read_vq_conf(struct virtio_ccw_device *vcdev,
173 struct ccw1 *ccw, int index)
174{
175 vcdev->config_block->index = index;
176 ccw->cmd_code = CCW_CMD_READ_VQ_CONF;
177 ccw->flags = 0;
178 ccw->count = sizeof(struct vq_config_block);
179 ccw->cda = (__u32)(unsigned long)(vcdev->config_block);
180 ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_READ_VQ_CONF);
181 return vcdev->config_block->num;
182}
183
184static void virtio_ccw_del_vq(struct virtqueue *vq, struct ccw1 *ccw)
185{
186 struct virtio_ccw_device *vcdev = to_vc_device(vq->vdev);
187 struct virtio_ccw_vq_info *info = vq->priv;
188 unsigned long flags;
189 unsigned long size;
190 int ret;
191 unsigned int index = virtqueue_get_queue_index(vq);
192
193 /* Remove from our list. */
194 spin_lock_irqsave(&vcdev->lock, flags);
195 list_del(&info->node);
196 spin_unlock_irqrestore(&vcdev->lock, flags);
197
198 /* Release from host. */
199 info->info_block->queue = 0;
200 info->info_block->align = 0;
201 info->info_block->index = index;
202 info->info_block->num = 0;
203 ccw->cmd_code = CCW_CMD_SET_VQ;
204 ccw->flags = 0;
205 ccw->count = sizeof(*info->info_block);
206 ccw->cda = (__u32)(unsigned long)(info->info_block);
207 ret = ccw_io_helper(vcdev, ccw,
208 VIRTIO_CCW_DOING_SET_VQ | index);
209 /*
210 * -ENODEV isn't considered an error: The device is gone anyway.
211 * This may happen on device detach.
212 */
213 if (ret && (ret != -ENODEV))
214 dev_warn(&vq->vdev->dev, "Error %d while deleting queue %d",
215 ret, index);
216
217 vring_del_virtqueue(vq);
218 size = PAGE_ALIGN(vring_size(info->num, KVM_VIRTIO_CCW_RING_ALIGN));
219 free_pages_exact(info->queue, size);
220 kfree(info->info_block);
221 kfree(info);
222}
223
224static void virtio_ccw_del_vqs(struct virtio_device *vdev)
225{
226 struct virtqueue *vq, *n;
227 struct ccw1 *ccw;
228
229 ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
230 if (!ccw)
231 return;
232
233
234 list_for_each_entry_safe(vq, n, &vdev->vqs, list)
235 virtio_ccw_del_vq(vq, ccw);
236
237 kfree(ccw);
238}
239
240static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev,
241 int i, vq_callback_t *callback,
242 const char *name,
243 struct ccw1 *ccw)
244{
245 struct virtio_ccw_device *vcdev = to_vc_device(vdev);
246 int err;
247 struct virtqueue *vq = NULL;
248 struct virtio_ccw_vq_info *info;
249 unsigned long size = 0; /* silence the compiler */
250 unsigned long flags;
251
252 /* Allocate queue. */
253 info = kzalloc(sizeof(struct virtio_ccw_vq_info), GFP_KERNEL);
254 if (!info) {
255 dev_warn(&vcdev->cdev->dev, "no info\n");
256 err = -ENOMEM;
257 goto out_err;
258 }
259 info->info_block = kzalloc(sizeof(*info->info_block),
260 GFP_DMA | GFP_KERNEL);
261 if (!info->info_block) {
262 dev_warn(&vcdev->cdev->dev, "no info block\n");
263 err = -ENOMEM;
264 goto out_err;
265 }
266 info->num = virtio_ccw_read_vq_conf(vcdev, ccw, i);
267 size = PAGE_ALIGN(vring_size(info->num, KVM_VIRTIO_CCW_RING_ALIGN));
268 info->queue = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
269 if (info->queue == NULL) {
270 dev_warn(&vcdev->cdev->dev, "no queue\n");
271 err = -ENOMEM;
272 goto out_err;
273 }
274
275 vq = vring_new_virtqueue(i, info->num, KVM_VIRTIO_CCW_RING_ALIGN, vdev,
276 true, info->queue, virtio_ccw_kvm_notify,
277 callback, name);
278 if (!vq) {
279 /* For now, we fail if we can't get the requested size. */
280 dev_warn(&vcdev->cdev->dev, "no vq\n");
281 err = -ENOMEM;
282 goto out_err;
283 }
284
285 /* Register it with the host. */
286 info->info_block->queue = (__u64)info->queue;
287 info->info_block->align = KVM_VIRTIO_CCW_RING_ALIGN;
288 info->info_block->index = i;
289 info->info_block->num = info->num;
290 ccw->cmd_code = CCW_CMD_SET_VQ;
291 ccw->flags = 0;
292 ccw->count = sizeof(*info->info_block);
293 ccw->cda = (__u32)(unsigned long)(info->info_block);
294 err = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_VQ | i);
295 if (err) {
296 dev_warn(&vcdev->cdev->dev, "SET_VQ failed\n");
297 goto out_err;
298 }
299
300 info->vq = vq;
301 vq->priv = info;
302
303 /* Save it to our list. */
304 spin_lock_irqsave(&vcdev->lock, flags);
305 list_add(&info->node, &vcdev->virtqueues);
306 spin_unlock_irqrestore(&vcdev->lock, flags);
307
308 return vq;
309
310out_err:
311 if (vq)
312 vring_del_virtqueue(vq);
313 if (info) {
314 if (info->queue)
315 free_pages_exact(info->queue, size);
316 kfree(info->info_block);
317 }
318 kfree(info);
319 return ERR_PTR(err);
320}
321
322static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
323 struct virtqueue *vqs[],
324 vq_callback_t *callbacks[],
325 const char *names[])
326{
327 struct virtio_ccw_device *vcdev = to_vc_device(vdev);
328 unsigned long *indicatorp = NULL;
329 int ret, i;
330 struct ccw1 *ccw;
331
332 ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
333 if (!ccw)
334 return -ENOMEM;
335
336 for (i = 0; i < nvqs; ++i) {
337 vqs[i] = virtio_ccw_setup_vq(vdev, i, callbacks[i], names[i],
338 ccw);
339 if (IS_ERR(vqs[i])) {
340 ret = PTR_ERR(vqs[i]);
341 vqs[i] = NULL;
342 goto out;
343 }
344 }
345 ret = -ENOMEM;
346 /* We need a data area under 2G to communicate. */
347 indicatorp = kmalloc(sizeof(&vcdev->indicators), GFP_DMA | GFP_KERNEL);
348 if (!indicatorp)
349 goto out;
350 *indicatorp = (unsigned long) &vcdev->indicators;
351 /* Register queue indicators with host. */
352 vcdev->indicators = 0;
353 ccw->cmd_code = CCW_CMD_SET_IND;
354 ccw->flags = 0;
355 ccw->count = sizeof(vcdev->indicators);
356 ccw->cda = (__u32)(unsigned long) indicatorp;
357 ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_IND);
358 if (ret)
359 goto out;
360 /* Register indicators2 with host for config changes */
361 *indicatorp = (unsigned long) &vcdev->indicators2;
362 vcdev->indicators2 = 0;
363 ccw->cmd_code = CCW_CMD_SET_CONF_IND;
364 ccw->flags = 0;
365 ccw->count = sizeof(vcdev->indicators2);
366 ccw->cda = (__u32)(unsigned long) indicatorp;
367 ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_CONF_IND);
368 if (ret)
369 goto out;
370
371 kfree(indicatorp);
372 kfree(ccw);
373 return 0;
374out:
375 kfree(indicatorp);
376 kfree(ccw);
377 virtio_ccw_del_vqs(vdev);
378 return ret;
379}
380
381static void virtio_ccw_reset(struct virtio_device *vdev)
382{
383 struct virtio_ccw_device *vcdev = to_vc_device(vdev);
384 struct ccw1 *ccw;
385
386 ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
387 if (!ccw)
388 return;
389
390 /* Zero status bits. */
391 *vcdev->status = 0;
392
393 /* Send a reset ccw on device. */
394 ccw->cmd_code = CCW_CMD_VDEV_RESET;
395 ccw->flags = 0;
396 ccw->count = 0;
397 ccw->cda = 0;
398 ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_RESET);
399 kfree(ccw);
400}
401
402static u32 virtio_ccw_get_features(struct virtio_device *vdev)
403{
404 struct virtio_ccw_device *vcdev = to_vc_device(vdev);
405 struct virtio_feature_desc *features;
406 int ret, rc;
407 struct ccw1 *ccw;
408
409 ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
410 if (!ccw)
411 return 0;
412
413 features = kzalloc(sizeof(*features), GFP_DMA | GFP_KERNEL);
414 if (!features) {
415 rc = 0;
416 goto out_free;
417 }
418 /* Read the feature bits from the host. */
419 /* TODO: Features > 32 bits */
420 features->index = 0;
421 ccw->cmd_code = CCW_CMD_READ_FEAT;
422 ccw->flags = 0;
423 ccw->count = sizeof(*features);
424 ccw->cda = (__u32)(unsigned long)features;
425 ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_READ_FEAT);
426 if (ret) {
427 rc = 0;
428 goto out_free;
429 }
430
431 rc = le32_to_cpu(features->features);
432
433out_free:
434 kfree(features);
435 kfree(ccw);
436 return rc;
437}
438
439static void virtio_ccw_finalize_features(struct virtio_device *vdev)
440{
441 struct virtio_ccw_device *vcdev = to_vc_device(vdev);
442 struct virtio_feature_desc *features;
443 int i;
444 struct ccw1 *ccw;
445
446 ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
447 if (!ccw)
448 return;
449
450 features = kzalloc(sizeof(*features), GFP_DMA | GFP_KERNEL);
451 if (!features)
452 goto out_free;
453
454 /* Give virtio_ring a chance to accept features. */
455 vring_transport_features(vdev);
456
457 for (i = 0; i < sizeof(*vdev->features) / sizeof(features->features);
458 i++) {
459 int highbits = i % 2 ? 32 : 0;
460 features->index = i;
461 features->features = cpu_to_le32(vdev->features[i / 2]
462 >> highbits);
463 /* Write the feature bits to the host. */
464 ccw->cmd_code = CCW_CMD_WRITE_FEAT;
465 ccw->flags = 0;
466 ccw->count = sizeof(*features);
467 ccw->cda = (__u32)(unsigned long)features;
468 ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_WRITE_FEAT);
469 }
470out_free:
471 kfree(features);
472 kfree(ccw);
473}
474
475static void virtio_ccw_get_config(struct virtio_device *vdev,
476 unsigned int offset, void *buf, unsigned len)
477{
478 struct virtio_ccw_device *vcdev = to_vc_device(vdev);
479 int ret;
480 struct ccw1 *ccw;
481 void *config_area;
482
483 ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
484 if (!ccw)
485 return;
486
487 config_area = kzalloc(VIRTIO_CCW_CONFIG_SIZE, GFP_DMA | GFP_KERNEL);
488 if (!config_area)
489 goto out_free;
490
491 /* Read the config area from the host. */
492 ccw->cmd_code = CCW_CMD_READ_CONF;
493 ccw->flags = 0;
494 ccw->count = offset + len;
495 ccw->cda = (__u32)(unsigned long)config_area;
496 ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_READ_CONFIG);
497 if (ret)
498 goto out_free;
499
500 memcpy(vcdev->config, config_area, sizeof(vcdev->config));
501 memcpy(buf, &vcdev->config[offset], len);
502
503out_free:
504 kfree(config_area);
505 kfree(ccw);
506}
507
508static void virtio_ccw_set_config(struct virtio_device *vdev,
509 unsigned int offset, const void *buf,
510 unsigned len)
511{
512 struct virtio_ccw_device *vcdev = to_vc_device(vdev);
513 struct ccw1 *ccw;
514 void *config_area;
515
516 ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
517 if (!ccw)
518 return;
519
520 config_area = kzalloc(VIRTIO_CCW_CONFIG_SIZE, GFP_DMA | GFP_KERNEL);
521 if (!config_area)
522 goto out_free;
523
524 memcpy(&vcdev->config[offset], buf, len);
525 /* Write the config area to the host. */
526 memcpy(config_area, vcdev->config, sizeof(vcdev->config));
527 ccw->cmd_code = CCW_CMD_WRITE_CONF;
528 ccw->flags = 0;
529 ccw->count = offset + len;
530 ccw->cda = (__u32)(unsigned long)config_area;
531 ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_WRITE_CONFIG);
532
533out_free:
534 kfree(config_area);
535 kfree(ccw);
536}
537
538static u8 virtio_ccw_get_status(struct virtio_device *vdev)
539{
540 struct virtio_ccw_device *vcdev = to_vc_device(vdev);
541
542 return *vcdev->status;
543}
544
545static void virtio_ccw_set_status(struct virtio_device *vdev, u8 status)
546{
547 struct virtio_ccw_device *vcdev = to_vc_device(vdev);
548 struct ccw1 *ccw;
549
550 ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
551 if (!ccw)
552 return;
553
554 /* Write the status to the host. */
555 *vcdev->status = status;
556 ccw->cmd_code = CCW_CMD_WRITE_STATUS;
557 ccw->flags = 0;
558 ccw->count = sizeof(status);
559 ccw->cda = (__u32)(unsigned long)vcdev->status;
560 ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_WRITE_STATUS);
561 kfree(ccw);
562}
563
564static struct virtio_config_ops virtio_ccw_config_ops = {
565 .get_features = virtio_ccw_get_features,
566 .finalize_features = virtio_ccw_finalize_features,
567 .get = virtio_ccw_get_config,
568 .set = virtio_ccw_set_config,
569 .get_status = virtio_ccw_get_status,
570 .set_status = virtio_ccw_set_status,
571 .reset = virtio_ccw_reset,
572 .find_vqs = virtio_ccw_find_vqs,
573 .del_vqs = virtio_ccw_del_vqs,
574};
575
576
577/*
578 * ccw bus driver related functions
579 */
580
581static void virtio_ccw_release_dev(struct device *_d)
582{
583 struct virtio_device *dev = container_of(_d, struct virtio_device,
584 dev);
585 struct virtio_ccw_device *vcdev = to_vc_device(dev);
586
587 kfree(vcdev->status);
588 kfree(vcdev->config_block);
589 kfree(vcdev);
590}
591
592static int irb_is_error(struct irb *irb)
593{
594 if (scsw_cstat(&irb->scsw) != 0)
595 return 1;
596 if (scsw_dstat(&irb->scsw) & ~(DEV_STAT_CHN_END | DEV_STAT_DEV_END))
597 return 1;
598 if (scsw_cc(&irb->scsw) != 0)
599 return 1;
600 return 0;
601}
602
603static struct virtqueue *virtio_ccw_vq_by_ind(struct virtio_ccw_device *vcdev,
604 int index)
605{
606 struct virtio_ccw_vq_info *info;
607 unsigned long flags;
608 struct virtqueue *vq;
609
610 vq = NULL;
611 spin_lock_irqsave(&vcdev->lock, flags);
612 list_for_each_entry(info, &vcdev->virtqueues, node) {
613 if (virtqueue_get_queue_index(info->vq) == index) {
614 vq = info->vq;
615 break;
616 }
617 }
618 spin_unlock_irqrestore(&vcdev->lock, flags);
619 return vq;
620}
621
622static void virtio_ccw_int_handler(struct ccw_device *cdev,
623 unsigned long intparm,
624 struct irb *irb)
625{
626 __u32 activity = intparm & VIRTIO_CCW_INTPARM_MASK;
627 struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev);
628 int i;
629 struct virtqueue *vq;
630 struct virtio_driver *drv;
631
632 /* Check if it's a notification from the host. */
633 if ((intparm == 0) &&
634 (scsw_stctl(&irb->scsw) ==
635 (SCSW_STCTL_ALERT_STATUS | SCSW_STCTL_STATUS_PEND))) {
636 /* OK */
637 }
638 if (irb_is_error(irb))
639 vcdev->err = -EIO; /* XXX - use real error */
640 if (vcdev->curr_io & activity) {
641 switch (activity) {
642 case VIRTIO_CCW_DOING_READ_FEAT:
643 case VIRTIO_CCW_DOING_WRITE_FEAT:
644 case VIRTIO_CCW_DOING_READ_CONFIG:
645 case VIRTIO_CCW_DOING_WRITE_CONFIG:
646 case VIRTIO_CCW_DOING_WRITE_STATUS:
647 case VIRTIO_CCW_DOING_SET_VQ:
648 case VIRTIO_CCW_DOING_SET_IND:
649 case VIRTIO_CCW_DOING_SET_CONF_IND:
650 case VIRTIO_CCW_DOING_RESET:
651 case VIRTIO_CCW_DOING_READ_VQ_CONF:
652 vcdev->curr_io &= ~activity;
653 wake_up(&vcdev->wait_q);
654 break;
655 default:
656 /* don't know what to do... */
657 dev_warn(&cdev->dev, "Suspicious activity '%08x'\n",
658 activity);
659 WARN_ON(1);
660 break;
661 }
662 }
663 for_each_set_bit(i, &vcdev->indicators,
664 sizeof(vcdev->indicators) * BITS_PER_BYTE) {
665 /* The bit clear must happen before the vring kick. */
666 clear_bit(i, &vcdev->indicators);
667 barrier();
668 vq = virtio_ccw_vq_by_ind(vcdev, i);
669 vring_interrupt(0, vq);
670 }
671 if (test_bit(0, &vcdev->indicators2)) {
672 drv = container_of(vcdev->vdev.dev.driver,
673 struct virtio_driver, driver);
674
675 if (drv && drv->config_changed)
676 drv->config_changed(&vcdev->vdev);
677 clear_bit(0, &vcdev->indicators2);
678 }
679}
680
681/*
682 * We usually want to autoonline all devices, but give the admin
683 * a way to exempt devices from this.
684 */
685#define __DEV_WORDS ((__MAX_SUBCHANNEL + (8*sizeof(long) - 1)) / \
686 (8*sizeof(long)))
687static unsigned long devs_no_auto[__MAX_SSID + 1][__DEV_WORDS];
688
689static char *no_auto = "";
690
691module_param(no_auto, charp, 0444);
692MODULE_PARM_DESC(no_auto, "list of ccw bus id ranges not to be auto-onlined");
693
694static int virtio_ccw_check_autoonline(struct ccw_device *cdev)
695{
696 struct ccw_dev_id id;
697
698 ccw_device_get_id(cdev, &id);
699 if (test_bit(id.devno, devs_no_auto[id.ssid]))
700 return 0;
701 return 1;
702}
703
704static void virtio_ccw_auto_online(void *data, async_cookie_t cookie)
705{
706 struct ccw_device *cdev = data;
707 int ret;
708
709 ret = ccw_device_set_online(cdev);
710 if (ret)
711 dev_warn(&cdev->dev, "Failed to set online: %d\n", ret);
712}
713
714static int virtio_ccw_probe(struct ccw_device *cdev)
715{
716 cdev->handler = virtio_ccw_int_handler;
717
718 if (virtio_ccw_check_autoonline(cdev))
719 async_schedule(virtio_ccw_auto_online, cdev);
720 return 0;
721}
722
723static void virtio_ccw_remove(struct ccw_device *cdev)
724{
725 struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev);
726
727 if (cdev->online) {
728 unregister_virtio_device(&vcdev->vdev);
729 dev_set_drvdata(&cdev->dev, NULL);
730 }
731 cdev->handler = NULL;
732}
733
734static int virtio_ccw_offline(struct ccw_device *cdev)
735{
736 struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev);
737
738 unregister_virtio_device(&vcdev->vdev);
739 dev_set_drvdata(&cdev->dev, NULL);
740 return 0;
741}
742
743
744static int virtio_ccw_online(struct ccw_device *cdev)
745{
746 int ret;
747 struct virtio_ccw_device *vcdev;
748
749 vcdev = kzalloc(sizeof(*vcdev), GFP_KERNEL);
750 if (!vcdev) {
751 dev_warn(&cdev->dev, "Could not get memory for virtio\n");
752 ret = -ENOMEM;
753 goto out_free;
754 }
755 vcdev->config_block = kzalloc(sizeof(*vcdev->config_block),
756 GFP_DMA | GFP_KERNEL);
757 if (!vcdev->config_block) {
758 ret = -ENOMEM;
759 goto out_free;
760 }
761 vcdev->status = kzalloc(sizeof(*vcdev->status), GFP_DMA | GFP_KERNEL);
762 if (!vcdev->status) {
763 ret = -ENOMEM;
764 goto out_free;
765 }
766
767 vcdev->vdev.dev.parent = &cdev->dev;
768 vcdev->vdev.dev.release = virtio_ccw_release_dev;
769 vcdev->vdev.config = &virtio_ccw_config_ops;
770 vcdev->cdev = cdev;
771 init_waitqueue_head(&vcdev->wait_q);
772 INIT_LIST_HEAD(&vcdev->virtqueues);
773 spin_lock_init(&vcdev->lock);
774
775 dev_set_drvdata(&cdev->dev, vcdev);
776 vcdev->vdev.id.vendor = cdev->id.cu_type;
777 vcdev->vdev.id.device = cdev->id.cu_model;
778 ret = register_virtio_device(&vcdev->vdev);
779 if (ret) {
780 dev_warn(&cdev->dev, "Failed to register virtio device: %d\n",
781 ret);
782 goto out_put;
783 }
784 return 0;
785out_put:
786 dev_set_drvdata(&cdev->dev, NULL);
787 put_device(&vcdev->vdev.dev);
788 return ret;
789out_free:
790 if (vcdev) {
791 kfree(vcdev->status);
792 kfree(vcdev->config_block);
793 }
794 kfree(vcdev);
795 return ret;
796}
797
798static int virtio_ccw_cio_notify(struct ccw_device *cdev, int event)
799{
800 /* TODO: Check whether we need special handling here. */
801 return 0;
802}
803
804static struct ccw_device_id virtio_ids[] = {
805 { CCW_DEVICE(0x3832, 0) },
806 {},
807};
808MODULE_DEVICE_TABLE(ccw, virtio_ids);
809
810static struct ccw_driver virtio_ccw_driver = {
811 .driver = {
812 .owner = THIS_MODULE,
813 .name = "virtio_ccw",
814 },
815 .ids = virtio_ids,
816 .probe = virtio_ccw_probe,
817 .remove = virtio_ccw_remove,
818 .set_offline = virtio_ccw_offline,
819 .set_online = virtio_ccw_online,
820 .notify = virtio_ccw_cio_notify,
821 .int_class = IRQIO_VIR,
822};
823
824static int __init pure_hex(char **cp, unsigned int *val, int min_digit,
825 int max_digit, int max_val)
826{
827 int diff;
828
829 diff = 0;
830 *val = 0;
831
832 while (diff <= max_digit) {
833 int value = hex_to_bin(**cp);
834
835 if (value < 0)
836 break;
837 *val = *val * 16 + value;
838 (*cp)++;
839 diff++;
840 }
841
842 if ((diff < min_digit) || (diff > max_digit) || (*val > max_val))
843 return 1;
844
845 return 0;
846}
847
848static int __init parse_busid(char *str, unsigned int *cssid,
849 unsigned int *ssid, unsigned int *devno)
850{
851 char *str_work;
852 int rc, ret;
853
854 rc = 1;
855
856 if (*str == '\0')
857 goto out;
858
859 str_work = str;
860 ret = pure_hex(&str_work, cssid, 1, 2, __MAX_CSSID);
861 if (ret || (str_work[0] != '.'))
862 goto out;
863 str_work++;
864 ret = pure_hex(&str_work, ssid, 1, 1, __MAX_SSID);
865 if (ret || (str_work[0] != '.'))
866 goto out;
867 str_work++;
868 ret = pure_hex(&str_work, devno, 4, 4, __MAX_SUBCHANNEL);
869 if (ret || (str_work[0] != '\0'))
870 goto out;
871
872 rc = 0;
873out:
874 return rc;
875}
876
877static void __init no_auto_parse(void)
878{
879 unsigned int from_cssid, to_cssid, from_ssid, to_ssid, from, to;
880 char *parm, *str;
881 int rc;
882
883 str = no_auto;
884 while ((parm = strsep(&str, ","))) {
885 rc = parse_busid(strsep(&parm, "-"), &from_cssid,
886 &from_ssid, &from);
887 if (rc)
888 continue;
889 if (parm != NULL) {
890 rc = parse_busid(parm, &to_cssid,
891 &to_ssid, &to);
892 if ((from_ssid > to_ssid) ||
893 ((from_ssid == to_ssid) && (from > to)))
894 rc = -EINVAL;
895 } else {
896 to_cssid = from_cssid;
897 to_ssid = from_ssid;
898 to = from;
899 }
900 if (rc)
901 continue;
902 while ((from_ssid < to_ssid) ||
903 ((from_ssid == to_ssid) && (from <= to))) {
904 set_bit(from, devs_no_auto[from_ssid]);
905 from++;
906 if (from > __MAX_SUBCHANNEL) {
907 from_ssid++;
908 from = 0;
909 }
910 }
911 }
912}
913
914static int __init virtio_ccw_init(void)
915{
916 /* parse no_auto string before we do anything further */
917 no_auto_parse();
918 return ccw_driver_register(&virtio_ccw_driver);
919}
920module_init(virtio_ccw_init);
921
922static void __exit virtio_ccw_exit(void)
923{
924 ccw_driver_unregister(&virtio_ccw_driver);
925}
926module_exit(virtio_ccw_exit);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b7996a768eb2..cad77fe09d77 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -123,6 +123,8 @@ static inline bool is_error_page(struct page *page)
123#define KVM_REQ_WATCHDOG 18 123#define KVM_REQ_WATCHDOG 18
124#define KVM_REQ_MASTERCLOCK_UPDATE 19 124#define KVM_REQ_MASTERCLOCK_UPDATE 19
125#define KVM_REQ_MCLOCK_INPROGRESS 20 125#define KVM_REQ_MCLOCK_INPROGRESS 20
126#define KVM_REQ_EPR_EXIT 21
127#define KVM_REQ_EOIBITMAP 22
126 128
127#define KVM_USERSPACE_IRQ_SOURCE_ID 0 129#define KVM_USERSPACE_IRQ_SOURCE_ID 0
128#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 130#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1
@@ -267,12 +269,11 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
267struct kvm_memory_slot { 269struct kvm_memory_slot {
268 gfn_t base_gfn; 270 gfn_t base_gfn;
269 unsigned long npages; 271 unsigned long npages;
270 unsigned long flags;
271 unsigned long *dirty_bitmap; 272 unsigned long *dirty_bitmap;
272 struct kvm_arch_memory_slot arch; 273 struct kvm_arch_memory_slot arch;
273 unsigned long userspace_addr; 274 unsigned long userspace_addr;
274 int user_alloc; 275 u32 flags;
275 int id; 276 short id;
276}; 277};
277 278
278static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot) 279static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot)
@@ -314,8 +315,12 @@ struct kvm_irq_routing_table {};
314 315
315#endif 316#endif
316 317
318#ifndef KVM_PRIVATE_MEM_SLOTS
319#define KVM_PRIVATE_MEM_SLOTS 0
320#endif
321
317#ifndef KVM_MEM_SLOTS_NUM 322#ifndef KVM_MEM_SLOTS_NUM
318#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) 323#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
319#endif 324#endif
320 325
321/* 326/*
@@ -327,7 +332,7 @@ struct kvm_memslots {
327 u64 generation; 332 u64 generation;
328 struct kvm_memory_slot memslots[KVM_MEM_SLOTS_NUM]; 333 struct kvm_memory_slot memslots[KVM_MEM_SLOTS_NUM];
329 /* The mapping table from slot id to the index in memslots[]. */ 334 /* The mapping table from slot id to the index in memslots[]. */
330 int id_to_index[KVM_MEM_SLOTS_NUM]; 335 short id_to_index[KVM_MEM_SLOTS_NUM];
331}; 336};
332 337
333struct kvm { 338struct kvm {
@@ -425,7 +430,8 @@ void kvm_exit(void);
425 430
426void kvm_get_kvm(struct kvm *kvm); 431void kvm_get_kvm(struct kvm *kvm);
427void kvm_put_kvm(struct kvm *kvm); 432void kvm_put_kvm(struct kvm *kvm);
428void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new); 433void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new,
434 u64 last_generation);
429 435
430static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm) 436static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
431{ 437{
@@ -448,10 +454,10 @@ id_to_memslot(struct kvm_memslots *slots, int id)
448 454
449int kvm_set_memory_region(struct kvm *kvm, 455int kvm_set_memory_region(struct kvm *kvm,
450 struct kvm_userspace_memory_region *mem, 456 struct kvm_userspace_memory_region *mem,
451 int user_alloc); 457 bool user_alloc);
452int __kvm_set_memory_region(struct kvm *kvm, 458int __kvm_set_memory_region(struct kvm *kvm,
453 struct kvm_userspace_memory_region *mem, 459 struct kvm_userspace_memory_region *mem,
454 int user_alloc); 460 bool user_alloc);
455void kvm_arch_free_memslot(struct kvm_memory_slot *free, 461void kvm_arch_free_memslot(struct kvm_memory_slot *free,
456 struct kvm_memory_slot *dont); 462 struct kvm_memory_slot *dont);
457int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages); 463int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages);
@@ -459,11 +465,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
459 struct kvm_memory_slot *memslot, 465 struct kvm_memory_slot *memslot,
460 struct kvm_memory_slot old, 466 struct kvm_memory_slot old,
461 struct kvm_userspace_memory_region *mem, 467 struct kvm_userspace_memory_region *mem,
462 int user_alloc); 468 bool user_alloc);
463void kvm_arch_commit_memory_region(struct kvm *kvm, 469void kvm_arch_commit_memory_region(struct kvm *kvm,
464 struct kvm_userspace_memory_region *mem, 470 struct kvm_userspace_memory_region *mem,
465 struct kvm_memory_slot old, 471 struct kvm_memory_slot old,
466 int user_alloc); 472 bool user_alloc);
467bool kvm_largepages_enabled(void); 473bool kvm_largepages_enabled(void);
468void kvm_disable_largepages(void); 474void kvm_disable_largepages(void);
469/* flush all memory translations */ 475/* flush all memory translations */
@@ -533,6 +539,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
533void kvm_flush_remote_tlbs(struct kvm *kvm); 539void kvm_flush_remote_tlbs(struct kvm *kvm);
534void kvm_reload_remote_mmus(struct kvm *kvm); 540void kvm_reload_remote_mmus(struct kvm *kvm);
535void kvm_make_mclock_inprogress_request(struct kvm *kvm); 541void kvm_make_mclock_inprogress_request(struct kvm *kvm);
542void kvm_make_update_eoibitmap_request(struct kvm *kvm);
536 543
537long kvm_arch_dev_ioctl(struct file *filp, 544long kvm_arch_dev_ioctl(struct file *filp,
538 unsigned int ioctl, unsigned long arg); 545 unsigned int ioctl, unsigned long arg);
@@ -550,7 +557,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
550int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 557int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
551 struct 558 struct
552 kvm_userspace_memory_region *mem, 559 kvm_userspace_memory_region *mem,
553 int user_alloc); 560 bool user_alloc);
554int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level); 561int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level);
555long kvm_arch_vm_ioctl(struct file *filp, 562long kvm_arch_vm_ioctl(struct file *filp,
556 unsigned int ioctl, unsigned long arg); 563 unsigned int ioctl, unsigned long arg);
@@ -686,6 +693,7 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
686int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level); 693int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level);
687int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, 694int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
688 int irq_source_id, int level); 695 int irq_source_id, int level);
696bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin);
689void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); 697void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
690void kvm_register_irq_ack_notifier(struct kvm *kvm, 698void kvm_register_irq_ack_notifier(struct kvm *kvm,
691 struct kvm_irq_ack_notifier *kian); 699 struct kvm_irq_ack_notifier *kian);
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 7ef9e759f499..19911dddaeb7 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -14,7 +14,7 @@
14 ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \ 14 ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \
15 ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\ 15 ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\
16 ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL), \ 16 ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL), \
17 ERSN(S390_UCONTROL) 17 ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH)
18 18
19TRACE_EVENT(kvm_userspace_exit, 19TRACE_EVENT(kvm_userspace_exit,
20 TP_PROTO(__u32 reason, int errno), 20 TP_PROTO(__u32 reason, int errno),
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index c70577cf67bc..3c56ba3d80c1 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -169,6 +169,8 @@ struct kvm_pit_config {
169#define KVM_EXIT_PAPR_HCALL 19 169#define KVM_EXIT_PAPR_HCALL 19
170#define KVM_EXIT_S390_UCONTROL 20 170#define KVM_EXIT_S390_UCONTROL 20
171#define KVM_EXIT_WATCHDOG 21 171#define KVM_EXIT_WATCHDOG 21
172#define KVM_EXIT_S390_TSCH 22
173#define KVM_EXIT_EPR 23
172 174
173/* For KVM_EXIT_INTERNAL_ERROR */ 175/* For KVM_EXIT_INTERNAL_ERROR */
174/* Emulate instruction failed. */ 176/* Emulate instruction failed. */
@@ -286,6 +288,19 @@ struct kvm_run {
286 __u64 ret; 288 __u64 ret;
287 __u64 args[9]; 289 __u64 args[9];
288 } papr_hcall; 290 } papr_hcall;
291 /* KVM_EXIT_S390_TSCH */
292 struct {
293 __u16 subchannel_id;
294 __u16 subchannel_nr;
295 __u32 io_int_parm;
296 __u32 io_int_word;
297 __u32 ipb;
298 __u8 dequeued;
299 } s390_tsch;
300 /* KVM_EXIT_EPR */
301 struct {
302 __u32 epr;
303 } epr;
289 /* Fix the size of the union. */ 304 /* Fix the size of the union. */
290 char padding[256]; 305 char padding[256];
291 }; 306 };
@@ -398,10 +413,20 @@ struct kvm_s390_psw {
398#define KVM_S390_PROGRAM_INT 0xfffe0001u 413#define KVM_S390_PROGRAM_INT 0xfffe0001u
399#define KVM_S390_SIGP_SET_PREFIX 0xfffe0002u 414#define KVM_S390_SIGP_SET_PREFIX 0xfffe0002u
400#define KVM_S390_RESTART 0xfffe0003u 415#define KVM_S390_RESTART 0xfffe0003u
416#define KVM_S390_MCHK 0xfffe1000u
401#define KVM_S390_INT_VIRTIO 0xffff2603u 417#define KVM_S390_INT_VIRTIO 0xffff2603u
402#define KVM_S390_INT_SERVICE 0xffff2401u 418#define KVM_S390_INT_SERVICE 0xffff2401u
403#define KVM_S390_INT_EMERGENCY 0xffff1201u 419#define KVM_S390_INT_EMERGENCY 0xffff1201u
404#define KVM_S390_INT_EXTERNAL_CALL 0xffff1202u 420#define KVM_S390_INT_EXTERNAL_CALL 0xffff1202u
421/* Anything below 0xfffe0000u is taken by INT_IO */
422#define KVM_S390_INT_IO(ai,cssid,ssid,schid) \
423 (((schid)) | \
424 ((ssid) << 16) | \
425 ((cssid) << 18) | \
426 ((ai) << 26))
427#define KVM_S390_INT_IO_MIN 0x00000000u
428#define KVM_S390_INT_IO_MAX 0xfffdffffu
429
405 430
406struct kvm_s390_interrupt { 431struct kvm_s390_interrupt {
407 __u32 type; 432 __u32 type;
@@ -636,6 +661,8 @@ struct kvm_ppc_smmu_info {
636#define KVM_CAP_IRQFD_RESAMPLE 82 661#define KVM_CAP_IRQFD_RESAMPLE 82
637#define KVM_CAP_PPC_BOOKE_WATCHDOG 83 662#define KVM_CAP_PPC_BOOKE_WATCHDOG 83
638#define KVM_CAP_PPC_HTAB_FD 84 663#define KVM_CAP_PPC_HTAB_FD 84
664#define KVM_CAP_S390_CSS_SUPPORT 85
665#define KVM_CAP_PPC_EPR 86
639#define KVM_CAP_ARM_PSCI 87 666#define KVM_CAP_ARM_PSCI 87
640#define KVM_CAP_ARM_SET_DEVICE_ADDR 88 667#define KVM_CAP_ARM_SET_DEVICE_ADDR 88
641 668
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 053dfd7692d1..f1bdecf09afb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4374,7 +4374,10 @@ EXPORT_SYMBOL(yield);
4374 * It's the caller's job to ensure that the target task struct 4374 * It's the caller's job to ensure that the target task struct
4375 * can't go away on us before we can do any checks. 4375 * can't go away on us before we can do any checks.
4376 * 4376 *
4377 * Returns true if we indeed boosted the target task. 4377 * Returns:
4378 * true (>0) if we indeed boosted the target task.
4379 * false (0) if we failed to boost the target.
4380 * -ESRCH if there's no task to yield to.
4378 */ 4381 */
4379bool __sched yield_to(struct task_struct *p, bool preempt) 4382bool __sched yield_to(struct task_struct *p, bool preempt)
4380{ 4383{
@@ -4388,6 +4391,15 @@ bool __sched yield_to(struct task_struct *p, bool preempt)
4388 4391
4389again: 4392again:
4390 p_rq = task_rq(p); 4393 p_rq = task_rq(p);
4394 /*
4395 * If we're the only runnable task on the rq and target rq also
4396 * has only one task, there's absolutely no point in yielding.
4397 */
4398 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
4399 yielded = -ESRCH;
4400 goto out_irq;
4401 }
4402
4391 double_rq_lock(rq, p_rq); 4403 double_rq_lock(rq, p_rq);
4392 while (task_rq(p) != p_rq) { 4404 while (task_rq(p) != p_rq) {
4393 double_rq_unlock(rq, p_rq); 4405 double_rq_unlock(rq, p_rq);
@@ -4395,13 +4407,13 @@ again:
4395 } 4407 }
4396 4408
4397 if (!curr->sched_class->yield_to_task) 4409 if (!curr->sched_class->yield_to_task)
4398 goto out; 4410 goto out_unlock;
4399 4411
4400 if (curr->sched_class != p->sched_class) 4412 if (curr->sched_class != p->sched_class)
4401 goto out; 4413 goto out_unlock;
4402 4414
4403 if (task_running(p_rq, p) || p->state) 4415 if (task_running(p_rq, p) || p->state)
4404 goto out; 4416 goto out_unlock;
4405 4417
4406 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 4418 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4407 if (yielded) { 4419 if (yielded) {
@@ -4414,11 +4426,12 @@ again:
4414 resched_task(p_rq->curr); 4426 resched_task(p_rq->curr);
4415 } 4427 }
4416 4428
4417out: 4429out_unlock:
4418 double_rq_unlock(rq, p_rq); 4430 double_rq_unlock(rq, p_rq);
4431out_irq:
4419 local_irq_restore(flags); 4432 local_irq_restore(flags);
4420 4433
4421 if (yielded) 4434 if (yielded > 0)
4422 schedule(); 4435 schedule();
4423 4436
4424 return yielded; 4437 return yielded;
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index cfb7e4d52dc2..ce82b9401958 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -35,6 +35,7 @@
35#include <linux/hrtimer.h> 35#include <linux/hrtimer.h>
36#include <linux/io.h> 36#include <linux/io.h>
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/export.h>
38#include <asm/processor.h> 39#include <asm/processor.h>
39#include <asm/page.h> 40#include <asm/page.h>
40#include <asm/current.h> 41#include <asm/current.h>
@@ -115,6 +116,42 @@ static void update_handled_vectors(struct kvm_ioapic *ioapic)
115 smp_wmb(); 116 smp_wmb();
116} 117}
117 118
119void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
120 u64 *eoi_exit_bitmap)
121{
122 struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
123 union kvm_ioapic_redirect_entry *e;
124 struct kvm_lapic_irq irqe;
125 int index;
126
127 spin_lock(&ioapic->lock);
128 /* traverse ioapic entry to set eoi exit bitmap*/
129 for (index = 0; index < IOAPIC_NUM_PINS; index++) {
130 e = &ioapic->redirtbl[index];
131 if (!e->fields.mask &&
132 (e->fields.trig_mode == IOAPIC_LEVEL_TRIG ||
133 kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC,
134 index))) {
135 irqe.dest_id = e->fields.dest_id;
136 irqe.vector = e->fields.vector;
137 irqe.dest_mode = e->fields.dest_mode;
138 irqe.delivery_mode = e->fields.delivery_mode << 8;
139 kvm_calculate_eoi_exitmap(vcpu, &irqe, eoi_exit_bitmap);
140 }
141 }
142 spin_unlock(&ioapic->lock);
143}
144EXPORT_SYMBOL_GPL(kvm_ioapic_calculate_eoi_exitmap);
145
146void kvm_ioapic_make_eoibitmap_request(struct kvm *kvm)
147{
148 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
149
150 if (!kvm_apic_vid_enabled(kvm) || !ioapic)
151 return;
152 kvm_make_update_eoibitmap_request(kvm);
153}
154
118static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) 155static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
119{ 156{
120 unsigned index; 157 unsigned index;
@@ -156,6 +193,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
156 if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG 193 if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
157 && ioapic->irr & (1 << index)) 194 && ioapic->irr & (1 << index))
158 ioapic_service(ioapic, index); 195 ioapic_service(ioapic, index);
196 kvm_ioapic_make_eoibitmap_request(ioapic->kvm);
159 break; 197 break;
160 } 198 }
161} 199}
@@ -179,15 +217,6 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
179 irqe.level = 1; 217 irqe.level = 1;
180 irqe.shorthand = 0; 218 irqe.shorthand = 0;
181 219
182#ifdef CONFIG_X86
183 /* Always delivery PIT interrupt to vcpu 0 */
184 if (irq == 0) {
185 irqe.dest_mode = 0; /* Physical mode. */
186 /* need to read apic_id from apic regiest since
187 * it can be rewritten */
188 irqe.dest_id = ioapic->kvm->bsp_vcpu_id;
189 }
190#endif
191 return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); 220 return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);
192} 221}
193 222
@@ -464,6 +493,7 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
464 spin_lock(&ioapic->lock); 493 spin_lock(&ioapic->lock);
465 memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); 494 memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
466 update_handled_vectors(ioapic); 495 update_handled_vectors(ioapic);
496 kvm_ioapic_make_eoibitmap_request(kvm);
467 spin_unlock(&ioapic->lock); 497 spin_unlock(&ioapic->lock);
468 return 0; 498 return 0;
469} 499}
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index a30abfe6ed16..0400a466c50c 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -82,5 +82,9 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
82 struct kvm_lapic_irq *irq); 82 struct kvm_lapic_irq *irq);
83int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); 83int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
84int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); 84int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
85void kvm_ioapic_make_eoibitmap_request(struct kvm *kvm);
86void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
87 u64 *eoi_exit_bitmap);
88
85 89
86#endif 90#endif
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 4a340cb23013..72a130bc448a 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -76,7 +76,9 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
76 gfn = slot->base_gfn; 76 gfn = slot->base_gfn;
77 end_gfn = gfn + slot->npages; 77 end_gfn = gfn + slot->npages;
78 78
79 flags = IOMMU_READ | IOMMU_WRITE; 79 flags = IOMMU_READ;
80 if (!(slot->flags & KVM_MEM_READONLY))
81 flags |= IOMMU_WRITE;
80 if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY) 82 if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)
81 flags |= IOMMU_CACHE; 83 flags |= IOMMU_CACHE;
82 84
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 656fa455e154..ff6d40e2c06d 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/kvm_host.h> 23#include <linux/kvm_host.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/export.h>
25#include <trace/events/kvm.h> 26#include <trace/events/kvm.h>
26 27
27#include <asm/msidef.h> 28#include <asm/msidef.h>
@@ -237,6 +238,28 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
237 return ret; 238 return ret;
238} 239}
239 240
241bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
242{
243 struct kvm_irq_ack_notifier *kian;
244 struct hlist_node *n;
245 int gsi;
246
247 rcu_read_lock();
248 gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
249 if (gsi != -1)
250 hlist_for_each_entry_rcu(kian, n, &kvm->irq_ack_notifier_list,
251 link)
252 if (kian->gsi == gsi) {
253 rcu_read_unlock();
254 return true;
255 }
256
257 rcu_read_unlock();
258
259 return false;
260}
261EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
262
240void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) 263void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
241{ 264{
242 struct kvm_irq_ack_notifier *kian; 265 struct kvm_irq_ack_notifier *kian;
@@ -261,6 +284,7 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm,
261 mutex_lock(&kvm->irq_lock); 284 mutex_lock(&kvm->irq_lock);
262 hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); 285 hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
263 mutex_unlock(&kvm->irq_lock); 286 mutex_unlock(&kvm->irq_lock);
287 kvm_ioapic_make_eoibitmap_request(kvm);
264} 288}
265 289
266void kvm_unregister_irq_ack_notifier(struct kvm *kvm, 290void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
@@ -270,6 +294,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
270 hlist_del_init_rcu(&kian->link); 294 hlist_del_init_rcu(&kian->link);
271 mutex_unlock(&kvm->irq_lock); 295 mutex_unlock(&kvm->irq_lock);
272 synchronize_rcu(); 296 synchronize_rcu();
297 kvm_ioapic_make_eoibitmap_request(kvm);
273} 298}
274 299
275int kvm_request_irq_source_id(struct kvm *kvm) 300int kvm_request_irq_source_id(struct kvm *kvm)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1cd693a76a51..adc68feb5c5a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -217,6 +217,11 @@ void kvm_make_mclock_inprogress_request(struct kvm *kvm)
217 make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); 217 make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
218} 218}
219 219
220void kvm_make_update_eoibitmap_request(struct kvm *kvm)
221{
222 make_all_cpus_request(kvm, KVM_REQ_EOIBITMAP);
223}
224
220int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 225int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
221{ 226{
222 struct page *page; 227 struct page *page;
@@ -474,6 +479,8 @@ static struct kvm *kvm_create_vm(unsigned long type)
474 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 479 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
475#endif 480#endif
476 481
482 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
483
477 r = -ENOMEM; 484 r = -ENOMEM;
478 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 485 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
479 if (!kvm->memslots) 486 if (!kvm->memslots)
@@ -670,7 +677,8 @@ static void sort_memslots(struct kvm_memslots *slots)
670 slots->id_to_index[slots->memslots[i].id] = i; 677 slots->id_to_index[slots->memslots[i].id] = i;
671} 678}
672 679
673void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new) 680void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new,
681 u64 last_generation)
674{ 682{
675 if (new) { 683 if (new) {
676 int id = new->id; 684 int id = new->id;
@@ -682,7 +690,7 @@ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new)
682 sort_memslots(slots); 690 sort_memslots(slots);
683 } 691 }
684 692
685 slots->generation++; 693 slots->generation = last_generation + 1;
686} 694}
687 695
688static int check_memory_region_flags(struct kvm_userspace_memory_region *mem) 696static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
@@ -699,6 +707,35 @@ static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
699 return 0; 707 return 0;
700} 708}
701 709
710static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
711 struct kvm_memslots *slots, struct kvm_memory_slot *new)
712{
713 struct kvm_memslots *old_memslots = kvm->memslots;
714
715 update_memslots(slots, new, kvm->memslots->generation);
716 rcu_assign_pointer(kvm->memslots, slots);
717 synchronize_srcu_expedited(&kvm->srcu);
718 return old_memslots;
719}
720
721/*
722 * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations:
723 * - create a new memory slot
724 * - delete an existing memory slot
725 * - modify an existing memory slot
726 * -- move it in the guest physical memory space
727 * -- just change its flags
728 *
729 * Since flags can be changed by some of these operations, the following
730 * differentiation is the best we can do for __kvm_set_memory_region():
731 */
732enum kvm_mr_change {
733 KVM_MR_CREATE,
734 KVM_MR_DELETE,
735 KVM_MR_MOVE,
736 KVM_MR_FLAGS_ONLY,
737};
738
702/* 739/*
703 * Allocate some memory and give it an address in the guest physical address 740 * Allocate some memory and give it an address in the guest physical address
704 * space. 741 * space.
@@ -709,14 +746,15 @@ static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
709 */ 746 */
710int __kvm_set_memory_region(struct kvm *kvm, 747int __kvm_set_memory_region(struct kvm *kvm,
711 struct kvm_userspace_memory_region *mem, 748 struct kvm_userspace_memory_region *mem,
712 int user_alloc) 749 bool user_alloc)
713{ 750{
714 int r; 751 int r;
715 gfn_t base_gfn; 752 gfn_t base_gfn;
716 unsigned long npages; 753 unsigned long npages;
717 struct kvm_memory_slot *memslot, *slot; 754 struct kvm_memory_slot *slot;
718 struct kvm_memory_slot old, new; 755 struct kvm_memory_slot old, new;
719 struct kvm_memslots *slots, *old_memslots; 756 struct kvm_memslots *slots = NULL, *old_memslots;
757 enum kvm_mr_change change;
720 758
721 r = check_memory_region_flags(mem); 759 r = check_memory_region_flags(mem);
722 if (r) 760 if (r)
@@ -740,7 +778,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
740 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 778 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
741 goto out; 779 goto out;
742 780
743 memslot = id_to_memslot(kvm->memslots, mem->slot); 781 slot = id_to_memslot(kvm->memslots, mem->slot);
744 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 782 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
745 npages = mem->memory_size >> PAGE_SHIFT; 783 npages = mem->memory_size >> PAGE_SHIFT;
746 784
@@ -751,26 +789,48 @@ int __kvm_set_memory_region(struct kvm *kvm,
751 if (!npages) 789 if (!npages)
752 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 790 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
753 791
754 new = old = *memslot; 792 new = old = *slot;
755 793
756 new.id = mem->slot; 794 new.id = mem->slot;
757 new.base_gfn = base_gfn; 795 new.base_gfn = base_gfn;
758 new.npages = npages; 796 new.npages = npages;
759 new.flags = mem->flags; 797 new.flags = mem->flags;
760 798
761 /* Disallow changing a memory slot's size. */
762 r = -EINVAL; 799 r = -EINVAL;
763 if (npages && old.npages && npages != old.npages) 800 if (npages) {
764 goto out_free; 801 if (!old.npages)
802 change = KVM_MR_CREATE;
803 else { /* Modify an existing slot. */
804 if ((mem->userspace_addr != old.userspace_addr) ||
805 (npages != old.npages) ||
806 ((new.flags ^ old.flags) & KVM_MEM_READONLY))
807 goto out;
765 808
766 /* Check for overlaps */ 809 if (base_gfn != old.base_gfn)
767 r = -EEXIST; 810 change = KVM_MR_MOVE;
768 kvm_for_each_memslot(slot, kvm->memslots) { 811 else if (new.flags != old.flags)
769 if (slot->id >= KVM_MEMORY_SLOTS || slot == memslot) 812 change = KVM_MR_FLAGS_ONLY;
770 continue; 813 else { /* Nothing to change. */
771 if (!((base_gfn + npages <= slot->base_gfn) || 814 r = 0;
772 (base_gfn >= slot->base_gfn + slot->npages))) 815 goto out;
773 goto out_free; 816 }
817 }
818 } else if (old.npages) {
819 change = KVM_MR_DELETE;
820 } else /* Modify a non-existent slot: disallowed. */
821 goto out;
822
823 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
824 /* Check for overlaps */
825 r = -EEXIST;
826 kvm_for_each_memslot(slot, kvm->memslots) {
827 if ((slot->id >= KVM_USER_MEM_SLOTS) ||
828 (slot->id == mem->slot))
829 continue;
830 if (!((base_gfn + npages <= slot->base_gfn) ||
831 (base_gfn >= slot->base_gfn + slot->npages)))
832 goto out;
833 }
774 } 834 }
775 835
776 /* Free page dirty bitmap if unneeded */ 836 /* Free page dirty bitmap if unneeded */
@@ -778,10 +838,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
778 new.dirty_bitmap = NULL; 838 new.dirty_bitmap = NULL;
779 839
780 r = -ENOMEM; 840 r = -ENOMEM;
781 841 if (change == KVM_MR_CREATE) {
782 /* Allocate if a slot is being created */
783 if (npages && !old.npages) {
784 new.user_alloc = user_alloc;
785 new.userspace_addr = mem->userspace_addr; 842 new.userspace_addr = mem->userspace_addr;
786 843
787 if (kvm_arch_create_memslot(&new, npages)) 844 if (kvm_arch_create_memslot(&new, npages))
@@ -792,12 +849,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
792 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 849 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
793 if (kvm_create_dirty_bitmap(&new) < 0) 850 if (kvm_create_dirty_bitmap(&new) < 0)
794 goto out_free; 851 goto out_free;
795 /* destroy any largepage mappings for dirty tracking */
796 } 852 }
797 853
798 if (!npages || base_gfn != old.base_gfn) { 854 if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
799 struct kvm_memory_slot *slot;
800
801 r = -ENOMEM; 855 r = -ENOMEM;
802 slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), 856 slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
803 GFP_KERNEL); 857 GFP_KERNEL);
@@ -806,11 +860,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
806 slot = id_to_memslot(slots, mem->slot); 860 slot = id_to_memslot(slots, mem->slot);
807 slot->flags |= KVM_MEMSLOT_INVALID; 861 slot->flags |= KVM_MEMSLOT_INVALID;
808 862
809 update_memslots(slots, NULL); 863 old_memslots = install_new_memslots(kvm, slots, NULL);
810 864
811 old_memslots = kvm->memslots; 865 /* slot was deleted or moved, clear iommu mapping */
812 rcu_assign_pointer(kvm->memslots, slots); 866 kvm_iommu_unmap_pages(kvm, &old);
813 synchronize_srcu_expedited(&kvm->srcu);
814 /* From this point no new shadow pages pointing to a deleted, 867 /* From this point no new shadow pages pointing to a deleted,
815 * or moved, memslot will be created. 868 * or moved, memslot will be created.
816 * 869 *
@@ -819,37 +872,48 @@ int __kvm_set_memory_region(struct kvm *kvm,
819 * - kvm_is_visible_gfn (mmu_check_roots) 872 * - kvm_is_visible_gfn (mmu_check_roots)
820 */ 873 */
821 kvm_arch_flush_shadow_memslot(kvm, slot); 874 kvm_arch_flush_shadow_memslot(kvm, slot);
822 kfree(old_memslots); 875 slots = old_memslots;
823 } 876 }
824 877
825 r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc); 878 r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc);
826 if (r) 879 if (r)
827 goto out_free; 880 goto out_slots;
828 881
829 /* map/unmap the pages in iommu page table */ 882 r = -ENOMEM;
830 if (npages) { 883 /*
831 r = kvm_iommu_map_pages(kvm, &new); 884 * We can re-use the old_memslots from above, the only difference
832 if (r) 885 * from the currently installed memslots is the invalid flag. This
886 * will get overwritten by update_memslots anyway.
887 */
888 if (!slots) {
889 slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
890 GFP_KERNEL);
891 if (!slots)
833 goto out_free; 892 goto out_free;
834 } else 893 }
835 kvm_iommu_unmap_pages(kvm, &old);
836 894
837 r = -ENOMEM; 895 /*
838 slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), 896 * IOMMU mapping: New slots need to be mapped. Old slots need to be
839 GFP_KERNEL); 897 * un-mapped and re-mapped if their base changes. Since base change
840 if (!slots) 898 * unmapping is handled above with slot deletion, mapping alone is
841 goto out_free; 899 * needed here. Anything else the iommu might care about for existing
900 * slots (size changes, userspace addr changes and read-only flag
901 * changes) is disallowed above, so any other attribute changes getting
902 * here can be skipped.
903 */
904 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
905 r = kvm_iommu_map_pages(kvm, &new);
906 if (r)
907 goto out_slots;
908 }
842 909
843 /* actual memory is freed via old in kvm_free_physmem_slot below */ 910 /* actual memory is freed via old in kvm_free_physmem_slot below */
844 if (!npages) { 911 if (change == KVM_MR_DELETE) {
845 new.dirty_bitmap = NULL; 912 new.dirty_bitmap = NULL;
846 memset(&new.arch, 0, sizeof(new.arch)); 913 memset(&new.arch, 0, sizeof(new.arch));
847 } 914 }
848 915
849 update_memslots(slots, &new); 916 old_memslots = install_new_memslots(kvm, slots, &new);
850 old_memslots = kvm->memslots;
851 rcu_assign_pointer(kvm->memslots, slots);
852 synchronize_srcu_expedited(&kvm->srcu);
853 917
854 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); 918 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
855 919
@@ -858,17 +922,18 @@ int __kvm_set_memory_region(struct kvm *kvm,
858 922
859 return 0; 923 return 0;
860 924
925out_slots:
926 kfree(slots);
861out_free: 927out_free:
862 kvm_free_physmem_slot(&new, &old); 928 kvm_free_physmem_slot(&new, &old);
863out: 929out:
864 return r; 930 return r;
865
866} 931}
867EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 932EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
868 933
869int kvm_set_memory_region(struct kvm *kvm, 934int kvm_set_memory_region(struct kvm *kvm,
870 struct kvm_userspace_memory_region *mem, 935 struct kvm_userspace_memory_region *mem,
871 int user_alloc) 936 bool user_alloc)
872{ 937{
873 int r; 938 int r;
874 939
@@ -882,9 +947,9 @@ EXPORT_SYMBOL_GPL(kvm_set_memory_region);
882int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 947int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
883 struct 948 struct
884 kvm_userspace_memory_region *mem, 949 kvm_userspace_memory_region *mem,
885 int user_alloc) 950 bool user_alloc)
886{ 951{
887 if (mem->slot >= KVM_MEMORY_SLOTS) 952 if (mem->slot >= KVM_USER_MEM_SLOTS)
888 return -EINVAL; 953 return -EINVAL;
889 return kvm_set_memory_region(kvm, mem, user_alloc); 954 return kvm_set_memory_region(kvm, mem, user_alloc);
890} 955}
@@ -898,7 +963,7 @@ int kvm_get_dirty_log(struct kvm *kvm,
898 unsigned long any = 0; 963 unsigned long any = 0;
899 964
900 r = -EINVAL; 965 r = -EINVAL;
901 if (log->slot >= KVM_MEMORY_SLOTS) 966 if (log->slot >= KVM_USER_MEM_SLOTS)
902 goto out; 967 goto out;
903 968
904 memslot = id_to_memslot(kvm->memslots, log->slot); 969 memslot = id_to_memslot(kvm->memslots, log->slot);
@@ -944,7 +1009,7 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
944{ 1009{
945 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); 1010 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
946 1011
947 if (!memslot || memslot->id >= KVM_MEMORY_SLOTS || 1012 if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS ||
948 memslot->flags & KVM_MEMSLOT_INVALID) 1013 memslot->flags & KVM_MEMSLOT_INVALID)
949 return 0; 1014 return 0;
950 1015
@@ -1641,6 +1706,7 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
1641{ 1706{
1642 struct pid *pid; 1707 struct pid *pid;
1643 struct task_struct *task = NULL; 1708 struct task_struct *task = NULL;
1709 bool ret = false;
1644 1710
1645 rcu_read_lock(); 1711 rcu_read_lock();
1646 pid = rcu_dereference(target->pid); 1712 pid = rcu_dereference(target->pid);
@@ -1648,17 +1714,15 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
1648 task = get_pid_task(target->pid, PIDTYPE_PID); 1714 task = get_pid_task(target->pid, PIDTYPE_PID);
1649 rcu_read_unlock(); 1715 rcu_read_unlock();
1650 if (!task) 1716 if (!task)
1651 return false; 1717 return ret;
1652 if (task->flags & PF_VCPU) { 1718 if (task->flags & PF_VCPU) {
1653 put_task_struct(task); 1719 put_task_struct(task);
1654 return false; 1720 return ret;
1655 }
1656 if (yield_to(task, 1)) {
1657 put_task_struct(task);
1658 return true;
1659 } 1721 }
1722 ret = yield_to(task, 1);
1660 put_task_struct(task); 1723 put_task_struct(task);
1661 return false; 1724
1725 return ret;
1662} 1726}
1663EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 1727EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
1664 1728
@@ -1699,12 +1763,14 @@ bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
1699 return eligible; 1763 return eligible;
1700} 1764}
1701#endif 1765#endif
1766
1702void kvm_vcpu_on_spin(struct kvm_vcpu *me) 1767void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1703{ 1768{
1704 struct kvm *kvm = me->kvm; 1769 struct kvm *kvm = me->kvm;
1705 struct kvm_vcpu *vcpu; 1770 struct kvm_vcpu *vcpu;
1706 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 1771 int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
1707 int yielded = 0; 1772 int yielded = 0;
1773 int try = 3;
1708 int pass; 1774 int pass;
1709 int i; 1775 int i;
1710 1776
@@ -1716,7 +1782,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1716 * VCPU is holding the lock that we need and will release it. 1782 * VCPU is holding the lock that we need and will release it.
1717 * We approximate round-robin by starting at the last boosted VCPU. 1783 * We approximate round-robin by starting at the last boosted VCPU.
1718 */ 1784 */
1719 for (pass = 0; pass < 2 && !yielded; pass++) { 1785 for (pass = 0; pass < 2 && !yielded && try; pass++) {
1720 kvm_for_each_vcpu(i, vcpu, kvm) { 1786 kvm_for_each_vcpu(i, vcpu, kvm) {
1721 if (!pass && i <= last_boosted_vcpu) { 1787 if (!pass && i <= last_boosted_vcpu) {
1722 i = last_boosted_vcpu; 1788 i = last_boosted_vcpu;
@@ -1729,10 +1795,15 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1729 continue; 1795 continue;
1730 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 1796 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
1731 continue; 1797 continue;
1732 if (kvm_vcpu_yield_to(vcpu)) { 1798
1799 yielded = kvm_vcpu_yield_to(vcpu);
1800 if (yielded > 0) {
1733 kvm->last_boosted_vcpu = i; 1801 kvm->last_boosted_vcpu = i;
1734 yielded = 1;
1735 break; 1802 break;
1803 } else if (yielded < 0) {
1804 try--;
1805 if (!try)
1806 break;
1736 } 1807 }
1737 } 1808 }
1738 } 1809 }
@@ -2127,7 +2198,7 @@ static long kvm_vm_ioctl(struct file *filp,
2127 sizeof kvm_userspace_mem)) 2198 sizeof kvm_userspace_mem))
2128 goto out; 2199 goto out;
2129 2200
2130 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); 2201 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, true);
2131 break; 2202 break;
2132 } 2203 }
2133 case KVM_GET_DIRTY_LOG: { 2204 case KVM_GET_DIRTY_LOG: {