diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-24 15:01:20 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-24 15:01:20 -0400 |
| commit | 5fecc9d8f59e765c2a48379dd7c6f5cf88c7d75a (patch) | |
| tree | d1fc25d9650d3ac24591bba6f5e2e7a1afc54796 | |
| parent | 3c4cfadef6a1665d9cd02a543782d03d3e6740c6 (diff) | |
| parent | 1a577b72475d161b6677c05abe57301362023bb2 (diff) | |
Merge tag 'kvm-3.6-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Avi Kivity:
"Highlights include
- full big real mode emulation on pre-Westmere Intel hosts (can be
disabled with emulate_invalid_guest_state=0)
- relatively small ppc and s390 updates
- PCID/INVPCID support in guests
- EOI avoidance; 3.6 guests should perform better on 3.6 hosts on
interrupt intensive workloads)
- Lockless write faults during live migration
- EPT accessed/dirty bits support for new Intel processors"
Fix up conflicts in:
- Documentation/virtual/kvm/api.txt:
Stupid subchapter numbering, added next to each other.
- arch/powerpc/kvm/booke_interrupts.S:
PPC asm changes clashing with the KVM fixes
- arch/s390/include/asm/sigp.h, arch/s390/kvm/sigp.c:
Duplicated commits through the kvm tree and the s390 tree, with
subsequent edits in the KVM tree.
* tag 'kvm-3.6-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (93 commits)
KVM: fix race with level interrupts
x86, hyper: fix build with !CONFIG_KVM_GUEST
Revert "apic: fix kvm build on UP without IOAPIC"
KVM guest: switch to apic_set_eoi_write, apic_write
apic: add apic_set_eoi_write for PV use
KVM: VMX: Implement PCID/INVPCID for guests with EPT
KVM: Add x86_hyper_kvm to complete detect_hypervisor_platform check
KVM: PPC: Critical interrupt emulation support
KVM: PPC: e500mc: Fix tlbilx emulation for 64-bit guests
KVM: PPC64: booke: Set interrupt computation mode for 64-bit host
KVM: PPC: bookehv: Add ESR flag to Data Storage Interrupt
KVM: PPC: bookehv64: Add support for std/ld emulation.
booke: Added crit/mc exception handler for e500v2
booke/bookehv: Add host crit-watchdog exception support
KVM: MMU: document mmu-lock and fast page fault
KVM: MMU: fix kvm_mmu_pagetable_walk tracepoint
KVM: MMU: trace fast page fault
KVM: MMU: fast path of handling guest page fault
KVM: MMU: introduce SPTE_MMU_WRITEABLE bit
KVM: MMU: fold tlb flush judgement into mmu_spte_update
...
71 files changed, 1914 insertions, 519 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 2c9948379469..bf33aaa4c59f 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt | |||
| @@ -1946,6 +1946,40 @@ the guest using the specified gsi pin. The irqfd is removed using | |||
| 1946 | the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd | 1946 | the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd |
| 1947 | and kvm_irqfd.gsi. | 1947 | and kvm_irqfd.gsi. |
| 1948 | 1948 | ||
| 1949 | 4.76 KVM_PPC_ALLOCATE_HTAB | ||
| 1950 | |||
| 1951 | Capability: KVM_CAP_PPC_ALLOC_HTAB | ||
| 1952 | Architectures: powerpc | ||
| 1953 | Type: vm ioctl | ||
| 1954 | Parameters: Pointer to u32 containing hash table order (in/out) | ||
| 1955 | Returns: 0 on success, -1 on error | ||
| 1956 | |||
| 1957 | This requests the host kernel to allocate an MMU hash table for a | ||
| 1958 | guest using the PAPR paravirtualization interface. This only does | ||
| 1959 | anything if the kernel is configured to use the Book 3S HV style of | ||
| 1960 | virtualization. Otherwise the capability doesn't exist and the ioctl | ||
| 1961 | returns an ENOTTY error. The rest of this description assumes Book 3S | ||
| 1962 | HV. | ||
| 1963 | |||
| 1964 | There must be no vcpus running when this ioctl is called; if there | ||
| 1965 | are, it will do nothing and return an EBUSY error. | ||
| 1966 | |||
| 1967 | The parameter is a pointer to a 32-bit unsigned integer variable | ||
| 1968 | containing the order (log base 2) of the desired size of the hash | ||
| 1969 | table, which must be between 18 and 46. On successful return from the | ||
| 1970 | ioctl, it will have been updated with the order of the hash table that | ||
| 1971 | was allocated. | ||
| 1972 | |||
| 1973 | If no hash table has been allocated when any vcpu is asked to run | ||
| 1974 | (with the KVM_RUN ioctl), the host kernel will allocate a | ||
| 1975 | default-sized hash table (16 MB). | ||
| 1976 | |||
| 1977 | If this ioctl is called when a hash table has already been allocated, | ||
| 1978 | the kernel will clear out the existing hash table (zero all HPTEs) and | ||
| 1979 | return the hash table order in the parameter. (If the guest is using | ||
| 1980 | the virtualized real-mode area (VRMA) facility, the kernel will | ||
| 1981 | re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.) | ||
| 1982 | |||
| 1949 | 1983 | ||
| 1950 | 5. The kvm_run structure | 1984 | 5. The kvm_run structure |
| 1951 | ------------------------ | 1985 | ------------------------ |
diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt index 3b4cd3bf5631..41b7ac9884b5 100644 --- a/Documentation/virtual/kvm/locking.txt +++ b/Documentation/virtual/kvm/locking.txt | |||
| @@ -6,7 +6,129 @@ KVM Lock Overview | |||
| 6 | 6 | ||
| 7 | (to be written) | 7 | (to be written) |
| 8 | 8 | ||
| 9 | 2. Reference | 9 | 2: Exception |
| 10 | ------------ | ||
| 11 | |||
| 12 | Fast page fault: | ||
| 13 | |||
| 14 | Fast page fault is the fast path which fixes the guest page fault out of | ||
| 15 | the mmu-lock on x86. Currently, the page fault can be fast only if the | ||
| 16 | shadow page table is present and it is caused by write-protect, that means | ||
| 17 | we just need change the W bit of the spte. | ||
| 18 | |||
| 19 | What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and | ||
| 20 | SPTE_MMU_WRITEABLE bit on the spte: | ||
| 21 | - SPTE_HOST_WRITEABLE means the gfn is writable on host. | ||
| 22 | - SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when | ||
| 23 | the gfn is writable on guest mmu and it is not write-protected by shadow | ||
| 24 | page write-protection. | ||
| 25 | |||
| 26 | On fast page fault path, we will use cmpxchg to atomically set the spte W | ||
| 27 | bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, this | ||
| 28 | is safe because whenever changing these bits can be detected by cmpxchg. | ||
| 29 | |||
| 30 | But we need carefully check these cases: | ||
| 31 | 1): The mapping from gfn to pfn | ||
| 32 | The mapping from gfn to pfn may be changed since we can only ensure the pfn | ||
| 33 | is not changed during cmpxchg. This is a ABA problem, for example, below case | ||
| 34 | will happen: | ||
| 35 | |||
| 36 | At the beginning: | ||
| 37 | gpte = gfn1 | ||
| 38 | gfn1 is mapped to pfn1 on host | ||
| 39 | spte is the shadow page table entry corresponding with gpte and | ||
| 40 | spte = pfn1 | ||
| 41 | |||
| 42 | VCPU 0 VCPU0 | ||
| 43 | on fast page fault path: | ||
| 44 | |||
| 45 | old_spte = *spte; | ||
| 46 | pfn1 is swapped out: | ||
| 47 | spte = 0; | ||
| 48 | |||
| 49 | pfn1 is re-alloced for gfn2. | ||
| 50 | |||
| 51 | gpte is changed to point to | ||
| 52 | gfn2 by the guest: | ||
| 53 | spte = pfn1; | ||
| 54 | |||
| 55 | if (cmpxchg(spte, old_spte, old_spte+W) | ||
| 56 | mark_page_dirty(vcpu->kvm, gfn1) | ||
| 57 | OOPS!!! | ||
| 58 | |||
| 59 | We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap. | ||
| 60 | |||
| 61 | For direct sp, we can easily avoid it since the spte of direct sp is fixed | ||
| 62 | to gfn. For indirect sp, before we do cmpxchg, we call gfn_to_pfn_atomic() | ||
| 63 | to pin gfn to pfn, because after gfn_to_pfn_atomic(): | ||
| 64 | - We have held the refcount of pfn that means the pfn can not be freed and | ||
| 65 | be reused for another gfn. | ||
| 66 | - The pfn is writable that means it can not be shared between different gfns | ||
| 67 | by KSM. | ||
| 68 | |||
| 69 | Then, we can ensure the dirty bitmaps is correctly set for a gfn. | ||
| 70 | |||
| 71 | Currently, to simplify the whole things, we disable fast page fault for | ||
| 72 | indirect shadow page. | ||
| 73 | |||
| 74 | 2): Dirty bit tracking | ||
| 75 | In the origin code, the spte can be fast updated (non-atomically) if the | ||
| 76 | spte is read-only and the Accessed bit has already been set since the | ||
| 77 | Accessed bit and Dirty bit can not be lost. | ||
| 78 | |||
| 79 | But it is not true after fast page fault since the spte can be marked | ||
| 80 | writable between reading spte and updating spte. Like below case: | ||
| 81 | |||
| 82 | At the beginning: | ||
| 83 | spte.W = 0 | ||
| 84 | spte.Accessed = 1 | ||
| 85 | |||
| 86 | VCPU 0 VCPU0 | ||
| 87 | In mmu_spte_clear_track_bits(): | ||
| 88 | |||
| 89 | old_spte = *spte; | ||
| 90 | |||
| 91 | /* 'if' condition is satisfied. */ | ||
| 92 | if (old_spte.Accssed == 1 && | ||
| 93 | old_spte.W == 0) | ||
| 94 | spte = 0ull; | ||
| 95 | on fast page fault path: | ||
| 96 | spte.W = 1 | ||
| 97 | memory write on the spte: | ||
| 98 | spte.Dirty = 1 | ||
| 99 | |||
| 100 | |||
| 101 | else | ||
| 102 | old_spte = xchg(spte, 0ull) | ||
| 103 | |||
| 104 | |||
| 105 | if (old_spte.Accssed == 1) | ||
| 106 | kvm_set_pfn_accessed(spte.pfn); | ||
| 107 | if (old_spte.Dirty == 1) | ||
| 108 | kvm_set_pfn_dirty(spte.pfn); | ||
| 109 | OOPS!!! | ||
| 110 | |||
| 111 | The Dirty bit is lost in this case. | ||
| 112 | |||
| 113 | In order to avoid this kind of issue, we always treat the spte as "volatile" | ||
| 114 | if it can be updated out of mmu-lock, see spte_has_volatile_bits(), it means, | ||
| 115 | the spte is always atomicly updated in this case. | ||
| 116 | |||
| 117 | 3): flush tlbs due to spte updated | ||
| 118 | If the spte is updated from writable to readonly, we should flush all TLBs, | ||
| 119 | otherwise rmap_write_protect will find a read-only spte, even though the | ||
| 120 | writable spte might be cached on a CPU's TLB. | ||
| 121 | |||
| 122 | As mentioned before, the spte can be updated to writable out of mmu-lock on | ||
| 123 | fast page fault path, in order to easily audit the path, we see if TLBs need | ||
| 124 | be flushed caused by this reason in mmu_spte_update() since this is a common | ||
| 125 | function to update spte (present -> present). | ||
| 126 | |||
| 127 | Since the spte is "volatile" if it can be updated out of mmu-lock, we always | ||
| 128 | atomicly update the spte, the race caused by fast page fault can be avoided, | ||
| 129 | See the comments in spte_has_volatile_bits() and mmu_spte_update(). | ||
| 130 | |||
| 131 | 3. Reference | ||
| 10 | ------------ | 132 | ------------ |
| 11 | 133 | ||
| 12 | Name: kvm_lock | 134 | Name: kvm_lock |
| @@ -23,3 +145,9 @@ Arch: x86 | |||
| 23 | Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset} | 145 | Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset} |
| 24 | - tsc offset in vmcb | 146 | - tsc offset in vmcb |
| 25 | Comment: 'raw' because updating the tsc offsets must not be preempted. | 147 | Comment: 'raw' because updating the tsc offsets must not be preempted. |
| 148 | |||
| 149 | Name: kvm->mmu_lock | ||
| 150 | Type: spinlock_t | ||
| 151 | Arch: any | ||
| 152 | Protects: -shadow page/shadow tlb entry | ||
| 153 | Comment: it is a spinlock since it is used in mmu notifier. | ||
diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt index 96b41bd97523..730471048583 100644 --- a/Documentation/virtual/kvm/msr.txt +++ b/Documentation/virtual/kvm/msr.txt | |||
| @@ -223,3 +223,36 @@ MSR_KVM_STEAL_TIME: 0x4b564d03 | |||
| 223 | steal: the amount of time in which this vCPU did not run, in | 223 | steal: the amount of time in which this vCPU did not run, in |
| 224 | nanoseconds. Time during which the vcpu is idle, will not be | 224 | nanoseconds. Time during which the vcpu is idle, will not be |
| 225 | reported as steal time. | 225 | reported as steal time. |
| 226 | |||
| 227 | MSR_KVM_EOI_EN: 0x4b564d04 | ||
| 228 | data: Bit 0 is 1 when PV end of interrupt is enabled on the vcpu; 0 | ||
| 229 | when disabled. Bit 1 is reserved and must be zero. When PV end of | ||
| 230 | interrupt is enabled (bit 0 set), bits 63-2 hold a 4-byte aligned | ||
| 231 | physical address of a 4 byte memory area which must be in guest RAM and | ||
| 232 | must be zeroed. | ||
| 233 | |||
| 234 | The first, least significant bit of 4 byte memory location will be | ||
| 235 | written to by the hypervisor, typically at the time of interrupt | ||
| 236 | injection. Value of 1 means that guest can skip writing EOI to the apic | ||
| 237 | (using MSR or MMIO write); instead, it is sufficient to signal | ||
| 238 | EOI by clearing the bit in guest memory - this location will | ||
| 239 | later be polled by the hypervisor. | ||
| 240 | Value of 0 means that the EOI write is required. | ||
| 241 | |||
| 242 | It is always safe for the guest to ignore the optimization and perform | ||
| 243 | the APIC EOI write anyway. | ||
| 244 | |||
| 245 | Hypervisor is guaranteed to only modify this least | ||
| 246 | significant bit while in the current VCPU context, this means that | ||
| 247 | guest does not need to use either lock prefix or memory ordering | ||
| 248 | primitives to synchronise with the hypervisor. | ||
| 249 | |||
| 250 | However, hypervisor can set and clear this memory bit at any time: | ||
| 251 | therefore to make sure hypervisor does not interrupt the | ||
| 252 | guest and clear the least significant bit in the memory area | ||
| 253 | in the window between guest testing it to detect | ||
| 254 | whether it can skip EOI apic write and between guest | ||
| 255 | clearing it to signal EOI to the hypervisor, | ||
| 256 | guest must both read the least significant bit in the memory area and | ||
| 257 | clear it using a single CPU instruction, such as test and clear, or | ||
| 258 | compare and exchange. | ||
diff --git a/Documentation/virtual/kvm/ppc-pv.txt b/Documentation/virtual/kvm/ppc-pv.txt index 6e7c37050930..4911cf95c67e 100644 --- a/Documentation/virtual/kvm/ppc-pv.txt +++ b/Documentation/virtual/kvm/ppc-pv.txt | |||
| @@ -109,8 +109,6 @@ The following bits are safe to be set inside the guest: | |||
| 109 | 109 | ||
| 110 | MSR_EE | 110 | MSR_EE |
| 111 | MSR_RI | 111 | MSR_RI |
| 112 | MSR_CR | ||
| 113 | MSR_ME | ||
| 114 | 112 | ||
| 115 | If any other bit changes in the MSR, please still use mtmsr(d). | 113 | If any other bit changes in the MSR, please still use mtmsr(d). |
| 116 | 114 | ||
diff --git a/MAINTAINERS b/MAINTAINERS index 7316ab62e5af..cda045337a9d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
| @@ -4002,8 +4002,8 @@ F: arch/ia64/include/asm/kvm* | |||
| 4002 | F: arch/ia64/kvm/ | 4002 | F: arch/ia64/kvm/ |
| 4003 | 4003 | ||
| 4004 | KERNEL VIRTUAL MACHINE for s390 (KVM/s390) | 4004 | KERNEL VIRTUAL MACHINE for s390 (KVM/s390) |
| 4005 | M: Carsten Otte <cotte@de.ibm.com> | ||
| 4006 | M: Christian Borntraeger <borntraeger@de.ibm.com> | 4005 | M: Christian Borntraeger <borntraeger@de.ibm.com> |
| 4006 | M: Cornelia Huck <cornelia.huck@de.ibm.com> | ||
| 4007 | M: linux390@de.ibm.com | 4007 | M: linux390@de.ibm.com |
| 4008 | L: linux-s390@vger.kernel.org | 4008 | L: linux-s390@vger.kernel.org |
| 4009 | W: http://www.ibm.com/developerworks/linux/linux390/ | 4009 | W: http://www.ibm.com/developerworks/linux/linux390/ |
diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h index b9f82c84f093..ec6c6b301238 100644 --- a/arch/ia64/include/asm/kvm.h +++ b/arch/ia64/include/asm/kvm.h | |||
| @@ -26,6 +26,7 @@ | |||
| 26 | 26 | ||
| 27 | /* Select x86 specific features in <linux/kvm.h> */ | 27 | /* Select x86 specific features in <linux/kvm.h> */ |
| 28 | #define __KVM_HAVE_IOAPIC | 28 | #define __KVM_HAVE_IOAPIC |
| 29 | #define __KVM_HAVE_IRQ_LINE | ||
| 29 | #define __KVM_HAVE_DEVICE_ASSIGNMENT | 30 | #define __KVM_HAVE_DEVICE_ASSIGNMENT |
| 30 | 31 | ||
| 31 | /* Architectural interrupt line count. */ | 32 | /* Architectural interrupt line count. */ |
diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig index 9806e55f91be..df5351e3eed7 100644 --- a/arch/ia64/kvm/Kconfig +++ b/arch/ia64/kvm/Kconfig | |||
| @@ -19,6 +19,7 @@ if VIRTUALIZATION | |||
| 19 | 19 | ||
| 20 | config KVM | 20 | config KVM |
| 21 | tristate "Kernel-based Virtual Machine (KVM) support" | 21 | tristate "Kernel-based Virtual Machine (KVM) support" |
| 22 | depends on BROKEN | ||
| 22 | depends on HAVE_KVM && MODULES && EXPERIMENTAL | 23 | depends on HAVE_KVM && MODULES && EXPERIMENTAL |
| 23 | # for device assignment: | 24 | # for device assignment: |
| 24 | depends on PCI | 25 | depends on PCI |
diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h index 976835d8f22e..bf2c06c33871 100644 --- a/arch/powerpc/include/asm/epapr_hcalls.h +++ b/arch/powerpc/include/asm/epapr_hcalls.h | |||
| @@ -153,6 +153,8 @@ | |||
| 153 | #define EV_HCALL_CLOBBERS2 EV_HCALL_CLOBBERS3, "r5" | 153 | #define EV_HCALL_CLOBBERS2 EV_HCALL_CLOBBERS3, "r5" |
| 154 | #define EV_HCALL_CLOBBERS1 EV_HCALL_CLOBBERS2, "r4" | 154 | #define EV_HCALL_CLOBBERS1 EV_HCALL_CLOBBERS2, "r4" |
| 155 | 155 | ||
| 156 | extern bool epapr_paravirt_enabled; | ||
| 157 | extern u32 epapr_hypercall_start[]; | ||
| 156 | 158 | ||
| 157 | /* | 159 | /* |
| 158 | * We use "uintptr_t" to define a register because it's guaranteed to be a | 160 | * We use "uintptr_t" to define a register because it's guaranteed to be a |
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 0554ab062bdc..e45c4947a772 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h | |||
| @@ -34,6 +34,8 @@ extern void __replay_interrupt(unsigned int vector); | |||
| 34 | 34 | ||
| 35 | extern void timer_interrupt(struct pt_regs *); | 35 | extern void timer_interrupt(struct pt_regs *); |
| 36 | extern void performance_monitor_exception(struct pt_regs *regs); | 36 | extern void performance_monitor_exception(struct pt_regs *regs); |
| 37 | extern void WatchdogException(struct pt_regs *regs); | ||
| 38 | extern void unknown_exception(struct pt_regs *regs); | ||
| 37 | 39 | ||
| 38 | #ifdef CONFIG_PPC64 | 40 | #ifdef CONFIG_PPC64 |
| 39 | #include <asm/paca.h> | 41 | #include <asm/paca.h> |
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index b0c08b142770..0dd1d86d3e31 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h | |||
| @@ -36,11 +36,8 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) | |||
| 36 | #define SPAPR_TCE_SHIFT 12 | 36 | #define SPAPR_TCE_SHIFT 12 |
| 37 | 37 | ||
| 38 | #ifdef CONFIG_KVM_BOOK3S_64_HV | 38 | #ifdef CONFIG_KVM_BOOK3S_64_HV |
| 39 | /* For now use fixed-size 16MB page table */ | 39 | #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ |
| 40 | #define HPT_ORDER 24 | 40 | extern int kvm_hpt_order; /* order of preallocated HPTs */ |
| 41 | #define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ | ||
| 42 | #define HPT_NPTE (HPT_NPTEG << 3) /* 8 PTEs per PTEG */ | ||
| 43 | #define HPT_HASH_MASK (HPT_NPTEG - 1) | ||
| 44 | #endif | 41 | #endif |
| 45 | 42 | ||
| 46 | #define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ | 43 | #define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ |
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index d848cdc49715..50ea12fd7bf5 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h | |||
| @@ -237,6 +237,10 @@ struct kvm_arch { | |||
| 237 | unsigned long vrma_slb_v; | 237 | unsigned long vrma_slb_v; |
| 238 | int rma_setup_done; | 238 | int rma_setup_done; |
| 239 | int using_mmu_notifiers; | 239 | int using_mmu_notifiers; |
| 240 | u32 hpt_order; | ||
| 241 | atomic_t vcpus_running; | ||
| 242 | unsigned long hpt_npte; | ||
| 243 | unsigned long hpt_mask; | ||
| 240 | spinlock_t slot_phys_lock; | 244 | spinlock_t slot_phys_lock; |
| 241 | unsigned long *slot_phys[KVM_MEM_SLOTS_NUM]; | 245 | unsigned long *slot_phys[KVM_MEM_SLOTS_NUM]; |
| 242 | int slot_npages[KVM_MEM_SLOTS_NUM]; | 246 | int slot_npages[KVM_MEM_SLOTS_NUM]; |
| @@ -414,7 +418,9 @@ struct kvm_vcpu_arch { | |||
| 414 | ulong mcsrr1; | 418 | ulong mcsrr1; |
| 415 | ulong mcsr; | 419 | ulong mcsr; |
| 416 | u32 dec; | 420 | u32 dec; |
| 421 | #ifdef CONFIG_BOOKE | ||
| 417 | u32 decar; | 422 | u32 decar; |
| 423 | #endif | ||
| 418 | u32 tbl; | 424 | u32 tbl; |
| 419 | u32 tbu; | 425 | u32 tbu; |
| 420 | u32 tcr; | 426 | u32 tcr; |
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index f68c22fa2fce..0124937a23b9 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h | |||
| @@ -119,7 +119,8 @@ extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu); | |||
| 119 | extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu); | 119 | extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu); |
| 120 | extern void kvmppc_map_magic(struct kvm_vcpu *vcpu); | 120 | extern void kvmppc_map_magic(struct kvm_vcpu *vcpu); |
| 121 | 121 | ||
| 122 | extern long kvmppc_alloc_hpt(struct kvm *kvm); | 122 | extern long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp); |
| 123 | extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp); | ||
| 123 | extern void kvmppc_free_hpt(struct kvm *kvm); | 124 | extern void kvmppc_free_hpt(struct kvm *kvm); |
| 124 | extern long kvmppc_prepare_vrma(struct kvm *kvm, | 125 | extern long kvmppc_prepare_vrma(struct kvm *kvm, |
| 125 | struct kvm_userspace_memory_region *mem); | 126 | struct kvm_userspace_memory_region *mem); |
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 83afacd3ba7b..bb282dd81612 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile | |||
| @@ -128,6 +128,7 @@ ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC),) | |||
| 128 | obj-y += ppc_save_regs.o | 128 | obj-y += ppc_save_regs.o |
| 129 | endif | 129 | endif |
| 130 | 130 | ||
| 131 | obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o | ||
| 131 | obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o | 132 | obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o |
| 132 | 133 | ||
| 133 | # Disable GCOV in odd or sensitive code | 134 | # Disable GCOV in odd or sensitive code |
diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S new file mode 100644 index 000000000000..697b390ebfd8 --- /dev/null +++ b/arch/powerpc/kernel/epapr_hcalls.S | |||
| @@ -0,0 +1,25 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2012 Freescale Semiconductor, Inc. | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or | ||
| 5 | * modify it under the terms of the GNU General Public License | ||
| 6 | * as published by the Free Software Foundation; either version | ||
| 7 | * 2 of the License, or (at your option) any later version. | ||
| 8 | */ | ||
| 9 | |||
| 10 | #include <linux/threads.h> | ||
| 11 | #include <asm/reg.h> | ||
| 12 | #include <asm/page.h> | ||
| 13 | #include <asm/cputable.h> | ||
| 14 | #include <asm/thread_info.h> | ||
| 15 | #include <asm/ppc_asm.h> | ||
| 16 | #include <asm/asm-offsets.h> | ||
| 17 | |||
| 18 | /* Hypercall entry point. Will be patched with device tree instructions. */ | ||
| 19 | .global epapr_hypercall_start | ||
| 20 | epapr_hypercall_start: | ||
| 21 | li r3, -1 | ||
| 22 | nop | ||
| 23 | nop | ||
| 24 | nop | ||
| 25 | blr | ||
diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c new file mode 100644 index 000000000000..028aeae370b6 --- /dev/null +++ b/arch/powerpc/kernel/epapr_paravirt.c | |||
| @@ -0,0 +1,52 @@ | |||
| 1 | /* | ||
| 2 | * ePAPR para-virtualization support. | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License, version 2, as | ||
| 6 | * published by the Free Software Foundation. | ||
| 7 | * | ||
| 8 | * This program is distributed in the hope that it will be useful, | ||
| 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 11 | * GNU General Public License for more details. | ||
| 12 | * | ||
| 13 | * You should have received a copy of the GNU General Public License | ||
| 14 | * along with this program; if not, write to the Free Software | ||
| 15 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
| 16 | * | ||
| 17 | * Copyright (C) 2012 Freescale Semiconductor, Inc. | ||
| 18 | */ | ||
| 19 | |||
| 20 | #include <linux/of.h> | ||
| 21 | #include <asm/epapr_hcalls.h> | ||
| 22 | #include <asm/cacheflush.h> | ||
| 23 | #include <asm/code-patching.h> | ||
| 24 | |||
| 25 | bool epapr_paravirt_enabled; | ||
| 26 | |||
| 27 | static int __init epapr_paravirt_init(void) | ||
| 28 | { | ||
| 29 | struct device_node *hyper_node; | ||
| 30 | const u32 *insts; | ||
| 31 | int len, i; | ||
| 32 | |||
| 33 | hyper_node = of_find_node_by_path("/hypervisor"); | ||
| 34 | if (!hyper_node) | ||
| 35 | return -ENODEV; | ||
| 36 | |||
| 37 | insts = of_get_property(hyper_node, "hcall-instructions", &len); | ||
| 38 | if (!insts) | ||
| 39 | return -ENODEV; | ||
| 40 | |||
| 41 | if (len % 4 || len > (4 * 4)) | ||
| 42 | return -ENODEV; | ||
| 43 | |||
| 44 | for (i = 0; i < (len / 4); i++) | ||
| 45 | patch_instruction(epapr_hypercall_start + i, insts[i]); | ||
| 46 | |||
| 47 | epapr_paravirt_enabled = true; | ||
| 48 | |||
| 49 | return 0; | ||
| 50 | } | ||
| 51 | |||
| 52 | early_initcall(epapr_paravirt_init); | ||
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 02c167db6ba0..867db1de8949 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c | |||
| @@ -31,6 +31,7 @@ | |||
| 31 | #include <asm/cacheflush.h> | 31 | #include <asm/cacheflush.h> |
| 32 | #include <asm/disassemble.h> | 32 | #include <asm/disassemble.h> |
| 33 | #include <asm/ppc-opcode.h> | 33 | #include <asm/ppc-opcode.h> |
| 34 | #include <asm/epapr_hcalls.h> | ||
| 34 | 35 | ||
| 35 | #define KVM_MAGIC_PAGE (-4096L) | 36 | #define KVM_MAGIC_PAGE (-4096L) |
| 36 | #define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x) | 37 | #define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x) |
| @@ -726,7 +727,7 @@ unsigned long kvm_hypercall(unsigned long *in, | |||
| 726 | unsigned long register r11 asm("r11") = nr; | 727 | unsigned long register r11 asm("r11") = nr; |
| 727 | unsigned long register r12 asm("r12"); | 728 | unsigned long register r12 asm("r12"); |
| 728 | 729 | ||
| 729 | asm volatile("bl kvm_hypercall_start" | 730 | asm volatile("bl epapr_hypercall_start" |
| 730 | : "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6), | 731 | : "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6), |
| 731 | "=r"(r7), "=r"(r8), "=r"(r9), "=r"(r10), "=r"(r11), | 732 | "=r"(r7), "=r"(r8), "=r"(r9), "=r"(r10), "=r"(r11), |
| 732 | "=r"(r12) | 733 | "=r"(r12) |
| @@ -747,29 +748,6 @@ unsigned long kvm_hypercall(unsigned long *in, | |||
| 747 | } | 748 | } |
| 748 | EXPORT_SYMBOL_GPL(kvm_hypercall); | 749 | EXPORT_SYMBOL_GPL(kvm_hypercall); |
| 749 | 750 | ||
| 750 | static int kvm_para_setup(void) | ||
| 751 | { | ||
| 752 | extern u32 kvm_hypercall_start; | ||
| 753 | struct device_node *hyper_node; | ||
| 754 | u32 *insts; | ||
| 755 | int len, i; | ||
| 756 | |||
| 757 | hyper_node = of_find_node_by_path("/hypervisor"); | ||
| 758 | if (!hyper_node) | ||
| 759 | return -1; | ||
| 760 | |||
| 761 | insts = (u32*)of_get_property(hyper_node, "hcall-instructions", &len); | ||
| 762 | if (len % 4) | ||
| 763 | return -1; | ||
| 764 | if (len > (4 * 4)) | ||
| 765 | return -1; | ||
| 766 | |||
| 767 | for (i = 0; i < (len / 4); i++) | ||
| 768 | kvm_patch_ins(&(&kvm_hypercall_start)[i], insts[i]); | ||
| 769 | |||
| 770 | return 0; | ||
| 771 | } | ||
| 772 | |||
| 773 | static __init void kvm_free_tmp(void) | 751 | static __init void kvm_free_tmp(void) |
| 774 | { | 752 | { |
| 775 | unsigned long start, end; | 753 | unsigned long start, end; |
| @@ -791,7 +769,7 @@ static int __init kvm_guest_init(void) | |||
| 791 | if (!kvm_para_available()) | 769 | if (!kvm_para_available()) |
| 792 | goto free_tmp; | 770 | goto free_tmp; |
| 793 | 771 | ||
| 794 | if (kvm_para_setup()) | 772 | if (!epapr_paravirt_enabled) |
| 795 | goto free_tmp; | 773 | goto free_tmp; |
| 796 | 774 | ||
| 797 | if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE)) | 775 | if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE)) |
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S index e291cf3cf954..e100ff324a85 100644 --- a/arch/powerpc/kernel/kvm_emul.S +++ b/arch/powerpc/kernel/kvm_emul.S | |||
| @@ -24,16 +24,6 @@ | |||
| 24 | #include <asm/page.h> | 24 | #include <asm/page.h> |
| 25 | #include <asm/asm-offsets.h> | 25 | #include <asm/asm-offsets.h> |
| 26 | 26 | ||
| 27 | /* Hypercall entry point. Will be patched with device tree instructions. */ | ||
| 28 | |||
| 29 | .global kvm_hypercall_start | ||
| 30 | kvm_hypercall_start: | ||
| 31 | li r3, -1 | ||
| 32 | nop | ||
| 33 | nop | ||
| 34 | nop | ||
| 35 | blr | ||
| 36 | |||
| 37 | #define KVM_MAGIC_PAGE (-4096) | 27 | #define KVM_MAGIC_PAGE (-4096) |
| 38 | 28 | ||
| 39 | #ifdef CONFIG_64BIT | 29 | #ifdef CONFIG_64BIT |
| @@ -132,7 +122,7 @@ kvm_emulate_mtmsrd_len: | |||
| 132 | .long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4 | 122 | .long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4 |
| 133 | 123 | ||
| 134 | 124 | ||
| 135 | #define MSR_SAFE_BITS (MSR_EE | MSR_CE | MSR_ME | MSR_RI) | 125 | #define MSR_SAFE_BITS (MSR_EE | MSR_RI) |
| 136 | #define MSR_CRITICAL_BITS ~MSR_SAFE_BITS | 126 | #define MSR_CRITICAL_BITS ~MSR_SAFE_BITS |
| 137 | 127 | ||
| 138 | .global kvm_emulate_mtmsr | 128 | .global kvm_emulate_mtmsr |
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 80a577517584..d03eb6f7b058 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c | |||
| @@ -37,56 +37,121 @@ | |||
| 37 | /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ | 37 | /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ |
| 38 | #define MAX_LPID_970 63 | 38 | #define MAX_LPID_970 63 |
| 39 | 39 | ||
| 40 | long kvmppc_alloc_hpt(struct kvm *kvm) | 40 | /* Power architecture requires HPT is at least 256kB */ |
| 41 | #define PPC_MIN_HPT_ORDER 18 | ||
| 42 | |||
| 43 | long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) | ||
| 41 | { | 44 | { |
| 42 | unsigned long hpt; | 45 | unsigned long hpt; |
| 43 | long lpid; | ||
| 44 | struct revmap_entry *rev; | 46 | struct revmap_entry *rev; |
| 45 | struct kvmppc_linear_info *li; | 47 | struct kvmppc_linear_info *li; |
| 48 | long order = kvm_hpt_order; | ||
| 46 | 49 | ||
| 47 | /* Allocate guest's hashed page table */ | 50 | if (htab_orderp) { |
| 48 | li = kvm_alloc_hpt(); | 51 | order = *htab_orderp; |
| 49 | if (li) { | 52 | if (order < PPC_MIN_HPT_ORDER) |
| 50 | /* using preallocated memory */ | 53 | order = PPC_MIN_HPT_ORDER; |
| 51 | hpt = (ulong)li->base_virt; | 54 | } |
| 52 | kvm->arch.hpt_li = li; | 55 | |
| 53 | } else { | 56 | /* |
| 54 | /* using dynamic memory */ | 57 | * If the user wants a different size from default, |
| 58 | * try first to allocate it from the kernel page allocator. | ||
| 59 | */ | ||
| 60 | hpt = 0; | ||
| 61 | if (order != kvm_hpt_order) { | ||
| 55 | hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| | 62 | hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| |
| 56 | __GFP_NOWARN, HPT_ORDER - PAGE_SHIFT); | 63 | __GFP_NOWARN, order - PAGE_SHIFT); |
| 64 | if (!hpt) | ||
| 65 | --order; | ||
| 57 | } | 66 | } |
| 58 | 67 | ||
| 68 | /* Next try to allocate from the preallocated pool */ | ||
| 59 | if (!hpt) { | 69 | if (!hpt) { |
| 60 | pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n"); | 70 | li = kvm_alloc_hpt(); |
| 61 | return -ENOMEM; | 71 | if (li) { |
| 72 | hpt = (ulong)li->base_virt; | ||
| 73 | kvm->arch.hpt_li = li; | ||
| 74 | order = kvm_hpt_order; | ||
| 75 | } | ||
| 62 | } | 76 | } |
| 77 | |||
| 78 | /* Lastly try successively smaller sizes from the page allocator */ | ||
| 79 | while (!hpt && order > PPC_MIN_HPT_ORDER) { | ||
| 80 | hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| | ||
| 81 | __GFP_NOWARN, order - PAGE_SHIFT); | ||
| 82 | if (!hpt) | ||
| 83 | --order; | ||
| 84 | } | ||
| 85 | |||
| 86 | if (!hpt) | ||
| 87 | return -ENOMEM; | ||
| 88 | |||
| 63 | kvm->arch.hpt_virt = hpt; | 89 | kvm->arch.hpt_virt = hpt; |
| 90 | kvm->arch.hpt_order = order; | ||
| 91 | /* HPTEs are 2**4 bytes long */ | ||
| 92 | kvm->arch.hpt_npte = 1ul << (order - 4); | ||
| 93 | /* 128 (2**7) bytes in each HPTEG */ | ||
| 94 | kvm->arch.hpt_mask = (1ul << (order - 7)) - 1; | ||
| 64 | 95 | ||
| 65 | /* Allocate reverse map array */ | 96 | /* Allocate reverse map array */ |
| 66 | rev = vmalloc(sizeof(struct revmap_entry) * HPT_NPTE); | 97 | rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte); |
| 67 | if (!rev) { | 98 | if (!rev) { |
| 68 | pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); | 99 | pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); |
| 69 | goto out_freehpt; | 100 | goto out_freehpt; |
| 70 | } | 101 | } |
| 71 | kvm->arch.revmap = rev; | 102 | kvm->arch.revmap = rev; |
| 103 | kvm->arch.sdr1 = __pa(hpt) | (order - 18); | ||
| 72 | 104 | ||
| 73 | lpid = kvmppc_alloc_lpid(); | 105 | pr_info("KVM guest htab at %lx (order %ld), LPID %x\n", |
| 74 | if (lpid < 0) | 106 | hpt, order, kvm->arch.lpid); |
| 75 | goto out_freeboth; | ||
| 76 | 107 | ||
| 77 | kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18); | 108 | if (htab_orderp) |
| 78 | kvm->arch.lpid = lpid; | 109 | *htab_orderp = order; |
| 79 | |||
| 80 | pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid); | ||
| 81 | return 0; | 110 | return 0; |
| 82 | 111 | ||
| 83 | out_freeboth: | ||
| 84 | vfree(rev); | ||
| 85 | out_freehpt: | 112 | out_freehpt: |
| 86 | free_pages(hpt, HPT_ORDER - PAGE_SHIFT); | 113 | if (kvm->arch.hpt_li) |
| 114 | kvm_release_hpt(kvm->arch.hpt_li); | ||
| 115 | else | ||
| 116 | free_pages(hpt, order - PAGE_SHIFT); | ||
| 87 | return -ENOMEM; | 117 | return -ENOMEM; |
| 88 | } | 118 | } |
| 89 | 119 | ||
| 120 | long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) | ||
| 121 | { | ||
| 122 | long err = -EBUSY; | ||
| 123 | long order; | ||
| 124 | |||
| 125 | mutex_lock(&kvm->lock); | ||
| 126 | if (kvm->arch.rma_setup_done) { | ||
| 127 | kvm->arch.rma_setup_done = 0; | ||
| 128 | /* order rma_setup_done vs. vcpus_running */ | ||
| 129 | smp_mb(); | ||
| 130 | if (atomic_read(&kvm->arch.vcpus_running)) { | ||
| 131 | kvm->arch.rma_setup_done = 1; | ||
| 132 | goto out; | ||
| 133 | } | ||
| 134 | } | ||
| 135 | if (kvm->arch.hpt_virt) { | ||
| 136 | order = kvm->arch.hpt_order; | ||
| 137 | /* Set the entire HPT to 0, i.e. invalid HPTEs */ | ||
| 138 | memset((void *)kvm->arch.hpt_virt, 0, 1ul << order); | ||
| 139 | /* | ||
| 140 | * Set the whole last_vcpu array to an invalid vcpu number. | ||
| 141 | * This ensures that each vcpu will flush its TLB on next entry. | ||
| 142 | */ | ||
| 143 | memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu)); | ||
| 144 | *htab_orderp = order; | ||
| 145 | err = 0; | ||
| 146 | } else { | ||
| 147 | err = kvmppc_alloc_hpt(kvm, htab_orderp); | ||
| 148 | order = *htab_orderp; | ||
| 149 | } | ||
| 150 | out: | ||
| 151 | mutex_unlock(&kvm->lock); | ||
| 152 | return err; | ||
| 153 | } | ||
| 154 | |||
| 90 | void kvmppc_free_hpt(struct kvm *kvm) | 155 | void kvmppc_free_hpt(struct kvm *kvm) |
| 91 | { | 156 | { |
| 92 | kvmppc_free_lpid(kvm->arch.lpid); | 157 | kvmppc_free_lpid(kvm->arch.lpid); |
| @@ -94,7 +159,8 @@ void kvmppc_free_hpt(struct kvm *kvm) | |||
| 94 | if (kvm->arch.hpt_li) | 159 | if (kvm->arch.hpt_li) |
| 95 | kvm_release_hpt(kvm->arch.hpt_li); | 160 | kvm_release_hpt(kvm->arch.hpt_li); |
| 96 | else | 161 | else |
| 97 | free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT); | 162 | free_pages(kvm->arch.hpt_virt, |
| 163 | kvm->arch.hpt_order - PAGE_SHIFT); | ||
| 98 | } | 164 | } |
| 99 | 165 | ||
| 100 | /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ | 166 | /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ |
| @@ -119,6 +185,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, | |||
| 119 | unsigned long psize; | 185 | unsigned long psize; |
| 120 | unsigned long hp0, hp1; | 186 | unsigned long hp0, hp1; |
| 121 | long ret; | 187 | long ret; |
| 188 | struct kvm *kvm = vcpu->kvm; | ||
| 122 | 189 | ||
| 123 | psize = 1ul << porder; | 190 | psize = 1ul << porder; |
| 124 | npages = memslot->npages >> (porder - PAGE_SHIFT); | 191 | npages = memslot->npages >> (porder - PAGE_SHIFT); |
| @@ -127,8 +194,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, | |||
| 127 | if (npages > 1ul << (40 - porder)) | 194 | if (npages > 1ul << (40 - porder)) |
| 128 | npages = 1ul << (40 - porder); | 195 | npages = 1ul << (40 - porder); |
| 129 | /* Can't use more than 1 HPTE per HPTEG */ | 196 | /* Can't use more than 1 HPTE per HPTEG */ |
| 130 | if (npages > HPT_NPTEG) | 197 | if (npages > kvm->arch.hpt_mask + 1) |
| 131 | npages = HPT_NPTEG; | 198 | npages = kvm->arch.hpt_mask + 1; |
| 132 | 199 | ||
| 133 | hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | | 200 | hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | |
| 134 | HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); | 201 | HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); |
| @@ -138,7 +205,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, | |||
| 138 | for (i = 0; i < npages; ++i) { | 205 | for (i = 0; i < npages; ++i) { |
| 139 | addr = i << porder; | 206 | addr = i << porder; |
| 140 | /* can't use hpt_hash since va > 64 bits */ | 207 | /* can't use hpt_hash since va > 64 bits */ |
| 141 | hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK; | 208 | hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask; |
| 142 | /* | 209 | /* |
| 143 | * We assume that the hash table is empty and no | 210 | * We assume that the hash table is empty and no |
| 144 | * vcpus are using it at this stage. Since we create | 211 | * vcpus are using it at this stage. Since we create |
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 3abe1b86e583..83e929e66f9d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c | |||
| @@ -56,7 +56,7 @@ | |||
| 56 | /* #define EXIT_DEBUG_INT */ | 56 | /* #define EXIT_DEBUG_INT */ |
| 57 | 57 | ||
| 58 | static void kvmppc_end_cede(struct kvm_vcpu *vcpu); | 58 | static void kvmppc_end_cede(struct kvm_vcpu *vcpu); |
| 59 | static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu); | 59 | static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); |
| 60 | 60 | ||
| 61 | void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | 61 | void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) |
| 62 | { | 62 | { |
| @@ -1104,11 +1104,15 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) | |||
| 1104 | return -EINTR; | 1104 | return -EINTR; |
| 1105 | } | 1105 | } |
| 1106 | 1106 | ||
| 1107 | /* On the first time here, set up VRMA or RMA */ | 1107 | atomic_inc(&vcpu->kvm->arch.vcpus_running); |
| 1108 | /* Order vcpus_running vs. rma_setup_done, see kvmppc_alloc_reset_hpt */ | ||
| 1109 | smp_mb(); | ||
| 1110 | |||
| 1111 | /* On the first time here, set up HTAB and VRMA or RMA */ | ||
| 1108 | if (!vcpu->kvm->arch.rma_setup_done) { | 1112 | if (!vcpu->kvm->arch.rma_setup_done) { |
| 1109 | r = kvmppc_hv_setup_rma(vcpu); | 1113 | r = kvmppc_hv_setup_htab_rma(vcpu); |
| 1110 | if (r) | 1114 | if (r) |
| 1111 | return r; | 1115 | goto out; |
| 1112 | } | 1116 | } |
| 1113 | 1117 | ||
| 1114 | flush_fp_to_thread(current); | 1118 | flush_fp_to_thread(current); |
| @@ -1126,6 +1130,9 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) | |||
| 1126 | kvmppc_core_prepare_to_enter(vcpu); | 1130 | kvmppc_core_prepare_to_enter(vcpu); |
| 1127 | } | 1131 | } |
| 1128 | } while (r == RESUME_GUEST); | 1132 | } while (r == RESUME_GUEST); |
| 1133 | |||
| 1134 | out: | ||
| 1135 | atomic_dec(&vcpu->kvm->arch.vcpus_running); | ||
| 1129 | return r; | 1136 | return r; |
| 1130 | } | 1137 | } |
| 1131 | 1138 | ||
| @@ -1341,7 +1348,7 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm, | |||
| 1341 | { | 1348 | { |
| 1342 | } | 1349 | } |
| 1343 | 1350 | ||
| 1344 | static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu) | 1351 | static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) |
| 1345 | { | 1352 | { |
| 1346 | int err = 0; | 1353 | int err = 0; |
| 1347 | struct kvm *kvm = vcpu->kvm; | 1354 | struct kvm *kvm = vcpu->kvm; |
| @@ -1360,6 +1367,15 @@ static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu) | |||
| 1360 | if (kvm->arch.rma_setup_done) | 1367 | if (kvm->arch.rma_setup_done) |
| 1361 | goto out; /* another vcpu beat us to it */ | 1368 | goto out; /* another vcpu beat us to it */ |
| 1362 | 1369 | ||
| 1370 | /* Allocate hashed page table (if not done already) and reset it */ | ||
| 1371 | if (!kvm->arch.hpt_virt) { | ||
| 1372 | err = kvmppc_alloc_hpt(kvm, NULL); | ||
| 1373 | if (err) { | ||
| 1374 | pr_err("KVM: Couldn't alloc HPT\n"); | ||
| 1375 | goto out; | ||
| 1376 | } | ||
| 1377 | } | ||
| 1378 | |||
| 1363 | /* Look up the memslot for guest physical address 0 */ | 1379 | /* Look up the memslot for guest physical address 0 */ |
| 1364 | memslot = gfn_to_memslot(kvm, 0); | 1380 | memslot = gfn_to_memslot(kvm, 0); |
| 1365 | 1381 | ||
| @@ -1471,13 +1487,14 @@ static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu) | |||
| 1471 | 1487 | ||
| 1472 | int kvmppc_core_init_vm(struct kvm *kvm) | 1488 | int kvmppc_core_init_vm(struct kvm *kvm) |
| 1473 | { | 1489 | { |
| 1474 | long r; | 1490 | unsigned long lpcr, lpid; |
| 1475 | unsigned long lpcr; | ||
| 1476 | 1491 | ||
| 1477 | /* Allocate hashed page table */ | 1492 | /* Allocate the guest's logical partition ID */ |
| 1478 | r = kvmppc_alloc_hpt(kvm); | 1493 | |
| 1479 | if (r) | 1494 | lpid = kvmppc_alloc_lpid(); |
| 1480 | return r; | 1495 | if (lpid < 0) |
| 1496 | return -ENOMEM; | ||
| 1497 | kvm->arch.lpid = lpid; | ||
| 1481 | 1498 | ||
| 1482 | INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); | 1499 | INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); |
| 1483 | 1500 | ||
| @@ -1487,7 +1504,6 @@ int kvmppc_core_init_vm(struct kvm *kvm) | |||
| 1487 | 1504 | ||
| 1488 | if (cpu_has_feature(CPU_FTR_ARCH_201)) { | 1505 | if (cpu_has_feature(CPU_FTR_ARCH_201)) { |
| 1489 | /* PPC970; HID4 is effectively the LPCR */ | 1506 | /* PPC970; HID4 is effectively the LPCR */ |
| 1490 | unsigned long lpid = kvm->arch.lpid; | ||
| 1491 | kvm->arch.host_lpid = 0; | 1507 | kvm->arch.host_lpid = 0; |
| 1492 | kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4); | 1508 | kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4); |
| 1493 | lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH)); | 1509 | lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH)); |
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index e1b60f56f2a1..fb4eac290fef 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c | |||
| @@ -25,6 +25,9 @@ static void __init kvm_linear_init_one(ulong size, int count, int type); | |||
| 25 | static struct kvmppc_linear_info *kvm_alloc_linear(int type); | 25 | static struct kvmppc_linear_info *kvm_alloc_linear(int type); |
| 26 | static void kvm_release_linear(struct kvmppc_linear_info *ri); | 26 | static void kvm_release_linear(struct kvmppc_linear_info *ri); |
| 27 | 27 | ||
| 28 | int kvm_hpt_order = KVM_DEFAULT_HPT_ORDER; | ||
| 29 | EXPORT_SYMBOL_GPL(kvm_hpt_order); | ||
| 30 | |||
| 28 | /*************** RMA *************/ | 31 | /*************** RMA *************/ |
| 29 | 32 | ||
| 30 | /* | 33 | /* |
| @@ -209,7 +212,7 @@ static void kvm_release_linear(struct kvmppc_linear_info *ri) | |||
| 209 | void __init kvm_linear_init(void) | 212 | void __init kvm_linear_init(void) |
| 210 | { | 213 | { |
| 211 | /* HPT */ | 214 | /* HPT */ |
| 212 | kvm_linear_init_one(1 << HPT_ORDER, kvm_hpt_count, KVM_LINEAR_HPT); | 215 | kvm_linear_init_one(1 << kvm_hpt_order, kvm_hpt_count, KVM_LINEAR_HPT); |
| 213 | 216 | ||
| 214 | /* RMA */ | 217 | /* RMA */ |
| 215 | /* Only do this on PPC970 in HV mode */ | 218 | /* Only do this on PPC970 in HV mode */ |
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index cec4daddbf31..5c70d19494f9 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c | |||
| @@ -237,7 +237,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, | |||
| 237 | 237 | ||
| 238 | /* Find and lock the HPTEG slot to use */ | 238 | /* Find and lock the HPTEG slot to use */ |
| 239 | do_insert: | 239 | do_insert: |
| 240 | if (pte_index >= HPT_NPTE) | 240 | if (pte_index >= kvm->arch.hpt_npte) |
| 241 | return H_PARAMETER; | 241 | return H_PARAMETER; |
| 242 | if (likely((flags & H_EXACT) == 0)) { | 242 | if (likely((flags & H_EXACT) == 0)) { |
| 243 | pte_index &= ~7UL; | 243 | pte_index &= ~7UL; |
| @@ -352,7 +352,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, | |||
| 352 | unsigned long v, r, rb; | 352 | unsigned long v, r, rb; |
| 353 | struct revmap_entry *rev; | 353 | struct revmap_entry *rev; |
| 354 | 354 | ||
| 355 | if (pte_index >= HPT_NPTE) | 355 | if (pte_index >= kvm->arch.hpt_npte) |
| 356 | return H_PARAMETER; | 356 | return H_PARAMETER; |
| 357 | hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); | 357 | hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); |
| 358 | while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) | 358 | while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) |
| @@ -419,7 +419,8 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) | |||
| 419 | i = 4; | 419 | i = 4; |
| 420 | break; | 420 | break; |
| 421 | } | 421 | } |
| 422 | if (req != 1 || flags == 3 || pte_index >= HPT_NPTE) { | 422 | if (req != 1 || flags == 3 || |
| 423 | pte_index >= kvm->arch.hpt_npte) { | ||
| 423 | /* parameter error */ | 424 | /* parameter error */ |
| 424 | args[j] = ((0xa0 | flags) << 56) + pte_index; | 425 | args[j] = ((0xa0 | flags) << 56) + pte_index; |
| 425 | ret = H_PARAMETER; | 426 | ret = H_PARAMETER; |
| @@ -521,7 +522,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, | |||
| 521 | struct revmap_entry *rev; | 522 | struct revmap_entry *rev; |
| 522 | unsigned long v, r, rb, mask, bits; | 523 | unsigned long v, r, rb, mask, bits; |
| 523 | 524 | ||
| 524 | if (pte_index >= HPT_NPTE) | 525 | if (pte_index >= kvm->arch.hpt_npte) |
| 525 | return H_PARAMETER; | 526 | return H_PARAMETER; |
| 526 | 527 | ||
| 527 | hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); | 528 | hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); |
| @@ -583,7 +584,7 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, | |||
| 583 | int i, n = 1; | 584 | int i, n = 1; |
| 584 | struct revmap_entry *rev = NULL; | 585 | struct revmap_entry *rev = NULL; |
| 585 | 586 | ||
| 586 | if (pte_index >= HPT_NPTE) | 587 | if (pte_index >= kvm->arch.hpt_npte) |
| 587 | return H_PARAMETER; | 588 | return H_PARAMETER; |
| 588 | if (flags & H_READ_4) { | 589 | if (flags & H_READ_4) { |
| 589 | pte_index &= ~3; | 590 | pte_index &= ~3; |
| @@ -678,7 +679,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, | |||
| 678 | somask = (1UL << 28) - 1; | 679 | somask = (1UL << 28) - 1; |
| 679 | vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT; | 680 | vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT; |
| 680 | } | 681 | } |
| 681 | hash = (vsid ^ ((eaddr & somask) >> pshift)) & HPT_HASH_MASK; | 682 | hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask; |
| 682 | avpn = slb_v & ~(somask >> 16); /* also includes B */ | 683 | avpn = slb_v & ~(somask >> 16); /* also includes B */ |
| 683 | avpn |= (eaddr & somask) >> 16; | 684 | avpn |= (eaddr & somask) >> 16; |
| 684 | 685 | ||
| @@ -723,7 +724,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, | |||
| 723 | if (val & HPTE_V_SECONDARY) | 724 | if (val & HPTE_V_SECONDARY) |
| 724 | break; | 725 | break; |
| 725 | val |= HPTE_V_SECONDARY; | 726 | val |= HPTE_V_SECONDARY; |
| 726 | hash = hash ^ HPT_HASH_MASK; | 727 | hash = hash ^ kvm->arch.hpt_mask; |
| 727 | } | 728 | } |
| 728 | return -1; | 729 | return -1; |
| 729 | } | 730 | } |
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 72f13f4a06e0..d25a097c852b 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c | |||
| @@ -612,6 +612,12 @@ static void kvmppc_fill_pt_regs(struct pt_regs *regs) | |||
| 612 | regs->link = lr; | 612 | regs->link = lr; |
| 613 | } | 613 | } |
| 614 | 614 | ||
| 615 | /* | ||
| 616 | * For interrupts needed to be handled by host interrupt handlers, | ||
| 617 | * corresponding host handler are called from here in similar way | ||
| 618 | * (but not exact) as they are called from low level handler | ||
| 619 | * (such as from arch/powerpc/kernel/head_fsl_booke.S). | ||
| 620 | */ | ||
| 615 | static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, | 621 | static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, |
| 616 | unsigned int exit_nr) | 622 | unsigned int exit_nr) |
| 617 | { | 623 | { |
| @@ -639,6 +645,17 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, | |||
| 639 | kvmppc_fill_pt_regs(®s); | 645 | kvmppc_fill_pt_regs(®s); |
| 640 | performance_monitor_exception(®s); | 646 | performance_monitor_exception(®s); |
| 641 | break; | 647 | break; |
| 648 | case BOOKE_INTERRUPT_WATCHDOG: | ||
| 649 | kvmppc_fill_pt_regs(®s); | ||
| 650 | #ifdef CONFIG_BOOKE_WDT | ||
| 651 | WatchdogException(®s); | ||
| 652 | #else | ||
| 653 | unknown_exception(®s); | ||
| 654 | #endif | ||
| 655 | break; | ||
| 656 | case BOOKE_INTERRUPT_CRITICAL: | ||
| 657 | unknown_exception(®s); | ||
| 658 | break; | ||
| 642 | } | 659 | } |
| 643 | } | 660 | } |
| 644 | 661 | ||
| @@ -683,6 +700,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, | |||
| 683 | r = RESUME_GUEST; | 700 | r = RESUME_GUEST; |
| 684 | break; | 701 | break; |
| 685 | 702 | ||
| 703 | case BOOKE_INTERRUPT_WATCHDOG: | ||
| 704 | r = RESUME_GUEST; | ||
| 705 | break; | ||
| 706 | |||
| 686 | case BOOKE_INTERRUPT_DOORBELL: | 707 | case BOOKE_INTERRUPT_DOORBELL: |
| 687 | kvmppc_account_exit(vcpu, DBELL_EXITS); | 708 | kvmppc_account_exit(vcpu, DBELL_EXITS); |
| 688 | r = RESUME_GUEST; | 709 | r = RESUME_GUEST; |
| @@ -1267,6 +1288,11 @@ void kvmppc_decrementer_func(unsigned long data) | |||
| 1267 | { | 1288 | { |
| 1268 | struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; | 1289 | struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; |
| 1269 | 1290 | ||
| 1291 | if (vcpu->arch.tcr & TCR_ARE) { | ||
| 1292 | vcpu->arch.dec = vcpu->arch.decar; | ||
| 1293 | kvmppc_emulate_dec(vcpu); | ||
| 1294 | } | ||
| 1295 | |||
| 1270 | kvmppc_set_tsr_bits(vcpu, TSR_DIS); | 1296 | kvmppc_set_tsr_bits(vcpu, TSR_DIS); |
| 1271 | } | 1297 | } |
| 1272 | 1298 | ||
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index 6c76397f2af4..12834bb608ab 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c | |||
| @@ -24,6 +24,7 @@ | |||
| 24 | #include "booke.h" | 24 | #include "booke.h" |
| 25 | 25 | ||
| 26 | #define OP_19_XOP_RFI 50 | 26 | #define OP_19_XOP_RFI 50 |
| 27 | #define OP_19_XOP_RFCI 51 | ||
| 27 | 28 | ||
| 28 | #define OP_31_XOP_MFMSR 83 | 29 | #define OP_31_XOP_MFMSR 83 |
| 29 | #define OP_31_XOP_WRTEE 131 | 30 | #define OP_31_XOP_WRTEE 131 |
| @@ -36,6 +37,12 @@ static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu) | |||
| 36 | kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1); | 37 | kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1); |
| 37 | } | 38 | } |
| 38 | 39 | ||
| 40 | static void kvmppc_emul_rfci(struct kvm_vcpu *vcpu) | ||
| 41 | { | ||
| 42 | vcpu->arch.pc = vcpu->arch.csrr0; | ||
| 43 | kvmppc_set_msr(vcpu, vcpu->arch.csrr1); | ||
| 44 | } | ||
| 45 | |||
| 39 | int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, | 46 | int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, |
| 40 | unsigned int inst, int *advance) | 47 | unsigned int inst, int *advance) |
| 41 | { | 48 | { |
| @@ -52,6 +59,12 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, | |||
| 52 | *advance = 0; | 59 | *advance = 0; |
| 53 | break; | 60 | break; |
| 54 | 61 | ||
| 62 | case OP_19_XOP_RFCI: | ||
| 63 | kvmppc_emul_rfci(vcpu); | ||
| 64 | kvmppc_set_exit_type(vcpu, EMULATED_RFCI_EXITS); | ||
| 65 | *advance = 0; | ||
| 66 | break; | ||
| 67 | |||
| 55 | default: | 68 | default: |
| 56 | emulated = EMULATE_FAIL; | 69 | emulated = EMULATE_FAIL; |
| 57 | break; | 70 | break; |
| @@ -113,6 +126,12 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) | |||
| 113 | case SPRN_ESR: | 126 | case SPRN_ESR: |
| 114 | vcpu->arch.shared->esr = spr_val; | 127 | vcpu->arch.shared->esr = spr_val; |
| 115 | break; | 128 | break; |
| 129 | case SPRN_CSRR0: | ||
| 130 | vcpu->arch.csrr0 = spr_val; | ||
| 131 | break; | ||
| 132 | case SPRN_CSRR1: | ||
| 133 | vcpu->arch.csrr1 = spr_val; | ||
| 134 | break; | ||
| 116 | case SPRN_DBCR0: | 135 | case SPRN_DBCR0: |
| 117 | vcpu->arch.dbcr0 = spr_val; | 136 | vcpu->arch.dbcr0 = spr_val; |
| 118 | break; | 137 | break; |
| @@ -129,6 +148,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) | |||
| 129 | kvmppc_set_tcr(vcpu, spr_val); | 148 | kvmppc_set_tcr(vcpu, spr_val); |
| 130 | break; | 149 | break; |
| 131 | 150 | ||
| 151 | case SPRN_DECAR: | ||
| 152 | vcpu->arch.decar = spr_val; | ||
| 153 | break; | ||
| 132 | /* | 154 | /* |
| 133 | * Note: SPRG4-7 are user-readable. | 155 | * Note: SPRG4-7 are user-readable. |
| 134 | * These values are loaded into the real SPRGs when resuming the | 156 | * These values are loaded into the real SPRGs when resuming the |
| @@ -229,6 +251,12 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) | |||
| 229 | case SPRN_ESR: | 251 | case SPRN_ESR: |
| 230 | *spr_val = vcpu->arch.shared->esr; | 252 | *spr_val = vcpu->arch.shared->esr; |
| 231 | break; | 253 | break; |
| 254 | case SPRN_CSRR0: | ||
| 255 | *spr_val = vcpu->arch.csrr0; | ||
| 256 | break; | ||
| 257 | case SPRN_CSRR1: | ||
| 258 | *spr_val = vcpu->arch.csrr1; | ||
| 259 | break; | ||
| 232 | case SPRN_DBCR0: | 260 | case SPRN_DBCR0: |
| 233 | *spr_val = vcpu->arch.dbcr0; | 261 | *spr_val = vcpu->arch.dbcr0; |
| 234 | break; | 262 | break; |
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index 8fd4b2a0911b..bb46b32f9813 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S | |||
| @@ -52,16 +52,21 @@ | |||
| 52 | (1<<BOOKE_INTERRUPT_PROGRAM) | \ | 52 | (1<<BOOKE_INTERRUPT_PROGRAM) | \ |
| 53 | (1<<BOOKE_INTERRUPT_DTLB_MISS)) | 53 | (1<<BOOKE_INTERRUPT_DTLB_MISS)) |
| 54 | 54 | ||
| 55 | .macro KVM_HANDLER ivor_nr | 55 | .macro KVM_HANDLER ivor_nr scratch srr0 |
| 56 | _GLOBAL(kvmppc_handler_\ivor_nr) | 56 | _GLOBAL(kvmppc_handler_\ivor_nr) |
| 57 | /* Get pointer to vcpu and record exit number. */ | 57 | /* Get pointer to vcpu and record exit number. */ |
| 58 | mtspr SPRN_SPRG_WSCRATCH0, r4 | 58 | mtspr \scratch , r4 |
| 59 | mfspr r4, SPRN_SPRG_RVCPU | 59 | mfspr r4, SPRN_SPRG_RVCPU |
| 60 | stw r3, VCPU_GPR(R3)(r4) | ||
| 60 | stw r5, VCPU_GPR(R5)(r4) | 61 | stw r5, VCPU_GPR(R5)(r4) |
| 61 | stw r6, VCPU_GPR(R6)(r4) | 62 | stw r6, VCPU_GPR(R6)(r4) |
| 63 | mfspr r3, \scratch | ||
| 62 | mfctr r5 | 64 | mfctr r5 |
| 63 | lis r6, kvmppc_resume_host@h | 65 | stw r3, VCPU_GPR(R4)(r4) |
| 64 | stw r5, VCPU_CTR(r4) | 66 | stw r5, VCPU_CTR(r4) |
| 67 | mfspr r3, \srr0 | ||
| 68 | lis r6, kvmppc_resume_host@h | ||
| 69 | stw r3, VCPU_PC(r4) | ||
| 65 | li r5, \ivor_nr | 70 | li r5, \ivor_nr |
| 66 | ori r6, r6, kvmppc_resume_host@l | 71 | ori r6, r6, kvmppc_resume_host@l |
| 67 | mtctr r6 | 72 | mtctr r6 |
| @@ -69,37 +74,35 @@ _GLOBAL(kvmppc_handler_\ivor_nr) | |||
| 69 | .endm | 74 | .endm |
| 70 | 75 | ||
| 71 | _GLOBAL(kvmppc_handlers_start) | 76 | _GLOBAL(kvmppc_handlers_start) |
| 72 | KVM_HANDLER BOOKE_INTERRUPT_CRITICAL | 77 | KVM_HANDLER BOOKE_INTERRUPT_CRITICAL SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 |
| 73 | KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK | 78 | KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK SPRN_SPRG_RSCRATCH_MC SPRN_MCSRR0 |
| 74 | KVM_HANDLER BOOKE_INTERRUPT_DATA_STORAGE | 79 | KVM_HANDLER BOOKE_INTERRUPT_DATA_STORAGE SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
| 75 | KVM_HANDLER BOOKE_INTERRUPT_INST_STORAGE | 80 | KVM_HANDLER BOOKE_INTERRUPT_INST_STORAGE SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
| 76 | KVM_HANDLER BOOKE_INTERRUPT_EXTERNAL | 81 | KVM_HANDLER BOOKE_INTERRUPT_EXTERNAL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
| 77 | KVM_HANDLER BOOKE_INTERRUPT_ALIGNMENT | 82 | KVM_HANDLER BOOKE_INTERRUPT_ALIGNMENT SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
| 78 | KVM_HANDLER BOOKE_INTERRUPT_PROGRAM | 83 | KVM_HANDLER BOOKE_INTERRUPT_PROGRAM SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
| 79 | KVM_HANDLER BOOKE_INTERRUPT_FP_UNAVAIL | 84 | KVM_HANDLER BOOKE_INTERRUPT_FP_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
| 80 | KVM_HANDLER BOOKE_INTERRUPT_SYSCALL | 85 | KVM_HANDLER BOOKE_INTERRUPT_SYSCALL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
| 81 | KVM_HANDLER BOOKE_INTERRUPT_AP_UNAVAIL | 86 | KVM_HANDLER BOOKE_INTERRUPT_AP_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
| 82 | KVM_HANDLER BOOKE_INTERRUPT_DECREMENTER | 87 | KVM_HANDLER BOOKE_INTERRUPT_DECREMENTER SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
| 83 | KVM_HANDLER BOOKE_INTERRUPT_FIT | 88 | KVM_HANDLER BOOKE_INTERRUPT_FIT SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
| 84 | KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG | 89 | KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 |
| 85 | KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS | 90 | KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
| 86 | KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS | 91 | KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
| 87 | KVM_HANDLER BOOKE_INTERRUPT_DEBUG | 92 | KVM_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 |
| 88 | KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL | 93 | KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
| 89 | KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA | 94 | KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
| 90 | KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND | 95 | KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND SPRN_SPRG_RSCRATCH0 SPRN_SRR0 |
| 91 | 96 | ||
| 92 | _GLOBAL(kvmppc_handler_len) | 97 | _GLOBAL(kvmppc_handler_len) |
| 93 | .long kvmppc_handler_1 - kvmppc_handler_0 | 98 | .long kvmppc_handler_1 - kvmppc_handler_0 |
| 94 | 99 | ||
| 95 | |||
| 96 | /* Registers: | 100 | /* Registers: |
| 97 | * SPRG_SCRATCH0: guest r4 | 101 | * SPRG_SCRATCH0: guest r4 |
| 98 | * r4: vcpu pointer | 102 | * r4: vcpu pointer |
| 99 | * r5: KVM exit number | 103 | * r5: KVM exit number |
| 100 | */ | 104 | */ |
| 101 | _GLOBAL(kvmppc_resume_host) | 105 | _GLOBAL(kvmppc_resume_host) |
| 102 | stw r3, VCPU_GPR(R3)(r4) | ||
| 103 | mfcr r3 | 106 | mfcr r3 |
| 104 | stw r3, VCPU_CR(r4) | 107 | stw r3, VCPU_CR(r4) |
| 105 | stw r7, VCPU_GPR(R7)(r4) | 108 | stw r7, VCPU_GPR(R7)(r4) |
| @@ -180,10 +183,6 @@ _GLOBAL(kvmppc_resume_host) | |||
| 180 | stw r3, VCPU_LR(r4) | 183 | stw r3, VCPU_LR(r4) |
| 181 | mfxer r3 | 184 | mfxer r3 |
| 182 | stw r3, VCPU_XER(r4) | 185 | stw r3, VCPU_XER(r4) |
| 183 | mfspr r3, SPRN_SPRG_RSCRATCH0 | ||
| 184 | stw r3, VCPU_GPR(R4)(r4) | ||
| 185 | mfspr r3, SPRN_SRR0 | ||
| 186 | stw r3, VCPU_PC(r4) | ||
| 187 | 186 | ||
| 188 | /* Restore host stack pointer and PID before IVPR, since the host | 187 | /* Restore host stack pointer and PID before IVPR, since the host |
| 189 | * exception handlers use them. */ | 188 | * exception handlers use them. */ |
diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S index 1685dc43bcf2..d28c2d43ac1b 100644 --- a/arch/powerpc/kvm/bookehv_interrupts.S +++ b/arch/powerpc/kvm/bookehv_interrupts.S | |||
| @@ -262,7 +262,7 @@ kvm_lvl_handler BOOKE_INTERRUPT_CRITICAL, \ | |||
| 262 | kvm_lvl_handler BOOKE_INTERRUPT_MACHINE_CHECK, \ | 262 | kvm_lvl_handler BOOKE_INTERRUPT_MACHINE_CHECK, \ |
| 263 | SPRN_SPRG_RSCRATCH_MC, SPRN_MCSRR0, SPRN_MCSRR1, 0 | 263 | SPRN_SPRG_RSCRATCH_MC, SPRN_MCSRR0, SPRN_MCSRR1, 0 |
| 264 | kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, \ | 264 | kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, \ |
| 265 | SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR) | 265 | SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) |
| 266 | kvm_handler BOOKE_INTERRUPT_INST_STORAGE, SPRN_SRR0, SPRN_SRR1, NEED_ESR | 266 | kvm_handler BOOKE_INTERRUPT_INST_STORAGE, SPRN_SRR0, SPRN_SRR1, NEED_ESR |
| 267 | kvm_handler BOOKE_INTERRUPT_EXTERNAL, SPRN_SRR0, SPRN_SRR1, 0 | 267 | kvm_handler BOOKE_INTERRUPT_EXTERNAL, SPRN_SRR0, SPRN_SRR1, 0 |
| 268 | kvm_handler BOOKE_INTERRUPT_ALIGNMENT, \ | 268 | kvm_handler BOOKE_INTERRUPT_ALIGNMENT, \ |
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index 8b99e076dc81..e04b0ef55ce0 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c | |||
| @@ -269,6 +269,9 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) | |||
| 269 | *spr_val = vcpu->arch.shared->mas7_3 >> 32; | 269 | *spr_val = vcpu->arch.shared->mas7_3 >> 32; |
| 270 | break; | 270 | break; |
| 271 | #endif | 271 | #endif |
| 272 | case SPRN_DECAR: | ||
| 273 | *spr_val = vcpu->arch.decar; | ||
| 274 | break; | ||
| 272 | case SPRN_TLB0CFG: | 275 | case SPRN_TLB0CFG: |
| 273 | *spr_val = vcpu->arch.tlbcfg[0]; | 276 | *spr_val = vcpu->arch.tlbcfg[0]; |
| 274 | break; | 277 | break; |
diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index fe6c1de6b701..1f89d26e65fb 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * Copyright (C) 2010 Freescale Semiconductor, Inc. All rights reserved. | 2 | * Copyright (C) 2010,2012 Freescale Semiconductor, Inc. All rights reserved. |
| 3 | * | 3 | * |
| 4 | * Author: Varun Sethi, <varun.sethi@freescale.com> | 4 | * Author: Varun Sethi, <varun.sethi@freescale.com> |
| 5 | * | 5 | * |
| @@ -57,7 +57,8 @@ void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500, | |||
| 57 | struct kvm_book3e_206_tlb_entry *gtlbe) | 57 | struct kvm_book3e_206_tlb_entry *gtlbe) |
| 58 | { | 58 | { |
| 59 | unsigned int tid, ts; | 59 | unsigned int tid, ts; |
| 60 | u32 val, eaddr, lpid; | 60 | gva_t eaddr; |
| 61 | u32 val, lpid; | ||
| 61 | unsigned long flags; | 62 | unsigned long flags; |
| 62 | 63 | ||
| 63 | ts = get_tlb_ts(gtlbe); | 64 | ts = get_tlb_ts(gtlbe); |
| @@ -183,6 +184,9 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) | |||
| 183 | 184 | ||
| 184 | vcpu->arch.shadow_epcr = SPRN_EPCR_DSIGS | SPRN_EPCR_DGTMI | \ | 185 | vcpu->arch.shadow_epcr = SPRN_EPCR_DSIGS | SPRN_EPCR_DGTMI | \ |
| 185 | SPRN_EPCR_DUVD; | 186 | SPRN_EPCR_DUVD; |
| 187 | #ifdef CONFIG_64BIT | ||
| 188 | vcpu->arch.shadow_epcr |= SPRN_EPCR_ICM; | ||
| 189 | #endif | ||
| 186 | vcpu->arch.shadow_msrp = MSRP_UCLEP | MSRP_DEP | MSRP_PMMP; | 190 | vcpu->arch.shadow_msrp = MSRP_UCLEP | MSRP_DEP | MSRP_PMMP; |
| 187 | vcpu->arch.eplc = EPC_EGS | (vcpu->kvm->arch.lpid << EPC_ELPID_SHIFT); | 191 | vcpu->arch.eplc = EPC_EGS | (vcpu->kvm->arch.lpid << EPC_ELPID_SHIFT); |
| 188 | vcpu->arch.epsc = vcpu->arch.eplc; | 192 | vcpu->arch.epsc = vcpu->arch.eplc; |
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index f90e86dea7a2..ee04abaefe23 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c | |||
| @@ -59,11 +59,13 @@ | |||
| 59 | #define OP_31_XOP_STHBRX 918 | 59 | #define OP_31_XOP_STHBRX 918 |
| 60 | 60 | ||
| 61 | #define OP_LWZ 32 | 61 | #define OP_LWZ 32 |
| 62 | #define OP_LD 58 | ||
| 62 | #define OP_LWZU 33 | 63 | #define OP_LWZU 33 |
| 63 | #define OP_LBZ 34 | 64 | #define OP_LBZ 34 |
| 64 | #define OP_LBZU 35 | 65 | #define OP_LBZU 35 |
| 65 | #define OP_STW 36 | 66 | #define OP_STW 36 |
| 66 | #define OP_STWU 37 | 67 | #define OP_STWU 37 |
| 68 | #define OP_STD 62 | ||
| 67 | #define OP_STB 38 | 69 | #define OP_STB 38 |
| 68 | #define OP_STBU 39 | 70 | #define OP_STBU 39 |
| 69 | #define OP_LHZ 40 | 71 | #define OP_LHZ 40 |
| @@ -392,6 +394,12 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) | |||
| 392 | emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); | 394 | emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); |
| 393 | break; | 395 | break; |
| 394 | 396 | ||
| 397 | /* TBD: Add support for other 64 bit load variants like ldu, ldux, ldx etc. */ | ||
| 398 | case OP_LD: | ||
| 399 | rt = get_rt(inst); | ||
| 400 | emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1); | ||
| 401 | break; | ||
| 402 | |||
| 395 | case OP_LWZU: | 403 | case OP_LWZU: |
| 396 | emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); | 404 | emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); |
| 397 | kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); | 405 | kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); |
| @@ -412,6 +420,14 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) | |||
| 412 | 4, 1); | 420 | 4, 1); |
| 413 | break; | 421 | break; |
| 414 | 422 | ||
| 423 | /* TBD: Add support for other 64 bit store variants like stdu, stdux, stdx etc. */ | ||
| 424 | case OP_STD: | ||
| 425 | rs = get_rs(inst); | ||
| 426 | emulated = kvmppc_handle_store(run, vcpu, | ||
| 427 | kvmppc_get_gpr(vcpu, rs), | ||
| 428 | 8, 1); | ||
| 429 | break; | ||
| 430 | |||
| 415 | case OP_STWU: | 431 | case OP_STWU: |
| 416 | emulated = kvmppc_handle_store(run, vcpu, | 432 | emulated = kvmppc_handle_store(run, vcpu, |
| 417 | kvmppc_get_gpr(vcpu, rs), | 433 | kvmppc_get_gpr(vcpu, rs), |
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 1493c8de947b..87f4dc886076 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c | |||
| @@ -246,6 +246,7 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
| 246 | #endif | 246 | #endif |
| 247 | #ifdef CONFIG_PPC_BOOK3S_64 | 247 | #ifdef CONFIG_PPC_BOOK3S_64 |
| 248 | case KVM_CAP_SPAPR_TCE: | 248 | case KVM_CAP_SPAPR_TCE: |
| 249 | case KVM_CAP_PPC_ALLOC_HTAB: | ||
| 249 | r = 1; | 250 | r = 1; |
| 250 | break; | 251 | break; |
| 251 | #endif /* CONFIG_PPC_BOOK3S_64 */ | 252 | #endif /* CONFIG_PPC_BOOK3S_64 */ |
| @@ -802,6 +803,23 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
| 802 | r = -EFAULT; | 803 | r = -EFAULT; |
| 803 | break; | 804 | break; |
| 804 | } | 805 | } |
| 806 | |||
| 807 | case KVM_PPC_ALLOCATE_HTAB: { | ||
| 808 | struct kvm *kvm = filp->private_data; | ||
| 809 | u32 htab_order; | ||
| 810 | |||
| 811 | r = -EFAULT; | ||
| 812 | if (get_user(htab_order, (u32 __user *)argp)) | ||
| 813 | break; | ||
| 814 | r = kvmppc_alloc_reset_hpt(kvm, &htab_order); | ||
| 815 | if (r) | ||
| 816 | break; | ||
| 817 | r = -EFAULT; | ||
| 818 | if (put_user(htab_order, (u32 __user *)argp)) | ||
| 819 | break; | ||
| 820 | r = 0; | ||
| 821 | break; | ||
| 822 | } | ||
| 805 | #endif /* CONFIG_KVM_BOOK3S_64_HV */ | 823 | #endif /* CONFIG_KVM_BOOK3S_64_HV */ |
| 806 | 824 | ||
| 807 | #ifdef CONFIG_PPC_BOOK3S_64 | 825 | #ifdef CONFIG_PPC_BOOK3S_64 |
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index a35ca44ade66..e7a896acd982 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig | |||
| @@ -25,6 +25,7 @@ source "arch/powerpc/platforms/wsp/Kconfig" | |||
| 25 | config KVM_GUEST | 25 | config KVM_GUEST |
| 26 | bool "KVM Guest support" | 26 | bool "KVM Guest support" |
| 27 | default n | 27 | default n |
| 28 | select EPAPR_PARAVIRT | ||
| 28 | ---help--- | 29 | ---help--- |
| 29 | This option enables various optimizations for running under the KVM | 30 | This option enables various optimizations for running under the KVM |
| 30 | hypervisor. Overhead for the kernel when not running inside KVM should | 31 | hypervisor. Overhead for the kernel when not running inside KVM should |
| @@ -32,6 +33,14 @@ config KVM_GUEST | |||
| 32 | 33 | ||
| 33 | In case of doubt, say Y | 34 | In case of doubt, say Y |
| 34 | 35 | ||
| 36 | config EPAPR_PARAVIRT | ||
| 37 | bool "ePAPR para-virtualization support" | ||
| 38 | default n | ||
| 39 | help | ||
| 40 | Enables ePAPR para-virtualization support for guests. | ||
| 41 | |||
| 42 | In case of doubt, say Y | ||
| 43 | |||
| 35 | config PPC_NATIVE | 44 | config PPC_NATIVE |
| 36 | bool | 45 | bool |
| 37 | depends on 6xx || PPC64 | 46 | depends on 6xx || PPC64 |
diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index 8685d1fb8b75..e62a555557ee 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h | |||
| @@ -53,5 +53,7 @@ int sclp_chp_configure(struct chp_id chpid); | |||
| 53 | int sclp_chp_deconfigure(struct chp_id chpid); | 53 | int sclp_chp_deconfigure(struct chp_id chpid); |
| 54 | int sclp_chp_read_info(struct sclp_chp_info *info); | 54 | int sclp_chp_read_info(struct sclp_chp_info *info); |
| 55 | void sclp_get_ipl_info(struct sclp_ipl_info *info); | 55 | void sclp_get_ipl_info(struct sclp_ipl_info *info); |
| 56 | bool sclp_has_linemode(void); | ||
| 57 | bool sclp_has_vt220(void); | ||
| 56 | 58 | ||
| 57 | #endif /* _ASM_S390_SCLP_H */ | 59 | #endif /* _ASM_S390_SCLP_H */ |
diff --git a/arch/s390/include/asm/sigp.h b/arch/s390/include/asm/sigp.h index 7306270b5b84..5a87d16d3e7c 100644 --- a/arch/s390/include/asm/sigp.h +++ b/arch/s390/include/asm/sigp.h | |||
| @@ -24,6 +24,7 @@ | |||
| 24 | 24 | ||
| 25 | #define SIGP_STATUS_CHECK_STOP 0x00000010UL | 25 | #define SIGP_STATUS_CHECK_STOP 0x00000010UL |
| 26 | #define SIGP_STATUS_STOPPED 0x00000040UL | 26 | #define SIGP_STATUS_STOPPED 0x00000040UL |
| 27 | #define SIGP_STATUS_EXT_CALL_PENDING 0x00000080UL | ||
| 27 | #define SIGP_STATUS_INVALID_PARAMETER 0x00000100UL | 28 | #define SIGP_STATUS_INVALID_PARAMETER 0x00000100UL |
| 28 | #define SIGP_STATUS_INCORRECT_STATE 0x00000200UL | 29 | #define SIGP_STATUS_INCORRECT_STATE 0x00000200UL |
| 29 | #define SIGP_STATUS_NOT_RUNNING 0x00000400UL | 30 | #define SIGP_STATUS_NOT_RUNNING 0x00000400UL |
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 34d75b50526c..743c0f32fe3b 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c | |||
| @@ -61,6 +61,7 @@ | |||
| 61 | #include <asm/kvm_virtio.h> | 61 | #include <asm/kvm_virtio.h> |
| 62 | #include <asm/diag.h> | 62 | #include <asm/diag.h> |
| 63 | #include <asm/os_info.h> | 63 | #include <asm/os_info.h> |
| 64 | #include <asm/sclp.h> | ||
| 64 | #include "entry.h" | 65 | #include "entry.h" |
| 65 | 66 | ||
| 66 | long psw_kernel_bits = PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_ASC_PRIMARY | | 67 | long psw_kernel_bits = PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_ASC_PRIMARY | |
| @@ -136,9 +137,14 @@ __setup("condev=", condev_setup); | |||
| 136 | 137 | ||
| 137 | static void __init set_preferred_console(void) | 138 | static void __init set_preferred_console(void) |
| 138 | { | 139 | { |
| 139 | if (MACHINE_IS_KVM) | 140 | if (MACHINE_IS_KVM) { |
| 140 | add_preferred_console("hvc", 0, NULL); | 141 | if (sclp_has_vt220()) |
| 141 | else if (CONSOLE_IS_3215 || CONSOLE_IS_SCLP) | 142 | add_preferred_console("ttyS", 1, NULL); |
| 143 | else if (sclp_has_linemode()) | ||
| 144 | add_preferred_console("ttyS", 0, NULL); | ||
| 145 | else | ||
| 146 | add_preferred_console("hvc", 0, NULL); | ||
| 147 | } else if (CONSOLE_IS_3215 || CONSOLE_IS_SCLP) | ||
| 142 | add_preferred_console("ttyS", 0, NULL); | 148 | add_preferred_console("ttyS", 0, NULL); |
| 143 | else if (CONSOLE_IS_3270) | 149 | else if (CONSOLE_IS_3270) |
| 144 | add_preferred_console("tty3270", 0, NULL); | 150 | add_preferred_console("tty3270", 0, NULL); |
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index c552d1f4103f..d470ccbfabae 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c | |||
| @@ -347,6 +347,7 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) | |||
| 347 | vcpu->arch.guest_fpregs.fpc = 0; | 347 | vcpu->arch.guest_fpregs.fpc = 0; |
| 348 | asm volatile("lfpc %0" : : "Q" (vcpu->arch.guest_fpregs.fpc)); | 348 | asm volatile("lfpc %0" : : "Q" (vcpu->arch.guest_fpregs.fpc)); |
| 349 | vcpu->arch.sie_block->gbea = 1; | 349 | vcpu->arch.sie_block->gbea = 1; |
| 350 | atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); | ||
| 350 | } | 351 | } |
| 351 | 352 | ||
| 352 | int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) | 353 | int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) |
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 1ab2ce1611c5..56f80e1f98f7 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c | |||
| @@ -26,19 +26,23 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr, | |||
| 26 | int rc; | 26 | int rc; |
| 27 | 27 | ||
| 28 | if (cpu_addr >= KVM_MAX_VCPUS) | 28 | if (cpu_addr >= KVM_MAX_VCPUS) |
| 29 | return 3; /* not operational */ | 29 | return SIGP_CC_NOT_OPERATIONAL; |
| 30 | 30 | ||
| 31 | spin_lock(&fi->lock); | 31 | spin_lock(&fi->lock); |
| 32 | if (fi->local_int[cpu_addr] == NULL) | 32 | if (fi->local_int[cpu_addr] == NULL) |
| 33 | rc = 3; /* not operational */ | 33 | rc = SIGP_CC_NOT_OPERATIONAL; |
| 34 | else if (!(atomic_read(fi->local_int[cpu_addr]->cpuflags) | 34 | else if (!(atomic_read(fi->local_int[cpu_addr]->cpuflags) |
| 35 | & CPUSTAT_STOPPED)) { | 35 | & (CPUSTAT_ECALL_PEND | CPUSTAT_STOPPED))) |
| 36 | *reg &= 0xffffffff00000000UL; | 36 | rc = SIGP_CC_ORDER_CODE_ACCEPTED; |
| 37 | rc = 1; /* status stored */ | 37 | else { |
| 38 | } else { | ||
| 39 | *reg &= 0xffffffff00000000UL; | 38 | *reg &= 0xffffffff00000000UL; |
| 40 | *reg |= SIGP_STATUS_STOPPED; | 39 | if (atomic_read(fi->local_int[cpu_addr]->cpuflags) |
| 41 | rc = 1; /* status stored */ | 40 | & CPUSTAT_ECALL_PEND) |
| 41 | *reg |= SIGP_STATUS_EXT_CALL_PENDING; | ||
| 42 | if (atomic_read(fi->local_int[cpu_addr]->cpuflags) | ||
| 43 | & CPUSTAT_STOPPED) | ||
| 44 | *reg |= SIGP_STATUS_STOPPED; | ||
| 45 | rc = SIGP_CC_STATUS_STORED; | ||
| 42 | } | 46 | } |
| 43 | spin_unlock(&fi->lock); | 47 | spin_unlock(&fi->lock); |
| 44 | 48 | ||
| @@ -54,7 +58,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr) | |||
| 54 | int rc; | 58 | int rc; |
| 55 | 59 | ||
| 56 | if (cpu_addr >= KVM_MAX_VCPUS) | 60 | if (cpu_addr >= KVM_MAX_VCPUS) |
| 57 | return 3; /* not operational */ | 61 | return SIGP_CC_NOT_OPERATIONAL; |
| 58 | 62 | ||
| 59 | inti = kzalloc(sizeof(*inti), GFP_KERNEL); | 63 | inti = kzalloc(sizeof(*inti), GFP_KERNEL); |
| 60 | if (!inti) | 64 | if (!inti) |
| @@ -66,7 +70,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr) | |||
| 66 | spin_lock(&fi->lock); | 70 | spin_lock(&fi->lock); |
| 67 | li = fi->local_int[cpu_addr]; | 71 | li = fi->local_int[cpu_addr]; |
| 68 | if (li == NULL) { | 72 | if (li == NULL) { |
| 69 | rc = 3; /* not operational */ | 73 | rc = SIGP_CC_NOT_OPERATIONAL; |
| 70 | kfree(inti); | 74 | kfree(inti); |
| 71 | goto unlock; | 75 | goto unlock; |
| 72 | } | 76 | } |
| @@ -77,7 +81,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr) | |||
| 77 | if (waitqueue_active(&li->wq)) | 81 | if (waitqueue_active(&li->wq)) |
| 78 | wake_up_interruptible(&li->wq); | 82 | wake_up_interruptible(&li->wq); |
| 79 | spin_unlock_bh(&li->lock); | 83 | spin_unlock_bh(&li->lock); |
| 80 | rc = 0; /* order accepted */ | 84 | rc = SIGP_CC_ORDER_CODE_ACCEPTED; |
| 81 | VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr); | 85 | VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr); |
| 82 | unlock: | 86 | unlock: |
| 83 | spin_unlock(&fi->lock); | 87 | spin_unlock(&fi->lock); |
| @@ -92,7 +96,7 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr) | |||
| 92 | int rc; | 96 | int rc; |
| 93 | 97 | ||
| 94 | if (cpu_addr >= KVM_MAX_VCPUS) | 98 | if (cpu_addr >= KVM_MAX_VCPUS) |
| 95 | return 3; /* not operational */ | 99 | return SIGP_CC_NOT_OPERATIONAL; |
| 96 | 100 | ||
| 97 | inti = kzalloc(sizeof(*inti), GFP_KERNEL); | 101 | inti = kzalloc(sizeof(*inti), GFP_KERNEL); |
| 98 | if (!inti) | 102 | if (!inti) |
| @@ -104,7 +108,7 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr) | |||
| 104 | spin_lock(&fi->lock); | 108 | spin_lock(&fi->lock); |
| 105 | li = fi->local_int[cpu_addr]; | 109 | li = fi->local_int[cpu_addr]; |
| 106 | if (li == NULL) { | 110 | if (li == NULL) { |
| 107 | rc = 3; /* not operational */ | 111 | rc = SIGP_CC_NOT_OPERATIONAL; |
| 108 | kfree(inti); | 112 | kfree(inti); |
| 109 | goto unlock; | 113 | goto unlock; |
| 110 | } | 114 | } |
| @@ -115,7 +119,7 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr) | |||
| 115 | if (waitqueue_active(&li->wq)) | 119 | if (waitqueue_active(&li->wq)) |
| 116 | wake_up_interruptible(&li->wq); | 120 | wake_up_interruptible(&li->wq); |
| 117 | spin_unlock_bh(&li->lock); | 121 | spin_unlock_bh(&li->lock); |
| 118 | rc = 0; /* order accepted */ | 122 | rc = SIGP_CC_ORDER_CODE_ACCEPTED; |
| 119 | VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr); | 123 | VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr); |
| 120 | unlock: | 124 | unlock: |
| 121 | spin_unlock(&fi->lock); | 125 | spin_unlock(&fi->lock); |
| @@ -143,7 +147,7 @@ static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action) | |||
| 143 | out: | 147 | out: |
| 144 | spin_unlock_bh(&li->lock); | 148 | spin_unlock_bh(&li->lock); |
| 145 | 149 | ||
| 146 | return 0; /* order accepted */ | 150 | return SIGP_CC_ORDER_CODE_ACCEPTED; |
| 147 | } | 151 | } |
| 148 | 152 | ||
| 149 | static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action) | 153 | static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action) |
| @@ -153,12 +157,12 @@ static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action) | |||
| 153 | int rc; | 157 | int rc; |
| 154 | 158 | ||
| 155 | if (cpu_addr >= KVM_MAX_VCPUS) | 159 | if (cpu_addr >= KVM_MAX_VCPUS) |
| 156 | return 3; /* not operational */ | 160 | return SIGP_CC_NOT_OPERATIONAL; |
| 157 | 161 | ||
| 158 | spin_lock(&fi->lock); | 162 | spin_lock(&fi->lock); |
| 159 | li = fi->local_int[cpu_addr]; | 163 | li = fi->local_int[cpu_addr]; |
| 160 | if (li == NULL) { | 164 | if (li == NULL) { |
| 161 | rc = 3; /* not operational */ | 165 | rc = SIGP_CC_NOT_OPERATIONAL; |
| 162 | goto unlock; | 166 | goto unlock; |
| 163 | } | 167 | } |
| 164 | 168 | ||
| @@ -182,11 +186,11 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter) | |||
| 182 | 186 | ||
| 183 | switch (parameter & 0xff) { | 187 | switch (parameter & 0xff) { |
| 184 | case 0: | 188 | case 0: |
| 185 | rc = 3; /* not operational */ | 189 | rc = SIGP_CC_NOT_OPERATIONAL; |
| 186 | break; | 190 | break; |
| 187 | case 1: | 191 | case 1: |
| 188 | case 2: | 192 | case 2: |
| 189 | rc = 0; /* order accepted */ | 193 | rc = SIGP_CC_ORDER_CODE_ACCEPTED; |
| 190 | break; | 194 | break; |
| 191 | default: | 195 | default: |
| 192 | rc = -EOPNOTSUPP; | 196 | rc = -EOPNOTSUPP; |
| @@ -207,21 +211,23 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address, | |||
| 207 | address = address & 0x7fffe000u; | 211 | address = address & 0x7fffe000u; |
| 208 | if (copy_from_guest_absolute(vcpu, &tmp, address, 1) || | 212 | if (copy_from_guest_absolute(vcpu, &tmp, address, 1) || |
| 209 | copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1)) { | 213 | copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1)) { |
| 214 | *reg &= 0xffffffff00000000UL; | ||
| 210 | *reg |= SIGP_STATUS_INVALID_PARAMETER; | 215 | *reg |= SIGP_STATUS_INVALID_PARAMETER; |
| 211 | return 1; /* invalid parameter */ | 216 | return SIGP_CC_STATUS_STORED; |
| 212 | } | 217 | } |
| 213 | 218 | ||
| 214 | inti = kzalloc(sizeof(*inti), GFP_KERNEL); | 219 | inti = kzalloc(sizeof(*inti), GFP_KERNEL); |
| 215 | if (!inti) | 220 | if (!inti) |
| 216 | return 2; /* busy */ | 221 | return SIGP_CC_BUSY; |
| 217 | 222 | ||
| 218 | spin_lock(&fi->lock); | 223 | spin_lock(&fi->lock); |
| 219 | if (cpu_addr < KVM_MAX_VCPUS) | 224 | if (cpu_addr < KVM_MAX_VCPUS) |
| 220 | li = fi->local_int[cpu_addr]; | 225 | li = fi->local_int[cpu_addr]; |
| 221 | 226 | ||
| 222 | if (li == NULL) { | 227 | if (li == NULL) { |
| 223 | rc = 1; /* incorrect state */ | 228 | *reg &= 0xffffffff00000000UL; |
| 224 | *reg &= SIGP_STATUS_INCORRECT_STATE; | 229 | *reg |= SIGP_STATUS_INCORRECT_STATE; |
| 230 | rc = SIGP_CC_STATUS_STORED; | ||
| 225 | kfree(inti); | 231 | kfree(inti); |
| 226 | goto out_fi; | 232 | goto out_fi; |
| 227 | } | 233 | } |
| @@ -229,8 +235,9 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address, | |||
| 229 | spin_lock_bh(&li->lock); | 235 | spin_lock_bh(&li->lock); |
| 230 | /* cpu must be in stopped state */ | 236 | /* cpu must be in stopped state */ |
| 231 | if (!(atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) { | 237 | if (!(atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) { |
| 232 | rc = 1; /* incorrect state */ | 238 | *reg &= 0xffffffff00000000UL; |
| 233 | *reg &= SIGP_STATUS_INCORRECT_STATE; | 239 | *reg |= SIGP_STATUS_INCORRECT_STATE; |
| 240 | rc = SIGP_CC_STATUS_STORED; | ||
| 234 | kfree(inti); | 241 | kfree(inti); |
| 235 | goto out_li; | 242 | goto out_li; |
| 236 | } | 243 | } |
| @@ -242,7 +249,7 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address, | |||
| 242 | atomic_set(&li->active, 1); | 249 | atomic_set(&li->active, 1); |
| 243 | if (waitqueue_active(&li->wq)) | 250 | if (waitqueue_active(&li->wq)) |
| 244 | wake_up_interruptible(&li->wq); | 251 | wake_up_interruptible(&li->wq); |
| 245 | rc = 0; /* order accepted */ | 252 | rc = SIGP_CC_ORDER_CODE_ACCEPTED; |
| 246 | 253 | ||
| 247 | VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", cpu_addr, address); | 254 | VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", cpu_addr, address); |
| 248 | out_li: | 255 | out_li: |
| @@ -259,21 +266,21 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr, | |||
| 259 | struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; | 266 | struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; |
| 260 | 267 | ||
| 261 | if (cpu_addr >= KVM_MAX_VCPUS) | 268 | if (cpu_addr >= KVM_MAX_VCPUS) |
| 262 | return 3; /* not operational */ | 269 | return SIGP_CC_NOT_OPERATIONAL; |
| 263 | 270 | ||
| 264 | spin_lock(&fi->lock); | 271 | spin_lock(&fi->lock); |
| 265 | if (fi->local_int[cpu_addr] == NULL) | 272 | if (fi->local_int[cpu_addr] == NULL) |
| 266 | rc = 3; /* not operational */ | 273 | rc = SIGP_CC_NOT_OPERATIONAL; |
| 267 | else { | 274 | else { |
| 268 | if (atomic_read(fi->local_int[cpu_addr]->cpuflags) | 275 | if (atomic_read(fi->local_int[cpu_addr]->cpuflags) |
| 269 | & CPUSTAT_RUNNING) { | 276 | & CPUSTAT_RUNNING) { |
| 270 | /* running */ | 277 | /* running */ |
| 271 | rc = 1; | 278 | rc = SIGP_CC_ORDER_CODE_ACCEPTED; |
| 272 | } else { | 279 | } else { |
| 273 | /* not running */ | 280 | /* not running */ |
| 274 | *reg &= 0xffffffff00000000UL; | 281 | *reg &= 0xffffffff00000000UL; |
| 275 | *reg |= SIGP_STATUS_NOT_RUNNING; | 282 | *reg |= SIGP_STATUS_NOT_RUNNING; |
| 276 | rc = 0; | 283 | rc = SIGP_CC_STATUS_STORED; |
| 277 | } | 284 | } |
| 278 | } | 285 | } |
| 279 | spin_unlock(&fi->lock); | 286 | spin_unlock(&fi->lock); |
| @@ -286,23 +293,23 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr, | |||
| 286 | 293 | ||
| 287 | static int __sigp_restart(struct kvm_vcpu *vcpu, u16 cpu_addr) | 294 | static int __sigp_restart(struct kvm_vcpu *vcpu, u16 cpu_addr) |
| 288 | { | 295 | { |
| 289 | int rc = 0; | ||
| 290 | struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; | 296 | struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; |
| 291 | struct kvm_s390_local_interrupt *li; | 297 | struct kvm_s390_local_interrupt *li; |
| 298 | int rc = SIGP_CC_ORDER_CODE_ACCEPTED; | ||
| 292 | 299 | ||
| 293 | if (cpu_addr >= KVM_MAX_VCPUS) | 300 | if (cpu_addr >= KVM_MAX_VCPUS) |
| 294 | return 3; /* not operational */ | 301 | return SIGP_CC_NOT_OPERATIONAL; |
| 295 | 302 | ||
| 296 | spin_lock(&fi->lock); | 303 | spin_lock(&fi->lock); |
| 297 | li = fi->local_int[cpu_addr]; | 304 | li = fi->local_int[cpu_addr]; |
| 298 | if (li == NULL) { | 305 | if (li == NULL) { |
| 299 | rc = 3; /* not operational */ | 306 | rc = SIGP_CC_NOT_OPERATIONAL; |
| 300 | goto out; | 307 | goto out; |
| 301 | } | 308 | } |
| 302 | 309 | ||
| 303 | spin_lock_bh(&li->lock); | 310 | spin_lock_bh(&li->lock); |
| 304 | if (li->action_bits & ACTION_STOP_ON_STOP) | 311 | if (li->action_bits & ACTION_STOP_ON_STOP) |
| 305 | rc = 2; /* busy */ | 312 | rc = SIGP_CC_BUSY; |
| 306 | else | 313 | else |
| 307 | VCPU_EVENT(vcpu, 4, "sigp restart %x to handle userspace", | 314 | VCPU_EVENT(vcpu, 4, "sigp restart %x to handle userspace", |
| 308 | cpu_addr); | 315 | cpu_addr); |
| @@ -377,7 +384,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) | |||
| 377 | case SIGP_RESTART: | 384 | case SIGP_RESTART: |
| 378 | vcpu->stat.instruction_sigp_restart++; | 385 | vcpu->stat.instruction_sigp_restart++; |
| 379 | rc = __sigp_restart(vcpu, cpu_addr); | 386 | rc = __sigp_restart(vcpu, cpu_addr); |
| 380 | if (rc == 2) /* busy */ | 387 | if (rc == SIGP_CC_BUSY) |
| 381 | break; | 388 | break; |
| 382 | /* user space must know about restart */ | 389 | /* user space must know about restart */ |
| 383 | default: | 390 | default: |
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 88093c1d44fd..3ea51a84a0e4 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h | |||
| @@ -465,6 +465,8 @@ static inline u32 safe_apic_wait_icr_idle(void) | |||
| 465 | return apic->safe_wait_icr_idle(); | 465 | return apic->safe_wait_icr_idle(); |
| 466 | } | 466 | } |
| 467 | 467 | ||
| 468 | extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)); | ||
| 469 | |||
| 468 | #else /* CONFIG_X86_LOCAL_APIC */ | 470 | #else /* CONFIG_X86_LOCAL_APIC */ |
| 469 | 471 | ||
| 470 | static inline u32 apic_read(u32 reg) { return 0; } | 472 | static inline u32 apic_read(u32 reg) { return 0; } |
| @@ -474,6 +476,7 @@ static inline u64 apic_icr_read(void) { return 0; } | |||
| 474 | static inline void apic_icr_write(u32 low, u32 high) { } | 476 | static inline void apic_icr_write(u32 low, u32 high) { } |
| 475 | static inline void apic_wait_icr_idle(void) { } | 477 | static inline void apic_wait_icr_idle(void) { } |
| 476 | static inline u32 safe_apic_wait_icr_idle(void) { return 0; } | 478 | static inline u32 safe_apic_wait_icr_idle(void) { return 0; } |
| 479 | static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {} | ||
| 477 | 480 | ||
| 478 | #endif /* CONFIG_X86_LOCAL_APIC */ | 481 | #endif /* CONFIG_X86_LOCAL_APIC */ |
| 479 | 482 | ||
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index a6983b277220..72f5009deb5a 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h | |||
| @@ -264,6 +264,13 @@ static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) | |||
| 264 | * This operation is non-atomic and can be reordered. | 264 | * This operation is non-atomic and can be reordered. |
| 265 | * If two examples of this operation race, one can appear to succeed | 265 | * If two examples of this operation race, one can appear to succeed |
| 266 | * but actually fail. You must protect multiple accesses with a lock. | 266 | * but actually fail. You must protect multiple accesses with a lock. |
| 267 | * | ||
| 268 | * Note: the operation is performed atomically with respect to | ||
| 269 | * the local CPU, but not other CPUs. Portable code should not | ||
| 270 | * rely on this behaviour. | ||
| 271 | * KVM relies on this behaviour on x86 for modifying memory that is also | ||
| 272 | * accessed from a hypervisor on the same CPU if running in a VM: don't change | ||
| 273 | * this without also updating arch/x86/kernel/kvm.c | ||
| 267 | */ | 274 | */ |
| 268 | static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) | 275 | static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) |
| 269 | { | 276 | { |
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 7a15153c675d..b518c7509933 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h | |||
| @@ -49,6 +49,7 @@ extern const struct hypervisor_x86 *x86_hyper; | |||
| 49 | extern const struct hypervisor_x86 x86_hyper_vmware; | 49 | extern const struct hypervisor_x86 x86_hyper_vmware; |
| 50 | extern const struct hypervisor_x86 x86_hyper_ms_hyperv; | 50 | extern const struct hypervisor_x86 x86_hyper_ms_hyperv; |
| 51 | extern const struct hypervisor_x86 x86_hyper_xen_hvm; | 51 | extern const struct hypervisor_x86 x86_hyper_xen_hvm; |
| 52 | extern const struct hypervisor_x86 x86_hyper_kvm; | ||
| 52 | 53 | ||
| 53 | static inline bool hypervisor_x2apic_available(void) | 54 | static inline bool hypervisor_x2apic_available(void) |
| 54 | { | 55 | { |
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index e7d1c194d272..246617efd67f 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h | |||
| @@ -12,6 +12,7 @@ | |||
| 12 | /* Select x86 specific features in <linux/kvm.h> */ | 12 | /* Select x86 specific features in <linux/kvm.h> */ |
| 13 | #define __KVM_HAVE_PIT | 13 | #define __KVM_HAVE_PIT |
| 14 | #define __KVM_HAVE_IOAPIC | 14 | #define __KVM_HAVE_IOAPIC |
| 15 | #define __KVM_HAVE_IRQ_LINE | ||
| 15 | #define __KVM_HAVE_DEVICE_ASSIGNMENT | 16 | #define __KVM_HAVE_DEVICE_ASSIGNMENT |
| 16 | #define __KVM_HAVE_MSI | 17 | #define __KVM_HAVE_MSI |
| 17 | #define __KVM_HAVE_USER_NMI | 18 | #define __KVM_HAVE_USER_NMI |
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 1ac46c22dd50..c764f43b71c5 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h | |||
| @@ -192,8 +192,8 @@ struct x86_emulate_ops { | |||
| 192 | struct x86_instruction_info *info, | 192 | struct x86_instruction_info *info, |
| 193 | enum x86_intercept_stage stage); | 193 | enum x86_intercept_stage stage); |
| 194 | 194 | ||
| 195 | bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, | 195 | void (*get_cpuid)(struct x86_emulate_ctxt *ctxt, |
| 196 | u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); | 196 | u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); |
| 197 | }; | 197 | }; |
| 198 | 198 | ||
| 199 | typedef u32 __attribute__((vector_size(16))) sse128_t; | 199 | typedef u32 __attribute__((vector_size(16))) sse128_t; |
| @@ -280,9 +280,9 @@ struct x86_emulate_ctxt { | |||
| 280 | u8 modrm_seg; | 280 | u8 modrm_seg; |
| 281 | bool rip_relative; | 281 | bool rip_relative; |
| 282 | unsigned long _eip; | 282 | unsigned long _eip; |
| 283 | struct operand memop; | ||
| 283 | /* Fields above regs are cleared together. */ | 284 | /* Fields above regs are cleared together. */ |
| 284 | unsigned long regs[NR_VCPU_REGS]; | 285 | unsigned long regs[NR_VCPU_REGS]; |
| 285 | struct operand memop; | ||
| 286 | struct operand *memopp; | 286 | struct operand *memopp; |
| 287 | struct fetch_cache fetch; | 287 | struct fetch_cache fetch; |
| 288 | struct read_cache io_read; | 288 | struct read_cache io_read; |
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2da88c0cda14..09155d64cf7e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h | |||
| @@ -48,12 +48,13 @@ | |||
| 48 | 48 | ||
| 49 | #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) | 49 | #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) |
| 50 | #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) | 50 | #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) |
| 51 | #define CR3_PCID_ENABLED_RESERVED_BITS 0xFFFFFF0000000000ULL | ||
| 51 | #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ | 52 | #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ |
| 52 | 0xFFFFFF0000000000ULL) | 53 | 0xFFFFFF0000000000ULL) |
| 53 | #define CR4_RESERVED_BITS \ | 54 | #define CR4_RESERVED_BITS \ |
| 54 | (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | 55 | (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ |
| 55 | | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | 56 | | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ |
| 56 | | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ | 57 | | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ |
| 57 | | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \ | 58 | | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \ |
| 58 | | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) | 59 | | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) |
| 59 | 60 | ||
| @@ -175,6 +176,13 @@ enum { | |||
| 175 | 176 | ||
| 176 | /* apic attention bits */ | 177 | /* apic attention bits */ |
| 177 | #define KVM_APIC_CHECK_VAPIC 0 | 178 | #define KVM_APIC_CHECK_VAPIC 0 |
| 179 | /* | ||
| 180 | * The following bit is set with PV-EOI, unset on EOI. | ||
| 181 | * We detect PV-EOI changes by guest by comparing | ||
| 182 | * this bit with PV-EOI in guest memory. | ||
| 183 | * See the implementation in apic_update_pv_eoi. | ||
| 184 | */ | ||
| 185 | #define KVM_APIC_PV_EOI_PENDING 1 | ||
| 178 | 186 | ||
| 179 | /* | 187 | /* |
| 180 | * We don't want allocation failures within the mmu code, so we preallocate | 188 | * We don't want allocation failures within the mmu code, so we preallocate |
| @@ -484,6 +492,11 @@ struct kvm_vcpu_arch { | |||
| 484 | u64 length; | 492 | u64 length; |
| 485 | u64 status; | 493 | u64 status; |
| 486 | } osvw; | 494 | } osvw; |
| 495 | |||
| 496 | struct { | ||
| 497 | u64 msr_val; | ||
| 498 | struct gfn_to_hva_cache data; | ||
| 499 | } pv_eoi; | ||
| 487 | }; | 500 | }; |
| 488 | 501 | ||
| 489 | struct kvm_lpage_info { | 502 | struct kvm_lpage_info { |
| @@ -661,6 +674,7 @@ struct kvm_x86_ops { | |||
| 661 | u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); | 674 | u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); |
| 662 | int (*get_lpage_level)(void); | 675 | int (*get_lpage_level)(void); |
| 663 | bool (*rdtscp_supported)(void); | 676 | bool (*rdtscp_supported)(void); |
| 677 | bool (*invpcid_supported)(void); | ||
| 664 | void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); | 678 | void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); |
| 665 | 679 | ||
| 666 | void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); | 680 | void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); |
| @@ -802,7 +816,20 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | |||
| 802 | void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); | 816 | void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); |
| 803 | bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); | 817 | bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); |
| 804 | 818 | ||
| 805 | int kvm_pic_set_irq(void *opaque, int irq, int level); | 819 | static inline int __kvm_irq_line_state(unsigned long *irq_state, |
| 820 | int irq_source_id, int level) | ||
| 821 | { | ||
| 822 | /* Logical OR for level trig interrupt */ | ||
| 823 | if (level) | ||
| 824 | __set_bit(irq_source_id, irq_state); | ||
| 825 | else | ||
| 826 | __clear_bit(irq_source_id, irq_state); | ||
| 827 | |||
| 828 | return !!(*irq_state); | ||
| 829 | } | ||
| 830 | |||
| 831 | int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); | ||
| 832 | void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); | ||
| 806 | 833 | ||
| 807 | void kvm_inject_nmi(struct kvm_vcpu *vcpu); | 834 | void kvm_inject_nmi(struct kvm_vcpu *vcpu); |
| 808 | 835 | ||
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 63ab1661d00e..2f7712e08b1e 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | #define KVM_FEATURE_CLOCKSOURCE2 3 | 22 | #define KVM_FEATURE_CLOCKSOURCE2 3 |
| 23 | #define KVM_FEATURE_ASYNC_PF 4 | 23 | #define KVM_FEATURE_ASYNC_PF 4 |
| 24 | #define KVM_FEATURE_STEAL_TIME 5 | 24 | #define KVM_FEATURE_STEAL_TIME 5 |
| 25 | #define KVM_FEATURE_PV_EOI 6 | ||
| 25 | 26 | ||
| 26 | /* The last 8 bits are used to indicate how to interpret the flags field | 27 | /* The last 8 bits are used to indicate how to interpret the flags field |
| 27 | * in pvclock structure. If no bits are set, all flags are ignored. | 28 | * in pvclock structure. If no bits are set, all flags are ignored. |
| @@ -37,6 +38,7 @@ | |||
| 37 | #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 | 38 | #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 |
| 38 | #define MSR_KVM_ASYNC_PF_EN 0x4b564d02 | 39 | #define MSR_KVM_ASYNC_PF_EN 0x4b564d02 |
| 39 | #define MSR_KVM_STEAL_TIME 0x4b564d03 | 40 | #define MSR_KVM_STEAL_TIME 0x4b564d03 |
| 41 | #define MSR_KVM_PV_EOI_EN 0x4b564d04 | ||
| 40 | 42 | ||
| 41 | struct kvm_steal_time { | 43 | struct kvm_steal_time { |
| 42 | __u64 steal; | 44 | __u64 steal; |
| @@ -89,6 +91,11 @@ struct kvm_vcpu_pv_apf_data { | |||
| 89 | __u32 enabled; | 91 | __u32 enabled; |
| 90 | }; | 92 | }; |
| 91 | 93 | ||
| 94 | #define KVM_PV_EOI_BIT 0 | ||
| 95 | #define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT) | ||
| 96 | #define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK | ||
| 97 | #define KVM_PV_EOI_DISABLED 0x0 | ||
| 98 | |||
| 92 | #ifdef __KERNEL__ | 99 | #ifdef __KERNEL__ |
| 93 | #include <asm/processor.h> | 100 | #include <asm/processor.h> |
| 94 | 101 | ||
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index f8ab3eaad128..aea1d1d848c7 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h | |||
| @@ -44,6 +44,7 @@ | |||
| 44 | */ | 44 | */ |
| 45 | #define X86_CR3_PWT 0x00000008 /* Page Write Through */ | 45 | #define X86_CR3_PWT 0x00000008 /* Page Write Through */ |
| 46 | #define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ | 46 | #define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ |
| 47 | #define X86_CR3_PCID_MASK 0x00000fff /* PCID Mask */ | ||
| 47 | 48 | ||
| 48 | /* | 49 | /* |
| 49 | * Intel CPU features in CR4 | 50 | * Intel CPU features in CR4 |
| @@ -61,6 +62,7 @@ | |||
| 61 | #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ | 62 | #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ |
| 62 | #define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ | 63 | #define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ |
| 63 | #define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */ | 64 | #define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */ |
| 65 | #define X86_CR4_PCIDE 0x00020000 /* enable PCID support */ | ||
| 64 | #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ | 66 | #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ |
| 65 | #define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ | 67 | #define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ |
| 66 | 68 | ||
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 31f180c21ce9..74fcb963595b 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h | |||
| @@ -60,6 +60,7 @@ | |||
| 60 | #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 | 60 | #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 |
| 61 | #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 | 61 | #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 |
| 62 | #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 | 62 | #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 |
| 63 | #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 | ||
| 63 | 64 | ||
| 64 | 65 | ||
| 65 | #define PIN_BASED_EXT_INTR_MASK 0x00000001 | 66 | #define PIN_BASED_EXT_INTR_MASK 0x00000001 |
| @@ -281,6 +282,7 @@ enum vmcs_field { | |||
| 281 | #define EXIT_REASON_EPT_MISCONFIG 49 | 282 | #define EXIT_REASON_EPT_MISCONFIG 49 |
| 282 | #define EXIT_REASON_WBINVD 54 | 283 | #define EXIT_REASON_WBINVD 54 |
| 283 | #define EXIT_REASON_XSETBV 55 | 284 | #define EXIT_REASON_XSETBV 55 |
| 285 | #define EXIT_REASON_INVPCID 58 | ||
| 284 | 286 | ||
| 285 | /* | 287 | /* |
| 286 | * Interruption-information format | 288 | * Interruption-information format |
| @@ -404,6 +406,7 @@ enum vmcs_field { | |||
| 404 | #define VMX_EPTP_WB_BIT (1ull << 14) | 406 | #define VMX_EPTP_WB_BIT (1ull << 14) |
| 405 | #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) | 407 | #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) |
| 406 | #define VMX_EPT_1GB_PAGE_BIT (1ull << 17) | 408 | #define VMX_EPT_1GB_PAGE_BIT (1ull << 17) |
| 409 | #define VMX_EPT_AD_BIT (1ull << 21) | ||
| 407 | #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) | 410 | #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) |
| 408 | #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) | 411 | #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) |
| 409 | #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) | 412 | #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) |
| @@ -415,11 +418,14 @@ enum vmcs_field { | |||
| 415 | #define VMX_EPT_MAX_GAW 0x4 | 418 | #define VMX_EPT_MAX_GAW 0x4 |
| 416 | #define VMX_EPT_MT_EPTE_SHIFT 3 | 419 | #define VMX_EPT_MT_EPTE_SHIFT 3 |
| 417 | #define VMX_EPT_GAW_EPTP_SHIFT 3 | 420 | #define VMX_EPT_GAW_EPTP_SHIFT 3 |
| 421 | #define VMX_EPT_AD_ENABLE_BIT (1ull << 6) | ||
| 418 | #define VMX_EPT_DEFAULT_MT 0x6ull | 422 | #define VMX_EPT_DEFAULT_MT 0x6ull |
| 419 | #define VMX_EPT_READABLE_MASK 0x1ull | 423 | #define VMX_EPT_READABLE_MASK 0x1ull |
| 420 | #define VMX_EPT_WRITABLE_MASK 0x2ull | 424 | #define VMX_EPT_WRITABLE_MASK 0x2ull |
| 421 | #define VMX_EPT_EXECUTABLE_MASK 0x4ull | 425 | #define VMX_EPT_EXECUTABLE_MASK 0x4ull |
| 422 | #define VMX_EPT_IPAT_BIT (1ull << 6) | 426 | #define VMX_EPT_IPAT_BIT (1ull << 6) |
| 427 | #define VMX_EPT_ACCESS_BIT (1ull << 8) | ||
| 428 | #define VMX_EPT_DIRTY_BIT (1ull << 9) | ||
| 423 | 429 | ||
| 424 | #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul | 430 | #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul |
| 425 | 431 | ||
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index c421512ca5eb..98e24131ff3a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c | |||
| @@ -2143,6 +2143,23 @@ int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, | |||
| 2143 | } | 2143 | } |
| 2144 | 2144 | ||
| 2145 | /* | 2145 | /* |
| 2146 | * Override the generic EOI implementation with an optimized version. | ||
| 2147 | * Only called during early boot when only one CPU is active and with | ||
| 2148 | * interrupts disabled, so we know this does not race with actual APIC driver | ||
| 2149 | * use. | ||
| 2150 | */ | ||
| 2151 | void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) | ||
| 2152 | { | ||
| 2153 | struct apic **drv; | ||
| 2154 | |||
| 2155 | for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { | ||
| 2156 | /* Should happen once for each apic */ | ||
| 2157 | WARN_ON((*drv)->eoi_write == eoi_write); | ||
| 2158 | (*drv)->eoi_write = eoi_write; | ||
| 2159 | } | ||
| 2160 | } | ||
| 2161 | |||
| 2162 | /* | ||
| 2146 | * Power management | 2163 | * Power management |
| 2147 | */ | 2164 | */ |
| 2148 | #ifdef CONFIG_PM | 2165 | #ifdef CONFIG_PM |
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 755f64fb0743..a8f8fa9769d6 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c | |||
| @@ -37,6 +37,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = | |||
| 37 | #endif | 37 | #endif |
| 38 | &x86_hyper_vmware, | 38 | &x86_hyper_vmware, |
| 39 | &x86_hyper_ms_hyperv, | 39 | &x86_hyper_ms_hyperv, |
| 40 | #ifdef CONFIG_KVM_GUEST | ||
| 41 | &x86_hyper_kvm, | ||
| 42 | #endif | ||
| 40 | }; | 43 | }; |
| 41 | 44 | ||
| 42 | const struct hypervisor_x86 *x86_hyper; | 45 | const struct hypervisor_x86 *x86_hyper; |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index e554e5ad2fe8..c1d61ee4b4f1 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
| @@ -39,6 +39,9 @@ | |||
| 39 | #include <asm/desc.h> | 39 | #include <asm/desc.h> |
| 40 | #include <asm/tlbflush.h> | 40 | #include <asm/tlbflush.h> |
| 41 | #include <asm/idle.h> | 41 | #include <asm/idle.h> |
| 42 | #include <asm/apic.h> | ||
| 43 | #include <asm/apicdef.h> | ||
| 44 | #include <asm/hypervisor.h> | ||
| 42 | 45 | ||
| 43 | static int kvmapf = 1; | 46 | static int kvmapf = 1; |
| 44 | 47 | ||
| @@ -283,6 +286,22 @@ static void kvm_register_steal_time(void) | |||
| 283 | cpu, __pa(st)); | 286 | cpu, __pa(st)); |
| 284 | } | 287 | } |
| 285 | 288 | ||
| 289 | static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; | ||
| 290 | |||
| 291 | static void kvm_guest_apic_eoi_write(u32 reg, u32 val) | ||
| 292 | { | ||
| 293 | /** | ||
| 294 | * This relies on __test_and_clear_bit to modify the memory | ||
| 295 | * in a way that is atomic with respect to the local CPU. | ||
| 296 | * The hypervisor only accesses this memory from the local CPU so | ||
| 297 | * there's no need for lock or memory barriers. | ||
| 298 | * An optimization barrier is implied in apic write. | ||
| 299 | */ | ||
| 300 | if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi))) | ||
| 301 | return; | ||
| 302 | apic_write(APIC_EOI, APIC_EOI_ACK); | ||
| 303 | } | ||
| 304 | |||
| 286 | void __cpuinit kvm_guest_cpu_init(void) | 305 | void __cpuinit kvm_guest_cpu_init(void) |
| 287 | { | 306 | { |
| 288 | if (!kvm_para_available()) | 307 | if (!kvm_para_available()) |
| @@ -300,11 +319,20 @@ void __cpuinit kvm_guest_cpu_init(void) | |||
| 300 | smp_processor_id()); | 319 | smp_processor_id()); |
| 301 | } | 320 | } |
| 302 | 321 | ||
| 322 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { | ||
| 323 | unsigned long pa; | ||
| 324 | /* Size alignment is implied but just to make it explicit. */ | ||
| 325 | BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); | ||
| 326 | __get_cpu_var(kvm_apic_eoi) = 0; | ||
| 327 | pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED; | ||
| 328 | wrmsrl(MSR_KVM_PV_EOI_EN, pa); | ||
| 329 | } | ||
| 330 | |||
| 303 | if (has_steal_clock) | 331 | if (has_steal_clock) |
| 304 | kvm_register_steal_time(); | 332 | kvm_register_steal_time(); |
| 305 | } | 333 | } |
| 306 | 334 | ||
| 307 | static void kvm_pv_disable_apf(void *unused) | 335 | static void kvm_pv_disable_apf(void) |
| 308 | { | 336 | { |
| 309 | if (!__get_cpu_var(apf_reason).enabled) | 337 | if (!__get_cpu_var(apf_reason).enabled) |
| 310 | return; | 338 | return; |
| @@ -316,11 +344,23 @@ static void kvm_pv_disable_apf(void *unused) | |||
| 316 | smp_processor_id()); | 344 | smp_processor_id()); |
| 317 | } | 345 | } |
| 318 | 346 | ||
| 347 | static void kvm_pv_guest_cpu_reboot(void *unused) | ||
| 348 | { | ||
| 349 | /* | ||
| 350 | * We disable PV EOI before we load a new kernel by kexec, | ||
| 351 | * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. | ||
| 352 | * New kernel can re-enable when it boots. | ||
| 353 | */ | ||
| 354 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) | ||
| 355 | wrmsrl(MSR_KVM_PV_EOI_EN, 0); | ||
| 356 | kvm_pv_disable_apf(); | ||
| 357 | } | ||
| 358 | |||
| 319 | static int kvm_pv_reboot_notify(struct notifier_block *nb, | 359 | static int kvm_pv_reboot_notify(struct notifier_block *nb, |
| 320 | unsigned long code, void *unused) | 360 | unsigned long code, void *unused) |
| 321 | { | 361 | { |
| 322 | if (code == SYS_RESTART) | 362 | if (code == SYS_RESTART) |
| 323 | on_each_cpu(kvm_pv_disable_apf, NULL, 1); | 363 | on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); |
| 324 | return NOTIFY_DONE; | 364 | return NOTIFY_DONE; |
| 325 | } | 365 | } |
| 326 | 366 | ||
| @@ -371,7 +411,9 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy) | |||
| 371 | static void kvm_guest_cpu_offline(void *dummy) | 411 | static void kvm_guest_cpu_offline(void *dummy) |
| 372 | { | 412 | { |
| 373 | kvm_disable_steal_time(); | 413 | kvm_disable_steal_time(); |
| 374 | kvm_pv_disable_apf(NULL); | 414 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) |
| 415 | wrmsrl(MSR_KVM_PV_EOI_EN, 0); | ||
| 416 | kvm_pv_disable_apf(); | ||
| 375 | apf_task_wake_all(); | 417 | apf_task_wake_all(); |
| 376 | } | 418 | } |
| 377 | 419 | ||
| @@ -424,6 +466,9 @@ void __init kvm_guest_init(void) | |||
| 424 | pv_time_ops.steal_clock = kvm_steal_clock; | 466 | pv_time_ops.steal_clock = kvm_steal_clock; |
| 425 | } | 467 | } |
| 426 | 468 | ||
| 469 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) | ||
| 470 | apic_set_eoi_write(kvm_guest_apic_eoi_write); | ||
| 471 | |||
| 427 | #ifdef CONFIG_SMP | 472 | #ifdef CONFIG_SMP |
| 428 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; | 473 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; |
| 429 | register_cpu_notifier(&kvm_cpu_notifier); | 474 | register_cpu_notifier(&kvm_cpu_notifier); |
| @@ -432,6 +477,19 @@ void __init kvm_guest_init(void) | |||
| 432 | #endif | 477 | #endif |
| 433 | } | 478 | } |
| 434 | 479 | ||
| 480 | static bool __init kvm_detect(void) | ||
| 481 | { | ||
| 482 | if (!kvm_para_available()) | ||
| 483 | return false; | ||
| 484 | return true; | ||
| 485 | } | ||
| 486 | |||
| 487 | const struct hypervisor_x86 x86_hyper_kvm __refconst = { | ||
| 488 | .name = "KVM", | ||
| 489 | .detect = kvm_detect, | ||
| 490 | }; | ||
| 491 | EXPORT_SYMBOL_GPL(x86_hyper_kvm); | ||
| 492 | |||
| 435 | static __init int activate_jump_labels(void) | 493 | static __init int activate_jump_labels(void) |
| 436 | { | 494 | { |
| 437 | if (has_steal_clock) { | 495 | if (has_steal_clock) { |
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7df1c6d839fb..0595f1397b7c 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c | |||
| @@ -201,6 +201,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
| 201 | unsigned f_lm = 0; | 201 | unsigned f_lm = 0; |
| 202 | #endif | 202 | #endif |
| 203 | unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; | 203 | unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; |
| 204 | unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; | ||
| 204 | 205 | ||
| 205 | /* cpuid 1.edx */ | 206 | /* cpuid 1.edx */ |
| 206 | const u32 kvm_supported_word0_x86_features = | 207 | const u32 kvm_supported_word0_x86_features = |
| @@ -228,7 +229,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
| 228 | 0 /* DS-CPL, VMX, SMX, EST */ | | 229 | 0 /* DS-CPL, VMX, SMX, EST */ | |
| 229 | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | | 230 | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | |
| 230 | F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | | 231 | F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | |
| 231 | 0 /* Reserved, DCA */ | F(XMM4_1) | | 232 | F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) | |
| 232 | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | | 233 | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | |
| 233 | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | | 234 | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | |
| 234 | F(F16C) | F(RDRAND); | 235 | F(F16C) | F(RDRAND); |
| @@ -248,7 +249,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
| 248 | /* cpuid 7.0.ebx */ | 249 | /* cpuid 7.0.ebx */ |
| 249 | const u32 kvm_supported_word9_x86_features = | 250 | const u32 kvm_supported_word9_x86_features = |
| 250 | F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | | 251 | F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | |
| 251 | F(BMI2) | F(ERMS) | F(RTM); | 252 | F(BMI2) | F(ERMS) | f_invpcid | F(RTM); |
| 252 | 253 | ||
| 253 | /* all calls to cpuid_count() should be made on the same cpu */ | 254 | /* all calls to cpuid_count() should be made on the same cpu */ |
| 254 | get_cpu(); | 255 | get_cpu(); |
| @@ -409,6 +410,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
| 409 | (1 << KVM_FEATURE_NOP_IO_DELAY) | | 410 | (1 << KVM_FEATURE_NOP_IO_DELAY) | |
| 410 | (1 << KVM_FEATURE_CLOCKSOURCE2) | | 411 | (1 << KVM_FEATURE_CLOCKSOURCE2) | |
| 411 | (1 << KVM_FEATURE_ASYNC_PF) | | 412 | (1 << KVM_FEATURE_ASYNC_PF) | |
| 413 | (1 << KVM_FEATURE_PV_EOI) | | ||
| 412 | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); | 414 | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); |
| 413 | 415 | ||
| 414 | if (sched_info_on()) | 416 | if (sched_info_on()) |
| @@ -639,33 +641,37 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, | |||
| 639 | return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); | 641 | return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); |
| 640 | } | 642 | } |
| 641 | 643 | ||
| 642 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | 644 | void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) |
| 643 | { | 645 | { |
| 644 | u32 function, index; | 646 | u32 function = *eax, index = *ecx; |
| 645 | struct kvm_cpuid_entry2 *best; | 647 | struct kvm_cpuid_entry2 *best; |
| 646 | 648 | ||
| 647 | function = kvm_register_read(vcpu, VCPU_REGS_RAX); | ||
| 648 | index = kvm_register_read(vcpu, VCPU_REGS_RCX); | ||
| 649 | kvm_register_write(vcpu, VCPU_REGS_RAX, 0); | ||
| 650 | kvm_register_write(vcpu, VCPU_REGS_RBX, 0); | ||
| 651 | kvm_register_write(vcpu, VCPU_REGS_RCX, 0); | ||
| 652 | kvm_register_write(vcpu, VCPU_REGS_RDX, 0); | ||
| 653 | best = kvm_find_cpuid_entry(vcpu, function, index); | 649 | best = kvm_find_cpuid_entry(vcpu, function, index); |
| 654 | 650 | ||
| 655 | if (!best) | 651 | if (!best) |
| 656 | best = check_cpuid_limit(vcpu, function, index); | 652 | best = check_cpuid_limit(vcpu, function, index); |
| 657 | 653 | ||
| 658 | if (best) { | 654 | if (best) { |
| 659 | kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); | 655 | *eax = best->eax; |
| 660 | kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); | 656 | *ebx = best->ebx; |
| 661 | kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); | 657 | *ecx = best->ecx; |
| 662 | kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); | 658 | *edx = best->edx; |
| 663 | } | 659 | } else |
| 660 | *eax = *ebx = *ecx = *edx = 0; | ||
| 661 | } | ||
| 662 | |||
| 663 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | ||
| 664 | { | ||
| 665 | u32 function, eax, ebx, ecx, edx; | ||
| 666 | |||
| 667 | function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX); | ||
| 668 | ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); | ||
| 669 | kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx); | ||
| 670 | kvm_register_write(vcpu, VCPU_REGS_RAX, eax); | ||
| 671 | kvm_register_write(vcpu, VCPU_REGS_RBX, ebx); | ||
| 672 | kvm_register_write(vcpu, VCPU_REGS_RCX, ecx); | ||
| 673 | kvm_register_write(vcpu, VCPU_REGS_RDX, edx); | ||
| 664 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 674 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
| 665 | trace_kvm_cpuid(function, | 675 | trace_kvm_cpuid(function, eax, ebx, ecx, edx); |
| 666 | kvm_register_read(vcpu, VCPU_REGS_RAX), | ||
| 667 | kvm_register_read(vcpu, VCPU_REGS_RBX), | ||
| 668 | kvm_register_read(vcpu, VCPU_REGS_RCX), | ||
| 669 | kvm_register_read(vcpu, VCPU_REGS_RDX)); | ||
| 670 | } | 676 | } |
| 671 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); | 677 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); |
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 26d1fb437eb5..a10e46016851 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h | |||
| @@ -17,6 +17,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, | |||
| 17 | int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, | 17 | int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, |
| 18 | struct kvm_cpuid2 *cpuid, | 18 | struct kvm_cpuid2 *cpuid, |
| 19 | struct kvm_cpuid_entry2 __user *entries); | 19 | struct kvm_cpuid_entry2 __user *entries); |
| 20 | void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); | ||
| 20 | 21 | ||
| 21 | 22 | ||
| 22 | static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) | 23 | static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) |
| @@ -51,4 +52,12 @@ static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) | |||
| 51 | return best && (best->ecx & bit(X86_FEATURE_OSVW)); | 52 | return best && (best->ecx & bit(X86_FEATURE_OSVW)); |
| 52 | } | 53 | } |
| 53 | 54 | ||
| 55 | static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu) | ||
| 56 | { | ||
| 57 | struct kvm_cpuid_entry2 *best; | ||
| 58 | |||
| 59 | best = kvm_find_cpuid_entry(vcpu, 1, 0); | ||
| 60 | return best && (best->ecx & bit(X86_FEATURE_PCID)); | ||
| 61 | } | ||
| 62 | |||
| 54 | #endif | 63 | #endif |
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f95d242ee9f7..97d9a9914ba8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c | |||
| @@ -433,11 +433,32 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, | |||
| 433 | return ctxt->ops->intercept(ctxt, &info, stage); | 433 | return ctxt->ops->intercept(ctxt, &info, stage); |
| 434 | } | 434 | } |
| 435 | 435 | ||
| 436 | static void assign_masked(ulong *dest, ulong src, ulong mask) | ||
| 437 | { | ||
| 438 | *dest = (*dest & ~mask) | (src & mask); | ||
| 439 | } | ||
| 440 | |||
| 436 | static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) | 441 | static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) |
| 437 | { | 442 | { |
| 438 | return (1UL << (ctxt->ad_bytes << 3)) - 1; | 443 | return (1UL << (ctxt->ad_bytes << 3)) - 1; |
| 439 | } | 444 | } |
| 440 | 445 | ||
| 446 | static ulong stack_mask(struct x86_emulate_ctxt *ctxt) | ||
| 447 | { | ||
| 448 | u16 sel; | ||
| 449 | struct desc_struct ss; | ||
| 450 | |||
| 451 | if (ctxt->mode == X86EMUL_MODE_PROT64) | ||
| 452 | return ~0UL; | ||
| 453 | ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS); | ||
| 454 | return ~0U >> ((ss.d ^ 1) * 16); /* d=0: 0xffff; d=1: 0xffffffff */ | ||
| 455 | } | ||
| 456 | |||
| 457 | static int stack_size(struct x86_emulate_ctxt *ctxt) | ||
| 458 | { | ||
| 459 | return (__fls(stack_mask(ctxt)) + 1) >> 3; | ||
| 460 | } | ||
| 461 | |||
| 441 | /* Access/update address held in a register, based on addressing mode. */ | 462 | /* Access/update address held in a register, based on addressing mode. */ |
| 442 | static inline unsigned long | 463 | static inline unsigned long |
| 443 | address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) | 464 | address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) |
| @@ -958,6 +979,12 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, | |||
| 958 | op->orig_val = op->val; | 979 | op->orig_val = op->val; |
| 959 | } | 980 | } |
| 960 | 981 | ||
| 982 | static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg) | ||
| 983 | { | ||
| 984 | if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP) | ||
| 985 | ctxt->modrm_seg = VCPU_SREG_SS; | ||
| 986 | } | ||
| 987 | |||
| 961 | static int decode_modrm(struct x86_emulate_ctxt *ctxt, | 988 | static int decode_modrm(struct x86_emulate_ctxt *ctxt, |
| 962 | struct operand *op) | 989 | struct operand *op) |
| 963 | { | 990 | { |
| @@ -1061,15 +1088,20 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, | |||
| 1061 | 1088 | ||
| 1062 | if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) | 1089 | if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) |
| 1063 | modrm_ea += insn_fetch(s32, ctxt); | 1090 | modrm_ea += insn_fetch(s32, ctxt); |
| 1064 | else | 1091 | else { |
| 1065 | modrm_ea += ctxt->regs[base_reg]; | 1092 | modrm_ea += ctxt->regs[base_reg]; |
| 1093 | adjust_modrm_seg(ctxt, base_reg); | ||
| 1094 | } | ||
| 1066 | if (index_reg != 4) | 1095 | if (index_reg != 4) |
| 1067 | modrm_ea += ctxt->regs[index_reg] << scale; | 1096 | modrm_ea += ctxt->regs[index_reg] << scale; |
| 1068 | } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { | 1097 | } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { |
| 1069 | if (ctxt->mode == X86EMUL_MODE_PROT64) | 1098 | if (ctxt->mode == X86EMUL_MODE_PROT64) |
| 1070 | ctxt->rip_relative = 1; | 1099 | ctxt->rip_relative = 1; |
| 1071 | } else | 1100 | } else { |
| 1072 | modrm_ea += ctxt->regs[ctxt->modrm_rm]; | 1101 | base_reg = ctxt->modrm_rm; |
| 1102 | modrm_ea += ctxt->regs[base_reg]; | ||
| 1103 | adjust_modrm_seg(ctxt, base_reg); | ||
| 1104 | } | ||
| 1073 | switch (ctxt->modrm_mod) { | 1105 | switch (ctxt->modrm_mod) { |
| 1074 | case 0: | 1106 | case 0: |
| 1075 | if (ctxt->modrm_rm == 5) | 1107 | if (ctxt->modrm_rm == 5) |
| @@ -1264,7 +1296,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, | |||
| 1264 | 1296 | ||
| 1265 | /* allowed just for 8 bytes segments */ | 1297 | /* allowed just for 8 bytes segments */ |
| 1266 | static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, | 1298 | static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, |
| 1267 | u16 selector, struct desc_struct *desc) | 1299 | u16 selector, struct desc_struct *desc, |
| 1300 | ulong *desc_addr_p) | ||
| 1268 | { | 1301 | { |
| 1269 | struct desc_ptr dt; | 1302 | struct desc_ptr dt; |
| 1270 | u16 index = selector >> 3; | 1303 | u16 index = selector >> 3; |
| @@ -1275,7 +1308,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
| 1275 | if (dt.size < index * 8 + 7) | 1308 | if (dt.size < index * 8 + 7) |
| 1276 | return emulate_gp(ctxt, selector & 0xfffc); | 1309 | return emulate_gp(ctxt, selector & 0xfffc); |
| 1277 | 1310 | ||
| 1278 | addr = dt.address + index * 8; | 1311 | *desc_addr_p = addr = dt.address + index * 8; |
| 1279 | return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, | 1312 | return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, |
| 1280 | &ctxt->exception); | 1313 | &ctxt->exception); |
| 1281 | } | 1314 | } |
| @@ -1302,11 +1335,12 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
| 1302 | static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | 1335 | static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, |
| 1303 | u16 selector, int seg) | 1336 | u16 selector, int seg) |
| 1304 | { | 1337 | { |
| 1305 | struct desc_struct seg_desc; | 1338 | struct desc_struct seg_desc, old_desc; |
| 1306 | u8 dpl, rpl, cpl; | 1339 | u8 dpl, rpl, cpl; |
| 1307 | unsigned err_vec = GP_VECTOR; | 1340 | unsigned err_vec = GP_VECTOR; |
| 1308 | u32 err_code = 0; | 1341 | u32 err_code = 0; |
| 1309 | bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ | 1342 | bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ |
| 1343 | ulong desc_addr; | ||
| 1310 | int ret; | 1344 | int ret; |
| 1311 | 1345 | ||
| 1312 | memset(&seg_desc, 0, sizeof seg_desc); | 1346 | memset(&seg_desc, 0, sizeof seg_desc); |
| @@ -1324,8 +1358,14 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
| 1324 | goto load; | 1358 | goto load; |
| 1325 | } | 1359 | } |
| 1326 | 1360 | ||
| 1327 | /* NULL selector is not valid for TR, CS and SS */ | 1361 | rpl = selector & 3; |
| 1328 | if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) | 1362 | cpl = ctxt->ops->cpl(ctxt); |
| 1363 | |||
| 1364 | /* NULL selector is not valid for TR, CS and SS (except for long mode) */ | ||
| 1365 | if ((seg == VCPU_SREG_CS | ||
| 1366 | || (seg == VCPU_SREG_SS | ||
| 1367 | && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)) | ||
| 1368 | || seg == VCPU_SREG_TR) | ||
| 1329 | && null_selector) | 1369 | && null_selector) |
| 1330 | goto exception; | 1370 | goto exception; |
| 1331 | 1371 | ||
| @@ -1336,7 +1376,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
| 1336 | if (null_selector) /* for NULL selector skip all following checks */ | 1376 | if (null_selector) /* for NULL selector skip all following checks */ |
| 1337 | goto load; | 1377 | goto load; |
| 1338 | 1378 | ||
| 1339 | ret = read_segment_descriptor(ctxt, selector, &seg_desc); | 1379 | ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr); |
| 1340 | if (ret != X86EMUL_CONTINUE) | 1380 | if (ret != X86EMUL_CONTINUE) |
| 1341 | return ret; | 1381 | return ret; |
| 1342 | 1382 | ||
| @@ -1352,9 +1392,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
| 1352 | goto exception; | 1392 | goto exception; |
| 1353 | } | 1393 | } |
| 1354 | 1394 | ||
| 1355 | rpl = selector & 3; | ||
| 1356 | dpl = seg_desc.dpl; | 1395 | dpl = seg_desc.dpl; |
| 1357 | cpl = ctxt->ops->cpl(ctxt); | ||
| 1358 | 1396 | ||
| 1359 | switch (seg) { | 1397 | switch (seg) { |
| 1360 | case VCPU_SREG_SS: | 1398 | case VCPU_SREG_SS: |
| @@ -1384,6 +1422,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
| 1384 | case VCPU_SREG_TR: | 1422 | case VCPU_SREG_TR: |
| 1385 | if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) | 1423 | if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) |
| 1386 | goto exception; | 1424 | goto exception; |
| 1425 | old_desc = seg_desc; | ||
| 1426 | seg_desc.type |= 2; /* busy */ | ||
| 1427 | ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, | ||
| 1428 | sizeof(seg_desc), &ctxt->exception); | ||
| 1429 | if (ret != X86EMUL_CONTINUE) | ||
| 1430 | return ret; | ||
| 1387 | break; | 1431 | break; |
| 1388 | case VCPU_SREG_LDTR: | 1432 | case VCPU_SREG_LDTR: |
| 1389 | if (seg_desc.s || seg_desc.type != 2) | 1433 | if (seg_desc.s || seg_desc.type != 2) |
| @@ -1474,17 +1518,22 @@ static int writeback(struct x86_emulate_ctxt *ctxt) | |||
| 1474 | return X86EMUL_CONTINUE; | 1518 | return X86EMUL_CONTINUE; |
| 1475 | } | 1519 | } |
| 1476 | 1520 | ||
| 1477 | static int em_push(struct x86_emulate_ctxt *ctxt) | 1521 | static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes) |
| 1478 | { | 1522 | { |
| 1479 | struct segmented_address addr; | 1523 | struct segmented_address addr; |
| 1480 | 1524 | ||
| 1481 | register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes); | 1525 | register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes); |
| 1482 | addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); | 1526 | addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); |
| 1483 | addr.seg = VCPU_SREG_SS; | 1527 | addr.seg = VCPU_SREG_SS; |
| 1484 | 1528 | ||
| 1529 | return segmented_write(ctxt, addr, data, bytes); | ||
| 1530 | } | ||
| 1531 | |||
| 1532 | static int em_push(struct x86_emulate_ctxt *ctxt) | ||
| 1533 | { | ||
| 1485 | /* Disable writeback. */ | 1534 | /* Disable writeback. */ |
| 1486 | ctxt->dst.type = OP_NONE; | 1535 | ctxt->dst.type = OP_NONE; |
| 1487 | return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes); | 1536 | return push(ctxt, &ctxt->src.val, ctxt->op_bytes); |
| 1488 | } | 1537 | } |
| 1489 | 1538 | ||
| 1490 | static int emulate_pop(struct x86_emulate_ctxt *ctxt, | 1539 | static int emulate_pop(struct x86_emulate_ctxt *ctxt, |
| @@ -1556,6 +1605,33 @@ static int em_popf(struct x86_emulate_ctxt *ctxt) | |||
| 1556 | return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); | 1605 | return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); |
| 1557 | } | 1606 | } |
| 1558 | 1607 | ||
| 1608 | static int em_enter(struct x86_emulate_ctxt *ctxt) | ||
| 1609 | { | ||
| 1610 | int rc; | ||
| 1611 | unsigned frame_size = ctxt->src.val; | ||
| 1612 | unsigned nesting_level = ctxt->src2.val & 31; | ||
| 1613 | |||
| 1614 | if (nesting_level) | ||
| 1615 | return X86EMUL_UNHANDLEABLE; | ||
| 1616 | |||
| 1617 | rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt)); | ||
| 1618 | if (rc != X86EMUL_CONTINUE) | ||
| 1619 | return rc; | ||
| 1620 | assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP], | ||
| 1621 | stack_mask(ctxt)); | ||
| 1622 | assign_masked(&ctxt->regs[VCPU_REGS_RSP], | ||
| 1623 | ctxt->regs[VCPU_REGS_RSP] - frame_size, | ||
| 1624 | stack_mask(ctxt)); | ||
| 1625 | return X86EMUL_CONTINUE; | ||
| 1626 | } | ||
| 1627 | |||
| 1628 | static int em_leave(struct x86_emulate_ctxt *ctxt) | ||
| 1629 | { | ||
| 1630 | assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP], | ||
| 1631 | stack_mask(ctxt)); | ||
| 1632 | return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes); | ||
| 1633 | } | ||
| 1634 | |||
| 1559 | static int em_push_sreg(struct x86_emulate_ctxt *ctxt) | 1635 | static int em_push_sreg(struct x86_emulate_ctxt *ctxt) |
| 1560 | { | 1636 | { |
| 1561 | int seg = ctxt->src2.val; | 1637 | int seg = ctxt->src2.val; |
| @@ -1993,8 +2069,8 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt) | |||
| 1993 | u32 eax, ebx, ecx, edx; | 2069 | u32 eax, ebx, ecx, edx; |
| 1994 | 2070 | ||
| 1995 | eax = ecx = 0; | 2071 | eax = ecx = 0; |
| 1996 | return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx) | 2072 | ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); |
| 1997 | && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx | 2073 | return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx |
| 1998 | && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx | 2074 | && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx |
| 1999 | && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; | 2075 | && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; |
| 2000 | } | 2076 | } |
| @@ -2013,32 +2089,31 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) | |||
| 2013 | 2089 | ||
| 2014 | eax = 0x00000000; | 2090 | eax = 0x00000000; |
| 2015 | ecx = 0x00000000; | 2091 | ecx = 0x00000000; |
| 2016 | if (ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)) { | 2092 | ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); |
| 2017 | /* | 2093 | /* |
| 2018 | * Intel ("GenuineIntel") | 2094 | * Intel ("GenuineIntel") |
| 2019 | * remark: Intel CPUs only support "syscall" in 64bit | 2095 | * remark: Intel CPUs only support "syscall" in 64bit |
| 2020 | * longmode. Also an 64bit guest with a | 2096 | * longmode. Also an 64bit guest with a |
| 2021 | * 32bit compat-app running will #UD !! While this | 2097 | * 32bit compat-app running will #UD !! While this |
| 2022 | * behaviour can be fixed (by emulating) into AMD | 2098 | * behaviour can be fixed (by emulating) into AMD |
| 2023 | * response - CPUs of AMD can't behave like Intel. | 2099 | * response - CPUs of AMD can't behave like Intel. |
| 2024 | */ | 2100 | */ |
| 2025 | if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && | 2101 | if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && |
| 2026 | ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && | 2102 | ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && |
| 2027 | edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) | 2103 | edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) |
| 2028 | return false; | 2104 | return false; |
| 2029 | 2105 | ||
| 2030 | /* AMD ("AuthenticAMD") */ | 2106 | /* AMD ("AuthenticAMD") */ |
| 2031 | if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && | 2107 | if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && |
| 2032 | ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && | 2108 | ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && |
| 2033 | edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) | 2109 | edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) |
| 2034 | return true; | 2110 | return true; |
| 2035 | 2111 | ||
| 2036 | /* AMD ("AMDisbetter!") */ | 2112 | /* AMD ("AMDisbetter!") */ |
| 2037 | if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && | 2113 | if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && |
| 2038 | ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && | 2114 | ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && |
| 2039 | edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) | 2115 | edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) |
| 2040 | return true; | 2116 | return true; |
| 2041 | } | ||
| 2042 | 2117 | ||
| 2043 | /* default: (not Intel, not AMD), apply Intel's stricter rules... */ | 2118 | /* default: (not Intel, not AMD), apply Intel's stricter rules... */ |
| 2044 | return false; | 2119 | return false; |
| @@ -2547,13 +2622,14 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
| 2547 | ulong old_tss_base = | 2622 | ulong old_tss_base = |
| 2548 | ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); | 2623 | ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); |
| 2549 | u32 desc_limit; | 2624 | u32 desc_limit; |
| 2625 | ulong desc_addr; | ||
| 2550 | 2626 | ||
| 2551 | /* FIXME: old_tss_base == ~0 ? */ | 2627 | /* FIXME: old_tss_base == ~0 ? */ |
| 2552 | 2628 | ||
| 2553 | ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc); | 2629 | ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc, &desc_addr); |
| 2554 | if (ret != X86EMUL_CONTINUE) | 2630 | if (ret != X86EMUL_CONTINUE) |
| 2555 | return ret; | 2631 | return ret; |
| 2556 | ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc); | 2632 | ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc, &desc_addr); |
| 2557 | if (ret != X86EMUL_CONTINUE) | 2633 | if (ret != X86EMUL_CONTINUE) |
| 2558 | return ret; | 2634 | return ret; |
| 2559 | 2635 | ||
| @@ -2948,6 +3024,24 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) | |||
| 2948 | return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); | 3024 | return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); |
| 2949 | } | 3025 | } |
| 2950 | 3026 | ||
| 3027 | static int em_lldt(struct x86_emulate_ctxt *ctxt) | ||
| 3028 | { | ||
| 3029 | u16 sel = ctxt->src.val; | ||
| 3030 | |||
| 3031 | /* Disable writeback. */ | ||
| 3032 | ctxt->dst.type = OP_NONE; | ||
| 3033 | return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR); | ||
| 3034 | } | ||
| 3035 | |||
| 3036 | static int em_ltr(struct x86_emulate_ctxt *ctxt) | ||
| 3037 | { | ||
| 3038 | u16 sel = ctxt->src.val; | ||
| 3039 | |||
| 3040 | /* Disable writeback. */ | ||
| 3041 | ctxt->dst.type = OP_NONE; | ||
| 3042 | return load_segment_descriptor(ctxt, sel, VCPU_SREG_TR); | ||
| 3043 | } | ||
| 3044 | |||
| 2951 | static int em_invlpg(struct x86_emulate_ctxt *ctxt) | 3045 | static int em_invlpg(struct x86_emulate_ctxt *ctxt) |
| 2952 | { | 3046 | { |
| 2953 | int rc; | 3047 | int rc; |
| @@ -2989,11 +3083,42 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt) | |||
| 2989 | return X86EMUL_CONTINUE; | 3083 | return X86EMUL_CONTINUE; |
| 2990 | } | 3084 | } |
| 2991 | 3085 | ||
| 3086 | static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt, | ||
| 3087 | void (*get)(struct x86_emulate_ctxt *ctxt, | ||
| 3088 | struct desc_ptr *ptr)) | ||
| 3089 | { | ||
| 3090 | struct desc_ptr desc_ptr; | ||
| 3091 | |||
| 3092 | if (ctxt->mode == X86EMUL_MODE_PROT64) | ||
| 3093 | ctxt->op_bytes = 8; | ||
| 3094 | get(ctxt, &desc_ptr); | ||
| 3095 | if (ctxt->op_bytes == 2) { | ||
| 3096 | ctxt->op_bytes = 4; | ||
| 3097 | desc_ptr.address &= 0x00ffffff; | ||
| 3098 | } | ||
| 3099 | /* Disable writeback. */ | ||
| 3100 | ctxt->dst.type = OP_NONE; | ||
| 3101 | return segmented_write(ctxt, ctxt->dst.addr.mem, | ||
| 3102 | &desc_ptr, 2 + ctxt->op_bytes); | ||
| 3103 | } | ||
| 3104 | |||
| 3105 | static int em_sgdt(struct x86_emulate_ctxt *ctxt) | ||
| 3106 | { | ||
| 3107 | return emulate_store_desc_ptr(ctxt, ctxt->ops->get_gdt); | ||
| 3108 | } | ||
| 3109 | |||
| 3110 | static int em_sidt(struct x86_emulate_ctxt *ctxt) | ||
| 3111 | { | ||
| 3112 | return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt); | ||
| 3113 | } | ||
| 3114 | |||
| 2992 | static int em_lgdt(struct x86_emulate_ctxt *ctxt) | 3115 | static int em_lgdt(struct x86_emulate_ctxt *ctxt) |
| 2993 | { | 3116 | { |
| 2994 | struct desc_ptr desc_ptr; | 3117 | struct desc_ptr desc_ptr; |
| 2995 | int rc; | 3118 | int rc; |
| 2996 | 3119 | ||
| 3120 | if (ctxt->mode == X86EMUL_MODE_PROT64) | ||
| 3121 | ctxt->op_bytes = 8; | ||
| 2997 | rc = read_descriptor(ctxt, ctxt->src.addr.mem, | 3122 | rc = read_descriptor(ctxt, ctxt->src.addr.mem, |
| 2998 | &desc_ptr.size, &desc_ptr.address, | 3123 | &desc_ptr.size, &desc_ptr.address, |
| 2999 | ctxt->op_bytes); | 3124 | ctxt->op_bytes); |
| @@ -3021,6 +3146,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt) | |||
| 3021 | struct desc_ptr desc_ptr; | 3146 | struct desc_ptr desc_ptr; |
| 3022 | int rc; | 3147 | int rc; |
| 3023 | 3148 | ||
| 3149 | if (ctxt->mode == X86EMUL_MODE_PROT64) | ||
| 3150 | ctxt->op_bytes = 8; | ||
| 3024 | rc = read_descriptor(ctxt, ctxt->src.addr.mem, | 3151 | rc = read_descriptor(ctxt, ctxt->src.addr.mem, |
| 3025 | &desc_ptr.size, &desc_ptr.address, | 3152 | &desc_ptr.size, &desc_ptr.address, |
| 3026 | ctxt->op_bytes); | 3153 | ctxt->op_bytes); |
| @@ -3143,6 +3270,42 @@ static int em_bsr(struct x86_emulate_ctxt *ctxt) | |||
| 3143 | return X86EMUL_CONTINUE; | 3270 | return X86EMUL_CONTINUE; |
| 3144 | } | 3271 | } |
| 3145 | 3272 | ||
| 3273 | static int em_cpuid(struct x86_emulate_ctxt *ctxt) | ||
| 3274 | { | ||
| 3275 | u32 eax, ebx, ecx, edx; | ||
| 3276 | |||
| 3277 | eax = ctxt->regs[VCPU_REGS_RAX]; | ||
| 3278 | ecx = ctxt->regs[VCPU_REGS_RCX]; | ||
| 3279 | ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); | ||
| 3280 | ctxt->regs[VCPU_REGS_RAX] = eax; | ||
| 3281 | ctxt->regs[VCPU_REGS_RBX] = ebx; | ||
| 3282 | ctxt->regs[VCPU_REGS_RCX] = ecx; | ||
| 3283 | ctxt->regs[VCPU_REGS_RDX] = edx; | ||
| 3284 | return X86EMUL_CONTINUE; | ||
| 3285 | } | ||
| 3286 | |||
| 3287 | static int em_lahf(struct x86_emulate_ctxt *ctxt) | ||
| 3288 | { | ||
| 3289 | ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL; | ||
| 3290 | ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8; | ||
| 3291 | return X86EMUL_CONTINUE; | ||
| 3292 | } | ||
| 3293 | |||
| 3294 | static int em_bswap(struct x86_emulate_ctxt *ctxt) | ||
| 3295 | { | ||
| 3296 | switch (ctxt->op_bytes) { | ||
| 3297 | #ifdef CONFIG_X86_64 | ||
| 3298 | case 8: | ||
| 3299 | asm("bswap %0" : "+r"(ctxt->dst.val)); | ||
| 3300 | break; | ||
| 3301 | #endif | ||
| 3302 | default: | ||
| 3303 | asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val)); | ||
| 3304 | break; | ||
| 3305 | } | ||
| 3306 | return X86EMUL_CONTINUE; | ||
| 3307 | } | ||
| 3308 | |||
| 3146 | static bool valid_cr(int nr) | 3309 | static bool valid_cr(int nr) |
| 3147 | { | 3310 | { |
| 3148 | switch (nr) { | 3311 | switch (nr) { |
| @@ -3424,14 +3587,14 @@ static struct opcode group5[] = { | |||
| 3424 | static struct opcode group6[] = { | 3587 | static struct opcode group6[] = { |
| 3425 | DI(Prot, sldt), | 3588 | DI(Prot, sldt), |
| 3426 | DI(Prot, str), | 3589 | DI(Prot, str), |
| 3427 | DI(Prot | Priv, lldt), | 3590 | II(Prot | Priv | SrcMem16, em_lldt, lldt), |
| 3428 | DI(Prot | Priv, ltr), | 3591 | II(Prot | Priv | SrcMem16, em_ltr, ltr), |
| 3429 | N, N, N, N, | 3592 | N, N, N, N, |
| 3430 | }; | 3593 | }; |
| 3431 | 3594 | ||
| 3432 | static struct group_dual group7 = { { | 3595 | static struct group_dual group7 = { { |
| 3433 | DI(Mov | DstMem | Priv, sgdt), | 3596 | II(Mov | DstMem | Priv, em_sgdt, sgdt), |
| 3434 | DI(Mov | DstMem | Priv, sidt), | 3597 | II(Mov | DstMem | Priv, em_sidt, sidt), |
| 3435 | II(SrcMem | Priv, em_lgdt, lgdt), | 3598 | II(SrcMem | Priv, em_lgdt, lgdt), |
| 3436 | II(SrcMem | Priv, em_lidt, lidt), | 3599 | II(SrcMem | Priv, em_lidt, lidt), |
| 3437 | II(SrcNone | DstMem | Mov, em_smsw, smsw), N, | 3600 | II(SrcNone | DstMem | Mov, em_smsw, smsw), N, |
| @@ -3538,7 +3701,7 @@ static struct opcode opcode_table[256] = { | |||
| 3538 | D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), | 3701 | D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), |
| 3539 | I(SrcImmFAddr | No64, em_call_far), N, | 3702 | I(SrcImmFAddr | No64, em_call_far), N, |
| 3540 | II(ImplicitOps | Stack, em_pushf, pushf), | 3703 | II(ImplicitOps | Stack, em_pushf, pushf), |
| 3541 | II(ImplicitOps | Stack, em_popf, popf), N, N, | 3704 | II(ImplicitOps | Stack, em_popf, popf), N, I(ImplicitOps, em_lahf), |
| 3542 | /* 0xA0 - 0xA7 */ | 3705 | /* 0xA0 - 0xA7 */ |
| 3543 | I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), | 3706 | I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), |
| 3544 | I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), | 3707 | I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), |
| @@ -3561,7 +3724,8 @@ static struct opcode opcode_table[256] = { | |||
| 3561 | I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), | 3724 | I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), |
| 3562 | G(ByteOp, group11), G(0, group11), | 3725 | G(ByteOp, group11), G(0, group11), |
| 3563 | /* 0xC8 - 0xCF */ | 3726 | /* 0xC8 - 0xCF */ |
| 3564 | N, N, N, I(ImplicitOps | Stack, em_ret_far), | 3727 | I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), |
| 3728 | N, I(ImplicitOps | Stack, em_ret_far), | ||
| 3565 | D(ImplicitOps), DI(SrcImmByte, intn), | 3729 | D(ImplicitOps), DI(SrcImmByte, intn), |
| 3566 | D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), | 3730 | D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), |
| 3567 | /* 0xD0 - 0xD7 */ | 3731 | /* 0xD0 - 0xD7 */ |
| @@ -3635,7 +3799,7 @@ static struct opcode twobyte_table[256] = { | |||
| 3635 | X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), | 3799 | X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), |
| 3636 | /* 0xA0 - 0xA7 */ | 3800 | /* 0xA0 - 0xA7 */ |
| 3637 | I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), | 3801 | I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), |
| 3638 | DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), | 3802 | II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), |
| 3639 | D(DstMem | SrcReg | Src2ImmByte | ModRM), | 3803 | D(DstMem | SrcReg | Src2ImmByte | ModRM), |
| 3640 | D(DstMem | SrcReg | Src2CL | ModRM), N, N, | 3804 | D(DstMem | SrcReg | Src2CL | ModRM), N, N, |
| 3641 | /* 0xA8 - 0xAF */ | 3805 | /* 0xA8 - 0xAF */ |
| @@ -3658,11 +3822,12 @@ static struct opcode twobyte_table[256] = { | |||
| 3658 | I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), | 3822 | I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), |
| 3659 | I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), | 3823 | I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), |
| 3660 | D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), | 3824 | D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), |
| 3661 | /* 0xC0 - 0xCF */ | 3825 | /* 0xC0 - 0xC7 */ |
| 3662 | D2bv(DstMem | SrcReg | ModRM | Lock), | 3826 | D2bv(DstMem | SrcReg | ModRM | Lock), |
| 3663 | N, D(DstMem | SrcReg | ModRM | Mov), | 3827 | N, D(DstMem | SrcReg | ModRM | Mov), |
| 3664 | N, N, N, GD(0, &group9), | 3828 | N, N, N, GD(0, &group9), |
| 3665 | N, N, N, N, N, N, N, N, | 3829 | /* 0xC8 - 0xCF */ |
| 3830 | X8(I(DstReg, em_bswap)), | ||
| 3666 | /* 0xD0 - 0xDF */ | 3831 | /* 0xD0 - 0xDF */ |
| 3667 | N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, | 3832 | N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, |
| 3668 | /* 0xE0 - 0xEF */ | 3833 | /* 0xE0 - 0xEF */ |
| @@ -4426,12 +4591,12 @@ twobyte_insn: | |||
| 4426 | break; | 4591 | break; |
| 4427 | case 0xb6 ... 0xb7: /* movzx */ | 4592 | case 0xb6 ... 0xb7: /* movzx */ |
| 4428 | ctxt->dst.bytes = ctxt->op_bytes; | 4593 | ctxt->dst.bytes = ctxt->op_bytes; |
| 4429 | ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val | 4594 | ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val |
| 4430 | : (u16) ctxt->src.val; | 4595 | : (u16) ctxt->src.val; |
| 4431 | break; | 4596 | break; |
| 4432 | case 0xbe ... 0xbf: /* movsx */ | 4597 | case 0xbe ... 0xbf: /* movsx */ |
| 4433 | ctxt->dst.bytes = ctxt->op_bytes; | 4598 | ctxt->dst.bytes = ctxt->op_bytes; |
| 4434 | ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : | 4599 | ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val : |
| 4435 | (s16) ctxt->src.val; | 4600 | (s16) ctxt->src.val; |
| 4436 | break; | 4601 | break; |
| 4437 | case 0xc0 ... 0xc1: /* xadd */ | 4602 | case 0xc0 ... 0xc1: /* xadd */ |
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 81cf4fa4a2be..1df8fb9e1d5d 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c | |||
| @@ -188,14 +188,15 @@ void kvm_pic_update_irq(struct kvm_pic *s) | |||
| 188 | pic_unlock(s); | 188 | pic_unlock(s); |
| 189 | } | 189 | } |
| 190 | 190 | ||
| 191 | int kvm_pic_set_irq(void *opaque, int irq, int level) | 191 | int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) |
| 192 | { | 192 | { |
| 193 | struct kvm_pic *s = opaque; | ||
| 194 | int ret = -1; | 193 | int ret = -1; |
| 195 | 194 | ||
| 196 | pic_lock(s); | 195 | pic_lock(s); |
| 197 | if (irq >= 0 && irq < PIC_NUM_PINS) { | 196 | if (irq >= 0 && irq < PIC_NUM_PINS) { |
| 198 | ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); | 197 | int irq_level = __kvm_irq_line_state(&s->irq_states[irq], |
| 198 | irq_source_id, level); | ||
| 199 | ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level); | ||
| 199 | pic_update_irq(s); | 200 | pic_update_irq(s); |
| 200 | trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, | 201 | trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, |
| 201 | s->pics[irq >> 3].imr, ret == 0); | 202 | s->pics[irq >> 3].imr, ret == 0); |
| @@ -205,6 +206,16 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) | |||
| 205 | return ret; | 206 | return ret; |
| 206 | } | 207 | } |
| 207 | 208 | ||
| 209 | void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id) | ||
| 210 | { | ||
| 211 | int i; | ||
| 212 | |||
| 213 | pic_lock(s); | ||
| 214 | for (i = 0; i < PIC_NUM_PINS; i++) | ||
| 215 | __clear_bit(irq_source_id, &s->irq_states[i]); | ||
| 216 | pic_unlock(s); | ||
| 217 | } | ||
| 218 | |||
| 208 | /* | 219 | /* |
| 209 | * acknowledge interrupt 'irq' | 220 | * acknowledge interrupt 'irq' |
| 210 | */ | 221 | */ |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 93c15743f1ee..ce878788a39f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
| @@ -107,6 +107,16 @@ static inline void apic_clear_vector(int vec, void *bitmap) | |||
| 107 | clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | 107 | clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); |
| 108 | } | 108 | } |
| 109 | 109 | ||
| 110 | static inline int __apic_test_and_set_vector(int vec, void *bitmap) | ||
| 111 | { | ||
| 112 | return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | ||
| 113 | } | ||
| 114 | |||
| 115 | static inline int __apic_test_and_clear_vector(int vec, void *bitmap) | ||
| 116 | { | ||
| 117 | return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | ||
| 118 | } | ||
| 119 | |||
| 110 | static inline int apic_hw_enabled(struct kvm_lapic *apic) | 120 | static inline int apic_hw_enabled(struct kvm_lapic *apic) |
| 111 | { | 121 | { |
| 112 | return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; | 122 | return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; |
| @@ -210,6 +220,16 @@ static int find_highest_vector(void *bitmap) | |||
| 210 | return fls(word[word_offset << 2]) - 1 + (word_offset << 5); | 220 | return fls(word[word_offset << 2]) - 1 + (word_offset << 5); |
| 211 | } | 221 | } |
| 212 | 222 | ||
| 223 | static u8 count_vectors(void *bitmap) | ||
| 224 | { | ||
| 225 | u32 *word = bitmap; | ||
| 226 | int word_offset; | ||
| 227 | u8 count = 0; | ||
| 228 | for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset) | ||
| 229 | count += hweight32(word[word_offset << 2]); | ||
| 230 | return count; | ||
| 231 | } | ||
| 232 | |||
| 213 | static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) | 233 | static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) |
| 214 | { | 234 | { |
| 215 | apic->irr_pending = true; | 235 | apic->irr_pending = true; |
| @@ -242,6 +262,27 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) | |||
| 242 | apic->irr_pending = true; | 262 | apic->irr_pending = true; |
| 243 | } | 263 | } |
| 244 | 264 | ||
| 265 | static inline void apic_set_isr(int vec, struct kvm_lapic *apic) | ||
| 266 | { | ||
| 267 | if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) | ||
| 268 | ++apic->isr_count; | ||
| 269 | BUG_ON(apic->isr_count > MAX_APIC_VECTOR); | ||
| 270 | /* | ||
| 271 | * ISR (in service register) bit is set when injecting an interrupt. | ||
| 272 | * The highest vector is injected. Thus the latest bit set matches | ||
| 273 | * the highest bit in ISR. | ||
| 274 | */ | ||
| 275 | apic->highest_isr_cache = vec; | ||
| 276 | } | ||
| 277 | |||
| 278 | static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) | ||
| 279 | { | ||
| 280 | if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) | ||
| 281 | --apic->isr_count; | ||
| 282 | BUG_ON(apic->isr_count < 0); | ||
| 283 | apic->highest_isr_cache = -1; | ||
| 284 | } | ||
| 285 | |||
| 245 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) | 286 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) |
| 246 | { | 287 | { |
| 247 | struct kvm_lapic *apic = vcpu->arch.apic; | 288 | struct kvm_lapic *apic = vcpu->arch.apic; |
| @@ -270,9 +311,61 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) | |||
| 270 | irq->level, irq->trig_mode); | 311 | irq->level, irq->trig_mode); |
| 271 | } | 312 | } |
| 272 | 313 | ||
| 314 | static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) | ||
| 315 | { | ||
| 316 | |||
| 317 | return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, | ||
| 318 | sizeof(val)); | ||
| 319 | } | ||
| 320 | |||
| 321 | static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) | ||
| 322 | { | ||
| 323 | |||
| 324 | return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, | ||
| 325 | sizeof(*val)); | ||
| 326 | } | ||
| 327 | |||
| 328 | static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) | ||
| 329 | { | ||
| 330 | return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; | ||
| 331 | } | ||
| 332 | |||
| 333 | static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) | ||
| 334 | { | ||
| 335 | u8 val; | ||
| 336 | if (pv_eoi_get_user(vcpu, &val) < 0) | ||
| 337 | apic_debug("Can't read EOI MSR value: 0x%llx\n", | ||
| 338 | (unsigned long long)vcpi->arch.pv_eoi.msr_val); | ||
| 339 | return val & 0x1; | ||
| 340 | } | ||
| 341 | |||
| 342 | static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) | ||
| 343 | { | ||
| 344 | if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) { | ||
| 345 | apic_debug("Can't set EOI MSR value: 0x%llx\n", | ||
| 346 | (unsigned long long)vcpi->arch.pv_eoi.msr_val); | ||
| 347 | return; | ||
| 348 | } | ||
| 349 | __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); | ||
| 350 | } | ||
| 351 | |||
| 352 | static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) | ||
| 353 | { | ||
| 354 | if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) { | ||
| 355 | apic_debug("Can't clear EOI MSR value: 0x%llx\n", | ||
| 356 | (unsigned long long)vcpi->arch.pv_eoi.msr_val); | ||
| 357 | return; | ||
| 358 | } | ||
| 359 | __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); | ||
| 360 | } | ||
| 361 | |||
| 273 | static inline int apic_find_highest_isr(struct kvm_lapic *apic) | 362 | static inline int apic_find_highest_isr(struct kvm_lapic *apic) |
| 274 | { | 363 | { |
| 275 | int result; | 364 | int result; |
| 365 | if (!apic->isr_count) | ||
| 366 | return -1; | ||
| 367 | if (likely(apic->highest_isr_cache != -1)) | ||
| 368 | return apic->highest_isr_cache; | ||
| 276 | 369 | ||
| 277 | result = find_highest_vector(apic->regs + APIC_ISR); | 370 | result = find_highest_vector(apic->regs + APIC_ISR); |
| 278 | ASSERT(result == -1 || result >= 16); | 371 | ASSERT(result == -1 || result >= 16); |
| @@ -482,17 +575,20 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) | |||
| 482 | return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; | 575 | return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; |
| 483 | } | 576 | } |
| 484 | 577 | ||
| 485 | static void apic_set_eoi(struct kvm_lapic *apic) | 578 | static int apic_set_eoi(struct kvm_lapic *apic) |
| 486 | { | 579 | { |
| 487 | int vector = apic_find_highest_isr(apic); | 580 | int vector = apic_find_highest_isr(apic); |
| 581 | |||
| 582 | trace_kvm_eoi(apic, vector); | ||
| 583 | |||
| 488 | /* | 584 | /* |
| 489 | * Not every write EOI will has corresponding ISR, | 585 | * Not every write EOI will has corresponding ISR, |
| 490 | * one example is when Kernel check timer on setup_IO_APIC | 586 | * one example is when Kernel check timer on setup_IO_APIC |
| 491 | */ | 587 | */ |
| 492 | if (vector == -1) | 588 | if (vector == -1) |
| 493 | return; | 589 | return vector; |
| 494 | 590 | ||
| 495 | apic_clear_vector(vector, apic->regs + APIC_ISR); | 591 | apic_clear_isr(vector, apic); |
| 496 | apic_update_ppr(apic); | 592 | apic_update_ppr(apic); |
| 497 | 593 | ||
| 498 | if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && | 594 | if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && |
| @@ -505,6 +601,7 @@ static void apic_set_eoi(struct kvm_lapic *apic) | |||
| 505 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); | 601 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); |
| 506 | } | 602 | } |
| 507 | kvm_make_request(KVM_REQ_EVENT, apic->vcpu); | 603 | kvm_make_request(KVM_REQ_EVENT, apic->vcpu); |
| 604 | return vector; | ||
| 508 | } | 605 | } |
| 509 | 606 | ||
| 510 | static void apic_send_ipi(struct kvm_lapic *apic) | 607 | static void apic_send_ipi(struct kvm_lapic *apic) |
| @@ -1081,10 +1178,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
| 1081 | apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); | 1178 | apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); |
| 1082 | } | 1179 | } |
| 1083 | apic->irr_pending = false; | 1180 | apic->irr_pending = false; |
| 1181 | apic->isr_count = 0; | ||
| 1182 | apic->highest_isr_cache = -1; | ||
| 1084 | update_divide_count(apic); | 1183 | update_divide_count(apic); |
| 1085 | atomic_set(&apic->lapic_timer.pending, 0); | 1184 | atomic_set(&apic->lapic_timer.pending, 0); |
| 1086 | if (kvm_vcpu_is_bsp(vcpu)) | 1185 | if (kvm_vcpu_is_bsp(vcpu)) |
| 1087 | vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; | 1186 | vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; |
| 1187 | vcpu->arch.pv_eoi.msr_val = 0; | ||
| 1088 | apic_update_ppr(apic); | 1188 | apic_update_ppr(apic); |
| 1089 | 1189 | ||
| 1090 | vcpu->arch.apic_arb_prio = 0; | 1190 | vcpu->arch.apic_arb_prio = 0; |
| @@ -1248,7 +1348,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) | |||
| 1248 | if (vector == -1) | 1348 | if (vector == -1) |
| 1249 | return -1; | 1349 | return -1; |
| 1250 | 1350 | ||
| 1251 | apic_set_vector(vector, apic->regs + APIC_ISR); | 1351 | apic_set_isr(vector, apic); |
| 1252 | apic_update_ppr(apic); | 1352 | apic_update_ppr(apic); |
| 1253 | apic_clear_irr(vector, apic); | 1353 | apic_clear_irr(vector, apic); |
| 1254 | return vector; | 1354 | return vector; |
| @@ -1267,6 +1367,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) | |||
| 1267 | update_divide_count(apic); | 1367 | update_divide_count(apic); |
| 1268 | start_apic_timer(apic); | 1368 | start_apic_timer(apic); |
| 1269 | apic->irr_pending = true; | 1369 | apic->irr_pending = true; |
| 1370 | apic->isr_count = count_vectors(apic->regs + APIC_ISR); | ||
| 1371 | apic->highest_isr_cache = -1; | ||
| 1270 | kvm_make_request(KVM_REQ_EVENT, vcpu); | 1372 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
| 1271 | } | 1373 | } |
| 1272 | 1374 | ||
| @@ -1283,11 +1385,51 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) | |||
| 1283 | hrtimer_start_expires(timer, HRTIMER_MODE_ABS); | 1385 | hrtimer_start_expires(timer, HRTIMER_MODE_ABS); |
| 1284 | } | 1386 | } |
| 1285 | 1387 | ||
| 1388 | /* | ||
| 1389 | * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt | ||
| 1390 | * | ||
| 1391 | * Detect whether guest triggered PV EOI since the | ||
| 1392 | * last entry. If yes, set EOI on guests's behalf. | ||
| 1393 | * Clear PV EOI in guest memory in any case. | ||
| 1394 | */ | ||
| 1395 | static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu, | ||
| 1396 | struct kvm_lapic *apic) | ||
| 1397 | { | ||
| 1398 | bool pending; | ||
| 1399 | int vector; | ||
| 1400 | /* | ||
| 1401 | * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host | ||
| 1402 | * and KVM_PV_EOI_ENABLED in guest memory as follows: | ||
| 1403 | * | ||
| 1404 | * KVM_APIC_PV_EOI_PENDING is unset: | ||
| 1405 | * -> host disabled PV EOI. | ||
| 1406 | * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set: | ||
| 1407 | * -> host enabled PV EOI, guest did not execute EOI yet. | ||
| 1408 | * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset: | ||
| 1409 | * -> host enabled PV EOI, guest executed EOI. | ||
| 1410 | */ | ||
| 1411 | BUG_ON(!pv_eoi_enabled(vcpu)); | ||
| 1412 | pending = pv_eoi_get_pending(vcpu); | ||
| 1413 | /* | ||
| 1414 | * Clear pending bit in any case: it will be set again on vmentry. | ||
| 1415 | * While this might not be ideal from performance point of view, | ||
| 1416 | * this makes sure pv eoi is only enabled when we know it's safe. | ||
| 1417 | */ | ||
| 1418 | pv_eoi_clr_pending(vcpu); | ||
| 1419 | if (pending) | ||
| 1420 | return; | ||
| 1421 | vector = apic_set_eoi(apic); | ||
| 1422 | trace_kvm_pv_eoi(apic, vector); | ||
| 1423 | } | ||
| 1424 | |||
| 1286 | void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) | 1425 | void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) |
| 1287 | { | 1426 | { |
| 1288 | u32 data; | 1427 | u32 data; |
| 1289 | void *vapic; | 1428 | void *vapic; |
| 1290 | 1429 | ||
| 1430 | if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention)) | ||
| 1431 | apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic); | ||
| 1432 | |||
| 1291 | if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) | 1433 | if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) |
| 1292 | return; | 1434 | return; |
| 1293 | 1435 | ||
| @@ -1298,17 +1440,44 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) | |||
| 1298 | apic_set_tpr(vcpu->arch.apic, data & 0xff); | 1440 | apic_set_tpr(vcpu->arch.apic, data & 0xff); |
| 1299 | } | 1441 | } |
| 1300 | 1442 | ||
| 1443 | /* | ||
| 1444 | * apic_sync_pv_eoi_to_guest - called before vmentry | ||
| 1445 | * | ||
| 1446 | * Detect whether it's safe to enable PV EOI and | ||
| 1447 | * if yes do so. | ||
| 1448 | */ | ||
| 1449 | static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu, | ||
| 1450 | struct kvm_lapic *apic) | ||
| 1451 | { | ||
| 1452 | if (!pv_eoi_enabled(vcpu) || | ||
| 1453 | /* IRR set or many bits in ISR: could be nested. */ | ||
| 1454 | apic->irr_pending || | ||
| 1455 | /* Cache not set: could be safe but we don't bother. */ | ||
| 1456 | apic->highest_isr_cache == -1 || | ||
| 1457 | /* Need EOI to update ioapic. */ | ||
| 1458 | kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) { | ||
| 1459 | /* | ||
| 1460 | * PV EOI was disabled by apic_sync_pv_eoi_from_guest | ||
| 1461 | * so we need not do anything here. | ||
| 1462 | */ | ||
| 1463 | return; | ||
| 1464 | } | ||
| 1465 | |||
| 1466 | pv_eoi_set_pending(apic->vcpu); | ||
| 1467 | } | ||
| 1468 | |||
| 1301 | void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) | 1469 | void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) |
| 1302 | { | 1470 | { |
| 1303 | u32 data, tpr; | 1471 | u32 data, tpr; |
| 1304 | int max_irr, max_isr; | 1472 | int max_irr, max_isr; |
| 1305 | struct kvm_lapic *apic; | 1473 | struct kvm_lapic *apic = vcpu->arch.apic; |
| 1306 | void *vapic; | 1474 | void *vapic; |
| 1307 | 1475 | ||
| 1476 | apic_sync_pv_eoi_to_guest(vcpu, apic); | ||
| 1477 | |||
| 1308 | if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) | 1478 | if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) |
| 1309 | return; | 1479 | return; |
| 1310 | 1480 | ||
| 1311 | apic = vcpu->arch.apic; | ||
| 1312 | tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; | 1481 | tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; |
| 1313 | max_irr = apic_find_highest_irr(apic); | 1482 | max_irr = apic_find_highest_irr(apic); |
| 1314 | if (max_irr < 0) | 1483 | if (max_irr < 0) |
| @@ -1394,3 +1563,16 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) | |||
| 1394 | 1563 | ||
| 1395 | return 0; | 1564 | return 0; |
| 1396 | } | 1565 | } |
| 1566 | |||
| 1567 | int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) | ||
| 1568 | { | ||
| 1569 | u64 addr = data & ~KVM_MSR_ENABLED; | ||
| 1570 | if (!IS_ALIGNED(addr, 4)) | ||
| 1571 | return 1; | ||
| 1572 | |||
| 1573 | vcpu->arch.pv_eoi.msr_val = data; | ||
| 1574 | if (!pv_eoi_enabled(vcpu)) | ||
| 1575 | return 0; | ||
| 1576 | return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, | ||
| 1577 | addr); | ||
| 1578 | } | ||
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 6f4ce2575d09..4af5405ae1e2 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h | |||
| @@ -13,6 +13,15 @@ struct kvm_lapic { | |||
| 13 | u32 divide_count; | 13 | u32 divide_count; |
| 14 | struct kvm_vcpu *vcpu; | 14 | struct kvm_vcpu *vcpu; |
| 15 | bool irr_pending; | 15 | bool irr_pending; |
| 16 | /* Number of bits set in ISR. */ | ||
| 17 | s16 isr_count; | ||
| 18 | /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */ | ||
| 19 | int highest_isr_cache; | ||
| 20 | /** | ||
| 21 | * APIC register page. The layout matches the register layout seen by | ||
| 22 | * the guest 1:1, because it is accessed by the vmx microcode. | ||
| 23 | * Note: Only one register, the TPR, is used by the microcode. | ||
| 24 | */ | ||
| 16 | void *regs; | 25 | void *regs; |
| 17 | gpa_t vapic_addr; | 26 | gpa_t vapic_addr; |
| 18 | struct page *vapic_page; | 27 | struct page *vapic_page; |
| @@ -60,4 +69,6 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) | |||
| 60 | { | 69 | { |
| 61 | return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; | 70 | return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; |
| 62 | } | 71 | } |
| 72 | |||
| 73 | int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); | ||
| 63 | #endif | 74 | #endif |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 57e168e27b5b..01ca00423938 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
| @@ -90,7 +90,7 @@ module_param(dbg, bool, 0644); | |||
| 90 | 90 | ||
| 91 | #define PTE_PREFETCH_NUM 8 | 91 | #define PTE_PREFETCH_NUM 8 |
| 92 | 92 | ||
| 93 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 | 93 | #define PT_FIRST_AVAIL_BITS_SHIFT 10 |
| 94 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 | 94 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 |
| 95 | 95 | ||
| 96 | #define PT64_LEVEL_BITS 9 | 96 | #define PT64_LEVEL_BITS 9 |
| @@ -145,7 +145,8 @@ module_param(dbg, bool, 0644); | |||
| 145 | #define CREATE_TRACE_POINTS | 145 | #define CREATE_TRACE_POINTS |
| 146 | #include "mmutrace.h" | 146 | #include "mmutrace.h" |
| 147 | 147 | ||
| 148 | #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) | 148 | #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) |
| 149 | #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) | ||
| 149 | 150 | ||
| 150 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | 151 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) |
| 151 | 152 | ||
| @@ -188,6 +189,7 @@ static u64 __read_mostly shadow_dirty_mask; | |||
| 188 | static u64 __read_mostly shadow_mmio_mask; | 189 | static u64 __read_mostly shadow_mmio_mask; |
| 189 | 190 | ||
| 190 | static void mmu_spte_set(u64 *sptep, u64 spte); | 191 | static void mmu_spte_set(u64 *sptep, u64 spte); |
| 192 | static void mmu_free_roots(struct kvm_vcpu *vcpu); | ||
| 191 | 193 | ||
| 192 | void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) | 194 | void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) |
| 193 | { | 195 | { |
| @@ -444,8 +446,22 @@ static bool __check_direct_spte_mmio_pf(u64 spte) | |||
| 444 | } | 446 | } |
| 445 | #endif | 447 | #endif |
| 446 | 448 | ||
| 449 | static bool spte_is_locklessly_modifiable(u64 spte) | ||
| 450 | { | ||
| 451 | return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); | ||
| 452 | } | ||
| 453 | |||
| 447 | static bool spte_has_volatile_bits(u64 spte) | 454 | static bool spte_has_volatile_bits(u64 spte) |
| 448 | { | 455 | { |
| 456 | /* | ||
| 457 | * Always atomicly update spte if it can be updated | ||
| 458 | * out of mmu-lock, it can ensure dirty bit is not lost, | ||
| 459 | * also, it can help us to get a stable is_writable_pte() | ||
| 460 | * to ensure tlb flush is not missed. | ||
| 461 | */ | ||
| 462 | if (spte_is_locklessly_modifiable(spte)) | ||
| 463 | return true; | ||
| 464 | |||
| 449 | if (!shadow_accessed_mask) | 465 | if (!shadow_accessed_mask) |
| 450 | return false; | 466 | return false; |
| 451 | 467 | ||
| @@ -478,34 +494,47 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) | |||
| 478 | 494 | ||
| 479 | /* Rules for using mmu_spte_update: | 495 | /* Rules for using mmu_spte_update: |
| 480 | * Update the state bits, it means the mapped pfn is not changged. | 496 | * Update the state bits, it means the mapped pfn is not changged. |
| 497 | * | ||
| 498 | * Whenever we overwrite a writable spte with a read-only one we | ||
| 499 | * should flush remote TLBs. Otherwise rmap_write_protect | ||
| 500 | * will find a read-only spte, even though the writable spte | ||
| 501 | * might be cached on a CPU's TLB, the return value indicates this | ||
| 502 | * case. | ||
| 481 | */ | 503 | */ |
| 482 | static void mmu_spte_update(u64 *sptep, u64 new_spte) | 504 | static bool mmu_spte_update(u64 *sptep, u64 new_spte) |
| 483 | { | 505 | { |
| 484 | u64 mask, old_spte = *sptep; | 506 | u64 old_spte = *sptep; |
| 507 | bool ret = false; | ||
| 485 | 508 | ||
| 486 | WARN_ON(!is_rmap_spte(new_spte)); | 509 | WARN_ON(!is_rmap_spte(new_spte)); |
| 487 | 510 | ||
| 488 | if (!is_shadow_present_pte(old_spte)) | 511 | if (!is_shadow_present_pte(old_spte)) { |
| 489 | return mmu_spte_set(sptep, new_spte); | 512 | mmu_spte_set(sptep, new_spte); |
| 490 | 513 | return ret; | |
| 491 | new_spte |= old_spte & shadow_dirty_mask; | 514 | } |
| 492 | |||
| 493 | mask = shadow_accessed_mask; | ||
| 494 | if (is_writable_pte(old_spte)) | ||
| 495 | mask |= shadow_dirty_mask; | ||
| 496 | 515 | ||
| 497 | if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) | 516 | if (!spte_has_volatile_bits(old_spte)) |
| 498 | __update_clear_spte_fast(sptep, new_spte); | 517 | __update_clear_spte_fast(sptep, new_spte); |
| 499 | else | 518 | else |
| 500 | old_spte = __update_clear_spte_slow(sptep, new_spte); | 519 | old_spte = __update_clear_spte_slow(sptep, new_spte); |
| 501 | 520 | ||
| 521 | /* | ||
| 522 | * For the spte updated out of mmu-lock is safe, since | ||
| 523 | * we always atomicly update it, see the comments in | ||
| 524 | * spte_has_volatile_bits(). | ||
| 525 | */ | ||
| 526 | if (is_writable_pte(old_spte) && !is_writable_pte(new_spte)) | ||
| 527 | ret = true; | ||
| 528 | |||
| 502 | if (!shadow_accessed_mask) | 529 | if (!shadow_accessed_mask) |
| 503 | return; | 530 | return ret; |
| 504 | 531 | ||
| 505 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) | 532 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) |
| 506 | kvm_set_pfn_accessed(spte_to_pfn(old_spte)); | 533 | kvm_set_pfn_accessed(spte_to_pfn(old_spte)); |
| 507 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) | 534 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) |
| 508 | kvm_set_pfn_dirty(spte_to_pfn(old_spte)); | 535 | kvm_set_pfn_dirty(spte_to_pfn(old_spte)); |
| 536 | |||
| 537 | return ret; | ||
| 509 | } | 538 | } |
| 510 | 539 | ||
| 511 | /* | 540 | /* |
| @@ -652,8 +681,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) | |||
| 652 | mmu_page_header_cache); | 681 | mmu_page_header_cache); |
| 653 | } | 682 | } |
| 654 | 683 | ||
| 655 | static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, | 684 | static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) |
| 656 | size_t size) | ||
| 657 | { | 685 | { |
| 658 | void *p; | 686 | void *p; |
| 659 | 687 | ||
| @@ -664,8 +692,7 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, | |||
| 664 | 692 | ||
| 665 | static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) | 693 | static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) |
| 666 | { | 694 | { |
| 667 | return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, | 695 | return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); |
| 668 | sizeof(struct pte_list_desc)); | ||
| 669 | } | 696 | } |
| 670 | 697 | ||
| 671 | static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) | 698 | static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) |
| @@ -1051,35 +1078,82 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) | |||
| 1051 | rmap_remove(kvm, sptep); | 1078 | rmap_remove(kvm, sptep); |
| 1052 | } | 1079 | } |
| 1053 | 1080 | ||
| 1054 | static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) | 1081 | |
| 1082 | static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) | ||
| 1083 | { | ||
| 1084 | if (is_large_pte(*sptep)) { | ||
| 1085 | WARN_ON(page_header(__pa(sptep))->role.level == | ||
| 1086 | PT_PAGE_TABLE_LEVEL); | ||
| 1087 | drop_spte(kvm, sptep); | ||
| 1088 | --kvm->stat.lpages; | ||
| 1089 | return true; | ||
| 1090 | } | ||
| 1091 | |||
| 1092 | return false; | ||
| 1093 | } | ||
| 1094 | |||
| 1095 | static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) | ||
| 1096 | { | ||
| 1097 | if (__drop_large_spte(vcpu->kvm, sptep)) | ||
| 1098 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
| 1099 | } | ||
| 1100 | |||
| 1101 | /* | ||
| 1102 | * Write-protect on the specified @sptep, @pt_protect indicates whether | ||
| 1103 | * spte writ-protection is caused by protecting shadow page table. | ||
| 1104 | * @flush indicates whether tlb need be flushed. | ||
| 1105 | * | ||
| 1106 | * Note: write protection is difference between drity logging and spte | ||
| 1107 | * protection: | ||
| 1108 | * - for dirty logging, the spte can be set to writable at anytime if | ||
| 1109 | * its dirty bitmap is properly set. | ||
| 1110 | * - for spte protection, the spte can be writable only after unsync-ing | ||
| 1111 | * shadow page. | ||
| 1112 | * | ||
| 1113 | * Return true if the spte is dropped. | ||
| 1114 | */ | ||
| 1115 | static bool | ||
| 1116 | spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect) | ||
| 1117 | { | ||
| 1118 | u64 spte = *sptep; | ||
| 1119 | |||
| 1120 | if (!is_writable_pte(spte) && | ||
| 1121 | !(pt_protect && spte_is_locklessly_modifiable(spte))) | ||
| 1122 | return false; | ||
| 1123 | |||
| 1124 | rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); | ||
| 1125 | |||
| 1126 | if (__drop_large_spte(kvm, sptep)) { | ||
| 1127 | *flush |= true; | ||
| 1128 | return true; | ||
| 1129 | } | ||
| 1130 | |||
| 1131 | if (pt_protect) | ||
| 1132 | spte &= ~SPTE_MMU_WRITEABLE; | ||
| 1133 | spte = spte & ~PT_WRITABLE_MASK; | ||
| 1134 | |||
| 1135 | *flush |= mmu_spte_update(sptep, spte); | ||
| 1136 | return false; | ||
| 1137 | } | ||
| 1138 | |||
| 1139 | static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, | ||
| 1140 | int level, bool pt_protect) | ||
| 1055 | { | 1141 | { |
| 1056 | u64 *sptep; | 1142 | u64 *sptep; |
| 1057 | struct rmap_iterator iter; | 1143 | struct rmap_iterator iter; |
| 1058 | int write_protected = 0; | 1144 | bool flush = false; |
| 1059 | 1145 | ||
| 1060 | for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { | 1146 | for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { |
| 1061 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); | 1147 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); |
| 1062 | rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); | 1148 | if (spte_write_protect(kvm, sptep, &flush, pt_protect)) { |
| 1063 | |||
| 1064 | if (!is_writable_pte(*sptep)) { | ||
| 1065 | sptep = rmap_get_next(&iter); | ||
| 1066 | continue; | ||
| 1067 | } | ||
| 1068 | |||
| 1069 | if (level == PT_PAGE_TABLE_LEVEL) { | ||
| 1070 | mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK); | ||
| 1071 | sptep = rmap_get_next(&iter); | ||
| 1072 | } else { | ||
| 1073 | BUG_ON(!is_large_pte(*sptep)); | ||
| 1074 | drop_spte(kvm, sptep); | ||
| 1075 | --kvm->stat.lpages; | ||
| 1076 | sptep = rmap_get_first(*rmapp, &iter); | 1149 | sptep = rmap_get_first(*rmapp, &iter); |
| 1150 | continue; | ||
| 1077 | } | 1151 | } |
| 1078 | 1152 | ||
| 1079 | write_protected = 1; | 1153 | sptep = rmap_get_next(&iter); |
| 1080 | } | 1154 | } |
| 1081 | 1155 | ||
| 1082 | return write_protected; | 1156 | return flush; |
| 1083 | } | 1157 | } |
| 1084 | 1158 | ||
| 1085 | /** | 1159 | /** |
| @@ -1100,26 +1174,26 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, | |||
| 1100 | 1174 | ||
| 1101 | while (mask) { | 1175 | while (mask) { |
| 1102 | rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; | 1176 | rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; |
| 1103 | __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); | 1177 | __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); |
| 1104 | 1178 | ||
| 1105 | /* clear the first set bit */ | 1179 | /* clear the first set bit */ |
| 1106 | mask &= mask - 1; | 1180 | mask &= mask - 1; |
| 1107 | } | 1181 | } |
| 1108 | } | 1182 | } |
| 1109 | 1183 | ||
| 1110 | static int rmap_write_protect(struct kvm *kvm, u64 gfn) | 1184 | static bool rmap_write_protect(struct kvm *kvm, u64 gfn) |
| 1111 | { | 1185 | { |
| 1112 | struct kvm_memory_slot *slot; | 1186 | struct kvm_memory_slot *slot; |
| 1113 | unsigned long *rmapp; | 1187 | unsigned long *rmapp; |
| 1114 | int i; | 1188 | int i; |
| 1115 | int write_protected = 0; | 1189 | bool write_protected = false; |
| 1116 | 1190 | ||
| 1117 | slot = gfn_to_memslot(kvm, gfn); | 1191 | slot = gfn_to_memslot(kvm, gfn); |
| 1118 | 1192 | ||
| 1119 | for (i = PT_PAGE_TABLE_LEVEL; | 1193 | for (i = PT_PAGE_TABLE_LEVEL; |
| 1120 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | 1194 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
| 1121 | rmapp = __gfn_to_rmap(gfn, i, slot); | 1195 | rmapp = __gfn_to_rmap(gfn, i, slot); |
| 1122 | write_protected |= __rmap_write_protect(kvm, rmapp, i); | 1196 | write_protected |= __rmap_write_protect(kvm, rmapp, i, true); |
| 1123 | } | 1197 | } |
| 1124 | 1198 | ||
| 1125 | return write_protected; | 1199 | return write_protected; |
| @@ -1238,11 +1312,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
| 1238 | unsigned long data) | 1312 | unsigned long data) |
| 1239 | { | 1313 | { |
| 1240 | u64 *sptep; | 1314 | u64 *sptep; |
| 1241 | struct rmap_iterator iter; | 1315 | struct rmap_iterator uninitialized_var(iter); |
| 1242 | int young = 0; | 1316 | int young = 0; |
| 1243 | 1317 | ||
| 1244 | /* | 1318 | /* |
| 1245 | * Emulate the accessed bit for EPT, by checking if this page has | 1319 | * In case of absence of EPT Access and Dirty Bits supports, |
| 1320 | * emulate the accessed bit for EPT, by checking if this page has | ||
| 1246 | * an EPT mapping, and clearing it if it does. On the next access, | 1321 | * an EPT mapping, and clearing it if it does. On the next access, |
| 1247 | * a new EPT mapping will be established. | 1322 | * a new EPT mapping will be established. |
| 1248 | * This has some overhead, but not as much as the cost of swapping | 1323 | * This has some overhead, but not as much as the cost of swapping |
| @@ -1253,11 +1328,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
| 1253 | 1328 | ||
| 1254 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; | 1329 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; |
| 1255 | sptep = rmap_get_next(&iter)) { | 1330 | sptep = rmap_get_next(&iter)) { |
| 1256 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); | 1331 | BUG_ON(!is_shadow_present_pte(*sptep)); |
| 1257 | 1332 | ||
| 1258 | if (*sptep & PT_ACCESSED_MASK) { | 1333 | if (*sptep & shadow_accessed_mask) { |
| 1259 | young = 1; | 1334 | young = 1; |
| 1260 | clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep); | 1335 | clear_bit((ffs(shadow_accessed_mask) - 1), |
| 1336 | (unsigned long *)sptep); | ||
| 1261 | } | 1337 | } |
| 1262 | } | 1338 | } |
| 1263 | 1339 | ||
| @@ -1281,9 +1357,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
| 1281 | 1357 | ||
| 1282 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; | 1358 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; |
| 1283 | sptep = rmap_get_next(&iter)) { | 1359 | sptep = rmap_get_next(&iter)) { |
| 1284 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); | 1360 | BUG_ON(!is_shadow_present_pte(*sptep)); |
| 1285 | 1361 | ||
| 1286 | if (*sptep & PT_ACCESSED_MASK) { | 1362 | if (*sptep & shadow_accessed_mask) { |
| 1287 | young = 1; | 1363 | young = 1; |
| 1288 | break; | 1364 | break; |
| 1289 | } | 1365 | } |
| @@ -1401,12 +1477,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | |||
| 1401 | u64 *parent_pte, int direct) | 1477 | u64 *parent_pte, int direct) |
| 1402 | { | 1478 | { |
| 1403 | struct kvm_mmu_page *sp; | 1479 | struct kvm_mmu_page *sp; |
| 1404 | sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, | 1480 | sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); |
| 1405 | sizeof *sp); | 1481 | sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); |
| 1406 | sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); | ||
| 1407 | if (!direct) | 1482 | if (!direct) |
| 1408 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, | 1483 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); |
| 1409 | PAGE_SIZE); | ||
| 1410 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | 1484 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); |
| 1411 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); | 1485 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); |
| 1412 | bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); | 1486 | bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); |
| @@ -1701,7 +1775,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, | |||
| 1701 | 1775 | ||
| 1702 | kvm_mmu_pages_init(parent, &parents, &pages); | 1776 | kvm_mmu_pages_init(parent, &parents, &pages); |
| 1703 | while (mmu_unsync_walk(parent, &pages)) { | 1777 | while (mmu_unsync_walk(parent, &pages)) { |
| 1704 | int protected = 0; | 1778 | bool protected = false; |
| 1705 | 1779 | ||
| 1706 | for_each_sp(pages, sp, parents, i) | 1780 | for_each_sp(pages, sp, parents, i) |
| 1707 | protected |= rmap_write_protect(vcpu->kvm, sp->gfn); | 1781 | protected |= rmap_write_protect(vcpu->kvm, sp->gfn); |
| @@ -1866,15 +1940,6 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) | |||
| 1866 | mmu_spte_set(sptep, spte); | 1940 | mmu_spte_set(sptep, spte); |
| 1867 | } | 1941 | } |
| 1868 | 1942 | ||
| 1869 | static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) | ||
| 1870 | { | ||
| 1871 | if (is_large_pte(*sptep)) { | ||
| 1872 | drop_spte(vcpu->kvm, sptep); | ||
| 1873 | --vcpu->kvm->stat.lpages; | ||
| 1874 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
| 1875 | } | ||
| 1876 | } | ||
| 1877 | |||
| 1878 | static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, | 1943 | static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, |
| 1879 | unsigned direct_access) | 1944 | unsigned direct_access) |
| 1880 | { | 1945 | { |
| @@ -2243,7 +2308,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
| 2243 | gfn_t gfn, pfn_t pfn, bool speculative, | 2308 | gfn_t gfn, pfn_t pfn, bool speculative, |
| 2244 | bool can_unsync, bool host_writable) | 2309 | bool can_unsync, bool host_writable) |
| 2245 | { | 2310 | { |
| 2246 | u64 spte, entry = *sptep; | 2311 | u64 spte; |
| 2247 | int ret = 0; | 2312 | int ret = 0; |
| 2248 | 2313 | ||
| 2249 | if (set_mmio_spte(sptep, gfn, pfn, pte_access)) | 2314 | if (set_mmio_spte(sptep, gfn, pfn, pte_access)) |
| @@ -2257,8 +2322,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
| 2257 | spte |= shadow_x_mask; | 2322 | spte |= shadow_x_mask; |
| 2258 | else | 2323 | else |
| 2259 | spte |= shadow_nx_mask; | 2324 | spte |= shadow_nx_mask; |
| 2325 | |||
| 2260 | if (pte_access & ACC_USER_MASK) | 2326 | if (pte_access & ACC_USER_MASK) |
| 2261 | spte |= shadow_user_mask; | 2327 | spte |= shadow_user_mask; |
| 2328 | |||
| 2262 | if (level > PT_PAGE_TABLE_LEVEL) | 2329 | if (level > PT_PAGE_TABLE_LEVEL) |
| 2263 | spte |= PT_PAGE_SIZE_MASK; | 2330 | spte |= PT_PAGE_SIZE_MASK; |
| 2264 | if (tdp_enabled) | 2331 | if (tdp_enabled) |
| @@ -2283,7 +2350,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
| 2283 | goto done; | 2350 | goto done; |
| 2284 | } | 2351 | } |
| 2285 | 2352 | ||
| 2286 | spte |= PT_WRITABLE_MASK; | 2353 | spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; |
| 2287 | 2354 | ||
| 2288 | if (!vcpu->arch.mmu.direct_map | 2355 | if (!vcpu->arch.mmu.direct_map |
| 2289 | && !(pte_access & ACC_WRITE_MASK)) { | 2356 | && !(pte_access & ACC_WRITE_MASK)) { |
| @@ -2312,8 +2379,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
| 2312 | __func__, gfn); | 2379 | __func__, gfn); |
| 2313 | ret = 1; | 2380 | ret = 1; |
| 2314 | pte_access &= ~ACC_WRITE_MASK; | 2381 | pte_access &= ~ACC_WRITE_MASK; |
| 2315 | if (is_writable_pte(spte)) | 2382 | spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); |
| 2316 | spte &= ~PT_WRITABLE_MASK; | ||
| 2317 | } | 2383 | } |
| 2318 | } | 2384 | } |
| 2319 | 2385 | ||
| @@ -2321,14 +2387,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
| 2321 | mark_page_dirty(vcpu->kvm, gfn); | 2387 | mark_page_dirty(vcpu->kvm, gfn); |
| 2322 | 2388 | ||
| 2323 | set_pte: | 2389 | set_pte: |
| 2324 | mmu_spte_update(sptep, spte); | 2390 | if (mmu_spte_update(sptep, spte)) |
| 2325 | /* | ||
| 2326 | * If we overwrite a writable spte with a read-only one we | ||
| 2327 | * should flush remote TLBs. Otherwise rmap_write_protect | ||
| 2328 | * will find a read-only spte, even though the writable spte | ||
| 2329 | * might be cached on a CPU's TLB. | ||
| 2330 | */ | ||
| 2331 | if (is_writable_pte(entry) && !is_writable_pte(*sptep)) | ||
| 2332 | kvm_flush_remote_tlbs(vcpu->kvm); | 2391 | kvm_flush_remote_tlbs(vcpu->kvm); |
| 2333 | done: | 2392 | done: |
| 2334 | return ret; | 2393 | return ret; |
| @@ -2403,6 +2462,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
| 2403 | 2462 | ||
| 2404 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | 2463 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) |
| 2405 | { | 2464 | { |
| 2465 | mmu_free_roots(vcpu); | ||
| 2406 | } | 2466 | } |
| 2407 | 2467 | ||
| 2408 | static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, | 2468 | static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, |
| @@ -2625,18 +2685,116 @@ exit: | |||
| 2625 | return ret; | 2685 | return ret; |
| 2626 | } | 2686 | } |
| 2627 | 2687 | ||
| 2688 | static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code) | ||
| 2689 | { | ||
| 2690 | /* | ||
| 2691 | * #PF can be fast only if the shadow page table is present and it | ||
| 2692 | * is caused by write-protect, that means we just need change the | ||
| 2693 | * W bit of the spte which can be done out of mmu-lock. | ||
| 2694 | */ | ||
| 2695 | if (!(error_code & PFERR_PRESENT_MASK) || | ||
| 2696 | !(error_code & PFERR_WRITE_MASK)) | ||
| 2697 | return false; | ||
| 2698 | |||
| 2699 | return true; | ||
| 2700 | } | ||
| 2701 | |||
| 2702 | static bool | ||
| 2703 | fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte) | ||
| 2704 | { | ||
| 2705 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); | ||
| 2706 | gfn_t gfn; | ||
| 2707 | |||
| 2708 | WARN_ON(!sp->role.direct); | ||
| 2709 | |||
| 2710 | /* | ||
| 2711 | * The gfn of direct spte is stable since it is calculated | ||
| 2712 | * by sp->gfn. | ||
| 2713 | */ | ||
| 2714 | gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); | ||
| 2715 | |||
| 2716 | if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) | ||
| 2717 | mark_page_dirty(vcpu->kvm, gfn); | ||
| 2718 | |||
| 2719 | return true; | ||
| 2720 | } | ||
| 2721 | |||
| 2722 | /* | ||
| 2723 | * Return value: | ||
| 2724 | * - true: let the vcpu to access on the same address again. | ||
| 2725 | * - false: let the real page fault path to fix it. | ||
| 2726 | */ | ||
| 2727 | static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, | ||
| 2728 | u32 error_code) | ||
| 2729 | { | ||
| 2730 | struct kvm_shadow_walk_iterator iterator; | ||
| 2731 | bool ret = false; | ||
| 2732 | u64 spte = 0ull; | ||
| 2733 | |||
| 2734 | if (!page_fault_can_be_fast(vcpu, error_code)) | ||
| 2735 | return false; | ||
| 2736 | |||
| 2737 | walk_shadow_page_lockless_begin(vcpu); | ||
| 2738 | for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) | ||
| 2739 | if (!is_shadow_present_pte(spte) || iterator.level < level) | ||
| 2740 | break; | ||
| 2741 | |||
| 2742 | /* | ||
| 2743 | * If the mapping has been changed, let the vcpu fault on the | ||
| 2744 | * same address again. | ||
| 2745 | */ | ||
| 2746 | if (!is_rmap_spte(spte)) { | ||
| 2747 | ret = true; | ||
| 2748 | goto exit; | ||
| 2749 | } | ||
| 2750 | |||
| 2751 | if (!is_last_spte(spte, level)) | ||
| 2752 | goto exit; | ||
| 2753 | |||
| 2754 | /* | ||
| 2755 | * Check if it is a spurious fault caused by TLB lazily flushed. | ||
| 2756 | * | ||
| 2757 | * Need not check the access of upper level table entries since | ||
| 2758 | * they are always ACC_ALL. | ||
| 2759 | */ | ||
| 2760 | if (is_writable_pte(spte)) { | ||
| 2761 | ret = true; | ||
| 2762 | goto exit; | ||
| 2763 | } | ||
| 2764 | |||
| 2765 | /* | ||
| 2766 | * Currently, to simplify the code, only the spte write-protected | ||
| 2767 | * by dirty-log can be fast fixed. | ||
| 2768 | */ | ||
| 2769 | if (!spte_is_locklessly_modifiable(spte)) | ||
| 2770 | goto exit; | ||
| 2771 | |||
| 2772 | /* | ||
| 2773 | * Currently, fast page fault only works for direct mapping since | ||
| 2774 | * the gfn is not stable for indirect shadow page. | ||
| 2775 | * See Documentation/virtual/kvm/locking.txt to get more detail. | ||
| 2776 | */ | ||
| 2777 | ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte); | ||
| 2778 | exit: | ||
| 2779 | trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, | ||
| 2780 | spte, ret); | ||
| 2781 | walk_shadow_page_lockless_end(vcpu); | ||
| 2782 | |||
| 2783 | return ret; | ||
| 2784 | } | ||
| 2785 | |||
| 2628 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, | 2786 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, |
| 2629 | gva_t gva, pfn_t *pfn, bool write, bool *writable); | 2787 | gva_t gva, pfn_t *pfn, bool write, bool *writable); |
| 2630 | 2788 | ||
| 2631 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, | 2789 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, |
| 2632 | bool prefault) | 2790 | gfn_t gfn, bool prefault) |
| 2633 | { | 2791 | { |
| 2634 | int r; | 2792 | int r; |
| 2635 | int level; | 2793 | int level; |
| 2636 | int force_pt_level; | 2794 | int force_pt_level; |
| 2637 | pfn_t pfn; | 2795 | pfn_t pfn; |
| 2638 | unsigned long mmu_seq; | 2796 | unsigned long mmu_seq; |
| 2639 | bool map_writable; | 2797 | bool map_writable, write = error_code & PFERR_WRITE_MASK; |
| 2640 | 2798 | ||
| 2641 | force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); | 2799 | force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); |
| 2642 | if (likely(!force_pt_level)) { | 2800 | if (likely(!force_pt_level)) { |
| @@ -2653,6 +2811,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, | |||
| 2653 | } else | 2811 | } else |
| 2654 | level = PT_PAGE_TABLE_LEVEL; | 2812 | level = PT_PAGE_TABLE_LEVEL; |
| 2655 | 2813 | ||
| 2814 | if (fast_page_fault(vcpu, v, level, error_code)) | ||
| 2815 | return 0; | ||
| 2816 | |||
| 2656 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2817 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
| 2657 | smp_rmb(); | 2818 | smp_rmb(); |
| 2658 | 2819 | ||
| @@ -3041,7 +3202,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | |||
| 3041 | gfn = gva >> PAGE_SHIFT; | 3202 | gfn = gva >> PAGE_SHIFT; |
| 3042 | 3203 | ||
| 3043 | return nonpaging_map(vcpu, gva & PAGE_MASK, | 3204 | return nonpaging_map(vcpu, gva & PAGE_MASK, |
| 3044 | error_code & PFERR_WRITE_MASK, gfn, prefault); | 3205 | error_code, gfn, prefault); |
| 3045 | } | 3206 | } |
| 3046 | 3207 | ||
| 3047 | static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) | 3208 | static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) |
| @@ -3121,6 +3282,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | |||
| 3121 | } else | 3282 | } else |
| 3122 | level = PT_PAGE_TABLE_LEVEL; | 3283 | level = PT_PAGE_TABLE_LEVEL; |
| 3123 | 3284 | ||
| 3285 | if (fast_page_fault(vcpu, gpa, level, error_code)) | ||
| 3286 | return 0; | ||
| 3287 | |||
| 3124 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 3288 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
| 3125 | smp_rmb(); | 3289 | smp_rmb(); |
| 3126 | 3290 | ||
| @@ -3885,6 +4049,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) | |||
| 3885 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | 4049 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) |
| 3886 | { | 4050 | { |
| 3887 | struct kvm_mmu_page *sp; | 4051 | struct kvm_mmu_page *sp; |
| 4052 | bool flush = false; | ||
| 3888 | 4053 | ||
| 3889 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { | 4054 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { |
| 3890 | int i; | 4055 | int i; |
| @@ -3899,16 +4064,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
| 3899 | !is_last_spte(pt[i], sp->role.level)) | 4064 | !is_last_spte(pt[i], sp->role.level)) |
| 3900 | continue; | 4065 | continue; |
| 3901 | 4066 | ||
| 3902 | if (is_large_pte(pt[i])) { | 4067 | spte_write_protect(kvm, &pt[i], &flush, false); |
| 3903 | drop_spte(kvm, &pt[i]); | ||
| 3904 | --kvm->stat.lpages; | ||
| 3905 | continue; | ||
| 3906 | } | ||
| 3907 | |||
| 3908 | /* avoid RMW */ | ||
| 3909 | if (is_writable_pte(pt[i])) | ||
| 3910 | mmu_spte_update(&pt[i], | ||
| 3911 | pt[i] & ~PT_WRITABLE_MASK); | ||
| 3912 | } | 4068 | } |
| 3913 | } | 4069 | } |
| 3914 | kvm_flush_remote_tlbs(kvm); | 4070 | kvm_flush_remote_tlbs(kvm); |
| @@ -3945,7 +4101,6 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, | |||
| 3945 | static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) | 4101 | static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) |
| 3946 | { | 4102 | { |
| 3947 | struct kvm *kvm; | 4103 | struct kvm *kvm; |
| 3948 | struct kvm *kvm_freed = NULL; | ||
| 3949 | int nr_to_scan = sc->nr_to_scan; | 4104 | int nr_to_scan = sc->nr_to_scan; |
| 3950 | 4105 | ||
| 3951 | if (nr_to_scan == 0) | 4106 | if (nr_to_scan == 0) |
| @@ -3957,22 +4112,30 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) | |||
| 3957 | int idx; | 4112 | int idx; |
| 3958 | LIST_HEAD(invalid_list); | 4113 | LIST_HEAD(invalid_list); |
| 3959 | 4114 | ||
| 4115 | /* | ||
| 4116 | * n_used_mmu_pages is accessed without holding kvm->mmu_lock | ||
| 4117 | * here. We may skip a VM instance errorneosly, but we do not | ||
| 4118 | * want to shrink a VM that only started to populate its MMU | ||
| 4119 | * anyway. | ||
| 4120 | */ | ||
| 4121 | if (kvm->arch.n_used_mmu_pages > 0) { | ||
| 4122 | if (!nr_to_scan--) | ||
| 4123 | break; | ||
| 4124 | continue; | ||
| 4125 | } | ||
| 4126 | |||
| 3960 | idx = srcu_read_lock(&kvm->srcu); | 4127 | idx = srcu_read_lock(&kvm->srcu); |
| 3961 | spin_lock(&kvm->mmu_lock); | 4128 | spin_lock(&kvm->mmu_lock); |
| 3962 | if (!kvm_freed && nr_to_scan > 0 && | ||
| 3963 | kvm->arch.n_used_mmu_pages > 0) { | ||
| 3964 | kvm_mmu_remove_some_alloc_mmu_pages(kvm, | ||
| 3965 | &invalid_list); | ||
| 3966 | kvm_freed = kvm; | ||
| 3967 | } | ||
| 3968 | nr_to_scan--; | ||
| 3969 | 4129 | ||
| 4130 | kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list); | ||
| 3970 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | 4131 | kvm_mmu_commit_zap_page(kvm, &invalid_list); |
| 4132 | |||
| 3971 | spin_unlock(&kvm->mmu_lock); | 4133 | spin_unlock(&kvm->mmu_lock); |
| 3972 | srcu_read_unlock(&kvm->srcu, idx); | 4134 | srcu_read_unlock(&kvm->srcu, idx); |
| 4135 | |||
| 4136 | list_move_tail(&kvm->vm_list, &vm_list); | ||
| 4137 | break; | ||
| 3973 | } | 4138 | } |
| 3974 | if (kvm_freed) | ||
| 3975 | list_move_tail(&kvm_freed->vm_list, &vm_list); | ||
| 3976 | 4139 | ||
| 3977 | raw_spin_unlock(&kvm_lock); | 4140 | raw_spin_unlock(&kvm_lock); |
| 3978 | 4141 | ||
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 89fb0e81322a..cd6e98333ba3 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h | |||
| @@ -54,8 +54,8 @@ | |||
| 54 | */ | 54 | */ |
| 55 | TRACE_EVENT( | 55 | TRACE_EVENT( |
| 56 | kvm_mmu_pagetable_walk, | 56 | kvm_mmu_pagetable_walk, |
| 57 | TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault), | 57 | TP_PROTO(u64 addr, u32 pferr), |
| 58 | TP_ARGS(addr, write_fault, user_fault, fetch_fault), | 58 | TP_ARGS(addr, pferr), |
| 59 | 59 | ||
| 60 | TP_STRUCT__entry( | 60 | TP_STRUCT__entry( |
| 61 | __field(__u64, addr) | 61 | __field(__u64, addr) |
| @@ -64,8 +64,7 @@ TRACE_EVENT( | |||
| 64 | 64 | ||
| 65 | TP_fast_assign( | 65 | TP_fast_assign( |
| 66 | __entry->addr = addr; | 66 | __entry->addr = addr; |
| 67 | __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2) | 67 | __entry->pferr = pferr; |
| 68 | | (!!fetch_fault << 4); | ||
| 69 | ), | 68 | ), |
| 70 | 69 | ||
| 71 | TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, | 70 | TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, |
| @@ -243,6 +242,44 @@ TRACE_EVENT( | |||
| 243 | TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, | 242 | TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, |
| 244 | __entry->access) | 243 | __entry->access) |
| 245 | ); | 244 | ); |
| 245 | |||
| 246 | #define __spte_satisfied(__spte) \ | ||
| 247 | (__entry->retry && is_writable_pte(__entry->__spte)) | ||
| 248 | |||
| 249 | TRACE_EVENT( | ||
| 250 | fast_page_fault, | ||
| 251 | TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, | ||
| 252 | u64 *sptep, u64 old_spte, bool retry), | ||
| 253 | TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry), | ||
| 254 | |||
| 255 | TP_STRUCT__entry( | ||
| 256 | __field(int, vcpu_id) | ||
| 257 | __field(gva_t, gva) | ||
| 258 | __field(u32, error_code) | ||
| 259 | __field(u64 *, sptep) | ||
| 260 | __field(u64, old_spte) | ||
| 261 | __field(u64, new_spte) | ||
| 262 | __field(bool, retry) | ||
| 263 | ), | ||
| 264 | |||
| 265 | TP_fast_assign( | ||
| 266 | __entry->vcpu_id = vcpu->vcpu_id; | ||
| 267 | __entry->gva = gva; | ||
| 268 | __entry->error_code = error_code; | ||
| 269 | __entry->sptep = sptep; | ||
| 270 | __entry->old_spte = old_spte; | ||
| 271 | __entry->new_spte = *sptep; | ||
| 272 | __entry->retry = retry; | ||
| 273 | ), | ||
| 274 | |||
| 275 | TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx" | ||
| 276 | " new %llx spurious %d fixed %d", __entry->vcpu_id, | ||
| 277 | __entry->gva, __print_flags(__entry->error_code, "|", | ||
| 278 | kvm_mmu_trace_pferr_flags), __entry->sptep, | ||
| 279 | __entry->old_spte, __entry->new_spte, | ||
| 280 | __spte_satisfied(old_spte), __spte_satisfied(new_spte) | ||
| 281 | ) | ||
| 282 | ); | ||
| 246 | #endif /* _TRACE_KVMMMU_H */ | 283 | #endif /* _TRACE_KVMMMU_H */ |
| 247 | 284 | ||
| 248 | #undef TRACE_INCLUDE_PATH | 285 | #undef TRACE_INCLUDE_PATH |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 34f970937ef1..bb7cf01cae76 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
| @@ -154,8 +154,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, | |||
| 154 | const int fetch_fault = access & PFERR_FETCH_MASK; | 154 | const int fetch_fault = access & PFERR_FETCH_MASK; |
| 155 | u16 errcode = 0; | 155 | u16 errcode = 0; |
| 156 | 156 | ||
| 157 | trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, | 157 | trace_kvm_mmu_pagetable_walk(addr, access); |
| 158 | fetch_fault); | ||
| 159 | retry_walk: | 158 | retry_walk: |
| 160 | eperm = false; | 159 | eperm = false; |
| 161 | walker->level = mmu->root_level; | 160 | walker->level = mmu->root_level; |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f75af406b268..baead950d6c8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
| @@ -3185,8 +3185,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
| 3185 | break; | 3185 | break; |
| 3186 | case MSR_IA32_DEBUGCTLMSR: | 3186 | case MSR_IA32_DEBUGCTLMSR: |
| 3187 | if (!boot_cpu_has(X86_FEATURE_LBRV)) { | 3187 | if (!boot_cpu_has(X86_FEATURE_LBRV)) { |
| 3188 | pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", | 3188 | vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", |
| 3189 | __func__, data); | 3189 | __func__, data); |
| 3190 | break; | 3190 | break; |
| 3191 | } | 3191 | } |
| 3192 | if (data & DEBUGCTL_RESERVED_BITS) | 3192 | if (data & DEBUGCTL_RESERVED_BITS) |
| @@ -3205,7 +3205,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
| 3205 | case MSR_VM_CR: | 3205 | case MSR_VM_CR: |
| 3206 | return svm_set_vm_cr(vcpu, data); | 3206 | return svm_set_vm_cr(vcpu, data); |
| 3207 | case MSR_VM_IGNNE: | 3207 | case MSR_VM_IGNNE: |
| 3208 | pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); | 3208 | vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); |
| 3209 | break; | 3209 | break; |
| 3210 | default: | 3210 | default: |
| 3211 | return kvm_set_msr_common(vcpu, ecx, data); | 3211 | return kvm_set_msr_common(vcpu, ecx, data); |
| @@ -4044,6 +4044,11 @@ static bool svm_rdtscp_supported(void) | |||
| 4044 | return false; | 4044 | return false; |
| 4045 | } | 4045 | } |
| 4046 | 4046 | ||
| 4047 | static bool svm_invpcid_supported(void) | ||
| 4048 | { | ||
| 4049 | return false; | ||
| 4050 | } | ||
| 4051 | |||
| 4047 | static bool svm_has_wbinvd_exit(void) | 4052 | static bool svm_has_wbinvd_exit(void) |
| 4048 | { | 4053 | { |
| 4049 | return true; | 4054 | return true; |
| @@ -4312,6 +4317,7 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
| 4312 | .cpuid_update = svm_cpuid_update, | 4317 | .cpuid_update = svm_cpuid_update, |
| 4313 | 4318 | ||
| 4314 | .rdtscp_supported = svm_rdtscp_supported, | 4319 | .rdtscp_supported = svm_rdtscp_supported, |
| 4320 | .invpcid_supported = svm_invpcid_supported, | ||
| 4315 | 4321 | ||
| 4316 | .set_supported_cpuid = svm_set_supported_cpuid, | 4322 | .set_supported_cpuid = svm_set_supported_cpuid, |
| 4317 | 4323 | ||
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 62d02e3c3ed6..a71faf727ff3 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h | |||
| @@ -517,6 +517,40 @@ TRACE_EVENT(kvm_apic_accept_irq, | |||
| 517 | __entry->coalesced ? " (coalesced)" : "") | 517 | __entry->coalesced ? " (coalesced)" : "") |
| 518 | ); | 518 | ); |
| 519 | 519 | ||
| 520 | TRACE_EVENT(kvm_eoi, | ||
| 521 | TP_PROTO(struct kvm_lapic *apic, int vector), | ||
| 522 | TP_ARGS(apic, vector), | ||
| 523 | |||
| 524 | TP_STRUCT__entry( | ||
| 525 | __field( __u32, apicid ) | ||
| 526 | __field( int, vector ) | ||
| 527 | ), | ||
| 528 | |||
| 529 | TP_fast_assign( | ||
| 530 | __entry->apicid = apic->vcpu->vcpu_id; | ||
| 531 | __entry->vector = vector; | ||
| 532 | ), | ||
| 533 | |||
| 534 | TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) | ||
| 535 | ); | ||
| 536 | |||
| 537 | TRACE_EVENT(kvm_pv_eoi, | ||
| 538 | TP_PROTO(struct kvm_lapic *apic, int vector), | ||
| 539 | TP_ARGS(apic, vector), | ||
| 540 | |||
| 541 | TP_STRUCT__entry( | ||
| 542 | __field( __u32, apicid ) | ||
| 543 | __field( int, vector ) | ||
| 544 | ), | ||
| 545 | |||
| 546 | TP_fast_assign( | ||
| 547 | __entry->apicid = apic->vcpu->vcpu_id; | ||
| 548 | __entry->vector = vector; | ||
| 549 | ), | ||
| 550 | |||
| 551 | TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) | ||
| 552 | ); | ||
| 553 | |||
| 520 | /* | 554 | /* |
| 521 | * Tracepoint for nested VMRUN | 555 | * Tracepoint for nested VMRUN |
| 522 | */ | 556 | */ |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 32eb58866292..c39b60707e02 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
| @@ -71,7 +71,10 @@ static bool __read_mostly enable_unrestricted_guest = 1; | |||
| 71 | module_param_named(unrestricted_guest, | 71 | module_param_named(unrestricted_guest, |
| 72 | enable_unrestricted_guest, bool, S_IRUGO); | 72 | enable_unrestricted_guest, bool, S_IRUGO); |
| 73 | 73 | ||
| 74 | static bool __read_mostly emulate_invalid_guest_state = 0; | 74 | static bool __read_mostly enable_ept_ad_bits = 1; |
| 75 | module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); | ||
| 76 | |||
| 77 | static bool __read_mostly emulate_invalid_guest_state = true; | ||
| 75 | module_param(emulate_invalid_guest_state, bool, S_IRUGO); | 78 | module_param(emulate_invalid_guest_state, bool, S_IRUGO); |
| 76 | 79 | ||
| 77 | static bool __read_mostly vmm_exclusive = 1; | 80 | static bool __read_mostly vmm_exclusive = 1; |
| @@ -615,6 +618,10 @@ static void kvm_cpu_vmxon(u64 addr); | |||
| 615 | static void kvm_cpu_vmxoff(void); | 618 | static void kvm_cpu_vmxoff(void); |
| 616 | static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); | 619 | static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); |
| 617 | static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); | 620 | static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); |
| 621 | static void vmx_set_segment(struct kvm_vcpu *vcpu, | ||
| 622 | struct kvm_segment *var, int seg); | ||
| 623 | static void vmx_get_segment(struct kvm_vcpu *vcpu, | ||
| 624 | struct kvm_segment *var, int seg); | ||
| 618 | 625 | ||
| 619 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); | 626 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); |
| 620 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); | 627 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); |
| @@ -789,6 +796,11 @@ static inline bool cpu_has_vmx_ept_4levels(void) | |||
| 789 | return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; | 796 | return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; |
| 790 | } | 797 | } |
| 791 | 798 | ||
| 799 | static inline bool cpu_has_vmx_ept_ad_bits(void) | ||
| 800 | { | ||
| 801 | return vmx_capability.ept & VMX_EPT_AD_BIT; | ||
| 802 | } | ||
| 803 | |||
| 792 | static inline bool cpu_has_vmx_invept_individual_addr(void) | 804 | static inline bool cpu_has_vmx_invept_individual_addr(void) |
| 793 | { | 805 | { |
| 794 | return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; | 806 | return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; |
| @@ -849,6 +861,12 @@ static inline bool cpu_has_vmx_rdtscp(void) | |||
| 849 | SECONDARY_EXEC_RDTSCP; | 861 | SECONDARY_EXEC_RDTSCP; |
| 850 | } | 862 | } |
| 851 | 863 | ||
| 864 | static inline bool cpu_has_vmx_invpcid(void) | ||
| 865 | { | ||
| 866 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
| 867 | SECONDARY_EXEC_ENABLE_INVPCID; | ||
| 868 | } | ||
| 869 | |||
| 852 | static inline bool cpu_has_virtual_nmis(void) | 870 | static inline bool cpu_has_virtual_nmis(void) |
| 853 | { | 871 | { |
| 854 | return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; | 872 | return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; |
| @@ -1739,6 +1757,11 @@ static bool vmx_rdtscp_supported(void) | |||
| 1739 | return cpu_has_vmx_rdtscp(); | 1757 | return cpu_has_vmx_rdtscp(); |
| 1740 | } | 1758 | } |
| 1741 | 1759 | ||
| 1760 | static bool vmx_invpcid_supported(void) | ||
| 1761 | { | ||
| 1762 | return cpu_has_vmx_invpcid() && enable_ept; | ||
| 1763 | } | ||
| 1764 | |||
| 1742 | /* | 1765 | /* |
| 1743 | * Swap MSR entry in host/guest MSR entry array. | 1766 | * Swap MSR entry in host/guest MSR entry array. |
| 1744 | */ | 1767 | */ |
| @@ -2458,7 +2481,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
| 2458 | SECONDARY_EXEC_ENABLE_EPT | | 2481 | SECONDARY_EXEC_ENABLE_EPT | |
| 2459 | SECONDARY_EXEC_UNRESTRICTED_GUEST | | 2482 | SECONDARY_EXEC_UNRESTRICTED_GUEST | |
| 2460 | SECONDARY_EXEC_PAUSE_LOOP_EXITING | | 2483 | SECONDARY_EXEC_PAUSE_LOOP_EXITING | |
| 2461 | SECONDARY_EXEC_RDTSCP; | 2484 | SECONDARY_EXEC_RDTSCP | |
| 2485 | SECONDARY_EXEC_ENABLE_INVPCID; | ||
| 2462 | if (adjust_vmx_controls(min2, opt2, | 2486 | if (adjust_vmx_controls(min2, opt2, |
| 2463 | MSR_IA32_VMX_PROCBASED_CTLS2, | 2487 | MSR_IA32_VMX_PROCBASED_CTLS2, |
| 2464 | &_cpu_based_2nd_exec_control) < 0) | 2488 | &_cpu_based_2nd_exec_control) < 0) |
| @@ -2645,8 +2669,12 @@ static __init int hardware_setup(void) | |||
| 2645 | !cpu_has_vmx_ept_4levels()) { | 2669 | !cpu_has_vmx_ept_4levels()) { |
| 2646 | enable_ept = 0; | 2670 | enable_ept = 0; |
| 2647 | enable_unrestricted_guest = 0; | 2671 | enable_unrestricted_guest = 0; |
| 2672 | enable_ept_ad_bits = 0; | ||
| 2648 | } | 2673 | } |
| 2649 | 2674 | ||
| 2675 | if (!cpu_has_vmx_ept_ad_bits()) | ||
| 2676 | enable_ept_ad_bits = 0; | ||
| 2677 | |||
| 2650 | if (!cpu_has_vmx_unrestricted_guest()) | 2678 | if (!cpu_has_vmx_unrestricted_guest()) |
| 2651 | enable_unrestricted_guest = 0; | 2679 | enable_unrestricted_guest = 0; |
| 2652 | 2680 | ||
| @@ -2770,6 +2798,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
| 2770 | { | 2798 | { |
| 2771 | unsigned long flags; | 2799 | unsigned long flags; |
| 2772 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2800 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
| 2801 | struct kvm_segment var; | ||
| 2773 | 2802 | ||
| 2774 | if (enable_unrestricted_guest) | 2803 | if (enable_unrestricted_guest) |
| 2775 | return; | 2804 | return; |
| @@ -2813,20 +2842,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
| 2813 | if (emulate_invalid_guest_state) | 2842 | if (emulate_invalid_guest_state) |
| 2814 | goto continue_rmode; | 2843 | goto continue_rmode; |
| 2815 | 2844 | ||
| 2816 | vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); | 2845 | vmx_get_segment(vcpu, &var, VCPU_SREG_SS); |
| 2817 | vmcs_write32(GUEST_SS_LIMIT, 0xffff); | 2846 | vmx_set_segment(vcpu, &var, VCPU_SREG_SS); |
| 2818 | vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); | 2847 | |
| 2848 | vmx_get_segment(vcpu, &var, VCPU_SREG_CS); | ||
| 2849 | vmx_set_segment(vcpu, &var, VCPU_SREG_CS); | ||
| 2850 | |||
| 2851 | vmx_get_segment(vcpu, &var, VCPU_SREG_ES); | ||
| 2852 | vmx_set_segment(vcpu, &var, VCPU_SREG_ES); | ||
| 2853 | |||
| 2854 | vmx_get_segment(vcpu, &var, VCPU_SREG_DS); | ||
| 2855 | vmx_set_segment(vcpu, &var, VCPU_SREG_DS); | ||
| 2819 | 2856 | ||
| 2820 | vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); | 2857 | vmx_get_segment(vcpu, &var, VCPU_SREG_GS); |
| 2821 | vmcs_write32(GUEST_CS_LIMIT, 0xffff); | 2858 | vmx_set_segment(vcpu, &var, VCPU_SREG_GS); |
| 2822 | if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) | ||
| 2823 | vmcs_writel(GUEST_CS_BASE, 0xf0000); | ||
| 2824 | vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); | ||
| 2825 | 2859 | ||
| 2826 | fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); | 2860 | vmx_get_segment(vcpu, &var, VCPU_SREG_FS); |
| 2827 | fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); | 2861 | vmx_set_segment(vcpu, &var, VCPU_SREG_FS); |
| 2828 | fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); | ||
| 2829 | fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); | ||
| 2830 | 2862 | ||
| 2831 | continue_rmode: | 2863 | continue_rmode: |
| 2832 | kvm_mmu_reset_context(vcpu); | 2864 | kvm_mmu_reset_context(vcpu); |
| @@ -3027,6 +3059,8 @@ static u64 construct_eptp(unsigned long root_hpa) | |||
| 3027 | /* TODO write the value reading from MSR */ | 3059 | /* TODO write the value reading from MSR */ |
| 3028 | eptp = VMX_EPT_DEFAULT_MT | | 3060 | eptp = VMX_EPT_DEFAULT_MT | |
| 3029 | VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; | 3061 | VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; |
| 3062 | if (enable_ept_ad_bits) | ||
| 3063 | eptp |= VMX_EPT_AD_ENABLE_BIT; | ||
| 3030 | eptp |= (root_hpa & PAGE_MASK); | 3064 | eptp |= (root_hpa & PAGE_MASK); |
| 3031 | 3065 | ||
| 3032 | return eptp; | 3066 | return eptp; |
| @@ -3153,11 +3187,22 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu) | |||
| 3153 | 3187 | ||
| 3154 | static int vmx_get_cpl(struct kvm_vcpu *vcpu) | 3188 | static int vmx_get_cpl(struct kvm_vcpu *vcpu) |
| 3155 | { | 3189 | { |
| 3190 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
| 3191 | |||
| 3192 | /* | ||
| 3193 | * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations | ||
| 3194 | * fail; use the cache instead. | ||
| 3195 | */ | ||
| 3196 | if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) { | ||
| 3197 | return vmx->cpl; | ||
| 3198 | } | ||
| 3199 | |||
| 3156 | if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { | 3200 | if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { |
| 3157 | __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); | 3201 | __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); |
| 3158 | to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu); | 3202 | vmx->cpl = __vmx_get_cpl(vcpu); |
| 3159 | } | 3203 | } |
| 3160 | return to_vmx(vcpu)->cpl; | 3204 | |
| 3205 | return vmx->cpl; | ||
| 3161 | } | 3206 | } |
| 3162 | 3207 | ||
| 3163 | 3208 | ||
| @@ -3165,7 +3210,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) | |||
| 3165 | { | 3210 | { |
| 3166 | u32 ar; | 3211 | u32 ar; |
| 3167 | 3212 | ||
| 3168 | if (var->unusable) | 3213 | if (var->unusable || !var->present) |
| 3169 | ar = 1 << 16; | 3214 | ar = 1 << 16; |
| 3170 | else { | 3215 | else { |
| 3171 | ar = var->type & 15; | 3216 | ar = var->type & 15; |
| @@ -3177,8 +3222,6 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) | |||
| 3177 | ar |= (var->db & 1) << 14; | 3222 | ar |= (var->db & 1) << 14; |
| 3178 | ar |= (var->g & 1) << 15; | 3223 | ar |= (var->g & 1) << 15; |
| 3179 | } | 3224 | } |
| 3180 | if (ar == 0) /* a 0 value means unusable */ | ||
| 3181 | ar = AR_UNUSABLE_MASK; | ||
| 3182 | 3225 | ||
| 3183 | return ar; | 3226 | return ar; |
| 3184 | } | 3227 | } |
| @@ -3229,6 +3272,44 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, | |||
| 3229 | 3272 | ||
| 3230 | vmcs_write32(sf->ar_bytes, ar); | 3273 | vmcs_write32(sf->ar_bytes, ar); |
| 3231 | __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); | 3274 | __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); |
| 3275 | |||
| 3276 | /* | ||
| 3277 | * Fix segments for real mode guest in hosts that don't have | ||
| 3278 | * "unrestricted_mode" or it was disabled. | ||
| 3279 | * This is done to allow migration of the guests from hosts with | ||
| 3280 | * unrestricted guest like Westmere to older host that don't have | ||
| 3281 | * unrestricted guest like Nehelem. | ||
| 3282 | */ | ||
| 3283 | if (!enable_unrestricted_guest && vmx->rmode.vm86_active) { | ||
| 3284 | switch (seg) { | ||
| 3285 | case VCPU_SREG_CS: | ||
| 3286 | vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); | ||
| 3287 | vmcs_write32(GUEST_CS_LIMIT, 0xffff); | ||
| 3288 | if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) | ||
| 3289 | vmcs_writel(GUEST_CS_BASE, 0xf0000); | ||
| 3290 | vmcs_write16(GUEST_CS_SELECTOR, | ||
| 3291 | vmcs_readl(GUEST_CS_BASE) >> 4); | ||
| 3292 | break; | ||
| 3293 | case VCPU_SREG_ES: | ||
| 3294 | fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); | ||
| 3295 | break; | ||
| 3296 | case VCPU_SREG_DS: | ||
| 3297 | fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); | ||
| 3298 | break; | ||
| 3299 | case VCPU_SREG_GS: | ||
| 3300 | fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); | ||
| 3301 | break; | ||
| 3302 | case VCPU_SREG_FS: | ||
| 3303 | fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); | ||
| 3304 | break; | ||
| 3305 | case VCPU_SREG_SS: | ||
| 3306 | vmcs_write16(GUEST_SS_SELECTOR, | ||
| 3307 | vmcs_readl(GUEST_SS_BASE) >> 4); | ||
| 3308 | vmcs_write32(GUEST_SS_LIMIT, 0xffff); | ||
| 3309 | vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); | ||
| 3310 | break; | ||
| 3311 | } | ||
| 3312 | } | ||
| 3232 | } | 3313 | } |
| 3233 | 3314 | ||
| 3234 | static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) | 3315 | static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) |
| @@ -3731,6 +3812,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) | |||
| 3731 | if (!enable_ept) { | 3812 | if (!enable_ept) { |
| 3732 | exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; | 3813 | exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; |
| 3733 | enable_unrestricted_guest = 0; | 3814 | enable_unrestricted_guest = 0; |
| 3815 | /* Enable INVPCID for non-ept guests may cause performance regression. */ | ||
| 3816 | exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; | ||
| 3734 | } | 3817 | } |
| 3735 | if (!enable_unrestricted_guest) | 3818 | if (!enable_unrestricted_guest) |
| 3736 | exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; | 3819 | exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; |
| @@ -4489,7 +4572,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) | |||
| 4489 | break; | 4572 | break; |
| 4490 | } | 4573 | } |
| 4491 | vcpu->run->exit_reason = 0; | 4574 | vcpu->run->exit_reason = 0; |
| 4492 | pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", | 4575 | vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", |
| 4493 | (int)(exit_qualification >> 4) & 3, cr); | 4576 | (int)(exit_qualification >> 4) & 3, cr); |
| 4494 | return 0; | 4577 | return 0; |
| 4495 | } | 4578 | } |
| @@ -4769,6 +4852,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) | |||
| 4769 | { | 4852 | { |
| 4770 | unsigned long exit_qualification; | 4853 | unsigned long exit_qualification; |
| 4771 | gpa_t gpa; | 4854 | gpa_t gpa; |
| 4855 | u32 error_code; | ||
| 4772 | int gla_validity; | 4856 | int gla_validity; |
| 4773 | 4857 | ||
| 4774 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | 4858 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); |
| @@ -4793,7 +4877,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) | |||
| 4793 | 4877 | ||
| 4794 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); | 4878 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); |
| 4795 | trace_kvm_page_fault(gpa, exit_qualification); | 4879 | trace_kvm_page_fault(gpa, exit_qualification); |
| 4796 | return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0); | 4880 | |
| 4881 | /* It is a write fault? */ | ||
| 4882 | error_code = exit_qualification & (1U << 1); | ||
| 4883 | /* ept page table is present? */ | ||
| 4884 | error_code |= (exit_qualification >> 3) & 0x1; | ||
| 4885 | |||
| 4886 | return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); | ||
| 4797 | } | 4887 | } |
| 4798 | 4888 | ||
| 4799 | static u64 ept_rsvd_mask(u64 spte, int level) | 4889 | static u64 ept_rsvd_mask(u64 spte, int level) |
| @@ -4908,15 +4998,18 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) | |||
| 4908 | int ret = 1; | 4998 | int ret = 1; |
| 4909 | u32 cpu_exec_ctrl; | 4999 | u32 cpu_exec_ctrl; |
| 4910 | bool intr_window_requested; | 5000 | bool intr_window_requested; |
| 5001 | unsigned count = 130; | ||
| 4911 | 5002 | ||
| 4912 | cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | 5003 | cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); |
| 4913 | intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; | 5004 | intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; |
| 4914 | 5005 | ||
| 4915 | while (!guest_state_valid(vcpu)) { | 5006 | while (!guest_state_valid(vcpu) && count-- != 0) { |
| 4916 | if (intr_window_requested | 5007 | if (intr_window_requested && vmx_interrupt_allowed(vcpu)) |
| 4917 | && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) | ||
| 4918 | return handle_interrupt_window(&vmx->vcpu); | 5008 | return handle_interrupt_window(&vmx->vcpu); |
| 4919 | 5009 | ||
| 5010 | if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) | ||
| 5011 | return 1; | ||
| 5012 | |||
| 4920 | err = emulate_instruction(vcpu, 0); | 5013 | err = emulate_instruction(vcpu, 0); |
| 4921 | 5014 | ||
| 4922 | if (err == EMULATE_DO_MMIO) { | 5015 | if (err == EMULATE_DO_MMIO) { |
| @@ -4924,8 +5017,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) | |||
| 4924 | goto out; | 5017 | goto out; |
| 4925 | } | 5018 | } |
| 4926 | 5019 | ||
| 4927 | if (err != EMULATE_DONE) | 5020 | if (err != EMULATE_DONE) { |
| 5021 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | ||
| 5022 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; | ||
| 5023 | vcpu->run->internal.ndata = 0; | ||
| 4928 | return 0; | 5024 | return 0; |
| 5025 | } | ||
| 4929 | 5026 | ||
| 4930 | if (signal_pending(current)) | 5027 | if (signal_pending(current)) |
| 4931 | goto out; | 5028 | goto out; |
| @@ -4933,7 +5030,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) | |||
| 4933 | schedule(); | 5030 | schedule(); |
| 4934 | } | 5031 | } |
| 4935 | 5032 | ||
| 4936 | vmx->emulation_required = 0; | 5033 | vmx->emulation_required = !guest_state_valid(vcpu); |
| 4937 | out: | 5034 | out: |
| 4938 | return ret; | 5035 | return ret; |
| 4939 | } | 5036 | } |
| @@ -6467,6 +6564,23 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) | |||
| 6467 | } | 6564 | } |
| 6468 | } | 6565 | } |
| 6469 | } | 6566 | } |
| 6567 | |||
| 6568 | exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); | ||
| 6569 | /* Exposing INVPCID only when PCID is exposed */ | ||
| 6570 | best = kvm_find_cpuid_entry(vcpu, 0x7, 0); | ||
| 6571 | if (vmx_invpcid_supported() && | ||
| 6572 | best && (best->ecx & bit(X86_FEATURE_INVPCID)) && | ||
| 6573 | guest_cpuid_has_pcid(vcpu)) { | ||
| 6574 | exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; | ||
| 6575 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, | ||
| 6576 | exec_control); | ||
| 6577 | } else { | ||
| 6578 | exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; | ||
| 6579 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, | ||
| 6580 | exec_control); | ||
| 6581 | if (best) | ||
| 6582 | best->ecx &= ~bit(X86_FEATURE_INVPCID); | ||
| 6583 | } | ||
| 6470 | } | 6584 | } |
| 6471 | 6585 | ||
| 6472 | static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) | 6586 | static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) |
| @@ -7201,6 +7315,7 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
| 7201 | .cpuid_update = vmx_cpuid_update, | 7315 | .cpuid_update = vmx_cpuid_update, |
| 7202 | 7316 | ||
| 7203 | .rdtscp_supported = vmx_rdtscp_supported, | 7317 | .rdtscp_supported = vmx_rdtscp_supported, |
| 7318 | .invpcid_supported = vmx_invpcid_supported, | ||
| 7204 | 7319 | ||
| 7205 | .set_supported_cpuid = vmx_set_supported_cpuid, | 7320 | .set_supported_cpuid = vmx_set_supported_cpuid, |
| 7206 | 7321 | ||
| @@ -7230,23 +7345,21 @@ static int __init vmx_init(void) | |||
| 7230 | if (!vmx_io_bitmap_a) | 7345 | if (!vmx_io_bitmap_a) |
| 7231 | return -ENOMEM; | 7346 | return -ENOMEM; |
| 7232 | 7347 | ||
| 7348 | r = -ENOMEM; | ||
| 7349 | |||
| 7233 | vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); | 7350 | vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); |
| 7234 | if (!vmx_io_bitmap_b) { | 7351 | if (!vmx_io_bitmap_b) |
| 7235 | r = -ENOMEM; | ||
| 7236 | goto out; | 7352 | goto out; |
| 7237 | } | ||
| 7238 | 7353 | ||
| 7239 | vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); | 7354 | vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); |
| 7240 | if (!vmx_msr_bitmap_legacy) { | 7355 | if (!vmx_msr_bitmap_legacy) |
| 7241 | r = -ENOMEM; | ||
| 7242 | goto out1; | 7356 | goto out1; |
| 7243 | } | 7357 | |
| 7244 | 7358 | ||
| 7245 | vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); | 7359 | vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); |
| 7246 | if (!vmx_msr_bitmap_longmode) { | 7360 | if (!vmx_msr_bitmap_longmode) |
| 7247 | r = -ENOMEM; | ||
| 7248 | goto out2; | 7361 | goto out2; |
| 7249 | } | 7362 | |
| 7250 | 7363 | ||
| 7251 | /* | 7364 | /* |
| 7252 | * Allow direct access to the PC debug port (it is often used for I/O | 7365 | * Allow direct access to the PC debug port (it is often used for I/O |
| @@ -7275,8 +7388,10 @@ static int __init vmx_init(void) | |||
| 7275 | vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); | 7388 | vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); |
| 7276 | 7389 | ||
| 7277 | if (enable_ept) { | 7390 | if (enable_ept) { |
| 7278 | kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, | 7391 | kvm_mmu_set_mask_ptes(0ull, |
| 7279 | VMX_EPT_EXECUTABLE_MASK); | 7392 | (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, |
| 7393 | (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, | ||
| 7394 | 0ull, VMX_EPT_EXECUTABLE_MASK); | ||
| 7280 | ept_set_mmio_spte_mask(); | 7395 | ept_set_mmio_spte_mask(); |
| 7281 | kvm_enable_tdp(); | 7396 | kvm_enable_tdp(); |
| 7282 | } else | 7397 | } else |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index be6d54929fa7..59b59508ff07 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
| @@ -528,6 +528,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
| 528 | return 1; | 528 | return 1; |
| 529 | } | 529 | } |
| 530 | 530 | ||
| 531 | if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) | ||
| 532 | return 1; | ||
| 533 | |||
| 531 | kvm_x86_ops->set_cr0(vcpu, cr0); | 534 | kvm_x86_ops->set_cr0(vcpu, cr0); |
| 532 | 535 | ||
| 533 | if ((cr0 ^ old_cr0) & X86_CR0_PG) { | 536 | if ((cr0 ^ old_cr0) & X86_CR0_PG) { |
| @@ -604,10 +607,20 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
| 604 | kvm_read_cr3(vcpu))) | 607 | kvm_read_cr3(vcpu))) |
| 605 | return 1; | 608 | return 1; |
| 606 | 609 | ||
| 610 | if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { | ||
| 611 | if (!guest_cpuid_has_pcid(vcpu)) | ||
| 612 | return 1; | ||
| 613 | |||
| 614 | /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ | ||
| 615 | if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) | ||
| 616 | return 1; | ||
| 617 | } | ||
| 618 | |||
| 607 | if (kvm_x86_ops->set_cr4(vcpu, cr4)) | 619 | if (kvm_x86_ops->set_cr4(vcpu, cr4)) |
| 608 | return 1; | 620 | return 1; |
| 609 | 621 | ||
| 610 | if ((cr4 ^ old_cr4) & pdptr_bits) | 622 | if (((cr4 ^ old_cr4) & pdptr_bits) || |
| 623 | (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) | ||
| 611 | kvm_mmu_reset_context(vcpu); | 624 | kvm_mmu_reset_context(vcpu); |
| 612 | 625 | ||
| 613 | if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) | 626 | if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) |
| @@ -626,8 +639,12 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
| 626 | } | 639 | } |
| 627 | 640 | ||
| 628 | if (is_long_mode(vcpu)) { | 641 | if (is_long_mode(vcpu)) { |
| 629 | if (cr3 & CR3_L_MODE_RESERVED_BITS) | 642 | if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) { |
| 630 | return 1; | 643 | if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) |
| 644 | return 1; | ||
| 645 | } else | ||
| 646 | if (cr3 & CR3_L_MODE_RESERVED_BITS) | ||
| 647 | return 1; | ||
| 631 | } else { | 648 | } else { |
| 632 | if (is_pae(vcpu)) { | 649 | if (is_pae(vcpu)) { |
| 633 | if (cr3 & CR3_PAE_RESERVED_BITS) | 650 | if (cr3 & CR3_PAE_RESERVED_BITS) |
| @@ -795,6 +812,7 @@ static u32 msrs_to_save[] = { | |||
| 795 | MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, | 812 | MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, |
| 796 | HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, | 813 | HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, |
| 797 | HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, | 814 | HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, |
| 815 | MSR_KVM_PV_EOI_EN, | ||
| 798 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | 816 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, |
| 799 | MSR_STAR, | 817 | MSR_STAR, |
| 800 | #ifdef CONFIG_X86_64 | 818 | #ifdef CONFIG_X86_64 |
| @@ -1437,8 +1455,8 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
| 1437 | break; | 1455 | break; |
| 1438 | } | 1456 | } |
| 1439 | default: | 1457 | default: |
| 1440 | pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " | 1458 | vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " |
| 1441 | "data 0x%llx\n", msr, data); | 1459 | "data 0x%llx\n", msr, data); |
| 1442 | return 1; | 1460 | return 1; |
| 1443 | } | 1461 | } |
| 1444 | return 0; | 1462 | return 0; |
| @@ -1470,8 +1488,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
| 1470 | case HV_X64_MSR_TPR: | 1488 | case HV_X64_MSR_TPR: |
| 1471 | return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); | 1489 | return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); |
| 1472 | default: | 1490 | default: |
| 1473 | pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " | 1491 | vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " |
| 1474 | "data 0x%llx\n", msr, data); | 1492 | "data 0x%llx\n", msr, data); |
| 1475 | return 1; | 1493 | return 1; |
| 1476 | } | 1494 | } |
| 1477 | 1495 | ||
| @@ -1551,15 +1569,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
| 1551 | data &= ~(u64)0x100; /* ignore ignne emulation enable */ | 1569 | data &= ~(u64)0x100; /* ignore ignne emulation enable */ |
| 1552 | data &= ~(u64)0x8; /* ignore TLB cache disable */ | 1570 | data &= ~(u64)0x8; /* ignore TLB cache disable */ |
| 1553 | if (data != 0) { | 1571 | if (data != 0) { |
| 1554 | pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", | 1572 | vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", |
| 1555 | data); | 1573 | data); |
| 1556 | return 1; | 1574 | return 1; |
| 1557 | } | 1575 | } |
| 1558 | break; | 1576 | break; |
| 1559 | case MSR_FAM10H_MMIO_CONF_BASE: | 1577 | case MSR_FAM10H_MMIO_CONF_BASE: |
| 1560 | if (data != 0) { | 1578 | if (data != 0) { |
| 1561 | pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " | 1579 | vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " |
| 1562 | "0x%llx\n", data); | 1580 | "0x%llx\n", data); |
| 1563 | return 1; | 1581 | return 1; |
| 1564 | } | 1582 | } |
| 1565 | break; | 1583 | break; |
| @@ -1574,8 +1592,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
| 1574 | thus reserved and should throw a #GP */ | 1592 | thus reserved and should throw a #GP */ |
| 1575 | return 1; | 1593 | return 1; |
| 1576 | } | 1594 | } |
| 1577 | pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", | 1595 | vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", |
| 1578 | __func__, data); | 1596 | __func__, data); |
| 1579 | break; | 1597 | break; |
| 1580 | case MSR_IA32_UCODE_REV: | 1598 | case MSR_IA32_UCODE_REV: |
| 1581 | case MSR_IA32_UCODE_WRITE: | 1599 | case MSR_IA32_UCODE_WRITE: |
| @@ -1653,6 +1671,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
| 1653 | kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); | 1671 | kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); |
| 1654 | 1672 | ||
| 1655 | break; | 1673 | break; |
| 1674 | case MSR_KVM_PV_EOI_EN: | ||
| 1675 | if (kvm_lapic_enable_pv_eoi(vcpu, data)) | ||
| 1676 | return 1; | ||
| 1677 | break; | ||
| 1656 | 1678 | ||
| 1657 | case MSR_IA32_MCG_CTL: | 1679 | case MSR_IA32_MCG_CTL: |
| 1658 | case MSR_IA32_MCG_STATUS: | 1680 | case MSR_IA32_MCG_STATUS: |
| @@ -1671,8 +1693,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
| 1671 | case MSR_K7_EVNTSEL2: | 1693 | case MSR_K7_EVNTSEL2: |
| 1672 | case MSR_K7_EVNTSEL3: | 1694 | case MSR_K7_EVNTSEL3: |
| 1673 | if (data != 0) | 1695 | if (data != 0) |
| 1674 | pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " | 1696 | vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " |
| 1675 | "0x%x data 0x%llx\n", msr, data); | 1697 | "0x%x data 0x%llx\n", msr, data); |
| 1676 | break; | 1698 | break; |
| 1677 | /* at least RHEL 4 unconditionally writes to the perfctr registers, | 1699 | /* at least RHEL 4 unconditionally writes to the perfctr registers, |
| 1678 | * so we ignore writes to make it happy. | 1700 | * so we ignore writes to make it happy. |
| @@ -1681,8 +1703,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
| 1681 | case MSR_K7_PERFCTR1: | 1703 | case MSR_K7_PERFCTR1: |
| 1682 | case MSR_K7_PERFCTR2: | 1704 | case MSR_K7_PERFCTR2: |
| 1683 | case MSR_K7_PERFCTR3: | 1705 | case MSR_K7_PERFCTR3: |
| 1684 | pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " | 1706 | vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " |
| 1685 | "0x%x data 0x%llx\n", msr, data); | 1707 | "0x%x data 0x%llx\n", msr, data); |
| 1686 | break; | 1708 | break; |
| 1687 | case MSR_P6_PERFCTR0: | 1709 | case MSR_P6_PERFCTR0: |
| 1688 | case MSR_P6_PERFCTR1: | 1710 | case MSR_P6_PERFCTR1: |
| @@ -1693,8 +1715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
| 1693 | return kvm_pmu_set_msr(vcpu, msr, data); | 1715 | return kvm_pmu_set_msr(vcpu, msr, data); |
| 1694 | 1716 | ||
| 1695 | if (pr || data != 0) | 1717 | if (pr || data != 0) |
| 1696 | pr_unimpl(vcpu, "disabled perfctr wrmsr: " | 1718 | vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " |
| 1697 | "0x%x data 0x%llx\n", msr, data); | 1719 | "0x%x data 0x%llx\n", msr, data); |
| 1698 | break; | 1720 | break; |
| 1699 | case MSR_K7_CLK_CTL: | 1721 | case MSR_K7_CLK_CTL: |
| 1700 | /* | 1722 | /* |
| @@ -1720,7 +1742,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
| 1720 | /* Drop writes to this legacy MSR -- see rdmsr | 1742 | /* Drop writes to this legacy MSR -- see rdmsr |
| 1721 | * counterpart for further detail. | 1743 | * counterpart for further detail. |
| 1722 | */ | 1744 | */ |
| 1723 | pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); | 1745 | vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); |
| 1724 | break; | 1746 | break; |
| 1725 | case MSR_AMD64_OSVW_ID_LENGTH: | 1747 | case MSR_AMD64_OSVW_ID_LENGTH: |
| 1726 | if (!guest_cpuid_has_osvw(vcpu)) | 1748 | if (!guest_cpuid_has_osvw(vcpu)) |
| @@ -1738,12 +1760,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
| 1738 | if (kvm_pmu_msr(vcpu, msr)) | 1760 | if (kvm_pmu_msr(vcpu, msr)) |
| 1739 | return kvm_pmu_set_msr(vcpu, msr, data); | 1761 | return kvm_pmu_set_msr(vcpu, msr, data); |
| 1740 | if (!ignore_msrs) { | 1762 | if (!ignore_msrs) { |
| 1741 | pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", | 1763 | vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", |
| 1742 | msr, data); | 1764 | msr, data); |
| 1743 | return 1; | 1765 | return 1; |
| 1744 | } else { | 1766 | } else { |
| 1745 | pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", | 1767 | vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", |
| 1746 | msr, data); | 1768 | msr, data); |
| 1747 | break; | 1769 | break; |
| 1748 | } | 1770 | } |
| 1749 | } | 1771 | } |
| @@ -1846,7 +1868,7 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
| 1846 | data = kvm->arch.hv_hypercall; | 1868 | data = kvm->arch.hv_hypercall; |
| 1847 | break; | 1869 | break; |
| 1848 | default: | 1870 | default: |
| 1849 | pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); | 1871 | vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); |
| 1850 | return 1; | 1872 | return 1; |
| 1851 | } | 1873 | } |
| 1852 | 1874 | ||
| @@ -1877,7 +1899,7 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
| 1877 | data = vcpu->arch.hv_vapic; | 1899 | data = vcpu->arch.hv_vapic; |
| 1878 | break; | 1900 | break; |
| 1879 | default: | 1901 | default: |
| 1880 | pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); | 1902 | vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); |
| 1881 | return 1; | 1903 | return 1; |
| 1882 | } | 1904 | } |
| 1883 | *pdata = data; | 1905 | *pdata = data; |
| @@ -2030,10 +2052,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
| 2030 | if (kvm_pmu_msr(vcpu, msr)) | 2052 | if (kvm_pmu_msr(vcpu, msr)) |
| 2031 | return kvm_pmu_get_msr(vcpu, msr, pdata); | 2053 | return kvm_pmu_get_msr(vcpu, msr, pdata); |
| 2032 | if (!ignore_msrs) { | 2054 | if (!ignore_msrs) { |
| 2033 | pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); | 2055 | vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); |
| 2034 | return 1; | 2056 | return 1; |
| 2035 | } else { | 2057 | } else { |
| 2036 | pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); | 2058 | vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); |
| 2037 | data = 0; | 2059 | data = 0; |
| 2038 | } | 2060 | } |
| 2039 | break; | 2061 | break; |
| @@ -4116,7 +4138,7 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) | |||
| 4116 | value = kvm_get_cr8(vcpu); | 4138 | value = kvm_get_cr8(vcpu); |
| 4117 | break; | 4139 | break; |
| 4118 | default: | 4140 | default: |
| 4119 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); | 4141 | kvm_err("%s: unexpected cr %u\n", __func__, cr); |
| 4120 | return 0; | 4142 | return 0; |
| 4121 | } | 4143 | } |
| 4122 | 4144 | ||
| @@ -4145,7 +4167,7 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) | |||
| 4145 | res = kvm_set_cr8(vcpu, val); | 4167 | res = kvm_set_cr8(vcpu, val); |
| 4146 | break; | 4168 | break; |
| 4147 | default: | 4169 | default: |
| 4148 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); | 4170 | kvm_err("%s: unexpected cr %u\n", __func__, cr); |
| 4149 | res = -1; | 4171 | res = -1; |
| 4150 | } | 4172 | } |
| 4151 | 4173 | ||
| @@ -4297,26 +4319,10 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, | |||
| 4297 | return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); | 4319 | return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); |
| 4298 | } | 4320 | } |
| 4299 | 4321 | ||
| 4300 | static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, | 4322 | static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, |
| 4301 | u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) | 4323 | u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) |
| 4302 | { | 4324 | { |
| 4303 | struct kvm_cpuid_entry2 *cpuid = NULL; | 4325 | kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx); |
| 4304 | |||
| 4305 | if (eax && ecx) | ||
| 4306 | cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt), | ||
| 4307 | *eax, *ecx); | ||
| 4308 | |||
| 4309 | if (cpuid) { | ||
| 4310 | *eax = cpuid->eax; | ||
| 4311 | *ecx = cpuid->ecx; | ||
| 4312 | if (ebx) | ||
| 4313 | *ebx = cpuid->ebx; | ||
| 4314 | if (edx) | ||
| 4315 | *edx = cpuid->edx; | ||
| 4316 | return true; | ||
| 4317 | } | ||
| 4318 | |||
| 4319 | return false; | ||
| 4320 | } | 4326 | } |
| 4321 | 4327 | ||
| 4322 | static struct x86_emulate_ops emulate_ops = { | 4328 | static struct x86_emulate_ops emulate_ops = { |
| @@ -5296,8 +5302,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
| 5296 | 5302 | ||
| 5297 | r = kvm_mmu_reload(vcpu); | 5303 | r = kvm_mmu_reload(vcpu); |
| 5298 | if (unlikely(r)) { | 5304 | if (unlikely(r)) { |
| 5299 | kvm_x86_ops->cancel_injection(vcpu); | 5305 | goto cancel_injection; |
| 5300 | goto out; | ||
| 5301 | } | 5306 | } |
| 5302 | 5307 | ||
| 5303 | preempt_disable(); | 5308 | preempt_disable(); |
| @@ -5322,9 +5327,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
| 5322 | smp_wmb(); | 5327 | smp_wmb(); |
| 5323 | local_irq_enable(); | 5328 | local_irq_enable(); |
| 5324 | preempt_enable(); | 5329 | preempt_enable(); |
| 5325 | kvm_x86_ops->cancel_injection(vcpu); | ||
| 5326 | r = 1; | 5330 | r = 1; |
| 5327 | goto out; | 5331 | goto cancel_injection; |
| 5328 | } | 5332 | } |
| 5329 | 5333 | ||
| 5330 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); | 5334 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); |
| @@ -5388,9 +5392,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
| 5388 | if (unlikely(vcpu->arch.tsc_always_catchup)) | 5392 | if (unlikely(vcpu->arch.tsc_always_catchup)) |
| 5389 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); | 5393 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
| 5390 | 5394 | ||
| 5391 | kvm_lapic_sync_from_vapic(vcpu); | 5395 | if (vcpu->arch.apic_attention) |
| 5396 | kvm_lapic_sync_from_vapic(vcpu); | ||
| 5392 | 5397 | ||
| 5393 | r = kvm_x86_ops->handle_exit(vcpu); | 5398 | r = kvm_x86_ops->handle_exit(vcpu); |
| 5399 | return r; | ||
| 5400 | |||
| 5401 | cancel_injection: | ||
| 5402 | kvm_x86_ops->cancel_injection(vcpu); | ||
| 5403 | if (unlikely(vcpu->arch.apic_attention)) | ||
| 5404 | kvm_lapic_sync_from_vapic(vcpu); | ||
| 5394 | out: | 5405 | out: |
| 5395 | return r; | 5406 | return r; |
| 5396 | } | 5407 | } |
| @@ -6304,7 +6315,7 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free, | |||
| 6304 | 6315 | ||
| 6305 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { | 6316 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { |
| 6306 | if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { | 6317 | if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { |
| 6307 | vfree(free->arch.lpage_info[i]); | 6318 | kvm_kvfree(free->arch.lpage_info[i]); |
| 6308 | free->arch.lpage_info[i] = NULL; | 6319 | free->arch.lpage_info[i] = NULL; |
| 6309 | } | 6320 | } |
| 6310 | } | 6321 | } |
| @@ -6323,7 +6334,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) | |||
| 6323 | slot->base_gfn, level) + 1; | 6334 | slot->base_gfn, level) + 1; |
| 6324 | 6335 | ||
| 6325 | slot->arch.lpage_info[i] = | 6336 | slot->arch.lpage_info[i] = |
| 6326 | vzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); | 6337 | kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); |
| 6327 | if (!slot->arch.lpage_info[i]) | 6338 | if (!slot->arch.lpage_info[i]) |
| 6328 | goto out_free; | 6339 | goto out_free; |
| 6329 | 6340 | ||
| @@ -6350,7 +6361,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) | |||
| 6350 | 6361 | ||
| 6351 | out_free: | 6362 | out_free: |
| 6352 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { | 6363 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { |
| 6353 | vfree(slot->arch.lpage_info[i]); | 6364 | kvm_kvfree(slot->arch.lpage_info[i]); |
| 6354 | slot->arch.lpage_info[i] = NULL; | 6365 | slot->arch.lpage_info[i] = NULL; |
| 6355 | } | 6366 | } |
| 6356 | return -ENOMEM; | 6367 | return -ENOMEM; |
diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c index 30f29a0020a1..3fcc000efc53 100644 --- a/drivers/s390/char/sclp.c +++ b/drivers/s390/char/sclp.c | |||
| @@ -654,16 +654,6 @@ sclp_remove_processed(struct sccb_header *sccb) | |||
| 654 | 654 | ||
| 655 | EXPORT_SYMBOL(sclp_remove_processed); | 655 | EXPORT_SYMBOL(sclp_remove_processed); |
| 656 | 656 | ||
| 657 | struct init_sccb { | ||
| 658 | struct sccb_header header; | ||
| 659 | u16 _reserved; | ||
| 660 | u16 mask_length; | ||
| 661 | sccb_mask_t receive_mask; | ||
| 662 | sccb_mask_t send_mask; | ||
| 663 | sccb_mask_t sclp_receive_mask; | ||
| 664 | sccb_mask_t sclp_send_mask; | ||
| 665 | } __attribute__((packed)); | ||
| 666 | |||
| 667 | /* Prepare init mask request. Called while sclp_lock is locked. */ | 657 | /* Prepare init mask request. Called while sclp_lock is locked. */ |
| 668 | static inline void | 658 | static inline void |
| 669 | __sclp_make_init_req(u32 receive_mask, u32 send_mask) | 659 | __sclp_make_init_req(u32 receive_mask, u32 send_mask) |
diff --git a/drivers/s390/char/sclp.h b/drivers/s390/char/sclp.h index 49a1bb52bc87..d7e97ae9ef6d 100644 --- a/drivers/s390/char/sclp.h +++ b/drivers/s390/char/sclp.h | |||
| @@ -88,6 +88,16 @@ struct sccb_header { | |||
| 88 | u16 response_code; | 88 | u16 response_code; |
| 89 | } __attribute__((packed)); | 89 | } __attribute__((packed)); |
| 90 | 90 | ||
| 91 | struct init_sccb { | ||
| 92 | struct sccb_header header; | ||
| 93 | u16 _reserved; | ||
| 94 | u16 mask_length; | ||
| 95 | sccb_mask_t receive_mask; | ||
| 96 | sccb_mask_t send_mask; | ||
| 97 | sccb_mask_t sclp_receive_mask; | ||
| 98 | sccb_mask_t sclp_send_mask; | ||
| 99 | } __attribute__((packed)); | ||
| 100 | |||
| 91 | extern u64 sclp_facilities; | 101 | extern u64 sclp_facilities; |
| 92 | #define SCLP_HAS_CHP_INFO (sclp_facilities & 0x8000000000000000ULL) | 102 | #define SCLP_HAS_CHP_INFO (sclp_facilities & 0x8000000000000000ULL) |
| 93 | #define SCLP_HAS_CHP_RECONFIG (sclp_facilities & 0x2000000000000000ULL) | 103 | #define SCLP_HAS_CHP_RECONFIG (sclp_facilities & 0x2000000000000000ULL) |
diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c index 766cb7b19b40..71ea923c322d 100644 --- a/drivers/s390/char/sclp_cmd.c +++ b/drivers/s390/char/sclp_cmd.c | |||
| @@ -48,6 +48,7 @@ struct read_info_sccb { | |||
| 48 | u8 _reserved5[4096 - 112]; /* 112-4095 */ | 48 | u8 _reserved5[4096 - 112]; /* 112-4095 */ |
| 49 | } __attribute__((packed, aligned(PAGE_SIZE))); | 49 | } __attribute__((packed, aligned(PAGE_SIZE))); |
| 50 | 50 | ||
| 51 | static struct init_sccb __initdata early_event_mask_sccb __aligned(PAGE_SIZE); | ||
| 51 | static struct read_info_sccb __initdata early_read_info_sccb; | 52 | static struct read_info_sccb __initdata early_read_info_sccb; |
| 52 | static int __initdata early_read_info_sccb_valid; | 53 | static int __initdata early_read_info_sccb_valid; |
| 53 | 54 | ||
| @@ -104,6 +105,19 @@ static void __init sclp_read_info_early(void) | |||
| 104 | } | 105 | } |
| 105 | } | 106 | } |
| 106 | 107 | ||
| 108 | static void __init sclp_event_mask_early(void) | ||
| 109 | { | ||
| 110 | struct init_sccb *sccb = &early_event_mask_sccb; | ||
| 111 | int rc; | ||
| 112 | |||
| 113 | do { | ||
| 114 | memset(sccb, 0, sizeof(*sccb)); | ||
| 115 | sccb->header.length = sizeof(*sccb); | ||
| 116 | sccb->mask_length = sizeof(sccb_mask_t); | ||
| 117 | rc = sclp_cmd_sync_early(SCLP_CMDW_WRITE_EVENT_MASK, sccb); | ||
| 118 | } while (rc == -EBUSY); | ||
| 119 | } | ||
| 120 | |||
| 107 | void __init sclp_facilities_detect(void) | 121 | void __init sclp_facilities_detect(void) |
| 108 | { | 122 | { |
| 109 | struct read_info_sccb *sccb; | 123 | struct read_info_sccb *sccb; |
| @@ -119,6 +133,30 @@ void __init sclp_facilities_detect(void) | |||
| 119 | rnmax = sccb->rnmax ? sccb->rnmax : sccb->rnmax2; | 133 | rnmax = sccb->rnmax ? sccb->rnmax : sccb->rnmax2; |
| 120 | rzm = sccb->rnsize ? sccb->rnsize : sccb->rnsize2; | 134 | rzm = sccb->rnsize ? sccb->rnsize : sccb->rnsize2; |
| 121 | rzm <<= 20; | 135 | rzm <<= 20; |
| 136 | |||
| 137 | sclp_event_mask_early(); | ||
| 138 | } | ||
| 139 | |||
| 140 | bool __init sclp_has_linemode(void) | ||
| 141 | { | ||
| 142 | struct init_sccb *sccb = &early_event_mask_sccb; | ||
| 143 | |||
| 144 | if (sccb->header.response_code != 0x20) | ||
| 145 | return 0; | ||
| 146 | if (sccb->sclp_send_mask & (EVTYP_MSG_MASK | EVTYP_PMSGCMD_MASK)) | ||
| 147 | return 1; | ||
| 148 | return 0; | ||
| 149 | } | ||
| 150 | |||
| 151 | bool __init sclp_has_vt220(void) | ||
| 152 | { | ||
| 153 | struct init_sccb *sccb = &early_event_mask_sccb; | ||
| 154 | |||
| 155 | if (sccb->header.response_code != 0x20) | ||
| 156 | return 0; | ||
| 157 | if (sccb->sclp_send_mask & EVTYP_VT220MSG_MASK) | ||
| 158 | return 1; | ||
| 159 | return 0; | ||
| 122 | } | 160 | } |
| 123 | 161 | ||
| 124 | unsigned long long sclp_get_rnmax(void) | 162 | unsigned long long sclp_get_rnmax(void) |
diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c index b67ee0408267..47cccd52aae8 100644 --- a/drivers/s390/kvm/kvm_virtio.c +++ b/drivers/s390/kvm/kvm_virtio.c | |||
| @@ -25,6 +25,7 @@ | |||
| 25 | #include <asm/io.h> | 25 | #include <asm/io.h> |
| 26 | #include <asm/kvm_para.h> | 26 | #include <asm/kvm_para.h> |
| 27 | #include <asm/kvm_virtio.h> | 27 | #include <asm/kvm_virtio.h> |
| 28 | #include <asm/sclp.h> | ||
| 28 | #include <asm/setup.h> | 29 | #include <asm/setup.h> |
| 29 | #include <asm/irq.h> | 30 | #include <asm/irq.h> |
| 30 | 31 | ||
| @@ -468,7 +469,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) | |||
| 468 | 469 | ||
| 469 | static int __init s390_virtio_console_init(void) | 470 | static int __init s390_virtio_console_init(void) |
| 470 | { | 471 | { |
| 471 | if (!MACHINE_IS_KVM) | 472 | if (sclp_has_vt220() || sclp_has_linemode()) |
| 472 | return -ENODEV; | 473 | return -ENODEV; |
| 473 | return virtio_cons_early_init(early_put_chars); | 474 | return virtio_cons_early_init(early_put_chars); |
| 474 | } | 475 | } |
diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 09f2b3aa2da7..2ce09aa7d3b3 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h | |||
| @@ -617,6 +617,7 @@ struct kvm_ppc_smmu_info { | |||
| 617 | #define KVM_CAP_SIGNAL_MSI 77 | 617 | #define KVM_CAP_SIGNAL_MSI 77 |
| 618 | #define KVM_CAP_PPC_GET_SMMU_INFO 78 | 618 | #define KVM_CAP_PPC_GET_SMMU_INFO 78 |
| 619 | #define KVM_CAP_S390_COW 79 | 619 | #define KVM_CAP_S390_COW 79 |
| 620 | #define KVM_CAP_PPC_ALLOC_HTAB 80 | ||
| 620 | 621 | ||
| 621 | #ifdef KVM_CAP_IRQ_ROUTING | 622 | #ifdef KVM_CAP_IRQ_ROUTING |
| 622 | 623 | ||
| @@ -828,6 +829,8 @@ struct kvm_s390_ucas_mapping { | |||
| 828 | #define KVM_SIGNAL_MSI _IOW(KVMIO, 0xa5, struct kvm_msi) | 829 | #define KVM_SIGNAL_MSI _IOW(KVMIO, 0xa5, struct kvm_msi) |
| 829 | /* Available with KVM_CAP_PPC_GET_SMMU_INFO */ | 830 | /* Available with KVM_CAP_PPC_GET_SMMU_INFO */ |
| 830 | #define KVM_PPC_GET_SMMU_INFO _IOR(KVMIO, 0xa6, struct kvm_ppc_smmu_info) | 831 | #define KVM_PPC_GET_SMMU_INFO _IOR(KVMIO, 0xa6, struct kvm_ppc_smmu_info) |
| 832 | /* Available with KVM_CAP_PPC_ALLOC_HTAB */ | ||
| 833 | #define KVM_PPC_ALLOCATE_HTAB _IOWR(KVMIO, 0xa7, __u32) | ||
| 831 | 834 | ||
| 832 | /* | 835 | /* |
| 833 | * ioctls for vcpu fds | 836 | * ioctls for vcpu fds |
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 96c158a37d3e..b70b48b01098 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h | |||
| @@ -306,7 +306,7 @@ struct kvm { | |||
| 306 | struct hlist_head irq_ack_notifier_list; | 306 | struct hlist_head irq_ack_notifier_list; |
| 307 | #endif | 307 | #endif |
| 308 | 308 | ||
| 309 | #ifdef KVM_ARCH_WANT_MMU_NOTIFIER | 309 | #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) |
| 310 | struct mmu_notifier mmu_notifier; | 310 | struct mmu_notifier mmu_notifier; |
| 311 | unsigned long mmu_notifier_seq; | 311 | unsigned long mmu_notifier_seq; |
| 312 | long mmu_notifier_count; | 312 | long mmu_notifier_count; |
| @@ -314,13 +314,19 @@ struct kvm { | |||
| 314 | long tlbs_dirty; | 314 | long tlbs_dirty; |
| 315 | }; | 315 | }; |
| 316 | 316 | ||
| 317 | /* The guest did something we don't support. */ | 317 | #define kvm_err(fmt, ...) \ |
| 318 | #define pr_unimpl(vcpu, fmt, ...) \ | 318 | pr_err("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) |
| 319 | pr_err_ratelimited("kvm: %i: cpu%i " fmt, \ | 319 | #define kvm_info(fmt, ...) \ |
| 320 | current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__) | 320 | pr_info("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) |
| 321 | #define kvm_debug(fmt, ...) \ | ||
| 322 | pr_debug("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) | ||
| 323 | #define kvm_pr_unimpl(fmt, ...) \ | ||
| 324 | pr_err_ratelimited("kvm [%i]: " fmt, \ | ||
| 325 | task_tgid_nr(current), ## __VA_ARGS__) | ||
| 321 | 326 | ||
| 322 | #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) | 327 | /* The guest did something we don't support. */ |
| 323 | #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) | 328 | #define vcpu_unimpl(vcpu, fmt, ...) \ |
| 329 | kvm_pr_unimpl("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__) | ||
| 324 | 330 | ||
| 325 | static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) | 331 | static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) |
| 326 | { | 332 | { |
| @@ -535,6 +541,9 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); | |||
| 535 | 541 | ||
| 536 | void kvm_free_physmem(struct kvm *kvm); | 542 | void kvm_free_physmem(struct kvm *kvm); |
| 537 | 543 | ||
| 544 | void *kvm_kvzalloc(unsigned long size); | ||
| 545 | void kvm_kvfree(const void *addr); | ||
| 546 | |||
| 538 | #ifndef __KVM_HAVE_ARCH_VM_ALLOC | 547 | #ifndef __KVM_HAVE_ARCH_VM_ALLOC |
| 539 | static inline struct kvm *kvm_arch_alloc_vm(void) | 548 | static inline struct kvm *kvm_arch_alloc_vm(void) |
| 540 | { | 549 | { |
| @@ -771,7 +780,7 @@ struct kvm_stats_debugfs_item { | |||
| 771 | extern struct kvm_stats_debugfs_item debugfs_entries[]; | 780 | extern struct kvm_stats_debugfs_item debugfs_entries[]; |
| 772 | extern struct dentry *kvm_debugfs_dir; | 781 | extern struct dentry *kvm_debugfs_dir; |
| 773 | 782 | ||
| 774 | #ifdef KVM_ARCH_WANT_MMU_NOTIFIER | 783 | #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) |
| 775 | static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq) | 784 | static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq) |
| 776 | { | 785 | { |
| 777 | if (unlikely(vcpu->kvm->mmu_notifier_count)) | 786 | if (unlikely(vcpu->kvm->mmu_notifier_count)) |
| @@ -793,7 +802,7 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se | |||
| 793 | } | 802 | } |
| 794 | #endif | 803 | #endif |
| 795 | 804 | ||
| 796 | #ifdef CONFIG_HAVE_KVM_IRQCHIP | 805 | #ifdef KVM_CAP_IRQ_ROUTING |
| 797 | 806 | ||
| 798 | #define KVM_MAX_IRQ_ROUTES 1024 | 807 | #define KVM_MAX_IRQ_ROUTES 1024 |
| 799 | 808 | ||
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 46e3cd8e197a..7ef9e759f499 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h | |||
| @@ -13,7 +13,8 @@ | |||
| 13 | ERSN(DEBUG), ERSN(HLT), ERSN(MMIO), ERSN(IRQ_WINDOW_OPEN), \ | 13 | ERSN(DEBUG), ERSN(HLT), ERSN(MMIO), ERSN(IRQ_WINDOW_OPEN), \ |
| 14 | ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \ | 14 | ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \ |
| 15 | ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\ | 15 | ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\ |
| 16 | ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI) | 16 | ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL), \ |
| 17 | ERSN(S390_UCONTROL) | ||
| 17 | 18 | ||
| 18 | TRACE_EVENT(kvm_userspace_exit, | 19 | TRACE_EVENT(kvm_userspace_exit, |
| 19 | TP_PROTO(__u32 reason, int errno), | 20 | TP_PROTO(__u32 reason, int errno), |
| @@ -36,7 +37,7 @@ TRACE_EVENT(kvm_userspace_exit, | |||
| 36 | __entry->errno < 0 ? -__entry->errno : __entry->reason) | 37 | __entry->errno < 0 ? -__entry->errno : __entry->reason) |
| 37 | ); | 38 | ); |
| 38 | 39 | ||
| 39 | #if defined(__KVM_HAVE_IOAPIC) | 40 | #if defined(__KVM_HAVE_IRQ_LINE) |
| 40 | TRACE_EVENT(kvm_set_irq, | 41 | TRACE_EVENT(kvm_set_irq, |
| 41 | TP_PROTO(unsigned int gsi, int level, int irq_source_id), | 42 | TP_PROTO(unsigned int gsi, int level, int irq_source_id), |
| 42 | TP_ARGS(gsi, level, irq_source_id), | 43 | TP_ARGS(gsi, level, irq_source_id), |
| @@ -56,7 +57,9 @@ TRACE_EVENT(kvm_set_irq, | |||
| 56 | TP_printk("gsi %u level %d source %d", | 57 | TP_printk("gsi %u level %d source %d", |
| 57 | __entry->gsi, __entry->level, __entry->irq_source_id) | 58 | __entry->gsi, __entry->level, __entry->irq_source_id) |
| 58 | ); | 59 | ); |
| 60 | #endif | ||
| 59 | 61 | ||
| 62 | #if defined(__KVM_HAVE_IOAPIC) | ||
| 60 | #define kvm_deliver_mode \ | 63 | #define kvm_deliver_mode \ |
| 61 | {0x0, "Fixed"}, \ | 64 | {0x0, "Fixed"}, \ |
| 62 | {0x1, "LowPrio"}, \ | 65 | {0x1, "LowPrio"}, \ |
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 26fd54dc459e..ef61d529a6c4 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c | |||
| @@ -191,7 +191,8 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) | |||
| 191 | return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); | 191 | return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); |
| 192 | } | 192 | } |
| 193 | 193 | ||
| 194 | int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) | 194 | int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, |
| 195 | int level) | ||
| 195 | { | 196 | { |
| 196 | u32 old_irr; | 197 | u32 old_irr; |
| 197 | u32 mask = 1 << irq; | 198 | u32 mask = 1 << irq; |
| @@ -201,9 +202,11 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) | |||
| 201 | spin_lock(&ioapic->lock); | 202 | spin_lock(&ioapic->lock); |
| 202 | old_irr = ioapic->irr; | 203 | old_irr = ioapic->irr; |
| 203 | if (irq >= 0 && irq < IOAPIC_NUM_PINS) { | 204 | if (irq >= 0 && irq < IOAPIC_NUM_PINS) { |
| 205 | int irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq], | ||
| 206 | irq_source_id, level); | ||
| 204 | entry = ioapic->redirtbl[irq]; | 207 | entry = ioapic->redirtbl[irq]; |
| 205 | level ^= entry.fields.polarity; | 208 | irq_level ^= entry.fields.polarity; |
| 206 | if (!level) | 209 | if (!irq_level) |
| 207 | ioapic->irr &= ~mask; | 210 | ioapic->irr &= ~mask; |
| 208 | else { | 211 | else { |
| 209 | int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); | 212 | int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); |
| @@ -221,6 +224,16 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) | |||
| 221 | return ret; | 224 | return ret; |
| 222 | } | 225 | } |
| 223 | 226 | ||
| 227 | void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id) | ||
| 228 | { | ||
| 229 | int i; | ||
| 230 | |||
| 231 | spin_lock(&ioapic->lock); | ||
| 232 | for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) | ||
| 233 | __clear_bit(irq_source_id, &ioapic->irq_states[i]); | ||
| 234 | spin_unlock(&ioapic->lock); | ||
| 235 | } | ||
| 236 | |||
| 224 | static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector, | 237 | static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector, |
| 225 | int trigger_mode) | 238 | int trigger_mode) |
| 226 | { | 239 | { |
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 32872a09b63f..a30abfe6ed16 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h | |||
| @@ -74,7 +74,9 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); | |||
| 74 | bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector); | 74 | bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector); |
| 75 | int kvm_ioapic_init(struct kvm *kvm); | 75 | int kvm_ioapic_init(struct kvm *kvm); |
| 76 | void kvm_ioapic_destroy(struct kvm *kvm); | 76 | void kvm_ioapic_destroy(struct kvm *kvm); |
| 77 | int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); | 77 | int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, |
| 78 | int level); | ||
| 79 | void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); | ||
| 78 | void kvm_ioapic_reset(struct kvm_ioapic *ioapic); | 80 | void kvm_ioapic_reset(struct kvm_ioapic *ioapic); |
| 79 | int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, | 81 | int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, |
| 80 | struct kvm_lapic_irq *irq); | 82 | struct kvm_lapic_irq *irq); |
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 5afb43114020..83402d74a767 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c | |||
| @@ -33,26 +33,12 @@ | |||
| 33 | 33 | ||
| 34 | #include "ioapic.h" | 34 | #include "ioapic.h" |
| 35 | 35 | ||
| 36 | static inline int kvm_irq_line_state(unsigned long *irq_state, | ||
| 37 | int irq_source_id, int level) | ||
| 38 | { | ||
| 39 | /* Logical OR for level trig interrupt */ | ||
| 40 | if (level) | ||
| 41 | set_bit(irq_source_id, irq_state); | ||
| 42 | else | ||
| 43 | clear_bit(irq_source_id, irq_state); | ||
| 44 | |||
| 45 | return !!(*irq_state); | ||
| 46 | } | ||
| 47 | |||
| 48 | static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, | 36 | static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, |
| 49 | struct kvm *kvm, int irq_source_id, int level) | 37 | struct kvm *kvm, int irq_source_id, int level) |
| 50 | { | 38 | { |
| 51 | #ifdef CONFIG_X86 | 39 | #ifdef CONFIG_X86 |
| 52 | struct kvm_pic *pic = pic_irqchip(kvm); | 40 | struct kvm_pic *pic = pic_irqchip(kvm); |
| 53 | level = kvm_irq_line_state(&pic->irq_states[e->irqchip.pin], | 41 | return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level); |
| 54 | irq_source_id, level); | ||
| 55 | return kvm_pic_set_irq(pic, e->irqchip.pin, level); | ||
| 56 | #else | 42 | #else |
| 57 | return -1; | 43 | return -1; |
| 58 | #endif | 44 | #endif |
| @@ -62,10 +48,7 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, | |||
| 62 | struct kvm *kvm, int irq_source_id, int level) | 48 | struct kvm *kvm, int irq_source_id, int level) |
| 63 | { | 49 | { |
| 64 | struct kvm_ioapic *ioapic = kvm->arch.vioapic; | 50 | struct kvm_ioapic *ioapic = kvm->arch.vioapic; |
| 65 | level = kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin], | 51 | return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level); |
| 66 | irq_source_id, level); | ||
| 67 | |||
| 68 | return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level); | ||
| 69 | } | 52 | } |
| 70 | 53 | ||
| 71 | inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) | 54 | inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) |
| @@ -249,8 +232,6 @@ unlock: | |||
| 249 | 232 | ||
| 250 | void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) | 233 | void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) |
| 251 | { | 234 | { |
| 252 | int i; | ||
| 253 | |||
| 254 | ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); | 235 | ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); |
| 255 | 236 | ||
| 256 | mutex_lock(&kvm->irq_lock); | 237 | mutex_lock(&kvm->irq_lock); |
| @@ -263,14 +244,10 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) | |||
| 263 | if (!irqchip_in_kernel(kvm)) | 244 | if (!irqchip_in_kernel(kvm)) |
| 264 | goto unlock; | 245 | goto unlock; |
| 265 | 246 | ||
| 266 | for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) { | 247 | kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); |
| 267 | clear_bit(irq_source_id, &kvm->arch.vioapic->irq_states[i]); | ||
| 268 | if (i >= 16) | ||
| 269 | continue; | ||
| 270 | #ifdef CONFIG_X86 | 248 | #ifdef CONFIG_X86 |
| 271 | clear_bit(irq_source_id, &pic_irqchip(kvm)->irq_states[i]); | 249 | kvm_pic_clear_all(pic_irqchip(kvm), irq_source_id); |
| 272 | #endif | 250 | #endif |
| 273 | } | ||
| 274 | unlock: | 251 | unlock: |
| 275 | mutex_unlock(&kvm->irq_lock); | 252 | mutex_unlock(&kvm->irq_lock); |
| 276 | } | 253 | } |
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 44ee7124b16d..246852397e30 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
| @@ -516,16 +516,32 @@ out_err_nodisable: | |||
| 516 | return ERR_PTR(r); | 516 | return ERR_PTR(r); |
| 517 | } | 517 | } |
| 518 | 518 | ||
| 519 | /* | ||
| 520 | * Avoid using vmalloc for a small buffer. | ||
| 521 | * Should not be used when the size is statically known. | ||
| 522 | */ | ||
| 523 | void *kvm_kvzalloc(unsigned long size) | ||
| 524 | { | ||
| 525 | if (size > PAGE_SIZE) | ||
| 526 | return vzalloc(size); | ||
| 527 | else | ||
| 528 | return kzalloc(size, GFP_KERNEL); | ||
| 529 | } | ||
| 530 | |||
| 531 | void kvm_kvfree(const void *addr) | ||
| 532 | { | ||
| 533 | if (is_vmalloc_addr(addr)) | ||
| 534 | vfree(addr); | ||
| 535 | else | ||
| 536 | kfree(addr); | ||
| 537 | } | ||
| 538 | |||
| 519 | static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) | 539 | static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) |
| 520 | { | 540 | { |
| 521 | if (!memslot->dirty_bitmap) | 541 | if (!memslot->dirty_bitmap) |
| 522 | return; | 542 | return; |
| 523 | 543 | ||
| 524 | if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE) | 544 | kvm_kvfree(memslot->dirty_bitmap); |
| 525 | vfree(memslot->dirty_bitmap); | ||
| 526 | else | ||
| 527 | kfree(memslot->dirty_bitmap); | ||
| 528 | |||
| 529 | memslot->dirty_bitmap = NULL; | 545 | memslot->dirty_bitmap = NULL; |
| 530 | } | 546 | } |
| 531 | 547 | ||
| @@ -617,11 +633,7 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) | |||
| 617 | #ifndef CONFIG_S390 | 633 | #ifndef CONFIG_S390 |
| 618 | unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); | 634 | unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); |
| 619 | 635 | ||
| 620 | if (dirty_bytes > PAGE_SIZE) | 636 | memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes); |
| 621 | memslot->dirty_bitmap = vzalloc(dirty_bytes); | ||
| 622 | else | ||
| 623 | memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL); | ||
| 624 | |||
| 625 | if (!memslot->dirty_bitmap) | 637 | if (!memslot->dirty_bitmap) |
| 626 | return -ENOMEM; | 638 | return -ENOMEM; |
| 627 | 639 | ||
| @@ -1586,7 +1598,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) | |||
| 1586 | */ | 1598 | */ |
| 1587 | for (pass = 0; pass < 2 && !yielded; pass++) { | 1599 | for (pass = 0; pass < 2 && !yielded; pass++) { |
| 1588 | kvm_for_each_vcpu(i, vcpu, kvm) { | 1600 | kvm_for_each_vcpu(i, vcpu, kvm) { |
| 1589 | if (!pass && i < last_boosted_vcpu) { | 1601 | if (!pass && i <= last_boosted_vcpu) { |
| 1590 | i = last_boosted_vcpu; | 1602 | i = last_boosted_vcpu; |
| 1591 | continue; | 1603 | continue; |
| 1592 | } else if (pass && i > last_boosted_vcpu) | 1604 | } else if (pass && i > last_boosted_vcpu) |
| @@ -2213,7 +2225,7 @@ static long kvm_dev_ioctl_check_extension_generic(long arg) | |||
| 2213 | case KVM_CAP_SIGNAL_MSI: | 2225 | case KVM_CAP_SIGNAL_MSI: |
| 2214 | #endif | 2226 | #endif |
| 2215 | return 1; | 2227 | return 1; |
| 2216 | #ifdef CONFIG_HAVE_KVM_IRQCHIP | 2228 | #ifdef KVM_CAP_IRQ_ROUTING |
| 2217 | case KVM_CAP_IRQ_ROUTING: | 2229 | case KVM_CAP_IRQ_ROUTING: |
| 2218 | return KVM_MAX_IRQ_ROUTES; | 2230 | return KVM_MAX_IRQ_ROUTES; |
| 2219 | #endif | 2231 | #endif |
