aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-08-04 13:43:01 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-08-04 13:43:01 -0400
commit5e83f6fbdb020b70c0e413312801424d13c58d68 (patch)
treeca270178fa891813dbc47751c331fed975d3766c /arch
parentfe445c6e2cb62a566e1a89f8798de11459975710 (diff)
parent3444d7da1839b851eefedd372978d8a982316c36 (diff)
Merge branch 'kvm-updates/2.6.36' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/2.6.36' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (198 commits) KVM: VMX: Fix host GDT.LIMIT corruption KVM: MMU: using __xchg_spte more smarter KVM: MMU: cleanup spte set and accssed/dirty tracking KVM: MMU: don't atomicly set spte if it's not present KVM: MMU: fix page dirty tracking lost while sync page KVM: MMU: fix broken page accessed tracking with ept enabled KVM: MMU: add missing reserved bits check in speculative path KVM: MMU: fix mmu notifier invalidate handler for huge spte KVM: x86 emulator: fix xchg instruction emulation KVM: x86: Call mask notifiers from pic KVM: x86: never re-execute instruction with enabled tdp KVM: Document KVM_GET_SUPPORTED_CPUID2 ioctl KVM: x86: emulator: inc/dec can have lock prefix KVM: MMU: Eliminate redundant temporaries in FNAME(fetch) KVM: MMU: Validate all gptes during fetch, not just those used for new pages KVM: MMU: Simplify spte fetch() function KVM: MMU: Add gpte_valid() helper KVM: MMU: Add validate_direct_spte() helper KVM: MMU: Add drop_large_spte() helper KVM: MMU: Use __set_spte to link shadow pages ...
Diffstat (limited to 'arch')
-rw-r--r--arch/ia64/include/asm/kvm_host.h1
-rw-r--r--arch/ia64/kvm/kvm-ia64.c50
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h10
-rw-r--r--arch/powerpc/include/asm/kvm_fpu.h27
-rw-r--r--arch/powerpc/include/asm/kvm_host.h18
-rw-r--r--arch/powerpc/kernel/ppc_ksyms.c4
-rw-r--r--arch/powerpc/kvm/44x_tlb.c3
-rw-r--r--arch/powerpc/kvm/Makefile2
-rw-r--r--arch/powerpc/kvm/book3s.c79
-rw-r--r--arch/powerpc/kvm/book3s_32_mmu.c8
-rw-r--r--arch/powerpc/kvm/book3s_32_mmu_host.c134
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_host.c129
-rw-r--r--arch/powerpc/kvm/book3s_mmu_hpte.c277
-rw-r--r--arch/powerpc/kvm/book3s_paired_singles.c94
-rw-r--r--arch/powerpc/kvm/booke.c12
-rw-r--r--arch/powerpc/kvm/fpu.S18
-rw-r--r--arch/powerpc/kvm/powerpc.c14
-rw-r--r--arch/s390/include/asm/kvm_host.h5
-rw-r--r--arch/s390/kvm/intercept.c2
-rw-r--r--arch/s390/kvm/kvm-s390.c64
-rw-r--r--arch/s390/kvm/kvm-s390.h2
-rw-r--r--arch/x86/include/asm/i387.h2
-rw-r--r--arch/x86/include/asm/kvm.h22
-rw-r--r--arch/x86/include/asm/kvm_emulate.h30
-rw-r--r--arch/x86/include/asm/kvm_host.h70
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/include/asm/vmx.h5
-rw-r--r--arch/x86/include/asm/xsave.h6
-rw-r--r--arch/x86/kernel/i387.c3
-rw-r--r--arch/x86/kernel/process.c1
-rw-r--r--arch/x86/kvm/emulate.c749
-rw-r--r--arch/x86/kvm/i8254.c146
-rw-r--r--arch/x86/kvm/i8254.h4
-rw-r--r--arch/x86/kvm/i8259.c48
-rw-r--r--arch/x86/kvm/irq.c2
-rw-r--r--arch/x86/kvm/irq.h4
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h8
-rw-r--r--arch/x86/kvm/lapic.c17
-rw-r--r--arch/x86/kvm/mmu.c807
-rw-r--r--arch/x86/kvm/mmutrace.h2
-rw-r--r--arch/x86/kvm/paging_tmpl.h252
-rw-r--r--arch/x86/kvm/svm.c138
-rw-r--r--arch/x86/kvm/timer.c16
-rw-r--r--arch/x86/kvm/vmx.c253
-rw-r--r--arch/x86/kvm/x86.c1174
-rw-r--r--arch/x86/kvm/x86.h7
46 files changed, 2729 insertions, 1992 deletions
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index a362e67e0ca6..2f229e5de498 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -235,6 +235,7 @@ struct kvm_vm_data {
235#define KVM_REQ_PTC_G 32 235#define KVM_REQ_PTC_G 32
236#define KVM_REQ_RESUME 33 236#define KVM_REQ_RESUME 33
237 237
238#define KVM_HPAGE_GFN_SHIFT(x) 0
238#define KVM_NR_PAGE_SIZES 1 239#define KVM_NR_PAGE_SIZES 1
239#define KVM_PAGES_PER_HPAGE(x) 1 240#define KVM_PAGES_PER_HPAGE(x) 1
240 241
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 21b701374f72..5cb58655cd5f 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -725,8 +725,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
725 int r; 725 int r;
726 sigset_t sigsaved; 726 sigset_t sigsaved;
727 727
728 vcpu_load(vcpu);
729
730 if (vcpu->sigset_active) 728 if (vcpu->sigset_active)
731 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 729 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
732 730
@@ -748,7 +746,6 @@ out:
748 if (vcpu->sigset_active) 746 if (vcpu->sigset_active)
749 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 747 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
750 748
751 vcpu_put(vcpu);
752 return r; 749 return r;
753} 750}
754 751
@@ -883,8 +880,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
883 struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd); 880 struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
884 int i; 881 int i;
885 882
886 vcpu_load(vcpu);
887
888 for (i = 0; i < 16; i++) { 883 for (i = 0; i < 16; i++) {
889 vpd->vgr[i] = regs->vpd.vgr[i]; 884 vpd->vgr[i] = regs->vpd.vgr[i];
890 vpd->vbgr[i] = regs->vpd.vbgr[i]; 885 vpd->vbgr[i] = regs->vpd.vbgr[i];
@@ -931,8 +926,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
931 vcpu->arch.itc_offset = regs->saved_itc - kvm_get_itc(vcpu); 926 vcpu->arch.itc_offset = regs->saved_itc - kvm_get_itc(vcpu);
932 set_bit(KVM_REQ_RESUME, &vcpu->requests); 927 set_bit(KVM_REQ_RESUME, &vcpu->requests);
933 928
934 vcpu_put(vcpu);
935
936 return 0; 929 return 0;
937} 930}
938 931
@@ -1802,35 +1795,24 @@ void kvm_arch_exit(void)
1802 kvm_vmm_info = NULL; 1795 kvm_vmm_info = NULL;
1803} 1796}
1804 1797
1805static int kvm_ia64_sync_dirty_log(struct kvm *kvm, 1798static void kvm_ia64_sync_dirty_log(struct kvm *kvm,
1806 struct kvm_dirty_log *log) 1799 struct kvm_memory_slot *memslot)
1807{ 1800{
1808 struct kvm_memory_slot *memslot; 1801 int i;
1809 int r, i;
1810 long base; 1802 long base;
1811 unsigned long n; 1803 unsigned long n;
1812 unsigned long *dirty_bitmap = (unsigned long *)(kvm->arch.vm_base + 1804 unsigned long *dirty_bitmap = (unsigned long *)(kvm->arch.vm_base +
1813 offsetof(struct kvm_vm_data, kvm_mem_dirty_log)); 1805 offsetof(struct kvm_vm_data, kvm_mem_dirty_log));
1814 1806
1815 r = -EINVAL;
1816 if (log->slot >= KVM_MEMORY_SLOTS)
1817 goto out;
1818
1819 memslot = &kvm->memslots->memslots[log->slot];
1820 r = -ENOENT;
1821 if (!memslot->dirty_bitmap)
1822 goto out;
1823
1824 n = kvm_dirty_bitmap_bytes(memslot); 1807 n = kvm_dirty_bitmap_bytes(memslot);
1825 base = memslot->base_gfn / BITS_PER_LONG; 1808 base = memslot->base_gfn / BITS_PER_LONG;
1826 1809
1810 spin_lock(&kvm->arch.dirty_log_lock);
1827 for (i = 0; i < n/sizeof(long); ++i) { 1811 for (i = 0; i < n/sizeof(long); ++i) {
1828 memslot->dirty_bitmap[i] = dirty_bitmap[base + i]; 1812 memslot->dirty_bitmap[i] = dirty_bitmap[base + i];
1829 dirty_bitmap[base + i] = 0; 1813 dirty_bitmap[base + i] = 0;
1830 } 1814 }
1831 r = 0; 1815 spin_unlock(&kvm->arch.dirty_log_lock);
1832out:
1833 return r;
1834} 1816}
1835 1817
1836int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 1818int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
@@ -1842,12 +1824,17 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1842 int is_dirty = 0; 1824 int is_dirty = 0;
1843 1825
1844 mutex_lock(&kvm->slots_lock); 1826 mutex_lock(&kvm->slots_lock);
1845 spin_lock(&kvm->arch.dirty_log_lock);
1846 1827
1847 r = kvm_ia64_sync_dirty_log(kvm, log); 1828 r = -EINVAL;
1848 if (r) 1829 if (log->slot >= KVM_MEMORY_SLOTS)
1830 goto out;
1831
1832 memslot = &kvm->memslots->memslots[log->slot];
1833 r = -ENOENT;
1834 if (!memslot->dirty_bitmap)
1849 goto out; 1835 goto out;
1850 1836
1837 kvm_ia64_sync_dirty_log(kvm, memslot);
1851 r = kvm_get_dirty_log(kvm, log, &is_dirty); 1838 r = kvm_get_dirty_log(kvm, log, &is_dirty);
1852 if (r) 1839 if (r)
1853 goto out; 1840 goto out;
@@ -1855,14 +1842,12 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1855 /* If nothing is dirty, don't bother messing with page tables. */ 1842 /* If nothing is dirty, don't bother messing with page tables. */
1856 if (is_dirty) { 1843 if (is_dirty) {
1857 kvm_flush_remote_tlbs(kvm); 1844 kvm_flush_remote_tlbs(kvm);
1858 memslot = &kvm->memslots->memslots[log->slot];
1859 n = kvm_dirty_bitmap_bytes(memslot); 1845 n = kvm_dirty_bitmap_bytes(memslot);
1860 memset(memslot->dirty_bitmap, 0, n); 1846 memset(memslot->dirty_bitmap, 0, n);
1861 } 1847 }
1862 r = 0; 1848 r = 0;
1863out: 1849out:
1864 mutex_unlock(&kvm->slots_lock); 1850 mutex_unlock(&kvm->slots_lock);
1865 spin_unlock(&kvm->arch.dirty_log_lock);
1866 return r; 1851 return r;
1867} 1852}
1868 1853
@@ -1953,11 +1938,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
1953 return vcpu->arch.timer_fired; 1938 return vcpu->arch.timer_fired;
1954} 1939}
1955 1940
1956gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1957{
1958 return gfn;
1959}
1960
1961int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 1941int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
1962{ 1942{
1963 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) || 1943 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) ||
@@ -1967,9 +1947,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
1967int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 1947int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
1968 struct kvm_mp_state *mp_state) 1948 struct kvm_mp_state *mp_state)
1969{ 1949{
1970 vcpu_load(vcpu);
1971 mp_state->mp_state = vcpu->arch.mp_state; 1950 mp_state->mp_state = vcpu->arch.mp_state;
1972 vcpu_put(vcpu);
1973 return 0; 1951 return 0;
1974} 1952}
1975 1953
@@ -2000,10 +1978,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
2000{ 1978{
2001 int r = 0; 1979 int r = 0;
2002 1980
2003 vcpu_load(vcpu);
2004 vcpu->arch.mp_state = mp_state->mp_state; 1981 vcpu->arch.mp_state = mp_state->mp_state;
2005 if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED) 1982 if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)
2006 r = vcpu_reset(vcpu); 1983 r = vcpu_reset(vcpu);
2007 vcpu_put(vcpu);
2008 return r; 1984 return r;
2009} 1985}
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 6f74d93725a0..8274a2d43925 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -115,7 +115,15 @@ extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
115extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte); 115extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
116extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr); 116extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
117extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu); 117extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
118extern struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data); 118
119extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
120extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu);
121extern void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu);
122extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu);
123extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
124extern int kvmppc_mmu_hpte_sysinit(void);
125extern void kvmppc_mmu_hpte_sysexit(void);
126
119extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); 127extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
120extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); 128extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
121extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec); 129extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
diff --git a/arch/powerpc/include/asm/kvm_fpu.h b/arch/powerpc/include/asm/kvm_fpu.h
index 94f05de9ad04..c3d4f0518a67 100644
--- a/arch/powerpc/include/asm/kvm_fpu.h
+++ b/arch/powerpc/include/asm/kvm_fpu.h
@@ -22,24 +22,24 @@
22 22
23#include <linux/types.h> 23#include <linux/types.h>
24 24
25extern void fps_fres(struct thread_struct *t, u32 *dst, u32 *src1); 25extern void fps_fres(u64 *fpscr, u32 *dst, u32 *src1);
26extern void fps_frsqrte(struct thread_struct *t, u32 *dst, u32 *src1); 26extern void fps_frsqrte(u64 *fpscr, u32 *dst, u32 *src1);
27extern void fps_fsqrts(struct thread_struct *t, u32 *dst, u32 *src1); 27extern void fps_fsqrts(u64 *fpscr, u32 *dst, u32 *src1);
28 28
29extern void fps_fadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2); 29extern void fps_fadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
30extern void fps_fdivs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2); 30extern void fps_fdivs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
31extern void fps_fmuls(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2); 31extern void fps_fmuls(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
32extern void fps_fsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2); 32extern void fps_fsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
33 33
34extern void fps_fmadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2, 34extern void fps_fmadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
35 u32 *src3); 35 u32 *src3);
36extern void fps_fmsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2, 36extern void fps_fmsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
37 u32 *src3); 37 u32 *src3);
38extern void fps_fnmadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2, 38extern void fps_fnmadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
39 u32 *src3); 39 u32 *src3);
40extern void fps_fnmsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2, 40extern void fps_fnmsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
41 u32 *src3); 41 u32 *src3);
42extern void fps_fsel(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2, 42extern void fps_fsel(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
43 u32 *src3); 43 u32 *src3);
44 44
45#define FPD_ONE_IN(name) extern void fpd_ ## name(u64 *fpscr, u32 *cr, \ 45#define FPD_ONE_IN(name) extern void fpd_ ## name(u64 *fpscr, u32 *cr, \
@@ -82,4 +82,7 @@ FPD_THREE_IN(fmadd)
82FPD_THREE_IN(fnmsub) 82FPD_THREE_IN(fnmsub)
83FPD_THREE_IN(fnmadd) 83FPD_THREE_IN(fnmadd)
84 84
85extern void kvm_cvt_fd(u32 *from, u64 *to, u64 *fpscr);
86extern void kvm_cvt_df(u64 *from, u32 *to, u64 *fpscr);
87
85#endif 88#endif
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 0c9ad869decd..b0b23c007d6e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -35,10 +35,17 @@
35#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 35#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
36 36
37/* We don't currently support large pages. */ 37/* We don't currently support large pages. */
38#define KVM_HPAGE_GFN_SHIFT(x) 0
38#define KVM_NR_PAGE_SIZES 1 39#define KVM_NR_PAGE_SIZES 1
39#define KVM_PAGES_PER_HPAGE(x) (1UL<<31) 40#define KVM_PAGES_PER_HPAGE(x) (1UL<<31)
40 41
41#define HPTEG_CACHE_NUM 1024 42#define HPTEG_CACHE_NUM (1 << 15)
43#define HPTEG_HASH_BITS_PTE 13
44#define HPTEG_HASH_BITS_VPTE 13
45#define HPTEG_HASH_BITS_VPTE_LONG 5
46#define HPTEG_HASH_NUM_PTE (1 << HPTEG_HASH_BITS_PTE)
47#define HPTEG_HASH_NUM_VPTE (1 << HPTEG_HASH_BITS_VPTE)
48#define HPTEG_HASH_NUM_VPTE_LONG (1 << HPTEG_HASH_BITS_VPTE_LONG)
42 49
43struct kvm; 50struct kvm;
44struct kvm_run; 51struct kvm_run;
@@ -151,6 +158,9 @@ struct kvmppc_mmu {
151}; 158};
152 159
153struct hpte_cache { 160struct hpte_cache {
161 struct hlist_node list_pte;
162 struct hlist_node list_vpte;
163 struct hlist_node list_vpte_long;
154 u64 host_va; 164 u64 host_va;
155 u64 pfn; 165 u64 pfn;
156 ulong slot; 166 ulong slot;
@@ -282,8 +292,10 @@ struct kvm_vcpu_arch {
282 unsigned long pending_exceptions; 292 unsigned long pending_exceptions;
283 293
284#ifdef CONFIG_PPC_BOOK3S 294#ifdef CONFIG_PPC_BOOK3S
285 struct hpte_cache hpte_cache[HPTEG_CACHE_NUM]; 295 struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
286 int hpte_cache_offset; 296 struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
297 struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG];
298 int hpte_cache_count;
287#endif 299#endif
288}; 300};
289 301
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 3b4dcc82a4c1..ab3e392ac63c 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -101,10 +101,6 @@ EXPORT_SYMBOL(pci_dram_offset);
101EXPORT_SYMBOL(start_thread); 101EXPORT_SYMBOL(start_thread);
102EXPORT_SYMBOL(kernel_thread); 102EXPORT_SYMBOL(kernel_thread);
103 103
104#ifdef CONFIG_PPC_FPU
105EXPORT_SYMBOL_GPL(cvt_df);
106EXPORT_SYMBOL_GPL(cvt_fd);
107#endif
108EXPORT_SYMBOL(giveup_fpu); 104EXPORT_SYMBOL(giveup_fpu);
109#ifdef CONFIG_ALTIVEC 105#ifdef CONFIG_ALTIVEC
110EXPORT_SYMBOL(giveup_altivec); 106EXPORT_SYMBOL(giveup_altivec);
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 812312542e50..9b9b5cdea840 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -316,7 +316,8 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
316 gfn = gpaddr >> PAGE_SHIFT; 316 gfn = gpaddr >> PAGE_SHIFT;
317 new_page = gfn_to_page(vcpu->kvm, gfn); 317 new_page = gfn_to_page(vcpu->kvm, gfn);
318 if (is_error_page(new_page)) { 318 if (is_error_page(new_page)) {
319 printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn); 319 printk(KERN_ERR "Couldn't get guest page for gfn %llx!\n",
320 (unsigned long long)gfn);
320 kvm_release_page_clean(new_page); 321 kvm_release_page_clean(new_page);
321 return; 322 return;
322 } 323 }
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index ff436066bf77..d45c818a384c 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -45,6 +45,7 @@ kvm-book3s_64-objs := \
45 book3s.o \ 45 book3s.o \
46 book3s_emulate.o \ 46 book3s_emulate.o \
47 book3s_interrupts.o \ 47 book3s_interrupts.o \
48 book3s_mmu_hpte.o \
48 book3s_64_mmu_host.o \ 49 book3s_64_mmu_host.o \
49 book3s_64_mmu.o \ 50 book3s_64_mmu.o \
50 book3s_32_mmu.o 51 book3s_32_mmu.o
@@ -57,6 +58,7 @@ kvm-book3s_32-objs := \
57 book3s.o \ 58 book3s.o \
58 book3s_emulate.o \ 59 book3s_emulate.o \
59 book3s_interrupts.o \ 60 book3s_interrupts.o \
61 book3s_mmu_hpte.o \
60 book3s_32_mmu_host.o \ 62 book3s_32_mmu_host.o \
61 book3s_32_mmu.o 63 book3s_32_mmu.o
62kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs) 64kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index b998abf1a63d..a3cef30d1d42 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1047,8 +1047,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
1047{ 1047{
1048 int i; 1048 int i;
1049 1049
1050 vcpu_load(vcpu);
1051
1052 regs->pc = kvmppc_get_pc(vcpu); 1050 regs->pc = kvmppc_get_pc(vcpu);
1053 regs->cr = kvmppc_get_cr(vcpu); 1051 regs->cr = kvmppc_get_cr(vcpu);
1054 regs->ctr = kvmppc_get_ctr(vcpu); 1052 regs->ctr = kvmppc_get_ctr(vcpu);
@@ -1069,8 +1067,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
1069 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) 1067 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
1070 regs->gpr[i] = kvmppc_get_gpr(vcpu, i); 1068 regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
1071 1069
1072 vcpu_put(vcpu);
1073
1074 return 0; 1070 return 0;
1075} 1071}
1076 1072
@@ -1078,8 +1074,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
1078{ 1074{
1079 int i; 1075 int i;
1080 1076
1081 vcpu_load(vcpu);
1082
1083 kvmppc_set_pc(vcpu, regs->pc); 1077 kvmppc_set_pc(vcpu, regs->pc);
1084 kvmppc_set_cr(vcpu, regs->cr); 1078 kvmppc_set_cr(vcpu, regs->cr);
1085 kvmppc_set_ctr(vcpu, regs->ctr); 1079 kvmppc_set_ctr(vcpu, regs->ctr);
@@ -1099,8 +1093,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
1099 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) 1093 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
1100 kvmppc_set_gpr(vcpu, i, regs->gpr[i]); 1094 kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
1101 1095
1102 vcpu_put(vcpu);
1103
1104 return 0; 1096 return 0;
1105} 1097}
1106 1098
@@ -1110,8 +1102,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
1110 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); 1102 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
1111 int i; 1103 int i;
1112 1104
1113 vcpu_load(vcpu);
1114
1115 sregs->pvr = vcpu->arch.pvr; 1105 sregs->pvr = vcpu->arch.pvr;
1116 1106
1117 sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1; 1107 sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
@@ -1131,8 +1121,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
1131 } 1121 }
1132 } 1122 }
1133 1123
1134 vcpu_put(vcpu);
1135
1136 return 0; 1124 return 0;
1137} 1125}
1138 1126
@@ -1142,8 +1130,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
1142 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); 1130 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
1143 int i; 1131 int i;
1144 1132
1145 vcpu_load(vcpu);
1146
1147 kvmppc_set_pvr(vcpu, sregs->pvr); 1133 kvmppc_set_pvr(vcpu, sregs->pvr);
1148 1134
1149 vcpu3s->sdr1 = sregs->u.s.sdr1; 1135 vcpu3s->sdr1 = sregs->u.s.sdr1;
@@ -1171,8 +1157,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
1171 /* Flush the MMU after messing with the segments */ 1157 /* Flush the MMU after messing with the segments */
1172 kvmppc_mmu_pte_flush(vcpu, 0, 0); 1158 kvmppc_mmu_pte_flush(vcpu, 0, 0);
1173 1159
1174 vcpu_put(vcpu);
1175
1176 return 0; 1160 return 0;
1177} 1161}
1178 1162
@@ -1309,12 +1293,17 @@ extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
1309int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 1293int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1310{ 1294{
1311 int ret; 1295 int ret;
1312 struct thread_struct ext_bkp; 1296 double fpr[32][TS_FPRWIDTH];
1297 unsigned int fpscr;
1298 int fpexc_mode;
1313#ifdef CONFIG_ALTIVEC 1299#ifdef CONFIG_ALTIVEC
1314 bool save_vec = current->thread.used_vr; 1300 vector128 vr[32];
1301 vector128 vscr;
1302 unsigned long uninitialized_var(vrsave);
1303 int used_vr;
1315#endif 1304#endif
1316#ifdef CONFIG_VSX 1305#ifdef CONFIG_VSX
1317 bool save_vsx = current->thread.used_vsr; 1306 int used_vsr;
1318#endif 1307#endif
1319 ulong ext_msr; 1308 ulong ext_msr;
1320 1309
@@ -1327,27 +1316,27 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1327 /* Save FPU state in stack */ 1316 /* Save FPU state in stack */
1328 if (current->thread.regs->msr & MSR_FP) 1317 if (current->thread.regs->msr & MSR_FP)
1329 giveup_fpu(current); 1318 giveup_fpu(current);
1330 memcpy(ext_bkp.fpr, current->thread.fpr, sizeof(current->thread.fpr)); 1319 memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr));
1331 ext_bkp.fpscr = current->thread.fpscr; 1320 fpscr = current->thread.fpscr.val;
1332 ext_bkp.fpexc_mode = current->thread.fpexc_mode; 1321 fpexc_mode = current->thread.fpexc_mode;
1333 1322
1334#ifdef CONFIG_ALTIVEC 1323#ifdef CONFIG_ALTIVEC
1335 /* Save Altivec state in stack */ 1324 /* Save Altivec state in stack */
1336 if (save_vec) { 1325 used_vr = current->thread.used_vr;
1326 if (used_vr) {
1337 if (current->thread.regs->msr & MSR_VEC) 1327 if (current->thread.regs->msr & MSR_VEC)
1338 giveup_altivec(current); 1328 giveup_altivec(current);
1339 memcpy(ext_bkp.vr, current->thread.vr, sizeof(ext_bkp.vr)); 1329 memcpy(vr, current->thread.vr, sizeof(current->thread.vr));
1340 ext_bkp.vscr = current->thread.vscr; 1330 vscr = current->thread.vscr;
1341 ext_bkp.vrsave = current->thread.vrsave; 1331 vrsave = current->thread.vrsave;
1342 } 1332 }
1343 ext_bkp.used_vr = current->thread.used_vr;
1344#endif 1333#endif
1345 1334
1346#ifdef CONFIG_VSX 1335#ifdef CONFIG_VSX
1347 /* Save VSX state in stack */ 1336 /* Save VSX state in stack */
1348 if (save_vsx && (current->thread.regs->msr & MSR_VSX)) 1337 used_vsr = current->thread.used_vsr;
1338 if (used_vsr && (current->thread.regs->msr & MSR_VSX))
1349 __giveup_vsx(current); 1339 __giveup_vsx(current);
1350 ext_bkp.used_vsr = current->thread.used_vsr;
1351#endif 1340#endif
1352 1341
1353 /* Remember the MSR with disabled extensions */ 1342 /* Remember the MSR with disabled extensions */
@@ -1372,22 +1361,22 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1372 kvmppc_giveup_ext(vcpu, MSR_VSX); 1361 kvmppc_giveup_ext(vcpu, MSR_VSX);
1373 1362
1374 /* Restore FPU state from stack */ 1363 /* Restore FPU state from stack */
1375 memcpy(current->thread.fpr, ext_bkp.fpr, sizeof(ext_bkp.fpr)); 1364 memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr));
1376 current->thread.fpscr = ext_bkp.fpscr; 1365 current->thread.fpscr.val = fpscr;
1377 current->thread.fpexc_mode = ext_bkp.fpexc_mode; 1366 current->thread.fpexc_mode = fpexc_mode;
1378 1367
1379#ifdef CONFIG_ALTIVEC 1368#ifdef CONFIG_ALTIVEC
1380 /* Restore Altivec state from stack */ 1369 /* Restore Altivec state from stack */
1381 if (save_vec && current->thread.used_vr) { 1370 if (used_vr && current->thread.used_vr) {
1382 memcpy(current->thread.vr, ext_bkp.vr, sizeof(ext_bkp.vr)); 1371 memcpy(current->thread.vr, vr, sizeof(current->thread.vr));
1383 current->thread.vscr = ext_bkp.vscr; 1372 current->thread.vscr = vscr;
1384 current->thread.vrsave= ext_bkp.vrsave; 1373 current->thread.vrsave = vrsave;
1385 } 1374 }
1386 current->thread.used_vr = ext_bkp.used_vr; 1375 current->thread.used_vr = used_vr;
1387#endif 1376#endif
1388 1377
1389#ifdef CONFIG_VSX 1378#ifdef CONFIG_VSX
1390 current->thread.used_vsr = ext_bkp.used_vsr; 1379 current->thread.used_vsr = used_vsr;
1391#endif 1380#endif
1392 1381
1393 return ret; 1382 return ret;
@@ -1395,12 +1384,22 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1395 1384
1396static int kvmppc_book3s_init(void) 1385static int kvmppc_book3s_init(void)
1397{ 1386{
1398 return kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0, 1387 int r;
1399 THIS_MODULE); 1388
1389 r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0,
1390 THIS_MODULE);
1391
1392 if (r)
1393 return r;
1394
1395 r = kvmppc_mmu_hpte_sysinit();
1396
1397 return r;
1400} 1398}
1401 1399
1402static void kvmppc_book3s_exit(void) 1400static void kvmppc_book3s_exit(void)
1403{ 1401{
1402 kvmppc_mmu_hpte_sysexit();
1404 kvm_exit(); 1403 kvm_exit();
1405} 1404}
1406 1405
diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
index 0b10503c8a4a..3292d76101d2 100644
--- a/arch/powerpc/kvm/book3s_32_mmu.c
+++ b/arch/powerpc/kvm/book3s_32_mmu.c
@@ -354,10 +354,10 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
354 *vsid = VSID_REAL_DR | gvsid; 354 *vsid = VSID_REAL_DR | gvsid;
355 break; 355 break;
356 case MSR_DR|MSR_IR: 356 case MSR_DR|MSR_IR:
357 if (!sr->valid) 357 if (sr->valid)
358 return -1; 358 *vsid = sr->vsid;
359 359 else
360 *vsid = sr->vsid; 360 *vsid = VSID_BAT | gvsid;
361 break; 361 break;
362 default: 362 default:
363 BUG(); 363 BUG();
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index 0bb66005338f..0b51ef872c1e 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -19,6 +19,7 @@
19 */ 19 */
20 20
21#include <linux/kvm_host.h> 21#include <linux/kvm_host.h>
22#include <linux/hash.h>
22 23
23#include <asm/kvm_ppc.h> 24#include <asm/kvm_ppc.h>
24#include <asm/kvm_book3s.h> 25#include <asm/kvm_book3s.h>
@@ -57,139 +58,26 @@
57static ulong htab; 58static ulong htab;
58static u32 htabmask; 59static u32 htabmask;
59 60
60static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) 61void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
61{ 62{
62 volatile u32 *pteg; 63 volatile u32 *pteg;
63 64
64 dprintk_mmu("KVM: Flushing SPTE: 0x%llx (0x%llx) -> 0x%llx\n", 65 /* Remove from host HTAB */
65 pte->pte.eaddr, pte->pte.vpage, pte->host_va);
66
67 pteg = (u32*)pte->slot; 66 pteg = (u32*)pte->slot;
68
69 pteg[0] = 0; 67 pteg[0] = 0;
68
69 /* And make sure it's gone from the TLB too */
70 asm volatile ("sync"); 70 asm volatile ("sync");
71 asm volatile ("tlbie %0" : : "r" (pte->pte.eaddr) : "memory"); 71 asm volatile ("tlbie %0" : : "r" (pte->pte.eaddr) : "memory");
72 asm volatile ("sync"); 72 asm volatile ("sync");
73 asm volatile ("tlbsync"); 73 asm volatile ("tlbsync");
74
75 pte->host_va = 0;
76
77 if (pte->pte.may_write)
78 kvm_release_pfn_dirty(pte->pfn);
79 else
80 kvm_release_pfn_clean(pte->pfn);
81}
82
83void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
84{
85 int i;
86
87 dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%x & 0x%x\n",
88 vcpu->arch.hpte_cache_offset, guest_ea, ea_mask);
89 BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
90
91 guest_ea &= ea_mask;
92 for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
93 struct hpte_cache *pte;
94
95 pte = &vcpu->arch.hpte_cache[i];
96 if (!pte->host_va)
97 continue;
98
99 if ((pte->pte.eaddr & ea_mask) == guest_ea) {
100 invalidate_pte(vcpu, pte);
101 }
102 }
103
104 /* Doing a complete flush -> start from scratch */
105 if (!ea_mask)
106 vcpu->arch.hpte_cache_offset = 0;
107}
108
109void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
110{
111 int i;
112
113 dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
114 vcpu->arch.hpte_cache_offset, guest_vp, vp_mask);
115 BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
116
117 guest_vp &= vp_mask;
118 for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
119 struct hpte_cache *pte;
120
121 pte = &vcpu->arch.hpte_cache[i];
122 if (!pte->host_va)
123 continue;
124
125 if ((pte->pte.vpage & vp_mask) == guest_vp) {
126 invalidate_pte(vcpu, pte);
127 }
128 }
129}
130
131void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
132{
133 int i;
134
135 dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%llx & 0x%llx\n",
136 vcpu->arch.hpte_cache_offset, pa_start, pa_end);
137 BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
138
139 for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
140 struct hpte_cache *pte;
141
142 pte = &vcpu->arch.hpte_cache[i];
143 if (!pte->host_va)
144 continue;
145
146 if ((pte->pte.raddr >= pa_start) &&
147 (pte->pte.raddr < pa_end)) {
148 invalidate_pte(vcpu, pte);
149 }
150 }
151}
152
153struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data)
154{
155 int i;
156 u64 guest_vp;
157
158 guest_vp = vcpu->arch.mmu.ea_to_vp(vcpu, ea, false);
159 for (i=0; i<vcpu->arch.hpte_cache_offset; i++) {
160 struct hpte_cache *pte;
161
162 pte = &vcpu->arch.hpte_cache[i];
163 if (!pte->host_va)
164 continue;
165
166 if (pte->pte.vpage == guest_vp)
167 return &pte->pte;
168 }
169
170 return NULL;
171}
172
173static int kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
174{
175 if (vcpu->arch.hpte_cache_offset == HPTEG_CACHE_NUM)
176 kvmppc_mmu_pte_flush(vcpu, 0, 0);
177
178 return vcpu->arch.hpte_cache_offset++;
179} 74}
180 75
181/* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using 76/* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using
182 * a hash, so we don't waste cycles on looping */ 77 * a hash, so we don't waste cycles on looping */
183static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid) 78static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
184{ 79{
185 return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^ 80 return hash_64(gvsid, SID_MAP_BITS);
186 ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
187 ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
188 ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
189 ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
190 ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
191 ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
192 ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
193} 81}
194 82
195 83
@@ -256,7 +144,6 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
256 register int rr = 0; 144 register int rr = 0;
257 bool primary = false; 145 bool primary = false;
258 bool evict = false; 146 bool evict = false;
259 int hpte_id;
260 struct hpte_cache *pte; 147 struct hpte_cache *pte;
261 148
262 /* Get host physical address for gpa */ 149 /* Get host physical address for gpa */
@@ -341,8 +228,7 @@ next_pteg:
341 228
342 /* Now tell our Shadow PTE code about the new page */ 229 /* Now tell our Shadow PTE code about the new page */
343 230
344 hpte_id = kvmppc_mmu_hpte_cache_next(vcpu); 231 pte = kvmppc_mmu_hpte_cache_next(vcpu);
345 pte = &vcpu->arch.hpte_cache[hpte_id];
346 232
347 dprintk_mmu("KVM: %c%c Map 0x%llx: [%lx] 0x%llx (0x%llx) -> %lx\n", 233 dprintk_mmu("KVM: %c%c Map 0x%llx: [%lx] 0x%llx (0x%llx) -> %lx\n",
348 orig_pte->may_write ? 'w' : '-', 234 orig_pte->may_write ? 'w' : '-',
@@ -355,6 +241,8 @@ next_pteg:
355 pte->pte = *orig_pte; 241 pte->pte = *orig_pte;
356 pte->pfn = hpaddr >> PAGE_SHIFT; 242 pte->pfn = hpaddr >> PAGE_SHIFT;
357 243
244 kvmppc_mmu_hpte_cache_map(vcpu, pte);
245
358 return 0; 246 return 0;
359} 247}
360 248
@@ -439,7 +327,7 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
439 327
440void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) 328void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
441{ 329{
442 kvmppc_mmu_pte_flush(vcpu, 0, 0); 330 kvmppc_mmu_hpte_destroy(vcpu);
443 preempt_disable(); 331 preempt_disable();
444 __destroy_context(to_book3s(vcpu)->context_id); 332 __destroy_context(to_book3s(vcpu)->context_id);
445 preempt_enable(); 333 preempt_enable();
@@ -479,5 +367,7 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
479 htabmask = ((sdr1 & 0x1FF) << 16) | 0xFFC0; 367 htabmask = ((sdr1 & 0x1FF) << 16) | 0xFFC0;
480 htab = (ulong)__va(sdr1 & 0xffff0000); 368 htab = (ulong)__va(sdr1 & 0xffff0000);
481 369
370 kvmppc_mmu_hpte_init(vcpu);
371
482 return 0; 372 return 0;
483} 373}
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index e4b5744977f6..384179a5002b 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/kvm_host.h> 22#include <linux/kvm_host.h>
23#include <linux/hash.h>
23 24
24#include <asm/kvm_ppc.h> 25#include <asm/kvm_ppc.h>
25#include <asm/kvm_book3s.h> 26#include <asm/kvm_book3s.h>
@@ -46,135 +47,20 @@
46#define dprintk_slb(a, ...) do { } while(0) 47#define dprintk_slb(a, ...) do { } while(0)
47#endif 48#endif
48 49
49static void invalidate_pte(struct hpte_cache *pte) 50void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
50{ 51{
51 dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n",
52 pte->pte.eaddr, pte->pte.vpage, pte->host_va);
53
54 ppc_md.hpte_invalidate(pte->slot, pte->host_va, 52 ppc_md.hpte_invalidate(pte->slot, pte->host_va,
55 MMU_PAGE_4K, MMU_SEGSIZE_256M, 53 MMU_PAGE_4K, MMU_SEGSIZE_256M,
56 false); 54 false);
57 pte->host_va = 0;
58
59 if (pte->pte.may_write)
60 kvm_release_pfn_dirty(pte->pfn);
61 else
62 kvm_release_pfn_clean(pte->pfn);
63}
64
65void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
66{
67 int i;
68
69 dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx & 0x%lx\n",
70 vcpu->arch.hpte_cache_offset, guest_ea, ea_mask);
71 BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
72
73 guest_ea &= ea_mask;
74 for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
75 struct hpte_cache *pte;
76
77 pte = &vcpu->arch.hpte_cache[i];
78 if (!pte->host_va)
79 continue;
80
81 if ((pte->pte.eaddr & ea_mask) == guest_ea) {
82 invalidate_pte(pte);
83 }
84 }
85
86 /* Doing a complete flush -> start from scratch */
87 if (!ea_mask)
88 vcpu->arch.hpte_cache_offset = 0;
89}
90
91void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
92{
93 int i;
94
95 dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
96 vcpu->arch.hpte_cache_offset, guest_vp, vp_mask);
97 BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
98
99 guest_vp &= vp_mask;
100 for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
101 struct hpte_cache *pte;
102
103 pte = &vcpu->arch.hpte_cache[i];
104 if (!pte->host_va)
105 continue;
106
107 if ((pte->pte.vpage & vp_mask) == guest_vp) {
108 invalidate_pte(pte);
109 }
110 }
111}
112
113void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
114{
115 int i;
116
117 dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx & 0x%lx\n",
118 vcpu->arch.hpte_cache_offset, pa_start, pa_end);
119 BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
120
121 for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
122 struct hpte_cache *pte;
123
124 pte = &vcpu->arch.hpte_cache[i];
125 if (!pte->host_va)
126 continue;
127
128 if ((pte->pte.raddr >= pa_start) &&
129 (pte->pte.raddr < pa_end)) {
130 invalidate_pte(pte);
131 }
132 }
133}
134
135struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data)
136{
137 int i;
138 u64 guest_vp;
139
140 guest_vp = vcpu->arch.mmu.ea_to_vp(vcpu, ea, false);
141 for (i=0; i<vcpu->arch.hpte_cache_offset; i++) {
142 struct hpte_cache *pte;
143
144 pte = &vcpu->arch.hpte_cache[i];
145 if (!pte->host_va)
146 continue;
147
148 if (pte->pte.vpage == guest_vp)
149 return &pte->pte;
150 }
151
152 return NULL;
153}
154
155static int kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
156{
157 if (vcpu->arch.hpte_cache_offset == HPTEG_CACHE_NUM)
158 kvmppc_mmu_pte_flush(vcpu, 0, 0);
159
160 return vcpu->arch.hpte_cache_offset++;
161} 55}
162 56
163/* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using 57/* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using
164 * a hash, so we don't waste cycles on looping */ 58 * a hash, so we don't waste cycles on looping */
165static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid) 59static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
166{ 60{
167 return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^ 61 return hash_64(gvsid, SID_MAP_BITS);
168 ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
169 ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
170 ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
171 ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
172 ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
173 ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
174 ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
175} 62}
176 63
177
178static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid) 64static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
179{ 65{
180 struct kvmppc_sid_map *map; 66 struct kvmppc_sid_map *map;
@@ -273,8 +159,7 @@ map_again:
273 attempt++; 159 attempt++;
274 goto map_again; 160 goto map_again;
275 } else { 161 } else {
276 int hpte_id = kvmppc_mmu_hpte_cache_next(vcpu); 162 struct hpte_cache *pte = kvmppc_mmu_hpte_cache_next(vcpu);
277 struct hpte_cache *pte = &vcpu->arch.hpte_cache[hpte_id];
278 163
279 dprintk_mmu("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx\n", 164 dprintk_mmu("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx\n",
280 ((rflags & HPTE_R_PP) == 3) ? '-' : 'w', 165 ((rflags & HPTE_R_PP) == 3) ? '-' : 'w',
@@ -292,6 +177,8 @@ map_again:
292 pte->host_va = va; 177 pte->host_va = va;
293 pte->pte = *orig_pte; 178 pte->pte = *orig_pte;
294 pte->pfn = hpaddr >> PAGE_SHIFT; 179 pte->pfn = hpaddr >> PAGE_SHIFT;
180
181 kvmppc_mmu_hpte_cache_map(vcpu, pte);
295 } 182 }
296 183
297 return 0; 184 return 0;
@@ -418,7 +305,7 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
418 305
419void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) 306void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
420{ 307{
421 kvmppc_mmu_pte_flush(vcpu, 0, 0); 308 kvmppc_mmu_hpte_destroy(vcpu);
422 __destroy_context(to_book3s(vcpu)->context_id); 309 __destroy_context(to_book3s(vcpu)->context_id);
423} 310}
424 311
@@ -436,5 +323,7 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
436 vcpu3s->vsid_first = vcpu3s->context_id << USER_ESID_BITS; 323 vcpu3s->vsid_first = vcpu3s->context_id << USER_ESID_BITS;
437 vcpu3s->vsid_next = vcpu3s->vsid_first; 324 vcpu3s->vsid_next = vcpu3s->vsid_first;
438 325
326 kvmppc_mmu_hpte_init(vcpu);
327
439 return 0; 328 return 0;
440} 329}
diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
new file mode 100644
index 000000000000..4868d4a7ebc5
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -0,0 +1,277 @@
1/*
2 * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
3 *
4 * Authors:
5 * Alexander Graf <agraf@suse.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License, version 2, as
9 * published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 */
20
21#include <linux/kvm_host.h>
22#include <linux/hash.h>
23#include <linux/slab.h>
24
25#include <asm/kvm_ppc.h>
26#include <asm/kvm_book3s.h>
27#include <asm/machdep.h>
28#include <asm/mmu_context.h>
29#include <asm/hw_irq.h>
30
31#define PTE_SIZE 12
32
33/* #define DEBUG_MMU */
34
35#ifdef DEBUG_MMU
36#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__)
37#else
38#define dprintk_mmu(a, ...) do { } while(0)
39#endif
40
41static struct kmem_cache *hpte_cache;
42
43static inline u64 kvmppc_mmu_hash_pte(u64 eaddr)
44{
45 return hash_64(eaddr >> PTE_SIZE, HPTEG_HASH_BITS_PTE);
46}
47
48static inline u64 kvmppc_mmu_hash_vpte(u64 vpage)
49{
50 return hash_64(vpage & 0xfffffffffULL, HPTEG_HASH_BITS_VPTE);
51}
52
53static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage)
54{
55 return hash_64((vpage & 0xffffff000ULL) >> 12,
56 HPTEG_HASH_BITS_VPTE_LONG);
57}
58
59void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
60{
61 u64 index;
62
63 /* Add to ePTE list */
64 index = kvmppc_mmu_hash_pte(pte->pte.eaddr);
65 hlist_add_head(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]);
66
67 /* Add to vPTE list */
68 index = kvmppc_mmu_hash_vpte(pte->pte.vpage);
69 hlist_add_head(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]);
70
71 /* Add to vPTE_long list */
72 index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage);
73 hlist_add_head(&pte->list_vpte_long,
74 &vcpu->arch.hpte_hash_vpte_long[index]);
75}
76
77static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
78{
79 dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n",
80 pte->pte.eaddr, pte->pte.vpage, pte->host_va);
81
82 /* Different for 32 and 64 bit */
83 kvmppc_mmu_invalidate_pte(vcpu, pte);
84
85 if (pte->pte.may_write)
86 kvm_release_pfn_dirty(pte->pfn);
87 else
88 kvm_release_pfn_clean(pte->pfn);
89
90 hlist_del(&pte->list_pte);
91 hlist_del(&pte->list_vpte);
92 hlist_del(&pte->list_vpte_long);
93
94 vcpu->arch.hpte_cache_count--;
95 kmem_cache_free(hpte_cache, pte);
96}
97
98static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
99{
100 struct hpte_cache *pte;
101 struct hlist_node *node, *tmp;
102 int i;
103
104 for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
105 struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
106
107 hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long)
108 invalidate_pte(vcpu, pte);
109 }
110}
111
112static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea)
113{
114 struct hlist_head *list;
115 struct hlist_node *node, *tmp;
116 struct hpte_cache *pte;
117
118 /* Find the list of entries in the map */
119 list = &vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];
120
121 /* Check the list for matching entries and invalidate */
122 hlist_for_each_entry_safe(pte, node, tmp, list, list_pte)
123 if ((pte->pte.eaddr & ~0xfffUL) == guest_ea)
124 invalidate_pte(vcpu, pte);
125}
126
127void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
128{
129 u64 i;
130
131 dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx & 0x%lx\n",
132 vcpu->arch.hpte_cache_count, guest_ea, ea_mask);
133
134 guest_ea &= ea_mask;
135
136 switch (ea_mask) {
137 case ~0xfffUL:
138 kvmppc_mmu_pte_flush_page(vcpu, guest_ea);
139 break;
140 case 0x0ffff000:
141 /* 32-bit flush w/o segment, go through all possible segments */
142 for (i = 0; i < 0x100000000ULL; i += 0x10000000ULL)
143 kvmppc_mmu_pte_flush(vcpu, guest_ea | i, ~0xfffUL);
144 break;
145 case 0:
146 /* Doing a complete flush -> start from scratch */
147 kvmppc_mmu_pte_flush_all(vcpu);
148 break;
149 default:
150 WARN_ON(1);
151 break;
152 }
153}
154
155/* Flush with mask 0xfffffffff */
156static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
157{
158 struct hlist_head *list;
159 struct hlist_node *node, *tmp;
160 struct hpte_cache *pte;
161 u64 vp_mask = 0xfffffffffULL;
162
163 list = &vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
164
165 /* Check the list for matching entries and invalidate */
166 hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte)
167 if ((pte->pte.vpage & vp_mask) == guest_vp)
168 invalidate_pte(vcpu, pte);
169}
170
171/* Flush with mask 0xffffff000 */
172static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp)
173{
174 struct hlist_head *list;
175 struct hlist_node *node, *tmp;
176 struct hpte_cache *pte;
177 u64 vp_mask = 0xffffff000ULL;
178
179 list = &vcpu->arch.hpte_hash_vpte_long[
180 kvmppc_mmu_hash_vpte_long(guest_vp)];
181
182 /* Check the list for matching entries and invalidate */
183 hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long)
184 if ((pte->pte.vpage & vp_mask) == guest_vp)
185 invalidate_pte(vcpu, pte);
186}
187
188void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
189{
190 dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
191 vcpu->arch.hpte_cache_count, guest_vp, vp_mask);
192 guest_vp &= vp_mask;
193
194 switch(vp_mask) {
195 case 0xfffffffffULL:
196 kvmppc_mmu_pte_vflush_short(vcpu, guest_vp);
197 break;
198 case 0xffffff000ULL:
199 kvmppc_mmu_pte_vflush_long(vcpu, guest_vp);
200 break;
201 default:
202 WARN_ON(1);
203 return;
204 }
205}
206
207void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
208{
209 struct hlist_node *node, *tmp;
210 struct hpte_cache *pte;
211 int i;
212
213 dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx - 0x%lx\n",
214 vcpu->arch.hpte_cache_count, pa_start, pa_end);
215
216 for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
217 struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
218
219 hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long)
220 if ((pte->pte.raddr >= pa_start) &&
221 (pte->pte.raddr < pa_end))
222 invalidate_pte(vcpu, pte);
223 }
224}
225
226struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
227{
228 struct hpte_cache *pte;
229
230 pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL);
231 vcpu->arch.hpte_cache_count++;
232
233 if (vcpu->arch.hpte_cache_count == HPTEG_CACHE_NUM)
234 kvmppc_mmu_pte_flush_all(vcpu);
235
236 return pte;
237}
238
239void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu)
240{
241 kvmppc_mmu_pte_flush(vcpu, 0, 0);
242}
243
244static void kvmppc_mmu_hpte_init_hash(struct hlist_head *hash_list, int len)
245{
246 int i;
247
248 for (i = 0; i < len; i++)
249 INIT_HLIST_HEAD(&hash_list[i]);
250}
251
252int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
253{
254 /* init hpte lookup hashes */
255 kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte,
256 ARRAY_SIZE(vcpu->arch.hpte_hash_pte));
257 kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte,
258 ARRAY_SIZE(vcpu->arch.hpte_hash_vpte));
259 kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long,
260 ARRAY_SIZE(vcpu->arch.hpte_hash_vpte_long));
261
262 return 0;
263}
264
265int kvmppc_mmu_hpte_sysinit(void)
266{
267 /* init hpte slab cache */
268 hpte_cache = kmem_cache_create("kvm-spt", sizeof(struct hpte_cache),
269 sizeof(struct hpte_cache), 0, NULL);
270
271 return 0;
272}
273
274void kvmppc_mmu_hpte_sysexit(void)
275{
276 kmem_cache_destroy(hpte_cache);
277}
diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c
index a9f66abafcb3..474f2e24050a 100644
--- a/arch/powerpc/kvm/book3s_paired_singles.c
+++ b/arch/powerpc/kvm/book3s_paired_singles.c
@@ -159,10 +159,7 @@
159 159
160static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt) 160static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt)
161{ 161{
162 struct thread_struct t; 162 kvm_cvt_df(&vcpu->arch.fpr[rt], &vcpu->arch.qpr[rt], &vcpu->arch.fpscr);
163
164 t.fpscr.val = vcpu->arch.fpscr;
165 cvt_df((double*)&vcpu->arch.fpr[rt], (float*)&vcpu->arch.qpr[rt], &t);
166} 163}
167 164
168static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store) 165static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store)
@@ -183,7 +180,6 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
183 int rs, ulong addr, int ls_type) 180 int rs, ulong addr, int ls_type)
184{ 181{
185 int emulated = EMULATE_FAIL; 182 int emulated = EMULATE_FAIL;
186 struct thread_struct t;
187 int r; 183 int r;
188 char tmp[8]; 184 char tmp[8];
189 int len = sizeof(u32); 185 int len = sizeof(u32);
@@ -191,8 +187,6 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
191 if (ls_type == FPU_LS_DOUBLE) 187 if (ls_type == FPU_LS_DOUBLE)
192 len = sizeof(u64); 188 len = sizeof(u64);
193 189
194 t.fpscr.val = vcpu->arch.fpscr;
195
196 /* read from memory */ 190 /* read from memory */
197 r = kvmppc_ld(vcpu, &addr, len, tmp, true); 191 r = kvmppc_ld(vcpu, &addr, len, tmp, true);
198 vcpu->arch.paddr_accessed = addr; 192 vcpu->arch.paddr_accessed = addr;
@@ -210,7 +204,7 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
210 /* put in registers */ 204 /* put in registers */
211 switch (ls_type) { 205 switch (ls_type) {
212 case FPU_LS_SINGLE: 206 case FPU_LS_SINGLE:
213 cvt_fd((float*)tmp, (double*)&vcpu->arch.fpr[rs], &t); 207 kvm_cvt_fd((u32*)tmp, &vcpu->arch.fpr[rs], &vcpu->arch.fpscr);
214 vcpu->arch.qpr[rs] = *((u32*)tmp); 208 vcpu->arch.qpr[rs] = *((u32*)tmp);
215 break; 209 break;
216 case FPU_LS_DOUBLE: 210 case FPU_LS_DOUBLE:
@@ -229,17 +223,14 @@ static int kvmppc_emulate_fpr_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
229 int rs, ulong addr, int ls_type) 223 int rs, ulong addr, int ls_type)
230{ 224{
231 int emulated = EMULATE_FAIL; 225 int emulated = EMULATE_FAIL;
232 struct thread_struct t;
233 int r; 226 int r;
234 char tmp[8]; 227 char tmp[8];
235 u64 val; 228 u64 val;
236 int len; 229 int len;
237 230
238 t.fpscr.val = vcpu->arch.fpscr;
239
240 switch (ls_type) { 231 switch (ls_type) {
241 case FPU_LS_SINGLE: 232 case FPU_LS_SINGLE:
242 cvt_df((double*)&vcpu->arch.fpr[rs], (float*)tmp, &t); 233 kvm_cvt_df(&vcpu->arch.fpr[rs], (u32*)tmp, &vcpu->arch.fpscr);
243 val = *((u32*)tmp); 234 val = *((u32*)tmp);
244 len = sizeof(u32); 235 len = sizeof(u32);
245 break; 236 break;
@@ -278,13 +269,10 @@ static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
278 int rs, ulong addr, bool w, int i) 269 int rs, ulong addr, bool w, int i)
279{ 270{
280 int emulated = EMULATE_FAIL; 271 int emulated = EMULATE_FAIL;
281 struct thread_struct t;
282 int r; 272 int r;
283 float one = 1.0; 273 float one = 1.0;
284 u32 tmp[2]; 274 u32 tmp[2];
285 275
286 t.fpscr.val = vcpu->arch.fpscr;
287
288 /* read from memory */ 276 /* read from memory */
289 if (w) { 277 if (w) {
290 r = kvmppc_ld(vcpu, &addr, sizeof(u32), tmp, true); 278 r = kvmppc_ld(vcpu, &addr, sizeof(u32), tmp, true);
@@ -308,7 +296,7 @@ static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
308 emulated = EMULATE_DONE; 296 emulated = EMULATE_DONE;
309 297
310 /* put in registers */ 298 /* put in registers */
311 cvt_fd((float*)&tmp[0], (double*)&vcpu->arch.fpr[rs], &t); 299 kvm_cvt_fd(&tmp[0], &vcpu->arch.fpr[rs], &vcpu->arch.fpscr);
312 vcpu->arch.qpr[rs] = tmp[1]; 300 vcpu->arch.qpr[rs] = tmp[1];
313 301
314 dprintk(KERN_INFO "KVM: PSQ_LD [0x%x, 0x%x] at 0x%lx (%d)\n", tmp[0], 302 dprintk(KERN_INFO "KVM: PSQ_LD [0x%x, 0x%x] at 0x%lx (%d)\n", tmp[0],
@@ -322,14 +310,11 @@ static int kvmppc_emulate_psq_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
322 int rs, ulong addr, bool w, int i) 310 int rs, ulong addr, bool w, int i)
323{ 311{
324 int emulated = EMULATE_FAIL; 312 int emulated = EMULATE_FAIL;
325 struct thread_struct t;
326 int r; 313 int r;
327 u32 tmp[2]; 314 u32 tmp[2];
328 int len = w ? sizeof(u32) : sizeof(u64); 315 int len = w ? sizeof(u32) : sizeof(u64);
329 316
330 t.fpscr.val = vcpu->arch.fpscr; 317 kvm_cvt_df(&vcpu->arch.fpr[rs], &tmp[0], &vcpu->arch.fpscr);
331
332 cvt_df((double*)&vcpu->arch.fpr[rs], (float*)&tmp[0], &t);
333 tmp[1] = vcpu->arch.qpr[rs]; 318 tmp[1] = vcpu->arch.qpr[rs];
334 319
335 r = kvmppc_st(vcpu, &addr, len, tmp, true); 320 r = kvmppc_st(vcpu, &addr, len, tmp, true);
@@ -517,7 +502,7 @@ static int get_d_signext(u32 inst)
517static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc, 502static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
518 int reg_out, int reg_in1, int reg_in2, 503 int reg_out, int reg_in1, int reg_in2,
519 int reg_in3, int scalar, 504 int reg_in3, int scalar,
520 void (*func)(struct thread_struct *t, 505 void (*func)(u64 *fpscr,
521 u32 *dst, u32 *src1, 506 u32 *dst, u32 *src1,
522 u32 *src2, u32 *src3)) 507 u32 *src2, u32 *src3))
523{ 508{
@@ -526,27 +511,25 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
526 u32 ps0_out; 511 u32 ps0_out;
527 u32 ps0_in1, ps0_in2, ps0_in3; 512 u32 ps0_in1, ps0_in2, ps0_in3;
528 u32 ps1_in1, ps1_in2, ps1_in3; 513 u32 ps1_in1, ps1_in2, ps1_in3;
529 struct thread_struct t;
530 t.fpscr.val = vcpu->arch.fpscr;
531 514
532 /* RC */ 515 /* RC */
533 WARN_ON(rc); 516 WARN_ON(rc);
534 517
535 /* PS0 */ 518 /* PS0 */
536 cvt_df((double*)&fpr[reg_in1], (float*)&ps0_in1, &t); 519 kvm_cvt_df(&fpr[reg_in1], &ps0_in1, &vcpu->arch.fpscr);
537 cvt_df((double*)&fpr[reg_in2], (float*)&ps0_in2, &t); 520 kvm_cvt_df(&fpr[reg_in2], &ps0_in2, &vcpu->arch.fpscr);
538 cvt_df((double*)&fpr[reg_in3], (float*)&ps0_in3, &t); 521 kvm_cvt_df(&fpr[reg_in3], &ps0_in3, &vcpu->arch.fpscr);
539 522
540 if (scalar & SCALAR_LOW) 523 if (scalar & SCALAR_LOW)
541 ps0_in2 = qpr[reg_in2]; 524 ps0_in2 = qpr[reg_in2];
542 525
543 func(&t, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3); 526 func(&vcpu->arch.fpscr, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3);
544 527
545 dprintk(KERN_INFO "PS3 ps0 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n", 528 dprintk(KERN_INFO "PS3 ps0 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",
546 ps0_in1, ps0_in2, ps0_in3, ps0_out); 529 ps0_in1, ps0_in2, ps0_in3, ps0_out);
547 530
548 if (!(scalar & SCALAR_NO_PS0)) 531 if (!(scalar & SCALAR_NO_PS0))
549 cvt_fd((float*)&ps0_out, (double*)&fpr[reg_out], &t); 532 kvm_cvt_fd(&ps0_out, &fpr[reg_out], &vcpu->arch.fpscr);
550 533
551 /* PS1 */ 534 /* PS1 */
552 ps1_in1 = qpr[reg_in1]; 535 ps1_in1 = qpr[reg_in1];
@@ -557,7 +540,7 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
557 ps1_in2 = ps0_in2; 540 ps1_in2 = ps0_in2;
558 541
559 if (!(scalar & SCALAR_NO_PS1)) 542 if (!(scalar & SCALAR_NO_PS1))
560 func(&t, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3); 543 func(&vcpu->arch.fpscr, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3);
561 544
562 dprintk(KERN_INFO "PS3 ps1 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n", 545 dprintk(KERN_INFO "PS3 ps1 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",
563 ps1_in1, ps1_in2, ps1_in3, qpr[reg_out]); 546 ps1_in1, ps1_in2, ps1_in3, qpr[reg_out]);
@@ -568,7 +551,7 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
568static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc, 551static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
569 int reg_out, int reg_in1, int reg_in2, 552 int reg_out, int reg_in1, int reg_in2,
570 int scalar, 553 int scalar,
571 void (*func)(struct thread_struct *t, 554 void (*func)(u64 *fpscr,
572 u32 *dst, u32 *src1, 555 u32 *dst, u32 *src1,
573 u32 *src2)) 556 u32 *src2))
574{ 557{
@@ -578,27 +561,25 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
578 u32 ps0_in1, ps0_in2; 561 u32 ps0_in1, ps0_in2;
579 u32 ps1_out; 562 u32 ps1_out;
580 u32 ps1_in1, ps1_in2; 563 u32 ps1_in1, ps1_in2;
581 struct thread_struct t;
582 t.fpscr.val = vcpu->arch.fpscr;
583 564
584 /* RC */ 565 /* RC */
585 WARN_ON(rc); 566 WARN_ON(rc);
586 567
587 /* PS0 */ 568 /* PS0 */
588 cvt_df((double*)&fpr[reg_in1], (float*)&ps0_in1, &t); 569 kvm_cvt_df(&fpr[reg_in1], &ps0_in1, &vcpu->arch.fpscr);
589 570
590 if (scalar & SCALAR_LOW) 571 if (scalar & SCALAR_LOW)
591 ps0_in2 = qpr[reg_in2]; 572 ps0_in2 = qpr[reg_in2];
592 else 573 else
593 cvt_df((double*)&fpr[reg_in2], (float*)&ps0_in2, &t); 574 kvm_cvt_df(&fpr[reg_in2], &ps0_in2, &vcpu->arch.fpscr);
594 575
595 func(&t, &ps0_out, &ps0_in1, &ps0_in2); 576 func(&vcpu->arch.fpscr, &ps0_out, &ps0_in1, &ps0_in2);
596 577
597 if (!(scalar & SCALAR_NO_PS0)) { 578 if (!(scalar & SCALAR_NO_PS0)) {
598 dprintk(KERN_INFO "PS2 ps0 -> f(0x%x, 0x%x) = 0x%x\n", 579 dprintk(KERN_INFO "PS2 ps0 -> f(0x%x, 0x%x) = 0x%x\n",
599 ps0_in1, ps0_in2, ps0_out); 580 ps0_in1, ps0_in2, ps0_out);
600 581
601 cvt_fd((float*)&ps0_out, (double*)&fpr[reg_out], &t); 582 kvm_cvt_fd(&ps0_out, &fpr[reg_out], &vcpu->arch.fpscr);
602 } 583 }
603 584
604 /* PS1 */ 585 /* PS1 */
@@ -608,7 +589,7 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
608 if (scalar & SCALAR_HIGH) 589 if (scalar & SCALAR_HIGH)
609 ps1_in2 = ps0_in2; 590 ps1_in2 = ps0_in2;
610 591
611 func(&t, &ps1_out, &ps1_in1, &ps1_in2); 592 func(&vcpu->arch.fpscr, &ps1_out, &ps1_in1, &ps1_in2);
612 593
613 if (!(scalar & SCALAR_NO_PS1)) { 594 if (!(scalar & SCALAR_NO_PS1)) {
614 qpr[reg_out] = ps1_out; 595 qpr[reg_out] = ps1_out;
@@ -622,31 +603,29 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
622 603
623static int kvmppc_ps_one_in(struct kvm_vcpu *vcpu, bool rc, 604static int kvmppc_ps_one_in(struct kvm_vcpu *vcpu, bool rc,
624 int reg_out, int reg_in, 605 int reg_out, int reg_in,
625 void (*func)(struct thread_struct *t, 606 void (*func)(u64 *t,
626 u32 *dst, u32 *src1)) 607 u32 *dst, u32 *src1))
627{ 608{
628 u32 *qpr = vcpu->arch.qpr; 609 u32 *qpr = vcpu->arch.qpr;
629 u64 *fpr = vcpu->arch.fpr; 610 u64 *fpr = vcpu->arch.fpr;
630 u32 ps0_out, ps0_in; 611 u32 ps0_out, ps0_in;
631 u32 ps1_in; 612 u32 ps1_in;
632 struct thread_struct t;
633 t.fpscr.val = vcpu->arch.fpscr;
634 613
635 /* RC */ 614 /* RC */
636 WARN_ON(rc); 615 WARN_ON(rc);
637 616
638 /* PS0 */ 617 /* PS0 */
639 cvt_df((double*)&fpr[reg_in], (float*)&ps0_in, &t); 618 kvm_cvt_df(&fpr[reg_in], &ps0_in, &vcpu->arch.fpscr);
640 func(&t, &ps0_out, &ps0_in); 619 func(&vcpu->arch.fpscr, &ps0_out, &ps0_in);
641 620
642 dprintk(KERN_INFO "PS1 ps0 -> f(0x%x) = 0x%x\n", 621 dprintk(KERN_INFO "PS1 ps0 -> f(0x%x) = 0x%x\n",
643 ps0_in, ps0_out); 622 ps0_in, ps0_out);
644 623
645 cvt_fd((float*)&ps0_out, (double*)&fpr[reg_out], &t); 624 kvm_cvt_fd(&ps0_out, &fpr[reg_out], &vcpu->arch.fpscr);
646 625
647 /* PS1 */ 626 /* PS1 */
648 ps1_in = qpr[reg_in]; 627 ps1_in = qpr[reg_in];
649 func(&t, &qpr[reg_out], &ps1_in); 628 func(&vcpu->arch.fpscr, &qpr[reg_out], &ps1_in);
650 629
651 dprintk(KERN_INFO "PS1 ps1 -> f(0x%x) = 0x%x\n", 630 dprintk(KERN_INFO "PS1 ps1 -> f(0x%x) = 0x%x\n",
652 ps1_in, qpr[reg_out]); 631 ps1_in, qpr[reg_out]);
@@ -672,13 +651,10 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
672 651
673 bool rcomp = (inst & 1) ? true : false; 652 bool rcomp = (inst & 1) ? true : false;
674 u32 cr = kvmppc_get_cr(vcpu); 653 u32 cr = kvmppc_get_cr(vcpu);
675 struct thread_struct t;
676#ifdef DEBUG 654#ifdef DEBUG
677 int i; 655 int i;
678#endif 656#endif
679 657
680 t.fpscr.val = vcpu->arch.fpscr;
681
682 if (!kvmppc_inst_is_paired_single(vcpu, inst)) 658 if (!kvmppc_inst_is_paired_single(vcpu, inst))
683 return EMULATE_FAIL; 659 return EMULATE_FAIL;
684 660
@@ -695,7 +671,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
695#ifdef DEBUG 671#ifdef DEBUG
696 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) { 672 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) {
697 u32 f; 673 u32 f;
698 cvt_df((double*)&vcpu->arch.fpr[i], (float*)&f, &t); 674 kvm_cvt_df(&vcpu->arch.fpr[i], &f, &vcpu->arch.fpscr);
699 dprintk(KERN_INFO "FPR[%d] = 0x%x / 0x%llx QPR[%d] = 0x%x\n", 675 dprintk(KERN_INFO "FPR[%d] = 0x%x / 0x%llx QPR[%d] = 0x%x\n",
700 i, f, vcpu->arch.fpr[i], i, vcpu->arch.qpr[i]); 676 i, f, vcpu->arch.fpr[i], i, vcpu->arch.qpr[i]);
701 } 677 }
@@ -819,8 +795,9 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
819 WARN_ON(rcomp); 795 WARN_ON(rcomp);
820 vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_ra]; 796 vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_ra];
821 /* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */ 797 /* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */
822 cvt_df((double*)&vcpu->arch.fpr[ax_rb], 798 kvm_cvt_df(&vcpu->arch.fpr[ax_rb],
823 (float*)&vcpu->arch.qpr[ax_rd], &t); 799 &vcpu->arch.qpr[ax_rd],
800 &vcpu->arch.fpscr);
824 break; 801 break;
825 case OP_4X_PS_MERGE01: 802 case OP_4X_PS_MERGE01:
826 WARN_ON(rcomp); 803 WARN_ON(rcomp);
@@ -830,17 +807,20 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
830 case OP_4X_PS_MERGE10: 807 case OP_4X_PS_MERGE10:
831 WARN_ON(rcomp); 808 WARN_ON(rcomp);
832 /* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */ 809 /* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */
833 cvt_fd((float*)&vcpu->arch.qpr[ax_ra], 810 kvm_cvt_fd(&vcpu->arch.qpr[ax_ra],
834 (double*)&vcpu->arch.fpr[ax_rd], &t); 811 &vcpu->arch.fpr[ax_rd],
812 &vcpu->arch.fpscr);
835 /* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */ 813 /* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */
836 cvt_df((double*)&vcpu->arch.fpr[ax_rb], 814 kvm_cvt_df(&vcpu->arch.fpr[ax_rb],
837 (float*)&vcpu->arch.qpr[ax_rd], &t); 815 &vcpu->arch.qpr[ax_rd],
816 &vcpu->arch.fpscr);
838 break; 817 break;
839 case OP_4X_PS_MERGE11: 818 case OP_4X_PS_MERGE11:
840 WARN_ON(rcomp); 819 WARN_ON(rcomp);
841 /* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */ 820 /* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */
842 cvt_fd((float*)&vcpu->arch.qpr[ax_ra], 821 kvm_cvt_fd(&vcpu->arch.qpr[ax_ra],
843 (double*)&vcpu->arch.fpr[ax_rd], &t); 822 &vcpu->arch.fpr[ax_rd],
823 &vcpu->arch.fpscr);
844 vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb]; 824 vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
845 break; 825 break;
846 } 826 }
@@ -1275,7 +1255,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
1275#ifdef DEBUG 1255#ifdef DEBUG
1276 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) { 1256 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) {
1277 u32 f; 1257 u32 f;
1278 cvt_df((double*)&vcpu->arch.fpr[i], (float*)&f, &t); 1258 kvm_cvt_df(&vcpu->arch.fpr[i], &f, &vcpu->arch.fpscr);
1279 dprintk(KERN_INFO "FPR[%d] = 0x%x\n", i, f); 1259 dprintk(KERN_INFO "FPR[%d] = 0x%x\n", i, f);
1280 } 1260 }
1281#endif 1261#endif
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index a33ab8cc2ccc..8d4e35f5372c 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -144,7 +144,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
144 unsigned int priority) 144 unsigned int priority)
145{ 145{
146 int allowed = 0; 146 int allowed = 0;
147 ulong msr_mask; 147 ulong uninitialized_var(msr_mask);
148 bool update_esr = false, update_dear = false; 148 bool update_esr = false, update_dear = false;
149 149
150 switch (priority) { 150 switch (priority) {
@@ -485,8 +485,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
485{ 485{
486 int i; 486 int i;
487 487
488 vcpu_load(vcpu);
489
490 regs->pc = vcpu->arch.pc; 488 regs->pc = vcpu->arch.pc;
491 regs->cr = kvmppc_get_cr(vcpu); 489 regs->cr = kvmppc_get_cr(vcpu);
492 regs->ctr = vcpu->arch.ctr; 490 regs->ctr = vcpu->arch.ctr;
@@ -507,8 +505,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
507 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) 505 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
508 regs->gpr[i] = kvmppc_get_gpr(vcpu, i); 506 regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
509 507
510 vcpu_put(vcpu);
511
512 return 0; 508 return 0;
513} 509}
514 510
@@ -516,8 +512,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
516{ 512{
517 int i; 513 int i;
518 514
519 vcpu_load(vcpu);
520
521 vcpu->arch.pc = regs->pc; 515 vcpu->arch.pc = regs->pc;
522 kvmppc_set_cr(vcpu, regs->cr); 516 kvmppc_set_cr(vcpu, regs->cr);
523 vcpu->arch.ctr = regs->ctr; 517 vcpu->arch.ctr = regs->ctr;
@@ -537,8 +531,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
537 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) 531 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
538 kvmppc_set_gpr(vcpu, i, regs->gpr[i]); 532 kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
539 533
540 vcpu_put(vcpu);
541
542 return 0; 534 return 0;
543} 535}
544 536
@@ -569,9 +561,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
569{ 561{
570 int r; 562 int r;
571 563
572 vcpu_load(vcpu);
573 r = kvmppc_core_vcpu_translate(vcpu, tr); 564 r = kvmppc_core_vcpu_translate(vcpu, tr);
574 vcpu_put(vcpu);
575 return r; 565 return r;
576} 566}
577 567
diff --git a/arch/powerpc/kvm/fpu.S b/arch/powerpc/kvm/fpu.S
index 2b340a3eee90..cb34bbe16113 100644
--- a/arch/powerpc/kvm/fpu.S
+++ b/arch/powerpc/kvm/fpu.S
@@ -271,3 +271,21 @@ FPD_THREE_IN(fmsub)
271FPD_THREE_IN(fmadd) 271FPD_THREE_IN(fmadd)
272FPD_THREE_IN(fnmsub) 272FPD_THREE_IN(fnmsub)
273FPD_THREE_IN(fnmadd) 273FPD_THREE_IN(fnmadd)
274
275_GLOBAL(kvm_cvt_fd)
276 lfd 0,0(r5) /* load up fpscr value */
277 MTFSF_L(0)
278 lfs 0,0(r3)
279 stfd 0,0(r4)
280 mffs 0
281 stfd 0,0(r5) /* save new fpscr value */
282 blr
283
284_GLOBAL(kvm_cvt_df)
285 lfd 0,0(r5) /* load up fpscr value */
286 MTFSF_L(0)
287 lfd 0,0(r3)
288 stfs 0,0(r4)
289 mffs 0
290 stfd 0,0(r5) /* save new fpscr value */
291 blr
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 9b8683f39e05..72a4ad86ee91 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -36,11 +36,6 @@
36#define CREATE_TRACE_POINTS 36#define CREATE_TRACE_POINTS
37#include "trace.h" 37#include "trace.h"
38 38
39gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
40{
41 return gfn;
42}
43
44int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) 39int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
45{ 40{
46 return !(v->arch.msr & MSR_WE) || !!(v->arch.pending_exceptions); 41 return !(v->arch.msr & MSR_WE) || !!(v->arch.pending_exceptions);
@@ -287,7 +282,7 @@ static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
287static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, 282static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
288 struct kvm_run *run) 283 struct kvm_run *run)
289{ 284{
290 u64 gpr; 285 u64 uninitialized_var(gpr);
291 286
292 if (run->mmio.len > sizeof(gpr)) { 287 if (run->mmio.len > sizeof(gpr)) {
293 printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len); 288 printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len);
@@ -423,8 +418,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
423 int r; 418 int r;
424 sigset_t sigsaved; 419 sigset_t sigsaved;
425 420
426 vcpu_load(vcpu);
427
428 if (vcpu->sigset_active) 421 if (vcpu->sigset_active)
429 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 422 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
430 423
@@ -456,8 +449,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
456 if (vcpu->sigset_active) 449 if (vcpu->sigset_active)
457 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 450 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
458 451
459 vcpu_put(vcpu);
460
461 return r; 452 return r;
462} 453}
463 454
@@ -523,8 +514,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
523 if (copy_from_user(&irq, argp, sizeof(irq))) 514 if (copy_from_user(&irq, argp, sizeof(irq)))
524 goto out; 515 goto out;
525 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 516 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
526 break; 517 goto out;
527 } 518 }
519
528 case KVM_ENABLE_CAP: 520 case KVM_ENABLE_CAP:
529 { 521 {
530 struct kvm_enable_cap cap; 522 struct kvm_enable_cap cap;
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 27605b62b980..cef7dbf69dfc 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -26,7 +26,7 @@
26 26
27struct sca_entry { 27struct sca_entry {
28 atomic_t scn; 28 atomic_t scn;
29 __u64 reserved; 29 __u32 reserved;
30 __u64 sda; 30 __u64 sda;
31 __u64 reserved2[2]; 31 __u64 reserved2[2];
32} __attribute__((packed)); 32} __attribute__((packed));
@@ -41,7 +41,8 @@ struct sca_block {
41} __attribute__((packed)); 41} __attribute__((packed));
42 42
43#define KVM_NR_PAGE_SIZES 2 43#define KVM_NR_PAGE_SIZES 2
44#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + ((x) - 1) * 8) 44#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 8)
45#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
45#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) 46#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
46#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) 47#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
47#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) 48#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 3ddc30895e31..f7b6df45d8be 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -135,7 +135,7 @@ static int handle_stop(struct kvm_vcpu *vcpu)
135 spin_lock_bh(&vcpu->arch.local_int.lock); 135 spin_lock_bh(&vcpu->arch.local_int.lock);
136 if (vcpu->arch.local_int.action_bits & ACTION_STORE_ON_STOP) { 136 if (vcpu->arch.local_int.action_bits & ACTION_STORE_ON_STOP) {
137 vcpu->arch.local_int.action_bits &= ~ACTION_STORE_ON_STOP; 137 vcpu->arch.local_int.action_bits &= ~ACTION_STORE_ON_STOP;
138 rc = __kvm_s390_vcpu_store_status(vcpu, 138 rc = kvm_s390_vcpu_store_status(vcpu,
139 KVM_S390_STORE_STATUS_NOADDR); 139 KVM_S390_STORE_STATUS_NOADDR);
140 if (rc >= 0) 140 if (rc >= 0)
141 rc = -EOPNOTSUPP; 141 rc = -EOPNOTSUPP;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ae3705816878..4fe68650535c 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -207,6 +207,7 @@ out_nokvm:
207void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 207void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
208{ 208{
209 VCPU_EVENT(vcpu, 3, "%s", "free cpu"); 209 VCPU_EVENT(vcpu, 3, "%s", "free cpu");
210 clear_bit(63 - vcpu->vcpu_id, (unsigned long *) &vcpu->kvm->arch.sca->mcn);
210 if (vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda == 211 if (vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda ==
211 (__u64) vcpu->arch.sie_block) 212 (__u64) vcpu->arch.sie_block)
212 vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0; 213 vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0;
@@ -296,7 +297,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
296{ 297{
297 atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH); 298 atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH);
298 set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests); 299 set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests);
299 vcpu->arch.sie_block->ecb = 2; 300 vcpu->arch.sie_block->ecb = 6;
300 vcpu->arch.sie_block->eca = 0xC1002001U; 301 vcpu->arch.sie_block->eca = 0xC1002001U;
301 vcpu->arch.sie_block->fac = (int) (long) facilities; 302 vcpu->arch.sie_block->fac = (int) (long) facilities;
302 hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); 303 hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
@@ -329,6 +330,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
329 kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block; 330 kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block;
330 vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32); 331 vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32);
331 vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca; 332 vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca;
333 set_bit(63 - id, (unsigned long *) &kvm->arch.sca->mcn);
332 334
333 spin_lock_init(&vcpu->arch.local_int.lock); 335 spin_lock_init(&vcpu->arch.local_int.lock);
334 INIT_LIST_HEAD(&vcpu->arch.local_int.list); 336 INIT_LIST_HEAD(&vcpu->arch.local_int.list);
@@ -363,63 +365,49 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
363 365
364static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) 366static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
365{ 367{
366 vcpu_load(vcpu);
367 kvm_s390_vcpu_initial_reset(vcpu); 368 kvm_s390_vcpu_initial_reset(vcpu);
368 vcpu_put(vcpu);
369 return 0; 369 return 0;
370} 370}
371 371
372int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 372int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
373{ 373{
374 vcpu_load(vcpu);
375 memcpy(&vcpu->arch.guest_gprs, &regs->gprs, sizeof(regs->gprs)); 374 memcpy(&vcpu->arch.guest_gprs, &regs->gprs, sizeof(regs->gprs));
376 vcpu_put(vcpu);
377 return 0; 375 return 0;
378} 376}
379 377
380int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 378int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
381{ 379{
382 vcpu_load(vcpu);
383 memcpy(&regs->gprs, &vcpu->arch.guest_gprs, sizeof(regs->gprs)); 380 memcpy(&regs->gprs, &vcpu->arch.guest_gprs, sizeof(regs->gprs));
384 vcpu_put(vcpu);
385 return 0; 381 return 0;
386} 382}
387 383
388int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 384int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
389 struct kvm_sregs *sregs) 385 struct kvm_sregs *sregs)
390{ 386{
391 vcpu_load(vcpu);
392 memcpy(&vcpu->arch.guest_acrs, &sregs->acrs, sizeof(sregs->acrs)); 387 memcpy(&vcpu->arch.guest_acrs, &sregs->acrs, sizeof(sregs->acrs));
393 memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs)); 388 memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs));
394 vcpu_put(vcpu);
395 return 0; 389 return 0;
396} 390}
397 391
398int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 392int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
399 struct kvm_sregs *sregs) 393 struct kvm_sregs *sregs)
400{ 394{
401 vcpu_load(vcpu);
402 memcpy(&sregs->acrs, &vcpu->arch.guest_acrs, sizeof(sregs->acrs)); 395 memcpy(&sregs->acrs, &vcpu->arch.guest_acrs, sizeof(sregs->acrs));
403 memcpy(&sregs->crs, &vcpu->arch.sie_block->gcr, sizeof(sregs->crs)); 396 memcpy(&sregs->crs, &vcpu->arch.sie_block->gcr, sizeof(sregs->crs));
404 vcpu_put(vcpu);
405 return 0; 397 return 0;
406} 398}
407 399
408int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 400int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
409{ 401{
410 vcpu_load(vcpu);
411 memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs)); 402 memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs));
412 vcpu->arch.guest_fpregs.fpc = fpu->fpc; 403 vcpu->arch.guest_fpregs.fpc = fpu->fpc;
413 vcpu_put(vcpu);
414 return 0; 404 return 0;
415} 405}
416 406
417int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 407int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
418{ 408{
419 vcpu_load(vcpu);
420 memcpy(&fpu->fprs, &vcpu->arch.guest_fpregs.fprs, sizeof(fpu->fprs)); 409 memcpy(&fpu->fprs, &vcpu->arch.guest_fpregs.fprs, sizeof(fpu->fprs));
421 fpu->fpc = vcpu->arch.guest_fpregs.fpc; 410 fpu->fpc = vcpu->arch.guest_fpregs.fpc;
422 vcpu_put(vcpu);
423 return 0; 411 return 0;
424} 412}
425 413
@@ -427,14 +415,12 @@ static int kvm_arch_vcpu_ioctl_set_initial_psw(struct kvm_vcpu *vcpu, psw_t psw)
427{ 415{
428 int rc = 0; 416 int rc = 0;
429 417
430 vcpu_load(vcpu);
431 if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING) 418 if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING)
432 rc = -EBUSY; 419 rc = -EBUSY;
433 else { 420 else {
434 vcpu->run->psw_mask = psw.mask; 421 vcpu->run->psw_mask = psw.mask;
435 vcpu->run->psw_addr = psw.addr; 422 vcpu->run->psw_addr = psw.addr;
436 } 423 }
437 vcpu_put(vcpu);
438 return rc; 424 return rc;
439} 425}
440 426
@@ -498,8 +484,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
498 int rc; 484 int rc;
499 sigset_t sigsaved; 485 sigset_t sigsaved;
500 486
501 vcpu_load(vcpu);
502
503rerun_vcpu: 487rerun_vcpu:
504 if (vcpu->requests) 488 if (vcpu->requests)
505 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 489 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -568,8 +552,6 @@ rerun_vcpu:
568 if (vcpu->sigset_active) 552 if (vcpu->sigset_active)
569 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 553 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
570 554
571 vcpu_put(vcpu);
572
573 vcpu->stat.exit_userspace++; 555 vcpu->stat.exit_userspace++;
574 return rc; 556 return rc;
575} 557}
@@ -589,7 +571,7 @@ static int __guestcopy(struct kvm_vcpu *vcpu, u64 guestdest, const void *from,
589 * KVM_S390_STORE_STATUS_NOADDR: -> 0x1200 on 64 bit 571 * KVM_S390_STORE_STATUS_NOADDR: -> 0x1200 on 64 bit
590 * KVM_S390_STORE_STATUS_PREFIXED: -> prefix 572 * KVM_S390_STORE_STATUS_PREFIXED: -> prefix
591 */ 573 */
592int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) 574int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
593{ 575{
594 const unsigned char archmode = 1; 576 const unsigned char archmode = 1;
595 int prefix; 577 int prefix;
@@ -651,45 +633,42 @@ int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
651 return 0; 633 return 0;
652} 634}
653 635
654static int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
655{
656 int rc;
657
658 vcpu_load(vcpu);
659 rc = __kvm_s390_vcpu_store_status(vcpu, addr);
660 vcpu_put(vcpu);
661 return rc;
662}
663
664long kvm_arch_vcpu_ioctl(struct file *filp, 636long kvm_arch_vcpu_ioctl(struct file *filp,
665 unsigned int ioctl, unsigned long arg) 637 unsigned int ioctl, unsigned long arg)
666{ 638{
667 struct kvm_vcpu *vcpu = filp->private_data; 639 struct kvm_vcpu *vcpu = filp->private_data;
668 void __user *argp = (void __user *)arg; 640 void __user *argp = (void __user *)arg;
641 long r;
669 642
670 switch (ioctl) { 643 switch (ioctl) {
671 case KVM_S390_INTERRUPT: { 644 case KVM_S390_INTERRUPT: {
672 struct kvm_s390_interrupt s390int; 645 struct kvm_s390_interrupt s390int;
673 646
647 r = -EFAULT;
674 if (copy_from_user(&s390int, argp, sizeof(s390int))) 648 if (copy_from_user(&s390int, argp, sizeof(s390int)))
675 return -EFAULT; 649 break;
676 return kvm_s390_inject_vcpu(vcpu, &s390int); 650 r = kvm_s390_inject_vcpu(vcpu, &s390int);
651 break;
677 } 652 }
678 case KVM_S390_STORE_STATUS: 653 case KVM_S390_STORE_STATUS:
679 return kvm_s390_vcpu_store_status(vcpu, arg); 654 r = kvm_s390_vcpu_store_status(vcpu, arg);
655 break;
680 case KVM_S390_SET_INITIAL_PSW: { 656 case KVM_S390_SET_INITIAL_PSW: {
681 psw_t psw; 657 psw_t psw;
682 658
659 r = -EFAULT;
683 if (copy_from_user(&psw, argp, sizeof(psw))) 660 if (copy_from_user(&psw, argp, sizeof(psw)))
684 return -EFAULT; 661 break;
685 return kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw); 662 r = kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw);
663 break;
686 } 664 }
687 case KVM_S390_INITIAL_RESET: 665 case KVM_S390_INITIAL_RESET:
688 return kvm_arch_vcpu_ioctl_initial_reset(vcpu); 666 r = kvm_arch_vcpu_ioctl_initial_reset(vcpu);
667 break;
689 default: 668 default:
690 ; 669 r = -EINVAL;
691 } 670 }
692 return -EINVAL; 671 return r;
693} 672}
694 673
695/* Section: memory related */ 674/* Section: memory related */
@@ -744,11 +723,6 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
744{ 723{
745} 724}
746 725
747gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
748{
749 return gfn;
750}
751
752static int __init kvm_s390_init(void) 726static int __init kvm_s390_init(void)
753{ 727{
754 int ret; 728 int ret;
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index cfa9d1777457..a7b7586626db 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -92,7 +92,7 @@ int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
92int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); 92int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
93 93
94/* implemented in kvm-s390.c */ 94/* implemented in kvm-s390.c */
95int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, 95int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu,
96 unsigned long addr); 96 unsigned long addr);
97/* implemented in diag.c */ 97/* implemented in diag.c */
98int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); 98int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index c991b3a7b904..815c5b2b9f57 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -482,6 +482,8 @@ static inline void fpu_copy(struct fpu *dst, struct fpu *src)
482 memcpy(dst->state, src->state, xstate_size); 482 memcpy(dst->state, src->state, xstate_size);
483} 483}
484 484
485extern void fpu_finit(struct fpu *fpu);
486
485#endif /* __ASSEMBLY__ */ 487#endif /* __ASSEMBLY__ */
486 488
487#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 489#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index ff90055c7f0b..4d8dcbdfc120 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -22,6 +22,8 @@
22#define __KVM_HAVE_XEN_HVM 22#define __KVM_HAVE_XEN_HVM
23#define __KVM_HAVE_VCPU_EVENTS 23#define __KVM_HAVE_VCPU_EVENTS
24#define __KVM_HAVE_DEBUGREGS 24#define __KVM_HAVE_DEBUGREGS
25#define __KVM_HAVE_XSAVE
26#define __KVM_HAVE_XCRS
25 27
26/* Architectural interrupt line count. */ 28/* Architectural interrupt line count. */
27#define KVM_NR_INTERRUPTS 256 29#define KVM_NR_INTERRUPTS 256
@@ -299,4 +301,24 @@ struct kvm_debugregs {
299 __u64 reserved[9]; 301 __u64 reserved[9];
300}; 302};
301 303
304/* for KVM_CAP_XSAVE */
305struct kvm_xsave {
306 __u32 region[1024];
307};
308
309#define KVM_MAX_XCRS 16
310
311struct kvm_xcr {
312 __u32 xcr;
313 __u32 reserved;
314 __u64 value;
315};
316
317struct kvm_xcrs {
318 __u32 nr_xcrs;
319 __u32 flags;
320 struct kvm_xcr xcrs[KVM_MAX_XCRS];
321 __u64 padding[16];
322};
323
302#endif /* _ASM_X86_KVM_H */ 324#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0b2729bf2070..51cfd730ac5d 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -51,8 +51,10 @@ struct x86_emulate_ctxt;
51#define X86EMUL_UNHANDLEABLE 1 51#define X86EMUL_UNHANDLEABLE 1
52/* Terminate emulation but return success to the caller. */ 52/* Terminate emulation but return success to the caller. */
53#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ 53#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
54#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */ 54#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */
55#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */ 55#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */
56#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */
57
56struct x86_emulate_ops { 58struct x86_emulate_ops {
57 /* 59 /*
58 * read_std: Read bytes of standard (non-emulated/special) memory. 60 * read_std: Read bytes of standard (non-emulated/special) memory.
@@ -92,6 +94,7 @@ struct x86_emulate_ops {
92 int (*read_emulated)(unsigned long addr, 94 int (*read_emulated)(unsigned long addr,
93 void *val, 95 void *val,
94 unsigned int bytes, 96 unsigned int bytes,
97 unsigned int *error,
95 struct kvm_vcpu *vcpu); 98 struct kvm_vcpu *vcpu);
96 99
97 /* 100 /*
@@ -104,6 +107,7 @@ struct x86_emulate_ops {
104 int (*write_emulated)(unsigned long addr, 107 int (*write_emulated)(unsigned long addr,
105 const void *val, 108 const void *val,
106 unsigned int bytes, 109 unsigned int bytes,
110 unsigned int *error,
107 struct kvm_vcpu *vcpu); 111 struct kvm_vcpu *vcpu);
108 112
109 /* 113 /*
@@ -118,6 +122,7 @@ struct x86_emulate_ops {
118 const void *old, 122 const void *old,
119 const void *new, 123 const void *new,
120 unsigned int bytes, 124 unsigned int bytes,
125 unsigned int *error,
121 struct kvm_vcpu *vcpu); 126 struct kvm_vcpu *vcpu);
122 127
123 int (*pio_in_emulated)(int size, unsigned short port, void *val, 128 int (*pio_in_emulated)(int size, unsigned short port, void *val,
@@ -132,18 +137,26 @@ struct x86_emulate_ops {
132 int seg, struct kvm_vcpu *vcpu); 137 int seg, struct kvm_vcpu *vcpu);
133 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); 138 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
134 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); 139 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
140 unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
135 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); 141 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
136 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); 142 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
137 void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); 143 int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
138 int (*cpl)(struct kvm_vcpu *vcpu); 144 int (*cpl)(struct kvm_vcpu *vcpu);
139 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 145 int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu);
146 int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu);
147 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
148 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
140}; 149};
141 150
142/* Type, address-of, and value of an instruction's operand. */ 151/* Type, address-of, and value of an instruction's operand. */
143struct operand { 152struct operand {
144 enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; 153 enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
145 unsigned int bytes; 154 unsigned int bytes;
146 unsigned long val, orig_val, *ptr; 155 unsigned long orig_val, *ptr;
156 union {
157 unsigned long val;
158 char valptr[sizeof(unsigned long) + 2];
159 };
147}; 160};
148 161
149struct fetch_cache { 162struct fetch_cache {
@@ -186,6 +199,7 @@ struct decode_cache {
186 unsigned long modrm_val; 199 unsigned long modrm_val;
187 struct fetch_cache fetch; 200 struct fetch_cache fetch;
188 struct read_cache io_read; 201 struct read_cache io_read;
202 struct read_cache mem_read;
189}; 203};
190 204
191struct x86_emulate_ctxt { 205struct x86_emulate_ctxt {
@@ -202,6 +216,12 @@ struct x86_emulate_ctxt {
202 int interruptibility; 216 int interruptibility;
203 217
204 bool restart; /* restart string instruction after writeback */ 218 bool restart; /* restart string instruction after writeback */
219
220 int exception; /* exception that happens during emulation or -1 */
221 u32 error_code; /* error code for exception */
222 bool error_code_valid;
223 unsigned long cr2; /* faulted address in case of #PF */
224
205 /* decode cache */ 225 /* decode cache */
206 struct decode_cache decode; 226 struct decode_cache decode;
207}; 227};
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 76f5483cffec..502e53f999cf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -15,6 +15,7 @@
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/mmu_notifier.h> 16#include <linux/mmu_notifier.h>
17#include <linux/tracepoint.h> 17#include <linux/tracepoint.h>
18#include <linux/cpumask.h>
18 19
19#include <linux/kvm.h> 20#include <linux/kvm.h>
20#include <linux/kvm_para.h> 21#include <linux/kvm_para.h>
@@ -39,11 +40,14 @@
39 0xFFFFFF0000000000ULL) 40 0xFFFFFF0000000000ULL)
40 41
41#define INVALID_PAGE (~(hpa_t)0) 42#define INVALID_PAGE (~(hpa_t)0)
43#define VALID_PAGE(x) ((x) != INVALID_PAGE)
44
42#define UNMAPPED_GVA (~(gpa_t)0) 45#define UNMAPPED_GVA (~(gpa_t)0)
43 46
44/* KVM Hugepage definitions for x86 */ 47/* KVM Hugepage definitions for x86 */
45#define KVM_NR_PAGE_SIZES 3 48#define KVM_NR_PAGE_SIZES 3
46#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9)) 49#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9)
50#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
47#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) 51#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
48#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) 52#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
49#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) 53#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
@@ -69,8 +73,6 @@
69 73
70#define IOPL_SHIFT 12 74#define IOPL_SHIFT 12
71 75
72#define KVM_ALIAS_SLOTS 4
73
74#define KVM_PERMILLE_MMU_PAGES 20 76#define KVM_PERMILLE_MMU_PAGES 20
75#define KVM_MIN_ALLOC_MMU_PAGES 64 77#define KVM_MIN_ALLOC_MMU_PAGES 64
76#define KVM_MMU_HASH_SHIFT 10 78#define KVM_MMU_HASH_SHIFT 10
@@ -241,7 +243,7 @@ struct kvm_mmu {
241 void (*prefetch_page)(struct kvm_vcpu *vcpu, 243 void (*prefetch_page)(struct kvm_vcpu *vcpu,
242 struct kvm_mmu_page *page); 244 struct kvm_mmu_page *page);
243 int (*sync_page)(struct kvm_vcpu *vcpu, 245 int (*sync_page)(struct kvm_vcpu *vcpu,
244 struct kvm_mmu_page *sp); 246 struct kvm_mmu_page *sp, bool clear_unsync);
245 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); 247 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
246 hpa_t root_hpa; 248 hpa_t root_hpa;
247 int root_level; 249 int root_level;
@@ -301,8 +303,8 @@ struct kvm_vcpu_arch {
301 unsigned long mmu_seq; 303 unsigned long mmu_seq;
302 } update_pte; 304 } update_pte;
303 305
304 struct i387_fxsave_struct host_fx_image; 306 struct fpu guest_fpu;
305 struct i387_fxsave_struct guest_fx_image; 307 u64 xcr0;
306 308
307 gva_t mmio_fault_cr2; 309 gva_t mmio_fault_cr2;
308 struct kvm_pio_request pio; 310 struct kvm_pio_request pio;
@@ -360,26 +362,11 @@ struct kvm_vcpu_arch {
360 362
361 /* fields used by HYPER-V emulation */ 363 /* fields used by HYPER-V emulation */
362 u64 hv_vapic; 364 u64 hv_vapic;
363};
364
365struct kvm_mem_alias {
366 gfn_t base_gfn;
367 unsigned long npages;
368 gfn_t target_gfn;
369#define KVM_ALIAS_INVALID 1UL
370 unsigned long flags;
371};
372 365
373#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION 366 cpumask_var_t wbinvd_dirty_mask;
374
375struct kvm_mem_aliases {
376 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
377 int naliases;
378}; 367};
379 368
380struct kvm_arch { 369struct kvm_arch {
381 struct kvm_mem_aliases *aliases;
382
383 unsigned int n_free_mmu_pages; 370 unsigned int n_free_mmu_pages;
384 unsigned int n_requested_mmu_pages; 371 unsigned int n_requested_mmu_pages;
385 unsigned int n_alloc_mmu_pages; 372 unsigned int n_alloc_mmu_pages;
@@ -533,6 +520,8 @@ struct kvm_x86_ops {
533 520
534 void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); 521 void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
535 522
523 bool (*has_wbinvd_exit)(void);
524
536 const struct trace_print_flags *exit_reasons_str; 525 const struct trace_print_flags *exit_reasons_str;
537}; 526};
538 527
@@ -576,7 +565,6 @@ enum emulation_result {
576#define EMULTYPE_SKIP (1 << 2) 565#define EMULTYPE_SKIP (1 << 2)
577int emulate_instruction(struct kvm_vcpu *vcpu, 566int emulate_instruction(struct kvm_vcpu *vcpu,
578 unsigned long cr2, u16 error_code, int emulation_type); 567 unsigned long cr2, u16 error_code, int emulation_type);
579void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
580void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 568void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
581void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 569void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
582 570
@@ -591,10 +579,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
591int kvm_emulate_halt(struct kvm_vcpu *vcpu); 579int kvm_emulate_halt(struct kvm_vcpu *vcpu);
592int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); 580int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
593int emulate_clts(struct kvm_vcpu *vcpu); 581int emulate_clts(struct kvm_vcpu *vcpu);
594int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, 582int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
595 unsigned long *dest);
596int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
597 unsigned long value);
598 583
599void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 584void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
600int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); 585int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
@@ -602,15 +587,16 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
602int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 587int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
603 bool has_error_code, u32 error_code); 588 bool has_error_code, u32 error_code);
604 589
605void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 590int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
606void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 591int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
607void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 592int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
608void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); 593void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
609int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); 594int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
610int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); 595int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
611unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); 596unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
612void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); 597void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
613void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); 598void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
599int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
614 600
615int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 601int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
616int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); 602int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
@@ -630,12 +616,7 @@ int kvm_pic_set_irq(void *opaque, int irq, int level);
630 616
631void kvm_inject_nmi(struct kvm_vcpu *vcpu); 617void kvm_inject_nmi(struct kvm_vcpu *vcpu);
632 618
633void fx_init(struct kvm_vcpu *vcpu); 619int fx_init(struct kvm_vcpu *vcpu);
634
635int emulator_write_emulated(unsigned long addr,
636 const void *val,
637 unsigned int bytes,
638 struct kvm_vcpu *vcpu);
639 620
640void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); 621void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
641void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 622void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -664,8 +645,6 @@ void kvm_disable_tdp(void);
664int complete_pio(struct kvm_vcpu *vcpu); 645int complete_pio(struct kvm_vcpu *vcpu);
665bool kvm_check_iopl(struct kvm_vcpu *vcpu); 646bool kvm_check_iopl(struct kvm_vcpu *vcpu);
666 647
667struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
668
669static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) 648static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
670{ 649{
671 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); 650 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -719,21 +698,6 @@ static inline unsigned long read_msr(unsigned long msr)
719} 698}
720#endif 699#endif
721 700
722static inline void kvm_fx_save(struct i387_fxsave_struct *image)
723{
724 asm("fxsave (%0)":: "r" (image));
725}
726
727static inline void kvm_fx_restore(struct i387_fxsave_struct *image)
728{
729 asm("fxrstor (%0)":: "r" (image));
730}
731
732static inline void kvm_fx_finit(void)
733{
734 asm("finit");
735}
736
737static inline u32 get_rdx_init_val(void) 701static inline u32 get_rdx_init_val(void)
738{ 702{
739 return 0x600; /* P6 family */ 703 return 0x600; /* P6 family */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 8c7ae4318629..509a42187dc2 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -20,6 +20,7 @@
20#define _EFER_LMA 10 /* Long mode active (read-only) */ 20#define _EFER_LMA 10 /* Long mode active (read-only) */
21#define _EFER_NX 11 /* No execute enable */ 21#define _EFER_NX 11 /* No execute enable */
22#define _EFER_SVME 12 /* Enable virtualization */ 22#define _EFER_SVME 12 /* Enable virtualization */
23#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
23#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ 24#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
24 25
25#define EFER_SCE (1<<_EFER_SCE) 26#define EFER_SCE (1<<_EFER_SCE)
@@ -27,6 +28,7 @@
27#define EFER_LMA (1<<_EFER_LMA) 28#define EFER_LMA (1<<_EFER_LMA)
28#define EFER_NX (1<<_EFER_NX) 29#define EFER_NX (1<<_EFER_NX)
29#define EFER_SVME (1<<_EFER_SVME) 30#define EFER_SVME (1<<_EFER_SVME)
31#define EFER_LMSLE (1<<_EFER_LMSLE)
30#define EFER_FFXSR (1<<_EFER_FFXSR) 32#define EFER_FFXSR (1<<_EFER_FFXSR)
31 33
32/* Intel MSRs. Some also available on other CPUs */ 34/* Intel MSRs. Some also available on other CPUs */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 9e6779f7cf2d..9f0cbd987d50 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -257,6 +257,7 @@ enum vmcs_field {
257#define EXIT_REASON_IO_INSTRUCTION 30 257#define EXIT_REASON_IO_INSTRUCTION 30
258#define EXIT_REASON_MSR_READ 31 258#define EXIT_REASON_MSR_READ 31
259#define EXIT_REASON_MSR_WRITE 32 259#define EXIT_REASON_MSR_WRITE 32
260#define EXIT_REASON_INVALID_STATE 33
260#define EXIT_REASON_MWAIT_INSTRUCTION 36 261#define EXIT_REASON_MWAIT_INSTRUCTION 36
261#define EXIT_REASON_MONITOR_INSTRUCTION 39 262#define EXIT_REASON_MONITOR_INSTRUCTION 39
262#define EXIT_REASON_PAUSE_INSTRUCTION 40 263#define EXIT_REASON_PAUSE_INSTRUCTION 40
@@ -266,6 +267,7 @@ enum vmcs_field {
266#define EXIT_REASON_EPT_VIOLATION 48 267#define EXIT_REASON_EPT_VIOLATION 48
267#define EXIT_REASON_EPT_MISCONFIG 49 268#define EXIT_REASON_EPT_MISCONFIG 49
268#define EXIT_REASON_WBINVD 54 269#define EXIT_REASON_WBINVD 54
270#define EXIT_REASON_XSETBV 55
269 271
270/* 272/*
271 * Interruption-information format 273 * Interruption-information format
@@ -375,6 +377,9 @@ enum vmcs_field {
375#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 377#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
376#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 378#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
377 379
380#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */
381#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */
382
378#define VMX_EPT_DEFAULT_GAW 3 383#define VMX_EPT_DEFAULT_GAW 3
379#define VMX_EPT_MAX_GAW 0x4 384#define VMX_EPT_MAX_GAW 0x4
380#define VMX_EPT_MT_EPTE_SHIFT 3 385#define VMX_EPT_MT_EPTE_SHIFT 3
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 2c4390cae228..32c36668fa7b 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -13,6 +13,12 @@
13 13
14#define FXSAVE_SIZE 512 14#define FXSAVE_SIZE 512
15 15
16#define XSAVE_HDR_SIZE 64
17#define XSAVE_HDR_OFFSET FXSAVE_SIZE
18
19#define XSAVE_YMM_SIZE 256
20#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
21
16/* 22/*
17 * These are the features that the OS can handle currently. 23 * These are the features that the OS can handle currently.
18 */ 24 */
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 86cef6b32253..c4444bce8469 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -107,7 +107,7 @@ void __cpuinit fpu_init(void)
107} 107}
108#endif /* CONFIG_X86_64 */ 108#endif /* CONFIG_X86_64 */
109 109
110static void fpu_finit(struct fpu *fpu) 110void fpu_finit(struct fpu *fpu)
111{ 111{
112#ifdef CONFIG_X86_32 112#ifdef CONFIG_X86_32
113 if (!HAVE_HWFP) { 113 if (!HAVE_HWFP) {
@@ -132,6 +132,7 @@ static void fpu_finit(struct fpu *fpu)
132 fp->fos = 0xffff0000u; 132 fp->fos = 0xffff0000u;
133 } 133 }
134} 134}
135EXPORT_SYMBOL_GPL(fpu_finit);
135 136
136/* 137/*
137 * The _current_ task is using the FPU for the first time 138 * The _current_ task is using the FPU for the first time
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e7e35219b32f..ebcfcceccc72 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@ unsigned long idle_nomwait;
28EXPORT_SYMBOL(idle_nomwait); 28EXPORT_SYMBOL(idle_nomwait);
29 29
30struct kmem_cache *task_xstate_cachep; 30struct kmem_cache *task_xstate_cachep;
31EXPORT_SYMBOL_GPL(task_xstate_cachep);
31 32
32int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 33int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
33{ 34{
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5ac0bb465ed6..b38bd8b92aa6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -9,6 +9,7 @@
9 * privileged instructions: 9 * privileged instructions:
10 * 10 *
11 * Copyright (C) 2006 Qumranet 11 * Copyright (C) 2006 Qumranet
12 * Copyright 2010 Red Hat, Inc. and/or its affilates.
12 * 13 *
13 * Avi Kivity <avi@qumranet.com> 14 * Avi Kivity <avi@qumranet.com>
14 * Yaniv Kamay <yaniv@qumranet.com> 15 * Yaniv Kamay <yaniv@qumranet.com>
@@ -67,6 +68,9 @@
67#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ 68#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
68#define SrcImmU (9<<4) /* Immediate operand, unsigned */ 69#define SrcImmU (9<<4) /* Immediate operand, unsigned */
69#define SrcSI (0xa<<4) /* Source is in the DS:RSI */ 70#define SrcSI (0xa<<4) /* Source is in the DS:RSI */
71#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */
72#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */
73#define SrcAcc (0xd<<4) /* Source Accumulator */
70#define SrcMask (0xf<<4) 74#define SrcMask (0xf<<4)
71/* Generic ModRM decode. */ 75/* Generic ModRM decode. */
72#define ModRM (1<<8) 76#define ModRM (1<<8)
@@ -88,10 +92,6 @@
88#define Src2CL (1<<29) 92#define Src2CL (1<<29)
89#define Src2ImmByte (2<<29) 93#define Src2ImmByte (2<<29)
90#define Src2One (3<<29) 94#define Src2One (3<<29)
91#define Src2Imm16 (4<<29)
92#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be
93 in memory and second argument is located
94 immediately after the first one in memory. */
95#define Src2Mask (7<<29) 95#define Src2Mask (7<<29)
96 96
97enum { 97enum {
@@ -124,15 +124,15 @@ static u32 opcode_table[256] = {
124 /* 0x20 - 0x27 */ 124 /* 0x20 - 0x27 */
125 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 125 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
126 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 126 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
127 DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, 127 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
128 /* 0x28 - 0x2F */ 128 /* 0x28 - 0x2F */
129 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 129 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
130 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 130 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
131 0, 0, 0, 0, 131 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
132 /* 0x30 - 0x37 */ 132 /* 0x30 - 0x37 */
133 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 133 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
134 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 134 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
135 0, 0, 0, 0, 135 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
136 /* 0x38 - 0x3F */ 136 /* 0x38 - 0x3F */
137 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 137 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
138 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 138 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -170,20 +170,20 @@ static u32 opcode_table[256] = {
170 /* 0x88 - 0x8F */ 170 /* 0x88 - 0x8F */
171 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, 171 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
172 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 172 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
173 DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, 173 DstMem | SrcNone | ModRM | Mov, ModRM | DstReg,
174 DstReg | SrcMem | ModRM | Mov, Group | Group1A, 174 ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
175 /* 0x90 - 0x97 */ 175 /* 0x90 - 0x97 */
176 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 176 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
177 /* 0x98 - 0x9F */ 177 /* 0x98 - 0x9F */
178 0, 0, SrcImm | Src2Imm16 | No64, 0, 178 0, 0, SrcImmFAddr | No64, 0,
179 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 179 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
180 /* 0xA0 - 0xA7 */ 180 /* 0xA0 - 0xA7 */
181 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 181 ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs,
182 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, 182 ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs,
183 ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String, 183 ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
184 ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String, 184 ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
185 /* 0xA8 - 0xAF */ 185 /* 0xA8 - 0xAF */
186 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String, 186 DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
187 ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String, 187 ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
188 ByteOp | DstDI | String, DstDI | String, 188 ByteOp | DstDI | String, DstDI | String,
189 /* 0xB0 - 0xB7 */ 189 /* 0xB0 - 0xB7 */
@@ -215,7 +215,7 @@ static u32 opcode_table[256] = {
215 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, 215 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
216 /* 0xE8 - 0xEF */ 216 /* 0xE8 - 0xEF */
217 SrcImm | Stack, SrcImm | ImplicitOps, 217 SrcImm | Stack, SrcImm | ImplicitOps,
218 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps, 218 SrcImmFAddr | No64, SrcImmByte | ImplicitOps,
219 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, 219 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
220 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, 220 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
221 /* 0xF0 - 0xF7 */ 221 /* 0xF0 - 0xF7 */
@@ -337,20 +337,20 @@ static u32 group_table[] = {
337 [Group1A*8] = 337 [Group1A*8] =
338 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, 338 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
339 [Group3_Byte*8] = 339 [Group3_Byte*8] =
340 ByteOp | SrcImm | DstMem | ModRM, 0, 340 ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
341 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 341 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
342 0, 0, 0, 0, 342 0, 0, 0, 0,
343 [Group3*8] = 343 [Group3*8] =
344 DstMem | SrcImm | ModRM, 0, 344 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
345 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 345 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
346 0, 0, 0, 0, 346 0, 0, 0, 0,
347 [Group4*8] = 347 [Group4*8] =
348 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 348 ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
349 0, 0, 0, 0, 0, 0, 349 0, 0, 0, 0, 0, 0,
350 [Group5*8] = 350 [Group5*8] =
351 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 351 DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
352 SrcMem | ModRM | Stack, 0, 352 SrcMem | ModRM | Stack, 0,
353 SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps, 353 SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
354 SrcMem | ModRM | Stack, 0, 354 SrcMem | ModRM | Stack, 0,
355 [Group7*8] = 355 [Group7*8] =
356 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, 356 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
@@ -576,6 +576,13 @@ static u32 group2_table[] = {
576 (_type)_x; \ 576 (_type)_x; \
577}) 577})
578 578
579#define insn_fetch_arr(_arr, _size, _eip) \
580({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \
581 if (rc != X86EMUL_CONTINUE) \
582 goto done; \
583 (_eip) += (_size); \
584})
585
579static inline unsigned long ad_mask(struct decode_cache *c) 586static inline unsigned long ad_mask(struct decode_cache *c)
580{ 587{
581 return (1UL << (c->ad_bytes << 3)) - 1; 588 return (1UL << (c->ad_bytes << 3)) - 1;
@@ -617,31 +624,66 @@ static void set_seg_override(struct decode_cache *c, int seg)
617 c->seg_override = seg; 624 c->seg_override = seg;
618} 625}
619 626
620static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) 627static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
628 struct x86_emulate_ops *ops, int seg)
621{ 629{
622 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) 630 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
623 return 0; 631 return 0;
624 632
625 return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg); 633 return ops->get_cached_segment_base(seg, ctxt->vcpu);
626} 634}
627 635
628static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, 636static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
637 struct x86_emulate_ops *ops,
629 struct decode_cache *c) 638 struct decode_cache *c)
630{ 639{
631 if (!c->has_seg_override) 640 if (!c->has_seg_override)
632 return 0; 641 return 0;
633 642
634 return seg_base(ctxt, c->seg_override); 643 return seg_base(ctxt, ops, c->seg_override);
644}
645
646static unsigned long es_base(struct x86_emulate_ctxt *ctxt,
647 struct x86_emulate_ops *ops)
648{
649 return seg_base(ctxt, ops, VCPU_SREG_ES);
650}
651
652static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
653 struct x86_emulate_ops *ops)
654{
655 return seg_base(ctxt, ops, VCPU_SREG_SS);
656}
657
658static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
659 u32 error, bool valid)
660{
661 ctxt->exception = vec;
662 ctxt->error_code = error;
663 ctxt->error_code_valid = valid;
664 ctxt->restart = false;
665}
666
667static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
668{
669 emulate_exception(ctxt, GP_VECTOR, err, true);
635} 670}
636 671
637static unsigned long es_base(struct x86_emulate_ctxt *ctxt) 672static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr,
673 int err)
638{ 674{
639 return seg_base(ctxt, VCPU_SREG_ES); 675 ctxt->cr2 = addr;
676 emulate_exception(ctxt, PF_VECTOR, err, true);
640} 677}
641 678
642static unsigned long ss_base(struct x86_emulate_ctxt *ctxt) 679static void emulate_ud(struct x86_emulate_ctxt *ctxt)
643{ 680{
644 return seg_base(ctxt, VCPU_SREG_SS); 681 emulate_exception(ctxt, UD_VECTOR, 0, false);
682}
683
684static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
685{
686 emulate_exception(ctxt, TS_VECTOR, err, true);
645} 687}
646 688
647static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 689static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -932,12 +974,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
932 /* we cannot decode insn before we complete previous rep insn */ 974 /* we cannot decode insn before we complete previous rep insn */
933 WARN_ON(ctxt->restart); 975 WARN_ON(ctxt->restart);
934 976
935 /* Shadow copy of register state. Committed on successful emulation. */
936 memset(c, 0, sizeof(struct decode_cache));
937 c->eip = ctxt->eip; 977 c->eip = ctxt->eip;
938 c->fetch.start = c->fetch.end = c->eip; 978 c->fetch.start = c->fetch.end = c->eip;
939 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 979 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
940 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
941 980
942 switch (mode) { 981 switch (mode) {
943 case X86EMUL_MODE_REAL: 982 case X86EMUL_MODE_REAL:
@@ -1060,7 +1099,7 @@ done_prefixes:
1060 set_seg_override(c, VCPU_SREG_DS); 1099 set_seg_override(c, VCPU_SREG_DS);
1061 1100
1062 if (!(!c->twobyte && c->b == 0x8d)) 1101 if (!(!c->twobyte && c->b == 0x8d))
1063 c->modrm_ea += seg_override_base(ctxt, c); 1102 c->modrm_ea += seg_override_base(ctxt, ops, c);
1064 1103
1065 if (c->ad_bytes != 8) 1104 if (c->ad_bytes != 8)
1066 c->modrm_ea = (u32)c->modrm_ea; 1105 c->modrm_ea = (u32)c->modrm_ea;
@@ -1148,6 +1187,25 @@ done_prefixes:
1148 else 1187 else
1149 c->src.val = insn_fetch(u8, 1, c->eip); 1188 c->src.val = insn_fetch(u8, 1, c->eip);
1150 break; 1189 break;
1190 case SrcAcc:
1191 c->src.type = OP_REG;
1192 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1193 c->src.ptr = &c->regs[VCPU_REGS_RAX];
1194 switch (c->src.bytes) {
1195 case 1:
1196 c->src.val = *(u8 *)c->src.ptr;
1197 break;
1198 case 2:
1199 c->src.val = *(u16 *)c->src.ptr;
1200 break;
1201 case 4:
1202 c->src.val = *(u32 *)c->src.ptr;
1203 break;
1204 case 8:
1205 c->src.val = *(u64 *)c->src.ptr;
1206 break;
1207 }
1208 break;
1151 case SrcOne: 1209 case SrcOne:
1152 c->src.bytes = 1; 1210 c->src.bytes = 1;
1153 c->src.val = 1; 1211 c->src.val = 1;
@@ -1156,10 +1214,21 @@ done_prefixes:
1156 c->src.type = OP_MEM; 1214 c->src.type = OP_MEM;
1157 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1215 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1158 c->src.ptr = (unsigned long *) 1216 c->src.ptr = (unsigned long *)
1159 register_address(c, seg_override_base(ctxt, c), 1217 register_address(c, seg_override_base(ctxt, ops, c),
1160 c->regs[VCPU_REGS_RSI]); 1218 c->regs[VCPU_REGS_RSI]);
1161 c->src.val = 0; 1219 c->src.val = 0;
1162 break; 1220 break;
1221 case SrcImmFAddr:
1222 c->src.type = OP_IMM;
1223 c->src.ptr = (unsigned long *)c->eip;
1224 c->src.bytes = c->op_bytes + 2;
1225 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
1226 break;
1227 case SrcMemFAddr:
1228 c->src.type = OP_MEM;
1229 c->src.ptr = (unsigned long *)c->modrm_ea;
1230 c->src.bytes = c->op_bytes + 2;
1231 break;
1163 } 1232 }
1164 1233
1165 /* 1234 /*
@@ -1179,22 +1248,10 @@ done_prefixes:
1179 c->src2.bytes = 1; 1248 c->src2.bytes = 1;
1180 c->src2.val = insn_fetch(u8, 1, c->eip); 1249 c->src2.val = insn_fetch(u8, 1, c->eip);
1181 break; 1250 break;
1182 case Src2Imm16:
1183 c->src2.type = OP_IMM;
1184 c->src2.ptr = (unsigned long *)c->eip;
1185 c->src2.bytes = 2;
1186 c->src2.val = insn_fetch(u16, 2, c->eip);
1187 break;
1188 case Src2One: 1251 case Src2One:
1189 c->src2.bytes = 1; 1252 c->src2.bytes = 1;
1190 c->src2.val = 1; 1253 c->src2.val = 1;
1191 break; 1254 break;
1192 case Src2Mem16:
1193 c->src2.type = OP_MEM;
1194 c->src2.bytes = 2;
1195 c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
1196 c->src2.val = 0;
1197 break;
1198 } 1255 }
1199 1256
1200 /* Decode and fetch the destination operand: register or memory. */ 1257 /* Decode and fetch the destination operand: register or memory. */
@@ -1253,7 +1310,7 @@ done_prefixes:
1253 c->dst.type = OP_MEM; 1310 c->dst.type = OP_MEM;
1254 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1311 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1255 c->dst.ptr = (unsigned long *) 1312 c->dst.ptr = (unsigned long *)
1256 register_address(c, es_base(ctxt), 1313 register_address(c, es_base(ctxt, ops),
1257 c->regs[VCPU_REGS_RDI]); 1314 c->regs[VCPU_REGS_RDI]);
1258 c->dst.val = 0; 1315 c->dst.val = 0;
1259 break; 1316 break;
@@ -1263,6 +1320,37 @@ done:
1263 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 1320 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1264} 1321}
1265 1322
1323static int read_emulated(struct x86_emulate_ctxt *ctxt,
1324 struct x86_emulate_ops *ops,
1325 unsigned long addr, void *dest, unsigned size)
1326{
1327 int rc;
1328 struct read_cache *mc = &ctxt->decode.mem_read;
1329 u32 err;
1330
1331 while (size) {
1332 int n = min(size, 8u);
1333 size -= n;
1334 if (mc->pos < mc->end)
1335 goto read_cached;
1336
1337 rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
1338 ctxt->vcpu);
1339 if (rc == X86EMUL_PROPAGATE_FAULT)
1340 emulate_pf(ctxt, addr, err);
1341 if (rc != X86EMUL_CONTINUE)
1342 return rc;
1343 mc->end += n;
1344
1345 read_cached:
1346 memcpy(dest, mc->data + mc->pos, n);
1347 mc->pos += n;
1348 dest += n;
1349 addr += n;
1350 }
1351 return X86EMUL_CONTINUE;
1352}
1353
1266static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, 1354static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1267 struct x86_emulate_ops *ops, 1355 struct x86_emulate_ops *ops,
1268 unsigned int size, unsigned short port, 1356 unsigned int size, unsigned short port,
@@ -1330,13 +1418,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1330 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1418 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1331 1419
1332 if (dt.size < index * 8 + 7) { 1420 if (dt.size < index * 8 + 7) {
1333 kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); 1421 emulate_gp(ctxt, selector & 0xfffc);
1334 return X86EMUL_PROPAGATE_FAULT; 1422 return X86EMUL_PROPAGATE_FAULT;
1335 } 1423 }
1336 addr = dt.address + index * 8; 1424 addr = dt.address + index * 8;
1337 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 1425 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1338 if (ret == X86EMUL_PROPAGATE_FAULT) 1426 if (ret == X86EMUL_PROPAGATE_FAULT)
1339 kvm_inject_page_fault(ctxt->vcpu, addr, err); 1427 emulate_pf(ctxt, addr, err);
1340 1428
1341 return ret; 1429 return ret;
1342} 1430}
@@ -1355,14 +1443,14 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1355 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1443 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1356 1444
1357 if (dt.size < index * 8 + 7) { 1445 if (dt.size < index * 8 + 7) {
1358 kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); 1446 emulate_gp(ctxt, selector & 0xfffc);
1359 return X86EMUL_PROPAGATE_FAULT; 1447 return X86EMUL_PROPAGATE_FAULT;
1360 } 1448 }
1361 1449
1362 addr = dt.address + index * 8; 1450 addr = dt.address + index * 8;
1363 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 1451 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1364 if (ret == X86EMUL_PROPAGATE_FAULT) 1452 if (ret == X86EMUL_PROPAGATE_FAULT)
1365 kvm_inject_page_fault(ctxt->vcpu, addr, err); 1453 emulate_pf(ctxt, addr, err);
1366 1454
1367 return ret; 1455 return ret;
1368} 1456}
@@ -1481,11 +1569,70 @@ load:
1481 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); 1569 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
1482 return X86EMUL_CONTINUE; 1570 return X86EMUL_CONTINUE;
1483exception: 1571exception:
1484 kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code); 1572 emulate_exception(ctxt, err_vec, err_code, true);
1485 return X86EMUL_PROPAGATE_FAULT; 1573 return X86EMUL_PROPAGATE_FAULT;
1486} 1574}
1487 1575
1488static inline void emulate_push(struct x86_emulate_ctxt *ctxt) 1576static inline int writeback(struct x86_emulate_ctxt *ctxt,
1577 struct x86_emulate_ops *ops)
1578{
1579 int rc;
1580 struct decode_cache *c = &ctxt->decode;
1581 u32 err;
1582
1583 switch (c->dst.type) {
1584 case OP_REG:
1585 /* The 4-byte case *is* correct:
1586 * in 64-bit mode we zero-extend.
1587 */
1588 switch (c->dst.bytes) {
1589 case 1:
1590 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1591 break;
1592 case 2:
1593 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1594 break;
1595 case 4:
1596 *c->dst.ptr = (u32)c->dst.val;
1597 break; /* 64b: zero-ext */
1598 case 8:
1599 *c->dst.ptr = c->dst.val;
1600 break;
1601 }
1602 break;
1603 case OP_MEM:
1604 if (c->lock_prefix)
1605 rc = ops->cmpxchg_emulated(
1606 (unsigned long)c->dst.ptr,
1607 &c->dst.orig_val,
1608 &c->dst.val,
1609 c->dst.bytes,
1610 &err,
1611 ctxt->vcpu);
1612 else
1613 rc = ops->write_emulated(
1614 (unsigned long)c->dst.ptr,
1615 &c->dst.val,
1616 c->dst.bytes,
1617 &err,
1618 ctxt->vcpu);
1619 if (rc == X86EMUL_PROPAGATE_FAULT)
1620 emulate_pf(ctxt,
1621 (unsigned long)c->dst.ptr, err);
1622 if (rc != X86EMUL_CONTINUE)
1623 return rc;
1624 break;
1625 case OP_NONE:
1626 /* no writeback */
1627 break;
1628 default:
1629 break;
1630 }
1631 return X86EMUL_CONTINUE;
1632}
1633
1634static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
1635 struct x86_emulate_ops *ops)
1489{ 1636{
1490 struct decode_cache *c = &ctxt->decode; 1637 struct decode_cache *c = &ctxt->decode;
1491 1638
@@ -1493,7 +1640,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
1493 c->dst.bytes = c->op_bytes; 1640 c->dst.bytes = c->op_bytes;
1494 c->dst.val = c->src.val; 1641 c->dst.val = c->src.val;
1495 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1642 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1496 c->dst.ptr = (void *) register_address(c, ss_base(ctxt), 1643 c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops),
1497 c->regs[VCPU_REGS_RSP]); 1644 c->regs[VCPU_REGS_RSP]);
1498} 1645}
1499 1646
@@ -1504,9 +1651,9 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1504 struct decode_cache *c = &ctxt->decode; 1651 struct decode_cache *c = &ctxt->decode;
1505 int rc; 1652 int rc;
1506 1653
1507 rc = ops->read_emulated(register_address(c, ss_base(ctxt), 1654 rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
1508 c->regs[VCPU_REGS_RSP]), 1655 c->regs[VCPU_REGS_RSP]),
1509 dest, len, ctxt->vcpu); 1656 dest, len);
1510 if (rc != X86EMUL_CONTINUE) 1657 if (rc != X86EMUL_CONTINUE)
1511 return rc; 1658 return rc;
1512 1659
@@ -1541,7 +1688,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1541 break; 1688 break;
1542 case X86EMUL_MODE_VM86: 1689 case X86EMUL_MODE_VM86:
1543 if (iopl < 3) { 1690 if (iopl < 3) {
1544 kvm_inject_gp(ctxt->vcpu, 0); 1691 emulate_gp(ctxt, 0);
1545 return X86EMUL_PROPAGATE_FAULT; 1692 return X86EMUL_PROPAGATE_FAULT;
1546 } 1693 }
1547 change_mask |= EFLG_IF; 1694 change_mask |= EFLG_IF;
@@ -1557,15 +1704,14 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1557 return rc; 1704 return rc;
1558} 1705}
1559 1706
1560static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) 1707static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
1708 struct x86_emulate_ops *ops, int seg)
1561{ 1709{
1562 struct decode_cache *c = &ctxt->decode; 1710 struct decode_cache *c = &ctxt->decode;
1563 struct kvm_segment segment;
1564 1711
1565 kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg); 1712 c->src.val = ops->get_segment_selector(seg, ctxt->vcpu);
1566 1713
1567 c->src.val = segment.selector; 1714 emulate_push(ctxt, ops);
1568 emulate_push(ctxt);
1569} 1715}
1570 1716
1571static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, 1717static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
@@ -1583,19 +1729,31 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1583 return rc; 1729 return rc;
1584} 1730}
1585 1731
1586static void emulate_pusha(struct x86_emulate_ctxt *ctxt) 1732static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
1733 struct x86_emulate_ops *ops)
1587{ 1734{
1588 struct decode_cache *c = &ctxt->decode; 1735 struct decode_cache *c = &ctxt->decode;
1589 unsigned long old_esp = c->regs[VCPU_REGS_RSP]; 1736 unsigned long old_esp = c->regs[VCPU_REGS_RSP];
1737 int rc = X86EMUL_CONTINUE;
1590 int reg = VCPU_REGS_RAX; 1738 int reg = VCPU_REGS_RAX;
1591 1739
1592 while (reg <= VCPU_REGS_RDI) { 1740 while (reg <= VCPU_REGS_RDI) {
1593 (reg == VCPU_REGS_RSP) ? 1741 (reg == VCPU_REGS_RSP) ?
1594 (c->src.val = old_esp) : (c->src.val = c->regs[reg]); 1742 (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
1595 1743
1596 emulate_push(ctxt); 1744 emulate_push(ctxt, ops);
1745
1746 rc = writeback(ctxt, ops);
1747 if (rc != X86EMUL_CONTINUE)
1748 return rc;
1749
1597 ++reg; 1750 ++reg;
1598 } 1751 }
1752
1753 /* Disable writeback. */
1754 c->dst.type = OP_NONE;
1755
1756 return rc;
1599} 1757}
1600 1758
1601static int emulate_popa(struct x86_emulate_ctxt *ctxt, 1759static int emulate_popa(struct x86_emulate_ctxt *ctxt,
@@ -1695,14 +1853,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1695 old_eip = c->eip; 1853 old_eip = c->eip;
1696 c->eip = c->src.val; 1854 c->eip = c->src.val;
1697 c->src.val = old_eip; 1855 c->src.val = old_eip;
1698 emulate_push(ctxt); 1856 emulate_push(ctxt, ops);
1699 break; 1857 break;
1700 } 1858 }
1701 case 4: /* jmp abs */ 1859 case 4: /* jmp abs */
1702 c->eip = c->src.val; 1860 c->eip = c->src.val;
1703 break; 1861 break;
1704 case 6: /* push */ 1862 case 6: /* push */
1705 emulate_push(ctxt); 1863 emulate_push(ctxt, ops);
1706 break; 1864 break;
1707 } 1865 }
1708 return X86EMUL_CONTINUE; 1866 return X86EMUL_CONTINUE;
@@ -1748,145 +1906,82 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1748 return rc; 1906 return rc;
1749} 1907}
1750 1908
1751static inline int writeback(struct x86_emulate_ctxt *ctxt,
1752 struct x86_emulate_ops *ops)
1753{
1754 int rc;
1755 struct decode_cache *c = &ctxt->decode;
1756
1757 switch (c->dst.type) {
1758 case OP_REG:
1759 /* The 4-byte case *is* correct:
1760 * in 64-bit mode we zero-extend.
1761 */
1762 switch (c->dst.bytes) {
1763 case 1:
1764 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1765 break;
1766 case 2:
1767 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1768 break;
1769 case 4:
1770 *c->dst.ptr = (u32)c->dst.val;
1771 break; /* 64b: zero-ext */
1772 case 8:
1773 *c->dst.ptr = c->dst.val;
1774 break;
1775 }
1776 break;
1777 case OP_MEM:
1778 if (c->lock_prefix)
1779 rc = ops->cmpxchg_emulated(
1780 (unsigned long)c->dst.ptr,
1781 &c->dst.orig_val,
1782 &c->dst.val,
1783 c->dst.bytes,
1784 ctxt->vcpu);
1785 else
1786 rc = ops->write_emulated(
1787 (unsigned long)c->dst.ptr,
1788 &c->dst.val,
1789 c->dst.bytes,
1790 ctxt->vcpu);
1791 if (rc != X86EMUL_CONTINUE)
1792 return rc;
1793 break;
1794 case OP_NONE:
1795 /* no writeback */
1796 break;
1797 default:
1798 break;
1799 }
1800 return X86EMUL_CONTINUE;
1801}
1802
1803static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
1804{
1805 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
1806 /*
1807 * an sti; sti; sequence only disable interrupts for the first
1808 * instruction. So, if the last instruction, be it emulated or
1809 * not, left the system with the INT_STI flag enabled, it
1810 * means that the last instruction is an sti. We should not
1811 * leave the flag on in this case. The same goes for mov ss
1812 */
1813 if (!(int_shadow & mask))
1814 ctxt->interruptibility = mask;
1815}
1816
1817static inline void 1909static inline void
1818setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, 1910setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1819 struct kvm_segment *cs, struct kvm_segment *ss) 1911 struct x86_emulate_ops *ops, struct desc_struct *cs,
1912 struct desc_struct *ss)
1820{ 1913{
1821 memset(cs, 0, sizeof(struct kvm_segment)); 1914 memset(cs, 0, sizeof(struct desc_struct));
1822 kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS); 1915 ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu);
1823 memset(ss, 0, sizeof(struct kvm_segment)); 1916 memset(ss, 0, sizeof(struct desc_struct));
1824 1917
1825 cs->l = 0; /* will be adjusted later */ 1918 cs->l = 0; /* will be adjusted later */
1826 cs->base = 0; /* flat segment */ 1919 set_desc_base(cs, 0); /* flat segment */
1827 cs->g = 1; /* 4kb granularity */ 1920 cs->g = 1; /* 4kb granularity */
1828 cs->limit = 0xffffffff; /* 4GB limit */ 1921 set_desc_limit(cs, 0xfffff); /* 4GB limit */
1829 cs->type = 0x0b; /* Read, Execute, Accessed */ 1922 cs->type = 0x0b; /* Read, Execute, Accessed */
1830 cs->s = 1; 1923 cs->s = 1;
1831 cs->dpl = 0; /* will be adjusted later */ 1924 cs->dpl = 0; /* will be adjusted later */
1832 cs->present = 1; 1925 cs->p = 1;
1833 cs->db = 1; 1926 cs->d = 1;
1834 1927
1835 ss->unusable = 0; 1928 set_desc_base(ss, 0); /* flat segment */
1836 ss->base = 0; /* flat segment */ 1929 set_desc_limit(ss, 0xfffff); /* 4GB limit */
1837 ss->limit = 0xffffffff; /* 4GB limit */
1838 ss->g = 1; /* 4kb granularity */ 1930 ss->g = 1; /* 4kb granularity */
1839 ss->s = 1; 1931 ss->s = 1;
1840 ss->type = 0x03; /* Read/Write, Accessed */ 1932 ss->type = 0x03; /* Read/Write, Accessed */
1841 ss->db = 1; /* 32bit stack segment */ 1933 ss->d = 1; /* 32bit stack segment */
1842 ss->dpl = 0; 1934 ss->dpl = 0;
1843 ss->present = 1; 1935 ss->p = 1;
1844} 1936}
1845 1937
1846static int 1938static int
1847emulate_syscall(struct x86_emulate_ctxt *ctxt) 1939emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1848{ 1940{
1849 struct decode_cache *c = &ctxt->decode; 1941 struct decode_cache *c = &ctxt->decode;
1850 struct kvm_segment cs, ss; 1942 struct desc_struct cs, ss;
1851 u64 msr_data; 1943 u64 msr_data;
1944 u16 cs_sel, ss_sel;
1852 1945
1853 /* syscall is not available in real mode */ 1946 /* syscall is not available in real mode */
1854 if (ctxt->mode == X86EMUL_MODE_REAL || 1947 if (ctxt->mode == X86EMUL_MODE_REAL ||
1855 ctxt->mode == X86EMUL_MODE_VM86) { 1948 ctxt->mode == X86EMUL_MODE_VM86) {
1856 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 1949 emulate_ud(ctxt);
1857 return X86EMUL_PROPAGATE_FAULT; 1950 return X86EMUL_PROPAGATE_FAULT;
1858 } 1951 }
1859 1952
1860 setup_syscalls_segments(ctxt, &cs, &ss); 1953 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1861 1954
1862 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 1955 ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
1863 msr_data >>= 32; 1956 msr_data >>= 32;
1864 cs.selector = (u16)(msr_data & 0xfffc); 1957 cs_sel = (u16)(msr_data & 0xfffc);
1865 ss.selector = (u16)(msr_data + 8); 1958 ss_sel = (u16)(msr_data + 8);
1866 1959
1867 if (is_long_mode(ctxt->vcpu)) { 1960 if (is_long_mode(ctxt->vcpu)) {
1868 cs.db = 0; 1961 cs.d = 0;
1869 cs.l = 1; 1962 cs.l = 1;
1870 } 1963 }
1871 kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); 1964 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
1872 kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); 1965 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
1966 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
1967 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1873 1968
1874 c->regs[VCPU_REGS_RCX] = c->eip; 1969 c->regs[VCPU_REGS_RCX] = c->eip;
1875 if (is_long_mode(ctxt->vcpu)) { 1970 if (is_long_mode(ctxt->vcpu)) {
1876#ifdef CONFIG_X86_64 1971#ifdef CONFIG_X86_64
1877 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; 1972 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
1878 1973
1879 kvm_x86_ops->get_msr(ctxt->vcpu, 1974 ops->get_msr(ctxt->vcpu,
1880 ctxt->mode == X86EMUL_MODE_PROT64 ? 1975 ctxt->mode == X86EMUL_MODE_PROT64 ?
1881 MSR_LSTAR : MSR_CSTAR, &msr_data); 1976 MSR_LSTAR : MSR_CSTAR, &msr_data);
1882 c->eip = msr_data; 1977 c->eip = msr_data;
1883 1978
1884 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); 1979 ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
1885 ctxt->eflags &= ~(msr_data | EFLG_RF); 1980 ctxt->eflags &= ~(msr_data | EFLG_RF);
1886#endif 1981#endif
1887 } else { 1982 } else {
1888 /* legacy mode */ 1983 /* legacy mode */
1889 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 1984 ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
1890 c->eip = (u32)msr_data; 1985 c->eip = (u32)msr_data;
1891 1986
1892 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 1987 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -1896,15 +1991,16 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
1896} 1991}
1897 1992
1898static int 1993static int
1899emulate_sysenter(struct x86_emulate_ctxt *ctxt) 1994emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1900{ 1995{
1901 struct decode_cache *c = &ctxt->decode; 1996 struct decode_cache *c = &ctxt->decode;
1902 struct kvm_segment cs, ss; 1997 struct desc_struct cs, ss;
1903 u64 msr_data; 1998 u64 msr_data;
1999 u16 cs_sel, ss_sel;
1904 2000
1905 /* inject #GP if in real mode */ 2001 /* inject #GP if in real mode */
1906 if (ctxt->mode == X86EMUL_MODE_REAL) { 2002 if (ctxt->mode == X86EMUL_MODE_REAL) {
1907 kvm_inject_gp(ctxt->vcpu, 0); 2003 emulate_gp(ctxt, 0);
1908 return X86EMUL_PROPAGATE_FAULT; 2004 return X86EMUL_PROPAGATE_FAULT;
1909 } 2005 }
1910 2006
@@ -1912,67 +2008,70 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1912 * Therefore, we inject an #UD. 2008 * Therefore, we inject an #UD.
1913 */ 2009 */
1914 if (ctxt->mode == X86EMUL_MODE_PROT64) { 2010 if (ctxt->mode == X86EMUL_MODE_PROT64) {
1915 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2011 emulate_ud(ctxt);
1916 return X86EMUL_PROPAGATE_FAULT; 2012 return X86EMUL_PROPAGATE_FAULT;
1917 } 2013 }
1918 2014
1919 setup_syscalls_segments(ctxt, &cs, &ss); 2015 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1920 2016
1921 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 2017 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
1922 switch (ctxt->mode) { 2018 switch (ctxt->mode) {
1923 case X86EMUL_MODE_PROT32: 2019 case X86EMUL_MODE_PROT32:
1924 if ((msr_data & 0xfffc) == 0x0) { 2020 if ((msr_data & 0xfffc) == 0x0) {
1925 kvm_inject_gp(ctxt->vcpu, 0); 2021 emulate_gp(ctxt, 0);
1926 return X86EMUL_PROPAGATE_FAULT; 2022 return X86EMUL_PROPAGATE_FAULT;
1927 } 2023 }
1928 break; 2024 break;
1929 case X86EMUL_MODE_PROT64: 2025 case X86EMUL_MODE_PROT64:
1930 if (msr_data == 0x0) { 2026 if (msr_data == 0x0) {
1931 kvm_inject_gp(ctxt->vcpu, 0); 2027 emulate_gp(ctxt, 0);
1932 return X86EMUL_PROPAGATE_FAULT; 2028 return X86EMUL_PROPAGATE_FAULT;
1933 } 2029 }
1934 break; 2030 break;
1935 } 2031 }
1936 2032
1937 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 2033 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
1938 cs.selector = (u16)msr_data; 2034 cs_sel = (u16)msr_data;
1939 cs.selector &= ~SELECTOR_RPL_MASK; 2035 cs_sel &= ~SELECTOR_RPL_MASK;
1940 ss.selector = cs.selector + 8; 2036 ss_sel = cs_sel + 8;
1941 ss.selector &= ~SELECTOR_RPL_MASK; 2037 ss_sel &= ~SELECTOR_RPL_MASK;
1942 if (ctxt->mode == X86EMUL_MODE_PROT64 2038 if (ctxt->mode == X86EMUL_MODE_PROT64
1943 || is_long_mode(ctxt->vcpu)) { 2039 || is_long_mode(ctxt->vcpu)) {
1944 cs.db = 0; 2040 cs.d = 0;
1945 cs.l = 1; 2041 cs.l = 1;
1946 } 2042 }
1947 2043
1948 kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); 2044 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
1949 kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); 2045 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
2046 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
2047 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1950 2048
1951 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); 2049 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
1952 c->eip = msr_data; 2050 c->eip = msr_data;
1953 2051
1954 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); 2052 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
1955 c->regs[VCPU_REGS_RSP] = msr_data; 2053 c->regs[VCPU_REGS_RSP] = msr_data;
1956 2054
1957 return X86EMUL_CONTINUE; 2055 return X86EMUL_CONTINUE;
1958} 2056}
1959 2057
1960static int 2058static int
1961emulate_sysexit(struct x86_emulate_ctxt *ctxt) 2059emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1962{ 2060{
1963 struct decode_cache *c = &ctxt->decode; 2061 struct decode_cache *c = &ctxt->decode;
1964 struct kvm_segment cs, ss; 2062 struct desc_struct cs, ss;
1965 u64 msr_data; 2063 u64 msr_data;
1966 int usermode; 2064 int usermode;
2065 u16 cs_sel, ss_sel;
1967 2066
1968 /* inject #GP if in real mode or Virtual 8086 mode */ 2067 /* inject #GP if in real mode or Virtual 8086 mode */
1969 if (ctxt->mode == X86EMUL_MODE_REAL || 2068 if (ctxt->mode == X86EMUL_MODE_REAL ||
1970 ctxt->mode == X86EMUL_MODE_VM86) { 2069 ctxt->mode == X86EMUL_MODE_VM86) {
1971 kvm_inject_gp(ctxt->vcpu, 0); 2070 emulate_gp(ctxt, 0);
1972 return X86EMUL_PROPAGATE_FAULT; 2071 return X86EMUL_PROPAGATE_FAULT;
1973 } 2072 }
1974 2073
1975 setup_syscalls_segments(ctxt, &cs, &ss); 2074 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1976 2075
1977 if ((c->rex_prefix & 0x8) != 0x0) 2076 if ((c->rex_prefix & 0x8) != 0x0)
1978 usermode = X86EMUL_MODE_PROT64; 2077 usermode = X86EMUL_MODE_PROT64;
@@ -1981,35 +2080,37 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1981 2080
1982 cs.dpl = 3; 2081 cs.dpl = 3;
1983 ss.dpl = 3; 2082 ss.dpl = 3;
1984 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 2083 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
1985 switch (usermode) { 2084 switch (usermode) {
1986 case X86EMUL_MODE_PROT32: 2085 case X86EMUL_MODE_PROT32:
1987 cs.selector = (u16)(msr_data + 16); 2086 cs_sel = (u16)(msr_data + 16);
1988 if ((msr_data & 0xfffc) == 0x0) { 2087 if ((msr_data & 0xfffc) == 0x0) {
1989 kvm_inject_gp(ctxt->vcpu, 0); 2088 emulate_gp(ctxt, 0);
1990 return X86EMUL_PROPAGATE_FAULT; 2089 return X86EMUL_PROPAGATE_FAULT;
1991 } 2090 }
1992 ss.selector = (u16)(msr_data + 24); 2091 ss_sel = (u16)(msr_data + 24);
1993 break; 2092 break;
1994 case X86EMUL_MODE_PROT64: 2093 case X86EMUL_MODE_PROT64:
1995 cs.selector = (u16)(msr_data + 32); 2094 cs_sel = (u16)(msr_data + 32);
1996 if (msr_data == 0x0) { 2095 if (msr_data == 0x0) {
1997 kvm_inject_gp(ctxt->vcpu, 0); 2096 emulate_gp(ctxt, 0);
1998 return X86EMUL_PROPAGATE_FAULT; 2097 return X86EMUL_PROPAGATE_FAULT;
1999 } 2098 }
2000 ss.selector = cs.selector + 8; 2099 ss_sel = cs_sel + 8;
2001 cs.db = 0; 2100 cs.d = 0;
2002 cs.l = 1; 2101 cs.l = 1;
2003 break; 2102 break;
2004 } 2103 }
2005 cs.selector |= SELECTOR_RPL_MASK; 2104 cs_sel |= SELECTOR_RPL_MASK;
2006 ss.selector |= SELECTOR_RPL_MASK; 2105 ss_sel |= SELECTOR_RPL_MASK;
2007 2106
2008 kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); 2107 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
2009 kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); 2108 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
2109 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
2110 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
2010 2111
2011 c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; 2112 c->eip = c->regs[VCPU_REGS_RDX];
2012 c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; 2113 c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
2013 2114
2014 return X86EMUL_CONTINUE; 2115 return X86EMUL_CONTINUE;
2015} 2116}
@@ -2030,25 +2131,25 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
2030 struct x86_emulate_ops *ops, 2131 struct x86_emulate_ops *ops,
2031 u16 port, u16 len) 2132 u16 port, u16 len)
2032{ 2133{
2033 struct kvm_segment tr_seg; 2134 struct desc_struct tr_seg;
2034 int r; 2135 int r;
2035 u16 io_bitmap_ptr; 2136 u16 io_bitmap_ptr;
2036 u8 perm, bit_idx = port & 0x7; 2137 u8 perm, bit_idx = port & 0x7;
2037 unsigned mask = (1 << len) - 1; 2138 unsigned mask = (1 << len) - 1;
2038 2139
2039 kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); 2140 ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu);
2040 if (tr_seg.unusable) 2141 if (!tr_seg.p)
2041 return false; 2142 return false;
2042 if (tr_seg.limit < 103) 2143 if (desc_limit_scaled(&tr_seg) < 103)
2043 return false; 2144 return false;
2044 r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, 2145 r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2,
2045 NULL); 2146 ctxt->vcpu, NULL);
2046 if (r != X86EMUL_CONTINUE) 2147 if (r != X86EMUL_CONTINUE)
2047 return false; 2148 return false;
2048 if (io_bitmap_ptr + port/8 > tr_seg.limit) 2149 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
2049 return false; 2150 return false;
2050 r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, 2151 r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8,
2051 ctxt->vcpu, NULL); 2152 &perm, 1, ctxt->vcpu, NULL);
2052 if (r != X86EMUL_CONTINUE) 2153 if (r != X86EMUL_CONTINUE)
2053 return false; 2154 return false;
2054 if ((perm >> bit_idx) & mask) 2155 if ((perm >> bit_idx) & mask)
@@ -2066,17 +2167,6 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
2066 return true; 2167 return true;
2067} 2168}
2068 2169
2069static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
2070 struct x86_emulate_ops *ops,
2071 int seg)
2072{
2073 struct desc_struct desc;
2074 if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
2075 return get_desc_base(&desc);
2076 else
2077 return ~0;
2078}
2079
2080static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, 2170static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2081 struct x86_emulate_ops *ops, 2171 struct x86_emulate_ops *ops,
2082 struct tss_segment_16 *tss) 2172 struct tss_segment_16 *tss)
@@ -2165,7 +2255,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2165 &err); 2255 &err);
2166 if (ret == X86EMUL_PROPAGATE_FAULT) { 2256 if (ret == X86EMUL_PROPAGATE_FAULT) {
2167 /* FIXME: need to provide precise fault address */ 2257 /* FIXME: need to provide precise fault address */
2168 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2258 emulate_pf(ctxt, old_tss_base, err);
2169 return ret; 2259 return ret;
2170 } 2260 }
2171 2261
@@ -2175,7 +2265,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2175 &err); 2265 &err);
2176 if (ret == X86EMUL_PROPAGATE_FAULT) { 2266 if (ret == X86EMUL_PROPAGATE_FAULT) {
2177 /* FIXME: need to provide precise fault address */ 2267 /* FIXME: need to provide precise fault address */
2178 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2268 emulate_pf(ctxt, old_tss_base, err);
2179 return ret; 2269 return ret;
2180 } 2270 }
2181 2271
@@ -2183,7 +2273,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2183 &err); 2273 &err);
2184 if (ret == X86EMUL_PROPAGATE_FAULT) { 2274 if (ret == X86EMUL_PROPAGATE_FAULT) {
2185 /* FIXME: need to provide precise fault address */ 2275 /* FIXME: need to provide precise fault address */
2186 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2276 emulate_pf(ctxt, new_tss_base, err);
2187 return ret; 2277 return ret;
2188 } 2278 }
2189 2279
@@ -2196,7 +2286,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2196 ctxt->vcpu, &err); 2286 ctxt->vcpu, &err);
2197 if (ret == X86EMUL_PROPAGATE_FAULT) { 2287 if (ret == X86EMUL_PROPAGATE_FAULT) {
2198 /* FIXME: need to provide precise fault address */ 2288 /* FIXME: need to provide precise fault address */
2199 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2289 emulate_pf(ctxt, new_tss_base, err);
2200 return ret; 2290 return ret;
2201 } 2291 }
2202 } 2292 }
@@ -2238,7 +2328,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2238 struct decode_cache *c = &ctxt->decode; 2328 struct decode_cache *c = &ctxt->decode;
2239 int ret; 2329 int ret;
2240 2330
2241 ops->set_cr(3, tss->cr3, ctxt->vcpu); 2331 if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) {
2332 emulate_gp(ctxt, 0);
2333 return X86EMUL_PROPAGATE_FAULT;
2334 }
2242 c->eip = tss->eip; 2335 c->eip = tss->eip;
2243 ctxt->eflags = tss->eflags | 2; 2336 ctxt->eflags = tss->eflags | 2;
2244 c->regs[VCPU_REGS_RAX] = tss->eax; 2337 c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2304,7 +2397,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2304 &err); 2397 &err);
2305 if (ret == X86EMUL_PROPAGATE_FAULT) { 2398 if (ret == X86EMUL_PROPAGATE_FAULT) {
2306 /* FIXME: need to provide precise fault address */ 2399 /* FIXME: need to provide precise fault address */
2307 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2400 emulate_pf(ctxt, old_tss_base, err);
2308 return ret; 2401 return ret;
2309 } 2402 }
2310 2403
@@ -2314,7 +2407,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2314 &err); 2407 &err);
2315 if (ret == X86EMUL_PROPAGATE_FAULT) { 2408 if (ret == X86EMUL_PROPAGATE_FAULT) {
2316 /* FIXME: need to provide precise fault address */ 2409 /* FIXME: need to provide precise fault address */
2317 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2410 emulate_pf(ctxt, old_tss_base, err);
2318 return ret; 2411 return ret;
2319 } 2412 }
2320 2413
@@ -2322,7 +2415,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2322 &err); 2415 &err);
2323 if (ret == X86EMUL_PROPAGATE_FAULT) { 2416 if (ret == X86EMUL_PROPAGATE_FAULT) {
2324 /* FIXME: need to provide precise fault address */ 2417 /* FIXME: need to provide precise fault address */
2325 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2418 emulate_pf(ctxt, new_tss_base, err);
2326 return ret; 2419 return ret;
2327 } 2420 }
2328 2421
@@ -2335,7 +2428,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2335 ctxt->vcpu, &err); 2428 ctxt->vcpu, &err);
2336 if (ret == X86EMUL_PROPAGATE_FAULT) { 2429 if (ret == X86EMUL_PROPAGATE_FAULT) {
2337 /* FIXME: need to provide precise fault address */ 2430 /* FIXME: need to provide precise fault address */
2338 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2431 emulate_pf(ctxt, new_tss_base, err);
2339 return ret; 2432 return ret;
2340 } 2433 }
2341 } 2434 }
@@ -2352,7 +2445,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2352 int ret; 2445 int ret;
2353 u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); 2446 u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
2354 ulong old_tss_base = 2447 ulong old_tss_base =
2355 get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR); 2448 ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu);
2356 u32 desc_limit; 2449 u32 desc_limit;
2357 2450
2358 /* FIXME: old_tss_base == ~0 ? */ 2451 /* FIXME: old_tss_base == ~0 ? */
@@ -2369,7 +2462,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2369 if (reason != TASK_SWITCH_IRET) { 2462 if (reason != TASK_SWITCH_IRET) {
2370 if ((tss_selector & 3) > next_tss_desc.dpl || 2463 if ((tss_selector & 3) > next_tss_desc.dpl ||
2371 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { 2464 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
2372 kvm_inject_gp(ctxt->vcpu, 0); 2465 emulate_gp(ctxt, 0);
2373 return X86EMUL_PROPAGATE_FAULT; 2466 return X86EMUL_PROPAGATE_FAULT;
2374 } 2467 }
2375 } 2468 }
@@ -2378,8 +2471,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2378 if (!next_tss_desc.p || 2471 if (!next_tss_desc.p ||
2379 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || 2472 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
2380 desc_limit < 0x2b)) { 2473 desc_limit < 0x2b)) {
2381 kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR, 2474 emulate_ts(ctxt, tss_selector & 0xfffc);
2382 tss_selector & 0xfffc);
2383 return X86EMUL_PROPAGATE_FAULT; 2475 return X86EMUL_PROPAGATE_FAULT;
2384 } 2476 }
2385 2477
@@ -2425,7 +2517,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2425 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; 2517 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
2426 c->lock_prefix = 0; 2518 c->lock_prefix = 0;
2427 c->src.val = (unsigned long) error_code; 2519 c->src.val = (unsigned long) error_code;
2428 emulate_push(ctxt); 2520 emulate_push(ctxt, ops);
2429 } 2521 }
2430 2522
2431 return ret; 2523 return ret;
@@ -2439,18 +2531,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2439 struct decode_cache *c = &ctxt->decode; 2531 struct decode_cache *c = &ctxt->decode;
2440 int rc; 2532 int rc;
2441 2533
2442 memset(c, 0, sizeof(struct decode_cache));
2443 c->eip = ctxt->eip; 2534 c->eip = ctxt->eip;
2444 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
2445 c->dst.type = OP_NONE; 2535 c->dst.type = OP_NONE;
2446 2536
2447 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, 2537 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
2448 has_error_code, error_code); 2538 has_error_code, error_code);
2449 2539
2450 if (rc == X86EMUL_CONTINUE) { 2540 if (rc == X86EMUL_CONTINUE) {
2451 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
2452 kvm_rip_write(ctxt->vcpu, c->eip);
2453 rc = writeback(ctxt, ops); 2541 rc = writeback(ctxt, ops);
2542 if (rc == X86EMUL_CONTINUE)
2543 ctxt->eip = c->eip;
2454 } 2544 }
2455 2545
2456 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 2546 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
@@ -2474,29 +2564,22 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2474 int rc = X86EMUL_CONTINUE; 2564 int rc = X86EMUL_CONTINUE;
2475 int saved_dst_type = c->dst.type; 2565 int saved_dst_type = c->dst.type;
2476 2566
2477 ctxt->interruptibility = 0; 2567 ctxt->decode.mem_read.pos = 0;
2478
2479 /* Shadow copy of register state. Committed on successful emulation.
2480 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
2481 * modify them.
2482 */
2483
2484 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
2485 2568
2486 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { 2569 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
2487 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2570 emulate_ud(ctxt);
2488 goto done; 2571 goto done;
2489 } 2572 }
2490 2573
2491 /* LOCK prefix is allowed only with some instructions */ 2574 /* LOCK prefix is allowed only with some instructions */
2492 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { 2575 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
2493 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2576 emulate_ud(ctxt);
2494 goto done; 2577 goto done;
2495 } 2578 }
2496 2579
2497 /* Privileged instruction can be executed only in CPL=0 */ 2580 /* Privileged instruction can be executed only in CPL=0 */
2498 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { 2581 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
2499 kvm_inject_gp(ctxt->vcpu, 0); 2582 emulate_gp(ctxt, 0);
2500 goto done; 2583 goto done;
2501 } 2584 }
2502 2585
@@ -2506,7 +2589,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2506 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { 2589 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
2507 string_done: 2590 string_done:
2508 ctxt->restart = false; 2591 ctxt->restart = false;
2509 kvm_rip_write(ctxt->vcpu, c->eip); 2592 ctxt->eip = c->eip;
2510 goto done; 2593 goto done;
2511 } 2594 }
2512 /* The second termination condition only applies for REPE 2595 /* The second termination condition only applies for REPE
@@ -2529,20 +2612,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2529 } 2612 }
2530 2613
2531 if (c->src.type == OP_MEM) { 2614 if (c->src.type == OP_MEM) {
2532 rc = ops->read_emulated((unsigned long)c->src.ptr, 2615 rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr,
2533 &c->src.val, 2616 c->src.valptr, c->src.bytes);
2534 c->src.bytes,
2535 ctxt->vcpu);
2536 if (rc != X86EMUL_CONTINUE) 2617 if (rc != X86EMUL_CONTINUE)
2537 goto done; 2618 goto done;
2538 c->src.orig_val = c->src.val; 2619 c->src.orig_val = c->src.val;
2539 } 2620 }
2540 2621
2541 if (c->src2.type == OP_MEM) { 2622 if (c->src2.type == OP_MEM) {
2542 rc = ops->read_emulated((unsigned long)c->src2.ptr, 2623 rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr,
2543 &c->src2.val, 2624 &c->src2.val, c->src2.bytes);
2544 c->src2.bytes,
2545 ctxt->vcpu);
2546 if (rc != X86EMUL_CONTINUE) 2625 if (rc != X86EMUL_CONTINUE)
2547 goto done; 2626 goto done;
2548 } 2627 }
@@ -2553,8 +2632,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2553 2632
2554 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { 2633 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
2555 /* optimisation - avoid slow emulated read if Mov */ 2634 /* optimisation - avoid slow emulated read if Mov */
2556 rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val, 2635 rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr,
2557 c->dst.bytes, ctxt->vcpu); 2636 &c->dst.val, c->dst.bytes);
2558 if (rc != X86EMUL_CONTINUE) 2637 if (rc != X86EMUL_CONTINUE)
2559 goto done; 2638 goto done;
2560 } 2639 }
@@ -2571,7 +2650,7 @@ special_insn:
2571 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); 2650 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
2572 break; 2651 break;
2573 case 0x06: /* push es */ 2652 case 0x06: /* push es */
2574 emulate_push_sreg(ctxt, VCPU_SREG_ES); 2653 emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
2575 break; 2654 break;
2576 case 0x07: /* pop es */ 2655 case 0x07: /* pop es */
2577 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); 2656 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
@@ -2583,14 +2662,14 @@ special_insn:
2583 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); 2662 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
2584 break; 2663 break;
2585 case 0x0e: /* push cs */ 2664 case 0x0e: /* push cs */
2586 emulate_push_sreg(ctxt, VCPU_SREG_CS); 2665 emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
2587 break; 2666 break;
2588 case 0x10 ... 0x15: 2667 case 0x10 ... 0x15:
2589 adc: /* adc */ 2668 adc: /* adc */
2590 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); 2669 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
2591 break; 2670 break;
2592 case 0x16: /* push ss */ 2671 case 0x16: /* push ss */
2593 emulate_push_sreg(ctxt, VCPU_SREG_SS); 2672 emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
2594 break; 2673 break;
2595 case 0x17: /* pop ss */ 2674 case 0x17: /* pop ss */
2596 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); 2675 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
@@ -2602,7 +2681,7 @@ special_insn:
2602 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); 2681 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
2603 break; 2682 break;
2604 case 0x1e: /* push ds */ 2683 case 0x1e: /* push ds */
2605 emulate_push_sreg(ctxt, VCPU_SREG_DS); 2684 emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
2606 break; 2685 break;
2607 case 0x1f: /* pop ds */ 2686 case 0x1f: /* pop ds */
2608 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); 2687 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
@@ -2632,7 +2711,7 @@ special_insn:
2632 emulate_1op("dec", c->dst, ctxt->eflags); 2711 emulate_1op("dec", c->dst, ctxt->eflags);
2633 break; 2712 break;
2634 case 0x50 ... 0x57: /* push reg */ 2713 case 0x50 ... 0x57: /* push reg */
2635 emulate_push(ctxt); 2714 emulate_push(ctxt, ops);
2636 break; 2715 break;
2637 case 0x58 ... 0x5f: /* pop reg */ 2716 case 0x58 ... 0x5f: /* pop reg */
2638 pop_instruction: 2717 pop_instruction:
@@ -2641,7 +2720,9 @@ special_insn:
2641 goto done; 2720 goto done;
2642 break; 2721 break;
2643 case 0x60: /* pusha */ 2722 case 0x60: /* pusha */
2644 emulate_pusha(ctxt); 2723 rc = emulate_pusha(ctxt, ops);
2724 if (rc != X86EMUL_CONTINUE)
2725 goto done;
2645 break; 2726 break;
2646 case 0x61: /* popa */ 2727 case 0x61: /* popa */
2647 rc = emulate_popa(ctxt, ops); 2728 rc = emulate_popa(ctxt, ops);
@@ -2655,14 +2736,14 @@ special_insn:
2655 break; 2736 break;
2656 case 0x68: /* push imm */ 2737 case 0x68: /* push imm */
2657 case 0x6a: /* push imm8 */ 2738 case 0x6a: /* push imm8 */
2658 emulate_push(ctxt); 2739 emulate_push(ctxt, ops);
2659 break; 2740 break;
2660 case 0x6c: /* insb */ 2741 case 0x6c: /* insb */
2661 case 0x6d: /* insw/insd */ 2742 case 0x6d: /* insw/insd */
2662 c->dst.bytes = min(c->dst.bytes, 4u); 2743 c->dst.bytes = min(c->dst.bytes, 4u);
2663 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2744 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2664 c->dst.bytes)) { 2745 c->dst.bytes)) {
2665 kvm_inject_gp(ctxt->vcpu, 0); 2746 emulate_gp(ctxt, 0);
2666 goto done; 2747 goto done;
2667 } 2748 }
2668 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, 2749 if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
@@ -2674,7 +2755,7 @@ special_insn:
2674 c->src.bytes = min(c->src.bytes, 4u); 2755 c->src.bytes = min(c->src.bytes, 4u);
2675 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2756 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2676 c->src.bytes)) { 2757 c->src.bytes)) {
2677 kvm_inject_gp(ctxt->vcpu, 0); 2758 emulate_gp(ctxt, 0);
2678 goto done; 2759 goto done;
2679 } 2760 }
2680 ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX], 2761 ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
@@ -2707,6 +2788,7 @@ special_insn:
2707 } 2788 }
2708 break; 2789 break;
2709 case 0x84 ... 0x85: 2790 case 0x84 ... 0x85:
2791 test:
2710 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); 2792 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
2711 break; 2793 break;
2712 case 0x86 ... 0x87: /* xchg */ 2794 case 0x86 ... 0x87: /* xchg */
@@ -2735,18 +2817,13 @@ special_insn:
2735 break; 2817 break;
2736 case 0x88 ... 0x8b: /* mov */ 2818 case 0x88 ... 0x8b: /* mov */
2737 goto mov; 2819 goto mov;
2738 case 0x8c: { /* mov r/m, sreg */ 2820 case 0x8c: /* mov r/m, sreg */
2739 struct kvm_segment segreg; 2821 if (c->modrm_reg > VCPU_SREG_GS) {
2740 2822 emulate_ud(ctxt);
2741 if (c->modrm_reg <= VCPU_SREG_GS)
2742 kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
2743 else {
2744 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2745 goto done; 2823 goto done;
2746 } 2824 }
2747 c->dst.val = segreg.selector; 2825 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
2748 break; 2826 break;
2749 }
2750 case 0x8d: /* lea r16/r32, m */ 2827 case 0x8d: /* lea r16/r32, m */
2751 c->dst.val = c->modrm_ea; 2828 c->dst.val = c->modrm_ea;
2752 break; 2829 break;
@@ -2757,12 +2834,12 @@ special_insn:
2757 2834
2758 if (c->modrm_reg == VCPU_SREG_CS || 2835 if (c->modrm_reg == VCPU_SREG_CS ||
2759 c->modrm_reg > VCPU_SREG_GS) { 2836 c->modrm_reg > VCPU_SREG_GS) {
2760 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2837 emulate_ud(ctxt);
2761 goto done; 2838 goto done;
2762 } 2839 }
2763 2840
2764 if (c->modrm_reg == VCPU_SREG_SS) 2841 if (c->modrm_reg == VCPU_SREG_SS)
2765 toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS); 2842 ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
2766 2843
2767 rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg); 2844 rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
2768 2845
@@ -2775,19 +2852,19 @@ special_insn:
2775 goto done; 2852 goto done;
2776 break; 2853 break;
2777 case 0x90: /* nop / xchg r8,rax */ 2854 case 0x90: /* nop / xchg r8,rax */
2778 if (!(c->rex_prefix & 1)) { /* nop */ 2855 if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) {
2779 c->dst.type = OP_NONE; 2856 c->dst.type = OP_NONE; /* nop */
2780 break; 2857 break;
2781 } 2858 }
2782 case 0x91 ... 0x97: /* xchg reg,rax */ 2859 case 0x91 ... 0x97: /* xchg reg,rax */
2783 c->src.type = c->dst.type = OP_REG; 2860 c->src.type = OP_REG;
2784 c->src.bytes = c->dst.bytes = c->op_bytes; 2861 c->src.bytes = c->op_bytes;
2785 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; 2862 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
2786 c->src.val = *(c->src.ptr); 2863 c->src.val = *(c->src.ptr);
2787 goto xchg; 2864 goto xchg;
2788 case 0x9c: /* pushf */ 2865 case 0x9c: /* pushf */
2789 c->src.val = (unsigned long) ctxt->eflags; 2866 c->src.val = (unsigned long) ctxt->eflags;
2790 emulate_push(ctxt); 2867 emulate_push(ctxt, ops);
2791 break; 2868 break;
2792 case 0x9d: /* popf */ 2869 case 0x9d: /* popf */
2793 c->dst.type = OP_REG; 2870 c->dst.type = OP_REG;
@@ -2797,19 +2874,15 @@ special_insn:
2797 if (rc != X86EMUL_CONTINUE) 2874 if (rc != X86EMUL_CONTINUE)
2798 goto done; 2875 goto done;
2799 break; 2876 break;
2800 case 0xa0 ... 0xa1: /* mov */ 2877 case 0xa0 ... 0xa3: /* mov */
2801 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
2802 c->dst.val = c->src.val;
2803 break;
2804 case 0xa2 ... 0xa3: /* mov */
2805 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
2806 break;
2807 case 0xa4 ... 0xa5: /* movs */ 2878 case 0xa4 ... 0xa5: /* movs */
2808 goto mov; 2879 goto mov;
2809 case 0xa6 ... 0xa7: /* cmps */ 2880 case 0xa6 ... 0xa7: /* cmps */
2810 c->dst.type = OP_NONE; /* Disable writeback. */ 2881 c->dst.type = OP_NONE; /* Disable writeback. */
2811 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); 2882 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
2812 goto cmp; 2883 goto cmp;
2884 case 0xa8 ... 0xa9: /* test ax, imm */
2885 goto test;
2813 case 0xaa ... 0xab: /* stos */ 2886 case 0xaa ... 0xab: /* stos */
2814 c->dst.val = c->regs[VCPU_REGS_RAX]; 2887 c->dst.val = c->regs[VCPU_REGS_RAX];
2815 break; 2888 break;
@@ -2855,19 +2928,23 @@ special_insn:
2855 long int rel = c->src.val; 2928 long int rel = c->src.val;
2856 c->src.val = (unsigned long) c->eip; 2929 c->src.val = (unsigned long) c->eip;
2857 jmp_rel(c, rel); 2930 jmp_rel(c, rel);
2858 emulate_push(ctxt); 2931 emulate_push(ctxt, ops);
2859 break; 2932 break;
2860 } 2933 }
2861 case 0xe9: /* jmp rel */ 2934 case 0xe9: /* jmp rel */
2862 goto jmp; 2935 goto jmp;
2863 case 0xea: /* jmp far */ 2936 case 0xea: { /* jmp far */
2937 unsigned short sel;
2864 jump_far: 2938 jump_far:
2865 if (load_segment_descriptor(ctxt, ops, c->src2.val, 2939 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
2866 VCPU_SREG_CS)) 2940
2941 if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS))
2867 goto done; 2942 goto done;
2868 2943
2869 c->eip = c->src.val; 2944 c->eip = 0;
2945 memcpy(&c->eip, c->src.valptr, c->op_bytes);
2870 break; 2946 break;
2947 }
2871 case 0xeb: 2948 case 0xeb:
2872 jmp: /* jmp rel short */ 2949 jmp: /* jmp rel short */
2873 jmp_rel(c, c->src.val); 2950 jmp_rel(c, c->src.val);
@@ -2879,20 +2956,20 @@ special_insn:
2879 do_io_in: 2956 do_io_in:
2880 c->dst.bytes = min(c->dst.bytes, 4u); 2957 c->dst.bytes = min(c->dst.bytes, 4u);
2881 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 2958 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2882 kvm_inject_gp(ctxt->vcpu, 0); 2959 emulate_gp(ctxt, 0);
2883 goto done; 2960 goto done;
2884 } 2961 }
2885 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, 2962 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
2886 &c->dst.val)) 2963 &c->dst.val))
2887 goto done; /* IO is needed */ 2964 goto done; /* IO is needed */
2888 break; 2965 break;
2889 case 0xee: /* out al,dx */ 2966 case 0xee: /* out dx,al */
2890 case 0xef: /* out (e/r)ax,dx */ 2967 case 0xef: /* out dx,(e/r)ax */
2891 c->src.val = c->regs[VCPU_REGS_RDX]; 2968 c->src.val = c->regs[VCPU_REGS_RDX];
2892 do_io_out: 2969 do_io_out:
2893 c->dst.bytes = min(c->dst.bytes, 4u); 2970 c->dst.bytes = min(c->dst.bytes, 4u);
2894 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 2971 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2895 kvm_inject_gp(ctxt->vcpu, 0); 2972 emulate_gp(ctxt, 0);
2896 goto done; 2973 goto done;
2897 } 2974 }
2898 ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, 2975 ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
@@ -2916,18 +2993,20 @@ special_insn:
2916 c->dst.type = OP_NONE; /* Disable writeback. */ 2993 c->dst.type = OP_NONE; /* Disable writeback. */
2917 break; 2994 break;
2918 case 0xfa: /* cli */ 2995 case 0xfa: /* cli */
2919 if (emulator_bad_iopl(ctxt, ops)) 2996 if (emulator_bad_iopl(ctxt, ops)) {
2920 kvm_inject_gp(ctxt->vcpu, 0); 2997 emulate_gp(ctxt, 0);
2921 else { 2998 goto done;
2999 } else {
2922 ctxt->eflags &= ~X86_EFLAGS_IF; 3000 ctxt->eflags &= ~X86_EFLAGS_IF;
2923 c->dst.type = OP_NONE; /* Disable writeback. */ 3001 c->dst.type = OP_NONE; /* Disable writeback. */
2924 } 3002 }
2925 break; 3003 break;
2926 case 0xfb: /* sti */ 3004 case 0xfb: /* sti */
2927 if (emulator_bad_iopl(ctxt, ops)) 3005 if (emulator_bad_iopl(ctxt, ops)) {
2928 kvm_inject_gp(ctxt->vcpu, 0); 3006 emulate_gp(ctxt, 0);
2929 else { 3007 goto done;
2930 toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI); 3008 } else {
3009 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
2931 ctxt->eflags |= X86_EFLAGS_IF; 3010 ctxt->eflags |= X86_EFLAGS_IF;
2932 c->dst.type = OP_NONE; /* Disable writeback. */ 3011 c->dst.type = OP_NONE; /* Disable writeback. */
2933 } 3012 }
@@ -2964,11 +3043,12 @@ writeback:
2964 c->dst.type = saved_dst_type; 3043 c->dst.type = saved_dst_type;
2965 3044
2966 if ((c->d & SrcMask) == SrcSI) 3045 if ((c->d & SrcMask) == SrcSI)
2967 string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI, 3046 string_addr_inc(ctxt, seg_override_base(ctxt, ops, c),
2968 &c->src); 3047 VCPU_REGS_RSI, &c->src);
2969 3048
2970 if ((c->d & DstMask) == DstDI) 3049 if ((c->d & DstMask) == DstDI)
2971 string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst); 3050 string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI,
3051 &c->dst);
2972 3052
2973 if (c->rep_prefix && (c->d & String)) { 3053 if (c->rep_prefix && (c->d & String)) {
2974 struct read_cache *rc = &ctxt->decode.io_read; 3054 struct read_cache *rc = &ctxt->decode.io_read;
@@ -2981,11 +3061,12 @@ writeback:
2981 (rc->end != 0 && rc->end == rc->pos)) 3061 (rc->end != 0 && rc->end == rc->pos))
2982 ctxt->restart = false; 3062 ctxt->restart = false;
2983 } 3063 }
2984 3064 /*
2985 /* Commit shadow register state. */ 3065 * reset read cache here in case string instruction is restared
2986 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); 3066 * without decoding
2987 kvm_rip_write(ctxt->vcpu, c->eip); 3067 */
2988 ops->set_rflags(ctxt->vcpu, ctxt->eflags); 3068 ctxt->decode.mem_read.end = 0;
3069 ctxt->eip = c->eip;
2989 3070
2990done: 3071done:
2991 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 3072 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
@@ -3051,7 +3132,7 @@ twobyte_insn:
3051 c->dst.type = OP_NONE; 3132 c->dst.type = OP_NONE;
3052 break; 3133 break;
3053 case 5: /* not defined */ 3134 case 5: /* not defined */
3054 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3135 emulate_ud(ctxt);
3055 goto done; 3136 goto done;
3056 case 7: /* invlpg*/ 3137 case 7: /* invlpg*/
3057 emulate_invlpg(ctxt->vcpu, c->modrm_ea); 3138 emulate_invlpg(ctxt->vcpu, c->modrm_ea);
@@ -3063,7 +3144,7 @@ twobyte_insn:
3063 } 3144 }
3064 break; 3145 break;
3065 case 0x05: /* syscall */ 3146 case 0x05: /* syscall */
3066 rc = emulate_syscall(ctxt); 3147 rc = emulate_syscall(ctxt, ops);
3067 if (rc != X86EMUL_CONTINUE) 3148 if (rc != X86EMUL_CONTINUE)
3068 goto done; 3149 goto done;
3069 else 3150 else
@@ -3073,8 +3154,11 @@ twobyte_insn:
3073 emulate_clts(ctxt->vcpu); 3154 emulate_clts(ctxt->vcpu);
3074 c->dst.type = OP_NONE; 3155 c->dst.type = OP_NONE;
3075 break; 3156 break;
3076 case 0x08: /* invd */
3077 case 0x09: /* wbinvd */ 3157 case 0x09: /* wbinvd */
3158 kvm_emulate_wbinvd(ctxt->vcpu);
3159 c->dst.type = OP_NONE;
3160 break;
3161 case 0x08: /* invd */
3078 case 0x0d: /* GrpP (prefetch) */ 3162 case 0x0d: /* GrpP (prefetch) */
3079 case 0x18: /* Grp16 (prefetch/nop) */ 3163 case 0x18: /* Grp16 (prefetch/nop) */
3080 c->dst.type = OP_NONE; 3164 c->dst.type = OP_NONE;
@@ -3084,7 +3168,7 @@ twobyte_insn:
3084 case 1: 3168 case 1:
3085 case 5 ... 7: 3169 case 5 ... 7:
3086 case 9 ... 15: 3170 case 9 ... 15:
3087 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3171 emulate_ud(ctxt);
3088 goto done; 3172 goto done;
3089 } 3173 }
3090 c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); 3174 c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@@ -3093,31 +3177,42 @@ twobyte_insn:
3093 case 0x21: /* mov from dr to reg */ 3177 case 0x21: /* mov from dr to reg */
3094 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3178 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
3095 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3179 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3096 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3180 emulate_ud(ctxt);
3097 goto done; 3181 goto done;
3098 } 3182 }
3099 emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); 3183 ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu);
3100 c->dst.type = OP_NONE; /* no writeback */ 3184 c->dst.type = OP_NONE; /* no writeback */
3101 break; 3185 break;
3102 case 0x22: /* mov reg, cr */ 3186 case 0x22: /* mov reg, cr */
3103 ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu); 3187 if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) {
3188 emulate_gp(ctxt, 0);
3189 goto done;
3190 }
3104 c->dst.type = OP_NONE; 3191 c->dst.type = OP_NONE;
3105 break; 3192 break;
3106 case 0x23: /* mov from reg to dr */ 3193 case 0x23: /* mov from reg to dr */
3107 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3194 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
3108 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3195 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3109 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3196 emulate_ud(ctxt);
3197 goto done;
3198 }
3199
3200 if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] &
3201 ((ctxt->mode == X86EMUL_MODE_PROT64) ?
3202 ~0ULL : ~0U), ctxt->vcpu) < 0) {
3203 /* #UD condition is already handled by the code above */
3204 emulate_gp(ctxt, 0);
3110 goto done; 3205 goto done;
3111 } 3206 }
3112 emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]); 3207
3113 c->dst.type = OP_NONE; /* no writeback */ 3208 c->dst.type = OP_NONE; /* no writeback */
3114 break; 3209 break;
3115 case 0x30: 3210 case 0x30:
3116 /* wrmsr */ 3211 /* wrmsr */
3117 msr_data = (u32)c->regs[VCPU_REGS_RAX] 3212 msr_data = (u32)c->regs[VCPU_REGS_RAX]
3118 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 3213 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
3119 if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { 3214 if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
3120 kvm_inject_gp(ctxt->vcpu, 0); 3215 emulate_gp(ctxt, 0);
3121 goto done; 3216 goto done;
3122 } 3217 }
3123 rc = X86EMUL_CONTINUE; 3218 rc = X86EMUL_CONTINUE;
@@ -3125,8 +3220,8 @@ twobyte_insn:
3125 break; 3220 break;
3126 case 0x32: 3221 case 0x32:
3127 /* rdmsr */ 3222 /* rdmsr */
3128 if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { 3223 if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
3129 kvm_inject_gp(ctxt->vcpu, 0); 3224 emulate_gp(ctxt, 0);
3130 goto done; 3225 goto done;
3131 } else { 3226 } else {
3132 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 3227 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
@@ -3136,14 +3231,14 @@ twobyte_insn:
3136 c->dst.type = OP_NONE; 3231 c->dst.type = OP_NONE;
3137 break; 3232 break;
3138 case 0x34: /* sysenter */ 3233 case 0x34: /* sysenter */
3139 rc = emulate_sysenter(ctxt); 3234 rc = emulate_sysenter(ctxt, ops);
3140 if (rc != X86EMUL_CONTINUE) 3235 if (rc != X86EMUL_CONTINUE)
3141 goto done; 3236 goto done;
3142 else 3237 else
3143 goto writeback; 3238 goto writeback;
3144 break; 3239 break;
3145 case 0x35: /* sysexit */ 3240 case 0x35: /* sysexit */
3146 rc = emulate_sysexit(ctxt); 3241 rc = emulate_sysexit(ctxt, ops);
3147 if (rc != X86EMUL_CONTINUE) 3242 if (rc != X86EMUL_CONTINUE)
3148 goto done; 3243 goto done;
3149 else 3244 else
@@ -3160,7 +3255,7 @@ twobyte_insn:
3160 c->dst.type = OP_NONE; 3255 c->dst.type = OP_NONE;
3161 break; 3256 break;
3162 case 0xa0: /* push fs */ 3257 case 0xa0: /* push fs */
3163 emulate_push_sreg(ctxt, VCPU_SREG_FS); 3258 emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
3164 break; 3259 break;
3165 case 0xa1: /* pop fs */ 3260 case 0xa1: /* pop fs */
3166 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); 3261 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
@@ -3179,7 +3274,7 @@ twobyte_insn:
3179 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); 3274 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
3180 break; 3275 break;
3181 case 0xa8: /* push gs */ 3276 case 0xa8: /* push gs */
3182 emulate_push_sreg(ctxt, VCPU_SREG_GS); 3277 emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
3183 break; 3278 break;
3184 case 0xa9: /* pop gs */ 3279 case 0xa9: /* pop gs */
3185 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); 3280 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 0150affad25d..0fd6378981f4 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -5,6 +5,7 @@
5 * Copyright (c) 2006 Intel Corporation 5 * Copyright (c) 2006 Intel Corporation
6 * Copyright (c) 2007 Keir Fraser, XenSource Inc 6 * Copyright (c) 2007 Keir Fraser, XenSource Inc
7 * Copyright (c) 2008 Intel Corporation 7 * Copyright (c) 2008 Intel Corporation
8 * Copyright 2009 Red Hat, Inc. and/or its affilates.
8 * 9 *
9 * Permission is hereby granted, free of charge, to any person obtaining a copy 10 * Permission is hereby granted, free of charge, to any person obtaining a copy
10 * of this software and associated documentation files (the "Software"), to deal 11 * of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +34,7 @@
33 34
34#include <linux/kvm_host.h> 35#include <linux/kvm_host.h>
35#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/workqueue.h>
36 38
37#include "irq.h" 39#include "irq.h"
38#include "i8254.h" 40#include "i8254.h"
@@ -243,11 +245,22 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
243{ 245{
244 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, 246 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
245 irq_ack_notifier); 247 irq_ack_notifier);
246 raw_spin_lock(&ps->inject_lock); 248 int value;
247 if (atomic_dec_return(&ps->pit_timer.pending) < 0) 249
250 spin_lock(&ps->inject_lock);
251 value = atomic_dec_return(&ps->pit_timer.pending);
252 if (value < 0)
253 /* spurious acks can be generated if, for example, the
254 * PIC is being reset. Handle it gracefully here
255 */
248 atomic_inc(&ps->pit_timer.pending); 256 atomic_inc(&ps->pit_timer.pending);
257 else if (value > 0)
258 /* in this case, we had multiple outstanding pit interrupts
259 * that we needed to inject. Reinject
260 */
261 queue_work(ps->pit->wq, &ps->pit->expired);
249 ps->irq_ack = 1; 262 ps->irq_ack = 1;
250 raw_spin_unlock(&ps->inject_lock); 263 spin_unlock(&ps->inject_lock);
251} 264}
252 265
253void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) 266void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -263,10 +276,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
263 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 276 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
264} 277}
265 278
266static void destroy_pit_timer(struct kvm_timer *pt) 279static void destroy_pit_timer(struct kvm_pit *pit)
267{ 280{
268 pr_debug("execute del timer!\n"); 281 hrtimer_cancel(&pit->pit_state.pit_timer.timer);
269 hrtimer_cancel(&pt->timer); 282 cancel_work_sync(&pit->expired);
270} 283}
271 284
272static bool kpit_is_periodic(struct kvm_timer *ktimer) 285static bool kpit_is_periodic(struct kvm_timer *ktimer)
@@ -280,6 +293,60 @@ static struct kvm_timer_ops kpit_ops = {
280 .is_periodic = kpit_is_periodic, 293 .is_periodic = kpit_is_periodic,
281}; 294};
282 295
296static void pit_do_work(struct work_struct *work)
297{
298 struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
299 struct kvm *kvm = pit->kvm;
300 struct kvm_vcpu *vcpu;
301 int i;
302 struct kvm_kpit_state *ps = &pit->pit_state;
303 int inject = 0;
304
305 /* Try to inject pending interrupts when
306 * last one has been acked.
307 */
308 spin_lock(&ps->inject_lock);
309 if (ps->irq_ack) {
310 ps->irq_ack = 0;
311 inject = 1;
312 }
313 spin_unlock(&ps->inject_lock);
314 if (inject) {
315 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
316 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
317
318 /*
319 * Provides NMI watchdog support via Virtual Wire mode.
320 * The route is: PIT -> PIC -> LVT0 in NMI mode.
321 *
322 * Note: Our Virtual Wire implementation is simplified, only
323 * propagating PIT interrupts to all VCPUs when they have set
324 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
325 * VCPU0, and only if its LVT0 is in EXTINT mode.
326 */
327 if (kvm->arch.vapics_in_nmi_mode > 0)
328 kvm_for_each_vcpu(i, vcpu, kvm)
329 kvm_apic_nmi_wd_deliver(vcpu);
330 }
331}
332
333static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
334{
335 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
336 struct kvm_pit *pt = ktimer->kvm->arch.vpit;
337
338 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
339 atomic_inc(&ktimer->pending);
340 queue_work(pt->wq, &pt->expired);
341 }
342
343 if (ktimer->t_ops->is_periodic(ktimer)) {
344 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
345 return HRTIMER_RESTART;
346 } else
347 return HRTIMER_NORESTART;
348}
349
283static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) 350static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
284{ 351{
285 struct kvm_timer *pt = &ps->pit_timer; 352 struct kvm_timer *pt = &ps->pit_timer;
@@ -291,13 +358,13 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
291 358
292 /* TODO The new value only affected after the retriggered */ 359 /* TODO The new value only affected after the retriggered */
293 hrtimer_cancel(&pt->timer); 360 hrtimer_cancel(&pt->timer);
361 cancel_work_sync(&ps->pit->expired);
294 pt->period = interval; 362 pt->period = interval;
295 ps->is_periodic = is_period; 363 ps->is_periodic = is_period;
296 364
297 pt->timer.function = kvm_timer_fn; 365 pt->timer.function = pit_timer_fn;
298 pt->t_ops = &kpit_ops; 366 pt->t_ops = &kpit_ops;
299 pt->kvm = ps->pit->kvm; 367 pt->kvm = ps->pit->kvm;
300 pt->vcpu = pt->kvm->bsp_vcpu;
301 368
302 atomic_set(&pt->pending, 0); 369 atomic_set(&pt->pending, 0);
303 ps->irq_ack = 1; 370 ps->irq_ack = 1;
@@ -346,7 +413,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
346 } 413 }
347 break; 414 break;
348 default: 415 default:
349 destroy_pit_timer(&ps->pit_timer); 416 destroy_pit_timer(kvm->arch.vpit);
350 } 417 }
351} 418}
352 419
@@ -625,7 +692,15 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
625 692
626 mutex_init(&pit->pit_state.lock); 693 mutex_init(&pit->pit_state.lock);
627 mutex_lock(&pit->pit_state.lock); 694 mutex_lock(&pit->pit_state.lock);
628 raw_spin_lock_init(&pit->pit_state.inject_lock); 695 spin_lock_init(&pit->pit_state.inject_lock);
696
697 pit->wq = create_singlethread_workqueue("kvm-pit-wq");
698 if (!pit->wq) {
699 mutex_unlock(&pit->pit_state.lock);
700 kfree(pit);
701 return NULL;
702 }
703 INIT_WORK(&pit->expired, pit_do_work);
629 704
630 kvm->arch.vpit = pit; 705 kvm->arch.vpit = pit;
631 pit->kvm = kvm; 706 pit->kvm = kvm;
@@ -677,6 +752,9 @@ void kvm_free_pit(struct kvm *kvm)
677 struct hrtimer *timer; 752 struct hrtimer *timer;
678 753
679 if (kvm->arch.vpit) { 754 if (kvm->arch.vpit) {
755 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev);
756 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
757 &kvm->arch.vpit->speaker_dev);
680 kvm_unregister_irq_mask_notifier(kvm, 0, 758 kvm_unregister_irq_mask_notifier(kvm, 0,
681 &kvm->arch.vpit->mask_notifier); 759 &kvm->arch.vpit->mask_notifier);
682 kvm_unregister_irq_ack_notifier(kvm, 760 kvm_unregister_irq_ack_notifier(kvm,
@@ -684,54 +762,10 @@ void kvm_free_pit(struct kvm *kvm)
684 mutex_lock(&kvm->arch.vpit->pit_state.lock); 762 mutex_lock(&kvm->arch.vpit->pit_state.lock);
685 timer = &kvm->arch.vpit->pit_state.pit_timer.timer; 763 timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
686 hrtimer_cancel(timer); 764 hrtimer_cancel(timer);
765 cancel_work_sync(&kvm->arch.vpit->expired);
687 kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); 766 kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
688 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 767 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
768 destroy_workqueue(kvm->arch.vpit->wq);
689 kfree(kvm->arch.vpit); 769 kfree(kvm->arch.vpit);
690 } 770 }
691} 771}
692
693static void __inject_pit_timer_intr(struct kvm *kvm)
694{
695 struct kvm_vcpu *vcpu;
696 int i;
697
698 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
699 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
700
701 /*
702 * Provides NMI watchdog support via Virtual Wire mode.
703 * The route is: PIT -> PIC -> LVT0 in NMI mode.
704 *
705 * Note: Our Virtual Wire implementation is simplified, only
706 * propagating PIT interrupts to all VCPUs when they have set
707 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
708 * VCPU0, and only if its LVT0 is in EXTINT mode.
709 */
710 if (kvm->arch.vapics_in_nmi_mode > 0)
711 kvm_for_each_vcpu(i, vcpu, kvm)
712 kvm_apic_nmi_wd_deliver(vcpu);
713}
714
715void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
716{
717 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
718 struct kvm *kvm = vcpu->kvm;
719 struct kvm_kpit_state *ps;
720
721 if (pit) {
722 int inject = 0;
723 ps = &pit->pit_state;
724
725 /* Try to inject pending interrupts when
726 * last one has been acked.
727 */
728 raw_spin_lock(&ps->inject_lock);
729 if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
730 ps->irq_ack = 0;
731 inject = 1;
732 }
733 raw_spin_unlock(&ps->inject_lock);
734 if (inject)
735 __inject_pit_timer_intr(kvm);
736 }
737}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 900d6b0ba7c2..46d08ca0b48f 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -27,7 +27,7 @@ struct kvm_kpit_state {
27 u32 speaker_data_on; 27 u32 speaker_data_on;
28 struct mutex lock; 28 struct mutex lock;
29 struct kvm_pit *pit; 29 struct kvm_pit *pit;
30 raw_spinlock_t inject_lock; 30 spinlock_t inject_lock;
31 unsigned long irq_ack; 31 unsigned long irq_ack;
32 struct kvm_irq_ack_notifier irq_ack_notifier; 32 struct kvm_irq_ack_notifier irq_ack_notifier;
33}; 33};
@@ -40,6 +40,8 @@ struct kvm_pit {
40 struct kvm_kpit_state pit_state; 40 struct kvm_kpit_state pit_state;
41 int irq_source_id; 41 int irq_source_id;
42 struct kvm_irq_mask_notifier mask_notifier; 42 struct kvm_irq_mask_notifier mask_notifier;
43 struct workqueue_struct *wq;
44 struct work_struct expired;
43}; 45};
44 46
45#define KVM_PIT_BASE_ADDRESS 0x40 47#define KVM_PIT_BASE_ADDRESS 0x40
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 93825ff3338f..8d10c063d7f2 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -3,6 +3,7 @@
3 * 3 *
4 * Copyright (c) 2003-2004 Fabrice Bellard 4 * Copyright (c) 2003-2004 Fabrice Bellard
5 * Copyright (c) 2007 Intel Corporation 5 * Copyright (c) 2007 Intel Corporation
6 * Copyright 2009 Red Hat, Inc. and/or its affilates.
6 * 7 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy 8 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal 9 * of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +34,8 @@
33#include <linux/kvm_host.h> 34#include <linux/kvm_host.h>
34#include "trace.h" 35#include "trace.h"
35 36
37static void pic_irq_request(struct kvm *kvm, int level);
38
36static void pic_lock(struct kvm_pic *s) 39static void pic_lock(struct kvm_pic *s)
37 __acquires(&s->lock) 40 __acquires(&s->lock)
38{ 41{
@@ -43,16 +46,25 @@ static void pic_unlock(struct kvm_pic *s)
43 __releases(&s->lock) 46 __releases(&s->lock)
44{ 47{
45 bool wakeup = s->wakeup_needed; 48 bool wakeup = s->wakeup_needed;
46 struct kvm_vcpu *vcpu; 49 struct kvm_vcpu *vcpu, *found = NULL;
50 int i;
47 51
48 s->wakeup_needed = false; 52 s->wakeup_needed = false;
49 53
50 raw_spin_unlock(&s->lock); 54 raw_spin_unlock(&s->lock);
51 55
52 if (wakeup) { 56 if (wakeup) {
53 vcpu = s->kvm->bsp_vcpu; 57 kvm_for_each_vcpu(i, vcpu, s->kvm) {
54 if (vcpu) 58 if (kvm_apic_accept_pic_intr(vcpu)) {
55 kvm_vcpu_kick(vcpu); 59 found = vcpu;
60 break;
61 }
62 }
63
64 if (!found)
65 found = s->kvm->bsp_vcpu;
66
67 kvm_vcpu_kick(found);
56 } 68 }
57} 69}
58 70
@@ -173,10 +185,7 @@ static void pic_update_irq(struct kvm_pic *s)
173 pic_set_irq1(&s->pics[0], 2, 0); 185 pic_set_irq1(&s->pics[0], 2, 0);
174 } 186 }
175 irq = pic_get_irq(&s->pics[0]); 187 irq = pic_get_irq(&s->pics[0]);
176 if (irq >= 0) 188 pic_irq_request(s->kvm, irq >= 0);
177 s->irq_request(s->irq_request_opaque, 1);
178 else
179 s->irq_request(s->irq_request_opaque, 0);
180} 189}
181 190
182void kvm_pic_update_irq(struct kvm_pic *s) 191void kvm_pic_update_irq(struct kvm_pic *s)
@@ -261,8 +270,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
261void kvm_pic_reset(struct kvm_kpic_state *s) 270void kvm_pic_reset(struct kvm_kpic_state *s)
262{ 271{
263 int irq; 272 int irq;
264 struct kvm *kvm = s->pics_state->irq_request_opaque; 273 struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu;
265 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
266 u8 irr = s->irr, isr = s->imr; 274 u8 irr = s->irr, isr = s->imr;
267 275
268 s->last_irr = 0; 276 s->last_irr = 0;
@@ -301,8 +309,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
301 /* 309 /*
302 * deassert a pending interrupt 310 * deassert a pending interrupt
303 */ 311 */
304 s->pics_state->irq_request(s->pics_state-> 312 pic_irq_request(s->pics_state->kvm, 0);
305 irq_request_opaque, 0);
306 s->init_state = 1; 313 s->init_state = 1;
307 s->init4 = val & 1; 314 s->init4 = val & 1;
308 if (val & 0x02) 315 if (val & 0x02)
@@ -356,10 +363,20 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
356 } 363 }
357 } else 364 } else
358 switch (s->init_state) { 365 switch (s->init_state) {
359 case 0: /* normal mode */ 366 case 0: { /* normal mode */
367 u8 imr_diff = s->imr ^ val,
368 off = (s == &s->pics_state->pics[0]) ? 0 : 8;
360 s->imr = val; 369 s->imr = val;
370 for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
371 if (imr_diff & (1 << irq))
372 kvm_fire_mask_notifiers(
373 s->pics_state->kvm,
374 SELECT_PIC(irq + off),
375 irq + off,
376 !!(s->imr & (1 << irq)));
361 pic_update_irq(s->pics_state); 377 pic_update_irq(s->pics_state);
362 break; 378 break;
379 }
363 case 1: 380 case 1:
364 s->irq_base = val & 0xf8; 381 s->irq_base = val & 0xf8;
365 s->init_state = 2; 382 s->init_state = 2;
@@ -518,9 +535,8 @@ static int picdev_read(struct kvm_io_device *this,
518/* 535/*
519 * callback when PIC0 irq status changed 536 * callback when PIC0 irq status changed
520 */ 537 */
521static void pic_irq_request(void *opaque, int level) 538static void pic_irq_request(struct kvm *kvm, int level)
522{ 539{
523 struct kvm *kvm = opaque;
524 struct kvm_vcpu *vcpu = kvm->bsp_vcpu; 540 struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
525 struct kvm_pic *s = pic_irqchip(kvm); 541 struct kvm_pic *s = pic_irqchip(kvm);
526 int irq = pic_get_irq(&s->pics[0]); 542 int irq = pic_get_irq(&s->pics[0]);
@@ -549,8 +565,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
549 s->kvm = kvm; 565 s->kvm = kvm;
550 s->pics[0].elcr_mask = 0xf8; 566 s->pics[0].elcr_mask = 0xf8;
551 s->pics[1].elcr_mask = 0xde; 567 s->pics[1].elcr_mask = 0xde;
552 s->irq_request = pic_irq_request;
553 s->irq_request_opaque = kvm;
554 s->pics[0].pics_state = s; 568 s->pics[0].pics_state = s;
555 s->pics[1].pics_state = s; 569 s->pics[1].pics_state = s;
556 570
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 96dfbb6ad2a9..2095a049835e 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * irq.c: API for in kernel interrupt controller 2 * irq.c: API for in kernel interrupt controller
3 * Copyright (c) 2007, Intel Corporation. 3 * Copyright (c) 2007, Intel Corporation.
4 * Copyright 2009 Red Hat, Inc. and/or its affilates.
4 * 5 *
5 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License, 7 * under the terms and conditions of the GNU General Public License,
@@ -89,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
89void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) 90void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
90{ 91{
91 kvm_inject_apic_timer_irqs(vcpu); 92 kvm_inject_apic_timer_irqs(vcpu);
92 kvm_inject_pit_timer_irqs(vcpu);
93 /* TODO: PIT, RTC etc. */ 93 /* TODO: PIT, RTC etc. */
94} 94}
95EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); 95EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index cd1f362f413d..ffed06871c5c 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -38,8 +38,6 @@
38struct kvm; 38struct kvm;
39struct kvm_vcpu; 39struct kvm_vcpu;
40 40
41typedef void irq_request_func(void *opaque, int level);
42
43struct kvm_kpic_state { 41struct kvm_kpic_state {
44 u8 last_irr; /* edge detection */ 42 u8 last_irr; /* edge detection */
45 u8 irr; /* interrupt request register */ 43 u8 irr; /* interrupt request register */
@@ -67,8 +65,6 @@ struct kvm_pic {
67 unsigned pending_acks; 65 unsigned pending_acks;
68 struct kvm *kvm; 66 struct kvm *kvm;
69 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 67 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
70 irq_request_func *irq_request;
71 void *irq_request_opaque;
72 int output; /* intr from master PIC */ 68 int output; /* intr from master PIC */
73 struct kvm_io_device dev; 69 struct kvm_io_device dev;
74 void (*ack_notifier)(void *opaque, int irq); 70 void (*ack_notifier)(void *opaque, int irq);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index cff851cf5322..6491ac8e755b 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -36,6 +36,8 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
36 36
37static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) 37static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
38{ 38{
39 might_sleep(); /* on svm */
40
39 if (!test_bit(VCPU_EXREG_PDPTR, 41 if (!test_bit(VCPU_EXREG_PDPTR,
40 (unsigned long *)&vcpu->arch.regs_avail)) 42 (unsigned long *)&vcpu->arch.regs_avail))
41 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); 43 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
@@ -69,4 +71,10 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
69 return kvm_read_cr4_bits(vcpu, ~0UL); 71 return kvm_read_cr4_bits(vcpu, ~0UL);
70} 72}
71 73
74static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
75{
76 return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u)
77 | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
78}
79
72#endif 80#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1eb7a4ae0c9c..77d8c0f4817d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -5,6 +5,7 @@
5 * Copyright (C) 2006 Qumranet, Inc. 5 * Copyright (C) 2006 Qumranet, Inc.
6 * Copyright (C) 2007 Novell 6 * Copyright (C) 2007 Novell
7 * Copyright (C) 2007 Intel 7 * Copyright (C) 2007 Intel
8 * Copyright 2009 Red Hat, Inc. and/or its affilates.
8 * 9 *
9 * Authors: 10 * Authors:
10 * Dor Laor <dor.laor@qumranet.com> 11 * Dor Laor <dor.laor@qumranet.com>
@@ -328,7 +329,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
328 "dest_mode 0x%x, short_hand 0x%x\n", 329 "dest_mode 0x%x, short_hand 0x%x\n",
329 target, source, dest, dest_mode, short_hand); 330 target, source, dest, dest_mode, short_hand);
330 331
331 ASSERT(!target); 332 ASSERT(target);
332 switch (short_hand) { 333 switch (short_hand) {
333 case APIC_DEST_NOSHORT: 334 case APIC_DEST_NOSHORT:
334 if (dest_mode == 0) 335 if (dest_mode == 0)
@@ -533,7 +534,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
533 struct kvm_vcpu *vcpu = apic->vcpu; 534 struct kvm_vcpu *vcpu = apic->vcpu;
534 struct kvm_run *run = vcpu->run; 535 struct kvm_run *run = vcpu->run;
535 536
536 set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); 537 kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
537 run->tpr_access.rip = kvm_rip_read(vcpu); 538 run->tpr_access.rip = kvm_rip_read(vcpu);
538 run->tpr_access.is_write = write; 539 run->tpr_access.is_write = write;
539} 540}
@@ -1106,13 +1107,11 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
1106 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); 1107 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
1107 int r = 0; 1108 int r = 0;
1108 1109
1109 if (kvm_vcpu_is_bsp(vcpu)) { 1110 if (!apic_hw_enabled(vcpu->arch.apic))
1110 if (!apic_hw_enabled(vcpu->arch.apic)) 1111 r = 1;
1111 r = 1; 1112 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
1112 if ((lvt0 & APIC_LVT_MASKED) == 0 && 1113 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
1113 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) 1114 r = 1;
1114 r = 1;
1115 }
1116 return r; 1115 return r;
1117} 1116}
1118 1117
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b1ed0a1a5913..0dcc95e09876 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,6 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates.
10 * 11 *
11 * Authors: 12 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -32,6 +33,7 @@
32#include <linux/compiler.h> 33#include <linux/compiler.h>
33#include <linux/srcu.h> 34#include <linux/srcu.h>
34#include <linux/slab.h> 35#include <linux/slab.h>
36#include <linux/uaccess.h>
35 37
36#include <asm/page.h> 38#include <asm/page.h>
37#include <asm/cmpxchg.h> 39#include <asm/cmpxchg.h>
@@ -90,8 +92,6 @@ module_param(oos_shadow, bool, 0644);
90#define PT_FIRST_AVAIL_BITS_SHIFT 9 92#define PT_FIRST_AVAIL_BITS_SHIFT 9
91#define PT64_SECOND_AVAIL_BITS_SHIFT 52 93#define PT64_SECOND_AVAIL_BITS_SHIFT 52
92 94
93#define VALID_PAGE(x) ((x) != INVALID_PAGE)
94
95#define PT64_LEVEL_BITS 9 95#define PT64_LEVEL_BITS 9
96 96
97#define PT64_LEVEL_SHIFT(level) \ 97#define PT64_LEVEL_SHIFT(level) \
@@ -173,7 +173,7 @@ struct kvm_shadow_walk_iterator {
173 shadow_walk_okay(&(_walker)); \ 173 shadow_walk_okay(&(_walker)); \
174 shadow_walk_next(&(_walker))) 174 shadow_walk_next(&(_walker)))
175 175
176typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp); 176typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
177 177
178static struct kmem_cache *pte_chain_cache; 178static struct kmem_cache *pte_chain_cache;
179static struct kmem_cache *rmap_desc_cache; 179static struct kmem_cache *rmap_desc_cache;
@@ -288,6 +288,35 @@ static void __set_spte(u64 *sptep, u64 spte)
288#endif 288#endif
289} 289}
290 290
291static u64 __xchg_spte(u64 *sptep, u64 new_spte)
292{
293#ifdef CONFIG_X86_64
294 return xchg(sptep, new_spte);
295#else
296 u64 old_spte;
297
298 do {
299 old_spte = *sptep;
300 } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
301
302 return old_spte;
303#endif
304}
305
306static void update_spte(u64 *sptep, u64 new_spte)
307{
308 u64 old_spte;
309
310 if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) ||
311 !is_rmap_spte(*sptep))
312 __set_spte(sptep, new_spte);
313 else {
314 old_spte = __xchg_spte(sptep, new_spte);
315 if (old_spte & shadow_accessed_mask)
316 mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
317 }
318}
319
291static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 320static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
292 struct kmem_cache *base_cache, int min) 321 struct kmem_cache *base_cache, int min)
293{ 322{
@@ -304,10 +333,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
304 return 0; 333 return 0;
305} 334}
306 335
307static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) 336static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
337 struct kmem_cache *cache)
308{ 338{
309 while (mc->nobjs) 339 while (mc->nobjs)
310 kfree(mc->objects[--mc->nobjs]); 340 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
311} 341}
312 342
313static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, 343static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
@@ -355,10 +385,11 @@ out:
355 385
356static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) 386static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
357{ 387{
358 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); 388 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
359 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); 389 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
360 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); 390 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
361 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); 391 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
392 mmu_page_header_cache);
362} 393}
363 394
364static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, 395static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
@@ -379,7 +410,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
379 410
380static void mmu_free_pte_chain(struct kvm_pte_chain *pc) 411static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
381{ 412{
382 kfree(pc); 413 kmem_cache_free(pte_chain_cache, pc);
383} 414}
384 415
385static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) 416static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
@@ -390,7 +421,23 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
390 421
391static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) 422static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
392{ 423{
393 kfree(rd); 424 kmem_cache_free(rmap_desc_cache, rd);
425}
426
427static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
428{
429 if (!sp->role.direct)
430 return sp->gfns[index];
431
432 return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
433}
434
435static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
436{
437 if (sp->role.direct)
438 BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
439 else
440 sp->gfns[index] = gfn;
394} 441}
395 442
396/* 443/*
@@ -403,8 +450,8 @@ static int *slot_largepage_idx(gfn_t gfn,
403{ 450{
404 unsigned long idx; 451 unsigned long idx;
405 452
406 idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - 453 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
407 (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); 454 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
408 return &slot->lpage_info[level - 2][idx].write_count; 455 return &slot->lpage_info[level - 2][idx].write_count;
409} 456}
410 457
@@ -414,9 +461,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
414 int *write_count; 461 int *write_count;
415 int i; 462 int i;
416 463
417 gfn = unalias_gfn(kvm, gfn); 464 slot = gfn_to_memslot(kvm, gfn);
418
419 slot = gfn_to_memslot_unaliased(kvm, gfn);
420 for (i = PT_DIRECTORY_LEVEL; 465 for (i = PT_DIRECTORY_LEVEL;
421 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 466 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
422 write_count = slot_largepage_idx(gfn, slot, i); 467 write_count = slot_largepage_idx(gfn, slot, i);
@@ -430,8 +475,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
430 int *write_count; 475 int *write_count;
431 int i; 476 int i;
432 477
433 gfn = unalias_gfn(kvm, gfn); 478 slot = gfn_to_memslot(kvm, gfn);
434 slot = gfn_to_memslot_unaliased(kvm, gfn);
435 for (i = PT_DIRECTORY_LEVEL; 479 for (i = PT_DIRECTORY_LEVEL;
436 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 480 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
437 write_count = slot_largepage_idx(gfn, slot, i); 481 write_count = slot_largepage_idx(gfn, slot, i);
@@ -447,8 +491,7 @@ static int has_wrprotected_page(struct kvm *kvm,
447 struct kvm_memory_slot *slot; 491 struct kvm_memory_slot *slot;
448 int *largepage_idx; 492 int *largepage_idx;
449 493
450 gfn = unalias_gfn(kvm, gfn); 494 slot = gfn_to_memslot(kvm, gfn);
451 slot = gfn_to_memslot_unaliased(kvm, gfn);
452 if (slot) { 495 if (slot) {
453 largepage_idx = slot_largepage_idx(gfn, slot, level); 496 largepage_idx = slot_largepage_idx(gfn, slot, level);
454 return *largepage_idx; 497 return *largepage_idx;
@@ -501,7 +544,6 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
501 544
502/* 545/*
503 * Take gfn and return the reverse mapping to it. 546 * Take gfn and return the reverse mapping to it.
504 * Note: gfn must be unaliased before this function get called
505 */ 547 */
506 548
507static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) 549static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
@@ -513,8 +555,8 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
513 if (likely(level == PT_PAGE_TABLE_LEVEL)) 555 if (likely(level == PT_PAGE_TABLE_LEVEL))
514 return &slot->rmap[gfn - slot->base_gfn]; 556 return &slot->rmap[gfn - slot->base_gfn];
515 557
516 idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - 558 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
517 (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); 559 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
518 560
519 return &slot->lpage_info[level - 2][idx].rmap_pde; 561 return &slot->lpage_info[level - 2][idx].rmap_pde;
520} 562}
@@ -541,9 +583,8 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
541 583
542 if (!is_rmap_spte(*spte)) 584 if (!is_rmap_spte(*spte))
543 return count; 585 return count;
544 gfn = unalias_gfn(vcpu->kvm, gfn);
545 sp = page_header(__pa(spte)); 586 sp = page_header(__pa(spte));
546 sp->gfns[spte - sp->spt] = gfn; 587 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
547 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 588 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
548 if (!*rmapp) { 589 if (!*rmapp) {
549 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); 590 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
@@ -600,19 +641,13 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
600 struct kvm_rmap_desc *desc; 641 struct kvm_rmap_desc *desc;
601 struct kvm_rmap_desc *prev_desc; 642 struct kvm_rmap_desc *prev_desc;
602 struct kvm_mmu_page *sp; 643 struct kvm_mmu_page *sp;
603 pfn_t pfn; 644 gfn_t gfn;
604 unsigned long *rmapp; 645 unsigned long *rmapp;
605 int i; 646 int i;
606 647
607 if (!is_rmap_spte(*spte))
608 return;
609 sp = page_header(__pa(spte)); 648 sp = page_header(__pa(spte));
610 pfn = spte_to_pfn(*spte); 649 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
611 if (*spte & shadow_accessed_mask) 650 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
612 kvm_set_pfn_accessed(pfn);
613 if (is_writable_pte(*spte))
614 kvm_set_pfn_dirty(pfn);
615 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
616 if (!*rmapp) { 651 if (!*rmapp) {
617 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 652 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
618 BUG(); 653 BUG();
@@ -644,6 +679,32 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
644 } 679 }
645} 680}
646 681
682static void set_spte_track_bits(u64 *sptep, u64 new_spte)
683{
684 pfn_t pfn;
685 u64 old_spte = *sptep;
686
687 if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) ||
688 old_spte & shadow_accessed_mask) {
689 __set_spte(sptep, new_spte);
690 } else
691 old_spte = __xchg_spte(sptep, new_spte);
692
693 if (!is_rmap_spte(old_spte))
694 return;
695 pfn = spte_to_pfn(old_spte);
696 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
697 kvm_set_pfn_accessed(pfn);
698 if (is_writable_pte(old_spte))
699 kvm_set_pfn_dirty(pfn);
700}
701
702static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
703{
704 set_spte_track_bits(sptep, new_spte);
705 rmap_remove(kvm, sptep);
706}
707
647static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 708static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
648{ 709{
649 struct kvm_rmap_desc *desc; 710 struct kvm_rmap_desc *desc;
@@ -676,7 +737,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
676 u64 *spte; 737 u64 *spte;
677 int i, write_protected = 0; 738 int i, write_protected = 0;
678 739
679 gfn = unalias_gfn(kvm, gfn);
680 rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); 740 rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
681 741
682 spte = rmap_next(kvm, rmapp, NULL); 742 spte = rmap_next(kvm, rmapp, NULL);
@@ -685,7 +745,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
685 BUG_ON(!(*spte & PT_PRESENT_MASK)); 745 BUG_ON(!(*spte & PT_PRESENT_MASK));
686 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 746 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
687 if (is_writable_pte(*spte)) { 747 if (is_writable_pte(*spte)) {
688 __set_spte(spte, *spte & ~PT_WRITABLE_MASK); 748 update_spte(spte, *spte & ~PT_WRITABLE_MASK);
689 write_protected = 1; 749 write_protected = 1;
690 } 750 }
691 spte = rmap_next(kvm, rmapp, spte); 751 spte = rmap_next(kvm, rmapp, spte);
@@ -709,9 +769,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
709 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 769 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
710 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 770 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
711 if (is_writable_pte(*spte)) { 771 if (is_writable_pte(*spte)) {
712 rmap_remove(kvm, spte); 772 drop_spte(kvm, spte,
773 shadow_trap_nonpresent_pte);
713 --kvm->stat.lpages; 774 --kvm->stat.lpages;
714 __set_spte(spte, shadow_trap_nonpresent_pte);
715 spte = NULL; 775 spte = NULL;
716 write_protected = 1; 776 write_protected = 1;
717 } 777 }
@@ -731,8 +791,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
731 while ((spte = rmap_next(kvm, rmapp, NULL))) { 791 while ((spte = rmap_next(kvm, rmapp, NULL))) {
732 BUG_ON(!(*spte & PT_PRESENT_MASK)); 792 BUG_ON(!(*spte & PT_PRESENT_MASK));
733 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); 793 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
734 rmap_remove(kvm, spte); 794 drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
735 __set_spte(spte, shadow_trap_nonpresent_pte);
736 need_tlb_flush = 1; 795 need_tlb_flush = 1;
737 } 796 }
738 return need_tlb_flush; 797 return need_tlb_flush;
@@ -754,8 +813,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
754 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); 813 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
755 need_flush = 1; 814 need_flush = 1;
756 if (pte_write(*ptep)) { 815 if (pte_write(*ptep)) {
757 rmap_remove(kvm, spte); 816 drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
758 __set_spte(spte, shadow_trap_nonpresent_pte);
759 spte = rmap_next(kvm, rmapp, NULL); 817 spte = rmap_next(kvm, rmapp, NULL);
760 } else { 818 } else {
761 new_spte = *spte &~ (PT64_BASE_ADDR_MASK); 819 new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -763,9 +821,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
763 821
764 new_spte &= ~PT_WRITABLE_MASK; 822 new_spte &= ~PT_WRITABLE_MASK;
765 new_spte &= ~SPTE_HOST_WRITEABLE; 823 new_spte &= ~SPTE_HOST_WRITEABLE;
766 if (is_writable_pte(*spte)) 824 new_spte &= ~shadow_accessed_mask;
767 kvm_set_pfn_dirty(spte_to_pfn(*spte)); 825 set_spte_track_bits(spte, new_spte);
768 __set_spte(spte, new_spte);
769 spte = rmap_next(kvm, rmapp, spte); 826 spte = rmap_next(kvm, rmapp, spte);
770 } 827 }
771 } 828 }
@@ -799,8 +856,12 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
799 ret = handler(kvm, &memslot->rmap[gfn_offset], data); 856 ret = handler(kvm, &memslot->rmap[gfn_offset], data);
800 857
801 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 858 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
802 int idx = gfn_offset; 859 unsigned long idx;
803 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); 860 int sh;
861
862 sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
863 idx = ((memslot->base_gfn+gfn_offset) >> sh) -
864 (memslot->base_gfn >> sh);
804 ret |= handler(kvm, 865 ret |= handler(kvm,
805 &memslot->lpage_info[j][idx].rmap_pde, 866 &memslot->lpage_info[j][idx].rmap_pde,
806 data); 867 data);
@@ -863,7 +924,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
863 924
864 sp = page_header(__pa(spte)); 925 sp = page_header(__pa(spte));
865 926
866 gfn = unalias_gfn(vcpu->kvm, gfn);
867 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 927 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
868 928
869 kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); 929 kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
@@ -894,10 +954,12 @@ static int is_empty_shadow_page(u64 *spt)
894static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) 954static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
895{ 955{
896 ASSERT(is_empty_shadow_page(sp->spt)); 956 ASSERT(is_empty_shadow_page(sp->spt));
957 hlist_del(&sp->hash_link);
897 list_del(&sp->link); 958 list_del(&sp->link);
898 __free_page(virt_to_page(sp->spt)); 959 __free_page(virt_to_page(sp->spt));
899 __free_page(virt_to_page(sp->gfns)); 960 if (!sp->role.direct)
900 kfree(sp); 961 __free_page(virt_to_page(sp->gfns));
962 kmem_cache_free(mmu_page_header_cache, sp);
901 ++kvm->arch.n_free_mmu_pages; 963 ++kvm->arch.n_free_mmu_pages;
902} 964}
903 965
@@ -907,13 +969,15 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
907} 969}
908 970
909static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, 971static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
910 u64 *parent_pte) 972 u64 *parent_pte, int direct)
911{ 973{
912 struct kvm_mmu_page *sp; 974 struct kvm_mmu_page *sp;
913 975
914 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); 976 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
915 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 977 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
916 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 978 if (!direct)
979 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
980 PAGE_SIZE);
917 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 981 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
918 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 982 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
919 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 983 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
@@ -998,7 +1062,6 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
998 BUG(); 1062 BUG();
999} 1063}
1000 1064
1001
1002static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) 1065static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1003{ 1066{
1004 struct kvm_pte_chain *pte_chain; 1067 struct kvm_pte_chain *pte_chain;
@@ -1008,63 +1071,37 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1008 1071
1009 if (!sp->multimapped && sp->parent_pte) { 1072 if (!sp->multimapped && sp->parent_pte) {
1010 parent_sp = page_header(__pa(sp->parent_pte)); 1073 parent_sp = page_header(__pa(sp->parent_pte));
1011 fn(parent_sp); 1074 fn(parent_sp, sp->parent_pte);
1012 mmu_parent_walk(parent_sp, fn);
1013 return; 1075 return;
1014 } 1076 }
1077
1015 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) 1078 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1016 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { 1079 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1017 if (!pte_chain->parent_ptes[i]) 1080 u64 *spte = pte_chain->parent_ptes[i];
1081
1082 if (!spte)
1018 break; 1083 break;
1019 parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); 1084 parent_sp = page_header(__pa(spte));
1020 fn(parent_sp); 1085 fn(parent_sp, spte);
1021 mmu_parent_walk(parent_sp, fn);
1022 } 1086 }
1023} 1087}
1024 1088
1025static void kvm_mmu_update_unsync_bitmap(u64 *spte) 1089static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
1090static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1026{ 1091{
1027 unsigned int index; 1092 mmu_parent_walk(sp, mark_unsync);
1028 struct kvm_mmu_page *sp = page_header(__pa(spte));
1029
1030 index = spte - sp->spt;
1031 if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
1032 sp->unsync_children++;
1033 WARN_ON(!sp->unsync_children);
1034} 1093}
1035 1094
1036static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) 1095static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
1037{ 1096{
1038 struct kvm_pte_chain *pte_chain; 1097 unsigned int index;
1039 struct hlist_node *node;
1040 int i;
1041 1098
1042 if (!sp->parent_pte) 1099 index = spte - sp->spt;
1100 if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1043 return; 1101 return;
1044 1102 if (sp->unsync_children++)
1045 if (!sp->multimapped) {
1046 kvm_mmu_update_unsync_bitmap(sp->parent_pte);
1047 return; 1103 return;
1048 } 1104 kvm_mmu_mark_parents_unsync(sp);
1049
1050 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1051 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1052 if (!pte_chain->parent_ptes[i])
1053 break;
1054 kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
1055 }
1056}
1057
1058static int unsync_walk_fn(struct kvm_mmu_page *sp)
1059{
1060 kvm_mmu_update_parents_unsync(sp);
1061 return 1;
1062}
1063
1064static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1065{
1066 mmu_parent_walk(sp, unsync_walk_fn);
1067 kvm_mmu_update_parents_unsync(sp);
1068} 1105}
1069 1106
1070static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, 1107static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
@@ -1077,7 +1114,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1077} 1114}
1078 1115
1079static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1116static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1080 struct kvm_mmu_page *sp) 1117 struct kvm_mmu_page *sp, bool clear_unsync)
1081{ 1118{
1082 return 1; 1119 return 1;
1083} 1120}
@@ -1123,35 +1160,40 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1123 int i, ret, nr_unsync_leaf = 0; 1160 int i, ret, nr_unsync_leaf = 0;
1124 1161
1125 for_each_unsync_children(sp->unsync_child_bitmap, i) { 1162 for_each_unsync_children(sp->unsync_child_bitmap, i) {
1163 struct kvm_mmu_page *child;
1126 u64 ent = sp->spt[i]; 1164 u64 ent = sp->spt[i];
1127 1165
1128 if (is_shadow_present_pte(ent) && !is_large_pte(ent)) { 1166 if (!is_shadow_present_pte(ent) || is_large_pte(ent))
1129 struct kvm_mmu_page *child; 1167 goto clear_child_bitmap;
1130 child = page_header(ent & PT64_BASE_ADDR_MASK); 1168
1131 1169 child = page_header(ent & PT64_BASE_ADDR_MASK);
1132 if (child->unsync_children) { 1170
1133 if (mmu_pages_add(pvec, child, i)) 1171 if (child->unsync_children) {
1134 return -ENOSPC; 1172 if (mmu_pages_add(pvec, child, i))
1135 1173 return -ENOSPC;
1136 ret = __mmu_unsync_walk(child, pvec); 1174
1137 if (!ret) 1175 ret = __mmu_unsync_walk(child, pvec);
1138 __clear_bit(i, sp->unsync_child_bitmap); 1176 if (!ret)
1139 else if (ret > 0) 1177 goto clear_child_bitmap;
1140 nr_unsync_leaf += ret; 1178 else if (ret > 0)
1141 else 1179 nr_unsync_leaf += ret;
1142 return ret; 1180 else
1143 } 1181 return ret;
1182 } else if (child->unsync) {
1183 nr_unsync_leaf++;
1184 if (mmu_pages_add(pvec, child, i))
1185 return -ENOSPC;
1186 } else
1187 goto clear_child_bitmap;
1144 1188
1145 if (child->unsync) { 1189 continue;
1146 nr_unsync_leaf++; 1190
1147 if (mmu_pages_add(pvec, child, i)) 1191clear_child_bitmap:
1148 return -ENOSPC; 1192 __clear_bit(i, sp->unsync_child_bitmap);
1149 } 1193 sp->unsync_children--;
1150 } 1194 WARN_ON((int)sp->unsync_children < 0);
1151 } 1195 }
1152 1196
1153 if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
1154 sp->unsync_children = 0;
1155 1197
1156 return nr_unsync_leaf; 1198 return nr_unsync_leaf;
1157} 1199}
@@ -1166,26 +1208,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1166 return __mmu_unsync_walk(sp, pvec); 1208 return __mmu_unsync_walk(sp, pvec);
1167} 1209}
1168 1210
1169static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
1170{
1171 unsigned index;
1172 struct hlist_head *bucket;
1173 struct kvm_mmu_page *sp;
1174 struct hlist_node *node;
1175
1176 pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
1177 index = kvm_page_table_hashfn(gfn);
1178 bucket = &kvm->arch.mmu_page_hash[index];
1179 hlist_for_each_entry(sp, node, bucket, hash_link)
1180 if (sp->gfn == gfn && !sp->role.direct
1181 && !sp->role.invalid) {
1182 pgprintk("%s: found role %x\n",
1183 __func__, sp->role.word);
1184 return sp;
1185 }
1186 return NULL;
1187}
1188
1189static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1211static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1190{ 1212{
1191 WARN_ON(!sp->unsync); 1213 WARN_ON(!sp->unsync);
@@ -1194,20 +1216,36 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1194 --kvm->stat.mmu_unsync; 1216 --kvm->stat.mmu_unsync;
1195} 1217}
1196 1218
1197static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); 1219static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1220 struct list_head *invalid_list);
1221static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1222 struct list_head *invalid_list);
1223
1224#define for_each_gfn_sp(kvm, sp, gfn, pos) \
1225 hlist_for_each_entry(sp, pos, \
1226 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
1227 if ((sp)->gfn != (gfn)) {} else
1228
1229#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \
1230 hlist_for_each_entry(sp, pos, \
1231 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
1232 if ((sp)->gfn != (gfn) || (sp)->role.direct || \
1233 (sp)->role.invalid) {} else
1198 1234
1199static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1235/* @sp->gfn should be write-protected at the call site */
1236static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1237 struct list_head *invalid_list, bool clear_unsync)
1200{ 1238{
1201 if (sp->role.cr4_pae != !!is_pae(vcpu)) { 1239 if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1202 kvm_mmu_zap_page(vcpu->kvm, sp); 1240 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1203 return 1; 1241 return 1;
1204 } 1242 }
1205 1243
1206 if (rmap_write_protect(vcpu->kvm, sp->gfn)) 1244 if (clear_unsync)
1207 kvm_flush_remote_tlbs(vcpu->kvm); 1245 kvm_unlink_unsync_page(vcpu->kvm, sp);
1208 kvm_unlink_unsync_page(vcpu->kvm, sp); 1246
1209 if (vcpu->arch.mmu.sync_page(vcpu, sp)) { 1247 if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
1210 kvm_mmu_zap_page(vcpu->kvm, sp); 1248 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1211 return 1; 1249 return 1;
1212 } 1250 }
1213 1251
@@ -1215,6 +1253,52 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1215 return 0; 1253 return 0;
1216} 1254}
1217 1255
1256static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
1257 struct kvm_mmu_page *sp)
1258{
1259 LIST_HEAD(invalid_list);
1260 int ret;
1261
1262 ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
1263 if (ret)
1264 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1265
1266 return ret;
1267}
1268
1269static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1270 struct list_head *invalid_list)
1271{
1272 return __kvm_sync_page(vcpu, sp, invalid_list, true);
1273}
1274
1275/* @gfn should be write-protected at the call site */
1276static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1277{
1278 struct kvm_mmu_page *s;
1279 struct hlist_node *node;
1280 LIST_HEAD(invalid_list);
1281 bool flush = false;
1282
1283 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1284 if (!s->unsync)
1285 continue;
1286
1287 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1288 if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1289 (vcpu->arch.mmu.sync_page(vcpu, s, true))) {
1290 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1291 continue;
1292 }
1293 kvm_unlink_unsync_page(vcpu->kvm, s);
1294 flush = true;
1295 }
1296
1297 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1298 if (flush)
1299 kvm_mmu_flush_tlb(vcpu);
1300}
1301
1218struct mmu_page_path { 1302struct mmu_page_path {
1219 struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; 1303 struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
1220 unsigned int idx[PT64_ROOT_LEVEL-1]; 1304 unsigned int idx[PT64_ROOT_LEVEL-1];
@@ -1281,6 +1365,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
1281 struct kvm_mmu_page *sp; 1365 struct kvm_mmu_page *sp;
1282 struct mmu_page_path parents; 1366 struct mmu_page_path parents;
1283 struct kvm_mmu_pages pages; 1367 struct kvm_mmu_pages pages;
1368 LIST_HEAD(invalid_list);
1284 1369
1285 kvm_mmu_pages_init(parent, &parents, &pages); 1370 kvm_mmu_pages_init(parent, &parents, &pages);
1286 while (mmu_unsync_walk(parent, &pages)) { 1371 while (mmu_unsync_walk(parent, &pages)) {
@@ -1293,9 +1378,10 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
1293 kvm_flush_remote_tlbs(vcpu->kvm); 1378 kvm_flush_remote_tlbs(vcpu->kvm);
1294 1379
1295 for_each_sp(pages, sp, parents, i) { 1380 for_each_sp(pages, sp, parents, i) {
1296 kvm_sync_page(vcpu, sp); 1381 kvm_sync_page(vcpu, sp, &invalid_list);
1297 mmu_pages_clear_parents(&parents); 1382 mmu_pages_clear_parents(&parents);
1298 } 1383 }
1384 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1299 cond_resched_lock(&vcpu->kvm->mmu_lock); 1385 cond_resched_lock(&vcpu->kvm->mmu_lock);
1300 kvm_mmu_pages_init(parent, &parents, &pages); 1386 kvm_mmu_pages_init(parent, &parents, &pages);
1301 } 1387 }
@@ -1310,11 +1396,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1310 u64 *parent_pte) 1396 u64 *parent_pte)
1311{ 1397{
1312 union kvm_mmu_page_role role; 1398 union kvm_mmu_page_role role;
1313 unsigned index;
1314 unsigned quadrant; 1399 unsigned quadrant;
1315 struct hlist_head *bucket;
1316 struct kvm_mmu_page *sp; 1400 struct kvm_mmu_page *sp;
1317 struct hlist_node *node, *tmp; 1401 struct hlist_node *node;
1402 bool need_sync = false;
1318 1403
1319 role = vcpu->arch.mmu.base_role; 1404 role = vcpu->arch.mmu.base_role;
1320 role.level = level; 1405 role.level = level;
@@ -1322,40 +1407,45 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1322 if (role.direct) 1407 if (role.direct)
1323 role.cr4_pae = 0; 1408 role.cr4_pae = 0;
1324 role.access = access; 1409 role.access = access;
1325 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1410 if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1326 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1411 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1327 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 1412 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1328 role.quadrant = quadrant; 1413 role.quadrant = quadrant;
1329 } 1414 }
1330 index = kvm_page_table_hashfn(gfn); 1415 for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
1331 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1416 if (!need_sync && sp->unsync)
1332 hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) 1417 need_sync = true;
1333 if (sp->gfn == gfn) {
1334 if (sp->unsync)
1335 if (kvm_sync_page(vcpu, sp))
1336 continue;
1337 1418
1338 if (sp->role.word != role.word) 1419 if (sp->role.word != role.word)
1339 continue; 1420 continue;
1340 1421
1341 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1422 if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
1342 if (sp->unsync_children) { 1423 break;
1343 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); 1424
1344 kvm_mmu_mark_parents_unsync(sp); 1425 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1345 } 1426 if (sp->unsync_children) {
1346 trace_kvm_mmu_get_page(sp, false); 1427 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1347 return sp; 1428 kvm_mmu_mark_parents_unsync(sp);
1348 } 1429 } else if (sp->unsync)
1430 kvm_mmu_mark_parents_unsync(sp);
1431
1432 trace_kvm_mmu_get_page(sp, false);
1433 return sp;
1434 }
1349 ++vcpu->kvm->stat.mmu_cache_miss; 1435 ++vcpu->kvm->stat.mmu_cache_miss;
1350 sp = kvm_mmu_alloc_page(vcpu, parent_pte); 1436 sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
1351 if (!sp) 1437 if (!sp)
1352 return sp; 1438 return sp;
1353 sp->gfn = gfn; 1439 sp->gfn = gfn;
1354 sp->role = role; 1440 sp->role = role;
1355 hlist_add_head(&sp->hash_link, bucket); 1441 hlist_add_head(&sp->hash_link,
1442 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
1356 if (!direct) { 1443 if (!direct) {
1357 if (rmap_write_protect(vcpu->kvm, gfn)) 1444 if (rmap_write_protect(vcpu->kvm, gfn))
1358 kvm_flush_remote_tlbs(vcpu->kvm); 1445 kvm_flush_remote_tlbs(vcpu->kvm);
1446 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
1447 kvm_sync_pages(vcpu, gfn);
1448
1359 account_shadowed(vcpu->kvm, gfn); 1449 account_shadowed(vcpu->kvm, gfn);
1360 } 1450 }
1361 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) 1451 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
@@ -1402,6 +1492,47 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1402 --iterator->level; 1492 --iterator->level;
1403} 1493}
1404 1494
1495static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1496{
1497 u64 spte;
1498
1499 spte = __pa(sp->spt)
1500 | PT_PRESENT_MASK | PT_ACCESSED_MASK
1501 | PT_WRITABLE_MASK | PT_USER_MASK;
1502 __set_spte(sptep, spte);
1503}
1504
1505static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1506{
1507 if (is_large_pte(*sptep)) {
1508 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1509 kvm_flush_remote_tlbs(vcpu->kvm);
1510 }
1511}
1512
1513static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1514 unsigned direct_access)
1515{
1516 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
1517 struct kvm_mmu_page *child;
1518
1519 /*
1520 * For the direct sp, if the guest pte's dirty bit
1521 * changed form clean to dirty, it will corrupt the
1522 * sp's access: allow writable in the read-only sp,
1523 * so we should update the spte at this point to get
1524 * a new sp with the correct access.
1525 */
1526 child = page_header(*sptep & PT64_BASE_ADDR_MASK);
1527 if (child->role.access == direct_access)
1528 return;
1529
1530 mmu_page_remove_parent_pte(child, sptep);
1531 __set_spte(sptep, shadow_trap_nonpresent_pte);
1532 kvm_flush_remote_tlbs(vcpu->kvm);
1533 }
1534}
1535
1405static void kvm_mmu_page_unlink_children(struct kvm *kvm, 1536static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1406 struct kvm_mmu_page *sp) 1537 struct kvm_mmu_page *sp)
1407{ 1538{
@@ -1422,7 +1553,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1422 } else { 1553 } else {
1423 if (is_large_pte(ent)) 1554 if (is_large_pte(ent))
1424 --kvm->stat.lpages; 1555 --kvm->stat.lpages;
1425 rmap_remove(kvm, &pt[i]); 1556 drop_spte(kvm, &pt[i],
1557 shadow_trap_nonpresent_pte);
1426 } 1558 }
1427 } 1559 }
1428 pt[i] = shadow_trap_nonpresent_pte; 1560 pt[i] = shadow_trap_nonpresent_pte;
@@ -1464,7 +1596,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1464} 1596}
1465 1597
1466static int mmu_zap_unsync_children(struct kvm *kvm, 1598static int mmu_zap_unsync_children(struct kvm *kvm,
1467 struct kvm_mmu_page *parent) 1599 struct kvm_mmu_page *parent,
1600 struct list_head *invalid_list)
1468{ 1601{
1469 int i, zapped = 0; 1602 int i, zapped = 0;
1470 struct mmu_page_path parents; 1603 struct mmu_page_path parents;
@@ -1478,7 +1611,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
1478 struct kvm_mmu_page *sp; 1611 struct kvm_mmu_page *sp;
1479 1612
1480 for_each_sp(pages, sp, parents, i) { 1613 for_each_sp(pages, sp, parents, i) {
1481 kvm_mmu_zap_page(kvm, sp); 1614 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
1482 mmu_pages_clear_parents(&parents); 1615 mmu_pages_clear_parents(&parents);
1483 zapped++; 1616 zapped++;
1484 } 1617 }
@@ -1488,32 +1621,52 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
1488 return zapped; 1621 return zapped;
1489} 1622}
1490 1623
1491static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1624static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1625 struct list_head *invalid_list)
1492{ 1626{
1493 int ret; 1627 int ret;
1494 1628
1495 trace_kvm_mmu_zap_page(sp); 1629 trace_kvm_mmu_prepare_zap_page(sp);
1496 ++kvm->stat.mmu_shadow_zapped; 1630 ++kvm->stat.mmu_shadow_zapped;
1497 ret = mmu_zap_unsync_children(kvm, sp); 1631 ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
1498 kvm_mmu_page_unlink_children(kvm, sp); 1632 kvm_mmu_page_unlink_children(kvm, sp);
1499 kvm_mmu_unlink_parents(kvm, sp); 1633 kvm_mmu_unlink_parents(kvm, sp);
1500 kvm_flush_remote_tlbs(kvm);
1501 if (!sp->role.invalid && !sp->role.direct) 1634 if (!sp->role.invalid && !sp->role.direct)
1502 unaccount_shadowed(kvm, sp->gfn); 1635 unaccount_shadowed(kvm, sp->gfn);
1503 if (sp->unsync) 1636 if (sp->unsync)
1504 kvm_unlink_unsync_page(kvm, sp); 1637 kvm_unlink_unsync_page(kvm, sp);
1505 if (!sp->root_count) { 1638 if (!sp->root_count) {
1506 hlist_del(&sp->hash_link); 1639 /* Count self */
1507 kvm_mmu_free_page(kvm, sp); 1640 ret++;
1641 list_move(&sp->link, invalid_list);
1508 } else { 1642 } else {
1509 sp->role.invalid = 1;
1510 list_move(&sp->link, &kvm->arch.active_mmu_pages); 1643 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1511 kvm_reload_remote_mmus(kvm); 1644 kvm_reload_remote_mmus(kvm);
1512 } 1645 }
1646
1647 sp->role.invalid = 1;
1513 kvm_mmu_reset_last_pte_updated(kvm); 1648 kvm_mmu_reset_last_pte_updated(kvm);
1514 return ret; 1649 return ret;
1515} 1650}
1516 1651
1652static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1653 struct list_head *invalid_list)
1654{
1655 struct kvm_mmu_page *sp;
1656
1657 if (list_empty(invalid_list))
1658 return;
1659
1660 kvm_flush_remote_tlbs(kvm);
1661
1662 do {
1663 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1664 WARN_ON(!sp->role.invalid || sp->root_count);
1665 kvm_mmu_free_page(kvm, sp);
1666 } while (!list_empty(invalid_list));
1667
1668}
1669
1517/* 1670/*
1518 * Changing the number of mmu pages allocated to the vm 1671 * Changing the number of mmu pages allocated to the vm
1519 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock 1672 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
@@ -1521,6 +1674,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1521void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) 1674void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1522{ 1675{
1523 int used_pages; 1676 int used_pages;
1677 LIST_HEAD(invalid_list);
1524 1678
1525 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; 1679 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
1526 used_pages = max(0, used_pages); 1680 used_pages = max(0, used_pages);
@@ -1538,9 +1692,10 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1538 1692
1539 page = container_of(kvm->arch.active_mmu_pages.prev, 1693 page = container_of(kvm->arch.active_mmu_pages.prev,
1540 struct kvm_mmu_page, link); 1694 struct kvm_mmu_page, link);
1541 used_pages -= kvm_mmu_zap_page(kvm, page); 1695 used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
1542 used_pages--; 1696 &invalid_list);
1543 } 1697 }
1698 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1544 kvm_nr_mmu_pages = used_pages; 1699 kvm_nr_mmu_pages = used_pages;
1545 kvm->arch.n_free_mmu_pages = 0; 1700 kvm->arch.n_free_mmu_pages = 0;
1546 } 1701 }
@@ -1553,47 +1708,36 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1553 1708
1554static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 1709static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1555{ 1710{
1556 unsigned index;
1557 struct hlist_head *bucket;
1558 struct kvm_mmu_page *sp; 1711 struct kvm_mmu_page *sp;
1559 struct hlist_node *node, *n; 1712 struct hlist_node *node;
1713 LIST_HEAD(invalid_list);
1560 int r; 1714 int r;
1561 1715
1562 pgprintk("%s: looking for gfn %lx\n", __func__, gfn); 1716 pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
1563 r = 0; 1717 r = 0;
1564 index = kvm_page_table_hashfn(gfn); 1718
1565 bucket = &kvm->arch.mmu_page_hash[index]; 1719 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1566restart: 1720 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1567 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) 1721 sp->role.word);
1568 if (sp->gfn == gfn && !sp->role.direct) { 1722 r = 1;
1569 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1723 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1570 sp->role.word); 1724 }
1571 r = 1; 1725 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1572 if (kvm_mmu_zap_page(kvm, sp))
1573 goto restart;
1574 }
1575 return r; 1726 return r;
1576} 1727}
1577 1728
1578static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) 1729static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1579{ 1730{
1580 unsigned index;
1581 struct hlist_head *bucket;
1582 struct kvm_mmu_page *sp; 1731 struct kvm_mmu_page *sp;
1583 struct hlist_node *node, *nn; 1732 struct hlist_node *node;
1733 LIST_HEAD(invalid_list);
1584 1734
1585 index = kvm_page_table_hashfn(gfn); 1735 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1586 bucket = &kvm->arch.mmu_page_hash[index]; 1736 pgprintk("%s: zap %lx %x\n",
1587restart: 1737 __func__, gfn, sp->role.word);
1588 hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { 1738 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1589 if (sp->gfn == gfn && !sp->role.direct
1590 && !sp->role.invalid) {
1591 pgprintk("%s: zap %lx %x\n",
1592 __func__, gfn, sp->role.word);
1593 if (kvm_mmu_zap_page(kvm, sp))
1594 goto restart;
1595 }
1596 } 1739 }
1740 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1597} 1741}
1598 1742
1599static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) 1743static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
@@ -1723,47 +1867,51 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1723} 1867}
1724EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); 1868EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
1725 1869
1726static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1870static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1727{ 1871{
1728 unsigned index;
1729 struct hlist_head *bucket;
1730 struct kvm_mmu_page *s;
1731 struct hlist_node *node, *n;
1732
1733 index = kvm_page_table_hashfn(sp->gfn);
1734 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1735 /* don't unsync if pagetable is shadowed with multiple roles */
1736 hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
1737 if (s->gfn != sp->gfn || s->role.direct)
1738 continue;
1739 if (s->role.word != sp->role.word)
1740 return 1;
1741 }
1742 trace_kvm_mmu_unsync_page(sp); 1872 trace_kvm_mmu_unsync_page(sp);
1743 ++vcpu->kvm->stat.mmu_unsync; 1873 ++vcpu->kvm->stat.mmu_unsync;
1744 sp->unsync = 1; 1874 sp->unsync = 1;
1745 1875
1746 kvm_mmu_mark_parents_unsync(sp); 1876 kvm_mmu_mark_parents_unsync(sp);
1747
1748 mmu_convert_notrap(sp); 1877 mmu_convert_notrap(sp);
1749 return 0; 1878}
1879
1880static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1881{
1882 struct kvm_mmu_page *s;
1883 struct hlist_node *node;
1884
1885 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1886 if (s->unsync)
1887 continue;
1888 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1889 __kvm_unsync_page(vcpu, s);
1890 }
1750} 1891}
1751 1892
1752static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, 1893static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1753 bool can_unsync) 1894 bool can_unsync)
1754{ 1895{
1755 struct kvm_mmu_page *shadow; 1896 struct kvm_mmu_page *s;
1897 struct hlist_node *node;
1898 bool need_unsync = false;
1756 1899
1757 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); 1900 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1758 if (shadow) { 1901 if (!can_unsync)
1759 if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
1760 return 1; 1902 return 1;
1761 if (shadow->unsync) 1903
1762 return 0; 1904 if (s->role.level != PT_PAGE_TABLE_LEVEL)
1763 if (can_unsync && oos_shadow) 1905 return 1;
1764 return kvm_unsync_page(vcpu, shadow); 1906
1765 return 1; 1907 if (!need_unsync && !s->unsync) {
1908 if (!oos_shadow)
1909 return 1;
1910 need_unsync = true;
1911 }
1766 } 1912 }
1913 if (need_unsync)
1914 kvm_unsync_pages(vcpu, gfn);
1767 return 0; 1915 return 0;
1768} 1916}
1769 1917
@@ -1804,13 +1952,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1804 spte |= (u64)pfn << PAGE_SHIFT; 1952 spte |= (u64)pfn << PAGE_SHIFT;
1805 1953
1806 if ((pte_access & ACC_WRITE_MASK) 1954 if ((pte_access & ACC_WRITE_MASK)
1807 || (write_fault && !is_write_protection(vcpu) && !user_fault)) { 1955 || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
1956 && !user_fault)) {
1808 1957
1809 if (level > PT_PAGE_TABLE_LEVEL && 1958 if (level > PT_PAGE_TABLE_LEVEL &&
1810 has_wrprotected_page(vcpu->kvm, gfn, level)) { 1959 has_wrprotected_page(vcpu->kvm, gfn, level)) {
1811 ret = 1; 1960 ret = 1;
1812 spte = shadow_trap_nonpresent_pte; 1961 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1813 goto set_pte; 1962 goto done;
1814 } 1963 }
1815 1964
1816 spte |= PT_WRITABLE_MASK; 1965 spte |= PT_WRITABLE_MASK;
@@ -1841,7 +1990,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1841 mark_page_dirty(vcpu->kvm, gfn); 1990 mark_page_dirty(vcpu->kvm, gfn);
1842 1991
1843set_pte: 1992set_pte:
1844 __set_spte(sptep, spte); 1993 if (is_writable_pte(*sptep) && !is_writable_pte(spte))
1994 kvm_set_pfn_dirty(pfn);
1995 update_spte(sptep, spte);
1996done:
1845 return ret; 1997 return ret;
1846} 1998}
1847 1999
@@ -1853,7 +2005,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1853 bool reset_host_protection) 2005 bool reset_host_protection)
1854{ 2006{
1855 int was_rmapped = 0; 2007 int was_rmapped = 0;
1856 int was_writable = is_writable_pte(*sptep);
1857 int rmap_count; 2008 int rmap_count;
1858 2009
1859 pgprintk("%s: spte %llx access %x write_fault %d" 2010 pgprintk("%s: spte %llx access %x write_fault %d"
@@ -1878,8 +2029,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1878 } else if (pfn != spte_to_pfn(*sptep)) { 2029 } else if (pfn != spte_to_pfn(*sptep)) {
1879 pgprintk("hfn old %lx new %lx\n", 2030 pgprintk("hfn old %lx new %lx\n",
1880 spte_to_pfn(*sptep), pfn); 2031 spte_to_pfn(*sptep), pfn);
1881 rmap_remove(vcpu->kvm, sptep); 2032 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1882 __set_spte(sptep, shadow_trap_nonpresent_pte);
1883 kvm_flush_remote_tlbs(vcpu->kvm); 2033 kvm_flush_remote_tlbs(vcpu->kvm);
1884 } else 2034 } else
1885 was_rmapped = 1; 2035 was_rmapped = 1;
@@ -1890,7 +2040,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1890 reset_host_protection)) { 2040 reset_host_protection)) {
1891 if (write_fault) 2041 if (write_fault)
1892 *ptwrite = 1; 2042 *ptwrite = 1;
1893 kvm_x86_ops->tlb_flush(vcpu); 2043 kvm_mmu_flush_tlb(vcpu);
1894 } 2044 }
1895 2045
1896 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2046 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
@@ -1904,15 +2054,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1904 page_header_update_slot(vcpu->kvm, sptep, gfn); 2054 page_header_update_slot(vcpu->kvm, sptep, gfn);
1905 if (!was_rmapped) { 2055 if (!was_rmapped) {
1906 rmap_count = rmap_add(vcpu, sptep, gfn); 2056 rmap_count = rmap_add(vcpu, sptep, gfn);
1907 kvm_release_pfn_clean(pfn);
1908 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 2057 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
1909 rmap_recycle(vcpu, sptep, gfn); 2058 rmap_recycle(vcpu, sptep, gfn);
1910 } else {
1911 if (was_writable)
1912 kvm_release_pfn_dirty(pfn);
1913 else
1914 kvm_release_pfn_clean(pfn);
1915 } 2059 }
2060 kvm_release_pfn_clean(pfn);
1916 if (speculative) { 2061 if (speculative) {
1917 vcpu->arch.last_pte_updated = sptep; 2062 vcpu->arch.last_pte_updated = sptep;
1918 vcpu->arch.last_pte_gfn = gfn; 2063 vcpu->arch.last_pte_gfn = gfn;
@@ -1941,7 +2086,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1941 } 2086 }
1942 2087
1943 if (*iterator.sptep == shadow_trap_nonpresent_pte) { 2088 if (*iterator.sptep == shadow_trap_nonpresent_pte) {
1944 pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; 2089 u64 base_addr = iterator.addr;
2090
2091 base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
2092 pseudo_gfn = base_addr >> PAGE_SHIFT;
1945 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, 2093 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
1946 iterator.level - 1, 2094 iterator.level - 1,
1947 1, ACC_ALL, iterator.sptep); 2095 1, ACC_ALL, iterator.sptep);
@@ -1960,6 +2108,29 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1960 return pt_write; 2108 return pt_write;
1961} 2109}
1962 2110
2111static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
2112{
2113 char buf[1];
2114 void __user *hva;
2115 int r;
2116
2117 /* Touch the page, so send SIGBUS */
2118 hva = (void __user *)gfn_to_hva(kvm, gfn);
2119 r = copy_from_user(buf, hva, 1);
2120}
2121
2122static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2123{
2124 kvm_release_pfn_clean(pfn);
2125 if (is_hwpoison_pfn(pfn)) {
2126 kvm_send_hwpoison_signal(kvm, gfn);
2127 return 0;
2128 } else if (is_fault_pfn(pfn))
2129 return -EFAULT;
2130
2131 return 1;
2132}
2133
1963static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 2134static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1964{ 2135{
1965 int r; 2136 int r;
@@ -1983,10 +2154,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1983 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2154 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1984 2155
1985 /* mmio */ 2156 /* mmio */
1986 if (is_error_pfn(pfn)) { 2157 if (is_error_pfn(pfn))
1987 kvm_release_pfn_clean(pfn); 2158 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
1988 return 1;
1989 }
1990 2159
1991 spin_lock(&vcpu->kvm->mmu_lock); 2160 spin_lock(&vcpu->kvm->mmu_lock);
1992 if (mmu_notifier_retry(vcpu, mmu_seq)) 2161 if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2009,6 +2178,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2009{ 2178{
2010 int i; 2179 int i;
2011 struct kvm_mmu_page *sp; 2180 struct kvm_mmu_page *sp;
2181 LIST_HEAD(invalid_list);
2012 2182
2013 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2183 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2014 return; 2184 return;
@@ -2018,8 +2188,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2018 2188
2019 sp = page_header(root); 2189 sp = page_header(root);
2020 --sp->root_count; 2190 --sp->root_count;
2021 if (!sp->root_count && sp->role.invalid) 2191 if (!sp->root_count && sp->role.invalid) {
2022 kvm_mmu_zap_page(vcpu->kvm, sp); 2192 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2193 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2194 }
2023 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 2195 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2024 spin_unlock(&vcpu->kvm->mmu_lock); 2196 spin_unlock(&vcpu->kvm->mmu_lock);
2025 return; 2197 return;
@@ -2032,10 +2204,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2032 sp = page_header(root); 2204 sp = page_header(root);
2033 --sp->root_count; 2205 --sp->root_count;
2034 if (!sp->root_count && sp->role.invalid) 2206 if (!sp->root_count && sp->role.invalid)
2035 kvm_mmu_zap_page(vcpu->kvm, sp); 2207 kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2208 &invalid_list);
2036 } 2209 }
2037 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; 2210 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2038 } 2211 }
2212 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2039 spin_unlock(&vcpu->kvm->mmu_lock); 2213 spin_unlock(&vcpu->kvm->mmu_lock);
2040 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 2214 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2041} 2215}
@@ -2045,7 +2219,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2045 int ret = 0; 2219 int ret = 0;
2046 2220
2047 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { 2221 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
2048 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 2222 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2049 ret = 1; 2223 ret = 1;
2050 } 2224 }
2051 2225
@@ -2073,6 +2247,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2073 root_gfn = 0; 2247 root_gfn = 0;
2074 } 2248 }
2075 spin_lock(&vcpu->kvm->mmu_lock); 2249 spin_lock(&vcpu->kvm->mmu_lock);
2250 kvm_mmu_free_some_pages(vcpu);
2076 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 2251 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
2077 PT64_ROOT_LEVEL, direct, 2252 PT64_ROOT_LEVEL, direct,
2078 ACC_ALL, NULL); 2253 ACC_ALL, NULL);
@@ -2103,6 +2278,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2103 root_gfn = i << 30; 2278 root_gfn = i << 30;
2104 } 2279 }
2105 spin_lock(&vcpu->kvm->mmu_lock); 2280 spin_lock(&vcpu->kvm->mmu_lock);
2281 kvm_mmu_free_some_pages(vcpu);
2106 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 2282 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2107 PT32_ROOT_LEVEL, direct, 2283 PT32_ROOT_LEVEL, direct,
2108 ACC_ALL, NULL); 2284 ACC_ALL, NULL);
@@ -2198,10 +2374,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2198 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2374 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2199 smp_rmb(); 2375 smp_rmb();
2200 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2376 pfn = gfn_to_pfn(vcpu->kvm, gfn);
2201 if (is_error_pfn(pfn)) { 2377 if (is_error_pfn(pfn))
2202 kvm_release_pfn_clean(pfn); 2378 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2203 return 1;
2204 }
2205 spin_lock(&vcpu->kvm->mmu_lock); 2379 spin_lock(&vcpu->kvm->mmu_lock);
2206 if (mmu_notifier_retry(vcpu, mmu_seq)) 2380 if (mmu_notifier_retry(vcpu, mmu_seq))
2207 goto out_unlock; 2381 goto out_unlock;
@@ -2243,7 +2417,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
2243void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) 2417void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2244{ 2418{
2245 ++vcpu->stat.tlb_flush; 2419 ++vcpu->stat.tlb_flush;
2246 kvm_x86_ops->tlb_flush(vcpu); 2420 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2247} 2421}
2248 2422
2249static void paging_new_cr3(struct kvm_vcpu *vcpu) 2423static void paging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2457,10 +2631,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
2457static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) 2631static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
2458{ 2632{
2459 ASSERT(vcpu); 2633 ASSERT(vcpu);
2460 if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { 2634 if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
2635 /* mmu.free() should set root_hpa = INVALID_PAGE */
2461 vcpu->arch.mmu.free(vcpu); 2636 vcpu->arch.mmu.free(vcpu);
2462 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2463 }
2464} 2637}
2465 2638
2466int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) 2639int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -2477,9 +2650,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2477 r = mmu_topup_memory_caches(vcpu); 2650 r = mmu_topup_memory_caches(vcpu);
2478 if (r) 2651 if (r)
2479 goto out; 2652 goto out;
2480 spin_lock(&vcpu->kvm->mmu_lock);
2481 kvm_mmu_free_some_pages(vcpu);
2482 spin_unlock(&vcpu->kvm->mmu_lock);
2483 r = mmu_alloc_roots(vcpu); 2653 r = mmu_alloc_roots(vcpu);
2484 spin_lock(&vcpu->kvm->mmu_lock); 2654 spin_lock(&vcpu->kvm->mmu_lock);
2485 mmu_sync_roots(vcpu); 2655 mmu_sync_roots(vcpu);
@@ -2508,7 +2678,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2508 pte = *spte; 2678 pte = *spte;
2509 if (is_shadow_present_pte(pte)) { 2679 if (is_shadow_present_pte(pte)) {
2510 if (is_last_spte(pte, sp->role.level)) 2680 if (is_last_spte(pte, sp->role.level))
2511 rmap_remove(vcpu->kvm, spte); 2681 drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
2512 else { 2682 else {
2513 child = page_header(pte & PT64_BASE_ADDR_MASK); 2683 child = page_header(pte & PT64_BASE_ADDR_MASK);
2514 mmu_page_remove_parent_pte(child, spte); 2684 mmu_page_remove_parent_pte(child, spte);
@@ -2529,6 +2699,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2529 return; 2699 return;
2530 } 2700 }
2531 2701
2702 if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
2703 return;
2704
2532 ++vcpu->kvm->stat.mmu_pte_updated; 2705 ++vcpu->kvm->stat.mmu_pte_updated;
2533 if (!sp->role.cr4_pae) 2706 if (!sp->role.cr4_pae)
2534 paging32_update_pte(vcpu, sp, spte, new); 2707 paging32_update_pte(vcpu, sp, spte, new);
@@ -2549,11 +2722,15 @@ static bool need_remote_flush(u64 old, u64 new)
2549 return (old & ~new & PT64_PERM_MASK) != 0; 2722 return (old & ~new & PT64_PERM_MASK) != 0;
2550} 2723}
2551 2724
2552static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) 2725static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
2726 bool remote_flush, bool local_flush)
2553{ 2727{
2554 if (need_remote_flush(old, new)) 2728 if (zap_page)
2729 return;
2730
2731 if (remote_flush)
2555 kvm_flush_remote_tlbs(vcpu->kvm); 2732 kvm_flush_remote_tlbs(vcpu->kvm);
2556 else 2733 else if (local_flush)
2557 kvm_mmu_flush_tlb(vcpu); 2734 kvm_mmu_flush_tlb(vcpu);
2558} 2735}
2559 2736
@@ -2603,10 +2780,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2603 bool guest_initiated) 2780 bool guest_initiated)
2604{ 2781{
2605 gfn_t gfn = gpa >> PAGE_SHIFT; 2782 gfn_t gfn = gpa >> PAGE_SHIFT;
2783 union kvm_mmu_page_role mask = { .word = 0 };
2606 struct kvm_mmu_page *sp; 2784 struct kvm_mmu_page *sp;
2607 struct hlist_node *node, *n; 2785 struct hlist_node *node;
2608 struct hlist_head *bucket; 2786 LIST_HEAD(invalid_list);
2609 unsigned index;
2610 u64 entry, gentry; 2787 u64 entry, gentry;
2611 u64 *spte; 2788 u64 *spte;
2612 unsigned offset = offset_in_page(gpa); 2789 unsigned offset = offset_in_page(gpa);
@@ -2619,6 +2796,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2619 int npte; 2796 int npte;
2620 int r; 2797 int r;
2621 int invlpg_counter; 2798 int invlpg_counter;
2799 bool remote_flush, local_flush, zap_page;
2800
2801 zap_page = remote_flush = local_flush = false;
2622 2802
2623 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 2803 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2624 2804
@@ -2674,13 +2854,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2674 vcpu->arch.last_pte_updated = NULL; 2854 vcpu->arch.last_pte_updated = NULL;
2675 } 2855 }
2676 } 2856 }
2677 index = kvm_page_table_hashfn(gfn);
2678 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
2679 2857
2680restart: 2858 mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
2681 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { 2859 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
2682 if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
2683 continue;
2684 pte_size = sp->role.cr4_pae ? 8 : 4; 2860 pte_size = sp->role.cr4_pae ? 8 : 4;
2685 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 2861 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
2686 misaligned |= bytes < 4; 2862 misaligned |= bytes < 4;
@@ -2697,8 +2873,8 @@ restart:
2697 */ 2873 */
2698 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 2874 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
2699 gpa, bytes, sp->role.word); 2875 gpa, bytes, sp->role.word);
2700 if (kvm_mmu_zap_page(vcpu->kvm, sp)) 2876 zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2701 goto restart; 2877 &invalid_list);
2702 ++vcpu->kvm->stat.mmu_flooded; 2878 ++vcpu->kvm->stat.mmu_flooded;
2703 continue; 2879 continue;
2704 } 2880 }
@@ -2722,16 +2898,22 @@ restart:
2722 if (quadrant != sp->role.quadrant) 2898 if (quadrant != sp->role.quadrant)
2723 continue; 2899 continue;
2724 } 2900 }
2901 local_flush = true;
2725 spte = &sp->spt[page_offset / sizeof(*spte)]; 2902 spte = &sp->spt[page_offset / sizeof(*spte)];
2726 while (npte--) { 2903 while (npte--) {
2727 entry = *spte; 2904 entry = *spte;
2728 mmu_pte_write_zap_pte(vcpu, sp, spte); 2905 mmu_pte_write_zap_pte(vcpu, sp, spte);
2729 if (gentry) 2906 if (gentry &&
2907 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
2908 & mask.word))
2730 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 2909 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
2731 mmu_pte_write_flush_tlb(vcpu, entry, *spte); 2910 if (!remote_flush && need_remote_flush(entry, *spte))
2911 remote_flush = true;
2732 ++spte; 2912 ++spte;
2733 } 2913 }
2734 } 2914 }
2915 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
2916 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2735 kvm_mmu_audit(vcpu, "post pte write"); 2917 kvm_mmu_audit(vcpu, "post pte write");
2736 spin_unlock(&vcpu->kvm->mmu_lock); 2918 spin_unlock(&vcpu->kvm->mmu_lock);
2737 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { 2919 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
@@ -2759,15 +2941,21 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2759 2941
2760void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 2942void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2761{ 2943{
2762 while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES && 2944 int free_pages;
2945 LIST_HEAD(invalid_list);
2946
2947 free_pages = vcpu->kvm->arch.n_free_mmu_pages;
2948 while (free_pages < KVM_REFILL_PAGES &&
2763 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { 2949 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2764 struct kvm_mmu_page *sp; 2950 struct kvm_mmu_page *sp;
2765 2951
2766 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 2952 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
2767 struct kvm_mmu_page, link); 2953 struct kvm_mmu_page, link);
2768 kvm_mmu_zap_page(vcpu->kvm, sp); 2954 free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2955 &invalid_list);
2769 ++vcpu->kvm->stat.mmu_recycled; 2956 ++vcpu->kvm->stat.mmu_recycled;
2770 } 2957 }
2958 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2771} 2959}
2772 2960
2773int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) 2961int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -2795,11 +2983,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2795 return 1; 2983 return 1;
2796 case EMULATE_DO_MMIO: 2984 case EMULATE_DO_MMIO:
2797 ++vcpu->stat.mmio_exits; 2985 ++vcpu->stat.mmio_exits;
2798 return 0; 2986 /* fall through */
2799 case EMULATE_FAIL: 2987 case EMULATE_FAIL:
2800 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2801 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2802 vcpu->run->internal.ndata = 0;
2803 return 0; 2988 return 0;
2804 default: 2989 default:
2805 BUG(); 2990 BUG();
@@ -2896,7 +3081,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2896 pt = sp->spt; 3081 pt = sp->spt;
2897 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 3082 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2898 /* avoid RMW */ 3083 /* avoid RMW */
2899 if (pt[i] & PT_WRITABLE_MASK) 3084 if (is_writable_pte(pt[i]))
2900 pt[i] &= ~PT_WRITABLE_MASK; 3085 pt[i] &= ~PT_WRITABLE_MASK;
2901 } 3086 }
2902 kvm_flush_remote_tlbs(kvm); 3087 kvm_flush_remote_tlbs(kvm);
@@ -2905,25 +3090,26 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2905void kvm_mmu_zap_all(struct kvm *kvm) 3090void kvm_mmu_zap_all(struct kvm *kvm)
2906{ 3091{
2907 struct kvm_mmu_page *sp, *node; 3092 struct kvm_mmu_page *sp, *node;
3093 LIST_HEAD(invalid_list);
2908 3094
2909 spin_lock(&kvm->mmu_lock); 3095 spin_lock(&kvm->mmu_lock);
2910restart: 3096restart:
2911 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 3097 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
2912 if (kvm_mmu_zap_page(kvm, sp)) 3098 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
2913 goto restart; 3099 goto restart;
2914 3100
3101 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2915 spin_unlock(&kvm->mmu_lock); 3102 spin_unlock(&kvm->mmu_lock);
2916
2917 kvm_flush_remote_tlbs(kvm);
2918} 3103}
2919 3104
2920static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm) 3105static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3106 struct list_head *invalid_list)
2921{ 3107{
2922 struct kvm_mmu_page *page; 3108 struct kvm_mmu_page *page;
2923 3109
2924 page = container_of(kvm->arch.active_mmu_pages.prev, 3110 page = container_of(kvm->arch.active_mmu_pages.prev,
2925 struct kvm_mmu_page, link); 3111 struct kvm_mmu_page, link);
2926 return kvm_mmu_zap_page(kvm, page) + 1; 3112 return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
2927} 3113}
2928 3114
2929static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 3115static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
@@ -2936,6 +3122,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
2936 3122
2937 list_for_each_entry(kvm, &vm_list, vm_list) { 3123 list_for_each_entry(kvm, &vm_list, vm_list) {
2938 int npages, idx, freed_pages; 3124 int npages, idx, freed_pages;
3125 LIST_HEAD(invalid_list);
2939 3126
2940 idx = srcu_read_lock(&kvm->srcu); 3127 idx = srcu_read_lock(&kvm->srcu);
2941 spin_lock(&kvm->mmu_lock); 3128 spin_lock(&kvm->mmu_lock);
@@ -2943,12 +3130,14 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
2943 kvm->arch.n_free_mmu_pages; 3130 kvm->arch.n_free_mmu_pages;
2944 cache_count += npages; 3131 cache_count += npages;
2945 if (!kvm_freed && nr_to_scan > 0 && npages > 0) { 3132 if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
2946 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm); 3133 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3134 &invalid_list);
2947 cache_count -= freed_pages; 3135 cache_count -= freed_pages;
2948 kvm_freed = kvm; 3136 kvm_freed = kvm;
2949 } 3137 }
2950 nr_to_scan--; 3138 nr_to_scan--;
2951 3139
3140 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2952 spin_unlock(&kvm->mmu_lock); 3141 spin_unlock(&kvm->mmu_lock);
2953 srcu_read_unlock(&kvm->srcu, idx); 3142 srcu_read_unlock(&kvm->srcu, idx);
2954 } 3143 }
@@ -3074,7 +3263,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3074 3263
3075static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) 3264static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3076{ 3265{
3077 kvm_set_cr3(vcpu, vcpu->arch.cr3); 3266 (void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
3078 return 1; 3267 return 1;
3079} 3268}
3080 3269
@@ -3331,9 +3520,9 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3331 struct kvm_mmu_page *rev_sp; 3520 struct kvm_mmu_page *rev_sp;
3332 gfn_t gfn; 3521 gfn_t gfn;
3333 3522
3334 if (*sptep & PT_WRITABLE_MASK) { 3523 if (is_writable_pte(*sptep)) {
3335 rev_sp = page_header(__pa(sptep)); 3524 rev_sp = page_header(__pa(sptep));
3336 gfn = rev_sp->gfns[sptep - rev_sp->spt]; 3525 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
3337 3526
3338 if (!gfn_to_memslot(kvm, gfn)) { 3527 if (!gfn_to_memslot(kvm, gfn)) {
3339 if (!printk_ratelimit()) 3528 if (!printk_ratelimit())
@@ -3347,8 +3536,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3347 return; 3536 return;
3348 } 3537 }
3349 3538
3350 rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], 3539 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
3351 rev_sp->role.level);
3352 if (!*rmapp) { 3540 if (!*rmapp) {
3353 if (!printk_ratelimit()) 3541 if (!printk_ratelimit())
3354 return; 3542 return;
@@ -3381,7 +3569,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3381 3569
3382 if (!(ent & PT_PRESENT_MASK)) 3570 if (!(ent & PT_PRESENT_MASK))
3383 continue; 3571 continue;
3384 if (!(ent & PT_WRITABLE_MASK)) 3572 if (!is_writable_pte(ent))
3385 continue; 3573 continue;
3386 inspect_spte_has_rmap(vcpu->kvm, &pt[i]); 3574 inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3387 } 3575 }
@@ -3409,13 +3597,12 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
3409 if (sp->unsync) 3597 if (sp->unsync)
3410 continue; 3598 continue;
3411 3599
3412 gfn = unalias_gfn(vcpu->kvm, sp->gfn); 3600 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
3413 slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
3414 rmapp = &slot->rmap[gfn - slot->base_gfn]; 3601 rmapp = &slot->rmap[gfn - slot->base_gfn];
3415 3602
3416 spte = rmap_next(vcpu->kvm, rmapp, NULL); 3603 spte = rmap_next(vcpu->kvm, rmapp, NULL);
3417 while (spte) { 3604 while (spte) {
3418 if (*spte & PT_WRITABLE_MASK) 3605 if (is_writable_pte(*spte))
3419 printk(KERN_ERR "%s: (%s) shadow page has " 3606 printk(KERN_ERR "%s: (%s) shadow page has "
3420 "writable mappings: gfn %lx role %x\n", 3607 "writable mappings: gfn %lx role %x\n",
3421 __func__, audit_msg, sp->gfn, 3608 __func__, audit_msg, sp->gfn,
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 42f07b1bfbc9..3aab0f0930ef 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -190,7 +190,7 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
190 TP_ARGS(sp) 190 TP_ARGS(sp)
191); 191);
192 192
193DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page, 193DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
194 TP_PROTO(struct kvm_mmu_page *sp), 194 TP_PROTO(struct kvm_mmu_page *sp),
195 195
196 TP_ARGS(sp) 196 TP_ARGS(sp)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2331bdc2b549..51ef9097960d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -7,6 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates.
10 * 11 *
11 * Authors: 12 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -118,21 +119,25 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
118{ 119{
119 pt_element_t pte; 120 pt_element_t pte;
120 gfn_t table_gfn; 121 gfn_t table_gfn;
121 unsigned index, pt_access, pte_access; 122 unsigned index, pt_access, uninitialized_var(pte_access);
122 gpa_t pte_gpa; 123 gpa_t pte_gpa;
123 int rsvd_fault = 0; 124 bool eperm, present, rsvd_fault;
124 125
125 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, 126 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
126 fetch_fault); 127 fetch_fault);
127walk: 128walk:
129 present = true;
130 eperm = rsvd_fault = false;
128 walker->level = vcpu->arch.mmu.root_level; 131 walker->level = vcpu->arch.mmu.root_level;
129 pte = vcpu->arch.cr3; 132 pte = vcpu->arch.cr3;
130#if PTTYPE == 64 133#if PTTYPE == 64
131 if (!is_long_mode(vcpu)) { 134 if (!is_long_mode(vcpu)) {
132 pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); 135 pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
133 trace_kvm_mmu_paging_element(pte, walker->level); 136 trace_kvm_mmu_paging_element(pte, walker->level);
134 if (!is_present_gpte(pte)) 137 if (!is_present_gpte(pte)) {
135 goto not_present; 138 present = false;
139 goto error;
140 }
136 --walker->level; 141 --walker->level;
137 } 142 }
138#endif 143#endif
@@ -150,37 +155,42 @@ walk:
150 walker->table_gfn[walker->level - 1] = table_gfn; 155 walker->table_gfn[walker->level - 1] = table_gfn;
151 walker->pte_gpa[walker->level - 1] = pte_gpa; 156 walker->pte_gpa[walker->level - 1] = pte_gpa;
152 157
153 if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) 158 if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) {
154 goto not_present; 159 present = false;
160 break;
161 }
155 162
156 trace_kvm_mmu_paging_element(pte, walker->level); 163 trace_kvm_mmu_paging_element(pte, walker->level);
157 164
158 if (!is_present_gpte(pte)) 165 if (!is_present_gpte(pte)) {
159 goto not_present; 166 present = false;
167 break;
168 }
160 169
161 rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); 170 if (is_rsvd_bits_set(vcpu, pte, walker->level)) {
162 if (rsvd_fault) 171 rsvd_fault = true;
163 goto access_error; 172 break;
173 }
164 174
165 if (write_fault && !is_writable_pte(pte)) 175 if (write_fault && !is_writable_pte(pte))
166 if (user_fault || is_write_protection(vcpu)) 176 if (user_fault || is_write_protection(vcpu))
167 goto access_error; 177 eperm = true;
168 178
169 if (user_fault && !(pte & PT_USER_MASK)) 179 if (user_fault && !(pte & PT_USER_MASK))
170 goto access_error; 180 eperm = true;
171 181
172#if PTTYPE == 64 182#if PTTYPE == 64
173 if (fetch_fault && (pte & PT64_NX_MASK)) 183 if (fetch_fault && (pte & PT64_NX_MASK))
174 goto access_error; 184 eperm = true;
175#endif 185#endif
176 186
177 if (!(pte & PT_ACCESSED_MASK)) { 187 if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) {
178 trace_kvm_mmu_set_accessed_bit(table_gfn, index, 188 trace_kvm_mmu_set_accessed_bit(table_gfn, index,
179 sizeof(pte)); 189 sizeof(pte));
180 mark_page_dirty(vcpu->kvm, table_gfn);
181 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, 190 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
182 index, pte, pte|PT_ACCESSED_MASK)) 191 index, pte, pte|PT_ACCESSED_MASK))
183 goto walk; 192 goto walk;
193 mark_page_dirty(vcpu->kvm, table_gfn);
184 pte |= PT_ACCESSED_MASK; 194 pte |= PT_ACCESSED_MASK;
185 } 195 }
186 196
@@ -213,15 +223,18 @@ walk:
213 --walker->level; 223 --walker->level;
214 } 224 }
215 225
226 if (!present || eperm || rsvd_fault)
227 goto error;
228
216 if (write_fault && !is_dirty_gpte(pte)) { 229 if (write_fault && !is_dirty_gpte(pte)) {
217 bool ret; 230 bool ret;
218 231
219 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 232 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
220 mark_page_dirty(vcpu->kvm, table_gfn);
221 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, 233 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
222 pte|PT_DIRTY_MASK); 234 pte|PT_DIRTY_MASK);
223 if (ret) 235 if (ret)
224 goto walk; 236 goto walk;
237 mark_page_dirty(vcpu->kvm, table_gfn);
225 pte |= PT_DIRTY_MASK; 238 pte |= PT_DIRTY_MASK;
226 walker->ptes[walker->level - 1] = pte; 239 walker->ptes[walker->level - 1] = pte;
227 } 240 }
@@ -229,22 +242,18 @@ walk:
229 walker->pt_access = pt_access; 242 walker->pt_access = pt_access;
230 walker->pte_access = pte_access; 243 walker->pte_access = pte_access;
231 pgprintk("%s: pte %llx pte_access %x pt_access %x\n", 244 pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
232 __func__, (u64)pte, pt_access, pte_access); 245 __func__, (u64)pte, pte_access, pt_access);
233 return 1; 246 return 1;
234 247
235not_present: 248error:
236 walker->error_code = 0; 249 walker->error_code = 0;
237 goto err; 250 if (present)
238 251 walker->error_code |= PFERR_PRESENT_MASK;
239access_error:
240 walker->error_code = PFERR_PRESENT_MASK;
241
242err:
243 if (write_fault) 252 if (write_fault)
244 walker->error_code |= PFERR_WRITE_MASK; 253 walker->error_code |= PFERR_WRITE_MASK;
245 if (user_fault) 254 if (user_fault)
246 walker->error_code |= PFERR_USER_MASK; 255 walker->error_code |= PFERR_USER_MASK;
247 if (fetch_fault) 256 if (fetch_fault && is_nx(vcpu))
248 walker->error_code |= PFERR_FETCH_MASK; 257 walker->error_code |= PFERR_FETCH_MASK;
249 if (rsvd_fault) 258 if (rsvd_fault)
250 walker->error_code |= PFERR_RSVD_MASK; 259 walker->error_code |= PFERR_RSVD_MASK;
@@ -252,7 +261,7 @@ err:
252 return 0; 261 return 0;
253} 262}
254 263
255static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, 264static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
256 u64 *spte, const void *pte) 265 u64 *spte, const void *pte)
257{ 266{
258 pt_element_t gpte; 267 pt_element_t gpte;
@@ -263,7 +272,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
263 gpte = *(const pt_element_t *)pte; 272 gpte = *(const pt_element_t *)pte;
264 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 273 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
265 if (!is_present_gpte(gpte)) { 274 if (!is_present_gpte(gpte)) {
266 if (page->unsync) 275 if (sp->unsync)
267 new_spte = shadow_trap_nonpresent_pte; 276 new_spte = shadow_trap_nonpresent_pte;
268 else 277 else
269 new_spte = shadow_notrap_nonpresent_pte; 278 new_spte = shadow_notrap_nonpresent_pte;
@@ -272,7 +281,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
272 return; 281 return;
273 } 282 }
274 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 283 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
275 pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); 284 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
276 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) 285 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
277 return; 286 return;
278 pfn = vcpu->arch.update_pte.pfn; 287 pfn = vcpu->arch.update_pte.pfn;
@@ -285,11 +294,22 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
285 * we call mmu_set_spte() with reset_host_protection = true beacuse that 294 * we call mmu_set_spte() with reset_host_protection = true beacuse that
286 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 295 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
287 */ 296 */
288 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 297 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
289 gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, 298 is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
290 gpte_to_gfn(gpte), pfn, true, true); 299 gpte_to_gfn(gpte), pfn, true, true);
291} 300}
292 301
302static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
303 struct guest_walker *gw, int level)
304{
305 int r;
306 pt_element_t curr_pte;
307
308 r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1],
309 &curr_pte, sizeof(curr_pte));
310 return r || curr_pte != gw->ptes[level - 1];
311}
312
293/* 313/*
294 * Fetch a shadow pte for a specific level in the paging hierarchy. 314 * Fetch a shadow pte for a specific level in the paging hierarchy.
295 */ 315 */
@@ -299,75 +319,86 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
299 int *ptwrite, pfn_t pfn) 319 int *ptwrite, pfn_t pfn)
300{ 320{
301 unsigned access = gw->pt_access; 321 unsigned access = gw->pt_access;
302 struct kvm_mmu_page *shadow_page; 322 struct kvm_mmu_page *sp = NULL;
303 u64 spte, *sptep = NULL; 323 bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
304 int direct; 324 int top_level;
305 gfn_t table_gfn; 325 unsigned direct_access;
306 int r; 326 struct kvm_shadow_walk_iterator it;
307 int level;
308 pt_element_t curr_pte;
309 struct kvm_shadow_walk_iterator iterator;
310 327
311 if (!is_present_gpte(gw->ptes[gw->level - 1])) 328 if (!is_present_gpte(gw->ptes[gw->level - 1]))
312 return NULL; 329 return NULL;
313 330
314 for_each_shadow_entry(vcpu, addr, iterator) { 331 direct_access = gw->pt_access & gw->pte_access;
315 level = iterator.level; 332 if (!dirty)
316 sptep = iterator.sptep; 333 direct_access &= ~ACC_WRITE_MASK;
317 if (iterator.level == hlevel) {
318 mmu_set_spte(vcpu, sptep, access,
319 gw->pte_access & access,
320 user_fault, write_fault,
321 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
322 ptwrite, level,
323 gw->gfn, pfn, false, true);
324 break;
325 }
326 334
327 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) 335 top_level = vcpu->arch.mmu.root_level;
328 continue; 336 if (top_level == PT32E_ROOT_LEVEL)
337 top_level = PT32_ROOT_LEVEL;
338 /*
339 * Verify that the top-level gpte is still there. Since the page
340 * is a root page, it is either write protected (and cannot be
341 * changed from now on) or it is invalid (in which case, we don't
342 * really care if it changes underneath us after this point).
343 */
344 if (FNAME(gpte_changed)(vcpu, gw, top_level))
345 goto out_gpte_changed;
329 346
330 if (is_large_pte(*sptep)) { 347 for (shadow_walk_init(&it, vcpu, addr);
331 rmap_remove(vcpu->kvm, sptep); 348 shadow_walk_okay(&it) && it.level > gw->level;
332 __set_spte(sptep, shadow_trap_nonpresent_pte); 349 shadow_walk_next(&it)) {
333 kvm_flush_remote_tlbs(vcpu->kvm); 350 gfn_t table_gfn;
334 }
335 351
336 if (level <= gw->level) { 352 drop_large_spte(vcpu, it.sptep);
337 int delta = level - gw->level + 1; 353
338 direct = 1; 354 sp = NULL;
339 if (!is_dirty_gpte(gw->ptes[level - delta])) 355 if (!is_shadow_present_pte(*it.sptep)) {
340 access &= ~ACC_WRITE_MASK; 356 table_gfn = gw->table_gfn[it.level - 2];
341 table_gfn = gpte_to_gfn(gw->ptes[level - delta]); 357 sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
342 /* advance table_gfn when emulating 1gb pages with 4k */ 358 false, access, it.sptep);
343 if (delta == 0)
344 table_gfn += PT_INDEX(addr, level);
345 access &= gw->pte_access;
346 } else {
347 direct = 0;
348 table_gfn = gw->table_gfn[level - 2];
349 }
350 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
351 direct, access, sptep);
352 if (!direct) {
353 r = kvm_read_guest_atomic(vcpu->kvm,
354 gw->pte_gpa[level - 2],
355 &curr_pte, sizeof(curr_pte));
356 if (r || curr_pte != gw->ptes[level - 2]) {
357 kvm_mmu_put_page(shadow_page, sptep);
358 kvm_release_pfn_clean(pfn);
359 sptep = NULL;
360 break;
361 }
362 } 359 }
363 360
364 spte = __pa(shadow_page->spt) 361 /*
365 | PT_PRESENT_MASK | PT_ACCESSED_MASK 362 * Verify that the gpte in the page we've just write
366 | PT_WRITABLE_MASK | PT_USER_MASK; 363 * protected is still there.
367 *sptep = spte; 364 */
365 if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
366 goto out_gpte_changed;
367
368 if (sp)
369 link_shadow_page(it.sptep, sp);
368 } 370 }
369 371
370 return sptep; 372 for (;
373 shadow_walk_okay(&it) && it.level > hlevel;
374 shadow_walk_next(&it)) {
375 gfn_t direct_gfn;
376
377 validate_direct_spte(vcpu, it.sptep, direct_access);
378
379 drop_large_spte(vcpu, it.sptep);
380
381 if (is_shadow_present_pte(*it.sptep))
382 continue;
383
384 direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
385
386 sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
387 true, direct_access, it.sptep);
388 link_shadow_page(it.sptep, sp);
389 }
390
391 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
392 user_fault, write_fault, dirty, ptwrite, it.level,
393 gw->gfn, pfn, false, true);
394
395 return it.sptep;
396
397out_gpte_changed:
398 if (sp)
399 kvm_mmu_put_page(sp, it.sptep);
400 kvm_release_pfn_clean(pfn);
401 return NULL;
371} 402}
372 403
373/* 404/*
@@ -431,11 +462,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
431 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 462 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
432 463
433 /* mmio */ 464 /* mmio */
434 if (is_error_pfn(pfn)) { 465 if (is_error_pfn(pfn))
435 pgprintk("gfn %lx is mmio\n", walker.gfn); 466 return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
436 kvm_release_pfn_clean(pfn);
437 return 1;
438 }
439 467
440 spin_lock(&vcpu->kvm->mmu_lock); 468 spin_lock(&vcpu->kvm->mmu_lock);
441 if (mmu_notifier_retry(vcpu, mmu_seq)) 469 if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -443,6 +471,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
443 kvm_mmu_free_some_pages(vcpu); 471 kvm_mmu_free_some_pages(vcpu);
444 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 472 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
445 level, &write_pt, pfn); 473 level, &write_pt, pfn);
474 (void)sptep;
446 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, 475 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
447 sptep, *sptep, write_pt); 476 sptep, *sptep, write_pt);
448 477
@@ -464,6 +493,7 @@ out_unlock:
464static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 493static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
465{ 494{
466 struct kvm_shadow_walk_iterator iterator; 495 struct kvm_shadow_walk_iterator iterator;
496 struct kvm_mmu_page *sp;
467 gpa_t pte_gpa = -1; 497 gpa_t pte_gpa = -1;
468 int level; 498 int level;
469 u64 *sptep; 499 u64 *sptep;
@@ -475,10 +505,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
475 level = iterator.level; 505 level = iterator.level;
476 sptep = iterator.sptep; 506 sptep = iterator.sptep;
477 507
508 sp = page_header(__pa(sptep));
478 if (is_last_spte(*sptep, level)) { 509 if (is_last_spte(*sptep, level)) {
479 struct kvm_mmu_page *sp = page_header(__pa(sptep));
480 int offset, shift; 510 int offset, shift;
481 511
512 if (!sp->unsync)
513 break;
514
482 shift = PAGE_SHIFT - 515 shift = PAGE_SHIFT -
483 (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level; 516 (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
484 offset = sp->role.quadrant << shift; 517 offset = sp->role.quadrant << shift;
@@ -487,16 +520,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
487 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); 520 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
488 521
489 if (is_shadow_present_pte(*sptep)) { 522 if (is_shadow_present_pte(*sptep)) {
490 rmap_remove(vcpu->kvm, sptep);
491 if (is_large_pte(*sptep)) 523 if (is_large_pte(*sptep))
492 --vcpu->kvm->stat.lpages; 524 --vcpu->kvm->stat.lpages;
525 drop_spte(vcpu->kvm, sptep,
526 shadow_trap_nonpresent_pte);
493 need_flush = 1; 527 need_flush = 1;
494 } 528 } else
495 __set_spte(sptep, shadow_trap_nonpresent_pte); 529 __set_spte(sptep, shadow_trap_nonpresent_pte);
496 break; 530 break;
497 } 531 }
498 532
499 if (!is_shadow_present_pte(*sptep)) 533 if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
500 break; 534 break;
501 } 535 }
502 536
@@ -570,9 +604,9 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
570 * Using the cached information from sp->gfns is safe because: 604 * Using the cached information from sp->gfns is safe because:
571 * - The spte has a reference to the struct page, so the pfn for a given gfn 605 * - The spte has a reference to the struct page, so the pfn for a given gfn
572 * can't change unless all sptes pointing to it are nuked first. 606 * can't change unless all sptes pointing to it are nuked first.
573 * - Alias changes zap the entire shadow cache.
574 */ 607 */
575static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 608static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
609 bool clear_unsync)
576{ 610{
577 int i, offset, nr_present; 611 int i, offset, nr_present;
578 bool reset_host_protection; 612 bool reset_host_protection;
@@ -580,6 +614,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
580 614
581 offset = nr_present = 0; 615 offset = nr_present = 0;
582 616
617 /* direct kvm_mmu_page can not be unsync. */
618 BUG_ON(sp->role.direct);
619
583 if (PTTYPE == 32) 620 if (PTTYPE == 32)
584 offset = sp->role.quadrant << PT64_LEVEL_BITS; 621 offset = sp->role.quadrant << PT64_LEVEL_BITS;
585 622
@@ -589,7 +626,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
589 unsigned pte_access; 626 unsigned pte_access;
590 pt_element_t gpte; 627 pt_element_t gpte;
591 gpa_t pte_gpa; 628 gpa_t pte_gpa;
592 gfn_t gfn = sp->gfns[i]; 629 gfn_t gfn;
593 630
594 if (!is_shadow_present_pte(sp->spt[i])) 631 if (!is_shadow_present_pte(sp->spt[i]))
595 continue; 632 continue;
@@ -600,16 +637,17 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
600 sizeof(pt_element_t))) 637 sizeof(pt_element_t)))
601 return -EINVAL; 638 return -EINVAL;
602 639
603 if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) || 640 gfn = gpte_to_gfn(gpte);
604 !(gpte & PT_ACCESSED_MASK)) { 641 if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)
642 || gfn != sp->gfns[i] || !is_present_gpte(gpte)
643 || !(gpte & PT_ACCESSED_MASK)) {
605 u64 nonpresent; 644 u64 nonpresent;
606 645
607 rmap_remove(vcpu->kvm, &sp->spt[i]); 646 if (is_present_gpte(gpte) || !clear_unsync)
608 if (is_present_gpte(gpte))
609 nonpresent = shadow_trap_nonpresent_pte; 647 nonpresent = shadow_trap_nonpresent_pte;
610 else 648 else
611 nonpresent = shadow_notrap_nonpresent_pte; 649 nonpresent = shadow_notrap_nonpresent_pte;
612 __set_spte(&sp->spt[i], nonpresent); 650 drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
613 continue; 651 continue;
614 } 652 }
615 653
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ce438e0fdd26..56c9b6bd7655 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4,6 +4,7 @@
4 * AMD SVM support 4 * AMD SVM support
5 * 5 *
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright 2010 Red Hat, Inc. and/or its affilates.
7 * 8 *
8 * Authors: 9 * Authors:
9 * Yaniv Kamay <yaniv@qumranet.com> 10 * Yaniv Kamay <yaniv@qumranet.com>
@@ -285,11 +286,11 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
285 286
286static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 287static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
287{ 288{
289 vcpu->arch.efer = efer;
288 if (!npt_enabled && !(efer & EFER_LMA)) 290 if (!npt_enabled && !(efer & EFER_LMA))
289 efer &= ~EFER_LME; 291 efer &= ~EFER_LME;
290 292
291 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; 293 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
292 vcpu->arch.efer = efer;
293} 294}
294 295
295static int is_external_interrupt(u32 info) 296static int is_external_interrupt(u32 info)
@@ -640,7 +641,7 @@ static __init int svm_hardware_setup(void)
640 641
641 if (nested) { 642 if (nested) {
642 printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); 643 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
643 kvm_enable_efer_bits(EFER_SVME); 644 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
644 } 645 }
645 646
646 for_each_possible_cpu(cpu) { 647 for_each_possible_cpu(cpu) {
@@ -806,7 +807,7 @@ static void init_vmcb(struct vcpu_svm *svm)
806 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. 807 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
807 */ 808 */
808 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 809 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
809 kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); 810 (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
810 811
811 save->cr4 = X86_CR4_PAE; 812 save->cr4 = X86_CR4_PAE;
812 /* rdx = ?? */ 813 /* rdx = ?? */
@@ -903,13 +904,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
903 svm->asid_generation = 0; 904 svm->asid_generation = 0;
904 init_vmcb(svm); 905 init_vmcb(svm);
905 906
906 fx_init(&svm->vcpu); 907 err = fx_init(&svm->vcpu);
908 if (err)
909 goto free_page4;
910
907 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 911 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
908 if (kvm_vcpu_is_bsp(&svm->vcpu)) 912 if (kvm_vcpu_is_bsp(&svm->vcpu))
909 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 913 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
910 914
911 return &svm->vcpu; 915 return &svm->vcpu;
912 916
917free_page4:
918 __free_page(hsave_page);
913free_page3: 919free_page3:
914 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); 920 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
915free_page2: 921free_page2:
@@ -1488,7 +1494,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)
1488 */ 1494 */
1489 pr_err("KVM: Guest triggered AMD Erratum 383\n"); 1495 pr_err("KVM: Guest triggered AMD Erratum 383\n");
1490 1496
1491 set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests); 1497 kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
1492 1498
1493 return; 1499 return;
1494 } 1500 }
@@ -1535,7 +1541,7 @@ static int io_interception(struct vcpu_svm *svm)
1535 string = (io_info & SVM_IOIO_STR_MASK) != 0; 1541 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1536 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 1542 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1537 if (string || in) 1543 if (string || in)
1538 return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); 1544 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
1539 1545
1540 port = io_info >> 16; 1546 port = io_info >> 16;
1541 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1547 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -1957,7 +1963,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1957 svm->vmcb->save.cr3 = hsave->save.cr3; 1963 svm->vmcb->save.cr3 = hsave->save.cr3;
1958 svm->vcpu.arch.cr3 = hsave->save.cr3; 1964 svm->vcpu.arch.cr3 = hsave->save.cr3;
1959 } else { 1965 } else {
1960 kvm_set_cr3(&svm->vcpu, hsave->save.cr3); 1966 (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
1961 } 1967 }
1962 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax); 1968 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
1963 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); 1969 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
@@ -2080,7 +2086,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2080 svm->vmcb->save.cr3 = nested_vmcb->save.cr3; 2086 svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
2081 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; 2087 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
2082 } else 2088 } else
2083 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); 2089 (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
2084 2090
2085 /* Guest paging mode is active - reset mmu */ 2091 /* Guest paging mode is active - reset mmu */
2086 kvm_mmu_reset_context(&svm->vcpu); 2092 kvm_mmu_reset_context(&svm->vcpu);
@@ -2386,16 +2392,12 @@ static int iret_interception(struct vcpu_svm *svm)
2386 2392
2387static int invlpg_interception(struct vcpu_svm *svm) 2393static int invlpg_interception(struct vcpu_svm *svm)
2388{ 2394{
2389 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) 2395 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
2390 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2391 return 1;
2392} 2396}
2393 2397
2394static int emulate_on_interception(struct vcpu_svm *svm) 2398static int emulate_on_interception(struct vcpu_svm *svm)
2395{ 2399{
2396 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) 2400 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
2397 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2398 return 1;
2399} 2401}
2400 2402
2401static int cr8_write_interception(struct vcpu_svm *svm) 2403static int cr8_write_interception(struct vcpu_svm *svm)
@@ -2726,6 +2728,99 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2726 [SVM_EXIT_NPF] = pf_interception, 2728 [SVM_EXIT_NPF] = pf_interception,
2727}; 2729};
2728 2730
2731void dump_vmcb(struct kvm_vcpu *vcpu)
2732{
2733 struct vcpu_svm *svm = to_svm(vcpu);
2734 struct vmcb_control_area *control = &svm->vmcb->control;
2735 struct vmcb_save_area *save = &svm->vmcb->save;
2736
2737 pr_err("VMCB Control Area:\n");
2738 pr_err("cr_read: %04x\n", control->intercept_cr_read);
2739 pr_err("cr_write: %04x\n", control->intercept_cr_write);
2740 pr_err("dr_read: %04x\n", control->intercept_dr_read);
2741 pr_err("dr_write: %04x\n", control->intercept_dr_write);
2742 pr_err("exceptions: %08x\n", control->intercept_exceptions);
2743 pr_err("intercepts: %016llx\n", control->intercept);
2744 pr_err("pause filter count: %d\n", control->pause_filter_count);
2745 pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa);
2746 pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa);
2747 pr_err("tsc_offset: %016llx\n", control->tsc_offset);
2748 pr_err("asid: %d\n", control->asid);
2749 pr_err("tlb_ctl: %d\n", control->tlb_ctl);
2750 pr_err("int_ctl: %08x\n", control->int_ctl);
2751 pr_err("int_vector: %08x\n", control->int_vector);
2752 pr_err("int_state: %08x\n", control->int_state);
2753 pr_err("exit_code: %08x\n", control->exit_code);
2754 pr_err("exit_info1: %016llx\n", control->exit_info_1);
2755 pr_err("exit_info2: %016llx\n", control->exit_info_2);
2756 pr_err("exit_int_info: %08x\n", control->exit_int_info);
2757 pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err);
2758 pr_err("nested_ctl: %lld\n", control->nested_ctl);
2759 pr_err("nested_cr3: %016llx\n", control->nested_cr3);
2760 pr_err("event_inj: %08x\n", control->event_inj);
2761 pr_err("event_inj_err: %08x\n", control->event_inj_err);
2762 pr_err("lbr_ctl: %lld\n", control->lbr_ctl);
2763 pr_err("next_rip: %016llx\n", control->next_rip);
2764 pr_err("VMCB State Save Area:\n");
2765 pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n",
2766 save->es.selector, save->es.attrib,
2767 save->es.limit, save->es.base);
2768 pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n",
2769 save->cs.selector, save->cs.attrib,
2770 save->cs.limit, save->cs.base);
2771 pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n",
2772 save->ss.selector, save->ss.attrib,
2773 save->ss.limit, save->ss.base);
2774 pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n",
2775 save->ds.selector, save->ds.attrib,
2776 save->ds.limit, save->ds.base);
2777 pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n",
2778 save->fs.selector, save->fs.attrib,
2779 save->fs.limit, save->fs.base);
2780 pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n",
2781 save->gs.selector, save->gs.attrib,
2782 save->gs.limit, save->gs.base);
2783 pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n",
2784 save->gdtr.selector, save->gdtr.attrib,
2785 save->gdtr.limit, save->gdtr.base);
2786 pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n",
2787 save->ldtr.selector, save->ldtr.attrib,
2788 save->ldtr.limit, save->ldtr.base);
2789 pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n",
2790 save->idtr.selector, save->idtr.attrib,
2791 save->idtr.limit, save->idtr.base);
2792 pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n",
2793 save->tr.selector, save->tr.attrib,
2794 save->tr.limit, save->tr.base);
2795 pr_err("cpl: %d efer: %016llx\n",
2796 save->cpl, save->efer);
2797 pr_err("cr0: %016llx cr2: %016llx\n",
2798 save->cr0, save->cr2);
2799 pr_err("cr3: %016llx cr4: %016llx\n",
2800 save->cr3, save->cr4);
2801 pr_err("dr6: %016llx dr7: %016llx\n",
2802 save->dr6, save->dr7);
2803 pr_err("rip: %016llx rflags: %016llx\n",
2804 save->rip, save->rflags);
2805 pr_err("rsp: %016llx rax: %016llx\n",
2806 save->rsp, save->rax);
2807 pr_err("star: %016llx lstar: %016llx\n",
2808 save->star, save->lstar);
2809 pr_err("cstar: %016llx sfmask: %016llx\n",
2810 save->cstar, save->sfmask);
2811 pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n",
2812 save->kernel_gs_base, save->sysenter_cs);
2813 pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n",
2814 save->sysenter_esp, save->sysenter_eip);
2815 pr_err("gpat: %016llx dbgctl: %016llx\n",
2816 save->g_pat, save->dbgctl);
2817 pr_err("br_from: %016llx br_to: %016llx\n",
2818 save->br_from, save->br_to);
2819 pr_err("excp_from: %016llx excp_to: %016llx\n",
2820 save->last_excp_from, save->last_excp_to);
2821
2822}
2823
2729static int handle_exit(struct kvm_vcpu *vcpu) 2824static int handle_exit(struct kvm_vcpu *vcpu)
2730{ 2825{
2731 struct vcpu_svm *svm = to_svm(vcpu); 2826 struct vcpu_svm *svm = to_svm(vcpu);
@@ -2770,6 +2865,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2770 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2865 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2771 kvm_run->fail_entry.hardware_entry_failure_reason 2866 kvm_run->fail_entry.hardware_entry_failure_reason
2772 = svm->vmcb->control.exit_code; 2867 = svm->vmcb->control.exit_code;
2868 pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
2869 dump_vmcb(vcpu);
2773 return 0; 2870 return 0;
2774 } 2871 }
2775 2872
@@ -2826,9 +2923,6 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
2826{ 2923{
2827 struct vmcb_control_area *control; 2924 struct vmcb_control_area *control;
2828 2925
2829 trace_kvm_inj_virq(irq);
2830
2831 ++svm->vcpu.stat.irq_injections;
2832 control = &svm->vmcb->control; 2926 control = &svm->vmcb->control;
2833 control->int_vector = irq; 2927 control->int_vector = irq;
2834 control->int_ctl &= ~V_INTR_PRIO_MASK; 2928 control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -2842,6 +2936,9 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
2842 2936
2843 BUG_ON(!(gif_set(svm))); 2937 BUG_ON(!(gif_set(svm)));
2844 2938
2939 trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
2940 ++vcpu->stat.irq_injections;
2941
2845 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | 2942 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
2846 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; 2943 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
2847} 2944}
@@ -3327,6 +3424,11 @@ static bool svm_rdtscp_supported(void)
3327 return false; 3424 return false;
3328} 3425}
3329 3426
3427static bool svm_has_wbinvd_exit(void)
3428{
3429 return true;
3430}
3431
3330static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) 3432static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
3331{ 3433{
3332 struct vcpu_svm *svm = to_svm(vcpu); 3434 struct vcpu_svm *svm = to_svm(vcpu);
@@ -3411,6 +3513,8 @@ static struct kvm_x86_ops svm_x86_ops = {
3411 .rdtscp_supported = svm_rdtscp_supported, 3513 .rdtscp_supported = svm_rdtscp_supported,
3412 3514
3413 .set_supported_cpuid = svm_set_supported_cpuid, 3515 .set_supported_cpuid = svm_set_supported_cpuid,
3516
3517 .has_wbinvd_exit = svm_has_wbinvd_exit,
3414}; 3518};
3415 3519
3416static int __init svm_init(void) 3520static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index 4ddadb1a5ffe..e16a0dbe74d8 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -1,3 +1,17 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * timer support
8 *
9 * Copyright 2010 Red Hat, Inc. and/or its affilates.
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2. See
12 * the COPYING file in the top-level directory.
13 */
14
1#include <linux/kvm_host.h> 15#include <linux/kvm_host.h>
2#include <linux/kvm.h> 16#include <linux/kvm.h>
3#include <linux/hrtimer.h> 17#include <linux/hrtimer.h>
@@ -18,7 +32,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
18 if (ktimer->reinject || !atomic_read(&ktimer->pending)) { 32 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
19 atomic_inc(&ktimer->pending); 33 atomic_inc(&ktimer->pending);
20 /* FIXME: this code should not know anything about vcpus */ 34 /* FIXME: this code should not know anything about vcpus */
21 set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 35 kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
22 } 36 }
23 37
24 if (waitqueue_active(q)) 38 if (waitqueue_active(q))
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ee03679efe78..27a0222c2946 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5,6 +5,7 @@
5 * machines without emulation or binary translation. 5 * machines without emulation or binary translation.
6 * 6 *
7 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affilates.
8 * 9 *
9 * Authors: 10 * Authors:
10 * Avi Kivity <avi@qumranet.com> 11 * Avi Kivity <avi@qumranet.com>
@@ -36,6 +37,8 @@
36#include <asm/vmx.h> 37#include <asm/vmx.h>
37#include <asm/virtext.h> 38#include <asm/virtext.h>
38#include <asm/mce.h> 39#include <asm/mce.h>
40#include <asm/i387.h>
41#include <asm/xcr.h>
39 42
40#include "trace.h" 43#include "trace.h"
41 44
@@ -63,6 +66,9 @@ module_param_named(unrestricted_guest,
63static int __read_mostly emulate_invalid_guest_state = 0; 66static int __read_mostly emulate_invalid_guest_state = 0;
64module_param(emulate_invalid_guest_state, bool, S_IRUGO); 67module_param(emulate_invalid_guest_state, bool, S_IRUGO);
65 68
69static int __read_mostly vmm_exclusive = 1;
70module_param(vmm_exclusive, bool, S_IRUGO);
71
66#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 72#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
67 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) 73 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
68#define KVM_GUEST_CR0_MASK \ 74#define KVM_GUEST_CR0_MASK \
@@ -173,10 +179,13 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
173 179
174static int init_rmode(struct kvm *kvm); 180static int init_rmode(struct kvm *kvm);
175static u64 construct_eptp(unsigned long root_hpa); 181static u64 construct_eptp(unsigned long root_hpa);
182static void kvm_cpu_vmxon(u64 addr);
183static void kvm_cpu_vmxoff(void);
176 184
177static DEFINE_PER_CPU(struct vmcs *, vmxarea); 185static DEFINE_PER_CPU(struct vmcs *, vmxarea);
178static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 186static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
179static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); 187static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
188static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
180 189
181static unsigned long *vmx_io_bitmap_a; 190static unsigned long *vmx_io_bitmap_a;
182static unsigned long *vmx_io_bitmap_b; 191static unsigned long *vmx_io_bitmap_b;
@@ -334,6 +343,11 @@ static inline bool cpu_has_vmx_ept_1g_page(void)
334 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; 343 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
335} 344}
336 345
346static inline bool cpu_has_vmx_ept_4levels(void)
347{
348 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
349}
350
337static inline bool cpu_has_vmx_invept_individual_addr(void) 351static inline bool cpu_has_vmx_invept_individual_addr(void)
338{ 352{
339 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; 353 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
@@ -349,6 +363,16 @@ static inline bool cpu_has_vmx_invept_global(void)
349 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; 363 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
350} 364}
351 365
366static inline bool cpu_has_vmx_invvpid_single(void)
367{
368 return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
369}
370
371static inline bool cpu_has_vmx_invvpid_global(void)
372{
373 return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
374}
375
352static inline bool cpu_has_vmx_ept(void) 376static inline bool cpu_has_vmx_ept(void)
353{ 377{
354 return vmcs_config.cpu_based_2nd_exec_ctrl & 378 return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -389,6 +413,12 @@ static inline bool cpu_has_virtual_nmis(void)
389 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 413 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
390} 414}
391 415
416static inline bool cpu_has_vmx_wbinvd_exit(void)
417{
418 return vmcs_config.cpu_based_2nd_exec_ctrl &
419 SECONDARY_EXEC_WBINVD_EXITING;
420}
421
392static inline bool report_flexpriority(void) 422static inline bool report_flexpriority(void)
393{ 423{
394 return flexpriority_enabled; 424 return flexpriority_enabled;
@@ -453,6 +483,19 @@ static void vmcs_clear(struct vmcs *vmcs)
453 vmcs, phys_addr); 483 vmcs, phys_addr);
454} 484}
455 485
486static void vmcs_load(struct vmcs *vmcs)
487{
488 u64 phys_addr = __pa(vmcs);
489 u8 error;
490
491 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
492 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
493 : "cc", "memory");
494 if (error)
495 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
496 vmcs, phys_addr);
497}
498
456static void __vcpu_clear(void *arg) 499static void __vcpu_clear(void *arg)
457{ 500{
458 struct vcpu_vmx *vmx = arg; 501 struct vcpu_vmx *vmx = arg;
@@ -475,12 +518,27 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
475 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); 518 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
476} 519}
477 520
478static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) 521static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
479{ 522{
480 if (vmx->vpid == 0) 523 if (vmx->vpid == 0)
481 return; 524 return;
482 525
483 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); 526 if (cpu_has_vmx_invvpid_single())
527 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
528}
529
530static inline void vpid_sync_vcpu_global(void)
531{
532 if (cpu_has_vmx_invvpid_global())
533 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
534}
535
536static inline void vpid_sync_context(struct vcpu_vmx *vmx)
537{
538 if (cpu_has_vmx_invvpid_single())
539 vpid_sync_vcpu_single(vmx);
540 else
541 vpid_sync_vcpu_global();
484} 542}
485 543
486static inline void ept_sync_global(void) 544static inline void ept_sync_global(void)
@@ -812,6 +870,9 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
812 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 870 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
813 } 871 }
814#endif 872#endif
873 if (current_thread_info()->status & TS_USEDFPU)
874 clts();
875 load_gdt(&__get_cpu_var(host_gdt));
815} 876}
816 877
817static void vmx_load_host_state(struct vcpu_vmx *vmx) 878static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -828,35 +889,30 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
828static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 889static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
829{ 890{
830 struct vcpu_vmx *vmx = to_vmx(vcpu); 891 struct vcpu_vmx *vmx = to_vmx(vcpu);
831 u64 phys_addr = __pa(vmx->vmcs);
832 u64 tsc_this, delta, new_offset; 892 u64 tsc_this, delta, new_offset;
893 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
833 894
834 if (vcpu->cpu != cpu) { 895 if (!vmm_exclusive)
896 kvm_cpu_vmxon(phys_addr);
897 else if (vcpu->cpu != cpu)
835 vcpu_clear(vmx); 898 vcpu_clear(vmx);
836 kvm_migrate_timers(vcpu);
837 set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
838 local_irq_disable();
839 list_add(&vmx->local_vcpus_link,
840 &per_cpu(vcpus_on_cpu, cpu));
841 local_irq_enable();
842 }
843 899
844 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { 900 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
845 u8 error;
846
847 per_cpu(current_vmcs, cpu) = vmx->vmcs; 901 per_cpu(current_vmcs, cpu) = vmx->vmcs;
848 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" 902 vmcs_load(vmx->vmcs);
849 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
850 : "cc");
851 if (error)
852 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
853 vmx->vmcs, phys_addr);
854 } 903 }
855 904
856 if (vcpu->cpu != cpu) { 905 if (vcpu->cpu != cpu) {
857 struct desc_ptr dt; 906 struct desc_ptr dt;
858 unsigned long sysenter_esp; 907 unsigned long sysenter_esp;
859 908
909 kvm_migrate_timers(vcpu);
910 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
911 local_irq_disable();
912 list_add(&vmx->local_vcpus_link,
913 &per_cpu(vcpus_on_cpu, cpu));
914 local_irq_enable();
915
860 vcpu->cpu = cpu; 916 vcpu->cpu = cpu;
861 /* 917 /*
862 * Linux uses per-cpu TSS and GDT, so set these when switching 918 * Linux uses per-cpu TSS and GDT, so set these when switching
@@ -884,6 +940,10 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
884static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 940static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
885{ 941{
886 __vmx_load_host_state(to_vmx(vcpu)); 942 __vmx_load_host_state(to_vmx(vcpu));
943 if (!vmm_exclusive) {
944 __vcpu_clear(to_vmx(vcpu));
945 kvm_cpu_vmxoff();
946 }
887} 947}
888 948
889static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 949static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -1286,6 +1346,13 @@ static __init int vmx_disabled_by_bios(void)
1286 /* locked but not enabled */ 1346 /* locked but not enabled */
1287} 1347}
1288 1348
1349static void kvm_cpu_vmxon(u64 addr)
1350{
1351 asm volatile (ASM_VMX_VMXON_RAX
1352 : : "a"(&addr), "m"(addr)
1353 : "memory", "cc");
1354}
1355
1289static int hardware_enable(void *garbage) 1356static int hardware_enable(void *garbage)
1290{ 1357{
1291 int cpu = raw_smp_processor_id(); 1358 int cpu = raw_smp_processor_id();
@@ -1308,11 +1375,13 @@ static int hardware_enable(void *garbage)
1308 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); 1375 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
1309 } 1376 }
1310 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ 1377 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1311 asm volatile (ASM_VMX_VMXON_RAX
1312 : : "a"(&phys_addr), "m"(phys_addr)
1313 : "memory", "cc");
1314 1378
1315 ept_sync_global(); 1379 if (vmm_exclusive) {
1380 kvm_cpu_vmxon(phys_addr);
1381 ept_sync_global();
1382 }
1383
1384 store_gdt(&__get_cpu_var(host_gdt));
1316 1385
1317 return 0; 1386 return 0;
1318} 1387}
@@ -1334,13 +1403,15 @@ static void vmclear_local_vcpus(void)
1334static void kvm_cpu_vmxoff(void) 1403static void kvm_cpu_vmxoff(void)
1335{ 1404{
1336 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); 1405 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
1337 write_cr4(read_cr4() & ~X86_CR4_VMXE);
1338} 1406}
1339 1407
1340static void hardware_disable(void *garbage) 1408static void hardware_disable(void *garbage)
1341{ 1409{
1342 vmclear_local_vcpus(); 1410 if (vmm_exclusive) {
1343 kvm_cpu_vmxoff(); 1411 vmclear_local_vcpus();
1412 kvm_cpu_vmxoff();
1413 }
1414 write_cr4(read_cr4() & ~X86_CR4_VMXE);
1344} 1415}
1345 1416
1346static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, 1417static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
@@ -1539,7 +1610,8 @@ static __init int hardware_setup(void)
1539 if (!cpu_has_vmx_vpid()) 1610 if (!cpu_has_vmx_vpid())
1540 enable_vpid = 0; 1611 enable_vpid = 0;
1541 1612
1542 if (!cpu_has_vmx_ept()) { 1613 if (!cpu_has_vmx_ept() ||
1614 !cpu_has_vmx_ept_4levels()) {
1543 enable_ept = 0; 1615 enable_ept = 0;
1544 enable_unrestricted_guest = 0; 1616 enable_unrestricted_guest = 0;
1545 } 1617 }
@@ -1628,7 +1700,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
1628 gfn_t base_gfn; 1700 gfn_t base_gfn;
1629 1701
1630 slots = kvm_memslots(kvm); 1702 slots = kvm_memslots(kvm);
1631 base_gfn = kvm->memslots->memslots[0].base_gfn + 1703 base_gfn = slots->memslots[0].base_gfn +
1632 kvm->memslots->memslots[0].npages - 3; 1704 kvm->memslots->memslots[0].npages - 3;
1633 return base_gfn << PAGE_SHIFT; 1705 return base_gfn << PAGE_SHIFT;
1634 } 1706 }
@@ -1759,9 +1831,12 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
1759 1831
1760static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 1832static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1761{ 1833{
1762 vpid_sync_vcpu_all(to_vmx(vcpu)); 1834 vpid_sync_context(to_vmx(vcpu));
1763 if (enable_ept) 1835 if (enable_ept) {
1836 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1837 return;
1764 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); 1838 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1839 }
1765} 1840}
1766 1841
1767static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 1842static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@ -2507,7 +2582,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2507 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); 2582 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
2508 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 2583 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
2509 2584
2510 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ 2585 vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
2511 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ 2586 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
2512 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ 2587 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
2513 2588
@@ -2599,21 +2674,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2599 2674
2600static int init_rmode(struct kvm *kvm) 2675static int init_rmode(struct kvm *kvm)
2601{ 2676{
2677 int idx, ret = 0;
2678
2679 idx = srcu_read_lock(&kvm->srcu);
2602 if (!init_rmode_tss(kvm)) 2680 if (!init_rmode_tss(kvm))
2603 return 0; 2681 goto exit;
2604 if (!init_rmode_identity_map(kvm)) 2682 if (!init_rmode_identity_map(kvm))
2605 return 0; 2683 goto exit;
2606 return 1; 2684
2685 ret = 1;
2686exit:
2687 srcu_read_unlock(&kvm->srcu, idx);
2688 return ret;
2607} 2689}
2608 2690
2609static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) 2691static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2610{ 2692{
2611 struct vcpu_vmx *vmx = to_vmx(vcpu); 2693 struct vcpu_vmx *vmx = to_vmx(vcpu);
2612 u64 msr; 2694 u64 msr;
2613 int ret, idx; 2695 int ret;
2614 2696
2615 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 2697 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2616 idx = srcu_read_lock(&vcpu->kvm->srcu);
2617 if (!init_rmode(vmx->vcpu.kvm)) { 2698 if (!init_rmode(vmx->vcpu.kvm)) {
2618 ret = -ENOMEM; 2699 ret = -ENOMEM;
2619 goto out; 2700 goto out;
@@ -2630,7 +2711,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2630 msr |= MSR_IA32_APICBASE_BSP; 2711 msr |= MSR_IA32_APICBASE_BSP;
2631 kvm_set_apic_base(&vmx->vcpu, msr); 2712 kvm_set_apic_base(&vmx->vcpu, msr);
2632 2713
2633 fx_init(&vmx->vcpu); 2714 ret = fx_init(&vmx->vcpu);
2715 if (ret != 0)
2716 goto out;
2634 2717
2635 seg_setup(VCPU_SREG_CS); 2718 seg_setup(VCPU_SREG_CS);
2636 /* 2719 /*
@@ -2713,7 +2796,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2713 vmx_fpu_activate(&vmx->vcpu); 2796 vmx_fpu_activate(&vmx->vcpu);
2714 update_exception_bitmap(&vmx->vcpu); 2797 update_exception_bitmap(&vmx->vcpu);
2715 2798
2716 vpid_sync_vcpu_all(vmx); 2799 vpid_sync_context(vmx);
2717 2800
2718 ret = 0; 2801 ret = 0;
2719 2802
@@ -2721,7 +2804,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2721 vmx->emulation_required = 0; 2804 vmx->emulation_required = 0;
2722 2805
2723out: 2806out:
2724 srcu_read_unlock(&vcpu->kvm->srcu, idx);
2725 return ret; 2807 return ret;
2726} 2808}
2727 2809
@@ -2826,9 +2908,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
2826{ 2908{
2827 if (!cpu_has_virtual_nmis()) 2909 if (!cpu_has_virtual_nmis())
2828 return to_vmx(vcpu)->soft_vnmi_blocked; 2910 return to_vmx(vcpu)->soft_vnmi_blocked;
2829 else 2911 return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
2830 return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2831 GUEST_INTR_STATE_NMI);
2832} 2912}
2833 2913
2834static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 2914static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
@@ -3070,7 +3150,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
3070 ++vcpu->stat.io_exits; 3150 ++vcpu->stat.io_exits;
3071 3151
3072 if (string || in) 3152 if (string || in)
3073 return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); 3153 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
3074 3154
3075 port = exit_qualification >> 16; 3155 port = exit_qualification >> 16;
3076 size = (exit_qualification & 7) + 1; 3156 size = (exit_qualification & 7) + 1;
@@ -3090,11 +3170,20 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3090 hypercall[2] = 0xc1; 3170 hypercall[2] = 0xc1;
3091} 3171}
3092 3172
3173static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
3174{
3175 if (err)
3176 kvm_inject_gp(vcpu, 0);
3177 else
3178 skip_emulated_instruction(vcpu);
3179}
3180
3093static int handle_cr(struct kvm_vcpu *vcpu) 3181static int handle_cr(struct kvm_vcpu *vcpu)
3094{ 3182{
3095 unsigned long exit_qualification, val; 3183 unsigned long exit_qualification, val;
3096 int cr; 3184 int cr;
3097 int reg; 3185 int reg;
3186 int err;
3098 3187
3099 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3188 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3100 cr = exit_qualification & 15; 3189 cr = exit_qualification & 15;
@@ -3105,16 +3194,16 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3105 trace_kvm_cr_write(cr, val); 3194 trace_kvm_cr_write(cr, val);
3106 switch (cr) { 3195 switch (cr) {
3107 case 0: 3196 case 0:
3108 kvm_set_cr0(vcpu, val); 3197 err = kvm_set_cr0(vcpu, val);
3109 skip_emulated_instruction(vcpu); 3198 complete_insn_gp(vcpu, err);
3110 return 1; 3199 return 1;
3111 case 3: 3200 case 3:
3112 kvm_set_cr3(vcpu, val); 3201 err = kvm_set_cr3(vcpu, val);
3113 skip_emulated_instruction(vcpu); 3202 complete_insn_gp(vcpu, err);
3114 return 1; 3203 return 1;
3115 case 4: 3204 case 4:
3116 kvm_set_cr4(vcpu, val); 3205 err = kvm_set_cr4(vcpu, val);
3117 skip_emulated_instruction(vcpu); 3206 complete_insn_gp(vcpu, err);
3118 return 1; 3207 return 1;
3119 case 8: { 3208 case 8: {
3120 u8 cr8_prev = kvm_get_cr8(vcpu); 3209 u8 cr8_prev = kvm_get_cr8(vcpu);
@@ -3321,30 +3410,25 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
3321static int handle_wbinvd(struct kvm_vcpu *vcpu) 3410static int handle_wbinvd(struct kvm_vcpu *vcpu)
3322{ 3411{
3323 skip_emulated_instruction(vcpu); 3412 skip_emulated_instruction(vcpu);
3324 /* TODO: Add support for VT-d/pass-through device */ 3413 kvm_emulate_wbinvd(vcpu);
3325 return 1; 3414 return 1;
3326} 3415}
3327 3416
3328static int handle_apic_access(struct kvm_vcpu *vcpu) 3417static int handle_xsetbv(struct kvm_vcpu *vcpu)
3329{ 3418{
3330 unsigned long exit_qualification; 3419 u64 new_bv = kvm_read_edx_eax(vcpu);
3331 enum emulation_result er; 3420 u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
3332 unsigned long offset;
3333 3421
3334 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3422 if (kvm_set_xcr(vcpu, index, new_bv) == 0)
3335 offset = exit_qualification & 0xffful; 3423 skip_emulated_instruction(vcpu);
3336
3337 er = emulate_instruction(vcpu, 0, 0, 0);
3338
3339 if (er != EMULATE_DONE) {
3340 printk(KERN_ERR
3341 "Fail to handle apic access vmexit! Offset is 0x%lx\n",
3342 offset);
3343 return -ENOEXEC;
3344 }
3345 return 1; 3424 return 1;
3346} 3425}
3347 3426
3427static int handle_apic_access(struct kvm_vcpu *vcpu)
3428{
3429 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
3430}
3431
3348static int handle_task_switch(struct kvm_vcpu *vcpu) 3432static int handle_task_switch(struct kvm_vcpu *vcpu)
3349{ 3433{
3350 struct vcpu_vmx *vmx = to_vmx(vcpu); 3434 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3554,13 +3638,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3554 goto out; 3638 goto out;
3555 } 3639 }
3556 3640
3557 if (err != EMULATE_DONE) { 3641 if (err != EMULATE_DONE)
3558 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3642 return 0;
3559 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3560 vcpu->run->internal.ndata = 0;
3561 ret = 0;
3562 goto out;
3563 }
3564 3643
3565 if (signal_pending(current)) 3644 if (signal_pending(current))
3566 goto out; 3645 goto out;
@@ -3623,6 +3702,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3623 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 3702 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
3624 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 3703 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
3625 [EXIT_REASON_WBINVD] = handle_wbinvd, 3704 [EXIT_REASON_WBINVD] = handle_wbinvd,
3705 [EXIT_REASON_XSETBV] = handle_xsetbv,
3626 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 3706 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
3627 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 3707 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
3628 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 3708 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
@@ -3656,6 +3736,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3656 if (enable_ept && is_paging(vcpu)) 3736 if (enable_ept && is_paging(vcpu))
3657 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3737 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3658 3738
3739 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
3740 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3741 vcpu->run->fail_entry.hardware_entry_failure_reason
3742 = exit_reason;
3743 return 0;
3744 }
3745
3659 if (unlikely(vmx->fail)) { 3746 if (unlikely(vmx->fail)) {
3660 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3747 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3661 vcpu->run->fail_entry.hardware_entry_failure_reason 3748 vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -3861,11 +3948,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3861 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 3948 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3862 vmx_set_interrupt_shadow(vcpu, 0); 3949 vmx_set_interrupt_shadow(vcpu, 0);
3863 3950
3864 /*
3865 * Loading guest fpu may have cleared host cr0.ts
3866 */
3867 vmcs_writel(HOST_CR0, read_cr0());
3868
3869 asm( 3951 asm(
3870 /* Store host registers */ 3952 /* Store host registers */
3871 "push %%"R"dx; push %%"R"bp;" 3953 "push %%"R"dx; push %%"R"bp;"
@@ -4001,6 +4083,19 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
4001 kmem_cache_free(kvm_vcpu_cache, vmx); 4083 kmem_cache_free(kvm_vcpu_cache, vmx);
4002} 4084}
4003 4085
4086static inline void vmcs_init(struct vmcs *vmcs)
4087{
4088 u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
4089
4090 if (!vmm_exclusive)
4091 kvm_cpu_vmxon(phys_addr);
4092
4093 vmcs_clear(vmcs);
4094
4095 if (!vmm_exclusive)
4096 kvm_cpu_vmxoff();
4097}
4098
4004static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) 4099static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4005{ 4100{
4006 int err; 4101 int err;
@@ -4026,7 +4121,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4026 if (!vmx->vmcs) 4121 if (!vmx->vmcs)
4027 goto free_msrs; 4122 goto free_msrs;
4028 4123
4029 vmcs_clear(vmx->vmcs); 4124 vmcs_init(vmx->vmcs);
4030 4125
4031 cpu = get_cpu(); 4126 cpu = get_cpu();
4032 vmx_vcpu_load(&vmx->vcpu, cpu); 4127 vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -4265,6 +4360,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
4265 .rdtscp_supported = vmx_rdtscp_supported, 4360 .rdtscp_supported = vmx_rdtscp_supported,
4266 4361
4267 .set_supported_cpuid = vmx_set_supported_cpuid, 4362 .set_supported_cpuid = vmx_set_supported_cpuid,
4363
4364 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
4268}; 4365};
4269 4366
4270static int __init vmx_init(void) 4367static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7fa89c39c64f..97aab036dabf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6,6 +6,7 @@
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright (C) 2008 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc.
8 * Copyright IBM Corporation, 2008 8 * Copyright IBM Corporation, 2008
9 * Copyright 2010 Red Hat, Inc. and/or its affilates.
9 * 10 *
10 * Authors: 11 * Authors:
11 * Avi Kivity <avi@qumranet.com> 12 * Avi Kivity <avi@qumranet.com>
@@ -41,17 +42,19 @@
41#include <linux/srcu.h> 42#include <linux/srcu.h>
42#include <linux/slab.h> 43#include <linux/slab.h>
43#include <linux/perf_event.h> 44#include <linux/perf_event.h>
45#include <linux/uaccess.h>
44#include <trace/events/kvm.h> 46#include <trace/events/kvm.h>
45 47
46#define CREATE_TRACE_POINTS 48#define CREATE_TRACE_POINTS
47#include "trace.h" 49#include "trace.h"
48 50
49#include <asm/debugreg.h> 51#include <asm/debugreg.h>
50#include <asm/uaccess.h>
51#include <asm/msr.h> 52#include <asm/msr.h>
52#include <asm/desc.h> 53#include <asm/desc.h>
53#include <asm/mtrr.h> 54#include <asm/mtrr.h>
54#include <asm/mce.h> 55#include <asm/mce.h>
56#include <asm/i387.h>
57#include <asm/xcr.h>
55 58
56#define MAX_IO_MSRS 256 59#define MAX_IO_MSRS 256
57#define CR0_RESERVED_BITS \ 60#define CR0_RESERVED_BITS \
@@ -62,6 +65,7 @@
62 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 65 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
63 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 66 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
64 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 67 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
68 | X86_CR4_OSXSAVE \
65 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 69 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
66 70
67#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 71#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -147,6 +151,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
147 { NULL } 151 { NULL }
148}; 152};
149 153
154u64 __read_mostly host_xcr0;
155
156static inline u32 bit(int bitno)
157{
158 return 1 << (bitno & 31);
159}
160
150static void kvm_on_user_return(struct user_return_notifier *urn) 161static void kvm_on_user_return(struct user_return_notifier *urn)
151{ 162{
152 unsigned slot; 163 unsigned slot;
@@ -285,7 +296,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
285 prev_nr = vcpu->arch.exception.nr; 296 prev_nr = vcpu->arch.exception.nr;
286 if (prev_nr == DF_VECTOR) { 297 if (prev_nr == DF_VECTOR) {
287 /* triple fault -> shutdown */ 298 /* triple fault -> shutdown */
288 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 299 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
289 return; 300 return;
290 } 301 }
291 class1 = exception_class(prev_nr); 302 class1 = exception_class(prev_nr);
@@ -414,121 +425,163 @@ out:
414 return changed; 425 return changed;
415} 426}
416 427
417void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 428int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
418{ 429{
430 unsigned long old_cr0 = kvm_read_cr0(vcpu);
431 unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
432 X86_CR0_CD | X86_CR0_NW;
433
419 cr0 |= X86_CR0_ET; 434 cr0 |= X86_CR0_ET;
420 435
421#ifdef CONFIG_X86_64 436#ifdef CONFIG_X86_64
422 if (cr0 & 0xffffffff00000000UL) { 437 if (cr0 & 0xffffffff00000000UL)
423 kvm_inject_gp(vcpu, 0); 438 return 1;
424 return;
425 }
426#endif 439#endif
427 440
428 cr0 &= ~CR0_RESERVED_BITS; 441 cr0 &= ~CR0_RESERVED_BITS;
429 442
430 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 443 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
431 kvm_inject_gp(vcpu, 0); 444 return 1;
432 return;
433 }
434 445
435 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 446 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
436 kvm_inject_gp(vcpu, 0); 447 return 1;
437 return;
438 }
439 448
440 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 449 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
441#ifdef CONFIG_X86_64 450#ifdef CONFIG_X86_64
442 if ((vcpu->arch.efer & EFER_LME)) { 451 if ((vcpu->arch.efer & EFER_LME)) {
443 int cs_db, cs_l; 452 int cs_db, cs_l;
444 453
445 if (!is_pae(vcpu)) { 454 if (!is_pae(vcpu))
446 kvm_inject_gp(vcpu, 0); 455 return 1;
447 return;
448 }
449 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 456 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
450 if (cs_l) { 457 if (cs_l)
451 kvm_inject_gp(vcpu, 0); 458 return 1;
452 return;
453
454 }
455 } else 459 } else
456#endif 460#endif
457 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 461 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3))
458 kvm_inject_gp(vcpu, 0); 462 return 1;
459 return;
460 }
461
462 } 463 }
463 464
464 kvm_x86_ops->set_cr0(vcpu, cr0); 465 kvm_x86_ops->set_cr0(vcpu, cr0);
465 466
466 kvm_mmu_reset_context(vcpu); 467 if ((cr0 ^ old_cr0) & update_bits)
467 return; 468 kvm_mmu_reset_context(vcpu);
469 return 0;
468} 470}
469EXPORT_SYMBOL_GPL(kvm_set_cr0); 471EXPORT_SYMBOL_GPL(kvm_set_cr0);
470 472
471void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 473void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
472{ 474{
473 kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); 475 (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
474} 476}
475EXPORT_SYMBOL_GPL(kvm_lmsw); 477EXPORT_SYMBOL_GPL(kvm_lmsw);
476 478
477void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 479int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
478{ 480{
479 unsigned long old_cr4 = kvm_read_cr4(vcpu); 481 u64 xcr0;
480 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
481 482
482 if (cr4 & CR4_RESERVED_BITS) { 483 /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
484 if (index != XCR_XFEATURE_ENABLED_MASK)
485 return 1;
486 xcr0 = xcr;
487 if (kvm_x86_ops->get_cpl(vcpu) != 0)
488 return 1;
489 if (!(xcr0 & XSTATE_FP))
490 return 1;
491 if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
492 return 1;
493 if (xcr0 & ~host_xcr0)
494 return 1;
495 vcpu->arch.xcr0 = xcr0;
496 vcpu->guest_xcr0_loaded = 0;
497 return 0;
498}
499
500int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
501{
502 if (__kvm_set_xcr(vcpu, index, xcr)) {
483 kvm_inject_gp(vcpu, 0); 503 kvm_inject_gp(vcpu, 0);
504 return 1;
505 }
506 return 0;
507}
508EXPORT_SYMBOL_GPL(kvm_set_xcr);
509
510static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
511{
512 struct kvm_cpuid_entry2 *best;
513
514 best = kvm_find_cpuid_entry(vcpu, 1, 0);
515 return best && (best->ecx & bit(X86_FEATURE_XSAVE));
516}
517
518static void update_cpuid(struct kvm_vcpu *vcpu)
519{
520 struct kvm_cpuid_entry2 *best;
521
522 best = kvm_find_cpuid_entry(vcpu, 1, 0);
523 if (!best)
484 return; 524 return;
525
526 /* Update OSXSAVE bit */
527 if (cpu_has_xsave && best->function == 0x1) {
528 best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
529 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
530 best->ecx |= bit(X86_FEATURE_OSXSAVE);
485 } 531 }
532}
533
534int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
535{
536 unsigned long old_cr4 = kvm_read_cr4(vcpu);
537 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
538
539 if (cr4 & CR4_RESERVED_BITS)
540 return 1;
541
542 if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
543 return 1;
486 544
487 if (is_long_mode(vcpu)) { 545 if (is_long_mode(vcpu)) {
488 if (!(cr4 & X86_CR4_PAE)) { 546 if (!(cr4 & X86_CR4_PAE))
489 kvm_inject_gp(vcpu, 0); 547 return 1;
490 return;
491 }
492 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 548 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
493 && ((cr4 ^ old_cr4) & pdptr_bits) 549 && ((cr4 ^ old_cr4) & pdptr_bits)
494 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 550 && !load_pdptrs(vcpu, vcpu->arch.cr3))
495 kvm_inject_gp(vcpu, 0); 551 return 1;
496 return; 552
497 } 553 if (cr4 & X86_CR4_VMXE)
554 return 1;
498 555
499 if (cr4 & X86_CR4_VMXE) {
500 kvm_inject_gp(vcpu, 0);
501 return;
502 }
503 kvm_x86_ops->set_cr4(vcpu, cr4); 556 kvm_x86_ops->set_cr4(vcpu, cr4);
504 vcpu->arch.cr4 = cr4; 557
505 kvm_mmu_reset_context(vcpu); 558 if ((cr4 ^ old_cr4) & pdptr_bits)
559 kvm_mmu_reset_context(vcpu);
560
561 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
562 update_cpuid(vcpu);
563
564 return 0;
506} 565}
507EXPORT_SYMBOL_GPL(kvm_set_cr4); 566EXPORT_SYMBOL_GPL(kvm_set_cr4);
508 567
509void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 568int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
510{ 569{
511 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 570 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
512 kvm_mmu_sync_roots(vcpu); 571 kvm_mmu_sync_roots(vcpu);
513 kvm_mmu_flush_tlb(vcpu); 572 kvm_mmu_flush_tlb(vcpu);
514 return; 573 return 0;
515 } 574 }
516 575
517 if (is_long_mode(vcpu)) { 576 if (is_long_mode(vcpu)) {
518 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 577 if (cr3 & CR3_L_MODE_RESERVED_BITS)
519 kvm_inject_gp(vcpu, 0); 578 return 1;
520 return;
521 }
522 } else { 579 } else {
523 if (is_pae(vcpu)) { 580 if (is_pae(vcpu)) {
524 if (cr3 & CR3_PAE_RESERVED_BITS) { 581 if (cr3 & CR3_PAE_RESERVED_BITS)
525 kvm_inject_gp(vcpu, 0); 582 return 1;
526 return; 583 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3))
527 } 584 return 1;
528 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
529 kvm_inject_gp(vcpu, 0);
530 return;
531 }
532 } 585 }
533 /* 586 /*
534 * We don't check reserved bits in nonpae mode, because 587 * We don't check reserved bits in nonpae mode, because
@@ -546,24 +599,28 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
546 * to debug) behavior on the guest side. 599 * to debug) behavior on the guest side.
547 */ 600 */
548 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 601 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
549 kvm_inject_gp(vcpu, 0); 602 return 1;
550 else { 603 vcpu->arch.cr3 = cr3;
551 vcpu->arch.cr3 = cr3; 604 vcpu->arch.mmu.new_cr3(vcpu);
552 vcpu->arch.mmu.new_cr3(vcpu); 605 return 0;
553 }
554} 606}
555EXPORT_SYMBOL_GPL(kvm_set_cr3); 607EXPORT_SYMBOL_GPL(kvm_set_cr3);
556 608
557void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 609int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
558{ 610{
559 if (cr8 & CR8_RESERVED_BITS) { 611 if (cr8 & CR8_RESERVED_BITS)
560 kvm_inject_gp(vcpu, 0); 612 return 1;
561 return;
562 }
563 if (irqchip_in_kernel(vcpu->kvm)) 613 if (irqchip_in_kernel(vcpu->kvm))
564 kvm_lapic_set_tpr(vcpu, cr8); 614 kvm_lapic_set_tpr(vcpu, cr8);
565 else 615 else
566 vcpu->arch.cr8 = cr8; 616 vcpu->arch.cr8 = cr8;
617 return 0;
618}
619
620void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
621{
622 if (__kvm_set_cr8(vcpu, cr8))
623 kvm_inject_gp(vcpu, 0);
567} 624}
568EXPORT_SYMBOL_GPL(kvm_set_cr8); 625EXPORT_SYMBOL_GPL(kvm_set_cr8);
569 626
@@ -576,7 +633,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
576} 633}
577EXPORT_SYMBOL_GPL(kvm_get_cr8); 634EXPORT_SYMBOL_GPL(kvm_get_cr8);
578 635
579int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) 636static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
580{ 637{
581 switch (dr) { 638 switch (dr) {
582 case 0 ... 3: 639 case 0 ... 3:
@@ -585,29 +642,21 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
585 vcpu->arch.eff_db[dr] = val; 642 vcpu->arch.eff_db[dr] = val;
586 break; 643 break;
587 case 4: 644 case 4:
588 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 645 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
589 kvm_queue_exception(vcpu, UD_VECTOR); 646 return 1; /* #UD */
590 return 1;
591 }
592 /* fall through */ 647 /* fall through */
593 case 6: 648 case 6:
594 if (val & 0xffffffff00000000ULL) { 649 if (val & 0xffffffff00000000ULL)
595 kvm_inject_gp(vcpu, 0); 650 return -1; /* #GP */
596 return 1;
597 }
598 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; 651 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
599 break; 652 break;
600 case 5: 653 case 5:
601 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 654 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
602 kvm_queue_exception(vcpu, UD_VECTOR); 655 return 1; /* #UD */
603 return 1;
604 }
605 /* fall through */ 656 /* fall through */
606 default: /* 7 */ 657 default: /* 7 */
607 if (val & 0xffffffff00000000ULL) { 658 if (val & 0xffffffff00000000ULL)
608 kvm_inject_gp(vcpu, 0); 659 return -1; /* #GP */
609 return 1;
610 }
611 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; 660 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
612 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 661 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
613 kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); 662 kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
@@ -618,28 +667,37 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
618 667
619 return 0; 668 return 0;
620} 669}
670
671int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
672{
673 int res;
674
675 res = __kvm_set_dr(vcpu, dr, val);
676 if (res > 0)
677 kvm_queue_exception(vcpu, UD_VECTOR);
678 else if (res < 0)
679 kvm_inject_gp(vcpu, 0);
680
681 return res;
682}
621EXPORT_SYMBOL_GPL(kvm_set_dr); 683EXPORT_SYMBOL_GPL(kvm_set_dr);
622 684
623int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) 685static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
624{ 686{
625 switch (dr) { 687 switch (dr) {
626 case 0 ... 3: 688 case 0 ... 3:
627 *val = vcpu->arch.db[dr]; 689 *val = vcpu->arch.db[dr];
628 break; 690 break;
629 case 4: 691 case 4:
630 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 692 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
631 kvm_queue_exception(vcpu, UD_VECTOR);
632 return 1; 693 return 1;
633 }
634 /* fall through */ 694 /* fall through */
635 case 6: 695 case 6:
636 *val = vcpu->arch.dr6; 696 *val = vcpu->arch.dr6;
637 break; 697 break;
638 case 5: 698 case 5:
639 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 699 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
640 kvm_queue_exception(vcpu, UD_VECTOR);
641 return 1; 700 return 1;
642 }
643 /* fall through */ 701 /* fall through */
644 default: /* 7 */ 702 default: /* 7 */
645 *val = vcpu->arch.dr7; 703 *val = vcpu->arch.dr7;
@@ -648,12 +706,16 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
648 706
649 return 0; 707 return 0;
650} 708}
651EXPORT_SYMBOL_GPL(kvm_get_dr);
652 709
653static inline u32 bit(int bitno) 710int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
654{ 711{
655 return 1 << (bitno & 31); 712 if (_kvm_get_dr(vcpu, dr, val)) {
713 kvm_queue_exception(vcpu, UD_VECTOR);
714 return 1;
715 }
716 return 0;
656} 717}
718EXPORT_SYMBOL_GPL(kvm_get_dr);
657 719
658/* 720/*
659 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 721 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -682,10 +744,14 @@ static unsigned num_msrs_to_save;
682 744
683static u32 emulated_msrs[] = { 745static u32 emulated_msrs[] = {
684 MSR_IA32_MISC_ENABLE, 746 MSR_IA32_MISC_ENABLE,
747 MSR_IA32_MCG_STATUS,
748 MSR_IA32_MCG_CTL,
685}; 749};
686 750
687static int set_efer(struct kvm_vcpu *vcpu, u64 efer) 751static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
688{ 752{
753 u64 old_efer = vcpu->arch.efer;
754
689 if (efer & efer_reserved_bits) 755 if (efer & efer_reserved_bits)
690 return 1; 756 return 1;
691 757
@@ -714,11 +780,13 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
714 780
715 kvm_x86_ops->set_efer(vcpu, efer); 781 kvm_x86_ops->set_efer(vcpu, efer);
716 782
717 vcpu->arch.efer = efer;
718
719 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 783 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
720 kvm_mmu_reset_context(vcpu); 784 kvm_mmu_reset_context(vcpu);
721 785
786 /* Update reserved bits */
787 if ((efer ^ old_efer) & EFER_NX)
788 kvm_mmu_reset_context(vcpu);
789
722 return 0; 790 return 0;
723} 791}
724 792
@@ -882,7 +950,7 @@ static int kvm_request_guest_time_update(struct kvm_vcpu *v)
882 950
883 if (!vcpu->time_page) 951 if (!vcpu->time_page)
884 return 0; 952 return 0;
885 set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); 953 kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
886 return 1; 954 return 1;
887} 955}
888 956
@@ -1524,16 +1592,12 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
1524{ 1592{
1525 int i, idx; 1593 int i, idx;
1526 1594
1527 vcpu_load(vcpu);
1528
1529 idx = srcu_read_lock(&vcpu->kvm->srcu); 1595 idx = srcu_read_lock(&vcpu->kvm->srcu);
1530 for (i = 0; i < msrs->nmsrs; ++i) 1596 for (i = 0; i < msrs->nmsrs; ++i)
1531 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 1597 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1532 break; 1598 break;
1533 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1599 srcu_read_unlock(&vcpu->kvm->srcu, idx);
1534 1600
1535 vcpu_put(vcpu);
1536
1537 return i; 1601 return i;
1538} 1602}
1539 1603
@@ -1618,6 +1682,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1618 case KVM_CAP_PCI_SEGMENT: 1682 case KVM_CAP_PCI_SEGMENT:
1619 case KVM_CAP_DEBUGREGS: 1683 case KVM_CAP_DEBUGREGS:
1620 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1684 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1685 case KVM_CAP_XSAVE:
1621 r = 1; 1686 r = 1;
1622 break; 1687 break;
1623 case KVM_CAP_COALESCED_MMIO: 1688 case KVM_CAP_COALESCED_MMIO:
@@ -1641,6 +1706,9 @@ int kvm_dev_ioctl_check_extension(long ext)
1641 case KVM_CAP_MCE: 1706 case KVM_CAP_MCE:
1642 r = KVM_MAX_MCE_BANKS; 1707 r = KVM_MAX_MCE_BANKS;
1643 break; 1708 break;
1709 case KVM_CAP_XCRS:
1710 r = cpu_has_xsave;
1711 break;
1644 default: 1712 default:
1645 r = 0; 1713 r = 0;
1646 break; 1714 break;
@@ -1717,8 +1785,28 @@ out:
1717 return r; 1785 return r;
1718} 1786}
1719 1787
1788static void wbinvd_ipi(void *garbage)
1789{
1790 wbinvd();
1791}
1792
1793static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
1794{
1795 return vcpu->kvm->arch.iommu_domain &&
1796 !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
1797}
1798
1720void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1799void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1721{ 1800{
1801 /* Address WBINVD may be executed by guest */
1802 if (need_emulate_wbinvd(vcpu)) {
1803 if (kvm_x86_ops->has_wbinvd_exit())
1804 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
1805 else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
1806 smp_call_function_single(vcpu->cpu,
1807 wbinvd_ipi, NULL, 1);
1808 }
1809
1722 kvm_x86_ops->vcpu_load(vcpu, cpu); 1810 kvm_x86_ops->vcpu_load(vcpu, cpu);
1723 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { 1811 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
1724 unsigned long khz = cpufreq_quick_get(cpu); 1812 unsigned long khz = cpufreq_quick_get(cpu);
@@ -1731,8 +1819,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1731 1819
1732void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1820void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1733{ 1821{
1734 kvm_put_guest_fpu(vcpu);
1735 kvm_x86_ops->vcpu_put(vcpu); 1822 kvm_x86_ops->vcpu_put(vcpu);
1823 kvm_put_guest_fpu(vcpu);
1736} 1824}
1737 1825
1738static int is_efer_nx(void) 1826static int is_efer_nx(void)
@@ -1781,7 +1869,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1781 if (copy_from_user(cpuid_entries, entries, 1869 if (copy_from_user(cpuid_entries, entries,
1782 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1870 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1783 goto out_free; 1871 goto out_free;
1784 vcpu_load(vcpu);
1785 for (i = 0; i < cpuid->nent; i++) { 1872 for (i = 0; i < cpuid->nent; i++) {
1786 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1873 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1787 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 1874 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -1799,7 +1886,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1799 r = 0; 1886 r = 0;
1800 kvm_apic_set_version(vcpu); 1887 kvm_apic_set_version(vcpu);
1801 kvm_x86_ops->cpuid_update(vcpu); 1888 kvm_x86_ops->cpuid_update(vcpu);
1802 vcpu_put(vcpu); 1889 update_cpuid(vcpu);
1803 1890
1804out_free: 1891out_free:
1805 vfree(cpuid_entries); 1892 vfree(cpuid_entries);
@@ -1820,11 +1907,10 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1820 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1907 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1821 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1908 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1822 goto out; 1909 goto out;
1823 vcpu_load(vcpu);
1824 vcpu->arch.cpuid_nent = cpuid->nent; 1910 vcpu->arch.cpuid_nent = cpuid->nent;
1825 kvm_apic_set_version(vcpu); 1911 kvm_apic_set_version(vcpu);
1826 kvm_x86_ops->cpuid_update(vcpu); 1912 kvm_x86_ops->cpuid_update(vcpu);
1827 vcpu_put(vcpu); 1913 update_cpuid(vcpu);
1828 return 0; 1914 return 0;
1829 1915
1830out: 1916out:
@@ -1837,7 +1923,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1837{ 1923{
1838 int r; 1924 int r;
1839 1925
1840 vcpu_load(vcpu);
1841 r = -E2BIG; 1926 r = -E2BIG;
1842 if (cpuid->nent < vcpu->arch.cpuid_nent) 1927 if (cpuid->nent < vcpu->arch.cpuid_nent)
1843 goto out; 1928 goto out;
@@ -1849,7 +1934,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1849 1934
1850out: 1935out:
1851 cpuid->nent = vcpu->arch.cpuid_nent; 1936 cpuid->nent = vcpu->arch.cpuid_nent;
1852 vcpu_put(vcpu);
1853 return r; 1937 return r;
1854} 1938}
1855 1939
@@ -1901,13 +1985,13 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1901 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 1985 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1902 /* cpuid 1.ecx */ 1986 /* cpuid 1.ecx */
1903 const u32 kvm_supported_word4_x86_features = 1987 const u32 kvm_supported_word4_x86_features =
1904 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | 1988 F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
1905 0 /* DS-CPL, VMX, SMX, EST */ | 1989 0 /* DS-CPL, VMX, SMX, EST */ |
1906 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 1990 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
1907 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 1991 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1908 0 /* Reserved, DCA */ | F(XMM4_1) | 1992 0 /* Reserved, DCA */ | F(XMM4_1) |
1909 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 1993 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
1910 0 /* Reserved, XSAVE, OSXSAVE */; 1994 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX);
1911 /* cpuid 0x80000001.ecx */ 1995 /* cpuid 0x80000001.ecx */
1912 const u32 kvm_supported_word6_x86_features = 1996 const u32 kvm_supported_word6_x86_features =
1913 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | 1997 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
@@ -1922,7 +2006,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1922 2006
1923 switch (function) { 2007 switch (function) {
1924 case 0: 2008 case 0:
1925 entry->eax = min(entry->eax, (u32)0xb); 2009 entry->eax = min(entry->eax, (u32)0xd);
1926 break; 2010 break;
1927 case 1: 2011 case 1:
1928 entry->edx &= kvm_supported_word0_x86_features; 2012 entry->edx &= kvm_supported_word0_x86_features;
@@ -1980,6 +2064,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1980 } 2064 }
1981 break; 2065 break;
1982 } 2066 }
2067 case 0xd: {
2068 int i;
2069
2070 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2071 for (i = 1; *nent < maxnent; ++i) {
2072 if (entry[i - 1].eax == 0 && i != 2)
2073 break;
2074 do_cpuid_1_ent(&entry[i], function, i);
2075 entry[i].flags |=
2076 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2077 ++*nent;
2078 }
2079 break;
2080 }
1983 case KVM_CPUID_SIGNATURE: { 2081 case KVM_CPUID_SIGNATURE: {
1984 char signature[12] = "KVMKVMKVM\0\0"; 2082 char signature[12] = "KVMKVMKVM\0\0";
1985 u32 *sigptr = (u32 *)signature; 2083 u32 *sigptr = (u32 *)signature;
@@ -2081,9 +2179,7 @@ out:
2081static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 2179static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2082 struct kvm_lapic_state *s) 2180 struct kvm_lapic_state *s)
2083{ 2181{
2084 vcpu_load(vcpu);
2085 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 2182 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
2086 vcpu_put(vcpu);
2087 2183
2088 return 0; 2184 return 0;
2089} 2185}
@@ -2091,11 +2187,9 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2091static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 2187static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2092 struct kvm_lapic_state *s) 2188 struct kvm_lapic_state *s)
2093{ 2189{
2094 vcpu_load(vcpu);
2095 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 2190 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
2096 kvm_apic_post_state_restore(vcpu); 2191 kvm_apic_post_state_restore(vcpu);
2097 update_cr8_intercept(vcpu); 2192 update_cr8_intercept(vcpu);
2098 vcpu_put(vcpu);
2099 2193
2100 return 0; 2194 return 0;
2101} 2195}
@@ -2107,20 +2201,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2107 return -EINVAL; 2201 return -EINVAL;
2108 if (irqchip_in_kernel(vcpu->kvm)) 2202 if (irqchip_in_kernel(vcpu->kvm))
2109 return -ENXIO; 2203 return -ENXIO;
2110 vcpu_load(vcpu);
2111 2204
2112 kvm_queue_interrupt(vcpu, irq->irq, false); 2205 kvm_queue_interrupt(vcpu, irq->irq, false);
2113 2206
2114 vcpu_put(vcpu);
2115
2116 return 0; 2207 return 0;
2117} 2208}
2118 2209
2119static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) 2210static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
2120{ 2211{
2121 vcpu_load(vcpu);
2122 kvm_inject_nmi(vcpu); 2212 kvm_inject_nmi(vcpu);
2123 vcpu_put(vcpu);
2124 2213
2125 return 0; 2214 return 0;
2126} 2215}
@@ -2140,7 +2229,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2140 int r; 2229 int r;
2141 unsigned bank_num = mcg_cap & 0xff, bank; 2230 unsigned bank_num = mcg_cap & 0xff, bank;
2142 2231
2143 vcpu_load(vcpu);
2144 r = -EINVAL; 2232 r = -EINVAL;
2145 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) 2233 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
2146 goto out; 2234 goto out;
@@ -2155,7 +2243,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2155 for (bank = 0; bank < bank_num; bank++) 2243 for (bank = 0; bank < bank_num; bank++)
2156 vcpu->arch.mce_banks[bank*4] = ~(u64)0; 2244 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
2157out: 2245out:
2158 vcpu_put(vcpu);
2159 return r; 2246 return r;
2160} 2247}
2161 2248
@@ -2188,7 +2275,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2188 printk(KERN_DEBUG "kvm: set_mce: " 2275 printk(KERN_DEBUG "kvm: set_mce: "
2189 "injects mce exception while " 2276 "injects mce exception while "
2190 "previous one is in progress!\n"); 2277 "previous one is in progress!\n");
2191 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 2278 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2192 return 0; 2279 return 0;
2193 } 2280 }
2194 if (banks[1] & MCI_STATUS_VAL) 2281 if (banks[1] & MCI_STATUS_VAL)
@@ -2213,8 +2300,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2213static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, 2300static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2214 struct kvm_vcpu_events *events) 2301 struct kvm_vcpu_events *events)
2215{ 2302{
2216 vcpu_load(vcpu);
2217
2218 events->exception.injected = 2303 events->exception.injected =
2219 vcpu->arch.exception.pending && 2304 vcpu->arch.exception.pending &&
2220 !kvm_exception_is_soft(vcpu->arch.exception.nr); 2305 !kvm_exception_is_soft(vcpu->arch.exception.nr);
@@ -2239,8 +2324,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2239 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 2324 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2240 | KVM_VCPUEVENT_VALID_SIPI_VECTOR 2325 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2241 | KVM_VCPUEVENT_VALID_SHADOW); 2326 | KVM_VCPUEVENT_VALID_SHADOW);
2242
2243 vcpu_put(vcpu);
2244} 2327}
2245 2328
2246static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, 2329static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
@@ -2251,8 +2334,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2251 | KVM_VCPUEVENT_VALID_SHADOW)) 2334 | KVM_VCPUEVENT_VALID_SHADOW))
2252 return -EINVAL; 2335 return -EINVAL;
2253 2336
2254 vcpu_load(vcpu);
2255
2256 vcpu->arch.exception.pending = events->exception.injected; 2337 vcpu->arch.exception.pending = events->exception.injected;
2257 vcpu->arch.exception.nr = events->exception.nr; 2338 vcpu->arch.exception.nr = events->exception.nr;
2258 vcpu->arch.exception.has_error_code = events->exception.has_error_code; 2339 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -2275,22 +2356,16 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2275 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) 2356 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
2276 vcpu->arch.sipi_vector = events->sipi_vector; 2357 vcpu->arch.sipi_vector = events->sipi_vector;
2277 2358
2278 vcpu_put(vcpu);
2279
2280 return 0; 2359 return 0;
2281} 2360}
2282 2361
2283static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, 2362static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
2284 struct kvm_debugregs *dbgregs) 2363 struct kvm_debugregs *dbgregs)
2285{ 2364{
2286 vcpu_load(vcpu);
2287
2288 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); 2365 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
2289 dbgregs->dr6 = vcpu->arch.dr6; 2366 dbgregs->dr6 = vcpu->arch.dr6;
2290 dbgregs->dr7 = vcpu->arch.dr7; 2367 dbgregs->dr7 = vcpu->arch.dr7;
2291 dbgregs->flags = 0; 2368 dbgregs->flags = 0;
2292
2293 vcpu_put(vcpu);
2294} 2369}
2295 2370
2296static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, 2371static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -2299,40 +2374,113 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
2299 if (dbgregs->flags) 2374 if (dbgregs->flags)
2300 return -EINVAL; 2375 return -EINVAL;
2301 2376
2302 vcpu_load(vcpu);
2303
2304 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); 2377 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
2305 vcpu->arch.dr6 = dbgregs->dr6; 2378 vcpu->arch.dr6 = dbgregs->dr6;
2306 vcpu->arch.dr7 = dbgregs->dr7; 2379 vcpu->arch.dr7 = dbgregs->dr7;
2307 2380
2308 vcpu_put(vcpu); 2381 return 0;
2382}
2383
2384static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
2385 struct kvm_xsave *guest_xsave)
2386{
2387 if (cpu_has_xsave)
2388 memcpy(guest_xsave->region,
2389 &vcpu->arch.guest_fpu.state->xsave,
2390 sizeof(struct xsave_struct));
2391 else {
2392 memcpy(guest_xsave->region,
2393 &vcpu->arch.guest_fpu.state->fxsave,
2394 sizeof(struct i387_fxsave_struct));
2395 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
2396 XSTATE_FPSSE;
2397 }
2398}
2399
2400static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
2401 struct kvm_xsave *guest_xsave)
2402{
2403 u64 xstate_bv =
2404 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
2309 2405
2406 if (cpu_has_xsave)
2407 memcpy(&vcpu->arch.guest_fpu.state->xsave,
2408 guest_xsave->region, sizeof(struct xsave_struct));
2409 else {
2410 if (xstate_bv & ~XSTATE_FPSSE)
2411 return -EINVAL;
2412 memcpy(&vcpu->arch.guest_fpu.state->fxsave,
2413 guest_xsave->region, sizeof(struct i387_fxsave_struct));
2414 }
2310 return 0; 2415 return 0;
2311} 2416}
2312 2417
2418static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
2419 struct kvm_xcrs *guest_xcrs)
2420{
2421 if (!cpu_has_xsave) {
2422 guest_xcrs->nr_xcrs = 0;
2423 return;
2424 }
2425
2426 guest_xcrs->nr_xcrs = 1;
2427 guest_xcrs->flags = 0;
2428 guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
2429 guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
2430}
2431
2432static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
2433 struct kvm_xcrs *guest_xcrs)
2434{
2435 int i, r = 0;
2436
2437 if (!cpu_has_xsave)
2438 return -EINVAL;
2439
2440 if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
2441 return -EINVAL;
2442
2443 for (i = 0; i < guest_xcrs->nr_xcrs; i++)
2444 /* Only support XCR0 currently */
2445 if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
2446 r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
2447 guest_xcrs->xcrs[0].value);
2448 break;
2449 }
2450 if (r)
2451 r = -EINVAL;
2452 return r;
2453}
2454
2313long kvm_arch_vcpu_ioctl(struct file *filp, 2455long kvm_arch_vcpu_ioctl(struct file *filp,
2314 unsigned int ioctl, unsigned long arg) 2456 unsigned int ioctl, unsigned long arg)
2315{ 2457{
2316 struct kvm_vcpu *vcpu = filp->private_data; 2458 struct kvm_vcpu *vcpu = filp->private_data;
2317 void __user *argp = (void __user *)arg; 2459 void __user *argp = (void __user *)arg;
2318 int r; 2460 int r;
2319 struct kvm_lapic_state *lapic = NULL; 2461 union {
2462 struct kvm_lapic_state *lapic;
2463 struct kvm_xsave *xsave;
2464 struct kvm_xcrs *xcrs;
2465 void *buffer;
2466 } u;
2320 2467
2468 u.buffer = NULL;
2321 switch (ioctl) { 2469 switch (ioctl) {
2322 case KVM_GET_LAPIC: { 2470 case KVM_GET_LAPIC: {
2323 r = -EINVAL; 2471 r = -EINVAL;
2324 if (!vcpu->arch.apic) 2472 if (!vcpu->arch.apic)
2325 goto out; 2473 goto out;
2326 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2474 u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
2327 2475
2328 r = -ENOMEM; 2476 r = -ENOMEM;
2329 if (!lapic) 2477 if (!u.lapic)
2330 goto out; 2478 goto out;
2331 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); 2479 r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
2332 if (r) 2480 if (r)
2333 goto out; 2481 goto out;
2334 r = -EFAULT; 2482 r = -EFAULT;
2335 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) 2483 if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
2336 goto out; 2484 goto out;
2337 r = 0; 2485 r = 0;
2338 break; 2486 break;
@@ -2341,14 +2489,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2341 r = -EINVAL; 2489 r = -EINVAL;
2342 if (!vcpu->arch.apic) 2490 if (!vcpu->arch.apic)
2343 goto out; 2491 goto out;
2344 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2492 u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
2345 r = -ENOMEM; 2493 r = -ENOMEM;
2346 if (!lapic) 2494 if (!u.lapic)
2347 goto out; 2495 goto out;
2348 r = -EFAULT; 2496 r = -EFAULT;
2349 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) 2497 if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
2350 goto out; 2498 goto out;
2351 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); 2499 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
2352 if (r) 2500 if (r)
2353 goto out; 2501 goto out;
2354 r = 0; 2502 r = 0;
@@ -2464,9 +2612,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2464 r = -EFAULT; 2612 r = -EFAULT;
2465 if (copy_from_user(&mce, argp, sizeof mce)) 2613 if (copy_from_user(&mce, argp, sizeof mce))
2466 goto out; 2614 goto out;
2467 vcpu_load(vcpu);
2468 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 2615 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
2469 vcpu_put(vcpu);
2470 break; 2616 break;
2471 } 2617 }
2472 case KVM_GET_VCPU_EVENTS: { 2618 case KVM_GET_VCPU_EVENTS: {
@@ -2513,11 +2659,67 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2513 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); 2659 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
2514 break; 2660 break;
2515 } 2661 }
2662 case KVM_GET_XSAVE: {
2663 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
2664 r = -ENOMEM;
2665 if (!u.xsave)
2666 break;
2667
2668 kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
2669
2670 r = -EFAULT;
2671 if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
2672 break;
2673 r = 0;
2674 break;
2675 }
2676 case KVM_SET_XSAVE: {
2677 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
2678 r = -ENOMEM;
2679 if (!u.xsave)
2680 break;
2681
2682 r = -EFAULT;
2683 if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
2684 break;
2685
2686 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
2687 break;
2688 }
2689 case KVM_GET_XCRS: {
2690 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
2691 r = -ENOMEM;
2692 if (!u.xcrs)
2693 break;
2694
2695 kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
2696
2697 r = -EFAULT;
2698 if (copy_to_user(argp, u.xcrs,
2699 sizeof(struct kvm_xcrs)))
2700 break;
2701 r = 0;
2702 break;
2703 }
2704 case KVM_SET_XCRS: {
2705 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
2706 r = -ENOMEM;
2707 if (!u.xcrs)
2708 break;
2709
2710 r = -EFAULT;
2711 if (copy_from_user(u.xcrs, argp,
2712 sizeof(struct kvm_xcrs)))
2713 break;
2714
2715 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
2716 break;
2717 }
2516 default: 2718 default:
2517 r = -EINVAL; 2719 r = -EINVAL;
2518 } 2720 }
2519out: 2721out:
2520 kfree(lapic); 2722 kfree(u.buffer);
2521 return r; 2723 return r;
2522} 2724}
2523 2725
@@ -2560,115 +2762,6 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
2560 return kvm->arch.n_alloc_mmu_pages; 2762 return kvm->arch.n_alloc_mmu_pages;
2561} 2763}
2562 2764
2563gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
2564{
2565 int i;
2566 struct kvm_mem_alias *alias;
2567 struct kvm_mem_aliases *aliases;
2568
2569 aliases = kvm_aliases(kvm);
2570
2571 for (i = 0; i < aliases->naliases; ++i) {
2572 alias = &aliases->aliases[i];
2573 if (alias->flags & KVM_ALIAS_INVALID)
2574 continue;
2575 if (gfn >= alias->base_gfn
2576 && gfn < alias->base_gfn + alias->npages)
2577 return alias->target_gfn + gfn - alias->base_gfn;
2578 }
2579 return gfn;
2580}
2581
2582gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
2583{
2584 int i;
2585 struct kvm_mem_alias *alias;
2586 struct kvm_mem_aliases *aliases;
2587
2588 aliases = kvm_aliases(kvm);
2589
2590 for (i = 0; i < aliases->naliases; ++i) {
2591 alias = &aliases->aliases[i];
2592 if (gfn >= alias->base_gfn
2593 && gfn < alias->base_gfn + alias->npages)
2594 return alias->target_gfn + gfn - alias->base_gfn;
2595 }
2596 return gfn;
2597}
2598
2599/*
2600 * Set a new alias region. Aliases map a portion of physical memory into
2601 * another portion. This is useful for memory windows, for example the PC
2602 * VGA region.
2603 */
2604static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
2605 struct kvm_memory_alias *alias)
2606{
2607 int r, n;
2608 struct kvm_mem_alias *p;
2609 struct kvm_mem_aliases *aliases, *old_aliases;
2610
2611 r = -EINVAL;
2612 /* General sanity checks */
2613 if (alias->memory_size & (PAGE_SIZE - 1))
2614 goto out;
2615 if (alias->guest_phys_addr & (PAGE_SIZE - 1))
2616 goto out;
2617 if (alias->slot >= KVM_ALIAS_SLOTS)
2618 goto out;
2619 if (alias->guest_phys_addr + alias->memory_size
2620 < alias->guest_phys_addr)
2621 goto out;
2622 if (alias->target_phys_addr + alias->memory_size
2623 < alias->target_phys_addr)
2624 goto out;
2625
2626 r = -ENOMEM;
2627 aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2628 if (!aliases)
2629 goto out;
2630
2631 mutex_lock(&kvm->slots_lock);
2632
2633 /* invalidate any gfn reference in case of deletion/shrinking */
2634 memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2635 aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
2636 old_aliases = kvm->arch.aliases;
2637 rcu_assign_pointer(kvm->arch.aliases, aliases);
2638 synchronize_srcu_expedited(&kvm->srcu);
2639 kvm_mmu_zap_all(kvm);
2640 kfree(old_aliases);
2641
2642 r = -ENOMEM;
2643 aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2644 if (!aliases)
2645 goto out_unlock;
2646
2647 memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2648
2649 p = &aliases->aliases[alias->slot];
2650 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
2651 p->npages = alias->memory_size >> PAGE_SHIFT;
2652 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
2653 p->flags &= ~(KVM_ALIAS_INVALID);
2654
2655 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
2656 if (aliases->aliases[n - 1].npages)
2657 break;
2658 aliases->naliases = n;
2659
2660 old_aliases = kvm->arch.aliases;
2661 rcu_assign_pointer(kvm->arch.aliases, aliases);
2662 synchronize_srcu_expedited(&kvm->srcu);
2663 kfree(old_aliases);
2664 r = 0;
2665
2666out_unlock:
2667 mutex_unlock(&kvm->slots_lock);
2668out:
2669 return r;
2670}
2671
2672static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 2765static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2673{ 2766{
2674 int r; 2767 int r;
@@ -2797,7 +2890,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2797 struct kvm_memory_slot *memslot; 2890 struct kvm_memory_slot *memslot;
2798 unsigned long n; 2891 unsigned long n;
2799 unsigned long is_dirty = 0; 2892 unsigned long is_dirty = 0;
2800 unsigned long *dirty_bitmap = NULL;
2801 2893
2802 mutex_lock(&kvm->slots_lock); 2894 mutex_lock(&kvm->slots_lock);
2803 2895
@@ -2812,27 +2904,30 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2812 2904
2813 n = kvm_dirty_bitmap_bytes(memslot); 2905 n = kvm_dirty_bitmap_bytes(memslot);
2814 2906
2815 r = -ENOMEM;
2816 dirty_bitmap = vmalloc(n);
2817 if (!dirty_bitmap)
2818 goto out;
2819 memset(dirty_bitmap, 0, n);
2820
2821 for (i = 0; !is_dirty && i < n/sizeof(long); i++) 2907 for (i = 0; !is_dirty && i < n/sizeof(long); i++)
2822 is_dirty = memslot->dirty_bitmap[i]; 2908 is_dirty = memslot->dirty_bitmap[i];
2823 2909
2824 /* If nothing is dirty, don't bother messing with page tables. */ 2910 /* If nothing is dirty, don't bother messing with page tables. */
2825 if (is_dirty) { 2911 if (is_dirty) {
2826 struct kvm_memslots *slots, *old_slots; 2912 struct kvm_memslots *slots, *old_slots;
2913 unsigned long *dirty_bitmap;
2827 2914
2828 spin_lock(&kvm->mmu_lock); 2915 spin_lock(&kvm->mmu_lock);
2829 kvm_mmu_slot_remove_write_access(kvm, log->slot); 2916 kvm_mmu_slot_remove_write_access(kvm, log->slot);
2830 spin_unlock(&kvm->mmu_lock); 2917 spin_unlock(&kvm->mmu_lock);
2831 2918
2832 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 2919 r = -ENOMEM;
2833 if (!slots) 2920 dirty_bitmap = vmalloc(n);
2834 goto out_free; 2921 if (!dirty_bitmap)
2922 goto out;
2923 memset(dirty_bitmap, 0, n);
2835 2924
2925 r = -ENOMEM;
2926 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
2927 if (!slots) {
2928 vfree(dirty_bitmap);
2929 goto out;
2930 }
2836 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 2931 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
2837 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; 2932 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
2838 2933
@@ -2841,13 +2936,20 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2841 synchronize_srcu_expedited(&kvm->srcu); 2936 synchronize_srcu_expedited(&kvm->srcu);
2842 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; 2937 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
2843 kfree(old_slots); 2938 kfree(old_slots);
2939
2940 r = -EFAULT;
2941 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) {
2942 vfree(dirty_bitmap);
2943 goto out;
2944 }
2945 vfree(dirty_bitmap);
2946 } else {
2947 r = -EFAULT;
2948 if (clear_user(log->dirty_bitmap, n))
2949 goto out;
2844 } 2950 }
2845 2951
2846 r = 0; 2952 r = 0;
2847 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
2848 r = -EFAULT;
2849out_free:
2850 vfree(dirty_bitmap);
2851out: 2953out:
2852 mutex_unlock(&kvm->slots_lock); 2954 mutex_unlock(&kvm->slots_lock);
2853 return r; 2955 return r;
@@ -2867,7 +2969,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
2867 union { 2969 union {
2868 struct kvm_pit_state ps; 2970 struct kvm_pit_state ps;
2869 struct kvm_pit_state2 ps2; 2971 struct kvm_pit_state2 ps2;
2870 struct kvm_memory_alias alias;
2871 struct kvm_pit_config pit_config; 2972 struct kvm_pit_config pit_config;
2872 } u; 2973 } u;
2873 2974
@@ -2888,22 +2989,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
2888 goto out; 2989 goto out;
2889 break; 2990 break;
2890 } 2991 }
2891 case KVM_SET_MEMORY_REGION: {
2892 struct kvm_memory_region kvm_mem;
2893 struct kvm_userspace_memory_region kvm_userspace_mem;
2894
2895 r = -EFAULT;
2896 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
2897 goto out;
2898 kvm_userspace_mem.slot = kvm_mem.slot;
2899 kvm_userspace_mem.flags = kvm_mem.flags;
2900 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
2901 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
2902 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
2903 if (r)
2904 goto out;
2905 break;
2906 }
2907 case KVM_SET_NR_MMU_PAGES: 2992 case KVM_SET_NR_MMU_PAGES:
2908 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 2993 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
2909 if (r) 2994 if (r)
@@ -2912,14 +2997,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
2912 case KVM_GET_NR_MMU_PAGES: 2997 case KVM_GET_NR_MMU_PAGES:
2913 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 2998 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
2914 break; 2999 break;
2915 case KVM_SET_MEMORY_ALIAS:
2916 r = -EFAULT;
2917 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
2918 goto out;
2919 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
2920 if (r)
2921 goto out;
2922 break;
2923 case KVM_CREATE_IRQCHIP: { 3000 case KVM_CREATE_IRQCHIP: {
2924 struct kvm_pic *vpic; 3001 struct kvm_pic *vpic;
2925 3002
@@ -3259,7 +3336,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3259 } 3336 }
3260 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 3337 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
3261 if (ret < 0) { 3338 if (ret < 0) {
3262 r = X86EMUL_UNHANDLEABLE; 3339 r = X86EMUL_IO_NEEDED;
3263 goto out; 3340 goto out;
3264 } 3341 }
3265 3342
@@ -3315,7 +3392,7 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
3315 } 3392 }
3316 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 3393 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
3317 if (ret < 0) { 3394 if (ret < 0) {
3318 r = X86EMUL_UNHANDLEABLE; 3395 r = X86EMUL_IO_NEEDED;
3319 goto out; 3396 goto out;
3320 } 3397 }
3321 3398
@@ -3330,10 +3407,10 @@ out:
3330static int emulator_read_emulated(unsigned long addr, 3407static int emulator_read_emulated(unsigned long addr,
3331 void *val, 3408 void *val,
3332 unsigned int bytes, 3409 unsigned int bytes,
3410 unsigned int *error_code,
3333 struct kvm_vcpu *vcpu) 3411 struct kvm_vcpu *vcpu)
3334{ 3412{
3335 gpa_t gpa; 3413 gpa_t gpa;
3336 u32 error_code;
3337 3414
3338 if (vcpu->mmio_read_completed) { 3415 if (vcpu->mmio_read_completed) {
3339 memcpy(val, vcpu->mmio_data, bytes); 3416 memcpy(val, vcpu->mmio_data, bytes);
@@ -3343,12 +3420,10 @@ static int emulator_read_emulated(unsigned long addr,
3343 return X86EMUL_CONTINUE; 3420 return X86EMUL_CONTINUE;
3344 } 3421 }
3345 3422
3346 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); 3423 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code);
3347 3424
3348 if (gpa == UNMAPPED_GVA) { 3425 if (gpa == UNMAPPED_GVA)
3349 kvm_inject_page_fault(vcpu, addr, error_code);
3350 return X86EMUL_PROPAGATE_FAULT; 3426 return X86EMUL_PROPAGATE_FAULT;
3351 }
3352 3427
3353 /* For APIC access vmexit */ 3428 /* For APIC access vmexit */
3354 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3429 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3370,11 +3445,12 @@ mmio:
3370 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); 3445 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
3371 3446
3372 vcpu->mmio_needed = 1; 3447 vcpu->mmio_needed = 1;
3373 vcpu->mmio_phys_addr = gpa; 3448 vcpu->run->exit_reason = KVM_EXIT_MMIO;
3374 vcpu->mmio_size = bytes; 3449 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
3375 vcpu->mmio_is_write = 0; 3450 vcpu->run->mmio.len = vcpu->mmio_size = bytes;
3451 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
3376 3452
3377 return X86EMUL_UNHANDLEABLE; 3453 return X86EMUL_IO_NEEDED;
3378} 3454}
3379 3455
3380int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 3456int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -3392,17 +3468,15 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
3392static int emulator_write_emulated_onepage(unsigned long addr, 3468static int emulator_write_emulated_onepage(unsigned long addr,
3393 const void *val, 3469 const void *val,
3394 unsigned int bytes, 3470 unsigned int bytes,
3471 unsigned int *error_code,
3395 struct kvm_vcpu *vcpu) 3472 struct kvm_vcpu *vcpu)
3396{ 3473{
3397 gpa_t gpa; 3474 gpa_t gpa;
3398 u32 error_code;
3399 3475
3400 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); 3476 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code);
3401 3477
3402 if (gpa == UNMAPPED_GVA) { 3478 if (gpa == UNMAPPED_GVA)
3403 kvm_inject_page_fault(vcpu, addr, error_code);
3404 return X86EMUL_PROPAGATE_FAULT; 3479 return X86EMUL_PROPAGATE_FAULT;
3405 }
3406 3480
3407 /* For APIC access vmexit */ 3481 /* For APIC access vmexit */
3408 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3482 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3420,10 +3494,11 @@ mmio:
3420 return X86EMUL_CONTINUE; 3494 return X86EMUL_CONTINUE;
3421 3495
3422 vcpu->mmio_needed = 1; 3496 vcpu->mmio_needed = 1;
3423 vcpu->mmio_phys_addr = gpa; 3497 vcpu->run->exit_reason = KVM_EXIT_MMIO;
3424 vcpu->mmio_size = bytes; 3498 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
3425 vcpu->mmio_is_write = 1; 3499 vcpu->run->mmio.len = vcpu->mmio_size = bytes;
3426 memcpy(vcpu->mmio_data, val, bytes); 3500 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
3501 memcpy(vcpu->run->mmio.data, val, bytes);
3427 3502
3428 return X86EMUL_CONTINUE; 3503 return X86EMUL_CONTINUE;
3429} 3504}
@@ -3431,6 +3506,7 @@ mmio:
3431int emulator_write_emulated(unsigned long addr, 3506int emulator_write_emulated(unsigned long addr,
3432 const void *val, 3507 const void *val,
3433 unsigned int bytes, 3508 unsigned int bytes,
3509 unsigned int *error_code,
3434 struct kvm_vcpu *vcpu) 3510 struct kvm_vcpu *vcpu)
3435{ 3511{
3436 /* Crossing a page boundary? */ 3512 /* Crossing a page boundary? */
@@ -3438,16 +3514,17 @@ int emulator_write_emulated(unsigned long addr,
3438 int rc, now; 3514 int rc, now;
3439 3515
3440 now = -addr & ~PAGE_MASK; 3516 now = -addr & ~PAGE_MASK;
3441 rc = emulator_write_emulated_onepage(addr, val, now, vcpu); 3517 rc = emulator_write_emulated_onepage(addr, val, now, error_code,
3518 vcpu);
3442 if (rc != X86EMUL_CONTINUE) 3519 if (rc != X86EMUL_CONTINUE)
3443 return rc; 3520 return rc;
3444 addr += now; 3521 addr += now;
3445 val += now; 3522 val += now;
3446 bytes -= now; 3523 bytes -= now;
3447 } 3524 }
3448 return emulator_write_emulated_onepage(addr, val, bytes, vcpu); 3525 return emulator_write_emulated_onepage(addr, val, bytes, error_code,
3526 vcpu);
3449} 3527}
3450EXPORT_SYMBOL_GPL(emulator_write_emulated);
3451 3528
3452#define CMPXCHG_TYPE(t, ptr, old, new) \ 3529#define CMPXCHG_TYPE(t, ptr, old, new) \
3453 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) 3530 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
@@ -3463,6 +3540,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3463 const void *old, 3540 const void *old,
3464 const void *new, 3541 const void *new,
3465 unsigned int bytes, 3542 unsigned int bytes,
3543 unsigned int *error_code,
3466 struct kvm_vcpu *vcpu) 3544 struct kvm_vcpu *vcpu)
3467{ 3545{
3468 gpa_t gpa; 3546 gpa_t gpa;
@@ -3484,6 +3562,10 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3484 goto emul_write; 3562 goto emul_write;
3485 3563
3486 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 3564 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3565 if (is_error_page(page)) {
3566 kvm_release_page_clean(page);
3567 goto emul_write;
3568 }
3487 3569
3488 kaddr = kmap_atomic(page, KM_USER0); 3570 kaddr = kmap_atomic(page, KM_USER0);
3489 kaddr += offset_in_page(gpa); 3571 kaddr += offset_in_page(gpa);
@@ -3516,7 +3598,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3516emul_write: 3598emul_write:
3517 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 3599 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
3518 3600
3519 return emulator_write_emulated(addr, new, bytes, vcpu); 3601 return emulator_write_emulated(addr, new, bytes, error_code, vcpu);
3520} 3602}
3521 3603
3522static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 3604static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3604,42 +3686,38 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
3604 return X86EMUL_CONTINUE; 3686 return X86EMUL_CONTINUE;
3605} 3687}
3606 3688
3607int emulate_clts(struct kvm_vcpu *vcpu) 3689int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
3608{ 3690{
3609 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); 3691 if (!need_emulate_wbinvd(vcpu))
3610 kvm_x86_ops->fpu_activate(vcpu); 3692 return X86EMUL_CONTINUE;
3693
3694 if (kvm_x86_ops->has_wbinvd_exit()) {
3695 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
3696 wbinvd_ipi, NULL, 1);
3697 cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
3698 }
3699 wbinvd();
3611 return X86EMUL_CONTINUE; 3700 return X86EMUL_CONTINUE;
3612} 3701}
3702EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
3613 3703
3614int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 3704int emulate_clts(struct kvm_vcpu *vcpu)
3615{ 3705{
3616 return kvm_get_dr(ctxt->vcpu, dr, dest); 3706 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
3707 kvm_x86_ops->fpu_activate(vcpu);
3708 return X86EMUL_CONTINUE;
3617} 3709}
3618 3710
3619int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 3711int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu)
3620{ 3712{
3621 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 3713 return _kvm_get_dr(vcpu, dr, dest);
3622
3623 return kvm_set_dr(ctxt->vcpu, dr, value & mask);
3624} 3714}
3625 3715
3626void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 3716int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)
3627{ 3717{
3628 u8 opcodes[4];
3629 unsigned long rip = kvm_rip_read(vcpu);
3630 unsigned long rip_linear;
3631
3632 if (!printk_ratelimit())
3633 return;
3634 3718
3635 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 3719 return __kvm_set_dr(vcpu, dr, value);
3636
3637 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
3638
3639 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
3640 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
3641} 3720}
3642EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
3643 3721
3644static u64 mk_cr_64(u64 curr_cr, u32 new_val) 3722static u64 mk_cr_64(u64 curr_cr, u32 new_val)
3645{ 3723{
@@ -3674,27 +3752,32 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
3674 return value; 3752 return value;
3675} 3753}
3676 3754
3677static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) 3755static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
3678{ 3756{
3757 int res = 0;
3758
3679 switch (cr) { 3759 switch (cr) {
3680 case 0: 3760 case 0:
3681 kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); 3761 res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
3682 break; 3762 break;
3683 case 2: 3763 case 2:
3684 vcpu->arch.cr2 = val; 3764 vcpu->arch.cr2 = val;
3685 break; 3765 break;
3686 case 3: 3766 case 3:
3687 kvm_set_cr3(vcpu, val); 3767 res = kvm_set_cr3(vcpu, val);
3688 break; 3768 break;
3689 case 4: 3769 case 4:
3690 kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); 3770 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
3691 break; 3771 break;
3692 case 8: 3772 case 8:
3693 kvm_set_cr8(vcpu, val & 0xfUL); 3773 res = __kvm_set_cr8(vcpu, val & 0xfUL);
3694 break; 3774 break;
3695 default: 3775 default:
3696 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 3776 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3777 res = -1;
3697 } 3778 }
3779
3780 return res;
3698} 3781}
3699 3782
3700static int emulator_get_cpl(struct kvm_vcpu *vcpu) 3783static int emulator_get_cpl(struct kvm_vcpu *vcpu)
@@ -3707,6 +3790,12 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
3707 kvm_x86_ops->get_gdt(vcpu, dt); 3790 kvm_x86_ops->get_gdt(vcpu, dt);
3708} 3791}
3709 3792
3793static unsigned long emulator_get_cached_segment_base(int seg,
3794 struct kvm_vcpu *vcpu)
3795{
3796 return get_segment_base(vcpu, seg);
3797}
3798
3710static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, 3799static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
3711 struct kvm_vcpu *vcpu) 3800 struct kvm_vcpu *vcpu)
3712{ 3801{
@@ -3779,11 +3868,6 @@ static void emulator_set_segment_selector(u16 sel, int seg,
3779 kvm_set_segment(vcpu, &kvm_seg, seg); 3868 kvm_set_segment(vcpu, &kvm_seg, seg);
3780} 3869}
3781 3870
3782static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
3783{
3784 kvm_x86_ops->set_rflags(vcpu, rflags);
3785}
3786
3787static struct x86_emulate_ops emulate_ops = { 3871static struct x86_emulate_ops emulate_ops = {
3788 .read_std = kvm_read_guest_virt_system, 3872 .read_std = kvm_read_guest_virt_system,
3789 .write_std = kvm_write_guest_virt_system, 3873 .write_std = kvm_write_guest_virt_system,
@@ -3797,11 +3881,15 @@ static struct x86_emulate_ops emulate_ops = {
3797 .set_cached_descriptor = emulator_set_cached_descriptor, 3881 .set_cached_descriptor = emulator_set_cached_descriptor,
3798 .get_segment_selector = emulator_get_segment_selector, 3882 .get_segment_selector = emulator_get_segment_selector,
3799 .set_segment_selector = emulator_set_segment_selector, 3883 .set_segment_selector = emulator_set_segment_selector,
3884 .get_cached_segment_base = emulator_get_cached_segment_base,
3800 .get_gdt = emulator_get_gdt, 3885 .get_gdt = emulator_get_gdt,
3801 .get_cr = emulator_get_cr, 3886 .get_cr = emulator_get_cr,
3802 .set_cr = emulator_set_cr, 3887 .set_cr = emulator_set_cr,
3803 .cpl = emulator_get_cpl, 3888 .cpl = emulator_get_cpl,
3804 .set_rflags = emulator_set_rflags, 3889 .get_dr = emulator_get_dr,
3890 .set_dr = emulator_set_dr,
3891 .set_msr = kvm_set_msr,
3892 .get_msr = kvm_get_msr,
3805}; 3893};
3806 3894
3807static void cache_all_regs(struct kvm_vcpu *vcpu) 3895static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3812,14 +3900,75 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
3812 vcpu->arch.regs_dirty = ~0; 3900 vcpu->arch.regs_dirty = ~0;
3813} 3901}
3814 3902
3903static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
3904{
3905 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
3906 /*
3907 * an sti; sti; sequence only disable interrupts for the first
3908 * instruction. So, if the last instruction, be it emulated or
3909 * not, left the system with the INT_STI flag enabled, it
3910 * means that the last instruction is an sti. We should not
3911 * leave the flag on in this case. The same goes for mov ss
3912 */
3913 if (!(int_shadow & mask))
3914 kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
3915}
3916
3917static void inject_emulated_exception(struct kvm_vcpu *vcpu)
3918{
3919 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
3920 if (ctxt->exception == PF_VECTOR)
3921 kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code);
3922 else if (ctxt->error_code_valid)
3923 kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
3924 else
3925 kvm_queue_exception(vcpu, ctxt->exception);
3926}
3927
3928static int handle_emulation_failure(struct kvm_vcpu *vcpu)
3929{
3930 ++vcpu->stat.insn_emulation_fail;
3931 trace_kvm_emulate_insn_failed(vcpu);
3932 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3933 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3934 vcpu->run->internal.ndata = 0;
3935 kvm_queue_exception(vcpu, UD_VECTOR);
3936 return EMULATE_FAIL;
3937}
3938
3939static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
3940{
3941 gpa_t gpa;
3942
3943 if (tdp_enabled)
3944 return false;
3945
3946 /*
3947 * if emulation was due to access to shadowed page table
3948 * and it failed try to unshadow page and re-entetr the
3949 * guest to let CPU execute the instruction.
3950 */
3951 if (kvm_mmu_unprotect_page_virt(vcpu, gva))
3952 return true;
3953
3954 gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
3955
3956 if (gpa == UNMAPPED_GVA)
3957 return true; /* let cpu generate fault */
3958
3959 if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
3960 return true;
3961
3962 return false;
3963}
3964
3815int emulate_instruction(struct kvm_vcpu *vcpu, 3965int emulate_instruction(struct kvm_vcpu *vcpu,
3816 unsigned long cr2, 3966 unsigned long cr2,
3817 u16 error_code, 3967 u16 error_code,
3818 int emulation_type) 3968 int emulation_type)
3819{ 3969{
3820 int r, shadow_mask; 3970 int r;
3821 struct decode_cache *c; 3971 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
3822 struct kvm_run *run = vcpu->run;
3823 3972
3824 kvm_clear_exception_queue(vcpu); 3973 kvm_clear_exception_queue(vcpu);
3825 vcpu->arch.mmio_fault_cr2 = cr2; 3974 vcpu->arch.mmio_fault_cr2 = cr2;
@@ -3831,8 +3980,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3831 */ 3980 */
3832 cache_all_regs(vcpu); 3981 cache_all_regs(vcpu);
3833 3982
3834 vcpu->mmio_is_write = 0;
3835
3836 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 3983 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
3837 int cs_db, cs_l; 3984 int cs_db, cs_l;
3838 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 3985 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
@@ -3846,13 +3993,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3846 ? X86EMUL_MODE_VM86 : cs_l 3993 ? X86EMUL_MODE_VM86 : cs_l
3847 ? X86EMUL_MODE_PROT64 : cs_db 3994 ? X86EMUL_MODE_PROT64 : cs_db
3848 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 3995 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
3996 memset(c, 0, sizeof(struct decode_cache));
3997 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
3998 vcpu->arch.emulate_ctxt.interruptibility = 0;
3999 vcpu->arch.emulate_ctxt.exception = -1;
3849 4000
3850 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 4001 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3851 trace_kvm_emulate_insn_start(vcpu); 4002 trace_kvm_emulate_insn_start(vcpu);
3852 4003
3853 /* Only allow emulation of specific instructions on #UD 4004 /* Only allow emulation of specific instructions on #UD
3854 * (namely VMMCALL, sysenter, sysexit, syscall)*/ 4005 * (namely VMMCALL, sysenter, sysexit, syscall)*/
3855 c = &vcpu->arch.emulate_ctxt.decode;
3856 if (emulation_type & EMULTYPE_TRAP_UD) { 4006 if (emulation_type & EMULTYPE_TRAP_UD) {
3857 if (!c->twobyte) 4007 if (!c->twobyte)
3858 return EMULATE_FAIL; 4008 return EMULATE_FAIL;
@@ -3880,11 +4030,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3880 4030
3881 ++vcpu->stat.insn_emulation; 4031 ++vcpu->stat.insn_emulation;
3882 if (r) { 4032 if (r) {
3883 ++vcpu->stat.insn_emulation_fail; 4033 if (reexecute_instruction(vcpu, cr2))
3884 trace_kvm_emulate_insn_failed(vcpu);
3885 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
3886 return EMULATE_DONE; 4034 return EMULATE_DONE;
3887 return EMULATE_FAIL; 4035 if (emulation_type & EMULTYPE_SKIP)
4036 return EMULATE_FAIL;
4037 return handle_emulation_failure(vcpu);
3888 } 4038 }
3889 } 4039 }
3890 4040
@@ -3893,48 +4043,42 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3893 return EMULATE_DONE; 4043 return EMULATE_DONE;
3894 } 4044 }
3895 4045
4046 /* this is needed for vmware backdor interface to work since it
4047 changes registers values during IO operation */
4048 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4049
3896restart: 4050restart:
3897 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 4051 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3898 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
3899 4052
3900 if (r == 0) 4053 if (r) { /* emulation failed */
3901 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); 4054 if (reexecute_instruction(vcpu, cr2))
4055 return EMULATE_DONE;
3902 4056
3903 if (vcpu->arch.pio.count) { 4057 return handle_emulation_failure(vcpu);
3904 if (!vcpu->arch.pio.in)
3905 vcpu->arch.pio.count = 0;
3906 return EMULATE_DO_MMIO;
3907 } 4058 }
3908 4059
3909 if (r || vcpu->mmio_is_write) { 4060 toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
3910 run->exit_reason = KVM_EXIT_MMIO; 4061 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
3911 run->mmio.phys_addr = vcpu->mmio_phys_addr; 4062 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
3912 memcpy(run->mmio.data, vcpu->mmio_data, 8); 4063 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
3913 run->mmio.len = vcpu->mmio_size; 4064
3914 run->mmio.is_write = vcpu->mmio_is_write; 4065 if (vcpu->arch.emulate_ctxt.exception >= 0) {
4066 inject_emulated_exception(vcpu);
4067 return EMULATE_DONE;
3915 } 4068 }
3916 4069
3917 if (r) { 4070 if (vcpu->arch.pio.count) {
3918 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 4071 if (!vcpu->arch.pio.in)
3919 goto done; 4072 vcpu->arch.pio.count = 0;
3920 if (!vcpu->mmio_needed) {
3921 ++vcpu->stat.insn_emulation_fail;
3922 trace_kvm_emulate_insn_failed(vcpu);
3923 kvm_report_emulation_failure(vcpu, "mmio");
3924 return EMULATE_FAIL;
3925 }
3926 return EMULATE_DO_MMIO; 4073 return EMULATE_DO_MMIO;
3927 } 4074 }
3928 4075
3929 if (vcpu->mmio_is_write) { 4076 if (vcpu->mmio_needed) {
3930 vcpu->mmio_needed = 0; 4077 if (vcpu->mmio_is_write)
4078 vcpu->mmio_needed = 0;
3931 return EMULATE_DO_MMIO; 4079 return EMULATE_DO_MMIO;
3932 } 4080 }
3933 4081
3934done:
3935 if (vcpu->arch.exception.pending)
3936 vcpu->arch.emulate_ctxt.restart = false;
3937
3938 if (vcpu->arch.emulate_ctxt.restart) 4082 if (vcpu->arch.emulate_ctxt.restart)
3939 goto restart; 4083 goto restart;
3940 4084
@@ -4108,6 +4252,9 @@ int kvm_arch_init(void *opaque)
4108 4252
4109 perf_register_guest_info_callbacks(&kvm_guest_cbs); 4253 perf_register_guest_info_callbacks(&kvm_guest_cbs);
4110 4254
4255 if (cpu_has_xsave)
4256 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
4257
4111 return 0; 4258 return 0;
4112 4259
4113out: 4260out:
@@ -4270,7 +4417,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
4270 4417
4271 kvm_x86_ops->patch_hypercall(vcpu, instruction); 4418 kvm_x86_ops->patch_hypercall(vcpu, instruction);
4272 4419
4273 return emulator_write_emulated(rip, instruction, 3, vcpu); 4420 return emulator_write_emulated(rip, instruction, 3, NULL, vcpu);
4274} 4421}
4275 4422
4276void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 4423void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
@@ -4506,59 +4653,78 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
4506 } 4653 }
4507} 4654}
4508 4655
4656static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
4657{
4658 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
4659 !vcpu->guest_xcr0_loaded) {
4660 /* kvm_set_xcr() also depends on this */
4661 xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
4662 vcpu->guest_xcr0_loaded = 1;
4663 }
4664}
4665
4666static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
4667{
4668 if (vcpu->guest_xcr0_loaded) {
4669 if (vcpu->arch.xcr0 != host_xcr0)
4670 xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
4671 vcpu->guest_xcr0_loaded = 0;
4672 }
4673}
4674
4509static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 4675static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4510{ 4676{
4511 int r; 4677 int r;
4512 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 4678 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
4513 vcpu->run->request_interrupt_window; 4679 vcpu->run->request_interrupt_window;
4514 4680
4515 if (vcpu->requests)
4516 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
4517 kvm_mmu_unload(vcpu);
4518
4519 r = kvm_mmu_reload(vcpu);
4520 if (unlikely(r))
4521 goto out;
4522
4523 if (vcpu->requests) { 4681 if (vcpu->requests) {
4524 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 4682 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
4683 kvm_mmu_unload(vcpu);
4684 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
4525 __kvm_migrate_timers(vcpu); 4685 __kvm_migrate_timers(vcpu);
4526 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) 4686 if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu))
4527 kvm_write_guest_time(vcpu); 4687 kvm_write_guest_time(vcpu);
4528 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) 4688 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
4529 kvm_mmu_sync_roots(vcpu); 4689 kvm_mmu_sync_roots(vcpu);
4530 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 4690 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
4531 kvm_x86_ops->tlb_flush(vcpu); 4691 kvm_x86_ops->tlb_flush(vcpu);
4532 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 4692 if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
4533 &vcpu->requests)) {
4534 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; 4693 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
4535 r = 0; 4694 r = 0;
4536 goto out; 4695 goto out;
4537 } 4696 }
4538 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 4697 if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
4539 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 4698 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
4540 r = 0; 4699 r = 0;
4541 goto out; 4700 goto out;
4542 } 4701 }
4543 if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { 4702 if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
4544 vcpu->fpu_active = 0; 4703 vcpu->fpu_active = 0;
4545 kvm_x86_ops->fpu_deactivate(vcpu); 4704 kvm_x86_ops->fpu_deactivate(vcpu);
4546 } 4705 }
4547 } 4706 }
4548 4707
4708 r = kvm_mmu_reload(vcpu);
4709 if (unlikely(r))
4710 goto out;
4711
4549 preempt_disable(); 4712 preempt_disable();
4550 4713
4551 kvm_x86_ops->prepare_guest_switch(vcpu); 4714 kvm_x86_ops->prepare_guest_switch(vcpu);
4552 if (vcpu->fpu_active) 4715 if (vcpu->fpu_active)
4553 kvm_load_guest_fpu(vcpu); 4716 kvm_load_guest_fpu(vcpu);
4717 kvm_load_guest_xcr0(vcpu);
4554 4718
4555 local_irq_disable(); 4719 atomic_set(&vcpu->guest_mode, 1);
4720 smp_wmb();
4556 4721
4557 clear_bit(KVM_REQ_KICK, &vcpu->requests); 4722 local_irq_disable();
4558 smp_mb__after_clear_bit();
4559 4723
4560 if (vcpu->requests || need_resched() || signal_pending(current)) { 4724 if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
4561 set_bit(KVM_REQ_KICK, &vcpu->requests); 4725 || need_resched() || signal_pending(current)) {
4726 atomic_set(&vcpu->guest_mode, 0);
4727 smp_wmb();
4562 local_irq_enable(); 4728 local_irq_enable();
4563 preempt_enable(); 4729 preempt_enable();
4564 r = 1; 4730 r = 1;
@@ -4603,7 +4769,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4603 if (hw_breakpoint_active()) 4769 if (hw_breakpoint_active())
4604 hw_breakpoint_restore(); 4770 hw_breakpoint_restore();
4605 4771
4606 set_bit(KVM_REQ_KICK, &vcpu->requests); 4772 atomic_set(&vcpu->guest_mode, 0);
4773 smp_wmb();
4607 local_irq_enable(); 4774 local_irq_enable();
4608 4775
4609 ++vcpu->stat.exits; 4776 ++vcpu->stat.exits;
@@ -4665,7 +4832,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
4665 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 4832 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
4666 kvm_vcpu_block(vcpu); 4833 kvm_vcpu_block(vcpu);
4667 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 4834 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
4668 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 4835 if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
4669 { 4836 {
4670 switch(vcpu->arch.mp_state) { 4837 switch(vcpu->arch.mp_state) {
4671 case KVM_MP_STATE_HALTED: 4838 case KVM_MP_STATE_HALTED:
@@ -4717,8 +4884,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4717 int r; 4884 int r;
4718 sigset_t sigsaved; 4885 sigset_t sigsaved;
4719 4886
4720 vcpu_load(vcpu);
4721
4722 if (vcpu->sigset_active) 4887 if (vcpu->sigset_active)
4723 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 4888 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
4724 4889
@@ -4743,7 +4908,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4743 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 4908 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4744 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); 4909 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
4745 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 4910 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4746 if (r == EMULATE_DO_MMIO) { 4911 if (r != EMULATE_DONE) {
4747 r = 0; 4912 r = 0;
4748 goto out; 4913 goto out;
4749 } 4914 }
@@ -4759,14 +4924,11 @@ out:
4759 if (vcpu->sigset_active) 4924 if (vcpu->sigset_active)
4760 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 4925 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
4761 4926
4762 vcpu_put(vcpu);
4763 return r; 4927 return r;
4764} 4928}
4765 4929
4766int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 4930int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4767{ 4931{
4768 vcpu_load(vcpu);
4769
4770 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4932 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4771 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 4933 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4772 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4934 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -4789,15 +4951,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4789 regs->rip = kvm_rip_read(vcpu); 4951 regs->rip = kvm_rip_read(vcpu);
4790 regs->rflags = kvm_get_rflags(vcpu); 4952 regs->rflags = kvm_get_rflags(vcpu);
4791 4953
4792 vcpu_put(vcpu);
4793
4794 return 0; 4954 return 0;
4795} 4955}
4796 4956
4797int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 4957int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4798{ 4958{
4799 vcpu_load(vcpu);
4800
4801 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 4959 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
4802 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 4960 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
4803 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 4961 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
@@ -4822,8 +4980,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4822 4980
4823 vcpu->arch.exception.pending = false; 4981 vcpu->arch.exception.pending = false;
4824 4982
4825 vcpu_put(vcpu);
4826
4827 return 0; 4983 return 0;
4828} 4984}
4829 4985
@@ -4842,8 +4998,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4842{ 4998{
4843 struct desc_ptr dt; 4999 struct desc_ptr dt;
4844 5000
4845 vcpu_load(vcpu);
4846
4847 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 5001 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
4848 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 5002 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
4849 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 5003 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -4875,32 +5029,27 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4875 set_bit(vcpu->arch.interrupt.nr, 5029 set_bit(vcpu->arch.interrupt.nr,
4876 (unsigned long *)sregs->interrupt_bitmap); 5030 (unsigned long *)sregs->interrupt_bitmap);
4877 5031
4878 vcpu_put(vcpu);
4879
4880 return 0; 5032 return 0;
4881} 5033}
4882 5034
4883int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 5035int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
4884 struct kvm_mp_state *mp_state) 5036 struct kvm_mp_state *mp_state)
4885{ 5037{
4886 vcpu_load(vcpu);
4887 mp_state->mp_state = vcpu->arch.mp_state; 5038 mp_state->mp_state = vcpu->arch.mp_state;
4888 vcpu_put(vcpu);
4889 return 0; 5039 return 0;
4890} 5040}
4891 5041
4892int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 5042int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
4893 struct kvm_mp_state *mp_state) 5043 struct kvm_mp_state *mp_state)
4894{ 5044{
4895 vcpu_load(vcpu);
4896 vcpu->arch.mp_state = mp_state->mp_state; 5045 vcpu->arch.mp_state = mp_state->mp_state;
4897 vcpu_put(vcpu);
4898 return 0; 5046 return 0;
4899} 5047}
4900 5048
4901int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 5049int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
4902 bool has_error_code, u32 error_code) 5050 bool has_error_code, u32 error_code)
4903{ 5051{
5052 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4904 int cs_db, cs_l, ret; 5053 int cs_db, cs_l, ret;
4905 cache_all_regs(vcpu); 5054 cache_all_regs(vcpu);
4906 5055
@@ -4915,6 +5064,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
4915 ? X86EMUL_MODE_VM86 : cs_l 5064 ? X86EMUL_MODE_VM86 : cs_l
4916 ? X86EMUL_MODE_PROT64 : cs_db 5065 ? X86EMUL_MODE_PROT64 : cs_db
4917 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 5066 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
5067 memset(c, 0, sizeof(struct decode_cache));
5068 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4918 5069
4919 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, 5070 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
4920 tss_selector, reason, has_error_code, 5071 tss_selector, reason, has_error_code,
@@ -4923,6 +5074,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
4923 if (ret) 5074 if (ret)
4924 return EMULATE_FAIL; 5075 return EMULATE_FAIL;
4925 5076
5077 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
5078 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4926 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 5079 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4927 return EMULATE_DONE; 5080 return EMULATE_DONE;
4928} 5081}
@@ -4935,8 +5088,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4935 int pending_vec, max_bits; 5088 int pending_vec, max_bits;
4936 struct desc_ptr dt; 5089 struct desc_ptr dt;
4937 5090
4938 vcpu_load(vcpu);
4939
4940 dt.size = sregs->idt.limit; 5091 dt.size = sregs->idt.limit;
4941 dt.address = sregs->idt.base; 5092 dt.address = sregs->idt.base;
4942 kvm_x86_ops->set_idt(vcpu, &dt); 5093 kvm_x86_ops->set_idt(vcpu, &dt);
@@ -4996,8 +5147,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4996 !is_protmode(vcpu)) 5147 !is_protmode(vcpu))
4997 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5148 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4998 5149
4999 vcpu_put(vcpu);
5000
5001 return 0; 5150 return 0;
5002} 5151}
5003 5152
@@ -5007,12 +5156,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5007 unsigned long rflags; 5156 unsigned long rflags;
5008 int i, r; 5157 int i, r;
5009 5158
5010 vcpu_load(vcpu);
5011
5012 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { 5159 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
5013 r = -EBUSY; 5160 r = -EBUSY;
5014 if (vcpu->arch.exception.pending) 5161 if (vcpu->arch.exception.pending)
5015 goto unlock_out; 5162 goto out;
5016 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 5163 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
5017 kvm_queue_exception(vcpu, DB_VECTOR); 5164 kvm_queue_exception(vcpu, DB_VECTOR);
5018 else 5165 else
@@ -5054,34 +5201,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5054 5201
5055 r = 0; 5202 r = 0;
5056 5203
5057unlock_out: 5204out:
5058 vcpu_put(vcpu);
5059 5205
5060 return r; 5206 return r;
5061} 5207}
5062 5208
5063/* 5209/*
5064 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
5065 * we have asm/x86/processor.h
5066 */
5067struct fxsave {
5068 u16 cwd;
5069 u16 swd;
5070 u16 twd;
5071 u16 fop;
5072 u64 rip;
5073 u64 rdp;
5074 u32 mxcsr;
5075 u32 mxcsr_mask;
5076 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
5077#ifdef CONFIG_X86_64
5078 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
5079#else
5080 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
5081#endif
5082};
5083
5084/*
5085 * Translate a guest virtual address to a guest physical address. 5210 * Translate a guest virtual address to a guest physical address.
5086 */ 5211 */
5087int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 5212int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
@@ -5091,7 +5216,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
5091 gpa_t gpa; 5216 gpa_t gpa;
5092 int idx; 5217 int idx;
5093 5218
5094 vcpu_load(vcpu);
5095 idx = srcu_read_lock(&vcpu->kvm->srcu); 5219 idx = srcu_read_lock(&vcpu->kvm->srcu);
5096 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); 5220 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
5097 srcu_read_unlock(&vcpu->kvm->srcu, idx); 5221 srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -5099,16 +5223,14 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
5099 tr->valid = gpa != UNMAPPED_GVA; 5223 tr->valid = gpa != UNMAPPED_GVA;
5100 tr->writeable = 1; 5224 tr->writeable = 1;
5101 tr->usermode = 0; 5225 tr->usermode = 0;
5102 vcpu_put(vcpu);
5103 5226
5104 return 0; 5227 return 0;
5105} 5228}
5106 5229
5107int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 5230int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5108{ 5231{
5109 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 5232 struct i387_fxsave_struct *fxsave =
5110 5233 &vcpu->arch.guest_fpu.state->fxsave;
5111 vcpu_load(vcpu);
5112 5234
5113 memcpy(fpu->fpr, fxsave->st_space, 128); 5235 memcpy(fpu->fpr, fxsave->st_space, 128);
5114 fpu->fcw = fxsave->cwd; 5236 fpu->fcw = fxsave->cwd;
@@ -5119,16 +5241,13 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5119 fpu->last_dp = fxsave->rdp; 5241 fpu->last_dp = fxsave->rdp;
5120 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 5242 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
5121 5243
5122 vcpu_put(vcpu);
5123
5124 return 0; 5244 return 0;
5125} 5245}
5126 5246
5127int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 5247int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5128{ 5248{
5129 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 5249 struct i387_fxsave_struct *fxsave =
5130 5250 &vcpu->arch.guest_fpu.state->fxsave;
5131 vcpu_load(vcpu);
5132 5251
5133 memcpy(fxsave->st_space, fpu->fpr, 128); 5252 memcpy(fxsave->st_space, fpu->fpr, 128);
5134 fxsave->cwd = fpu->fcw; 5253 fxsave->cwd = fpu->fcw;
@@ -5139,61 +5258,63 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
5139 fxsave->rdp = fpu->last_dp; 5258 fxsave->rdp = fpu->last_dp;
5140 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 5259 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
5141 5260
5142 vcpu_put(vcpu);
5143
5144 return 0; 5261 return 0;
5145} 5262}
5146 5263
5147void fx_init(struct kvm_vcpu *vcpu) 5264int fx_init(struct kvm_vcpu *vcpu)
5148{ 5265{
5149 unsigned after_mxcsr_mask; 5266 int err;
5267
5268 err = fpu_alloc(&vcpu->arch.guest_fpu);
5269 if (err)
5270 return err;
5271
5272 fpu_finit(&vcpu->arch.guest_fpu);
5150 5273
5151 /* 5274 /*
5152 * Touch the fpu the first time in non atomic context as if 5275 * Ensure guest xcr0 is valid for loading
5153 * this is the first fpu instruction the exception handler
5154 * will fire before the instruction returns and it'll have to
5155 * allocate ram with GFP_KERNEL.
5156 */ 5276 */
5157 if (!used_math()) 5277 vcpu->arch.xcr0 = XSTATE_FP;
5158 kvm_fx_save(&vcpu->arch.host_fx_image);
5159
5160 /* Initialize guest FPU by resetting ours and saving into guest's */
5161 preempt_disable();
5162 kvm_fx_save(&vcpu->arch.host_fx_image);
5163 kvm_fx_finit();
5164 kvm_fx_save(&vcpu->arch.guest_fx_image);
5165 kvm_fx_restore(&vcpu->arch.host_fx_image);
5166 preempt_enable();
5167 5278
5168 vcpu->arch.cr0 |= X86_CR0_ET; 5279 vcpu->arch.cr0 |= X86_CR0_ET;
5169 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); 5280
5170 vcpu->arch.guest_fx_image.mxcsr = 0x1f80; 5281 return 0;
5171 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
5172 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
5173} 5282}
5174EXPORT_SYMBOL_GPL(fx_init); 5283EXPORT_SYMBOL_GPL(fx_init);
5175 5284
5285static void fx_free(struct kvm_vcpu *vcpu)
5286{
5287 fpu_free(&vcpu->arch.guest_fpu);
5288}
5289
5176void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 5290void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
5177{ 5291{
5178 if (vcpu->guest_fpu_loaded) 5292 if (vcpu->guest_fpu_loaded)
5179 return; 5293 return;
5180 5294
5295 /*
5296 * Restore all possible states in the guest,
5297 * and assume host would use all available bits.
5298 * Guest xcr0 would be loaded later.
5299 */
5300 kvm_put_guest_xcr0(vcpu);
5181 vcpu->guest_fpu_loaded = 1; 5301 vcpu->guest_fpu_loaded = 1;
5182 kvm_fx_save(&vcpu->arch.host_fx_image); 5302 unlazy_fpu(current);
5183 kvm_fx_restore(&vcpu->arch.guest_fx_image); 5303 fpu_restore_checking(&vcpu->arch.guest_fpu);
5184 trace_kvm_fpu(1); 5304 trace_kvm_fpu(1);
5185} 5305}
5186 5306
5187void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 5307void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
5188{ 5308{
5309 kvm_put_guest_xcr0(vcpu);
5310
5189 if (!vcpu->guest_fpu_loaded) 5311 if (!vcpu->guest_fpu_loaded)
5190 return; 5312 return;
5191 5313
5192 vcpu->guest_fpu_loaded = 0; 5314 vcpu->guest_fpu_loaded = 0;
5193 kvm_fx_save(&vcpu->arch.guest_fx_image); 5315 fpu_save_init(&vcpu->arch.guest_fpu);
5194 kvm_fx_restore(&vcpu->arch.host_fx_image);
5195 ++vcpu->stat.fpu_reload; 5316 ++vcpu->stat.fpu_reload;
5196 set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); 5317 kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
5197 trace_kvm_fpu(0); 5318 trace_kvm_fpu(0);
5198} 5319}
5199 5320
@@ -5204,6 +5325,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
5204 vcpu->arch.time_page = NULL; 5325 vcpu->arch.time_page = NULL;
5205 } 5326 }
5206 5327
5328 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
5329 fx_free(vcpu);
5207 kvm_x86_ops->vcpu_free(vcpu); 5330 kvm_x86_ops->vcpu_free(vcpu);
5208} 5331}
5209 5332
@@ -5217,9 +5340,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
5217{ 5340{
5218 int r; 5341 int r;
5219 5342
5220 /* We do fxsave: this must be aligned. */
5221 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
5222
5223 vcpu->arch.mtrr_state.have_fixed = 1; 5343 vcpu->arch.mtrr_state.have_fixed = 1;
5224 vcpu_load(vcpu); 5344 vcpu_load(vcpu);
5225 r = kvm_arch_vcpu_reset(vcpu); 5345 r = kvm_arch_vcpu_reset(vcpu);
@@ -5241,6 +5361,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
5241 kvm_mmu_unload(vcpu); 5361 kvm_mmu_unload(vcpu);
5242 vcpu_put(vcpu); 5362 vcpu_put(vcpu);
5243 5363
5364 fx_free(vcpu);
5244 kvm_x86_ops->vcpu_free(vcpu); 5365 kvm_x86_ops->vcpu_free(vcpu);
5245} 5366}
5246 5367
@@ -5334,7 +5455,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5334 } 5455 }
5335 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 5456 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
5336 5457
5458 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
5459 goto fail_free_mce_banks;
5460
5337 return 0; 5461 return 0;
5462fail_free_mce_banks:
5463 kfree(vcpu->arch.mce_banks);
5338fail_free_lapic: 5464fail_free_lapic:
5339 kvm_free_lapic(vcpu); 5465 kvm_free_lapic(vcpu);
5340fail_mmu_destroy: 5466fail_mmu_destroy:
@@ -5364,12 +5490,6 @@ struct kvm *kvm_arch_create_vm(void)
5364 if (!kvm) 5490 if (!kvm)
5365 return ERR_PTR(-ENOMEM); 5491 return ERR_PTR(-ENOMEM);
5366 5492
5367 kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
5368 if (!kvm->arch.aliases) {
5369 kfree(kvm);
5370 return ERR_PTR(-ENOMEM);
5371 }
5372
5373 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 5493 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5374 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 5494 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
5375 5495
@@ -5412,12 +5532,12 @@ static void kvm_free_vcpus(struct kvm *kvm)
5412void kvm_arch_sync_events(struct kvm *kvm) 5532void kvm_arch_sync_events(struct kvm *kvm)
5413{ 5533{
5414 kvm_free_all_assigned_devices(kvm); 5534 kvm_free_all_assigned_devices(kvm);
5535 kvm_free_pit(kvm);
5415} 5536}
5416 5537
5417void kvm_arch_destroy_vm(struct kvm *kvm) 5538void kvm_arch_destroy_vm(struct kvm *kvm)
5418{ 5539{
5419 kvm_iommu_unmap_guest(kvm); 5540 kvm_iommu_unmap_guest(kvm);
5420 kvm_free_pit(kvm);
5421 kfree(kvm->arch.vpic); 5541 kfree(kvm->arch.vpic);
5422 kfree(kvm->arch.vioapic); 5542 kfree(kvm->arch.vioapic);
5423 kvm_free_vcpus(kvm); 5543 kvm_free_vcpus(kvm);
@@ -5427,7 +5547,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
5427 if (kvm->arch.ept_identity_pagetable) 5547 if (kvm->arch.ept_identity_pagetable)
5428 put_page(kvm->arch.ept_identity_pagetable); 5548 put_page(kvm->arch.ept_identity_pagetable);
5429 cleanup_srcu_struct(&kvm->srcu); 5549 cleanup_srcu_struct(&kvm->srcu);
5430 kfree(kvm->arch.aliases);
5431 kfree(kvm); 5550 kfree(kvm);
5432} 5551}
5433 5552
@@ -5438,6 +5557,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
5438 int user_alloc) 5557 int user_alloc)
5439{ 5558{
5440 int npages = memslot->npages; 5559 int npages = memslot->npages;
5560 int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
5561
5562 /* Prevent internal slot pages from being moved by fork()/COW. */
5563 if (memslot->id >= KVM_MEMORY_SLOTS)
5564 map_flags = MAP_SHARED | MAP_ANONYMOUS;
5441 5565
5442 /*To keep backward compatibility with older userspace, 5566 /*To keep backward compatibility with older userspace,
5443 *x86 needs to hanlde !user_alloc case. 5567 *x86 needs to hanlde !user_alloc case.
@@ -5450,7 +5574,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
5450 userspace_addr = do_mmap(NULL, 0, 5574 userspace_addr = do_mmap(NULL, 0,
5451 npages * PAGE_SIZE, 5575 npages * PAGE_SIZE,
5452 PROT_READ | PROT_WRITE, 5576 PROT_READ | PROT_WRITE,
5453 MAP_PRIVATE | MAP_ANONYMOUS, 5577 map_flags,
5454 0); 5578 0);
5455 up_write(&current->mm->mmap_sem); 5579 up_write(&current->mm->mmap_sem);
5456 5580
@@ -5523,7 +5647,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
5523 5647
5524 me = get_cpu(); 5648 me = get_cpu();
5525 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 5649 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
5526 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) 5650 if (atomic_xchg(&vcpu->guest_mode, 0))
5527 smp_send_reschedule(cpu); 5651 smp_send_reschedule(cpu);
5528 put_cpu(); 5652 put_cpu();
5529} 5653}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f4b54458285b..b7a404722d2b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,13 +65,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG); 65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
66} 66}
67 67
68static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm)
69{
70 return rcu_dereference_check(kvm->arch.aliases,
71 srcu_read_lock_held(&kvm->srcu)
72 || lockdep_is_held(&kvm->slots_lock));
73}
74
75void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 68void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
76void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 69void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
77 70