aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-10-16 18:36:00 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-16 18:36:00 -0400
commit08d19f51f05a68ce89a289320ce4ed96e757df72 (patch)
tree31c5d718d0aeaff5083fe533cd6e1f9fbbe846bb
parent1c95e1b69073cff5ff179e592fa1a1e182c78a17 (diff)
parent2381ad241d0bea1253a37f314b270848067640bb (diff)
Merge branch 'kvm-updates/2.6.28' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm
* 'kvm-updates/2.6.28' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (134 commits) KVM: ia64: Add intel iommu support for guests. KVM: ia64: add directed mmio range support for kvm guests KVM: ia64: Make pmt table be able to hold physical mmio entries. KVM: Move irqchip_in_kernel() from ioapic.h to irq.h KVM: Separate irq ack notification out of arch/x86/kvm/irq.c KVM: Change is_mmio_pfn to kvm_is_mmio_pfn, and make it common for all archs KVM: Move device assignment logic to common code KVM: Device Assignment: Move vtd.c from arch/x86/kvm/ to virt/kvm/ KVM: VMX: enable invlpg exiting if EPT is disabled KVM: x86: Silence various LAPIC-related host kernel messages KVM: Device Assignment: Map mmio pages into VT-d page table KVM: PIC: enhance IPI avoidance KVM: MMU: add "oos_shadow" parameter to disable oos KVM: MMU: speed up mmu_unsync_walk KVM: MMU: out of sync shadow core KVM: MMU: mmu_convert_notrap helper KVM: MMU: awareness of new kvm_mmu_zap_page behaviour KVM: MMU: mmu_parent_walk KVM: x86: trap invlpg KVM: MMU: sync roots on mmu reload ...
-rw-r--r--MAINTAINERS9
-rw-r--r--arch/ia64/include/asm/kvm_host.h6
-rw-r--r--arch/ia64/kvm/Kconfig2
-rw-r--r--arch/ia64/kvm/Makefile6
-rw-r--r--arch/ia64/kvm/irq.h31
-rw-r--r--arch/ia64/kvm/kvm-ia64.c66
-rw-r--r--arch/ia64/kvm/kvm_minstate.h23
-rw-r--r--arch/ia64/kvm/optvfault.S181
-rw-r--r--arch/ia64/kvm/process.c4
-rw-r--r--arch/ia64/kvm/vcpu.h26
-rw-r--r--arch/ia64/kvm/vmm_ivt.S39
-rw-r--r--arch/ia64/kvm/vtlb.c23
-rw-r--r--arch/powerpc/include/asm/kvm_host.h14
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h12
-rw-r--r--arch/powerpc/kernel/asm-offsets.c4
-rw-r--r--arch/powerpc/kvm/44x_tlb.c53
-rw-r--r--arch/powerpc/kvm/Kconfig11
-rw-r--r--arch/powerpc/kvm/Makefile6
-rw-r--r--arch/powerpc/kvm/booke_guest.c17
-rw-r--r--arch/powerpc/kvm/booke_interrupts.S79
-rw-r--r--arch/powerpc/kvm/emulate.c8
-rw-r--r--arch/powerpc/kvm/powerpc.c99
-rw-r--r--arch/s390/Kconfig7
-rw-r--r--arch/s390/kvm/priv.c4
-rw-r--r--arch/x86/kernel/kvmclock.c30
-rw-r--r--arch/x86/kernel/pvclock.c12
-rw-r--r--arch/x86/kvm/Makefile5
-rw-r--r--arch/x86/kvm/i8254.c81
-rw-r--r--arch/x86/kvm/i8254.h7
-rw-r--r--arch/x86/kvm/i8259.c53
-rw-r--r--arch/x86/kvm/irq.c3
-rw-r--r--arch/x86/kvm/irq.h6
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h32
-rw-r--r--arch/x86/kvm/lapic.c43
-rw-r--r--arch/x86/kvm/mmu.c680
-rw-r--r--arch/x86/kvm/paging_tmpl.h249
-rw-r--r--arch/x86/kvm/svm.c156
-rw-r--r--arch/x86/kvm/vmx.c712
-rw-r--r--arch/x86/kvm/vmx.h3
-rw-r--r--arch/x86/kvm/x86.c552
-rw-r--r--arch/x86/kvm/x86.h22
-rw-r--r--arch/x86/kvm/x86_emulate.c170
-rw-r--r--arch/x86/xen/time.c11
-rw-r--r--drivers/pci/dmar.c4
-rw-r--r--drivers/pci/intel-iommu.c116
-rw-r--r--drivers/pci/intr_remapping.c2
-rw-r--r--drivers/pci/intr_remapping.h2
-rw-r--r--drivers/pci/iova.c2
-rw-r--r--include/asm-x86/kvm.h22
-rw-r--r--include/asm-x86/kvm_host.h82
-rw-r--r--include/asm-x86/msr-index.h3
-rw-r--r--include/asm-x86/pvclock.h1
-rw-r--r--include/linux/dma_remapping.h (renamed from drivers/pci/dma_remapping.h)0
-rw-r--r--include/linux/intel-iommu.h (renamed from drivers/pci/intel-iommu.h)24
-rw-r--r--include/linux/iova.h (renamed from drivers/pci/iova.h)0
-rw-r--r--include/linux/kvm.h72
-rw-r--r--include/linux/kvm_host.h82
-rw-r--r--virt/kvm/ioapic.c22
-rw-r--r--virt/kvm/ioapic.h10
-rw-r--r--virt/kvm/irq_comm.c60
-rw-r--r--virt/kvm/kvm_main.c382
-rw-r--r--virt/kvm/kvm_trace.c30
-rw-r--r--virt/kvm/vtd.c191
63 files changed, 3432 insertions, 1232 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 57975bda9201..52702b057c02 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2448,7 +2448,14 @@ S: Supported
2448 2448
2449KERNEL VIRTUAL MACHINE (KVM) 2449KERNEL VIRTUAL MACHINE (KVM)
2450P: Avi Kivity 2450P: Avi Kivity
2451M: avi@qumranet.com 2451M: avi@redhat.com
2452L: kvm@vger.kernel.org
2453W: http://kvm.qumranet.com
2454S: Supported
2455
2456KERNEL VIRTUAL MACHINE (KVM) FOR AMD-V
2457P: Joerg Roedel
2458M: joerg.roedel@amd.com
2452L: kvm@vger.kernel.org 2459L: kvm@vger.kernel.org
2453W: http://kvm.qumranet.com 2460W: http://kvm.qumranet.com
2454S: Supported 2461S: Supported
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 1efe513a9941..85db124d37f6 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -132,7 +132,7 @@
132#define GPFN_IOSAPIC (4UL << 60) /* IOSAPIC base */ 132#define GPFN_IOSAPIC (4UL << 60) /* IOSAPIC base */
133#define GPFN_LEGACY_IO (5UL << 60) /* Legacy I/O base */ 133#define GPFN_LEGACY_IO (5UL << 60) /* Legacy I/O base */
134#define GPFN_GFW (6UL << 60) /* Guest Firmware */ 134#define GPFN_GFW (6UL << 60) /* Guest Firmware */
135#define GPFN_HIGH_MMIO (7UL << 60) /* High MMIO range */ 135#define GPFN_PHYS_MMIO (7UL << 60) /* Directed MMIO Range */
136 136
137#define GPFN_IO_MASK (7UL << 60) /* Guest pfn is I/O type */ 137#define GPFN_IO_MASK (7UL << 60) /* Guest pfn is I/O type */
138#define GPFN_INV_MASK (1UL << 63) /* Guest pfn is invalid */ 138#define GPFN_INV_MASK (1UL << 63) /* Guest pfn is invalid */
@@ -413,6 +413,10 @@ struct kvm_arch {
413 struct kvm_ioapic *vioapic; 413 struct kvm_ioapic *vioapic;
414 struct kvm_vm_stat stat; 414 struct kvm_vm_stat stat;
415 struct kvm_sal_data rdv_sal_data; 415 struct kvm_sal_data rdv_sal_data;
416
417 struct list_head assigned_dev_head;
418 struct dmar_domain *intel_iommu_domain;
419 struct hlist_head irq_ack_notifier_list;
416}; 420};
417 421
418union cpuid3_t { 422union cpuid3_t {
diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig
index 7914e4828504..8e99fed6b3fd 100644
--- a/arch/ia64/kvm/Kconfig
+++ b/arch/ia64/kvm/Kconfig
@@ -46,4 +46,6 @@ config KVM_INTEL
46config KVM_TRACE 46config KVM_TRACE
47 bool 47 bool
48 48
49source drivers/virtio/Kconfig
50
49endif # VIRTUALIZATION 51endif # VIRTUALIZATION
diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile
index bf22fb9e6dcf..cf37f8f490c0 100644
--- a/arch/ia64/kvm/Makefile
+++ b/arch/ia64/kvm/Makefile
@@ -44,7 +44,11 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
44EXTRA_AFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/ 44EXTRA_AFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
45 45
46common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 46common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
47 coalesced_mmio.o) 47 coalesced_mmio.o irq_comm.o)
48
49ifeq ($(CONFIG_DMAR),y)
50common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
51endif
48 52
49kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o 53kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o
50obj-$(CONFIG_KVM) += kvm.o 54obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/ia64/kvm/irq.h b/arch/ia64/kvm/irq.h
new file mode 100644
index 000000000000..c6786e8b1bf4
--- /dev/null
+++ b/arch/ia64/kvm/irq.h
@@ -0,0 +1,31 @@
1/*
2 * irq.h: In-kernel interrupt controller related definitions
3 * Copyright (c) 2008, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 *
18 * Authors:
19 * Xiantao Zhang <xiantao.zhang@intel.com>
20 *
21 */
22
23#ifndef __IRQ_H
24#define __IRQ_H
25
26static inline int irqchip_in_kernel(struct kvm *kvm)
27{
28 return 1;
29}
30
31#endif
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index cd0d1a7284b7..c0699f0e35a9 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -31,6 +31,7 @@
31#include <linux/bitops.h> 31#include <linux/bitops.h>
32#include <linux/hrtimer.h> 32#include <linux/hrtimer.h>
33#include <linux/uaccess.h> 33#include <linux/uaccess.h>
34#include <linux/intel-iommu.h>
34 35
35#include <asm/pgtable.h> 36#include <asm/pgtable.h>
36#include <asm/gcc_intrin.h> 37#include <asm/gcc_intrin.h>
@@ -45,6 +46,7 @@
45#include "iodev.h" 46#include "iodev.h"
46#include "ioapic.h" 47#include "ioapic.h"
47#include "lapic.h" 48#include "lapic.h"
49#include "irq.h"
48 50
49static unsigned long kvm_vmm_base; 51static unsigned long kvm_vmm_base;
50static unsigned long kvm_vsa_base; 52static unsigned long kvm_vsa_base;
@@ -179,12 +181,16 @@ int kvm_dev_ioctl_check_extension(long ext)
179 switch (ext) { 181 switch (ext) {
180 case KVM_CAP_IRQCHIP: 182 case KVM_CAP_IRQCHIP:
181 case KVM_CAP_USER_MEMORY: 183 case KVM_CAP_USER_MEMORY:
184 case KVM_CAP_MP_STATE:
182 185
183 r = 1; 186 r = 1;
184 break; 187 break;
185 case KVM_CAP_COALESCED_MMIO: 188 case KVM_CAP_COALESCED_MMIO:
186 r = KVM_COALESCED_MMIO_PAGE_OFFSET; 189 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
187 break; 190 break;
191 case KVM_CAP_IOMMU:
192 r = intel_iommu_found();
193 break;
188 default: 194 default:
189 r = 0; 195 r = 0;
190 } 196 }
@@ -771,6 +777,7 @@ static void kvm_init_vm(struct kvm *kvm)
771 */ 777 */
772 kvm_build_io_pmt(kvm); 778 kvm_build_io_pmt(kvm);
773 779
780 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
774} 781}
775 782
776struct kvm *kvm_arch_create_vm(void) 783struct kvm *kvm_arch_create_vm(void)
@@ -1334,6 +1341,10 @@ static void kvm_release_vm_pages(struct kvm *kvm)
1334 1341
1335void kvm_arch_destroy_vm(struct kvm *kvm) 1342void kvm_arch_destroy_vm(struct kvm *kvm)
1336{ 1343{
1344 kvm_iommu_unmap_guest(kvm);
1345#ifdef KVM_CAP_DEVICE_ASSIGNMENT
1346 kvm_free_all_assigned_devices(kvm);
1347#endif
1337 kfree(kvm->arch.vioapic); 1348 kfree(kvm->arch.vioapic);
1338 kvm_release_vm_pages(kvm); 1349 kvm_release_vm_pages(kvm);
1339 kvm_free_physmem(kvm); 1350 kvm_free_physmem(kvm);
@@ -1435,17 +1446,24 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
1435 int user_alloc) 1446 int user_alloc)
1436{ 1447{
1437 unsigned long i; 1448 unsigned long i;
1438 struct page *page; 1449 unsigned long pfn;
1439 int npages = mem->memory_size >> PAGE_SHIFT; 1450 int npages = mem->memory_size >> PAGE_SHIFT;
1440 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; 1451 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
1441 unsigned long base_gfn = memslot->base_gfn; 1452 unsigned long base_gfn = memslot->base_gfn;
1442 1453
1443 for (i = 0; i < npages; i++) { 1454 for (i = 0; i < npages; i++) {
1444 page = gfn_to_page(kvm, base_gfn + i); 1455 pfn = gfn_to_pfn(kvm, base_gfn + i);
1445 kvm_set_pmt_entry(kvm, base_gfn + i, 1456 if (!kvm_is_mmio_pfn(pfn)) {
1446 page_to_pfn(page) << PAGE_SHIFT, 1457 kvm_set_pmt_entry(kvm, base_gfn + i,
1447 _PAGE_AR_RWX|_PAGE_MA_WB); 1458 pfn << PAGE_SHIFT,
1448 memslot->rmap[i] = (unsigned long)page; 1459 _PAGE_AR_RWX | _PAGE_MA_WB);
1460 memslot->rmap[i] = (unsigned long)pfn_to_page(pfn);
1461 } else {
1462 kvm_set_pmt_entry(kvm, base_gfn + i,
1463 GPFN_PHYS_MMIO | (pfn << PAGE_SHIFT),
1464 _PAGE_MA_UC);
1465 memslot->rmap[i] = 0;
1466 }
1449 } 1467 }
1450 1468
1451 return 0; 1469 return 0;
@@ -1789,11 +1807,43 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
1789int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 1807int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
1790 struct kvm_mp_state *mp_state) 1808 struct kvm_mp_state *mp_state)
1791{ 1809{
1792 return -EINVAL; 1810 vcpu_load(vcpu);
1811 mp_state->mp_state = vcpu->arch.mp_state;
1812 vcpu_put(vcpu);
1813 return 0;
1814}
1815
1816static int vcpu_reset(struct kvm_vcpu *vcpu)
1817{
1818 int r;
1819 long psr;
1820 local_irq_save(psr);
1821 r = kvm_insert_vmm_mapping(vcpu);
1822 if (r)
1823 goto fail;
1824
1825 vcpu->arch.launched = 0;
1826 kvm_arch_vcpu_uninit(vcpu);
1827 r = kvm_arch_vcpu_init(vcpu);
1828 if (r)
1829 goto fail;
1830
1831 kvm_purge_vmm_mapping(vcpu);
1832 r = 0;
1833fail:
1834 local_irq_restore(psr);
1835 return r;
1793} 1836}
1794 1837
1795int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 1838int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
1796 struct kvm_mp_state *mp_state) 1839 struct kvm_mp_state *mp_state)
1797{ 1840{
1798 return -EINVAL; 1841 int r = 0;
1842
1843 vcpu_load(vcpu);
1844 vcpu->arch.mp_state = mp_state->mp_state;
1845 if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)
1846 r = vcpu_reset(vcpu);
1847 vcpu_put(vcpu);
1848 return r;
1799} 1849}
diff --git a/arch/ia64/kvm/kvm_minstate.h b/arch/ia64/kvm/kvm_minstate.h
index 13980d9b8bcf..2cc41d17cf99 100644
--- a/arch/ia64/kvm/kvm_minstate.h
+++ b/arch/ia64/kvm/kvm_minstate.h
@@ -50,27 +50,18 @@
50 50
51#define PAL_VSA_SYNC_READ \ 51#define PAL_VSA_SYNC_READ \
52 /* begin to call pal vps sync_read */ \ 52 /* begin to call pal vps sync_read */ \
53{.mii; \
53 add r25 = VMM_VPD_BASE_OFFSET, r21; \ 54 add r25 = VMM_VPD_BASE_OFFSET, r21; \
54 adds r20 = VMM_VCPU_VSA_BASE_OFFSET, r21; /* entry point */ \ 55 nop 0x0; \
56 mov r24=ip; \
55 ;; \ 57 ;; \
58} \
59{.mmb \
60 add r24=0x20, r24; \
56 ld8 r25 = [r25]; /* read vpd base */ \ 61 ld8 r25 = [r25]; /* read vpd base */ \
57 ld8 r20 = [r20]; \ 62 br.cond.sptk kvm_vps_sync_read; /*call the service*/ \
58 ;; \
59 add r20 = PAL_VPS_SYNC_READ,r20; \
60 ;; \
61{ .mii; \
62 nop 0x0; \
63 mov r24 = ip; \
64 mov b0 = r20; \
65 ;; \ 63 ;; \
66}; \ 64}; \
67{ .mmb; \
68 add r24 = 0x20, r24; \
69 nop 0x0; \
70 br.cond.sptk b0; /* call the service */ \
71 ;; \
72};
73
74 65
75 66
76#define KVM_MINSTATE_GET_CURRENT(reg) mov reg=r21 67#define KVM_MINSTATE_GET_CURRENT(reg) mov reg=r21
diff --git a/arch/ia64/kvm/optvfault.S b/arch/ia64/kvm/optvfault.S
index e4f15d641b22..634abad979b5 100644
--- a/arch/ia64/kvm/optvfault.S
+++ b/arch/ia64/kvm/optvfault.S
@@ -1,9 +1,12 @@
1/* 1/*
2 * arch/ia64/vmx/optvfault.S 2 * arch/ia64/kvm/optvfault.S
3 * optimize virtualization fault handler 3 * optimize virtualization fault handler
4 * 4 *
5 * Copyright (C) 2006 Intel Co 5 * Copyright (C) 2006 Intel Co
6 * Xuefei Xu (Anthony Xu) <anthony.xu@intel.com> 6 * Xuefei Xu (Anthony Xu) <anthony.xu@intel.com>
7 * Copyright (C) 2008 Intel Co
8 * Add the support for Tukwila processors.
9 * Xiantao Zhang <xiantao.zhang@intel.com>
7 */ 10 */
8 11
9#include <asm/asmmacro.h> 12#include <asm/asmmacro.h>
@@ -20,6 +23,98 @@
20#define ACCE_MOV_TO_PSR 23#define ACCE_MOV_TO_PSR
21#define ACCE_THASH 24#define ACCE_THASH
22 25
26#define VMX_VPS_SYNC_READ \
27 add r16=VMM_VPD_BASE_OFFSET,r21; \
28 mov r17 = b0; \
29 mov r18 = r24; \
30 mov r19 = r25; \
31 mov r20 = r31; \
32 ;; \
33{.mii; \
34 ld8 r16 = [r16]; \
35 nop 0x0; \
36 mov r24 = ip; \
37 ;; \
38}; \
39{.mmb; \
40 add r24=0x20, r24; \
41 mov r25 =r16; \
42 br.sptk.many kvm_vps_sync_read; \
43}; \
44 mov b0 = r17; \
45 mov r24 = r18; \
46 mov r25 = r19; \
47 mov r31 = r20
48
49ENTRY(kvm_vps_entry)
50 adds r29 = VMM_VCPU_VSA_BASE_OFFSET,r21
51 ;;
52 ld8 r29 = [r29]
53 ;;
54 add r29 = r29, r30
55 ;;
56 mov b0 = r29
57 br.sptk.many b0
58END(kvm_vps_entry)
59
60/*
61 * Inputs:
62 * r24 : return address
63 * r25 : vpd
64 * r29 : scratch
65 *
66 */
67GLOBAL_ENTRY(kvm_vps_sync_read)
68 movl r30 = PAL_VPS_SYNC_READ
69 ;;
70 br.sptk.many kvm_vps_entry
71END(kvm_vps_sync_read)
72
73/*
74 * Inputs:
75 * r24 : return address
76 * r25 : vpd
77 * r29 : scratch
78 *
79 */
80GLOBAL_ENTRY(kvm_vps_sync_write)
81 movl r30 = PAL_VPS_SYNC_WRITE
82 ;;
83 br.sptk.many kvm_vps_entry
84END(kvm_vps_sync_write)
85
86/*
87 * Inputs:
88 * r23 : pr
89 * r24 : guest b0
90 * r25 : vpd
91 *
92 */
93GLOBAL_ENTRY(kvm_vps_resume_normal)
94 movl r30 = PAL_VPS_RESUME_NORMAL
95 ;;
96 mov pr=r23,-2
97 br.sptk.many kvm_vps_entry
98END(kvm_vps_resume_normal)
99
100/*
101 * Inputs:
102 * r23 : pr
103 * r24 : guest b0
104 * r25 : vpd
105 * r17 : isr
106 */
107GLOBAL_ENTRY(kvm_vps_resume_handler)
108 movl r30 = PAL_VPS_RESUME_HANDLER
109 ;;
110 ld8 r27=[r25]
111 shr r17=r17,IA64_ISR_IR_BIT
112 ;;
113 dep r27=r17,r27,63,1 // bit 63 of r27 indicate whether enable CFLE
114 mov pr=r23,-2
115 br.sptk.many kvm_vps_entry
116END(kvm_vps_resume_handler)
117
23//mov r1=ar3 118//mov r1=ar3
24GLOBAL_ENTRY(kvm_asm_mov_from_ar) 119GLOBAL_ENTRY(kvm_asm_mov_from_ar)
25#ifndef ACCE_MOV_FROM_AR 120#ifndef ACCE_MOV_FROM_AR
@@ -157,11 +252,11 @@ GLOBAL_ENTRY(kvm_asm_rsm)
157#ifndef ACCE_RSM 252#ifndef ACCE_RSM
158 br.many kvm_virtualization_fault_back 253 br.many kvm_virtualization_fault_back
159#endif 254#endif
160 add r16=VMM_VPD_BASE_OFFSET,r21 255 VMX_VPS_SYNC_READ
256 ;;
161 extr.u r26=r25,6,21 257 extr.u r26=r25,6,21
162 extr.u r27=r25,31,2 258 extr.u r27=r25,31,2
163 ;; 259 ;;
164 ld8 r16=[r16]
165 extr.u r28=r25,36,1 260 extr.u r28=r25,36,1
166 dep r26=r27,r26,21,2 261 dep r26=r27,r26,21,2
167 ;; 262 ;;
@@ -196,7 +291,7 @@ GLOBAL_ENTRY(kvm_asm_rsm)
196 tbit.nz p6,p0=r23,0 291 tbit.nz p6,p0=r23,0
197 ;; 292 ;;
198 tbit.z.or p6,p0=r26,IA64_PSR_DT_BIT 293 tbit.z.or p6,p0=r26,IA64_PSR_DT_BIT
199 (p6) br.dptk kvm_resume_to_guest 294 (p6) br.dptk kvm_resume_to_guest_with_sync
200 ;; 295 ;;
201 add r26=VMM_VCPU_META_RR0_OFFSET,r21 296 add r26=VMM_VCPU_META_RR0_OFFSET,r21
202 add r27=VMM_VCPU_META_RR0_OFFSET+8,r21 297 add r27=VMM_VCPU_META_RR0_OFFSET+8,r21
@@ -212,7 +307,7 @@ GLOBAL_ENTRY(kvm_asm_rsm)
212 mov rr[r28]=r27 307 mov rr[r28]=r27
213 ;; 308 ;;
214 srlz.d 309 srlz.d
215 br.many kvm_resume_to_guest 310 br.many kvm_resume_to_guest_with_sync
216END(kvm_asm_rsm) 311END(kvm_asm_rsm)
217 312
218 313
@@ -221,11 +316,11 @@ GLOBAL_ENTRY(kvm_asm_ssm)
221#ifndef ACCE_SSM 316#ifndef ACCE_SSM
222 br.many kvm_virtualization_fault_back 317 br.many kvm_virtualization_fault_back
223#endif 318#endif
224 add r16=VMM_VPD_BASE_OFFSET,r21 319 VMX_VPS_SYNC_READ
320 ;;
225 extr.u r26=r25,6,21 321 extr.u r26=r25,6,21
226 extr.u r27=r25,31,2 322 extr.u r27=r25,31,2
227 ;; 323 ;;
228 ld8 r16=[r16]
229 extr.u r28=r25,36,1 324 extr.u r28=r25,36,1
230 dep r26=r27,r26,21,2 325 dep r26=r27,r26,21,2
231 ;; //r26 is imm24 326 ;; //r26 is imm24
@@ -271,7 +366,7 @@ kvm_asm_ssm_1:
271 tbit.nz p6,p0=r29,IA64_PSR_I_BIT 366 tbit.nz p6,p0=r29,IA64_PSR_I_BIT
272 ;; 367 ;;
273 tbit.z.or p6,p0=r19,IA64_PSR_I_BIT 368 tbit.z.or p6,p0=r19,IA64_PSR_I_BIT
274 (p6) br.dptk kvm_resume_to_guest 369 (p6) br.dptk kvm_resume_to_guest_with_sync
275 ;; 370 ;;
276 add r29=VPD_VTPR_START_OFFSET,r16 371 add r29=VPD_VTPR_START_OFFSET,r16
277 add r30=VPD_VHPI_START_OFFSET,r16 372 add r30=VPD_VHPI_START_OFFSET,r16
@@ -286,7 +381,7 @@ kvm_asm_ssm_1:
286 ;; 381 ;;
287 cmp.gt p6,p0=r30,r17 382 cmp.gt p6,p0=r30,r17
288 (p6) br.dpnt.few kvm_asm_dispatch_vexirq 383 (p6) br.dpnt.few kvm_asm_dispatch_vexirq
289 br.many kvm_resume_to_guest 384 br.many kvm_resume_to_guest_with_sync
290END(kvm_asm_ssm) 385END(kvm_asm_ssm)
291 386
292 387
@@ -295,10 +390,9 @@ GLOBAL_ENTRY(kvm_asm_mov_to_psr)
295#ifndef ACCE_MOV_TO_PSR 390#ifndef ACCE_MOV_TO_PSR
296 br.many kvm_virtualization_fault_back 391 br.many kvm_virtualization_fault_back
297#endif 392#endif
298 add r16=VMM_VPD_BASE_OFFSET,r21 393 VMX_VPS_SYNC_READ
299 extr.u r26=r25,13,7 //r2
300 ;; 394 ;;
301 ld8 r16=[r16] 395 extr.u r26=r25,13,7 //r2
302 addl r20=@gprel(asm_mov_from_reg),gp 396 addl r20=@gprel(asm_mov_from_reg),gp
303 ;; 397 ;;
304 adds r30=kvm_asm_mov_to_psr_back-asm_mov_from_reg,r20 398 adds r30=kvm_asm_mov_to_psr_back-asm_mov_from_reg,r20
@@ -374,7 +468,7 @@ kvm_asm_mov_to_psr_1:
374 ;; 468 ;;
375 tbit.nz.or p6,p0=r17,IA64_PSR_I_BIT 469 tbit.nz.or p6,p0=r17,IA64_PSR_I_BIT
376 tbit.z.or p6,p0=r30,IA64_PSR_I_BIT 470 tbit.z.or p6,p0=r30,IA64_PSR_I_BIT
377 (p6) br.dpnt.few kvm_resume_to_guest 471 (p6) br.dpnt.few kvm_resume_to_guest_with_sync
378 ;; 472 ;;
379 add r29=VPD_VTPR_START_OFFSET,r16 473 add r29=VPD_VTPR_START_OFFSET,r16
380 add r30=VPD_VHPI_START_OFFSET,r16 474 add r30=VPD_VHPI_START_OFFSET,r16
@@ -389,13 +483,29 @@ kvm_asm_mov_to_psr_1:
389 ;; 483 ;;
390 cmp.gt p6,p0=r30,r17 484 cmp.gt p6,p0=r30,r17
391 (p6) br.dpnt.few kvm_asm_dispatch_vexirq 485 (p6) br.dpnt.few kvm_asm_dispatch_vexirq
392 br.many kvm_resume_to_guest 486 br.many kvm_resume_to_guest_with_sync
393END(kvm_asm_mov_to_psr) 487END(kvm_asm_mov_to_psr)
394 488
395 489
396ENTRY(kvm_asm_dispatch_vexirq) 490ENTRY(kvm_asm_dispatch_vexirq)
397//increment iip 491//increment iip
492 mov r17 = b0
493 mov r18 = r31
494{.mii
495 add r25=VMM_VPD_BASE_OFFSET,r21
496 nop 0x0
497 mov r24 = ip
498 ;;
499}
500{.mmb
501 add r24 = 0x20, r24
502 ld8 r25 = [r25]
503 br.sptk.many kvm_vps_sync_write
504}
505 mov b0 =r17
398 mov r16=cr.ipsr 506 mov r16=cr.ipsr
507 mov r31 = r18
508 mov r19 = 37
399 ;; 509 ;;
400 extr.u r17=r16,IA64_PSR_RI_BIT,2 510 extr.u r17=r16,IA64_PSR_RI_BIT,2
401 tbit.nz p6,p7=r16,IA64_PSR_RI_BIT+1 511 tbit.nz p6,p7=r16,IA64_PSR_RI_BIT+1
@@ -435,25 +545,31 @@ GLOBAL_ENTRY(kvm_asm_thash)
435 ;; 545 ;;
436kvm_asm_thash_back1: 546kvm_asm_thash_back1:
437 shr.u r23=r19,61 // get RR number 547 shr.u r23=r19,61 // get RR number
438 adds r25=VMM_VCPU_VRR0_OFFSET,r21 // get vcpu->arch.vrr[0]'s addr 548 adds r28=VMM_VCPU_VRR0_OFFSET,r21 // get vcpu->arch.vrr[0]'s addr
439 adds r16=VMM_VPD_VPTA_OFFSET,r16 // get vpta 549 adds r16=VMM_VPD_VPTA_OFFSET,r16 // get vpta
440 ;; 550 ;;
441 shladd r27=r23,3,r25 // get vcpu->arch.vrr[r23]'s addr 551 shladd r27=r23,3,r28 // get vcpu->arch.vrr[r23]'s addr
442 ld8 r17=[r16] // get PTA 552 ld8 r17=[r16] // get PTA
443 mov r26=1 553 mov r26=1
444 ;; 554 ;;
445 extr.u r29=r17,2,6 // get pta.size 555 extr.u r29=r17,2,6 // get pta.size
446 ld8 r25=[r27] // get vcpu->arch.vrr[r23]'s value 556 ld8 r28=[r27] // get vcpu->arch.vrr[r23]'s value
447 ;; 557 ;;
448 extr.u r25=r25,2,6 // get rr.ps 558 mov b0=r24
559 //Fallback to C if pta.vf is set
560 tbit.nz p6,p0=r17, 8
561 ;;
562 (p6) mov r24=EVENT_THASH
563 (p6) br.cond.dpnt.many kvm_virtualization_fault_back
564 extr.u r28=r28,2,6 // get rr.ps
449 shl r22=r26,r29 // 1UL << pta.size 565 shl r22=r26,r29 // 1UL << pta.size
450 ;; 566 ;;
451 shr.u r23=r19,r25 // vaddr >> rr.ps 567 shr.u r23=r19,r28 // vaddr >> rr.ps
452 adds r26=3,r29 // pta.size + 3 568 adds r26=3,r29 // pta.size + 3
453 shl r27=r17,3 // pta << 3 569 shl r27=r17,3 // pta << 3
454 ;; 570 ;;
455 shl r23=r23,3 // (vaddr >> rr.ps) << 3 571 shl r23=r23,3 // (vaddr >> rr.ps) << 3
456 shr.u r27=r27,r26 // (pta << 3) >> (pta.size+3) 572 shr.u r27=r27,r26 // (pta << 3) >> (pta.size+3)
457 movl r16=7<<61 573 movl r16=7<<61
458 ;; 574 ;;
459 adds r22=-1,r22 // (1UL << pta.size) - 1 575 adds r22=-1,r22 // (1UL << pta.size) - 1
@@ -724,6 +840,29 @@ END(asm_mov_from_reg)
724 * r31: pr 840 * r31: pr
725 * r24: b0 841 * r24: b0
726 */ 842 */
843ENTRY(kvm_resume_to_guest_with_sync)
844 adds r19=VMM_VPD_BASE_OFFSET,r21
845 mov r16 = r31
846 mov r17 = r24
847 ;;
848{.mii
849 ld8 r25 =[r19]
850 nop 0x0
851 mov r24 = ip
852 ;;
853}
854{.mmb
855 add r24 =0x20, r24
856 nop 0x0
857 br.sptk.many kvm_vps_sync_write
858}
859
860 mov r31 = r16
861 mov r24 =r17
862 ;;
863 br.sptk.many kvm_resume_to_guest
864END(kvm_resume_to_guest_with_sync)
865
727ENTRY(kvm_resume_to_guest) 866ENTRY(kvm_resume_to_guest)
728 adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21 867 adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21
729 ;; 868 ;;
diff --git a/arch/ia64/kvm/process.c b/arch/ia64/kvm/process.c
index 5a33f7ed29a0..3417783ae164 100644
--- a/arch/ia64/kvm/process.c
+++ b/arch/ia64/kvm/process.c
@@ -962,9 +962,9 @@ static void kvm_do_resume_op(struct kvm_vcpu *vcpu)
962void vmm_transition(struct kvm_vcpu *vcpu) 962void vmm_transition(struct kvm_vcpu *vcpu)
963{ 963{
964 ia64_call_vsa(PAL_VPS_SAVE, (unsigned long)vcpu->arch.vpd, 964 ia64_call_vsa(PAL_VPS_SAVE, (unsigned long)vcpu->arch.vpd,
965 0, 0, 0, 0, 0, 0); 965 1, 0, 0, 0, 0, 0);
966 vmm_trampoline(&vcpu->arch.guest, &vcpu->arch.host); 966 vmm_trampoline(&vcpu->arch.guest, &vcpu->arch.host);
967 ia64_call_vsa(PAL_VPS_RESTORE, (unsigned long)vcpu->arch.vpd, 967 ia64_call_vsa(PAL_VPS_RESTORE, (unsigned long)vcpu->arch.vpd,
968 0, 0, 0, 0, 0, 0); 968 1, 0, 0, 0, 0, 0);
969 kvm_do_resume_op(vcpu); 969 kvm_do_resume_op(vcpu);
970} 970}
diff --git a/arch/ia64/kvm/vcpu.h b/arch/ia64/kvm/vcpu.h
index b0fcfb62c49e..341e3fee280c 100644
--- a/arch/ia64/kvm/vcpu.h
+++ b/arch/ia64/kvm/vcpu.h
@@ -313,21 +313,21 @@ static inline void vcpu_set_tr(struct thash_data *trp, u64 pte, u64 itir,
313 trp->rid = rid; 313 trp->rid = rid;
314} 314}
315 315
316extern u64 kvm_lookup_mpa(u64 gpfn); 316extern u64 kvm_get_mpt_entry(u64 gpfn);
317extern u64 kvm_gpa_to_mpa(u64 gpa);
318
319/* Return I/O type if trye */
320#define __gpfn_is_io(gpfn) \
321 ({ \
322 u64 pte, ret = 0; \
323 pte = kvm_lookup_mpa(gpfn); \
324 if (!(pte & GPFN_INV_MASK)) \
325 ret = pte & GPFN_IO_MASK; \
326 ret; \
327 })
328 317
318/* Return I/ */
319static inline u64 __gpfn_is_io(u64 gpfn)
320{
321 u64 pte;
322 pte = kvm_get_mpt_entry(gpfn);
323 if (!(pte & GPFN_INV_MASK)) {
324 pte = pte & GPFN_IO_MASK;
325 if (pte != GPFN_PHYS_MMIO)
326 return pte;
327 }
328 return 0;
329}
329#endif 330#endif
330
331#define IA64_NO_FAULT 0 331#define IA64_NO_FAULT 0
332#define IA64_FAULT 1 332#define IA64_FAULT 1
333 333
diff --git a/arch/ia64/kvm/vmm_ivt.S b/arch/ia64/kvm/vmm_ivt.S
index 3ee5f481c06d..c1d7251a1480 100644
--- a/arch/ia64/kvm/vmm_ivt.S
+++ b/arch/ia64/kvm/vmm_ivt.S
@@ -1261,11 +1261,6 @@ kvm_rse_clear_invalid:
1261 adds r19=VMM_VPD_VPSR_OFFSET,r18 1261 adds r19=VMM_VPD_VPSR_OFFSET,r18
1262 ;; 1262 ;;
1263 ld8 r19=[r19] //vpsr 1263 ld8 r19=[r19] //vpsr
1264 adds r20=VMM_VCPU_VSA_BASE_OFFSET,r21
1265 ;;
1266 ld8 r20=[r20]
1267 ;;
1268//vsa_sync_write_start
1269 mov r25=r18 1264 mov r25=r18
1270 adds r16= VMM_VCPU_GP_OFFSET,r21 1265 adds r16= VMM_VCPU_GP_OFFSET,r21
1271 ;; 1266 ;;
@@ -1274,10 +1269,7 @@ kvm_rse_clear_invalid:
1274 ;; 1269 ;;
1275 add r24=r24,r16 1270 add r24=r24,r16
1276 ;; 1271 ;;
1277 add r16=PAL_VPS_SYNC_WRITE,r20 1272 br.sptk.many kvm_vps_sync_write // call the service
1278 ;;
1279 mov b0=r16
1280 br.cond.sptk b0 // call the service
1281 ;; 1273 ;;
1282END(ia64_leave_hypervisor) 1274END(ia64_leave_hypervisor)
1283// fall through 1275// fall through
@@ -1288,28 +1280,15 @@ GLOBAL_ENTRY(ia64_vmm_entry)
1288 * r17:cr.isr 1280 * r17:cr.isr
1289 * r18:vpd 1281 * r18:vpd
1290 * r19:vpsr 1282 * r19:vpsr
1291 * r20:__vsa_base
1292 * r22:b0 1283 * r22:b0
1293 * r23:predicate 1284 * r23:predicate
1294 */ 1285 */
1295 mov r24=r22 1286 mov r24=r22
1296 mov r25=r18 1287 mov r25=r18
1297 tbit.nz p1,p2 = r19,IA64_PSR_IC_BIT // p1=vpsr.ic 1288 tbit.nz p1,p2 = r19,IA64_PSR_IC_BIT // p1=vpsr.ic
1289 (p1) br.cond.sptk.few kvm_vps_resume_normal
1290 (p2) br.cond.sptk.many kvm_vps_resume_handler
1298 ;; 1291 ;;
1299 (p1) add r29=PAL_VPS_RESUME_NORMAL,r20
1300 (p1) br.sptk.many ia64_vmm_entry_out
1301 ;;
1302 tbit.nz p1,p2 = r17,IA64_ISR_IR_BIT //p1=cr.isr.ir
1303 ;;
1304 (p1) add r29=PAL_VPS_RESUME_NORMAL,r20
1305 (p2) add r29=PAL_VPS_RESUME_HANDLER,r20
1306 (p2) ld8 r26=[r25]
1307 ;;
1308ia64_vmm_entry_out:
1309 mov pr=r23,-2
1310 mov b0=r29
1311 ;;
1312 br.cond.sptk b0 // call pal service
1313END(ia64_vmm_entry) 1292END(ia64_vmm_entry)
1314 1293
1315 1294
@@ -1376,6 +1355,9 @@ GLOBAL_ENTRY(vmm_reset_entry)
1376 //set up ipsr, iip, vpd.vpsr, dcr 1355 //set up ipsr, iip, vpd.vpsr, dcr
1377 // For IPSR: it/dt/rt=1, i/ic=1, si=1, vm/bn=1 1356 // For IPSR: it/dt/rt=1, i/ic=1, si=1, vm/bn=1
1378 // For DCR: all bits 0 1357 // For DCR: all bits 0
1358 bsw.0
1359 ;;
1360 mov r21 =r13
1379 adds r14=-VMM_PT_REGS_SIZE, r12 1361 adds r14=-VMM_PT_REGS_SIZE, r12
1380 ;; 1362 ;;
1381 movl r6=0x501008826000 // IPSR dt/rt/it:1;i/ic:1, si:1, vm/bn:1 1363 movl r6=0x501008826000 // IPSR dt/rt/it:1;i/ic:1, si:1, vm/bn:1
@@ -1387,12 +1369,6 @@ GLOBAL_ENTRY(vmm_reset_entry)
1387 ;; 1369 ;;
1388 srlz.i 1370 srlz.i
1389 ;; 1371 ;;
1390 bsw.0
1391 ;;
1392 mov r21 =r13
1393 ;;
1394 bsw.1
1395 ;;
1396 mov ar.rsc = 0 1372 mov ar.rsc = 0
1397 ;; 1373 ;;
1398 flushrs 1374 flushrs
@@ -1406,12 +1382,9 @@ GLOBAL_ENTRY(vmm_reset_entry)
1406 ld8 r1 = [r20] 1382 ld8 r1 = [r20]
1407 ;; 1383 ;;
1408 mov cr.iip=r4 1384 mov cr.iip=r4
1409 ;;
1410 adds r16=VMM_VPD_BASE_OFFSET,r13 1385 adds r16=VMM_VPD_BASE_OFFSET,r13
1411 adds r20=VMM_VCPU_VSA_BASE_OFFSET,r13
1412 ;; 1386 ;;
1413 ld8 r18=[r16] 1387 ld8 r18=[r16]
1414 ld8 r20=[r20]
1415 ;; 1388 ;;
1416 adds r19=VMM_VPD_VPSR_OFFSET,r18 1389 adds r19=VMM_VPD_VPSR_OFFSET,r18
1417 ;; 1390 ;;
diff --git a/arch/ia64/kvm/vtlb.c b/arch/ia64/kvm/vtlb.c
index def4576d22b1..e22b93361e08 100644
--- a/arch/ia64/kvm/vtlb.c
+++ b/arch/ia64/kvm/vtlb.c
@@ -390,7 +390,7 @@ void thash_purge_entries_remote(struct kvm_vcpu *v, u64 va, u64 ps)
390 390
391u64 translate_phy_pte(u64 *pte, u64 itir, u64 va) 391u64 translate_phy_pte(u64 *pte, u64 itir, u64 va)
392{ 392{
393 u64 ps, ps_mask, paddr, maddr; 393 u64 ps, ps_mask, paddr, maddr, io_mask;
394 union pte_flags phy_pte; 394 union pte_flags phy_pte;
395 395
396 ps = itir_ps(itir); 396 ps = itir_ps(itir);
@@ -398,8 +398,9 @@ u64 translate_phy_pte(u64 *pte, u64 itir, u64 va)
398 phy_pte.val = *pte; 398 phy_pte.val = *pte;
399 paddr = *pte; 399 paddr = *pte;
400 paddr = ((paddr & _PAGE_PPN_MASK) & ps_mask) | (va & ~ps_mask); 400 paddr = ((paddr & _PAGE_PPN_MASK) & ps_mask) | (va & ~ps_mask);
401 maddr = kvm_lookup_mpa(paddr >> PAGE_SHIFT); 401 maddr = kvm_get_mpt_entry(paddr >> PAGE_SHIFT);
402 if (maddr & GPFN_IO_MASK) { 402 io_mask = maddr & GPFN_IO_MASK;
403 if (io_mask && (io_mask != GPFN_PHYS_MMIO)) {
403 *pte |= VTLB_PTE_IO; 404 *pte |= VTLB_PTE_IO;
404 return -1; 405 return -1;
405 } 406 }
@@ -418,7 +419,7 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir,
418 u64 ifa, int type) 419 u64 ifa, int type)
419{ 420{
420 u64 ps; 421 u64 ps;
421 u64 phy_pte; 422 u64 phy_pte, io_mask, index;
422 union ia64_rr vrr, mrr; 423 union ia64_rr vrr, mrr;
423 int ret = 0; 424 int ret = 0;
424 425
@@ -426,13 +427,16 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir,
426 vrr.val = vcpu_get_rr(v, ifa); 427 vrr.val = vcpu_get_rr(v, ifa);
427 mrr.val = ia64_get_rr(ifa); 428 mrr.val = ia64_get_rr(ifa);
428 429
430 index = (pte & _PAGE_PPN_MASK) >> PAGE_SHIFT;
431 io_mask = kvm_get_mpt_entry(index) & GPFN_IO_MASK;
429 phy_pte = translate_phy_pte(&pte, itir, ifa); 432 phy_pte = translate_phy_pte(&pte, itir, ifa);
430 433
431 /* Ensure WB attribute if pte is related to a normal mem page, 434 /* Ensure WB attribute if pte is related to a normal mem page,
432 * which is required by vga acceleration since qemu maps shared 435 * which is required by vga acceleration since qemu maps shared
433 * vram buffer with WB. 436 * vram buffer with WB.
434 */ 437 */
435 if (!(pte & VTLB_PTE_IO) && ((pte & _PAGE_MA_MASK) != _PAGE_MA_NAT)) { 438 if (!(pte & VTLB_PTE_IO) && ((pte & _PAGE_MA_MASK) != _PAGE_MA_NAT) &&
439 io_mask != GPFN_PHYS_MMIO) {
436 pte &= ~_PAGE_MA_MASK; 440 pte &= ~_PAGE_MA_MASK;
437 phy_pte &= ~_PAGE_MA_MASK; 441 phy_pte &= ~_PAGE_MA_MASK;
438 } 442 }
@@ -566,12 +570,19 @@ void thash_init(struct thash_cb *hcb, u64 sz)
566 } 570 }
567} 571}
568 572
569u64 kvm_lookup_mpa(u64 gpfn) 573u64 kvm_get_mpt_entry(u64 gpfn)
570{ 574{
571 u64 *base = (u64 *) KVM_P2M_BASE; 575 u64 *base = (u64 *) KVM_P2M_BASE;
572 return *(base + gpfn); 576 return *(base + gpfn);
573} 577}
574 578
579u64 kvm_lookup_mpa(u64 gpfn)
580{
581 u64 maddr;
582 maddr = kvm_get_mpt_entry(gpfn);
583 return maddr&_PAGE_PPN_MASK;
584}
585
575u64 kvm_gpa_to_mpa(u64 gpa) 586u64 kvm_gpa_to_mpa(u64 gpa)
576{ 587{
577 u64 pte = kvm_lookup_mpa(gpa >> PAGE_SHIFT); 588 u64 pte = kvm_lookup_mpa(gpa >> PAGE_SHIFT);
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 2655e2a4831e..34b52b7180cd 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -81,11 +81,17 @@ struct kvm_vcpu_arch {
81 struct tlbe shadow_tlb[PPC44x_TLB_SIZE]; 81 struct tlbe shadow_tlb[PPC44x_TLB_SIZE];
82 /* Pages which are referenced in the shadow TLB. */ 82 /* Pages which are referenced in the shadow TLB. */
83 struct page *shadow_pages[PPC44x_TLB_SIZE]; 83 struct page *shadow_pages[PPC44x_TLB_SIZE];
84 /* Copy of the host's TLB. */ 84
85 struct tlbe host_tlb[PPC44x_TLB_SIZE]; 85 /* Track which TLB entries we've modified in the current exit. */
86 u8 shadow_tlb_mod[PPC44x_TLB_SIZE];
86 87
87 u32 host_stack; 88 u32 host_stack;
88 u32 host_pid; 89 u32 host_pid;
90 u32 host_dbcr0;
91 u32 host_dbcr1;
92 u32 host_dbcr2;
93 u32 host_iac[4];
94 u32 host_msr;
89 95
90 u64 fpr[32]; 96 u64 fpr[32];
91 u32 gpr[32]; 97 u32 gpr[32];
@@ -123,7 +129,11 @@ struct kvm_vcpu_arch {
123 u32 ivor[16]; 129 u32 ivor[16];
124 u32 ivpr; 130 u32 ivpr;
125 u32 pir; 131 u32 pir;
132
133 u32 shadow_pid;
126 u32 pid; 134 u32 pid;
135 u32 swap_pid;
136
127 u32 pvr; 137 u32 pvr;
128 u32 ccr0; 138 u32 ccr0;
129 u32 ccr1; 139 u32 ccr1;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index a8b068792260..8931ba729d2b 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -64,6 +64,10 @@ extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn,
64extern void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr, 64extern void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
65 gva_t eend, u32 asid); 65 gva_t eend, u32 asid);
66extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode); 66extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode);
67extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid);
68
69/* XXX Book E specific */
70extern void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i);
67 71
68extern void kvmppc_check_and_deliver_interrupts(struct kvm_vcpu *vcpu); 72extern void kvmppc_check_and_deliver_interrupts(struct kvm_vcpu *vcpu);
69 73
@@ -92,4 +96,12 @@ static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
92 kvm_vcpu_block(vcpu); 96 kvm_vcpu_block(vcpu);
93} 97}
94 98
99static inline void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid)
100{
101 if (vcpu->arch.pid != new_pid) {
102 vcpu->arch.pid = new_pid;
103 vcpu->arch.swap_pid = 1;
104 }
105}
106
95#endif /* __POWERPC_KVM_PPC_H__ */ 107#endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 09febc582584..75c5dd0138fd 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -359,8 +359,8 @@ int main(void)
359 359
360 DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack)); 360 DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
361 DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid)); 361 DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
362 DEFINE(VCPU_HOST_TLB, offsetof(struct kvm_vcpu, arch.host_tlb));
363 DEFINE(VCPU_SHADOW_TLB, offsetof(struct kvm_vcpu, arch.shadow_tlb)); 362 DEFINE(VCPU_SHADOW_TLB, offsetof(struct kvm_vcpu, arch.shadow_tlb));
363 DEFINE(VCPU_SHADOW_MOD, offsetof(struct kvm_vcpu, arch.shadow_tlb_mod));
364 DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr)); 364 DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
365 DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr)); 365 DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
366 DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr)); 366 DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
@@ -372,7 +372,7 @@ int main(void)
372 DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5)); 372 DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
373 DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6)); 373 DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
374 DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7)); 374 DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7));
375 DEFINE(VCPU_PID, offsetof(struct kvm_vcpu, arch.pid)); 375 DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
376 376
377 DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); 377 DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
378 DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear)); 378 DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear));
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 5a5602da5091..2e227a412bc2 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -19,6 +19,7 @@
19 19
20#include <linux/types.h> 20#include <linux/types.h>
21#include <linux/string.h> 21#include <linux/string.h>
22#include <linux/kvm.h>
22#include <linux/kvm_host.h> 23#include <linux/kvm_host.h>
23#include <linux/highmem.h> 24#include <linux/highmem.h>
24#include <asm/mmu-44x.h> 25#include <asm/mmu-44x.h>
@@ -109,7 +110,6 @@ static int kvmppc_44x_tlbe_is_writable(struct tlbe *tlbe)
109 return tlbe->word2 & (PPC44x_TLB_SW|PPC44x_TLB_UW); 110 return tlbe->word2 & (PPC44x_TLB_SW|PPC44x_TLB_UW);
110} 111}
111 112
112/* Must be called with mmap_sem locked for writing. */
113static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu, 113static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu,
114 unsigned int index) 114 unsigned int index)
115{ 115{
@@ -124,6 +124,11 @@ static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu,
124 } 124 }
125} 125}
126 126
127void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i)
128{
129 vcpu->arch.shadow_tlb_mod[i] = 1;
130}
131
127/* Caller must ensure that the specified guest TLB entry is safe to insert into 132/* Caller must ensure that the specified guest TLB entry is safe to insert into
128 * the shadow TLB. */ 133 * the shadow TLB. */
129void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid, 134void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
@@ -142,19 +147,16 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
142 stlbe = &vcpu->arch.shadow_tlb[victim]; 147 stlbe = &vcpu->arch.shadow_tlb[victim];
143 148
144 /* Get reference to new page. */ 149 /* Get reference to new page. */
145 down_read(&current->mm->mmap_sem);
146 new_page = gfn_to_page(vcpu->kvm, gfn); 150 new_page = gfn_to_page(vcpu->kvm, gfn);
147 if (is_error_page(new_page)) { 151 if (is_error_page(new_page)) {
148 printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn); 152 printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn);
149 kvm_release_page_clean(new_page); 153 kvm_release_page_clean(new_page);
150 up_read(&current->mm->mmap_sem);
151 return; 154 return;
152 } 155 }
153 hpaddr = page_to_phys(new_page); 156 hpaddr = page_to_phys(new_page);
154 157
155 /* Drop reference to old page. */ 158 /* Drop reference to old page. */
156 kvmppc_44x_shadow_release(vcpu, victim); 159 kvmppc_44x_shadow_release(vcpu, victim);
157 up_read(&current->mm->mmap_sem);
158 160
159 vcpu->arch.shadow_pages[victim] = new_page; 161 vcpu->arch.shadow_pages[victim] = new_page;
160 162
@@ -164,27 +166,30 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
164 166
165 /* XXX what about AS? */ 167 /* XXX what about AS? */
166 168
167 stlbe->tid = asid & 0xff; 169 stlbe->tid = !(asid & 0xff);
168 170
169 /* Force TS=1 for all guest mappings. */ 171 /* Force TS=1 for all guest mappings. */
170 /* For now we hardcode 4KB mappings, but it will be important to 172 /* For now we hardcode 4KB mappings, but it will be important to
171 * use host large pages in the future. */ 173 * use host large pages in the future. */
172 stlbe->word0 = (gvaddr & PAGE_MASK) | PPC44x_TLB_VALID | PPC44x_TLB_TS 174 stlbe->word0 = (gvaddr & PAGE_MASK) | PPC44x_TLB_VALID | PPC44x_TLB_TS
173 | PPC44x_TLB_4K; 175 | PPC44x_TLB_4K;
174
175 stlbe->word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf); 176 stlbe->word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf);
176 stlbe->word2 = kvmppc_44x_tlb_shadow_attrib(flags, 177 stlbe->word2 = kvmppc_44x_tlb_shadow_attrib(flags,
177 vcpu->arch.msr & MSR_PR); 178 vcpu->arch.msr & MSR_PR);
179 kvmppc_tlbe_set_modified(vcpu, victim);
180
181 KVMTRACE_5D(STLB_WRITE, vcpu, victim,
182 stlbe->tid, stlbe->word0, stlbe->word1, stlbe->word2,
183 handler);
178} 184}
179 185
180void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr, 186void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
181 gva_t eend, u32 asid) 187 gva_t eend, u32 asid)
182{ 188{
183 unsigned int pid = asid & 0xff; 189 unsigned int pid = !(asid & 0xff);
184 int i; 190 int i;
185 191
186 /* XXX Replace loop with fancy data structures. */ 192 /* XXX Replace loop with fancy data structures. */
187 down_write(&current->mm->mmap_sem);
188 for (i = 0; i <= tlb_44x_hwater; i++) { 193 for (i = 0; i <= tlb_44x_hwater; i++) {
189 struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i]; 194 struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
190 unsigned int tid; 195 unsigned int tid;
@@ -204,21 +209,35 @@ void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
204 209
205 kvmppc_44x_shadow_release(vcpu, i); 210 kvmppc_44x_shadow_release(vcpu, i);
206 stlbe->word0 = 0; 211 stlbe->word0 = 0;
212 kvmppc_tlbe_set_modified(vcpu, i);
213 KVMTRACE_5D(STLB_INVAL, vcpu, i,
214 stlbe->tid, stlbe->word0, stlbe->word1,
215 stlbe->word2, handler);
207 } 216 }
208 up_write(&current->mm->mmap_sem);
209} 217}
210 218
211/* Invalidate all mappings, so that when they fault back in they will get the 219/* Invalidate all mappings on the privilege switch after PID has been changed.
212 * proper permission bits. */ 220 * The guest always runs with PID=1, so we must clear the entire TLB when
221 * switching address spaces. */
213void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode) 222void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
214{ 223{
215 int i; 224 int i;
216 225
217 /* XXX Replace loop with fancy data structures. */ 226 if (vcpu->arch.swap_pid) {
218 down_write(&current->mm->mmap_sem); 227 /* XXX Replace loop with fancy data structures. */
219 for (i = 0; i <= tlb_44x_hwater; i++) { 228 for (i = 0; i <= tlb_44x_hwater; i++) {
220 kvmppc_44x_shadow_release(vcpu, i); 229 struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
221 vcpu->arch.shadow_tlb[i].word0 = 0; 230
231 /* Future optimization: clear only userspace mappings. */
232 kvmppc_44x_shadow_release(vcpu, i);
233 stlbe->word0 = 0;
234 kvmppc_tlbe_set_modified(vcpu, i);
235 KVMTRACE_5D(STLB_INVAL, vcpu, i,
236 stlbe->tid, stlbe->word0, stlbe->word1,
237 stlbe->word2, handler);
238 }
239 vcpu->arch.swap_pid = 0;
222 } 240 }
223 up_write(&current->mm->mmap_sem); 241
242 vcpu->arch.shadow_pid = !usermode;
224} 243}
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 6b076010213b..53aaa66b25e5 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -37,6 +37,17 @@ config KVM_BOOKE_HOST
37 Provides host support for KVM on Book E PowerPC processors. Currently 37 Provides host support for KVM on Book E PowerPC processors. Currently
38 this works on 440 processors only. 38 this works on 440 processors only.
39 39
40config KVM_TRACE
41 bool "KVM trace support"
42 depends on KVM && MARKERS && SYSFS
43 select RELAY
44 select DEBUG_FS
45 default n
46 ---help---
47 This option allows reading a trace of kvm-related events through
48 relayfs. Note the ABI is not considered stable and will be
49 modified in future updates.
50
40source drivers/virtio/Kconfig 51source drivers/virtio/Kconfig
41 52
42endif # VIRTUALIZATION 53endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 04e3449e1f42..2a5d4397ac4b 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -4,9 +4,11 @@
4 4
5EXTRA_CFLAGS += -Ivirt/kvm -Iarch/powerpc/kvm 5EXTRA_CFLAGS += -Ivirt/kvm -Iarch/powerpc/kvm
6 6
7common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) 7common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
8 8
9kvm-objs := $(common-objs) powerpc.o emulate.o booke_guest.o 9common-objs-$(CONFIG_KVM_TRACE) += $(addprefix ../../../virt/kvm/, kvm_trace.o)
10
11kvm-objs := $(common-objs-y) powerpc.o emulate.o booke_guest.o
10obj-$(CONFIG_KVM) += kvm.o 12obj-$(CONFIG_KVM) += kvm.o
11 13
12AFLAGS_booke_interrupts.o := -I$(obj) 14AFLAGS_booke_interrupts.o := -I$(obj)
diff --git a/arch/powerpc/kvm/booke_guest.c b/arch/powerpc/kvm/booke_guest.c
index 9c8ad850c6e3..7b2591e26bae 100644
--- a/arch/powerpc/kvm/booke_guest.c
+++ b/arch/powerpc/kvm/booke_guest.c
@@ -410,6 +410,21 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
410 break; 410 break;
411 } 411 }
412 412
413 case BOOKE_INTERRUPT_DEBUG: {
414 u32 dbsr;
415
416 vcpu->arch.pc = mfspr(SPRN_CSRR0);
417
418 /* clear IAC events in DBSR register */
419 dbsr = mfspr(SPRN_DBSR);
420 dbsr &= DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4;
421 mtspr(SPRN_DBSR, dbsr);
422
423 run->exit_reason = KVM_EXIT_DEBUG;
424 r = RESUME_HOST;
425 break;
426 }
427
413 default: 428 default:
414 printk(KERN_EMERG "exit_nr %d\n", exit_nr); 429 printk(KERN_EMERG "exit_nr %d\n", exit_nr);
415 BUG(); 430 BUG();
@@ -471,6 +486,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
471 vcpu->arch.msr = 0; 486 vcpu->arch.msr = 0;
472 vcpu->arch.gpr[1] = (16<<20) - 8; /* -8 for the callee-save LR slot */ 487 vcpu->arch.gpr[1] = (16<<20) - 8; /* -8 for the callee-save LR slot */
473 488
489 vcpu->arch.shadow_pid = 1;
490
474 /* Eye-catching number so we know if the guest takes an interrupt 491 /* Eye-catching number so we know if the guest takes an interrupt
475 * before it's programmed its own IVPR. */ 492 * before it's programmed its own IVPR. */
476 vcpu->arch.ivpr = 0x55550000; 493 vcpu->arch.ivpr = 0x55550000;
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index 3b653b5309b8..95e165baf85f 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -42,7 +42,8 @@
42#define HOST_STACK_LR (HOST_STACK_SIZE + 4) /* In caller stack frame. */ 42#define HOST_STACK_LR (HOST_STACK_SIZE + 4) /* In caller stack frame. */
43 43
44#define NEED_INST_MASK ((1<<BOOKE_INTERRUPT_PROGRAM) | \ 44#define NEED_INST_MASK ((1<<BOOKE_INTERRUPT_PROGRAM) | \
45 (1<<BOOKE_INTERRUPT_DTLB_MISS)) 45 (1<<BOOKE_INTERRUPT_DTLB_MISS) | \
46 (1<<BOOKE_INTERRUPT_DEBUG))
46 47
47#define NEED_DEAR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \ 48#define NEED_DEAR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \
48 (1<<BOOKE_INTERRUPT_DTLB_MISS)) 49 (1<<BOOKE_INTERRUPT_DTLB_MISS))
@@ -331,51 +332,57 @@ lightweight_exit:
331 332
332 mfspr r3, SPRN_PID 333 mfspr r3, SPRN_PID
333 stw r3, VCPU_HOST_PID(r4) 334 stw r3, VCPU_HOST_PID(r4)
334 lwz r3, VCPU_PID(r4) 335 lwz r3, VCPU_SHADOW_PID(r4)
335 mtspr SPRN_PID, r3 336 mtspr SPRN_PID, r3
336 337
337 /* Prevent all TLB updates. */ 338 /* Prevent all asynchronous TLB updates. */
338 mfmsr r5 339 mfmsr r5
339 lis r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@h 340 lis r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@h
340 ori r6, r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@l 341 ori r6, r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@l
341 andc r6, r5, r6 342 andc r6, r5, r6
342 mtmsr r6 343 mtmsr r6
343 344
344 /* Save the host's non-pinned TLB mappings, and load the guest mappings 345 /* Load the guest mappings, leaving the host's "pinned" kernel mappings
345 * over them. Leave the host's "pinned" kernel mappings in place. */ 346 * in place. */
346 /* XXX optimization: use generation count to avoid swapping unmodified
347 * entries. */
348 mfspr r10, SPRN_MMUCR /* Save host MMUCR. */ 347 mfspr r10, SPRN_MMUCR /* Save host MMUCR. */
349 lis r8, tlb_44x_hwater@ha 348 li r5, PPC44x_TLB_SIZE
350 lwz r8, tlb_44x_hwater@l(r8) 349 lis r5, tlb_44x_hwater@ha
351 addi r3, r4, VCPU_HOST_TLB - 4 350 lwz r5, tlb_44x_hwater@l(r5)
352 addi r9, r4, VCPU_SHADOW_TLB - 4 351 mtctr r5
353 li r6, 0 352 addi r9, r4, VCPU_SHADOW_TLB
353 addi r5, r4, VCPU_SHADOW_MOD
354 li r3, 0
3541: 3551:
355 /* Save host entry. */ 356 lbzx r7, r3, r5
356 tlbre r7, r6, PPC44x_TLB_PAGEID 357 cmpwi r7, 0
357 mfspr r5, SPRN_MMUCR 358 beq 3f
358 stwu r5, 4(r3) 359
359 stwu r7, 4(r3)
360 tlbre r7, r6, PPC44x_TLB_XLAT
361 stwu r7, 4(r3)
362 tlbre r7, r6, PPC44x_TLB_ATTRIB
363 stwu r7, 4(r3)
364 /* Load guest entry. */ 360 /* Load guest entry. */
365 lwzu r7, 4(r9) 361 mulli r11, r3, TLBE_BYTES
362 add r11, r11, r9
363 lwz r7, 0(r11)
366 mtspr SPRN_MMUCR, r7 364 mtspr SPRN_MMUCR, r7
367 lwzu r7, 4(r9) 365 lwz r7, 4(r11)
368 tlbwe r7, r6, PPC44x_TLB_PAGEID 366 tlbwe r7, r3, PPC44x_TLB_PAGEID
369 lwzu r7, 4(r9) 367 lwz r7, 8(r11)
370 tlbwe r7, r6, PPC44x_TLB_XLAT 368 tlbwe r7, r3, PPC44x_TLB_XLAT
371 lwzu r7, 4(r9) 369 lwz r7, 12(r11)
372 tlbwe r7, r6, PPC44x_TLB_ATTRIB 370 tlbwe r7, r3, PPC44x_TLB_ATTRIB
373 /* Increment index. */ 3713:
374 addi r6, r6, 1 372 addi r3, r3, 1 /* Increment index. */
375 cmpw r6, r8 373 bdnz 1b
376 blt 1b 374
377 mtspr SPRN_MMUCR, r10 /* Restore host MMUCR. */ 375 mtspr SPRN_MMUCR, r10 /* Restore host MMUCR. */
378 376
377 /* Clear bitmap of modified TLB entries */
378 li r5, PPC44x_TLB_SIZE>>2
379 mtctr r5
380 addi r5, r4, VCPU_SHADOW_MOD - 4
381 li r6, 0
3821:
383 stwu r6, 4(r5)
384 bdnz 1b
385
379 iccci 0, 0 /* XXX hack */ 386 iccci 0, 0 /* XXX hack */
380 387
381 /* Load some guest volatiles. */ 388 /* Load some guest volatiles. */
@@ -431,6 +438,14 @@ lightweight_exit:
431 oris r3, r3, KVMPPC_MSR_MASK@h 438 oris r3, r3, KVMPPC_MSR_MASK@h
432 ori r3, r3, KVMPPC_MSR_MASK@l 439 ori r3, r3, KVMPPC_MSR_MASK@l
433 mtsrr1 r3 440 mtsrr1 r3
441
442 /* Clear any debug events which occurred since we disabled MSR[DE].
443 * XXX This gives us a 3-instruction window in which a breakpoint
444 * intended for guest context could fire in the host instead. */
445 lis r3, 0xffff
446 ori r3, r3, 0xffff
447 mtspr SPRN_DBSR, r3
448
434 lwz r3, VCPU_GPR(r3)(r4) 449 lwz r3, VCPU_GPR(r3)(r4)
435 lwz r4, VCPU_GPR(r4)(r4) 450 lwz r4, VCPU_GPR(r4)(r4)
436 rfi 451 rfi
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 8c605d0a5488..0fce4fbdc20d 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -170,6 +170,10 @@ static int kvmppc_emul_tlbwe(struct kvm_vcpu *vcpu, u32 inst)
170 kvmppc_mmu_map(vcpu, eaddr, raddr >> PAGE_SHIFT, asid, flags); 170 kvmppc_mmu_map(vcpu, eaddr, raddr >> PAGE_SHIFT, asid, flags);
171 } 171 }
172 172
173 KVMTRACE_5D(GTLB_WRITE, vcpu, index,
174 tlbe->tid, tlbe->word0, tlbe->word1, tlbe->word2,
175 handler);
176
173 return EMULATE_DONE; 177 return EMULATE_DONE;
174} 178}
175 179
@@ -504,7 +508,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
504 case SPRN_MMUCR: 508 case SPRN_MMUCR:
505 vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break; 509 vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break;
506 case SPRN_PID: 510 case SPRN_PID:
507 vcpu->arch.pid = vcpu->arch.gpr[rs]; break; 511 kvmppc_set_pid(vcpu, vcpu->arch.gpr[rs]); break;
508 case SPRN_CCR0: 512 case SPRN_CCR0:
509 vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break; 513 vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break;
510 case SPRN_CCR1: 514 case SPRN_CCR1:
@@ -765,6 +769,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
765 break; 769 break;
766 } 770 }
767 771
772 KVMTRACE_3D(PPC_INSTR, vcpu, inst, vcpu->arch.pc, emulated, entryexit);
773
768 if (advance) 774 if (advance)
769 vcpu->arch.pc += 4; /* Advance past emulated instruction. */ 775 vcpu->arch.pc += 4; /* Advance past emulated instruction. */
770 776
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 53826a5f6c06..90a6fc422b23 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -27,6 +27,7 @@
27#include <asm/cputable.h> 27#include <asm/cputable.h>
28#include <asm/uaccess.h> 28#include <asm/uaccess.h>
29#include <asm/kvm_ppc.h> 29#include <asm/kvm_ppc.h>
30#include <asm/tlbflush.h>
30 31
31 32
32gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 33gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
@@ -239,18 +240,114 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
239{ 240{
240} 241}
241 242
243/* Note: clearing MSR[DE] just means that the debug interrupt will not be
244 * delivered *immediately*. Instead, it simply sets the appropriate DBSR bits.
245 * If those DBSR bits are still set when MSR[DE] is re-enabled, the interrupt
246 * will be delivered as an "imprecise debug event" (which is indicated by
247 * DBSR[IDE].
248 */
249static void kvmppc_disable_debug_interrupts(void)
250{
251 mtmsr(mfmsr() & ~MSR_DE);
252}
253
254static void kvmppc_restore_host_debug_state(struct kvm_vcpu *vcpu)
255{
256 kvmppc_disable_debug_interrupts();
257
258 mtspr(SPRN_IAC1, vcpu->arch.host_iac[0]);
259 mtspr(SPRN_IAC2, vcpu->arch.host_iac[1]);
260 mtspr(SPRN_IAC3, vcpu->arch.host_iac[2]);
261 mtspr(SPRN_IAC4, vcpu->arch.host_iac[3]);
262 mtspr(SPRN_DBCR1, vcpu->arch.host_dbcr1);
263 mtspr(SPRN_DBCR2, vcpu->arch.host_dbcr2);
264 mtspr(SPRN_DBCR0, vcpu->arch.host_dbcr0);
265 mtmsr(vcpu->arch.host_msr);
266}
267
268static void kvmppc_load_guest_debug_registers(struct kvm_vcpu *vcpu)
269{
270 struct kvm_guest_debug *dbg = &vcpu->guest_debug;
271 u32 dbcr0 = 0;
272
273 vcpu->arch.host_msr = mfmsr();
274 kvmppc_disable_debug_interrupts();
275
276 /* Save host debug register state. */
277 vcpu->arch.host_iac[0] = mfspr(SPRN_IAC1);
278 vcpu->arch.host_iac[1] = mfspr(SPRN_IAC2);
279 vcpu->arch.host_iac[2] = mfspr(SPRN_IAC3);
280 vcpu->arch.host_iac[3] = mfspr(SPRN_IAC4);
281 vcpu->arch.host_dbcr0 = mfspr(SPRN_DBCR0);
282 vcpu->arch.host_dbcr1 = mfspr(SPRN_DBCR1);
283 vcpu->arch.host_dbcr2 = mfspr(SPRN_DBCR2);
284
285 /* set registers up for guest */
286
287 if (dbg->bp[0]) {
288 mtspr(SPRN_IAC1, dbg->bp[0]);
289 dbcr0 |= DBCR0_IAC1 | DBCR0_IDM;
290 }
291 if (dbg->bp[1]) {
292 mtspr(SPRN_IAC2, dbg->bp[1]);
293 dbcr0 |= DBCR0_IAC2 | DBCR0_IDM;
294 }
295 if (dbg->bp[2]) {
296 mtspr(SPRN_IAC3, dbg->bp[2]);
297 dbcr0 |= DBCR0_IAC3 | DBCR0_IDM;
298 }
299 if (dbg->bp[3]) {
300 mtspr(SPRN_IAC4, dbg->bp[3]);
301 dbcr0 |= DBCR0_IAC4 | DBCR0_IDM;
302 }
303
304 mtspr(SPRN_DBCR0, dbcr0);
305 mtspr(SPRN_DBCR1, 0);
306 mtspr(SPRN_DBCR2, 0);
307}
308
242void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 309void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
243{ 310{
311 int i;
312
313 if (vcpu->guest_debug.enabled)
314 kvmppc_load_guest_debug_registers(vcpu);
315
316 /* Mark every guest entry in the shadow TLB entry modified, so that they
317 * will all be reloaded on the next vcpu run (instead of being
318 * demand-faulted). */
319 for (i = 0; i <= tlb_44x_hwater; i++)
320 kvmppc_tlbe_set_modified(vcpu, i);
244} 321}
245 322
246void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 323void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
247{ 324{
325 if (vcpu->guest_debug.enabled)
326 kvmppc_restore_host_debug_state(vcpu);
327
328 /* Don't leave guest TLB entries resident when being de-scheduled. */
329 /* XXX It would be nice to differentiate between heavyweight exit and
330 * sched_out here, since we could avoid the TLB flush for heavyweight
331 * exits. */
332 _tlbia();
248} 333}
249 334
250int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, 335int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
251 struct kvm_debug_guest *dbg) 336 struct kvm_debug_guest *dbg)
252{ 337{
253 return -ENOTSUPP; 338 int i;
339
340 vcpu->guest_debug.enabled = dbg->enabled;
341 if (vcpu->guest_debug.enabled) {
342 for (i=0; i < ARRAY_SIZE(vcpu->guest_debug.bp); i++) {
343 if (dbg->breakpoints[i].enabled)
344 vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
345 else
346 vcpu->guest_debug.bp[i] = 0;
347 }
348 }
349
350 return 0;
254} 351}
255 352
256static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu, 353static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 4c03049e7db9..bc581d8a7cd9 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -565,13 +565,16 @@ config ZFCPDUMP
565 Refer to <file:Documentation/s390/zfcpdump.txt> for more details on this. 565 Refer to <file:Documentation/s390/zfcpdump.txt> for more details on this.
566 566
567config S390_GUEST 567config S390_GUEST
568bool "s390 guest support (EXPERIMENTAL)" 568bool "s390 guest support for KVM (EXPERIMENTAL)"
569 depends on 64BIT && EXPERIMENTAL 569 depends on 64BIT && EXPERIMENTAL
570 select VIRTIO 570 select VIRTIO
571 select VIRTIO_RING 571 select VIRTIO_RING
572 select VIRTIO_CONSOLE 572 select VIRTIO_CONSOLE
573 help 573 help
574 Select this option if you want to run the kernel under s390 linux 574 Select this option if you want to run the kernel as a guest under
575 the KVM hypervisor. This will add detection for KVM as well as a
576 virtio transport. If KVM is detected, the virtio console will be
577 the default console.
575endmenu 578endmenu
576 579
577source "net/Kconfig" 580source "net/Kconfig"
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index d1faf5c54405..cce40ff2913b 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -157,8 +157,8 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
157 int rc; 157 int rc;
158 158
159 vcpu->stat.instruction_stfl++; 159 vcpu->stat.instruction_stfl++;
160 facility_list &= ~(1UL<<24); /* no stfle */ 160 /* only pass the facility bits, which we can handle */
161 facility_list &= ~(1UL<<23); /* no large pages */ 161 facility_list &= 0xfe00fff3;
162 162
163 rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list), 163 rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
164 &facility_list, sizeof(facility_list)); 164 &facility_list, sizeof(facility_list));
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index d02def06ca91..774ac4991568 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void)
78 return ret; 78 return ret;
79} 79}
80 80
81/*
82 * If we don't do that, there is the possibility that the guest
83 * will calibrate under heavy load - thus, getting a lower lpj -
84 * and execute the delays themselves without load. This is wrong,
85 * because no delay loop can finish beforehand.
86 * Any heuristics is subject to fail, because ultimately, a large
87 * poll of guests can be running and trouble each other. So we preset
88 * lpj here
89 */
90static unsigned long kvm_get_tsc_khz(void)
91{
92 return preset_lpj;
93}
94
95static void kvm_get_preset_lpj(void)
96{
97 struct pvclock_vcpu_time_info *src;
98 unsigned long khz;
99 u64 lpj;
100
101 src = &per_cpu(hv_clock, 0);
102 khz = pvclock_tsc_khz(src);
103
104 lpj = ((u64)khz * 1000);
105 do_div(lpj, HZ);
106 preset_lpj = lpj;
107}
108
81static struct clocksource kvm_clock = { 109static struct clocksource kvm_clock = {
82 .name = "kvm-clock", 110 .name = "kvm-clock",
83 .read = kvm_clock_read, 111 .read = kvm_clock_read,
@@ -153,6 +181,7 @@ void __init kvmclock_init(void)
153 pv_time_ops.get_wallclock = kvm_get_wallclock; 181 pv_time_ops.get_wallclock = kvm_get_wallclock;
154 pv_time_ops.set_wallclock = kvm_set_wallclock; 182 pv_time_ops.set_wallclock = kvm_set_wallclock;
155 pv_time_ops.sched_clock = kvm_clock_read; 183 pv_time_ops.sched_clock = kvm_clock_read;
184 pv_time_ops.get_tsc_khz = kvm_get_tsc_khz;
156#ifdef CONFIG_X86_LOCAL_APIC 185#ifdef CONFIG_X86_LOCAL_APIC
157 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; 186 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
158#endif 187#endif
@@ -163,6 +192,7 @@ void __init kvmclock_init(void)
163#ifdef CONFIG_KEXEC 192#ifdef CONFIG_KEXEC
164 machine_ops.crash_shutdown = kvm_crash_shutdown; 193 machine_ops.crash_shutdown = kvm_crash_shutdown;
165#endif 194#endif
195 kvm_get_preset_lpj();
166 clocksource_register(&kvm_clock); 196 clocksource_register(&kvm_clock);
167 } 197 }
168} 198}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 05fbe9a0325a..4f9c55f3a7c0 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
97 return dst->version; 97 return dst->version;
98} 98}
99 99
100unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
101{
102 u64 pv_tsc_khz = 1000000ULL << 32;
103
104 do_div(pv_tsc_khz, src->tsc_to_system_mul);
105 if (src->tsc_shift < 0)
106 pv_tsc_khz <<= -src->tsc_shift;
107 else
108 pv_tsc_khz >>= src->tsc_shift;
109 return pv_tsc_khz;
110}
111
100cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 112cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
101{ 113{
102 struct pvclock_shadow_time shadow; 114 struct pvclock_shadow_time shadow;
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d0e940bb6f40..c02343594b4d 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -3,10 +3,13 @@
3# 3#
4 4
5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
6 coalesced_mmio.o) 6 coalesced_mmio.o irq_comm.o)
7ifeq ($(CONFIG_KVM_TRACE),y) 7ifeq ($(CONFIG_KVM_TRACE),y)
8common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) 8common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
9endif 9endif
10ifeq ($(CONFIG_DMAR),y)
11common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
12endif
10 13
11EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm 14EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
12 15
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c0f7872a9124..634132a9a512 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -200,13 +200,14 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
200 200
201 if (!atomic_inc_and_test(&pt->pending)) 201 if (!atomic_inc_and_test(&pt->pending))
202 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); 202 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
203 if (vcpu0 && waitqueue_active(&vcpu0->wq)) { 203
204 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; 204 if (vcpu0 && waitqueue_active(&vcpu0->wq))
205 wake_up_interruptible(&vcpu0->wq); 205 wake_up_interruptible(&vcpu0->wq);
206 }
207 206
208 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); 207 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
209 pt->scheduled = ktime_to_ns(pt->timer.expires); 208 pt->scheduled = ktime_to_ns(pt->timer.expires);
209 if (pt->period)
210 ps->channels[0].count_load_time = pt->timer.expires;
210 211
211 return (pt->period == 0 ? 0 : 1); 212 return (pt->period == 0 ? 0 : 1);
212} 213}
@@ -215,12 +216,22 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
215{ 216{
216 struct kvm_pit *pit = vcpu->kvm->arch.vpit; 217 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
217 218
218 if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending) 219 if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
219 return atomic_read(&pit->pit_state.pit_timer.pending); 220 return atomic_read(&pit->pit_state.pit_timer.pending);
220
221 return 0; 221 return 0;
222} 222}
223 223
224static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
225{
226 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
227 irq_ack_notifier);
228 spin_lock(&ps->inject_lock);
229 if (atomic_dec_return(&ps->pit_timer.pending) < 0)
230 atomic_inc(&ps->pit_timer.pending);
231 ps->irq_ack = 1;
232 spin_unlock(&ps->inject_lock);
233}
234
224static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) 235static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
225{ 236{
226 struct kvm_kpit_state *ps; 237 struct kvm_kpit_state *ps;
@@ -255,8 +266,9 @@ static void destroy_pit_timer(struct kvm_kpit_timer *pt)
255 hrtimer_cancel(&pt->timer); 266 hrtimer_cancel(&pt->timer);
256} 267}
257 268
258static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) 269static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
259{ 270{
271 struct kvm_kpit_timer *pt = &ps->pit_timer;
260 s64 interval; 272 s64 interval;
261 273
262 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); 274 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -268,6 +280,7 @@ static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
268 pt->period = (is_period == 0) ? 0 : interval; 280 pt->period = (is_period == 0) ? 0 : interval;
269 pt->timer.function = pit_timer_fn; 281 pt->timer.function = pit_timer_fn;
270 atomic_set(&pt->pending, 0); 282 atomic_set(&pt->pending, 0);
283 ps->irq_ack = 1;
271 284
272 hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), 285 hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
273 HRTIMER_MODE_ABS); 286 HRTIMER_MODE_ABS);
@@ -302,11 +315,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
302 case 1: 315 case 1:
303 /* FIXME: enhance mode 4 precision */ 316 /* FIXME: enhance mode 4 precision */
304 case 4: 317 case 4:
305 create_pit_timer(&ps->pit_timer, val, 0); 318 create_pit_timer(ps, val, 0);
306 break; 319 break;
307 case 2: 320 case 2:
308 case 3: 321 case 3:
309 create_pit_timer(&ps->pit_timer, val, 1); 322 create_pit_timer(ps, val, 1);
310 break; 323 break;
311 default: 324 default:
312 destroy_pit_timer(&ps->pit_timer); 325 destroy_pit_timer(&ps->pit_timer);
@@ -520,7 +533,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
520 mutex_unlock(&pit->pit_state.lock); 533 mutex_unlock(&pit->pit_state.lock);
521 534
522 atomic_set(&pit->pit_state.pit_timer.pending, 0); 535 atomic_set(&pit->pit_state.pit_timer.pending, 0);
523 pit->pit_state.inject_pending = 1; 536 pit->pit_state.irq_ack = 1;
524} 537}
525 538
526struct kvm_pit *kvm_create_pit(struct kvm *kvm) 539struct kvm_pit *kvm_create_pit(struct kvm *kvm)
@@ -534,6 +547,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
534 547
535 mutex_init(&pit->pit_state.lock); 548 mutex_init(&pit->pit_state.lock);
536 mutex_lock(&pit->pit_state.lock); 549 mutex_lock(&pit->pit_state.lock);
550 spin_lock_init(&pit->pit_state.inject_lock);
537 551
538 /* Initialize PIO device */ 552 /* Initialize PIO device */
539 pit->dev.read = pit_ioport_read; 553 pit->dev.read = pit_ioport_read;
@@ -555,6 +569,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
555 pit_state->pit = pit; 569 pit_state->pit = pit;
556 hrtimer_init(&pit_state->pit_timer.timer, 570 hrtimer_init(&pit_state->pit_timer.timer,
557 CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 571 CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
572 pit_state->irq_ack_notifier.gsi = 0;
573 pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
574 kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
558 mutex_unlock(&pit->pit_state.lock); 575 mutex_unlock(&pit->pit_state.lock);
559 576
560 kvm_pit_reset(pit); 577 kvm_pit_reset(pit);
@@ -578,10 +595,8 @@ void kvm_free_pit(struct kvm *kvm)
578static void __inject_pit_timer_intr(struct kvm *kvm) 595static void __inject_pit_timer_intr(struct kvm *kvm)
579{ 596{
580 mutex_lock(&kvm->lock); 597 mutex_lock(&kvm->lock);
581 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1); 598 kvm_set_irq(kvm, 0, 1);
582 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0); 599 kvm_set_irq(kvm, 0, 0);
583 kvm_pic_set_irq(pic_irqchip(kvm), 0, 1);
584 kvm_pic_set_irq(pic_irqchip(kvm), 0, 0);
585 mutex_unlock(&kvm->lock); 600 mutex_unlock(&kvm->lock);
586} 601}
587 602
@@ -592,37 +607,19 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
592 struct kvm_kpit_state *ps; 607 struct kvm_kpit_state *ps;
593 608
594 if (vcpu && pit) { 609 if (vcpu && pit) {
610 int inject = 0;
595 ps = &pit->pit_state; 611 ps = &pit->pit_state;
596 612
597 /* Try to inject pending interrupts when: 613 /* Try to inject pending interrupts when
598 * 1. Pending exists 614 * last one has been acked.
599 * 2. Last interrupt was accepted or waited for too long time*/ 615 */
600 if (atomic_read(&ps->pit_timer.pending) && 616 spin_lock(&ps->inject_lock);
601 (ps->inject_pending || 617 if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
602 (jiffies - ps->last_injected_time 618 ps->irq_ack = 0;
603 >= KVM_MAX_PIT_INTR_INTERVAL))) { 619 inject = 1;
604 ps->inject_pending = 0;
605 __inject_pit_timer_intr(kvm);
606 ps->last_injected_time = jiffies;
607 }
608 }
609}
610
611void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
612{
613 struct kvm_arch *arch = &vcpu->kvm->arch;
614 struct kvm_kpit_state *ps;
615
616 if (vcpu && arch->vpit) {
617 ps = &arch->vpit->pit_state;
618 if (atomic_read(&ps->pit_timer.pending) &&
619 (((arch->vpic->pics[0].imr & 1) == 0 &&
620 arch->vpic->pics[0].irq_base == vec) ||
621 (arch->vioapic->redirtbl[0].fields.vector == vec &&
622 arch->vioapic->redirtbl[0].fields.mask != 1))) {
623 ps->inject_pending = 1;
624 atomic_dec(&ps->pit_timer.pending);
625 ps->channels[0].count_load_time = ktime_get();
626 } 620 }
621 spin_unlock(&ps->inject_lock);
622 if (inject)
623 __inject_pit_timer_intr(kvm);
627 } 624 }
628} 625}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index db25c2a6c8c4..e436d4983aa1 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -8,7 +8,6 @@ struct kvm_kpit_timer {
8 int irq; 8 int irq;
9 s64 period; /* unit: ns */ 9 s64 period; /* unit: ns */
10 s64 scheduled; 10 s64 scheduled;
11 ktime_t last_update;
12 atomic_t pending; 11 atomic_t pending;
13}; 12};
14 13
@@ -34,8 +33,9 @@ struct kvm_kpit_state {
34 u32 speaker_data_on; 33 u32 speaker_data_on;
35 struct mutex lock; 34 struct mutex lock;
36 struct kvm_pit *pit; 35 struct kvm_pit *pit;
37 bool inject_pending; /* if inject pending interrupts */ 36 spinlock_t inject_lock;
38 unsigned long last_injected_time; 37 unsigned long irq_ack;
38 struct kvm_irq_ack_notifier irq_ack_notifier;
39}; 39};
40 40
41struct kvm_pit { 41struct kvm_pit {
@@ -54,7 +54,6 @@ struct kvm_pit {
54#define KVM_PIT_CHANNEL_MASK 0x3 54#define KVM_PIT_CHANNEL_MASK 0x3
55 55
56void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); 56void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
57void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
58void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); 57void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
59struct kvm_pit *kvm_create_pit(struct kvm *kvm); 58struct kvm_pit *kvm_create_pit(struct kvm *kvm);
60void kvm_free_pit(struct kvm *kvm); 59void kvm_free_pit(struct kvm *kvm);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index c31164e8aa46..17e41e165f1a 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -30,6 +30,19 @@
30 30
31#include <linux/kvm_host.h> 31#include <linux/kvm_host.h>
32 32
33static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
34{
35 s->isr &= ~(1 << irq);
36 s->isr_ack |= (1 << irq);
37}
38
39void kvm_pic_clear_isr_ack(struct kvm *kvm)
40{
41 struct kvm_pic *s = pic_irqchip(kvm);
42 s->pics[0].isr_ack = 0xff;
43 s->pics[1].isr_ack = 0xff;
44}
45
33/* 46/*
34 * set irq level. If an edge is detected, then the IRR is set to 1 47 * set irq level. If an edge is detected, then the IRR is set to 1
35 */ 48 */
@@ -141,11 +154,12 @@ void kvm_pic_set_irq(void *opaque, int irq, int level)
141 */ 154 */
142static inline void pic_intack(struct kvm_kpic_state *s, int irq) 155static inline void pic_intack(struct kvm_kpic_state *s, int irq)
143{ 156{
157 s->isr |= 1 << irq;
144 if (s->auto_eoi) { 158 if (s->auto_eoi) {
145 if (s->rotate_on_auto_eoi) 159 if (s->rotate_on_auto_eoi)
146 s->priority_add = (irq + 1) & 7; 160 s->priority_add = (irq + 1) & 7;
147 } else 161 pic_clear_isr(s, irq);
148 s->isr |= (1 << irq); 162 }
149 /* 163 /*
150 * We don't clear a level sensitive interrupt here 164 * We don't clear a level sensitive interrupt here
151 */ 165 */
@@ -153,9 +167,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq)
153 s->irr &= ~(1 << irq); 167 s->irr &= ~(1 << irq);
154} 168}
155 169
156int kvm_pic_read_irq(struct kvm_pic *s) 170int kvm_pic_read_irq(struct kvm *kvm)
157{ 171{
158 int irq, irq2, intno; 172 int irq, irq2, intno;
173 struct kvm_pic *s = pic_irqchip(kvm);
159 174
160 irq = pic_get_irq(&s->pics[0]); 175 irq = pic_get_irq(&s->pics[0]);
161 if (irq >= 0) { 176 if (irq >= 0) {
@@ -181,16 +196,32 @@ int kvm_pic_read_irq(struct kvm_pic *s)
181 intno = s->pics[0].irq_base + irq; 196 intno = s->pics[0].irq_base + irq;
182 } 197 }
183 pic_update_irq(s); 198 pic_update_irq(s);
199 kvm_notify_acked_irq(kvm, irq);
184 200
185 return intno; 201 return intno;
186} 202}
187 203
188void kvm_pic_reset(struct kvm_kpic_state *s) 204void kvm_pic_reset(struct kvm_kpic_state *s)
189{ 205{
206 int irq, irqbase;
207 struct kvm *kvm = s->pics_state->irq_request_opaque;
208 struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
209
210 if (s == &s->pics_state->pics[0])
211 irqbase = 0;
212 else
213 irqbase = 8;
214
215 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
216 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
217 if (s->irr & (1 << irq) || s->isr & (1 << irq))
218 kvm_notify_acked_irq(kvm, irq+irqbase);
219 }
190 s->last_irr = 0; 220 s->last_irr = 0;
191 s->irr = 0; 221 s->irr = 0;
192 s->imr = 0; 222 s->imr = 0;
193 s->isr = 0; 223 s->isr = 0;
224 s->isr_ack = 0xff;
194 s->priority_add = 0; 225 s->priority_add = 0;
195 s->irq_base = 0; 226 s->irq_base = 0;
196 s->read_reg_select = 0; 227 s->read_reg_select = 0;
@@ -243,7 +274,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
243 priority = get_priority(s, s->isr); 274 priority = get_priority(s, s->isr);
244 if (priority != 8) { 275 if (priority != 8) {
245 irq = (priority + s->priority_add) & 7; 276 irq = (priority + s->priority_add) & 7;
246 s->isr &= ~(1 << irq); 277 pic_clear_isr(s, irq);
247 if (cmd == 5) 278 if (cmd == 5)
248 s->priority_add = (irq + 1) & 7; 279 s->priority_add = (irq + 1) & 7;
249 pic_update_irq(s->pics_state); 280 pic_update_irq(s->pics_state);
@@ -251,7 +282,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
251 break; 282 break;
252 case 3: 283 case 3:
253 irq = val & 7; 284 irq = val & 7;
254 s->isr &= ~(1 << irq); 285 pic_clear_isr(s, irq);
255 pic_update_irq(s->pics_state); 286 pic_update_irq(s->pics_state);
256 break; 287 break;
257 case 6: 288 case 6:
@@ -260,8 +291,8 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
260 break; 291 break;
261 case 7: 292 case 7:
262 irq = val & 7; 293 irq = val & 7;
263 s->isr &= ~(1 << irq);
264 s->priority_add = (irq + 1) & 7; 294 s->priority_add = (irq + 1) & 7;
295 pic_clear_isr(s, irq);
265 pic_update_irq(s->pics_state); 296 pic_update_irq(s->pics_state);
266 break; 297 break;
267 default: 298 default:
@@ -303,7 +334,7 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
303 s->pics_state->pics[0].irr &= ~(1 << 2); 334 s->pics_state->pics[0].irr &= ~(1 << 2);
304 } 335 }
305 s->irr &= ~(1 << ret); 336 s->irr &= ~(1 << ret);
306 s->isr &= ~(1 << ret); 337 pic_clear_isr(s, ret);
307 if (addr1 >> 7 || ret != 2) 338 if (addr1 >> 7 || ret != 2)
308 pic_update_irq(s->pics_state); 339 pic_update_irq(s->pics_state);
309 } else { 340 } else {
@@ -422,10 +453,14 @@ static void pic_irq_request(void *opaque, int level)
422{ 453{
423 struct kvm *kvm = opaque; 454 struct kvm *kvm = opaque;
424 struct kvm_vcpu *vcpu = kvm->vcpus[0]; 455 struct kvm_vcpu *vcpu = kvm->vcpus[0];
456 struct kvm_pic *s = pic_irqchip(kvm);
457 int irq = pic_get_irq(&s->pics[0]);
425 458
426 pic_irqchip(kvm)->output = level; 459 s->output = level;
427 if (vcpu) 460 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
461 s->pics[0].isr_ack &= ~(1 << irq);
428 kvm_vcpu_kick(vcpu); 462 kvm_vcpu_kick(vcpu);
463 }
429} 464}
430 465
431struct kvm_pic *kvm_create_pic(struct kvm *kvm) 466struct kvm_pic *kvm_create_pic(struct kvm *kvm)
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 76d736b5f664..c019b8edcdb7 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
72 if (kvm_apic_accept_pic_intr(v)) { 72 if (kvm_apic_accept_pic_intr(v)) {
73 s = pic_irqchip(v->kvm); 73 s = pic_irqchip(v->kvm);
74 s->output = 0; /* PIC */ 74 s->output = 0; /* PIC */
75 vector = kvm_pic_read_irq(s); 75 vector = kvm_pic_read_irq(v->kvm);
76 } 76 }
77 } 77 }
78 return vector; 78 return vector;
@@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
90void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) 90void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
91{ 91{
92 kvm_apic_timer_intr_post(vcpu, vec); 92 kvm_apic_timer_intr_post(vcpu, vec);
93 kvm_pit_timer_intr_post(vcpu, vec);
94 /* TODO: PIT, RTC etc. */ 93 /* TODO: PIT, RTC etc. */
95} 94}
96EXPORT_SYMBOL_GPL(kvm_timer_intr_post); 95EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7ca47cbb48bb..f17c8f5bbf31 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -42,6 +42,7 @@ struct kvm_kpic_state {
42 u8 irr; /* interrupt request register */ 42 u8 irr; /* interrupt request register */
43 u8 imr; /* interrupt mask register */ 43 u8 imr; /* interrupt mask register */
44 u8 isr; /* interrupt service register */ 44 u8 isr; /* interrupt service register */
45 u8 isr_ack; /* interrupt ack detection */
45 u8 priority_add; /* highest irq priority */ 46 u8 priority_add; /* highest irq priority */
46 u8 irq_base; 47 u8 irq_base;
47 u8 read_reg_select; 48 u8 read_reg_select;
@@ -63,12 +64,13 @@ struct kvm_pic {
63 void *irq_request_opaque; 64 void *irq_request_opaque;
64 int output; /* intr from master PIC */ 65 int output; /* intr from master PIC */
65 struct kvm_io_device dev; 66 struct kvm_io_device dev;
67 void (*ack_notifier)(void *opaque, int irq);
66}; 68};
67 69
68struct kvm_pic *kvm_create_pic(struct kvm *kvm); 70struct kvm_pic *kvm_create_pic(struct kvm *kvm);
69void kvm_pic_set_irq(void *opaque, int irq, int level); 71int kvm_pic_read_irq(struct kvm *kvm);
70int kvm_pic_read_irq(struct kvm_pic *s);
71void kvm_pic_update_irq(struct kvm_pic *s); 72void kvm_pic_update_irq(struct kvm_pic *s);
73void kvm_pic_clear_isr_ack(struct kvm *kvm);
72 74
73static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) 75static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
74{ 76{
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
new file mode 100644
index 000000000000..1ff819dce7d3
--- /dev/null
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -0,0 +1,32 @@
1#ifndef ASM_KVM_CACHE_REGS_H
2#define ASM_KVM_CACHE_REGS_H
3
4static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
5 enum kvm_reg reg)
6{
7 if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
8 kvm_x86_ops->cache_reg(vcpu, reg);
9
10 return vcpu->arch.regs[reg];
11}
12
13static inline void kvm_register_write(struct kvm_vcpu *vcpu,
14 enum kvm_reg reg,
15 unsigned long val)
16{
17 vcpu->arch.regs[reg] = val;
18 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
19 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
20}
21
22static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
23{
24 return kvm_register_read(vcpu, VCPU_REGS_RIP);
25}
26
27static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
28{
29 kvm_register_write(vcpu, VCPU_REGS_RIP, val);
30}
31
32#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 73f43de69f67..6571926bfd33 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -32,6 +32,7 @@
32#include <asm/current.h> 32#include <asm/current.h>
33#include <asm/apicdef.h> 33#include <asm/apicdef.h>
34#include <asm/atomic.h> 34#include <asm/atomic.h>
35#include "kvm_cache_regs.h"
35#include "irq.h" 36#include "irq.h"
36 37
37#define PRId64 "d" 38#define PRId64 "d"
@@ -338,13 +339,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
338 } else 339 } else
339 apic_clear_vector(vector, apic->regs + APIC_TMR); 340 apic_clear_vector(vector, apic->regs + APIC_TMR);
340 341
341 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 342 kvm_vcpu_kick(vcpu);
342 kvm_vcpu_kick(vcpu);
343 else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
344 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
345 if (waitqueue_active(&vcpu->wq))
346 wake_up_interruptible(&vcpu->wq);
347 }
348 343
349 result = (orig_irr == 0); 344 result = (orig_irr == 0);
350 break; 345 break;
@@ -370,21 +365,18 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
370 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 365 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
371 kvm_vcpu_kick(vcpu); 366 kvm_vcpu_kick(vcpu);
372 } else { 367 } else {
373 printk(KERN_DEBUG 368 apic_debug("Ignoring de-assert INIT to vcpu %d\n",
374 "Ignoring de-assert INIT to vcpu %d\n", 369 vcpu->vcpu_id);
375 vcpu->vcpu_id);
376 } 370 }
377
378 break; 371 break;
379 372
380 case APIC_DM_STARTUP: 373 case APIC_DM_STARTUP:
381 printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", 374 apic_debug("SIPI to vcpu %d vector 0x%02x\n",
382 vcpu->vcpu_id, vector); 375 vcpu->vcpu_id, vector);
383 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 376 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
384 vcpu->arch.sipi_vector = vector; 377 vcpu->arch.sipi_vector = vector;
385 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; 378 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
386 if (waitqueue_active(&vcpu->wq)) 379 kvm_vcpu_kick(vcpu);
387 wake_up_interruptible(&vcpu->wq);
388 } 380 }
389 break; 381 break;
390 382
@@ -438,7 +430,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
438static void apic_set_eoi(struct kvm_lapic *apic) 430static void apic_set_eoi(struct kvm_lapic *apic)
439{ 431{
440 int vector = apic_find_highest_isr(apic); 432 int vector = apic_find_highest_isr(apic);
441 433 int trigger_mode;
442 /* 434 /*
443 * Not every write EOI will has corresponding ISR, 435 * Not every write EOI will has corresponding ISR,
444 * one example is when Kernel check timer on setup_IO_APIC 436 * one example is when Kernel check timer on setup_IO_APIC
@@ -450,7 +442,10 @@ static void apic_set_eoi(struct kvm_lapic *apic)
450 apic_update_ppr(apic); 442 apic_update_ppr(apic);
451 443
452 if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) 444 if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
453 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector); 445 trigger_mode = IOAPIC_LEVEL_TRIG;
446 else
447 trigger_mode = IOAPIC_EDGE_TRIG;
448 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
454} 449}
455 450
456static void apic_send_ipi(struct kvm_lapic *apic) 451static void apic_send_ipi(struct kvm_lapic *apic)
@@ -558,8 +553,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
558 struct kvm_run *run = vcpu->run; 553 struct kvm_run *run = vcpu->run;
559 554
560 set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); 555 set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
561 kvm_x86_ops->cache_regs(vcpu); 556 run->tpr_access.rip = kvm_rip_read(vcpu);
562 run->tpr_access.rip = vcpu->arch.rip;
563 run->tpr_access.is_write = write; 557 run->tpr_access.is_write = write;
564} 558}
565 559
@@ -683,9 +677,9 @@ static void apic_mmio_write(struct kvm_io_device *this,
683 * Refer SDM 8.4.1 677 * Refer SDM 8.4.1
684 */ 678 */
685 if (len != 4 || alignment) { 679 if (len != 4 || alignment) {
686 if (printk_ratelimit()) 680 /* Don't shout loud, $infamous_os would cause only noise. */
687 printk(KERN_ERR "apic write: bad size=%d %lx\n", 681 apic_debug("apic write: bad size=%d %lx\n",
688 len, (long)address); 682 len, (long)address);
689 return; 683 return;
690 } 684 }
691 685
@@ -947,10 +941,9 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
947 941
948 if(!atomic_inc_and_test(&apic->timer.pending)) 942 if(!atomic_inc_and_test(&apic->timer.pending))
949 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); 943 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
950 if (waitqueue_active(q)) { 944 if (waitqueue_active(q))
951 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
952 wake_up_interruptible(q); 945 wake_up_interruptible(q);
953 } 946
954 if (apic_lvtt_period(apic)) { 947 if (apic_lvtt_period(apic)) {
955 result = 1; 948 result = 1;
956 apic->timer.dev.expires = ktime_add_ns( 949 apic->timer.dev.expires = ktime_add_ns(
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3da2508eb22a..99c239c5c0ac 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -70,6 +70,9 @@ static int dbg = 0;
70module_param(dbg, bool, 0644); 70module_param(dbg, bool, 0644);
71#endif 71#endif
72 72
73static int oos_shadow = 1;
74module_param(oos_shadow, bool, 0644);
75
73#ifndef MMU_DEBUG 76#ifndef MMU_DEBUG
74#define ASSERT(x) do { } while (0) 77#define ASSERT(x) do { } while (0)
75#else 78#else
@@ -135,18 +138,24 @@ module_param(dbg, bool, 0644);
135#define ACC_USER_MASK PT_USER_MASK 138#define ACC_USER_MASK PT_USER_MASK
136#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 139#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
137 140
138struct kvm_pv_mmu_op_buffer { 141#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
139 void *ptr;
140 unsigned len;
141 unsigned processed;
142 char buf[512] __aligned(sizeof(long));
143};
144 142
145struct kvm_rmap_desc { 143struct kvm_rmap_desc {
146 u64 *shadow_ptes[RMAP_EXT]; 144 u64 *shadow_ptes[RMAP_EXT];
147 struct kvm_rmap_desc *more; 145 struct kvm_rmap_desc *more;
148}; 146};
149 147
148struct kvm_shadow_walk {
149 int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu,
150 u64 addr, u64 *spte, int level);
151};
152
153struct kvm_unsync_walk {
154 int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
155};
156
157typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
158
150static struct kmem_cache *pte_chain_cache; 159static struct kmem_cache *pte_chain_cache;
151static struct kmem_cache *rmap_desc_cache; 160static struct kmem_cache *rmap_desc_cache;
152static struct kmem_cache *mmu_page_header_cache; 161static struct kmem_cache *mmu_page_header_cache;
@@ -405,16 +414,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
405{ 414{
406 struct vm_area_struct *vma; 415 struct vm_area_struct *vma;
407 unsigned long addr; 416 unsigned long addr;
417 int ret = 0;
408 418
409 addr = gfn_to_hva(kvm, gfn); 419 addr = gfn_to_hva(kvm, gfn);
410 if (kvm_is_error_hva(addr)) 420 if (kvm_is_error_hva(addr))
411 return 0; 421 return ret;
412 422
423 down_read(&current->mm->mmap_sem);
413 vma = find_vma(current->mm, addr); 424 vma = find_vma(current->mm, addr);
414 if (vma && is_vm_hugetlb_page(vma)) 425 if (vma && is_vm_hugetlb_page(vma))
415 return 1; 426 ret = 1;
427 up_read(&current->mm->mmap_sem);
416 428
417 return 0; 429 return ret;
418} 430}
419 431
420static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) 432static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
@@ -649,8 +661,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
649 661
650 if (write_protected) 662 if (write_protected)
651 kvm_flush_remote_tlbs(kvm); 663 kvm_flush_remote_tlbs(kvm);
652
653 account_shadowed(kvm, gfn);
654} 664}
655 665
656static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) 666static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
@@ -859,6 +869,77 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
859 BUG(); 869 BUG();
860} 870}
861 871
872
873static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
874 mmu_parent_walk_fn fn)
875{
876 struct kvm_pte_chain *pte_chain;
877 struct hlist_node *node;
878 struct kvm_mmu_page *parent_sp;
879 int i;
880
881 if (!sp->multimapped && sp->parent_pte) {
882 parent_sp = page_header(__pa(sp->parent_pte));
883 fn(vcpu, parent_sp);
884 mmu_parent_walk(vcpu, parent_sp, fn);
885 return;
886 }
887 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
888 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
889 if (!pte_chain->parent_ptes[i])
890 break;
891 parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
892 fn(vcpu, parent_sp);
893 mmu_parent_walk(vcpu, parent_sp, fn);
894 }
895}
896
897static void kvm_mmu_update_unsync_bitmap(u64 *spte)
898{
899 unsigned int index;
900 struct kvm_mmu_page *sp = page_header(__pa(spte));
901
902 index = spte - sp->spt;
903 __set_bit(index, sp->unsync_child_bitmap);
904 sp->unsync_children = 1;
905}
906
907static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
908{
909 struct kvm_pte_chain *pte_chain;
910 struct hlist_node *node;
911 int i;
912
913 if (!sp->parent_pte)
914 return;
915
916 if (!sp->multimapped) {
917 kvm_mmu_update_unsync_bitmap(sp->parent_pte);
918 return;
919 }
920
921 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
922 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
923 if (!pte_chain->parent_ptes[i])
924 break;
925 kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
926 }
927}
928
929static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
930{
931 sp->unsync_children = 1;
932 kvm_mmu_update_parents_unsync(sp);
933 return 1;
934}
935
936static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
937 struct kvm_mmu_page *sp)
938{
939 mmu_parent_walk(vcpu, sp, unsync_walk_fn);
940 kvm_mmu_update_parents_unsync(sp);
941}
942
862static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, 943static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
863 struct kvm_mmu_page *sp) 944 struct kvm_mmu_page *sp)
864{ 945{
@@ -868,6 +949,58 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
868 sp->spt[i] = shadow_trap_nonpresent_pte; 949 sp->spt[i] = shadow_trap_nonpresent_pte;
869} 950}
870 951
952static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
953 struct kvm_mmu_page *sp)
954{
955 return 1;
956}
957
958static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
959{
960}
961
962#define for_each_unsync_children(bitmap, idx) \
963 for (idx = find_first_bit(bitmap, 512); \
964 idx < 512; \
965 idx = find_next_bit(bitmap, 512, idx+1))
966
967static int mmu_unsync_walk(struct kvm_mmu_page *sp,
968 struct kvm_unsync_walk *walker)
969{
970 int i, ret;
971
972 if (!sp->unsync_children)
973 return 0;
974
975 for_each_unsync_children(sp->unsync_child_bitmap, i) {
976 u64 ent = sp->spt[i];
977
978 if (is_shadow_present_pte(ent)) {
979 struct kvm_mmu_page *child;
980 child = page_header(ent & PT64_BASE_ADDR_MASK);
981
982 if (child->unsync_children) {
983 ret = mmu_unsync_walk(child, walker);
984 if (ret)
985 return ret;
986 __clear_bit(i, sp->unsync_child_bitmap);
987 }
988
989 if (child->unsync) {
990 ret = walker->entry(child, walker);
991 __clear_bit(i, sp->unsync_child_bitmap);
992 if (ret)
993 return ret;
994 }
995 }
996 }
997
998 if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
999 sp->unsync_children = 0;
1000
1001 return 0;
1002}
1003
871static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) 1004static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
872{ 1005{
873 unsigned index; 1006 unsigned index;
@@ -888,6 +1021,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
888 return NULL; 1021 return NULL;
889} 1022}
890 1023
1024static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1025{
1026 WARN_ON(!sp->unsync);
1027 sp->unsync = 0;
1028 --kvm->stat.mmu_unsync;
1029}
1030
1031static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
1032
1033static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1034{
1035 if (sp->role.glevels != vcpu->arch.mmu.root_level) {
1036 kvm_mmu_zap_page(vcpu->kvm, sp);
1037 return 1;
1038 }
1039
1040 rmap_write_protect(vcpu->kvm, sp->gfn);
1041 if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1042 kvm_mmu_zap_page(vcpu->kvm, sp);
1043 return 1;
1044 }
1045
1046 kvm_mmu_flush_tlb(vcpu);
1047 kvm_unlink_unsync_page(vcpu->kvm, sp);
1048 return 0;
1049}
1050
1051struct sync_walker {
1052 struct kvm_vcpu *vcpu;
1053 struct kvm_unsync_walk walker;
1054};
1055
1056static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
1057{
1058 struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
1059 walker);
1060 struct kvm_vcpu *vcpu = sync_walk->vcpu;
1061
1062 kvm_sync_page(vcpu, sp);
1063 return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
1064}
1065
1066static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1067{
1068 struct sync_walker walker = {
1069 .walker = { .entry = mmu_sync_fn, },
1070 .vcpu = vcpu,
1071 };
1072
1073 while (mmu_unsync_walk(sp, &walker.walker))
1074 cond_resched_lock(&vcpu->kvm->mmu_lock);
1075}
1076
891static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 1077static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
892 gfn_t gfn, 1078 gfn_t gfn,
893 gva_t gaddr, 1079 gva_t gaddr,
@@ -901,7 +1087,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
901 unsigned quadrant; 1087 unsigned quadrant;
902 struct hlist_head *bucket; 1088 struct hlist_head *bucket;
903 struct kvm_mmu_page *sp; 1089 struct kvm_mmu_page *sp;
904 struct hlist_node *node; 1090 struct hlist_node *node, *tmp;
905 1091
906 role.word = 0; 1092 role.word = 0;
907 role.glevels = vcpu->arch.mmu.root_level; 1093 role.glevels = vcpu->arch.mmu.root_level;
@@ -917,9 +1103,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
917 gfn, role.word); 1103 gfn, role.word);
918 index = kvm_page_table_hashfn(gfn); 1104 index = kvm_page_table_hashfn(gfn);
919 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1105 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
920 hlist_for_each_entry(sp, node, bucket, hash_link) 1106 hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
921 if (sp->gfn == gfn && sp->role.word == role.word) { 1107 if (sp->gfn == gfn) {
1108 if (sp->unsync)
1109 if (kvm_sync_page(vcpu, sp))
1110 continue;
1111
1112 if (sp->role.word != role.word)
1113 continue;
1114
922 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1115 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1116 if (sp->unsync_children) {
1117 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
1118 kvm_mmu_mark_parents_unsync(vcpu, sp);
1119 }
923 pgprintk("%s: found\n", __func__); 1120 pgprintk("%s: found\n", __func__);
924 return sp; 1121 return sp;
925 } 1122 }
@@ -931,8 +1128,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
931 sp->gfn = gfn; 1128 sp->gfn = gfn;
932 sp->role = role; 1129 sp->role = role;
933 hlist_add_head(&sp->hash_link, bucket); 1130 hlist_add_head(&sp->hash_link, bucket);
934 if (!metaphysical) 1131 if (!metaphysical) {
935 rmap_write_protect(vcpu->kvm, gfn); 1132 rmap_write_protect(vcpu->kvm, gfn);
1133 account_shadowed(vcpu->kvm, gfn);
1134 }
936 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) 1135 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
937 vcpu->arch.mmu.prefetch_page(vcpu, sp); 1136 vcpu->arch.mmu.prefetch_page(vcpu, sp);
938 else 1137 else
@@ -940,6 +1139,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
940 return sp; 1139 return sp;
941} 1140}
942 1141
1142static int walk_shadow(struct kvm_shadow_walk *walker,
1143 struct kvm_vcpu *vcpu, u64 addr)
1144{
1145 hpa_t shadow_addr;
1146 int level;
1147 int r;
1148 u64 *sptep;
1149 unsigned index;
1150
1151 shadow_addr = vcpu->arch.mmu.root_hpa;
1152 level = vcpu->arch.mmu.shadow_root_level;
1153 if (level == PT32E_ROOT_LEVEL) {
1154 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1155 shadow_addr &= PT64_BASE_ADDR_MASK;
1156 --level;
1157 }
1158
1159 while (level >= PT_PAGE_TABLE_LEVEL) {
1160 index = SHADOW_PT_INDEX(addr, level);
1161 sptep = ((u64 *)__va(shadow_addr)) + index;
1162 r = walker->entry(walker, vcpu, addr, sptep, level);
1163 if (r)
1164 return r;
1165 shadow_addr = *sptep & PT64_BASE_ADDR_MASK;
1166 --level;
1167 }
1168 return 0;
1169}
1170
943static void kvm_mmu_page_unlink_children(struct kvm *kvm, 1171static void kvm_mmu_page_unlink_children(struct kvm *kvm,
944 struct kvm_mmu_page *sp) 1172 struct kvm_mmu_page *sp)
945{ 1173{
@@ -955,7 +1183,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
955 rmap_remove(kvm, &pt[i]); 1183 rmap_remove(kvm, &pt[i]);
956 pt[i] = shadow_trap_nonpresent_pte; 1184 pt[i] = shadow_trap_nonpresent_pte;
957 } 1185 }
958 kvm_flush_remote_tlbs(kvm);
959 return; 1186 return;
960 } 1187 }
961 1188
@@ -974,7 +1201,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
974 } 1201 }
975 pt[i] = shadow_trap_nonpresent_pte; 1202 pt[i] = shadow_trap_nonpresent_pte;
976 } 1203 }
977 kvm_flush_remote_tlbs(kvm);
978} 1204}
979 1205
980static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) 1206static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -991,11 +1217,10 @@ static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
991 kvm->vcpus[i]->arch.last_pte_updated = NULL; 1217 kvm->vcpus[i]->arch.last_pte_updated = NULL;
992} 1218}
993 1219
994static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1220static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
995{ 1221{
996 u64 *parent_pte; 1222 u64 *parent_pte;
997 1223
998 ++kvm->stat.mmu_shadow_zapped;
999 while (sp->multimapped || sp->parent_pte) { 1224 while (sp->multimapped || sp->parent_pte) {
1000 if (!sp->multimapped) 1225 if (!sp->multimapped)
1001 parent_pte = sp->parent_pte; 1226 parent_pte = sp->parent_pte;
@@ -1010,21 +1235,59 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1010 kvm_mmu_put_page(sp, parent_pte); 1235 kvm_mmu_put_page(sp, parent_pte);
1011 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); 1236 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
1012 } 1237 }
1238}
1239
1240struct zap_walker {
1241 struct kvm_unsync_walk walker;
1242 struct kvm *kvm;
1243 int zapped;
1244};
1245
1246static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
1247{
1248 struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
1249 walker);
1250 kvm_mmu_zap_page(zap_walk->kvm, sp);
1251 zap_walk->zapped = 1;
1252 return 0;
1253}
1254
1255static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
1256{
1257 struct zap_walker walker = {
1258 .walker = { .entry = mmu_zap_fn, },
1259 .kvm = kvm,
1260 .zapped = 0,
1261 };
1262
1263 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
1264 return 0;
1265 mmu_unsync_walk(sp, &walker.walker);
1266 return walker.zapped;
1267}
1268
1269static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1270{
1271 int ret;
1272 ++kvm->stat.mmu_shadow_zapped;
1273 ret = mmu_zap_unsync_children(kvm, sp);
1013 kvm_mmu_page_unlink_children(kvm, sp); 1274 kvm_mmu_page_unlink_children(kvm, sp);
1275 kvm_mmu_unlink_parents(kvm, sp);
1276 kvm_flush_remote_tlbs(kvm);
1277 if (!sp->role.invalid && !sp->role.metaphysical)
1278 unaccount_shadowed(kvm, sp->gfn);
1279 if (sp->unsync)
1280 kvm_unlink_unsync_page(kvm, sp);
1014 if (!sp->root_count) { 1281 if (!sp->root_count) {
1015 if (!sp->role.metaphysical && !sp->role.invalid)
1016 unaccount_shadowed(kvm, sp->gfn);
1017 hlist_del(&sp->hash_link); 1282 hlist_del(&sp->hash_link);
1018 kvm_mmu_free_page(kvm, sp); 1283 kvm_mmu_free_page(kvm, sp);
1019 } else { 1284 } else {
1020 int invalid = sp->role.invalid;
1021 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1022 sp->role.invalid = 1; 1285 sp->role.invalid = 1;
1286 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1023 kvm_reload_remote_mmus(kvm); 1287 kvm_reload_remote_mmus(kvm);
1024 if (!sp->role.metaphysical && !invalid)
1025 unaccount_shadowed(kvm, sp->gfn);
1026 } 1288 }
1027 kvm_mmu_reset_last_pte_updated(kvm); 1289 kvm_mmu_reset_last_pte_updated(kvm);
1290 return ret;
1028} 1291}
1029 1292
1030/* 1293/*
@@ -1077,8 +1340,9 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1077 if (sp->gfn == gfn && !sp->role.metaphysical) { 1340 if (sp->gfn == gfn && !sp->role.metaphysical) {
1078 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1341 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1079 sp->role.word); 1342 sp->role.word);
1080 kvm_mmu_zap_page(kvm, sp);
1081 r = 1; 1343 r = 1;
1344 if (kvm_mmu_zap_page(kvm, sp))
1345 n = bucket->first;
1082 } 1346 }
1083 return r; 1347 return r;
1084} 1348}
@@ -1101,6 +1365,20 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1101 __set_bit(slot, &sp->slot_bitmap); 1365 __set_bit(slot, &sp->slot_bitmap);
1102} 1366}
1103 1367
1368static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1369{
1370 int i;
1371 u64 *pt = sp->spt;
1372
1373 if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
1374 return;
1375
1376 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1377 if (pt[i] == shadow_notrap_nonpresent_pte)
1378 set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte);
1379 }
1380}
1381
1104struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) 1382struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1105{ 1383{
1106 struct page *page; 1384 struct page *page;
@@ -1110,51 +1388,60 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1110 if (gpa == UNMAPPED_GVA) 1388 if (gpa == UNMAPPED_GVA)
1111 return NULL; 1389 return NULL;
1112 1390
1113 down_read(&current->mm->mmap_sem);
1114 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 1391 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1115 up_read(&current->mm->mmap_sem);
1116 1392
1117 return page; 1393 return page;
1118} 1394}
1119 1395
1120static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1396static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1121 unsigned pt_access, unsigned pte_access,
1122 int user_fault, int write_fault, int dirty,
1123 int *ptwrite, int largepage, gfn_t gfn,
1124 pfn_t pfn, bool speculative)
1125{ 1397{
1126 u64 spte; 1398 unsigned index;
1127 int was_rmapped = 0; 1399 struct hlist_head *bucket;
1128 int was_writeble = is_writeble_pte(*shadow_pte); 1400 struct kvm_mmu_page *s;
1401 struct hlist_node *node, *n;
1129 1402
1130 pgprintk("%s: spte %llx access %x write_fault %d" 1403 index = kvm_page_table_hashfn(sp->gfn);
1131 " user_fault %d gfn %lx\n", 1404 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1132 __func__, *shadow_pte, pt_access, 1405 /* don't unsync if pagetable is shadowed with multiple roles */
1133 write_fault, user_fault, gfn); 1406 hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
1407 if (s->gfn != sp->gfn || s->role.metaphysical)
1408 continue;
1409 if (s->role.word != sp->role.word)
1410 return 1;
1411 }
1412 kvm_mmu_mark_parents_unsync(vcpu, sp);
1413 ++vcpu->kvm->stat.mmu_unsync;
1414 sp->unsync = 1;
1415 mmu_convert_notrap(sp);
1416 return 0;
1417}
1134 1418
1135 if (is_rmap_pte(*shadow_pte)) { 1419static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1136 /* 1420 bool can_unsync)
1137 * If we overwrite a PTE page pointer with a 2MB PMD, unlink 1421{
1138 * the parent of the now unreachable PTE. 1422 struct kvm_mmu_page *shadow;
1139 */
1140 if (largepage && !is_large_pte(*shadow_pte)) {
1141 struct kvm_mmu_page *child;
1142 u64 pte = *shadow_pte;
1143 1423
1144 child = page_header(pte & PT64_BASE_ADDR_MASK); 1424 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
1145 mmu_page_remove_parent_pte(child, shadow_pte); 1425 if (shadow) {
1146 } else if (pfn != spte_to_pfn(*shadow_pte)) { 1426 if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
1147 pgprintk("hfn old %lx new %lx\n", 1427 return 1;
1148 spte_to_pfn(*shadow_pte), pfn); 1428 if (shadow->unsync)
1149 rmap_remove(vcpu->kvm, shadow_pte); 1429 return 0;
1150 } else { 1430 if (can_unsync && oos_shadow)
1151 if (largepage) 1431 return kvm_unsync_page(vcpu, shadow);
1152 was_rmapped = is_large_pte(*shadow_pte); 1432 return 1;
1153 else
1154 was_rmapped = 1;
1155 }
1156 } 1433 }
1434 return 0;
1435}
1157 1436
1437static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1438 unsigned pte_access, int user_fault,
1439 int write_fault, int dirty, int largepage,
1440 gfn_t gfn, pfn_t pfn, bool speculative,
1441 bool can_unsync)
1442{
1443 u64 spte;
1444 int ret = 0;
1158 /* 1445 /*
1159 * We don't set the accessed bit, since we sometimes want to see 1446 * We don't set the accessed bit, since we sometimes want to see
1160 * whether the guest actually used the pte (in order to detect 1447 * whether the guest actually used the pte (in order to detect
@@ -1162,7 +1449,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1162 */ 1449 */
1163 spte = shadow_base_present_pte | shadow_dirty_mask; 1450 spte = shadow_base_present_pte | shadow_dirty_mask;
1164 if (!speculative) 1451 if (!speculative)
1165 pte_access |= PT_ACCESSED_MASK; 1452 spte |= shadow_accessed_mask;
1166 if (!dirty) 1453 if (!dirty)
1167 pte_access &= ~ACC_WRITE_MASK; 1454 pte_access &= ~ACC_WRITE_MASK;
1168 if (pte_access & ACC_EXEC_MASK) 1455 if (pte_access & ACC_EXEC_MASK)
@@ -1178,35 +1465,82 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1178 1465
1179 if ((pte_access & ACC_WRITE_MASK) 1466 if ((pte_access & ACC_WRITE_MASK)
1180 || (write_fault && !is_write_protection(vcpu) && !user_fault)) { 1467 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
1181 struct kvm_mmu_page *shadow; 1468
1469 if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
1470 ret = 1;
1471 spte = shadow_trap_nonpresent_pte;
1472 goto set_pte;
1473 }
1182 1474
1183 spte |= PT_WRITABLE_MASK; 1475 spte |= PT_WRITABLE_MASK;
1184 1476
1185 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); 1477 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1186 if (shadow ||
1187 (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
1188 pgprintk("%s: found shadow page for %lx, marking ro\n", 1478 pgprintk("%s: found shadow page for %lx, marking ro\n",
1189 __func__, gfn); 1479 __func__, gfn);
1480 ret = 1;
1190 pte_access &= ~ACC_WRITE_MASK; 1481 pte_access &= ~ACC_WRITE_MASK;
1191 if (is_writeble_pte(spte)) { 1482 if (is_writeble_pte(spte))
1192 spte &= ~PT_WRITABLE_MASK; 1483 spte &= ~PT_WRITABLE_MASK;
1193 kvm_x86_ops->tlb_flush(vcpu);
1194 }
1195 if (write_fault)
1196 *ptwrite = 1;
1197 } 1484 }
1198 } 1485 }
1199 1486
1200 if (pte_access & ACC_WRITE_MASK) 1487 if (pte_access & ACC_WRITE_MASK)
1201 mark_page_dirty(vcpu->kvm, gfn); 1488 mark_page_dirty(vcpu->kvm, gfn);
1202 1489
1203 pgprintk("%s: setting spte %llx\n", __func__, spte); 1490set_pte:
1204 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1205 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
1206 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
1207 set_shadow_pte(shadow_pte, spte); 1491 set_shadow_pte(shadow_pte, spte);
1208 if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK) 1492 return ret;
1209 && (spte & PT_PRESENT_MASK)) 1493}
1494
1495static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1496 unsigned pt_access, unsigned pte_access,
1497 int user_fault, int write_fault, int dirty,
1498 int *ptwrite, int largepage, gfn_t gfn,
1499 pfn_t pfn, bool speculative)
1500{
1501 int was_rmapped = 0;
1502 int was_writeble = is_writeble_pte(*shadow_pte);
1503
1504 pgprintk("%s: spte %llx access %x write_fault %d"
1505 " user_fault %d gfn %lx\n",
1506 __func__, *shadow_pte, pt_access,
1507 write_fault, user_fault, gfn);
1508
1509 if (is_rmap_pte(*shadow_pte)) {
1510 /*
1511 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
1512 * the parent of the now unreachable PTE.
1513 */
1514 if (largepage && !is_large_pte(*shadow_pte)) {
1515 struct kvm_mmu_page *child;
1516 u64 pte = *shadow_pte;
1517
1518 child = page_header(pte & PT64_BASE_ADDR_MASK);
1519 mmu_page_remove_parent_pte(child, shadow_pte);
1520 } else if (pfn != spte_to_pfn(*shadow_pte)) {
1521 pgprintk("hfn old %lx new %lx\n",
1522 spte_to_pfn(*shadow_pte), pfn);
1523 rmap_remove(vcpu->kvm, shadow_pte);
1524 } else {
1525 if (largepage)
1526 was_rmapped = is_large_pte(*shadow_pte);
1527 else
1528 was_rmapped = 1;
1529 }
1530 }
1531 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
1532 dirty, largepage, gfn, pfn, speculative, true)) {
1533 if (write_fault)
1534 *ptwrite = 1;
1535 kvm_x86_ops->tlb_flush(vcpu);
1536 }
1537
1538 pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
1539 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1540 is_large_pte(*shadow_pte)? "2MB" : "4kB",
1541 is_present_pte(*shadow_pte)?"RW":"R", gfn,
1542 *shadow_pte, shadow_pte);
1543 if (!was_rmapped && is_large_pte(*shadow_pte))
1210 ++vcpu->kvm->stat.lpages; 1544 ++vcpu->kvm->stat.lpages;
1211 1545
1212 page_header_update_slot(vcpu->kvm, shadow_pte, gfn); 1546 page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
@@ -1230,54 +1564,67 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
1230{ 1564{
1231} 1565}
1232 1566
1233static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 1567struct direct_shadow_walk {
1234 int largepage, gfn_t gfn, pfn_t pfn, 1568 struct kvm_shadow_walk walker;
1235 int level) 1569 pfn_t pfn;
1236{ 1570 int write;
1237 hpa_t table_addr = vcpu->arch.mmu.root_hpa; 1571 int largepage;
1238 int pt_write = 0; 1572 int pt_write;
1239 1573};
1240 for (; ; level--) {
1241 u32 index = PT64_INDEX(v, level);
1242 u64 *table;
1243
1244 ASSERT(VALID_PAGE(table_addr));
1245 table = __va(table_addr);
1246 1574
1247 if (level == 1) { 1575static int direct_map_entry(struct kvm_shadow_walk *_walk,
1248 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, 1576 struct kvm_vcpu *vcpu,
1249 0, write, 1, &pt_write, 0, gfn, pfn, false); 1577 u64 addr, u64 *sptep, int level)
1250 return pt_write; 1578{
1251 } 1579 struct direct_shadow_walk *walk =
1580 container_of(_walk, struct direct_shadow_walk, walker);
1581 struct kvm_mmu_page *sp;
1582 gfn_t pseudo_gfn;
1583 gfn_t gfn = addr >> PAGE_SHIFT;
1584
1585 if (level == PT_PAGE_TABLE_LEVEL
1586 || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
1587 mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
1588 0, walk->write, 1, &walk->pt_write,
1589 walk->largepage, gfn, walk->pfn, false);
1590 ++vcpu->stat.pf_fixed;
1591 return 1;
1592 }
1252 1593
1253 if (largepage && level == 2) { 1594 if (*sptep == shadow_trap_nonpresent_pte) {
1254 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, 1595 pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
1255 0, write, 1, &pt_write, 1, gfn, pfn, false); 1596 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1,
1256 return pt_write; 1597 1, ACC_ALL, sptep);
1598 if (!sp) {
1599 pgprintk("nonpaging_map: ENOMEM\n");
1600 kvm_release_pfn_clean(walk->pfn);
1601 return -ENOMEM;
1257 } 1602 }
1258 1603
1259 if (table[index] == shadow_trap_nonpresent_pte) { 1604 set_shadow_pte(sptep,
1260 struct kvm_mmu_page *new_table; 1605 __pa(sp->spt)
1261 gfn_t pseudo_gfn; 1606 | PT_PRESENT_MASK | PT_WRITABLE_MASK
1262 1607 | shadow_user_mask | shadow_x_mask);
1263 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
1264 >> PAGE_SHIFT;
1265 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
1266 v, level - 1,
1267 1, ACC_ALL, &table[index]);
1268 if (!new_table) {
1269 pgprintk("nonpaging_map: ENOMEM\n");
1270 kvm_release_pfn_clean(pfn);
1271 return -ENOMEM;
1272 }
1273
1274 set_shadow_pte(&table[index],
1275 __pa(new_table->spt)
1276 | PT_PRESENT_MASK | PT_WRITABLE_MASK
1277 | shadow_user_mask | shadow_x_mask);
1278 }
1279 table_addr = table[index] & PT64_BASE_ADDR_MASK;
1280 } 1608 }
1609 return 0;
1610}
1611
1612static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1613 int largepage, gfn_t gfn, pfn_t pfn)
1614{
1615 int r;
1616 struct direct_shadow_walk walker = {
1617 .walker = { .entry = direct_map_entry, },
1618 .pfn = pfn,
1619 .largepage = largepage,
1620 .write = write,
1621 .pt_write = 0,
1622 };
1623
1624 r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT);
1625 if (r < 0)
1626 return r;
1627 return walker.pt_write;
1281} 1628}
1282 1629
1283static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 1630static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
@@ -1287,16 +1634,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1287 pfn_t pfn; 1634 pfn_t pfn;
1288 unsigned long mmu_seq; 1635 unsigned long mmu_seq;
1289 1636
1290 down_read(&current->mm->mmap_sem);
1291 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 1637 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1292 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1638 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1293 largepage = 1; 1639 largepage = 1;
1294 } 1640 }
1295 1641
1296 mmu_seq = vcpu->kvm->mmu_notifier_seq; 1642 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1297 /* implicit mb(), we'll read before PT lock is unlocked */ 1643 smp_rmb();
1298 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1644 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1299 up_read(&current->mm->mmap_sem);
1300 1645
1301 /* mmio */ 1646 /* mmio */
1302 if (is_error_pfn(pfn)) { 1647 if (is_error_pfn(pfn)) {
@@ -1308,8 +1653,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1308 if (mmu_notifier_retry(vcpu, mmu_seq)) 1653 if (mmu_notifier_retry(vcpu, mmu_seq))
1309 goto out_unlock; 1654 goto out_unlock;
1310 kvm_mmu_free_some_pages(vcpu); 1655 kvm_mmu_free_some_pages(vcpu);
1311 r = __direct_map(vcpu, v, write, largepage, gfn, pfn, 1656 r = __direct_map(vcpu, v, write, largepage, gfn, pfn);
1312 PT32E_ROOT_LEVEL);
1313 spin_unlock(&vcpu->kvm->mmu_lock); 1657 spin_unlock(&vcpu->kvm->mmu_lock);
1314 1658
1315 1659
@@ -1405,6 +1749,37 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1405 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 1749 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1406} 1750}
1407 1751
1752static void mmu_sync_roots(struct kvm_vcpu *vcpu)
1753{
1754 int i;
1755 struct kvm_mmu_page *sp;
1756
1757 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1758 return;
1759 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1760 hpa_t root = vcpu->arch.mmu.root_hpa;
1761 sp = page_header(root);
1762 mmu_sync_children(vcpu, sp);
1763 return;
1764 }
1765 for (i = 0; i < 4; ++i) {
1766 hpa_t root = vcpu->arch.mmu.pae_root[i];
1767
1768 if (root) {
1769 root &= PT64_BASE_ADDR_MASK;
1770 sp = page_header(root);
1771 mmu_sync_children(vcpu, sp);
1772 }
1773 }
1774}
1775
1776void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
1777{
1778 spin_lock(&vcpu->kvm->mmu_lock);
1779 mmu_sync_roots(vcpu);
1780 spin_unlock(&vcpu->kvm->mmu_lock);
1781}
1782
1408static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) 1783static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
1409{ 1784{
1410 return vaddr; 1785 return vaddr;
@@ -1446,15 +1821,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1446 if (r) 1821 if (r)
1447 return r; 1822 return r;
1448 1823
1449 down_read(&current->mm->mmap_sem);
1450 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 1824 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1451 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1825 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1452 largepage = 1; 1826 largepage = 1;
1453 } 1827 }
1454 mmu_seq = vcpu->kvm->mmu_notifier_seq; 1828 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1455 /* implicit mb(), we'll read before PT lock is unlocked */ 1829 smp_rmb();
1456 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1830 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1457 up_read(&current->mm->mmap_sem);
1458 if (is_error_pfn(pfn)) { 1831 if (is_error_pfn(pfn)) {
1459 kvm_release_pfn_clean(pfn); 1832 kvm_release_pfn_clean(pfn);
1460 return 1; 1833 return 1;
@@ -1464,7 +1837,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1464 goto out_unlock; 1837 goto out_unlock;
1465 kvm_mmu_free_some_pages(vcpu); 1838 kvm_mmu_free_some_pages(vcpu);
1466 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 1839 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
1467 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); 1840 largepage, gfn, pfn);
1468 spin_unlock(&vcpu->kvm->mmu_lock); 1841 spin_unlock(&vcpu->kvm->mmu_lock);
1469 1842
1470 return r; 1843 return r;
@@ -1489,6 +1862,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
1489 context->gva_to_gpa = nonpaging_gva_to_gpa; 1862 context->gva_to_gpa = nonpaging_gva_to_gpa;
1490 context->free = nonpaging_free; 1863 context->free = nonpaging_free;
1491 context->prefetch_page = nonpaging_prefetch_page; 1864 context->prefetch_page = nonpaging_prefetch_page;
1865 context->sync_page = nonpaging_sync_page;
1866 context->invlpg = nonpaging_invlpg;
1492 context->root_level = 0; 1867 context->root_level = 0;
1493 context->shadow_root_level = PT32E_ROOT_LEVEL; 1868 context->shadow_root_level = PT32E_ROOT_LEVEL;
1494 context->root_hpa = INVALID_PAGE; 1869 context->root_hpa = INVALID_PAGE;
@@ -1536,6 +1911,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1536 context->page_fault = paging64_page_fault; 1911 context->page_fault = paging64_page_fault;
1537 context->gva_to_gpa = paging64_gva_to_gpa; 1912 context->gva_to_gpa = paging64_gva_to_gpa;
1538 context->prefetch_page = paging64_prefetch_page; 1913 context->prefetch_page = paging64_prefetch_page;
1914 context->sync_page = paging64_sync_page;
1915 context->invlpg = paging64_invlpg;
1539 context->free = paging_free; 1916 context->free = paging_free;
1540 context->root_level = level; 1917 context->root_level = level;
1541 context->shadow_root_level = level; 1918 context->shadow_root_level = level;
@@ -1557,6 +1934,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
1557 context->gva_to_gpa = paging32_gva_to_gpa; 1934 context->gva_to_gpa = paging32_gva_to_gpa;
1558 context->free = paging_free; 1935 context->free = paging_free;
1559 context->prefetch_page = paging32_prefetch_page; 1936 context->prefetch_page = paging32_prefetch_page;
1937 context->sync_page = paging32_sync_page;
1938 context->invlpg = paging32_invlpg;
1560 context->root_level = PT32_ROOT_LEVEL; 1939 context->root_level = PT32_ROOT_LEVEL;
1561 context->shadow_root_level = PT32E_ROOT_LEVEL; 1940 context->shadow_root_level = PT32E_ROOT_LEVEL;
1562 context->root_hpa = INVALID_PAGE; 1941 context->root_hpa = INVALID_PAGE;
@@ -1576,6 +1955,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
1576 context->page_fault = tdp_page_fault; 1955 context->page_fault = tdp_page_fault;
1577 context->free = nonpaging_free; 1956 context->free = nonpaging_free;
1578 context->prefetch_page = nonpaging_prefetch_page; 1957 context->prefetch_page = nonpaging_prefetch_page;
1958 context->sync_page = nonpaging_sync_page;
1959 context->invlpg = nonpaging_invlpg;
1579 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 1960 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
1580 context->root_hpa = INVALID_PAGE; 1961 context->root_hpa = INVALID_PAGE;
1581 1962
@@ -1647,6 +2028,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
1647 spin_lock(&vcpu->kvm->mmu_lock); 2028 spin_lock(&vcpu->kvm->mmu_lock);
1648 kvm_mmu_free_some_pages(vcpu); 2029 kvm_mmu_free_some_pages(vcpu);
1649 mmu_alloc_roots(vcpu); 2030 mmu_alloc_roots(vcpu);
2031 mmu_sync_roots(vcpu);
1650 spin_unlock(&vcpu->kvm->mmu_lock); 2032 spin_unlock(&vcpu->kvm->mmu_lock);
1651 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 2033 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
1652 kvm_mmu_flush_tlb(vcpu); 2034 kvm_mmu_flush_tlb(vcpu);
@@ -1767,15 +2149,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1767 return; 2149 return;
1768 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 2150 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
1769 2151
1770 down_read(&current->mm->mmap_sem);
1771 if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { 2152 if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
1772 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 2153 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1773 vcpu->arch.update_pte.largepage = 1; 2154 vcpu->arch.update_pte.largepage = 1;
1774 } 2155 }
1775 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; 2156 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
1776 /* implicit mb(), we'll read before PT lock is unlocked */ 2157 smp_rmb();
1777 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2158 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1778 up_read(&current->mm->mmap_sem);
1779 2159
1780 if (is_error_pfn(pfn)) { 2160 if (is_error_pfn(pfn)) {
1781 kvm_release_pfn_clean(pfn); 2161 kvm_release_pfn_clean(pfn);
@@ -1837,7 +2217,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1837 index = kvm_page_table_hashfn(gfn); 2217 index = kvm_page_table_hashfn(gfn);
1838 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 2218 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1839 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { 2219 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
1840 if (sp->gfn != gfn || sp->role.metaphysical) 2220 if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid)
1841 continue; 2221 continue;
1842 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; 2222 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1843 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 2223 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
@@ -1855,7 +2235,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1855 */ 2235 */
1856 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 2236 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1857 gpa, bytes, sp->role.word); 2237 gpa, bytes, sp->role.word);
1858 kvm_mmu_zap_page(vcpu->kvm, sp); 2238 if (kvm_mmu_zap_page(vcpu->kvm, sp))
2239 n = bucket->first;
1859 ++vcpu->kvm->stat.mmu_flooded; 2240 ++vcpu->kvm->stat.mmu_flooded;
1860 continue; 2241 continue;
1861 } 2242 }
@@ -1969,6 +2350,16 @@ out:
1969} 2350}
1970EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); 2351EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
1971 2352
2353void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
2354{
2355 spin_lock(&vcpu->kvm->mmu_lock);
2356 vcpu->arch.mmu.invlpg(vcpu, gva);
2357 spin_unlock(&vcpu->kvm->mmu_lock);
2358 kvm_mmu_flush_tlb(vcpu);
2359 ++vcpu->stat.invlpg;
2360}
2361EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
2362
1972void kvm_enable_tdp(void) 2363void kvm_enable_tdp(void)
1973{ 2364{
1974 tdp_enabled = true; 2365 tdp_enabled = true;
@@ -2055,6 +2446,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2055{ 2446{
2056 struct kvm_mmu_page *sp; 2447 struct kvm_mmu_page *sp;
2057 2448
2449 spin_lock(&kvm->mmu_lock);
2058 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { 2450 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
2059 int i; 2451 int i;
2060 u64 *pt; 2452 u64 *pt;
@@ -2068,6 +2460,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2068 if (pt[i] & PT_WRITABLE_MASK) 2460 if (pt[i] & PT_WRITABLE_MASK)
2069 pt[i] &= ~PT_WRITABLE_MASK; 2461 pt[i] &= ~PT_WRITABLE_MASK;
2070 } 2462 }
2463 kvm_flush_remote_tlbs(kvm);
2464 spin_unlock(&kvm->mmu_lock);
2071} 2465}
2072 2466
2073void kvm_mmu_zap_all(struct kvm *kvm) 2467void kvm_mmu_zap_all(struct kvm *kvm)
@@ -2076,7 +2470,9 @@ void kvm_mmu_zap_all(struct kvm *kvm)
2076 2470
2077 spin_lock(&kvm->mmu_lock); 2471 spin_lock(&kvm->mmu_lock);
2078 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 2472 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
2079 kvm_mmu_zap_page(kvm, sp); 2473 if (kvm_mmu_zap_page(kvm, sp))
2474 node = container_of(kvm->arch.active_mmu_pages.next,
2475 struct kvm_mmu_page, link);
2080 spin_unlock(&kvm->mmu_lock); 2476 spin_unlock(&kvm->mmu_lock);
2081 2477
2082 kvm_flush_remote_tlbs(kvm); 2478 kvm_flush_remote_tlbs(kvm);
@@ -2291,18 +2687,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
2291 gpa_t addr, unsigned long *ret) 2687 gpa_t addr, unsigned long *ret)
2292{ 2688{
2293 int r; 2689 int r;
2294 struct kvm_pv_mmu_op_buffer buffer; 2690 struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
2295 2691
2296 buffer.ptr = buffer.buf; 2692 buffer->ptr = buffer->buf;
2297 buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf); 2693 buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
2298 buffer.processed = 0; 2694 buffer->processed = 0;
2299 2695
2300 r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len); 2696 r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
2301 if (r) 2697 if (r)
2302 goto out; 2698 goto out;
2303 2699
2304 while (buffer.len) { 2700 while (buffer->len) {
2305 r = kvm_pv_mmu_op_one(vcpu, &buffer); 2701 r = kvm_pv_mmu_op_one(vcpu, buffer);
2306 if (r < 0) 2702 if (r < 0)
2307 goto out; 2703 goto out;
2308 if (r == 0) 2704 if (r == 0)
@@ -2311,7 +2707,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
2311 2707
2312 r = 1; 2708 r = 1;
2313out: 2709out:
2314 *ret = buffer.processed; 2710 *ret = buffer->processed;
2315 return r; 2711 return r;
2316} 2712}
2317 2713
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4a814bff21f2..613ec9aa674a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -25,11 +25,11 @@
25#if PTTYPE == 64 25#if PTTYPE == 64
26 #define pt_element_t u64 26 #define pt_element_t u64
27 #define guest_walker guest_walker64 27 #define guest_walker guest_walker64
28 #define shadow_walker shadow_walker64
28 #define FNAME(name) paging##64_##name 29 #define FNAME(name) paging##64_##name
29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK 30 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK 31 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 32 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) 33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #define PT_LEVEL_BITS PT64_LEVEL_BITS 34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
35 #ifdef CONFIG_X86_64 35 #ifdef CONFIG_X86_64
@@ -42,11 +42,11 @@
42#elif PTTYPE == 32 42#elif PTTYPE == 32
43 #define pt_element_t u32 43 #define pt_element_t u32
44 #define guest_walker guest_walker32 44 #define guest_walker guest_walker32
45 #define shadow_walker shadow_walker32
45 #define FNAME(name) paging##32_##name 46 #define FNAME(name) paging##32_##name
46 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK 47 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
47 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK 48 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
48 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 49 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
49 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
50 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) 50 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
51 #define PT_LEVEL_BITS PT32_LEVEL_BITS 51 #define PT_LEVEL_BITS PT32_LEVEL_BITS
52 #define PT_MAX_FULL_LEVELS 2 52 #define PT_MAX_FULL_LEVELS 2
@@ -73,6 +73,17 @@ struct guest_walker {
73 u32 error_code; 73 u32 error_code;
74}; 74};
75 75
76struct shadow_walker {
77 struct kvm_shadow_walk walker;
78 struct guest_walker *guest_walker;
79 int user_fault;
80 int write_fault;
81 int largepage;
82 int *ptwrite;
83 pfn_t pfn;
84 u64 *sptep;
85};
86
76static gfn_t gpte_to_gfn(pt_element_t gpte) 87static gfn_t gpte_to_gfn(pt_element_t gpte)
77{ 88{
78 return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; 89 return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -91,14 +102,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
91 pt_element_t *table; 102 pt_element_t *table;
92 struct page *page; 103 struct page *page;
93 104
94 down_read(&current->mm->mmap_sem);
95 page = gfn_to_page(kvm, table_gfn); 105 page = gfn_to_page(kvm, table_gfn);
96 up_read(&current->mm->mmap_sem);
97 106
98 table = kmap_atomic(page, KM_USER0); 107 table = kmap_atomic(page, KM_USER0);
99
100 ret = CMPXCHG(&table[index], orig_pte, new_pte); 108 ret = CMPXCHG(&table[index], orig_pte, new_pte);
101
102 kunmap_atomic(table, KM_USER0); 109 kunmap_atomic(table, KM_USER0);
103 110
104 kvm_release_page_dirty(page); 111 kvm_release_page_dirty(page);
@@ -274,86 +281,89 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
274/* 281/*
275 * Fetch a shadow pte for a specific level in the paging hierarchy. 282 * Fetch a shadow pte for a specific level in the paging hierarchy.
276 */ 283 */
277static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 284static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
278 struct guest_walker *walker, 285 struct kvm_vcpu *vcpu, u64 addr,
279 int user_fault, int write_fault, int largepage, 286 u64 *sptep, int level)
280 int *ptwrite, pfn_t pfn)
281{ 287{
282 hpa_t shadow_addr; 288 struct shadow_walker *sw =
283 int level; 289 container_of(_sw, struct shadow_walker, walker);
284 u64 *shadow_ent; 290 struct guest_walker *gw = sw->guest_walker;
285 unsigned access = walker->pt_access; 291 unsigned access = gw->pt_access;
286 292 struct kvm_mmu_page *shadow_page;
287 if (!is_present_pte(walker->ptes[walker->level - 1])) 293 u64 spte;
288 return NULL; 294 int metaphysical;
289 295 gfn_t table_gfn;
290 shadow_addr = vcpu->arch.mmu.root_hpa; 296 int r;
291 level = vcpu->arch.mmu.shadow_root_level; 297 pt_element_t curr_pte;
292 if (level == PT32E_ROOT_LEVEL) { 298
293 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; 299 if (level == PT_PAGE_TABLE_LEVEL
294 shadow_addr &= PT64_BASE_ADDR_MASK; 300 || (sw->largepage && level == PT_DIRECTORY_LEVEL)) {
295 --level; 301 mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
302 sw->user_fault, sw->write_fault,
303 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
304 sw->ptwrite, sw->largepage, gw->gfn, sw->pfn,
305 false);
306 sw->sptep = sptep;
307 return 1;
296 } 308 }
297 309
298 for (; ; level--) { 310 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
299 u32 index = SHADOW_PT_INDEX(addr, level); 311 return 0;
300 struct kvm_mmu_page *shadow_page;
301 u64 shadow_pte;
302 int metaphysical;
303 gfn_t table_gfn;
304
305 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
306 if (level == PT_PAGE_TABLE_LEVEL)
307 break;
308
309 if (largepage && level == PT_DIRECTORY_LEVEL)
310 break;
311 312
312 if (is_shadow_present_pte(*shadow_ent) 313 if (is_large_pte(*sptep)) {
313 && !is_large_pte(*shadow_ent)) { 314 set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
314 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; 315 kvm_flush_remote_tlbs(vcpu->kvm);
315 continue; 316 rmap_remove(vcpu->kvm, sptep);
316 } 317 }
317 318
318 if (is_large_pte(*shadow_ent)) 319 if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) {
319 rmap_remove(vcpu->kvm, shadow_ent); 320 metaphysical = 1;
320 321 if (!is_dirty_pte(gw->ptes[level - 1]))
321 if (level - 1 == PT_PAGE_TABLE_LEVEL 322 access &= ~ACC_WRITE_MASK;
322 && walker->level == PT_DIRECTORY_LEVEL) { 323 table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
323 metaphysical = 1; 324 } else {
324 if (!is_dirty_pte(walker->ptes[level - 1])) 325 metaphysical = 0;
325 access &= ~ACC_WRITE_MASK; 326 table_gfn = gw->table_gfn[level - 2];
326 table_gfn = gpte_to_gfn(walker->ptes[level - 1]); 327 }
327 } else { 328 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1,
328 metaphysical = 0; 329 metaphysical, access, sptep);
329 table_gfn = walker->table_gfn[level - 2]; 330 if (!metaphysical) {
330 } 331 r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2],
331 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, 332 &curr_pte, sizeof(curr_pte));
332 metaphysical, access, 333 if (r || curr_pte != gw->ptes[level - 2]) {
333 shadow_ent); 334 kvm_release_pfn_clean(sw->pfn);
334 if (!metaphysical) { 335 sw->sptep = NULL;
335 int r; 336 return 1;
336 pt_element_t curr_pte;
337 r = kvm_read_guest_atomic(vcpu->kvm,
338 walker->pte_gpa[level - 2],
339 &curr_pte, sizeof(curr_pte));
340 if (r || curr_pte != walker->ptes[level - 2]) {
341 kvm_release_pfn_clean(pfn);
342 return NULL;
343 }
344 } 337 }
345 shadow_addr = __pa(shadow_page->spt);
346 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
347 | PT_WRITABLE_MASK | PT_USER_MASK;
348 set_shadow_pte(shadow_ent, shadow_pte);
349 } 338 }
350 339
351 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, 340 spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK
352 user_fault, write_fault, 341 | PT_WRITABLE_MASK | PT_USER_MASK;
353 walker->ptes[walker->level-1] & PT_DIRTY_MASK, 342 *sptep = spte;
354 ptwrite, largepage, walker->gfn, pfn, false); 343 return 0;
344}
345
346static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
347 struct guest_walker *guest_walker,
348 int user_fault, int write_fault, int largepage,
349 int *ptwrite, pfn_t pfn)
350{
351 struct shadow_walker walker = {
352 .walker = { .entry = FNAME(shadow_walk_entry), },
353 .guest_walker = guest_walker,
354 .user_fault = user_fault,
355 .write_fault = write_fault,
356 .largepage = largepage,
357 .ptwrite = ptwrite,
358 .pfn = pfn,
359 };
360
361 if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1]))
362 return NULL;
363
364 walk_shadow(&walker.walker, vcpu, addr);
355 365
356 return shadow_ent; 366 return walker.sptep;
357} 367}
358 368
359/* 369/*
@@ -407,7 +417,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
407 return 0; 417 return 0;
408 } 418 }
409 419
410 down_read(&current->mm->mmap_sem);
411 if (walker.level == PT_DIRECTORY_LEVEL) { 420 if (walker.level == PT_DIRECTORY_LEVEL) {
412 gfn_t large_gfn; 421 gfn_t large_gfn;
413 large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); 422 large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
@@ -417,9 +426,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
417 } 426 }
418 } 427 }
419 mmu_seq = vcpu->kvm->mmu_notifier_seq; 428 mmu_seq = vcpu->kvm->mmu_notifier_seq;
420 /* implicit mb(), we'll read before PT lock is unlocked */ 429 smp_rmb();
421 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 430 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
422 up_read(&current->mm->mmap_sem);
423 431
424 /* mmio */ 432 /* mmio */
425 if (is_error_pfn(pfn)) { 433 if (is_error_pfn(pfn)) {
@@ -453,6 +461,31 @@ out_unlock:
453 return 0; 461 return 0;
454} 462}
455 463
464static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
465 struct kvm_vcpu *vcpu, u64 addr,
466 u64 *sptep, int level)
467{
468
469 if (level == PT_PAGE_TABLE_LEVEL) {
470 if (is_shadow_present_pte(*sptep))
471 rmap_remove(vcpu->kvm, sptep);
472 set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
473 return 1;
474 }
475 if (!is_shadow_present_pte(*sptep))
476 return 1;
477 return 0;
478}
479
480static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
481{
482 struct shadow_walker walker = {
483 .walker = { .entry = FNAME(shadow_invlpg_entry), },
484 };
485
486 walk_shadow(&walker.walker, vcpu, gva);
487}
488
456static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 489static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
457{ 490{
458 struct guest_walker walker; 491 struct guest_walker walker;
@@ -499,12 +532,66 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
499 } 532 }
500} 533}
501 534
535/*
536 * Using the cached information from sp->gfns is safe because:
537 * - The spte has a reference to the struct page, so the pfn for a given gfn
538 * can't change unless all sptes pointing to it are nuked first.
539 * - Alias changes zap the entire shadow cache.
540 */
541static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
542{
543 int i, offset, nr_present;
544
545 offset = nr_present = 0;
546
547 if (PTTYPE == 32)
548 offset = sp->role.quadrant << PT64_LEVEL_BITS;
549
550 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
551 unsigned pte_access;
552 pt_element_t gpte;
553 gpa_t pte_gpa;
554 gfn_t gfn = sp->gfns[i];
555
556 if (!is_shadow_present_pte(sp->spt[i]))
557 continue;
558
559 pte_gpa = gfn_to_gpa(sp->gfn);
560 pte_gpa += (i+offset) * sizeof(pt_element_t);
561
562 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
563 sizeof(pt_element_t)))
564 return -EINVAL;
565
566 if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) ||
567 !(gpte & PT_ACCESSED_MASK)) {
568 u64 nonpresent;
569
570 rmap_remove(vcpu->kvm, &sp->spt[i]);
571 if (is_present_pte(gpte))
572 nonpresent = shadow_trap_nonpresent_pte;
573 else
574 nonpresent = shadow_notrap_nonpresent_pte;
575 set_shadow_pte(&sp->spt[i], nonpresent);
576 continue;
577 }
578
579 nr_present++;
580 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
581 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
582 is_dirty_pte(gpte), 0, gfn,
583 spte_to_pfn(sp->spt[i]), true, false);
584 }
585
586 return !nr_present;
587}
588
502#undef pt_element_t 589#undef pt_element_t
503#undef guest_walker 590#undef guest_walker
591#undef shadow_walker
504#undef FNAME 592#undef FNAME
505#undef PT_BASE_ADDR_MASK 593#undef PT_BASE_ADDR_MASK
506#undef PT_INDEX 594#undef PT_INDEX
507#undef SHADOW_PT_INDEX
508#undef PT_LEVEL_MASK 595#undef PT_LEVEL_MASK
509#undef PT_DIR_BASE_ADDR_MASK 596#undef PT_DIR_BASE_ADDR_MASK
510#undef PT_LEVEL_BITS 597#undef PT_LEVEL_BITS
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8233b86c778c..9c4ce657d963 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -18,6 +18,7 @@
18#include "kvm_svm.h" 18#include "kvm_svm.h"
19#include "irq.h" 19#include "irq.h"
20#include "mmu.h" 20#include "mmu.h"
21#include "kvm_cache_regs.h"
21 22
22#include <linux/module.h> 23#include <linux/module.h>
23#include <linux/kernel.h> 24#include <linux/kernel.h>
@@ -35,10 +36,6 @@ MODULE_LICENSE("GPL");
35#define IOPM_ALLOC_ORDER 2 36#define IOPM_ALLOC_ORDER 2
36#define MSRPM_ALLOC_ORDER 1 37#define MSRPM_ALLOC_ORDER 1
37 38
38#define DB_VECTOR 1
39#define UD_VECTOR 6
40#define GP_VECTOR 13
41
42#define DR7_GD_MASK (1 << 13) 39#define DR7_GD_MASK (1 << 13)
43#define DR6_BD_MASK (1 << 13) 40#define DR6_BD_MASK (1 << 13)
44 41
@@ -47,7 +44,7 @@ MODULE_LICENSE("GPL");
47 44
48#define SVM_FEATURE_NPT (1 << 0) 45#define SVM_FEATURE_NPT (1 << 0)
49#define SVM_FEATURE_LBRV (1 << 1) 46#define SVM_FEATURE_LBRV (1 << 1)
50#define SVM_DEATURE_SVML (1 << 2) 47#define SVM_FEATURE_SVML (1 << 2)
51 48
52#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 49#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
53 50
@@ -236,13 +233,11 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
236 printk(KERN_DEBUG "%s: NOP\n", __func__); 233 printk(KERN_DEBUG "%s: NOP\n", __func__);
237 return; 234 return;
238 } 235 }
239 if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) 236 if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
240 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", 237 printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
241 __func__, 238 __func__, kvm_rip_read(vcpu), svm->next_rip);
242 svm->vmcb->save.rip,
243 svm->next_rip);
244 239
245 vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip; 240 kvm_rip_write(vcpu, svm->next_rip);
246 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 241 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
247 242
248 vcpu->arch.interrupt_window_open = 1; 243 vcpu->arch.interrupt_window_open = 1;
@@ -530,6 +525,7 @@ static void init_vmcb(struct vcpu_svm *svm)
530 (1ULL << INTERCEPT_CPUID) | 525 (1ULL << INTERCEPT_CPUID) |
531 (1ULL << INTERCEPT_INVD) | 526 (1ULL << INTERCEPT_INVD) |
532 (1ULL << INTERCEPT_HLT) | 527 (1ULL << INTERCEPT_HLT) |
528 (1ULL << INTERCEPT_INVLPG) |
533 (1ULL << INTERCEPT_INVLPGA) | 529 (1ULL << INTERCEPT_INVLPGA) |
534 (1ULL << INTERCEPT_IOIO_PROT) | 530 (1ULL << INTERCEPT_IOIO_PROT) |
535 (1ULL << INTERCEPT_MSR_PROT) | 531 (1ULL << INTERCEPT_MSR_PROT) |
@@ -581,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm)
581 save->dr7 = 0x400; 577 save->dr7 = 0x400;
582 save->rflags = 2; 578 save->rflags = 2;
583 save->rip = 0x0000fff0; 579 save->rip = 0x0000fff0;
580 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
584 581
585 /* 582 /*
586 * cr0 val on cpu init should be 0x60000010, we enable cpu 583 * cr0 val on cpu init should be 0x60000010, we enable cpu
@@ -593,7 +590,8 @@ static void init_vmcb(struct vcpu_svm *svm)
593 if (npt_enabled) { 590 if (npt_enabled) {
594 /* Setup VMCB for Nested Paging */ 591 /* Setup VMCB for Nested Paging */
595 control->nested_ctl = 1; 592 control->nested_ctl = 1;
596 control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH); 593 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
594 (1ULL << INTERCEPT_INVLPG));
597 control->intercept_exceptions &= ~(1 << PF_VECTOR); 595 control->intercept_exceptions &= ~(1 << PF_VECTOR);
598 control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| 596 control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
599 INTERCEPT_CR3_MASK); 597 INTERCEPT_CR3_MASK);
@@ -615,10 +613,12 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
615 init_vmcb(svm); 613 init_vmcb(svm);
616 614
617 if (vcpu->vcpu_id != 0) { 615 if (vcpu->vcpu_id != 0) {
618 svm->vmcb->save.rip = 0; 616 kvm_rip_write(vcpu, 0);
619 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; 617 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
620 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; 618 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
621 } 619 }
620 vcpu->arch.regs_avail = ~0;
621 vcpu->arch.regs_dirty = ~0;
622 622
623 return 0; 623 return 0;
624} 624}
@@ -721,23 +721,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
721 rdtscll(vcpu->arch.host_tsc); 721 rdtscll(vcpu->arch.host_tsc);
722} 722}
723 723
724static void svm_cache_regs(struct kvm_vcpu *vcpu)
725{
726 struct vcpu_svm *svm = to_svm(vcpu);
727
728 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
729 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
730 vcpu->arch.rip = svm->vmcb->save.rip;
731}
732
733static void svm_decache_regs(struct kvm_vcpu *vcpu)
734{
735 struct vcpu_svm *svm = to_svm(vcpu);
736 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
737 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
738 svm->vmcb->save.rip = vcpu->arch.rip;
739}
740
741static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 724static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
742{ 725{
743 return to_svm(vcpu)->vmcb->save.rflags; 726 return to_svm(vcpu)->vmcb->save.rflags;
@@ -1040,7 +1023,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1040 if (npt_enabled) 1023 if (npt_enabled)
1041 svm_flush_tlb(&svm->vcpu); 1024 svm_flush_tlb(&svm->vcpu);
1042 1025
1043 if (event_injection) 1026 if (!npt_enabled && event_injection)
1044 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); 1027 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1045 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1028 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1046} 1029}
@@ -1139,14 +1122,14 @@ static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1139 1122
1140static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1123static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1141{ 1124{
1142 svm->next_rip = svm->vmcb->save.rip + 1; 1125 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
1143 skip_emulated_instruction(&svm->vcpu); 1126 skip_emulated_instruction(&svm->vcpu);
1144 return kvm_emulate_halt(&svm->vcpu); 1127 return kvm_emulate_halt(&svm->vcpu);
1145} 1128}
1146 1129
1147static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1130static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1148{ 1131{
1149 svm->next_rip = svm->vmcb->save.rip + 3; 1132 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1150 skip_emulated_instruction(&svm->vcpu); 1133 skip_emulated_instruction(&svm->vcpu);
1151 kvm_emulate_hypercall(&svm->vcpu); 1134 kvm_emulate_hypercall(&svm->vcpu);
1152 return 1; 1135 return 1;
@@ -1178,11 +1161,18 @@ static int task_switch_interception(struct vcpu_svm *svm,
1178 1161
1179static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1162static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1180{ 1163{
1181 svm->next_rip = svm->vmcb->save.rip + 2; 1164 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
1182 kvm_emulate_cpuid(&svm->vcpu); 1165 kvm_emulate_cpuid(&svm->vcpu);
1183 return 1; 1166 return 1;
1184} 1167}
1185 1168
1169static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1170{
1171 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
1172 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
1173 return 1;
1174}
1175
1186static int emulate_on_interception(struct vcpu_svm *svm, 1176static int emulate_on_interception(struct vcpu_svm *svm,
1187 struct kvm_run *kvm_run) 1177 struct kvm_run *kvm_run)
1188{ 1178{
@@ -1273,9 +1263,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1273 KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, 1263 KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data,
1274 (u32)(data >> 32), handler); 1264 (u32)(data >> 32), handler);
1275 1265
1276 svm->vmcb->save.rax = data & 0xffffffff; 1266 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
1277 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; 1267 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
1278 svm->next_rip = svm->vmcb->save.rip + 2; 1268 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
1279 skip_emulated_instruction(&svm->vcpu); 1269 skip_emulated_instruction(&svm->vcpu);
1280 } 1270 }
1281 return 1; 1271 return 1;
@@ -1359,13 +1349,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1359static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1349static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1360{ 1350{
1361 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 1351 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1362 u64 data = (svm->vmcb->save.rax & -1u) 1352 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
1363 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); 1353 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
1364 1354
1365 KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), 1355 KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32),
1366 handler); 1356 handler);
1367 1357
1368 svm->next_rip = svm->vmcb->save.rip + 2; 1358 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
1369 if (svm_set_msr(&svm->vcpu, ecx, data)) 1359 if (svm_set_msr(&svm->vcpu, ecx, data))
1370 kvm_inject_gp(&svm->vcpu, 0); 1360 kvm_inject_gp(&svm->vcpu, 0);
1371 else 1361 else
@@ -1436,7 +1426,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1436 [SVM_EXIT_CPUID] = cpuid_interception, 1426 [SVM_EXIT_CPUID] = cpuid_interception,
1437 [SVM_EXIT_INVD] = emulate_on_interception, 1427 [SVM_EXIT_INVD] = emulate_on_interception,
1438 [SVM_EXIT_HLT] = halt_interception, 1428 [SVM_EXIT_HLT] = halt_interception,
1439 [SVM_EXIT_INVLPG] = emulate_on_interception, 1429 [SVM_EXIT_INVLPG] = invlpg_interception,
1440 [SVM_EXIT_INVLPGA] = invalid_op_interception, 1430 [SVM_EXIT_INVLPGA] = invalid_op_interception,
1441 [SVM_EXIT_IOIO] = io_interception, 1431 [SVM_EXIT_IOIO] = io_interception,
1442 [SVM_EXIT_MSR] = msr_interception, 1432 [SVM_EXIT_MSR] = msr_interception,
@@ -1538,6 +1528,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
1538 1528
1539 KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); 1529 KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler);
1540 1530
1531 ++svm->vcpu.stat.irq_injections;
1541 control = &svm->vmcb->control; 1532 control = &svm->vmcb->control;
1542 control->int_vector = irq; 1533 control->int_vector = irq;
1543 control->int_ctl &= ~V_INTR_PRIO_MASK; 1534 control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -1716,6 +1707,12 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
1716 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 1707 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
1717} 1708}
1718 1709
1710#ifdef CONFIG_X86_64
1711#define R "r"
1712#else
1713#define R "e"
1714#endif
1715
1719static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1716static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1720{ 1717{
1721 struct vcpu_svm *svm = to_svm(vcpu); 1718 struct vcpu_svm *svm = to_svm(vcpu);
@@ -1723,6 +1720,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1723 u16 gs_selector; 1720 u16 gs_selector;
1724 u16 ldt_selector; 1721 u16 ldt_selector;
1725 1722
1723 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
1724 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
1725 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
1726
1726 pre_svm_run(svm); 1727 pre_svm_run(svm);
1727 1728
1728 sync_lapic_to_cr8(vcpu); 1729 sync_lapic_to_cr8(vcpu);
@@ -1750,19 +1751,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1750 local_irq_enable(); 1751 local_irq_enable();
1751 1752
1752 asm volatile ( 1753 asm volatile (
1754 "push %%"R"bp; \n\t"
1755 "mov %c[rbx](%[svm]), %%"R"bx \n\t"
1756 "mov %c[rcx](%[svm]), %%"R"cx \n\t"
1757 "mov %c[rdx](%[svm]), %%"R"dx \n\t"
1758 "mov %c[rsi](%[svm]), %%"R"si \n\t"
1759 "mov %c[rdi](%[svm]), %%"R"di \n\t"
1760 "mov %c[rbp](%[svm]), %%"R"bp \n\t"
1753#ifdef CONFIG_X86_64 1761#ifdef CONFIG_X86_64
1754 "push %%rbp; \n\t"
1755#else
1756 "push %%ebp; \n\t"
1757#endif
1758
1759#ifdef CONFIG_X86_64
1760 "mov %c[rbx](%[svm]), %%rbx \n\t"
1761 "mov %c[rcx](%[svm]), %%rcx \n\t"
1762 "mov %c[rdx](%[svm]), %%rdx \n\t"
1763 "mov %c[rsi](%[svm]), %%rsi \n\t"
1764 "mov %c[rdi](%[svm]), %%rdi \n\t"
1765 "mov %c[rbp](%[svm]), %%rbp \n\t"
1766 "mov %c[r8](%[svm]), %%r8 \n\t" 1762 "mov %c[r8](%[svm]), %%r8 \n\t"
1767 "mov %c[r9](%[svm]), %%r9 \n\t" 1763 "mov %c[r9](%[svm]), %%r9 \n\t"
1768 "mov %c[r10](%[svm]), %%r10 \n\t" 1764 "mov %c[r10](%[svm]), %%r10 \n\t"
@@ -1771,41 +1767,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1771 "mov %c[r13](%[svm]), %%r13 \n\t" 1767 "mov %c[r13](%[svm]), %%r13 \n\t"
1772 "mov %c[r14](%[svm]), %%r14 \n\t" 1768 "mov %c[r14](%[svm]), %%r14 \n\t"
1773 "mov %c[r15](%[svm]), %%r15 \n\t" 1769 "mov %c[r15](%[svm]), %%r15 \n\t"
1774#else
1775 "mov %c[rbx](%[svm]), %%ebx \n\t"
1776 "mov %c[rcx](%[svm]), %%ecx \n\t"
1777 "mov %c[rdx](%[svm]), %%edx \n\t"
1778 "mov %c[rsi](%[svm]), %%esi \n\t"
1779 "mov %c[rdi](%[svm]), %%edi \n\t"
1780 "mov %c[rbp](%[svm]), %%ebp \n\t"
1781#endif 1770#endif
1782 1771
1783#ifdef CONFIG_X86_64
1784 /* Enter guest mode */
1785 "push %%rax \n\t"
1786 "mov %c[vmcb](%[svm]), %%rax \n\t"
1787 __ex(SVM_VMLOAD) "\n\t"
1788 __ex(SVM_VMRUN) "\n\t"
1789 __ex(SVM_VMSAVE) "\n\t"
1790 "pop %%rax \n\t"
1791#else
1792 /* Enter guest mode */ 1772 /* Enter guest mode */
1793 "push %%eax \n\t" 1773 "push %%"R"ax \n\t"
1794 "mov %c[vmcb](%[svm]), %%eax \n\t" 1774 "mov %c[vmcb](%[svm]), %%"R"ax \n\t"
1795 __ex(SVM_VMLOAD) "\n\t" 1775 __ex(SVM_VMLOAD) "\n\t"
1796 __ex(SVM_VMRUN) "\n\t" 1776 __ex(SVM_VMRUN) "\n\t"
1797 __ex(SVM_VMSAVE) "\n\t" 1777 __ex(SVM_VMSAVE) "\n\t"
1798 "pop %%eax \n\t" 1778 "pop %%"R"ax \n\t"
1799#endif
1800 1779
1801 /* Save guest registers, load host registers */ 1780 /* Save guest registers, load host registers */
1781 "mov %%"R"bx, %c[rbx](%[svm]) \n\t"
1782 "mov %%"R"cx, %c[rcx](%[svm]) \n\t"
1783 "mov %%"R"dx, %c[rdx](%[svm]) \n\t"
1784 "mov %%"R"si, %c[rsi](%[svm]) \n\t"
1785 "mov %%"R"di, %c[rdi](%[svm]) \n\t"
1786 "mov %%"R"bp, %c[rbp](%[svm]) \n\t"
1802#ifdef CONFIG_X86_64 1787#ifdef CONFIG_X86_64
1803 "mov %%rbx, %c[rbx](%[svm]) \n\t"
1804 "mov %%rcx, %c[rcx](%[svm]) \n\t"
1805 "mov %%rdx, %c[rdx](%[svm]) \n\t"
1806 "mov %%rsi, %c[rsi](%[svm]) \n\t"
1807 "mov %%rdi, %c[rdi](%[svm]) \n\t"
1808 "mov %%rbp, %c[rbp](%[svm]) \n\t"
1809 "mov %%r8, %c[r8](%[svm]) \n\t" 1788 "mov %%r8, %c[r8](%[svm]) \n\t"
1810 "mov %%r9, %c[r9](%[svm]) \n\t" 1789 "mov %%r9, %c[r9](%[svm]) \n\t"
1811 "mov %%r10, %c[r10](%[svm]) \n\t" 1790 "mov %%r10, %c[r10](%[svm]) \n\t"
@@ -1814,18 +1793,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1814 "mov %%r13, %c[r13](%[svm]) \n\t" 1793 "mov %%r13, %c[r13](%[svm]) \n\t"
1815 "mov %%r14, %c[r14](%[svm]) \n\t" 1794 "mov %%r14, %c[r14](%[svm]) \n\t"
1816 "mov %%r15, %c[r15](%[svm]) \n\t" 1795 "mov %%r15, %c[r15](%[svm]) \n\t"
1817
1818 "pop %%rbp; \n\t"
1819#else
1820 "mov %%ebx, %c[rbx](%[svm]) \n\t"
1821 "mov %%ecx, %c[rcx](%[svm]) \n\t"
1822 "mov %%edx, %c[rdx](%[svm]) \n\t"
1823 "mov %%esi, %c[rsi](%[svm]) \n\t"
1824 "mov %%edi, %c[rdi](%[svm]) \n\t"
1825 "mov %%ebp, %c[rbp](%[svm]) \n\t"
1826
1827 "pop %%ebp; \n\t"
1828#endif 1796#endif
1797 "pop %%"R"bp"
1829 : 1798 :
1830 : [svm]"a"(svm), 1799 : [svm]"a"(svm),
1831 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), 1800 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
@@ -1846,11 +1815,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1846 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) 1815 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
1847#endif 1816#endif
1848 : "cc", "memory" 1817 : "cc", "memory"
1818 , R"bx", R"cx", R"dx", R"si", R"di"
1849#ifdef CONFIG_X86_64 1819#ifdef CONFIG_X86_64
1850 , "rbx", "rcx", "rdx", "rsi", "rdi"
1851 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" 1820 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
1852#else
1853 , "ebx", "ecx", "edx" , "esi", "edi"
1854#endif 1821#endif
1855 ); 1822 );
1856 1823
@@ -1858,6 +1825,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1858 load_db_regs(svm->host_db_regs); 1825 load_db_regs(svm->host_db_regs);
1859 1826
1860 vcpu->arch.cr2 = svm->vmcb->save.cr2; 1827 vcpu->arch.cr2 = svm->vmcb->save.cr2;
1828 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
1829 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
1830 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
1861 1831
1862 write_dr6(svm->host_dr6); 1832 write_dr6(svm->host_dr6);
1863 write_dr7(svm->host_dr7); 1833 write_dr7(svm->host_dr7);
@@ -1879,6 +1849,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1879 svm->next_rip = 0; 1849 svm->next_rip = 0;
1880} 1850}
1881 1851
1852#undef R
1853
1882static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) 1854static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
1883{ 1855{
1884 struct vcpu_svm *svm = to_svm(vcpu); 1856 struct vcpu_svm *svm = to_svm(vcpu);
@@ -1977,8 +1949,6 @@ static struct kvm_x86_ops svm_x86_ops = {
1977 .set_gdt = svm_set_gdt, 1949 .set_gdt = svm_set_gdt,
1978 .get_dr = svm_get_dr, 1950 .get_dr = svm_get_dr,
1979 .set_dr = svm_set_dr, 1951 .set_dr = svm_set_dr,
1980 .cache_regs = svm_cache_regs,
1981 .decache_regs = svm_decache_regs,
1982 .get_rflags = svm_get_rflags, 1952 .get_rflags = svm_get_rflags,
1983 .set_rflags = svm_set_rflags, 1953 .set_rflags = svm_set_rflags,
1984 1954
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7041cc52b562..2643b430d83a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -26,6 +26,8 @@
26#include <linux/highmem.h> 26#include <linux/highmem.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/moduleparam.h> 28#include <linux/moduleparam.h>
29#include "kvm_cache_regs.h"
30#include "x86.h"
29 31
30#include <asm/io.h> 32#include <asm/io.h>
31#include <asm/desc.h> 33#include <asm/desc.h>
@@ -47,6 +49,9 @@ module_param(flexpriority_enabled, bool, 0);
47static int enable_ept = 1; 49static int enable_ept = 1;
48module_param(enable_ept, bool, 0); 50module_param(enable_ept, bool, 0);
49 51
52static int emulate_invalid_guest_state = 0;
53module_param(emulate_invalid_guest_state, bool, 0);
54
50struct vmcs { 55struct vmcs {
51 u32 revision_id; 56 u32 revision_id;
52 u32 abort; 57 u32 abort;
@@ -56,6 +61,7 @@ struct vmcs {
56struct vcpu_vmx { 61struct vcpu_vmx {
57 struct kvm_vcpu vcpu; 62 struct kvm_vcpu vcpu;
58 struct list_head local_vcpus_link; 63 struct list_head local_vcpus_link;
64 unsigned long host_rsp;
59 int launched; 65 int launched;
60 u8 fail; 66 u8 fail;
61 u32 idt_vectoring_info; 67 u32 idt_vectoring_info;
@@ -83,6 +89,7 @@ struct vcpu_vmx {
83 } irq; 89 } irq;
84 } rmode; 90 } rmode;
85 int vpid; 91 int vpid;
92 bool emulation_required;
86}; 93};
87 94
88static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 95static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -468,7 +475,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
468 if (!vcpu->fpu_active) 475 if (!vcpu->fpu_active)
469 eb |= 1u << NM_VECTOR; 476 eb |= 1u << NM_VECTOR;
470 if (vcpu->guest_debug.enabled) 477 if (vcpu->guest_debug.enabled)
471 eb |= 1u << 1; 478 eb |= 1u << DB_VECTOR;
472 if (vcpu->arch.rmode.active) 479 if (vcpu->arch.rmode.active)
473 eb = ~0; 480 eb = ~0;
474 if (vm_need_ept()) 481 if (vm_need_ept())
@@ -715,9 +722,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
715 unsigned long rip; 722 unsigned long rip;
716 u32 interruptibility; 723 u32 interruptibility;
717 724
718 rip = vmcs_readl(GUEST_RIP); 725 rip = kvm_rip_read(vcpu);
719 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 726 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
720 vmcs_writel(GUEST_RIP, rip); 727 kvm_rip_write(vcpu, rip);
721 728
722 /* 729 /*
723 * We emulated an instruction, so temporary interrupt blocking 730 * We emulated an instruction, so temporary interrupt blocking
@@ -733,19 +740,35 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
733static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 740static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
734 bool has_error_code, u32 error_code) 741 bool has_error_code, u32 error_code)
735{ 742{
743 struct vcpu_vmx *vmx = to_vmx(vcpu);
744
745 if (has_error_code)
746 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
747
748 if (vcpu->arch.rmode.active) {
749 vmx->rmode.irq.pending = true;
750 vmx->rmode.irq.vector = nr;
751 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
752 if (nr == BP_VECTOR)
753 vmx->rmode.irq.rip++;
754 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
755 nr | INTR_TYPE_SOFT_INTR
756 | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
757 | INTR_INFO_VALID_MASK);
758 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
759 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
760 return;
761 }
762
736 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 763 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
737 nr | INTR_TYPE_EXCEPTION 764 nr | INTR_TYPE_EXCEPTION
738 | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) 765 | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
739 | INTR_INFO_VALID_MASK); 766 | INTR_INFO_VALID_MASK);
740 if (has_error_code)
741 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
742} 767}
743 768
744static bool vmx_exception_injected(struct kvm_vcpu *vcpu) 769static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
745{ 770{
746 struct vcpu_vmx *vmx = to_vmx(vcpu); 771 return false;
747
748 return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
749} 772}
750 773
751/* 774/*
@@ -947,24 +970,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
947 return ret; 970 return ret;
948} 971}
949 972
950/* 973static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
951 * Sync the rsp and rip registers into the vcpu structure. This allows
952 * registers to be accessed by indexing vcpu->arch.regs.
953 */
954static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
955{
956 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
957 vcpu->arch.rip = vmcs_readl(GUEST_RIP);
958}
959
960/*
961 * Syncs rsp and rip back into the vmcs. Should be called after possible
962 * modification.
963 */
964static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
965{ 974{
966 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 975 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
967 vmcs_writel(GUEST_RIP, vcpu->arch.rip); 976 switch (reg) {
977 case VCPU_REGS_RSP:
978 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
979 break;
980 case VCPU_REGS_RIP:
981 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
982 break;
983 default:
984 break;
985 }
968} 986}
969 987
970static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) 988static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
@@ -1007,17 +1025,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
1007 1025
1008static int vmx_get_irq(struct kvm_vcpu *vcpu) 1026static int vmx_get_irq(struct kvm_vcpu *vcpu)
1009{ 1027{
1010 struct vcpu_vmx *vmx = to_vmx(vcpu); 1028 if (!vcpu->arch.interrupt.pending)
1011 u32 idtv_info_field; 1029 return -1;
1012 1030 return vcpu->arch.interrupt.nr;
1013 idtv_info_field = vmx->idt_vectoring_info;
1014 if (idtv_info_field & INTR_INFO_VALID_MASK) {
1015 if (is_external_interrupt(idtv_info_field))
1016 return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
1017 else
1018 printk(KERN_DEBUG "pending exception: not handled yet\n");
1019 }
1020 return -1;
1021} 1031}
1022 1032
1023static __init int cpu_has_kvm_support(void) 1033static __init int cpu_has_kvm_support(void)
@@ -1031,9 +1041,9 @@ static __init int vmx_disabled_by_bios(void)
1031 u64 msr; 1041 u64 msr;
1032 1042
1033 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 1043 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
1034 return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED | 1044 return (msr & (FEATURE_CONTROL_LOCKED |
1035 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) 1045 FEATURE_CONTROL_VMXON_ENABLED))
1036 == MSR_IA32_FEATURE_CONTROL_LOCKED; 1046 == FEATURE_CONTROL_LOCKED;
1037 /* locked but not enabled */ 1047 /* locked but not enabled */
1038} 1048}
1039 1049
@@ -1045,14 +1055,14 @@ static void hardware_enable(void *garbage)
1045 1055
1046 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); 1056 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1047 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 1057 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1048 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | 1058 if ((old & (FEATURE_CONTROL_LOCKED |
1049 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) 1059 FEATURE_CONTROL_VMXON_ENABLED))
1050 != (MSR_IA32_FEATURE_CONTROL_LOCKED | 1060 != (FEATURE_CONTROL_LOCKED |
1051 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) 1061 FEATURE_CONTROL_VMXON_ENABLED))
1052 /* enable and lock */ 1062 /* enable and lock */
1053 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 1063 wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
1054 MSR_IA32_FEATURE_CONTROL_LOCKED | 1064 FEATURE_CONTROL_LOCKED |
1055 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); 1065 FEATURE_CONTROL_VMXON_ENABLED);
1056 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ 1066 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1057 asm volatile (ASM_VMX_VMXON_RAX 1067 asm volatile (ASM_VMX_VMXON_RAX
1058 : : "a"(&phys_addr), "m"(phys_addr) 1068 : : "a"(&phys_addr), "m"(phys_addr)
@@ -1120,7 +1130,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1120 CPU_BASED_CR3_STORE_EXITING | 1130 CPU_BASED_CR3_STORE_EXITING |
1121 CPU_BASED_USE_IO_BITMAPS | 1131 CPU_BASED_USE_IO_BITMAPS |
1122 CPU_BASED_MOV_DR_EXITING | 1132 CPU_BASED_MOV_DR_EXITING |
1123 CPU_BASED_USE_TSC_OFFSETING; 1133 CPU_BASED_USE_TSC_OFFSETING |
1134 CPU_BASED_INVLPG_EXITING;
1124 opt = CPU_BASED_TPR_SHADOW | 1135 opt = CPU_BASED_TPR_SHADOW |
1125 CPU_BASED_USE_MSR_BITMAPS | 1136 CPU_BASED_USE_MSR_BITMAPS |
1126 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 1137 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1149,9 +1160,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1149 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 1160 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
1150#endif 1161#endif
1151 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 1162 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
1152 /* CR3 accesses don't need to cause VM Exits when EPT enabled */ 1163 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
1164 enabled */
1153 min &= ~(CPU_BASED_CR3_LOAD_EXITING | 1165 min &= ~(CPU_BASED_CR3_LOAD_EXITING |
1154 CPU_BASED_CR3_STORE_EXITING); 1166 CPU_BASED_CR3_STORE_EXITING |
1167 CPU_BASED_INVLPG_EXITING);
1155 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, 1168 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
1156 &_cpu_based_exec_control) < 0) 1169 &_cpu_based_exec_control) < 0)
1157 return -EIO; 1170 return -EIO;
@@ -1288,7 +1301,9 @@ static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
1288static void enter_pmode(struct kvm_vcpu *vcpu) 1301static void enter_pmode(struct kvm_vcpu *vcpu)
1289{ 1302{
1290 unsigned long flags; 1303 unsigned long flags;
1304 struct vcpu_vmx *vmx = to_vmx(vcpu);
1291 1305
1306 vmx->emulation_required = 1;
1292 vcpu->arch.rmode.active = 0; 1307 vcpu->arch.rmode.active = 0;
1293 1308
1294 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); 1309 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
@@ -1305,6 +1320,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1305 1320
1306 update_exception_bitmap(vcpu); 1321 update_exception_bitmap(vcpu);
1307 1322
1323 if (emulate_invalid_guest_state)
1324 return;
1325
1308 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); 1326 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1309 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); 1327 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1310 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); 1328 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
@@ -1345,7 +1363,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1345static void enter_rmode(struct kvm_vcpu *vcpu) 1363static void enter_rmode(struct kvm_vcpu *vcpu)
1346{ 1364{
1347 unsigned long flags; 1365 unsigned long flags;
1366 struct vcpu_vmx *vmx = to_vmx(vcpu);
1348 1367
1368 vmx->emulation_required = 1;
1349 vcpu->arch.rmode.active = 1; 1369 vcpu->arch.rmode.active = 1;
1350 1370
1351 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1371 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
@@ -1367,6 +1387,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1367 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 1387 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
1368 update_exception_bitmap(vcpu); 1388 update_exception_bitmap(vcpu);
1369 1389
1390 if (emulate_invalid_guest_state)
1391 goto continue_rmode;
1392
1370 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); 1393 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
1371 vmcs_write32(GUEST_SS_LIMIT, 0xffff); 1394 vmcs_write32(GUEST_SS_LIMIT, 0xffff);
1372 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); 1395 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
@@ -1382,6 +1405,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1382 fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); 1405 fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1383 fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); 1406 fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1384 1407
1408continue_rmode:
1385 kvm_mmu_reset_context(vcpu); 1409 kvm_mmu_reset_context(vcpu);
1386 init_rmode(vcpu->kvm); 1410 init_rmode(vcpu->kvm);
1387} 1411}
@@ -1715,6 +1739,186 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1715 vmcs_writel(GUEST_GDTR_BASE, dt->base); 1739 vmcs_writel(GUEST_GDTR_BASE, dt->base);
1716} 1740}
1717 1741
1742static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
1743{
1744 struct kvm_segment var;
1745 u32 ar;
1746
1747 vmx_get_segment(vcpu, &var, seg);
1748 ar = vmx_segment_access_rights(&var);
1749
1750 if (var.base != (var.selector << 4))
1751 return false;
1752 if (var.limit != 0xffff)
1753 return false;
1754 if (ar != 0xf3)
1755 return false;
1756
1757 return true;
1758}
1759
1760static bool code_segment_valid(struct kvm_vcpu *vcpu)
1761{
1762 struct kvm_segment cs;
1763 unsigned int cs_rpl;
1764
1765 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
1766 cs_rpl = cs.selector & SELECTOR_RPL_MASK;
1767
1768 if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
1769 return false;
1770 if (!cs.s)
1771 return false;
1772 if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) {
1773 if (cs.dpl > cs_rpl)
1774 return false;
1775 } else if (cs.type & AR_TYPE_CODE_MASK) {
1776 if (cs.dpl != cs_rpl)
1777 return false;
1778 }
1779 if (!cs.present)
1780 return false;
1781
1782 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
1783 return true;
1784}
1785
1786static bool stack_segment_valid(struct kvm_vcpu *vcpu)
1787{
1788 struct kvm_segment ss;
1789 unsigned int ss_rpl;
1790
1791 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
1792 ss_rpl = ss.selector & SELECTOR_RPL_MASK;
1793
1794 if ((ss.type != 3) || (ss.type != 7))
1795 return false;
1796 if (!ss.s)
1797 return false;
1798 if (ss.dpl != ss_rpl) /* DPL != RPL */
1799 return false;
1800 if (!ss.present)
1801 return false;
1802
1803 return true;
1804}
1805
1806static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
1807{
1808 struct kvm_segment var;
1809 unsigned int rpl;
1810
1811 vmx_get_segment(vcpu, &var, seg);
1812 rpl = var.selector & SELECTOR_RPL_MASK;
1813
1814 if (!var.s)
1815 return false;
1816 if (!var.present)
1817 return false;
1818 if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
1819 if (var.dpl < rpl) /* DPL < RPL */
1820 return false;
1821 }
1822
1823 /* TODO: Add other members to kvm_segment_field to allow checking for other access
1824 * rights flags
1825 */
1826 return true;
1827}
1828
1829static bool tr_valid(struct kvm_vcpu *vcpu)
1830{
1831 struct kvm_segment tr;
1832
1833 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
1834
1835 if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */
1836 return false;
1837 if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */
1838 return false;
1839 if (!tr.present)
1840 return false;
1841
1842 return true;
1843}
1844
1845static bool ldtr_valid(struct kvm_vcpu *vcpu)
1846{
1847 struct kvm_segment ldtr;
1848
1849 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
1850
1851 if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */
1852 return false;
1853 if (ldtr.type != 2)
1854 return false;
1855 if (!ldtr.present)
1856 return false;
1857
1858 return true;
1859}
1860
1861static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
1862{
1863 struct kvm_segment cs, ss;
1864
1865 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
1866 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
1867
1868 return ((cs.selector & SELECTOR_RPL_MASK) ==
1869 (ss.selector & SELECTOR_RPL_MASK));
1870}
1871
1872/*
1873 * Check if guest state is valid. Returns true if valid, false if
1874 * not.
1875 * We assume that registers are always usable
1876 */
1877static bool guest_state_valid(struct kvm_vcpu *vcpu)
1878{
1879 /* real mode guest state checks */
1880 if (!(vcpu->arch.cr0 & X86_CR0_PE)) {
1881 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
1882 return false;
1883 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
1884 return false;
1885 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
1886 return false;
1887 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
1888 return false;
1889 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
1890 return false;
1891 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
1892 return false;
1893 } else {
1894 /* protected mode guest state checks */
1895 if (!cs_ss_rpl_check(vcpu))
1896 return false;
1897 if (!code_segment_valid(vcpu))
1898 return false;
1899 if (!stack_segment_valid(vcpu))
1900 return false;
1901 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
1902 return false;
1903 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
1904 return false;
1905 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
1906 return false;
1907 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
1908 return false;
1909 if (!tr_valid(vcpu))
1910 return false;
1911 if (!ldtr_valid(vcpu))
1912 return false;
1913 }
1914 /* TODO:
1915 * - Add checks on RIP
1916 * - Add checks on RFLAGS
1917 */
1918
1919 return true;
1920}
1921
1718static int init_rmode_tss(struct kvm *kvm) 1922static int init_rmode_tss(struct kvm *kvm)
1719{ 1923{
1720 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; 1924 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
@@ -1726,7 +1930,8 @@ static int init_rmode_tss(struct kvm *kvm)
1726 if (r < 0) 1930 if (r < 0)
1727 goto out; 1931 goto out;
1728 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 1932 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
1729 r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); 1933 r = kvm_write_guest_page(kvm, fn++, &data,
1934 TSS_IOPB_BASE_OFFSET, sizeof(u16));
1730 if (r < 0) 1935 if (r < 0)
1731 goto out; 1936 goto out;
1732 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); 1937 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
@@ -1789,7 +1994,7 @@ static void seg_setup(int seg)
1789 vmcs_write16(sf->selector, 0); 1994 vmcs_write16(sf->selector, 0);
1790 vmcs_writel(sf->base, 0); 1995 vmcs_writel(sf->base, 0);
1791 vmcs_write32(sf->limit, 0xffff); 1996 vmcs_write32(sf->limit, 0xffff);
1792 vmcs_write32(sf->ar_bytes, 0x93); 1997 vmcs_write32(sf->ar_bytes, 0xf3);
1793} 1998}
1794 1999
1795static int alloc_apic_access_page(struct kvm *kvm) 2000static int alloc_apic_access_page(struct kvm *kvm)
@@ -1808,9 +2013,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
1808 if (r) 2013 if (r)
1809 goto out; 2014 goto out;
1810 2015
1811 down_read(&current->mm->mmap_sem);
1812 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); 2016 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
1813 up_read(&current->mm->mmap_sem);
1814out: 2017out:
1815 up_write(&kvm->slots_lock); 2018 up_write(&kvm->slots_lock);
1816 return r; 2019 return r;
@@ -1832,10 +2035,8 @@ static int alloc_identity_pagetable(struct kvm *kvm)
1832 if (r) 2035 if (r)
1833 goto out; 2036 goto out;
1834 2037
1835 down_read(&current->mm->mmap_sem);
1836 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, 2038 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
1837 VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); 2039 VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
1838 up_read(&current->mm->mmap_sem);
1839out: 2040out:
1840 up_write(&kvm->slots_lock); 2041 up_write(&kvm->slots_lock);
1841 return r; 2042 return r;
@@ -1917,7 +2118,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1917 } 2118 }
1918 if (!vm_need_ept()) 2119 if (!vm_need_ept())
1919 exec_control |= CPU_BASED_CR3_STORE_EXITING | 2120 exec_control |= CPU_BASED_CR3_STORE_EXITING |
1920 CPU_BASED_CR3_LOAD_EXITING; 2121 CPU_BASED_CR3_LOAD_EXITING |
2122 CPU_BASED_INVLPG_EXITING;
1921 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); 2123 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
1922 2124
1923 if (cpu_has_secondary_exec_ctrls()) { 2125 if (cpu_has_secondary_exec_ctrls()) {
@@ -2019,6 +2221,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2019 u64 msr; 2221 u64 msr;
2020 int ret; 2222 int ret;
2021 2223
2224 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2022 down_read(&vcpu->kvm->slots_lock); 2225 down_read(&vcpu->kvm->slots_lock);
2023 if (!init_rmode(vmx->vcpu.kvm)) { 2226 if (!init_rmode(vmx->vcpu.kvm)) {
2024 ret = -ENOMEM; 2227 ret = -ENOMEM;
@@ -2036,6 +2239,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2036 2239
2037 fx_init(&vmx->vcpu); 2240 fx_init(&vmx->vcpu);
2038 2241
2242 seg_setup(VCPU_SREG_CS);
2039 /* 2243 /*
2040 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode 2244 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
2041 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. 2245 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
@@ -2047,8 +2251,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2047 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); 2251 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
2048 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); 2252 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
2049 } 2253 }
2050 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
2051 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
2052 2254
2053 seg_setup(VCPU_SREG_DS); 2255 seg_setup(VCPU_SREG_DS);
2054 seg_setup(VCPU_SREG_ES); 2256 seg_setup(VCPU_SREG_ES);
@@ -2072,10 +2274,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2072 2274
2073 vmcs_writel(GUEST_RFLAGS, 0x02); 2275 vmcs_writel(GUEST_RFLAGS, 0x02);
2074 if (vmx->vcpu.vcpu_id == 0) 2276 if (vmx->vcpu.vcpu_id == 0)
2075 vmcs_writel(GUEST_RIP, 0xfff0); 2277 kvm_rip_write(vcpu, 0xfff0);
2076 else 2278 else
2077 vmcs_writel(GUEST_RIP, 0); 2279 kvm_rip_write(vcpu, 0);
2078 vmcs_writel(GUEST_RSP, 0); 2280 kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
2079 2281
2080 /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ 2282 /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
2081 vmcs_writel(GUEST_DR7, 0x400); 2283 vmcs_writel(GUEST_DR7, 0x400);
@@ -2125,6 +2327,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2125 2327
2126 ret = 0; 2328 ret = 0;
2127 2329
2330 /* HACK: Don't enable emulation on guest boot/reset */
2331 vmx->emulation_required = 0;
2332
2128out: 2333out:
2129 up_read(&vcpu->kvm->slots_lock); 2334 up_read(&vcpu->kvm->slots_lock);
2130 return ret; 2335 return ret;
@@ -2136,14 +2341,15 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
2136 2341
2137 KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); 2342 KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
2138 2343
2344 ++vcpu->stat.irq_injections;
2139 if (vcpu->arch.rmode.active) { 2345 if (vcpu->arch.rmode.active) {
2140 vmx->rmode.irq.pending = true; 2346 vmx->rmode.irq.pending = true;
2141 vmx->rmode.irq.vector = irq; 2347 vmx->rmode.irq.vector = irq;
2142 vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); 2348 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2143 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2349 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2144 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); 2350 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
2145 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); 2351 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2146 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); 2352 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2147 return; 2353 return;
2148 } 2354 }
2149 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2355 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -2154,7 +2360,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2154{ 2360{
2155 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2361 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2156 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 2362 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2157 vcpu->arch.nmi_pending = 0;
2158} 2363}
2159 2364
2160static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) 2365static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
@@ -2166,7 +2371,7 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
2166 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); 2371 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
2167 if (!vcpu->arch.irq_pending[word_index]) 2372 if (!vcpu->arch.irq_pending[word_index])
2168 clear_bit(word_index, &vcpu->arch.irq_summary); 2373 clear_bit(word_index, &vcpu->arch.irq_summary);
2169 vmx_inject_irq(vcpu, irq); 2374 kvm_queue_interrupt(vcpu, irq);
2170} 2375}
2171 2376
2172 2377
@@ -2180,13 +2385,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
2180 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); 2385 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
2181 2386
2182 if (vcpu->arch.interrupt_window_open && 2387 if (vcpu->arch.interrupt_window_open &&
2183 vcpu->arch.irq_summary && 2388 vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
2184 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
2185 /*
2186 * If interrupts enabled, and not blocked by sti or mov ss. Good.
2187 */
2188 kvm_do_inject_irq(vcpu); 2389 kvm_do_inject_irq(vcpu);
2189 2390
2391 if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending)
2392 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
2393
2190 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 2394 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2191 if (!vcpu->arch.interrupt_window_open && 2395 if (!vcpu->arch.interrupt_window_open &&
2192 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) 2396 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
@@ -2237,9 +2441,6 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
2237static int handle_rmode_exception(struct kvm_vcpu *vcpu, 2441static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2238 int vec, u32 err_code) 2442 int vec, u32 err_code)
2239{ 2443{
2240 if (!vcpu->arch.rmode.active)
2241 return 0;
2242
2243 /* 2444 /*
2244 * Instruction with address size override prefix opcode 0x67 2445 * Instruction with address size override prefix opcode 0x67
2245 * Cause the #SS fault with 0 error code in VM86 mode. 2446 * Cause the #SS fault with 0 error code in VM86 mode.
@@ -2247,6 +2448,25 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2247 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 2448 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2248 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) 2449 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
2249 return 1; 2450 return 1;
2451 /*
2452 * Forward all other exceptions that are valid in real mode.
2453 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
2454 * the required debugging infrastructure rework.
2455 */
2456 switch (vec) {
2457 case DE_VECTOR:
2458 case DB_VECTOR:
2459 case BP_VECTOR:
2460 case OF_VECTOR:
2461 case BR_VECTOR:
2462 case UD_VECTOR:
2463 case DF_VECTOR:
2464 case SS_VECTOR:
2465 case GP_VECTOR:
2466 case MF_VECTOR:
2467 kvm_queue_exception(vcpu, vec);
2468 return 1;
2469 }
2250 return 0; 2470 return 0;
2251} 2471}
2252 2472
@@ -2288,7 +2508,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2288 } 2508 }
2289 2509
2290 error_code = 0; 2510 error_code = 0;
2291 rip = vmcs_readl(GUEST_RIP); 2511 rip = kvm_rip_read(vcpu);
2292 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 2512 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
2293 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 2513 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2294 if (is_page_fault(intr_info)) { 2514 if (is_page_fault(intr_info)) {
@@ -2298,7 +2518,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2298 cr2 = vmcs_readl(EXIT_QUALIFICATION); 2518 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2299 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, 2519 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
2300 (u32)((u64)cr2 >> 32), handler); 2520 (u32)((u64)cr2 >> 32), handler);
2301 if (vect_info & VECTORING_INFO_VALID_MASK) 2521 if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending)
2302 kvm_mmu_unprotect_page_virt(vcpu, cr2); 2522 kvm_mmu_unprotect_page_virt(vcpu, cr2);
2303 return kvm_mmu_page_fault(vcpu, cr2, error_code); 2523 return kvm_mmu_page_fault(vcpu, cr2, error_code);
2304 } 2524 }
@@ -2386,27 +2606,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2386 reg = (exit_qualification >> 8) & 15; 2606 reg = (exit_qualification >> 8) & 15;
2387 switch ((exit_qualification >> 4) & 3) { 2607 switch ((exit_qualification >> 4) & 3) {
2388 case 0: /* mov to cr */ 2608 case 0: /* mov to cr */
2389 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg], 2609 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr,
2390 (u32)((u64)vcpu->arch.regs[reg] >> 32), handler); 2610 (u32)kvm_register_read(vcpu, reg),
2611 (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
2612 handler);
2391 switch (cr) { 2613 switch (cr) {
2392 case 0: 2614 case 0:
2393 vcpu_load_rsp_rip(vcpu); 2615 kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg));
2394 kvm_set_cr0(vcpu, vcpu->arch.regs[reg]);
2395 skip_emulated_instruction(vcpu); 2616 skip_emulated_instruction(vcpu);
2396 return 1; 2617 return 1;
2397 case 3: 2618 case 3:
2398 vcpu_load_rsp_rip(vcpu); 2619 kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg));
2399 kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
2400 skip_emulated_instruction(vcpu); 2620 skip_emulated_instruction(vcpu);
2401 return 1; 2621 return 1;
2402 case 4: 2622 case 4:
2403 vcpu_load_rsp_rip(vcpu); 2623 kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
2404 kvm_set_cr4(vcpu, vcpu->arch.regs[reg]);
2405 skip_emulated_instruction(vcpu); 2624 skip_emulated_instruction(vcpu);
2406 return 1; 2625 return 1;
2407 case 8: 2626 case 8:
2408 vcpu_load_rsp_rip(vcpu); 2627 kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg));
2409 kvm_set_cr8(vcpu, vcpu->arch.regs[reg]);
2410 skip_emulated_instruction(vcpu); 2628 skip_emulated_instruction(vcpu);
2411 if (irqchip_in_kernel(vcpu->kvm)) 2629 if (irqchip_in_kernel(vcpu->kvm))
2412 return 1; 2630 return 1;
@@ -2415,7 +2633,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2415 }; 2633 };
2416 break; 2634 break;
2417 case 2: /* clts */ 2635 case 2: /* clts */
2418 vcpu_load_rsp_rip(vcpu);
2419 vmx_fpu_deactivate(vcpu); 2636 vmx_fpu_deactivate(vcpu);
2420 vcpu->arch.cr0 &= ~X86_CR0_TS; 2637 vcpu->arch.cr0 &= ~X86_CR0_TS;
2421 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); 2638 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
@@ -2426,21 +2643,17 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2426 case 1: /*mov from cr*/ 2643 case 1: /*mov from cr*/
2427 switch (cr) { 2644 switch (cr) {
2428 case 3: 2645 case 3:
2429 vcpu_load_rsp_rip(vcpu); 2646 kvm_register_write(vcpu, reg, vcpu->arch.cr3);
2430 vcpu->arch.regs[reg] = vcpu->arch.cr3;
2431 vcpu_put_rsp_rip(vcpu);
2432 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, 2647 KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
2433 (u32)vcpu->arch.regs[reg], 2648 (u32)kvm_register_read(vcpu, reg),
2434 (u32)((u64)vcpu->arch.regs[reg] >> 32), 2649 (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
2435 handler); 2650 handler);
2436 skip_emulated_instruction(vcpu); 2651 skip_emulated_instruction(vcpu);
2437 return 1; 2652 return 1;
2438 case 8: 2653 case 8:
2439 vcpu_load_rsp_rip(vcpu); 2654 kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu));
2440 vcpu->arch.regs[reg] = kvm_get_cr8(vcpu);
2441 vcpu_put_rsp_rip(vcpu);
2442 KVMTRACE_2D(CR_READ, vcpu, (u32)cr, 2655 KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
2443 (u32)vcpu->arch.regs[reg], handler); 2656 (u32)kvm_register_read(vcpu, reg), handler);
2444 skip_emulated_instruction(vcpu); 2657 skip_emulated_instruction(vcpu);
2445 return 1; 2658 return 1;
2446 } 2659 }
@@ -2472,7 +2685,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2472 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 2685 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2473 dr = exit_qualification & 7; 2686 dr = exit_qualification & 7;
2474 reg = (exit_qualification >> 8) & 15; 2687 reg = (exit_qualification >> 8) & 15;
2475 vcpu_load_rsp_rip(vcpu);
2476 if (exit_qualification & 16) { 2688 if (exit_qualification & 16) {
2477 /* mov from dr */ 2689 /* mov from dr */
2478 switch (dr) { 2690 switch (dr) {
@@ -2485,12 +2697,11 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2485 default: 2697 default:
2486 val = 0; 2698 val = 0;
2487 } 2699 }
2488 vcpu->arch.regs[reg] = val; 2700 kvm_register_write(vcpu, reg, val);
2489 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); 2701 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
2490 } else { 2702 } else {
2491 /* mov to dr */ 2703 /* mov to dr */
2492 } 2704 }
2493 vcpu_put_rsp_rip(vcpu);
2494 skip_emulated_instruction(vcpu); 2705 skip_emulated_instruction(vcpu);
2495 return 1; 2706 return 1;
2496} 2707}
@@ -2583,6 +2794,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2583 return 1; 2794 return 1;
2584} 2795}
2585 2796
2797static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2798{
2799 u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2800
2801 kvm_mmu_invlpg(vcpu, exit_qualification);
2802 skip_emulated_instruction(vcpu);
2803 return 1;
2804}
2805
2586static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2806static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2587{ 2807{
2588 skip_emulated_instruction(vcpu); 2808 skip_emulated_instruction(vcpu);
@@ -2695,6 +2915,43 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2695 return 1; 2915 return 1;
2696} 2916}
2697 2917
2918static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
2919 struct kvm_run *kvm_run)
2920{
2921 struct vcpu_vmx *vmx = to_vmx(vcpu);
2922 int err;
2923
2924 preempt_enable();
2925 local_irq_enable();
2926
2927 while (!guest_state_valid(vcpu)) {
2928 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2929
2930 switch (err) {
2931 case EMULATE_DONE:
2932 break;
2933 case EMULATE_DO_MMIO:
2934 kvm_report_emulation_failure(vcpu, "mmio");
2935 /* TODO: Handle MMIO */
2936 return;
2937 default:
2938 kvm_report_emulation_failure(vcpu, "emulation failure");
2939 return;
2940 }
2941
2942 if (signal_pending(current))
2943 break;
2944 if (need_resched())
2945 schedule();
2946 }
2947
2948 local_irq_disable();
2949 preempt_disable();
2950
2951 /* Guest state should be valid now, no more emulation should be needed */
2952 vmx->emulation_required = 0;
2953}
2954
2698/* 2955/*
2699 * The exit handlers return 1 if the exit was handled fully and guest execution 2956 * The exit handlers return 1 if the exit was handled fully and guest execution
2700 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 2957 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -2714,6 +2971,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2714 [EXIT_REASON_MSR_WRITE] = handle_wrmsr, 2971 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
2715 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 2972 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
2716 [EXIT_REASON_HLT] = handle_halt, 2973 [EXIT_REASON_HLT] = handle_halt,
2974 [EXIT_REASON_INVLPG] = handle_invlpg,
2717 [EXIT_REASON_VMCALL] = handle_vmcall, 2975 [EXIT_REASON_VMCALL] = handle_vmcall,
2718 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 2976 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
2719 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 2977 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
@@ -2735,8 +2993,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2735 struct vcpu_vmx *vmx = to_vmx(vcpu); 2993 struct vcpu_vmx *vmx = to_vmx(vcpu);
2736 u32 vectoring_info = vmx->idt_vectoring_info; 2994 u32 vectoring_info = vmx->idt_vectoring_info;
2737 2995
2738 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP), 2996 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
2739 (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit); 2997 (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
2740 2998
2741 /* Access CR3 don't cause VMExit in paging mode, so we need 2999 /* Access CR3 don't cause VMExit in paging mode, so we need
2742 * to sync with guest real CR3. */ 3000 * to sync with guest real CR3. */
@@ -2829,88 +3087,92 @@ static void enable_intr_window(struct kvm_vcpu *vcpu)
2829 enable_irq_window(vcpu); 3087 enable_irq_window(vcpu);
2830} 3088}
2831 3089
2832static void vmx_intr_assist(struct kvm_vcpu *vcpu) 3090static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
2833{ 3091{
2834 struct vcpu_vmx *vmx = to_vmx(vcpu); 3092 u32 exit_intr_info;
2835 u32 idtv_info_field, intr_info_field, exit_intr_info_field; 3093 u32 idt_vectoring_info;
2836 int vector; 3094 bool unblock_nmi;
3095 u8 vector;
3096 int type;
3097 bool idtv_info_valid;
3098 u32 error;
2837 3099
2838 update_tpr_threshold(vcpu); 3100 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2839 3101 if (cpu_has_virtual_nmis()) {
2840 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); 3102 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
2841 exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO); 3103 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
2842 idtv_info_field = vmx->idt_vectoring_info; 3104 /*
2843 if (intr_info_field & INTR_INFO_VALID_MASK) { 3105 * SDM 3: 25.7.1.2
2844 if (idtv_info_field & INTR_INFO_VALID_MASK) { 3106 * Re-set bit "block by NMI" before VM entry if vmexit caused by
2845 /* TODO: fault when IDT_Vectoring */ 3107 * a guest IRET fault.
2846 if (printk_ratelimit()) 3108 */
2847 printk(KERN_ERR "Fault when IDT_Vectoring\n"); 3109 if (unblock_nmi && vector != DF_VECTOR)
2848 } 3110 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
2849 enable_intr_window(vcpu); 3111 GUEST_INTR_STATE_NMI);
2850 return;
2851 } 3112 }
2852 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
2853 if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
2854 == INTR_TYPE_EXT_INTR
2855 && vcpu->arch.rmode.active) {
2856 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
2857
2858 vmx_inject_irq(vcpu, vect);
2859 enable_intr_window(vcpu);
2860 return;
2861 }
2862
2863 KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler);
2864 3113
3114 idt_vectoring_info = vmx->idt_vectoring_info;
3115 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3116 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
3117 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
3118 if (vmx->vcpu.arch.nmi_injected) {
2865 /* 3119 /*
2866 * SDM 3: 25.7.1.2 3120 * SDM 3: 25.7.1.2
2867 * Clear bit "block by NMI" before VM entry if a NMI delivery 3121 * Clear bit "block by NMI" before VM entry if a NMI delivery
2868 * faulted. 3122 * faulted.
2869 */ 3123 */
2870 if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) 3124 if (idtv_info_valid && type == INTR_TYPE_NMI_INTR)
2871 == INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis()) 3125 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
2872 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 3126 GUEST_INTR_STATE_NMI);
2873 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3127 else
2874 ~GUEST_INTR_STATE_NMI); 3128 vmx->vcpu.arch.nmi_injected = false;
2875 3129 }
2876 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field 3130 kvm_clear_exception_queue(&vmx->vcpu);
2877 & ~INTR_INFO_RESVD_BITS_MASK); 3131 if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) {
2878 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 3132 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
2879 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 3133 error = vmcs_read32(IDT_VECTORING_ERROR_CODE);
2880 3134 kvm_queue_exception_e(&vmx->vcpu, vector, error);
2881 if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) 3135 } else
2882 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 3136 kvm_queue_exception(&vmx->vcpu, vector);
2883 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 3137 vmx->idt_vectoring_info = 0;
2884 enable_intr_window(vcpu);
2885 return;
2886 } 3138 }
3139 kvm_clear_interrupt_queue(&vmx->vcpu);
3140 if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) {
3141 kvm_queue_interrupt(&vmx->vcpu, vector);
3142 vmx->idt_vectoring_info = 0;
3143 }
3144}
3145
3146static void vmx_intr_assist(struct kvm_vcpu *vcpu)
3147{
3148 update_tpr_threshold(vcpu);
3149
2887 if (cpu_has_virtual_nmis()) { 3150 if (cpu_has_virtual_nmis()) {
2888 /* 3151 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
2889 * SDM 3: 25.7.1.2 3152 if (vmx_nmi_enabled(vcpu)) {
2890 * Re-set bit "block by NMI" before VM entry if vmexit caused by 3153 vcpu->arch.nmi_pending = false;
2891 * a guest IRET fault. 3154 vcpu->arch.nmi_injected = true;
2892 */ 3155 } else {
2893 if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) && 3156 enable_intr_window(vcpu);
2894 (exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8) 3157 return;
2895 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 3158 }
2896 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) | 3159 }
2897 GUEST_INTR_STATE_NMI); 3160 if (vcpu->arch.nmi_injected) {
2898 else if (vcpu->arch.nmi_pending) { 3161 vmx_inject_nmi(vcpu);
2899 if (vmx_nmi_enabled(vcpu))
2900 vmx_inject_nmi(vcpu);
2901 enable_intr_window(vcpu); 3162 enable_intr_window(vcpu);
2902 return; 3163 return;
2903 } 3164 }
2904
2905 } 3165 }
2906 if (!kvm_cpu_has_interrupt(vcpu)) 3166 if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
2907 return; 3167 if (vmx_irq_enabled(vcpu))
2908 if (vmx_irq_enabled(vcpu)) { 3168 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
2909 vector = kvm_cpu_get_interrupt(vcpu); 3169 else
2910 vmx_inject_irq(vcpu, vector); 3170 enable_irq_window(vcpu);
2911 kvm_timer_intr_post(vcpu, vector); 3171 }
2912 } else 3172 if (vcpu->arch.interrupt.pending) {
2913 enable_irq_window(vcpu); 3173 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
3174 kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
3175 }
2914} 3176}
2915 3177
2916/* 3178/*
@@ -2922,9 +3184,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2922static void fixup_rmode_irq(struct vcpu_vmx *vmx) 3184static void fixup_rmode_irq(struct vcpu_vmx *vmx)
2923{ 3185{
2924 vmx->rmode.irq.pending = 0; 3186 vmx->rmode.irq.pending = 0;
2925 if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) 3187 if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
2926 return; 3188 return;
2927 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); 3189 kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
2928 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { 3190 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
2929 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; 3191 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
2930 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; 3192 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
@@ -2936,11 +3198,30 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
2936 | vmx->rmode.irq.vector; 3198 | vmx->rmode.irq.vector;
2937} 3199}
2938 3200
3201#ifdef CONFIG_X86_64
3202#define R "r"
3203#define Q "q"
3204#else
3205#define R "e"
3206#define Q "l"
3207#endif
3208
2939static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3209static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2940{ 3210{
2941 struct vcpu_vmx *vmx = to_vmx(vcpu); 3211 struct vcpu_vmx *vmx = to_vmx(vcpu);
2942 u32 intr_info; 3212 u32 intr_info;
2943 3213
3214 /* Handle invalid guest state instead of entering VMX */
3215 if (vmx->emulation_required && emulate_invalid_guest_state) {
3216 handle_invalid_guest_state(vcpu, kvm_run);
3217 return;
3218 }
3219
3220 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
3221 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
3222 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
3223 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
3224
2944 /* 3225 /*
2945 * Loading guest fpu may have cleared host cr0.ts 3226 * Loading guest fpu may have cleared host cr0.ts
2946 */ 3227 */
@@ -2948,26 +3229,25 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2948 3229
2949 asm( 3230 asm(
2950 /* Store host registers */ 3231 /* Store host registers */
2951#ifdef CONFIG_X86_64 3232 "push %%"R"dx; push %%"R"bp;"
2952 "push %%rdx; push %%rbp;" 3233 "push %%"R"cx \n\t"
2953 "push %%rcx \n\t" 3234 "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
2954#else 3235 "je 1f \n\t"
2955 "push %%edx; push %%ebp;" 3236 "mov %%"R"sp, %c[host_rsp](%0) \n\t"
2956 "push %%ecx \n\t"
2957#endif
2958 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" 3237 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
3238 "1: \n\t"
2959 /* Check if vmlaunch of vmresume is needed */ 3239 /* Check if vmlaunch of vmresume is needed */
2960 "cmpl $0, %c[launched](%0) \n\t" 3240 "cmpl $0, %c[launched](%0) \n\t"
2961 /* Load guest registers. Don't clobber flags. */ 3241 /* Load guest registers. Don't clobber flags. */
3242 "mov %c[cr2](%0), %%"R"ax \n\t"
3243 "mov %%"R"ax, %%cr2 \n\t"
3244 "mov %c[rax](%0), %%"R"ax \n\t"
3245 "mov %c[rbx](%0), %%"R"bx \n\t"
3246 "mov %c[rdx](%0), %%"R"dx \n\t"
3247 "mov %c[rsi](%0), %%"R"si \n\t"
3248 "mov %c[rdi](%0), %%"R"di \n\t"
3249 "mov %c[rbp](%0), %%"R"bp \n\t"
2962#ifdef CONFIG_X86_64 3250#ifdef CONFIG_X86_64
2963 "mov %c[cr2](%0), %%rax \n\t"
2964 "mov %%rax, %%cr2 \n\t"
2965 "mov %c[rax](%0), %%rax \n\t"
2966 "mov %c[rbx](%0), %%rbx \n\t"
2967 "mov %c[rdx](%0), %%rdx \n\t"
2968 "mov %c[rsi](%0), %%rsi \n\t"
2969 "mov %c[rdi](%0), %%rdi \n\t"
2970 "mov %c[rbp](%0), %%rbp \n\t"
2971 "mov %c[r8](%0), %%r8 \n\t" 3251 "mov %c[r8](%0), %%r8 \n\t"
2972 "mov %c[r9](%0), %%r9 \n\t" 3252 "mov %c[r9](%0), %%r9 \n\t"
2973 "mov %c[r10](%0), %%r10 \n\t" 3253 "mov %c[r10](%0), %%r10 \n\t"
@@ -2976,18 +3256,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2976 "mov %c[r13](%0), %%r13 \n\t" 3256 "mov %c[r13](%0), %%r13 \n\t"
2977 "mov %c[r14](%0), %%r14 \n\t" 3257 "mov %c[r14](%0), %%r14 \n\t"
2978 "mov %c[r15](%0), %%r15 \n\t" 3258 "mov %c[r15](%0), %%r15 \n\t"
2979 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
2980#else
2981 "mov %c[cr2](%0), %%eax \n\t"
2982 "mov %%eax, %%cr2 \n\t"
2983 "mov %c[rax](%0), %%eax \n\t"
2984 "mov %c[rbx](%0), %%ebx \n\t"
2985 "mov %c[rdx](%0), %%edx \n\t"
2986 "mov %c[rsi](%0), %%esi \n\t"
2987 "mov %c[rdi](%0), %%edi \n\t"
2988 "mov %c[rbp](%0), %%ebp \n\t"
2989 "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
2990#endif 3259#endif
3260 "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
3261
2991 /* Enter guest mode */ 3262 /* Enter guest mode */
2992 "jne .Llaunched \n\t" 3263 "jne .Llaunched \n\t"
2993 __ex(ASM_VMX_VMLAUNCH) "\n\t" 3264 __ex(ASM_VMX_VMLAUNCH) "\n\t"
@@ -2995,15 +3266,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2995 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" 3266 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
2996 ".Lkvm_vmx_return: " 3267 ".Lkvm_vmx_return: "
2997 /* Save guest registers, load host registers, keep flags */ 3268 /* Save guest registers, load host registers, keep flags */
3269 "xchg %0, (%%"R"sp) \n\t"
3270 "mov %%"R"ax, %c[rax](%0) \n\t"
3271 "mov %%"R"bx, %c[rbx](%0) \n\t"
3272 "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t"
3273 "mov %%"R"dx, %c[rdx](%0) \n\t"
3274 "mov %%"R"si, %c[rsi](%0) \n\t"
3275 "mov %%"R"di, %c[rdi](%0) \n\t"
3276 "mov %%"R"bp, %c[rbp](%0) \n\t"
2998#ifdef CONFIG_X86_64 3277#ifdef CONFIG_X86_64
2999 "xchg %0, (%%rsp) \n\t"
3000 "mov %%rax, %c[rax](%0) \n\t"
3001 "mov %%rbx, %c[rbx](%0) \n\t"
3002 "pushq (%%rsp); popq %c[rcx](%0) \n\t"
3003 "mov %%rdx, %c[rdx](%0) \n\t"
3004 "mov %%rsi, %c[rsi](%0) \n\t"
3005 "mov %%rdi, %c[rdi](%0) \n\t"
3006 "mov %%rbp, %c[rbp](%0) \n\t"
3007 "mov %%r8, %c[r8](%0) \n\t" 3278 "mov %%r8, %c[r8](%0) \n\t"
3008 "mov %%r9, %c[r9](%0) \n\t" 3279 "mov %%r9, %c[r9](%0) \n\t"
3009 "mov %%r10, %c[r10](%0) \n\t" 3280 "mov %%r10, %c[r10](%0) \n\t"
@@ -3012,28 +3283,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3012 "mov %%r13, %c[r13](%0) \n\t" 3283 "mov %%r13, %c[r13](%0) \n\t"
3013 "mov %%r14, %c[r14](%0) \n\t" 3284 "mov %%r14, %c[r14](%0) \n\t"
3014 "mov %%r15, %c[r15](%0) \n\t" 3285 "mov %%r15, %c[r15](%0) \n\t"
3015 "mov %%cr2, %%rax \n\t"
3016 "mov %%rax, %c[cr2](%0) \n\t"
3017
3018 "pop %%rbp; pop %%rbp; pop %%rdx \n\t"
3019#else
3020 "xchg %0, (%%esp) \n\t"
3021 "mov %%eax, %c[rax](%0) \n\t"
3022 "mov %%ebx, %c[rbx](%0) \n\t"
3023 "pushl (%%esp); popl %c[rcx](%0) \n\t"
3024 "mov %%edx, %c[rdx](%0) \n\t"
3025 "mov %%esi, %c[rsi](%0) \n\t"
3026 "mov %%edi, %c[rdi](%0) \n\t"
3027 "mov %%ebp, %c[rbp](%0) \n\t"
3028 "mov %%cr2, %%eax \n\t"
3029 "mov %%eax, %c[cr2](%0) \n\t"
3030
3031 "pop %%ebp; pop %%ebp; pop %%edx \n\t"
3032#endif 3286#endif
3287 "mov %%cr2, %%"R"ax \n\t"
3288 "mov %%"R"ax, %c[cr2](%0) \n\t"
3289
3290 "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t"
3033 "setbe %c[fail](%0) \n\t" 3291 "setbe %c[fail](%0) \n\t"
3034 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 3292 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
3035 [launched]"i"(offsetof(struct vcpu_vmx, launched)), 3293 [launched]"i"(offsetof(struct vcpu_vmx, launched)),
3036 [fail]"i"(offsetof(struct vcpu_vmx, fail)), 3294 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
3295 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
3037 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), 3296 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
3038 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), 3297 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
3039 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), 3298 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
@@ -3053,14 +3312,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3053#endif 3312#endif
3054 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) 3313 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
3055 : "cc", "memory" 3314 : "cc", "memory"
3315 , R"bx", R"di", R"si"
3056#ifdef CONFIG_X86_64 3316#ifdef CONFIG_X86_64
3057 , "rbx", "rdi", "rsi"
3058 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" 3317 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
3059#else
3060 , "ebx", "edi", "rsi"
3061#endif 3318#endif
3062 ); 3319 );
3063 3320
3321 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
3322 vcpu->arch.regs_dirty = 0;
3323
3064 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 3324 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
3065 if (vmx->rmode.irq.pending) 3325 if (vmx->rmode.irq.pending)
3066 fixup_rmode_irq(vmx); 3326 fixup_rmode_irq(vmx);
@@ -3080,8 +3340,13 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3080 KVMTRACE_0D(NMI, vcpu, handler); 3340 KVMTRACE_0D(NMI, vcpu, handler);
3081 asm("int $2"); 3341 asm("int $2");
3082 } 3342 }
3343
3344 vmx_complete_interrupts(vmx);
3083} 3345}
3084 3346
3347#undef R
3348#undef Q
3349
3085static void vmx_free_vmcs(struct kvm_vcpu *vcpu) 3350static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
3086{ 3351{
3087 struct vcpu_vmx *vmx = to_vmx(vcpu); 3352 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3224,8 +3489,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
3224 .set_idt = vmx_set_idt, 3489 .set_idt = vmx_set_idt,
3225 .get_gdt = vmx_get_gdt, 3490 .get_gdt = vmx_get_gdt,
3226 .set_gdt = vmx_set_gdt, 3491 .set_gdt = vmx_set_gdt,
3227 .cache_regs = vcpu_load_rsp_rip, 3492 .cache_reg = vmx_cache_reg,
3228 .decache_regs = vcpu_put_rsp_rip,
3229 .get_rflags = vmx_get_rflags, 3493 .get_rflags = vmx_get_rflags,
3230 .set_rflags = vmx_set_rflags, 3494 .set_rflags = vmx_set_rflags,
3231 3495
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 17e25995b65b..3e010d21fdd7 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -331,9 +331,6 @@ enum vmcs_field {
331 331
332#define AR_RESERVD_MASK 0xfffe0f00 332#define AR_RESERVD_MASK 0xfffe0f00
333 333
334#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
335#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
336
337#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 334#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
338#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 335#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10
339 336
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 19afbb644c7f..4f0677d1eae8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4,10 +4,14 @@
4 * derived from drivers/kvm/kvm_main.c 4 * derived from drivers/kvm/kvm_main.c
5 * 5 *
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright (C) 2008 Qumranet, Inc.
8 * Copyright IBM Corporation, 2008
7 * 9 *
8 * Authors: 10 * Authors:
9 * Avi Kivity <avi@qumranet.com> 11 * Avi Kivity <avi@qumranet.com>
10 * Yaniv Kamay <yaniv@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Amit Shah <amit.shah@qumranet.com>
14 * Ben-Ami Yassour <benami@il.ibm.com>
11 * 15 *
12 * This work is licensed under the terms of the GNU GPL, version 2. See 16 * This work is licensed under the terms of the GNU GPL, version 2. See
13 * the COPYING file in the top-level directory. 17 * the COPYING file in the top-level directory.
@@ -19,14 +23,18 @@
19#include "mmu.h" 23#include "mmu.h"
20#include "i8254.h" 24#include "i8254.h"
21#include "tss.h" 25#include "tss.h"
26#include "kvm_cache_regs.h"
27#include "x86.h"
22 28
23#include <linux/clocksource.h> 29#include <linux/clocksource.h>
30#include <linux/interrupt.h>
24#include <linux/kvm.h> 31#include <linux/kvm.h>
25#include <linux/fs.h> 32#include <linux/fs.h>
26#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
27#include <linux/module.h> 34#include <linux/module.h>
28#include <linux/mman.h> 35#include <linux/mman.h>
29#include <linux/highmem.h> 36#include <linux/highmem.h>
37#include <linux/intel-iommu.h>
30 38
31#include <asm/uaccess.h> 39#include <asm/uaccess.h>
32#include <asm/msr.h> 40#include <asm/msr.h>
@@ -61,6 +69,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
61 struct kvm_cpuid_entry2 __user *entries); 69 struct kvm_cpuid_entry2 __user *entries);
62 70
63struct kvm_x86_ops *kvm_x86_ops; 71struct kvm_x86_ops *kvm_x86_ops;
72EXPORT_SYMBOL_GPL(kvm_x86_ops);
64 73
65struct kvm_stats_debugfs_item debugfs_entries[] = { 74struct kvm_stats_debugfs_item debugfs_entries[] = {
66 { "pf_fixed", VCPU_STAT(pf_fixed) }, 75 { "pf_fixed", VCPU_STAT(pf_fixed) },
@@ -83,6 +92,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
83 { "fpu_reload", VCPU_STAT(fpu_reload) }, 92 { "fpu_reload", VCPU_STAT(fpu_reload) },
84 { "insn_emulation", VCPU_STAT(insn_emulation) }, 93 { "insn_emulation", VCPU_STAT(insn_emulation) },
85 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 94 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
95 { "irq_injections", VCPU_STAT(irq_injections) },
86 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 96 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
87 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 97 { "mmu_pte_write", VM_STAT(mmu_pte_write) },
88 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 98 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -90,12 +100,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
90 { "mmu_flooded", VM_STAT(mmu_flooded) }, 100 { "mmu_flooded", VM_STAT(mmu_flooded) },
91 { "mmu_recycled", VM_STAT(mmu_recycled) }, 101 { "mmu_recycled", VM_STAT(mmu_recycled) },
92 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 102 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
103 { "mmu_unsync", VM_STAT(mmu_unsync) },
93 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 104 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
94 { "largepages", VM_STAT(lpages) }, 105 { "largepages", VM_STAT(lpages) },
95 { NULL } 106 { NULL }
96}; 107};
97 108
98
99unsigned long segment_base(u16 selector) 109unsigned long segment_base(u16 selector)
100{ 110{
101 struct descriptor_table gdt; 111 struct descriptor_table gdt;
@@ -352,6 +362,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
352void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 362void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
353{ 363{
354 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 364 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
365 kvm_mmu_sync_roots(vcpu);
355 kvm_mmu_flush_tlb(vcpu); 366 kvm_mmu_flush_tlb(vcpu);
356 return; 367 return;
357 } 368 }
@@ -662,6 +673,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
662 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", 673 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
663 __func__, data); 674 __func__, data);
664 break; 675 break;
676 case MSR_IA32_DEBUGCTLMSR:
677 if (!data) {
678 /* We support the non-activated case already */
679 break;
680 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
681 /* Values other than LBR and BTF are vendor-specific,
682 thus reserved and should throw a #GP */
683 return 1;
684 }
685 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
686 __func__, data);
687 break;
665 case MSR_IA32_UCODE_REV: 688 case MSR_IA32_UCODE_REV:
666 case MSR_IA32_UCODE_WRITE: 689 case MSR_IA32_UCODE_WRITE:
667 break; 690 break;
@@ -692,10 +715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
692 /* ...but clean it before doing the actual write */ 715 /* ...but clean it before doing the actual write */
693 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 716 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
694 717
695 down_read(&current->mm->mmap_sem);
696 vcpu->arch.time_page = 718 vcpu->arch.time_page =
697 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 719 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
698 up_read(&current->mm->mmap_sem);
699 720
700 if (is_error_page(vcpu->arch.time_page)) { 721 if (is_error_page(vcpu->arch.time_page)) {
701 kvm_release_page_clean(vcpu->arch.time_page); 722 kvm_release_page_clean(vcpu->arch.time_page);
@@ -752,8 +773,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
752 case MSR_IA32_MC0_MISC+8: 773 case MSR_IA32_MC0_MISC+8:
753 case MSR_IA32_MC0_MISC+12: 774 case MSR_IA32_MC0_MISC+12:
754 case MSR_IA32_MC0_MISC+16: 775 case MSR_IA32_MC0_MISC+16:
776 case MSR_IA32_MC0_MISC+20:
755 case MSR_IA32_UCODE_REV: 777 case MSR_IA32_UCODE_REV:
756 case MSR_IA32_EBL_CR_POWERON: 778 case MSR_IA32_EBL_CR_POWERON:
779 case MSR_IA32_DEBUGCTLMSR:
780 case MSR_IA32_LASTBRANCHFROMIP:
781 case MSR_IA32_LASTBRANCHTOIP:
782 case MSR_IA32_LASTINTFROMIP:
783 case MSR_IA32_LASTINTTOIP:
757 data = 0; 784 data = 0;
758 break; 785 break;
759 case MSR_MTRRcap: 786 case MSR_MTRRcap:
@@ -901,6 +928,9 @@ int kvm_dev_ioctl_check_extension(long ext)
901 case KVM_CAP_PV_MMU: 928 case KVM_CAP_PV_MMU:
902 r = !tdp_enabled; 929 r = !tdp_enabled;
903 break; 930 break;
931 case KVM_CAP_IOMMU:
932 r = intel_iommu_found();
933 break;
904 default: 934 default:
905 r = 0; 935 r = 0;
906 break; 936 break;
@@ -1303,28 +1333,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1303 struct kvm_vcpu *vcpu = filp->private_data; 1333 struct kvm_vcpu *vcpu = filp->private_data;
1304 void __user *argp = (void __user *)arg; 1334 void __user *argp = (void __user *)arg;
1305 int r; 1335 int r;
1336 struct kvm_lapic_state *lapic = NULL;
1306 1337
1307 switch (ioctl) { 1338 switch (ioctl) {
1308 case KVM_GET_LAPIC: { 1339 case KVM_GET_LAPIC: {
1309 struct kvm_lapic_state lapic; 1340 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1310 1341
1311 memset(&lapic, 0, sizeof lapic); 1342 r = -ENOMEM;
1312 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); 1343 if (!lapic)
1344 goto out;
1345 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
1313 if (r) 1346 if (r)
1314 goto out; 1347 goto out;
1315 r = -EFAULT; 1348 r = -EFAULT;
1316 if (copy_to_user(argp, &lapic, sizeof lapic)) 1349 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
1317 goto out; 1350 goto out;
1318 r = 0; 1351 r = 0;
1319 break; 1352 break;
1320 } 1353 }
1321 case KVM_SET_LAPIC: { 1354 case KVM_SET_LAPIC: {
1322 struct kvm_lapic_state lapic; 1355 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1323 1356 r = -ENOMEM;
1357 if (!lapic)
1358 goto out;
1324 r = -EFAULT; 1359 r = -EFAULT;
1325 if (copy_from_user(&lapic, argp, sizeof lapic)) 1360 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
1326 goto out; 1361 goto out;
1327 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; 1362 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
1328 if (r) 1363 if (r)
1329 goto out; 1364 goto out;
1330 r = 0; 1365 r = 0;
@@ -1422,6 +1457,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1422 r = -EINVAL; 1457 r = -EINVAL;
1423 } 1458 }
1424out: 1459out:
1460 if (lapic)
1461 kfree(lapic);
1425 return r; 1462 return r;
1426} 1463}
1427 1464
@@ -1630,6 +1667,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
1630 struct kvm *kvm = filp->private_data; 1667 struct kvm *kvm = filp->private_data;
1631 void __user *argp = (void __user *)arg; 1668 void __user *argp = (void __user *)arg;
1632 int r = -EINVAL; 1669 int r = -EINVAL;
1670 /*
1671 * This union makes it completely explicit to gcc-3.x
1672 * that these two variables' stack usage should be
1673 * combined, not added together.
1674 */
1675 union {
1676 struct kvm_pit_state ps;
1677 struct kvm_memory_alias alias;
1678 } u;
1633 1679
1634 switch (ioctl) { 1680 switch (ioctl) {
1635 case KVM_SET_TSS_ADDR: 1681 case KVM_SET_TSS_ADDR:
@@ -1661,17 +1707,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
1661 case KVM_GET_NR_MMU_PAGES: 1707 case KVM_GET_NR_MMU_PAGES:
1662 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 1708 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1663 break; 1709 break;
1664 case KVM_SET_MEMORY_ALIAS: { 1710 case KVM_SET_MEMORY_ALIAS:
1665 struct kvm_memory_alias alias;
1666
1667 r = -EFAULT; 1711 r = -EFAULT;
1668 if (copy_from_user(&alias, argp, sizeof alias)) 1712 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
1669 goto out; 1713 goto out;
1670 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); 1714 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
1671 if (r) 1715 if (r)
1672 goto out; 1716 goto out;
1673 break; 1717 break;
1674 }
1675 case KVM_CREATE_IRQCHIP: 1718 case KVM_CREATE_IRQCHIP:
1676 r = -ENOMEM; 1719 r = -ENOMEM;
1677 kvm->arch.vpic = kvm_create_pic(kvm); 1720 kvm->arch.vpic = kvm_create_pic(kvm);
@@ -1699,13 +1742,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
1699 goto out; 1742 goto out;
1700 if (irqchip_in_kernel(kvm)) { 1743 if (irqchip_in_kernel(kvm)) {
1701 mutex_lock(&kvm->lock); 1744 mutex_lock(&kvm->lock);
1702 if (irq_event.irq < 16) 1745 kvm_set_irq(kvm, irq_event.irq, irq_event.level);
1703 kvm_pic_set_irq(pic_irqchip(kvm),
1704 irq_event.irq,
1705 irq_event.level);
1706 kvm_ioapic_set_irq(kvm->arch.vioapic,
1707 irq_event.irq,
1708 irq_event.level);
1709 mutex_unlock(&kvm->lock); 1746 mutex_unlock(&kvm->lock);
1710 r = 0; 1747 r = 0;
1711 } 1748 }
@@ -1713,65 +1750,77 @@ long kvm_arch_vm_ioctl(struct file *filp,
1713 } 1750 }
1714 case KVM_GET_IRQCHIP: { 1751 case KVM_GET_IRQCHIP: {
1715 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1752 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1716 struct kvm_irqchip chip; 1753 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1717 1754
1718 r = -EFAULT; 1755 r = -ENOMEM;
1719 if (copy_from_user(&chip, argp, sizeof chip)) 1756 if (!chip)
1720 goto out; 1757 goto out;
1758 r = -EFAULT;
1759 if (copy_from_user(chip, argp, sizeof *chip))
1760 goto get_irqchip_out;
1721 r = -ENXIO; 1761 r = -ENXIO;
1722 if (!irqchip_in_kernel(kvm)) 1762 if (!irqchip_in_kernel(kvm))
1723 goto out; 1763 goto get_irqchip_out;
1724 r = kvm_vm_ioctl_get_irqchip(kvm, &chip); 1764 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
1725 if (r) 1765 if (r)
1726 goto out; 1766 goto get_irqchip_out;
1727 r = -EFAULT; 1767 r = -EFAULT;
1728 if (copy_to_user(argp, &chip, sizeof chip)) 1768 if (copy_to_user(argp, chip, sizeof *chip))
1729 goto out; 1769 goto get_irqchip_out;
1730 r = 0; 1770 r = 0;
1771 get_irqchip_out:
1772 kfree(chip);
1773 if (r)
1774 goto out;
1731 break; 1775 break;
1732 } 1776 }
1733 case KVM_SET_IRQCHIP: { 1777 case KVM_SET_IRQCHIP: {
1734 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1778 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1735 struct kvm_irqchip chip; 1779 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1736 1780
1737 r = -EFAULT; 1781 r = -ENOMEM;
1738 if (copy_from_user(&chip, argp, sizeof chip)) 1782 if (!chip)
1739 goto out; 1783 goto out;
1784 r = -EFAULT;
1785 if (copy_from_user(chip, argp, sizeof *chip))
1786 goto set_irqchip_out;
1740 r = -ENXIO; 1787 r = -ENXIO;
1741 if (!irqchip_in_kernel(kvm)) 1788 if (!irqchip_in_kernel(kvm))
1742 goto out; 1789 goto set_irqchip_out;
1743 r = kvm_vm_ioctl_set_irqchip(kvm, &chip); 1790 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
1744 if (r) 1791 if (r)
1745 goto out; 1792 goto set_irqchip_out;
1746 r = 0; 1793 r = 0;
1794 set_irqchip_out:
1795 kfree(chip);
1796 if (r)
1797 goto out;
1747 break; 1798 break;
1748 } 1799 }
1749 case KVM_GET_PIT: { 1800 case KVM_GET_PIT: {
1750 struct kvm_pit_state ps;
1751 r = -EFAULT; 1801 r = -EFAULT;
1752 if (copy_from_user(&ps, argp, sizeof ps)) 1802 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
1753 goto out; 1803 goto out;
1754 r = -ENXIO; 1804 r = -ENXIO;
1755 if (!kvm->arch.vpit) 1805 if (!kvm->arch.vpit)
1756 goto out; 1806 goto out;
1757 r = kvm_vm_ioctl_get_pit(kvm, &ps); 1807 r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
1758 if (r) 1808 if (r)
1759 goto out; 1809 goto out;
1760 r = -EFAULT; 1810 r = -EFAULT;
1761 if (copy_to_user(argp, &ps, sizeof ps)) 1811 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
1762 goto out; 1812 goto out;
1763 r = 0; 1813 r = 0;
1764 break; 1814 break;
1765 } 1815 }
1766 case KVM_SET_PIT: { 1816 case KVM_SET_PIT: {
1767 struct kvm_pit_state ps;
1768 r = -EFAULT; 1817 r = -EFAULT;
1769 if (copy_from_user(&ps, argp, sizeof ps)) 1818 if (copy_from_user(&u.ps, argp, sizeof u.ps))
1770 goto out; 1819 goto out;
1771 r = -ENXIO; 1820 r = -ENXIO;
1772 if (!kvm->arch.vpit) 1821 if (!kvm->arch.vpit)
1773 goto out; 1822 goto out;
1774 r = kvm_vm_ioctl_set_pit(kvm, &ps); 1823 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
1775 if (r) 1824 if (r)
1776 goto out; 1825 goto out;
1777 r = 0; 1826 r = 0;
@@ -2018,9 +2067,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
2018 2067
2019 val = *(u64 *)new; 2068 val = *(u64 *)new;
2020 2069
2021 down_read(&current->mm->mmap_sem);
2022 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2070 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2023 up_read(&current->mm->mmap_sem);
2024 2071
2025 kaddr = kmap_atomic(page, KM_USER0); 2072 kaddr = kmap_atomic(page, KM_USER0);
2026 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 2073 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
@@ -2040,6 +2087,7 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
2040 2087
2041int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 2088int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2042{ 2089{
2090 kvm_mmu_invlpg(vcpu, address);
2043 return X86EMUL_CONTINUE; 2091 return X86EMUL_CONTINUE;
2044} 2092}
2045 2093
@@ -2080,7 +2128,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2080void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 2128void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2081{ 2129{
2082 u8 opcodes[4]; 2130 u8 opcodes[4];
2083 unsigned long rip = vcpu->arch.rip; 2131 unsigned long rip = kvm_rip_read(vcpu);
2084 unsigned long rip_linear; 2132 unsigned long rip_linear;
2085 2133
2086 if (!printk_ratelimit()) 2134 if (!printk_ratelimit())
@@ -2102,6 +2150,14 @@ static struct x86_emulate_ops emulate_ops = {
2102 .cmpxchg_emulated = emulator_cmpxchg_emulated, 2150 .cmpxchg_emulated = emulator_cmpxchg_emulated,
2103}; 2151};
2104 2152
2153static void cache_all_regs(struct kvm_vcpu *vcpu)
2154{
2155 kvm_register_read(vcpu, VCPU_REGS_RAX);
2156 kvm_register_read(vcpu, VCPU_REGS_RSP);
2157 kvm_register_read(vcpu, VCPU_REGS_RIP);
2158 vcpu->arch.regs_dirty = ~0;
2159}
2160
2105int emulate_instruction(struct kvm_vcpu *vcpu, 2161int emulate_instruction(struct kvm_vcpu *vcpu,
2106 struct kvm_run *run, 2162 struct kvm_run *run,
2107 unsigned long cr2, 2163 unsigned long cr2,
@@ -2111,8 +2167,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2111 int r; 2167 int r;
2112 struct decode_cache *c; 2168 struct decode_cache *c;
2113 2169
2170 kvm_clear_exception_queue(vcpu);
2114 vcpu->arch.mmio_fault_cr2 = cr2; 2171 vcpu->arch.mmio_fault_cr2 = cr2;
2115 kvm_x86_ops->cache_regs(vcpu); 2172 /*
2173 * TODO: fix x86_emulate.c to use guest_read/write_register
2174 * instead of direct ->regs accesses, can save hundred cycles
2175 * on Intel for instructions that don't read/change RSP, for
2176 * for example.
2177 */
2178 cache_all_regs(vcpu);
2116 2179
2117 vcpu->mmio_is_write = 0; 2180 vcpu->mmio_is_write = 0;
2118 vcpu->arch.pio.string = 0; 2181 vcpu->arch.pio.string = 0;
@@ -2172,7 +2235,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2172 return EMULATE_DO_MMIO; 2235 return EMULATE_DO_MMIO;
2173 } 2236 }
2174 2237
2175 kvm_x86_ops->decache_regs(vcpu);
2176 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 2238 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2177 2239
2178 if (vcpu->mmio_is_write) { 2240 if (vcpu->mmio_is_write) {
@@ -2225,20 +2287,19 @@ int complete_pio(struct kvm_vcpu *vcpu)
2225 struct kvm_pio_request *io = &vcpu->arch.pio; 2287 struct kvm_pio_request *io = &vcpu->arch.pio;
2226 long delta; 2288 long delta;
2227 int r; 2289 int r;
2228 2290 unsigned long val;
2229 kvm_x86_ops->cache_regs(vcpu);
2230 2291
2231 if (!io->string) { 2292 if (!io->string) {
2232 if (io->in) 2293 if (io->in) {
2233 memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, 2294 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2234 io->size); 2295 memcpy(&val, vcpu->arch.pio_data, io->size);
2296 kvm_register_write(vcpu, VCPU_REGS_RAX, val);
2297 }
2235 } else { 2298 } else {
2236 if (io->in) { 2299 if (io->in) {
2237 r = pio_copy_data(vcpu); 2300 r = pio_copy_data(vcpu);
2238 if (r) { 2301 if (r)
2239 kvm_x86_ops->cache_regs(vcpu);
2240 return r; 2302 return r;
2241 }
2242 } 2303 }
2243 2304
2244 delta = 1; 2305 delta = 1;
@@ -2248,19 +2309,24 @@ int complete_pio(struct kvm_vcpu *vcpu)
2248 * The size of the register should really depend on 2309 * The size of the register should really depend on
2249 * current address size. 2310 * current address size.
2250 */ 2311 */
2251 vcpu->arch.regs[VCPU_REGS_RCX] -= delta; 2312 val = kvm_register_read(vcpu, VCPU_REGS_RCX);
2313 val -= delta;
2314 kvm_register_write(vcpu, VCPU_REGS_RCX, val);
2252 } 2315 }
2253 if (io->down) 2316 if (io->down)
2254 delta = -delta; 2317 delta = -delta;
2255 delta *= io->size; 2318 delta *= io->size;
2256 if (io->in) 2319 if (io->in) {
2257 vcpu->arch.regs[VCPU_REGS_RDI] += delta; 2320 val = kvm_register_read(vcpu, VCPU_REGS_RDI);
2258 else 2321 val += delta;
2259 vcpu->arch.regs[VCPU_REGS_RSI] += delta; 2322 kvm_register_write(vcpu, VCPU_REGS_RDI, val);
2323 } else {
2324 val = kvm_register_read(vcpu, VCPU_REGS_RSI);
2325 val += delta;
2326 kvm_register_write(vcpu, VCPU_REGS_RSI, val);
2327 }
2260 } 2328 }
2261 2329
2262 kvm_x86_ops->decache_regs(vcpu);
2263
2264 io->count -= io->cur_count; 2330 io->count -= io->cur_count;
2265 io->cur_count = 0; 2331 io->cur_count = 0;
2266 2332
@@ -2313,6 +2379,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2313 int size, unsigned port) 2379 int size, unsigned port)
2314{ 2380{
2315 struct kvm_io_device *pio_dev; 2381 struct kvm_io_device *pio_dev;
2382 unsigned long val;
2316 2383
2317 vcpu->run->exit_reason = KVM_EXIT_IO; 2384 vcpu->run->exit_reason = KVM_EXIT_IO;
2318 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2385 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -2333,8 +2400,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2333 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, 2400 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2334 handler); 2401 handler);
2335 2402
2336 kvm_x86_ops->cache_regs(vcpu); 2403 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2337 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); 2404 memcpy(vcpu->arch.pio_data, &val, 4);
2338 2405
2339 kvm_x86_ops->skip_emulated_instruction(vcpu); 2406 kvm_x86_ops->skip_emulated_instruction(vcpu);
2340 2407
@@ -2492,11 +2559,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2492 KVMTRACE_0D(HLT, vcpu, handler); 2559 KVMTRACE_0D(HLT, vcpu, handler);
2493 if (irqchip_in_kernel(vcpu->kvm)) { 2560 if (irqchip_in_kernel(vcpu->kvm)) {
2494 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 2561 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
2495 up_read(&vcpu->kvm->slots_lock);
2496 kvm_vcpu_block(vcpu);
2497 down_read(&vcpu->kvm->slots_lock);
2498 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
2499 return -EINTR;
2500 return 1; 2562 return 1;
2501 } else { 2563 } else {
2502 vcpu->run->exit_reason = KVM_EXIT_HLT; 2564 vcpu->run->exit_reason = KVM_EXIT_HLT;
@@ -2519,13 +2581,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2519 unsigned long nr, a0, a1, a2, a3, ret; 2581 unsigned long nr, a0, a1, a2, a3, ret;
2520 int r = 1; 2582 int r = 1;
2521 2583
2522 kvm_x86_ops->cache_regs(vcpu); 2584 nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
2523 2585 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
2524 nr = vcpu->arch.regs[VCPU_REGS_RAX]; 2586 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
2525 a0 = vcpu->arch.regs[VCPU_REGS_RBX]; 2587 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
2526 a1 = vcpu->arch.regs[VCPU_REGS_RCX]; 2588 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
2527 a2 = vcpu->arch.regs[VCPU_REGS_RDX];
2528 a3 = vcpu->arch.regs[VCPU_REGS_RSI];
2529 2589
2530 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); 2590 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
2531 2591
@@ -2548,8 +2608,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2548 ret = -KVM_ENOSYS; 2608 ret = -KVM_ENOSYS;
2549 break; 2609 break;
2550 } 2610 }
2551 vcpu->arch.regs[VCPU_REGS_RAX] = ret; 2611 kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
2552 kvm_x86_ops->decache_regs(vcpu);
2553 ++vcpu->stat.hypercalls; 2612 ++vcpu->stat.hypercalls;
2554 return r; 2613 return r;
2555} 2614}
@@ -2559,6 +2618,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2559{ 2618{
2560 char instruction[3]; 2619 char instruction[3];
2561 int ret = 0; 2620 int ret = 0;
2621 unsigned long rip = kvm_rip_read(vcpu);
2562 2622
2563 2623
2564 /* 2624 /*
@@ -2568,9 +2628,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2568 */ 2628 */
2569 kvm_mmu_zap_all(vcpu->kvm); 2629 kvm_mmu_zap_all(vcpu->kvm);
2570 2630
2571 kvm_x86_ops->cache_regs(vcpu);
2572 kvm_x86_ops->patch_hypercall(vcpu, instruction); 2631 kvm_x86_ops->patch_hypercall(vcpu, instruction);
2573 if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) 2632 if (emulator_write_emulated(rip, instruction, 3, vcpu)
2574 != X86EMUL_CONTINUE) 2633 != X86EMUL_CONTINUE)
2575 ret = -EFAULT; 2634 ret = -EFAULT;
2576 2635
@@ -2700,13 +2759,12 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2700 u32 function, index; 2759 u32 function, index;
2701 struct kvm_cpuid_entry2 *e, *best; 2760 struct kvm_cpuid_entry2 *e, *best;
2702 2761
2703 kvm_x86_ops->cache_regs(vcpu); 2762 function = kvm_register_read(vcpu, VCPU_REGS_RAX);
2704 function = vcpu->arch.regs[VCPU_REGS_RAX]; 2763 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
2705 index = vcpu->arch.regs[VCPU_REGS_RCX]; 2764 kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
2706 vcpu->arch.regs[VCPU_REGS_RAX] = 0; 2765 kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
2707 vcpu->arch.regs[VCPU_REGS_RBX] = 0; 2766 kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
2708 vcpu->arch.regs[VCPU_REGS_RCX] = 0; 2767 kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
2709 vcpu->arch.regs[VCPU_REGS_RDX] = 0;
2710 best = NULL; 2768 best = NULL;
2711 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 2769 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2712 e = &vcpu->arch.cpuid_entries[i]; 2770 e = &vcpu->arch.cpuid_entries[i];
@@ -2724,18 +2782,17 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2724 best = e; 2782 best = e;
2725 } 2783 }
2726 if (best) { 2784 if (best) {
2727 vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; 2785 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
2728 vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; 2786 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
2729 vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; 2787 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
2730 vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; 2788 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
2731 } 2789 }
2732 kvm_x86_ops->decache_regs(vcpu);
2733 kvm_x86_ops->skip_emulated_instruction(vcpu); 2790 kvm_x86_ops->skip_emulated_instruction(vcpu);
2734 KVMTRACE_5D(CPUID, vcpu, function, 2791 KVMTRACE_5D(CPUID, vcpu, function,
2735 (u32)vcpu->arch.regs[VCPU_REGS_RAX], 2792 (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
2736 (u32)vcpu->arch.regs[VCPU_REGS_RBX], 2793 (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
2737 (u32)vcpu->arch.regs[VCPU_REGS_RCX], 2794 (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
2738 (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler); 2795 (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
2739} 2796}
2740EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 2797EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
2741 2798
@@ -2776,9 +2833,7 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
2776 if (!apic || !apic->vapic_addr) 2833 if (!apic || !apic->vapic_addr)
2777 return; 2834 return;
2778 2835
2779 down_read(&current->mm->mmap_sem);
2780 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 2836 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2781 up_read(&current->mm->mmap_sem);
2782 2837
2783 vcpu->arch.apic->vapic_page = page; 2838 vcpu->arch.apic->vapic_page = page;
2784} 2839}
@@ -2796,28 +2851,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
2796 up_read(&vcpu->kvm->slots_lock); 2851 up_read(&vcpu->kvm->slots_lock);
2797} 2852}
2798 2853
2799static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2854static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2800{ 2855{
2801 int r; 2856 int r;
2802 2857
2803 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
2804 pr_debug("vcpu %d received sipi with vector # %x\n",
2805 vcpu->vcpu_id, vcpu->arch.sipi_vector);
2806 kvm_lapic_reset(vcpu);
2807 r = kvm_x86_ops->vcpu_reset(vcpu);
2808 if (r)
2809 return r;
2810 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2811 }
2812
2813 down_read(&vcpu->kvm->slots_lock);
2814 vapic_enter(vcpu);
2815
2816preempted:
2817 if (vcpu->guest_debug.enabled)
2818 kvm_x86_ops->guest_debug_pre(vcpu);
2819
2820again:
2821 if (vcpu->requests) 2858 if (vcpu->requests)
2822 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 2859 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
2823 kvm_mmu_unload(vcpu); 2860 kvm_mmu_unload(vcpu);
@@ -2829,6 +2866,8 @@ again:
2829 if (vcpu->requests) { 2866 if (vcpu->requests) {
2830 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 2867 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
2831 __kvm_migrate_timers(vcpu); 2868 __kvm_migrate_timers(vcpu);
2869 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
2870 kvm_mmu_sync_roots(vcpu);
2832 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 2871 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2833 kvm_x86_ops->tlb_flush(vcpu); 2872 kvm_x86_ops->tlb_flush(vcpu);
2834 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 2873 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
@@ -2854,21 +2893,15 @@ again:
2854 2893
2855 local_irq_disable(); 2894 local_irq_disable();
2856 2895
2857 if (vcpu->requests || need_resched()) { 2896 if (vcpu->requests || need_resched() || signal_pending(current)) {
2858 local_irq_enable(); 2897 local_irq_enable();
2859 preempt_enable(); 2898 preempt_enable();
2860 r = 1; 2899 r = 1;
2861 goto out; 2900 goto out;
2862 } 2901 }
2863 2902
2864 if (signal_pending(current)) { 2903 if (vcpu->guest_debug.enabled)
2865 local_irq_enable(); 2904 kvm_x86_ops->guest_debug_pre(vcpu);
2866 preempt_enable();
2867 r = -EINTR;
2868 kvm_run->exit_reason = KVM_EXIT_INTR;
2869 ++vcpu->stat.signal_exits;
2870 goto out;
2871 }
2872 2905
2873 vcpu->guest_mode = 1; 2906 vcpu->guest_mode = 1;
2874 /* 2907 /*
@@ -2917,8 +2950,8 @@ again:
2917 * Profile KVM exit RIPs: 2950 * Profile KVM exit RIPs:
2918 */ 2951 */
2919 if (unlikely(prof_on == KVM_PROFILING)) { 2952 if (unlikely(prof_on == KVM_PROFILING)) {
2920 kvm_x86_ops->cache_regs(vcpu); 2953 unsigned long rip = kvm_rip_read(vcpu);
2921 profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); 2954 profile_hit(KVM_PROFILING, (void *)rip);
2922 } 2955 }
2923 2956
2924 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) 2957 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
@@ -2927,26 +2960,63 @@ again:
2927 kvm_lapic_sync_from_vapic(vcpu); 2960 kvm_lapic_sync_from_vapic(vcpu);
2928 2961
2929 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 2962 r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2963out:
2964 return r;
2965}
2930 2966
2931 if (r > 0) { 2967static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2932 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 2968{
2933 r = -EINTR; 2969 int r;
2934 kvm_run->exit_reason = KVM_EXIT_INTR; 2970
2935 ++vcpu->stat.request_irq_exits; 2971 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
2936 goto out; 2972 pr_debug("vcpu %d received sipi with vector # %x\n",
2937 } 2973 vcpu->vcpu_id, vcpu->arch.sipi_vector);
2938 if (!need_resched()) 2974 kvm_lapic_reset(vcpu);
2939 goto again; 2975 r = kvm_x86_ops->vcpu_reset(vcpu);
2976 if (r)
2977 return r;
2978 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2940 } 2979 }
2941 2980
2942out: 2981 down_read(&vcpu->kvm->slots_lock);
2943 up_read(&vcpu->kvm->slots_lock); 2982 vapic_enter(vcpu);
2944 if (r > 0) { 2983
2945 kvm_resched(vcpu); 2984 r = 1;
2946 down_read(&vcpu->kvm->slots_lock); 2985 while (r > 0) {
2947 goto preempted; 2986 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
2987 r = vcpu_enter_guest(vcpu, kvm_run);
2988 else {
2989 up_read(&vcpu->kvm->slots_lock);
2990 kvm_vcpu_block(vcpu);
2991 down_read(&vcpu->kvm->slots_lock);
2992 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
2993 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
2994 vcpu->arch.mp_state =
2995 KVM_MP_STATE_RUNNABLE;
2996 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
2997 r = -EINTR;
2998 }
2999
3000 if (r > 0) {
3001 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
3002 r = -EINTR;
3003 kvm_run->exit_reason = KVM_EXIT_INTR;
3004 ++vcpu->stat.request_irq_exits;
3005 }
3006 if (signal_pending(current)) {
3007 r = -EINTR;
3008 kvm_run->exit_reason = KVM_EXIT_INTR;
3009 ++vcpu->stat.signal_exits;
3010 }
3011 if (need_resched()) {
3012 up_read(&vcpu->kvm->slots_lock);
3013 kvm_resched(vcpu);
3014 down_read(&vcpu->kvm->slots_lock);
3015 }
3016 }
2948 } 3017 }
2949 3018
3019 up_read(&vcpu->kvm->slots_lock);
2950 post_kvm_run_save(vcpu, kvm_run); 3020 post_kvm_run_save(vcpu, kvm_run);
2951 3021
2952 vapic_exit(vcpu); 3022 vapic_exit(vcpu);
@@ -2966,6 +3036,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2966 3036
2967 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 3037 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
2968 kvm_vcpu_block(vcpu); 3038 kvm_vcpu_block(vcpu);
3039 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
2969 r = -EAGAIN; 3040 r = -EAGAIN;
2970 goto out; 3041 goto out;
2971 } 3042 }
@@ -2999,11 +3070,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2999 } 3070 }
3000 } 3071 }
3001#endif 3072#endif
3002 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { 3073 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3003 kvm_x86_ops->cache_regs(vcpu); 3074 kvm_register_write(vcpu, VCPU_REGS_RAX,
3004 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; 3075 kvm_run->hypercall.ret);
3005 kvm_x86_ops->decache_regs(vcpu);
3006 }
3007 3076
3008 r = __vcpu_run(vcpu, kvm_run); 3077 r = __vcpu_run(vcpu, kvm_run);
3009 3078
@@ -3019,28 +3088,26 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3019{ 3088{
3020 vcpu_load(vcpu); 3089 vcpu_load(vcpu);
3021 3090
3022 kvm_x86_ops->cache_regs(vcpu); 3091 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3023 3092 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3024 regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; 3093 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3025 regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; 3094 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3026 regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; 3095 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3027 regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; 3096 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3028 regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; 3097 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3029 regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; 3098 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3030 regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3031 regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
3032#ifdef CONFIG_X86_64 3099#ifdef CONFIG_X86_64
3033 regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; 3100 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
3034 regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; 3101 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
3035 regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; 3102 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
3036 regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; 3103 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
3037 regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; 3104 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
3038 regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; 3105 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
3039 regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; 3106 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
3040 regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; 3107 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
3041#endif 3108#endif
3042 3109
3043 regs->rip = vcpu->arch.rip; 3110 regs->rip = kvm_rip_read(vcpu);
3044 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 3111 regs->rflags = kvm_x86_ops->get_rflags(vcpu);
3045 3112
3046 /* 3113 /*
@@ -3058,29 +3125,29 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3058{ 3125{
3059 vcpu_load(vcpu); 3126 vcpu_load(vcpu);
3060 3127
3061 vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; 3128 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
3062 vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; 3129 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
3063 vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; 3130 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
3064 vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; 3131 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
3065 vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; 3132 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
3066 vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; 3133 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
3067 vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; 3134 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
3068 vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; 3135 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
3069#ifdef CONFIG_X86_64 3136#ifdef CONFIG_X86_64
3070 vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; 3137 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
3071 vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; 3138 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
3072 vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; 3139 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
3073 vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; 3140 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
3074 vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; 3141 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
3075 vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; 3142 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3076 vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; 3143 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3077 vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; 3144 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3145
3078#endif 3146#endif
3079 3147
3080 vcpu->arch.rip = regs->rip; 3148 kvm_rip_write(vcpu, regs->rip);
3081 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 3149 kvm_x86_ops->set_rflags(vcpu, regs->rflags);
3082 3150
3083 kvm_x86_ops->decache_regs(vcpu);
3084 3151
3085 vcpu->arch.exception.pending = false; 3152 vcpu->arch.exception.pending = false;
3086 3153
@@ -3294,11 +3361,33 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
3294 return 0; 3361 return 0;
3295} 3362}
3296 3363
3364static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
3365{
3366 struct kvm_segment segvar = {
3367 .base = selector << 4,
3368 .limit = 0xffff,
3369 .selector = selector,
3370 .type = 3,
3371 .present = 1,
3372 .dpl = 3,
3373 .db = 0,
3374 .s = 1,
3375 .l = 0,
3376 .g = 0,
3377 .avl = 0,
3378 .unusable = 0,
3379 };
3380 kvm_x86_ops->set_segment(vcpu, &segvar, seg);
3381 return 0;
3382}
3383
3297int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3384int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3298 int type_bits, int seg) 3385 int type_bits, int seg)
3299{ 3386{
3300 struct kvm_segment kvm_seg; 3387 struct kvm_segment kvm_seg;
3301 3388
3389 if (!(vcpu->arch.cr0 & X86_CR0_PE))
3390 return kvm_load_realmode_segment(vcpu, selector, seg);
3302 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) 3391 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
3303 return 1; 3392 return 1;
3304 kvm_seg.type |= type_bits; 3393 kvm_seg.type |= type_bits;
@@ -3316,17 +3405,16 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
3316 struct tss_segment_32 *tss) 3405 struct tss_segment_32 *tss)
3317{ 3406{
3318 tss->cr3 = vcpu->arch.cr3; 3407 tss->cr3 = vcpu->arch.cr3;
3319 tss->eip = vcpu->arch.rip; 3408 tss->eip = kvm_rip_read(vcpu);
3320 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 3409 tss->eflags = kvm_x86_ops->get_rflags(vcpu);
3321 tss->eax = vcpu->arch.regs[VCPU_REGS_RAX]; 3410 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3322 tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3411 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3323 tss->edx = vcpu->arch.regs[VCPU_REGS_RDX]; 3412 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3324 tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX]; 3413 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3325 tss->esp = vcpu->arch.regs[VCPU_REGS_RSP]; 3414 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3326 tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP]; 3415 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3327 tss->esi = vcpu->arch.regs[VCPU_REGS_RSI]; 3416 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3328 tss->edi = vcpu->arch.regs[VCPU_REGS_RDI]; 3417 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3329
3330 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3418 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3331 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3419 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3332 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 3420 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
@@ -3342,17 +3430,17 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3342{ 3430{
3343 kvm_set_cr3(vcpu, tss->cr3); 3431 kvm_set_cr3(vcpu, tss->cr3);
3344 3432
3345 vcpu->arch.rip = tss->eip; 3433 kvm_rip_write(vcpu, tss->eip);
3346 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 3434 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
3347 3435
3348 vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax; 3436 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
3349 vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx; 3437 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
3350 vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx; 3438 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
3351 vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx; 3439 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
3352 vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp; 3440 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
3353 vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp; 3441 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
3354 vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; 3442 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
3355 vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; 3443 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
3356 3444
3357 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 3445 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
3358 return 1; 3446 return 1;
@@ -3380,16 +3468,16 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3380static void save_state_to_tss16(struct kvm_vcpu *vcpu, 3468static void save_state_to_tss16(struct kvm_vcpu *vcpu,
3381 struct tss_segment_16 *tss) 3469 struct tss_segment_16 *tss)
3382{ 3470{
3383 tss->ip = vcpu->arch.rip; 3471 tss->ip = kvm_rip_read(vcpu);
3384 tss->flag = kvm_x86_ops->get_rflags(vcpu); 3472 tss->flag = kvm_x86_ops->get_rflags(vcpu);
3385 tss->ax = vcpu->arch.regs[VCPU_REGS_RAX]; 3473 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3386 tss->cx = vcpu->arch.regs[VCPU_REGS_RCX]; 3474 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3387 tss->dx = vcpu->arch.regs[VCPU_REGS_RDX]; 3475 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3388 tss->bx = vcpu->arch.regs[VCPU_REGS_RBX]; 3476 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3389 tss->sp = vcpu->arch.regs[VCPU_REGS_RSP]; 3477 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3390 tss->bp = vcpu->arch.regs[VCPU_REGS_RBP]; 3478 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3391 tss->si = vcpu->arch.regs[VCPU_REGS_RSI]; 3479 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
3392 tss->di = vcpu->arch.regs[VCPU_REGS_RDI]; 3480 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
3393 3481
3394 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3482 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3395 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3483 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
@@ -3402,16 +3490,16 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
3402static int load_state_from_tss16(struct kvm_vcpu *vcpu, 3490static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3403 struct tss_segment_16 *tss) 3491 struct tss_segment_16 *tss)
3404{ 3492{
3405 vcpu->arch.rip = tss->ip; 3493 kvm_rip_write(vcpu, tss->ip);
3406 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 3494 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
3407 vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax; 3495 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
3408 vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx; 3496 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
3409 vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx; 3497 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
3410 vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx; 3498 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
3411 vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp; 3499 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
3412 vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp; 3500 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
3413 vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; 3501 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
3414 vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; 3502 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
3415 3503
3416 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 3504 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
3417 return 1; 3505 return 1;
@@ -3534,7 +3622,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3534 } 3622 }
3535 3623
3536 kvm_x86_ops->skip_emulated_instruction(vcpu); 3624 kvm_x86_ops->skip_emulated_instruction(vcpu);
3537 kvm_x86_ops->cache_regs(vcpu);
3538 3625
3539 if (nseg_desc.type & 8) 3626 if (nseg_desc.type & 8)
3540 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, 3627 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
@@ -3559,7 +3646,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3559 tr_seg.type = 11; 3646 tr_seg.type = 11;
3560 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3647 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
3561out: 3648out:
3562 kvm_x86_ops->decache_regs(vcpu);
3563 return ret; 3649 return ret;
3564} 3650}
3565EXPORT_SYMBOL_GPL(kvm_task_switch); 3651EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -3622,6 +3708,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3622 pr_debug("Set back pending irq %d\n", 3708 pr_debug("Set back pending irq %d\n",
3623 pending_vec); 3709 pending_vec);
3624 } 3710 }
3711 kvm_pic_clear_isr_ack(vcpu->kvm);
3625 } 3712 }
3626 3713
3627 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3714 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -3634,6 +3721,12 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3634 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3721 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3635 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3722 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3636 3723
3724 /* Older userspace won't unhalt the vcpu on reset. */
3725 if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
3726 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
3727 !(vcpu->arch.cr0 & X86_CR0_PE))
3728 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3729
3637 vcpu_put(vcpu); 3730 vcpu_put(vcpu);
3638 3731
3639 return 0; 3732 return 0;
@@ -3918,6 +4011,7 @@ struct kvm *kvm_arch_create_vm(void)
3918 return ERR_PTR(-ENOMEM); 4011 return ERR_PTR(-ENOMEM);
3919 4012
3920 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 4013 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4014 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
3921 4015
3922 return kvm; 4016 return kvm;
3923} 4017}
@@ -3950,6 +4044,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
3950 4044
3951void kvm_arch_destroy_vm(struct kvm *kvm) 4045void kvm_arch_destroy_vm(struct kvm *kvm)
3952{ 4046{
4047 kvm_iommu_unmap_guest(kvm);
4048 kvm_free_all_assigned_devices(kvm);
3953 kvm_free_pit(kvm); 4049 kvm_free_pit(kvm);
3954 kfree(kvm->arch.vpic); 4050 kfree(kvm->arch.vpic);
3955 kfree(kvm->arch.vioapic); 4051 kfree(kvm->arch.vioapic);
@@ -3981,7 +4077,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
3981 userspace_addr = do_mmap(NULL, 0, 4077 userspace_addr = do_mmap(NULL, 0,
3982 npages * PAGE_SIZE, 4078 npages * PAGE_SIZE,
3983 PROT_READ | PROT_WRITE, 4079 PROT_READ | PROT_WRITE,
3984 MAP_SHARED | MAP_ANONYMOUS, 4080 MAP_PRIVATE | MAP_ANONYMOUS,
3985 0); 4081 0);
3986 up_write(&current->mm->mmap_sem); 4082 up_write(&current->mm->mmap_sem);
3987 4083
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
new file mode 100644
index 000000000000..6a4be78a7384
--- /dev/null
+++ b/arch/x86/kvm/x86.h
@@ -0,0 +1,22 @@
1#ifndef ARCH_X86_KVM_X86_H
2#define ARCH_X86_KVM_X86_H
3
4#include <linux/kvm_host.h>
5
6static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
7{
8 vcpu->arch.exception.pending = false;
9}
10
11static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector)
12{
13 vcpu->arch.interrupt.pending = true;
14 vcpu->arch.interrupt.nr = vector;
15}
16
17static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
18{
19 vcpu->arch.interrupt.pending = false;
20}
21
22#endif
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index f2f90468f8b1..ea051173b0da 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -26,6 +26,7 @@
26#define DPRINTF(_f, _a ...) printf(_f , ## _a) 26#define DPRINTF(_f, _a ...) printf(_f , ## _a)
27#else 27#else
28#include <linux/kvm_host.h> 28#include <linux/kvm_host.h>
29#include "kvm_cache_regs.h"
29#define DPRINTF(x...) do {} while (0) 30#define DPRINTF(x...) do {} while (0)
30#endif 31#endif
31#include <linux/module.h> 32#include <linux/module.h>
@@ -46,25 +47,26 @@
46#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ 47#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
47#define DstReg (2<<1) /* Register operand. */ 48#define DstReg (2<<1) /* Register operand. */
48#define DstMem (3<<1) /* Memory operand. */ 49#define DstMem (3<<1) /* Memory operand. */
49#define DstMask (3<<1) 50#define DstAcc (4<<1) /* Destination Accumulator */
51#define DstMask (7<<1)
50/* Source operand type. */ 52/* Source operand type. */
51#define SrcNone (0<<3) /* No source operand. */ 53#define SrcNone (0<<4) /* No source operand. */
52#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ 54#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */
53#define SrcReg (1<<3) /* Register operand. */ 55#define SrcReg (1<<4) /* Register operand. */
54#define SrcMem (2<<3) /* Memory operand. */ 56#define SrcMem (2<<4) /* Memory operand. */
55#define SrcMem16 (3<<3) /* Memory operand (16-bit). */ 57#define SrcMem16 (3<<4) /* Memory operand (16-bit). */
56#define SrcMem32 (4<<3) /* Memory operand (32-bit). */ 58#define SrcMem32 (4<<4) /* Memory operand (32-bit). */
57#define SrcImm (5<<3) /* Immediate operand. */ 59#define SrcImm (5<<4) /* Immediate operand. */
58#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ 60#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
59#define SrcMask (7<<3) 61#define SrcMask (7<<4)
60/* Generic ModRM decode. */ 62/* Generic ModRM decode. */
61#define ModRM (1<<6) 63#define ModRM (1<<7)
62/* Destination is only written; never read. */ 64/* Destination is only written; never read. */
63#define Mov (1<<7) 65#define Mov (1<<8)
64#define BitOp (1<<8) 66#define BitOp (1<<9)
65#define MemAbs (1<<9) /* Memory operand is absolute displacement */ 67#define MemAbs (1<<10) /* Memory operand is absolute displacement */
66#define String (1<<10) /* String instruction (rep capable) */ 68#define String (1<<12) /* String instruction (rep capable) */
67#define Stack (1<<11) /* Stack instruction (push/pop) */ 69#define Stack (1<<13) /* Stack instruction (push/pop) */
68#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 70#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
69#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 71#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
70#define GroupMask 0xff /* Group number stored in bits 0:7 */ 72#define GroupMask 0xff /* Group number stored in bits 0:7 */
@@ -94,7 +96,7 @@ static u16 opcode_table[256] = {
94 /* 0x20 - 0x27 */ 96 /* 0x20 - 0x27 */
95 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 97 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
96 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 98 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
97 SrcImmByte, SrcImm, 0, 0, 99 DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
98 /* 0x28 - 0x2F */ 100 /* 0x28 - 0x2F */
99 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 101 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
100 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 102 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -106,7 +108,8 @@ static u16 opcode_table[256] = {
106 /* 0x38 - 0x3F */ 108 /* 0x38 - 0x3F */
107 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 109 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
108 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 110 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
109 0, 0, 0, 0, 111 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
112 0, 0,
110 /* 0x40 - 0x47 */ 113 /* 0x40 - 0x47 */
111 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 114 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
112 /* 0x48 - 0x4F */ 115 /* 0x48 - 0x4F */
@@ -153,9 +156,16 @@ static u16 opcode_table[256] = {
153 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 156 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
154 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 157 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
155 ByteOp | ImplicitOps | String, ImplicitOps | String, 158 ByteOp | ImplicitOps | String, ImplicitOps | String,
156 /* 0xB0 - 0xBF */ 159 /* 0xB0 - 0xB7 */
157 0, 0, 0, 0, 0, 0, 0, 0, 160 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
158 DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0, 161 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
162 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
163 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
164 /* 0xB8 - 0xBF */
165 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
166 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
167 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
168 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
159 /* 0xC0 - 0xC7 */ 169 /* 0xC0 - 0xC7 */
160 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 170 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
161 0, ImplicitOps | Stack, 0, 0, 171 0, ImplicitOps | Stack, 0, 0,
@@ -169,17 +179,20 @@ static u16 opcode_table[256] = {
169 /* 0xD8 - 0xDF */ 179 /* 0xD8 - 0xDF */
170 0, 0, 0, 0, 0, 0, 0, 0, 180 0, 0, 0, 0, 0, 0, 0, 0,
171 /* 0xE0 - 0xE7 */ 181 /* 0xE0 - 0xE7 */
172 0, 0, 0, 0, 0, 0, 0, 0, 182 0, 0, 0, 0,
183 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
184 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
173 /* 0xE8 - 0xEF */ 185 /* 0xE8 - 0xEF */
174 ImplicitOps | Stack, SrcImm | ImplicitOps, 186 ImplicitOps | Stack, SrcImm | ImplicitOps,
175 ImplicitOps, SrcImmByte | ImplicitOps, 187 ImplicitOps, SrcImmByte | ImplicitOps,
176 0, 0, 0, 0, 188 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
189 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
177 /* 0xF0 - 0xF7 */ 190 /* 0xF0 - 0xF7 */
178 0, 0, 0, 0, 191 0, 0, 0, 0,
179 ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, 192 ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
180 /* 0xF8 - 0xFF */ 193 /* 0xF8 - 0xFF */
181 ImplicitOps, 0, ImplicitOps, ImplicitOps, 194 ImplicitOps, 0, ImplicitOps, ImplicitOps,
182 0, 0, Group | Group4, Group | Group5, 195 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
183}; 196};
184 197
185static u16 twobyte_table[256] = { 198static u16 twobyte_table[256] = {
@@ -268,15 +281,16 @@ static u16 group_table[] = {
268 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 281 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
269 0, 0, 0, 0, 282 0, 0, 0, 0,
270 [Group3*8] = 283 [Group3*8] =
271 DstMem | SrcImm | ModRM | SrcImm, 0, 284 DstMem | SrcImm | ModRM, 0,
272 DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 285 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
273 0, 0, 0, 0, 286 0, 0, 0, 0,
274 [Group4*8] = 287 [Group4*8] =
275 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 288 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
276 0, 0, 0, 0, 0, 0, 289 0, 0, 0, 0, 0, 0,
277 [Group5*8] = 290 [Group5*8] =
278 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, 291 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
279 SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0, 292 SrcMem | ModRM | Stack, 0,
293 SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
280 [Group7*8] = 294 [Group7*8] =
281 0, 0, ModRM | SrcMem, ModRM | SrcMem, 295 0, 0, ModRM | SrcMem, ModRM | SrcMem,
282 SrcNone | ModRM | DstMem | Mov, 0, 296 SrcNone | ModRM | DstMem | Mov, 0,
@@ -839,7 +853,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
839 /* Shadow copy of register state. Committed on successful emulation. */ 853 /* Shadow copy of register state. Committed on successful emulation. */
840 854
841 memset(c, 0, sizeof(struct decode_cache)); 855 memset(c, 0, sizeof(struct decode_cache));
842 c->eip = ctxt->vcpu->arch.rip; 856 c->eip = kvm_rip_read(ctxt->vcpu);
843 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 857 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
844 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 858 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
845 859
@@ -1048,6 +1062,23 @@ done_prefixes:
1048 } 1062 }
1049 c->dst.type = OP_MEM; 1063 c->dst.type = OP_MEM;
1050 break; 1064 break;
1065 case DstAcc:
1066 c->dst.type = OP_REG;
1067 c->dst.bytes = c->op_bytes;
1068 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1069 switch (c->op_bytes) {
1070 case 1:
1071 c->dst.val = *(u8 *)c->dst.ptr;
1072 break;
1073 case 2:
1074 c->dst.val = *(u16 *)c->dst.ptr;
1075 break;
1076 case 4:
1077 c->dst.val = *(u32 *)c->dst.ptr;
1078 break;
1079 }
1080 c->dst.orig_val = c->dst.val;
1081 break;
1051 } 1082 }
1052 1083
1053 if (c->rip_relative) 1084 if (c->rip_relative)
@@ -1151,6 +1182,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1151 case 1: /* dec */ 1182 case 1: /* dec */
1152 emulate_1op("dec", c->dst, ctxt->eflags); 1183 emulate_1op("dec", c->dst, ctxt->eflags);
1153 break; 1184 break;
1185 case 2: /* call near abs */ {
1186 long int old_eip;
1187 old_eip = c->eip;
1188 c->eip = c->src.val;
1189 c->src.val = old_eip;
1190 emulate_push(ctxt);
1191 break;
1192 }
1154 case 4: /* jmp abs */ 1193 case 4: /* jmp abs */
1155 c->eip = c->src.val; 1194 c->eip = c->src.val;
1156 break; 1195 break;
@@ -1251,6 +1290,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1251 u64 msr_data; 1290 u64 msr_data;
1252 unsigned long saved_eip = 0; 1291 unsigned long saved_eip = 0;
1253 struct decode_cache *c = &ctxt->decode; 1292 struct decode_cache *c = &ctxt->decode;
1293 unsigned int port;
1294 int io_dir_in;
1254 int rc = 0; 1295 int rc = 0;
1255 1296
1256 /* Shadow copy of register state. Committed on successful emulation. 1297 /* Shadow copy of register state. Committed on successful emulation.
@@ -1267,7 +1308,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1267 if (c->rep_prefix && (c->d & String)) { 1308 if (c->rep_prefix && (c->d & String)) {
1268 /* All REP prefixes have the same first termination condition */ 1309 /* All REP prefixes have the same first termination condition */
1269 if (c->regs[VCPU_REGS_RCX] == 0) { 1310 if (c->regs[VCPU_REGS_RCX] == 0) {
1270 ctxt->vcpu->arch.rip = c->eip; 1311 kvm_rip_write(ctxt->vcpu, c->eip);
1271 goto done; 1312 goto done;
1272 } 1313 }
1273 /* The second termination condition only applies for REPE 1314 /* The second termination condition only applies for REPE
@@ -1281,17 +1322,17 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1281 (c->b == 0xae) || (c->b == 0xaf)) { 1322 (c->b == 0xae) || (c->b == 0xaf)) {
1282 if ((c->rep_prefix == REPE_PREFIX) && 1323 if ((c->rep_prefix == REPE_PREFIX) &&
1283 ((ctxt->eflags & EFLG_ZF) == 0)) { 1324 ((ctxt->eflags & EFLG_ZF) == 0)) {
1284 ctxt->vcpu->arch.rip = c->eip; 1325 kvm_rip_write(ctxt->vcpu, c->eip);
1285 goto done; 1326 goto done;
1286 } 1327 }
1287 if ((c->rep_prefix == REPNE_PREFIX) && 1328 if ((c->rep_prefix == REPNE_PREFIX) &&
1288 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { 1329 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
1289 ctxt->vcpu->arch.rip = c->eip; 1330 kvm_rip_write(ctxt->vcpu, c->eip);
1290 goto done; 1331 goto done;
1291 } 1332 }
1292 } 1333 }
1293 c->regs[VCPU_REGS_RCX]--; 1334 c->regs[VCPU_REGS_RCX]--;
1294 c->eip = ctxt->vcpu->arch.rip; 1335 c->eip = kvm_rip_read(ctxt->vcpu);
1295 } 1336 }
1296 1337
1297 if (c->src.type == OP_MEM) { 1338 if (c->src.type == OP_MEM) {
@@ -1351,27 +1392,10 @@ special_insn:
1351 sbb: /* sbb */ 1392 sbb: /* sbb */
1352 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); 1393 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
1353 break; 1394 break;
1354 case 0x20 ... 0x23: 1395 case 0x20 ... 0x25:
1355 and: /* and */ 1396 and: /* and */
1356 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); 1397 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
1357 break; 1398 break;
1358 case 0x24: /* and al imm8 */
1359 c->dst.type = OP_REG;
1360 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1361 c->dst.val = *(u8 *)c->dst.ptr;
1362 c->dst.bytes = 1;
1363 c->dst.orig_val = c->dst.val;
1364 goto and;
1365 case 0x25: /* and ax imm16, or eax imm32 */
1366 c->dst.type = OP_REG;
1367 c->dst.bytes = c->op_bytes;
1368 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1369 if (c->op_bytes == 2)
1370 c->dst.val = *(u16 *)c->dst.ptr;
1371 else
1372 c->dst.val = *(u32 *)c->dst.ptr;
1373 c->dst.orig_val = c->dst.val;
1374 goto and;
1375 case 0x28 ... 0x2d: 1399 case 0x28 ... 0x2d:
1376 sub: /* sub */ 1400 sub: /* sub */
1377 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); 1401 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
@@ -1659,7 +1683,7 @@ special_insn:
1659 case 0xae ... 0xaf: /* scas */ 1683 case 0xae ... 0xaf: /* scas */
1660 DPRINTF("Urk! I don't handle SCAS.\n"); 1684 DPRINTF("Urk! I don't handle SCAS.\n");
1661 goto cannot_emulate; 1685 goto cannot_emulate;
1662 case 0xb8: /* mov r, imm */ 1686 case 0xb0 ... 0xbf: /* mov r, imm */
1663 goto mov; 1687 goto mov;
1664 case 0xc0 ... 0xc1: 1688 case 0xc0 ... 0xc1:
1665 emulate_grp2(ctxt); 1689 emulate_grp2(ctxt);
@@ -1679,6 +1703,16 @@ special_insn:
1679 c->src.val = c->regs[VCPU_REGS_RCX]; 1703 c->src.val = c->regs[VCPU_REGS_RCX];
1680 emulate_grp2(ctxt); 1704 emulate_grp2(ctxt);
1681 break; 1705 break;
1706 case 0xe4: /* inb */
1707 case 0xe5: /* in */
1708 port = insn_fetch(u8, 1, c->eip);
1709 io_dir_in = 1;
1710 goto do_io;
1711 case 0xe6: /* outb */
1712 case 0xe7: /* out */
1713 port = insn_fetch(u8, 1, c->eip);
1714 io_dir_in = 0;
1715 goto do_io;
1682 case 0xe8: /* call (near) */ { 1716 case 0xe8: /* call (near) */ {
1683 long int rel; 1717 long int rel;
1684 switch (c->op_bytes) { 1718 switch (c->op_bytes) {
@@ -1729,6 +1763,22 @@ special_insn:
1729 jmp_rel(c, c->src.val); 1763 jmp_rel(c, c->src.val);
1730 c->dst.type = OP_NONE; /* Disable writeback. */ 1764 c->dst.type = OP_NONE; /* Disable writeback. */
1731 break; 1765 break;
1766 case 0xec: /* in al,dx */
1767 case 0xed: /* in (e/r)ax,dx */
1768 port = c->regs[VCPU_REGS_RDX];
1769 io_dir_in = 1;
1770 goto do_io;
1771 case 0xee: /* out al,dx */
1772 case 0xef: /* out (e/r)ax,dx */
1773 port = c->regs[VCPU_REGS_RDX];
1774 io_dir_in = 0;
1775 do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in,
1776 (c->d & ByteOp) ? 1 : c->op_bytes,
1777 port) != 0) {
1778 c->eip = saved_eip;
1779 goto cannot_emulate;
1780 }
1781 return 0;
1732 case 0xf4: /* hlt */ 1782 case 0xf4: /* hlt */
1733 ctxt->vcpu->arch.halt_request = 1; 1783 ctxt->vcpu->arch.halt_request = 1;
1734 break; 1784 break;
@@ -1754,6 +1804,14 @@ special_insn:
1754 ctxt->eflags |= X86_EFLAGS_IF; 1804 ctxt->eflags |= X86_EFLAGS_IF;
1755 c->dst.type = OP_NONE; /* Disable writeback. */ 1805 c->dst.type = OP_NONE; /* Disable writeback. */
1756 break; 1806 break;
1807 case 0xfc: /* cld */
1808 ctxt->eflags &= ~EFLG_DF;
1809 c->dst.type = OP_NONE; /* Disable writeback. */
1810 break;
1811 case 0xfd: /* std */
1812 ctxt->eflags |= EFLG_DF;
1813 c->dst.type = OP_NONE; /* Disable writeback. */
1814 break;
1757 case 0xfe ... 0xff: /* Grp4/Grp5 */ 1815 case 0xfe ... 0xff: /* Grp4/Grp5 */
1758 rc = emulate_grp45(ctxt, ops); 1816 rc = emulate_grp45(ctxt, ops);
1759 if (rc != 0) 1817 if (rc != 0)
@@ -1768,7 +1826,7 @@ writeback:
1768 1826
1769 /* Commit shadow register state. */ 1827 /* Commit shadow register state. */
1770 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); 1828 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
1771 ctxt->vcpu->arch.rip = c->eip; 1829 kvm_rip_write(ctxt->vcpu, c->eip);
1772 1830
1773done: 1831done:
1774 if (rc == X86EMUL_UNHANDLEABLE) { 1832 if (rc == X86EMUL_UNHANDLEABLE) {
@@ -1793,7 +1851,7 @@ twobyte_insn:
1793 goto done; 1851 goto done;
1794 1852
1795 /* Let the processor re-execute the fixed hypercall */ 1853 /* Let the processor re-execute the fixed hypercall */
1796 c->eip = ctxt->vcpu->arch.rip; 1854 c->eip = kvm_rip_read(ctxt->vcpu);
1797 /* Disable writeback. */ 1855 /* Disable writeback. */
1798 c->dst.type = OP_NONE; 1856 c->dst.type = OP_NONE;
1799 break; 1857 break;
@@ -1889,7 +1947,7 @@ twobyte_insn:
1889 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); 1947 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
1890 if (rc) { 1948 if (rc) {
1891 kvm_inject_gp(ctxt->vcpu, 0); 1949 kvm_inject_gp(ctxt->vcpu, 0);
1892 c->eip = ctxt->vcpu->arch.rip; 1950 c->eip = kvm_rip_read(ctxt->vcpu);
1893 } 1951 }
1894 rc = X86EMUL_CONTINUE; 1952 rc = X86EMUL_CONTINUE;
1895 c->dst.type = OP_NONE; 1953 c->dst.type = OP_NONE;
@@ -1899,7 +1957,7 @@ twobyte_insn:
1899 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); 1957 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
1900 if (rc) { 1958 if (rc) {
1901 kvm_inject_gp(ctxt->vcpu, 0); 1959 kvm_inject_gp(ctxt->vcpu, 0);
1902 c->eip = ctxt->vcpu->arch.rip; 1960 c->eip = kvm_rip_read(ctxt->vcpu);
1903 } else { 1961 } else {
1904 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 1962 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
1905 c->regs[VCPU_REGS_RDX] = msr_data >> 32; 1963 c->regs[VCPU_REGS_RDX] = msr_data >> 32;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 004ba86326ae..c9f7cda48ed7 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -198,17 +198,10 @@ unsigned long long xen_sched_clock(void)
198/* Get the TSC speed from Xen */ 198/* Get the TSC speed from Xen */
199unsigned long xen_tsc_khz(void) 199unsigned long xen_tsc_khz(void)
200{ 200{
201 u64 xen_khz = 1000000ULL << 32; 201 struct pvclock_vcpu_time_info *info =
202 const struct pvclock_vcpu_time_info *info =
203 &HYPERVISOR_shared_info->vcpu_info[0].time; 202 &HYPERVISOR_shared_info->vcpu_info[0].time;
204 203
205 do_div(xen_khz, info->tsc_to_system_mul); 204 return pvclock_tsc_khz(info);
206 if (info->tsc_shift < 0)
207 xen_khz <<= -info->tsc_shift;
208 else
209 xen_khz >>= info->tsc_shift;
210
211 return xen_khz;
212} 205}
213 206
214cycle_t xen_clocksource_read(void) 207cycle_t xen_clocksource_read(void)
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index bd2c01674f5e..e842e756308a 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -28,9 +28,9 @@
28 28
29#include <linux/pci.h> 29#include <linux/pci.h>
30#include <linux/dmar.h> 30#include <linux/dmar.h>
31#include <linux/iova.h>
32#include <linux/intel-iommu.h>
31#include <linux/timer.h> 33#include <linux/timer.h>
32#include "iova.h"
33#include "intel-iommu.h"
34 34
35#undef PREFIX 35#undef PREFIX
36#define PREFIX "DMAR:" 36#define PREFIX "DMAR:"
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 389fdd6f4a9f..fc5f2dbf5323 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -33,8 +33,8 @@
33#include <linux/dma-mapping.h> 33#include <linux/dma-mapping.h>
34#include <linux/mempool.h> 34#include <linux/mempool.h>
35#include <linux/timer.h> 35#include <linux/timer.h>
36#include "iova.h" 36#include <linux/iova.h>
37#include "intel-iommu.h" 37#include <linux/intel-iommu.h>
38#include <asm/proto.h> /* force_iommu in this header in x86-64*/ 38#include <asm/proto.h> /* force_iommu in this header in x86-64*/
39#include <asm/cacheflush.h> 39#include <asm/cacheflush.h>
40#include <asm/iommu.h> 40#include <asm/iommu.h>
@@ -156,7 +156,7 @@ static inline void *alloc_domain_mem(void)
156 return iommu_kmem_cache_alloc(iommu_domain_cache); 156 return iommu_kmem_cache_alloc(iommu_domain_cache);
157} 157}
158 158
159static inline void free_domain_mem(void *vaddr) 159static void free_domain_mem(void *vaddr)
160{ 160{
161 kmem_cache_free(iommu_domain_cache, vaddr); 161 kmem_cache_free(iommu_domain_cache, vaddr);
162} 162}
@@ -1341,7 +1341,7 @@ static void domain_remove_dev_info(struct dmar_domain *domain)
1341 * find_domain 1341 * find_domain
1342 * Note: we use struct pci_dev->dev.archdata.iommu stores the info 1342 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1343 */ 1343 */
1344struct dmar_domain * 1344static struct dmar_domain *
1345find_domain(struct pci_dev *pdev) 1345find_domain(struct pci_dev *pdev)
1346{ 1346{
1347 struct device_domain_info *info; 1347 struct device_domain_info *info;
@@ -2318,3 +2318,111 @@ int __init intel_iommu_init(void)
2318 return 0; 2318 return 0;
2319} 2319}
2320 2320
2321void intel_iommu_domain_exit(struct dmar_domain *domain)
2322{
2323 u64 end;
2324
2325 /* Domain 0 is reserved, so dont process it */
2326 if (!domain)
2327 return;
2328
2329 end = DOMAIN_MAX_ADDR(domain->gaw);
2330 end = end & (~PAGE_MASK_4K);
2331
2332 /* clear ptes */
2333 dma_pte_clear_range(domain, 0, end);
2334
2335 /* free page tables */
2336 dma_pte_free_pagetable(domain, 0, end);
2337
2338 iommu_free_domain(domain);
2339 free_domain_mem(domain);
2340}
2341EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
2342
2343struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
2344{
2345 struct dmar_drhd_unit *drhd;
2346 struct dmar_domain *domain;
2347 struct intel_iommu *iommu;
2348
2349 drhd = dmar_find_matched_drhd_unit(pdev);
2350 if (!drhd) {
2351 printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
2352 return NULL;
2353 }
2354
2355 iommu = drhd->iommu;
2356 if (!iommu) {
2357 printk(KERN_ERR
2358 "intel_iommu_domain_alloc: iommu == NULL\n");
2359 return NULL;
2360 }
2361 domain = iommu_alloc_domain(iommu);
2362 if (!domain) {
2363 printk(KERN_ERR
2364 "intel_iommu_domain_alloc: domain == NULL\n");
2365 return NULL;
2366 }
2367 if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2368 printk(KERN_ERR
2369 "intel_iommu_domain_alloc: domain_init() failed\n");
2370 intel_iommu_domain_exit(domain);
2371 return NULL;
2372 }
2373 return domain;
2374}
2375EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
2376
2377int intel_iommu_context_mapping(
2378 struct dmar_domain *domain, struct pci_dev *pdev)
2379{
2380 int rc;
2381 rc = domain_context_mapping(domain, pdev);
2382 return rc;
2383}
2384EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
2385
2386int intel_iommu_page_mapping(
2387 struct dmar_domain *domain, dma_addr_t iova,
2388 u64 hpa, size_t size, int prot)
2389{
2390 int rc;
2391 rc = domain_page_mapping(domain, iova, hpa, size, prot);
2392 return rc;
2393}
2394EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
2395
2396void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
2397{
2398 detach_domain_for_dev(domain, bus, devfn);
2399}
2400EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
2401
2402struct dmar_domain *
2403intel_iommu_find_domain(struct pci_dev *pdev)
2404{
2405 return find_domain(pdev);
2406}
2407EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
2408
2409int intel_iommu_found(void)
2410{
2411 return g_num_of_iommus;
2412}
2413EXPORT_SYMBOL_GPL(intel_iommu_found);
2414
2415u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
2416{
2417 struct dma_pte *pte;
2418 u64 pfn;
2419
2420 pfn = 0;
2421 pte = addr_to_dma_pte(domain, iova);
2422
2423 if (pte)
2424 pfn = dma_pte_addr(*pte);
2425
2426 return pfn >> PAGE_SHIFT_4K;
2427}
2428EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index bb642cc5e18c..738d4c89581c 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -4,7 +4,7 @@
4#include <linux/pci.h> 4#include <linux/pci.h>
5#include <linux/irq.h> 5#include <linux/irq.h>
6#include <asm/io_apic.h> 6#include <asm/io_apic.h>
7#include "intel-iommu.h" 7#include <linux/intel-iommu.h>
8#include "intr_remapping.h" 8#include "intr_remapping.h"
9 9
10static struct ioapic_scope ir_ioapic[MAX_IO_APICS]; 10static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
diff --git a/drivers/pci/intr_remapping.h b/drivers/pci/intr_remapping.h
index 05f2635bbe4e..ca48f0df8ac9 100644
--- a/drivers/pci/intr_remapping.h
+++ b/drivers/pci/intr_remapping.h
@@ -1,4 +1,4 @@
1#include "intel-iommu.h" 1#include <linux/intel-iommu.h>
2 2
3struct ioapic_scope { 3struct ioapic_scope {
4 struct intel_iommu *iommu; 4 struct intel_iommu *iommu;
diff --git a/drivers/pci/iova.c b/drivers/pci/iova.c
index 3ef4ac064315..2287116e9822 100644
--- a/drivers/pci/iova.c
+++ b/drivers/pci/iova.c
@@ -7,7 +7,7 @@
7 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> 7 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
8 */ 8 */
9 9
10#include "iova.h" 10#include <linux/iova.h>
11 11
12void 12void
13init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit) 13init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
diff --git a/include/asm-x86/kvm.h b/include/asm-x86/kvm.h
index 78e954db1e7f..ba0dd791fadf 100644
--- a/include/asm-x86/kvm.h
+++ b/include/asm-x86/kvm.h
@@ -208,26 +208,4 @@ struct kvm_pit_channel_state {
208struct kvm_pit_state { 208struct kvm_pit_state {
209 struct kvm_pit_channel_state channels[3]; 209 struct kvm_pit_channel_state channels[3];
210}; 210};
211
212#define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02)
213#define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03)
214#define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04)
215#define KVM_TRC_IO_READ (KVM_TRC_HANDLER + 0x05)
216#define KVM_TRC_IO_WRITE (KVM_TRC_HANDLER + 0x06)
217#define KVM_TRC_CR_READ (KVM_TRC_HANDLER + 0x07)
218#define KVM_TRC_CR_WRITE (KVM_TRC_HANDLER + 0x08)
219#define KVM_TRC_DR_READ (KVM_TRC_HANDLER + 0x09)
220#define KVM_TRC_DR_WRITE (KVM_TRC_HANDLER + 0x0A)
221#define KVM_TRC_MSR_READ (KVM_TRC_HANDLER + 0x0B)
222#define KVM_TRC_MSR_WRITE (KVM_TRC_HANDLER + 0x0C)
223#define KVM_TRC_CPUID (KVM_TRC_HANDLER + 0x0D)
224#define KVM_TRC_INTR (KVM_TRC_HANDLER + 0x0E)
225#define KVM_TRC_NMI (KVM_TRC_HANDLER + 0x0F)
226#define KVM_TRC_VMMCALL (KVM_TRC_HANDLER + 0x10)
227#define KVM_TRC_HLT (KVM_TRC_HANDLER + 0x11)
228#define KVM_TRC_CLTS (KVM_TRC_HANDLER + 0x12)
229#define KVM_TRC_LMSW (KVM_TRC_HANDLER + 0x13)
230#define KVM_TRC_APIC_ACCESS (KVM_TRC_HANDLER + 0x14)
231#define KVM_TRC_TDP_FAULT (KVM_TRC_HANDLER + 0x15)
232
233#endif /* ASM_X86__KVM_H */ 211#endif /* ASM_X86__KVM_H */
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 69794547f514..411fb8cfb24e 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -57,6 +57,10 @@
57#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE) 57#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE)
58 58
59#define DE_VECTOR 0 59#define DE_VECTOR 0
60#define DB_VECTOR 1
61#define BP_VECTOR 3
62#define OF_VECTOR 4
63#define BR_VECTOR 5
60#define UD_VECTOR 6 64#define UD_VECTOR 6
61#define NM_VECTOR 7 65#define NM_VECTOR 7
62#define DF_VECTOR 8 66#define DF_VECTOR 8
@@ -65,6 +69,7 @@
65#define SS_VECTOR 12 69#define SS_VECTOR 12
66#define GP_VECTOR 13 70#define GP_VECTOR 13
67#define PF_VECTOR 14 71#define PF_VECTOR 14
72#define MF_VECTOR 16
68#define MC_VECTOR 18 73#define MC_VECTOR 18
69 74
70#define SELECTOR_TI_MASK (1 << 2) 75#define SELECTOR_TI_MASK (1 << 2)
@@ -89,7 +94,7 @@ extern struct list_head vm_list;
89struct kvm_vcpu; 94struct kvm_vcpu;
90struct kvm; 95struct kvm;
91 96
92enum { 97enum kvm_reg {
93 VCPU_REGS_RAX = 0, 98 VCPU_REGS_RAX = 0,
94 VCPU_REGS_RCX = 1, 99 VCPU_REGS_RCX = 1,
95 VCPU_REGS_RDX = 2, 100 VCPU_REGS_RDX = 2,
@@ -108,6 +113,7 @@ enum {
108 VCPU_REGS_R14 = 14, 113 VCPU_REGS_R14 = 14,
109 VCPU_REGS_R15 = 15, 114 VCPU_REGS_R15 = 15,
110#endif 115#endif
116 VCPU_REGS_RIP,
111 NR_VCPU_REGS 117 NR_VCPU_REGS
112}; 118};
113 119
@@ -189,10 +195,20 @@ struct kvm_mmu_page {
189 */ 195 */
190 int multimapped; /* More than one parent_pte? */ 196 int multimapped; /* More than one parent_pte? */
191 int root_count; /* Currently serving as active root */ 197 int root_count; /* Currently serving as active root */
198 bool unsync;
199 bool unsync_children;
192 union { 200 union {
193 u64 *parent_pte; /* !multimapped */ 201 u64 *parent_pte; /* !multimapped */
194 struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ 202 struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
195 }; 203 };
204 DECLARE_BITMAP(unsync_child_bitmap, 512);
205};
206
207struct kvm_pv_mmu_op_buffer {
208 void *ptr;
209 unsigned len;
210 unsigned processed;
211 char buf[512] __aligned(sizeof(long));
196}; 212};
197 213
198/* 214/*
@@ -207,6 +223,9 @@ struct kvm_mmu {
207 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); 223 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
208 void (*prefetch_page)(struct kvm_vcpu *vcpu, 224 void (*prefetch_page)(struct kvm_vcpu *vcpu,
209 struct kvm_mmu_page *page); 225 struct kvm_mmu_page *page);
226 int (*sync_page)(struct kvm_vcpu *vcpu,
227 struct kvm_mmu_page *sp);
228 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
210 hpa_t root_hpa; 229 hpa_t root_hpa;
211 int root_level; 230 int root_level;
212 int shadow_root_level; 231 int shadow_root_level;
@@ -219,8 +238,13 @@ struct kvm_vcpu_arch {
219 int interrupt_window_open; 238 int interrupt_window_open;
220 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ 239 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
221 DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); 240 DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
222 unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ 241 /*
223 unsigned long rip; /* needs vcpu_load_rsp_rip() */ 242 * rip and regs accesses must go through
243 * kvm_{register,rip}_{read,write} functions.
244 */
245 unsigned long regs[NR_VCPU_REGS];
246 u32 regs_avail;
247 u32 regs_dirty;
224 248
225 unsigned long cr0; 249 unsigned long cr0;
226 unsigned long cr2; 250 unsigned long cr2;
@@ -237,6 +261,9 @@ struct kvm_vcpu_arch {
237 bool tpr_access_reporting; 261 bool tpr_access_reporting;
238 262
239 struct kvm_mmu mmu; 263 struct kvm_mmu mmu;
264 /* only needed in kvm_pv_mmu_op() path, but it's hot so
265 * put it here to avoid allocation */
266 struct kvm_pv_mmu_op_buffer mmu_op_buffer;
240 267
241 struct kvm_mmu_memory_cache mmu_pte_chain_cache; 268 struct kvm_mmu_memory_cache mmu_pte_chain_cache;
242 struct kvm_mmu_memory_cache mmu_rmap_desc_cache; 269 struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
@@ -269,6 +296,11 @@ struct kvm_vcpu_arch {
269 u32 error_code; 296 u32 error_code;
270 } exception; 297 } exception;
271 298
299 struct kvm_queued_interrupt {
300 bool pending;
301 u8 nr;
302 } interrupt;
303
272 struct { 304 struct {
273 int active; 305 int active;
274 u8 save_iopl; 306 u8 save_iopl;
@@ -294,6 +326,7 @@ struct kvm_vcpu_arch {
294 struct page *time_page; 326 struct page *time_page;
295 327
296 bool nmi_pending; 328 bool nmi_pending;
329 bool nmi_injected;
297 330
298 u64 mtrr[0x100]; 331 u64 mtrr[0x100];
299}; 332};
@@ -316,9 +349,12 @@ struct kvm_arch{
316 * Hash table of struct kvm_mmu_page. 349 * Hash table of struct kvm_mmu_page.
317 */ 350 */
318 struct list_head active_mmu_pages; 351 struct list_head active_mmu_pages;
352 struct list_head assigned_dev_head;
353 struct dmar_domain *intel_iommu_domain;
319 struct kvm_pic *vpic; 354 struct kvm_pic *vpic;
320 struct kvm_ioapic *vioapic; 355 struct kvm_ioapic *vioapic;
321 struct kvm_pit *vpit; 356 struct kvm_pit *vpit;
357 struct hlist_head irq_ack_notifier_list;
322 358
323 int round_robin_prev_vcpu; 359 int round_robin_prev_vcpu;
324 unsigned int tss_addr; 360 unsigned int tss_addr;
@@ -338,6 +374,7 @@ struct kvm_vm_stat {
338 u32 mmu_flooded; 374 u32 mmu_flooded;
339 u32 mmu_recycled; 375 u32 mmu_recycled;
340 u32 mmu_cache_miss; 376 u32 mmu_cache_miss;
377 u32 mmu_unsync;
341 u32 remote_tlb_flush; 378 u32 remote_tlb_flush;
342 u32 lpages; 379 u32 lpages;
343}; 380};
@@ -364,6 +401,7 @@ struct kvm_vcpu_stat {
364 u32 insn_emulation; 401 u32 insn_emulation;
365 u32 insn_emulation_fail; 402 u32 insn_emulation_fail;
366 u32 hypercalls; 403 u32 hypercalls;
404 u32 irq_injections;
367}; 405};
368 406
369struct descriptor_table { 407struct descriptor_table {
@@ -414,8 +452,7 @@ struct kvm_x86_ops {
414 unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); 452 unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr);
415 void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, 453 void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value,
416 int *exception); 454 int *exception);
417 void (*cache_regs)(struct kvm_vcpu *vcpu); 455 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
418 void (*decache_regs)(struct kvm_vcpu *vcpu);
419 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); 456 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
420 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 457 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
421 458
@@ -528,6 +565,8 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
528void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, 565void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
529 u32 error_code); 566 u32 error_code);
530 567
568void kvm_pic_set_irq(void *opaque, int irq, int level);
569
531void kvm_inject_nmi(struct kvm_vcpu *vcpu); 570void kvm_inject_nmi(struct kvm_vcpu *vcpu);
532 571
533void fx_init(struct kvm_vcpu *vcpu); 572void fx_init(struct kvm_vcpu *vcpu);
@@ -550,12 +589,14 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
550void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); 589void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
551int kvm_mmu_load(struct kvm_vcpu *vcpu); 590int kvm_mmu_load(struct kvm_vcpu *vcpu);
552void kvm_mmu_unload(struct kvm_vcpu *vcpu); 591void kvm_mmu_unload(struct kvm_vcpu *vcpu);
592void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
553 593
554int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); 594int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
555 595
556int kvm_fix_hypercall(struct kvm_vcpu *vcpu); 596int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
557 597
558int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); 598int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
599void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
559 600
560void kvm_enable_tdp(void); 601void kvm_enable_tdp(void);
561void kvm_disable_tdp(void); 602void kvm_disable_tdp(void);
@@ -686,33 +727,6 @@ enum {
686 TASK_SWITCH_GATE = 3, 727 TASK_SWITCH_GATE = 3,
687}; 728};
688 729
689#define KVMTRACE_5D(evt, vcpu, d1, d2, d3, d4, d5, name) \
690 trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
691 vcpu, 5, d1, d2, d3, d4, d5)
692#define KVMTRACE_4D(evt, vcpu, d1, d2, d3, d4, name) \
693 trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
694 vcpu, 4, d1, d2, d3, d4, 0)
695#define KVMTRACE_3D(evt, vcpu, d1, d2, d3, name) \
696 trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
697 vcpu, 3, d1, d2, d3, 0, 0)
698#define KVMTRACE_2D(evt, vcpu, d1, d2, name) \
699 trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
700 vcpu, 2, d1, d2, 0, 0, 0)
701#define KVMTRACE_1D(evt, vcpu, d1, name) \
702 trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
703 vcpu, 1, d1, 0, 0, 0, 0)
704#define KVMTRACE_0D(evt, vcpu, name) \
705 trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
706 vcpu, 0, 0, 0, 0, 0, 0)
707
708#ifdef CONFIG_64BIT
709# define KVM_EX_ENTRY ".quad"
710# define KVM_EX_PUSH "pushq"
711#else
712# define KVM_EX_ENTRY ".long"
713# define KVM_EX_PUSH "pushl"
714#endif
715
716/* 730/*
717 * Hardware virtualization extension instructions may fault if a 731 * Hardware virtualization extension instructions may fault if a
718 * reboot turns off virtualization while processes are running. 732 * reboot turns off virtualization while processes are running.
@@ -724,11 +738,11 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
724 "666: " insn "\n\t" \ 738 "666: " insn "\n\t" \
725 ".pushsection .fixup, \"ax\" \n" \ 739 ".pushsection .fixup, \"ax\" \n" \
726 "667: \n\t" \ 740 "667: \n\t" \
727 KVM_EX_PUSH " $666b \n\t" \ 741 __ASM_SIZE(push) " $666b \n\t" \
728 "jmp kvm_handle_fault_on_reboot \n\t" \ 742 "jmp kvm_handle_fault_on_reboot \n\t" \
729 ".popsection \n\t" \ 743 ".popsection \n\t" \
730 ".pushsection __ex_table, \"a\" \n\t" \ 744 ".pushsection __ex_table, \"a\" \n\t" \
731 KVM_EX_ENTRY " 666b, 667b \n\t" \ 745 _ASM_PTR " 666b, 667b \n\t" \
732 ".popsection" 746 ".popsection"
733 747
734#define KVM_ARCH_WANT_MMU_NOTIFIER 748#define KVM_ARCH_WANT_MMU_NOTIFIER
diff --git a/include/asm-x86/msr-index.h b/include/asm-x86/msr-index.h
index 0bb43301a202..dabd10f0bbee 100644
--- a/include/asm-x86/msr-index.h
+++ b/include/asm-x86/msr-index.h
@@ -178,6 +178,9 @@
178#define MSR_IA32_EBL_CR_POWERON 0x0000002a 178#define MSR_IA32_EBL_CR_POWERON 0x0000002a
179#define MSR_IA32_FEATURE_CONTROL 0x0000003a 179#define MSR_IA32_FEATURE_CONTROL 0x0000003a
180 180
181#define FEATURE_CONTROL_LOCKED (1<<0)
182#define FEATURE_CONTROL_VMXON_ENABLED (1<<2)
183
181#define MSR_IA32_APICBASE 0x0000001b 184#define MSR_IA32_APICBASE 0x0000001b
182#define MSR_IA32_APICBASE_BSP (1<<8) 185#define MSR_IA32_APICBASE_BSP (1<<8)
183#define MSR_IA32_APICBASE_ENABLE (1<<11) 186#define MSR_IA32_APICBASE_ENABLE (1<<11)
diff --git a/include/asm-x86/pvclock.h b/include/asm-x86/pvclock.h
index 1a38f6834800..ad29e277fd6d 100644
--- a/include/asm-x86/pvclock.h
+++ b/include/asm-x86/pvclock.h
@@ -6,6 +6,7 @@
6 6
7/* some helper functions for xen and kvm pv clock sources */ 7/* some helper functions for xen and kvm pv clock sources */
8cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); 8cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
9unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
9void pvclock_read_wallclock(struct pvclock_wall_clock *wall, 10void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
10 struct pvclock_vcpu_time_info *vcpu, 11 struct pvclock_vcpu_time_info *vcpu,
11 struct timespec *ts); 12 struct timespec *ts);
diff --git a/drivers/pci/dma_remapping.h b/include/linux/dma_remapping.h
index bff5c65f81dc..bff5c65f81dc 100644
--- a/drivers/pci/dma_remapping.h
+++ b/include/linux/dma_remapping.h
diff --git a/drivers/pci/intel-iommu.h b/include/linux/intel-iommu.h
index 2142c01e0143..2e117f30a76c 100644
--- a/drivers/pci/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -25,10 +25,10 @@
25#include <linux/types.h> 25#include <linux/types.h>
26#include <linux/msi.h> 26#include <linux/msi.h>
27#include <linux/sysdev.h> 27#include <linux/sysdev.h>
28#include "iova.h" 28#include <linux/iova.h>
29#include <linux/io.h> 29#include <linux/io.h>
30#include <linux/dma_remapping.h>
30#include <asm/cacheflush.h> 31#include <asm/cacheflush.h>
31#include "dma_remapping.h"
32 32
33/* 33/*
34 * Intel IOMMU register specification per version 1.0 public spec. 34 * Intel IOMMU register specification per version 1.0 public spec.
@@ -304,4 +304,24 @@ extern int dmar_enable_qi(struct intel_iommu *iommu);
304extern void qi_global_iec(struct intel_iommu *iommu); 304extern void qi_global_iec(struct intel_iommu *iommu);
305 305
306extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu); 306extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
307
308void intel_iommu_domain_exit(struct dmar_domain *domain);
309struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev);
310int intel_iommu_context_mapping(struct dmar_domain *domain,
311 struct pci_dev *pdev);
312int intel_iommu_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
313 u64 hpa, size_t size, int prot);
314void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn);
315struct dmar_domain *intel_iommu_find_domain(struct pci_dev *pdev);
316u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova);
317
318#ifdef CONFIG_DMAR
319int intel_iommu_found(void);
320#else /* CONFIG_DMAR */
321static inline int intel_iommu_found(void)
322{
323 return 0;
324}
325#endif /* CONFIG_DMAR */
326
307#endif 327#endif
diff --git a/drivers/pci/iova.h b/include/linux/iova.h
index 228f6c94b69c..228f6c94b69c 100644
--- a/drivers/pci/iova.h
+++ b/include/linux/iova.h
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 70a30651cd12..797fcd781242 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -311,22 +311,33 @@ struct kvm_s390_interrupt {
311 311
312/* This structure represents a single trace buffer record. */ 312/* This structure represents a single trace buffer record. */
313struct kvm_trace_rec { 313struct kvm_trace_rec {
314 __u32 event:28; 314 /* variable rec_val
315 __u32 extra_u32:3; 315 * is split into:
316 __u32 cycle_in:1; 316 * bits 0 - 27 -> event id
317 * bits 28 -30 -> number of extra data args of size u32
318 * bits 31 -> binary indicator for if tsc is in record
319 */
320 __u32 rec_val;
317 __u32 pid; 321 __u32 pid;
318 __u32 vcpu_id; 322 __u32 vcpu_id;
319 union { 323 union {
320 struct { 324 struct {
321 __u64 cycle_u64; 325 __u64 timestamp;
322 __u32 extra_u32[KVM_TRC_EXTRA_MAX]; 326 __u32 extra_u32[KVM_TRC_EXTRA_MAX];
323 } __attribute__((packed)) cycle; 327 } __attribute__((packed)) timestamp;
324 struct { 328 struct {
325 __u32 extra_u32[KVM_TRC_EXTRA_MAX]; 329 __u32 extra_u32[KVM_TRC_EXTRA_MAX];
326 } nocycle; 330 } notimestamp;
327 } u; 331 } u;
328}; 332};
329 333
334#define TRACE_REC_EVENT_ID(val) \
335 (0x0fffffff & (val))
336#define TRACE_REC_NUM_DATA_ARGS(val) \
337 (0x70000000 & ((val) << 28))
338#define TRACE_REC_TCS(val) \
339 (0x80000000 & ((val) << 31))
340
330#define KVMIO 0xAE 341#define KVMIO 0xAE
331 342
332/* 343/*
@@ -372,6 +383,10 @@ struct kvm_trace_rec {
372#define KVM_CAP_MP_STATE 14 383#define KVM_CAP_MP_STATE 14
373#define KVM_CAP_COALESCED_MMIO 15 384#define KVM_CAP_COALESCED_MMIO 15
374#define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */ 385#define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */
386#if defined(CONFIG_X86)||defined(CONFIG_IA64)
387#define KVM_CAP_DEVICE_ASSIGNMENT 17
388#endif
389#define KVM_CAP_IOMMU 18
375 390
376/* 391/*
377 * ioctls for VM fds 392 * ioctls for VM fds
@@ -401,6 +416,10 @@ struct kvm_trace_rec {
401 _IOW(KVMIO, 0x67, struct kvm_coalesced_mmio_zone) 416 _IOW(KVMIO, 0x67, struct kvm_coalesced_mmio_zone)
402#define KVM_UNREGISTER_COALESCED_MMIO \ 417#define KVM_UNREGISTER_COALESCED_MMIO \
403 _IOW(KVMIO, 0x68, struct kvm_coalesced_mmio_zone) 418 _IOW(KVMIO, 0x68, struct kvm_coalesced_mmio_zone)
419#define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \
420 struct kvm_assigned_pci_dev)
421#define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \
422 struct kvm_assigned_irq)
404 423
405/* 424/*
406 * ioctls for vcpu fds 425 * ioctls for vcpu fds
@@ -440,4 +459,45 @@ struct kvm_trace_rec {
440#define KVM_GET_MP_STATE _IOR(KVMIO, 0x98, struct kvm_mp_state) 459#define KVM_GET_MP_STATE _IOR(KVMIO, 0x98, struct kvm_mp_state)
441#define KVM_SET_MP_STATE _IOW(KVMIO, 0x99, struct kvm_mp_state) 460#define KVM_SET_MP_STATE _IOW(KVMIO, 0x99, struct kvm_mp_state)
442 461
462#define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02)
463#define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03)
464#define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04)
465#define KVM_TRC_IO_READ (KVM_TRC_HANDLER + 0x05)
466#define KVM_TRC_IO_WRITE (KVM_TRC_HANDLER + 0x06)
467#define KVM_TRC_CR_READ (KVM_TRC_HANDLER + 0x07)
468#define KVM_TRC_CR_WRITE (KVM_TRC_HANDLER + 0x08)
469#define KVM_TRC_DR_READ (KVM_TRC_HANDLER + 0x09)
470#define KVM_TRC_DR_WRITE (KVM_TRC_HANDLER + 0x0A)
471#define KVM_TRC_MSR_READ (KVM_TRC_HANDLER + 0x0B)
472#define KVM_TRC_MSR_WRITE (KVM_TRC_HANDLER + 0x0C)
473#define KVM_TRC_CPUID (KVM_TRC_HANDLER + 0x0D)
474#define KVM_TRC_INTR (KVM_TRC_HANDLER + 0x0E)
475#define KVM_TRC_NMI (KVM_TRC_HANDLER + 0x0F)
476#define KVM_TRC_VMMCALL (KVM_TRC_HANDLER + 0x10)
477#define KVM_TRC_HLT (KVM_TRC_HANDLER + 0x11)
478#define KVM_TRC_CLTS (KVM_TRC_HANDLER + 0x12)
479#define KVM_TRC_LMSW (KVM_TRC_HANDLER + 0x13)
480#define KVM_TRC_APIC_ACCESS (KVM_TRC_HANDLER + 0x14)
481#define KVM_TRC_TDP_FAULT (KVM_TRC_HANDLER + 0x15)
482#define KVM_TRC_GTLB_WRITE (KVM_TRC_HANDLER + 0x16)
483#define KVM_TRC_STLB_WRITE (KVM_TRC_HANDLER + 0x17)
484#define KVM_TRC_STLB_INVAL (KVM_TRC_HANDLER + 0x18)
485#define KVM_TRC_PPC_INSTR (KVM_TRC_HANDLER + 0x19)
486
487struct kvm_assigned_pci_dev {
488 __u32 assigned_dev_id;
489 __u32 busnr;
490 __u32 devfn;
491 __u32 flags;
492};
493
494struct kvm_assigned_irq {
495 __u32 assigned_dev_id;
496 __u32 host_irq;
497 __u32 guest_irq;
498 __u32 flags;
499};
500
501#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
502
443#endif 503#endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8525afc53107..3833c48fae3a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -34,6 +34,8 @@
34#define KVM_REQ_MMU_RELOAD 3 34#define KVM_REQ_MMU_RELOAD 3
35#define KVM_REQ_TRIPLE_FAULT 4 35#define KVM_REQ_TRIPLE_FAULT 4
36#define KVM_REQ_PENDING_TIMER 5 36#define KVM_REQ_PENDING_TIMER 5
37#define KVM_REQ_UNHALT 6
38#define KVM_REQ_MMU_SYNC 7
37 39
38struct kvm_vcpu; 40struct kvm_vcpu;
39extern struct kmem_cache *kvm_vcpu_cache; 41extern struct kmem_cache *kvm_vcpu_cache;
@@ -279,12 +281,68 @@ void kvm_free_physmem(struct kvm *kvm);
279 281
280struct kvm *kvm_arch_create_vm(void); 282struct kvm *kvm_arch_create_vm(void);
281void kvm_arch_destroy_vm(struct kvm *kvm); 283void kvm_arch_destroy_vm(struct kvm *kvm);
284void kvm_free_all_assigned_devices(struct kvm *kvm);
282 285
283int kvm_cpu_get_interrupt(struct kvm_vcpu *v); 286int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
284int kvm_cpu_has_interrupt(struct kvm_vcpu *v); 287int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
285int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); 288int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
286void kvm_vcpu_kick(struct kvm_vcpu *vcpu); 289void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
287 290
291int kvm_is_mmio_pfn(pfn_t pfn);
292
293struct kvm_irq_ack_notifier {
294 struct hlist_node link;
295 unsigned gsi;
296 void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
297};
298
299struct kvm_assigned_dev_kernel {
300 struct kvm_irq_ack_notifier ack_notifier;
301 struct work_struct interrupt_work;
302 struct list_head list;
303 int assigned_dev_id;
304 int host_busnr;
305 int host_devfn;
306 int host_irq;
307 int guest_irq;
308 int irq_requested;
309 struct pci_dev *dev;
310 struct kvm *kvm;
311};
312void kvm_set_irq(struct kvm *kvm, int irq, int level);
313void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi);
314void kvm_register_irq_ack_notifier(struct kvm *kvm,
315 struct kvm_irq_ack_notifier *kian);
316void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
317 struct kvm_irq_ack_notifier *kian);
318
319#ifdef CONFIG_DMAR
320int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn,
321 unsigned long npages);
322int kvm_iommu_map_guest(struct kvm *kvm,
323 struct kvm_assigned_dev_kernel *assigned_dev);
324int kvm_iommu_unmap_guest(struct kvm *kvm);
325#else /* CONFIG_DMAR */
326static inline int kvm_iommu_map_pages(struct kvm *kvm,
327 gfn_t base_gfn,
328 unsigned long npages)
329{
330 return 0;
331}
332
333static inline int kvm_iommu_map_guest(struct kvm *kvm,
334 struct kvm_assigned_dev_kernel
335 *assigned_dev)
336{
337 return -ENODEV;
338}
339
340static inline int kvm_iommu_unmap_guest(struct kvm *kvm)
341{
342 return 0;
343}
344#endif /* CONFIG_DMAR */
345
288static inline void kvm_guest_enter(void) 346static inline void kvm_guest_enter(void)
289{ 347{
290 account_system_vtime(current); 348 account_system_vtime(current);
@@ -307,6 +365,11 @@ static inline gpa_t gfn_to_gpa(gfn_t gfn)
307 return (gpa_t)gfn << PAGE_SHIFT; 365 return (gpa_t)gfn << PAGE_SHIFT;
308} 366}
309 367
368static inline hpa_t pfn_to_hpa(pfn_t pfn)
369{
370 return (hpa_t)pfn << PAGE_SHIFT;
371}
372
310static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu) 373static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu)
311{ 374{
312 set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); 375 set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
@@ -326,6 +389,25 @@ struct kvm_stats_debugfs_item {
326extern struct kvm_stats_debugfs_item debugfs_entries[]; 389extern struct kvm_stats_debugfs_item debugfs_entries[];
327extern struct dentry *kvm_debugfs_dir; 390extern struct dentry *kvm_debugfs_dir;
328 391
392#define KVMTRACE_5D(evt, vcpu, d1, d2, d3, d4, d5, name) \
393 trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
394 vcpu, 5, d1, d2, d3, d4, d5)
395#define KVMTRACE_4D(evt, vcpu, d1, d2, d3, d4, name) \
396 trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
397 vcpu, 4, d1, d2, d3, d4, 0)
398#define KVMTRACE_3D(evt, vcpu, d1, d2, d3, name) \
399 trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
400 vcpu, 3, d1, d2, d3, 0, 0)
401#define KVMTRACE_2D(evt, vcpu, d1, d2, name) \
402 trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
403 vcpu, 2, d1, d2, 0, 0, 0)
404#define KVMTRACE_1D(evt, vcpu, d1, name) \
405 trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
406 vcpu, 1, d1, 0, 0, 0, 0)
407#define KVMTRACE_0D(evt, vcpu, name) \
408 trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
409 vcpu, 0, 0, 0, 0, 0, 0)
410
329#ifdef CONFIG_KVM_TRACE 411#ifdef CONFIG_KVM_TRACE
330int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg); 412int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg);
331void kvm_trace_cleanup(void); 413void kvm_trace_cleanup(void);
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index c0d22870ee9c..53772bb46320 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -39,6 +39,7 @@
39 39
40#include "ioapic.h" 40#include "ioapic.h"
41#include "lapic.h" 41#include "lapic.h"
42#include "irq.h"
42 43
43#if 0 44#if 0
44#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) 45#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
@@ -285,26 +286,31 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
285 } 286 }
286} 287}
287 288
288static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi) 289static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi,
290 int trigger_mode)
289{ 291{
290 union ioapic_redir_entry *ent; 292 union ioapic_redir_entry *ent;
291 293
292 ent = &ioapic->redirtbl[gsi]; 294 ent = &ioapic->redirtbl[gsi];
293 ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
294 295
295 ent->fields.remote_irr = 0; 296 kvm_notify_acked_irq(ioapic->kvm, gsi);
296 if (!ent->fields.mask && (ioapic->irr & (1 << gsi))) 297
297 ioapic_service(ioapic, gsi); 298 if (trigger_mode == IOAPIC_LEVEL_TRIG) {
299 ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
300 ent->fields.remote_irr = 0;
301 if (!ent->fields.mask && (ioapic->irr & (1 << gsi)))
302 ioapic_service(ioapic, gsi);
303 }
298} 304}
299 305
300void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) 306void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
301{ 307{
302 struct kvm_ioapic *ioapic = kvm->arch.vioapic; 308 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
303 int i; 309 int i;
304 310
305 for (i = 0; i < IOAPIC_NUM_PINS; i++) 311 for (i = 0; i < IOAPIC_NUM_PINS; i++)
306 if (ioapic->redirtbl[i].fields.vector == vector) 312 if (ioapic->redirtbl[i].fields.vector == vector)
307 __kvm_ioapic_update_eoi(ioapic, i); 313 __kvm_ioapic_update_eoi(ioapic, i, trigger_mode);
308} 314}
309 315
310static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr, 316static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr,
@@ -380,7 +386,7 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
380 break; 386 break;
381#ifdef CONFIG_IA64 387#ifdef CONFIG_IA64
382 case IOAPIC_REG_EOI: 388 case IOAPIC_REG_EOI:
383 kvm_ioapic_update_eoi(ioapic->kvm, data); 389 kvm_ioapic_update_eoi(ioapic->kvm, data, IOAPIC_LEVEL_TRIG);
384 break; 390 break;
385#endif 391#endif
386 392
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 7f16675fe783..cd7ae7691c9d 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -58,6 +58,7 @@ struct kvm_ioapic {
58 } redirtbl[IOAPIC_NUM_PINS]; 58 } redirtbl[IOAPIC_NUM_PINS];
59 struct kvm_io_device dev; 59 struct kvm_io_device dev;
60 struct kvm *kvm; 60 struct kvm *kvm;
61 void (*ack_notifier)(void *opaque, int irq);
61}; 62};
62 63
63#ifdef DEBUG 64#ifdef DEBUG
@@ -78,16 +79,9 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
78 return kvm->arch.vioapic; 79 return kvm->arch.vioapic;
79} 80}
80 81
81#ifdef CONFIG_IA64
82static inline int irqchip_in_kernel(struct kvm *kvm)
83{
84 return 1;
85}
86#endif
87
88struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, 82struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
89 unsigned long bitmap); 83 unsigned long bitmap);
90void kvm_ioapic_update_eoi(struct kvm *kvm, int vector); 84void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
91int kvm_ioapic_init(struct kvm *kvm); 85int kvm_ioapic_init(struct kvm *kvm);
92void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); 86void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
93void kvm_ioapic_reset(struct kvm_ioapic *ioapic); 87void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
new file mode 100644
index 000000000000..d0169f5e6047
--- /dev/null
+++ b/virt/kvm/irq_comm.c
@@ -0,0 +1,60 @@
1/*
2 * irq_comm.c: Common API for in kernel interrupt controller
3 * Copyright (c) 2007, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 *
20 */
21
22#include <linux/kvm_host.h>
23#include "irq.h"
24
25#include "ioapic.h"
26
27/* This should be called with the kvm->lock mutex held */
28void kvm_set_irq(struct kvm *kvm, int irq, int level)
29{
30 /* Not possible to detect if the guest uses the PIC or the
31 * IOAPIC. So set the bit in both. The guest will ignore
32 * writes to the unused one.
33 */
34 kvm_ioapic_set_irq(kvm->arch.vioapic, irq, level);
35#ifdef CONFIG_X86
36 kvm_pic_set_irq(pic_irqchip(kvm), irq, level);
37#endif
38}
39
40void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi)
41{
42 struct kvm_irq_ack_notifier *kian;
43 struct hlist_node *n;
44
45 hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link)
46 if (kian->gsi == gsi)
47 kian->irq_acked(kian);
48}
49
50void kvm_register_irq_ack_notifier(struct kvm *kvm,
51 struct kvm_irq_ack_notifier *kian)
52{
53 hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list);
54}
55
56void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
57 struct kvm_irq_ack_notifier *kian)
58{
59 hlist_del(&kian->link);
60}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7dd9b0b85e4e..cf0ab8ed3845 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -51,6 +51,12 @@
51#include "coalesced_mmio.h" 51#include "coalesced_mmio.h"
52#endif 52#endif
53 53
54#ifdef KVM_CAP_DEVICE_ASSIGNMENT
55#include <linux/pci.h>
56#include <linux/interrupt.h>
57#include "irq.h"
58#endif
59
54MODULE_AUTHOR("Qumranet"); 60MODULE_AUTHOR("Qumranet");
55MODULE_LICENSE("GPL"); 61MODULE_LICENSE("GPL");
56 62
@@ -71,11 +77,253 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
71 77
72bool kvm_rebooting; 78bool kvm_rebooting;
73 79
80#ifdef KVM_CAP_DEVICE_ASSIGNMENT
81static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
82 int assigned_dev_id)
83{
84 struct list_head *ptr;
85 struct kvm_assigned_dev_kernel *match;
86
87 list_for_each(ptr, head) {
88 match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
89 if (match->assigned_dev_id == assigned_dev_id)
90 return match;
91 }
92 return NULL;
93}
94
95static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
96{
97 struct kvm_assigned_dev_kernel *assigned_dev;
98
99 assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
100 interrupt_work);
101
102 /* This is taken to safely inject irq inside the guest. When
103 * the interrupt injection (or the ioapic code) uses a
104 * finer-grained lock, update this
105 */
106 mutex_lock(&assigned_dev->kvm->lock);
107 kvm_set_irq(assigned_dev->kvm,
108 assigned_dev->guest_irq, 1);
109 mutex_unlock(&assigned_dev->kvm->lock);
110 kvm_put_kvm(assigned_dev->kvm);
111}
112
113/* FIXME: Implement the OR logic needed to make shared interrupts on
114 * this line behave properly
115 */
116static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
117{
118 struct kvm_assigned_dev_kernel *assigned_dev =
119 (struct kvm_assigned_dev_kernel *) dev_id;
120
121 kvm_get_kvm(assigned_dev->kvm);
122 schedule_work(&assigned_dev->interrupt_work);
123 disable_irq_nosync(irq);
124 return IRQ_HANDLED;
125}
126
127/* Ack the irq line for an assigned device */
128static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
129{
130 struct kvm_assigned_dev_kernel *dev;
131
132 if (kian->gsi == -1)
133 return;
134
135 dev = container_of(kian, struct kvm_assigned_dev_kernel,
136 ack_notifier);
137 kvm_set_irq(dev->kvm, dev->guest_irq, 0);
138 enable_irq(dev->host_irq);
139}
140
141static void kvm_free_assigned_device(struct kvm *kvm,
142 struct kvm_assigned_dev_kernel
143 *assigned_dev)
144{
145 if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested)
146 free_irq(assigned_dev->host_irq, (void *)assigned_dev);
147
148 kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
149
150 if (cancel_work_sync(&assigned_dev->interrupt_work))
151 /* We had pending work. That means we will have to take
152 * care of kvm_put_kvm.
153 */
154 kvm_put_kvm(kvm);
155
156 pci_release_regions(assigned_dev->dev);
157 pci_disable_device(assigned_dev->dev);
158 pci_dev_put(assigned_dev->dev);
159
160 list_del(&assigned_dev->list);
161 kfree(assigned_dev);
162}
163
164void kvm_free_all_assigned_devices(struct kvm *kvm)
165{
166 struct list_head *ptr, *ptr2;
167 struct kvm_assigned_dev_kernel *assigned_dev;
168
169 list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
170 assigned_dev = list_entry(ptr,
171 struct kvm_assigned_dev_kernel,
172 list);
173
174 kvm_free_assigned_device(kvm, assigned_dev);
175 }
176}
177
178static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
179 struct kvm_assigned_irq
180 *assigned_irq)
181{
182 int r = 0;
183 struct kvm_assigned_dev_kernel *match;
184
185 mutex_lock(&kvm->lock);
186
187 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
188 assigned_irq->assigned_dev_id);
189 if (!match) {
190 mutex_unlock(&kvm->lock);
191 return -EINVAL;
192 }
193
194 if (match->irq_requested) {
195 match->guest_irq = assigned_irq->guest_irq;
196 match->ack_notifier.gsi = assigned_irq->guest_irq;
197 mutex_unlock(&kvm->lock);
198 return 0;
199 }
200
201 INIT_WORK(&match->interrupt_work,
202 kvm_assigned_dev_interrupt_work_handler);
203
204 if (irqchip_in_kernel(kvm)) {
205 if (!capable(CAP_SYS_RAWIO)) {
206 r = -EPERM;
207 goto out_release;
208 }
209
210 if (assigned_irq->host_irq)
211 match->host_irq = assigned_irq->host_irq;
212 else
213 match->host_irq = match->dev->irq;
214 match->guest_irq = assigned_irq->guest_irq;
215 match->ack_notifier.gsi = assigned_irq->guest_irq;
216 match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
217 kvm_register_irq_ack_notifier(kvm, &match->ack_notifier);
218
219 /* Even though this is PCI, we don't want to use shared
220 * interrupts. Sharing host devices with guest-assigned devices
221 * on the same interrupt line is not a happy situation: there
222 * are going to be long delays in accepting, acking, etc.
223 */
224 if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0,
225 "kvm_assigned_device", (void *)match)) {
226 r = -EIO;
227 goto out_release;
228 }
229 }
230
231 match->irq_requested = true;
232 mutex_unlock(&kvm->lock);
233 return r;
234out_release:
235 mutex_unlock(&kvm->lock);
236 kvm_free_assigned_device(kvm, match);
237 return r;
238}
239
240static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
241 struct kvm_assigned_pci_dev *assigned_dev)
242{
243 int r = 0;
244 struct kvm_assigned_dev_kernel *match;
245 struct pci_dev *dev;
246
247 mutex_lock(&kvm->lock);
248
249 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
250 assigned_dev->assigned_dev_id);
251 if (match) {
252 /* device already assigned */
253 r = -EINVAL;
254 goto out;
255 }
256
257 match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
258 if (match == NULL) {
259 printk(KERN_INFO "%s: Couldn't allocate memory\n",
260 __func__);
261 r = -ENOMEM;
262 goto out;
263 }
264 dev = pci_get_bus_and_slot(assigned_dev->busnr,
265 assigned_dev->devfn);
266 if (!dev) {
267 printk(KERN_INFO "%s: host device not found\n", __func__);
268 r = -EINVAL;
269 goto out_free;
270 }
271 if (pci_enable_device(dev)) {
272 printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
273 r = -EBUSY;
274 goto out_put;
275 }
276 r = pci_request_regions(dev, "kvm_assigned_device");
277 if (r) {
278 printk(KERN_INFO "%s: Could not get access to device regions\n",
279 __func__);
280 goto out_disable;
281 }
282 match->assigned_dev_id = assigned_dev->assigned_dev_id;
283 match->host_busnr = assigned_dev->busnr;
284 match->host_devfn = assigned_dev->devfn;
285 match->dev = dev;
286
287 match->kvm = kvm;
288
289 list_add(&match->list, &kvm->arch.assigned_dev_head);
290
291 if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
292 r = kvm_iommu_map_guest(kvm, match);
293 if (r)
294 goto out_list_del;
295 }
296
297out:
298 mutex_unlock(&kvm->lock);
299 return r;
300out_list_del:
301 list_del(&match->list);
302 pci_release_regions(dev);
303out_disable:
304 pci_disable_device(dev);
305out_put:
306 pci_dev_put(dev);
307out_free:
308 kfree(match);
309 mutex_unlock(&kvm->lock);
310 return r;
311}
312#endif
313
74static inline int valid_vcpu(int n) 314static inline int valid_vcpu(int n)
75{ 315{
76 return likely(n >= 0 && n < KVM_MAX_VCPUS); 316 return likely(n >= 0 && n < KVM_MAX_VCPUS);
77} 317}
78 318
319inline int kvm_is_mmio_pfn(pfn_t pfn)
320{
321 if (pfn_valid(pfn))
322 return PageReserved(pfn_to_page(pfn));
323
324 return true;
325}
326
79/* 327/*
80 * Switches to specified vcpu, until a matching vcpu_put() 328 * Switches to specified vcpu, until a matching vcpu_put()
81 */ 329 */
@@ -570,6 +818,12 @@ int __kvm_set_memory_region(struct kvm *kvm,
570 } 818 }
571 819
572 kvm_free_physmem_slot(&old, &new); 820 kvm_free_physmem_slot(&old, &new);
821#ifdef CONFIG_DMAR
822 /* map the pages in iommu page table */
823 r = kvm_iommu_map_pages(kvm, base_gfn, npages);
824 if (r)
825 goto out;
826#endif
573 return 0; 827 return 0;
574 828
575out_free: 829out_free:
@@ -708,9 +962,6 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
708} 962}
709EXPORT_SYMBOL_GPL(gfn_to_hva); 963EXPORT_SYMBOL_GPL(gfn_to_hva);
710 964
711/*
712 * Requires current->mm->mmap_sem to be held
713 */
714pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 965pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
715{ 966{
716 struct page *page[1]; 967 struct page *page[1];
@@ -726,21 +977,24 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
726 return page_to_pfn(bad_page); 977 return page_to_pfn(bad_page);
727 } 978 }
728 979
729 npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page, 980 npages = get_user_pages_fast(addr, 1, 1, page);
730 NULL);
731 981
732 if (unlikely(npages != 1)) { 982 if (unlikely(npages != 1)) {
733 struct vm_area_struct *vma; 983 struct vm_area_struct *vma;
734 984
985 down_read(&current->mm->mmap_sem);
735 vma = find_vma(current->mm, addr); 986 vma = find_vma(current->mm, addr);
987
736 if (vma == NULL || addr < vma->vm_start || 988 if (vma == NULL || addr < vma->vm_start ||
737 !(vma->vm_flags & VM_PFNMAP)) { 989 !(vma->vm_flags & VM_PFNMAP)) {
990 up_read(&current->mm->mmap_sem);
738 get_page(bad_page); 991 get_page(bad_page);
739 return page_to_pfn(bad_page); 992 return page_to_pfn(bad_page);
740 } 993 }
741 994
742 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 995 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
743 BUG_ON(pfn_valid(pfn)); 996 up_read(&current->mm->mmap_sem);
997 BUG_ON(!kvm_is_mmio_pfn(pfn));
744 } else 998 } else
745 pfn = page_to_pfn(page[0]); 999 pfn = page_to_pfn(page[0]);
746 1000
@@ -754,10 +1008,10 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
754 pfn_t pfn; 1008 pfn_t pfn;
755 1009
756 pfn = gfn_to_pfn(kvm, gfn); 1010 pfn = gfn_to_pfn(kvm, gfn);
757 if (pfn_valid(pfn)) 1011 if (!kvm_is_mmio_pfn(pfn))
758 return pfn_to_page(pfn); 1012 return pfn_to_page(pfn);
759 1013
760 WARN_ON(!pfn_valid(pfn)); 1014 WARN_ON(kvm_is_mmio_pfn(pfn));
761 1015
762 get_page(bad_page); 1016 get_page(bad_page);
763 return bad_page; 1017 return bad_page;
@@ -773,7 +1027,7 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean);
773 1027
774void kvm_release_pfn_clean(pfn_t pfn) 1028void kvm_release_pfn_clean(pfn_t pfn)
775{ 1029{
776 if (pfn_valid(pfn)) 1030 if (!kvm_is_mmio_pfn(pfn))
777 put_page(pfn_to_page(pfn)); 1031 put_page(pfn_to_page(pfn));
778} 1032}
779EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1033EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
@@ -799,7 +1053,7 @@ EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
799 1053
800void kvm_set_pfn_dirty(pfn_t pfn) 1054void kvm_set_pfn_dirty(pfn_t pfn)
801{ 1055{
802 if (pfn_valid(pfn)) { 1056 if (!kvm_is_mmio_pfn(pfn)) {
803 struct page *page = pfn_to_page(pfn); 1057 struct page *page = pfn_to_page(pfn);
804 if (!PageReserved(page)) 1058 if (!PageReserved(page))
805 SetPageDirty(page); 1059 SetPageDirty(page);
@@ -809,14 +1063,14 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
809 1063
810void kvm_set_pfn_accessed(pfn_t pfn) 1064void kvm_set_pfn_accessed(pfn_t pfn)
811{ 1065{
812 if (pfn_valid(pfn)) 1066 if (!kvm_is_mmio_pfn(pfn))
813 mark_page_accessed(pfn_to_page(pfn)); 1067 mark_page_accessed(pfn_to_page(pfn));
814} 1068}
815EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1069EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
816 1070
817void kvm_get_pfn(pfn_t pfn) 1071void kvm_get_pfn(pfn_t pfn)
818{ 1072{
819 if (pfn_valid(pfn)) 1073 if (!kvm_is_mmio_pfn(pfn))
820 get_page(pfn_to_page(pfn)); 1074 get_page(pfn_to_page(pfn));
821} 1075}
822EXPORT_SYMBOL_GPL(kvm_get_pfn); 1076EXPORT_SYMBOL_GPL(kvm_get_pfn);
@@ -972,12 +1226,12 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
972 for (;;) { 1226 for (;;) {
973 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1227 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
974 1228
975 if (kvm_cpu_has_interrupt(vcpu)) 1229 if (kvm_cpu_has_interrupt(vcpu) ||
976 break; 1230 kvm_cpu_has_pending_timer(vcpu) ||
977 if (kvm_cpu_has_pending_timer(vcpu)) 1231 kvm_arch_vcpu_runnable(vcpu)) {
978 break; 1232 set_bit(KVM_REQ_UNHALT, &vcpu->requests);
979 if (kvm_arch_vcpu_runnable(vcpu))
980 break; 1233 break;
1234 }
981 if (signal_pending(current)) 1235 if (signal_pending(current))
982 break; 1236 break;
983 1237
@@ -1074,12 +1328,11 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
1074 1328
1075 r = kvm_arch_vcpu_setup(vcpu); 1329 r = kvm_arch_vcpu_setup(vcpu);
1076 if (r) 1330 if (r)
1077 goto vcpu_destroy; 1331 return r;
1078 1332
1079 mutex_lock(&kvm->lock); 1333 mutex_lock(&kvm->lock);
1080 if (kvm->vcpus[n]) { 1334 if (kvm->vcpus[n]) {
1081 r = -EEXIST; 1335 r = -EEXIST;
1082 mutex_unlock(&kvm->lock);
1083 goto vcpu_destroy; 1336 goto vcpu_destroy;
1084 } 1337 }
1085 kvm->vcpus[n] = vcpu; 1338 kvm->vcpus[n] = vcpu;
@@ -1095,8 +1348,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
1095unlink: 1348unlink:
1096 mutex_lock(&kvm->lock); 1349 mutex_lock(&kvm->lock);
1097 kvm->vcpus[n] = NULL; 1350 kvm->vcpus[n] = NULL;
1098 mutex_unlock(&kvm->lock);
1099vcpu_destroy: 1351vcpu_destroy:
1352 mutex_unlock(&kvm->lock);
1100 kvm_arch_vcpu_destroy(vcpu); 1353 kvm_arch_vcpu_destroy(vcpu);
1101 return r; 1354 return r;
1102} 1355}
@@ -1118,6 +1371,8 @@ static long kvm_vcpu_ioctl(struct file *filp,
1118 struct kvm_vcpu *vcpu = filp->private_data; 1371 struct kvm_vcpu *vcpu = filp->private_data;
1119 void __user *argp = (void __user *)arg; 1372 void __user *argp = (void __user *)arg;
1120 int r; 1373 int r;
1374 struct kvm_fpu *fpu = NULL;
1375 struct kvm_sregs *kvm_sregs = NULL;
1121 1376
1122 if (vcpu->kvm->mm != current->mm) 1377 if (vcpu->kvm->mm != current->mm)
1123 return -EIO; 1378 return -EIO;
@@ -1165,25 +1420,28 @@ out_free2:
1165 break; 1420 break;
1166 } 1421 }
1167 case KVM_GET_SREGS: { 1422 case KVM_GET_SREGS: {
1168 struct kvm_sregs kvm_sregs; 1423 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1169 1424 r = -ENOMEM;
1170 memset(&kvm_sregs, 0, sizeof kvm_sregs); 1425 if (!kvm_sregs)
1171 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs); 1426 goto out;
1427 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
1172 if (r) 1428 if (r)
1173 goto out; 1429 goto out;
1174 r = -EFAULT; 1430 r = -EFAULT;
1175 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs)) 1431 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
1176 goto out; 1432 goto out;
1177 r = 0; 1433 r = 0;
1178 break; 1434 break;
1179 } 1435 }
1180 case KVM_SET_SREGS: { 1436 case KVM_SET_SREGS: {
1181 struct kvm_sregs kvm_sregs; 1437 kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1182 1438 r = -ENOMEM;
1439 if (!kvm_sregs)
1440 goto out;
1183 r = -EFAULT; 1441 r = -EFAULT;
1184 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs)) 1442 if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs)))
1185 goto out; 1443 goto out;
1186 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs); 1444 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
1187 if (r) 1445 if (r)
1188 goto out; 1446 goto out;
1189 r = 0; 1447 r = 0;
@@ -1264,25 +1522,28 @@ out_free2:
1264 break; 1522 break;
1265 } 1523 }
1266 case KVM_GET_FPU: { 1524 case KVM_GET_FPU: {
1267 struct kvm_fpu fpu; 1525 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
1268 1526 r = -ENOMEM;
1269 memset(&fpu, 0, sizeof fpu); 1527 if (!fpu)
1270 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu); 1528 goto out;
1529 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
1271 if (r) 1530 if (r)
1272 goto out; 1531 goto out;
1273 r = -EFAULT; 1532 r = -EFAULT;
1274 if (copy_to_user(argp, &fpu, sizeof fpu)) 1533 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
1275 goto out; 1534 goto out;
1276 r = 0; 1535 r = 0;
1277 break; 1536 break;
1278 } 1537 }
1279 case KVM_SET_FPU: { 1538 case KVM_SET_FPU: {
1280 struct kvm_fpu fpu; 1539 fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
1281 1540 r = -ENOMEM;
1541 if (!fpu)
1542 goto out;
1282 r = -EFAULT; 1543 r = -EFAULT;
1283 if (copy_from_user(&fpu, argp, sizeof fpu)) 1544 if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu)))
1284 goto out; 1545 goto out;
1285 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu); 1546 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
1286 if (r) 1547 if (r)
1287 goto out; 1548 goto out;
1288 r = 0; 1549 r = 0;
@@ -1292,6 +1553,8 @@ out_free2:
1292 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1553 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
1293 } 1554 }
1294out: 1555out:
1556 kfree(fpu);
1557 kfree(kvm_sregs);
1295 return r; 1558 return r;
1296} 1559}
1297 1560
@@ -1360,6 +1623,30 @@ static long kvm_vm_ioctl(struct file *filp,
1360 break; 1623 break;
1361 } 1624 }
1362#endif 1625#endif
1626#ifdef KVM_CAP_DEVICE_ASSIGNMENT
1627 case KVM_ASSIGN_PCI_DEVICE: {
1628 struct kvm_assigned_pci_dev assigned_dev;
1629
1630 r = -EFAULT;
1631 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
1632 goto out;
1633 r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
1634 if (r)
1635 goto out;
1636 break;
1637 }
1638 case KVM_ASSIGN_IRQ: {
1639 struct kvm_assigned_irq assigned_irq;
1640
1641 r = -EFAULT;
1642 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
1643 goto out;
1644 r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
1645 if (r)
1646 goto out;
1647 break;
1648 }
1649#endif
1363 default: 1650 default:
1364 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 1651 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
1365 } 1652 }
@@ -1369,17 +1656,22 @@ out:
1369 1656
1370static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1657static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1371{ 1658{
1659 struct page *page[1];
1660 unsigned long addr;
1661 int npages;
1662 gfn_t gfn = vmf->pgoff;
1372 struct kvm *kvm = vma->vm_file->private_data; 1663 struct kvm *kvm = vma->vm_file->private_data;
1373 struct page *page;
1374 1664
1375 if (!kvm_is_visible_gfn(kvm, vmf->pgoff)) 1665 addr = gfn_to_hva(kvm, gfn);
1666 if (kvm_is_error_hva(addr))
1376 return VM_FAULT_SIGBUS; 1667 return VM_FAULT_SIGBUS;
1377 page = gfn_to_page(kvm, vmf->pgoff); 1668
1378 if (is_error_page(page)) { 1669 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
1379 kvm_release_page_clean(page); 1670 NULL);
1671 if (unlikely(npages != 1))
1380 return VM_FAULT_SIGBUS; 1672 return VM_FAULT_SIGBUS;
1381 } 1673
1382 vmf->page = page; 1674 vmf->page = page[0];
1383 return 0; 1675 return 0;
1384} 1676}
1385 1677
diff --git a/virt/kvm/kvm_trace.c b/virt/kvm/kvm_trace.c
index 58141f31ea8f..41dcc845f78c 100644
--- a/virt/kvm/kvm_trace.c
+++ b/virt/kvm/kvm_trace.c
@@ -17,6 +17,7 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/relay.h> 18#include <linux/relay.h>
19#include <linux/debugfs.h> 19#include <linux/debugfs.h>
20#include <linux/ktime.h>
20 21
21#include <linux/kvm_host.h> 22#include <linux/kvm_host.h>
22 23
@@ -35,16 +36,16 @@ static struct kvm_trace *kvm_trace;
35struct kvm_trace_probe { 36struct kvm_trace_probe {
36 const char *name; 37 const char *name;
37 const char *format; 38 const char *format;
38 u32 cycle_in; 39 u32 timestamp_in;
39 marker_probe_func *probe_func; 40 marker_probe_func *probe_func;
40}; 41};
41 42
42static inline int calc_rec_size(int cycle, int extra) 43static inline int calc_rec_size(int timestamp, int extra)
43{ 44{
44 int rec_size = KVM_TRC_HEAD_SIZE; 45 int rec_size = KVM_TRC_HEAD_SIZE;
45 46
46 rec_size += extra; 47 rec_size += extra;
47 return cycle ? rec_size += KVM_TRC_CYCLE_SIZE : rec_size; 48 return timestamp ? rec_size += KVM_TRC_CYCLE_SIZE : rec_size;
48} 49}
49 50
50static void kvm_add_trace(void *probe_private, void *call_data, 51static void kvm_add_trace(void *probe_private, void *call_data,
@@ -54,12 +55,13 @@ static void kvm_add_trace(void *probe_private, void *call_data,
54 struct kvm_trace *kt = kvm_trace; 55 struct kvm_trace *kt = kvm_trace;
55 struct kvm_trace_rec rec; 56 struct kvm_trace_rec rec;
56 struct kvm_vcpu *vcpu; 57 struct kvm_vcpu *vcpu;
57 int i, extra, size; 58 int i, size;
59 u32 extra;
58 60
59 if (unlikely(kt->trace_state != KVM_TRACE_STATE_RUNNING)) 61 if (unlikely(kt->trace_state != KVM_TRACE_STATE_RUNNING))
60 return; 62 return;
61 63
62 rec.event = va_arg(*args, u32); 64 rec.rec_val = TRACE_REC_EVENT_ID(va_arg(*args, u32));
63 vcpu = va_arg(*args, struct kvm_vcpu *); 65 vcpu = va_arg(*args, struct kvm_vcpu *);
64 rec.pid = current->tgid; 66 rec.pid = current->tgid;
65 rec.vcpu_id = vcpu->vcpu_id; 67 rec.vcpu_id = vcpu->vcpu_id;
@@ -67,21 +69,21 @@ static void kvm_add_trace(void *probe_private, void *call_data,
67 extra = va_arg(*args, u32); 69 extra = va_arg(*args, u32);
68 WARN_ON(!(extra <= KVM_TRC_EXTRA_MAX)); 70 WARN_ON(!(extra <= KVM_TRC_EXTRA_MAX));
69 extra = min_t(u32, extra, KVM_TRC_EXTRA_MAX); 71 extra = min_t(u32, extra, KVM_TRC_EXTRA_MAX);
70 rec.extra_u32 = extra;
71 72
72 rec.cycle_in = p->cycle_in; 73 rec.rec_val |= TRACE_REC_TCS(p->timestamp_in)
74 | TRACE_REC_NUM_DATA_ARGS(extra);
73 75
74 if (rec.cycle_in) { 76 if (p->timestamp_in) {
75 rec.u.cycle.cycle_u64 = get_cycles(); 77 rec.u.timestamp.timestamp = ktime_to_ns(ktime_get());
76 78
77 for (i = 0; i < rec.extra_u32; i++) 79 for (i = 0; i < extra; i++)
78 rec.u.cycle.extra_u32[i] = va_arg(*args, u32); 80 rec.u.timestamp.extra_u32[i] = va_arg(*args, u32);
79 } else { 81 } else {
80 for (i = 0; i < rec.extra_u32; i++) 82 for (i = 0; i < extra; i++)
81 rec.u.nocycle.extra_u32[i] = va_arg(*args, u32); 83 rec.u.notimestamp.extra_u32[i] = va_arg(*args, u32);
82 } 84 }
83 85
84 size = calc_rec_size(rec.cycle_in, rec.extra_u32 * sizeof(u32)); 86 size = calc_rec_size(p->timestamp_in, extra * sizeof(u32));
85 relay_write(kt->rchan, &rec, size); 87 relay_write(kt->rchan, &rec, size);
86} 88}
87 89
diff --git a/virt/kvm/vtd.c b/virt/kvm/vtd.c
new file mode 100644
index 000000000000..a770874f3a3a
--- /dev/null
+++ b/virt/kvm/vtd.c
@@ -0,0 +1,191 @@
1/*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Copyright IBM Corporation, 2008
19 * Author: Allen M. Kay <allen.m.kay@intel.com>
20 * Author: Weidong Han <weidong.han@intel.com>
21 * Author: Ben-Ami Yassour <benami@il.ibm.com>
22 */
23
24#include <linux/list.h>
25#include <linux/kvm_host.h>
26#include <linux/pci.h>
27#include <linux/dmar.h>
28#include <linux/intel-iommu.h>
29
30static int kvm_iommu_unmap_memslots(struct kvm *kvm);
31static void kvm_iommu_put_pages(struct kvm *kvm,
32 gfn_t base_gfn, unsigned long npages);
33
34int kvm_iommu_map_pages(struct kvm *kvm,
35 gfn_t base_gfn, unsigned long npages)
36{
37 gfn_t gfn = base_gfn;
38 pfn_t pfn;
39 int i, r = 0;
40 struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
41
42 /* check if iommu exists and in use */
43 if (!domain)
44 return 0;
45
46 for (i = 0; i < npages; i++) {
47 /* check if already mapped */
48 pfn = (pfn_t)intel_iommu_iova_to_pfn(domain,
49 gfn_to_gpa(gfn));
50 if (pfn)
51 continue;
52
53 pfn = gfn_to_pfn(kvm, gfn);
54 r = intel_iommu_page_mapping(domain,
55 gfn_to_gpa(gfn),
56 pfn_to_hpa(pfn),
57 PAGE_SIZE,
58 DMA_PTE_READ |
59 DMA_PTE_WRITE);
60 if (r) {
61 printk(KERN_ERR "kvm_iommu_map_pages:"
62 "iommu failed to map pfn=%lx\n", pfn);
63 goto unmap_pages;
64 }
65 gfn++;
66 }
67 return 0;
68
69unmap_pages:
70 kvm_iommu_put_pages(kvm, base_gfn, i);
71 return r;
72}
73
74static int kvm_iommu_map_memslots(struct kvm *kvm)
75{
76 int i, r;
77
78 down_read(&kvm->slots_lock);
79 for (i = 0; i < kvm->nmemslots; i++) {
80 r = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn,
81 kvm->memslots[i].npages);
82 if (r)
83 break;
84 }
85 up_read(&kvm->slots_lock);
86 return r;
87}
88
89int kvm_iommu_map_guest(struct kvm *kvm,
90 struct kvm_assigned_dev_kernel *assigned_dev)
91{
92 struct pci_dev *pdev = NULL;
93 int r;
94
95 if (!intel_iommu_found()) {
96 printk(KERN_ERR "%s: intel iommu not found\n", __func__);
97 return -ENODEV;
98 }
99
100 printk(KERN_DEBUG "VT-d direct map: host bdf = %x:%x:%x\n",
101 assigned_dev->host_busnr,
102 PCI_SLOT(assigned_dev->host_devfn),
103 PCI_FUNC(assigned_dev->host_devfn));
104
105 pdev = assigned_dev->dev;
106
107 if (pdev == NULL) {
108 if (kvm->arch.intel_iommu_domain) {
109 intel_iommu_domain_exit(kvm->arch.intel_iommu_domain);
110 kvm->arch.intel_iommu_domain = NULL;
111 }
112 return -ENODEV;
113 }
114
115 kvm->arch.intel_iommu_domain = intel_iommu_domain_alloc(pdev);
116 if (!kvm->arch.intel_iommu_domain)
117 return -ENODEV;
118
119 r = kvm_iommu_map_memslots(kvm);
120 if (r)
121 goto out_unmap;
122
123 intel_iommu_detach_dev(kvm->arch.intel_iommu_domain,
124 pdev->bus->number, pdev->devfn);
125
126 r = intel_iommu_context_mapping(kvm->arch.intel_iommu_domain,
127 pdev);
128 if (r) {
129 printk(KERN_ERR "Domain context map for %s failed",
130 pci_name(pdev));
131 goto out_unmap;
132 }
133 return 0;
134
135out_unmap:
136 kvm_iommu_unmap_memslots(kvm);
137 return r;
138}
139
140static void kvm_iommu_put_pages(struct kvm *kvm,
141 gfn_t base_gfn, unsigned long npages)
142{
143 gfn_t gfn = base_gfn;
144 pfn_t pfn;
145 struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
146 int i;
147
148 for (i = 0; i < npages; i++) {
149 pfn = (pfn_t)intel_iommu_iova_to_pfn(domain,
150 gfn_to_gpa(gfn));
151 kvm_release_pfn_clean(pfn);
152 gfn++;
153 }
154}
155
156static int kvm_iommu_unmap_memslots(struct kvm *kvm)
157{
158 int i;
159 down_read(&kvm->slots_lock);
160 for (i = 0; i < kvm->nmemslots; i++) {
161 kvm_iommu_put_pages(kvm, kvm->memslots[i].base_gfn,
162 kvm->memslots[i].npages);
163 }
164 up_read(&kvm->slots_lock);
165
166 return 0;
167}
168
169int kvm_iommu_unmap_guest(struct kvm *kvm)
170{
171 struct kvm_assigned_dev_kernel *entry;
172 struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
173
174 /* check if iommu exists and in use */
175 if (!domain)
176 return 0;
177
178 list_for_each_entry(entry, &kvm->arch.assigned_dev_head, list) {
179 printk(KERN_DEBUG "VT-d unmap: host bdf = %x:%x:%x\n",
180 entry->host_busnr,
181 PCI_SLOT(entry->host_devfn),
182 PCI_FUNC(entry->host_devfn));
183
184 /* detach kvm dmar domain */
185 intel_iommu_detach_dev(domain, entry->host_busnr,
186 entry->host_devfn);
187 }
188 kvm_iommu_unmap_memslots(kvm);
189 intel_iommu_domain_exit(domain);
190 return 0;
191}