aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'drivers')
-rw-r--r--drivers/kvm/Kconfig9
-rw-r--r--drivers/kvm/kvm.h116
-rw-r--r--drivers/kvm/kvm_main.c456
-rw-r--r--drivers/kvm/mmu.c292
-rw-r--r--drivers/kvm/paging_tmpl.h273
-rw-r--r--drivers/kvm/svm.c59
-rw-r--r--drivers/kvm/svm.h3
-rw-r--r--drivers/kvm/vmx.c652
-rw-r--r--drivers/kvm/x86_emulate.c44
9 files changed, 1154 insertions, 750 deletions
diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig
index e8e37d826478..33fa28a8c199 100644
--- a/drivers/kvm/Kconfig
+++ b/drivers/kvm/Kconfig
@@ -1,12 +1,17 @@
1# 1#
2# KVM configuration 2# KVM configuration
3# 3#
4menu "Virtualization" 4menuconfig VIRTUALIZATION
5 bool "Virtualization"
5 depends on X86 6 depends on X86
7 default y
8
9if VIRTUALIZATION
6 10
7config KVM 11config KVM
8 tristate "Kernel-based Virtual Machine (KVM) support" 12 tristate "Kernel-based Virtual Machine (KVM) support"
9 depends on X86 && EXPERIMENTAL 13 depends on X86 && EXPERIMENTAL
14 depends on X86_CMPXCHG64 || 64BIT
10 ---help--- 15 ---help---
11 Support hosting fully virtualized guest machines using hardware 16 Support hosting fully virtualized guest machines using hardware
12 virtualization extensions. You will need a fairly recent 17 virtualization extensions. You will need a fairly recent
@@ -35,4 +40,4 @@ config KVM_AMD
35 Provides support for KVM on AMD processors equipped with the AMD-V 40 Provides support for KVM on AMD processors equipped with the AMD-V
36 (SVM) extensions. 41 (SVM) extensions.
37 42
38endmenu 43endif # VIRTUALIZATION
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 152312c1fafa..a7c5e6bee034 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -10,6 +10,8 @@
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/mutex.h> 11#include <linux/mutex.h>
12#include <linux/spinlock.h> 12#include <linux/spinlock.h>
13#include <linux/signal.h>
14#include <linux/sched.h>
13#include <linux/mm.h> 15#include <linux/mm.h>
14#include <asm/signal.h> 16#include <asm/signal.h>
15 17
@@ -18,6 +20,7 @@
18#include <linux/kvm_para.h> 20#include <linux/kvm_para.h>
19 21
20#define CR0_PE_MASK (1ULL << 0) 22#define CR0_PE_MASK (1ULL << 0)
23#define CR0_MP_MASK (1ULL << 1)
21#define CR0_TS_MASK (1ULL << 3) 24#define CR0_TS_MASK (1ULL << 3)
22#define CR0_NE_MASK (1ULL << 5) 25#define CR0_NE_MASK (1ULL << 5)
23#define CR0_WP_MASK (1ULL << 16) 26#define CR0_WP_MASK (1ULL << 16)
@@ -42,7 +45,8 @@
42 (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK \ 45 (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK \
43 | CR0_NW_MASK | CR0_CD_MASK) 46 | CR0_NW_MASK | CR0_CD_MASK)
44#define KVM_VM_CR0_ALWAYS_ON \ 47#define KVM_VM_CR0_ALWAYS_ON \
45 (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK) 48 (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK | CR0_TS_MASK \
49 | CR0_MP_MASK)
46#define KVM_GUEST_CR4_MASK \ 50#define KVM_GUEST_CR4_MASK \
47 (CR4_PSE_MASK | CR4_PAE_MASK | CR4_PGE_MASK | CR4_VMXE_MASK | CR4_VME_MASK) 51 (CR4_PSE_MASK | CR4_PAE_MASK | CR4_PGE_MASK | CR4_VMXE_MASK | CR4_VME_MASK)
48#define KVM_PMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK) 52#define KVM_PMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK)
@@ -51,10 +55,10 @@
51#define INVALID_PAGE (~(hpa_t)0) 55#define INVALID_PAGE (~(hpa_t)0)
52#define UNMAPPED_GVA (~(gpa_t)0) 56#define UNMAPPED_GVA (~(gpa_t)0)
53 57
54#define KVM_MAX_VCPUS 1 58#define KVM_MAX_VCPUS 4
55#define KVM_ALIAS_SLOTS 4 59#define KVM_ALIAS_SLOTS 4
56#define KVM_MEMORY_SLOTS 4 60#define KVM_MEMORY_SLOTS 4
57#define KVM_NUM_MMU_PAGES 256 61#define KVM_NUM_MMU_PAGES 1024
58#define KVM_MIN_FREE_MMU_PAGES 5 62#define KVM_MIN_FREE_MMU_PAGES 5
59#define KVM_REFILL_PAGES 25 63#define KVM_REFILL_PAGES 25
60#define KVM_MAX_CPUID_ENTRIES 40 64#define KVM_MAX_CPUID_ENTRIES 40
@@ -80,6 +84,11 @@
80#define KVM_PIO_PAGE_OFFSET 1 84#define KVM_PIO_PAGE_OFFSET 1
81 85
82/* 86/*
87 * vcpu->requests bit members
88 */
89#define KVM_TLB_FLUSH 0
90
91/*
83 * Address types: 92 * Address types:
84 * 93 *
85 * gva - guest virtual address 94 * gva - guest virtual address
@@ -137,7 +146,7 @@ struct kvm_mmu_page {
137 gfn_t gfn; 146 gfn_t gfn;
138 union kvm_mmu_page_role role; 147 union kvm_mmu_page_role role;
139 148
140 hpa_t page_hpa; 149 u64 *spt;
141 unsigned long slot_bitmap; /* One bit set per slot which has memory 150 unsigned long slot_bitmap; /* One bit set per slot which has memory
142 * in this shadow page. 151 * in this shadow page.
143 */ 152 */
@@ -232,6 +241,7 @@ struct kvm_pio_request {
232 struct page *guest_pages[2]; 241 struct page *guest_pages[2];
233 unsigned guest_page_offset; 242 unsigned guest_page_offset;
234 int in; 243 int in;
244 int port;
235 int size; 245 int size;
236 int string; 246 int string;
237 int down; 247 int down;
@@ -252,8 +262,70 @@ struct kvm_stat {
252 u32 halt_exits; 262 u32 halt_exits;
253 u32 request_irq_exits; 263 u32 request_irq_exits;
254 u32 irq_exits; 264 u32 irq_exits;
265 u32 light_exits;
266 u32 efer_reload;
267};
268
269struct kvm_io_device {
270 void (*read)(struct kvm_io_device *this,
271 gpa_t addr,
272 int len,
273 void *val);
274 void (*write)(struct kvm_io_device *this,
275 gpa_t addr,
276 int len,
277 const void *val);
278 int (*in_range)(struct kvm_io_device *this, gpa_t addr);
279 void (*destructor)(struct kvm_io_device *this);
280
281 void *private;
282};
283
284static inline void kvm_iodevice_read(struct kvm_io_device *dev,
285 gpa_t addr,
286 int len,
287 void *val)
288{
289 dev->read(dev, addr, len, val);
290}
291
292static inline void kvm_iodevice_write(struct kvm_io_device *dev,
293 gpa_t addr,
294 int len,
295 const void *val)
296{
297 dev->write(dev, addr, len, val);
298}
299
300static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
301{
302 return dev->in_range(dev, addr);
303}
304
305static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
306{
307 if (dev->destructor)
308 dev->destructor(dev);
309}
310
311/*
312 * It would be nice to use something smarter than a linear search, TBD...
313 * Thankfully we dont expect many devices to register (famous last words :),
314 * so until then it will suffice. At least its abstracted so we can change
315 * in one place.
316 */
317struct kvm_io_bus {
318 int dev_count;
319#define NR_IOBUS_DEVS 6
320 struct kvm_io_device *devs[NR_IOBUS_DEVS];
255}; 321};
256 322
323void kvm_io_bus_init(struct kvm_io_bus *bus);
324void kvm_io_bus_destroy(struct kvm_io_bus *bus);
325struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
326void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
327 struct kvm_io_device *dev);
328
257struct kvm_vcpu { 329struct kvm_vcpu {
258 struct kvm *kvm; 330 struct kvm *kvm;
259 union { 331 union {
@@ -266,6 +338,8 @@ struct kvm_vcpu {
266 u64 host_tsc; 338 u64 host_tsc;
267 struct kvm_run *run; 339 struct kvm_run *run;
268 int interrupt_window_open; 340 int interrupt_window_open;
341 int guest_mode;
342 unsigned long requests;
269 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ 343 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
270#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) 344#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long)
271 unsigned long irq_pending[NR_IRQ_WORDS]; 345 unsigned long irq_pending[NR_IRQ_WORDS];
@@ -285,15 +359,20 @@ struct kvm_vcpu {
285 u64 apic_base; 359 u64 apic_base;
286 u64 ia32_misc_enable_msr; 360 u64 ia32_misc_enable_msr;
287 int nmsrs; 361 int nmsrs;
362 int save_nmsrs;
363 int msr_offset_efer;
364#ifdef CONFIG_X86_64
365 int msr_offset_kernel_gs_base;
366#endif
288 struct vmx_msr_entry *guest_msrs; 367 struct vmx_msr_entry *guest_msrs;
289 struct vmx_msr_entry *host_msrs; 368 struct vmx_msr_entry *host_msrs;
290 369
291 struct list_head free_pages;
292 struct kvm_mmu_page page_header_buf[KVM_NUM_MMU_PAGES];
293 struct kvm_mmu mmu; 370 struct kvm_mmu mmu;
294 371
295 struct kvm_mmu_memory_cache mmu_pte_chain_cache; 372 struct kvm_mmu_memory_cache mmu_pte_chain_cache;
296 struct kvm_mmu_memory_cache mmu_rmap_desc_cache; 373 struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
374 struct kvm_mmu_memory_cache mmu_page_cache;
375 struct kvm_mmu_memory_cache mmu_page_header_cache;
297 376
298 gfn_t last_pt_write_gfn; 377 gfn_t last_pt_write_gfn;
299 int last_pt_write_count; 378 int last_pt_write_count;
@@ -305,6 +384,11 @@ struct kvm_vcpu {
305 char *guest_fx_image; 384 char *guest_fx_image;
306 int fpu_active; 385 int fpu_active;
307 int guest_fpu_loaded; 386 int guest_fpu_loaded;
387 struct vmx_host_state {
388 int loaded;
389 u16 fs_sel, gs_sel, ldt_sel;
390 int fs_gs_ldt_reload_needed;
391 } vmx_host_state;
308 392
309 int mmio_needed; 393 int mmio_needed;
310 int mmio_read_completed; 394 int mmio_read_completed;
@@ -331,6 +415,7 @@ struct kvm_vcpu {
331 u32 ar; 415 u32 ar;
332 } tr, es, ds, fs, gs; 416 } tr, es, ds, fs, gs;
333 } rmode; 417 } rmode;
418 int halt_request; /* real mode on Intel only */
334 419
335 int cpuid_nent; 420 int cpuid_nent;
336 struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES]; 421 struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES];
@@ -362,12 +447,15 @@ struct kvm {
362 struct list_head active_mmu_pages; 447 struct list_head active_mmu_pages;
363 int n_free_mmu_pages; 448 int n_free_mmu_pages;
364 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; 449 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
450 int nvcpus;
365 struct kvm_vcpu vcpus[KVM_MAX_VCPUS]; 451 struct kvm_vcpu vcpus[KVM_MAX_VCPUS];
366 int memory_config_version; 452 int memory_config_version;
367 int busy; 453 int busy;
368 unsigned long rmap_overflow; 454 unsigned long rmap_overflow;
369 struct list_head vm_list; 455 struct list_head vm_list;
370 struct file *filp; 456 struct file *filp;
457 struct kvm_io_bus mmio_bus;
458 struct kvm_io_bus pio_bus;
371}; 459};
372 460
373struct descriptor_table { 461struct descriptor_table {
@@ -488,6 +576,7 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
488 int size, unsigned long count, int string, int down, 576 int size, unsigned long count, int string, int down,
489 gva_t address, int rep, unsigned port); 577 gva_t address, int rep, unsigned port);
490void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); 578void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
579int kvm_emulate_halt(struct kvm_vcpu *vcpu);
491int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); 580int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
492int emulate_clts(struct kvm_vcpu *vcpu); 581int emulate_clts(struct kvm_vcpu *vcpu);
493int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, 582int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr,
@@ -511,6 +600,7 @@ void save_msrs(struct vmx_msr_entry *e, int n);
511void kvm_resched(struct kvm_vcpu *vcpu); 600void kvm_resched(struct kvm_vcpu *vcpu);
512void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); 601void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
513void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); 602void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
603void kvm_flush_remote_tlbs(struct kvm *kvm);
514 604
515int kvm_read_guest(struct kvm_vcpu *vcpu, 605int kvm_read_guest(struct kvm_vcpu *vcpu,
516 gva_t addr, 606 gva_t addr,
@@ -524,10 +614,12 @@ int kvm_write_guest(struct kvm_vcpu *vcpu,
524 614
525unsigned long segment_base(u16 selector); 615unsigned long segment_base(u16 selector);
526 616
527void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes); 617void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
528void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes); 618 const u8 *old, const u8 *new, int bytes);
529int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); 619int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
530void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); 620void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
621int kvm_mmu_load(struct kvm_vcpu *vcpu);
622void kvm_mmu_unload(struct kvm_vcpu *vcpu);
531 623
532int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); 624int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
533 625
@@ -539,6 +631,14 @@ static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
539 return vcpu->mmu.page_fault(vcpu, gva, error_code); 631 return vcpu->mmu.page_fault(vcpu, gva, error_code);
540} 632}
541 633
634static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
635{
636 if (likely(vcpu->mmu.root_hpa != INVALID_PAGE))
637 return 0;
638
639 return kvm_mmu_load(vcpu);
640}
641
542static inline int is_long_mode(struct kvm_vcpu *vcpu) 642static inline int is_long_mode(struct kvm_vcpu *vcpu)
543{ 643{
544#ifdef CONFIG_X86_64 644#ifdef CONFIG_X86_64
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 8f1f07adb04e..1b206f197c6b 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -16,34 +16,33 @@
16 */ 16 */
17 17
18#include "kvm.h" 18#include "kvm.h"
19#include "x86_emulate.h"
20#include "segment_descriptor.h"
19 21
20#include <linux/kvm.h> 22#include <linux/kvm.h>
21#include <linux/module.h> 23#include <linux/module.h>
22#include <linux/errno.h> 24#include <linux/errno.h>
23#include <linux/magic.h>
24#include <asm/processor.h>
25#include <linux/percpu.h> 25#include <linux/percpu.h>
26#include <linux/gfp.h> 26#include <linux/gfp.h>
27#include <asm/msr.h>
28#include <linux/mm.h> 27#include <linux/mm.h>
29#include <linux/miscdevice.h> 28#include <linux/miscdevice.h>
30#include <linux/vmalloc.h> 29#include <linux/vmalloc.h>
31#include <asm/uaccess.h>
32#include <linux/reboot.h> 30#include <linux/reboot.h>
33#include <asm/io.h>
34#include <linux/debugfs.h> 31#include <linux/debugfs.h>
35#include <linux/highmem.h> 32#include <linux/highmem.h>
36#include <linux/file.h> 33#include <linux/file.h>
37#include <asm/desc.h>
38#include <linux/sysdev.h> 34#include <linux/sysdev.h>
39#include <linux/cpu.h> 35#include <linux/cpu.h>
40#include <linux/file.h>
41#include <linux/fs.h>
42#include <linux/mount.h>
43#include <linux/sched.h> 36#include <linux/sched.h>
37#include <linux/cpumask.h>
38#include <linux/smp.h>
39#include <linux/anon_inodes.h>
44 40
45#include "x86_emulate.h" 41#include <asm/processor.h>
46#include "segment_descriptor.h" 42#include <asm/msr.h>
43#include <asm/io.h>
44#include <asm/uaccess.h>
45#include <asm/desc.h>
47 46
48MODULE_AUTHOR("Qumranet"); 47MODULE_AUTHOR("Qumranet");
49MODULE_LICENSE("GPL"); 48MODULE_LICENSE("GPL");
@@ -51,8 +50,12 @@ MODULE_LICENSE("GPL");
51static DEFINE_SPINLOCK(kvm_lock); 50static DEFINE_SPINLOCK(kvm_lock);
52static LIST_HEAD(vm_list); 51static LIST_HEAD(vm_list);
53 52
53static cpumask_t cpus_hardware_enabled;
54
54struct kvm_arch_ops *kvm_arch_ops; 55struct kvm_arch_ops *kvm_arch_ops;
55 56
57static void hardware_disable(void *ignored);
58
56#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x) 59#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
57 60
58static struct kvm_stats_debugfs_item { 61static struct kvm_stats_debugfs_item {
@@ -72,13 +75,13 @@ static struct kvm_stats_debugfs_item {
72 { "halt_exits", STAT_OFFSET(halt_exits) }, 75 { "halt_exits", STAT_OFFSET(halt_exits) },
73 { "request_irq", STAT_OFFSET(request_irq_exits) }, 76 { "request_irq", STAT_OFFSET(request_irq_exits) },
74 { "irq_exits", STAT_OFFSET(irq_exits) }, 77 { "irq_exits", STAT_OFFSET(irq_exits) },
78 { "light_exits", STAT_OFFSET(light_exits) },
79 { "efer_reload", STAT_OFFSET(efer_reload) },
75 { NULL } 80 { NULL }
76}; 81};
77 82
78static struct dentry *debugfs_dir; 83static struct dentry *debugfs_dir;
79 84
80struct vfsmount *kvmfs_mnt;
81
82#define MAX_IO_MSRS 256 85#define MAX_IO_MSRS 256
83 86
84#define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL 87#define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
@@ -100,55 +103,6 @@ struct segment_descriptor_64 {
100static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 103static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
101 unsigned long arg); 104 unsigned long arg);
102 105
103static struct inode *kvmfs_inode(struct file_operations *fops)
104{
105 int error = -ENOMEM;
106 struct inode *inode = new_inode(kvmfs_mnt->mnt_sb);
107
108 if (!inode)
109 goto eexit_1;
110
111 inode->i_fop = fops;
112
113 /*
114 * Mark the inode dirty from the very beginning,
115 * that way it will never be moved to the dirty
116 * list because mark_inode_dirty() will think
117 * that it already _is_ on the dirty list.
118 */
119 inode->i_state = I_DIRTY;
120 inode->i_mode = S_IRUSR | S_IWUSR;
121 inode->i_uid = current->fsuid;
122 inode->i_gid = current->fsgid;
123 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
124 return inode;
125
126eexit_1:
127 return ERR_PTR(error);
128}
129
130static struct file *kvmfs_file(struct inode *inode, void *private_data)
131{
132 struct file *file = get_empty_filp();
133
134 if (!file)
135 return ERR_PTR(-ENFILE);
136
137 file->f_path.mnt = mntget(kvmfs_mnt);
138 file->f_path.dentry = d_alloc_anon(inode);
139 if (!file->f_path.dentry)
140 return ERR_PTR(-ENOMEM);
141 file->f_mapping = inode->i_mapping;
142
143 file->f_pos = 0;
144 file->f_flags = O_RDWR;
145 file->f_op = inode->i_fop;
146 file->f_mode = FMODE_READ | FMODE_WRITE;
147 file->f_version = 0;
148 file->private_data = private_data;
149 return file;
150}
151
152unsigned long segment_base(u16 selector) 106unsigned long segment_base(u16 selector)
153{ 107{
154 struct descriptor_table gdt; 108 struct descriptor_table gdt;
@@ -307,6 +261,48 @@ static void vcpu_put(struct kvm_vcpu *vcpu)
307 mutex_unlock(&vcpu->mutex); 261 mutex_unlock(&vcpu->mutex);
308} 262}
309 263
264static void ack_flush(void *_completed)
265{
266 atomic_t *completed = _completed;
267
268 atomic_inc(completed);
269}
270
271void kvm_flush_remote_tlbs(struct kvm *kvm)
272{
273 int i, cpu, needed;
274 cpumask_t cpus;
275 struct kvm_vcpu *vcpu;
276 atomic_t completed;
277
278 atomic_set(&completed, 0);
279 cpus_clear(cpus);
280 needed = 0;
281 for (i = 0; i < kvm->nvcpus; ++i) {
282 vcpu = &kvm->vcpus[i];
283 if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
284 continue;
285 cpu = vcpu->cpu;
286 if (cpu != -1 && cpu != raw_smp_processor_id())
287 if (!cpu_isset(cpu, cpus)) {
288 cpu_set(cpu, cpus);
289 ++needed;
290 }
291 }
292
293 /*
294 * We really want smp_call_function_mask() here. But that's not
295 * available, so ipi all cpus in parallel and wait for them
296 * to complete.
297 */
298 for (cpu = first_cpu(cpus); cpu != NR_CPUS; cpu = next_cpu(cpu, cpus))
299 smp_call_function_single(cpu, ack_flush, &completed, 1, 0);
300 while (atomic_read(&completed) != needed) {
301 cpu_relax();
302 barrier();
303 }
304}
305
310static struct kvm *kvm_create_vm(void) 306static struct kvm *kvm_create_vm(void)
311{ 307{
312 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); 308 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
@@ -315,8 +311,13 @@ static struct kvm *kvm_create_vm(void)
315 if (!kvm) 311 if (!kvm)
316 return ERR_PTR(-ENOMEM); 312 return ERR_PTR(-ENOMEM);
317 313
314 kvm_io_bus_init(&kvm->pio_bus);
318 spin_lock_init(&kvm->lock); 315 spin_lock_init(&kvm->lock);
319 INIT_LIST_HEAD(&kvm->active_mmu_pages); 316 INIT_LIST_HEAD(&kvm->active_mmu_pages);
317 spin_lock(&kvm_lock);
318 list_add(&kvm->vm_list, &vm_list);
319 spin_unlock(&kvm_lock);
320 kvm_io_bus_init(&kvm->mmio_bus);
320 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 321 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
321 struct kvm_vcpu *vcpu = &kvm->vcpus[i]; 322 struct kvm_vcpu *vcpu = &kvm->vcpus[i];
322 323
@@ -324,10 +325,6 @@ static struct kvm *kvm_create_vm(void)
324 vcpu->cpu = -1; 325 vcpu->cpu = -1;
325 vcpu->kvm = kvm; 326 vcpu->kvm = kvm;
326 vcpu->mmu.root_hpa = INVALID_PAGE; 327 vcpu->mmu.root_hpa = INVALID_PAGE;
327 INIT_LIST_HEAD(&vcpu->free_pages);
328 spin_lock(&kvm_lock);
329 list_add(&kvm->vm_list, &vm_list);
330 spin_unlock(&kvm_lock);
331 } 328 }
332 return kvm; 329 return kvm;
333} 330}
@@ -380,6 +377,16 @@ static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
380 } 377 }
381} 378}
382 379
380static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
381{
382 if (!vcpu->vmcs)
383 return;
384
385 vcpu_load(vcpu);
386 kvm_mmu_unload(vcpu);
387 vcpu_put(vcpu);
388}
389
383static void kvm_free_vcpu(struct kvm_vcpu *vcpu) 390static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
384{ 391{
385 if (!vcpu->vmcs) 392 if (!vcpu->vmcs)
@@ -400,6 +407,11 @@ static void kvm_free_vcpus(struct kvm *kvm)
400{ 407{
401 unsigned int i; 408 unsigned int i;
402 409
410 /*
411 * Unpin any mmu pages first.
412 */
413 for (i = 0; i < KVM_MAX_VCPUS; ++i)
414 kvm_unload_vcpu_mmu(&kvm->vcpus[i]);
403 for (i = 0; i < KVM_MAX_VCPUS; ++i) 415 for (i = 0; i < KVM_MAX_VCPUS; ++i)
404 kvm_free_vcpu(&kvm->vcpus[i]); 416 kvm_free_vcpu(&kvm->vcpus[i]);
405} 417}
@@ -414,6 +426,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
414 spin_lock(&kvm_lock); 426 spin_lock(&kvm_lock);
415 list_del(&kvm->vm_list); 427 list_del(&kvm->vm_list);
416 spin_unlock(&kvm_lock); 428 spin_unlock(&kvm_lock);
429 kvm_io_bus_destroy(&kvm->pio_bus);
430 kvm_io_bus_destroy(&kvm->mmio_bus);
417 kvm_free_vcpus(kvm); 431 kvm_free_vcpus(kvm);
418 kvm_free_physmem(kvm); 432 kvm_free_physmem(kvm);
419 kfree(kvm); 433 kfree(kvm);
@@ -969,7 +983,7 @@ EXPORT_SYMBOL_GPL(gfn_to_page);
969void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 983void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
970{ 984{
971 int i; 985 int i;
972 struct kvm_memory_slot *memslot = NULL; 986 struct kvm_memory_slot *memslot;
973 unsigned long rel_gfn; 987 unsigned long rel_gfn;
974 988
975 for (i = 0; i < kvm->nmemslots; ++i) { 989 for (i = 0; i < kvm->nmemslots; ++i) {
@@ -978,7 +992,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
978 if (gfn >= memslot->base_gfn 992 if (gfn >= memslot->base_gfn
979 && gfn < memslot->base_gfn + memslot->npages) { 993 && gfn < memslot->base_gfn + memslot->npages) {
980 994
981 if (!memslot || !memslot->dirty_bitmap) 995 if (!memslot->dirty_bitmap)
982 return; 996 return;
983 997
984 rel_gfn = gfn - memslot->base_gfn; 998 rel_gfn = gfn - memslot->base_gfn;
@@ -1037,12 +1051,31 @@ static int emulator_write_std(unsigned long addr,
1037 return X86EMUL_UNHANDLEABLE; 1051 return X86EMUL_UNHANDLEABLE;
1038} 1052}
1039 1053
1054static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1055 gpa_t addr)
1056{
1057 /*
1058 * Note that its important to have this wrapper function because
1059 * in the very near future we will be checking for MMIOs against
1060 * the LAPIC as well as the general MMIO bus
1061 */
1062 return kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1063}
1064
1065static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
1066 gpa_t addr)
1067{
1068 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
1069}
1070
1040static int emulator_read_emulated(unsigned long addr, 1071static int emulator_read_emulated(unsigned long addr,
1041 void *val, 1072 void *val,
1042 unsigned int bytes, 1073 unsigned int bytes,
1043 struct x86_emulate_ctxt *ctxt) 1074 struct x86_emulate_ctxt *ctxt)
1044{ 1075{
1045 struct kvm_vcpu *vcpu = ctxt->vcpu; 1076 struct kvm_vcpu *vcpu = ctxt->vcpu;
1077 struct kvm_io_device *mmio_dev;
1078 gpa_t gpa;
1046 1079
1047 if (vcpu->mmio_read_completed) { 1080 if (vcpu->mmio_read_completed) {
1048 memcpy(val, vcpu->mmio_data, bytes); 1081 memcpy(val, vcpu->mmio_data, bytes);
@@ -1051,18 +1084,26 @@ static int emulator_read_emulated(unsigned long addr,
1051 } else if (emulator_read_std(addr, val, bytes, ctxt) 1084 } else if (emulator_read_std(addr, val, bytes, ctxt)
1052 == X86EMUL_CONTINUE) 1085 == X86EMUL_CONTINUE)
1053 return X86EMUL_CONTINUE; 1086 return X86EMUL_CONTINUE;
1054 else {
1055 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1056 1087
1057 if (gpa == UNMAPPED_GVA) 1088 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1058 return X86EMUL_PROPAGATE_FAULT; 1089 if (gpa == UNMAPPED_GVA)
1059 vcpu->mmio_needed = 1; 1090 return X86EMUL_PROPAGATE_FAULT;
1060 vcpu->mmio_phys_addr = gpa;
1061 vcpu->mmio_size = bytes;
1062 vcpu->mmio_is_write = 0;
1063 1091
1064 return X86EMUL_UNHANDLEABLE; 1092 /*
1093 * Is this MMIO handled locally?
1094 */
1095 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1096 if (mmio_dev) {
1097 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1098 return X86EMUL_CONTINUE;
1065 } 1099 }
1100
1101 vcpu->mmio_needed = 1;
1102 vcpu->mmio_phys_addr = gpa;
1103 vcpu->mmio_size = bytes;
1104 vcpu->mmio_is_write = 0;
1105
1106 return X86EMUL_UNHANDLEABLE;
1066} 1107}
1067 1108
1068static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 1109static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -1070,18 +1111,20 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1070{ 1111{
1071 struct page *page; 1112 struct page *page;
1072 void *virt; 1113 void *virt;
1114 unsigned offset = offset_in_page(gpa);
1073 1115
1074 if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) 1116 if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
1075 return 0; 1117 return 0;
1076 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 1118 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1077 if (!page) 1119 if (!page)
1078 return 0; 1120 return 0;
1079 kvm_mmu_pre_write(vcpu, gpa, bytes);
1080 mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); 1121 mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
1081 virt = kmap_atomic(page, KM_USER0); 1122 virt = kmap_atomic(page, KM_USER0);
1082 memcpy(virt + offset_in_page(gpa), val, bytes); 1123 if (memcmp(virt + offset_in_page(gpa), val, bytes)) {
1124 kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes);
1125 memcpy(virt + offset_in_page(gpa), val, bytes);
1126 }
1083 kunmap_atomic(virt, KM_USER0); 1127 kunmap_atomic(virt, KM_USER0);
1084 kvm_mmu_post_write(vcpu, gpa, bytes);
1085 return 1; 1128 return 1;
1086} 1129}
1087 1130
@@ -1090,8 +1133,9 @@ static int emulator_write_emulated(unsigned long addr,
1090 unsigned int bytes, 1133 unsigned int bytes,
1091 struct x86_emulate_ctxt *ctxt) 1134 struct x86_emulate_ctxt *ctxt)
1092{ 1135{
1093 struct kvm_vcpu *vcpu = ctxt->vcpu; 1136 struct kvm_vcpu *vcpu = ctxt->vcpu;
1094 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); 1137 struct kvm_io_device *mmio_dev;
1138 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1095 1139
1096 if (gpa == UNMAPPED_GVA) { 1140 if (gpa == UNMAPPED_GVA) {
1097 kvm_arch_ops->inject_page_fault(vcpu, addr, 2); 1141 kvm_arch_ops->inject_page_fault(vcpu, addr, 2);
@@ -1101,6 +1145,15 @@ static int emulator_write_emulated(unsigned long addr,
1101 if (emulator_write_phys(vcpu, gpa, val, bytes)) 1145 if (emulator_write_phys(vcpu, gpa, val, bytes))
1102 return X86EMUL_CONTINUE; 1146 return X86EMUL_CONTINUE;
1103 1147
1148 /*
1149 * Is this MMIO handled locally?
1150 */
1151 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1152 if (mmio_dev) {
1153 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1154 return X86EMUL_CONTINUE;
1155 }
1156
1104 vcpu->mmio_needed = 1; 1157 vcpu->mmio_needed = 1;
1105 vcpu->mmio_phys_addr = gpa; 1158 vcpu->mmio_phys_addr = gpa;
1106 vcpu->mmio_size = bytes; 1159 vcpu->mmio_size = bytes;
@@ -1269,6 +1322,17 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
1269} 1322}
1270EXPORT_SYMBOL_GPL(emulate_instruction); 1323EXPORT_SYMBOL_GPL(emulate_instruction);
1271 1324
1325int kvm_emulate_halt(struct kvm_vcpu *vcpu)
1326{
1327 if (vcpu->irq_summary)
1328 return 1;
1329
1330 vcpu->run->exit_reason = KVM_EXIT_HLT;
1331 ++vcpu->stat.halt_exits;
1332 return 0;
1333}
1334EXPORT_SYMBOL_GPL(kvm_emulate_halt);
1335
1272int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) 1336int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
1273{ 1337{
1274 unsigned long nr, a0, a1, a2, a3, a4, a5, ret; 1338 unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
@@ -1469,6 +1533,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1469 case MSR_IA32_MC0_MISC+16: 1533 case MSR_IA32_MC0_MISC+16:
1470 case MSR_IA32_UCODE_REV: 1534 case MSR_IA32_UCODE_REV:
1471 case MSR_IA32_PERF_STATUS: 1535 case MSR_IA32_PERF_STATUS:
1536 case MSR_IA32_EBL_CR_POWERON:
1472 /* MTRR registers */ 1537 /* MTRR registers */
1473 case 0xfe: 1538 case 0xfe:
1474 case 0x200 ... 0x2ff: 1539 case 0x200 ... 0x2ff:
@@ -1727,6 +1792,20 @@ static int complete_pio(struct kvm_vcpu *vcpu)
1727 return 0; 1792 return 0;
1728} 1793}
1729 1794
1795void kernel_pio(struct kvm_io_device *pio_dev, struct kvm_vcpu *vcpu)
1796{
1797 /* TODO: String I/O for in kernel device */
1798
1799 if (vcpu->pio.in)
1800 kvm_iodevice_read(pio_dev, vcpu->pio.port,
1801 vcpu->pio.size,
1802 vcpu->pio_data);
1803 else
1804 kvm_iodevice_write(pio_dev, vcpu->pio.port,
1805 vcpu->pio.size,
1806 vcpu->pio_data);
1807}
1808
1730int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 1809int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1731 int size, unsigned long count, int string, int down, 1810 int size, unsigned long count, int string, int down,
1732 gva_t address, int rep, unsigned port) 1811 gva_t address, int rep, unsigned port)
@@ -1735,6 +1814,7 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1735 int i; 1814 int i;
1736 int nr_pages = 1; 1815 int nr_pages = 1;
1737 struct page *page; 1816 struct page *page;
1817 struct kvm_io_device *pio_dev;
1738 1818
1739 vcpu->run->exit_reason = KVM_EXIT_IO; 1819 vcpu->run->exit_reason = KVM_EXIT_IO;
1740 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 1820 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -1746,17 +1826,27 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1746 vcpu->pio.cur_count = count; 1826 vcpu->pio.cur_count = count;
1747 vcpu->pio.size = size; 1827 vcpu->pio.size = size;
1748 vcpu->pio.in = in; 1828 vcpu->pio.in = in;
1829 vcpu->pio.port = port;
1749 vcpu->pio.string = string; 1830 vcpu->pio.string = string;
1750 vcpu->pio.down = down; 1831 vcpu->pio.down = down;
1751 vcpu->pio.guest_page_offset = offset_in_page(address); 1832 vcpu->pio.guest_page_offset = offset_in_page(address);
1752 vcpu->pio.rep = rep; 1833 vcpu->pio.rep = rep;
1753 1834
1835 pio_dev = vcpu_find_pio_dev(vcpu, port);
1754 if (!string) { 1836 if (!string) {
1755 kvm_arch_ops->cache_regs(vcpu); 1837 kvm_arch_ops->cache_regs(vcpu);
1756 memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4); 1838 memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
1757 kvm_arch_ops->decache_regs(vcpu); 1839 kvm_arch_ops->decache_regs(vcpu);
1840 if (pio_dev) {
1841 kernel_pio(pio_dev, vcpu);
1842 complete_pio(vcpu);
1843 return 1;
1844 }
1758 return 0; 1845 return 0;
1759 } 1846 }
1847 /* TODO: String I/O for in kernel device */
1848 if (pio_dev)
1849 printk(KERN_ERR "kvm_setup_pio: no string io support\n");
1760 1850
1761 if (!count) { 1851 if (!count) {
1762 kvm_arch_ops->skip_emulated_instruction(vcpu); 1852 kvm_arch_ops->skip_emulated_instruction(vcpu);
@@ -2273,34 +2363,12 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
2273 struct inode *inode; 2363 struct inode *inode;
2274 struct file *file; 2364 struct file *file;
2275 2365
2366 r = anon_inode_getfd(&fd, &inode, &file,
2367 "kvm-vcpu", &kvm_vcpu_fops, vcpu);
2368 if (r)
2369 return r;
2276 atomic_inc(&vcpu->kvm->filp->f_count); 2370 atomic_inc(&vcpu->kvm->filp->f_count);
2277 inode = kvmfs_inode(&kvm_vcpu_fops);
2278 if (IS_ERR(inode)) {
2279 r = PTR_ERR(inode);
2280 goto out1;
2281 }
2282
2283 file = kvmfs_file(inode, vcpu);
2284 if (IS_ERR(file)) {
2285 r = PTR_ERR(file);
2286 goto out2;
2287 }
2288
2289 r = get_unused_fd();
2290 if (r < 0)
2291 goto out3;
2292 fd = r;
2293 fd_install(fd, file);
2294
2295 return fd; 2371 return fd;
2296
2297out3:
2298 fput(file);
2299out2:
2300 iput(inode);
2301out1:
2302 fput(vcpu->kvm->filp);
2303 return r;
2304} 2372}
2305 2373
2306/* 2374/*
@@ -2363,6 +2431,11 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
2363 if (r < 0) 2431 if (r < 0)
2364 goto out_free_vcpus; 2432 goto out_free_vcpus;
2365 2433
2434 spin_lock(&kvm_lock);
2435 if (n >= kvm->nvcpus)
2436 kvm->nvcpus = n + 1;
2437 spin_unlock(&kvm_lock);
2438
2366 return r; 2439 return r;
2367 2440
2368out_free_vcpus: 2441out_free_vcpus:
@@ -2376,6 +2449,27 @@ out:
2376 return r; 2449 return r;
2377} 2450}
2378 2451
2452static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
2453{
2454 u64 efer;
2455 int i;
2456 struct kvm_cpuid_entry *e, *entry;
2457
2458 rdmsrl(MSR_EFER, efer);
2459 entry = NULL;
2460 for (i = 0; i < vcpu->cpuid_nent; ++i) {
2461 e = &vcpu->cpuid_entries[i];
2462 if (e->function == 0x80000001) {
2463 entry = e;
2464 break;
2465 }
2466 }
2467 if (entry && (entry->edx & EFER_NX) && !(efer & EFER_NX)) {
2468 entry->edx &= ~(1 << 20);
2469 printk(KERN_INFO ": guest NX capability removed\n");
2470 }
2471}
2472
2379static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, 2473static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
2380 struct kvm_cpuid *cpuid, 2474 struct kvm_cpuid *cpuid,
2381 struct kvm_cpuid_entry __user *entries) 2475 struct kvm_cpuid_entry __user *entries)
@@ -2390,6 +2484,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
2390 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 2484 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
2391 goto out; 2485 goto out;
2392 vcpu->cpuid_nent = cpuid->nent; 2486 vcpu->cpuid_nent = cpuid->nent;
2487 cpuid_fix_nx_cap(vcpu);
2393 return 0; 2488 return 0;
2394 2489
2395out: 2490out:
@@ -2738,41 +2833,18 @@ static int kvm_dev_ioctl_create_vm(void)
2738 struct file *file; 2833 struct file *file;
2739 struct kvm *kvm; 2834 struct kvm *kvm;
2740 2835
2741 inode = kvmfs_inode(&kvm_vm_fops);
2742 if (IS_ERR(inode)) {
2743 r = PTR_ERR(inode);
2744 goto out1;
2745 }
2746
2747 kvm = kvm_create_vm(); 2836 kvm = kvm_create_vm();
2748 if (IS_ERR(kvm)) { 2837 if (IS_ERR(kvm))
2749 r = PTR_ERR(kvm); 2838 return PTR_ERR(kvm);
2750 goto out2; 2839 r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
2840 if (r) {
2841 kvm_destroy_vm(kvm);
2842 return r;
2751 } 2843 }
2752 2844
2753 file = kvmfs_file(inode, kvm);
2754 if (IS_ERR(file)) {
2755 r = PTR_ERR(file);
2756 goto out3;
2757 }
2758 kvm->filp = file; 2845 kvm->filp = file;
2759 2846
2760 r = get_unused_fd();
2761 if (r < 0)
2762 goto out4;
2763 fd = r;
2764 fd_install(fd, file);
2765
2766 return fd; 2847 return fd;
2767
2768out4:
2769 fput(file);
2770out3:
2771 kvm_destroy_vm(kvm);
2772out2:
2773 iput(inode);
2774out1:
2775 return r;
2776} 2848}
2777 2849
2778static long kvm_dev_ioctl(struct file *filp, 2850static long kvm_dev_ioctl(struct file *filp,
@@ -2862,7 +2934,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
2862 * in vmx root mode. 2934 * in vmx root mode.
2863 */ 2935 */
2864 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 2936 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
2865 on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); 2937 on_each_cpu(hardware_disable, NULL, 0, 1);
2866 } 2938 }
2867 return NOTIFY_OK; 2939 return NOTIFY_OK;
2868} 2940}
@@ -2905,33 +2977,88 @@ static void decache_vcpus_on_cpu(int cpu)
2905 spin_unlock(&kvm_lock); 2977 spin_unlock(&kvm_lock);
2906} 2978}
2907 2979
2980static void hardware_enable(void *junk)
2981{
2982 int cpu = raw_smp_processor_id();
2983
2984 if (cpu_isset(cpu, cpus_hardware_enabled))
2985 return;
2986 cpu_set(cpu, cpus_hardware_enabled);
2987 kvm_arch_ops->hardware_enable(NULL);
2988}
2989
2990static void hardware_disable(void *junk)
2991{
2992 int cpu = raw_smp_processor_id();
2993
2994 if (!cpu_isset(cpu, cpus_hardware_enabled))
2995 return;
2996 cpu_clear(cpu, cpus_hardware_enabled);
2997 decache_vcpus_on_cpu(cpu);
2998 kvm_arch_ops->hardware_disable(NULL);
2999}
3000
2908static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 3001static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2909 void *v) 3002 void *v)
2910{ 3003{
2911 int cpu = (long)v; 3004 int cpu = (long)v;
2912 3005
2913 switch (val) { 3006 switch (val) {
2914 case CPU_DOWN_PREPARE: 3007 case CPU_DYING:
2915 case CPU_DOWN_PREPARE_FROZEN: 3008 case CPU_DYING_FROZEN:
2916 case CPU_UP_CANCELED: 3009 case CPU_UP_CANCELED:
2917 case CPU_UP_CANCELED_FROZEN: 3010 case CPU_UP_CANCELED_FROZEN:
2918 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 3011 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
2919 cpu); 3012 cpu);
2920 decache_vcpus_on_cpu(cpu); 3013 smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
2921 smp_call_function_single(cpu, kvm_arch_ops->hardware_disable,
2922 NULL, 0, 1);
2923 break; 3014 break;
2924 case CPU_ONLINE: 3015 case CPU_ONLINE:
2925 case CPU_ONLINE_FROZEN: 3016 case CPU_ONLINE_FROZEN:
2926 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 3017 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
2927 cpu); 3018 cpu);
2928 smp_call_function_single(cpu, kvm_arch_ops->hardware_enable, 3019 smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
2929 NULL, 0, 1);
2930 break; 3020 break;
2931 } 3021 }
2932 return NOTIFY_OK; 3022 return NOTIFY_OK;
2933} 3023}
2934 3024
3025void kvm_io_bus_init(struct kvm_io_bus *bus)
3026{
3027 memset(bus, 0, sizeof(*bus));
3028}
3029
3030void kvm_io_bus_destroy(struct kvm_io_bus *bus)
3031{
3032 int i;
3033
3034 for (i = 0; i < bus->dev_count; i++) {
3035 struct kvm_io_device *pos = bus->devs[i];
3036
3037 kvm_iodevice_destructor(pos);
3038 }
3039}
3040
3041struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
3042{
3043 int i;
3044
3045 for (i = 0; i < bus->dev_count; i++) {
3046 struct kvm_io_device *pos = bus->devs[i];
3047
3048 if (pos->in_range(pos, addr))
3049 return pos;
3050 }
3051
3052 return NULL;
3053}
3054
3055void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
3056{
3057 BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
3058
3059 bus->devs[bus->dev_count++] = dev;
3060}
3061
2935static struct notifier_block kvm_cpu_notifier = { 3062static struct notifier_block kvm_cpu_notifier = {
2936 .notifier_call = kvm_cpu_hotplug, 3063 .notifier_call = kvm_cpu_hotplug,
2937 .priority = 20, /* must be > scheduler priority */ 3064 .priority = 20, /* must be > scheduler priority */
@@ -2983,14 +3110,13 @@ static void kvm_exit_debug(void)
2983 3110
2984static int kvm_suspend(struct sys_device *dev, pm_message_t state) 3111static int kvm_suspend(struct sys_device *dev, pm_message_t state)
2985{ 3112{
2986 decache_vcpus_on_cpu(raw_smp_processor_id()); 3113 hardware_disable(NULL);
2987 on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
2988 return 0; 3114 return 0;
2989} 3115}
2990 3116
2991static int kvm_resume(struct sys_device *dev) 3117static int kvm_resume(struct sys_device *dev)
2992{ 3118{
2993 on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1); 3119 hardware_enable(NULL);
2994 return 0; 3120 return 0;
2995} 3121}
2996 3122
@@ -3007,18 +3133,6 @@ static struct sys_device kvm_sysdev = {
3007 3133
3008hpa_t bad_page_address; 3134hpa_t bad_page_address;
3009 3135
3010static int kvmfs_get_sb(struct file_system_type *fs_type, int flags,
3011 const char *dev_name, void *data, struct vfsmount *mnt)
3012{
3013 return get_sb_pseudo(fs_type, "kvm:", NULL, KVMFS_SUPER_MAGIC, mnt);
3014}
3015
3016static struct file_system_type kvm_fs_type = {
3017 .name = "kvmfs",
3018 .get_sb = kvmfs_get_sb,
3019 .kill_sb = kill_anon_super,
3020};
3021
3022int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) 3136int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
3023{ 3137{
3024 int r; 3138 int r;
@@ -3043,7 +3157,7 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
3043 if (r < 0) 3157 if (r < 0)
3044 goto out; 3158 goto out;
3045 3159
3046 on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1); 3160 on_each_cpu(hardware_enable, NULL, 0, 1);
3047 r = register_cpu_notifier(&kvm_cpu_notifier); 3161 r = register_cpu_notifier(&kvm_cpu_notifier);
3048 if (r) 3162 if (r)
3049 goto out_free_1; 3163 goto out_free_1;
@@ -3075,7 +3189,7 @@ out_free_2:
3075 unregister_reboot_notifier(&kvm_reboot_notifier); 3189 unregister_reboot_notifier(&kvm_reboot_notifier);
3076 unregister_cpu_notifier(&kvm_cpu_notifier); 3190 unregister_cpu_notifier(&kvm_cpu_notifier);
3077out_free_1: 3191out_free_1:
3078 on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); 3192 on_each_cpu(hardware_disable, NULL, 0, 1);
3079 kvm_arch_ops->hardware_unsetup(); 3193 kvm_arch_ops->hardware_unsetup();
3080out: 3194out:
3081 kvm_arch_ops = NULL; 3195 kvm_arch_ops = NULL;
@@ -3089,7 +3203,7 @@ void kvm_exit_arch(void)
3089 sysdev_class_unregister(&kvm_sysdev_class); 3203 sysdev_class_unregister(&kvm_sysdev_class);
3090 unregister_reboot_notifier(&kvm_reboot_notifier); 3204 unregister_reboot_notifier(&kvm_reboot_notifier);
3091 unregister_cpu_notifier(&kvm_cpu_notifier); 3205 unregister_cpu_notifier(&kvm_cpu_notifier);
3092 on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); 3206 on_each_cpu(hardware_disable, NULL, 0, 1);
3093 kvm_arch_ops->hardware_unsetup(); 3207 kvm_arch_ops->hardware_unsetup();
3094 kvm_arch_ops = NULL; 3208 kvm_arch_ops = NULL;
3095} 3209}
@@ -3103,14 +3217,6 @@ static __init int kvm_init(void)
3103 if (r) 3217 if (r)
3104 goto out4; 3218 goto out4;
3105 3219
3106 r = register_filesystem(&kvm_fs_type);
3107 if (r)
3108 goto out3;
3109
3110 kvmfs_mnt = kern_mount(&kvm_fs_type);
3111 r = PTR_ERR(kvmfs_mnt);
3112 if (IS_ERR(kvmfs_mnt))
3113 goto out2;
3114 kvm_init_debug(); 3220 kvm_init_debug();
3115 3221
3116 kvm_init_msr_list(); 3222 kvm_init_msr_list();
@@ -3127,10 +3233,6 @@ static __init int kvm_init(void)
3127 3233
3128out: 3234out:
3129 kvm_exit_debug(); 3235 kvm_exit_debug();
3130 mntput(kvmfs_mnt);
3131out2:
3132 unregister_filesystem(&kvm_fs_type);
3133out3:
3134 kvm_mmu_module_exit(); 3236 kvm_mmu_module_exit();
3135out4: 3237out4:
3136 return r; 3238 return r;
@@ -3140,8 +3242,6 @@ static __exit void kvm_exit(void)
3140{ 3242{
3141 kvm_exit_debug(); 3243 kvm_exit_debug();
3142 __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT)); 3244 __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3143 mntput(kvmfs_mnt);
3144 unregister_filesystem(&kvm_fs_type);
3145 kvm_mmu_module_exit(); 3245 kvm_mmu_module_exit();
3146} 3246}
3147 3247
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
index e8e228118de9..b297a6b111ac 100644
--- a/drivers/kvm/mmu.c
+++ b/drivers/kvm/mmu.c
@@ -16,15 +16,18 @@
16 * the COPYING file in the top-level directory. 16 * the COPYING file in the top-level directory.
17 * 17 *
18 */ 18 */
19
20#include "vmx.h"
21#include "kvm.h"
22
19#include <linux/types.h> 23#include <linux/types.h>
20#include <linux/string.h> 24#include <linux/string.h>
21#include <asm/page.h>
22#include <linux/mm.h> 25#include <linux/mm.h>
23#include <linux/highmem.h> 26#include <linux/highmem.h>
24#include <linux/module.h> 27#include <linux/module.h>
25 28
26#include "vmx.h" 29#include <asm/page.h>
27#include "kvm.h" 30#include <asm/cmpxchg.h>
28 31
29#undef MMU_DEBUG 32#undef MMU_DEBUG
30 33
@@ -90,25 +93,11 @@ static int dbg = 1;
90#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) 93#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
91 94
92 95
93#define PT32_PTE_COPY_MASK \
94 (PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK | PT_GLOBAL_MASK)
95
96#define PT64_PTE_COPY_MASK (PT64_NX_MASK | PT32_PTE_COPY_MASK)
97
98#define PT_FIRST_AVAIL_BITS_SHIFT 9 96#define PT_FIRST_AVAIL_BITS_SHIFT 9
99#define PT64_SECOND_AVAIL_BITS_SHIFT 52 97#define PT64_SECOND_AVAIL_BITS_SHIFT 52
100 98
101#define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
102#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) 99#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
103 100
104#define PT_SHADOW_WRITABLE_SHIFT (PT_FIRST_AVAIL_BITS_SHIFT + 1)
105#define PT_SHADOW_WRITABLE_MASK (1ULL << PT_SHADOW_WRITABLE_SHIFT)
106
107#define PT_SHADOW_USER_SHIFT (PT_SHADOW_WRITABLE_SHIFT + 1)
108#define PT_SHADOW_USER_MASK (1ULL << (PT_SHADOW_USER_SHIFT))
109
110#define PT_SHADOW_BITS_OFFSET (PT_SHADOW_WRITABLE_SHIFT - PT_WRITABLE_SHIFT)
111
112#define VALID_PAGE(x) ((x) != INVALID_PAGE) 101#define VALID_PAGE(x) ((x) != INVALID_PAGE)
113 102
114#define PT64_LEVEL_BITS 9 103#define PT64_LEVEL_BITS 9
@@ -165,6 +154,8 @@ struct kvm_rmap_desc {
165 154
166static struct kmem_cache *pte_chain_cache; 155static struct kmem_cache *pte_chain_cache;
167static struct kmem_cache *rmap_desc_cache; 156static struct kmem_cache *rmap_desc_cache;
157static struct kmem_cache *mmu_page_cache;
158static struct kmem_cache *mmu_page_header_cache;
168 159
169static int is_write_protection(struct kvm_vcpu *vcpu) 160static int is_write_protection(struct kvm_vcpu *vcpu)
170{ 161{
@@ -202,6 +193,15 @@ static int is_rmap_pte(u64 pte)
202 == (PT_WRITABLE_MASK | PT_PRESENT_MASK); 193 == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
203} 194}
204 195
196static void set_shadow_pte(u64 *sptep, u64 spte)
197{
198#ifdef CONFIG_X86_64
199 set_64bit((unsigned long *)sptep, spte);
200#else
201 set_64bit((unsigned long long *)sptep, spte);
202#endif
203}
204
205static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 205static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
206 struct kmem_cache *base_cache, int min, 206 struct kmem_cache *base_cache, int min,
207 gfp_t gfp_flags) 207 gfp_t gfp_flags)
@@ -235,6 +235,14 @@ static int __mmu_topup_memory_caches(struct kvm_vcpu *vcpu, gfp_t gfp_flags)
235 goto out; 235 goto out;
236 r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache, 236 r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
237 rmap_desc_cache, 1, gfp_flags); 237 rmap_desc_cache, 1, gfp_flags);
238 if (r)
239 goto out;
240 r = mmu_topup_memory_cache(&vcpu->mmu_page_cache,
241 mmu_page_cache, 4, gfp_flags);
242 if (r)
243 goto out;
244 r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
245 mmu_page_header_cache, 4, gfp_flags);
238out: 246out:
239 return r; 247 return r;
240} 248}
@@ -258,6 +266,8 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
258{ 266{
259 mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache); 267 mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
260 mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache); 268 mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
269 mmu_free_memory_cache(&vcpu->mmu_page_cache);
270 mmu_free_memory_cache(&vcpu->mmu_page_header_cache);
261} 271}
262 272
263static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, 273static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
@@ -433,19 +443,18 @@ static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
433 BUG_ON(!(*spte & PT_WRITABLE_MASK)); 443 BUG_ON(!(*spte & PT_WRITABLE_MASK));
434 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 444 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
435 rmap_remove(vcpu, spte); 445 rmap_remove(vcpu, spte);
436 kvm_arch_ops->tlb_flush(vcpu); 446 set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
437 *spte &= ~(u64)PT_WRITABLE_MASK; 447 kvm_flush_remote_tlbs(vcpu->kvm);
438 } 448 }
439} 449}
440 450
441#ifdef MMU_DEBUG 451#ifdef MMU_DEBUG
442static int is_empty_shadow_page(hpa_t page_hpa) 452static int is_empty_shadow_page(u64 *spt)
443{ 453{
444 u64 *pos; 454 u64 *pos;
445 u64 *end; 455 u64 *end;
446 456
447 for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u64); 457 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
448 pos != end; pos++)
449 if (*pos != 0) { 458 if (*pos != 0) {
450 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, 459 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
451 pos, *pos); 460 pos, *pos);
@@ -455,13 +464,13 @@ static int is_empty_shadow_page(hpa_t page_hpa)
455} 464}
456#endif 465#endif
457 466
458static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa) 467static void kvm_mmu_free_page(struct kvm_vcpu *vcpu,
468 struct kvm_mmu_page *page_head)
459{ 469{
460 struct kvm_mmu_page *page_head = page_header(page_hpa); 470 ASSERT(is_empty_shadow_page(page_head->spt));
461 471 list_del(&page_head->link);
462 ASSERT(is_empty_shadow_page(page_hpa)); 472 mmu_memory_cache_free(&vcpu->mmu_page_cache, page_head->spt);
463 page_head->page_hpa = page_hpa; 473 mmu_memory_cache_free(&vcpu->mmu_page_header_cache, page_head);
464 list_move(&page_head->link, &vcpu->free_pages);
465 ++vcpu->kvm->n_free_mmu_pages; 474 ++vcpu->kvm->n_free_mmu_pages;
466} 475}
467 476
@@ -475,12 +484,15 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
475{ 484{
476 struct kvm_mmu_page *page; 485 struct kvm_mmu_page *page;
477 486
478 if (list_empty(&vcpu->free_pages)) 487 if (!vcpu->kvm->n_free_mmu_pages)
479 return NULL; 488 return NULL;
480 489
481 page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); 490 page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache,
482 list_move(&page->link, &vcpu->kvm->active_mmu_pages); 491 sizeof *page);
483 ASSERT(is_empty_shadow_page(page->page_hpa)); 492 page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
493 set_page_private(virt_to_page(page->spt), (unsigned long)page);
494 list_add(&page->link, &vcpu->kvm->active_mmu_pages);
495 ASSERT(is_empty_shadow_page(page->spt));
484 page->slot_bitmap = 0; 496 page->slot_bitmap = 0;
485 page->multimapped = 0; 497 page->multimapped = 0;
486 page->parent_pte = parent_pte; 498 page->parent_pte = parent_pte;
@@ -638,7 +650,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
638 u64 *pt; 650 u64 *pt;
639 u64 ent; 651 u64 ent;
640 652
641 pt = __va(page->page_hpa); 653 pt = page->spt;
642 654
643 if (page->role.level == PT_PAGE_TABLE_LEVEL) { 655 if (page->role.level == PT_PAGE_TABLE_LEVEL) {
644 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 656 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
@@ -646,7 +658,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
646 rmap_remove(vcpu, &pt[i]); 658 rmap_remove(vcpu, &pt[i]);
647 pt[i] = 0; 659 pt[i] = 0;
648 } 660 }
649 kvm_arch_ops->tlb_flush(vcpu); 661 kvm_flush_remote_tlbs(vcpu->kvm);
650 return; 662 return;
651 } 663 }
652 664
@@ -659,6 +671,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
659 ent &= PT64_BASE_ADDR_MASK; 671 ent &= PT64_BASE_ADDR_MASK;
660 mmu_page_remove_parent_pte(vcpu, page_header(ent), &pt[i]); 672 mmu_page_remove_parent_pte(vcpu, page_header(ent), &pt[i]);
661 } 673 }
674 kvm_flush_remote_tlbs(vcpu->kvm);
662} 675}
663 676
664static void kvm_mmu_put_page(struct kvm_vcpu *vcpu, 677static void kvm_mmu_put_page(struct kvm_vcpu *vcpu,
@@ -685,12 +698,12 @@ static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu,
685 } 698 }
686 BUG_ON(!parent_pte); 699 BUG_ON(!parent_pte);
687 kvm_mmu_put_page(vcpu, page, parent_pte); 700 kvm_mmu_put_page(vcpu, page, parent_pte);
688 *parent_pte = 0; 701 set_shadow_pte(parent_pte, 0);
689 } 702 }
690 kvm_mmu_page_unlink_children(vcpu, page); 703 kvm_mmu_page_unlink_children(vcpu, page);
691 if (!page->root_count) { 704 if (!page->root_count) {
692 hlist_del(&page->hash_link); 705 hlist_del(&page->hash_link);
693 kvm_mmu_free_page(vcpu, page->page_hpa); 706 kvm_mmu_free_page(vcpu, page);
694 } else 707 } else
695 list_move(&page->link, &vcpu->kvm->active_mmu_pages); 708 list_move(&page->link, &vcpu->kvm->active_mmu_pages);
696} 709}
@@ -717,6 +730,17 @@ static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
717 return r; 730 return r;
718} 731}
719 732
733static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn)
734{
735 struct kvm_mmu_page *page;
736
737 while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
738 pgprintk("%s: zap %lx %x\n",
739 __FUNCTION__, gfn, page->role.word);
740 kvm_mmu_zap_page(vcpu, page);
741 }
742}
743
720static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) 744static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
721{ 745{
722 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT)); 746 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
@@ -805,7 +829,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
805 return -ENOMEM; 829 return -ENOMEM;
806 } 830 }
807 831
808 table[index] = new_table->page_hpa | PT_PRESENT_MASK 832 table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
809 | PT_WRITABLE_MASK | PT_USER_MASK; 833 | PT_WRITABLE_MASK | PT_USER_MASK;
810 } 834 }
811 table_addr = table[index] & PT64_BASE_ADDR_MASK; 835 table_addr = table[index] & PT64_BASE_ADDR_MASK;
@@ -817,11 +841,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
817 int i; 841 int i;
818 struct kvm_mmu_page *page; 842 struct kvm_mmu_page *page;
819 843
844 if (!VALID_PAGE(vcpu->mmu.root_hpa))
845 return;
820#ifdef CONFIG_X86_64 846#ifdef CONFIG_X86_64
821 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { 847 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
822 hpa_t root = vcpu->mmu.root_hpa; 848 hpa_t root = vcpu->mmu.root_hpa;
823 849
824 ASSERT(VALID_PAGE(root));
825 page = page_header(root); 850 page = page_header(root);
826 --page->root_count; 851 --page->root_count;
827 vcpu->mmu.root_hpa = INVALID_PAGE; 852 vcpu->mmu.root_hpa = INVALID_PAGE;
@@ -832,7 +857,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
832 hpa_t root = vcpu->mmu.pae_root[i]; 857 hpa_t root = vcpu->mmu.pae_root[i];
833 858
834 if (root) { 859 if (root) {
835 ASSERT(VALID_PAGE(root));
836 root &= PT64_BASE_ADDR_MASK; 860 root &= PT64_BASE_ADDR_MASK;
837 page = page_header(root); 861 page = page_header(root);
838 --page->root_count; 862 --page->root_count;
@@ -857,7 +881,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
857 ASSERT(!VALID_PAGE(root)); 881 ASSERT(!VALID_PAGE(root));
858 page = kvm_mmu_get_page(vcpu, root_gfn, 0, 882 page = kvm_mmu_get_page(vcpu, root_gfn, 0,
859 PT64_ROOT_LEVEL, 0, 0, NULL); 883 PT64_ROOT_LEVEL, 0, 0, NULL);
860 root = page->page_hpa; 884 root = __pa(page->spt);
861 ++page->root_count; 885 ++page->root_count;
862 vcpu->mmu.root_hpa = root; 886 vcpu->mmu.root_hpa = root;
863 return; 887 return;
@@ -878,7 +902,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
878 page = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 902 page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
879 PT32_ROOT_LEVEL, !is_paging(vcpu), 903 PT32_ROOT_LEVEL, !is_paging(vcpu),
880 0, NULL); 904 0, NULL);
881 root = page->page_hpa; 905 root = __pa(page->spt);
882 ++page->root_count; 906 ++page->root_count;
883 vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; 907 vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
884 } 908 }
@@ -928,9 +952,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
928 context->free = nonpaging_free; 952 context->free = nonpaging_free;
929 context->root_level = 0; 953 context->root_level = 0;
930 context->shadow_root_level = PT32E_ROOT_LEVEL; 954 context->shadow_root_level = PT32E_ROOT_LEVEL;
931 mmu_alloc_roots(vcpu); 955 context->root_hpa = INVALID_PAGE;
932 ASSERT(VALID_PAGE(context->root_hpa));
933 kvm_arch_ops->set_cr3(vcpu, context->root_hpa);
934 return 0; 956 return 0;
935} 957}
936 958
@@ -944,59 +966,6 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu)
944{ 966{
945 pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); 967 pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
946 mmu_free_roots(vcpu); 968 mmu_free_roots(vcpu);
947 if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
948 kvm_mmu_free_some_pages(vcpu);
949 mmu_alloc_roots(vcpu);
950 kvm_mmu_flush_tlb(vcpu);
951 kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
952}
953
954static inline void set_pte_common(struct kvm_vcpu *vcpu,
955 u64 *shadow_pte,
956 gpa_t gaddr,
957 int dirty,
958 u64 access_bits,
959 gfn_t gfn)
960{
961 hpa_t paddr;
962
963 *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET;
964 if (!dirty)
965 access_bits &= ~PT_WRITABLE_MASK;
966
967 paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
968
969 *shadow_pte |= access_bits;
970
971 if (is_error_hpa(paddr)) {
972 *shadow_pte |= gaddr;
973 *shadow_pte |= PT_SHADOW_IO_MARK;
974 *shadow_pte &= ~PT_PRESENT_MASK;
975 return;
976 }
977
978 *shadow_pte |= paddr;
979
980 if (access_bits & PT_WRITABLE_MASK) {
981 struct kvm_mmu_page *shadow;
982
983 shadow = kvm_mmu_lookup_page(vcpu, gfn);
984 if (shadow) {
985 pgprintk("%s: found shadow page for %lx, marking ro\n",
986 __FUNCTION__, gfn);
987 access_bits &= ~PT_WRITABLE_MASK;
988 if (is_writeble_pte(*shadow_pte)) {
989 *shadow_pte &= ~PT_WRITABLE_MASK;
990 kvm_arch_ops->tlb_flush(vcpu);
991 }
992 }
993 }
994
995 if (access_bits & PT_WRITABLE_MASK)
996 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
997
998 page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
999 rmap_add(vcpu, shadow_pte);
1000} 969}
1001 970
1002static void inject_page_fault(struct kvm_vcpu *vcpu, 971static void inject_page_fault(struct kvm_vcpu *vcpu,
@@ -1006,23 +975,6 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
1006 kvm_arch_ops->inject_page_fault(vcpu, addr, err_code); 975 kvm_arch_ops->inject_page_fault(vcpu, addr, err_code);
1007} 976}
1008 977
1009static inline int fix_read_pf(u64 *shadow_ent)
1010{
1011 if ((*shadow_ent & PT_SHADOW_USER_MASK) &&
1012 !(*shadow_ent & PT_USER_MASK)) {
1013 /*
1014 * If supervisor write protect is disabled, we shadow kernel
1015 * pages as user pages so we can trap the write access.
1016 */
1017 *shadow_ent |= PT_USER_MASK;
1018 *shadow_ent &= ~PT_WRITABLE_MASK;
1019
1020 return 1;
1021
1022 }
1023 return 0;
1024}
1025
1026static void paging_free(struct kvm_vcpu *vcpu) 978static void paging_free(struct kvm_vcpu *vcpu)
1027{ 979{
1028 nonpaging_free(vcpu); 980 nonpaging_free(vcpu);
@@ -1047,10 +999,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1047 context->free = paging_free; 999 context->free = paging_free;
1048 context->root_level = level; 1000 context->root_level = level;
1049 context->shadow_root_level = level; 1001 context->shadow_root_level = level;
1050 mmu_alloc_roots(vcpu); 1002 context->root_hpa = INVALID_PAGE;
1051 ASSERT(VALID_PAGE(context->root_hpa));
1052 kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
1053 (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
1054 return 0; 1003 return 0;
1055} 1004}
1056 1005
@@ -1069,10 +1018,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
1069 context->free = paging_free; 1018 context->free = paging_free;
1070 context->root_level = PT32_ROOT_LEVEL; 1019 context->root_level = PT32_ROOT_LEVEL;
1071 context->shadow_root_level = PT32E_ROOT_LEVEL; 1020 context->shadow_root_level = PT32E_ROOT_LEVEL;
1072 mmu_alloc_roots(vcpu); 1021 context->root_hpa = INVALID_PAGE;
1073 ASSERT(VALID_PAGE(context->root_hpa));
1074 kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
1075 (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
1076 return 0; 1022 return 0;
1077} 1023}
1078 1024
@@ -1107,18 +1053,33 @@ static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
1107 1053
1108int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) 1054int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
1109{ 1055{
1056 destroy_kvm_mmu(vcpu);
1057 return init_kvm_mmu(vcpu);
1058}
1059
1060int kvm_mmu_load(struct kvm_vcpu *vcpu)
1061{
1110 int r; 1062 int r;
1111 1063
1112 destroy_kvm_mmu(vcpu); 1064 spin_lock(&vcpu->kvm->lock);
1113 r = init_kvm_mmu(vcpu);
1114 if (r < 0)
1115 goto out;
1116 r = mmu_topup_memory_caches(vcpu); 1065 r = mmu_topup_memory_caches(vcpu);
1066 if (r)
1067 goto out;
1068 mmu_alloc_roots(vcpu);
1069 kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
1070 kvm_mmu_flush_tlb(vcpu);
1117out: 1071out:
1072 spin_unlock(&vcpu->kvm->lock);
1118 return r; 1073 return r;
1119} 1074}
1075EXPORT_SYMBOL_GPL(kvm_mmu_load);
1076
1077void kvm_mmu_unload(struct kvm_vcpu *vcpu)
1078{
1079 mmu_free_roots(vcpu);
1080}
1120 1081
1121static void mmu_pre_write_zap_pte(struct kvm_vcpu *vcpu, 1082static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1122 struct kvm_mmu_page *page, 1083 struct kvm_mmu_page *page,
1123 u64 *spte) 1084 u64 *spte)
1124{ 1085{
@@ -1135,9 +1096,25 @@ static void mmu_pre_write_zap_pte(struct kvm_vcpu *vcpu,
1135 } 1096 }
1136 } 1097 }
1137 *spte = 0; 1098 *spte = 0;
1099 kvm_flush_remote_tlbs(vcpu->kvm);
1100}
1101
1102static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1103 struct kvm_mmu_page *page,
1104 u64 *spte,
1105 const void *new, int bytes)
1106{
1107 if (page->role.level != PT_PAGE_TABLE_LEVEL)
1108 return;
1109
1110 if (page->role.glevels == PT32_ROOT_LEVEL)
1111 paging32_update_pte(vcpu, page, spte, new, bytes);
1112 else
1113 paging64_update_pte(vcpu, page, spte, new, bytes);
1138} 1114}
1139 1115
1140void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) 1116void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1117 const u8 *old, const u8 *new, int bytes)
1141{ 1118{
1142 gfn_t gfn = gpa >> PAGE_SHIFT; 1119 gfn_t gfn = gpa >> PAGE_SHIFT;
1143 struct kvm_mmu_page *page; 1120 struct kvm_mmu_page *page;
@@ -1149,6 +1126,7 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
1149 unsigned pte_size; 1126 unsigned pte_size;
1150 unsigned page_offset; 1127 unsigned page_offset;
1151 unsigned misaligned; 1128 unsigned misaligned;
1129 unsigned quadrant;
1152 int level; 1130 int level;
1153 int flooded = 0; 1131 int flooded = 0;
1154 int npte; 1132 int npte;
@@ -1169,6 +1147,7 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
1169 continue; 1147 continue;
1170 pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; 1148 pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1171 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 1149 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1150 misaligned |= bytes < 4;
1172 if (misaligned || flooded) { 1151 if (misaligned || flooded) {
1173 /* 1152 /*
1174 * Misaligned accesses are too much trouble to fix 1153 * Misaligned accesses are too much trouble to fix
@@ -1200,21 +1179,20 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
1200 page_offset <<= 1; 1179 page_offset <<= 1;
1201 npte = 2; 1180 npte = 2;
1202 } 1181 }
1182 quadrant = page_offset >> PAGE_SHIFT;
1203 page_offset &= ~PAGE_MASK; 1183 page_offset &= ~PAGE_MASK;
1184 if (quadrant != page->role.quadrant)
1185 continue;
1204 } 1186 }
1205 spte = __va(page->page_hpa); 1187 spte = &page->spt[page_offset / sizeof(*spte)];
1206 spte += page_offset / sizeof(*spte);
1207 while (npte--) { 1188 while (npte--) {
1208 mmu_pre_write_zap_pte(vcpu, page, spte); 1189 mmu_pte_write_zap_pte(vcpu, page, spte);
1190 mmu_pte_write_new_pte(vcpu, page, spte, new, bytes);
1209 ++spte; 1191 ++spte;
1210 } 1192 }
1211 } 1193 }
1212} 1194}
1213 1195
1214void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
1215{
1216}
1217
1218int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 1196int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1219{ 1197{
1220 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); 1198 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
@@ -1243,13 +1221,6 @@ static void free_mmu_pages(struct kvm_vcpu *vcpu)
1243 struct kvm_mmu_page, link); 1221 struct kvm_mmu_page, link);
1244 kvm_mmu_zap_page(vcpu, page); 1222 kvm_mmu_zap_page(vcpu, page);
1245 } 1223 }
1246 while (!list_empty(&vcpu->free_pages)) {
1247 page = list_entry(vcpu->free_pages.next,
1248 struct kvm_mmu_page, link);
1249 list_del(&page->link);
1250 __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT));
1251 page->page_hpa = INVALID_PAGE;
1252 }
1253 free_page((unsigned long)vcpu->mmu.pae_root); 1224 free_page((unsigned long)vcpu->mmu.pae_root);
1254} 1225}
1255 1226
@@ -1260,18 +1231,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
1260 1231
1261 ASSERT(vcpu); 1232 ASSERT(vcpu);
1262 1233
1263 for (i = 0; i < KVM_NUM_MMU_PAGES; i++) { 1234 vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES;
1264 struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i];
1265
1266 INIT_LIST_HEAD(&page_header->link);
1267 if ((page = alloc_page(GFP_KERNEL)) == NULL)
1268 goto error_1;
1269 set_page_private(page, (unsigned long)page_header);
1270 page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT;
1271 memset(__va(page_header->page_hpa), 0, PAGE_SIZE);
1272 list_add(&page_header->link, &vcpu->free_pages);
1273 ++vcpu->kvm->n_free_mmu_pages;
1274 }
1275 1235
1276 /* 1236 /*
1277 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. 1237 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
@@ -1296,7 +1256,6 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
1296{ 1256{
1297 ASSERT(vcpu); 1257 ASSERT(vcpu);
1298 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); 1258 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1299 ASSERT(list_empty(&vcpu->free_pages));
1300 1259
1301 return alloc_mmu_pages(vcpu); 1260 return alloc_mmu_pages(vcpu);
1302} 1261}
@@ -1305,7 +1264,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
1305{ 1264{
1306 ASSERT(vcpu); 1265 ASSERT(vcpu);
1307 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); 1266 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1308 ASSERT(!list_empty(&vcpu->free_pages));
1309 1267
1310 return init_kvm_mmu(vcpu); 1268 return init_kvm_mmu(vcpu);
1311} 1269}
@@ -1331,7 +1289,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot)
1331 if (!test_bit(slot, &page->slot_bitmap)) 1289 if (!test_bit(slot, &page->slot_bitmap))
1332 continue; 1290 continue;
1333 1291
1334 pt = __va(page->page_hpa); 1292 pt = page->spt;
1335 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 1293 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1336 /* avoid RMW */ 1294 /* avoid RMW */
1337 if (pt[i] & PT_WRITABLE_MASK) { 1295 if (pt[i] & PT_WRITABLE_MASK) {
@@ -1354,7 +1312,7 @@ void kvm_mmu_zap_all(struct kvm_vcpu *vcpu)
1354 } 1312 }
1355 1313
1356 mmu_free_memory_caches(vcpu); 1314 mmu_free_memory_caches(vcpu);
1357 kvm_arch_ops->tlb_flush(vcpu); 1315 kvm_flush_remote_tlbs(vcpu->kvm);
1358 init_kvm_mmu(vcpu); 1316 init_kvm_mmu(vcpu);
1359} 1317}
1360 1318
@@ -1364,6 +1322,10 @@ void kvm_mmu_module_exit(void)
1364 kmem_cache_destroy(pte_chain_cache); 1322 kmem_cache_destroy(pte_chain_cache);
1365 if (rmap_desc_cache) 1323 if (rmap_desc_cache)
1366 kmem_cache_destroy(rmap_desc_cache); 1324 kmem_cache_destroy(rmap_desc_cache);
1325 if (mmu_page_cache)
1326 kmem_cache_destroy(mmu_page_cache);
1327 if (mmu_page_header_cache)
1328 kmem_cache_destroy(mmu_page_header_cache);
1367} 1329}
1368 1330
1369int kvm_mmu_module_init(void) 1331int kvm_mmu_module_init(void)
@@ -1379,6 +1341,18 @@ int kvm_mmu_module_init(void)
1379 if (!rmap_desc_cache) 1341 if (!rmap_desc_cache)
1380 goto nomem; 1342 goto nomem;
1381 1343
1344 mmu_page_cache = kmem_cache_create("kvm_mmu_page",
1345 PAGE_SIZE,
1346 PAGE_SIZE, 0, NULL, NULL);
1347 if (!mmu_page_cache)
1348 goto nomem;
1349
1350 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
1351 sizeof(struct kvm_mmu_page),
1352 0, 0, NULL, NULL);
1353 if (!mmu_page_header_cache)
1354 goto nomem;
1355
1382 return 0; 1356 return 0;
1383 1357
1384nomem: 1358nomem:
@@ -1482,7 +1456,7 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu)
1482 int i; 1456 int i;
1483 1457
1484 list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { 1458 list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1485 u64 *pt = __va(page->page_hpa); 1459 u64 *pt = page->spt;
1486 1460
1487 if (page->role.level != PT_PAGE_TABLE_LEVEL) 1461 if (page->role.level != PT_PAGE_TABLE_LEVEL)
1488 continue; 1462 continue;
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
index 73ffbffb1097..a7c5cb0319ea 100644
--- a/drivers/kvm/paging_tmpl.h
+++ b/drivers/kvm/paging_tmpl.h
@@ -31,7 +31,6 @@
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) 33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK
35 #ifdef CONFIG_X86_64 34 #ifdef CONFIG_X86_64
36 #define PT_MAX_FULL_LEVELS 4 35 #define PT_MAX_FULL_LEVELS 4
37 #else 36 #else
@@ -46,7 +45,6 @@
46 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 45 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
47 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 46 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
48 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) 47 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
49 #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK
50 #define PT_MAX_FULL_LEVELS 2 48 #define PT_MAX_FULL_LEVELS 2
51#else 49#else
52 #error Invalid PTTYPE value 50 #error Invalid PTTYPE value
@@ -192,40 +190,143 @@ static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
192 mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]); 190 mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]);
193} 191}
194 192
195static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte, 193static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
196 u64 *shadow_pte, u64 access_bits, gfn_t gfn) 194 u64 *shadow_pte,
195 gpa_t gaddr,
196 pt_element_t *gpte,
197 u64 access_bits,
198 int user_fault,
199 int write_fault,
200 int *ptwrite,
201 struct guest_walker *walker,
202 gfn_t gfn)
197{ 203{
198 ASSERT(*shadow_pte == 0); 204 hpa_t paddr;
199 access_bits &= guest_pte; 205 int dirty = *gpte & PT_DIRTY_MASK;
200 *shadow_pte = (guest_pte & PT_PTE_COPY_MASK); 206 u64 spte = *shadow_pte;
201 set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK, 207 int was_rmapped = is_rmap_pte(spte);
202 guest_pte & PT_DIRTY_MASK, access_bits, gfn); 208
209 pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d"
210 " user_fault %d gfn %lx\n",
211 __FUNCTION__, spte, (u64)*gpte, access_bits,
212 write_fault, user_fault, gfn);
213
214 if (write_fault && !dirty) {
215 *gpte |= PT_DIRTY_MASK;
216 dirty = 1;
217 FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
218 }
219
220 spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK;
221 spte |= *gpte & PT64_NX_MASK;
222 if (!dirty)
223 access_bits &= ~PT_WRITABLE_MASK;
224
225 paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
226
227 spte |= PT_PRESENT_MASK;
228 if (access_bits & PT_USER_MASK)
229 spte |= PT_USER_MASK;
230
231 if (is_error_hpa(paddr)) {
232 spte |= gaddr;
233 spte |= PT_SHADOW_IO_MARK;
234 spte &= ~PT_PRESENT_MASK;
235 set_shadow_pte(shadow_pte, spte);
236 return;
237 }
238
239 spte |= paddr;
240
241 if ((access_bits & PT_WRITABLE_MASK)
242 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
243 struct kvm_mmu_page *shadow;
244
245 spte |= PT_WRITABLE_MASK;
246 if (user_fault) {
247 mmu_unshadow(vcpu, gfn);
248 goto unshadowed;
249 }
250
251 shadow = kvm_mmu_lookup_page(vcpu, gfn);
252 if (shadow) {
253 pgprintk("%s: found shadow page for %lx, marking ro\n",
254 __FUNCTION__, gfn);
255 access_bits &= ~PT_WRITABLE_MASK;
256 if (is_writeble_pte(spte)) {
257 spte &= ~PT_WRITABLE_MASK;
258 kvm_arch_ops->tlb_flush(vcpu);
259 }
260 if (write_fault)
261 *ptwrite = 1;
262 }
263 }
264
265unshadowed:
266
267 if (access_bits & PT_WRITABLE_MASK)
268 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
269
270 set_shadow_pte(shadow_pte, spte);
271 page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
272 if (!was_rmapped)
273 rmap_add(vcpu, shadow_pte);
203} 274}
204 275
205static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde, 276static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t *gpte,
206 u64 *shadow_pte, u64 access_bits, gfn_t gfn) 277 u64 *shadow_pte, u64 access_bits,
278 int user_fault, int write_fault, int *ptwrite,
279 struct guest_walker *walker, gfn_t gfn)
280{
281 access_bits &= *gpte;
282 FNAME(set_pte_common)(vcpu, shadow_pte, *gpte & PT_BASE_ADDR_MASK,
283 gpte, access_bits, user_fault, write_fault,
284 ptwrite, walker, gfn);
285}
286
287static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
288 u64 *spte, const void *pte, int bytes)
289{
290 pt_element_t gpte;
291
292 if (bytes < sizeof(pt_element_t))
293 return;
294 gpte = *(const pt_element_t *)pte;
295 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK))
296 return;
297 pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
298 FNAME(set_pte)(vcpu, &gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
299 0, NULL, NULL,
300 (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT);
301}
302
303static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t *gpde,
304 u64 *shadow_pte, u64 access_bits,
305 int user_fault, int write_fault, int *ptwrite,
306 struct guest_walker *walker, gfn_t gfn)
207{ 307{
208 gpa_t gaddr; 308 gpa_t gaddr;
209 309
210 ASSERT(*shadow_pte == 0); 310 access_bits &= *gpde;
211 access_bits &= guest_pde;
212 gaddr = (gpa_t)gfn << PAGE_SHIFT; 311 gaddr = (gpa_t)gfn << PAGE_SHIFT;
213 if (PTTYPE == 32 && is_cpuid_PSE36()) 312 if (PTTYPE == 32 && is_cpuid_PSE36())
214 gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) << 313 gaddr |= (*gpde & PT32_DIR_PSE36_MASK) <<
215 (32 - PT32_DIR_PSE36_SHIFT); 314 (32 - PT32_DIR_PSE36_SHIFT);
216 *shadow_pte = guest_pde & PT_PTE_COPY_MASK; 315 FNAME(set_pte_common)(vcpu, shadow_pte, gaddr,
217 set_pte_common(vcpu, shadow_pte, gaddr, 316 gpde, access_bits, user_fault, write_fault,
218 guest_pde & PT_DIRTY_MASK, access_bits, gfn); 317 ptwrite, walker, gfn);
219} 318}
220 319
221/* 320/*
222 * Fetch a shadow pte for a specific level in the paging hierarchy. 321 * Fetch a shadow pte for a specific level in the paging hierarchy.
223 */ 322 */
224static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 323static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
225 struct guest_walker *walker) 324 struct guest_walker *walker,
325 int user_fault, int write_fault, int *ptwrite)
226{ 326{
227 hpa_t shadow_addr; 327 hpa_t shadow_addr;
228 int level; 328 int level;
329 u64 *shadow_ent;
229 u64 *prev_shadow_ent = NULL; 330 u64 *prev_shadow_ent = NULL;
230 pt_element_t *guest_ent = walker->ptep; 331 pt_element_t *guest_ent = walker->ptep;
231 332
@@ -242,37 +343,23 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
242 343
243 for (; ; level--) { 344 for (; ; level--) {
244 u32 index = SHADOW_PT_INDEX(addr, level); 345 u32 index = SHADOW_PT_INDEX(addr, level);
245 u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index;
246 struct kvm_mmu_page *shadow_page; 346 struct kvm_mmu_page *shadow_page;
247 u64 shadow_pte; 347 u64 shadow_pte;
248 int metaphysical; 348 int metaphysical;
249 gfn_t table_gfn; 349 gfn_t table_gfn;
250 unsigned hugepage_access = 0; 350 unsigned hugepage_access = 0;
251 351
352 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
252 if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { 353 if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
253 if (level == PT_PAGE_TABLE_LEVEL) 354 if (level == PT_PAGE_TABLE_LEVEL)
254 return shadow_ent; 355 break;
255 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; 356 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
256 prev_shadow_ent = shadow_ent; 357 prev_shadow_ent = shadow_ent;
257 continue; 358 continue;
258 } 359 }
259 360
260 if (level == PT_PAGE_TABLE_LEVEL) { 361 if (level == PT_PAGE_TABLE_LEVEL)
261 362 break;
262 if (walker->level == PT_DIRECTORY_LEVEL) {
263 if (prev_shadow_ent)
264 *prev_shadow_ent |= PT_SHADOW_PS_MARK;
265 FNAME(set_pde)(vcpu, *guest_ent, shadow_ent,
266 walker->inherited_ar,
267 walker->gfn);
268 } else {
269 ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
270 FNAME(set_pte)(vcpu, *guest_ent, shadow_ent,
271 walker->inherited_ar,
272 walker->gfn);
273 }
274 return shadow_ent;
275 }
276 363
277 if (level - 1 == PT_PAGE_TABLE_LEVEL 364 if (level - 1 == PT_PAGE_TABLE_LEVEL
278 && walker->level == PT_DIRECTORY_LEVEL) { 365 && walker->level == PT_DIRECTORY_LEVEL) {
@@ -289,90 +376,24 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
289 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, 376 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
290 metaphysical, hugepage_access, 377 metaphysical, hugepage_access,
291 shadow_ent); 378 shadow_ent);
292 shadow_addr = shadow_page->page_hpa; 379 shadow_addr = __pa(shadow_page->spt);
293 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK 380 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
294 | PT_WRITABLE_MASK | PT_USER_MASK; 381 | PT_WRITABLE_MASK | PT_USER_MASK;
295 *shadow_ent = shadow_pte; 382 *shadow_ent = shadow_pte;
296 prev_shadow_ent = shadow_ent; 383 prev_shadow_ent = shadow_ent;
297 } 384 }
298}
299 385
300/* 386 if (walker->level == PT_DIRECTORY_LEVEL) {
301 * The guest faulted for write. We need to 387 FNAME(set_pde)(vcpu, guest_ent, shadow_ent,
302 * 388 walker->inherited_ar, user_fault, write_fault,
303 * - check write permissions 389 ptwrite, walker, walker->gfn);
304 * - update the guest pte dirty bit 390 } else {
305 * - update our own dirty page tracking structures 391 ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
306 */ 392 FNAME(set_pte)(vcpu, guest_ent, shadow_ent,
307static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, 393 walker->inherited_ar, user_fault, write_fault,
308 u64 *shadow_ent, 394 ptwrite, walker, walker->gfn);
309 struct guest_walker *walker,
310 gva_t addr,
311 int user,
312 int *write_pt)
313{
314 pt_element_t *guest_ent;
315 int writable_shadow;
316 gfn_t gfn;
317 struct kvm_mmu_page *page;
318
319 if (is_writeble_pte(*shadow_ent))
320 return !user || (*shadow_ent & PT_USER_MASK);
321
322 writable_shadow = *shadow_ent & PT_SHADOW_WRITABLE_MASK;
323 if (user) {
324 /*
325 * User mode access. Fail if it's a kernel page or a read-only
326 * page.
327 */
328 if (!(*shadow_ent & PT_SHADOW_USER_MASK) || !writable_shadow)
329 return 0;
330 ASSERT(*shadow_ent & PT_USER_MASK);
331 } else
332 /*
333 * Kernel mode access. Fail if it's a read-only page and
334 * supervisor write protection is enabled.
335 */
336 if (!writable_shadow) {
337 if (is_write_protection(vcpu))
338 return 0;
339 *shadow_ent &= ~PT_USER_MASK;
340 }
341
342 guest_ent = walker->ptep;
343
344 if (!is_present_pte(*guest_ent)) {
345 *shadow_ent = 0;
346 return 0;
347 } 395 }
348 396 return shadow_ent;
349 gfn = walker->gfn;
350
351 if (user) {
352 /*
353 * Usermode page faults won't be for page table updates.
354 */
355 while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
356 pgprintk("%s: zap %lx %x\n",
357 __FUNCTION__, gfn, page->role.word);
358 kvm_mmu_zap_page(vcpu, page);
359 }
360 } else if (kvm_mmu_lookup_page(vcpu, gfn)) {
361 pgprintk("%s: found shadow page for %lx, marking ro\n",
362 __FUNCTION__, gfn);
363 mark_page_dirty(vcpu->kvm, gfn);
364 FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
365 *guest_ent |= PT_DIRTY_MASK;
366 *write_pt = 1;
367 return 0;
368 }
369 mark_page_dirty(vcpu->kvm, gfn);
370 *shadow_ent |= PT_WRITABLE_MASK;
371 FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
372 *guest_ent |= PT_DIRTY_MASK;
373 rmap_add(vcpu, shadow_ent);
374
375 return 1;
376} 397}
377 398
378/* 399/*
@@ -397,7 +418,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
397 int fetch_fault = error_code & PFERR_FETCH_MASK; 418 int fetch_fault = error_code & PFERR_FETCH_MASK;
398 struct guest_walker walker; 419 struct guest_walker walker;
399 u64 *shadow_pte; 420 u64 *shadow_pte;
400 int fixed;
401 int write_pt = 0; 421 int write_pt = 0;
402 int r; 422 int r;
403 423
@@ -421,27 +441,20 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
421 pgprintk("%s: guest page fault\n", __FUNCTION__); 441 pgprintk("%s: guest page fault\n", __FUNCTION__);
422 inject_page_fault(vcpu, addr, walker.error_code); 442 inject_page_fault(vcpu, addr, walker.error_code);
423 FNAME(release_walker)(&walker); 443 FNAME(release_walker)(&walker);
444 vcpu->last_pt_write_count = 0; /* reset fork detector */
424 return 0; 445 return 0;
425 } 446 }
426 447
427 shadow_pte = FNAME(fetch)(vcpu, addr, &walker); 448 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
428 pgprintk("%s: shadow pte %p %llx\n", __FUNCTION__, 449 &write_pt);
429 shadow_pte, *shadow_pte); 450 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
430 451 shadow_pte, *shadow_pte, write_pt);
431 /*
432 * Update the shadow pte.
433 */
434 if (write_fault)
435 fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr,
436 user_fault, &write_pt);
437 else
438 fixed = fix_read_pf(shadow_pte);
439
440 pgprintk("%s: updated shadow pte %p %llx\n", __FUNCTION__,
441 shadow_pte, *shadow_pte);
442 452
443 FNAME(release_walker)(&walker); 453 FNAME(release_walker)(&walker);
444 454
455 if (!write_pt)
456 vcpu->last_pt_write_count = 0; /* reset fork detector */
457
445 /* 458 /*
446 * mmio: emulate if accessible, otherwise its a guest fault. 459 * mmio: emulate if accessible, otherwise its a guest fault.
447 */ 460 */
@@ -478,7 +491,5 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
478#undef PT_INDEX 491#undef PT_INDEX
479#undef SHADOW_PT_INDEX 492#undef SHADOW_PT_INDEX
480#undef PT_LEVEL_MASK 493#undef PT_LEVEL_MASK
481#undef PT_PTE_COPY_MASK
482#undef PT_NON_PTE_COPY_MASK
483#undef PT_DIR_BASE_ADDR_MASK 494#undef PT_DIR_BASE_ADDR_MASK
484#undef PT_MAX_FULL_LEVELS 495#undef PT_MAX_FULL_LEVELS
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index fa17d6d4f0cb..bc818cc126e3 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -14,16 +14,17 @@
14 * 14 *
15 */ 15 */
16 16
17#include "kvm_svm.h"
18#include "x86_emulate.h"
19
17#include <linux/module.h> 20#include <linux/module.h>
18#include <linux/kernel.h> 21#include <linux/kernel.h>
19#include <linux/vmalloc.h> 22#include <linux/vmalloc.h>
20#include <linux/highmem.h> 23#include <linux/highmem.h>
21#include <linux/profile.h> 24#include <linux/profile.h>
22#include <linux/sched.h> 25#include <linux/sched.h>
23#include <asm/desc.h>
24 26
25#include "kvm_svm.h" 27#include <asm/desc.h>
26#include "x86_emulate.h"
27 28
28MODULE_AUTHOR("Qumranet"); 29MODULE_AUTHOR("Qumranet");
29MODULE_LICENSE("GPL"); 30MODULE_LICENSE("GPL");
@@ -378,7 +379,7 @@ static __init int svm_hardware_setup(void)
378 int cpu; 379 int cpu;
379 struct page *iopm_pages; 380 struct page *iopm_pages;
380 struct page *msrpm_pages; 381 struct page *msrpm_pages;
381 void *msrpm_va; 382 void *iopm_va, *msrpm_va;
382 int r; 383 int r;
383 384
384 kvm_emulator_want_group7_invlpg(); 385 kvm_emulator_want_group7_invlpg();
@@ -387,8 +388,10 @@ static __init int svm_hardware_setup(void)
387 388
388 if (!iopm_pages) 389 if (!iopm_pages)
389 return -ENOMEM; 390 return -ENOMEM;
390 memset(page_address(iopm_pages), 0xff, 391
391 PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); 392 iopm_va = page_address(iopm_pages);
393 memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
394 clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */
392 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; 395 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
393 396
394 397
@@ -579,7 +582,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
579 goto out2; 582 goto out2;
580 583
581 vcpu->svm->vmcb = page_address(page); 584 vcpu->svm->vmcb = page_address(page);
582 memset(vcpu->svm->vmcb, 0, PAGE_SIZE); 585 clear_page(vcpu->svm->vmcb);
583 vcpu->svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; 586 vcpu->svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
584 vcpu->svm->asid_generation = 0; 587 vcpu->svm->asid_generation = 0;
585 memset(vcpu->svm->db_regs, 0, sizeof(vcpu->svm->db_regs)); 588 memset(vcpu->svm->db_regs, 0, sizeof(vcpu->svm->db_regs));
@@ -587,9 +590,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
587 590
588 fx_init(vcpu); 591 fx_init(vcpu);
589 vcpu->fpu_active = 1; 592 vcpu->fpu_active = 1;
590 vcpu->apic_base = 0xfee00000 | 593 vcpu->apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
591 /*for vcpu 0*/ MSR_IA32_APICBASE_BSP | 594 if (vcpu == &vcpu->kvm->vcpus[0])
592 MSR_IA32_APICBASE_ENABLE; 595 vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
593 596
594 return 0; 597 return 0;
595 598
@@ -955,7 +958,7 @@ static int shutdown_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
955 * VMCB is undefined after a SHUTDOWN intercept 958 * VMCB is undefined after a SHUTDOWN intercept
956 * so reinitialize it. 959 * so reinitialize it.
957 */ 960 */
958 memset(vcpu->svm->vmcb, 0, PAGE_SIZE); 961 clear_page(vcpu->svm->vmcb);
959 init_vmcb(vcpu->svm->vmcb); 962 init_vmcb(vcpu->svm->vmcb);
960 963
961 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 964 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
@@ -1113,12 +1116,7 @@ static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1113{ 1116{
1114 vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1; 1117 vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1;
1115 skip_emulated_instruction(vcpu); 1118 skip_emulated_instruction(vcpu);
1116 if (vcpu->irq_summary) 1119 return kvm_emulate_halt(vcpu);
1117 return 1;
1118
1119 kvm_run->exit_reason = KVM_EXIT_HLT;
1120 ++vcpu->stat.halt_exits;
1121 return 0;
1122} 1120}
1123 1121
1124static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1122static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -1473,6 +1471,11 @@ static void load_db_regs(unsigned long *db_regs)
1473 asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3])); 1471 asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3]));
1474} 1472}
1475 1473
1474static void svm_flush_tlb(struct kvm_vcpu *vcpu)
1475{
1476 force_new_asid(vcpu);
1477}
1478
1476static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1479static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1477{ 1480{
1478 u16 fs_selector; 1481 u16 fs_selector;
@@ -1481,11 +1484,20 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1481 int r; 1484 int r;
1482 1485
1483again: 1486again:
1487 r = kvm_mmu_reload(vcpu);
1488 if (unlikely(r))
1489 return r;
1490
1484 if (!vcpu->mmio_read_completed) 1491 if (!vcpu->mmio_read_completed)
1485 do_interrupt_requests(vcpu, kvm_run); 1492 do_interrupt_requests(vcpu, kvm_run);
1486 1493
1487 clgi(); 1494 clgi();
1488 1495
1496 vcpu->guest_mode = 1;
1497 if (vcpu->requests)
1498 if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
1499 svm_flush_tlb(vcpu);
1500
1489 pre_svm_run(vcpu); 1501 pre_svm_run(vcpu);
1490 1502
1491 save_host_msrs(vcpu); 1503 save_host_msrs(vcpu);
@@ -1617,6 +1629,8 @@ again:
1617#endif 1629#endif
1618 : "cc", "memory" ); 1630 : "cc", "memory" );
1619 1631
1632 vcpu->guest_mode = 0;
1633
1620 if (vcpu->fpu_active) { 1634 if (vcpu->fpu_active) {
1621 fx_save(vcpu->guest_fx_image); 1635 fx_save(vcpu->guest_fx_image);
1622 fx_restore(vcpu->host_fx_image); 1636 fx_restore(vcpu->host_fx_image);
@@ -1681,11 +1695,6 @@ again:
1681 return r; 1695 return r;
1682} 1696}
1683 1697
1684static void svm_flush_tlb(struct kvm_vcpu *vcpu)
1685{
1686 force_new_asid(vcpu);
1687}
1688
1689static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) 1698static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
1690{ 1699{
1691 vcpu->svm->vmcb->save.cr3 = root; 1700 vcpu->svm->vmcb->save.cr3 = root;
@@ -1727,6 +1736,12 @@ static void svm_inject_page_fault(struct kvm_vcpu *vcpu,
1727 1736
1728static int is_disabled(void) 1737static int is_disabled(void)
1729{ 1738{
1739 u64 vm_cr;
1740
1741 rdmsrl(MSR_VM_CR, vm_cr);
1742 if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
1743 return 1;
1744
1730 return 0; 1745 return 0;
1731} 1746}
1732 1747
diff --git a/drivers/kvm/svm.h b/drivers/kvm/svm.h
index 5e93814400ce..3b1b0f35b6cb 100644
--- a/drivers/kvm/svm.h
+++ b/drivers/kvm/svm.h
@@ -175,8 +175,11 @@ struct __attribute__ ((__packed__)) vmcb {
175#define SVM_CPUID_FUNC 0x8000000a 175#define SVM_CPUID_FUNC 0x8000000a
176 176
177#define MSR_EFER_SVME_MASK (1ULL << 12) 177#define MSR_EFER_SVME_MASK (1ULL << 12)
178#define MSR_VM_CR 0xc0010114
178#define MSR_VM_HSAVE_PA 0xc0010117ULL 179#define MSR_VM_HSAVE_PA 0xc0010117ULL
179 180
181#define SVM_VM_CR_SVM_DISABLE 4
182
180#define SVM_SELECTOR_S_SHIFT 4 183#define SVM_SELECTOR_S_SHIFT 4
181#define SVM_SELECTOR_DPL_SHIFT 5 184#define SVM_SELECTOR_DPL_SHIFT 5
182#define SVM_SELECTOR_P_SHIFT 7 185#define SVM_SELECTOR_P_SHIFT 7
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index c1ac106ace8c..80628f69916d 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -17,28 +17,35 @@
17 17
18#include "kvm.h" 18#include "kvm.h"
19#include "vmx.h" 19#include "vmx.h"
20#include "segment_descriptor.h"
21
20#include <linux/module.h> 22#include <linux/module.h>
21#include <linux/kernel.h> 23#include <linux/kernel.h>
22#include <linux/mm.h> 24#include <linux/mm.h>
23#include <linux/highmem.h> 25#include <linux/highmem.h>
24#include <linux/profile.h> 26#include <linux/profile.h>
25#include <linux/sched.h> 27#include <linux/sched.h>
28
26#include <asm/io.h> 29#include <asm/io.h>
27#include <asm/desc.h> 30#include <asm/desc.h>
28 31
29#include "segment_descriptor.h"
30
31MODULE_AUTHOR("Qumranet"); 32MODULE_AUTHOR("Qumranet");
32MODULE_LICENSE("GPL"); 33MODULE_LICENSE("GPL");
33 34
35static int init_rmode_tss(struct kvm *kvm);
36
34static DEFINE_PER_CPU(struct vmcs *, vmxarea); 37static DEFINE_PER_CPU(struct vmcs *, vmxarea);
35static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 38static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
36 39
40static struct page *vmx_io_bitmap_a;
41static struct page *vmx_io_bitmap_b;
42
37#ifdef CONFIG_X86_64 43#ifdef CONFIG_X86_64
38#define HOST_IS_64 1 44#define HOST_IS_64 1
39#else 45#else
40#define HOST_IS_64 0 46#define HOST_IS_64 0
41#endif 47#endif
48#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE)
42 49
43static struct vmcs_descriptor { 50static struct vmcs_descriptor {
44 int size; 51 int size;
@@ -82,18 +89,17 @@ static const u32 vmx_msr_index[] = {
82}; 89};
83#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 90#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
84 91
85#ifdef CONFIG_X86_64 92static inline u64 msr_efer_save_restore_bits(struct vmx_msr_entry msr)
86static unsigned msr_offset_kernel_gs_base; 93{
87#define NR_64BIT_MSRS 4 94 return (u64)msr.data & EFER_SAVE_RESTORE_BITS;
88/* 95}
89 * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt 96
90 * mechanism (cpu bug AA24) 97static inline int msr_efer_need_save_restore(struct kvm_vcpu *vcpu)
91 */ 98{
92#define NR_BAD_MSRS 2 99 int efer_offset = vcpu->msr_offset_efer;
93#else 100 return msr_efer_save_restore_bits(vcpu->host_msrs[efer_offset]) !=
94#define NR_64BIT_MSRS 0 101 msr_efer_save_restore_bits(vcpu->guest_msrs[efer_offset]);
95#define NR_BAD_MSRS 0 102}
96#endif
97 103
98static inline int is_page_fault(u32 intr_info) 104static inline int is_page_fault(u32 intr_info)
99{ 105{
@@ -115,13 +121,23 @@ static inline int is_external_interrupt(u32 intr_info)
115 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 121 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
116} 122}
117 123
118static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr) 124static int __find_msr_index(struct kvm_vcpu *vcpu, u32 msr)
119{ 125{
120 int i; 126 int i;
121 127
122 for (i = 0; i < vcpu->nmsrs; ++i) 128 for (i = 0; i < vcpu->nmsrs; ++i)
123 if (vcpu->guest_msrs[i].index == msr) 129 if (vcpu->guest_msrs[i].index == msr)
124 return &vcpu->guest_msrs[i]; 130 return i;
131 return -1;
132}
133
134static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr)
135{
136 int i;
137
138 i = __find_msr_index(vcpu, msr);
139 if (i >= 0)
140 return &vcpu->guest_msrs[i];
125 return NULL; 141 return NULL;
126} 142}
127 143
@@ -147,6 +163,7 @@ static void __vcpu_clear(void *arg)
147 vmcs_clear(vcpu->vmcs); 163 vmcs_clear(vcpu->vmcs);
148 if (per_cpu(current_vmcs, cpu) == vcpu->vmcs) 164 if (per_cpu(current_vmcs, cpu) == vcpu->vmcs)
149 per_cpu(current_vmcs, cpu) = NULL; 165 per_cpu(current_vmcs, cpu) = NULL;
166 rdtscll(vcpu->host_tsc);
150} 167}
151 168
152static void vcpu_clear(struct kvm_vcpu *vcpu) 169static void vcpu_clear(struct kvm_vcpu *vcpu)
@@ -234,6 +251,127 @@ static void vmcs_set_bits(unsigned long field, u32 mask)
234 vmcs_writel(field, vmcs_readl(field) | mask); 251 vmcs_writel(field, vmcs_readl(field) | mask);
235} 252}
236 253
254static void update_exception_bitmap(struct kvm_vcpu *vcpu)
255{
256 u32 eb;
257
258 eb = 1u << PF_VECTOR;
259 if (!vcpu->fpu_active)
260 eb |= 1u << NM_VECTOR;
261 if (vcpu->guest_debug.enabled)
262 eb |= 1u << 1;
263 if (vcpu->rmode.active)
264 eb = ~0;
265 vmcs_write32(EXCEPTION_BITMAP, eb);
266}
267
268static void reload_tss(void)
269{
270#ifndef CONFIG_X86_64
271
272 /*
273 * VT restores TR but not its size. Useless.
274 */
275 struct descriptor_table gdt;
276 struct segment_descriptor *descs;
277
278 get_gdt(&gdt);
279 descs = (void *)gdt.base;
280 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
281 load_TR_desc();
282#endif
283}
284
285static void load_transition_efer(struct kvm_vcpu *vcpu)
286{
287 u64 trans_efer;
288 int efer_offset = vcpu->msr_offset_efer;
289
290 trans_efer = vcpu->host_msrs[efer_offset].data;
291 trans_efer &= ~EFER_SAVE_RESTORE_BITS;
292 trans_efer |= msr_efer_save_restore_bits(
293 vcpu->guest_msrs[efer_offset]);
294 wrmsrl(MSR_EFER, trans_efer);
295 vcpu->stat.efer_reload++;
296}
297
298static void vmx_save_host_state(struct kvm_vcpu *vcpu)
299{
300 struct vmx_host_state *hs = &vcpu->vmx_host_state;
301
302 if (hs->loaded)
303 return;
304
305 hs->loaded = 1;
306 /*
307 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
308 * allow segment selectors with cpl > 0 or ti == 1.
309 */
310 hs->ldt_sel = read_ldt();
311 hs->fs_gs_ldt_reload_needed = hs->ldt_sel;
312 hs->fs_sel = read_fs();
313 if (!(hs->fs_sel & 7))
314 vmcs_write16(HOST_FS_SELECTOR, hs->fs_sel);
315 else {
316 vmcs_write16(HOST_FS_SELECTOR, 0);
317 hs->fs_gs_ldt_reload_needed = 1;
318 }
319 hs->gs_sel = read_gs();
320 if (!(hs->gs_sel & 7))
321 vmcs_write16(HOST_GS_SELECTOR, hs->gs_sel);
322 else {
323 vmcs_write16(HOST_GS_SELECTOR, 0);
324 hs->fs_gs_ldt_reload_needed = 1;
325 }
326
327#ifdef CONFIG_X86_64
328 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
329 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
330#else
331 vmcs_writel(HOST_FS_BASE, segment_base(hs->fs_sel));
332 vmcs_writel(HOST_GS_BASE, segment_base(hs->gs_sel));
333#endif
334
335#ifdef CONFIG_X86_64
336 if (is_long_mode(vcpu)) {
337 save_msrs(vcpu->host_msrs + vcpu->msr_offset_kernel_gs_base, 1);
338 }
339#endif
340 load_msrs(vcpu->guest_msrs, vcpu->save_nmsrs);
341 if (msr_efer_need_save_restore(vcpu))
342 load_transition_efer(vcpu);
343}
344
345static void vmx_load_host_state(struct kvm_vcpu *vcpu)
346{
347 struct vmx_host_state *hs = &vcpu->vmx_host_state;
348
349 if (!hs->loaded)
350 return;
351
352 hs->loaded = 0;
353 if (hs->fs_gs_ldt_reload_needed) {
354 load_ldt(hs->ldt_sel);
355 load_fs(hs->fs_sel);
356 /*
357 * If we have to reload gs, we must take care to
358 * preserve our gs base.
359 */
360 local_irq_disable();
361 load_gs(hs->gs_sel);
362#ifdef CONFIG_X86_64
363 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
364#endif
365 local_irq_enable();
366
367 reload_tss();
368 }
369 save_msrs(vcpu->guest_msrs, vcpu->save_nmsrs);
370 load_msrs(vcpu->host_msrs, vcpu->save_nmsrs);
371 if (msr_efer_need_save_restore(vcpu))
372 load_msrs(vcpu->host_msrs + vcpu->msr_offset_efer, 1);
373}
374
237/* 375/*
238 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 376 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
239 * vcpu mutex is already taken. 377 * vcpu mutex is already taken.
@@ -242,6 +380,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu)
242{ 380{
243 u64 phys_addr = __pa(vcpu->vmcs); 381 u64 phys_addr = __pa(vcpu->vmcs);
244 int cpu; 382 int cpu;
383 u64 tsc_this, delta;
245 384
246 cpu = get_cpu(); 385 cpu = get_cpu();
247 386
@@ -275,15 +414,43 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu)
275 414
276 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 415 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
277 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 416 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
417
418 /*
419 * Make sure the time stamp counter is monotonous.
420 */
421 rdtscll(tsc_this);
422 delta = vcpu->host_tsc - tsc_this;
423 vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
278 } 424 }
279} 425}
280 426
281static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 427static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
282{ 428{
429 vmx_load_host_state(vcpu);
283 kvm_put_guest_fpu(vcpu); 430 kvm_put_guest_fpu(vcpu);
284 put_cpu(); 431 put_cpu();
285} 432}
286 433
434static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
435{
436 if (vcpu->fpu_active)
437 return;
438 vcpu->fpu_active = 1;
439 vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK);
440 if (vcpu->cr0 & CR0_TS_MASK)
441 vmcs_set_bits(GUEST_CR0, CR0_TS_MASK);
442 update_exception_bitmap(vcpu);
443}
444
445static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
446{
447 if (!vcpu->fpu_active)
448 return;
449 vcpu->fpu_active = 0;
450 vmcs_set_bits(GUEST_CR0, CR0_TS_MASK);
451 update_exception_bitmap(vcpu);
452}
453
287static void vmx_vcpu_decache(struct kvm_vcpu *vcpu) 454static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
288{ 455{
289 vcpu_clear(vcpu); 456 vcpu_clear(vcpu);
@@ -332,41 +499,61 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
332} 499}
333 500
334/* 501/*
502 * Swap MSR entry in host/guest MSR entry array.
503 */
504void move_msr_up(struct kvm_vcpu *vcpu, int from, int to)
505{
506 struct vmx_msr_entry tmp;
507 tmp = vcpu->guest_msrs[to];
508 vcpu->guest_msrs[to] = vcpu->guest_msrs[from];
509 vcpu->guest_msrs[from] = tmp;
510 tmp = vcpu->host_msrs[to];
511 vcpu->host_msrs[to] = vcpu->host_msrs[from];
512 vcpu->host_msrs[from] = tmp;
513}
514
515/*
335 * Set up the vmcs to automatically save and restore system 516 * Set up the vmcs to automatically save and restore system
336 * msrs. Don't touch the 64-bit msrs if the guest is in legacy 517 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
337 * mode, as fiddling with msrs is very expensive. 518 * mode, as fiddling with msrs is very expensive.
338 */ 519 */
339static void setup_msrs(struct kvm_vcpu *vcpu) 520static void setup_msrs(struct kvm_vcpu *vcpu)
340{ 521{
341 int nr_skip, nr_good_msrs; 522 int save_nmsrs;
342
343 if (is_long_mode(vcpu))
344 nr_skip = NR_BAD_MSRS;
345 else
346 nr_skip = NR_64BIT_MSRS;
347 nr_good_msrs = vcpu->nmsrs - nr_skip;
348 523
349 /* 524 save_nmsrs = 0;
350 * MSR_K6_STAR is only needed on long mode guests, and only
351 * if efer.sce is enabled.
352 */
353 if (find_msr_entry(vcpu, MSR_K6_STAR)) {
354 --nr_good_msrs;
355#ifdef CONFIG_X86_64 525#ifdef CONFIG_X86_64
356 if (is_long_mode(vcpu) && (vcpu->shadow_efer & EFER_SCE)) 526 if (is_long_mode(vcpu)) {
357 ++nr_good_msrs; 527 int index;
358#endif 528
529 index = __find_msr_index(vcpu, MSR_SYSCALL_MASK);
530 if (index >= 0)
531 move_msr_up(vcpu, index, save_nmsrs++);
532 index = __find_msr_index(vcpu, MSR_LSTAR);
533 if (index >= 0)
534 move_msr_up(vcpu, index, save_nmsrs++);
535 index = __find_msr_index(vcpu, MSR_CSTAR);
536 if (index >= 0)
537 move_msr_up(vcpu, index, save_nmsrs++);
538 index = __find_msr_index(vcpu, MSR_KERNEL_GS_BASE);
539 if (index >= 0)
540 move_msr_up(vcpu, index, save_nmsrs++);
541 /*
542 * MSR_K6_STAR is only needed on long mode guests, and only
543 * if efer.sce is enabled.
544 */
545 index = __find_msr_index(vcpu, MSR_K6_STAR);
546 if ((index >= 0) && (vcpu->shadow_efer & EFER_SCE))
547 move_msr_up(vcpu, index, save_nmsrs++);
359 } 548 }
549#endif
550 vcpu->save_nmsrs = save_nmsrs;
360 551
361 vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, 552#ifdef CONFIG_X86_64
362 virt_to_phys(vcpu->guest_msrs + nr_skip)); 553 vcpu->msr_offset_kernel_gs_base =
363 vmcs_writel(VM_EXIT_MSR_STORE_ADDR, 554 __find_msr_index(vcpu, MSR_KERNEL_GS_BASE);
364 virt_to_phys(vcpu->guest_msrs + nr_skip)); 555#endif
365 vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, 556 vcpu->msr_offset_efer = __find_msr_index(vcpu, MSR_EFER);
366 virt_to_phys(vcpu->host_msrs + nr_skip));
367 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */
368 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
369 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
370} 557}
371 558
372/* 559/*
@@ -394,23 +581,6 @@ static void guest_write_tsc(u64 guest_tsc)
394 vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); 581 vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
395} 582}
396 583
397static void reload_tss(void)
398{
399#ifndef CONFIG_X86_64
400
401 /*
402 * VT restores TR but not its size. Useless.
403 */
404 struct descriptor_table gdt;
405 struct segment_descriptor *descs;
406
407 get_gdt(&gdt);
408 descs = (void *)gdt.base;
409 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
410 load_TR_desc();
411#endif
412}
413
414/* 584/*
415 * Reads an msr value (of 'msr_index') into 'pdata'. 585 * Reads an msr value (of 'msr_index') into 'pdata'.
416 * Returns 0 on success, non-0 otherwise. 586 * Returns 0 on success, non-0 otherwise.
@@ -470,10 +640,15 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
470static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 640static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
471{ 641{
472 struct vmx_msr_entry *msr; 642 struct vmx_msr_entry *msr;
643 int ret = 0;
644
473 switch (msr_index) { 645 switch (msr_index) {
474#ifdef CONFIG_X86_64 646#ifdef CONFIG_X86_64
475 case MSR_EFER: 647 case MSR_EFER:
476 return kvm_set_msr_common(vcpu, msr_index, data); 648 ret = kvm_set_msr_common(vcpu, msr_index, data);
649 if (vcpu->vmx_host_state.loaded)
650 load_transition_efer(vcpu);
651 break;
477 case MSR_FS_BASE: 652 case MSR_FS_BASE:
478 vmcs_writel(GUEST_FS_BASE, data); 653 vmcs_writel(GUEST_FS_BASE, data);
479 break; 654 break;
@@ -497,14 +672,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
497 msr = find_msr_entry(vcpu, msr_index); 672 msr = find_msr_entry(vcpu, msr_index);
498 if (msr) { 673 if (msr) {
499 msr->data = data; 674 msr->data = data;
675 if (vcpu->vmx_host_state.loaded)
676 load_msrs(vcpu->guest_msrs, vcpu->save_nmsrs);
500 break; 677 break;
501 } 678 }
502 return kvm_set_msr_common(vcpu, msr_index, data); 679 ret = kvm_set_msr_common(vcpu, msr_index, data);
503 msr->data = data;
504 break;
505 } 680 }
506 681
507 return 0; 682 return ret;
508} 683}
509 684
510/* 685/*
@@ -530,10 +705,8 @@ static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
530static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) 705static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
531{ 706{
532 unsigned long dr7 = 0x400; 707 unsigned long dr7 = 0x400;
533 u32 exception_bitmap;
534 int old_singlestep; 708 int old_singlestep;
535 709
536 exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
537 old_singlestep = vcpu->guest_debug.singlestep; 710 old_singlestep = vcpu->guest_debug.singlestep;
538 711
539 vcpu->guest_debug.enabled = dbg->enabled; 712 vcpu->guest_debug.enabled = dbg->enabled;
@@ -549,13 +722,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
549 dr7 |= 0 << (i*4+16); /* execution breakpoint */ 722 dr7 |= 0 << (i*4+16); /* execution breakpoint */
550 } 723 }
551 724
552 exception_bitmap |= (1u << 1); /* Trap debug exceptions */
553
554 vcpu->guest_debug.singlestep = dbg->singlestep; 725 vcpu->guest_debug.singlestep = dbg->singlestep;
555 } else { 726 } else
556 exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */
557 vcpu->guest_debug.singlestep = 0; 727 vcpu->guest_debug.singlestep = 0;
558 }
559 728
560 if (old_singlestep && !vcpu->guest_debug.singlestep) { 729 if (old_singlestep && !vcpu->guest_debug.singlestep) {
561 unsigned long flags; 730 unsigned long flags;
@@ -565,7 +734,7 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
565 vmcs_writel(GUEST_RFLAGS, flags); 734 vmcs_writel(GUEST_RFLAGS, flags);
566 } 735 }
567 736
568 vmcs_write32(EXCEPTION_BITMAP, exception_bitmap); 737 update_exception_bitmap(vcpu);
569 vmcs_writel(GUEST_DR7, dr7); 738 vmcs_writel(GUEST_DR7, dr7);
570 739
571 return 0; 740 return 0;
@@ -679,14 +848,6 @@ static __exit void hardware_unsetup(void)
679 free_kvm_area(); 848 free_kvm_area();
680} 849}
681 850
682static void update_exception_bitmap(struct kvm_vcpu *vcpu)
683{
684 if (vcpu->rmode.active)
685 vmcs_write32(EXCEPTION_BITMAP, ~0);
686 else
687 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
688}
689
690static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) 851static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
691{ 852{
692 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 853 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -793,6 +954,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
793 fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds); 954 fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds);
794 fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs); 955 fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs);
795 fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs); 956 fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs);
957
958 init_rmode_tss(vcpu->kvm);
796} 959}
797 960
798#ifdef CONFIG_X86_64 961#ifdef CONFIG_X86_64
@@ -837,6 +1000,8 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
837 1000
838static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1001static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
839{ 1002{
1003 vmx_fpu_deactivate(vcpu);
1004
840 if (vcpu->rmode.active && (cr0 & CR0_PE_MASK)) 1005 if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
841 enter_pmode(vcpu); 1006 enter_pmode(vcpu);
842 1007
@@ -852,26 +1017,20 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
852 } 1017 }
853#endif 1018#endif
854 1019
855 if (!(cr0 & CR0_TS_MASK)) {
856 vcpu->fpu_active = 1;
857 vmcs_clear_bits(EXCEPTION_BITMAP, CR0_TS_MASK);
858 }
859
860 vmcs_writel(CR0_READ_SHADOW, cr0); 1020 vmcs_writel(CR0_READ_SHADOW, cr0);
861 vmcs_writel(GUEST_CR0, 1021 vmcs_writel(GUEST_CR0,
862 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); 1022 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
863 vcpu->cr0 = cr0; 1023 vcpu->cr0 = cr0;
1024
1025 if (!(cr0 & CR0_TS_MASK) || !(cr0 & CR0_PE_MASK))
1026 vmx_fpu_activate(vcpu);
864} 1027}
865 1028
866static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 1029static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
867{ 1030{
868 vmcs_writel(GUEST_CR3, cr3); 1031 vmcs_writel(GUEST_CR3, cr3);
869 1032 if (vcpu->cr0 & CR0_PE_MASK)
870 if (!(vcpu->cr0 & CR0_TS_MASK)) { 1033 vmx_fpu_deactivate(vcpu);
871 vcpu->fpu_active = 0;
872 vmcs_set_bits(GUEST_CR0, CR0_TS_MASK);
873 vmcs_set_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR);
874 }
875} 1034}
876 1035
877static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1036static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -937,23 +1096,11 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
937 var->unusable = (ar >> 16) & 1; 1096 var->unusable = (ar >> 16) & 1;
938} 1097}
939 1098
940static void vmx_set_segment(struct kvm_vcpu *vcpu, 1099static u32 vmx_segment_access_rights(struct kvm_segment *var)
941 struct kvm_segment *var, int seg)
942{ 1100{
943 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
944 u32 ar; 1101 u32 ar;
945 1102
946 vmcs_writel(sf->base, var->base); 1103 if (var->unusable)
947 vmcs_write32(sf->limit, var->limit);
948 vmcs_write16(sf->selector, var->selector);
949 if (vcpu->rmode.active && var->s) {
950 /*
951 * Hack real-mode segments into vm86 compatibility.
952 */
953 if (var->base == 0xffff0000 && var->selector == 0xf000)
954 vmcs_writel(sf->base, 0xf0000);
955 ar = 0xf3;
956 } else if (var->unusable)
957 ar = 1 << 16; 1104 ar = 1 << 16;
958 else { 1105 else {
959 ar = var->type & 15; 1106 ar = var->type & 15;
@@ -967,6 +1114,35 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
967 } 1114 }
968 if (ar == 0) /* a 0 value means unusable */ 1115 if (ar == 0) /* a 0 value means unusable */
969 ar = AR_UNUSABLE_MASK; 1116 ar = AR_UNUSABLE_MASK;
1117
1118 return ar;
1119}
1120
1121static void vmx_set_segment(struct kvm_vcpu *vcpu,
1122 struct kvm_segment *var, int seg)
1123{
1124 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1125 u32 ar;
1126
1127 if (vcpu->rmode.active && seg == VCPU_SREG_TR) {
1128 vcpu->rmode.tr.selector = var->selector;
1129 vcpu->rmode.tr.base = var->base;
1130 vcpu->rmode.tr.limit = var->limit;
1131 vcpu->rmode.tr.ar = vmx_segment_access_rights(var);
1132 return;
1133 }
1134 vmcs_writel(sf->base, var->base);
1135 vmcs_write32(sf->limit, var->limit);
1136 vmcs_write16(sf->selector, var->selector);
1137 if (vcpu->rmode.active && var->s) {
1138 /*
1139 * Hack real-mode segments into vm86 compatibility.
1140 */
1141 if (var->base == 0xffff0000 && var->selector == 0xf000)
1142 vmcs_writel(sf->base, 0xf0000);
1143 ar = 0xf3;
1144 } else
1145 ar = vmx_segment_access_rights(var);
970 vmcs_write32(sf->ar_bytes, ar); 1146 vmcs_write32(sf->ar_bytes, ar);
971} 1147}
972 1148
@@ -1018,16 +1194,16 @@ static int init_rmode_tss(struct kvm* kvm)
1018 } 1194 }
1019 1195
1020 page = kmap_atomic(p1, KM_USER0); 1196 page = kmap_atomic(p1, KM_USER0);
1021 memset(page, 0, PAGE_SIZE); 1197 clear_page(page);
1022 *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 1198 *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
1023 kunmap_atomic(page, KM_USER0); 1199 kunmap_atomic(page, KM_USER0);
1024 1200
1025 page = kmap_atomic(p2, KM_USER0); 1201 page = kmap_atomic(p2, KM_USER0);
1026 memset(page, 0, PAGE_SIZE); 1202 clear_page(page);
1027 kunmap_atomic(page, KM_USER0); 1203 kunmap_atomic(page, KM_USER0);
1028 1204
1029 page = kmap_atomic(p3, KM_USER0); 1205 page = kmap_atomic(p3, KM_USER0);
1030 memset(page, 0, PAGE_SIZE); 1206 clear_page(page);
1031 *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0; 1207 *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
1032 kunmap_atomic(page, KM_USER0); 1208 kunmap_atomic(page, KM_USER0);
1033 1209
@@ -1066,7 +1242,7 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
1066 struct descriptor_table dt; 1242 struct descriptor_table dt;
1067 int i; 1243 int i;
1068 int ret = 0; 1244 int ret = 0;
1069 extern asmlinkage void kvm_vmx_return(void); 1245 unsigned long kvm_vmx_return;
1070 1246
1071 if (!init_rmode_tss(vcpu->kvm)) { 1247 if (!init_rmode_tss(vcpu->kvm)) {
1072 ret = -ENOMEM; 1248 ret = -ENOMEM;
@@ -1076,9 +1252,9 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
1076 memset(vcpu->regs, 0, sizeof(vcpu->regs)); 1252 memset(vcpu->regs, 0, sizeof(vcpu->regs));
1077 vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val(); 1253 vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
1078 vcpu->cr8 = 0; 1254 vcpu->cr8 = 0;
1079 vcpu->apic_base = 0xfee00000 | 1255 vcpu->apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
1080 /*for vcpu 0*/ MSR_IA32_APICBASE_BSP | 1256 if (vcpu == &vcpu->kvm->vcpus[0])
1081 MSR_IA32_APICBASE_ENABLE; 1257 vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
1082 1258
1083 fx_init(vcpu); 1259 fx_init(vcpu);
1084 1260
@@ -1129,8 +1305,8 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
1129 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); 1305 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1130 1306
1131 /* I/O */ 1307 /* I/O */
1132 vmcs_write64(IO_BITMAP_A, 0); 1308 vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
1133 vmcs_write64(IO_BITMAP_B, 0); 1309 vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
1134 1310
1135 guest_write_tsc(0); 1311 guest_write_tsc(0);
1136 1312
@@ -1150,12 +1326,11 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
1150 CPU_BASED_HLT_EXITING /* 20.6.2 */ 1326 CPU_BASED_HLT_EXITING /* 20.6.2 */
1151 | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */ 1327 | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */
1152 | CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */ 1328 | CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */
1153 | CPU_BASED_UNCOND_IO_EXITING /* 20.6.2 */ 1329 | CPU_BASED_ACTIVATE_IO_BITMAP /* 20.6.2 */
1154 | CPU_BASED_MOV_DR_EXITING 1330 | CPU_BASED_MOV_DR_EXITING
1155 | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */ 1331 | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */
1156 ); 1332 );
1157 1333
1158 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
1159 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 1334 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1160 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 1335 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1161 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 1336 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
@@ -1185,8 +1360,11 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
1185 get_idt(&dt); 1360 get_idt(&dt);
1186 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ 1361 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
1187 1362
1188 1363 asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
1189 vmcs_writel(HOST_RIP, (unsigned long)kvm_vmx_return); /* 22.2.5 */ 1364 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
1365 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
1366 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
1367 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
1190 1368
1191 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); 1369 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
1192 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); 1370 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
@@ -1210,10 +1388,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
1210 vcpu->host_msrs[j].reserved = 0; 1388 vcpu->host_msrs[j].reserved = 0;
1211 vcpu->host_msrs[j].data = data; 1389 vcpu->host_msrs[j].data = data;
1212 vcpu->guest_msrs[j] = vcpu->host_msrs[j]; 1390 vcpu->guest_msrs[j] = vcpu->host_msrs[j];
1213#ifdef CONFIG_X86_64
1214 if (index == MSR_KERNEL_GS_BASE)
1215 msr_offset_kernel_gs_base = j;
1216#endif
1217 ++vcpu->nmsrs; 1391 ++vcpu->nmsrs;
1218 } 1392 }
1219 1393
@@ -1241,6 +1415,8 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
1241#ifdef CONFIG_X86_64 1415#ifdef CONFIG_X86_64
1242 vmx_set_efer(vcpu, 0); 1416 vmx_set_efer(vcpu, 0);
1243#endif 1417#endif
1418 vmx_fpu_activate(vcpu);
1419 update_exception_bitmap(vcpu);
1244 1420
1245 return 0; 1421 return 0;
1246 1422
@@ -1365,7 +1541,11 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
1365 if (!vcpu->rmode.active) 1541 if (!vcpu->rmode.active)
1366 return 0; 1542 return 0;
1367 1543
1368 if (vec == GP_VECTOR && err_code == 0) 1544 /*
1545 * Instruction with address size override prefix opcode 0x67
1546 * Cause the #SS fault with 0 error code in VM86 mode.
1547 */
1548 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
1369 if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE) 1549 if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE)
1370 return 1; 1550 return 1;
1371 return 0; 1551 return 0;
@@ -1400,10 +1580,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1400 } 1580 }
1401 1581
1402 if (is_no_device(intr_info)) { 1582 if (is_no_device(intr_info)) {
1403 vcpu->fpu_active = 1; 1583 vmx_fpu_activate(vcpu);
1404 vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR);
1405 if (!(vcpu->cr0 & CR0_TS_MASK))
1406 vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK);
1407 return 1; 1584 return 1;
1408 } 1585 }
1409 1586
@@ -1445,8 +1622,13 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1445 1622
1446 if (vcpu->rmode.active && 1623 if (vcpu->rmode.active &&
1447 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, 1624 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
1448 error_code)) 1625 error_code)) {
1626 if (vcpu->halt_request) {
1627 vcpu->halt_request = 0;
1628 return kvm_emulate_halt(vcpu);
1629 }
1449 return 1; 1630 return 1;
1631 }
1450 1632
1451 if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) { 1633 if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
1452 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1634 kvm_run->exit_reason = KVM_EXIT_DEBUG;
@@ -1595,11 +1777,10 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1595 break; 1777 break;
1596 case 2: /* clts */ 1778 case 2: /* clts */
1597 vcpu_load_rsp_rip(vcpu); 1779 vcpu_load_rsp_rip(vcpu);
1598 vcpu->fpu_active = 1; 1780 vmx_fpu_deactivate(vcpu);
1599 vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR);
1600 vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK);
1601 vcpu->cr0 &= ~CR0_TS_MASK; 1781 vcpu->cr0 &= ~CR0_TS_MASK;
1602 vmcs_writel(CR0_READ_SHADOW, vcpu->cr0); 1782 vmcs_writel(CR0_READ_SHADOW, vcpu->cr0);
1783 vmx_fpu_activate(vcpu);
1603 skip_emulated_instruction(vcpu); 1784 skip_emulated_instruction(vcpu);
1604 return 1; 1785 return 1;
1605 case 1: /*mov from cr*/ 1786 case 1: /*mov from cr*/
@@ -1734,12 +1915,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
1734static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1915static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1735{ 1916{
1736 skip_emulated_instruction(vcpu); 1917 skip_emulated_instruction(vcpu);
1737 if (vcpu->irq_summary) 1918 return kvm_emulate_halt(vcpu);
1738 return 1;
1739
1740 kvm_run->exit_reason = KVM_EXIT_HLT;
1741 ++vcpu->stat.halt_exits;
1742 return 0;
1743} 1919}
1744 1920
1745static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1921static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -1770,7 +1946,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
1770}; 1946};
1771 1947
1772static const int kvm_vmx_max_exit_handlers = 1948static const int kvm_vmx_max_exit_handlers =
1773 sizeof(kvm_vmx_exit_handlers) / sizeof(*kvm_vmx_exit_handlers); 1949 ARRAY_SIZE(kvm_vmx_exit_handlers);
1774 1950
1775/* 1951/*
1776 * The guest has exited. See if we can fix it or if we need userspace 1952 * The guest has exited. See if we can fix it or if we need userspace
@@ -1810,61 +1986,44 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
1810 (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)); 1986 (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
1811} 1987}
1812 1988
1989static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1990{
1991}
1992
1813static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1993static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1814{ 1994{
1815 u8 fail; 1995 u8 fail;
1816 u16 fs_sel, gs_sel, ldt_sel;
1817 int fs_gs_ldt_reload_needed;
1818 int r; 1996 int r;
1819 1997
1820again: 1998preempted:
1821 /* 1999 if (vcpu->guest_debug.enabled)
1822 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 2000 kvm_guest_debug_pre(vcpu);
1823 * allow segment selectors with cpl > 0 or ti == 1.
1824 */
1825 fs_sel = read_fs();
1826 gs_sel = read_gs();
1827 ldt_sel = read_ldt();
1828 fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
1829 if (!fs_gs_ldt_reload_needed) {
1830 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1831 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1832 } else {
1833 vmcs_write16(HOST_FS_SELECTOR, 0);
1834 vmcs_write16(HOST_GS_SELECTOR, 0);
1835 }
1836
1837#ifdef CONFIG_X86_64
1838 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
1839 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
1840#else
1841 vmcs_writel(HOST_FS_BASE, segment_base(fs_sel));
1842 vmcs_writel(HOST_GS_BASE, segment_base(gs_sel));
1843#endif
1844 2001
2002again:
1845 if (!vcpu->mmio_read_completed) 2003 if (!vcpu->mmio_read_completed)
1846 do_interrupt_requests(vcpu, kvm_run); 2004 do_interrupt_requests(vcpu, kvm_run);
1847 2005
1848 if (vcpu->guest_debug.enabled) 2006 vmx_save_host_state(vcpu);
1849 kvm_guest_debug_pre(vcpu);
1850
1851 kvm_load_guest_fpu(vcpu); 2007 kvm_load_guest_fpu(vcpu);
1852 2008
2009 r = kvm_mmu_reload(vcpu);
2010 if (unlikely(r))
2011 goto out;
2012
1853 /* 2013 /*
1854 * Loading guest fpu may have cleared host cr0.ts 2014 * Loading guest fpu may have cleared host cr0.ts
1855 */ 2015 */
1856 vmcs_writel(HOST_CR0, read_cr0()); 2016 vmcs_writel(HOST_CR0, read_cr0());
1857 2017
1858#ifdef CONFIG_X86_64 2018 local_irq_disable();
1859 if (is_long_mode(vcpu)) { 2019
1860 save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1); 2020 vcpu->guest_mode = 1;
1861 load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); 2021 if (vcpu->requests)
1862 } 2022 if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
1863#endif 2023 vmx_flush_tlb(vcpu);
1864 2024
1865 asm ( 2025 asm (
1866 /* Store host registers */ 2026 /* Store host registers */
1867 "pushf \n\t"
1868#ifdef CONFIG_X86_64 2027#ifdef CONFIG_X86_64
1869 "push %%rax; push %%rbx; push %%rdx;" 2028 "push %%rax; push %%rbx; push %%rdx;"
1870 "push %%rsi; push %%rdi; push %%rbp;" 2029 "push %%rsi; push %%rdi; push %%rbp;"
@@ -1909,12 +2068,11 @@ again:
1909 "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */ 2068 "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
1910#endif 2069#endif
1911 /* Enter guest mode */ 2070 /* Enter guest mode */
1912 "jne launched \n\t" 2071 "jne .Llaunched \n\t"
1913 ASM_VMX_VMLAUNCH "\n\t" 2072 ASM_VMX_VMLAUNCH "\n\t"
1914 "jmp kvm_vmx_return \n\t" 2073 "jmp .Lkvm_vmx_return \n\t"
1915 "launched: " ASM_VMX_VMRESUME "\n\t" 2074 ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
1916 ".globl kvm_vmx_return \n\t" 2075 ".Lkvm_vmx_return: "
1917 "kvm_vmx_return: "
1918 /* Save guest registers, load host registers, keep flags */ 2076 /* Save guest registers, load host registers, keep flags */
1919#ifdef CONFIG_X86_64 2077#ifdef CONFIG_X86_64
1920 "xchg %3, (%%rsp) \n\t" 2078 "xchg %3, (%%rsp) \n\t"
@@ -1957,7 +2115,6 @@ again:
1957 "pop %%ecx; popa \n\t" 2115 "pop %%ecx; popa \n\t"
1958#endif 2116#endif
1959 "setbe %0 \n\t" 2117 "setbe %0 \n\t"
1960 "popf \n\t"
1961 : "=q" (fail) 2118 : "=q" (fail)
1962 : "r"(vcpu->launched), "d"((unsigned long)HOST_RSP), 2119 : "r"(vcpu->launched), "d"((unsigned long)HOST_RSP),
1963 "c"(vcpu), 2120 "c"(vcpu),
@@ -1981,84 +2138,61 @@ again:
1981 [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) 2138 [cr2]"i"(offsetof(struct kvm_vcpu, cr2))
1982 : "cc", "memory" ); 2139 : "cc", "memory" );
1983 2140
1984 /* 2141 vcpu->guest_mode = 0;
1985 * Reload segment selectors ASAP. (it's needed for a functional 2142 local_irq_enable();
1986 * kernel: x86 relies on having __KERNEL_PDA in %fs and x86_64
1987 * relies on having 0 in %gs for the CPU PDA to work.)
1988 */
1989 if (fs_gs_ldt_reload_needed) {
1990 load_ldt(ldt_sel);
1991 load_fs(fs_sel);
1992 /*
1993 * If we have to reload gs, we must take care to
1994 * preserve our gs base.
1995 */
1996 local_irq_disable();
1997 load_gs(gs_sel);
1998#ifdef CONFIG_X86_64
1999 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
2000#endif
2001 local_irq_enable();
2002 2143
2003 reload_tss();
2004 }
2005 ++vcpu->stat.exits; 2144 ++vcpu->stat.exits;
2006 2145
2007#ifdef CONFIG_X86_64
2008 if (is_long_mode(vcpu)) {
2009 save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
2010 load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
2011 }
2012#endif
2013
2014 vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; 2146 vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
2015 2147
2016 asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 2148 asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2017 2149
2018 if (fail) { 2150 if (unlikely(fail)) {
2019 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2151 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2020 kvm_run->fail_entry.hardware_entry_failure_reason 2152 kvm_run->fail_entry.hardware_entry_failure_reason
2021 = vmcs_read32(VM_INSTRUCTION_ERROR); 2153 = vmcs_read32(VM_INSTRUCTION_ERROR);
2022 r = 0; 2154 r = 0;
2023 } else { 2155 goto out;
2024 /* 2156 }
2025 * Profile KVM exit RIPs: 2157 /*
2026 */ 2158 * Profile KVM exit RIPs:
2027 if (unlikely(prof_on == KVM_PROFILING)) 2159 */
2028 profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP)); 2160 if (unlikely(prof_on == KVM_PROFILING))
2029 2161 profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP));
2030 vcpu->launched = 1; 2162
2031 r = kvm_handle_exit(kvm_run, vcpu); 2163 vcpu->launched = 1;
2032 if (r > 0) { 2164 r = kvm_handle_exit(kvm_run, vcpu);
2033 /* Give scheduler a change to reschedule. */ 2165 if (r > 0) {
2034 if (signal_pending(current)) { 2166 /* Give scheduler a change to reschedule. */
2035 ++vcpu->stat.signal_exits; 2167 if (signal_pending(current)) {
2036 post_kvm_run_save(vcpu, kvm_run); 2168 r = -EINTR;
2037 kvm_run->exit_reason = KVM_EXIT_INTR; 2169 kvm_run->exit_reason = KVM_EXIT_INTR;
2038 return -EINTR; 2170 ++vcpu->stat.signal_exits;
2039 } 2171 goto out;
2040 2172 }
2041 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 2173
2042 ++vcpu->stat.request_irq_exits; 2174 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
2043 post_kvm_run_save(vcpu, kvm_run); 2175 r = -EINTR;
2044 kvm_run->exit_reason = KVM_EXIT_INTR; 2176 kvm_run->exit_reason = KVM_EXIT_INTR;
2045 return -EINTR; 2177 ++vcpu->stat.request_irq_exits;
2046 } 2178 goto out;
2047 2179 }
2048 kvm_resched(vcpu); 2180 if (!need_resched()) {
2181 ++vcpu->stat.light_exits;
2049 goto again; 2182 goto again;
2050 } 2183 }
2051 } 2184 }
2052 2185
2186out:
2187 if (r > 0) {
2188 kvm_resched(vcpu);
2189 goto preempted;
2190 }
2191
2053 post_kvm_run_save(vcpu, kvm_run); 2192 post_kvm_run_save(vcpu, kvm_run);
2054 return r; 2193 return r;
2055} 2194}
2056 2195
2057static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
2058{
2059 vmcs_writel(GUEST_CR3, vmcs_readl(GUEST_CR3));
2060}
2061
2062static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, 2196static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
2063 unsigned long addr, 2197 unsigned long addr,
2064 u32 err_code) 2198 u32 err_code)
@@ -2122,7 +2256,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
2122 vmcs_clear(vmcs); 2256 vmcs_clear(vmcs);
2123 vcpu->vmcs = vmcs; 2257 vcpu->vmcs = vmcs;
2124 vcpu->launched = 0; 2258 vcpu->launched = 0;
2125 vcpu->fpu_active = 1;
2126 2259
2127 return 0; 2260 return 0;
2128 2261
@@ -2188,11 +2321,50 @@ static struct kvm_arch_ops vmx_arch_ops = {
2188 2321
2189static int __init vmx_init(void) 2322static int __init vmx_init(void)
2190{ 2323{
2191 return kvm_init_arch(&vmx_arch_ops, THIS_MODULE); 2324 void *iova;
2325 int r;
2326
2327 vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
2328 if (!vmx_io_bitmap_a)
2329 return -ENOMEM;
2330
2331 vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
2332 if (!vmx_io_bitmap_b) {
2333 r = -ENOMEM;
2334 goto out;
2335 }
2336
2337 /*
2338 * Allow direct access to the PC debug port (it is often used for I/O
2339 * delays, but the vmexits simply slow things down).
2340 */
2341 iova = kmap(vmx_io_bitmap_a);
2342 memset(iova, 0xff, PAGE_SIZE);
2343 clear_bit(0x80, iova);
2344 kunmap(vmx_io_bitmap_a);
2345
2346 iova = kmap(vmx_io_bitmap_b);
2347 memset(iova, 0xff, PAGE_SIZE);
2348 kunmap(vmx_io_bitmap_b);
2349
2350 r = kvm_init_arch(&vmx_arch_ops, THIS_MODULE);
2351 if (r)
2352 goto out1;
2353
2354 return 0;
2355
2356out1:
2357 __free_page(vmx_io_bitmap_b);
2358out:
2359 __free_page(vmx_io_bitmap_a);
2360 return r;
2192} 2361}
2193 2362
2194static void __exit vmx_exit(void) 2363static void __exit vmx_exit(void)
2195{ 2364{
2365 __free_page(vmx_io_bitmap_b);
2366 __free_page(vmx_io_bitmap_a);
2367
2196 kvm_exit_arch(); 2368 kvm_exit_arch();
2197} 2369}
2198 2370
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c
index 7ade09086aa5..f60012d62610 100644
--- a/drivers/kvm/x86_emulate.c
+++ b/drivers/kvm/x86_emulate.c
@@ -98,8 +98,11 @@ static u8 opcode_table[256] = {
98 0, 0, 0, 0, 98 0, 0, 0, 0,
99 /* 0x40 - 0x4F */ 99 /* 0x40 - 0x4F */
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 /* 0x50 - 0x5F */ 101 /* 0x50 - 0x57 */
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 102 0, 0, 0, 0, 0, 0, 0, 0,
103 /* 0x58 - 0x5F */
104 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
105 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
103 /* 0x60 - 0x6F */ 106 /* 0x60 - 0x6F */
104 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 107 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 108 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -128,9 +131,9 @@ static u8 opcode_table[256] = {
128 /* 0xB0 - 0xBF */ 131 /* 0xB0 - 0xBF */
129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 132 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
130 /* 0xC0 - 0xC7 */ 133 /* 0xC0 - 0xC7 */
131 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 0, 0, 134 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
132 0, 0, ByteOp | DstMem | SrcImm | ModRM | Mov, 135 0, ImplicitOps, 0, 0,
133 DstMem | SrcImm | ModRM | Mov, 136 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
134 /* 0xC8 - 0xCF */ 137 /* 0xC8 - 0xCF */
135 0, 0, 0, 0, 0, 0, 0, 0, 138 0, 0, 0, 0, 0, 0, 0, 0,
136 /* 0xD0 - 0xD7 */ 139 /* 0xD0 - 0xD7 */
@@ -143,7 +146,8 @@ static u8 opcode_table[256] = {
143 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 146 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
144 /* 0xF0 - 0xF7 */ 147 /* 0xF0 - 0xF7 */
145 0, 0, 0, 0, 148 0, 0, 0, 0,
146 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 149 ImplicitOps, 0,
150 ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
147 /* 0xF8 - 0xFF */ 151 /* 0xF8 - 0xFF */
148 0, 0, 0, 0, 152 0, 0, 0, 0,
149 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM 153 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
@@ -152,7 +156,7 @@ static u8 opcode_table[256] = {
152static u16 twobyte_table[256] = { 156static u16 twobyte_table[256] = {
153 /* 0x00 - 0x0F */ 157 /* 0x00 - 0x0F */
154 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, 158 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
155 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 159 0, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
156 /* 0x10 - 0x1F */ 160 /* 0x10 - 0x1F */
157 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, 161 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
158 /* 0x20 - 0x2F */ 162 /* 0x20 - 0x2F */
@@ -481,6 +485,7 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
481 int mode = ctxt->mode; 485 int mode = ctxt->mode;
482 unsigned long modrm_ea; 486 unsigned long modrm_ea;
483 int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0; 487 int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0;
488 int no_wb = 0;
484 489
485 /* Shadow copy of register state. Committed on successful emulation. */ 490 /* Shadow copy of register state. Committed on successful emulation. */
486 unsigned long _regs[NR_VCPU_REGS]; 491 unsigned long _regs[NR_VCPU_REGS];
@@ -1047,7 +1052,7 @@ done_prefixes:
1047 _regs[VCPU_REGS_RSP]), 1052 _regs[VCPU_REGS_RSP]),
1048 &dst.val, dst.bytes, ctxt)) != 0) 1053 &dst.val, dst.bytes, ctxt)) != 0)
1049 goto done; 1054 goto done;
1050 dst.val = dst.orig_val; /* skanky: disable writeback */ 1055 no_wb = 1;
1051 break; 1056 break;
1052 default: 1057 default:
1053 goto cannot_emulate; 1058 goto cannot_emulate;
@@ -1056,7 +1061,7 @@ done_prefixes:
1056 } 1061 }
1057 1062
1058writeback: 1063writeback:
1059 if ((d & Mov) || (dst.orig_val != dst.val)) { 1064 if (!no_wb) {
1060 switch (dst.type) { 1065 switch (dst.type) {
1061 case OP_REG: 1066 case OP_REG:
1062 /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ 1067 /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
@@ -1149,6 +1154,23 @@ special_insn:
1149 case 0xae ... 0xaf: /* scas */ 1154 case 0xae ... 0xaf: /* scas */
1150 DPRINTF("Urk! I don't handle SCAS.\n"); 1155 DPRINTF("Urk! I don't handle SCAS.\n");
1151 goto cannot_emulate; 1156 goto cannot_emulate;
1157 case 0xf4: /* hlt */
1158 ctxt->vcpu->halt_request = 1;
1159 goto done;
1160 case 0xc3: /* ret */
1161 dst.ptr = &_eip;
1162 goto pop_instruction;
1163 case 0x58 ... 0x5f: /* pop reg */
1164 dst.ptr = (unsigned long *)&_regs[b & 0x7];
1165
1166pop_instruction:
1167 if ((rc = ops->read_std(register_address(ctxt->ss_base,
1168 _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt)) != 0)
1169 goto done;
1170
1171 register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
1172 no_wb = 1; /* Disable writeback. */
1173 break;
1152 } 1174 }
1153 goto writeback; 1175 goto writeback;
1154 1176
@@ -1302,8 +1324,10 @@ twobyte_insn:
1302 1324
1303twobyte_special_insn: 1325twobyte_special_insn:
1304 /* Disable writeback. */ 1326 /* Disable writeback. */
1305 dst.orig_val = dst.val; 1327 no_wb = 1;
1306 switch (b) { 1328 switch (b) {
1329 case 0x09: /* wbinvd */
1330 break;
1307 case 0x0d: /* GrpP (prefetch) */ 1331 case 0x0d: /* GrpP (prefetch) */
1308 case 0x18: /* Grp16 (prefetch/nop) */ 1332 case 0x18: /* Grp16 (prefetch/nop) */
1309 break; 1333 break;