aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/kvm/kvm.h100
-rw-r--r--drivers/kvm/kvm_main.c792
-rw-r--r--drivers/kvm/kvm_svm.h13
-rw-r--r--drivers/kvm/kvm_vmx.h14
-rw-r--r--drivers/kvm/mmu.c154
-rw-r--r--drivers/kvm/paging_tmpl.h12
-rw-r--r--drivers/kvm/svm.c197
-rw-r--r--drivers/kvm/svm.h6
-rw-r--r--drivers/kvm/vmx.c273
-rw-r--r--drivers/kvm/x86_emulate.c51
-rw-r--r--drivers/kvm/x86_emulate.h32
-rw-r--r--include/linux/Kbuild1
-rw-r--r--include/linux/kvm.h133
-rw-r--r--include/linux/miscdevice.h1
14 files changed, 1301 insertions, 478 deletions
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 0d122bf889db..41634fde8e13 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -51,16 +51,19 @@
51#define UNMAPPED_GVA (~(gpa_t)0) 51#define UNMAPPED_GVA (~(gpa_t)0)
52 52
53#define KVM_MAX_VCPUS 1 53#define KVM_MAX_VCPUS 1
54#define KVM_ALIAS_SLOTS 4
54#define KVM_MEMORY_SLOTS 4 55#define KVM_MEMORY_SLOTS 4
55#define KVM_NUM_MMU_PAGES 256 56#define KVM_NUM_MMU_PAGES 256
56#define KVM_MIN_FREE_MMU_PAGES 5 57#define KVM_MIN_FREE_MMU_PAGES 5
57#define KVM_REFILL_PAGES 25 58#define KVM_REFILL_PAGES 25
59#define KVM_MAX_CPUID_ENTRIES 40
58 60
59#define FX_IMAGE_SIZE 512 61#define FX_IMAGE_SIZE 512
60#define FX_IMAGE_ALIGN 16 62#define FX_IMAGE_ALIGN 16
61#define FX_BUF_SIZE (2 * FX_IMAGE_SIZE + FX_IMAGE_ALIGN) 63#define FX_BUF_SIZE (2 * FX_IMAGE_SIZE + FX_IMAGE_ALIGN)
62 64
63#define DE_VECTOR 0 65#define DE_VECTOR 0
66#define NM_VECTOR 7
64#define DF_VECTOR 8 67#define DF_VECTOR 8
65#define TS_VECTOR 10 68#define TS_VECTOR 10
66#define NP_VECTOR 11 69#define NP_VECTOR 11
@@ -73,6 +76,8 @@
73 76
74#define IOPL_SHIFT 12 77#define IOPL_SHIFT 12
75 78
79#define KVM_PIO_PAGE_OFFSET 1
80
76/* 81/*
77 * Address types: 82 * Address types:
78 * 83 *
@@ -106,6 +111,7 @@ struct kvm_pte_chain {
106 * bits 4:7 - page table level for this shadow (1-4) 111 * bits 4:7 - page table level for this shadow (1-4)
107 * bits 8:9 - page table quadrant for 2-level guests 112 * bits 8:9 - page table quadrant for 2-level guests
108 * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode) 113 * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode)
114 * bits 17:18 - "access" - the user and writable bits of a huge page pde
109 */ 115 */
110union kvm_mmu_page_role { 116union kvm_mmu_page_role {
111 unsigned word; 117 unsigned word;
@@ -115,6 +121,7 @@ union kvm_mmu_page_role {
115 unsigned quadrant : 2; 121 unsigned quadrant : 2;
116 unsigned pad_for_nice_hex_output : 6; 122 unsigned pad_for_nice_hex_output : 6;
117 unsigned metaphysical : 1; 123 unsigned metaphysical : 1;
124 unsigned hugepage_access : 2;
118 }; 125 };
119}; 126};
120 127
@@ -133,7 +140,6 @@ struct kvm_mmu_page {
133 unsigned long slot_bitmap; /* One bit set per slot which has memory 140 unsigned long slot_bitmap; /* One bit set per slot which has memory
134 * in this shadow page. 141 * in this shadow page.
135 */ 142 */
136 int global; /* Set if all ptes in this page are global */
137 int multimapped; /* More than one parent_pte? */ 143 int multimapped; /* More than one parent_pte? */
138 int root_count; /* Currently serving as active root */ 144 int root_count; /* Currently serving as active root */
139 union { 145 union {
@@ -219,6 +225,34 @@ enum {
219 VCPU_SREG_LDTR, 225 VCPU_SREG_LDTR,
220}; 226};
221 227
228struct kvm_pio_request {
229 unsigned long count;
230 int cur_count;
231 struct page *guest_pages[2];
232 unsigned guest_page_offset;
233 int in;
234 int size;
235 int string;
236 int down;
237 int rep;
238};
239
240struct kvm_stat {
241 u32 pf_fixed;
242 u32 pf_guest;
243 u32 tlb_flush;
244 u32 invlpg;
245
246 u32 exits;
247 u32 io_exits;
248 u32 mmio_exits;
249 u32 signal_exits;
250 u32 irq_window_exits;
251 u32 halt_exits;
252 u32 request_irq_exits;
253 u32 irq_exits;
254};
255
222struct kvm_vcpu { 256struct kvm_vcpu {
223 struct kvm *kvm; 257 struct kvm *kvm;
224 union { 258 union {
@@ -228,6 +262,8 @@ struct kvm_vcpu {
228 struct mutex mutex; 262 struct mutex mutex;
229 int cpu; 263 int cpu;
230 int launched; 264 int launched;
265 u64 host_tsc;
266 struct kvm_run *run;
231 int interrupt_window_open; 267 int interrupt_window_open;
232 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ 268 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
233#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) 269#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long)
@@ -266,6 +302,7 @@ struct kvm_vcpu {
266 char fx_buf[FX_BUF_SIZE]; 302 char fx_buf[FX_BUF_SIZE];
267 char *host_fx_image; 303 char *host_fx_image;
268 char *guest_fx_image; 304 char *guest_fx_image;
305 int fpu_active;
269 306
270 int mmio_needed; 307 int mmio_needed;
271 int mmio_read_completed; 308 int mmio_read_completed;
@@ -273,6 +310,14 @@ struct kvm_vcpu {
273 int mmio_size; 310 int mmio_size;
274 unsigned char mmio_data[8]; 311 unsigned char mmio_data[8];
275 gpa_t mmio_phys_addr; 312 gpa_t mmio_phys_addr;
313 gva_t mmio_fault_cr2;
314 struct kvm_pio_request pio;
315 void *pio_data;
316
317 int sigset_active;
318 sigset_t sigset;
319
320 struct kvm_stat stat;
276 321
277 struct { 322 struct {
278 int active; 323 int active;
@@ -284,6 +329,15 @@ struct kvm_vcpu {
284 u32 ar; 329 u32 ar;
285 } tr, es, ds, fs, gs; 330 } tr, es, ds, fs, gs;
286 } rmode; 331 } rmode;
332
333 int cpuid_nent;
334 struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES];
335};
336
337struct kvm_mem_alias {
338 gfn_t base_gfn;
339 unsigned long npages;
340 gfn_t target_gfn;
287}; 341};
288 342
289struct kvm_memory_slot { 343struct kvm_memory_slot {
@@ -296,6 +350,8 @@ struct kvm_memory_slot {
296 350
297struct kvm { 351struct kvm {
298 spinlock_t lock; /* protects everything except vcpus */ 352 spinlock_t lock; /* protects everything except vcpus */
353 int naliases;
354 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
299 int nmemslots; 355 int nmemslots;
300 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; 356 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS];
301 /* 357 /*
@@ -312,22 +368,6 @@ struct kvm {
312 struct file *filp; 368 struct file *filp;
313}; 369};
314 370
315struct kvm_stat {
316 u32 pf_fixed;
317 u32 pf_guest;
318 u32 tlb_flush;
319 u32 invlpg;
320
321 u32 exits;
322 u32 io_exits;
323 u32 mmio_exits;
324 u32 signal_exits;
325 u32 irq_window_exits;
326 u32 halt_exits;
327 u32 request_irq_exits;
328 u32 irq_exits;
329};
330
331struct descriptor_table { 371struct descriptor_table {
332 u16 limit; 372 u16 limit;
333 unsigned long base; 373 unsigned long base;
@@ -358,10 +398,8 @@ struct kvm_arch_ops {
358 void (*set_segment)(struct kvm_vcpu *vcpu, 398 void (*set_segment)(struct kvm_vcpu *vcpu,
359 struct kvm_segment *var, int seg); 399 struct kvm_segment *var, int seg);
360 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); 400 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
361 void (*decache_cr0_cr4_guest_bits)(struct kvm_vcpu *vcpu); 401 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
362 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); 402 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
363 void (*set_cr0_no_modeswitch)(struct kvm_vcpu *vcpu,
364 unsigned long cr0);
365 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 403 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
366 void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); 404 void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
367 void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); 405 void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
@@ -391,7 +429,6 @@ struct kvm_arch_ops {
391 unsigned char *hypercall_addr); 429 unsigned char *hypercall_addr);
392}; 430};
393 431
394extern struct kvm_stat kvm_stat;
395extern struct kvm_arch_ops *kvm_arch_ops; 432extern struct kvm_arch_ops *kvm_arch_ops;
396 433
397#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) 434#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
@@ -400,28 +437,29 @@ extern struct kvm_arch_ops *kvm_arch_ops;
400int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module); 437int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module);
401void kvm_exit_arch(void); 438void kvm_exit_arch(void);
402 439
440int kvm_mmu_module_init(void);
441void kvm_mmu_module_exit(void);
442
403void kvm_mmu_destroy(struct kvm_vcpu *vcpu); 443void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
404int kvm_mmu_create(struct kvm_vcpu *vcpu); 444int kvm_mmu_create(struct kvm_vcpu *vcpu);
405int kvm_mmu_setup(struct kvm_vcpu *vcpu); 445int kvm_mmu_setup(struct kvm_vcpu *vcpu);
406 446
407int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 447int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
408void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot); 448void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot);
449void kvm_mmu_zap_all(struct kvm_vcpu *vcpu);
409 450
410hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa); 451hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa);
411#define HPA_MSB ((sizeof(hpa_t) * 8) - 1) 452#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
412#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) 453#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
413static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } 454static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
414hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva); 455hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva);
456struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
415 457
416void kvm_emulator_want_group7_invlpg(void); 458void kvm_emulator_want_group7_invlpg(void);
417 459
418extern hpa_t bad_page_address; 460extern hpa_t bad_page_address;
419 461
420static inline struct page *gfn_to_page(struct kvm_memory_slot *slot, gfn_t gfn) 462struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
421{
422 return slot->phys_mem[gfn - slot->base_gfn];
423}
424
425struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); 463struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
426void mark_page_dirty(struct kvm *kvm, gfn_t gfn); 464void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
427 465
@@ -444,6 +482,10 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
444 482
445struct x86_emulate_ctxt; 483struct x86_emulate_ctxt;
446 484
485int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
486 int size, unsigned long count, int string, int down,
487 gva_t address, int rep, unsigned port);
488void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
447int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); 489int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
448int emulate_clts(struct kvm_vcpu *vcpu); 490int emulate_clts(struct kvm_vcpu *vcpu);
449int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, 491int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr,
@@ -493,12 +535,6 @@ static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
493 return vcpu->mmu.page_fault(vcpu, gva, error_code); 535 return vcpu->mmu.page_fault(vcpu, gva, error_code);
494} 536}
495 537
496static inline struct page *_gfn_to_page(struct kvm *kvm, gfn_t gfn)
497{
498 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
499 return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : NULL;
500}
501
502static inline int is_long_mode(struct kvm_vcpu *vcpu) 538static inline int is_long_mode(struct kvm_vcpu *vcpu)
503{ 539{
504#ifdef CONFIG_X86_64 540#ifdef CONFIG_X86_64
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index dc7a8c78cbf9..c8b8cfa332bb 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -51,27 +51,27 @@ static DEFINE_SPINLOCK(kvm_lock);
51static LIST_HEAD(vm_list); 51static LIST_HEAD(vm_list);
52 52
53struct kvm_arch_ops *kvm_arch_ops; 53struct kvm_arch_ops *kvm_arch_ops;
54struct kvm_stat kvm_stat; 54
55EXPORT_SYMBOL_GPL(kvm_stat); 55#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
56 56
57static struct kvm_stats_debugfs_item { 57static struct kvm_stats_debugfs_item {
58 const char *name; 58 const char *name;
59 u32 *data; 59 int offset;
60 struct dentry *dentry; 60 struct dentry *dentry;
61} debugfs_entries[] = { 61} debugfs_entries[] = {
62 { "pf_fixed", &kvm_stat.pf_fixed }, 62 { "pf_fixed", STAT_OFFSET(pf_fixed) },
63 { "pf_guest", &kvm_stat.pf_guest }, 63 { "pf_guest", STAT_OFFSET(pf_guest) },
64 { "tlb_flush", &kvm_stat.tlb_flush }, 64 { "tlb_flush", STAT_OFFSET(tlb_flush) },
65 { "invlpg", &kvm_stat.invlpg }, 65 { "invlpg", STAT_OFFSET(invlpg) },
66 { "exits", &kvm_stat.exits }, 66 { "exits", STAT_OFFSET(exits) },
67 { "io_exits", &kvm_stat.io_exits }, 67 { "io_exits", STAT_OFFSET(io_exits) },
68 { "mmio_exits", &kvm_stat.mmio_exits }, 68 { "mmio_exits", STAT_OFFSET(mmio_exits) },
69 { "signal_exits", &kvm_stat.signal_exits }, 69 { "signal_exits", STAT_OFFSET(signal_exits) },
70 { "irq_window", &kvm_stat.irq_window_exits }, 70 { "irq_window", STAT_OFFSET(irq_window_exits) },
71 { "halt_exits", &kvm_stat.halt_exits }, 71 { "halt_exits", STAT_OFFSET(halt_exits) },
72 { "request_irq", &kvm_stat.request_irq_exits }, 72 { "request_irq", STAT_OFFSET(request_irq_exits) },
73 { "irq_exits", &kvm_stat.irq_exits }, 73 { "irq_exits", STAT_OFFSET(irq_exits) },
74 { NULL, NULL } 74 { NULL }
75}; 75};
76 76
77static struct dentry *debugfs_dir; 77static struct dentry *debugfs_dir;
@@ -346,6 +346,17 @@ static void kvm_free_physmem(struct kvm *kvm)
346 kvm_free_physmem_slot(&kvm->memslots[i], NULL); 346 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
347} 347}
348 348
349static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
350{
351 int i;
352
353 for (i = 0; i < 2; ++i)
354 if (vcpu->pio.guest_pages[i]) {
355 __free_page(vcpu->pio.guest_pages[i]);
356 vcpu->pio.guest_pages[i] = NULL;
357 }
358}
359
349static void kvm_free_vcpu(struct kvm_vcpu *vcpu) 360static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
350{ 361{
351 if (!vcpu->vmcs) 362 if (!vcpu->vmcs)
@@ -355,6 +366,11 @@ static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
355 kvm_mmu_destroy(vcpu); 366 kvm_mmu_destroy(vcpu);
356 vcpu_put(vcpu); 367 vcpu_put(vcpu);
357 kvm_arch_ops->vcpu_free(vcpu); 368 kvm_arch_ops->vcpu_free(vcpu);
369 free_page((unsigned long)vcpu->run);
370 vcpu->run = NULL;
371 free_page((unsigned long)vcpu->pio_data);
372 vcpu->pio_data = NULL;
373 free_pio_guest_pages(vcpu);
358} 374}
359 375
360static void kvm_free_vcpus(struct kvm *kvm) 376static void kvm_free_vcpus(struct kvm *kvm)
@@ -404,12 +420,12 @@ static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
404 u64 pdpte; 420 u64 pdpte;
405 u64 *pdpt; 421 u64 *pdpt;
406 int ret; 422 int ret;
407 struct kvm_memory_slot *memslot; 423 struct page *page;
408 424
409 spin_lock(&vcpu->kvm->lock); 425 spin_lock(&vcpu->kvm->lock);
410 memslot = gfn_to_memslot(vcpu->kvm, pdpt_gfn); 426 page = gfn_to_page(vcpu->kvm, pdpt_gfn);
411 /* FIXME: !memslot - emulate? 0xff? */ 427 /* FIXME: !page - emulate? 0xff? */
412 pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn), KM_USER0); 428 pdpt = kmap_atomic(page, KM_USER0);
413 429
414 ret = 1; 430 ret = 1;
415 for (i = 0; i < 4; ++i) { 431 for (i = 0; i < 4; ++i) {
@@ -494,7 +510,6 @@ EXPORT_SYMBOL_GPL(set_cr0);
494 510
495void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 511void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
496{ 512{
497 kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
498 set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f)); 513 set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
499} 514}
500EXPORT_SYMBOL_GPL(lmsw); 515EXPORT_SYMBOL_GPL(lmsw);
@@ -830,7 +845,73 @@ out:
830 return r; 845 return r;
831} 846}
832 847
833struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 848/*
849 * Set a new alias region. Aliases map a portion of physical memory into
850 * another portion. This is useful for memory windows, for example the PC
851 * VGA region.
852 */
853static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
854 struct kvm_memory_alias *alias)
855{
856 int r, n;
857 struct kvm_mem_alias *p;
858
859 r = -EINVAL;
860 /* General sanity checks */
861 if (alias->memory_size & (PAGE_SIZE - 1))
862 goto out;
863 if (alias->guest_phys_addr & (PAGE_SIZE - 1))
864 goto out;
865 if (alias->slot >= KVM_ALIAS_SLOTS)
866 goto out;
867 if (alias->guest_phys_addr + alias->memory_size
868 < alias->guest_phys_addr)
869 goto out;
870 if (alias->target_phys_addr + alias->memory_size
871 < alias->target_phys_addr)
872 goto out;
873
874 spin_lock(&kvm->lock);
875
876 p = &kvm->aliases[alias->slot];
877 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
878 p->npages = alias->memory_size >> PAGE_SHIFT;
879 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
880
881 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
882 if (kvm->aliases[n - 1].npages)
883 break;
884 kvm->naliases = n;
885
886 spin_unlock(&kvm->lock);
887
888 vcpu_load(&kvm->vcpus[0]);
889 spin_lock(&kvm->lock);
890 kvm_mmu_zap_all(&kvm->vcpus[0]);
891 spin_unlock(&kvm->lock);
892 vcpu_put(&kvm->vcpus[0]);
893
894 return 0;
895
896out:
897 return r;
898}
899
900static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
901{
902 int i;
903 struct kvm_mem_alias *alias;
904
905 for (i = 0; i < kvm->naliases; ++i) {
906 alias = &kvm->aliases[i];
907 if (gfn >= alias->base_gfn
908 && gfn < alias->base_gfn + alias->npages)
909 return alias->target_gfn + gfn - alias->base_gfn;
910 }
911 return gfn;
912}
913
914static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
834{ 915{
835 int i; 916 int i;
836 917
@@ -843,7 +924,24 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
843 } 924 }
844 return NULL; 925 return NULL;
845} 926}
846EXPORT_SYMBOL_GPL(gfn_to_memslot); 927
928struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
929{
930 gfn = unalias_gfn(kvm, gfn);
931 return __gfn_to_memslot(kvm, gfn);
932}
933
934struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
935{
936 struct kvm_memory_slot *slot;
937
938 gfn = unalias_gfn(kvm, gfn);
939 slot = __gfn_to_memslot(kvm, gfn);
940 if (!slot)
941 return NULL;
942 return slot->phys_mem[gfn - slot->base_gfn];
943}
944EXPORT_SYMBOL_GPL(gfn_to_page);
847 945
848void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 946void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
849{ 947{
@@ -871,7 +969,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
871} 969}
872 970
873static int emulator_read_std(unsigned long addr, 971static int emulator_read_std(unsigned long addr,
874 unsigned long *val, 972 void *val,
875 unsigned int bytes, 973 unsigned int bytes,
876 struct x86_emulate_ctxt *ctxt) 974 struct x86_emulate_ctxt *ctxt)
877{ 975{
@@ -883,20 +981,20 @@ static int emulator_read_std(unsigned long addr,
883 unsigned offset = addr & (PAGE_SIZE-1); 981 unsigned offset = addr & (PAGE_SIZE-1);
884 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); 982 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
885 unsigned long pfn; 983 unsigned long pfn;
886 struct kvm_memory_slot *memslot; 984 struct page *page;
887 void *page; 985 void *page_virt;
888 986
889 if (gpa == UNMAPPED_GVA) 987 if (gpa == UNMAPPED_GVA)
890 return X86EMUL_PROPAGATE_FAULT; 988 return X86EMUL_PROPAGATE_FAULT;
891 pfn = gpa >> PAGE_SHIFT; 989 pfn = gpa >> PAGE_SHIFT;
892 memslot = gfn_to_memslot(vcpu->kvm, pfn); 990 page = gfn_to_page(vcpu->kvm, pfn);
893 if (!memslot) 991 if (!page)
894 return X86EMUL_UNHANDLEABLE; 992 return X86EMUL_UNHANDLEABLE;
895 page = kmap_atomic(gfn_to_page(memslot, pfn), KM_USER0); 993 page_virt = kmap_atomic(page, KM_USER0);
896 994
897 memcpy(data, page + offset, tocopy); 995 memcpy(data, page_virt + offset, tocopy);
898 996
899 kunmap_atomic(page, KM_USER0); 997 kunmap_atomic(page_virt, KM_USER0);
900 998
901 bytes -= tocopy; 999 bytes -= tocopy;
902 data += tocopy; 1000 data += tocopy;
@@ -907,7 +1005,7 @@ static int emulator_read_std(unsigned long addr,
907} 1005}
908 1006
909static int emulator_write_std(unsigned long addr, 1007static int emulator_write_std(unsigned long addr,
910 unsigned long val, 1008 const void *val,
911 unsigned int bytes, 1009 unsigned int bytes,
912 struct x86_emulate_ctxt *ctxt) 1010 struct x86_emulate_ctxt *ctxt)
913{ 1011{
@@ -917,7 +1015,7 @@ static int emulator_write_std(unsigned long addr,
917} 1015}
918 1016
919static int emulator_read_emulated(unsigned long addr, 1017static int emulator_read_emulated(unsigned long addr,
920 unsigned long *val, 1018 void *val,
921 unsigned int bytes, 1019 unsigned int bytes,
922 struct x86_emulate_ctxt *ctxt) 1020 struct x86_emulate_ctxt *ctxt)
923{ 1021{
@@ -945,37 +1043,37 @@ static int emulator_read_emulated(unsigned long addr,
945} 1043}
946 1044
947static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 1045static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
948 unsigned long val, int bytes) 1046 const void *val, int bytes)
949{ 1047{
950 struct kvm_memory_slot *m;
951 struct page *page; 1048 struct page *page;
952 void *virt; 1049 void *virt;
953 1050
954 if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) 1051 if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
955 return 0; 1052 return 0;
956 m = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT); 1053 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
957 if (!m) 1054 if (!page)
958 return 0; 1055 return 0;
959 page = gfn_to_page(m, gpa >> PAGE_SHIFT);
960 kvm_mmu_pre_write(vcpu, gpa, bytes); 1056 kvm_mmu_pre_write(vcpu, gpa, bytes);
961 mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); 1057 mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
962 virt = kmap_atomic(page, KM_USER0); 1058 virt = kmap_atomic(page, KM_USER0);
963 memcpy(virt + offset_in_page(gpa), &val, bytes); 1059 memcpy(virt + offset_in_page(gpa), val, bytes);
964 kunmap_atomic(virt, KM_USER0); 1060 kunmap_atomic(virt, KM_USER0);
965 kvm_mmu_post_write(vcpu, gpa, bytes); 1061 kvm_mmu_post_write(vcpu, gpa, bytes);
966 return 1; 1062 return 1;
967} 1063}
968 1064
969static int emulator_write_emulated(unsigned long addr, 1065static int emulator_write_emulated(unsigned long addr,
970 unsigned long val, 1066 const void *val,
971 unsigned int bytes, 1067 unsigned int bytes,
972 struct x86_emulate_ctxt *ctxt) 1068 struct x86_emulate_ctxt *ctxt)
973{ 1069{
974 struct kvm_vcpu *vcpu = ctxt->vcpu; 1070 struct kvm_vcpu *vcpu = ctxt->vcpu;
975 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); 1071 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
976 1072
977 if (gpa == UNMAPPED_GVA) 1073 if (gpa == UNMAPPED_GVA) {
1074 kvm_arch_ops->inject_page_fault(vcpu, addr, 2);
978 return X86EMUL_PROPAGATE_FAULT; 1075 return X86EMUL_PROPAGATE_FAULT;
1076 }
979 1077
980 if (emulator_write_phys(vcpu, gpa, val, bytes)) 1078 if (emulator_write_phys(vcpu, gpa, val, bytes))
981 return X86EMUL_CONTINUE; 1079 return X86EMUL_CONTINUE;
@@ -984,14 +1082,14 @@ static int emulator_write_emulated(unsigned long addr,
984 vcpu->mmio_phys_addr = gpa; 1082 vcpu->mmio_phys_addr = gpa;
985 vcpu->mmio_size = bytes; 1083 vcpu->mmio_size = bytes;
986 vcpu->mmio_is_write = 1; 1084 vcpu->mmio_is_write = 1;
987 memcpy(vcpu->mmio_data, &val, bytes); 1085 memcpy(vcpu->mmio_data, val, bytes);
988 1086
989 return X86EMUL_CONTINUE; 1087 return X86EMUL_CONTINUE;
990} 1088}
991 1089
992static int emulator_cmpxchg_emulated(unsigned long addr, 1090static int emulator_cmpxchg_emulated(unsigned long addr,
993 unsigned long old, 1091 const void *old,
994 unsigned long new, 1092 const void *new,
995 unsigned int bytes, 1093 unsigned int bytes,
996 struct x86_emulate_ctxt *ctxt) 1094 struct x86_emulate_ctxt *ctxt)
997{ 1095{
@@ -1004,30 +1102,6 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
1004 return emulator_write_emulated(addr, new, bytes, ctxt); 1102 return emulator_write_emulated(addr, new, bytes, ctxt);
1005} 1103}
1006 1104
1007#ifdef CONFIG_X86_32
1008
1009static int emulator_cmpxchg8b_emulated(unsigned long addr,
1010 unsigned long old_lo,
1011 unsigned long old_hi,
1012 unsigned long new_lo,
1013 unsigned long new_hi,
1014 struct x86_emulate_ctxt *ctxt)
1015{
1016 static int reported;
1017 int r;
1018
1019 if (!reported) {
1020 reported = 1;
1021 printk(KERN_WARNING "kvm: emulating exchange8b as write\n");
1022 }
1023 r = emulator_write_emulated(addr, new_lo, 4, ctxt);
1024 if (r != X86EMUL_CONTINUE)
1025 return r;
1026 return emulator_write_emulated(addr+4, new_hi, 4, ctxt);
1027}
1028
1029#endif
1030
1031static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 1105static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1032{ 1106{
1033 return kvm_arch_ops->get_segment_base(vcpu, seg); 1107 return kvm_arch_ops->get_segment_base(vcpu, seg);
@@ -1042,7 +1116,6 @@ int emulate_clts(struct kvm_vcpu *vcpu)
1042{ 1116{
1043 unsigned long cr0; 1117 unsigned long cr0;
1044 1118
1045 kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
1046 cr0 = vcpu->cr0 & ~CR0_TS_MASK; 1119 cr0 = vcpu->cr0 & ~CR0_TS_MASK;
1047 kvm_arch_ops->set_cr0(vcpu, cr0); 1120 kvm_arch_ops->set_cr0(vcpu, cr0);
1048 return X86EMUL_CONTINUE; 1121 return X86EMUL_CONTINUE;
@@ -1102,9 +1175,6 @@ struct x86_emulate_ops emulate_ops = {
1102 .read_emulated = emulator_read_emulated, 1175 .read_emulated = emulator_read_emulated,
1103 .write_emulated = emulator_write_emulated, 1176 .write_emulated = emulator_write_emulated,
1104 .cmpxchg_emulated = emulator_cmpxchg_emulated, 1177 .cmpxchg_emulated = emulator_cmpxchg_emulated,
1105#ifdef CONFIG_X86_32
1106 .cmpxchg8b_emulated = emulator_cmpxchg8b_emulated,
1107#endif
1108}; 1178};
1109 1179
1110int emulate_instruction(struct kvm_vcpu *vcpu, 1180int emulate_instruction(struct kvm_vcpu *vcpu,
@@ -1116,6 +1186,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
1116 int r; 1186 int r;
1117 int cs_db, cs_l; 1187 int cs_db, cs_l;
1118 1188
1189 vcpu->mmio_fault_cr2 = cr2;
1119 kvm_arch_ops->cache_regs(vcpu); 1190 kvm_arch_ops->cache_regs(vcpu);
1120 1191
1121 kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 1192 kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
@@ -1166,8 +1237,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
1166 kvm_arch_ops->decache_regs(vcpu); 1237 kvm_arch_ops->decache_regs(vcpu);
1167 kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags); 1238 kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags);
1168 1239
1169 if (vcpu->mmio_is_write) 1240 if (vcpu->mmio_is_write) {
1241 vcpu->mmio_needed = 0;
1170 return EMULATE_DO_MMIO; 1242 return EMULATE_DO_MMIO;
1243 }
1171 1244
1172 return EMULATE_DONE; 1245 return EMULATE_DONE;
1173} 1246}
@@ -1177,7 +1250,7 @@ int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
1177{ 1250{
1178 unsigned long nr, a0, a1, a2, a3, a4, a5, ret; 1251 unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
1179 1252
1180 kvm_arch_ops->decache_regs(vcpu); 1253 kvm_arch_ops->cache_regs(vcpu);
1181 ret = -KVM_EINVAL; 1254 ret = -KVM_EINVAL;
1182#ifdef CONFIG_X86_64 1255#ifdef CONFIG_X86_64
1183 if (is_long_mode(vcpu)) { 1256 if (is_long_mode(vcpu)) {
@@ -1201,10 +1274,19 @@ int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
1201 } 1274 }
1202 switch (nr) { 1275 switch (nr) {
1203 default: 1276 default:
1204 ; 1277 run->hypercall.args[0] = a0;
1278 run->hypercall.args[1] = a1;
1279 run->hypercall.args[2] = a2;
1280 run->hypercall.args[3] = a3;
1281 run->hypercall.args[4] = a4;
1282 run->hypercall.args[5] = a5;
1283 run->hypercall.ret = ret;
1284 run->hypercall.longmode = is_long_mode(vcpu);
1285 kvm_arch_ops->decache_regs(vcpu);
1286 return 0;
1205 } 1287 }
1206 vcpu->regs[VCPU_REGS_RAX] = ret; 1288 vcpu->regs[VCPU_REGS_RAX] = ret;
1207 kvm_arch_ops->cache_regs(vcpu); 1289 kvm_arch_ops->decache_regs(vcpu);
1208 return 1; 1290 return 1;
1209} 1291}
1210EXPORT_SYMBOL_GPL(kvm_hypercall); 1292EXPORT_SYMBOL_GPL(kvm_hypercall);
@@ -1237,7 +1319,7 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1237 1319
1238unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 1320unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1239{ 1321{
1240 kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu); 1322 kvm_arch_ops->decache_cr4_guest_bits(vcpu);
1241 switch (cr) { 1323 switch (cr) {
1242 case 0: 1324 case 0:
1243 return vcpu->cr0; 1325 return vcpu->cr0;
@@ -1442,6 +1524,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1442 printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", 1524 printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1443 __FUNCTION__, data); 1525 __FUNCTION__, data);
1444 break; 1526 break;
1527 case MSR_IA32_MCG_STATUS:
1528 printk(KERN_WARNING "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
1529 __FUNCTION__, data);
1530 break;
1445 case MSR_IA32_UCODE_REV: 1531 case MSR_IA32_UCODE_REV:
1446 case MSR_IA32_UCODE_WRITE: 1532 case MSR_IA32_UCODE_WRITE:
1447 case 0x200 ... 0x2ff: /* MTRRs */ 1533 case 0x200 ... 0x2ff: /* MTRRs */
@@ -1478,6 +1564,8 @@ static int set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1478 1564
1479void kvm_resched(struct kvm_vcpu *vcpu) 1565void kvm_resched(struct kvm_vcpu *vcpu)
1480{ 1566{
1567 if (!need_resched())
1568 return;
1481 vcpu_put(vcpu); 1569 vcpu_put(vcpu);
1482 cond_resched(); 1570 cond_resched();
1483 vcpu_load(vcpu); 1571 vcpu_load(vcpu);
@@ -1502,29 +1590,250 @@ void save_msrs(struct vmx_msr_entry *e, int n)
1502} 1590}
1503EXPORT_SYMBOL_GPL(save_msrs); 1591EXPORT_SYMBOL_GPL(save_msrs);
1504 1592
1593void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1594{
1595 int i;
1596 u32 function;
1597 struct kvm_cpuid_entry *e, *best;
1598
1599 kvm_arch_ops->cache_regs(vcpu);
1600 function = vcpu->regs[VCPU_REGS_RAX];
1601 vcpu->regs[VCPU_REGS_RAX] = 0;
1602 vcpu->regs[VCPU_REGS_RBX] = 0;
1603 vcpu->regs[VCPU_REGS_RCX] = 0;
1604 vcpu->regs[VCPU_REGS_RDX] = 0;
1605 best = NULL;
1606 for (i = 0; i < vcpu->cpuid_nent; ++i) {
1607 e = &vcpu->cpuid_entries[i];
1608 if (e->function == function) {
1609 best = e;
1610 break;
1611 }
1612 /*
1613 * Both basic or both extended?
1614 */
1615 if (((e->function ^ function) & 0x80000000) == 0)
1616 if (!best || e->function > best->function)
1617 best = e;
1618 }
1619 if (best) {
1620 vcpu->regs[VCPU_REGS_RAX] = best->eax;
1621 vcpu->regs[VCPU_REGS_RBX] = best->ebx;
1622 vcpu->regs[VCPU_REGS_RCX] = best->ecx;
1623 vcpu->regs[VCPU_REGS_RDX] = best->edx;
1624 }
1625 kvm_arch_ops->decache_regs(vcpu);
1626 kvm_arch_ops->skip_emulated_instruction(vcpu);
1627}
1628EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
1629
1630static int pio_copy_data(struct kvm_vcpu *vcpu)
1631{
1632 void *p = vcpu->pio_data;
1633 void *q;
1634 unsigned bytes;
1635 int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
1636
1637 kvm_arch_ops->vcpu_put(vcpu);
1638 q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1639 PAGE_KERNEL);
1640 if (!q) {
1641 kvm_arch_ops->vcpu_load(vcpu);
1642 free_pio_guest_pages(vcpu);
1643 return -ENOMEM;
1644 }
1645 q += vcpu->pio.guest_page_offset;
1646 bytes = vcpu->pio.size * vcpu->pio.cur_count;
1647 if (vcpu->pio.in)
1648 memcpy(q, p, bytes);
1649 else
1650 memcpy(p, q, bytes);
1651 q -= vcpu->pio.guest_page_offset;
1652 vunmap(q);
1653 kvm_arch_ops->vcpu_load(vcpu);
1654 free_pio_guest_pages(vcpu);
1655 return 0;
1656}
1657
1658static int complete_pio(struct kvm_vcpu *vcpu)
1659{
1660 struct kvm_pio_request *io = &vcpu->pio;
1661 long delta;
1662 int r;
1663
1664 kvm_arch_ops->cache_regs(vcpu);
1665
1666 if (!io->string) {
1667 if (io->in)
1668 memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
1669 io->size);
1670 } else {
1671 if (io->in) {
1672 r = pio_copy_data(vcpu);
1673 if (r) {
1674 kvm_arch_ops->cache_regs(vcpu);
1675 return r;
1676 }
1677 }
1678
1679 delta = 1;
1680 if (io->rep) {
1681 delta *= io->cur_count;
1682 /*
1683 * The size of the register should really depend on
1684 * current address size.
1685 */
1686 vcpu->regs[VCPU_REGS_RCX] -= delta;
1687 }
1688 if (io->down)
1689 delta = -delta;
1690 delta *= io->size;
1691 if (io->in)
1692 vcpu->regs[VCPU_REGS_RDI] += delta;
1693 else
1694 vcpu->regs[VCPU_REGS_RSI] += delta;
1695 }
1696
1697 kvm_arch_ops->decache_regs(vcpu);
1698
1699 io->count -= io->cur_count;
1700 io->cur_count = 0;
1701
1702 if (!io->count)
1703 kvm_arch_ops->skip_emulated_instruction(vcpu);
1704 return 0;
1705}
1706
1707int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1708 int size, unsigned long count, int string, int down,
1709 gva_t address, int rep, unsigned port)
1710{
1711 unsigned now, in_page;
1712 int i;
1713 int nr_pages = 1;
1714 struct page *page;
1715
1716 vcpu->run->exit_reason = KVM_EXIT_IO;
1717 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1718 vcpu->run->io.size = size;
1719 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1720 vcpu->run->io.count = count;
1721 vcpu->run->io.port = port;
1722 vcpu->pio.count = count;
1723 vcpu->pio.cur_count = count;
1724 vcpu->pio.size = size;
1725 vcpu->pio.in = in;
1726 vcpu->pio.string = string;
1727 vcpu->pio.down = down;
1728 vcpu->pio.guest_page_offset = offset_in_page(address);
1729 vcpu->pio.rep = rep;
1730
1731 if (!string) {
1732 kvm_arch_ops->cache_regs(vcpu);
1733 memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
1734 kvm_arch_ops->decache_regs(vcpu);
1735 return 0;
1736 }
1737
1738 if (!count) {
1739 kvm_arch_ops->skip_emulated_instruction(vcpu);
1740 return 1;
1741 }
1742
1743 now = min(count, PAGE_SIZE / size);
1744
1745 if (!down)
1746 in_page = PAGE_SIZE - offset_in_page(address);
1747 else
1748 in_page = offset_in_page(address) + size;
1749 now = min(count, (unsigned long)in_page / size);
1750 if (!now) {
1751 /*
1752 * String I/O straddles page boundary. Pin two guest pages
1753 * so that we satisfy atomicity constraints. Do just one
1754 * transaction to avoid complexity.
1755 */
1756 nr_pages = 2;
1757 now = 1;
1758 }
1759 if (down) {
1760 /*
1761 * String I/O in reverse. Yuck. Kill the guest, fix later.
1762 */
1763 printk(KERN_ERR "kvm: guest string pio down\n");
1764 inject_gp(vcpu);
1765 return 1;
1766 }
1767 vcpu->run->io.count = now;
1768 vcpu->pio.cur_count = now;
1769
1770 for (i = 0; i < nr_pages; ++i) {
1771 spin_lock(&vcpu->kvm->lock);
1772 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
1773 if (page)
1774 get_page(page);
1775 vcpu->pio.guest_pages[i] = page;
1776 spin_unlock(&vcpu->kvm->lock);
1777 if (!page) {
1778 inject_gp(vcpu);
1779 free_pio_guest_pages(vcpu);
1780 return 1;
1781 }
1782 }
1783
1784 if (!vcpu->pio.in)
1785 return pio_copy_data(vcpu);
1786 return 0;
1787}
1788EXPORT_SYMBOL_GPL(kvm_setup_pio);
1789
1505static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1790static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1506{ 1791{
1507 int r; 1792 int r;
1793 sigset_t sigsaved;
1508 1794
1509 vcpu_load(vcpu); 1795 vcpu_load(vcpu);
1510 1796
1797 if (vcpu->sigset_active)
1798 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
1799
1511 /* re-sync apic's tpr */ 1800 /* re-sync apic's tpr */
1512 vcpu->cr8 = kvm_run->cr8; 1801 vcpu->cr8 = kvm_run->cr8;
1513 1802
1514 if (kvm_run->emulated) { 1803 if (vcpu->pio.cur_count) {
1515 kvm_arch_ops->skip_emulated_instruction(vcpu); 1804 r = complete_pio(vcpu);
1516 kvm_run->emulated = 0; 1805 if (r)
1806 goto out;
1517 } 1807 }
1518 1808
1519 if (kvm_run->mmio_completed) { 1809 if (vcpu->mmio_needed) {
1520 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 1810 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
1521 vcpu->mmio_read_completed = 1; 1811 vcpu->mmio_read_completed = 1;
1812 vcpu->mmio_needed = 0;
1813 r = emulate_instruction(vcpu, kvm_run,
1814 vcpu->mmio_fault_cr2, 0);
1815 if (r == EMULATE_DO_MMIO) {
1816 /*
1817 * Read-modify-write. Back to userspace.
1818 */
1819 kvm_run->exit_reason = KVM_EXIT_MMIO;
1820 r = 0;
1821 goto out;
1822 }
1522 } 1823 }
1523 1824
1524 vcpu->mmio_needed = 0; 1825 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
1826 kvm_arch_ops->cache_regs(vcpu);
1827 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
1828 kvm_arch_ops->decache_regs(vcpu);
1829 }
1525 1830
1526 r = kvm_arch_ops->run(vcpu, kvm_run); 1831 r = kvm_arch_ops->run(vcpu, kvm_run);
1527 1832
1833out:
1834 if (vcpu->sigset_active)
1835 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1836
1528 vcpu_put(vcpu); 1837 vcpu_put(vcpu);
1529 return r; 1838 return r;
1530} 1839}
@@ -1633,7 +1942,7 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
1633 sregs->gdt.limit = dt.limit; 1942 sregs->gdt.limit = dt.limit;
1634 sregs->gdt.base = dt.base; 1943 sregs->gdt.base = dt.base;
1635 1944
1636 kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu); 1945 kvm_arch_ops->decache_cr4_guest_bits(vcpu);
1637 sregs->cr0 = vcpu->cr0; 1946 sregs->cr0 = vcpu->cr0;
1638 sregs->cr2 = vcpu->cr2; 1947 sregs->cr2 = vcpu->cr2;
1639 sregs->cr3 = vcpu->cr3; 1948 sregs->cr3 = vcpu->cr3;
@@ -1665,16 +1974,6 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
1665 1974
1666 vcpu_load(vcpu); 1975 vcpu_load(vcpu);
1667 1976
1668 set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
1669 set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
1670 set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
1671 set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
1672 set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
1673 set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
1674
1675 set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
1676 set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
1677
1678 dt.limit = sregs->idt.limit; 1977 dt.limit = sregs->idt.limit;
1679 dt.base = sregs->idt.base; 1978 dt.base = sregs->idt.base;
1680 kvm_arch_ops->set_idt(vcpu, &dt); 1979 kvm_arch_ops->set_idt(vcpu, &dt);
@@ -1694,10 +1993,10 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
1694#endif 1993#endif
1695 vcpu->apic_base = sregs->apic_base; 1994 vcpu->apic_base = sregs->apic_base;
1696 1995
1697 kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu); 1996 kvm_arch_ops->decache_cr4_guest_bits(vcpu);
1698 1997
1699 mmu_reset_needed |= vcpu->cr0 != sregs->cr0; 1998 mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
1700 kvm_arch_ops->set_cr0_no_modeswitch(vcpu, sregs->cr0); 1999 kvm_arch_ops->set_cr0(vcpu, sregs->cr0);
1701 2000
1702 mmu_reset_needed |= vcpu->cr4 != sregs->cr4; 2001 mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
1703 kvm_arch_ops->set_cr4(vcpu, sregs->cr4); 2002 kvm_arch_ops->set_cr4(vcpu, sregs->cr4);
@@ -1714,6 +2013,16 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
1714 if (vcpu->irq_pending[i]) 2013 if (vcpu->irq_pending[i])
1715 __set_bit(i, &vcpu->irq_summary); 2014 __set_bit(i, &vcpu->irq_summary);
1716 2015
2016 set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2017 set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2018 set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2019 set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2020 set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2021 set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2022
2023 set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2024 set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2025
1717 vcpu_put(vcpu); 2026 vcpu_put(vcpu);
1718 2027
1719 return 0; 2028 return 0;
@@ -1887,6 +2196,36 @@ static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
1887 return r; 2196 return r;
1888} 2197}
1889 2198
2199static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
2200 unsigned long address,
2201 int *type)
2202{
2203 struct kvm_vcpu *vcpu = vma->vm_file->private_data;
2204 unsigned long pgoff;
2205 struct page *page;
2206
2207 *type = VM_FAULT_MINOR;
2208 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2209 if (pgoff == 0)
2210 page = virt_to_page(vcpu->run);
2211 else if (pgoff == KVM_PIO_PAGE_OFFSET)
2212 page = virt_to_page(vcpu->pio_data);
2213 else
2214 return NOPAGE_SIGBUS;
2215 get_page(page);
2216 return page;
2217}
2218
2219static struct vm_operations_struct kvm_vcpu_vm_ops = {
2220 .nopage = kvm_vcpu_nopage,
2221};
2222
2223static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
2224{
2225 vma->vm_ops = &kvm_vcpu_vm_ops;
2226 return 0;
2227}
2228
1890static int kvm_vcpu_release(struct inode *inode, struct file *filp) 2229static int kvm_vcpu_release(struct inode *inode, struct file *filp)
1891{ 2230{
1892 struct kvm_vcpu *vcpu = filp->private_data; 2231 struct kvm_vcpu *vcpu = filp->private_data;
@@ -1899,6 +2238,7 @@ static struct file_operations kvm_vcpu_fops = {
1899 .release = kvm_vcpu_release, 2238 .release = kvm_vcpu_release,
1900 .unlocked_ioctl = kvm_vcpu_ioctl, 2239 .unlocked_ioctl = kvm_vcpu_ioctl,
1901 .compat_ioctl = kvm_vcpu_ioctl, 2240 .compat_ioctl = kvm_vcpu_ioctl,
2241 .mmap = kvm_vcpu_mmap,
1902}; 2242};
1903 2243
1904/* 2244/*
@@ -1947,6 +2287,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
1947{ 2287{
1948 int r; 2288 int r;
1949 struct kvm_vcpu *vcpu; 2289 struct kvm_vcpu *vcpu;
2290 struct page *page;
1950 2291
1951 r = -EINVAL; 2292 r = -EINVAL;
1952 if (!valid_vcpu(n)) 2293 if (!valid_vcpu(n))
@@ -1961,9 +2302,22 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
1961 return -EEXIST; 2302 return -EEXIST;
1962 } 2303 }
1963 2304
2305 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2306 r = -ENOMEM;
2307 if (!page)
2308 goto out_unlock;
2309 vcpu->run = page_address(page);
2310
2311 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2312 r = -ENOMEM;
2313 if (!page)
2314 goto out_free_run;
2315 vcpu->pio_data = page_address(page);
2316
1964 vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf, 2317 vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf,
1965 FX_IMAGE_ALIGN); 2318 FX_IMAGE_ALIGN);
1966 vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE; 2319 vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
2320 vcpu->cr0 = 0x10;
1967 2321
1968 r = kvm_arch_ops->vcpu_create(vcpu); 2322 r = kvm_arch_ops->vcpu_create(vcpu);
1969 if (r < 0) 2323 if (r < 0)
@@ -1990,11 +2344,107 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
1990 2344
1991out_free_vcpus: 2345out_free_vcpus:
1992 kvm_free_vcpu(vcpu); 2346 kvm_free_vcpu(vcpu);
2347out_free_run:
2348 free_page((unsigned long)vcpu->run);
2349 vcpu->run = NULL;
2350out_unlock:
1993 mutex_unlock(&vcpu->mutex); 2351 mutex_unlock(&vcpu->mutex);
1994out: 2352out:
1995 return r; 2353 return r;
1996} 2354}
1997 2355
2356static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
2357 struct kvm_cpuid *cpuid,
2358 struct kvm_cpuid_entry __user *entries)
2359{
2360 int r;
2361
2362 r = -E2BIG;
2363 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2364 goto out;
2365 r = -EFAULT;
2366 if (copy_from_user(&vcpu->cpuid_entries, entries,
2367 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
2368 goto out;
2369 vcpu->cpuid_nent = cpuid->nent;
2370 return 0;
2371
2372out:
2373 return r;
2374}
2375
2376static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
2377{
2378 if (sigset) {
2379 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2380 vcpu->sigset_active = 1;
2381 vcpu->sigset = *sigset;
2382 } else
2383 vcpu->sigset_active = 0;
2384 return 0;
2385}
2386
2387/*
2388 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
2389 * we have asm/x86/processor.h
2390 */
2391struct fxsave {
2392 u16 cwd;
2393 u16 swd;
2394 u16 twd;
2395 u16 fop;
2396 u64 rip;
2397 u64 rdp;
2398 u32 mxcsr;
2399 u32 mxcsr_mask;
2400 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
2401#ifdef CONFIG_X86_64
2402 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
2403#else
2404 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
2405#endif
2406};
2407
2408static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2409{
2410 struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image;
2411
2412 vcpu_load(vcpu);
2413
2414 memcpy(fpu->fpr, fxsave->st_space, 128);
2415 fpu->fcw = fxsave->cwd;
2416 fpu->fsw = fxsave->swd;
2417 fpu->ftwx = fxsave->twd;
2418 fpu->last_opcode = fxsave->fop;
2419 fpu->last_ip = fxsave->rip;
2420 fpu->last_dp = fxsave->rdp;
2421 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
2422
2423 vcpu_put(vcpu);
2424
2425 return 0;
2426}
2427
2428static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2429{
2430 struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image;
2431
2432 vcpu_load(vcpu);
2433
2434 memcpy(fxsave->st_space, fpu->fpr, 128);
2435 fxsave->cwd = fpu->fcw;
2436 fxsave->swd = fpu->fsw;
2437 fxsave->twd = fpu->ftwx;
2438 fxsave->fop = fpu->last_opcode;
2439 fxsave->rip = fpu->last_ip;
2440 fxsave->rdp = fpu->last_dp;
2441 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
2442
2443 vcpu_put(vcpu);
2444
2445 return 0;
2446}
2447
1998static long kvm_vcpu_ioctl(struct file *filp, 2448static long kvm_vcpu_ioctl(struct file *filp,
1999 unsigned int ioctl, unsigned long arg) 2449 unsigned int ioctl, unsigned long arg)
2000{ 2450{
@@ -2003,21 +2453,12 @@ static long kvm_vcpu_ioctl(struct file *filp,
2003 int r = -EINVAL; 2453 int r = -EINVAL;
2004 2454
2005 switch (ioctl) { 2455 switch (ioctl) {
2006 case KVM_RUN: { 2456 case KVM_RUN:
2007 struct kvm_run kvm_run; 2457 r = -EINVAL;
2008 2458 if (arg)
2009 r = -EFAULT;
2010 if (copy_from_user(&kvm_run, argp, sizeof kvm_run))
2011 goto out; 2459 goto out;
2012 r = kvm_vcpu_ioctl_run(vcpu, &kvm_run); 2460 r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
2013 if (r < 0 && r != -EINTR)
2014 goto out;
2015 if (copy_to_user(argp, &kvm_run, sizeof kvm_run)) {
2016 r = -EFAULT;
2017 goto out;
2018 }
2019 break; 2461 break;
2020 }
2021 case KVM_GET_REGS: { 2462 case KVM_GET_REGS: {
2022 struct kvm_regs kvm_regs; 2463 struct kvm_regs kvm_regs;
2023 2464
@@ -2113,6 +2554,66 @@ static long kvm_vcpu_ioctl(struct file *filp,
2113 case KVM_SET_MSRS: 2554 case KVM_SET_MSRS:
2114 r = msr_io(vcpu, argp, do_set_msr, 0); 2555 r = msr_io(vcpu, argp, do_set_msr, 0);
2115 break; 2556 break;
2557 case KVM_SET_CPUID: {
2558 struct kvm_cpuid __user *cpuid_arg = argp;
2559 struct kvm_cpuid cpuid;
2560
2561 r = -EFAULT;
2562 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2563 goto out;
2564 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
2565 if (r)
2566 goto out;
2567 break;
2568 }
2569 case KVM_SET_SIGNAL_MASK: {
2570 struct kvm_signal_mask __user *sigmask_arg = argp;
2571 struct kvm_signal_mask kvm_sigmask;
2572 sigset_t sigset, *p;
2573
2574 p = NULL;
2575 if (argp) {
2576 r = -EFAULT;
2577 if (copy_from_user(&kvm_sigmask, argp,
2578 sizeof kvm_sigmask))
2579 goto out;
2580 r = -EINVAL;
2581 if (kvm_sigmask.len != sizeof sigset)
2582 goto out;
2583 r = -EFAULT;
2584 if (copy_from_user(&sigset, sigmask_arg->sigset,
2585 sizeof sigset))
2586 goto out;
2587 p = &sigset;
2588 }
2589 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
2590 break;
2591 }
2592 case KVM_GET_FPU: {
2593 struct kvm_fpu fpu;
2594
2595 memset(&fpu, 0, sizeof fpu);
2596 r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
2597 if (r)
2598 goto out;
2599 r = -EFAULT;
2600 if (copy_to_user(argp, &fpu, sizeof fpu))
2601 goto out;
2602 r = 0;
2603 break;
2604 }
2605 case KVM_SET_FPU: {
2606 struct kvm_fpu fpu;
2607
2608 r = -EFAULT;
2609 if (copy_from_user(&fpu, argp, sizeof fpu))
2610 goto out;
2611 r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
2612 if (r)
2613 goto out;
2614 r = 0;
2615 break;
2616 }
2116 default: 2617 default:
2117 ; 2618 ;
2118 } 2619 }
@@ -2155,6 +2656,17 @@ static long kvm_vm_ioctl(struct file *filp,
2155 goto out; 2656 goto out;
2156 break; 2657 break;
2157 } 2658 }
2659 case KVM_SET_MEMORY_ALIAS: {
2660 struct kvm_memory_alias alias;
2661
2662 r = -EFAULT;
2663 if (copy_from_user(&alias, argp, sizeof alias))
2664 goto out;
2665 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
2666 if (r)
2667 goto out;
2668 break;
2669 }
2158 default: 2670 default:
2159 ; 2671 ;
2160 } 2672 }
@@ -2168,15 +2680,11 @@ static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
2168{ 2680{
2169 struct kvm *kvm = vma->vm_file->private_data; 2681 struct kvm *kvm = vma->vm_file->private_data;
2170 unsigned long pgoff; 2682 unsigned long pgoff;
2171 struct kvm_memory_slot *slot;
2172 struct page *page; 2683 struct page *page;
2173 2684
2174 *type = VM_FAULT_MINOR; 2685 *type = VM_FAULT_MINOR;
2175 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 2686 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2176 slot = gfn_to_memslot(kvm, pgoff); 2687 page = gfn_to_page(kvm, pgoff);
2177 if (!slot)
2178 return NOPAGE_SIGBUS;
2179 page = gfn_to_page(slot, pgoff);
2180 if (!page) 2688 if (!page)
2181 return NOPAGE_SIGBUS; 2689 return NOPAGE_SIGBUS;
2182 get_page(page); 2690 get_page(page);
@@ -2248,13 +2756,19 @@ static long kvm_dev_ioctl(struct file *filp,
2248 unsigned int ioctl, unsigned long arg) 2756 unsigned int ioctl, unsigned long arg)
2249{ 2757{
2250 void __user *argp = (void __user *)arg; 2758 void __user *argp = (void __user *)arg;
2251 int r = -EINVAL; 2759 long r = -EINVAL;
2252 2760
2253 switch (ioctl) { 2761 switch (ioctl) {
2254 case KVM_GET_API_VERSION: 2762 case KVM_GET_API_VERSION:
2763 r = -EINVAL;
2764 if (arg)
2765 goto out;
2255 r = KVM_API_VERSION; 2766 r = KVM_API_VERSION;
2256 break; 2767 break;
2257 case KVM_CREATE_VM: 2768 case KVM_CREATE_VM:
2769 r = -EINVAL;
2770 if (arg)
2771 goto out;
2258 r = kvm_dev_ioctl_create_vm(); 2772 r = kvm_dev_ioctl_create_vm();
2259 break; 2773 break;
2260 case KVM_GET_MSR_INDEX_LIST: { 2774 case KVM_GET_MSR_INDEX_LIST: {
@@ -2284,6 +2798,18 @@ static long kvm_dev_ioctl(struct file *filp,
2284 r = 0; 2798 r = 0;
2285 break; 2799 break;
2286 } 2800 }
2801 case KVM_CHECK_EXTENSION:
2802 /*
2803 * No extensions defined at present.
2804 */
2805 r = 0;
2806 break;
2807 case KVM_GET_VCPU_MMAP_SIZE:
2808 r = -EINVAL;
2809 if (arg)
2810 goto out;
2811 r = 2 * PAGE_SIZE;
2812 break;
2287 default: 2813 default:
2288 ; 2814 ;
2289 } 2815 }
@@ -2299,7 +2825,7 @@ static struct file_operations kvm_chardev_ops = {
2299}; 2825};
2300 2826
2301static struct miscdevice kvm_dev = { 2827static struct miscdevice kvm_dev = {
2302 MISC_DYNAMIC_MINOR, 2828 KVM_MINOR,
2303 "kvm", 2829 "kvm",
2304 &kvm_chardev_ops, 2830 &kvm_chardev_ops,
2305}; 2831};
@@ -2385,14 +2911,39 @@ static struct notifier_block kvm_cpu_notifier = {
2385 .priority = 20, /* must be > scheduler priority */ 2911 .priority = 20, /* must be > scheduler priority */
2386}; 2912};
2387 2913
2914static u64 stat_get(void *_offset)
2915{
2916 unsigned offset = (long)_offset;
2917 u64 total = 0;
2918 struct kvm *kvm;
2919 struct kvm_vcpu *vcpu;
2920 int i;
2921
2922 spin_lock(&kvm_lock);
2923 list_for_each_entry(kvm, &vm_list, vm_list)
2924 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
2925 vcpu = &kvm->vcpus[i];
2926 total += *(u32 *)((void *)vcpu + offset);
2927 }
2928 spin_unlock(&kvm_lock);
2929 return total;
2930}
2931
2932static void stat_set(void *offset, u64 val)
2933{
2934}
2935
2936DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, stat_set, "%llu\n");
2937
2388static __init void kvm_init_debug(void) 2938static __init void kvm_init_debug(void)
2389{ 2939{
2390 struct kvm_stats_debugfs_item *p; 2940 struct kvm_stats_debugfs_item *p;
2391 2941
2392 debugfs_dir = debugfs_create_dir("kvm", NULL); 2942 debugfs_dir = debugfs_create_dir("kvm", NULL);
2393 for (p = debugfs_entries; p->name; ++p) 2943 for (p = debugfs_entries; p->name; ++p)
2394 p->dentry = debugfs_create_u32(p->name, 0444, debugfs_dir, 2944 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
2395 p->data); 2945 (void *)(long)p->offset,
2946 &stat_fops);
2396} 2947}
2397 2948
2398static void kvm_exit_debug(void) 2949static void kvm_exit_debug(void)
@@ -2522,6 +3073,10 @@ static __init int kvm_init(void)
2522 static struct page *bad_page; 3073 static struct page *bad_page;
2523 int r; 3074 int r;
2524 3075
3076 r = kvm_mmu_module_init();
3077 if (r)
3078 goto out4;
3079
2525 r = register_filesystem(&kvm_fs_type); 3080 r = register_filesystem(&kvm_fs_type);
2526 if (r) 3081 if (r)
2527 goto out3; 3082 goto out3;
@@ -2550,6 +3105,8 @@ out:
2550out2: 3105out2:
2551 unregister_filesystem(&kvm_fs_type); 3106 unregister_filesystem(&kvm_fs_type);
2552out3: 3107out3:
3108 kvm_mmu_module_exit();
3109out4:
2553 return r; 3110 return r;
2554} 3111}
2555 3112
@@ -2559,6 +3116,7 @@ static __exit void kvm_exit(void)
2559 __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT)); 3116 __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
2560 mntput(kvmfs_mnt); 3117 mntput(kvmfs_mnt);
2561 unregister_filesystem(&kvm_fs_type); 3118 unregister_filesystem(&kvm_fs_type);
3119 kvm_mmu_module_exit();
2562} 3120}
2563 3121
2564module_init(kvm_init) 3122module_init(kvm_init)
diff --git a/drivers/kvm/kvm_svm.h b/drivers/kvm/kvm_svm.h
index 624f1ca48657..a869983d683d 100644
--- a/drivers/kvm/kvm_svm.h
+++ b/drivers/kvm/kvm_svm.h
@@ -9,17 +9,15 @@
9#include "svm.h" 9#include "svm.h"
10#include "kvm.h" 10#include "kvm.h"
11 11
12static const u32 host_save_msrs[] = { 12static const u32 host_save_user_msrs[] = {
13#ifdef CONFIG_X86_64 13#ifdef CONFIG_X86_64
14 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, 14 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
15 MSR_FS_BASE, MSR_GS_BASE, 15 MSR_FS_BASE,
16#endif 16#endif
17 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 17 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
18 MSR_IA32_DEBUGCTLMSR, /*MSR_IA32_LASTBRANCHFROMIP,
19 MSR_IA32_LASTBRANCHTOIP, MSR_IA32_LASTINTFROMIP,MSR_IA32_LASTINTTOIP,*/
20}; 18};
21 19
22#define NR_HOST_SAVE_MSRS ARRAY_SIZE(host_save_msrs) 20#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
23#define NUM_DB_REGS 4 21#define NUM_DB_REGS 4
24 22
25struct vcpu_svm { 23struct vcpu_svm {
@@ -28,13 +26,12 @@ struct vcpu_svm {
28 struct svm_cpu_data *svm_data; 26 struct svm_cpu_data *svm_data;
29 uint64_t asid_generation; 27 uint64_t asid_generation;
30 28
31 unsigned long cr0;
32 unsigned long cr4;
33 unsigned long db_regs[NUM_DB_REGS]; 29 unsigned long db_regs[NUM_DB_REGS];
34 30
35 u64 next_rip; 31 u64 next_rip;
36 32
37 u64 host_msrs[NR_HOST_SAVE_MSRS]; 33 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
34 u64 host_gs_base;
38 unsigned long host_cr2; 35 unsigned long host_cr2;
39 unsigned long host_db_regs[NUM_DB_REGS]; 36 unsigned long host_db_regs[NUM_DB_REGS];
40 unsigned long host_dr6; 37 unsigned long host_dr6;
diff --git a/drivers/kvm/kvm_vmx.h b/drivers/kvm/kvm_vmx.h
deleted file mode 100644
index d139f73fb6e1..000000000000
--- a/drivers/kvm/kvm_vmx.h
+++ /dev/null
@@ -1,14 +0,0 @@
1#ifndef __KVM_VMX_H
2#define __KVM_VMX_H
3
4#ifdef CONFIG_X86_64
5/*
6 * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
7 * mechanism (cpu bug AA24)
8 */
9#define NR_BAD_MSRS 2
10#else
11#define NR_BAD_MSRS 0
12#endif
13
14#endif
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
index cab26f301eab..e8e228118de9 100644
--- a/drivers/kvm/mmu.c
+++ b/drivers/kvm/mmu.c
@@ -52,11 +52,15 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
52static int dbg = 1; 52static int dbg = 1;
53#endif 53#endif
54 54
55#ifndef MMU_DEBUG
56#define ASSERT(x) do { } while (0)
57#else
55#define ASSERT(x) \ 58#define ASSERT(x) \
56 if (!(x)) { \ 59 if (!(x)) { \
57 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ 60 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
58 __FILE__, __LINE__, #x); \ 61 __FILE__, __LINE__, #x); \
59 } 62 }
63#endif
60 64
61#define PT64_PT_BITS 9 65#define PT64_PT_BITS 9
62#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) 66#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
@@ -159,6 +163,9 @@ struct kvm_rmap_desc {
159 struct kvm_rmap_desc *more; 163 struct kvm_rmap_desc *more;
160}; 164};
161 165
166static struct kmem_cache *pte_chain_cache;
167static struct kmem_cache *rmap_desc_cache;
168
162static int is_write_protection(struct kvm_vcpu *vcpu) 169static int is_write_protection(struct kvm_vcpu *vcpu)
163{ 170{
164 return vcpu->cr0 & CR0_WP_MASK; 171 return vcpu->cr0 & CR0_WP_MASK;
@@ -196,14 +203,15 @@ static int is_rmap_pte(u64 pte)
196} 203}
197 204
198static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 205static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
199 size_t objsize, int min) 206 struct kmem_cache *base_cache, int min,
207 gfp_t gfp_flags)
200{ 208{
201 void *obj; 209 void *obj;
202 210
203 if (cache->nobjs >= min) 211 if (cache->nobjs >= min)
204 return 0; 212 return 0;
205 while (cache->nobjs < ARRAY_SIZE(cache->objects)) { 213 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
206 obj = kzalloc(objsize, GFP_NOWAIT); 214 obj = kmem_cache_zalloc(base_cache, gfp_flags);
207 if (!obj) 215 if (!obj)
208 return -ENOMEM; 216 return -ENOMEM;
209 cache->objects[cache->nobjs++] = obj; 217 cache->objects[cache->nobjs++] = obj;
@@ -217,20 +225,35 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
217 kfree(mc->objects[--mc->nobjs]); 225 kfree(mc->objects[--mc->nobjs]);
218} 226}
219 227
220static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) 228static int __mmu_topup_memory_caches(struct kvm_vcpu *vcpu, gfp_t gfp_flags)
221{ 229{
222 int r; 230 int r;
223 231
224 r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache, 232 r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
225 sizeof(struct kvm_pte_chain), 4); 233 pte_chain_cache, 4, gfp_flags);
226 if (r) 234 if (r)
227 goto out; 235 goto out;
228 r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache, 236 r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
229 sizeof(struct kvm_rmap_desc), 1); 237 rmap_desc_cache, 1, gfp_flags);
230out: 238out:
231 return r; 239 return r;
232} 240}
233 241
242static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
243{
244 int r;
245
246 r = __mmu_topup_memory_caches(vcpu, GFP_NOWAIT);
247 if (r < 0) {
248 spin_unlock(&vcpu->kvm->lock);
249 kvm_arch_ops->vcpu_put(vcpu);
250 r = __mmu_topup_memory_caches(vcpu, GFP_KERNEL);
251 kvm_arch_ops->vcpu_load(vcpu);
252 spin_lock(&vcpu->kvm->lock);
253 }
254 return r;
255}
256
234static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) 257static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
235{ 258{
236 mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache); 259 mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
@@ -390,13 +413,11 @@ static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
390{ 413{
391 struct kvm *kvm = vcpu->kvm; 414 struct kvm *kvm = vcpu->kvm;
392 struct page *page; 415 struct page *page;
393 struct kvm_memory_slot *slot;
394 struct kvm_rmap_desc *desc; 416 struct kvm_rmap_desc *desc;
395 u64 *spte; 417 u64 *spte;
396 418
397 slot = gfn_to_memslot(kvm, gfn); 419 page = gfn_to_page(kvm, gfn);
398 BUG_ON(!slot); 420 BUG_ON(!page);
399 page = gfn_to_page(slot, gfn);
400 421
401 while (page_private(page)) { 422 while (page_private(page)) {
402 if (!(page_private(page) & 1)) 423 if (!(page_private(page) & 1))
@@ -417,6 +438,7 @@ static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
417 } 438 }
418} 439}
419 440
441#ifdef MMU_DEBUG
420static int is_empty_shadow_page(hpa_t page_hpa) 442static int is_empty_shadow_page(hpa_t page_hpa)
421{ 443{
422 u64 *pos; 444 u64 *pos;
@@ -431,15 +453,15 @@ static int is_empty_shadow_page(hpa_t page_hpa)
431 } 453 }
432 return 1; 454 return 1;
433} 455}
456#endif
434 457
435static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa) 458static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa)
436{ 459{
437 struct kvm_mmu_page *page_head = page_header(page_hpa); 460 struct kvm_mmu_page *page_head = page_header(page_hpa);
438 461
439 ASSERT(is_empty_shadow_page(page_hpa)); 462 ASSERT(is_empty_shadow_page(page_hpa));
440 list_del(&page_head->link);
441 page_head->page_hpa = page_hpa; 463 page_head->page_hpa = page_hpa;
442 list_add(&page_head->link, &vcpu->free_pages); 464 list_move(&page_head->link, &vcpu->free_pages);
443 ++vcpu->kvm->n_free_mmu_pages; 465 ++vcpu->kvm->n_free_mmu_pages;
444} 466}
445 467
@@ -457,11 +479,9 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
457 return NULL; 479 return NULL;
458 480
459 page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); 481 page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link);
460 list_del(&page->link); 482 list_move(&page->link, &vcpu->kvm->active_mmu_pages);
461 list_add(&page->link, &vcpu->kvm->active_mmu_pages);
462 ASSERT(is_empty_shadow_page(page->page_hpa)); 483 ASSERT(is_empty_shadow_page(page->page_hpa));
463 page->slot_bitmap = 0; 484 page->slot_bitmap = 0;
464 page->global = 1;
465 page->multimapped = 0; 485 page->multimapped = 0;
466 page->parent_pte = parent_pte; 486 page->parent_pte = parent_pte;
467 --vcpu->kvm->n_free_mmu_pages; 487 --vcpu->kvm->n_free_mmu_pages;
@@ -569,6 +589,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
569 gva_t gaddr, 589 gva_t gaddr,
570 unsigned level, 590 unsigned level,
571 int metaphysical, 591 int metaphysical,
592 unsigned hugepage_access,
572 u64 *parent_pte) 593 u64 *parent_pte)
573{ 594{
574 union kvm_mmu_page_role role; 595 union kvm_mmu_page_role role;
@@ -582,6 +603,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
582 role.glevels = vcpu->mmu.root_level; 603 role.glevels = vcpu->mmu.root_level;
583 role.level = level; 604 role.level = level;
584 role.metaphysical = metaphysical; 605 role.metaphysical = metaphysical;
606 role.hugepage_access = hugepage_access;
585 if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) { 607 if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
586 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 608 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
587 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 609 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
@@ -669,10 +691,8 @@ static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu,
669 if (!page->root_count) { 691 if (!page->root_count) {
670 hlist_del(&page->hash_link); 692 hlist_del(&page->hash_link);
671 kvm_mmu_free_page(vcpu, page->page_hpa); 693 kvm_mmu_free_page(vcpu, page->page_hpa);
672 } else { 694 } else
673 list_del(&page->link); 695 list_move(&page->link, &vcpu->kvm->active_mmu_pages);
674 list_add(&page->link, &vcpu->kvm->active_mmu_pages);
675 }
676} 696}
677 697
678static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn) 698static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -714,14 +734,12 @@ hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
714 734
715hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) 735hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
716{ 736{
717 struct kvm_memory_slot *slot;
718 struct page *page; 737 struct page *page;
719 738
720 ASSERT((gpa & HPA_ERR_MASK) == 0); 739 ASSERT((gpa & HPA_ERR_MASK) == 0);
721 slot = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT); 740 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
722 if (!slot) 741 if (!page)
723 return gpa | HPA_ERR_MASK; 742 return gpa | HPA_ERR_MASK;
724 page = gfn_to_page(slot, gpa >> PAGE_SHIFT);
725 return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT) 743 return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
726 | (gpa & (PAGE_SIZE-1)); 744 | (gpa & (PAGE_SIZE-1));
727} 745}
@@ -735,6 +753,15 @@ hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
735 return gpa_to_hpa(vcpu, gpa); 753 return gpa_to_hpa(vcpu, gpa);
736} 754}
737 755
756struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
757{
758 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
759
760 if (gpa == UNMAPPED_GVA)
761 return NULL;
762 return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT);
763}
764
738static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 765static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
739{ 766{
740} 767}
@@ -772,7 +799,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
772 >> PAGE_SHIFT; 799 >> PAGE_SHIFT;
773 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, 800 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
774 v, level - 1, 801 v, level - 1,
775 1, &table[index]); 802 1, 0, &table[index]);
776 if (!new_table) { 803 if (!new_table) {
777 pgprintk("nonpaging_map: ENOMEM\n"); 804 pgprintk("nonpaging_map: ENOMEM\n");
778 return -ENOMEM; 805 return -ENOMEM;
@@ -804,10 +831,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
804 for (i = 0; i < 4; ++i) { 831 for (i = 0; i < 4; ++i) {
805 hpa_t root = vcpu->mmu.pae_root[i]; 832 hpa_t root = vcpu->mmu.pae_root[i];
806 833
807 ASSERT(VALID_PAGE(root)); 834 if (root) {
808 root &= PT64_BASE_ADDR_MASK; 835 ASSERT(VALID_PAGE(root));
809 page = page_header(root); 836 root &= PT64_BASE_ADDR_MASK;
810 --page->root_count; 837 page = page_header(root);
838 --page->root_count;
839 }
811 vcpu->mmu.pae_root[i] = INVALID_PAGE; 840 vcpu->mmu.pae_root[i] = INVALID_PAGE;
812 } 841 }
813 vcpu->mmu.root_hpa = INVALID_PAGE; 842 vcpu->mmu.root_hpa = INVALID_PAGE;
@@ -827,7 +856,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
827 856
828 ASSERT(!VALID_PAGE(root)); 857 ASSERT(!VALID_PAGE(root));
829 page = kvm_mmu_get_page(vcpu, root_gfn, 0, 858 page = kvm_mmu_get_page(vcpu, root_gfn, 0,
830 PT64_ROOT_LEVEL, 0, NULL); 859 PT64_ROOT_LEVEL, 0, 0, NULL);
831 root = page->page_hpa; 860 root = page->page_hpa;
832 ++page->root_count; 861 ++page->root_count;
833 vcpu->mmu.root_hpa = root; 862 vcpu->mmu.root_hpa = root;
@@ -838,13 +867,17 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
838 hpa_t root = vcpu->mmu.pae_root[i]; 867 hpa_t root = vcpu->mmu.pae_root[i];
839 868
840 ASSERT(!VALID_PAGE(root)); 869 ASSERT(!VALID_PAGE(root));
841 if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) 870 if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) {
871 if (!is_present_pte(vcpu->pdptrs[i])) {
872 vcpu->mmu.pae_root[i] = 0;
873 continue;
874 }
842 root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT; 875 root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
843 else if (vcpu->mmu.root_level == 0) 876 } else if (vcpu->mmu.root_level == 0)
844 root_gfn = 0; 877 root_gfn = 0;
845 page = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 878 page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
846 PT32_ROOT_LEVEL, !is_paging(vcpu), 879 PT32_ROOT_LEVEL, !is_paging(vcpu),
847 NULL); 880 0, NULL);
848 root = page->page_hpa; 881 root = page->page_hpa;
849 ++page->root_count; 882 ++page->root_count;
850 vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; 883 vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
@@ -903,7 +936,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
903 936
904static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) 937static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
905{ 938{
906 ++kvm_stat.tlb_flush; 939 ++vcpu->stat.tlb_flush;
907 kvm_arch_ops->tlb_flush(vcpu); 940 kvm_arch_ops->tlb_flush(vcpu);
908} 941}
909 942
@@ -918,11 +951,6 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu)
918 kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); 951 kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
919} 952}
920 953
921static void mark_pagetable_nonglobal(void *shadow_pte)
922{
923 page_header(__pa(shadow_pte))->global = 0;
924}
925
926static inline void set_pte_common(struct kvm_vcpu *vcpu, 954static inline void set_pte_common(struct kvm_vcpu *vcpu,
927 u64 *shadow_pte, 955 u64 *shadow_pte,
928 gpa_t gaddr, 956 gpa_t gaddr,
@@ -940,9 +968,6 @@ static inline void set_pte_common(struct kvm_vcpu *vcpu,
940 968
941 *shadow_pte |= access_bits; 969 *shadow_pte |= access_bits;
942 970
943 if (!(*shadow_pte & PT_GLOBAL_MASK))
944 mark_pagetable_nonglobal(shadow_pte);
945
946 if (is_error_hpa(paddr)) { 971 if (is_error_hpa(paddr)) {
947 *shadow_pte |= gaddr; 972 *shadow_pte |= gaddr;
948 *shadow_pte |= PT_SHADOW_IO_MARK; 973 *shadow_pte |= PT_SHADOW_IO_MARK;
@@ -1316,6 +1341,51 @@ void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot)
1316 } 1341 }
1317} 1342}
1318 1343
1344void kvm_mmu_zap_all(struct kvm_vcpu *vcpu)
1345{
1346 destroy_kvm_mmu(vcpu);
1347
1348 while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
1349 struct kvm_mmu_page *page;
1350
1351 page = container_of(vcpu->kvm->active_mmu_pages.next,
1352 struct kvm_mmu_page, link);
1353 kvm_mmu_zap_page(vcpu, page);
1354 }
1355
1356 mmu_free_memory_caches(vcpu);
1357 kvm_arch_ops->tlb_flush(vcpu);
1358 init_kvm_mmu(vcpu);
1359}
1360
1361void kvm_mmu_module_exit(void)
1362{
1363 if (pte_chain_cache)
1364 kmem_cache_destroy(pte_chain_cache);
1365 if (rmap_desc_cache)
1366 kmem_cache_destroy(rmap_desc_cache);
1367}
1368
1369int kvm_mmu_module_init(void)
1370{
1371 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
1372 sizeof(struct kvm_pte_chain),
1373 0, 0, NULL, NULL);
1374 if (!pte_chain_cache)
1375 goto nomem;
1376 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
1377 sizeof(struct kvm_rmap_desc),
1378 0, 0, NULL, NULL);
1379 if (!rmap_desc_cache)
1380 goto nomem;
1381
1382 return 0;
1383
1384nomem:
1385 kvm_mmu_module_exit();
1386 return -ENOMEM;
1387}
1388
1319#ifdef AUDIT 1389#ifdef AUDIT
1320 1390
1321static const char *audit_msg; 1391static const char *audit_msg;
@@ -1338,7 +1408,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1338 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { 1408 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
1339 u64 ent = pt[i]; 1409 u64 ent = pt[i];
1340 1410
1341 if (!ent & PT_PRESENT_MASK) 1411 if (!(ent & PT_PRESENT_MASK))
1342 continue; 1412 continue;
1343 1413
1344 va = canonicalize(va); 1414 va = canonicalize(va);
@@ -1360,7 +1430,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1360 1430
1361static void audit_mappings(struct kvm_vcpu *vcpu) 1431static void audit_mappings(struct kvm_vcpu *vcpu)
1362{ 1432{
1363 int i; 1433 unsigned i;
1364 1434
1365 if (vcpu->mmu.root_level == 4) 1435 if (vcpu->mmu.root_level == 4)
1366 audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4); 1436 audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
index f3bcee904651..73ffbffb1097 100644
--- a/drivers/kvm/paging_tmpl.h
+++ b/drivers/kvm/paging_tmpl.h
@@ -148,8 +148,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
148 break; 148 break;
149 } 149 }
150 150
151 if (walker->level != 3 || is_long_mode(vcpu)) 151 walker->inherited_ar &= walker->table[index];
152 walker->inherited_ar &= walker->table[index];
153 table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; 152 table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
154 paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK); 153 paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK);
155 kunmap_atomic(walker->table, KM_USER0); 154 kunmap_atomic(walker->table, KM_USER0);
@@ -248,6 +247,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
248 u64 shadow_pte; 247 u64 shadow_pte;
249 int metaphysical; 248 int metaphysical;
250 gfn_t table_gfn; 249 gfn_t table_gfn;
250 unsigned hugepage_access = 0;
251 251
252 if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { 252 if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
253 if (level == PT_PAGE_TABLE_LEVEL) 253 if (level == PT_PAGE_TABLE_LEVEL)
@@ -277,6 +277,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
277 if (level - 1 == PT_PAGE_TABLE_LEVEL 277 if (level - 1 == PT_PAGE_TABLE_LEVEL
278 && walker->level == PT_DIRECTORY_LEVEL) { 278 && walker->level == PT_DIRECTORY_LEVEL) {
279 metaphysical = 1; 279 metaphysical = 1;
280 hugepage_access = *guest_ent;
281 hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK;
282 hugepage_access >>= PT_WRITABLE_SHIFT;
280 table_gfn = (*guest_ent & PT_BASE_ADDR_MASK) 283 table_gfn = (*guest_ent & PT_BASE_ADDR_MASK)
281 >> PAGE_SHIFT; 284 >> PAGE_SHIFT;
282 } else { 285 } else {
@@ -284,7 +287,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
284 table_gfn = walker->table_gfn[level - 2]; 287 table_gfn = walker->table_gfn[level - 2];
285 } 288 }
286 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, 289 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
287 metaphysical, shadow_ent); 290 metaphysical, hugepage_access,
291 shadow_ent);
288 shadow_addr = shadow_page->page_hpa; 292 shadow_addr = shadow_page->page_hpa;
289 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK 293 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
290 | PT_WRITABLE_MASK | PT_USER_MASK; 294 | PT_WRITABLE_MASK | PT_USER_MASK;
@@ -444,7 +448,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
444 if (is_io_pte(*shadow_pte)) 448 if (is_io_pte(*shadow_pte))
445 return 1; 449 return 1;
446 450
447 ++kvm_stat.pf_fixed; 451 ++vcpu->stat.pf_fixed;
448 kvm_mmu_audit(vcpu, "post page fault (fixed)"); 452 kvm_mmu_audit(vcpu, "post page fault (fixed)");
449 453
450 return write_pt; 454 return write_pt;
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index 3d8ea7ac2ecc..9c15f32eea18 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -44,6 +44,10 @@ MODULE_LICENSE("GPL");
44#define KVM_EFER_LMA (1 << 10) 44#define KVM_EFER_LMA (1 << 10)
45#define KVM_EFER_LME (1 << 8) 45#define KVM_EFER_LME (1 << 8)
46 46
47#define SVM_FEATURE_NPT (1 << 0)
48#define SVM_FEATURE_LBRV (1 << 1)
49#define SVM_DEATURE_SVML (1 << 2)
50
47unsigned long iopm_base; 51unsigned long iopm_base;
48unsigned long msrpm_base; 52unsigned long msrpm_base;
49 53
@@ -59,15 +63,16 @@ struct kvm_ldttss_desc {
59struct svm_cpu_data { 63struct svm_cpu_data {
60 int cpu; 64 int cpu;
61 65
62 uint64_t asid_generation; 66 u64 asid_generation;
63 uint32_t max_asid; 67 u32 max_asid;
64 uint32_t next_asid; 68 u32 next_asid;
65 struct kvm_ldttss_desc *tss_desc; 69 struct kvm_ldttss_desc *tss_desc;
66 70
67 struct page *save_area; 71 struct page *save_area;
68}; 72};
69 73
70static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); 74static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
75static uint32_t svm_features;
71 76
72struct svm_init_data { 77struct svm_init_data {
73 int cpu; 78 int cpu;
@@ -82,6 +87,11 @@ static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
82 87
83#define MAX_INST_SIZE 15 88#define MAX_INST_SIZE 15
84 89
90static inline u32 svm_has(u32 feat)
91{
92 return svm_features & feat;
93}
94
85static unsigned get_addr_size(struct kvm_vcpu *vcpu) 95static unsigned get_addr_size(struct kvm_vcpu *vcpu)
86{ 96{
87 struct vmcb_save_area *sa = &vcpu->svm->vmcb->save; 97 struct vmcb_save_area *sa = &vcpu->svm->vmcb->save;
@@ -203,13 +213,6 @@ static void inject_ud(struct kvm_vcpu *vcpu)
203 UD_VECTOR; 213 UD_VECTOR;
204} 214}
205 215
206static void inject_db(struct kvm_vcpu *vcpu)
207{
208 vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
209 SVM_EVTINJ_TYPE_EXEPT |
210 DB_VECTOR;
211}
212
213static int is_page_fault(uint32_t info) 216static int is_page_fault(uint32_t info)
214{ 217{
215 info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; 218 info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
@@ -309,6 +312,7 @@ static void svm_hardware_enable(void *garbage)
309 svm_data->asid_generation = 1; 312 svm_data->asid_generation = 1;
310 svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 313 svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
311 svm_data->next_asid = svm_data->max_asid + 1; 314 svm_data->next_asid = svm_data->max_asid + 1;
315 svm_features = cpuid_edx(SVM_CPUID_FUNC);
312 316
313 asm volatile ( "sgdt %0" : "=m"(gdt_descr) ); 317 asm volatile ( "sgdt %0" : "=m"(gdt_descr) );
314 gdt = (struct desc_struct *)gdt_descr.address; 318 gdt = (struct desc_struct *)gdt_descr.address;
@@ -459,7 +463,6 @@ static void init_vmcb(struct vmcb *vmcb)
459{ 463{
460 struct vmcb_control_area *control = &vmcb->control; 464 struct vmcb_control_area *control = &vmcb->control;
461 struct vmcb_save_area *save = &vmcb->save; 465 struct vmcb_save_area *save = &vmcb->save;
462 u64 tsc;
463 466
464 control->intercept_cr_read = INTERCEPT_CR0_MASK | 467 control->intercept_cr_read = INTERCEPT_CR0_MASK |
465 INTERCEPT_CR3_MASK | 468 INTERCEPT_CR3_MASK |
@@ -511,12 +514,13 @@ static void init_vmcb(struct vmcb *vmcb)
511 (1ULL << INTERCEPT_VMSAVE) | 514 (1ULL << INTERCEPT_VMSAVE) |
512 (1ULL << INTERCEPT_STGI) | 515 (1ULL << INTERCEPT_STGI) |
513 (1ULL << INTERCEPT_CLGI) | 516 (1ULL << INTERCEPT_CLGI) |
514 (1ULL << INTERCEPT_SKINIT); 517 (1ULL << INTERCEPT_SKINIT) |
518 (1ULL << INTERCEPT_MONITOR) |
519 (1ULL << INTERCEPT_MWAIT);
515 520
516 control->iopm_base_pa = iopm_base; 521 control->iopm_base_pa = iopm_base;
517 control->msrpm_base_pa = msrpm_base; 522 control->msrpm_base_pa = msrpm_base;
518 rdtscll(tsc); 523 control->tsc_offset = 0;
519 control->tsc_offset = -tsc;
520 control->int_ctl = V_INTR_MASKING_MASK; 524 control->int_ctl = V_INTR_MASKING_MASK;
521 525
522 init_seg(&save->es); 526 init_seg(&save->es);
@@ -576,12 +580,15 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
576 vcpu->svm->vmcb = page_address(page); 580 vcpu->svm->vmcb = page_address(page);
577 memset(vcpu->svm->vmcb, 0, PAGE_SIZE); 581 memset(vcpu->svm->vmcb, 0, PAGE_SIZE);
578 vcpu->svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; 582 vcpu->svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
579 vcpu->svm->cr0 = 0x00000010;
580 vcpu->svm->asid_generation = 0; 583 vcpu->svm->asid_generation = 0;
581 memset(vcpu->svm->db_regs, 0, sizeof(vcpu->svm->db_regs)); 584 memset(vcpu->svm->db_regs, 0, sizeof(vcpu->svm->db_regs));
582 init_vmcb(vcpu->svm->vmcb); 585 init_vmcb(vcpu->svm->vmcb);
583 586
584 fx_init(vcpu); 587 fx_init(vcpu);
588 vcpu->fpu_active = 1;
589 vcpu->apic_base = 0xfee00000 |
590 /*for vcpu 0*/ MSR_IA32_APICBASE_BSP |
591 MSR_IA32_APICBASE_ENABLE;
585 592
586 return 0; 593 return 0;
587 594
@@ -602,11 +609,34 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
602 609
603static void svm_vcpu_load(struct kvm_vcpu *vcpu) 610static void svm_vcpu_load(struct kvm_vcpu *vcpu)
604{ 611{
605 get_cpu(); 612 int cpu, i;
613
614 cpu = get_cpu();
615 if (unlikely(cpu != vcpu->cpu)) {
616 u64 tsc_this, delta;
617
618 /*
619 * Make sure that the guest sees a monotonically
620 * increasing TSC.
621 */
622 rdtscll(tsc_this);
623 delta = vcpu->host_tsc - tsc_this;
624 vcpu->svm->vmcb->control.tsc_offset += delta;
625 vcpu->cpu = cpu;
626 }
627
628 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
629 rdmsrl(host_save_user_msrs[i], vcpu->svm->host_user_msrs[i]);
606} 630}
607 631
608static void svm_vcpu_put(struct kvm_vcpu *vcpu) 632static void svm_vcpu_put(struct kvm_vcpu *vcpu)
609{ 633{
634 int i;
635
636 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
637 wrmsrl(host_save_user_msrs[i], vcpu->svm->host_user_msrs[i]);
638
639 rdtscll(vcpu->host_tsc);
610 put_cpu(); 640 put_cpu();
611} 641}
612 642
@@ -714,7 +744,7 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
714 vcpu->svm->vmcb->save.gdtr.base = dt->base ; 744 vcpu->svm->vmcb->save.gdtr.base = dt->base ;
715} 745}
716 746
717static void svm_decache_cr0_cr4_guest_bits(struct kvm_vcpu *vcpu) 747static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
718{ 748{
719} 749}
720 750
@@ -733,9 +763,15 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
733 } 763 }
734 } 764 }
735#endif 765#endif
736 vcpu->svm->cr0 = cr0; 766 if ((vcpu->cr0 & CR0_TS_MASK) && !(cr0 & CR0_TS_MASK)) {
737 vcpu->svm->vmcb->save.cr0 = cr0 | CR0_PG_MASK | CR0_WP_MASK; 767 vcpu->svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
768 vcpu->fpu_active = 1;
769 }
770
738 vcpu->cr0 = cr0; 771 vcpu->cr0 = cr0;
772 cr0 |= CR0_PG_MASK | CR0_WP_MASK;
773 cr0 &= ~(CR0_CD_MASK | CR0_NW_MASK);
774 vcpu->svm->vmcb->save.cr0 = cr0;
739} 775}
740 776
741static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 777static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -785,18 +821,16 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
785 821
786static void load_host_msrs(struct kvm_vcpu *vcpu) 822static void load_host_msrs(struct kvm_vcpu *vcpu)
787{ 823{
788 int i; 824#ifdef CONFIG_X86_64
789 825 wrmsrl(MSR_GS_BASE, vcpu->svm->host_gs_base);
790 for ( i = 0; i < NR_HOST_SAVE_MSRS; i++) 826#endif
791 wrmsrl(host_save_msrs[i], vcpu->svm->host_msrs[i]);
792} 827}
793 828
794static void save_host_msrs(struct kvm_vcpu *vcpu) 829static void save_host_msrs(struct kvm_vcpu *vcpu)
795{ 830{
796 int i; 831#ifdef CONFIG_X86_64
797 832 rdmsrl(MSR_GS_BASE, vcpu->svm->host_gs_base);
798 for ( i = 0; i < NR_HOST_SAVE_MSRS; i++) 833#endif
799 rdmsrl(host_save_msrs[i], vcpu->svm->host_msrs[i]);
800} 834}
801 835
802static void new_asid(struct kvm_vcpu *vcpu, struct svm_cpu_data *svm_data) 836static void new_asid(struct kvm_vcpu *vcpu, struct svm_cpu_data *svm_data)
@@ -890,7 +924,7 @@ static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
890 case EMULATE_DONE: 924 case EMULATE_DONE:
891 return 1; 925 return 1;
892 case EMULATE_DO_MMIO: 926 case EMULATE_DO_MMIO:
893 ++kvm_stat.mmio_exits; 927 ++vcpu->stat.mmio_exits;
894 kvm_run->exit_reason = KVM_EXIT_MMIO; 928 kvm_run->exit_reason = KVM_EXIT_MMIO;
895 return 0; 929 return 0;
896 case EMULATE_FAIL: 930 case EMULATE_FAIL:
@@ -904,6 +938,16 @@ static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
904 return 0; 938 return 0;
905} 939}
906 940
941static int nm_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
942{
943 vcpu->svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
944 if (!(vcpu->cr0 & CR0_TS_MASK))
945 vcpu->svm->vmcb->save.cr0 &= ~CR0_TS_MASK;
946 vcpu->fpu_active = 1;
947
948 return 1;
949}
950
907static int shutdown_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 951static int shutdown_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
908{ 952{
909 /* 953 /*
@@ -981,7 +1025,7 @@ static int io_get_override(struct kvm_vcpu *vcpu,
981 return 0; 1025 return 0;
982} 1026}
983 1027
984static unsigned long io_adress(struct kvm_vcpu *vcpu, int ins, u64 *address) 1028static unsigned long io_adress(struct kvm_vcpu *vcpu, int ins, gva_t *address)
985{ 1029{
986 unsigned long addr_mask; 1030 unsigned long addr_mask;
987 unsigned long *reg; 1031 unsigned long *reg;
@@ -1025,38 +1069,38 @@ static unsigned long io_adress(struct kvm_vcpu *vcpu, int ins, u64 *address)
1025static int io_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1069static int io_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1026{ 1070{
1027 u32 io_info = vcpu->svm->vmcb->control.exit_info_1; //address size bug? 1071 u32 io_info = vcpu->svm->vmcb->control.exit_info_1; //address size bug?
1028 int _in = io_info & SVM_IOIO_TYPE_MASK; 1072 int size, down, in, string, rep;
1073 unsigned port;
1074 unsigned long count;
1075 gva_t address = 0;
1029 1076
1030 ++kvm_stat.io_exits; 1077 ++vcpu->stat.io_exits;
1031 1078
1032 vcpu->svm->next_rip = vcpu->svm->vmcb->control.exit_info_2; 1079 vcpu->svm->next_rip = vcpu->svm->vmcb->control.exit_info_2;
1033 1080
1034 kvm_run->exit_reason = KVM_EXIT_IO; 1081 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1035 kvm_run->io.port = io_info >> 16; 1082 port = io_info >> 16;
1036 kvm_run->io.direction = (_in) ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 1083 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1037 kvm_run->io.size = ((io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT); 1084 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1038 kvm_run->io.string = (io_info & SVM_IOIO_STR_MASK) != 0; 1085 rep = (io_info & SVM_IOIO_REP_MASK) != 0;
1039 kvm_run->io.rep = (io_info & SVM_IOIO_REP_MASK) != 0; 1086 count = 1;
1087 down = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
1040 1088
1041 if (kvm_run->io.string) { 1089 if (string) {
1042 unsigned addr_mask; 1090 unsigned addr_mask;
1043 1091
1044 addr_mask = io_adress(vcpu, _in, &kvm_run->io.address); 1092 addr_mask = io_adress(vcpu, in, &address);
1045 if (!addr_mask) { 1093 if (!addr_mask) {
1046 printk(KERN_DEBUG "%s: get io address failed\n", 1094 printk(KERN_DEBUG "%s: get io address failed\n",
1047 __FUNCTION__); 1095 __FUNCTION__);
1048 return 1; 1096 return 1;
1049 } 1097 }
1050 1098
1051 if (kvm_run->io.rep) { 1099 if (rep)
1052 kvm_run->io.count 1100 count = vcpu->regs[VCPU_REGS_RCX] & addr_mask;
1053 = vcpu->regs[VCPU_REGS_RCX] & addr_mask; 1101 }
1054 kvm_run->io.string_down = (vcpu->svm->vmcb->save.rflags 1102 return kvm_setup_pio(vcpu, kvm_run, in, size, count, string, down,
1055 & X86_EFLAGS_DF) != 0; 1103 address, rep, port);
1056 }
1057 } else
1058 kvm_run->io.value = vcpu->svm->vmcb->save.rax;
1059 return 0;
1060} 1104}
1061 1105
1062static int nop_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1106static int nop_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -1072,13 +1116,14 @@ static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1072 return 1; 1116 return 1;
1073 1117
1074 kvm_run->exit_reason = KVM_EXIT_HLT; 1118 kvm_run->exit_reason = KVM_EXIT_HLT;
1075 ++kvm_stat.halt_exits; 1119 ++vcpu->stat.halt_exits;
1076 return 0; 1120 return 0;
1077} 1121}
1078 1122
1079static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1123static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1080{ 1124{
1081 vcpu->svm->vmcb->save.rip += 3; 1125 vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 3;
1126 skip_emulated_instruction(vcpu);
1082 return kvm_hypercall(vcpu, kvm_run); 1127 return kvm_hypercall(vcpu, kvm_run);
1083} 1128}
1084 1129
@@ -1098,8 +1143,8 @@ static int task_switch_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_r
1098static int cpuid_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1143static int cpuid_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1099{ 1144{
1100 vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2; 1145 vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2;
1101 kvm_run->exit_reason = KVM_EXIT_CPUID; 1146 kvm_emulate_cpuid(vcpu);
1102 return 0; 1147 return 1;
1103} 1148}
1104 1149
1105static int emulate_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1150static int emulate_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -1239,7 +1284,7 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu,
1239 */ 1284 */
1240 if (kvm_run->request_interrupt_window && 1285 if (kvm_run->request_interrupt_window &&
1241 !vcpu->irq_summary) { 1286 !vcpu->irq_summary) {
1242 ++kvm_stat.irq_window_exits; 1287 ++vcpu->stat.irq_window_exits;
1243 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 1288 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
1244 return 0; 1289 return 0;
1245 } 1290 }
@@ -1267,6 +1312,7 @@ static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu,
1267 [SVM_EXIT_WRITE_DR5] = emulate_on_interception, 1312 [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
1268 [SVM_EXIT_WRITE_DR7] = emulate_on_interception, 1313 [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
1269 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 1314 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
1315 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
1270 [SVM_EXIT_INTR] = nop_on_interception, 1316 [SVM_EXIT_INTR] = nop_on_interception,
1271 [SVM_EXIT_NMI] = nop_on_interception, 1317 [SVM_EXIT_NMI] = nop_on_interception,
1272 [SVM_EXIT_SMI] = nop_on_interception, 1318 [SVM_EXIT_SMI] = nop_on_interception,
@@ -1288,6 +1334,8 @@ static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu,
1288 [SVM_EXIT_STGI] = invalid_op_interception, 1334 [SVM_EXIT_STGI] = invalid_op_interception,
1289 [SVM_EXIT_CLGI] = invalid_op_interception, 1335 [SVM_EXIT_CLGI] = invalid_op_interception,
1290 [SVM_EXIT_SKINIT] = invalid_op_interception, 1336 [SVM_EXIT_SKINIT] = invalid_op_interception,
1337 [SVM_EXIT_MONITOR] = invalid_op_interception,
1338 [SVM_EXIT_MWAIT] = invalid_op_interception,
1291}; 1339};
1292 1340
1293 1341
@@ -1295,8 +1343,6 @@ static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1295{ 1343{
1296 u32 exit_code = vcpu->svm->vmcb->control.exit_code; 1344 u32 exit_code = vcpu->svm->vmcb->control.exit_code;
1297 1345
1298 kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT;
1299
1300 if (is_external_interrupt(vcpu->svm->vmcb->control.exit_int_info) && 1346 if (is_external_interrupt(vcpu->svm->vmcb->control.exit_int_info) &&
1301 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR) 1347 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR)
1302 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " 1348 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
@@ -1307,12 +1353,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1307 if (exit_code >= ARRAY_SIZE(svm_exit_handlers) 1353 if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
1308 || svm_exit_handlers[exit_code] == 0) { 1354 || svm_exit_handlers[exit_code] == 0) {
1309 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 1355 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1310 printk(KERN_ERR "%s: 0x%x @ 0x%llx cr0 0x%lx rflags 0x%llx\n", 1356 kvm_run->hw.hardware_exit_reason = exit_code;
1311 __FUNCTION__,
1312 exit_code,
1313 vcpu->svm->vmcb->save.rip,
1314 vcpu->cr0,
1315 vcpu->svm->vmcb->save.rflags);
1316 return 0; 1357 return 0;
1317 } 1358 }
1318 1359
@@ -1461,8 +1502,10 @@ again:
1461 load_db_regs(vcpu->svm->db_regs); 1502 load_db_regs(vcpu->svm->db_regs);
1462 } 1503 }
1463 1504
1464 fx_save(vcpu->host_fx_image); 1505 if (vcpu->fpu_active) {
1465 fx_restore(vcpu->guest_fx_image); 1506 fx_save(vcpu->host_fx_image);
1507 fx_restore(vcpu->guest_fx_image);
1508 }
1466 1509
1467 asm volatile ( 1510 asm volatile (
1468#ifdef CONFIG_X86_64 1511#ifdef CONFIG_X86_64
@@ -1573,8 +1616,10 @@ again:
1573#endif 1616#endif
1574 : "cc", "memory" ); 1617 : "cc", "memory" );
1575 1618
1576 fx_save(vcpu->guest_fx_image); 1619 if (vcpu->fpu_active) {
1577 fx_restore(vcpu->host_fx_image); 1620 fx_save(vcpu->guest_fx_image);
1621 fx_restore(vcpu->host_fx_image);
1622 }
1578 1623
1579 if ((vcpu->svm->vmcb->save.dr7 & 0xff)) 1624 if ((vcpu->svm->vmcb->save.dr7 & 0xff))
1580 load_db_regs(vcpu->svm->host_db_regs); 1625 load_db_regs(vcpu->svm->host_db_regs);
@@ -1606,8 +1651,9 @@ again:
1606 vcpu->svm->next_rip = 0; 1651 vcpu->svm->next_rip = 0;
1607 1652
1608 if (vcpu->svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 1653 if (vcpu->svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
1609 kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; 1654 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
1610 kvm_run->exit_reason = vcpu->svm->vmcb->control.exit_code; 1655 kvm_run->fail_entry.hardware_entry_failure_reason
1656 = vcpu->svm->vmcb->control.exit_code;
1611 post_kvm_run_save(vcpu, kvm_run); 1657 post_kvm_run_save(vcpu, kvm_run);
1612 return 0; 1658 return 0;
1613 } 1659 }
@@ -1615,14 +1661,16 @@ again:
1615 r = handle_exit(vcpu, kvm_run); 1661 r = handle_exit(vcpu, kvm_run);
1616 if (r > 0) { 1662 if (r > 0) {
1617 if (signal_pending(current)) { 1663 if (signal_pending(current)) {
1618 ++kvm_stat.signal_exits; 1664 ++vcpu->stat.signal_exits;
1619 post_kvm_run_save(vcpu, kvm_run); 1665 post_kvm_run_save(vcpu, kvm_run);
1666 kvm_run->exit_reason = KVM_EXIT_INTR;
1620 return -EINTR; 1667 return -EINTR;
1621 } 1668 }
1622 1669
1623 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 1670 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
1624 ++kvm_stat.request_irq_exits; 1671 ++vcpu->stat.request_irq_exits;
1625 post_kvm_run_save(vcpu, kvm_run); 1672 post_kvm_run_save(vcpu, kvm_run);
1673 kvm_run->exit_reason = KVM_EXIT_INTR;
1626 return -EINTR; 1674 return -EINTR;
1627 } 1675 }
1628 kvm_resched(vcpu); 1676 kvm_resched(vcpu);
@@ -1641,6 +1689,12 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
1641{ 1689{
1642 vcpu->svm->vmcb->save.cr3 = root; 1690 vcpu->svm->vmcb->save.cr3 = root;
1643 force_new_asid(vcpu); 1691 force_new_asid(vcpu);
1692
1693 if (vcpu->fpu_active) {
1694 vcpu->svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
1695 vcpu->svm->vmcb->save.cr0 |= CR0_TS_MASK;
1696 vcpu->fpu_active = 0;
1697 }
1644} 1698}
1645 1699
1646static void svm_inject_page_fault(struct kvm_vcpu *vcpu, 1700static void svm_inject_page_fault(struct kvm_vcpu *vcpu,
@@ -1649,7 +1703,7 @@ static void svm_inject_page_fault(struct kvm_vcpu *vcpu,
1649{ 1703{
1650 uint32_t exit_int_info = vcpu->svm->vmcb->control.exit_int_info; 1704 uint32_t exit_int_info = vcpu->svm->vmcb->control.exit_int_info;
1651 1705
1652 ++kvm_stat.pf_guest; 1706 ++vcpu->stat.pf_guest;
1653 1707
1654 if (is_page_fault(exit_int_info)) { 1708 if (is_page_fault(exit_int_info)) {
1655 1709
@@ -1709,9 +1763,8 @@ static struct kvm_arch_ops svm_arch_ops = {
1709 .get_segment = svm_get_segment, 1763 .get_segment = svm_get_segment,
1710 .set_segment = svm_set_segment, 1764 .set_segment = svm_set_segment,
1711 .get_cs_db_l_bits = svm_get_cs_db_l_bits, 1765 .get_cs_db_l_bits = svm_get_cs_db_l_bits,
1712 .decache_cr0_cr4_guest_bits = svm_decache_cr0_cr4_guest_bits, 1766 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
1713 .set_cr0 = svm_set_cr0, 1767 .set_cr0 = svm_set_cr0,
1714 .set_cr0_no_modeswitch = svm_set_cr0,
1715 .set_cr3 = svm_set_cr3, 1768 .set_cr3 = svm_set_cr3,
1716 .set_cr4 = svm_set_cr4, 1769 .set_cr4 = svm_set_cr4,
1717 .set_efer = svm_set_efer, 1770 .set_efer = svm_set_efer,
diff --git a/drivers/kvm/svm.h b/drivers/kvm/svm.h
index df731c3fb588..5e93814400ce 100644
--- a/drivers/kvm/svm.h
+++ b/drivers/kvm/svm.h
@@ -44,6 +44,9 @@ enum {
44 INTERCEPT_RDTSCP, 44 INTERCEPT_RDTSCP,
45 INTERCEPT_ICEBP, 45 INTERCEPT_ICEBP,
46 INTERCEPT_WBINVD, 46 INTERCEPT_WBINVD,
47 INTERCEPT_MONITOR,
48 INTERCEPT_MWAIT,
49 INTERCEPT_MWAIT_COND,
47}; 50};
48 51
49 52
@@ -298,6 +301,9 @@ struct __attribute__ ((__packed__)) vmcb {
298#define SVM_EXIT_RDTSCP 0x087 301#define SVM_EXIT_RDTSCP 0x087
299#define SVM_EXIT_ICEBP 0x088 302#define SVM_EXIT_ICEBP 0x088
300#define SVM_EXIT_WBINVD 0x089 303#define SVM_EXIT_WBINVD 0x089
304#define SVM_EXIT_MONITOR 0x08a
305#define SVM_EXIT_MWAIT 0x08b
306#define SVM_EXIT_MWAIT_COND 0x08c
301#define SVM_EXIT_NPF 0x400 307#define SVM_EXIT_NPF 0x400
302 308
303#define SVM_EXIT_ERR -1 309#define SVM_EXIT_ERR -1
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index fbbf9d6b299f..724db0027f00 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -17,7 +17,6 @@
17 17
18#include "kvm.h" 18#include "kvm.h"
19#include "vmx.h" 19#include "vmx.h"
20#include "kvm_vmx.h"
21#include <linux/module.h> 20#include <linux/module.h>
22#include <linux/kernel.h> 21#include <linux/kernel.h>
23#include <linux/mm.h> 22#include <linux/mm.h>
@@ -70,6 +69,10 @@ static struct kvm_vmx_segment_field {
70 VMX_SEGMENT_FIELD(LDTR), 69 VMX_SEGMENT_FIELD(LDTR),
71}; 70};
72 71
72/*
73 * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
74 * away by decrementing the array size.
75 */
73static const u32 vmx_msr_index[] = { 76static const u32 vmx_msr_index[] = {
74#ifdef CONFIG_X86_64 77#ifdef CONFIG_X86_64
75 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, 78 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
@@ -78,6 +81,19 @@ static const u32 vmx_msr_index[] = {
78}; 81};
79#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 82#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
80 83
84#ifdef CONFIG_X86_64
85static unsigned msr_offset_kernel_gs_base;
86#define NR_64BIT_MSRS 4
87/*
88 * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
89 * mechanism (cpu bug AA24)
90 */
91#define NR_BAD_MSRS 2
92#else
93#define NR_64BIT_MSRS 0
94#define NR_BAD_MSRS 0
95#endif
96
81static inline int is_page_fault(u32 intr_info) 97static inline int is_page_fault(u32 intr_info)
82{ 98{
83 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 99 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -85,6 +101,13 @@ static inline int is_page_fault(u32 intr_info)
85 (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); 101 (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
86} 102}
87 103
104static inline int is_no_device(u32 intr_info)
105{
106 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
107 INTR_INFO_VALID_MASK)) ==
108 (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
109}
110
88static inline int is_external_interrupt(u32 intr_info) 111static inline int is_external_interrupt(u32 intr_info)
89{ 112{
90 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 113 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -200,6 +223,16 @@ static void vmcs_write64(unsigned long field, u64 value)
200#endif 223#endif
201} 224}
202 225
226static void vmcs_clear_bits(unsigned long field, u32 mask)
227{
228 vmcs_writel(field, vmcs_readl(field) & ~mask);
229}
230
231static void vmcs_set_bits(unsigned long field, u32 mask)
232{
233 vmcs_writel(field, vmcs_readl(field) | mask);
234}
235
203/* 236/*
204 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 237 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
205 * vcpu mutex is already taken. 238 * vcpu mutex is already taken.
@@ -297,6 +330,44 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
297} 330}
298 331
299/* 332/*
333 * Set up the vmcs to automatically save and restore system
334 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
335 * mode, as fiddling with msrs is very expensive.
336 */
337static void setup_msrs(struct kvm_vcpu *vcpu)
338{
339 int nr_skip, nr_good_msrs;
340
341 if (is_long_mode(vcpu))
342 nr_skip = NR_BAD_MSRS;
343 else
344 nr_skip = NR_64BIT_MSRS;
345 nr_good_msrs = vcpu->nmsrs - nr_skip;
346
347 /*
348 * MSR_K6_STAR is only needed on long mode guests, and only
349 * if efer.sce is enabled.
350 */
351 if (find_msr_entry(vcpu, MSR_K6_STAR)) {
352 --nr_good_msrs;
353#ifdef CONFIG_X86_64
354 if (is_long_mode(vcpu) && (vcpu->shadow_efer & EFER_SCE))
355 ++nr_good_msrs;
356#endif
357 }
358
359 vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR,
360 virt_to_phys(vcpu->guest_msrs + nr_skip));
361 vmcs_writel(VM_EXIT_MSR_STORE_ADDR,
362 virt_to_phys(vcpu->guest_msrs + nr_skip));
363 vmcs_writel(VM_EXIT_MSR_LOAD_ADDR,
364 virt_to_phys(vcpu->host_msrs + nr_skip));
365 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */
366 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
367 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
368}
369
370/*
300 * reads and returns guest's timestamp counter "register" 371 * reads and returns guest's timestamp counter "register"
301 * guest_tsc = host_tsc + tsc_offset -- 21.3 372 * guest_tsc = host_tsc + tsc_offset -- 21.3
302 */ 373 */
@@ -712,6 +783,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
712 783
713 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); 784 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
714 vmcs_write32(GUEST_CS_LIMIT, 0xffff); 785 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
786 if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
787 vmcs_writel(GUEST_CS_BASE, 0xf0000);
715 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); 788 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
716 789
717 fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es); 790 fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es);
@@ -754,11 +827,8 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
754 827
755#endif 828#endif
756 829
757static void vmx_decache_cr0_cr4_guest_bits(struct kvm_vcpu *vcpu) 830static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
758{ 831{
759 vcpu->cr0 &= KVM_GUEST_CR0_MASK;
760 vcpu->cr0 |= vmcs_readl(GUEST_CR0) & ~KVM_GUEST_CR0_MASK;
761
762 vcpu->cr4 &= KVM_GUEST_CR4_MASK; 832 vcpu->cr4 &= KVM_GUEST_CR4_MASK;
763 vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; 833 vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
764} 834}
@@ -780,22 +850,11 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
780 } 850 }
781#endif 851#endif
782 852
783 vmcs_writel(CR0_READ_SHADOW, cr0); 853 if (!(cr0 & CR0_TS_MASK)) {
784 vmcs_writel(GUEST_CR0, 854 vcpu->fpu_active = 1;
785 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); 855 vmcs_clear_bits(EXCEPTION_BITMAP, CR0_TS_MASK);
786 vcpu->cr0 = cr0; 856 }
787}
788
789/*
790 * Used when restoring the VM to avoid corrupting segment registers
791 */
792static void vmx_set_cr0_no_modeswitch(struct kvm_vcpu *vcpu, unsigned long cr0)
793{
794 if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
795 enter_rmode(vcpu);
796 857
797 vcpu->rmode.active = ((cr0 & CR0_PE_MASK) == 0);
798 update_exception_bitmap(vcpu);
799 vmcs_writel(CR0_READ_SHADOW, cr0); 858 vmcs_writel(CR0_READ_SHADOW, cr0);
800 vmcs_writel(GUEST_CR0, 859 vmcs_writel(GUEST_CR0,
801 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); 860 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
@@ -805,6 +864,12 @@ static void vmx_set_cr0_no_modeswitch(struct kvm_vcpu *vcpu, unsigned long cr0)
805static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 864static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
806{ 865{
807 vmcs_writel(GUEST_CR3, cr3); 866 vmcs_writel(GUEST_CR3, cr3);
867
868 if (!(vcpu->cr0 & CR0_TS_MASK)) {
869 vcpu->fpu_active = 0;
870 vmcs_set_bits(GUEST_CR0, CR0_TS_MASK);
871 vmcs_set_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR);
872 }
808} 873}
809 874
810static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 875static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -835,6 +900,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
835 900
836 msr->data = efer & ~EFER_LME; 901 msr->data = efer & ~EFER_LME;
837 } 902 }
903 setup_msrs(vcpu);
838} 904}
839 905
840#endif 906#endif
@@ -878,7 +944,14 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
878 vmcs_writel(sf->base, var->base); 944 vmcs_writel(sf->base, var->base);
879 vmcs_write32(sf->limit, var->limit); 945 vmcs_write32(sf->limit, var->limit);
880 vmcs_write16(sf->selector, var->selector); 946 vmcs_write16(sf->selector, var->selector);
881 if (var->unusable) 947 if (vcpu->rmode.active && var->s) {
948 /*
949 * Hack real-mode segments into vm86 compatibility.
950 */
951 if (var->base == 0xffff0000 && var->selector == 0xf000)
952 vmcs_writel(sf->base, 0xf0000);
953 ar = 0xf3;
954 } else if (var->unusable)
882 ar = 1 << 16; 955 ar = 1 << 16;
883 else { 956 else {
884 ar = var->type & 15; 957 ar = var->type & 15;
@@ -933,9 +1006,9 @@ static int init_rmode_tss(struct kvm* kvm)
933 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; 1006 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
934 char *page; 1007 char *page;
935 1008
936 p1 = _gfn_to_page(kvm, fn++); 1009 p1 = gfn_to_page(kvm, fn++);
937 p2 = _gfn_to_page(kvm, fn++); 1010 p2 = gfn_to_page(kvm, fn++);
938 p3 = _gfn_to_page(kvm, fn); 1011 p3 = gfn_to_page(kvm, fn);
939 1012
940 if (!p1 || !p2 || !p3) { 1013 if (!p1 || !p2 || !p3) {
941 kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__); 1014 kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__);
@@ -991,7 +1064,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
991 struct descriptor_table dt; 1064 struct descriptor_table dt;
992 int i; 1065 int i;
993 int ret = 0; 1066 int ret = 0;
994 int nr_good_msrs;
995 extern asmlinkage void kvm_vmx_return(void); 1067 extern asmlinkage void kvm_vmx_return(void);
996 1068
997 if (!init_rmode_tss(vcpu->kvm)) { 1069 if (!init_rmode_tss(vcpu->kvm)) {
@@ -1136,23 +1208,17 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
1136 vcpu->host_msrs[j].reserved = 0; 1208 vcpu->host_msrs[j].reserved = 0;
1137 vcpu->host_msrs[j].data = data; 1209 vcpu->host_msrs[j].data = data;
1138 vcpu->guest_msrs[j] = vcpu->host_msrs[j]; 1210 vcpu->guest_msrs[j] = vcpu->host_msrs[j];
1211#ifdef CONFIG_X86_64
1212 if (index == MSR_KERNEL_GS_BASE)
1213 msr_offset_kernel_gs_base = j;
1214#endif
1139 ++vcpu->nmsrs; 1215 ++vcpu->nmsrs;
1140 } 1216 }
1141 printk(KERN_DEBUG "kvm: msrs: %d\n", vcpu->nmsrs);
1142 1217
1143 nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS; 1218 setup_msrs(vcpu);
1144 vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, 1219
1145 virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS));
1146 vmcs_writel(VM_EXIT_MSR_STORE_ADDR,
1147 virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS));
1148 vmcs_writel(VM_EXIT_MSR_LOAD_ADDR,
1149 virt_to_phys(vcpu->host_msrs + NR_BAD_MSRS));
1150 vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS, VM_EXIT_CONTROLS, 1220 vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS, VM_EXIT_CONTROLS,
1151 (HOST_IS_64 << 9)); /* 22.2,1, 20.7.1 */ 1221 (HOST_IS_64 << 9)); /* 22.2,1, 20.7.1 */
1152 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */
1153 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
1154 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
1155
1156 1222
1157 /* 22.2.1, 20.8.1 */ 1223 /* 22.2.1, 20.8.1 */
1158 vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS, 1224 vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS,
@@ -1164,7 +1230,7 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
1164 vmcs_writel(TPR_THRESHOLD, 0); 1230 vmcs_writel(TPR_THRESHOLD, 0);
1165#endif 1231#endif
1166 1232
1167 vmcs_writel(CR0_GUEST_HOST_MASK, KVM_GUEST_CR0_MASK); 1233 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
1168 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); 1234 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
1169 1235
1170 vcpu->cr0 = 0x60000010; 1236 vcpu->cr0 = 0x60000010;
@@ -1190,7 +1256,7 @@ static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
1190 u16 sp = vmcs_readl(GUEST_RSP); 1256 u16 sp = vmcs_readl(GUEST_RSP);
1191 u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT); 1257 u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT);
1192 1258
1193 if (sp > ss_limit || sp - 6 > sp) { 1259 if (sp > ss_limit || sp < 6 ) {
1194 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n", 1260 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
1195 __FUNCTION__, 1261 __FUNCTION__,
1196 vmcs_readl(GUEST_RSP), 1262 vmcs_readl(GUEST_RSP),
@@ -1330,6 +1396,15 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1330 asm ("int $2"); 1396 asm ("int $2");
1331 return 1; 1397 return 1;
1332 } 1398 }
1399
1400 if (is_no_device(intr_info)) {
1401 vcpu->fpu_active = 1;
1402 vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR);
1403 if (!(vcpu->cr0 & CR0_TS_MASK))
1404 vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK);
1405 return 1;
1406 }
1407
1333 error_code = 0; 1408 error_code = 0;
1334 rip = vmcs_readl(GUEST_RIP); 1409 rip = vmcs_readl(GUEST_RIP);
1335 if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) 1410 if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
@@ -1355,7 +1430,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1355 case EMULATE_DONE: 1430 case EMULATE_DONE:
1356 return 1; 1431 return 1;
1357 case EMULATE_DO_MMIO: 1432 case EMULATE_DO_MMIO:
1358 ++kvm_stat.mmio_exits; 1433 ++vcpu->stat.mmio_exits;
1359 kvm_run->exit_reason = KVM_EXIT_MMIO; 1434 kvm_run->exit_reason = KVM_EXIT_MMIO;
1360 return 0; 1435 return 0;
1361 case EMULATE_FAIL: 1436 case EMULATE_FAIL:
@@ -1384,7 +1459,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1384static int handle_external_interrupt(struct kvm_vcpu *vcpu, 1459static int handle_external_interrupt(struct kvm_vcpu *vcpu,
1385 struct kvm_run *kvm_run) 1460 struct kvm_run *kvm_run)
1386{ 1461{
1387 ++kvm_stat.irq_exits; 1462 ++vcpu->stat.irq_exits;
1388 return 1; 1463 return 1;
1389} 1464}
1390 1465
@@ -1394,7 +1469,7 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1394 return 0; 1469 return 0;
1395} 1470}
1396 1471
1397static int get_io_count(struct kvm_vcpu *vcpu, u64 *count) 1472static int get_io_count(struct kvm_vcpu *vcpu, unsigned long *count)
1398{ 1473{
1399 u64 inst; 1474 u64 inst;
1400 gva_t rip; 1475 gva_t rip;
@@ -1439,33 +1514,35 @@ static int get_io_count(struct kvm_vcpu *vcpu, u64 *count)
1439done: 1514done:
1440 countr_size *= 8; 1515 countr_size *= 8;
1441 *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size)); 1516 *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
1517 //printk("cx: %lx\n", vcpu->regs[VCPU_REGS_RCX]);
1442 return 1; 1518 return 1;
1443} 1519}
1444 1520
1445static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1521static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1446{ 1522{
1447 u64 exit_qualification; 1523 u64 exit_qualification;
1524 int size, down, in, string, rep;
1525 unsigned port;
1526 unsigned long count;
1527 gva_t address;
1448 1528
1449 ++kvm_stat.io_exits; 1529 ++vcpu->stat.io_exits;
1450 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 1530 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
1451 kvm_run->exit_reason = KVM_EXIT_IO; 1531 in = (exit_qualification & 8) != 0;
1452 if (exit_qualification & 8) 1532 size = (exit_qualification & 7) + 1;
1453 kvm_run->io.direction = KVM_EXIT_IO_IN; 1533 string = (exit_qualification & 16) != 0;
1454 else 1534 down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
1455 kvm_run->io.direction = KVM_EXIT_IO_OUT; 1535 count = 1;
1456 kvm_run->io.size = (exit_qualification & 7) + 1; 1536 rep = (exit_qualification & 32) != 0;
1457 kvm_run->io.string = (exit_qualification & 16) != 0; 1537 port = exit_qualification >> 16;
1458 kvm_run->io.string_down 1538 address = 0;
1459 = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; 1539 if (string) {
1460 kvm_run->io.rep = (exit_qualification & 32) != 0; 1540 if (rep && !get_io_count(vcpu, &count))
1461 kvm_run->io.port = exit_qualification >> 16;
1462 if (kvm_run->io.string) {
1463 if (!get_io_count(vcpu, &kvm_run->io.count))
1464 return 1; 1541 return 1;
1465 kvm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS); 1542 address = vmcs_readl(GUEST_LINEAR_ADDRESS);
1466 } else 1543 }
1467 kvm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */ 1544 return kvm_setup_pio(vcpu, kvm_run, in, size, count, string, down,
1468 return 0; 1545 address, rep, port);
1469} 1546}
1470 1547
1471static void 1548static void
@@ -1514,6 +1591,15 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1514 return 1; 1591 return 1;
1515 }; 1592 };
1516 break; 1593 break;
1594 case 2: /* clts */
1595 vcpu_load_rsp_rip(vcpu);
1596 vcpu->fpu_active = 1;
1597 vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR);
1598 vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK);
1599 vcpu->cr0 &= ~CR0_TS_MASK;
1600 vmcs_writel(CR0_READ_SHADOW, vcpu->cr0);
1601 skip_emulated_instruction(vcpu);
1602 return 1;
1517 case 1: /*mov from cr*/ 1603 case 1: /*mov from cr*/
1518 switch (cr) { 1604 switch (cr) {
1519 case 3: 1605 case 3:
@@ -1523,8 +1609,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1523 skip_emulated_instruction(vcpu); 1609 skip_emulated_instruction(vcpu);
1524 return 1; 1610 return 1;
1525 case 8: 1611 case 8:
1526 printk(KERN_DEBUG "handle_cr: read CR8 "
1527 "cpu erratum AA15\n");
1528 vcpu_load_rsp_rip(vcpu); 1612 vcpu_load_rsp_rip(vcpu);
1529 vcpu->regs[reg] = vcpu->cr8; 1613 vcpu->regs[reg] = vcpu->cr8;
1530 vcpu_put_rsp_rip(vcpu); 1614 vcpu_put_rsp_rip(vcpu);
@@ -1583,8 +1667,8 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1583 1667
1584static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1668static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1585{ 1669{
1586 kvm_run->exit_reason = KVM_EXIT_CPUID; 1670 kvm_emulate_cpuid(vcpu);
1587 return 0; 1671 return 1;
1588} 1672}
1589 1673
1590static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1674static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -1639,7 +1723,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
1639 if (kvm_run->request_interrupt_window && 1723 if (kvm_run->request_interrupt_window &&
1640 !vcpu->irq_summary) { 1724 !vcpu->irq_summary) {
1641 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 1725 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
1642 ++kvm_stat.irq_window_exits; 1726 ++vcpu->stat.irq_window_exits;
1643 return 0; 1727 return 0;
1644 } 1728 }
1645 return 1; 1729 return 1;
@@ -1652,13 +1736,13 @@ static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1652 return 1; 1736 return 1;
1653 1737
1654 kvm_run->exit_reason = KVM_EXIT_HLT; 1738 kvm_run->exit_reason = KVM_EXIT_HLT;
1655 ++kvm_stat.halt_exits; 1739 ++vcpu->stat.halt_exits;
1656 return 0; 1740 return 0;
1657} 1741}
1658 1742
1659static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1743static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1660{ 1744{
1661 vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP)+3); 1745 skip_emulated_instruction(vcpu);
1662 return kvm_hypercall(vcpu, kvm_run); 1746 return kvm_hypercall(vcpu, kvm_run);
1663} 1747}
1664 1748
@@ -1699,7 +1783,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1699 exit_reason != EXIT_REASON_EXCEPTION_NMI ) 1783 exit_reason != EXIT_REASON_EXCEPTION_NMI )
1700 printk(KERN_WARNING "%s: unexpected, valid vectoring info and " 1784 printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
1701 "exit reason is 0x%x\n", __FUNCTION__, exit_reason); 1785 "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
1702 kvm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1703 if (exit_reason < kvm_vmx_max_exit_handlers 1786 if (exit_reason < kvm_vmx_max_exit_handlers
1704 && kvm_vmx_exit_handlers[exit_reason]) 1787 && kvm_vmx_exit_handlers[exit_reason])
1705 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); 1788 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
@@ -1763,11 +1846,21 @@ again:
1763 if (vcpu->guest_debug.enabled) 1846 if (vcpu->guest_debug.enabled)
1764 kvm_guest_debug_pre(vcpu); 1847 kvm_guest_debug_pre(vcpu);
1765 1848
1766 fx_save(vcpu->host_fx_image); 1849 if (vcpu->fpu_active) {
1767 fx_restore(vcpu->guest_fx_image); 1850 fx_save(vcpu->host_fx_image);
1851 fx_restore(vcpu->guest_fx_image);
1852 }
1853 /*
1854 * Loading guest fpu may have cleared host cr0.ts
1855 */
1856 vmcs_writel(HOST_CR0, read_cr0());
1768 1857
1769 save_msrs(vcpu->host_msrs, vcpu->nmsrs); 1858#ifdef CONFIG_X86_64
1770 load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); 1859 if (is_long_mode(vcpu)) {
1860 save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1);
1861 load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
1862 }
1863#endif
1771 1864
1772 asm ( 1865 asm (
1773 /* Store host registers */ 1866 /* Store host registers */
@@ -1909,21 +2002,28 @@ again:
1909 2002
1910 reload_tss(); 2003 reload_tss();
1911 } 2004 }
1912 ++kvm_stat.exits; 2005 ++vcpu->stat.exits;
1913 2006
1914 save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); 2007#ifdef CONFIG_X86_64
1915 load_msrs(vcpu->host_msrs, NR_BAD_MSRS); 2008 if (is_long_mode(vcpu)) {
2009 save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
2010 load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
2011 }
2012#endif
2013
2014 if (vcpu->fpu_active) {
2015 fx_save(vcpu->guest_fx_image);
2016 fx_restore(vcpu->host_fx_image);
2017 }
1916 2018
1917 fx_save(vcpu->guest_fx_image);
1918 fx_restore(vcpu->host_fx_image);
1919 vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; 2019 vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
1920 2020
1921 asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 2021 asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
1922 2022
1923 kvm_run->exit_type = 0;
1924 if (fail) { 2023 if (fail) {
1925 kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; 2024 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
1926 kvm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR); 2025 kvm_run->fail_entry.hardware_entry_failure_reason
2026 = vmcs_read32(VM_INSTRUCTION_ERROR);
1927 r = 0; 2027 r = 0;
1928 } else { 2028 } else {
1929 /* 2029 /*
@@ -1933,19 +2033,20 @@ again:
1933 profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP)); 2033 profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP));
1934 2034
1935 vcpu->launched = 1; 2035 vcpu->launched = 1;
1936 kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT;
1937 r = kvm_handle_exit(kvm_run, vcpu); 2036 r = kvm_handle_exit(kvm_run, vcpu);
1938 if (r > 0) { 2037 if (r > 0) {
1939 /* Give scheduler a change to reschedule. */ 2038 /* Give scheduler a change to reschedule. */
1940 if (signal_pending(current)) { 2039 if (signal_pending(current)) {
1941 ++kvm_stat.signal_exits; 2040 ++vcpu->stat.signal_exits;
1942 post_kvm_run_save(vcpu, kvm_run); 2041 post_kvm_run_save(vcpu, kvm_run);
2042 kvm_run->exit_reason = KVM_EXIT_INTR;
1943 return -EINTR; 2043 return -EINTR;
1944 } 2044 }
1945 2045
1946 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 2046 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
1947 ++kvm_stat.request_irq_exits; 2047 ++vcpu->stat.request_irq_exits;
1948 post_kvm_run_save(vcpu, kvm_run); 2048 post_kvm_run_save(vcpu, kvm_run);
2049 kvm_run->exit_reason = KVM_EXIT_INTR;
1949 return -EINTR; 2050 return -EINTR;
1950 } 2051 }
1951 2052
@@ -1969,7 +2070,7 @@ static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
1969{ 2070{
1970 u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 2071 u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1971 2072
1972 ++kvm_stat.pf_guest; 2073 ++vcpu->stat.pf_guest;
1973 2074
1974 if (is_page_fault(vect_info)) { 2075 if (is_page_fault(vect_info)) {
1975 printk(KERN_DEBUG "inject_page_fault: " 2076 printk(KERN_DEBUG "inject_page_fault: "
@@ -2026,6 +2127,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
2026 vmcs_clear(vmcs); 2127 vmcs_clear(vmcs);
2027 vcpu->vmcs = vmcs; 2128 vcpu->vmcs = vmcs;
2028 vcpu->launched = 0; 2129 vcpu->launched = 0;
2130 vcpu->fpu_active = 1;
2029 2131
2030 return 0; 2132 return 0;
2031 2133
@@ -2062,9 +2164,8 @@ static struct kvm_arch_ops vmx_arch_ops = {
2062 .get_segment = vmx_get_segment, 2164 .get_segment = vmx_get_segment,
2063 .set_segment = vmx_set_segment, 2165 .set_segment = vmx_set_segment,
2064 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 2166 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
2065 .decache_cr0_cr4_guest_bits = vmx_decache_cr0_cr4_guest_bits, 2167 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
2066 .set_cr0 = vmx_set_cr0, 2168 .set_cr0 = vmx_set_cr0,
2067 .set_cr0_no_modeswitch = vmx_set_cr0_no_modeswitch,
2068 .set_cr3 = vmx_set_cr3, 2169 .set_cr3 = vmx_set_cr3,
2069 .set_cr4 = vmx_set_cr4, 2170 .set_cr4 = vmx_set_cr4,
2070#ifdef CONFIG_X86_64 2171#ifdef CONFIG_X86_64
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c
index 7513cddb929f..7ade09086aa5 100644
--- a/drivers/kvm/x86_emulate.c
+++ b/drivers/kvm/x86_emulate.c
@@ -833,8 +833,9 @@ done_prefixes:
833 dst.ptr = (unsigned long *)cr2; 833 dst.ptr = (unsigned long *)cr2;
834 dst.bytes = (d & ByteOp) ? 1 : op_bytes; 834 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
835 if (d & BitOp) { 835 if (d & BitOp) {
836 dst.ptr += src.val / BITS_PER_LONG; 836 unsigned long mask = ~(dst.bytes * 8 - 1);
837 dst.bytes = sizeof(long); 837
838 dst.ptr = (void *)dst.ptr + (src.val & mask) / 8;
838 } 839 }
839 if (!(d & Mov) && /* optimisation - avoid slow emulated read */ 840 if (!(d & Mov) && /* optimisation - avoid slow emulated read */
840 ((rc = ops->read_emulated((unsigned long)dst.ptr, 841 ((rc = ops->read_emulated((unsigned long)dst.ptr,
@@ -1044,7 +1045,7 @@ done_prefixes:
1044 if ((rc = ops->write_std( 1045 if ((rc = ops->write_std(
1045 register_address(ctxt->ss_base, 1046 register_address(ctxt->ss_base,
1046 _regs[VCPU_REGS_RSP]), 1047 _regs[VCPU_REGS_RSP]),
1047 dst.val, dst.bytes, ctxt)) != 0) 1048 &dst.val, dst.bytes, ctxt)) != 0)
1048 goto done; 1049 goto done;
1049 dst.val = dst.orig_val; /* skanky: disable writeback */ 1050 dst.val = dst.orig_val; /* skanky: disable writeback */
1050 break; 1051 break;
@@ -1077,12 +1078,12 @@ writeback:
1077 case OP_MEM: 1078 case OP_MEM:
1078 if (lock_prefix) 1079 if (lock_prefix)
1079 rc = ops->cmpxchg_emulated((unsigned long)dst. 1080 rc = ops->cmpxchg_emulated((unsigned long)dst.
1080 ptr, dst.orig_val, 1081 ptr, &dst.orig_val,
1081 dst.val, dst.bytes, 1082 &dst.val, dst.bytes,
1082 ctxt); 1083 ctxt);
1083 else 1084 else
1084 rc = ops->write_emulated((unsigned long)dst.ptr, 1085 rc = ops->write_emulated((unsigned long)dst.ptr,
1085 dst.val, dst.bytes, 1086 &dst.val, dst.bytes,
1086 ctxt); 1087 ctxt);
1087 if (rc != 0) 1088 if (rc != 0)
1088 goto done; 1089 goto done;
@@ -1320,36 +1321,8 @@ twobyte_special_insn:
1320 realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags); 1321 realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags);
1321 break; 1322 break;
1322 case 0xc7: /* Grp9 (cmpxchg8b) */ 1323 case 0xc7: /* Grp9 (cmpxchg8b) */
1323#if defined(__i386__)
1324 {
1325 unsigned long old_lo, old_hi;
1326 if (((rc = ops->read_emulated(cr2 + 0, &old_lo, 4,
1327 ctxt)) != 0)
1328 || ((rc = ops->read_emulated(cr2 + 4, &old_hi, 4,
1329 ctxt)) != 0))
1330 goto done;
1331 if ((old_lo != _regs[VCPU_REGS_RAX])
1332 || (old_hi != _regs[VCPU_REGS_RDX])) {
1333 _regs[VCPU_REGS_RAX] = old_lo;
1334 _regs[VCPU_REGS_RDX] = old_hi;
1335 _eflags &= ~EFLG_ZF;
1336 } else if (ops->cmpxchg8b_emulated == NULL) {
1337 rc = X86EMUL_UNHANDLEABLE;
1338 goto done;
1339 } else {
1340 if ((rc = ops->cmpxchg8b_emulated(cr2, old_lo,
1341 old_hi,
1342 _regs[VCPU_REGS_RBX],
1343 _regs[VCPU_REGS_RCX],
1344 ctxt)) != 0)
1345 goto done;
1346 _eflags |= EFLG_ZF;
1347 }
1348 break;
1349 }
1350#elif defined(CONFIG_X86_64)
1351 { 1324 {
1352 unsigned long old, new; 1325 u64 old, new;
1353 if ((rc = ops->read_emulated(cr2, &old, 8, ctxt)) != 0) 1326 if ((rc = ops->read_emulated(cr2, &old, 8, ctxt)) != 0)
1354 goto done; 1327 goto done;
1355 if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) || 1328 if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) ||
@@ -1358,15 +1331,15 @@ twobyte_special_insn:
1358 _regs[VCPU_REGS_RDX] = (u32) (old >> 32); 1331 _regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1359 _eflags &= ~EFLG_ZF; 1332 _eflags &= ~EFLG_ZF;
1360 } else { 1333 } else {
1361 new = (_regs[VCPU_REGS_RCX] << 32) | (u32) _regs[VCPU_REGS_RBX]; 1334 new = ((u64)_regs[VCPU_REGS_RCX] << 32)
1362 if ((rc = ops->cmpxchg_emulated(cr2, old, 1335 | (u32) _regs[VCPU_REGS_RBX];
1363 new, 8, ctxt)) != 0) 1336 if ((rc = ops->cmpxchg_emulated(cr2, &old,
1337 &new, 8, ctxt)) != 0)
1364 goto done; 1338 goto done;
1365 _eflags |= EFLG_ZF; 1339 _eflags |= EFLG_ZF;
1366 } 1340 }
1367 break; 1341 break;
1368 } 1342 }
1369#endif
1370 } 1343 }
1371 goto writeback; 1344 goto writeback;
1372 1345
diff --git a/drivers/kvm/x86_emulate.h b/drivers/kvm/x86_emulate.h
index 5d41bd55125e..ea3407d7feee 100644
--- a/drivers/kvm/x86_emulate.h
+++ b/drivers/kvm/x86_emulate.h
@@ -59,8 +59,7 @@ struct x86_emulate_ops {
59 * @val: [OUT] Value read from memory, zero-extended to 'u_long'. 59 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
60 * @bytes: [IN ] Number of bytes to read from memory. 60 * @bytes: [IN ] Number of bytes to read from memory.
61 */ 61 */
62 int (*read_std)(unsigned long addr, 62 int (*read_std)(unsigned long addr, void *val,
63 unsigned long *val,
64 unsigned int bytes, struct x86_emulate_ctxt * ctxt); 63 unsigned int bytes, struct x86_emulate_ctxt * ctxt);
65 64
66 /* 65 /*
@@ -71,8 +70,7 @@ struct x86_emulate_ops {
71 * required). 70 * required).
72 * @bytes: [IN ] Number of bytes to write to memory. 71 * @bytes: [IN ] Number of bytes to write to memory.
73 */ 72 */
74 int (*write_std)(unsigned long addr, 73 int (*write_std)(unsigned long addr, const void *val,
75 unsigned long val,
76 unsigned int bytes, struct x86_emulate_ctxt * ctxt); 74 unsigned int bytes, struct x86_emulate_ctxt * ctxt);
77 75
78 /* 76 /*
@@ -82,7 +80,7 @@ struct x86_emulate_ops {
82 * @bytes: [IN ] Number of bytes to read from memory. 80 * @bytes: [IN ] Number of bytes to read from memory.
83 */ 81 */
84 int (*read_emulated) (unsigned long addr, 82 int (*read_emulated) (unsigned long addr,
85 unsigned long *val, 83 void *val,
86 unsigned int bytes, 84 unsigned int bytes,
87 struct x86_emulate_ctxt * ctxt); 85 struct x86_emulate_ctxt * ctxt);
88 86
@@ -94,7 +92,7 @@ struct x86_emulate_ops {
94 * @bytes: [IN ] Number of bytes to write to memory. 92 * @bytes: [IN ] Number of bytes to write to memory.
95 */ 93 */
96 int (*write_emulated) (unsigned long addr, 94 int (*write_emulated) (unsigned long addr,
97 unsigned long val, 95 const void *val,
98 unsigned int bytes, 96 unsigned int bytes,
99 struct x86_emulate_ctxt * ctxt); 97 struct x86_emulate_ctxt * ctxt);
100 98
@@ -107,29 +105,11 @@ struct x86_emulate_ops {
107 * @bytes: [IN ] Number of bytes to access using CMPXCHG. 105 * @bytes: [IN ] Number of bytes to access using CMPXCHG.
108 */ 106 */
109 int (*cmpxchg_emulated) (unsigned long addr, 107 int (*cmpxchg_emulated) (unsigned long addr,
110 unsigned long old, 108 const void *old,
111 unsigned long new, 109 const void *new,
112 unsigned int bytes, 110 unsigned int bytes,
113 struct x86_emulate_ctxt * ctxt); 111 struct x86_emulate_ctxt * ctxt);
114 112
115 /*
116 * cmpxchg8b_emulated: Emulate an atomic (LOCKed) CMPXCHG8B operation on an
117 * emulated/special memory area.
118 * @addr: [IN ] Linear address to access.
119 * @old: [IN ] Value expected to be current at @addr.
120 * @new: [IN ] Value to write to @addr.
121 * NOTES:
122 * 1. This function is only ever called when emulating a real CMPXCHG8B.
123 * 2. This function is *never* called on x86/64 systems.
124 * 2. Not defining this function (i.e., specifying NULL) is equivalent
125 * to defining a function that always returns X86EMUL_UNHANDLEABLE.
126 */
127 int (*cmpxchg8b_emulated) (unsigned long addr,
128 unsigned long old_lo,
129 unsigned long old_hi,
130 unsigned long new_lo,
131 unsigned long new_hi,
132 struct x86_emulate_ctxt * ctxt);
133}; 113};
134 114
135struct cpu_user_regs; 115struct cpu_user_regs;
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 4ff0f57d0add..9f05279e7dd3 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -96,6 +96,7 @@ header-y += iso_fs.h
96header-y += ixjuser.h 96header-y += ixjuser.h
97header-y += jffs2.h 97header-y += jffs2.h
98header-y += keyctl.h 98header-y += keyctl.h
99header-y += kvm.h
99header-y += limits.h 100header-y += limits.h
100header-y += lock_dlm_plock.h 101header-y += lock_dlm_plock.h
101header-y += magic.h 102header-y += magic.h
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 275354ffa1cb..e6edca81ab84 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -11,7 +11,7 @@
11#include <asm/types.h> 11#include <asm/types.h>
12#include <linux/ioctl.h> 12#include <linux/ioctl.h>
13 13
14#define KVM_API_VERSION 4 14#define KVM_API_VERSION 12
15 15
16/* 16/*
17 * Architectural interrupt line count, and the size of the bitmap needed 17 * Architectural interrupt line count, and the size of the bitmap needed
@@ -33,37 +33,39 @@ struct kvm_memory_region {
33/* for kvm_memory_region::flags */ 33/* for kvm_memory_region::flags */
34#define KVM_MEM_LOG_DIRTY_PAGES 1UL 34#define KVM_MEM_LOG_DIRTY_PAGES 1UL
35 35
36 36struct kvm_memory_alias {
37#define KVM_EXIT_TYPE_FAIL_ENTRY 1 37 __u32 slot; /* this has a different namespace than memory slots */
38#define KVM_EXIT_TYPE_VM_EXIT 2 38 __u32 flags;
39 __u64 guest_phys_addr;
40 __u64 memory_size;
41 __u64 target_phys_addr;
42};
39 43
40enum kvm_exit_reason { 44enum kvm_exit_reason {
41 KVM_EXIT_UNKNOWN = 0, 45 KVM_EXIT_UNKNOWN = 0,
42 KVM_EXIT_EXCEPTION = 1, 46 KVM_EXIT_EXCEPTION = 1,
43 KVM_EXIT_IO = 2, 47 KVM_EXIT_IO = 2,
44 KVM_EXIT_CPUID = 3, 48 KVM_EXIT_HYPERCALL = 3,
45 KVM_EXIT_DEBUG = 4, 49 KVM_EXIT_DEBUG = 4,
46 KVM_EXIT_HLT = 5, 50 KVM_EXIT_HLT = 5,
47 KVM_EXIT_MMIO = 6, 51 KVM_EXIT_MMIO = 6,
48 KVM_EXIT_IRQ_WINDOW_OPEN = 7, 52 KVM_EXIT_IRQ_WINDOW_OPEN = 7,
49 KVM_EXIT_SHUTDOWN = 8, 53 KVM_EXIT_SHUTDOWN = 8,
54 KVM_EXIT_FAIL_ENTRY = 9,
55 KVM_EXIT_INTR = 10,
50}; 56};
51 57
52/* for KVM_RUN */ 58/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
53struct kvm_run { 59struct kvm_run {
54 /* in */ 60 /* in */
55 __u32 emulated; /* skip current instruction */
56 __u32 mmio_completed; /* mmio request completed */
57 __u8 request_interrupt_window; 61 __u8 request_interrupt_window;
58 __u8 padding1[7]; 62 __u8 padding1[7];
59 63
60 /* out */ 64 /* out */
61 __u32 exit_type;
62 __u32 exit_reason; 65 __u32 exit_reason;
63 __u32 instruction_length;
64 __u8 ready_for_interrupt_injection; 66 __u8 ready_for_interrupt_injection;
65 __u8 if_flag; 67 __u8 if_flag;
66 __u16 padding2; 68 __u8 padding2[2];
67 69
68 /* in (pre_kvm_run), out (post_kvm_run) */ 70 /* in (pre_kvm_run), out (post_kvm_run) */
69 __u64 cr8; 71 __u64 cr8;
@@ -72,29 +74,26 @@ struct kvm_run {
72 union { 74 union {
73 /* KVM_EXIT_UNKNOWN */ 75 /* KVM_EXIT_UNKNOWN */
74 struct { 76 struct {
75 __u32 hardware_exit_reason; 77 __u64 hardware_exit_reason;
76 } hw; 78 } hw;
79 /* KVM_EXIT_FAIL_ENTRY */
80 struct {
81 __u64 hardware_entry_failure_reason;
82 } fail_entry;
77 /* KVM_EXIT_EXCEPTION */ 83 /* KVM_EXIT_EXCEPTION */
78 struct { 84 struct {
79 __u32 exception; 85 __u32 exception;
80 __u32 error_code; 86 __u32 error_code;
81 } ex; 87 } ex;
82 /* KVM_EXIT_IO */ 88 /* KVM_EXIT_IO */
83 struct { 89 struct kvm_io {
84#define KVM_EXIT_IO_IN 0 90#define KVM_EXIT_IO_IN 0
85#define KVM_EXIT_IO_OUT 1 91#define KVM_EXIT_IO_OUT 1
86 __u8 direction; 92 __u8 direction;
87 __u8 size; /* bytes */ 93 __u8 size; /* bytes */
88 __u8 string;
89 __u8 string_down;
90 __u8 rep;
91 __u8 pad;
92 __u16 port; 94 __u16 port;
93 __u64 count; 95 __u32 count;
94 union { 96 __u64 data_offset; /* relative to kvm_run start */
95 __u64 address;
96 __u32 value;
97 };
98 } io; 97 } io;
99 struct { 98 struct {
100 } debug; 99 } debug;
@@ -105,6 +104,13 @@ struct kvm_run {
105 __u32 len; 104 __u32 len;
106 __u8 is_write; 105 __u8 is_write;
107 } mmio; 106 } mmio;
107 /* KVM_EXIT_HYPERCALL */
108 struct {
109 __u64 args[6];
110 __u64 ret;
111 __u32 longmode;
112 __u32 pad;
113 } hypercall;
108 }; 114 };
109}; 115};
110 116
@@ -118,6 +124,21 @@ struct kvm_regs {
118 __u64 rip, rflags; 124 __u64 rip, rflags;
119}; 125};
120 126
127/* for KVM_GET_FPU and KVM_SET_FPU */
128struct kvm_fpu {
129 __u8 fpr[8][16];
130 __u16 fcw;
131 __u16 fsw;
132 __u8 ftwx; /* in fxsave format */
133 __u8 pad1;
134 __u16 last_opcode;
135 __u64 last_ip;
136 __u64 last_dp;
137 __u8 xmm[16][16];
138 __u32 mxcsr;
139 __u32 pad2;
140};
141
121struct kvm_segment { 142struct kvm_segment {
122 __u64 base; 143 __u64 base;
123 __u32 limit; 144 __u32 limit;
@@ -210,38 +231,74 @@ struct kvm_dirty_log {
210 }; 231 };
211}; 232};
212 233
234struct kvm_cpuid_entry {
235 __u32 function;
236 __u32 eax;
237 __u32 ebx;
238 __u32 ecx;
239 __u32 edx;
240 __u32 padding;
241};
242
243/* for KVM_SET_CPUID */
244struct kvm_cpuid {
245 __u32 nent;
246 __u32 padding;
247 struct kvm_cpuid_entry entries[0];
248};
249
250/* for KVM_SET_SIGNAL_MASK */
251struct kvm_signal_mask {
252 __u32 len;
253 __u8 sigset[0];
254};
255
213#define KVMIO 0xAE 256#define KVMIO 0xAE
214 257
215/* 258/*
216 * ioctls for /dev/kvm fds: 259 * ioctls for /dev/kvm fds:
217 */ 260 */
218#define KVM_GET_API_VERSION _IO(KVMIO, 1) 261#define KVM_GET_API_VERSION _IO(KVMIO, 0x00)
219#define KVM_CREATE_VM _IO(KVMIO, 2) /* returns a VM fd */ 262#define KVM_CREATE_VM _IO(KVMIO, 0x01) /* returns a VM fd */
220#define KVM_GET_MSR_INDEX_LIST _IOWR(KVMIO, 15, struct kvm_msr_list) 263#define KVM_GET_MSR_INDEX_LIST _IOWR(KVMIO, 0x02, struct kvm_msr_list)
264/*
265 * Check if a kvm extension is available. Argument is extension number,
266 * return is 1 (yes) or 0 (no, sorry).
267 */
268#define KVM_CHECK_EXTENSION _IO(KVMIO, 0x03)
269/*
270 * Get size for mmap(vcpu_fd)
271 */
272#define KVM_GET_VCPU_MMAP_SIZE _IO(KVMIO, 0x04) /* in bytes */
221 273
222/* 274/*
223 * ioctls for VM fds 275 * ioctls for VM fds
224 */ 276 */
225#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 10, struct kvm_memory_region) 277#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region)
226/* 278/*
227 * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns 279 * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
228 * a vcpu fd. 280 * a vcpu fd.
229 */ 281 */
230#define KVM_CREATE_VCPU _IOW(KVMIO, 11, int) 282#define KVM_CREATE_VCPU _IO(KVMIO, 0x41)
231#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 12, struct kvm_dirty_log) 283#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log)
284#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias)
232 285
233/* 286/*
234 * ioctls for vcpu fds 287 * ioctls for vcpu fds
235 */ 288 */
236#define KVM_RUN _IOWR(KVMIO, 2, struct kvm_run) 289#define KVM_RUN _IO(KVMIO, 0x80)
237#define KVM_GET_REGS _IOR(KVMIO, 3, struct kvm_regs) 290#define KVM_GET_REGS _IOR(KVMIO, 0x81, struct kvm_regs)
238#define KVM_SET_REGS _IOW(KVMIO, 4, struct kvm_regs) 291#define KVM_SET_REGS _IOW(KVMIO, 0x82, struct kvm_regs)
239#define KVM_GET_SREGS _IOR(KVMIO, 5, struct kvm_sregs) 292#define KVM_GET_SREGS _IOR(KVMIO, 0x83, struct kvm_sregs)
240#define KVM_SET_SREGS _IOW(KVMIO, 6, struct kvm_sregs) 293#define KVM_SET_SREGS _IOW(KVMIO, 0x84, struct kvm_sregs)
241#define KVM_TRANSLATE _IOWR(KVMIO, 7, struct kvm_translation) 294#define KVM_TRANSLATE _IOWR(KVMIO, 0x85, struct kvm_translation)
242#define KVM_INTERRUPT _IOW(KVMIO, 8, struct kvm_interrupt) 295#define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt)
243#define KVM_DEBUG_GUEST _IOW(KVMIO, 9, struct kvm_debug_guest) 296#define KVM_DEBUG_GUEST _IOW(KVMIO, 0x87, struct kvm_debug_guest)
244#define KVM_GET_MSRS _IOWR(KVMIO, 13, struct kvm_msrs) 297#define KVM_GET_MSRS _IOWR(KVMIO, 0x88, struct kvm_msrs)
245#define KVM_SET_MSRS _IOW(KVMIO, 14, struct kvm_msrs) 298#define KVM_SET_MSRS _IOW(KVMIO, 0x89, struct kvm_msrs)
299#define KVM_SET_CPUID _IOW(KVMIO, 0x8a, struct kvm_cpuid)
300#define KVM_SET_SIGNAL_MASK _IOW(KVMIO, 0x8b, struct kvm_signal_mask)
301#define KVM_GET_FPU _IOR(KVMIO, 0x8c, struct kvm_fpu)
302#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu)
246 303
247#endif 304#endif
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 326da7d500c7..dff9ea32606a 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -29,6 +29,7 @@
29 29
30#define TUN_MINOR 200 30#define TUN_MINOR 200
31#define HPET_MINOR 228 31#define HPET_MINOR 228
32#define KVM_MINOR 232
32 33
33struct device; 34struct device;
34 35