diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-01-02 14:41:11 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-01-02 14:41:11 -0500 |
commit | 597b0d21626da4e6f09f132442caf0cc2b0eb47c (patch) | |
tree | 13c0074bb20f7b05a471e78d4ff52c665a10266a /arch/x86 | |
parent | 2640c9a90fa596871e142f42052608864335f102 (diff) | |
parent | 87917239204d67a316cb89751750f86c9ed3640b (diff) |
Merge branch 'kvm-updates/2.6.29' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm
* 'kvm-updates/2.6.29' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (140 commits)
KVM: MMU: handle large host sptes on invlpg/resync
KVM: Add locking to virtual i8259 interrupt controller
KVM: MMU: Don't treat a global pte as such if cr4.pge is cleared
MAINTAINERS: Maintainership changes for kvm/ia64
KVM: ia64: Fix kvm_arch_vcpu_ioctl_[gs]et_regs()
KVM: x86: Rework user space NMI injection as KVM_CAP_USER_NMI
KVM: VMX: Fix pending NMI-vs.-IRQ race for user space irqchip
KVM: fix handling of ACK from shared guest IRQ
KVM: MMU: check for present pdptr shadow page in walk_shadow
KVM: Consolidate userspace memory capability reporting into common code
KVM: Advertise the bug in memory region destruction as fixed
KVM: use cpumask_var_t for cpus_hardware_enabled
KVM: use modern cpumask primitives, no cpumask_t on stack
KVM: Extract core of kvm_flush_remote_tlbs/kvm_reload_remote_mmus
KVM: set owner of cpu and vm file operations
anon_inodes: use fops->owner for module refcount
x86: KVM guest: kvm_get_tsc_khz: return khz, not lpj
KVM: MMU: prepopulate the shadow on invlpg
KVM: MMU: skip global pgtables on sync due to cr3 switch
KVM: MMU: collapse remote TLB flushes on root sync
...
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 45 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_x86_emulate.h | 11 | ||||
-rw-r--r-- | arch/x86/include/asm/mtrr.h | 25 | ||||
-rw-r--r-- | arch/x86/include/asm/svm.h (renamed from arch/x86/kvm/svm.h) | 0 | ||||
-rw-r--r-- | arch/x86/include/asm/virtext.h | 132 | ||||
-rw-r--r-- | arch/x86/include/asm/vmx.h (renamed from arch/x86/kvm/vmx.h) | 27 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mtrr/generic.c | 12 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mtrr/main.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mtrr/mtrr.h | 18 | ||||
-rw-r--r-- | arch/x86/kernel/crash.c | 18 | ||||
-rw-r--r-- | arch/x86/kernel/kvmclock.c | 10 | ||||
-rw-r--r-- | arch/x86/kernel/reboot.c | 62 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.c | 19 | ||||
-rw-r--r-- | arch/x86/kvm/i8259.c | 52 | ||||
-rw-r--r-- | arch/x86/kvm/irq.h | 6 | ||||
-rw-r--r-- | arch/x86/kvm/kvm_svm.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 58 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 444 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 44 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 48 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 350 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 117 | ||||
-rw-r--r-- | arch/x86/kvm/x86_emulate.c | 297 |
23 files changed, 1357 insertions, 444 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8346be87cfa1..97215a458e5f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h | |||
@@ -21,6 +21,7 @@ | |||
21 | 21 | ||
22 | #include <asm/pvclock-abi.h> | 22 | #include <asm/pvclock-abi.h> |
23 | #include <asm/desc.h> | 23 | #include <asm/desc.h> |
24 | #include <asm/mtrr.h> | ||
24 | 25 | ||
25 | #define KVM_MAX_VCPUS 16 | 26 | #define KVM_MAX_VCPUS 16 |
26 | #define KVM_MEMORY_SLOTS 32 | 27 | #define KVM_MEMORY_SLOTS 32 |
@@ -86,6 +87,7 @@ | |||
86 | #define KVM_MIN_FREE_MMU_PAGES 5 | 87 | #define KVM_MIN_FREE_MMU_PAGES 5 |
87 | #define KVM_REFILL_PAGES 25 | 88 | #define KVM_REFILL_PAGES 25 |
88 | #define KVM_MAX_CPUID_ENTRIES 40 | 89 | #define KVM_MAX_CPUID_ENTRIES 40 |
90 | #define KVM_NR_FIXED_MTRR_REGION 88 | ||
89 | #define KVM_NR_VAR_MTRR 8 | 91 | #define KVM_NR_VAR_MTRR 8 |
90 | 92 | ||
91 | extern spinlock_t kvm_lock; | 93 | extern spinlock_t kvm_lock; |
@@ -180,6 +182,8 @@ struct kvm_mmu_page { | |||
180 | struct list_head link; | 182 | struct list_head link; |
181 | struct hlist_node hash_link; | 183 | struct hlist_node hash_link; |
182 | 184 | ||
185 | struct list_head oos_link; | ||
186 | |||
183 | /* | 187 | /* |
184 | * The following two entries are used to key the shadow page in the | 188 | * The following two entries are used to key the shadow page in the |
185 | * hash table. | 189 | * hash table. |
@@ -190,13 +194,16 @@ struct kvm_mmu_page { | |||
190 | u64 *spt; | 194 | u64 *spt; |
191 | /* hold the gfn of each spte inside spt */ | 195 | /* hold the gfn of each spte inside spt */ |
192 | gfn_t *gfns; | 196 | gfn_t *gfns; |
193 | unsigned long slot_bitmap; /* One bit set per slot which has memory | 197 | /* |
194 | * in this shadow page. | 198 | * One bit set per slot which has memory |
195 | */ | 199 | * in this shadow page. |
200 | */ | ||
201 | DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); | ||
196 | int multimapped; /* More than one parent_pte? */ | 202 | int multimapped; /* More than one parent_pte? */ |
197 | int root_count; /* Currently serving as active root */ | 203 | int root_count; /* Currently serving as active root */ |
198 | bool unsync; | 204 | bool unsync; |
199 | bool unsync_children; | 205 | bool global; |
206 | unsigned int unsync_children; | ||
200 | union { | 207 | union { |
201 | u64 *parent_pte; /* !multimapped */ | 208 | u64 *parent_pte; /* !multimapped */ |
202 | struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ | 209 | struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ |
@@ -327,8 +334,10 @@ struct kvm_vcpu_arch { | |||
327 | 334 | ||
328 | bool nmi_pending; | 335 | bool nmi_pending; |
329 | bool nmi_injected; | 336 | bool nmi_injected; |
337 | bool nmi_window_open; | ||
330 | 338 | ||
331 | u64 mtrr[0x100]; | 339 | struct mtrr_state_type mtrr_state; |
340 | u32 pat; | ||
332 | }; | 341 | }; |
333 | 342 | ||
334 | struct kvm_mem_alias { | 343 | struct kvm_mem_alias { |
@@ -350,11 +359,13 @@ struct kvm_arch{ | |||
350 | */ | 359 | */ |
351 | struct list_head active_mmu_pages; | 360 | struct list_head active_mmu_pages; |
352 | struct list_head assigned_dev_head; | 361 | struct list_head assigned_dev_head; |
362 | struct list_head oos_global_pages; | ||
353 | struct dmar_domain *intel_iommu_domain; | 363 | struct dmar_domain *intel_iommu_domain; |
354 | struct kvm_pic *vpic; | 364 | struct kvm_pic *vpic; |
355 | struct kvm_ioapic *vioapic; | 365 | struct kvm_ioapic *vioapic; |
356 | struct kvm_pit *vpit; | 366 | struct kvm_pit *vpit; |
357 | struct hlist_head irq_ack_notifier_list; | 367 | struct hlist_head irq_ack_notifier_list; |
368 | int vapics_in_nmi_mode; | ||
358 | 369 | ||
359 | int round_robin_prev_vcpu; | 370 | int round_robin_prev_vcpu; |
360 | unsigned int tss_addr; | 371 | unsigned int tss_addr; |
@@ -378,6 +389,7 @@ struct kvm_vm_stat { | |||
378 | u32 mmu_recycled; | 389 | u32 mmu_recycled; |
379 | u32 mmu_cache_miss; | 390 | u32 mmu_cache_miss; |
380 | u32 mmu_unsync; | 391 | u32 mmu_unsync; |
392 | u32 mmu_unsync_global; | ||
381 | u32 remote_tlb_flush; | 393 | u32 remote_tlb_flush; |
382 | u32 lpages; | 394 | u32 lpages; |
383 | }; | 395 | }; |
@@ -397,6 +409,7 @@ struct kvm_vcpu_stat { | |||
397 | u32 halt_exits; | 409 | u32 halt_exits; |
398 | u32 halt_wakeup; | 410 | u32 halt_wakeup; |
399 | u32 request_irq_exits; | 411 | u32 request_irq_exits; |
412 | u32 request_nmi_exits; | ||
400 | u32 irq_exits; | 413 | u32 irq_exits; |
401 | u32 host_state_reload; | 414 | u32 host_state_reload; |
402 | u32 efer_reload; | 415 | u32 efer_reload; |
@@ -405,6 +418,7 @@ struct kvm_vcpu_stat { | |||
405 | u32 insn_emulation_fail; | 418 | u32 insn_emulation_fail; |
406 | u32 hypercalls; | 419 | u32 hypercalls; |
407 | u32 irq_injections; | 420 | u32 irq_injections; |
421 | u32 nmi_injections; | ||
408 | }; | 422 | }; |
409 | 423 | ||
410 | struct descriptor_table { | 424 | struct descriptor_table { |
@@ -477,6 +491,7 @@ struct kvm_x86_ops { | |||
477 | 491 | ||
478 | int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); | 492 | int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); |
479 | int (*get_tdp_level)(void); | 493 | int (*get_tdp_level)(void); |
494 | int (*get_mt_mask_shift)(void); | ||
480 | }; | 495 | }; |
481 | 496 | ||
482 | extern struct kvm_x86_ops *kvm_x86_ops; | 497 | extern struct kvm_x86_ops *kvm_x86_ops; |
@@ -490,7 +505,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu); | |||
490 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); | 505 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); |
491 | void kvm_mmu_set_base_ptes(u64 base_pte); | 506 | void kvm_mmu_set_base_ptes(u64 base_pte); |
492 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, | 507 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, |
493 | u64 dirty_mask, u64 nx_mask, u64 x_mask); | 508 | u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask); |
494 | 509 | ||
495 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); | 510 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); |
496 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); | 511 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); |
@@ -587,12 +602,14 @@ unsigned long segment_base(u16 selector); | |||
587 | 602 | ||
588 | void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); | 603 | void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); |
589 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | 604 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, |
590 | const u8 *new, int bytes); | 605 | const u8 *new, int bytes, |
606 | bool guest_initiated); | ||
591 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); | 607 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); |
592 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); | 608 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); |
593 | int kvm_mmu_load(struct kvm_vcpu *vcpu); | 609 | int kvm_mmu_load(struct kvm_vcpu *vcpu); |
594 | void kvm_mmu_unload(struct kvm_vcpu *vcpu); | 610 | void kvm_mmu_unload(struct kvm_vcpu *vcpu); |
595 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); | 611 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); |
612 | void kvm_mmu_sync_global(struct kvm_vcpu *vcpu); | ||
596 | 613 | ||
597 | int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); | 614 | int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); |
598 | 615 | ||
@@ -607,6 +624,8 @@ void kvm_disable_tdp(void); | |||
607 | int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); | 624 | int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); |
608 | int complete_pio(struct kvm_vcpu *vcpu); | 625 | int complete_pio(struct kvm_vcpu *vcpu); |
609 | 626 | ||
627 | struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); | ||
628 | |||
610 | static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) | 629 | static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) |
611 | { | 630 | { |
612 | struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); | 631 | struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); |
@@ -702,18 +721,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) | |||
702 | kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); | 721 | kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); |
703 | } | 722 | } |
704 | 723 | ||
705 | #define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" | ||
706 | #define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" | ||
707 | #define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" | ||
708 | #define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30" | ||
709 | #define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0" | ||
710 | #define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0" | ||
711 | #define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4" | ||
712 | #define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4" | ||
713 | #define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" | ||
714 | #define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08" | ||
715 | #define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08" | ||
716 | |||
717 | #define MSR_IA32_TIME_STAMP_COUNTER 0x010 | 724 | #define MSR_IA32_TIME_STAMP_COUNTER 0x010 |
718 | 725 | ||
719 | #define TSS_IOPB_BASE_OFFSET 0x66 | 726 | #define TSS_IOPB_BASE_OFFSET 0x66 |
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h index 25179a29f208..6a159732881a 100644 --- a/arch/x86/include/asm/kvm_x86_emulate.h +++ b/arch/x86/include/asm/kvm_x86_emulate.h | |||
@@ -123,6 +123,7 @@ struct decode_cache { | |||
123 | u8 ad_bytes; | 123 | u8 ad_bytes; |
124 | u8 rex_prefix; | 124 | u8 rex_prefix; |
125 | struct operand src; | 125 | struct operand src; |
126 | struct operand src2; | ||
126 | struct operand dst; | 127 | struct operand dst; |
127 | bool has_seg_override; | 128 | bool has_seg_override; |
128 | u8 seg_override; | 129 | u8 seg_override; |
@@ -146,22 +147,18 @@ struct x86_emulate_ctxt { | |||
146 | /* Register state before/after emulation. */ | 147 | /* Register state before/after emulation. */ |
147 | struct kvm_vcpu *vcpu; | 148 | struct kvm_vcpu *vcpu; |
148 | 149 | ||
149 | /* Linear faulting address (if emulating a page-faulting instruction) */ | ||
150 | unsigned long eflags; | 150 | unsigned long eflags; |
151 | |||
152 | /* Emulated execution mode, represented by an X86EMUL_MODE value. */ | 151 | /* Emulated execution mode, represented by an X86EMUL_MODE value. */ |
153 | int mode; | 152 | int mode; |
154 | |||
155 | u32 cs_base; | 153 | u32 cs_base; |
156 | 154 | ||
157 | /* decode cache */ | 155 | /* decode cache */ |
158 | |||
159 | struct decode_cache decode; | 156 | struct decode_cache decode; |
160 | }; | 157 | }; |
161 | 158 | ||
162 | /* Repeat String Operation Prefix */ | 159 | /* Repeat String Operation Prefix */ |
163 | #define REPE_PREFIX 1 | 160 | #define REPE_PREFIX 1 |
164 | #define REPNE_PREFIX 2 | 161 | #define REPNE_PREFIX 2 |
165 | 162 | ||
166 | /* Execution mode, passed to the emulator. */ | 163 | /* Execution mode, passed to the emulator. */ |
167 | #define X86EMUL_MODE_REAL 0 /* Real mode. */ | 164 | #define X86EMUL_MODE_REAL 0 /* Real mode. */ |
@@ -170,7 +167,7 @@ struct x86_emulate_ctxt { | |||
170 | #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ | 167 | #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ |
171 | 168 | ||
172 | /* Host execution mode. */ | 169 | /* Host execution mode. */ |
173 | #if defined(__i386__) | 170 | #if defined(CONFIG_X86_32) |
174 | #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 | 171 | #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 |
175 | #elif defined(CONFIG_X86_64) | 172 | #elif defined(CONFIG_X86_64) |
176 | #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 | 173 | #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 |
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index 7c1e4258b31e..cb988aab716d 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h | |||
@@ -57,6 +57,31 @@ struct mtrr_gentry { | |||
57 | }; | 57 | }; |
58 | #endif /* !__i386__ */ | 58 | #endif /* !__i386__ */ |
59 | 59 | ||
60 | struct mtrr_var_range { | ||
61 | u32 base_lo; | ||
62 | u32 base_hi; | ||
63 | u32 mask_lo; | ||
64 | u32 mask_hi; | ||
65 | }; | ||
66 | |||
67 | /* In the Intel processor's MTRR interface, the MTRR type is always held in | ||
68 | an 8 bit field: */ | ||
69 | typedef u8 mtrr_type; | ||
70 | |||
71 | #define MTRR_NUM_FIXED_RANGES 88 | ||
72 | #define MTRR_MAX_VAR_RANGES 256 | ||
73 | |||
74 | struct mtrr_state_type { | ||
75 | struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES]; | ||
76 | mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES]; | ||
77 | unsigned char enabled; | ||
78 | unsigned char have_fixed; | ||
79 | mtrr_type def_type; | ||
80 | }; | ||
81 | |||
82 | #define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg)) | ||
83 | #define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) | ||
84 | |||
60 | /* These are the various ioctls */ | 85 | /* These are the various ioctls */ |
61 | #define MTRRIOC_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry) | 86 | #define MTRRIOC_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry) |
62 | #define MTRRIOC_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry) | 87 | #define MTRRIOC_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry) |
diff --git a/arch/x86/kvm/svm.h b/arch/x86/include/asm/svm.h index 1b8afa78e869..1b8afa78e869 100644 --- a/arch/x86/kvm/svm.h +++ b/arch/x86/include/asm/svm.h | |||
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h new file mode 100644 index 000000000000..593636275238 --- /dev/null +++ b/arch/x86/include/asm/virtext.h | |||
@@ -0,0 +1,132 @@ | |||
1 | /* CPU virtualization extensions handling | ||
2 | * | ||
3 | * This should carry the code for handling CPU virtualization extensions | ||
4 | * that needs to live in the kernel core. | ||
5 | * | ||
6 | * Author: Eduardo Habkost <ehabkost@redhat.com> | ||
7 | * | ||
8 | * Copyright (C) 2008, Red Hat Inc. | ||
9 | * | ||
10 | * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc. | ||
11 | * | ||
12 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
13 | * the COPYING file in the top-level directory. | ||
14 | */ | ||
15 | #ifndef _ASM_X86_VIRTEX_H | ||
16 | #define _ASM_X86_VIRTEX_H | ||
17 | |||
18 | #include <asm/processor.h> | ||
19 | #include <asm/system.h> | ||
20 | |||
21 | #include <asm/vmx.h> | ||
22 | #include <asm/svm.h> | ||
23 | |||
24 | /* | ||
25 | * VMX functions: | ||
26 | */ | ||
27 | |||
28 | static inline int cpu_has_vmx(void) | ||
29 | { | ||
30 | unsigned long ecx = cpuid_ecx(1); | ||
31 | return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ | ||
32 | } | ||
33 | |||
34 | |||
35 | /** Disable VMX on the current CPU | ||
36 | * | ||
37 | * vmxoff causes a undefined-opcode exception if vmxon was not run | ||
38 | * on the CPU previously. Only call this function if you know VMX | ||
39 | * is enabled. | ||
40 | */ | ||
41 | static inline void cpu_vmxoff(void) | ||
42 | { | ||
43 | asm volatile (ASM_VMX_VMXOFF : : : "cc"); | ||
44 | write_cr4(read_cr4() & ~X86_CR4_VMXE); | ||
45 | } | ||
46 | |||
47 | static inline int cpu_vmx_enabled(void) | ||
48 | { | ||
49 | return read_cr4() & X86_CR4_VMXE; | ||
50 | } | ||
51 | |||
52 | /** Disable VMX if it is enabled on the current CPU | ||
53 | * | ||
54 | * You shouldn't call this if cpu_has_vmx() returns 0. | ||
55 | */ | ||
56 | static inline void __cpu_emergency_vmxoff(void) | ||
57 | { | ||
58 | if (cpu_vmx_enabled()) | ||
59 | cpu_vmxoff(); | ||
60 | } | ||
61 | |||
62 | /** Disable VMX if it is supported and enabled on the current CPU | ||
63 | */ | ||
64 | static inline void cpu_emergency_vmxoff(void) | ||
65 | { | ||
66 | if (cpu_has_vmx()) | ||
67 | __cpu_emergency_vmxoff(); | ||
68 | } | ||
69 | |||
70 | |||
71 | |||
72 | |||
73 | /* | ||
74 | * SVM functions: | ||
75 | */ | ||
76 | |||
77 | /** Check if the CPU has SVM support | ||
78 | * | ||
79 | * You can use the 'msg' arg to get a message describing the problem, | ||
80 | * if the function returns zero. Simply pass NULL if you are not interested | ||
81 | * on the messages; gcc should take care of not generating code for | ||
82 | * the messages on this case. | ||
83 | */ | ||
84 | static inline int cpu_has_svm(const char **msg) | ||
85 | { | ||
86 | uint32_t eax, ebx, ecx, edx; | ||
87 | |||
88 | if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { | ||
89 | if (msg) | ||
90 | *msg = "not amd"; | ||
91 | return 0; | ||
92 | } | ||
93 | |||
94 | cpuid(0x80000000, &eax, &ebx, &ecx, &edx); | ||
95 | if (eax < SVM_CPUID_FUNC) { | ||
96 | if (msg) | ||
97 | *msg = "can't execute cpuid_8000000a"; | ||
98 | return 0; | ||
99 | } | ||
100 | |||
101 | cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | ||
102 | if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) { | ||
103 | if (msg) | ||
104 | *msg = "svm not available"; | ||
105 | return 0; | ||
106 | } | ||
107 | return 1; | ||
108 | } | ||
109 | |||
110 | |||
111 | /** Disable SVM on the current CPU | ||
112 | * | ||
113 | * You should call this only if cpu_has_svm() returned true. | ||
114 | */ | ||
115 | static inline void cpu_svm_disable(void) | ||
116 | { | ||
117 | uint64_t efer; | ||
118 | |||
119 | wrmsrl(MSR_VM_HSAVE_PA, 0); | ||
120 | rdmsrl(MSR_EFER, efer); | ||
121 | wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); | ||
122 | } | ||
123 | |||
124 | /** Makes sure SVM is disabled, if it is supported on the CPU | ||
125 | */ | ||
126 | static inline void cpu_emergency_svm_disable(void) | ||
127 | { | ||
128 | if (cpu_has_svm(NULL)) | ||
129 | cpu_svm_disable(); | ||
130 | } | ||
131 | |||
132 | #endif /* _ASM_X86_VIRTEX_H */ | ||
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/include/asm/vmx.h index ec5edc339da6..d0238e6151d8 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/include/asm/vmx.h | |||
@@ -63,10 +63,13 @@ | |||
63 | 63 | ||
64 | #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 | 64 | #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 |
65 | #define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 | 65 | #define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 |
66 | #define VM_EXIT_SAVE_IA32_PAT 0x00040000 | ||
67 | #define VM_EXIT_LOAD_IA32_PAT 0x00080000 | ||
66 | 68 | ||
67 | #define VM_ENTRY_IA32E_MODE 0x00000200 | 69 | #define VM_ENTRY_IA32E_MODE 0x00000200 |
68 | #define VM_ENTRY_SMM 0x00000400 | 70 | #define VM_ENTRY_SMM 0x00000400 |
69 | #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 | 71 | #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 |
72 | #define VM_ENTRY_LOAD_IA32_PAT 0x00004000 | ||
70 | 73 | ||
71 | /* VMCS Encodings */ | 74 | /* VMCS Encodings */ |
72 | enum vmcs_field { | 75 | enum vmcs_field { |
@@ -112,6 +115,8 @@ enum vmcs_field { | |||
112 | VMCS_LINK_POINTER_HIGH = 0x00002801, | 115 | VMCS_LINK_POINTER_HIGH = 0x00002801, |
113 | GUEST_IA32_DEBUGCTL = 0x00002802, | 116 | GUEST_IA32_DEBUGCTL = 0x00002802, |
114 | GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, | 117 | GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, |
118 | GUEST_IA32_PAT = 0x00002804, | ||
119 | GUEST_IA32_PAT_HIGH = 0x00002805, | ||
115 | GUEST_PDPTR0 = 0x0000280a, | 120 | GUEST_PDPTR0 = 0x0000280a, |
116 | GUEST_PDPTR0_HIGH = 0x0000280b, | 121 | GUEST_PDPTR0_HIGH = 0x0000280b, |
117 | GUEST_PDPTR1 = 0x0000280c, | 122 | GUEST_PDPTR1 = 0x0000280c, |
@@ -120,6 +125,8 @@ enum vmcs_field { | |||
120 | GUEST_PDPTR2_HIGH = 0x0000280f, | 125 | GUEST_PDPTR2_HIGH = 0x0000280f, |
121 | GUEST_PDPTR3 = 0x00002810, | 126 | GUEST_PDPTR3 = 0x00002810, |
122 | GUEST_PDPTR3_HIGH = 0x00002811, | 127 | GUEST_PDPTR3_HIGH = 0x00002811, |
128 | HOST_IA32_PAT = 0x00002c00, | ||
129 | HOST_IA32_PAT_HIGH = 0x00002c01, | ||
123 | PIN_BASED_VM_EXEC_CONTROL = 0x00004000, | 130 | PIN_BASED_VM_EXEC_CONTROL = 0x00004000, |
124 | CPU_BASED_VM_EXEC_CONTROL = 0x00004002, | 131 | CPU_BASED_VM_EXEC_CONTROL = 0x00004002, |
125 | EXCEPTION_BITMAP = 0x00004004, | 132 | EXCEPTION_BITMAP = 0x00004004, |
@@ -331,8 +338,9 @@ enum vmcs_field { | |||
331 | 338 | ||
332 | #define AR_RESERVD_MASK 0xfffe0f00 | 339 | #define AR_RESERVD_MASK 0xfffe0f00 |
333 | 340 | ||
334 | #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 | 341 | #define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0) |
335 | #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 | 342 | #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1) |
343 | #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2) | ||
336 | 344 | ||
337 | #define VMX_NR_VPIDS (1 << 16) | 345 | #define VMX_NR_VPIDS (1 << 16) |
338 | #define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 | 346 | #define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 |
@@ -356,4 +364,19 @@ enum vmcs_field { | |||
356 | 364 | ||
357 | #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul | 365 | #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul |
358 | 366 | ||
367 | |||
368 | #define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" | ||
369 | #define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" | ||
370 | #define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" | ||
371 | #define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30" | ||
372 | #define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0" | ||
373 | #define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0" | ||
374 | #define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4" | ||
375 | #define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4" | ||
376 | #define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" | ||
377 | #define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08" | ||
378 | #define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08" | ||
379 | |||
380 | |||
381 | |||
359 | #endif | 382 | #endif |
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 4e8d77f01eeb..b59ddcc88cd8 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c | |||
@@ -14,14 +14,6 @@ | |||
14 | #include <asm/pat.h> | 14 | #include <asm/pat.h> |
15 | #include "mtrr.h" | 15 | #include "mtrr.h" |
16 | 16 | ||
17 | struct mtrr_state { | ||
18 | struct mtrr_var_range var_ranges[MAX_VAR_RANGES]; | ||
19 | mtrr_type fixed_ranges[NUM_FIXED_RANGES]; | ||
20 | unsigned char enabled; | ||
21 | unsigned char have_fixed; | ||
22 | mtrr_type def_type; | ||
23 | }; | ||
24 | |||
25 | struct fixed_range_block { | 17 | struct fixed_range_block { |
26 | int base_msr; /* start address of an MTRR block */ | 18 | int base_msr; /* start address of an MTRR block */ |
27 | int ranges; /* number of MTRRs in this block */ | 19 | int ranges; /* number of MTRRs in this block */ |
@@ -35,10 +27,12 @@ static struct fixed_range_block fixed_range_blocks[] = { | |||
35 | }; | 27 | }; |
36 | 28 | ||
37 | static unsigned long smp_changes_mask; | 29 | static unsigned long smp_changes_mask; |
38 | static struct mtrr_state mtrr_state = {}; | ||
39 | static int mtrr_state_set; | 30 | static int mtrr_state_set; |
40 | u64 mtrr_tom2; | 31 | u64 mtrr_tom2; |
41 | 32 | ||
33 | struct mtrr_state_type mtrr_state = {}; | ||
34 | EXPORT_SYMBOL_GPL(mtrr_state); | ||
35 | |||
42 | #undef MODULE_PARAM_PREFIX | 36 | #undef MODULE_PARAM_PREFIX |
43 | #define MODULE_PARAM_PREFIX "mtrr." | 37 | #define MODULE_PARAM_PREFIX "mtrr." |
44 | 38 | ||
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 1159e269e596..d6ec7ec30274 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c | |||
@@ -49,7 +49,7 @@ | |||
49 | 49 | ||
50 | u32 num_var_ranges = 0; | 50 | u32 num_var_ranges = 0; |
51 | 51 | ||
52 | unsigned int mtrr_usage_table[MAX_VAR_RANGES]; | 52 | unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; |
53 | static DEFINE_MUTEX(mtrr_mutex); | 53 | static DEFINE_MUTEX(mtrr_mutex); |
54 | 54 | ||
55 | u64 size_or_mask, size_and_mask; | 55 | u64 size_or_mask, size_and_mask; |
@@ -574,7 +574,7 @@ struct mtrr_value { | |||
574 | unsigned long lsize; | 574 | unsigned long lsize; |
575 | }; | 575 | }; |
576 | 576 | ||
577 | static struct mtrr_value mtrr_state[MAX_VAR_RANGES]; | 577 | static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES]; |
578 | 578 | ||
579 | static int mtrr_save(struct sys_device * sysdev, pm_message_t state) | 579 | static int mtrr_save(struct sys_device * sysdev, pm_message_t state) |
580 | { | 580 | { |
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 2dc4ec656b23..ffd60409cc6d 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h | |||
@@ -8,11 +8,6 @@ | |||
8 | #define MTRRcap_MSR 0x0fe | 8 | #define MTRRcap_MSR 0x0fe |
9 | #define MTRRdefType_MSR 0x2ff | 9 | #define MTRRdefType_MSR 0x2ff |
10 | 10 | ||
11 | #define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg)) | ||
12 | #define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) | ||
13 | |||
14 | #define NUM_FIXED_RANGES 88 | ||
15 | #define MAX_VAR_RANGES 256 | ||
16 | #define MTRRfix64K_00000_MSR 0x250 | 11 | #define MTRRfix64K_00000_MSR 0x250 |
17 | #define MTRRfix16K_80000_MSR 0x258 | 12 | #define MTRRfix16K_80000_MSR 0x258 |
18 | #define MTRRfix16K_A0000_MSR 0x259 | 13 | #define MTRRfix16K_A0000_MSR 0x259 |
@@ -29,11 +24,7 @@ | |||
29 | #define MTRR_CHANGE_MASK_VARIABLE 0x02 | 24 | #define MTRR_CHANGE_MASK_VARIABLE 0x02 |
30 | #define MTRR_CHANGE_MASK_DEFTYPE 0x04 | 25 | #define MTRR_CHANGE_MASK_DEFTYPE 0x04 |
31 | 26 | ||
32 | /* In the Intel processor's MTRR interface, the MTRR type is always held in | 27 | extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; |
33 | an 8 bit field: */ | ||
34 | typedef u8 mtrr_type; | ||
35 | |||
36 | extern unsigned int mtrr_usage_table[MAX_VAR_RANGES]; | ||
37 | 28 | ||
38 | struct mtrr_ops { | 29 | struct mtrr_ops { |
39 | u32 vendor; | 30 | u32 vendor; |
@@ -70,13 +61,6 @@ struct set_mtrr_context { | |||
70 | u32 ccr3; | 61 | u32 ccr3; |
71 | }; | 62 | }; |
72 | 63 | ||
73 | struct mtrr_var_range { | ||
74 | u32 base_lo; | ||
75 | u32 base_hi; | ||
76 | u32 mask_lo; | ||
77 | u32 mask_hi; | ||
78 | }; | ||
79 | |||
80 | void set_mtrr_done(struct set_mtrr_context *ctxt); | 64 | void set_mtrr_done(struct set_mtrr_context *ctxt); |
81 | void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); | 65 | void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); |
82 | void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); | 66 | void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); |
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index d84a852e4cd7..c689d19e35ab 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/kdebug.h> | 26 | #include <linux/kdebug.h> |
27 | #include <asm/smp.h> | 27 | #include <asm/smp.h> |
28 | #include <asm/reboot.h> | 28 | #include <asm/reboot.h> |
29 | #include <asm/virtext.h> | ||
29 | 30 | ||
30 | #include <mach_ipi.h> | 31 | #include <mach_ipi.h> |
31 | 32 | ||
@@ -49,6 +50,15 @@ static void kdump_nmi_callback(int cpu, struct die_args *args) | |||
49 | #endif | 50 | #endif |
50 | crash_save_cpu(regs, cpu); | 51 | crash_save_cpu(regs, cpu); |
51 | 52 | ||
53 | /* Disable VMX or SVM if needed. | ||
54 | * | ||
55 | * We need to disable virtualization on all CPUs. | ||
56 | * Having VMX or SVM enabled on any CPU may break rebooting | ||
57 | * after the kdump kernel has finished its task. | ||
58 | */ | ||
59 | cpu_emergency_vmxoff(); | ||
60 | cpu_emergency_svm_disable(); | ||
61 | |||
52 | disable_local_APIC(); | 62 | disable_local_APIC(); |
53 | } | 63 | } |
54 | 64 | ||
@@ -80,6 +90,14 @@ void native_machine_crash_shutdown(struct pt_regs *regs) | |||
80 | local_irq_disable(); | 90 | local_irq_disable(); |
81 | 91 | ||
82 | kdump_nmi_shootdown_cpus(); | 92 | kdump_nmi_shootdown_cpus(); |
93 | |||
94 | /* Booting kdump kernel with VMX or SVM enabled won't work, | ||
95 | * because (among other limitations) we can't disable paging | ||
96 | * with the virt flags. | ||
97 | */ | ||
98 | cpu_emergency_vmxoff(); | ||
99 | cpu_emergency_svm_disable(); | ||
100 | |||
83 | lapic_shutdown(); | 101 | lapic_shutdown(); |
84 | #if defined(CONFIG_X86_IO_APIC) | 102 | #if defined(CONFIG_X86_IO_APIC) |
85 | disable_IO_APIC(); | 103 | disable_IO_APIC(); |
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index e169ae9b6a62..652fce6d2cce 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c | |||
@@ -89,17 +89,17 @@ static cycle_t kvm_clock_read(void) | |||
89 | */ | 89 | */ |
90 | static unsigned long kvm_get_tsc_khz(void) | 90 | static unsigned long kvm_get_tsc_khz(void) |
91 | { | 91 | { |
92 | return preset_lpj; | 92 | struct pvclock_vcpu_time_info *src; |
93 | src = &per_cpu(hv_clock, 0); | ||
94 | return pvclock_tsc_khz(src); | ||
93 | } | 95 | } |
94 | 96 | ||
95 | static void kvm_get_preset_lpj(void) | 97 | static void kvm_get_preset_lpj(void) |
96 | { | 98 | { |
97 | struct pvclock_vcpu_time_info *src; | ||
98 | unsigned long khz; | 99 | unsigned long khz; |
99 | u64 lpj; | 100 | u64 lpj; |
100 | 101 | ||
101 | src = &per_cpu(hv_clock, 0); | 102 | khz = kvm_get_tsc_khz(); |
102 | khz = pvclock_tsc_khz(src); | ||
103 | 103 | ||
104 | lpj = ((u64)khz * 1000); | 104 | lpj = ((u64)khz * 1000); |
105 | do_div(lpj, HZ); | 105 | do_div(lpj, HZ); |
@@ -194,5 +194,7 @@ void __init kvmclock_init(void) | |||
194 | #endif | 194 | #endif |
195 | kvm_get_preset_lpj(); | 195 | kvm_get_preset_lpj(); |
196 | clocksource_register(&kvm_clock); | 196 | clocksource_register(&kvm_clock); |
197 | pv_info.paravirt_enabled = 1; | ||
198 | pv_info.name = "KVM"; | ||
197 | } | 199 | } |
198 | } | 200 | } |
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 61f718df6eec..72e0e4e712d6 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <asm/proto.h> | 12 | #include <asm/proto.h> |
13 | #include <asm/reboot_fixups.h> | 13 | #include <asm/reboot_fixups.h> |
14 | #include <asm/reboot.h> | 14 | #include <asm/reboot.h> |
15 | #include <asm/virtext.h> | ||
15 | 16 | ||
16 | #ifdef CONFIG_X86_32 | 17 | #ifdef CONFIG_X86_32 |
17 | # include <linux/dmi.h> | 18 | # include <linux/dmi.h> |
@@ -39,6 +40,12 @@ int reboot_force; | |||
39 | static int reboot_cpu = -1; | 40 | static int reboot_cpu = -1; |
40 | #endif | 41 | #endif |
41 | 42 | ||
43 | /* This is set if we need to go through the 'emergency' path. | ||
44 | * When machine_emergency_restart() is called, we may be on | ||
45 | * an inconsistent state and won't be able to do a clean cleanup | ||
46 | */ | ||
47 | static int reboot_emergency; | ||
48 | |||
42 | /* This is set by the PCI code if either type 1 or type 2 PCI is detected */ | 49 | /* This is set by the PCI code if either type 1 or type 2 PCI is detected */ |
43 | bool port_cf9_safe = false; | 50 | bool port_cf9_safe = false; |
44 | 51 | ||
@@ -368,6 +375,48 @@ static inline void kb_wait(void) | |||
368 | } | 375 | } |
369 | } | 376 | } |
370 | 377 | ||
378 | static void vmxoff_nmi(int cpu, struct die_args *args) | ||
379 | { | ||
380 | cpu_emergency_vmxoff(); | ||
381 | } | ||
382 | |||
383 | /* Use NMIs as IPIs to tell all CPUs to disable virtualization | ||
384 | */ | ||
385 | static void emergency_vmx_disable_all(void) | ||
386 | { | ||
387 | /* Just make sure we won't change CPUs while doing this */ | ||
388 | local_irq_disable(); | ||
389 | |||
390 | /* We need to disable VMX on all CPUs before rebooting, otherwise | ||
391 | * we risk hanging up the machine, because the CPU ignore INIT | ||
392 | * signals when VMX is enabled. | ||
393 | * | ||
394 | * We can't take any locks and we may be on an inconsistent | ||
395 | * state, so we use NMIs as IPIs to tell the other CPUs to disable | ||
396 | * VMX and halt. | ||
397 | * | ||
398 | * For safety, we will avoid running the nmi_shootdown_cpus() | ||
399 | * stuff unnecessarily, but we don't have a way to check | ||
400 | * if other CPUs have VMX enabled. So we will call it only if the | ||
401 | * CPU we are running on has VMX enabled. | ||
402 | * | ||
403 | * We will miss cases where VMX is not enabled on all CPUs. This | ||
404 | * shouldn't do much harm because KVM always enable VMX on all | ||
405 | * CPUs anyway. But we can miss it on the small window where KVM | ||
406 | * is still enabling VMX. | ||
407 | */ | ||
408 | if (cpu_has_vmx() && cpu_vmx_enabled()) { | ||
409 | /* Disable VMX on this CPU. | ||
410 | */ | ||
411 | cpu_vmxoff(); | ||
412 | |||
413 | /* Halt and disable VMX on the other CPUs */ | ||
414 | nmi_shootdown_cpus(vmxoff_nmi); | ||
415 | |||
416 | } | ||
417 | } | ||
418 | |||
419 | |||
371 | void __attribute__((weak)) mach_reboot_fixups(void) | 420 | void __attribute__((weak)) mach_reboot_fixups(void) |
372 | { | 421 | { |
373 | } | 422 | } |
@@ -376,6 +425,9 @@ static void native_machine_emergency_restart(void) | |||
376 | { | 425 | { |
377 | int i; | 426 | int i; |
378 | 427 | ||
428 | if (reboot_emergency) | ||
429 | emergency_vmx_disable_all(); | ||
430 | |||
379 | /* Tell the BIOS if we want cold or warm reboot */ | 431 | /* Tell the BIOS if we want cold or warm reboot */ |
380 | *((unsigned short *)__va(0x472)) = reboot_mode; | 432 | *((unsigned short *)__va(0x472)) = reboot_mode; |
381 | 433 | ||
@@ -482,13 +534,19 @@ void native_machine_shutdown(void) | |||
482 | #endif | 534 | #endif |
483 | } | 535 | } |
484 | 536 | ||
537 | static void __machine_emergency_restart(int emergency) | ||
538 | { | ||
539 | reboot_emergency = emergency; | ||
540 | machine_ops.emergency_restart(); | ||
541 | } | ||
542 | |||
485 | static void native_machine_restart(char *__unused) | 543 | static void native_machine_restart(char *__unused) |
486 | { | 544 | { |
487 | printk("machine restart\n"); | 545 | printk("machine restart\n"); |
488 | 546 | ||
489 | if (!reboot_force) | 547 | if (!reboot_force) |
490 | machine_shutdown(); | 548 | machine_shutdown(); |
491 | machine_emergency_restart(); | 549 | __machine_emergency_restart(0); |
492 | } | 550 | } |
493 | 551 | ||
494 | static void native_machine_halt(void) | 552 | static void native_machine_halt(void) |
@@ -532,7 +590,7 @@ void machine_shutdown(void) | |||
532 | 590 | ||
533 | void machine_emergency_restart(void) | 591 | void machine_emergency_restart(void) |
534 | { | 592 | { |
535 | machine_ops.emergency_restart(); | 593 | __machine_emergency_restart(1); |
536 | } | 594 | } |
537 | 595 | ||
538 | void machine_restart(char *cmd) | 596 | void machine_restart(char *cmd) |
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 59ebd37ad79e..e665d1c623ca 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c | |||
@@ -603,10 +603,29 @@ void kvm_free_pit(struct kvm *kvm) | |||
603 | 603 | ||
604 | static void __inject_pit_timer_intr(struct kvm *kvm) | 604 | static void __inject_pit_timer_intr(struct kvm *kvm) |
605 | { | 605 | { |
606 | struct kvm_vcpu *vcpu; | ||
607 | int i; | ||
608 | |||
606 | mutex_lock(&kvm->lock); | 609 | mutex_lock(&kvm->lock); |
607 | kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); | 610 | kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); |
608 | kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); | 611 | kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); |
609 | mutex_unlock(&kvm->lock); | 612 | mutex_unlock(&kvm->lock); |
613 | |||
614 | /* | ||
615 | * Provides NMI watchdog support via Virtual Wire mode. | ||
616 | * The route is: PIT -> PIC -> LVT0 in NMI mode. | ||
617 | * | ||
618 | * Note: Our Virtual Wire implementation is simplified, only | ||
619 | * propagating PIT interrupts to all VCPUs when they have set | ||
620 | * LVT0 to NMI delivery. Other PIC interrupts are just sent to | ||
621 | * VCPU0, and only if its LVT0 is in EXTINT mode. | ||
622 | */ | ||
623 | if (kvm->arch.vapics_in_nmi_mode > 0) | ||
624 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
625 | vcpu = kvm->vcpus[i]; | ||
626 | if (vcpu) | ||
627 | kvm_apic_nmi_wd_deliver(vcpu); | ||
628 | } | ||
610 | } | 629 | } |
611 | 630 | ||
612 | void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) | 631 | void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) |
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 17e41e165f1a..179dcb0103fd 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c | |||
@@ -26,10 +26,40 @@ | |||
26 | * Port from Qemu. | 26 | * Port from Qemu. |
27 | */ | 27 | */ |
28 | #include <linux/mm.h> | 28 | #include <linux/mm.h> |
29 | #include <linux/bitops.h> | ||
29 | #include "irq.h" | 30 | #include "irq.h" |
30 | 31 | ||
31 | #include <linux/kvm_host.h> | 32 | #include <linux/kvm_host.h> |
32 | 33 | ||
34 | static void pic_lock(struct kvm_pic *s) | ||
35 | { | ||
36 | spin_lock(&s->lock); | ||
37 | } | ||
38 | |||
39 | static void pic_unlock(struct kvm_pic *s) | ||
40 | { | ||
41 | struct kvm *kvm = s->kvm; | ||
42 | unsigned acks = s->pending_acks; | ||
43 | bool wakeup = s->wakeup_needed; | ||
44 | struct kvm_vcpu *vcpu; | ||
45 | |||
46 | s->pending_acks = 0; | ||
47 | s->wakeup_needed = false; | ||
48 | |||
49 | spin_unlock(&s->lock); | ||
50 | |||
51 | while (acks) { | ||
52 | kvm_notify_acked_irq(kvm, __ffs(acks)); | ||
53 | acks &= acks - 1; | ||
54 | } | ||
55 | |||
56 | if (wakeup) { | ||
57 | vcpu = s->kvm->vcpus[0]; | ||
58 | if (vcpu) | ||
59 | kvm_vcpu_kick(vcpu); | ||
60 | } | ||
61 | } | ||
62 | |||
33 | static void pic_clear_isr(struct kvm_kpic_state *s, int irq) | 63 | static void pic_clear_isr(struct kvm_kpic_state *s, int irq) |
34 | { | 64 | { |
35 | s->isr &= ~(1 << irq); | 65 | s->isr &= ~(1 << irq); |
@@ -136,17 +166,21 @@ static void pic_update_irq(struct kvm_pic *s) | |||
136 | 166 | ||
137 | void kvm_pic_update_irq(struct kvm_pic *s) | 167 | void kvm_pic_update_irq(struct kvm_pic *s) |
138 | { | 168 | { |
169 | pic_lock(s); | ||
139 | pic_update_irq(s); | 170 | pic_update_irq(s); |
171 | pic_unlock(s); | ||
140 | } | 172 | } |
141 | 173 | ||
142 | void kvm_pic_set_irq(void *opaque, int irq, int level) | 174 | void kvm_pic_set_irq(void *opaque, int irq, int level) |
143 | { | 175 | { |
144 | struct kvm_pic *s = opaque; | 176 | struct kvm_pic *s = opaque; |
145 | 177 | ||
178 | pic_lock(s); | ||
146 | if (irq >= 0 && irq < PIC_NUM_PINS) { | 179 | if (irq >= 0 && irq < PIC_NUM_PINS) { |
147 | pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); | 180 | pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); |
148 | pic_update_irq(s); | 181 | pic_update_irq(s); |
149 | } | 182 | } |
183 | pic_unlock(s); | ||
150 | } | 184 | } |
151 | 185 | ||
152 | /* | 186 | /* |
@@ -172,6 +206,7 @@ int kvm_pic_read_irq(struct kvm *kvm) | |||
172 | int irq, irq2, intno; | 206 | int irq, irq2, intno; |
173 | struct kvm_pic *s = pic_irqchip(kvm); | 207 | struct kvm_pic *s = pic_irqchip(kvm); |
174 | 208 | ||
209 | pic_lock(s); | ||
175 | irq = pic_get_irq(&s->pics[0]); | 210 | irq = pic_get_irq(&s->pics[0]); |
176 | if (irq >= 0) { | 211 | if (irq >= 0) { |
177 | pic_intack(&s->pics[0], irq); | 212 | pic_intack(&s->pics[0], irq); |
@@ -196,6 +231,7 @@ int kvm_pic_read_irq(struct kvm *kvm) | |||
196 | intno = s->pics[0].irq_base + irq; | 231 | intno = s->pics[0].irq_base + irq; |
197 | } | 232 | } |
198 | pic_update_irq(s); | 233 | pic_update_irq(s); |
234 | pic_unlock(s); | ||
199 | kvm_notify_acked_irq(kvm, irq); | 235 | kvm_notify_acked_irq(kvm, irq); |
200 | 236 | ||
201 | return intno; | 237 | return intno; |
@@ -203,7 +239,7 @@ int kvm_pic_read_irq(struct kvm *kvm) | |||
203 | 239 | ||
204 | void kvm_pic_reset(struct kvm_kpic_state *s) | 240 | void kvm_pic_reset(struct kvm_kpic_state *s) |
205 | { | 241 | { |
206 | int irq, irqbase; | 242 | int irq, irqbase, n; |
207 | struct kvm *kvm = s->pics_state->irq_request_opaque; | 243 | struct kvm *kvm = s->pics_state->irq_request_opaque; |
208 | struct kvm_vcpu *vcpu0 = kvm->vcpus[0]; | 244 | struct kvm_vcpu *vcpu0 = kvm->vcpus[0]; |
209 | 245 | ||
@@ -214,8 +250,10 @@ void kvm_pic_reset(struct kvm_kpic_state *s) | |||
214 | 250 | ||
215 | for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { | 251 | for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { |
216 | if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) | 252 | if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) |
217 | if (s->irr & (1 << irq) || s->isr & (1 << irq)) | 253 | if (s->irr & (1 << irq) || s->isr & (1 << irq)) { |
218 | kvm_notify_acked_irq(kvm, irq+irqbase); | 254 | n = irq + irqbase; |
255 | s->pics_state->pending_acks |= 1 << n; | ||
256 | } | ||
219 | } | 257 | } |
220 | s->last_irr = 0; | 258 | s->last_irr = 0; |
221 | s->irr = 0; | 259 | s->irr = 0; |
@@ -406,6 +444,7 @@ static void picdev_write(struct kvm_io_device *this, | |||
406 | printk(KERN_ERR "PIC: non byte write\n"); | 444 | printk(KERN_ERR "PIC: non byte write\n"); |
407 | return; | 445 | return; |
408 | } | 446 | } |
447 | pic_lock(s); | ||
409 | switch (addr) { | 448 | switch (addr) { |
410 | case 0x20: | 449 | case 0x20: |
411 | case 0x21: | 450 | case 0x21: |
@@ -418,6 +457,7 @@ static void picdev_write(struct kvm_io_device *this, | |||
418 | elcr_ioport_write(&s->pics[addr & 1], addr, data); | 457 | elcr_ioport_write(&s->pics[addr & 1], addr, data); |
419 | break; | 458 | break; |
420 | } | 459 | } |
460 | pic_unlock(s); | ||
421 | } | 461 | } |
422 | 462 | ||
423 | static void picdev_read(struct kvm_io_device *this, | 463 | static void picdev_read(struct kvm_io_device *this, |
@@ -431,6 +471,7 @@ static void picdev_read(struct kvm_io_device *this, | |||
431 | printk(KERN_ERR "PIC: non byte read\n"); | 471 | printk(KERN_ERR "PIC: non byte read\n"); |
432 | return; | 472 | return; |
433 | } | 473 | } |
474 | pic_lock(s); | ||
434 | switch (addr) { | 475 | switch (addr) { |
435 | case 0x20: | 476 | case 0x20: |
436 | case 0x21: | 477 | case 0x21: |
@@ -444,6 +485,7 @@ static void picdev_read(struct kvm_io_device *this, | |||
444 | break; | 485 | break; |
445 | } | 486 | } |
446 | *(unsigned char *)val = data; | 487 | *(unsigned char *)val = data; |
488 | pic_unlock(s); | ||
447 | } | 489 | } |
448 | 490 | ||
449 | /* | 491 | /* |
@@ -459,7 +501,7 @@ static void pic_irq_request(void *opaque, int level) | |||
459 | s->output = level; | 501 | s->output = level; |
460 | if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { | 502 | if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { |
461 | s->pics[0].isr_ack &= ~(1 << irq); | 503 | s->pics[0].isr_ack &= ~(1 << irq); |
462 | kvm_vcpu_kick(vcpu); | 504 | s->wakeup_needed = true; |
463 | } | 505 | } |
464 | } | 506 | } |
465 | 507 | ||
@@ -469,6 +511,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) | |||
469 | s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); | 511 | s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); |
470 | if (!s) | 512 | if (!s) |
471 | return NULL; | 513 | return NULL; |
514 | spin_lock_init(&s->lock); | ||
515 | s->kvm = kvm; | ||
472 | s->pics[0].elcr_mask = 0xf8; | 516 | s->pics[0].elcr_mask = 0xf8; |
473 | s->pics[1].elcr_mask = 0xde; | 517 | s->pics[1].elcr_mask = 0xde; |
474 | s->irq_request = pic_irq_request; | 518 | s->irq_request = pic_irq_request; |
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index f17c8f5bbf31..2bf32a03ceec 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/mm_types.h> | 25 | #include <linux/mm_types.h> |
26 | #include <linux/hrtimer.h> | 26 | #include <linux/hrtimer.h> |
27 | #include <linux/kvm_host.h> | 27 | #include <linux/kvm_host.h> |
28 | #include <linux/spinlock.h> | ||
28 | 29 | ||
29 | #include "iodev.h" | 30 | #include "iodev.h" |
30 | #include "ioapic.h" | 31 | #include "ioapic.h" |
@@ -59,6 +60,10 @@ struct kvm_kpic_state { | |||
59 | }; | 60 | }; |
60 | 61 | ||
61 | struct kvm_pic { | 62 | struct kvm_pic { |
63 | spinlock_t lock; | ||
64 | bool wakeup_needed; | ||
65 | unsigned pending_acks; | ||
66 | struct kvm *kvm; | ||
62 | struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ | 67 | struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ |
63 | irq_request_func *irq_request; | 68 | irq_request_func *irq_request; |
64 | void *irq_request_opaque; | 69 | void *irq_request_opaque; |
@@ -87,6 +92,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s); | |||
87 | void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); | 92 | void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); |
88 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); | 93 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); |
89 | void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); | 94 | void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); |
95 | void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu); | ||
90 | void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); | 96 | void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); |
91 | void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); | 97 | void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); |
92 | void __kvm_migrate_timers(struct kvm_vcpu *vcpu); | 98 | void __kvm_migrate_timers(struct kvm_vcpu *vcpu); |
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h index 65ef0fc2c036..8e5ee99551f6 100644 --- a/arch/x86/kvm/kvm_svm.h +++ b/arch/x86/kvm/kvm_svm.h | |||
@@ -7,7 +7,7 @@ | |||
7 | #include <linux/kvm_host.h> | 7 | #include <linux/kvm_host.h> |
8 | #include <asm/msr.h> | 8 | #include <asm/msr.h> |
9 | 9 | ||
10 | #include "svm.h" | 10 | #include <asm/svm.h> |
11 | 11 | ||
12 | static const u32 host_save_user_msrs[] = { | 12 | static const u32 host_save_user_msrs[] = { |
13 | #ifdef CONFIG_X86_64 | 13 | #ifdef CONFIG_X86_64 |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0fc3cab48943..afac68c0815c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -130,6 +130,11 @@ static inline int apic_lvtt_period(struct kvm_lapic *apic) | |||
130 | return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC; | 130 | return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC; |
131 | } | 131 | } |
132 | 132 | ||
133 | static inline int apic_lvt_nmi_mode(u32 lvt_val) | ||
134 | { | ||
135 | return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; | ||
136 | } | ||
137 | |||
133 | static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { | 138 | static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { |
134 | LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ | 139 | LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ |
135 | LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ | 140 | LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ |
@@ -354,6 +359,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
354 | 359 | ||
355 | case APIC_DM_NMI: | 360 | case APIC_DM_NMI: |
356 | kvm_inject_nmi(vcpu); | 361 | kvm_inject_nmi(vcpu); |
362 | kvm_vcpu_kick(vcpu); | ||
357 | break; | 363 | break; |
358 | 364 | ||
359 | case APIC_DM_INIT: | 365 | case APIC_DM_INIT: |
@@ -380,6 +386,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
380 | } | 386 | } |
381 | break; | 387 | break; |
382 | 388 | ||
389 | case APIC_DM_EXTINT: | ||
390 | /* | ||
391 | * Should only be called by kvm_apic_local_deliver() with LVT0, | ||
392 | * before NMI watchdog was enabled. Already handled by | ||
393 | * kvm_apic_accept_pic_intr(). | ||
394 | */ | ||
395 | break; | ||
396 | |||
383 | default: | 397 | default: |
384 | printk(KERN_ERR "TODO: unsupported delivery mode %x\n", | 398 | printk(KERN_ERR "TODO: unsupported delivery mode %x\n", |
385 | delivery_mode); | 399 | delivery_mode); |
@@ -663,6 +677,20 @@ static void start_apic_timer(struct kvm_lapic *apic) | |||
663 | apic->timer.period))); | 677 | apic->timer.period))); |
664 | } | 678 | } |
665 | 679 | ||
680 | static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) | ||
681 | { | ||
682 | int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0)); | ||
683 | |||
684 | if (apic_lvt_nmi_mode(lvt0_val)) { | ||
685 | if (!nmi_wd_enabled) { | ||
686 | apic_debug("Receive NMI setting on APIC_LVT0 " | ||
687 | "for cpu %d\n", apic->vcpu->vcpu_id); | ||
688 | apic->vcpu->kvm->arch.vapics_in_nmi_mode++; | ||
689 | } | ||
690 | } else if (nmi_wd_enabled) | ||
691 | apic->vcpu->kvm->arch.vapics_in_nmi_mode--; | ||
692 | } | ||
693 | |||
666 | static void apic_mmio_write(struct kvm_io_device *this, | 694 | static void apic_mmio_write(struct kvm_io_device *this, |
667 | gpa_t address, int len, const void *data) | 695 | gpa_t address, int len, const void *data) |
668 | { | 696 | { |
@@ -743,10 +771,11 @@ static void apic_mmio_write(struct kvm_io_device *this, | |||
743 | apic_set_reg(apic, APIC_ICR2, val & 0xff000000); | 771 | apic_set_reg(apic, APIC_ICR2, val & 0xff000000); |
744 | break; | 772 | break; |
745 | 773 | ||
774 | case APIC_LVT0: | ||
775 | apic_manage_nmi_watchdog(apic, val); | ||
746 | case APIC_LVTT: | 776 | case APIC_LVTT: |
747 | case APIC_LVTTHMR: | 777 | case APIC_LVTTHMR: |
748 | case APIC_LVTPC: | 778 | case APIC_LVTPC: |
749 | case APIC_LVT0: | ||
750 | case APIC_LVT1: | 779 | case APIC_LVT1: |
751 | case APIC_LVTERR: | 780 | case APIC_LVTERR: |
752 | /* TODO: Check vector */ | 781 | /* TODO: Check vector */ |
@@ -961,12 +990,26 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu) | |||
961 | return 0; | 990 | return 0; |
962 | } | 991 | } |
963 | 992 | ||
964 | static int __inject_apic_timer_irq(struct kvm_lapic *apic) | 993 | static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) |
994 | { | ||
995 | u32 reg = apic_get_reg(apic, lvt_type); | ||
996 | int vector, mode, trig_mode; | ||
997 | |||
998 | if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { | ||
999 | vector = reg & APIC_VECTOR_MASK; | ||
1000 | mode = reg & APIC_MODE_MASK; | ||
1001 | trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; | ||
1002 | return __apic_accept_irq(apic, mode, vector, 1, trig_mode); | ||
1003 | } | ||
1004 | return 0; | ||
1005 | } | ||
1006 | |||
1007 | void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu) | ||
965 | { | 1008 | { |
966 | int vector; | 1009 | struct kvm_lapic *apic = vcpu->arch.apic; |
967 | 1010 | ||
968 | vector = apic_lvt_vector(apic, APIC_LVTT); | 1011 | if (apic) |
969 | return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0); | 1012 | kvm_apic_local_deliver(apic, APIC_LVT0); |
970 | } | 1013 | } |
971 | 1014 | ||
972 | static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) | 1015 | static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) |
@@ -1061,9 +1104,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) | |||
1061 | { | 1104 | { |
1062 | struct kvm_lapic *apic = vcpu->arch.apic; | 1105 | struct kvm_lapic *apic = vcpu->arch.apic; |
1063 | 1106 | ||
1064 | if (apic && apic_lvt_enabled(apic, APIC_LVTT) && | 1107 | if (apic && atomic_read(&apic->timer.pending) > 0) { |
1065 | atomic_read(&apic->timer.pending) > 0) { | 1108 | if (kvm_apic_local_deliver(apic, APIC_LVTT)) |
1066 | if (__inject_apic_timer_irq(apic)) | ||
1067 | atomic_dec(&apic->timer.pending); | 1109 | atomic_dec(&apic->timer.pending); |
1068 | } | 1110 | } |
1069 | } | 1111 | } |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 410ddbc1aa2e..83f11c7474a1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -17,7 +17,6 @@ | |||
17 | * | 17 | * |
18 | */ | 18 | */ |
19 | 19 | ||
20 | #include "vmx.h" | ||
21 | #include "mmu.h" | 20 | #include "mmu.h" |
22 | 21 | ||
23 | #include <linux/kvm_host.h> | 22 | #include <linux/kvm_host.h> |
@@ -33,6 +32,7 @@ | |||
33 | #include <asm/page.h> | 32 | #include <asm/page.h> |
34 | #include <asm/cmpxchg.h> | 33 | #include <asm/cmpxchg.h> |
35 | #include <asm/io.h> | 34 | #include <asm/io.h> |
35 | #include <asm/vmx.h> | ||
36 | 36 | ||
37 | /* | 37 | /* |
38 | * When setting this variable to true it enables Two-Dimensional-Paging | 38 | * When setting this variable to true it enables Two-Dimensional-Paging |
@@ -168,6 +168,7 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ | |||
168 | static u64 __read_mostly shadow_user_mask; | 168 | static u64 __read_mostly shadow_user_mask; |
169 | static u64 __read_mostly shadow_accessed_mask; | 169 | static u64 __read_mostly shadow_accessed_mask; |
170 | static u64 __read_mostly shadow_dirty_mask; | 170 | static u64 __read_mostly shadow_dirty_mask; |
171 | static u64 __read_mostly shadow_mt_mask; | ||
171 | 172 | ||
172 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) | 173 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) |
173 | { | 174 | { |
@@ -183,13 +184,14 @@ void kvm_mmu_set_base_ptes(u64 base_pte) | |||
183 | EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); | 184 | EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); |
184 | 185 | ||
185 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, | 186 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, |
186 | u64 dirty_mask, u64 nx_mask, u64 x_mask) | 187 | u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask) |
187 | { | 188 | { |
188 | shadow_user_mask = user_mask; | 189 | shadow_user_mask = user_mask; |
189 | shadow_accessed_mask = accessed_mask; | 190 | shadow_accessed_mask = accessed_mask; |
190 | shadow_dirty_mask = dirty_mask; | 191 | shadow_dirty_mask = dirty_mask; |
191 | shadow_nx_mask = nx_mask; | 192 | shadow_nx_mask = nx_mask; |
192 | shadow_x_mask = x_mask; | 193 | shadow_x_mask = x_mask; |
194 | shadow_mt_mask = mt_mask; | ||
193 | } | 195 | } |
194 | EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); | 196 | EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); |
195 | 197 | ||
@@ -384,7 +386,9 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn) | |||
384 | { | 386 | { |
385 | int *write_count; | 387 | int *write_count; |
386 | 388 | ||
387 | write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); | 389 | gfn = unalias_gfn(kvm, gfn); |
390 | write_count = slot_largepage_idx(gfn, | ||
391 | gfn_to_memslot_unaliased(kvm, gfn)); | ||
388 | *write_count += 1; | 392 | *write_count += 1; |
389 | } | 393 | } |
390 | 394 | ||
@@ -392,16 +396,20 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) | |||
392 | { | 396 | { |
393 | int *write_count; | 397 | int *write_count; |
394 | 398 | ||
395 | write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); | 399 | gfn = unalias_gfn(kvm, gfn); |
400 | write_count = slot_largepage_idx(gfn, | ||
401 | gfn_to_memslot_unaliased(kvm, gfn)); | ||
396 | *write_count -= 1; | 402 | *write_count -= 1; |
397 | WARN_ON(*write_count < 0); | 403 | WARN_ON(*write_count < 0); |
398 | } | 404 | } |
399 | 405 | ||
400 | static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) | 406 | static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) |
401 | { | 407 | { |
402 | struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); | 408 | struct kvm_memory_slot *slot; |
403 | int *largepage_idx; | 409 | int *largepage_idx; |
404 | 410 | ||
411 | gfn = unalias_gfn(kvm, gfn); | ||
412 | slot = gfn_to_memslot_unaliased(kvm, gfn); | ||
405 | if (slot) { | 413 | if (slot) { |
406 | largepage_idx = slot_largepage_idx(gfn, slot); | 414 | largepage_idx = slot_largepage_idx(gfn, slot); |
407 | return *largepage_idx; | 415 | return *largepage_idx; |
@@ -613,7 +621,7 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) | |||
613 | return NULL; | 621 | return NULL; |
614 | } | 622 | } |
615 | 623 | ||
616 | static void rmap_write_protect(struct kvm *kvm, u64 gfn) | 624 | static int rmap_write_protect(struct kvm *kvm, u64 gfn) |
617 | { | 625 | { |
618 | unsigned long *rmapp; | 626 | unsigned long *rmapp; |
619 | u64 *spte; | 627 | u64 *spte; |
@@ -659,8 +667,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
659 | spte = rmap_next(kvm, rmapp, spte); | 667 | spte = rmap_next(kvm, rmapp, spte); |
660 | } | 668 | } |
661 | 669 | ||
662 | if (write_protected) | 670 | return write_protected; |
663 | kvm_flush_remote_tlbs(kvm); | ||
664 | } | 671 | } |
665 | 672 | ||
666 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) | 673 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) |
@@ -786,9 +793,11 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | |||
786 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); | 793 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); |
787 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | 794 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); |
788 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); | 795 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); |
796 | INIT_LIST_HEAD(&sp->oos_link); | ||
789 | ASSERT(is_empty_shadow_page(sp->spt)); | 797 | ASSERT(is_empty_shadow_page(sp->spt)); |
790 | sp->slot_bitmap = 0; | 798 | bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); |
791 | sp->multimapped = 0; | 799 | sp->multimapped = 0; |
800 | sp->global = 1; | ||
792 | sp->parent_pte = parent_pte; | 801 | sp->parent_pte = parent_pte; |
793 | --vcpu->kvm->arch.n_free_mmu_pages; | 802 | --vcpu->kvm->arch.n_free_mmu_pages; |
794 | return sp; | 803 | return sp; |
@@ -900,8 +909,9 @@ static void kvm_mmu_update_unsync_bitmap(u64 *spte) | |||
900 | struct kvm_mmu_page *sp = page_header(__pa(spte)); | 909 | struct kvm_mmu_page *sp = page_header(__pa(spte)); |
901 | 910 | ||
902 | index = spte - sp->spt; | 911 | index = spte - sp->spt; |
903 | __set_bit(index, sp->unsync_child_bitmap); | 912 | if (!__test_and_set_bit(index, sp->unsync_child_bitmap)) |
904 | sp->unsync_children = 1; | 913 | sp->unsync_children++; |
914 | WARN_ON(!sp->unsync_children); | ||
905 | } | 915 | } |
906 | 916 | ||
907 | static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) | 917 | static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) |
@@ -928,7 +938,6 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) | |||
928 | 938 | ||
929 | static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | 939 | static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) |
930 | { | 940 | { |
931 | sp->unsync_children = 1; | ||
932 | kvm_mmu_update_parents_unsync(sp); | 941 | kvm_mmu_update_parents_unsync(sp); |
933 | return 1; | 942 | return 1; |
934 | } | 943 | } |
@@ -959,38 +968,66 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) | |||
959 | { | 968 | { |
960 | } | 969 | } |
961 | 970 | ||
971 | #define KVM_PAGE_ARRAY_NR 16 | ||
972 | |||
973 | struct kvm_mmu_pages { | ||
974 | struct mmu_page_and_offset { | ||
975 | struct kvm_mmu_page *sp; | ||
976 | unsigned int idx; | ||
977 | } page[KVM_PAGE_ARRAY_NR]; | ||
978 | unsigned int nr; | ||
979 | }; | ||
980 | |||
962 | #define for_each_unsync_children(bitmap, idx) \ | 981 | #define for_each_unsync_children(bitmap, idx) \ |
963 | for (idx = find_first_bit(bitmap, 512); \ | 982 | for (idx = find_first_bit(bitmap, 512); \ |
964 | idx < 512; \ | 983 | idx < 512; \ |
965 | idx = find_next_bit(bitmap, 512, idx+1)) | 984 | idx = find_next_bit(bitmap, 512, idx+1)) |
966 | 985 | ||
967 | static int mmu_unsync_walk(struct kvm_mmu_page *sp, | 986 | int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, |
968 | struct kvm_unsync_walk *walker) | 987 | int idx) |
969 | { | 988 | { |
970 | int i, ret; | 989 | int i; |
971 | 990 | ||
972 | if (!sp->unsync_children) | 991 | if (sp->unsync) |
973 | return 0; | 992 | for (i=0; i < pvec->nr; i++) |
993 | if (pvec->page[i].sp == sp) | ||
994 | return 0; | ||
995 | |||
996 | pvec->page[pvec->nr].sp = sp; | ||
997 | pvec->page[pvec->nr].idx = idx; | ||
998 | pvec->nr++; | ||
999 | return (pvec->nr == KVM_PAGE_ARRAY_NR); | ||
1000 | } | ||
1001 | |||
1002 | static int __mmu_unsync_walk(struct kvm_mmu_page *sp, | ||
1003 | struct kvm_mmu_pages *pvec) | ||
1004 | { | ||
1005 | int i, ret, nr_unsync_leaf = 0; | ||
974 | 1006 | ||
975 | for_each_unsync_children(sp->unsync_child_bitmap, i) { | 1007 | for_each_unsync_children(sp->unsync_child_bitmap, i) { |
976 | u64 ent = sp->spt[i]; | 1008 | u64 ent = sp->spt[i]; |
977 | 1009 | ||
978 | if (is_shadow_present_pte(ent)) { | 1010 | if (is_shadow_present_pte(ent) && !is_large_pte(ent)) { |
979 | struct kvm_mmu_page *child; | 1011 | struct kvm_mmu_page *child; |
980 | child = page_header(ent & PT64_BASE_ADDR_MASK); | 1012 | child = page_header(ent & PT64_BASE_ADDR_MASK); |
981 | 1013 | ||
982 | if (child->unsync_children) { | 1014 | if (child->unsync_children) { |
983 | ret = mmu_unsync_walk(child, walker); | 1015 | if (mmu_pages_add(pvec, child, i)) |
984 | if (ret) | 1016 | return -ENOSPC; |
1017 | |||
1018 | ret = __mmu_unsync_walk(child, pvec); | ||
1019 | if (!ret) | ||
1020 | __clear_bit(i, sp->unsync_child_bitmap); | ||
1021 | else if (ret > 0) | ||
1022 | nr_unsync_leaf += ret; | ||
1023 | else | ||
985 | return ret; | 1024 | return ret; |
986 | __clear_bit(i, sp->unsync_child_bitmap); | ||
987 | } | 1025 | } |
988 | 1026 | ||
989 | if (child->unsync) { | 1027 | if (child->unsync) { |
990 | ret = walker->entry(child, walker); | 1028 | nr_unsync_leaf++; |
991 | __clear_bit(i, sp->unsync_child_bitmap); | 1029 | if (mmu_pages_add(pvec, child, i)) |
992 | if (ret) | 1030 | return -ENOSPC; |
993 | return ret; | ||
994 | } | 1031 | } |
995 | } | 1032 | } |
996 | } | 1033 | } |
@@ -998,7 +1035,17 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, | |||
998 | if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) | 1035 | if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) |
999 | sp->unsync_children = 0; | 1036 | sp->unsync_children = 0; |
1000 | 1037 | ||
1001 | return 0; | 1038 | return nr_unsync_leaf; |
1039 | } | ||
1040 | |||
1041 | static int mmu_unsync_walk(struct kvm_mmu_page *sp, | ||
1042 | struct kvm_mmu_pages *pvec) | ||
1043 | { | ||
1044 | if (!sp->unsync_children) | ||
1045 | return 0; | ||
1046 | |||
1047 | mmu_pages_add(pvec, sp, 0); | ||
1048 | return __mmu_unsync_walk(sp, pvec); | ||
1002 | } | 1049 | } |
1003 | 1050 | ||
1004 | static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) | 1051 | static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) |
@@ -1021,10 +1068,18 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) | |||
1021 | return NULL; | 1068 | return NULL; |
1022 | } | 1069 | } |
1023 | 1070 | ||
1071 | static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
1072 | { | ||
1073 | list_del(&sp->oos_link); | ||
1074 | --kvm->stat.mmu_unsync_global; | ||
1075 | } | ||
1076 | |||
1024 | static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) | 1077 | static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) |
1025 | { | 1078 | { |
1026 | WARN_ON(!sp->unsync); | 1079 | WARN_ON(!sp->unsync); |
1027 | sp->unsync = 0; | 1080 | sp->unsync = 0; |
1081 | if (sp->global) | ||
1082 | kvm_unlink_unsync_global(kvm, sp); | ||
1028 | --kvm->stat.mmu_unsync; | 1083 | --kvm->stat.mmu_unsync; |
1029 | } | 1084 | } |
1030 | 1085 | ||
@@ -1037,7 +1092,8 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
1037 | return 1; | 1092 | return 1; |
1038 | } | 1093 | } |
1039 | 1094 | ||
1040 | rmap_write_protect(vcpu->kvm, sp->gfn); | 1095 | if (rmap_write_protect(vcpu->kvm, sp->gfn)) |
1096 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
1041 | kvm_unlink_unsync_page(vcpu->kvm, sp); | 1097 | kvm_unlink_unsync_page(vcpu->kvm, sp); |
1042 | if (vcpu->arch.mmu.sync_page(vcpu, sp)) { | 1098 | if (vcpu->arch.mmu.sync_page(vcpu, sp)) { |
1043 | kvm_mmu_zap_page(vcpu->kvm, sp); | 1099 | kvm_mmu_zap_page(vcpu->kvm, sp); |
@@ -1048,30 +1104,89 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
1048 | return 0; | 1104 | return 0; |
1049 | } | 1105 | } |
1050 | 1106 | ||
1051 | struct sync_walker { | 1107 | struct mmu_page_path { |
1052 | struct kvm_vcpu *vcpu; | 1108 | struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; |
1053 | struct kvm_unsync_walk walker; | 1109 | unsigned int idx[PT64_ROOT_LEVEL-1]; |
1054 | }; | 1110 | }; |
1055 | 1111 | ||
1056 | static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) | 1112 | #define for_each_sp(pvec, sp, parents, i) \ |
1113 | for (i = mmu_pages_next(&pvec, &parents, -1), \ | ||
1114 | sp = pvec.page[i].sp; \ | ||
1115 | i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ | ||
1116 | i = mmu_pages_next(&pvec, &parents, i)) | ||
1117 | |||
1118 | int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents, | ||
1119 | int i) | ||
1057 | { | 1120 | { |
1058 | struct sync_walker *sync_walk = container_of(walk, struct sync_walker, | 1121 | int n; |
1059 | walker); | ||
1060 | struct kvm_vcpu *vcpu = sync_walk->vcpu; | ||
1061 | 1122 | ||
1062 | kvm_sync_page(vcpu, sp); | 1123 | for (n = i+1; n < pvec->nr; n++) { |
1063 | return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)); | 1124 | struct kvm_mmu_page *sp = pvec->page[n].sp; |
1125 | |||
1126 | if (sp->role.level == PT_PAGE_TABLE_LEVEL) { | ||
1127 | parents->idx[0] = pvec->page[n].idx; | ||
1128 | return n; | ||
1129 | } | ||
1130 | |||
1131 | parents->parent[sp->role.level-2] = sp; | ||
1132 | parents->idx[sp->role.level-1] = pvec->page[n].idx; | ||
1133 | } | ||
1134 | |||
1135 | return n; | ||
1064 | } | 1136 | } |
1065 | 1137 | ||
1066 | static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | 1138 | void mmu_pages_clear_parents(struct mmu_page_path *parents) |
1067 | { | 1139 | { |
1068 | struct sync_walker walker = { | 1140 | struct kvm_mmu_page *sp; |
1069 | .walker = { .entry = mmu_sync_fn, }, | 1141 | unsigned int level = 0; |
1070 | .vcpu = vcpu, | 1142 | |
1071 | }; | 1143 | do { |
1144 | unsigned int idx = parents->idx[level]; | ||
1145 | |||
1146 | sp = parents->parent[level]; | ||
1147 | if (!sp) | ||
1148 | return; | ||
1149 | |||
1150 | --sp->unsync_children; | ||
1151 | WARN_ON((int)sp->unsync_children < 0); | ||
1152 | __clear_bit(idx, sp->unsync_child_bitmap); | ||
1153 | level++; | ||
1154 | } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children); | ||
1155 | } | ||
1156 | |||
1157 | static void kvm_mmu_pages_init(struct kvm_mmu_page *parent, | ||
1158 | struct mmu_page_path *parents, | ||
1159 | struct kvm_mmu_pages *pvec) | ||
1160 | { | ||
1161 | parents->parent[parent->role.level-1] = NULL; | ||
1162 | pvec->nr = 0; | ||
1163 | } | ||
1164 | |||
1165 | static void mmu_sync_children(struct kvm_vcpu *vcpu, | ||
1166 | struct kvm_mmu_page *parent) | ||
1167 | { | ||
1168 | int i; | ||
1169 | struct kvm_mmu_page *sp; | ||
1170 | struct mmu_page_path parents; | ||
1171 | struct kvm_mmu_pages pages; | ||
1172 | |||
1173 | kvm_mmu_pages_init(parent, &parents, &pages); | ||
1174 | while (mmu_unsync_walk(parent, &pages)) { | ||
1175 | int protected = 0; | ||
1072 | 1176 | ||
1073 | while (mmu_unsync_walk(sp, &walker.walker)) | 1177 | for_each_sp(pages, sp, parents, i) |
1178 | protected |= rmap_write_protect(vcpu->kvm, sp->gfn); | ||
1179 | |||
1180 | if (protected) | ||
1181 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
1182 | |||
1183 | for_each_sp(pages, sp, parents, i) { | ||
1184 | kvm_sync_page(vcpu, sp); | ||
1185 | mmu_pages_clear_parents(&parents); | ||
1186 | } | ||
1074 | cond_resched_lock(&vcpu->kvm->mmu_lock); | 1187 | cond_resched_lock(&vcpu->kvm->mmu_lock); |
1188 | kvm_mmu_pages_init(parent, &parents, &pages); | ||
1189 | } | ||
1075 | } | 1190 | } |
1076 | 1191 | ||
1077 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | 1192 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, |
@@ -1129,7 +1244,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
1129 | sp->role = role; | 1244 | sp->role = role; |
1130 | hlist_add_head(&sp->hash_link, bucket); | 1245 | hlist_add_head(&sp->hash_link, bucket); |
1131 | if (!metaphysical) { | 1246 | if (!metaphysical) { |
1132 | rmap_write_protect(vcpu->kvm, gfn); | 1247 | if (rmap_write_protect(vcpu->kvm, gfn)) |
1248 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
1133 | account_shadowed(vcpu->kvm, gfn); | 1249 | account_shadowed(vcpu->kvm, gfn); |
1134 | } | 1250 | } |
1135 | if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) | 1251 | if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) |
@@ -1153,6 +1269,8 @@ static int walk_shadow(struct kvm_shadow_walk *walker, | |||
1153 | if (level == PT32E_ROOT_LEVEL) { | 1269 | if (level == PT32E_ROOT_LEVEL) { |
1154 | shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; | 1270 | shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; |
1155 | shadow_addr &= PT64_BASE_ADDR_MASK; | 1271 | shadow_addr &= PT64_BASE_ADDR_MASK; |
1272 | if (!shadow_addr) | ||
1273 | return 1; | ||
1156 | --level; | 1274 | --level; |
1157 | } | 1275 | } |
1158 | 1276 | ||
@@ -1237,33 +1355,29 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
1237 | } | 1355 | } |
1238 | } | 1356 | } |
1239 | 1357 | ||
1240 | struct zap_walker { | 1358 | static int mmu_zap_unsync_children(struct kvm *kvm, |
1241 | struct kvm_unsync_walk walker; | 1359 | struct kvm_mmu_page *parent) |
1242 | struct kvm *kvm; | ||
1243 | int zapped; | ||
1244 | }; | ||
1245 | |||
1246 | static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) | ||
1247 | { | 1360 | { |
1248 | struct zap_walker *zap_walk = container_of(walk, struct zap_walker, | 1361 | int i, zapped = 0; |
1249 | walker); | 1362 | struct mmu_page_path parents; |
1250 | kvm_mmu_zap_page(zap_walk->kvm, sp); | 1363 | struct kvm_mmu_pages pages; |
1251 | zap_walk->zapped = 1; | ||
1252 | return 0; | ||
1253 | } | ||
1254 | 1364 | ||
1255 | static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp) | 1365 | if (parent->role.level == PT_PAGE_TABLE_LEVEL) |
1256 | { | ||
1257 | struct zap_walker walker = { | ||
1258 | .walker = { .entry = mmu_zap_fn, }, | ||
1259 | .kvm = kvm, | ||
1260 | .zapped = 0, | ||
1261 | }; | ||
1262 | |||
1263 | if (sp->role.level == PT_PAGE_TABLE_LEVEL) | ||
1264 | return 0; | 1366 | return 0; |
1265 | mmu_unsync_walk(sp, &walker.walker); | 1367 | |
1266 | return walker.zapped; | 1368 | kvm_mmu_pages_init(parent, &parents, &pages); |
1369 | while (mmu_unsync_walk(parent, &pages)) { | ||
1370 | struct kvm_mmu_page *sp; | ||
1371 | |||
1372 | for_each_sp(pages, sp, parents, i) { | ||
1373 | kvm_mmu_zap_page(kvm, sp); | ||
1374 | mmu_pages_clear_parents(&parents); | ||
1375 | } | ||
1376 | zapped += pages.nr; | ||
1377 | kvm_mmu_pages_init(parent, &parents, &pages); | ||
1378 | } | ||
1379 | |||
1380 | return zapped; | ||
1267 | } | 1381 | } |
1268 | 1382 | ||
1269 | static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | 1383 | static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) |
@@ -1362,7 +1476,7 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) | |||
1362 | int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); | 1476 | int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); |
1363 | struct kvm_mmu_page *sp = page_header(__pa(pte)); | 1477 | struct kvm_mmu_page *sp = page_header(__pa(pte)); |
1364 | 1478 | ||
1365 | __set_bit(slot, &sp->slot_bitmap); | 1479 | __set_bit(slot, sp->slot_bitmap); |
1366 | } | 1480 | } |
1367 | 1481 | ||
1368 | static void mmu_convert_notrap(struct kvm_mmu_page *sp) | 1482 | static void mmu_convert_notrap(struct kvm_mmu_page *sp) |
@@ -1393,6 +1507,110 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) | |||
1393 | return page; | 1507 | return page; |
1394 | } | 1508 | } |
1395 | 1509 | ||
1510 | /* | ||
1511 | * The function is based on mtrr_type_lookup() in | ||
1512 | * arch/x86/kernel/cpu/mtrr/generic.c | ||
1513 | */ | ||
1514 | static int get_mtrr_type(struct mtrr_state_type *mtrr_state, | ||
1515 | u64 start, u64 end) | ||
1516 | { | ||
1517 | int i; | ||
1518 | u64 base, mask; | ||
1519 | u8 prev_match, curr_match; | ||
1520 | int num_var_ranges = KVM_NR_VAR_MTRR; | ||
1521 | |||
1522 | if (!mtrr_state->enabled) | ||
1523 | return 0xFF; | ||
1524 | |||
1525 | /* Make end inclusive end, instead of exclusive */ | ||
1526 | end--; | ||
1527 | |||
1528 | /* Look in fixed ranges. Just return the type as per start */ | ||
1529 | if (mtrr_state->have_fixed && (start < 0x100000)) { | ||
1530 | int idx; | ||
1531 | |||
1532 | if (start < 0x80000) { | ||
1533 | idx = 0; | ||
1534 | idx += (start >> 16); | ||
1535 | return mtrr_state->fixed_ranges[idx]; | ||
1536 | } else if (start < 0xC0000) { | ||
1537 | idx = 1 * 8; | ||
1538 | idx += ((start - 0x80000) >> 14); | ||
1539 | return mtrr_state->fixed_ranges[idx]; | ||
1540 | } else if (start < 0x1000000) { | ||
1541 | idx = 3 * 8; | ||
1542 | idx += ((start - 0xC0000) >> 12); | ||
1543 | return mtrr_state->fixed_ranges[idx]; | ||
1544 | } | ||
1545 | } | ||
1546 | |||
1547 | /* | ||
1548 | * Look in variable ranges | ||
1549 | * Look of multiple ranges matching this address and pick type | ||
1550 | * as per MTRR precedence | ||
1551 | */ | ||
1552 | if (!(mtrr_state->enabled & 2)) | ||
1553 | return mtrr_state->def_type; | ||
1554 | |||
1555 | prev_match = 0xFF; | ||
1556 | for (i = 0; i < num_var_ranges; ++i) { | ||
1557 | unsigned short start_state, end_state; | ||
1558 | |||
1559 | if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11))) | ||
1560 | continue; | ||
1561 | |||
1562 | base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) + | ||
1563 | (mtrr_state->var_ranges[i].base_lo & PAGE_MASK); | ||
1564 | mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) + | ||
1565 | (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK); | ||
1566 | |||
1567 | start_state = ((start & mask) == (base & mask)); | ||
1568 | end_state = ((end & mask) == (base & mask)); | ||
1569 | if (start_state != end_state) | ||
1570 | return 0xFE; | ||
1571 | |||
1572 | if ((start & mask) != (base & mask)) | ||
1573 | continue; | ||
1574 | |||
1575 | curr_match = mtrr_state->var_ranges[i].base_lo & 0xff; | ||
1576 | if (prev_match == 0xFF) { | ||
1577 | prev_match = curr_match; | ||
1578 | continue; | ||
1579 | } | ||
1580 | |||
1581 | if (prev_match == MTRR_TYPE_UNCACHABLE || | ||
1582 | curr_match == MTRR_TYPE_UNCACHABLE) | ||
1583 | return MTRR_TYPE_UNCACHABLE; | ||
1584 | |||
1585 | if ((prev_match == MTRR_TYPE_WRBACK && | ||
1586 | curr_match == MTRR_TYPE_WRTHROUGH) || | ||
1587 | (prev_match == MTRR_TYPE_WRTHROUGH && | ||
1588 | curr_match == MTRR_TYPE_WRBACK)) { | ||
1589 | prev_match = MTRR_TYPE_WRTHROUGH; | ||
1590 | curr_match = MTRR_TYPE_WRTHROUGH; | ||
1591 | } | ||
1592 | |||
1593 | if (prev_match != curr_match) | ||
1594 | return MTRR_TYPE_UNCACHABLE; | ||
1595 | } | ||
1596 | |||
1597 | if (prev_match != 0xFF) | ||
1598 | return prev_match; | ||
1599 | |||
1600 | return mtrr_state->def_type; | ||
1601 | } | ||
1602 | |||
1603 | static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
1604 | { | ||
1605 | u8 mtrr; | ||
1606 | |||
1607 | mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT, | ||
1608 | (gfn << PAGE_SHIFT) + PAGE_SIZE); | ||
1609 | if (mtrr == 0xfe || mtrr == 0xff) | ||
1610 | mtrr = MTRR_TYPE_WRBACK; | ||
1611 | return mtrr; | ||
1612 | } | ||
1613 | |||
1396 | static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | 1614 | static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) |
1397 | { | 1615 | { |
1398 | unsigned index; | 1616 | unsigned index; |
@@ -1409,9 +1627,15 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
1409 | if (s->role.word != sp->role.word) | 1627 | if (s->role.word != sp->role.word) |
1410 | return 1; | 1628 | return 1; |
1411 | } | 1629 | } |
1412 | kvm_mmu_mark_parents_unsync(vcpu, sp); | ||
1413 | ++vcpu->kvm->stat.mmu_unsync; | 1630 | ++vcpu->kvm->stat.mmu_unsync; |
1414 | sp->unsync = 1; | 1631 | sp->unsync = 1; |
1632 | |||
1633 | if (sp->global) { | ||
1634 | list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages); | ||
1635 | ++vcpu->kvm->stat.mmu_unsync_global; | ||
1636 | } else | ||
1637 | kvm_mmu_mark_parents_unsync(vcpu, sp); | ||
1638 | |||
1415 | mmu_convert_notrap(sp); | 1639 | mmu_convert_notrap(sp); |
1416 | return 0; | 1640 | return 0; |
1417 | } | 1641 | } |
@@ -1437,11 +1661,24 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, | |||
1437 | static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | 1661 | static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, |
1438 | unsigned pte_access, int user_fault, | 1662 | unsigned pte_access, int user_fault, |
1439 | int write_fault, int dirty, int largepage, | 1663 | int write_fault, int dirty, int largepage, |
1440 | gfn_t gfn, pfn_t pfn, bool speculative, | 1664 | int global, gfn_t gfn, pfn_t pfn, bool speculative, |
1441 | bool can_unsync) | 1665 | bool can_unsync) |
1442 | { | 1666 | { |
1443 | u64 spte; | 1667 | u64 spte; |
1444 | int ret = 0; | 1668 | int ret = 0; |
1669 | u64 mt_mask = shadow_mt_mask; | ||
1670 | struct kvm_mmu_page *sp = page_header(__pa(shadow_pte)); | ||
1671 | |||
1672 | if (!(vcpu->arch.cr4 & X86_CR4_PGE)) | ||
1673 | global = 0; | ||
1674 | if (!global && sp->global) { | ||
1675 | sp->global = 0; | ||
1676 | if (sp->unsync) { | ||
1677 | kvm_unlink_unsync_global(vcpu->kvm, sp); | ||
1678 | kvm_mmu_mark_parents_unsync(vcpu, sp); | ||
1679 | } | ||
1680 | } | ||
1681 | |||
1445 | /* | 1682 | /* |
1446 | * We don't set the accessed bit, since we sometimes want to see | 1683 | * We don't set the accessed bit, since we sometimes want to see |
1447 | * whether the guest actually used the pte (in order to detect | 1684 | * whether the guest actually used the pte (in order to detect |
@@ -1460,6 +1697,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1460 | spte |= shadow_user_mask; | 1697 | spte |= shadow_user_mask; |
1461 | if (largepage) | 1698 | if (largepage) |
1462 | spte |= PT_PAGE_SIZE_MASK; | 1699 | spte |= PT_PAGE_SIZE_MASK; |
1700 | if (mt_mask) { | ||
1701 | mt_mask = get_memory_type(vcpu, gfn) << | ||
1702 | kvm_x86_ops->get_mt_mask_shift(); | ||
1703 | spte |= mt_mask; | ||
1704 | } | ||
1463 | 1705 | ||
1464 | spte |= (u64)pfn << PAGE_SHIFT; | 1706 | spte |= (u64)pfn << PAGE_SHIFT; |
1465 | 1707 | ||
@@ -1474,6 +1716,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1474 | 1716 | ||
1475 | spte |= PT_WRITABLE_MASK; | 1717 | spte |= PT_WRITABLE_MASK; |
1476 | 1718 | ||
1719 | /* | ||
1720 | * Optimization: for pte sync, if spte was writable the hash | ||
1721 | * lookup is unnecessary (and expensive). Write protection | ||
1722 | * is responsibility of mmu_get_page / kvm_sync_page. | ||
1723 | * Same reasoning can be applied to dirty page accounting. | ||
1724 | */ | ||
1725 | if (!can_unsync && is_writeble_pte(*shadow_pte)) | ||
1726 | goto set_pte; | ||
1727 | |||
1477 | if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { | 1728 | if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { |
1478 | pgprintk("%s: found shadow page for %lx, marking ro\n", | 1729 | pgprintk("%s: found shadow page for %lx, marking ro\n", |
1479 | __func__, gfn); | 1730 | __func__, gfn); |
@@ -1495,8 +1746,8 @@ set_pte: | |||
1495 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | 1746 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, |
1496 | unsigned pt_access, unsigned pte_access, | 1747 | unsigned pt_access, unsigned pte_access, |
1497 | int user_fault, int write_fault, int dirty, | 1748 | int user_fault, int write_fault, int dirty, |
1498 | int *ptwrite, int largepage, gfn_t gfn, | 1749 | int *ptwrite, int largepage, int global, |
1499 | pfn_t pfn, bool speculative) | 1750 | gfn_t gfn, pfn_t pfn, bool speculative) |
1500 | { | 1751 | { |
1501 | int was_rmapped = 0; | 1752 | int was_rmapped = 0; |
1502 | int was_writeble = is_writeble_pte(*shadow_pte); | 1753 | int was_writeble = is_writeble_pte(*shadow_pte); |
@@ -1529,7 +1780,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1529 | } | 1780 | } |
1530 | } | 1781 | } |
1531 | if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, | 1782 | if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, |
1532 | dirty, largepage, gfn, pfn, speculative, true)) { | 1783 | dirty, largepage, global, gfn, pfn, speculative, true)) { |
1533 | if (write_fault) | 1784 | if (write_fault) |
1534 | *ptwrite = 1; | 1785 | *ptwrite = 1; |
1535 | kvm_x86_ops->tlb_flush(vcpu); | 1786 | kvm_x86_ops->tlb_flush(vcpu); |
@@ -1586,7 +1837,7 @@ static int direct_map_entry(struct kvm_shadow_walk *_walk, | |||
1586 | || (walk->largepage && level == PT_DIRECTORY_LEVEL)) { | 1837 | || (walk->largepage && level == PT_DIRECTORY_LEVEL)) { |
1587 | mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL, | 1838 | mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL, |
1588 | 0, walk->write, 1, &walk->pt_write, | 1839 | 0, walk->write, 1, &walk->pt_write, |
1589 | walk->largepage, gfn, walk->pfn, false); | 1840 | walk->largepage, 0, gfn, walk->pfn, false); |
1590 | ++vcpu->stat.pf_fixed; | 1841 | ++vcpu->stat.pf_fixed; |
1591 | return 1; | 1842 | return 1; |
1592 | } | 1843 | } |
@@ -1773,6 +2024,15 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) | |||
1773 | } | 2024 | } |
1774 | } | 2025 | } |
1775 | 2026 | ||
2027 | static void mmu_sync_global(struct kvm_vcpu *vcpu) | ||
2028 | { | ||
2029 | struct kvm *kvm = vcpu->kvm; | ||
2030 | struct kvm_mmu_page *sp, *n; | ||
2031 | |||
2032 | list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link) | ||
2033 | kvm_sync_page(vcpu, sp); | ||
2034 | } | ||
2035 | |||
1776 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) | 2036 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) |
1777 | { | 2037 | { |
1778 | spin_lock(&vcpu->kvm->mmu_lock); | 2038 | spin_lock(&vcpu->kvm->mmu_lock); |
@@ -1780,6 +2040,13 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) | |||
1780 | spin_unlock(&vcpu->kvm->mmu_lock); | 2040 | spin_unlock(&vcpu->kvm->mmu_lock); |
1781 | } | 2041 | } |
1782 | 2042 | ||
2043 | void kvm_mmu_sync_global(struct kvm_vcpu *vcpu) | ||
2044 | { | ||
2045 | spin_lock(&vcpu->kvm->mmu_lock); | ||
2046 | mmu_sync_global(vcpu); | ||
2047 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
2048 | } | ||
2049 | |||
1783 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) | 2050 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) |
1784 | { | 2051 | { |
1785 | return vaddr; | 2052 | return vaddr; |
@@ -2178,7 +2445,8 @@ static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) | |||
2178 | } | 2445 | } |
2179 | 2446 | ||
2180 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | 2447 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, |
2181 | const u8 *new, int bytes) | 2448 | const u8 *new, int bytes, |
2449 | bool guest_initiated) | ||
2182 | { | 2450 | { |
2183 | gfn_t gfn = gpa >> PAGE_SHIFT; | 2451 | gfn_t gfn = gpa >> PAGE_SHIFT; |
2184 | struct kvm_mmu_page *sp; | 2452 | struct kvm_mmu_page *sp; |
@@ -2204,15 +2472,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
2204 | kvm_mmu_free_some_pages(vcpu); | 2472 | kvm_mmu_free_some_pages(vcpu); |
2205 | ++vcpu->kvm->stat.mmu_pte_write; | 2473 | ++vcpu->kvm->stat.mmu_pte_write; |
2206 | kvm_mmu_audit(vcpu, "pre pte write"); | 2474 | kvm_mmu_audit(vcpu, "pre pte write"); |
2207 | if (gfn == vcpu->arch.last_pt_write_gfn | 2475 | if (guest_initiated) { |
2208 | && !last_updated_pte_accessed(vcpu)) { | 2476 | if (gfn == vcpu->arch.last_pt_write_gfn |
2209 | ++vcpu->arch.last_pt_write_count; | 2477 | && !last_updated_pte_accessed(vcpu)) { |
2210 | if (vcpu->arch.last_pt_write_count >= 3) | 2478 | ++vcpu->arch.last_pt_write_count; |
2211 | flooded = 1; | 2479 | if (vcpu->arch.last_pt_write_count >= 3) |
2212 | } else { | 2480 | flooded = 1; |
2213 | vcpu->arch.last_pt_write_gfn = gfn; | 2481 | } else { |
2214 | vcpu->arch.last_pt_write_count = 1; | 2482 | vcpu->arch.last_pt_write_gfn = gfn; |
2215 | vcpu->arch.last_pte_updated = NULL; | 2483 | vcpu->arch.last_pt_write_count = 1; |
2484 | vcpu->arch.last_pte_updated = NULL; | ||
2485 | } | ||
2216 | } | 2486 | } |
2217 | index = kvm_page_table_hashfn(gfn); | 2487 | index = kvm_page_table_hashfn(gfn); |
2218 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | 2488 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; |
@@ -2352,9 +2622,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); | |||
2352 | 2622 | ||
2353 | void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) | 2623 | void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) |
2354 | { | 2624 | { |
2355 | spin_lock(&vcpu->kvm->mmu_lock); | ||
2356 | vcpu->arch.mmu.invlpg(vcpu, gva); | 2625 | vcpu->arch.mmu.invlpg(vcpu, gva); |
2357 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
2358 | kvm_mmu_flush_tlb(vcpu); | 2626 | kvm_mmu_flush_tlb(vcpu); |
2359 | ++vcpu->stat.invlpg; | 2627 | ++vcpu->stat.invlpg; |
2360 | } | 2628 | } |
@@ -2451,7 +2719,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
2451 | int i; | 2719 | int i; |
2452 | u64 *pt; | 2720 | u64 *pt; |
2453 | 2721 | ||
2454 | if (!test_bit(slot, &sp->slot_bitmap)) | 2722 | if (!test_bit(slot, sp->slot_bitmap)) |
2455 | continue; | 2723 | continue; |
2456 | 2724 | ||
2457 | pt = sp->spt; | 2725 | pt = sp->spt; |
@@ -2860,8 +3128,8 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) | |||
2860 | if (sp->role.metaphysical) | 3128 | if (sp->role.metaphysical) |
2861 | continue; | 3129 | continue; |
2862 | 3130 | ||
2863 | slot = gfn_to_memslot(vcpu->kvm, sp->gfn); | ||
2864 | gfn = unalias_gfn(vcpu->kvm, sp->gfn); | 3131 | gfn = unalias_gfn(vcpu->kvm, sp->gfn); |
3132 | slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); | ||
2865 | rmapp = &slot->rmap[gfn - slot->base_gfn]; | 3133 | rmapp = &slot->rmap[gfn - slot->base_gfn]; |
2866 | if (*rmapp) | 3134 | if (*rmapp) |
2867 | printk(KERN_ERR "%s: (%s) shadow page has writable" | 3135 | printk(KERN_ERR "%s: (%s) shadow page has writable" |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 84eee43bbe74..9fd78b6e17ad 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -82,6 +82,7 @@ struct shadow_walker { | |||
82 | int *ptwrite; | 82 | int *ptwrite; |
83 | pfn_t pfn; | 83 | pfn_t pfn; |
84 | u64 *sptep; | 84 | u64 *sptep; |
85 | gpa_t pte_gpa; | ||
85 | }; | 86 | }; |
86 | 87 | ||
87 | static gfn_t gpte_to_gfn(pt_element_t gpte) | 88 | static gfn_t gpte_to_gfn(pt_element_t gpte) |
@@ -222,7 +223,7 @@ walk: | |||
222 | if (ret) | 223 | if (ret) |
223 | goto walk; | 224 | goto walk; |
224 | pte |= PT_DIRTY_MASK; | 225 | pte |= PT_DIRTY_MASK; |
225 | kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); | 226 | kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0); |
226 | walker->ptes[walker->level - 1] = pte; | 227 | walker->ptes[walker->level - 1] = pte; |
227 | } | 228 | } |
228 | 229 | ||
@@ -274,7 +275,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | |||
274 | return; | 275 | return; |
275 | kvm_get_pfn(pfn); | 276 | kvm_get_pfn(pfn); |
276 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, | 277 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, |
277 | gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), | 278 | gpte & PT_DIRTY_MASK, NULL, largepage, |
279 | gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte), | ||
278 | pfn, true); | 280 | pfn, true); |
279 | } | 281 | } |
280 | 282 | ||
@@ -301,8 +303,9 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, | |||
301 | mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, | 303 | mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, |
302 | sw->user_fault, sw->write_fault, | 304 | sw->user_fault, sw->write_fault, |
303 | gw->ptes[gw->level-1] & PT_DIRTY_MASK, | 305 | gw->ptes[gw->level-1] & PT_DIRTY_MASK, |
304 | sw->ptwrite, sw->largepage, gw->gfn, sw->pfn, | 306 | sw->ptwrite, sw->largepage, |
305 | false); | 307 | gw->ptes[gw->level-1] & PT_GLOBAL_MASK, |
308 | gw->gfn, sw->pfn, false); | ||
306 | sw->sptep = sptep; | 309 | sw->sptep = sptep; |
307 | return 1; | 310 | return 1; |
308 | } | 311 | } |
@@ -466,10 +469,22 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw, | |||
466 | struct kvm_vcpu *vcpu, u64 addr, | 469 | struct kvm_vcpu *vcpu, u64 addr, |
467 | u64 *sptep, int level) | 470 | u64 *sptep, int level) |
468 | { | 471 | { |
472 | struct shadow_walker *sw = | ||
473 | container_of(_sw, struct shadow_walker, walker); | ||
469 | 474 | ||
470 | if (level == PT_PAGE_TABLE_LEVEL) { | 475 | /* FIXME: properly handle invlpg on large guest pages */ |
471 | if (is_shadow_present_pte(*sptep)) | 476 | if (level == PT_PAGE_TABLE_LEVEL || |
477 | ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) { | ||
478 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); | ||
479 | |||
480 | sw->pte_gpa = (sp->gfn << PAGE_SHIFT); | ||
481 | sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); | ||
482 | |||
483 | if (is_shadow_present_pte(*sptep)) { | ||
472 | rmap_remove(vcpu->kvm, sptep); | 484 | rmap_remove(vcpu->kvm, sptep); |
485 | if (is_large_pte(*sptep)) | ||
486 | --vcpu->kvm->stat.lpages; | ||
487 | } | ||
473 | set_shadow_pte(sptep, shadow_trap_nonpresent_pte); | 488 | set_shadow_pte(sptep, shadow_trap_nonpresent_pte); |
474 | return 1; | 489 | return 1; |
475 | } | 490 | } |
@@ -480,11 +495,26 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw, | |||
480 | 495 | ||
481 | static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | 496 | static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) |
482 | { | 497 | { |
498 | pt_element_t gpte; | ||
483 | struct shadow_walker walker = { | 499 | struct shadow_walker walker = { |
484 | .walker = { .entry = FNAME(shadow_invlpg_entry), }, | 500 | .walker = { .entry = FNAME(shadow_invlpg_entry), }, |
501 | .pte_gpa = -1, | ||
485 | }; | 502 | }; |
486 | 503 | ||
504 | spin_lock(&vcpu->kvm->mmu_lock); | ||
487 | walk_shadow(&walker.walker, vcpu, gva); | 505 | walk_shadow(&walker.walker, vcpu, gva); |
506 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
507 | if (walker.pte_gpa == -1) | ||
508 | return; | ||
509 | if (kvm_read_guest_atomic(vcpu->kvm, walker.pte_gpa, &gpte, | ||
510 | sizeof(pt_element_t))) | ||
511 | return; | ||
512 | if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) { | ||
513 | if (mmu_topup_memory_caches(vcpu)) | ||
514 | return; | ||
515 | kvm_mmu_pte_write(vcpu, walker.pte_gpa, (const u8 *)&gpte, | ||
516 | sizeof(pt_element_t), 0); | ||
517 | } | ||
488 | } | 518 | } |
489 | 519 | ||
490 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | 520 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) |
@@ -580,7 +610,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
580 | nr_present++; | 610 | nr_present++; |
581 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | 611 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); |
582 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, | 612 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, |
583 | is_dirty_pte(gpte), 0, gfn, | 613 | is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn, |
584 | spte_to_pfn(sp->spt[i]), true, false); | 614 | spte_to_pfn(sp->spt[i]), true, false); |
585 | } | 615 | } |
586 | 616 | ||
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9c4ce657d963..1452851ae258 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -28,6 +28,8 @@ | |||
28 | 28 | ||
29 | #include <asm/desc.h> | 29 | #include <asm/desc.h> |
30 | 30 | ||
31 | #include <asm/virtext.h> | ||
32 | |||
31 | #define __ex(x) __kvm_handle_fault_on_reboot(x) | 33 | #define __ex(x) __kvm_handle_fault_on_reboot(x) |
32 | 34 | ||
33 | MODULE_AUTHOR("Qumranet"); | 35 | MODULE_AUTHOR("Qumranet"); |
@@ -245,34 +247,19 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
245 | 247 | ||
246 | static int has_svm(void) | 248 | static int has_svm(void) |
247 | { | 249 | { |
248 | uint32_t eax, ebx, ecx, edx; | 250 | const char *msg; |
249 | |||
250 | if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { | ||
251 | printk(KERN_INFO "has_svm: not amd\n"); | ||
252 | return 0; | ||
253 | } | ||
254 | 251 | ||
255 | cpuid(0x80000000, &eax, &ebx, &ecx, &edx); | 252 | if (!cpu_has_svm(&msg)) { |
256 | if (eax < SVM_CPUID_FUNC) { | 253 | printk(KERN_INFO "has_svn: %s\n", msg); |
257 | printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n"); | ||
258 | return 0; | 254 | return 0; |
259 | } | 255 | } |
260 | 256 | ||
261 | cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | ||
262 | if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) { | ||
263 | printk(KERN_DEBUG "has_svm: svm not available\n"); | ||
264 | return 0; | ||
265 | } | ||
266 | return 1; | 257 | return 1; |
267 | } | 258 | } |
268 | 259 | ||
269 | static void svm_hardware_disable(void *garbage) | 260 | static void svm_hardware_disable(void *garbage) |
270 | { | 261 | { |
271 | uint64_t efer; | 262 | cpu_svm_disable(); |
272 | |||
273 | wrmsrl(MSR_VM_HSAVE_PA, 0); | ||
274 | rdmsrl(MSR_EFER, efer); | ||
275 | wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); | ||
276 | } | 263 | } |
277 | 264 | ||
278 | static void svm_hardware_enable(void *garbage) | 265 | static void svm_hardware_enable(void *garbage) |
@@ -772,6 +759,22 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, | |||
772 | var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; | 759 | var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; |
773 | var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; | 760 | var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; |
774 | var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; | 761 | var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; |
762 | |||
763 | /* | ||
764 | * SVM always stores 0 for the 'G' bit in the CS selector in | ||
765 | * the VMCB on a VMEXIT. This hurts cross-vendor migration: | ||
766 | * Intel's VMENTRY has a check on the 'G' bit. | ||
767 | */ | ||
768 | if (seg == VCPU_SREG_CS) | ||
769 | var->g = s->limit > 0xfffff; | ||
770 | |||
771 | /* | ||
772 | * Work around a bug where the busy flag in the tr selector | ||
773 | * isn't exposed | ||
774 | */ | ||
775 | if (seg == VCPU_SREG_TR) | ||
776 | var->type |= 0x2; | ||
777 | |||
775 | var->unusable = !var->present; | 778 | var->unusable = !var->present; |
776 | } | 779 | } |
777 | 780 | ||
@@ -1099,6 +1102,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1099 | rep = (io_info & SVM_IOIO_REP_MASK) != 0; | 1102 | rep = (io_info & SVM_IOIO_REP_MASK) != 0; |
1100 | down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0; | 1103 | down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0; |
1101 | 1104 | ||
1105 | skip_emulated_instruction(&svm->vcpu); | ||
1102 | return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); | 1106 | return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); |
1103 | } | 1107 | } |
1104 | 1108 | ||
@@ -1912,6 +1916,11 @@ static int get_npt_level(void) | |||
1912 | #endif | 1916 | #endif |
1913 | } | 1917 | } |
1914 | 1918 | ||
1919 | static int svm_get_mt_mask_shift(void) | ||
1920 | { | ||
1921 | return 0; | ||
1922 | } | ||
1923 | |||
1915 | static struct kvm_x86_ops svm_x86_ops = { | 1924 | static struct kvm_x86_ops svm_x86_ops = { |
1916 | .cpu_has_kvm_support = has_svm, | 1925 | .cpu_has_kvm_support = has_svm, |
1917 | .disabled_by_bios = is_disabled, | 1926 | .disabled_by_bios = is_disabled, |
@@ -1967,6 +1976,7 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
1967 | 1976 | ||
1968 | .set_tss_addr = svm_set_tss_addr, | 1977 | .set_tss_addr = svm_set_tss_addr, |
1969 | .get_tdp_level = get_npt_level, | 1978 | .get_tdp_level = get_npt_level, |
1979 | .get_mt_mask_shift = svm_get_mt_mask_shift, | ||
1970 | }; | 1980 | }; |
1971 | 1981 | ||
1972 | static int __init svm_init(void) | 1982 | static int __init svm_init(void) |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a4018b01e1f9..6259d7467648 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -16,7 +16,6 @@ | |||
16 | */ | 16 | */ |
17 | 17 | ||
18 | #include "irq.h" | 18 | #include "irq.h" |
19 | #include "vmx.h" | ||
20 | #include "mmu.h" | 19 | #include "mmu.h" |
21 | 20 | ||
22 | #include <linux/kvm_host.h> | 21 | #include <linux/kvm_host.h> |
@@ -31,6 +30,8 @@ | |||
31 | 30 | ||
32 | #include <asm/io.h> | 31 | #include <asm/io.h> |
33 | #include <asm/desc.h> | 32 | #include <asm/desc.h> |
33 | #include <asm/vmx.h> | ||
34 | #include <asm/virtext.h> | ||
34 | 35 | ||
35 | #define __ex(x) __kvm_handle_fault_on_reboot(x) | 36 | #define __ex(x) __kvm_handle_fault_on_reboot(x) |
36 | 37 | ||
@@ -90,6 +91,11 @@ struct vcpu_vmx { | |||
90 | } rmode; | 91 | } rmode; |
91 | int vpid; | 92 | int vpid; |
92 | bool emulation_required; | 93 | bool emulation_required; |
94 | |||
95 | /* Support for vnmi-less CPUs */ | ||
96 | int soft_vnmi_blocked; | ||
97 | ktime_t entry_time; | ||
98 | s64 vnmi_blocked_time; | ||
93 | }; | 99 | }; |
94 | 100 | ||
95 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) | 101 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) |
@@ -122,7 +128,7 @@ static struct vmcs_config { | |||
122 | u32 vmentry_ctrl; | 128 | u32 vmentry_ctrl; |
123 | } vmcs_config; | 129 | } vmcs_config; |
124 | 130 | ||
125 | struct vmx_capability { | 131 | static struct vmx_capability { |
126 | u32 ept; | 132 | u32 ept; |
127 | u32 vpid; | 133 | u32 vpid; |
128 | } vmx_capability; | 134 | } vmx_capability; |
@@ -957,6 +963,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
957 | pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data); | 963 | pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data); |
958 | 964 | ||
959 | break; | 965 | break; |
966 | case MSR_IA32_CR_PAT: | ||
967 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { | ||
968 | vmcs_write64(GUEST_IA32_PAT, data); | ||
969 | vcpu->arch.pat = data; | ||
970 | break; | ||
971 | } | ||
972 | /* Otherwise falls through to kvm_set_msr_common */ | ||
960 | default: | 973 | default: |
961 | vmx_load_host_state(vmx); | 974 | vmx_load_host_state(vmx); |
962 | msr = find_msr_entry(vmx, msr_index); | 975 | msr = find_msr_entry(vmx, msr_index); |
@@ -1032,8 +1045,7 @@ static int vmx_get_irq(struct kvm_vcpu *vcpu) | |||
1032 | 1045 | ||
1033 | static __init int cpu_has_kvm_support(void) | 1046 | static __init int cpu_has_kvm_support(void) |
1034 | { | 1047 | { |
1035 | unsigned long ecx = cpuid_ecx(1); | 1048 | return cpu_has_vmx(); |
1036 | return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ | ||
1037 | } | 1049 | } |
1038 | 1050 | ||
1039 | static __init int vmx_disabled_by_bios(void) | 1051 | static __init int vmx_disabled_by_bios(void) |
@@ -1079,13 +1091,22 @@ static void vmclear_local_vcpus(void) | |||
1079 | __vcpu_clear(vmx); | 1091 | __vcpu_clear(vmx); |
1080 | } | 1092 | } |
1081 | 1093 | ||
1082 | static void hardware_disable(void *garbage) | 1094 | |
1095 | /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() | ||
1096 | * tricks. | ||
1097 | */ | ||
1098 | static void kvm_cpu_vmxoff(void) | ||
1083 | { | 1099 | { |
1084 | vmclear_local_vcpus(); | ||
1085 | asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); | 1100 | asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); |
1086 | write_cr4(read_cr4() & ~X86_CR4_VMXE); | 1101 | write_cr4(read_cr4() & ~X86_CR4_VMXE); |
1087 | } | 1102 | } |
1088 | 1103 | ||
1104 | static void hardware_disable(void *garbage) | ||
1105 | { | ||
1106 | vmclear_local_vcpus(); | ||
1107 | kvm_cpu_vmxoff(); | ||
1108 | } | ||
1109 | |||
1089 | static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, | 1110 | static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, |
1090 | u32 msr, u32 *result) | 1111 | u32 msr, u32 *result) |
1091 | { | 1112 | { |
@@ -1176,12 +1197,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
1176 | #ifdef CONFIG_X86_64 | 1197 | #ifdef CONFIG_X86_64 |
1177 | min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; | 1198 | min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; |
1178 | #endif | 1199 | #endif |
1179 | opt = 0; | 1200 | opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT; |
1180 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, | 1201 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, |
1181 | &_vmexit_control) < 0) | 1202 | &_vmexit_control) < 0) |
1182 | return -EIO; | 1203 | return -EIO; |
1183 | 1204 | ||
1184 | min = opt = 0; | 1205 | min = 0; |
1206 | opt = VM_ENTRY_LOAD_IA32_PAT; | ||
1185 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, | 1207 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, |
1186 | &_vmentry_control) < 0) | 1208 | &_vmentry_control) < 0) |
1187 | return -EIO; | 1209 | return -EIO; |
@@ -2087,8 +2109,9 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) | |||
2087 | */ | 2109 | */ |
2088 | static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | 2110 | static int vmx_vcpu_setup(struct vcpu_vmx *vmx) |
2089 | { | 2111 | { |
2090 | u32 host_sysenter_cs; | 2112 | u32 host_sysenter_cs, msr_low, msr_high; |
2091 | u32 junk; | 2113 | u32 junk; |
2114 | u64 host_pat; | ||
2092 | unsigned long a; | 2115 | unsigned long a; |
2093 | struct descriptor_table dt; | 2116 | struct descriptor_table dt; |
2094 | int i; | 2117 | int i; |
@@ -2176,6 +2199,20 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2176 | rdmsrl(MSR_IA32_SYSENTER_EIP, a); | 2199 | rdmsrl(MSR_IA32_SYSENTER_EIP, a); |
2177 | vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ | 2200 | vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ |
2178 | 2201 | ||
2202 | if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { | ||
2203 | rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); | ||
2204 | host_pat = msr_low | ((u64) msr_high << 32); | ||
2205 | vmcs_write64(HOST_IA32_PAT, host_pat); | ||
2206 | } | ||
2207 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { | ||
2208 | rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); | ||
2209 | host_pat = msr_low | ((u64) msr_high << 32); | ||
2210 | /* Write the default value follow host pat */ | ||
2211 | vmcs_write64(GUEST_IA32_PAT, host_pat); | ||
2212 | /* Keep arch.pat sync with GUEST_IA32_PAT */ | ||
2213 | vmx->vcpu.arch.pat = host_pat; | ||
2214 | } | ||
2215 | |||
2179 | for (i = 0; i < NR_VMX_MSR; ++i) { | 2216 | for (i = 0; i < NR_VMX_MSR; ++i) { |
2180 | u32 index = vmx_msr_index[i]; | 2217 | u32 index = vmx_msr_index[i]; |
2181 | u32 data_low, data_high; | 2218 | u32 data_low, data_high; |
@@ -2230,6 +2267,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2230 | 2267 | ||
2231 | vmx->vcpu.arch.rmode.active = 0; | 2268 | vmx->vcpu.arch.rmode.active = 0; |
2232 | 2269 | ||
2270 | vmx->soft_vnmi_blocked = 0; | ||
2271 | |||
2233 | vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); | 2272 | vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); |
2234 | kvm_set_cr8(&vmx->vcpu, 0); | 2273 | kvm_set_cr8(&vmx->vcpu, 0); |
2235 | msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; | 2274 | msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; |
@@ -2335,6 +2374,29 @@ out: | |||
2335 | return ret; | 2374 | return ret; |
2336 | } | 2375 | } |
2337 | 2376 | ||
2377 | static void enable_irq_window(struct kvm_vcpu *vcpu) | ||
2378 | { | ||
2379 | u32 cpu_based_vm_exec_control; | ||
2380 | |||
2381 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | ||
2382 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; | ||
2383 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | ||
2384 | } | ||
2385 | |||
2386 | static void enable_nmi_window(struct kvm_vcpu *vcpu) | ||
2387 | { | ||
2388 | u32 cpu_based_vm_exec_control; | ||
2389 | |||
2390 | if (!cpu_has_virtual_nmis()) { | ||
2391 | enable_irq_window(vcpu); | ||
2392 | return; | ||
2393 | } | ||
2394 | |||
2395 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | ||
2396 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; | ||
2397 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | ||
2398 | } | ||
2399 | |||
2338 | static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) | 2400 | static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) |
2339 | { | 2401 | { |
2340 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2402 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
@@ -2358,10 +2420,54 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) | |||
2358 | 2420 | ||
2359 | static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | 2421 | static void vmx_inject_nmi(struct kvm_vcpu *vcpu) |
2360 | { | 2422 | { |
2423 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
2424 | |||
2425 | if (!cpu_has_virtual_nmis()) { | ||
2426 | /* | ||
2427 | * Tracking the NMI-blocked state in software is built upon | ||
2428 | * finding the next open IRQ window. This, in turn, depends on | ||
2429 | * well-behaving guests: They have to keep IRQs disabled at | ||
2430 | * least as long as the NMI handler runs. Otherwise we may | ||
2431 | * cause NMI nesting, maybe breaking the guest. But as this is | ||
2432 | * highly unlikely, we can live with the residual risk. | ||
2433 | */ | ||
2434 | vmx->soft_vnmi_blocked = 1; | ||
2435 | vmx->vnmi_blocked_time = 0; | ||
2436 | } | ||
2437 | |||
2438 | ++vcpu->stat.nmi_injections; | ||
2439 | if (vcpu->arch.rmode.active) { | ||
2440 | vmx->rmode.irq.pending = true; | ||
2441 | vmx->rmode.irq.vector = NMI_VECTOR; | ||
2442 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); | ||
2443 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
2444 | NMI_VECTOR | INTR_TYPE_SOFT_INTR | | ||
2445 | INTR_INFO_VALID_MASK); | ||
2446 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | ||
2447 | kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); | ||
2448 | return; | ||
2449 | } | ||
2361 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 2450 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
2362 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); | 2451 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); |
2363 | } | 2452 | } |
2364 | 2453 | ||
2454 | static void vmx_update_window_states(struct kvm_vcpu *vcpu) | ||
2455 | { | ||
2456 | u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | ||
2457 | |||
2458 | vcpu->arch.nmi_window_open = | ||
2459 | !(guest_intr & (GUEST_INTR_STATE_STI | | ||
2460 | GUEST_INTR_STATE_MOV_SS | | ||
2461 | GUEST_INTR_STATE_NMI)); | ||
2462 | if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) | ||
2463 | vcpu->arch.nmi_window_open = 0; | ||
2464 | |||
2465 | vcpu->arch.interrupt_window_open = | ||
2466 | ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && | ||
2467 | !(guest_intr & (GUEST_INTR_STATE_STI | | ||
2468 | GUEST_INTR_STATE_MOV_SS))); | ||
2469 | } | ||
2470 | |||
2365 | static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) | 2471 | static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) |
2366 | { | 2472 | { |
2367 | int word_index = __ffs(vcpu->arch.irq_summary); | 2473 | int word_index = __ffs(vcpu->arch.irq_summary); |
@@ -2374,40 +2480,49 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) | |||
2374 | kvm_queue_interrupt(vcpu, irq); | 2480 | kvm_queue_interrupt(vcpu, irq); |
2375 | } | 2481 | } |
2376 | 2482 | ||
2377 | |||
2378 | static void do_interrupt_requests(struct kvm_vcpu *vcpu, | 2483 | static void do_interrupt_requests(struct kvm_vcpu *vcpu, |
2379 | struct kvm_run *kvm_run) | 2484 | struct kvm_run *kvm_run) |
2380 | { | 2485 | { |
2381 | u32 cpu_based_vm_exec_control; | 2486 | vmx_update_window_states(vcpu); |
2382 | |||
2383 | vcpu->arch.interrupt_window_open = | ||
2384 | ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && | ||
2385 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); | ||
2386 | 2487 | ||
2387 | if (vcpu->arch.interrupt_window_open && | 2488 | if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { |
2388 | vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) | 2489 | if (vcpu->arch.interrupt.pending) { |
2389 | kvm_do_inject_irq(vcpu); | 2490 | enable_nmi_window(vcpu); |
2491 | } else if (vcpu->arch.nmi_window_open) { | ||
2492 | vcpu->arch.nmi_pending = false; | ||
2493 | vcpu->arch.nmi_injected = true; | ||
2494 | } else { | ||
2495 | enable_nmi_window(vcpu); | ||
2496 | return; | ||
2497 | } | ||
2498 | } | ||
2499 | if (vcpu->arch.nmi_injected) { | ||
2500 | vmx_inject_nmi(vcpu); | ||
2501 | if (vcpu->arch.nmi_pending) | ||
2502 | enable_nmi_window(vcpu); | ||
2503 | else if (vcpu->arch.irq_summary | ||
2504 | || kvm_run->request_interrupt_window) | ||
2505 | enable_irq_window(vcpu); | ||
2506 | return; | ||
2507 | } | ||
2390 | 2508 | ||
2391 | if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending) | 2509 | if (vcpu->arch.interrupt_window_open) { |
2392 | vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); | 2510 | if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) |
2511 | kvm_do_inject_irq(vcpu); | ||
2393 | 2512 | ||
2394 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | 2513 | if (vcpu->arch.interrupt.pending) |
2514 | vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); | ||
2515 | } | ||
2395 | if (!vcpu->arch.interrupt_window_open && | 2516 | if (!vcpu->arch.interrupt_window_open && |
2396 | (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) | 2517 | (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) |
2397 | /* | 2518 | enable_irq_window(vcpu); |
2398 | * Interrupts blocked. Wait for unblock. | ||
2399 | */ | ||
2400 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; | ||
2401 | else | ||
2402 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; | ||
2403 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | ||
2404 | } | 2519 | } |
2405 | 2520 | ||
2406 | static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) | 2521 | static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) |
2407 | { | 2522 | { |
2408 | int ret; | 2523 | int ret; |
2409 | struct kvm_userspace_memory_region tss_mem = { | 2524 | struct kvm_userspace_memory_region tss_mem = { |
2410 | .slot = 8, | 2525 | .slot = TSS_PRIVATE_MEMSLOT, |
2411 | .guest_phys_addr = addr, | 2526 | .guest_phys_addr = addr, |
2412 | .memory_size = PAGE_SIZE * 3, | 2527 | .memory_size = PAGE_SIZE * 3, |
2413 | .flags = 0, | 2528 | .flags = 0, |
@@ -2492,7 +2607,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2492 | set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); | 2607 | set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); |
2493 | } | 2608 | } |
2494 | 2609 | ||
2495 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ | 2610 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) |
2496 | return 1; /* already handled by vmx_vcpu_run() */ | 2611 | return 1; /* already handled by vmx_vcpu_run() */ |
2497 | 2612 | ||
2498 | if (is_no_device(intr_info)) { | 2613 | if (is_no_device(intr_info)) { |
@@ -2581,6 +2696,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2581 | rep = (exit_qualification & 32) != 0; | 2696 | rep = (exit_qualification & 32) != 0; |
2582 | port = exit_qualification >> 16; | 2697 | port = exit_qualification >> 16; |
2583 | 2698 | ||
2699 | skip_emulated_instruction(vcpu); | ||
2584 | return kvm_emulate_pio(vcpu, kvm_run, in, size, port); | 2700 | return kvm_emulate_pio(vcpu, kvm_run, in, size, port); |
2585 | } | 2701 | } |
2586 | 2702 | ||
@@ -2767,6 +2883,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, | |||
2767 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | 2883 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); |
2768 | 2884 | ||
2769 | KVMTRACE_0D(PEND_INTR, vcpu, handler); | 2885 | KVMTRACE_0D(PEND_INTR, vcpu, handler); |
2886 | ++vcpu->stat.irq_window_exits; | ||
2770 | 2887 | ||
2771 | /* | 2888 | /* |
2772 | * If the user space waits to inject interrupts, exit as soon as | 2889 | * If the user space waits to inject interrupts, exit as soon as |
@@ -2775,7 +2892,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, | |||
2775 | if (kvm_run->request_interrupt_window && | 2892 | if (kvm_run->request_interrupt_window && |
2776 | !vcpu->arch.irq_summary) { | 2893 | !vcpu->arch.irq_summary) { |
2777 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; | 2894 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; |
2778 | ++vcpu->stat.irq_window_exits; | ||
2779 | return 0; | 2895 | return 0; |
2780 | } | 2896 | } |
2781 | return 1; | 2897 | return 1; |
@@ -2832,6 +2948,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2832 | 2948 | ||
2833 | static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 2949 | static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
2834 | { | 2950 | { |
2951 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
2835 | unsigned long exit_qualification; | 2952 | unsigned long exit_qualification; |
2836 | u16 tss_selector; | 2953 | u16 tss_selector; |
2837 | int reason; | 2954 | int reason; |
@@ -2839,6 +2956,15 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2839 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | 2956 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); |
2840 | 2957 | ||
2841 | reason = (u32)exit_qualification >> 30; | 2958 | reason = (u32)exit_qualification >> 30; |
2959 | if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected && | ||
2960 | (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && | ||
2961 | (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK) | ||
2962 | == INTR_TYPE_NMI_INTR) { | ||
2963 | vcpu->arch.nmi_injected = false; | ||
2964 | if (cpu_has_virtual_nmis()) | ||
2965 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, | ||
2966 | GUEST_INTR_STATE_NMI); | ||
2967 | } | ||
2842 | tss_selector = exit_qualification; | 2968 | tss_selector = exit_qualification; |
2843 | 2969 | ||
2844 | return kvm_task_switch(vcpu, tss_selector, reason); | 2970 | return kvm_task_switch(vcpu, tss_selector, reason); |
@@ -2927,16 +3053,12 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, | |||
2927 | while (!guest_state_valid(vcpu)) { | 3053 | while (!guest_state_valid(vcpu)) { |
2928 | err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); | 3054 | err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); |
2929 | 3055 | ||
2930 | switch (err) { | 3056 | if (err == EMULATE_DO_MMIO) |
2931 | case EMULATE_DONE: | 3057 | break; |
2932 | break; | 3058 | |
2933 | case EMULATE_DO_MMIO: | 3059 | if (err != EMULATE_DONE) { |
2934 | kvm_report_emulation_failure(vcpu, "mmio"); | 3060 | kvm_report_emulation_failure(vcpu, "emulation failure"); |
2935 | /* TODO: Handle MMIO */ | 3061 | return; |
2936 | return; | ||
2937 | default: | ||
2938 | kvm_report_emulation_failure(vcpu, "emulation failure"); | ||
2939 | return; | ||
2940 | } | 3062 | } |
2941 | 3063 | ||
2942 | if (signal_pending(current)) | 3064 | if (signal_pending(current)) |
@@ -2948,8 +3070,10 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, | |||
2948 | local_irq_disable(); | 3070 | local_irq_disable(); |
2949 | preempt_disable(); | 3071 | preempt_disable(); |
2950 | 3072 | ||
2951 | /* Guest state should be valid now, no more emulation should be needed */ | 3073 | /* Guest state should be valid now except if we need to |
2952 | vmx->emulation_required = 0; | 3074 | * emulate an MMIO */ |
3075 | if (guest_state_valid(vcpu)) | ||
3076 | vmx->emulation_required = 0; | ||
2953 | } | 3077 | } |
2954 | 3078 | ||
2955 | /* | 3079 | /* |
@@ -2996,6 +3120,11 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
2996 | KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), | 3120 | KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), |
2997 | (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); | 3121 | (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); |
2998 | 3122 | ||
3123 | /* If we need to emulate an MMIO from handle_invalid_guest_state | ||
3124 | * we just return 0 */ | ||
3125 | if (vmx->emulation_required && emulate_invalid_guest_state) | ||
3126 | return 0; | ||
3127 | |||
2999 | /* Access CR3 don't cause VMExit in paging mode, so we need | 3128 | /* Access CR3 don't cause VMExit in paging mode, so we need |
3000 | * to sync with guest real CR3. */ | 3129 | * to sync with guest real CR3. */ |
3001 | if (vm_need_ept() && is_paging(vcpu)) { | 3130 | if (vm_need_ept() && is_paging(vcpu)) { |
@@ -3012,9 +3141,32 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
3012 | 3141 | ||
3013 | if ((vectoring_info & VECTORING_INFO_VALID_MASK) && | 3142 | if ((vectoring_info & VECTORING_INFO_VALID_MASK) && |
3014 | (exit_reason != EXIT_REASON_EXCEPTION_NMI && | 3143 | (exit_reason != EXIT_REASON_EXCEPTION_NMI && |
3015 | exit_reason != EXIT_REASON_EPT_VIOLATION)) | 3144 | exit_reason != EXIT_REASON_EPT_VIOLATION && |
3016 | printk(KERN_WARNING "%s: unexpected, valid vectoring info and " | 3145 | exit_reason != EXIT_REASON_TASK_SWITCH)) |
3017 | "exit reason is 0x%x\n", __func__, exit_reason); | 3146 | printk(KERN_WARNING "%s: unexpected, valid vectoring info " |
3147 | "(0x%x) and exit reason is 0x%x\n", | ||
3148 | __func__, vectoring_info, exit_reason); | ||
3149 | |||
3150 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { | ||
3151 | if (vcpu->arch.interrupt_window_open) { | ||
3152 | vmx->soft_vnmi_blocked = 0; | ||
3153 | vcpu->arch.nmi_window_open = 1; | ||
3154 | } else if (vmx->vnmi_blocked_time > 1000000000LL && | ||
3155 | vcpu->arch.nmi_pending) { | ||
3156 | /* | ||
3157 | * This CPU don't support us in finding the end of an | ||
3158 | * NMI-blocked window if the guest runs with IRQs | ||
3159 | * disabled. So we pull the trigger after 1 s of | ||
3160 | * futile waiting, but inform the user about this. | ||
3161 | */ | ||
3162 | printk(KERN_WARNING "%s: Breaking out of NMI-blocked " | ||
3163 | "state on VCPU %d after 1 s timeout\n", | ||
3164 | __func__, vcpu->vcpu_id); | ||
3165 | vmx->soft_vnmi_blocked = 0; | ||
3166 | vmx->vcpu.arch.nmi_window_open = 1; | ||
3167 | } | ||
3168 | } | ||
3169 | |||
3018 | if (exit_reason < kvm_vmx_max_exit_handlers | 3170 | if (exit_reason < kvm_vmx_max_exit_handlers |
3019 | && kvm_vmx_exit_handlers[exit_reason]) | 3171 | && kvm_vmx_exit_handlers[exit_reason]) |
3020 | return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); | 3172 | return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); |
@@ -3042,51 +3194,6 @@ static void update_tpr_threshold(struct kvm_vcpu *vcpu) | |||
3042 | vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4); | 3194 | vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4); |
3043 | } | 3195 | } |
3044 | 3196 | ||
3045 | static void enable_irq_window(struct kvm_vcpu *vcpu) | ||
3046 | { | ||
3047 | u32 cpu_based_vm_exec_control; | ||
3048 | |||
3049 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | ||
3050 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; | ||
3051 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | ||
3052 | } | ||
3053 | |||
3054 | static void enable_nmi_window(struct kvm_vcpu *vcpu) | ||
3055 | { | ||
3056 | u32 cpu_based_vm_exec_control; | ||
3057 | |||
3058 | if (!cpu_has_virtual_nmis()) | ||
3059 | return; | ||
3060 | |||
3061 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | ||
3062 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; | ||
3063 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | ||
3064 | } | ||
3065 | |||
3066 | static int vmx_nmi_enabled(struct kvm_vcpu *vcpu) | ||
3067 | { | ||
3068 | u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | ||
3069 | return !(guest_intr & (GUEST_INTR_STATE_NMI | | ||
3070 | GUEST_INTR_STATE_MOV_SS | | ||
3071 | GUEST_INTR_STATE_STI)); | ||
3072 | } | ||
3073 | |||
3074 | static int vmx_irq_enabled(struct kvm_vcpu *vcpu) | ||
3075 | { | ||
3076 | u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | ||
3077 | return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS | | ||
3078 | GUEST_INTR_STATE_STI)) && | ||
3079 | (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)); | ||
3080 | } | ||
3081 | |||
3082 | static void enable_intr_window(struct kvm_vcpu *vcpu) | ||
3083 | { | ||
3084 | if (vcpu->arch.nmi_pending) | ||
3085 | enable_nmi_window(vcpu); | ||
3086 | else if (kvm_cpu_has_interrupt(vcpu)) | ||
3087 | enable_irq_window(vcpu); | ||
3088 | } | ||
3089 | |||
3090 | static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | 3197 | static void vmx_complete_interrupts(struct vcpu_vmx *vmx) |
3091 | { | 3198 | { |
3092 | u32 exit_intr_info; | 3199 | u32 exit_intr_info; |
@@ -3109,7 +3216,9 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | |||
3109 | if (unblock_nmi && vector != DF_VECTOR) | 3216 | if (unblock_nmi && vector != DF_VECTOR) |
3110 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, | 3217 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, |
3111 | GUEST_INTR_STATE_NMI); | 3218 | GUEST_INTR_STATE_NMI); |
3112 | } | 3219 | } else if (unlikely(vmx->soft_vnmi_blocked)) |
3220 | vmx->vnmi_blocked_time += | ||
3221 | ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); | ||
3113 | 3222 | ||
3114 | idt_vectoring_info = vmx->idt_vectoring_info; | 3223 | idt_vectoring_info = vmx->idt_vectoring_info; |
3115 | idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; | 3224 | idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; |
@@ -3147,26 +3256,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) | |||
3147 | { | 3256 | { |
3148 | update_tpr_threshold(vcpu); | 3257 | update_tpr_threshold(vcpu); |
3149 | 3258 | ||
3150 | if (cpu_has_virtual_nmis()) { | 3259 | vmx_update_window_states(vcpu); |
3151 | if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { | 3260 | |
3152 | if (vcpu->arch.interrupt.pending) { | 3261 | if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { |
3153 | enable_nmi_window(vcpu); | 3262 | if (vcpu->arch.interrupt.pending) { |
3154 | } else if (vmx_nmi_enabled(vcpu)) { | 3263 | enable_nmi_window(vcpu); |
3155 | vcpu->arch.nmi_pending = false; | 3264 | } else if (vcpu->arch.nmi_window_open) { |
3156 | vcpu->arch.nmi_injected = true; | 3265 | vcpu->arch.nmi_pending = false; |
3157 | } else { | 3266 | vcpu->arch.nmi_injected = true; |
3158 | enable_intr_window(vcpu); | 3267 | } else { |
3159 | return; | 3268 | enable_nmi_window(vcpu); |
3160 | } | ||
3161 | } | ||
3162 | if (vcpu->arch.nmi_injected) { | ||
3163 | vmx_inject_nmi(vcpu); | ||
3164 | enable_intr_window(vcpu); | ||
3165 | return; | 3269 | return; |
3166 | } | 3270 | } |
3167 | } | 3271 | } |
3272 | if (vcpu->arch.nmi_injected) { | ||
3273 | vmx_inject_nmi(vcpu); | ||
3274 | if (vcpu->arch.nmi_pending) | ||
3275 | enable_nmi_window(vcpu); | ||
3276 | else if (kvm_cpu_has_interrupt(vcpu)) | ||
3277 | enable_irq_window(vcpu); | ||
3278 | return; | ||
3279 | } | ||
3168 | if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) { | 3280 | if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) { |
3169 | if (vmx_irq_enabled(vcpu)) | 3281 | if (vcpu->arch.interrupt_window_open) |
3170 | kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); | 3282 | kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); |
3171 | else | 3283 | else |
3172 | enable_irq_window(vcpu); | 3284 | enable_irq_window(vcpu); |
@@ -3174,6 +3286,8 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) | |||
3174 | if (vcpu->arch.interrupt.pending) { | 3286 | if (vcpu->arch.interrupt.pending) { |
3175 | vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); | 3287 | vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); |
3176 | kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr); | 3288 | kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr); |
3289 | if (kvm_cpu_has_interrupt(vcpu)) | ||
3290 | enable_irq_window(vcpu); | ||
3177 | } | 3291 | } |
3178 | } | 3292 | } |
3179 | 3293 | ||
@@ -3213,6 +3327,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3213 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3327 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3214 | u32 intr_info; | 3328 | u32 intr_info; |
3215 | 3329 | ||
3330 | /* Record the guest's net vcpu time for enforced NMI injections. */ | ||
3331 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) | ||
3332 | vmx->entry_time = ktime_get(); | ||
3333 | |||
3216 | /* Handle invalid guest state instead of entering VMX */ | 3334 | /* Handle invalid guest state instead of entering VMX */ |
3217 | if (vmx->emulation_required && emulate_invalid_guest_state) { | 3335 | if (vmx->emulation_required && emulate_invalid_guest_state) { |
3218 | handle_invalid_guest_state(vcpu, kvm_run); | 3336 | handle_invalid_guest_state(vcpu, kvm_run); |
@@ -3327,9 +3445,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3327 | if (vmx->rmode.irq.pending) | 3445 | if (vmx->rmode.irq.pending) |
3328 | fixup_rmode_irq(vmx); | 3446 | fixup_rmode_irq(vmx); |
3329 | 3447 | ||
3330 | vcpu->arch.interrupt_window_open = | 3448 | vmx_update_window_states(vcpu); |
3331 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & | ||
3332 | (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0; | ||
3333 | 3449 | ||
3334 | asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); | 3450 | asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); |
3335 | vmx->launched = 1; | 3451 | vmx->launched = 1; |
@@ -3337,7 +3453,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3337 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | 3453 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); |
3338 | 3454 | ||
3339 | /* We need to handle NMIs before interrupts are enabled */ | 3455 | /* We need to handle NMIs before interrupts are enabled */ |
3340 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 && | 3456 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && |
3341 | (intr_info & INTR_INFO_VALID_MASK)) { | 3457 | (intr_info & INTR_INFO_VALID_MASK)) { |
3342 | KVMTRACE_0D(NMI, vcpu, handler); | 3458 | KVMTRACE_0D(NMI, vcpu, handler); |
3343 | asm("int $2"); | 3459 | asm("int $2"); |
@@ -3455,6 +3571,11 @@ static int get_ept_level(void) | |||
3455 | return VMX_EPT_DEFAULT_GAW + 1; | 3571 | return VMX_EPT_DEFAULT_GAW + 1; |
3456 | } | 3572 | } |
3457 | 3573 | ||
3574 | static int vmx_get_mt_mask_shift(void) | ||
3575 | { | ||
3576 | return VMX_EPT_MT_EPTE_SHIFT; | ||
3577 | } | ||
3578 | |||
3458 | static struct kvm_x86_ops vmx_x86_ops = { | 3579 | static struct kvm_x86_ops vmx_x86_ops = { |
3459 | .cpu_has_kvm_support = cpu_has_kvm_support, | 3580 | .cpu_has_kvm_support = cpu_has_kvm_support, |
3460 | .disabled_by_bios = vmx_disabled_by_bios, | 3581 | .disabled_by_bios = vmx_disabled_by_bios, |
@@ -3510,6 +3631,7 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
3510 | 3631 | ||
3511 | .set_tss_addr = vmx_set_tss_addr, | 3632 | .set_tss_addr = vmx_set_tss_addr, |
3512 | .get_tdp_level = get_ept_level, | 3633 | .get_tdp_level = get_ept_level, |
3634 | .get_mt_mask_shift = vmx_get_mt_mask_shift, | ||
3513 | }; | 3635 | }; |
3514 | 3636 | ||
3515 | static int __init vmx_init(void) | 3637 | static int __init vmx_init(void) |
@@ -3566,10 +3688,10 @@ static int __init vmx_init(void) | |||
3566 | bypass_guest_pf = 0; | 3688 | bypass_guest_pf = 0; |
3567 | kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | | 3689 | kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | |
3568 | VMX_EPT_WRITABLE_MASK | | 3690 | VMX_EPT_WRITABLE_MASK | |
3569 | VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT | | ||
3570 | VMX_EPT_IGMT_BIT); | 3691 | VMX_EPT_IGMT_BIT); |
3571 | kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, | 3692 | kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, |
3572 | VMX_EPT_EXECUTABLE_MASK); | 3693 | VMX_EPT_EXECUTABLE_MASK, |
3694 | VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); | ||
3573 | kvm_enable_tdp(); | 3695 | kvm_enable_tdp(); |
3574 | } else | 3696 | } else |
3575 | kvm_disable_tdp(); | 3697 | kvm_disable_tdp(); |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f1f8ff2f1fa2..0e6aa8141dcd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <asm/uaccess.h> | 39 | #include <asm/uaccess.h> |
40 | #include <asm/msr.h> | 40 | #include <asm/msr.h> |
41 | #include <asm/desc.h> | 41 | #include <asm/desc.h> |
42 | #include <asm/mtrr.h> | ||
42 | 43 | ||
43 | #define MAX_IO_MSRS 256 | 44 | #define MAX_IO_MSRS 256 |
44 | #define CR0_RESERVED_BITS \ | 45 | #define CR0_RESERVED_BITS \ |
@@ -86,6 +87,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
86 | { "halt_wakeup", VCPU_STAT(halt_wakeup) }, | 87 | { "halt_wakeup", VCPU_STAT(halt_wakeup) }, |
87 | { "hypercalls", VCPU_STAT(hypercalls) }, | 88 | { "hypercalls", VCPU_STAT(hypercalls) }, |
88 | { "request_irq", VCPU_STAT(request_irq_exits) }, | 89 | { "request_irq", VCPU_STAT(request_irq_exits) }, |
90 | { "request_nmi", VCPU_STAT(request_nmi_exits) }, | ||
89 | { "irq_exits", VCPU_STAT(irq_exits) }, | 91 | { "irq_exits", VCPU_STAT(irq_exits) }, |
90 | { "host_state_reload", VCPU_STAT(host_state_reload) }, | 92 | { "host_state_reload", VCPU_STAT(host_state_reload) }, |
91 | { "efer_reload", VCPU_STAT(efer_reload) }, | 93 | { "efer_reload", VCPU_STAT(efer_reload) }, |
@@ -93,6 +95,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
93 | { "insn_emulation", VCPU_STAT(insn_emulation) }, | 95 | { "insn_emulation", VCPU_STAT(insn_emulation) }, |
94 | { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, | 96 | { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, |
95 | { "irq_injections", VCPU_STAT(irq_injections) }, | 97 | { "irq_injections", VCPU_STAT(irq_injections) }, |
98 | { "nmi_injections", VCPU_STAT(nmi_injections) }, | ||
96 | { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, | 99 | { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, |
97 | { "mmu_pte_write", VM_STAT(mmu_pte_write) }, | 100 | { "mmu_pte_write", VM_STAT(mmu_pte_write) }, |
98 | { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, | 101 | { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, |
@@ -101,6 +104,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
101 | { "mmu_recycled", VM_STAT(mmu_recycled) }, | 104 | { "mmu_recycled", VM_STAT(mmu_recycled) }, |
102 | { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, | 105 | { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, |
103 | { "mmu_unsync", VM_STAT(mmu_unsync) }, | 106 | { "mmu_unsync", VM_STAT(mmu_unsync) }, |
107 | { "mmu_unsync_global", VM_STAT(mmu_unsync_global) }, | ||
104 | { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, | 108 | { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, |
105 | { "largepages", VM_STAT(lpages) }, | 109 | { "largepages", VM_STAT(lpages) }, |
106 | { NULL } | 110 | { NULL } |
@@ -312,6 +316,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
312 | kvm_x86_ops->set_cr0(vcpu, cr0); | 316 | kvm_x86_ops->set_cr0(vcpu, cr0); |
313 | vcpu->arch.cr0 = cr0; | 317 | vcpu->arch.cr0 = cr0; |
314 | 318 | ||
319 | kvm_mmu_sync_global(vcpu); | ||
315 | kvm_mmu_reset_context(vcpu); | 320 | kvm_mmu_reset_context(vcpu); |
316 | return; | 321 | return; |
317 | } | 322 | } |
@@ -355,6 +360,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
355 | } | 360 | } |
356 | kvm_x86_ops->set_cr4(vcpu, cr4); | 361 | kvm_x86_ops->set_cr4(vcpu, cr4); |
357 | vcpu->arch.cr4 = cr4; | 362 | vcpu->arch.cr4 = cr4; |
363 | kvm_mmu_sync_global(vcpu); | ||
358 | kvm_mmu_reset_context(vcpu); | 364 | kvm_mmu_reset_context(vcpu); |
359 | } | 365 | } |
360 | EXPORT_SYMBOL_GPL(kvm_set_cr4); | 366 | EXPORT_SYMBOL_GPL(kvm_set_cr4); |
@@ -449,7 +455,7 @@ static u32 msrs_to_save[] = { | |||
449 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, | 455 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, |
450 | #endif | 456 | #endif |
451 | MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, | 457 | MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, |
452 | MSR_IA32_PERF_STATUS, | 458 | MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT |
453 | }; | 459 | }; |
454 | 460 | ||
455 | static unsigned num_msrs_to_save; | 461 | static unsigned num_msrs_to_save; |
@@ -648,10 +654,38 @@ static bool msr_mtrr_valid(unsigned msr) | |||
648 | 654 | ||
649 | static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) | 655 | static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) |
650 | { | 656 | { |
657 | u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; | ||
658 | |||
651 | if (!msr_mtrr_valid(msr)) | 659 | if (!msr_mtrr_valid(msr)) |
652 | return 1; | 660 | return 1; |
653 | 661 | ||
654 | vcpu->arch.mtrr[msr - 0x200] = data; | 662 | if (msr == MSR_MTRRdefType) { |
663 | vcpu->arch.mtrr_state.def_type = data; | ||
664 | vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; | ||
665 | } else if (msr == MSR_MTRRfix64K_00000) | ||
666 | p[0] = data; | ||
667 | else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) | ||
668 | p[1 + msr - MSR_MTRRfix16K_80000] = data; | ||
669 | else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) | ||
670 | p[3 + msr - MSR_MTRRfix4K_C0000] = data; | ||
671 | else if (msr == MSR_IA32_CR_PAT) | ||
672 | vcpu->arch.pat = data; | ||
673 | else { /* Variable MTRRs */ | ||
674 | int idx, is_mtrr_mask; | ||
675 | u64 *pt; | ||
676 | |||
677 | idx = (msr - 0x200) / 2; | ||
678 | is_mtrr_mask = msr - 0x200 - 2 * idx; | ||
679 | if (!is_mtrr_mask) | ||
680 | pt = | ||
681 | (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; | ||
682 | else | ||
683 | pt = | ||
684 | (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; | ||
685 | *pt = data; | ||
686 | } | ||
687 | |||
688 | kvm_mmu_reset_context(vcpu); | ||
655 | return 0; | 689 | return 0; |
656 | } | 690 | } |
657 | 691 | ||
@@ -747,10 +781,37 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | |||
747 | 781 | ||
748 | static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | 782 | static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) |
749 | { | 783 | { |
784 | u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; | ||
785 | |||
750 | if (!msr_mtrr_valid(msr)) | 786 | if (!msr_mtrr_valid(msr)) |
751 | return 1; | 787 | return 1; |
752 | 788 | ||
753 | *pdata = vcpu->arch.mtrr[msr - 0x200]; | 789 | if (msr == MSR_MTRRdefType) |
790 | *pdata = vcpu->arch.mtrr_state.def_type + | ||
791 | (vcpu->arch.mtrr_state.enabled << 10); | ||
792 | else if (msr == MSR_MTRRfix64K_00000) | ||
793 | *pdata = p[0]; | ||
794 | else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) | ||
795 | *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; | ||
796 | else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) | ||
797 | *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; | ||
798 | else if (msr == MSR_IA32_CR_PAT) | ||
799 | *pdata = vcpu->arch.pat; | ||
800 | else { /* Variable MTRRs */ | ||
801 | int idx, is_mtrr_mask; | ||
802 | u64 *pt; | ||
803 | |||
804 | idx = (msr - 0x200) / 2; | ||
805 | is_mtrr_mask = msr - 0x200 - 2 * idx; | ||
806 | if (!is_mtrr_mask) | ||
807 | pt = | ||
808 | (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; | ||
809 | else | ||
810 | pt = | ||
811 | (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; | ||
812 | *pdata = *pt; | ||
813 | } | ||
814 | |||
754 | return 0; | 815 | return 0; |
755 | } | 816 | } |
756 | 817 | ||
@@ -903,7 +964,6 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
903 | case KVM_CAP_IRQCHIP: | 964 | case KVM_CAP_IRQCHIP: |
904 | case KVM_CAP_HLT: | 965 | case KVM_CAP_HLT: |
905 | case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: | 966 | case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: |
906 | case KVM_CAP_USER_MEMORY: | ||
907 | case KVM_CAP_SET_TSS_ADDR: | 967 | case KVM_CAP_SET_TSS_ADDR: |
908 | case KVM_CAP_EXT_CPUID: | 968 | case KVM_CAP_EXT_CPUID: |
909 | case KVM_CAP_CLOCKSOURCE: | 969 | case KVM_CAP_CLOCKSOURCE: |
@@ -1188,6 +1248,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
1188 | int t, times = entry->eax & 0xff; | 1248 | int t, times = entry->eax & 0xff; |
1189 | 1249 | ||
1190 | entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; | 1250 | entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; |
1251 | entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; | ||
1191 | for (t = 1; t < times && *nent < maxnent; ++t) { | 1252 | for (t = 1; t < times && *nent < maxnent; ++t) { |
1192 | do_cpuid_1_ent(&entry[t], function, 0); | 1253 | do_cpuid_1_ent(&entry[t], function, 0); |
1193 | entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; | 1254 | entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; |
@@ -1218,7 +1279,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
1218 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | 1279 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; |
1219 | /* read more entries until level_type is zero */ | 1280 | /* read more entries until level_type is zero */ |
1220 | for (i = 1; *nent < maxnent; ++i) { | 1281 | for (i = 1; *nent < maxnent; ++i) { |
1221 | level_type = entry[i - 1].ecx & 0xff; | 1282 | level_type = entry[i - 1].ecx & 0xff00; |
1222 | if (!level_type) | 1283 | if (!level_type) |
1223 | break; | 1284 | break; |
1224 | do_cpuid_1_ent(&entry[i], function, i); | 1285 | do_cpuid_1_ent(&entry[i], function, i); |
@@ -1318,6 +1379,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, | |||
1318 | return 0; | 1379 | return 0; |
1319 | } | 1380 | } |
1320 | 1381 | ||
1382 | static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) | ||
1383 | { | ||
1384 | vcpu_load(vcpu); | ||
1385 | kvm_inject_nmi(vcpu); | ||
1386 | vcpu_put(vcpu); | ||
1387 | |||
1388 | return 0; | ||
1389 | } | ||
1390 | |||
1321 | static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, | 1391 | static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, |
1322 | struct kvm_tpr_access_ctl *tac) | 1392 | struct kvm_tpr_access_ctl *tac) |
1323 | { | 1393 | { |
@@ -1377,6 +1447,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
1377 | r = 0; | 1447 | r = 0; |
1378 | break; | 1448 | break; |
1379 | } | 1449 | } |
1450 | case KVM_NMI: { | ||
1451 | r = kvm_vcpu_ioctl_nmi(vcpu); | ||
1452 | if (r) | ||
1453 | goto out; | ||
1454 | r = 0; | ||
1455 | break; | ||
1456 | } | ||
1380 | case KVM_SET_CPUID: { | 1457 | case KVM_SET_CPUID: { |
1381 | struct kvm_cpuid __user *cpuid_arg = argp; | 1458 | struct kvm_cpuid __user *cpuid_arg = argp; |
1382 | struct kvm_cpuid cpuid; | 1459 | struct kvm_cpuid cpuid; |
@@ -1968,7 +2045,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
1968 | ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); | 2045 | ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); |
1969 | if (ret < 0) | 2046 | if (ret < 0) |
1970 | return 0; | 2047 | return 0; |
1971 | kvm_mmu_pte_write(vcpu, gpa, val, bytes); | 2048 | kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); |
1972 | return 1; | 2049 | return 1; |
1973 | } | 2050 | } |
1974 | 2051 | ||
@@ -2404,8 +2481,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
2404 | val = kvm_register_read(vcpu, VCPU_REGS_RAX); | 2481 | val = kvm_register_read(vcpu, VCPU_REGS_RAX); |
2405 | memcpy(vcpu->arch.pio_data, &val, 4); | 2482 | memcpy(vcpu->arch.pio_data, &val, 4); |
2406 | 2483 | ||
2407 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
2408 | |||
2409 | pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); | 2484 | pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); |
2410 | if (pio_dev) { | 2485 | if (pio_dev) { |
2411 | kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); | 2486 | kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); |
@@ -2541,7 +2616,7 @@ int kvm_arch_init(void *opaque) | |||
2541 | kvm_mmu_set_nonpresent_ptes(0ull, 0ull); | 2616 | kvm_mmu_set_nonpresent_ptes(0ull, 0ull); |
2542 | kvm_mmu_set_base_ptes(PT_PRESENT_MASK); | 2617 | kvm_mmu_set_base_ptes(PT_PRESENT_MASK); |
2543 | kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, | 2618 | kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, |
2544 | PT_DIRTY_MASK, PT64_NX_MASK, 0); | 2619 | PT_DIRTY_MASK, PT64_NX_MASK, 0, 0); |
2545 | return 0; | 2620 | return 0; |
2546 | 2621 | ||
2547 | out: | 2622 | out: |
@@ -2729,7 +2804,7 @@ static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) | |||
2729 | 2804 | ||
2730 | e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; | 2805 | e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; |
2731 | /* when no next entry is found, the current entry[i] is reselected */ | 2806 | /* when no next entry is found, the current entry[i] is reselected */ |
2732 | for (j = i + 1; j == i; j = (j + 1) % nent) { | 2807 | for (j = i + 1; ; j = (j + 1) % nent) { |
2733 | struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; | 2808 | struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; |
2734 | if (ej->function == e->function) { | 2809 | if (ej->function == e->function) { |
2735 | ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; | 2810 | ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; |
@@ -2973,7 +3048,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2973 | pr_debug("vcpu %d received sipi with vector # %x\n", | 3048 | pr_debug("vcpu %d received sipi with vector # %x\n", |
2974 | vcpu->vcpu_id, vcpu->arch.sipi_vector); | 3049 | vcpu->vcpu_id, vcpu->arch.sipi_vector); |
2975 | kvm_lapic_reset(vcpu); | 3050 | kvm_lapic_reset(vcpu); |
2976 | r = kvm_x86_ops->vcpu_reset(vcpu); | 3051 | r = kvm_arch_vcpu_reset(vcpu); |
2977 | if (r) | 3052 | if (r) |
2978 | return r; | 3053 | return r; |
2979 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | 3054 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
@@ -3275,9 +3350,9 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, | |||
3275 | kvm_desct->padding = 0; | 3350 | kvm_desct->padding = 0; |
3276 | } | 3351 | } |
3277 | 3352 | ||
3278 | static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu, | 3353 | static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, |
3279 | u16 selector, | 3354 | u16 selector, |
3280 | struct descriptor_table *dtable) | 3355 | struct descriptor_table *dtable) |
3281 | { | 3356 | { |
3282 | if (selector & 1 << 2) { | 3357 | if (selector & 1 << 2) { |
3283 | struct kvm_segment kvm_seg; | 3358 | struct kvm_segment kvm_seg; |
@@ -3302,7 +3377,7 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, | |||
3302 | struct descriptor_table dtable; | 3377 | struct descriptor_table dtable; |
3303 | u16 index = selector >> 3; | 3378 | u16 index = selector >> 3; |
3304 | 3379 | ||
3305 | get_segment_descritptor_dtable(vcpu, selector, &dtable); | 3380 | get_segment_descriptor_dtable(vcpu, selector, &dtable); |
3306 | 3381 | ||
3307 | if (dtable.limit < index * 8 + 7) { | 3382 | if (dtable.limit < index * 8 + 7) { |
3308 | kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); | 3383 | kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); |
@@ -3321,7 +3396,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, | |||
3321 | struct descriptor_table dtable; | 3396 | struct descriptor_table dtable; |
3322 | u16 index = selector >> 3; | 3397 | u16 index = selector >> 3; |
3323 | 3398 | ||
3324 | get_segment_descritptor_dtable(vcpu, selector, &dtable); | 3399 | get_segment_descriptor_dtable(vcpu, selector, &dtable); |
3325 | 3400 | ||
3326 | if (dtable.limit < index * 8 + 7) | 3401 | if (dtable.limit < index * 8 + 7) |
3327 | return 1; | 3402 | return 1; |
@@ -3900,6 +3975,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) | |||
3900 | /* We do fxsave: this must be aligned. */ | 3975 | /* We do fxsave: this must be aligned. */ |
3901 | BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); | 3976 | BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); |
3902 | 3977 | ||
3978 | vcpu->arch.mtrr_state.have_fixed = 1; | ||
3903 | vcpu_load(vcpu); | 3979 | vcpu_load(vcpu); |
3904 | r = kvm_arch_vcpu_reset(vcpu); | 3980 | r = kvm_arch_vcpu_reset(vcpu); |
3905 | if (r == 0) | 3981 | if (r == 0) |
@@ -3925,6 +4001,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) | |||
3925 | 4001 | ||
3926 | int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) | 4002 | int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) |
3927 | { | 4003 | { |
4004 | vcpu->arch.nmi_pending = false; | ||
4005 | vcpu->arch.nmi_injected = false; | ||
4006 | |||
3928 | return kvm_x86_ops->vcpu_reset(vcpu); | 4007 | return kvm_x86_ops->vcpu_reset(vcpu); |
3929 | } | 4008 | } |
3930 | 4009 | ||
@@ -4012,6 +4091,7 @@ struct kvm *kvm_arch_create_vm(void) | |||
4012 | return ERR_PTR(-ENOMEM); | 4091 | return ERR_PTR(-ENOMEM); |
4013 | 4092 | ||
4014 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); | 4093 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); |
4094 | INIT_LIST_HEAD(&kvm->arch.oos_global_pages); | ||
4015 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); | 4095 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); |
4016 | 4096 | ||
4017 | /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ | 4097 | /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ |
@@ -4048,8 +4128,8 @@ static void kvm_free_vcpus(struct kvm *kvm) | |||
4048 | 4128 | ||
4049 | void kvm_arch_destroy_vm(struct kvm *kvm) | 4129 | void kvm_arch_destroy_vm(struct kvm *kvm) |
4050 | { | 4130 | { |
4051 | kvm_iommu_unmap_guest(kvm); | ||
4052 | kvm_free_all_assigned_devices(kvm); | 4131 | kvm_free_all_assigned_devices(kvm); |
4132 | kvm_iommu_unmap_guest(kvm); | ||
4053 | kvm_free_pit(kvm); | 4133 | kvm_free_pit(kvm); |
4054 | kfree(kvm->arch.vpic); | 4134 | kfree(kvm->arch.vpic); |
4055 | kfree(kvm->arch.vioapic); | 4135 | kfree(kvm->arch.vioapic); |
@@ -4127,7 +4207,8 @@ void kvm_arch_flush_shadow(struct kvm *kvm) | |||
4127 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) | 4207 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) |
4128 | { | 4208 | { |
4129 | return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE | 4209 | return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE |
4130 | || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED; | 4210 | || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED |
4211 | || vcpu->arch.nmi_pending; | ||
4131 | } | 4212 | } |
4132 | 4213 | ||
4133 | static void vcpu_kick_intr(void *info) | 4214 | static void vcpu_kick_intr(void *info) |
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index ea051173b0da..d174db7a3370 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c | |||
@@ -58,6 +58,7 @@ | |||
58 | #define SrcMem32 (4<<4) /* Memory operand (32-bit). */ | 58 | #define SrcMem32 (4<<4) /* Memory operand (32-bit). */ |
59 | #define SrcImm (5<<4) /* Immediate operand. */ | 59 | #define SrcImm (5<<4) /* Immediate operand. */ |
60 | #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ | 60 | #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ |
61 | #define SrcOne (7<<4) /* Implied '1' */ | ||
61 | #define SrcMask (7<<4) | 62 | #define SrcMask (7<<4) |
62 | /* Generic ModRM decode. */ | 63 | /* Generic ModRM decode. */ |
63 | #define ModRM (1<<7) | 64 | #define ModRM (1<<7) |
@@ -70,17 +71,23 @@ | |||
70 | #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ | 71 | #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ |
71 | #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ | 72 | #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ |
72 | #define GroupMask 0xff /* Group number stored in bits 0:7 */ | 73 | #define GroupMask 0xff /* Group number stored in bits 0:7 */ |
74 | /* Source 2 operand type */ | ||
75 | #define Src2None (0<<29) | ||
76 | #define Src2CL (1<<29) | ||
77 | #define Src2ImmByte (2<<29) | ||
78 | #define Src2One (3<<29) | ||
79 | #define Src2Mask (7<<29) | ||
73 | 80 | ||
74 | enum { | 81 | enum { |
75 | Group1_80, Group1_81, Group1_82, Group1_83, | 82 | Group1_80, Group1_81, Group1_82, Group1_83, |
76 | Group1A, Group3_Byte, Group3, Group4, Group5, Group7, | 83 | Group1A, Group3_Byte, Group3, Group4, Group5, Group7, |
77 | }; | 84 | }; |
78 | 85 | ||
79 | static u16 opcode_table[256] = { | 86 | static u32 opcode_table[256] = { |
80 | /* 0x00 - 0x07 */ | 87 | /* 0x00 - 0x07 */ |
81 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | 88 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, |
82 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 89 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, |
83 | 0, 0, 0, 0, | 90 | ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, |
84 | /* 0x08 - 0x0F */ | 91 | /* 0x08 - 0x0F */ |
85 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | 92 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, |
86 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 93 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, |
@@ -195,7 +202,7 @@ static u16 opcode_table[256] = { | |||
195 | ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, | 202 | ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, |
196 | }; | 203 | }; |
197 | 204 | ||
198 | static u16 twobyte_table[256] = { | 205 | static u32 twobyte_table[256] = { |
199 | /* 0x00 - 0x0F */ | 206 | /* 0x00 - 0x0F */ |
200 | 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0, | 207 | 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0, |
201 | ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, | 208 | ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, |
@@ -230,9 +237,14 @@ static u16 twobyte_table[256] = { | |||
230 | /* 0x90 - 0x9F */ | 237 | /* 0x90 - 0x9F */ |
231 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 238 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
232 | /* 0xA0 - 0xA7 */ | 239 | /* 0xA0 - 0xA7 */ |
233 | 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, | 240 | 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, |
241 | DstMem | SrcReg | Src2ImmByte | ModRM, | ||
242 | DstMem | SrcReg | Src2CL | ModRM, 0, 0, | ||
234 | /* 0xA8 - 0xAF */ | 243 | /* 0xA8 - 0xAF */ |
235 | 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, ModRM, 0, | 244 | 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, |
245 | DstMem | SrcReg | Src2ImmByte | ModRM, | ||
246 | DstMem | SrcReg | Src2CL | ModRM, | ||
247 | ModRM, 0, | ||
236 | /* 0xB0 - 0xB7 */ | 248 | /* 0xB0 - 0xB7 */ |
237 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, | 249 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, |
238 | DstMem | SrcReg | ModRM | BitOp, | 250 | DstMem | SrcReg | ModRM | BitOp, |
@@ -253,7 +265,7 @@ static u16 twobyte_table[256] = { | |||
253 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | 265 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
254 | }; | 266 | }; |
255 | 267 | ||
256 | static u16 group_table[] = { | 268 | static u32 group_table[] = { |
257 | [Group1_80*8] = | 269 | [Group1_80*8] = |
258 | ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, | 270 | ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, |
259 | ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, | 271 | ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, |
@@ -297,9 +309,9 @@ static u16 group_table[] = { | |||
297 | SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, | 309 | SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, |
298 | }; | 310 | }; |
299 | 311 | ||
300 | static u16 group2_table[] = { | 312 | static u32 group2_table[] = { |
301 | [Group7*8] = | 313 | [Group7*8] = |
302 | SrcNone | ModRM, 0, 0, 0, | 314 | SrcNone | ModRM, 0, 0, SrcNone | ModRM, |
303 | SrcNone | ModRM | DstMem | Mov, 0, | 315 | SrcNone | ModRM | DstMem | Mov, 0, |
304 | SrcMem16 | ModRM | Mov, 0, | 316 | SrcMem16 | ModRM | Mov, 0, |
305 | }; | 317 | }; |
@@ -359,49 +371,48 @@ static u16 group2_table[] = { | |||
359 | "andl %"_msk",%"_LO32 _tmp"; " \ | 371 | "andl %"_msk",%"_LO32 _tmp"; " \ |
360 | "orl %"_LO32 _tmp",%"_sav"; " | 372 | "orl %"_LO32 _tmp",%"_sav"; " |
361 | 373 | ||
374 | #ifdef CONFIG_X86_64 | ||
375 | #define ON64(x) x | ||
376 | #else | ||
377 | #define ON64(x) | ||
378 | #endif | ||
379 | |||
380 | #define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \ | ||
381 | do { \ | ||
382 | __asm__ __volatile__ ( \ | ||
383 | _PRE_EFLAGS("0", "4", "2") \ | ||
384 | _op _suffix " %"_x"3,%1; " \ | ||
385 | _POST_EFLAGS("0", "4", "2") \ | ||
386 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
387 | "=&r" (_tmp) \ | ||
388 | : _y ((_src).val), "i" (EFLAGS_MASK)); \ | ||
389 | } while (0) | ||
390 | |||
391 | |||
362 | /* Raw emulation: instruction has two explicit operands. */ | 392 | /* Raw emulation: instruction has two explicit operands. */ |
363 | #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ | 393 | #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ |
364 | do { \ | 394 | do { \ |
365 | unsigned long _tmp; \ | 395 | unsigned long _tmp; \ |
366 | \ | 396 | \ |
367 | switch ((_dst).bytes) { \ | 397 | switch ((_dst).bytes) { \ |
368 | case 2: \ | 398 | case 2: \ |
369 | __asm__ __volatile__ ( \ | 399 | ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \ |
370 | _PRE_EFLAGS("0", "4", "2") \ | 400 | break; \ |
371 | _op"w %"_wx"3,%1; " \ | 401 | case 4: \ |
372 | _POST_EFLAGS("0", "4", "2") \ | 402 | ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \ |
373 | : "=m" (_eflags), "=m" ((_dst).val), \ | 403 | break; \ |
374 | "=&r" (_tmp) \ | 404 | case 8: \ |
375 | : _wy ((_src).val), "i" (EFLAGS_MASK)); \ | 405 | ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \ |
376 | break; \ | 406 | break; \ |
377 | case 4: \ | 407 | } \ |
378 | __asm__ __volatile__ ( \ | ||
379 | _PRE_EFLAGS("0", "4", "2") \ | ||
380 | _op"l %"_lx"3,%1; " \ | ||
381 | _POST_EFLAGS("0", "4", "2") \ | ||
382 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
383 | "=&r" (_tmp) \ | ||
384 | : _ly ((_src).val), "i" (EFLAGS_MASK)); \ | ||
385 | break; \ | ||
386 | case 8: \ | ||
387 | __emulate_2op_8byte(_op, _src, _dst, \ | ||
388 | _eflags, _qx, _qy); \ | ||
389 | break; \ | ||
390 | } \ | ||
391 | } while (0) | 408 | } while (0) |
392 | 409 | ||
393 | #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ | 410 | #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ |
394 | do { \ | 411 | do { \ |
395 | unsigned long __tmp; \ | 412 | unsigned long _tmp; \ |
396 | switch ((_dst).bytes) { \ | 413 | switch ((_dst).bytes) { \ |
397 | case 1: \ | 414 | case 1: \ |
398 | __asm__ __volatile__ ( \ | 415 | ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \ |
399 | _PRE_EFLAGS("0", "4", "2") \ | ||
400 | _op"b %"_bx"3,%1; " \ | ||
401 | _POST_EFLAGS("0", "4", "2") \ | ||
402 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
403 | "=&r" (__tmp) \ | ||
404 | : _by ((_src).val), "i" (EFLAGS_MASK)); \ | ||
405 | break; \ | 416 | break; \ |
406 | default: \ | 417 | default: \ |
407 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ | 418 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ |
@@ -425,71 +436,68 @@ static u16 group2_table[] = { | |||
425 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ | 436 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ |
426 | "w", "r", _LO32, "r", "", "r") | 437 | "w", "r", _LO32, "r", "", "r") |
427 | 438 | ||
428 | /* Instruction has only one explicit operand (no source operand). */ | 439 | /* Instruction has three operands and one operand is stored in ECX register */ |
429 | #define emulate_1op(_op, _dst, _eflags) \ | 440 | #define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ |
430 | do { \ | 441 | do { \ |
431 | unsigned long _tmp; \ | 442 | unsigned long _tmp; \ |
432 | \ | 443 | _type _clv = (_cl).val; \ |
433 | switch ((_dst).bytes) { \ | 444 | _type _srcv = (_src).val; \ |
434 | case 1: \ | 445 | _type _dstv = (_dst).val; \ |
435 | __asm__ __volatile__ ( \ | 446 | \ |
436 | _PRE_EFLAGS("0", "3", "2") \ | 447 | __asm__ __volatile__ ( \ |
437 | _op"b %1; " \ | 448 | _PRE_EFLAGS("0", "5", "2") \ |
438 | _POST_EFLAGS("0", "3", "2") \ | 449 | _op _suffix " %4,%1 \n" \ |
439 | : "=m" (_eflags), "=m" ((_dst).val), \ | 450 | _POST_EFLAGS("0", "5", "2") \ |
440 | "=&r" (_tmp) \ | 451 | : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ |
441 | : "i" (EFLAGS_MASK)); \ | 452 | : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ |
442 | break; \ | 453 | ); \ |
443 | case 2: \ | 454 | \ |
444 | __asm__ __volatile__ ( \ | 455 | (_cl).val = (unsigned long) _clv; \ |
445 | _PRE_EFLAGS("0", "3", "2") \ | 456 | (_src).val = (unsigned long) _srcv; \ |
446 | _op"w %1; " \ | 457 | (_dst).val = (unsigned long) _dstv; \ |
447 | _POST_EFLAGS("0", "3", "2") \ | ||
448 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
449 | "=&r" (_tmp) \ | ||
450 | : "i" (EFLAGS_MASK)); \ | ||
451 | break; \ | ||
452 | case 4: \ | ||
453 | __asm__ __volatile__ ( \ | ||
454 | _PRE_EFLAGS("0", "3", "2") \ | ||
455 | _op"l %1; " \ | ||
456 | _POST_EFLAGS("0", "3", "2") \ | ||
457 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
458 | "=&r" (_tmp) \ | ||
459 | : "i" (EFLAGS_MASK)); \ | ||
460 | break; \ | ||
461 | case 8: \ | ||
462 | __emulate_1op_8byte(_op, _dst, _eflags); \ | ||
463 | break; \ | ||
464 | } \ | ||
465 | } while (0) | 458 | } while (0) |
466 | 459 | ||
467 | /* Emulate an instruction with quadword operands (x86/64 only). */ | 460 | #define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ |
468 | #if defined(CONFIG_X86_64) | 461 | do { \ |
469 | #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ | 462 | switch ((_dst).bytes) { \ |
470 | do { \ | 463 | case 2: \ |
471 | __asm__ __volatile__ ( \ | 464 | __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ |
472 | _PRE_EFLAGS("0", "4", "2") \ | 465 | "w", unsigned short); \ |
473 | _op"q %"_qx"3,%1; " \ | 466 | break; \ |
474 | _POST_EFLAGS("0", "4", "2") \ | 467 | case 4: \ |
475 | : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ | 468 | __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ |
476 | : _qy ((_src).val), "i" (EFLAGS_MASK)); \ | 469 | "l", unsigned int); \ |
470 | break; \ | ||
471 | case 8: \ | ||
472 | ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ | ||
473 | "q", unsigned long)); \ | ||
474 | break; \ | ||
475 | } \ | ||
477 | } while (0) | 476 | } while (0) |
478 | 477 | ||
479 | #define __emulate_1op_8byte(_op, _dst, _eflags) \ | 478 | #define __emulate_1op(_op, _dst, _eflags, _suffix) \ |
480 | do { \ | 479 | do { \ |
481 | __asm__ __volatile__ ( \ | 480 | unsigned long _tmp; \ |
482 | _PRE_EFLAGS("0", "3", "2") \ | 481 | \ |
483 | _op"q %1; " \ | 482 | __asm__ __volatile__ ( \ |
484 | _POST_EFLAGS("0", "3", "2") \ | 483 | _PRE_EFLAGS("0", "3", "2") \ |
485 | : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ | 484 | _op _suffix " %1; " \ |
486 | : "i" (EFLAGS_MASK)); \ | 485 | _POST_EFLAGS("0", "3", "2") \ |
486 | : "=m" (_eflags), "+m" ((_dst).val), \ | ||
487 | "=&r" (_tmp) \ | ||
488 | : "i" (EFLAGS_MASK)); \ | ||
487 | } while (0) | 489 | } while (0) |
488 | 490 | ||
489 | #elif defined(__i386__) | 491 | /* Instruction has only one explicit operand (no source operand). */ |
490 | #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) | 492 | #define emulate_1op(_op, _dst, _eflags) \ |
491 | #define __emulate_1op_8byte(_op, _dst, _eflags) | 493 | do { \ |
492 | #endif /* __i386__ */ | 494 | switch ((_dst).bytes) { \ |
495 | case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \ | ||
496 | case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \ | ||
497 | case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \ | ||
498 | case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \ | ||
499 | } \ | ||
500 | } while (0) | ||
493 | 501 | ||
494 | /* Fetch next part of the instruction being emulated. */ | 502 | /* Fetch next part of the instruction being emulated. */ |
495 | #define insn_fetch(_type, _size, _eip) \ | 503 | #define insn_fetch(_type, _size, _eip) \ |
@@ -1041,6 +1049,33 @@ done_prefixes: | |||
1041 | c->src.bytes = 1; | 1049 | c->src.bytes = 1; |
1042 | c->src.val = insn_fetch(s8, 1, c->eip); | 1050 | c->src.val = insn_fetch(s8, 1, c->eip); |
1043 | break; | 1051 | break; |
1052 | case SrcOne: | ||
1053 | c->src.bytes = 1; | ||
1054 | c->src.val = 1; | ||
1055 | break; | ||
1056 | } | ||
1057 | |||
1058 | /* | ||
1059 | * Decode and fetch the second source operand: register, memory | ||
1060 | * or immediate. | ||
1061 | */ | ||
1062 | switch (c->d & Src2Mask) { | ||
1063 | case Src2None: | ||
1064 | break; | ||
1065 | case Src2CL: | ||
1066 | c->src2.bytes = 1; | ||
1067 | c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; | ||
1068 | break; | ||
1069 | case Src2ImmByte: | ||
1070 | c->src2.type = OP_IMM; | ||
1071 | c->src2.ptr = (unsigned long *)c->eip; | ||
1072 | c->src2.bytes = 1; | ||
1073 | c->src2.val = insn_fetch(u8, 1, c->eip); | ||
1074 | break; | ||
1075 | case Src2One: | ||
1076 | c->src2.bytes = 1; | ||
1077 | c->src2.val = 1; | ||
1078 | break; | ||
1044 | } | 1079 | } |
1045 | 1080 | ||
1046 | /* Decode and fetch the destination operand: register or memory. */ | 1081 | /* Decode and fetch the destination operand: register or memory. */ |
@@ -1100,20 +1135,33 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt) | |||
1100 | c->regs[VCPU_REGS_RSP]); | 1135 | c->regs[VCPU_REGS_RSP]); |
1101 | } | 1136 | } |
1102 | 1137 | ||
1103 | static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, | 1138 | static int emulate_pop(struct x86_emulate_ctxt *ctxt, |
1104 | struct x86_emulate_ops *ops) | 1139 | struct x86_emulate_ops *ops) |
1105 | { | 1140 | { |
1106 | struct decode_cache *c = &ctxt->decode; | 1141 | struct decode_cache *c = &ctxt->decode; |
1107 | int rc; | 1142 | int rc; |
1108 | 1143 | ||
1109 | rc = ops->read_std(register_address(c, ss_base(ctxt), | 1144 | rc = ops->read_emulated(register_address(c, ss_base(ctxt), |
1110 | c->regs[VCPU_REGS_RSP]), | 1145 | c->regs[VCPU_REGS_RSP]), |
1111 | &c->dst.val, c->dst.bytes, ctxt->vcpu); | 1146 | &c->src.val, c->src.bytes, ctxt->vcpu); |
1112 | if (rc != 0) | 1147 | if (rc != 0) |
1113 | return rc; | 1148 | return rc; |
1114 | 1149 | ||
1115 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->dst.bytes); | 1150 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.bytes); |
1151 | return rc; | ||
1152 | } | ||
1153 | |||
1154 | static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, | ||
1155 | struct x86_emulate_ops *ops) | ||
1156 | { | ||
1157 | struct decode_cache *c = &ctxt->decode; | ||
1158 | int rc; | ||
1116 | 1159 | ||
1160 | c->src.bytes = c->dst.bytes; | ||
1161 | rc = emulate_pop(ctxt, ops); | ||
1162 | if (rc != 0) | ||
1163 | return rc; | ||
1164 | c->dst.val = c->src.val; | ||
1117 | return 0; | 1165 | return 0; |
1118 | } | 1166 | } |
1119 | 1167 | ||
@@ -1415,24 +1463,15 @@ special_insn: | |||
1415 | emulate_1op("dec", c->dst, ctxt->eflags); | 1463 | emulate_1op("dec", c->dst, ctxt->eflags); |
1416 | break; | 1464 | break; |
1417 | case 0x50 ... 0x57: /* push reg */ | 1465 | case 0x50 ... 0x57: /* push reg */ |
1418 | c->dst.type = OP_MEM; | 1466 | emulate_push(ctxt); |
1419 | c->dst.bytes = c->op_bytes; | ||
1420 | c->dst.val = c->src.val; | ||
1421 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], | ||
1422 | -c->op_bytes); | ||
1423 | c->dst.ptr = (void *) register_address( | ||
1424 | c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]); | ||
1425 | break; | 1467 | break; |
1426 | case 0x58 ... 0x5f: /* pop reg */ | 1468 | case 0x58 ... 0x5f: /* pop reg */ |
1427 | pop_instruction: | 1469 | pop_instruction: |
1428 | if ((rc = ops->read_std(register_address(c, ss_base(ctxt), | 1470 | c->src.bytes = c->op_bytes; |
1429 | c->regs[VCPU_REGS_RSP]), c->dst.ptr, | 1471 | rc = emulate_pop(ctxt, ops); |
1430 | c->op_bytes, ctxt->vcpu)) != 0) | 1472 | if (rc != 0) |
1431 | goto done; | 1473 | goto done; |
1432 | 1474 | c->dst.val = c->src.val; | |
1433 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], | ||
1434 | c->op_bytes); | ||
1435 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1436 | break; | 1475 | break; |
1437 | case 0x63: /* movsxd */ | 1476 | case 0x63: /* movsxd */ |
1438 | if (ctxt->mode != X86EMUL_MODE_PROT64) | 1477 | if (ctxt->mode != X86EMUL_MODE_PROT64) |
@@ -1591,7 +1630,9 @@ special_insn: | |||
1591 | emulate_push(ctxt); | 1630 | emulate_push(ctxt); |
1592 | break; | 1631 | break; |
1593 | case 0x9d: /* popf */ | 1632 | case 0x9d: /* popf */ |
1633 | c->dst.type = OP_REG; | ||
1594 | c->dst.ptr = (unsigned long *) &ctxt->eflags; | 1634 | c->dst.ptr = (unsigned long *) &ctxt->eflags; |
1635 | c->dst.bytes = c->op_bytes; | ||
1595 | goto pop_instruction; | 1636 | goto pop_instruction; |
1596 | case 0xa0 ... 0xa1: /* mov */ | 1637 | case 0xa0 ... 0xa1: /* mov */ |
1597 | c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; | 1638 | c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; |
@@ -1689,7 +1730,9 @@ special_insn: | |||
1689 | emulate_grp2(ctxt); | 1730 | emulate_grp2(ctxt); |
1690 | break; | 1731 | break; |
1691 | case 0xc3: /* ret */ | 1732 | case 0xc3: /* ret */ |
1733 | c->dst.type = OP_REG; | ||
1692 | c->dst.ptr = &c->eip; | 1734 | c->dst.ptr = &c->eip; |
1735 | c->dst.bytes = c->op_bytes; | ||
1693 | goto pop_instruction; | 1736 | goto pop_instruction; |
1694 | case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ | 1737 | case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ |
1695 | mov: | 1738 | mov: |
@@ -1778,7 +1821,7 @@ special_insn: | |||
1778 | c->eip = saved_eip; | 1821 | c->eip = saved_eip; |
1779 | goto cannot_emulate; | 1822 | goto cannot_emulate; |
1780 | } | 1823 | } |
1781 | return 0; | 1824 | break; |
1782 | case 0xf4: /* hlt */ | 1825 | case 0xf4: /* hlt */ |
1783 | ctxt->vcpu->arch.halt_request = 1; | 1826 | ctxt->vcpu->arch.halt_request = 1; |
1784 | break; | 1827 | break; |
@@ -1999,12 +2042,20 @@ twobyte_insn: | |||
1999 | c->src.val &= (c->dst.bytes << 3) - 1; | 2042 | c->src.val &= (c->dst.bytes << 3) - 1; |
2000 | emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); | 2043 | emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); |
2001 | break; | 2044 | break; |
2045 | case 0xa4: /* shld imm8, r, r/m */ | ||
2046 | case 0xa5: /* shld cl, r, r/m */ | ||
2047 | emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); | ||
2048 | break; | ||
2002 | case 0xab: | 2049 | case 0xab: |
2003 | bts: /* bts */ | 2050 | bts: /* bts */ |
2004 | /* only subword offset */ | 2051 | /* only subword offset */ |
2005 | c->src.val &= (c->dst.bytes << 3) - 1; | 2052 | c->src.val &= (c->dst.bytes << 3) - 1; |
2006 | emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); | 2053 | emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); |
2007 | break; | 2054 | break; |
2055 | case 0xac: /* shrd imm8, r, r/m */ | ||
2056 | case 0xad: /* shrd cl, r, r/m */ | ||
2057 | emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags); | ||
2058 | break; | ||
2008 | case 0xae: /* clflush */ | 2059 | case 0xae: /* clflush */ |
2009 | break; | 2060 | break; |
2010 | case 0xb0 ... 0xb1: /* cmpxchg */ | 2061 | case 0xb0 ... 0xb1: /* cmpxchg */ |