diff options
Diffstat (limited to 'drivers/kvm')
| -rw-r--r-- | drivers/kvm/kvm.h | 106 | ||||
| -rw-r--r-- | drivers/kvm/kvm_main.c | 155 | ||||
| -rw-r--r-- | drivers/kvm/mmu.c | 1114 | ||||
| -rw-r--r-- | drivers/kvm/paging_tmpl.h | 260 | ||||
| -rw-r--r-- | drivers/kvm/svm.c | 113 | ||||
| -rw-r--r-- | drivers/kvm/vmx.c | 175 | ||||
| -rw-r--r-- | drivers/kvm/x86_emulate.c | 2 |
7 files changed, 1551 insertions, 374 deletions
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 100df6f38d92..91e0c75aca8f 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h | |||
| @@ -52,6 +52,8 @@ | |||
| 52 | #define KVM_MAX_VCPUS 1 | 52 | #define KVM_MAX_VCPUS 1 |
| 53 | #define KVM_MEMORY_SLOTS 4 | 53 | #define KVM_MEMORY_SLOTS 4 |
| 54 | #define KVM_NUM_MMU_PAGES 256 | 54 | #define KVM_NUM_MMU_PAGES 256 |
| 55 | #define KVM_MIN_FREE_MMU_PAGES 5 | ||
| 56 | #define KVM_REFILL_PAGES 25 | ||
| 55 | 57 | ||
| 56 | #define FX_IMAGE_SIZE 512 | 58 | #define FX_IMAGE_SIZE 512 |
| 57 | #define FX_IMAGE_ALIGN 16 | 59 | #define FX_IMAGE_ALIGN 16 |
| @@ -89,14 +91,54 @@ typedef unsigned long hva_t; | |||
| 89 | typedef u64 hpa_t; | 91 | typedef u64 hpa_t; |
| 90 | typedef unsigned long hfn_t; | 92 | typedef unsigned long hfn_t; |
| 91 | 93 | ||
| 94 | #define NR_PTE_CHAIN_ENTRIES 5 | ||
| 95 | |||
| 96 | struct kvm_pte_chain { | ||
| 97 | u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES]; | ||
| 98 | struct hlist_node link; | ||
| 99 | }; | ||
| 100 | |||
| 101 | /* | ||
| 102 | * kvm_mmu_page_role, below, is defined as: | ||
| 103 | * | ||
| 104 | * bits 0:3 - total guest paging levels (2-4, or zero for real mode) | ||
| 105 | * bits 4:7 - page table level for this shadow (1-4) | ||
| 106 | * bits 8:9 - page table quadrant for 2-level guests | ||
| 107 | * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode) | ||
| 108 | */ | ||
| 109 | union kvm_mmu_page_role { | ||
| 110 | unsigned word; | ||
| 111 | struct { | ||
| 112 | unsigned glevels : 4; | ||
| 113 | unsigned level : 4; | ||
| 114 | unsigned quadrant : 2; | ||
| 115 | unsigned pad_for_nice_hex_output : 6; | ||
| 116 | unsigned metaphysical : 1; | ||
| 117 | }; | ||
| 118 | }; | ||
| 119 | |||
| 92 | struct kvm_mmu_page { | 120 | struct kvm_mmu_page { |
| 93 | struct list_head link; | 121 | struct list_head link; |
| 122 | struct hlist_node hash_link; | ||
| 123 | |||
| 124 | /* | ||
| 125 | * The following two entries are used to key the shadow page in the | ||
| 126 | * hash table. | ||
| 127 | */ | ||
| 128 | gfn_t gfn; | ||
| 129 | union kvm_mmu_page_role role; | ||
| 130 | |||
| 94 | hpa_t page_hpa; | 131 | hpa_t page_hpa; |
| 95 | unsigned long slot_bitmap; /* One bit set per slot which has memory | 132 | unsigned long slot_bitmap; /* One bit set per slot which has memory |
| 96 | * in this shadow page. | 133 | * in this shadow page. |
| 97 | */ | 134 | */ |
| 98 | int global; /* Set if all ptes in this page are global */ | 135 | int global; /* Set if all ptes in this page are global */ |
| 99 | u64 *parent_pte; | 136 | int multimapped; /* More than one parent_pte? */ |
| 137 | int root_count; /* Currently serving as active root */ | ||
| 138 | union { | ||
| 139 | u64 *parent_pte; /* !multimapped */ | ||
| 140 | struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ | ||
| 141 | }; | ||
| 100 | }; | 142 | }; |
| 101 | 143 | ||
| 102 | struct vmcs { | 144 | struct vmcs { |
| @@ -117,14 +159,26 @@ struct kvm_vcpu; | |||
| 117 | struct kvm_mmu { | 159 | struct kvm_mmu { |
| 118 | void (*new_cr3)(struct kvm_vcpu *vcpu); | 160 | void (*new_cr3)(struct kvm_vcpu *vcpu); |
| 119 | int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); | 161 | int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); |
| 120 | void (*inval_page)(struct kvm_vcpu *vcpu, gva_t gva); | ||
| 121 | void (*free)(struct kvm_vcpu *vcpu); | 162 | void (*free)(struct kvm_vcpu *vcpu); |
| 122 | gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); | 163 | gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); |
| 123 | hpa_t root_hpa; | 164 | hpa_t root_hpa; |
| 124 | int root_level; | 165 | int root_level; |
| 125 | int shadow_root_level; | 166 | int shadow_root_level; |
| 167 | |||
| 168 | u64 *pae_root; | ||
| 169 | }; | ||
| 170 | |||
| 171 | #define KVM_NR_MEM_OBJS 20 | ||
| 172 | |||
| 173 | struct kvm_mmu_memory_cache { | ||
| 174 | int nobjs; | ||
| 175 | void *objects[KVM_NR_MEM_OBJS]; | ||
| 126 | }; | 176 | }; |
| 127 | 177 | ||
| 178 | /* | ||
| 179 | * We don't want allocation failures within the mmu code, so we preallocate | ||
| 180 | * enough memory for a single page fault in a cache. | ||
| 181 | */ | ||
| 128 | struct kvm_guest_debug { | 182 | struct kvm_guest_debug { |
| 129 | int enabled; | 183 | int enabled; |
| 130 | unsigned long bp[4]; | 184 | unsigned long bp[4]; |
| @@ -173,6 +227,7 @@ struct kvm_vcpu { | |||
| 173 | struct mutex mutex; | 227 | struct mutex mutex; |
| 174 | int cpu; | 228 | int cpu; |
| 175 | int launched; | 229 | int launched; |
| 230 | int interrupt_window_open; | ||
| 176 | unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ | 231 | unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ |
| 177 | #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) | 232 | #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) |
| 178 | unsigned long irq_pending[NR_IRQ_WORDS]; | 233 | unsigned long irq_pending[NR_IRQ_WORDS]; |
| @@ -184,6 +239,7 @@ struct kvm_vcpu { | |||
| 184 | unsigned long cr3; | 239 | unsigned long cr3; |
| 185 | unsigned long cr4; | 240 | unsigned long cr4; |
| 186 | unsigned long cr8; | 241 | unsigned long cr8; |
| 242 | u64 pdptrs[4]; /* pae */ | ||
| 187 | u64 shadow_efer; | 243 | u64 shadow_efer; |
| 188 | u64 apic_base; | 244 | u64 apic_base; |
| 189 | int nmsrs; | 245 | int nmsrs; |
| @@ -194,6 +250,12 @@ struct kvm_vcpu { | |||
| 194 | struct kvm_mmu_page page_header_buf[KVM_NUM_MMU_PAGES]; | 250 | struct kvm_mmu_page page_header_buf[KVM_NUM_MMU_PAGES]; |
| 195 | struct kvm_mmu mmu; | 251 | struct kvm_mmu mmu; |
| 196 | 252 | ||
| 253 | struct kvm_mmu_memory_cache mmu_pte_chain_cache; | ||
| 254 | struct kvm_mmu_memory_cache mmu_rmap_desc_cache; | ||
| 255 | |||
| 256 | gfn_t last_pt_write_gfn; | ||
| 257 | int last_pt_write_count; | ||
| 258 | |||
| 197 | struct kvm_guest_debug guest_debug; | 259 | struct kvm_guest_debug guest_debug; |
| 198 | 260 | ||
| 199 | char fx_buf[FX_BUF_SIZE]; | 261 | char fx_buf[FX_BUF_SIZE]; |
| @@ -231,10 +293,16 @@ struct kvm { | |||
| 231 | spinlock_t lock; /* protects everything except vcpus */ | 293 | spinlock_t lock; /* protects everything except vcpus */ |
| 232 | int nmemslots; | 294 | int nmemslots; |
| 233 | struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; | 295 | struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; |
| 296 | /* | ||
| 297 | * Hash table of struct kvm_mmu_page. | ||
| 298 | */ | ||
| 234 | struct list_head active_mmu_pages; | 299 | struct list_head active_mmu_pages; |
| 300 | int n_free_mmu_pages; | ||
| 301 | struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; | ||
| 235 | struct kvm_vcpu vcpus[KVM_MAX_VCPUS]; | 302 | struct kvm_vcpu vcpus[KVM_MAX_VCPUS]; |
| 236 | int memory_config_version; | 303 | int memory_config_version; |
| 237 | int busy; | 304 | int busy; |
| 305 | unsigned long rmap_overflow; | ||
| 238 | }; | 306 | }; |
| 239 | 307 | ||
| 240 | struct kvm_stat { | 308 | struct kvm_stat { |
| @@ -247,6 +315,9 @@ struct kvm_stat { | |||
| 247 | u32 io_exits; | 315 | u32 io_exits; |
| 248 | u32 mmio_exits; | 316 | u32 mmio_exits; |
| 249 | u32 signal_exits; | 317 | u32 signal_exits; |
| 318 | u32 irq_window_exits; | ||
| 319 | u32 halt_exits; | ||
| 320 | u32 request_irq_exits; | ||
| 250 | u32 irq_exits; | 321 | u32 irq_exits; |
| 251 | }; | 322 | }; |
| 252 | 323 | ||
| @@ -279,6 +350,7 @@ struct kvm_arch_ops { | |||
| 279 | void (*set_segment)(struct kvm_vcpu *vcpu, | 350 | void (*set_segment)(struct kvm_vcpu *vcpu, |
| 280 | struct kvm_segment *var, int seg); | 351 | struct kvm_segment *var, int seg); |
| 281 | void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); | 352 | void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); |
| 353 | void (*decache_cr0_cr4_guest_bits)(struct kvm_vcpu *vcpu); | ||
| 282 | void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); | 354 | void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); |
| 283 | void (*set_cr0_no_modeswitch)(struct kvm_vcpu *vcpu, | 355 | void (*set_cr0_no_modeswitch)(struct kvm_vcpu *vcpu, |
| 284 | unsigned long cr0); | 356 | unsigned long cr0); |
| @@ -323,7 +395,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu); | |||
| 323 | int kvm_mmu_setup(struct kvm_vcpu *vcpu); | 395 | int kvm_mmu_setup(struct kvm_vcpu *vcpu); |
| 324 | 396 | ||
| 325 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); | 397 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); |
| 326 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); | 398 | void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot); |
| 327 | 399 | ||
| 328 | hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa); | 400 | hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa); |
| 329 | #define HPA_MSB ((sizeof(hpa_t) * 8) - 1) | 401 | #define HPA_MSB ((sizeof(hpa_t) * 8) - 1) |
| @@ -396,6 +468,19 @@ int kvm_write_guest(struct kvm_vcpu *vcpu, | |||
| 396 | 468 | ||
| 397 | unsigned long segment_base(u16 selector); | 469 | unsigned long segment_base(u16 selector); |
| 398 | 470 | ||
| 471 | void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes); | ||
| 472 | void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes); | ||
| 473 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); | ||
| 474 | void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); | ||
| 475 | |||
| 476 | static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | ||
| 477 | u32 error_code) | ||
| 478 | { | ||
| 479 | if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) | ||
| 480 | kvm_mmu_free_some_pages(vcpu); | ||
| 481 | return vcpu->mmu.page_fault(vcpu, gva, error_code); | ||
| 482 | } | ||
| 483 | |||
| 399 | static inline struct page *_gfn_to_page(struct kvm *kvm, gfn_t gfn) | 484 | static inline struct page *_gfn_to_page(struct kvm *kvm, gfn_t gfn) |
| 400 | { | 485 | { |
| 401 | struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); | 486 | struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); |
| @@ -541,19 +626,4 @@ static inline u32 get_rdx_init_val(void) | |||
| 541 | #define TSS_REDIRECTION_SIZE (256 / 8) | 626 | #define TSS_REDIRECTION_SIZE (256 / 8) |
| 542 | #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1) | 627 | #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1) |
| 543 | 628 | ||
| 544 | #ifdef CONFIG_X86_64 | ||
| 545 | |||
| 546 | /* | ||
| 547 | * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. Therefore | ||
| 548 | * we need to allocate shadow page tables in the first 4GB of memory, which | ||
| 549 | * happens to fit the DMA32 zone. | ||
| 550 | */ | ||
| 551 | #define GFP_KVM_MMU (GFP_KERNEL | __GFP_DMA32) | ||
| 552 | |||
| 553 | #else | ||
| 554 | |||
| 555 | #define GFP_KVM_MMU GFP_KERNEL | ||
| 556 | |||
| 557 | #endif | ||
| 558 | |||
| 559 | #endif | 629 | #endif |
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index ce7fe640f18d..67c1154960f0 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c | |||
| @@ -58,6 +58,9 @@ static struct kvm_stats_debugfs_item { | |||
| 58 | { "io_exits", &kvm_stat.io_exits }, | 58 | { "io_exits", &kvm_stat.io_exits }, |
| 59 | { "mmio_exits", &kvm_stat.mmio_exits }, | 59 | { "mmio_exits", &kvm_stat.mmio_exits }, |
| 60 | { "signal_exits", &kvm_stat.signal_exits }, | 60 | { "signal_exits", &kvm_stat.signal_exits }, |
| 61 | { "irq_window", &kvm_stat.irq_window_exits }, | ||
| 62 | { "halt_exits", &kvm_stat.halt_exits }, | ||
| 63 | { "request_irq", &kvm_stat.request_irq_exits }, | ||
| 61 | { "irq_exits", &kvm_stat.irq_exits }, | 64 | { "irq_exits", &kvm_stat.irq_exits }, |
| 62 | { 0, 0 } | 65 | { 0, 0 } |
| 63 | }; | 66 | }; |
| @@ -227,6 +230,7 @@ static int kvm_dev_open(struct inode *inode, struct file *filp) | |||
| 227 | struct kvm_vcpu *vcpu = &kvm->vcpus[i]; | 230 | struct kvm_vcpu *vcpu = &kvm->vcpus[i]; |
| 228 | 231 | ||
| 229 | mutex_init(&vcpu->mutex); | 232 | mutex_init(&vcpu->mutex); |
| 233 | vcpu->kvm = kvm; | ||
| 230 | vcpu->mmu.root_hpa = INVALID_PAGE; | 234 | vcpu->mmu.root_hpa = INVALID_PAGE; |
| 231 | INIT_LIST_HEAD(&vcpu->free_pages); | 235 | INIT_LIST_HEAD(&vcpu->free_pages); |
| 232 | } | 236 | } |
| @@ -268,8 +272,8 @@ static void kvm_free_physmem(struct kvm *kvm) | |||
| 268 | 272 | ||
| 269 | static void kvm_free_vcpu(struct kvm_vcpu *vcpu) | 273 | static void kvm_free_vcpu(struct kvm_vcpu *vcpu) |
| 270 | { | 274 | { |
| 271 | kvm_arch_ops->vcpu_free(vcpu); | ||
| 272 | kvm_mmu_destroy(vcpu); | 275 | kvm_mmu_destroy(vcpu); |
| 276 | kvm_arch_ops->vcpu_free(vcpu); | ||
| 273 | } | 277 | } |
| 274 | 278 | ||
| 275 | static void kvm_free_vcpus(struct kvm *kvm) | 279 | static void kvm_free_vcpus(struct kvm *kvm) |
| @@ -295,14 +299,17 @@ static void inject_gp(struct kvm_vcpu *vcpu) | |||
| 295 | kvm_arch_ops->inject_gp(vcpu, 0); | 299 | kvm_arch_ops->inject_gp(vcpu, 0); |
| 296 | } | 300 | } |
| 297 | 301 | ||
| 298 | static int pdptrs_have_reserved_bits_set(struct kvm_vcpu *vcpu, | 302 | /* |
| 299 | unsigned long cr3) | 303 | * Load the pae pdptrs. Return true is they are all valid. |
| 304 | */ | ||
| 305 | static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) | ||
| 300 | { | 306 | { |
| 301 | gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; | 307 | gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; |
| 302 | unsigned offset = (cr3 & (PAGE_SIZE-1)) >> 5; | 308 | unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; |
| 303 | int i; | 309 | int i; |
| 304 | u64 pdpte; | 310 | u64 pdpte; |
| 305 | u64 *pdpt; | 311 | u64 *pdpt; |
| 312 | int ret; | ||
| 306 | struct kvm_memory_slot *memslot; | 313 | struct kvm_memory_slot *memslot; |
| 307 | 314 | ||
| 308 | spin_lock(&vcpu->kvm->lock); | 315 | spin_lock(&vcpu->kvm->lock); |
| @@ -310,16 +317,23 @@ static int pdptrs_have_reserved_bits_set(struct kvm_vcpu *vcpu, | |||
| 310 | /* FIXME: !memslot - emulate? 0xff? */ | 317 | /* FIXME: !memslot - emulate? 0xff? */ |
| 311 | pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn), KM_USER0); | 318 | pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn), KM_USER0); |
| 312 | 319 | ||
| 320 | ret = 1; | ||
| 313 | for (i = 0; i < 4; ++i) { | 321 | for (i = 0; i < 4; ++i) { |
| 314 | pdpte = pdpt[offset + i]; | 322 | pdpte = pdpt[offset + i]; |
| 315 | if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) | 323 | if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) { |
| 316 | break; | 324 | ret = 0; |
| 325 | goto out; | ||
| 326 | } | ||
| 317 | } | 327 | } |
| 318 | 328 | ||
| 329 | for (i = 0; i < 4; ++i) | ||
| 330 | vcpu->pdptrs[i] = pdpt[offset + i]; | ||
| 331 | |||
| 332 | out: | ||
| 319 | kunmap_atomic(pdpt, KM_USER0); | 333 | kunmap_atomic(pdpt, KM_USER0); |
| 320 | spin_unlock(&vcpu->kvm->lock); | 334 | spin_unlock(&vcpu->kvm->lock); |
| 321 | 335 | ||
| 322 | return i != 4; | 336 | return ret; |
| 323 | } | 337 | } |
| 324 | 338 | ||
| 325 | void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | 339 | void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) |
| @@ -365,8 +379,7 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
| 365 | } | 379 | } |
| 366 | } else | 380 | } else |
| 367 | #endif | 381 | #endif |
| 368 | if (is_pae(vcpu) && | 382 | if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) { |
| 369 | pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) { | ||
| 370 | printk(KERN_DEBUG "set_cr0: #GP, pdptrs " | 383 | printk(KERN_DEBUG "set_cr0: #GP, pdptrs " |
| 371 | "reserved bits\n"); | 384 | "reserved bits\n"); |
| 372 | inject_gp(vcpu); | 385 | inject_gp(vcpu); |
| @@ -387,6 +400,7 @@ EXPORT_SYMBOL_GPL(set_cr0); | |||
| 387 | 400 | ||
| 388 | void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) | 401 | void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) |
| 389 | { | 402 | { |
| 403 | kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu); | ||
| 390 | set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f)); | 404 | set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f)); |
| 391 | } | 405 | } |
| 392 | EXPORT_SYMBOL_GPL(lmsw); | 406 | EXPORT_SYMBOL_GPL(lmsw); |
| @@ -407,7 +421,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
| 407 | return; | 421 | return; |
| 408 | } | 422 | } |
| 409 | } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK) | 423 | } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK) |
| 410 | && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) { | 424 | && !load_pdptrs(vcpu, vcpu->cr3)) { |
| 411 | printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); | 425 | printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); |
| 412 | inject_gp(vcpu); | 426 | inject_gp(vcpu); |
| 413 | } | 427 | } |
| @@ -439,7 +453,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
| 439 | return; | 453 | return; |
| 440 | } | 454 | } |
| 441 | if (is_paging(vcpu) && is_pae(vcpu) && | 455 | if (is_paging(vcpu) && is_pae(vcpu) && |
| 442 | pdptrs_have_reserved_bits_set(vcpu, cr3)) { | 456 | !load_pdptrs(vcpu, cr3)) { |
| 443 | printk(KERN_DEBUG "set_cr3: #GP, pdptrs " | 457 | printk(KERN_DEBUG "set_cr3: #GP, pdptrs " |
| 444 | "reserved bits\n"); | 458 | "reserved bits\n"); |
| 445 | inject_gp(vcpu); | 459 | inject_gp(vcpu); |
| @@ -449,7 +463,19 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
| 449 | 463 | ||
| 450 | vcpu->cr3 = cr3; | 464 | vcpu->cr3 = cr3; |
| 451 | spin_lock(&vcpu->kvm->lock); | 465 | spin_lock(&vcpu->kvm->lock); |
| 452 | vcpu->mmu.new_cr3(vcpu); | 466 | /* |
| 467 | * Does the new cr3 value map to physical memory? (Note, we | ||
| 468 | * catch an invalid cr3 even in real-mode, because it would | ||
| 469 | * cause trouble later on when we turn on paging anyway.) | ||
| 470 | * | ||
| 471 | * A real CPU would silently accept an invalid cr3 and would | ||
| 472 | * attempt to use it - with largely undefined (and often hard | ||
| 473 | * to debug) behavior on the guest side. | ||
| 474 | */ | ||
| 475 | if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) | ||
| 476 | inject_gp(vcpu); | ||
| 477 | else | ||
| 478 | vcpu->mmu.new_cr3(vcpu); | ||
| 453 | spin_unlock(&vcpu->kvm->lock); | 479 | spin_unlock(&vcpu->kvm->lock); |
| 454 | } | 480 | } |
| 455 | EXPORT_SYMBOL_GPL(set_cr3); | 481 | EXPORT_SYMBOL_GPL(set_cr3); |
| @@ -517,7 +543,6 @@ static int kvm_dev_ioctl_create_vcpu(struct kvm *kvm, int n) | |||
| 517 | vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE; | 543 | vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE; |
| 518 | 544 | ||
| 519 | vcpu->cpu = -1; /* First load will set up TR */ | 545 | vcpu->cpu = -1; /* First load will set up TR */ |
| 520 | vcpu->kvm = kvm; | ||
| 521 | r = kvm_arch_ops->vcpu_create(vcpu); | 546 | r = kvm_arch_ops->vcpu_create(vcpu); |
| 522 | if (r < 0) | 547 | if (r < 0) |
| 523 | goto out_free_vcpus; | 548 | goto out_free_vcpus; |
| @@ -634,6 +659,7 @@ raced: | |||
| 634 | | __GFP_ZERO); | 659 | | __GFP_ZERO); |
| 635 | if (!new.phys_mem[i]) | 660 | if (!new.phys_mem[i]) |
| 636 | goto out_free; | 661 | goto out_free; |
| 662 | new.phys_mem[i]->private = 0; | ||
| 637 | } | 663 | } |
| 638 | } | 664 | } |
| 639 | 665 | ||
| @@ -688,6 +714,13 @@ out: | |||
| 688 | return r; | 714 | return r; |
| 689 | } | 715 | } |
| 690 | 716 | ||
| 717 | static void do_remove_write_access(struct kvm_vcpu *vcpu, int slot) | ||
| 718 | { | ||
| 719 | spin_lock(&vcpu->kvm->lock); | ||
| 720 | kvm_mmu_slot_remove_write_access(vcpu, slot); | ||
| 721 | spin_unlock(&vcpu->kvm->lock); | ||
| 722 | } | ||
| 723 | |||
| 691 | /* | 724 | /* |
| 692 | * Get (and clear) the dirty memory log for a memory slot. | 725 | * Get (and clear) the dirty memory log for a memory slot. |
| 693 | */ | 726 | */ |
| @@ -697,6 +730,7 @@ static int kvm_dev_ioctl_get_dirty_log(struct kvm *kvm, | |||
| 697 | struct kvm_memory_slot *memslot; | 730 | struct kvm_memory_slot *memslot; |
| 698 | int r, i; | 731 | int r, i; |
| 699 | int n; | 732 | int n; |
| 733 | int cleared; | ||
| 700 | unsigned long any = 0; | 734 | unsigned long any = 0; |
| 701 | 735 | ||
| 702 | spin_lock(&kvm->lock); | 736 | spin_lock(&kvm->lock); |
| @@ -727,15 +761,17 @@ static int kvm_dev_ioctl_get_dirty_log(struct kvm *kvm, | |||
| 727 | 761 | ||
| 728 | 762 | ||
| 729 | if (any) { | 763 | if (any) { |
| 730 | spin_lock(&kvm->lock); | 764 | cleared = 0; |
| 731 | kvm_mmu_slot_remove_write_access(kvm, log->slot); | ||
| 732 | spin_unlock(&kvm->lock); | ||
| 733 | memset(memslot->dirty_bitmap, 0, n); | ||
| 734 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | 765 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { |
| 735 | struct kvm_vcpu *vcpu = vcpu_load(kvm, i); | 766 | struct kvm_vcpu *vcpu = vcpu_load(kvm, i); |
| 736 | 767 | ||
| 737 | if (!vcpu) | 768 | if (!vcpu) |
| 738 | continue; | 769 | continue; |
| 770 | if (!cleared) { | ||
| 771 | do_remove_write_access(vcpu, log->slot); | ||
| 772 | memset(memslot->dirty_bitmap, 0, n); | ||
| 773 | cleared = 1; | ||
| 774 | } | ||
| 739 | kvm_arch_ops->tlb_flush(vcpu); | 775 | kvm_arch_ops->tlb_flush(vcpu); |
| 740 | vcpu_put(vcpu); | 776 | vcpu_put(vcpu); |
| 741 | } | 777 | } |
| @@ -863,6 +899,27 @@ static int emulator_read_emulated(unsigned long addr, | |||
| 863 | } | 899 | } |
| 864 | } | 900 | } |
| 865 | 901 | ||
| 902 | static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, | ||
| 903 | unsigned long val, int bytes) | ||
| 904 | { | ||
| 905 | struct kvm_memory_slot *m; | ||
| 906 | struct page *page; | ||
| 907 | void *virt; | ||
| 908 | |||
| 909 | if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) | ||
| 910 | return 0; | ||
| 911 | m = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
| 912 | if (!m) | ||
| 913 | return 0; | ||
| 914 | page = gfn_to_page(m, gpa >> PAGE_SHIFT); | ||
| 915 | kvm_mmu_pre_write(vcpu, gpa, bytes); | ||
| 916 | virt = kmap_atomic(page, KM_USER0); | ||
| 917 | memcpy(virt + offset_in_page(gpa), &val, bytes); | ||
| 918 | kunmap_atomic(virt, KM_USER0); | ||
| 919 | kvm_mmu_post_write(vcpu, gpa, bytes); | ||
| 920 | return 1; | ||
| 921 | } | ||
| 922 | |||
| 866 | static int emulator_write_emulated(unsigned long addr, | 923 | static int emulator_write_emulated(unsigned long addr, |
| 867 | unsigned long val, | 924 | unsigned long val, |
| 868 | unsigned int bytes, | 925 | unsigned int bytes, |
| @@ -874,6 +931,9 @@ static int emulator_write_emulated(unsigned long addr, | |||
| 874 | if (gpa == UNMAPPED_GVA) | 931 | if (gpa == UNMAPPED_GVA) |
| 875 | return X86EMUL_PROPAGATE_FAULT; | 932 | return X86EMUL_PROPAGATE_FAULT; |
| 876 | 933 | ||
| 934 | if (emulator_write_phys(vcpu, gpa, val, bytes)) | ||
| 935 | return X86EMUL_CONTINUE; | ||
| 936 | |||
| 877 | vcpu->mmio_needed = 1; | 937 | vcpu->mmio_needed = 1; |
| 878 | vcpu->mmio_phys_addr = gpa; | 938 | vcpu->mmio_phys_addr = gpa; |
| 879 | vcpu->mmio_size = bytes; | 939 | vcpu->mmio_size = bytes; |
| @@ -898,6 +958,30 @@ static int emulator_cmpxchg_emulated(unsigned long addr, | |||
| 898 | return emulator_write_emulated(addr, new, bytes, ctxt); | 958 | return emulator_write_emulated(addr, new, bytes, ctxt); |
| 899 | } | 959 | } |
| 900 | 960 | ||
| 961 | #ifdef CONFIG_X86_32 | ||
| 962 | |||
| 963 | static int emulator_cmpxchg8b_emulated(unsigned long addr, | ||
| 964 | unsigned long old_lo, | ||
| 965 | unsigned long old_hi, | ||
| 966 | unsigned long new_lo, | ||
| 967 | unsigned long new_hi, | ||
| 968 | struct x86_emulate_ctxt *ctxt) | ||
| 969 | { | ||
| 970 | static int reported; | ||
| 971 | int r; | ||
| 972 | |||
| 973 | if (!reported) { | ||
| 974 | reported = 1; | ||
| 975 | printk(KERN_WARNING "kvm: emulating exchange8b as write\n"); | ||
| 976 | } | ||
| 977 | r = emulator_write_emulated(addr, new_lo, 4, ctxt); | ||
| 978 | if (r != X86EMUL_CONTINUE) | ||
| 979 | return r; | ||
| 980 | return emulator_write_emulated(addr+4, new_hi, 4, ctxt); | ||
| 981 | } | ||
| 982 | |||
| 983 | #endif | ||
| 984 | |||
| 901 | static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) | 985 | static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) |
| 902 | { | 986 | { |
| 903 | return kvm_arch_ops->get_segment_base(vcpu, seg); | 987 | return kvm_arch_ops->get_segment_base(vcpu, seg); |
| @@ -905,18 +989,15 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) | |||
| 905 | 989 | ||
| 906 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) | 990 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) |
| 907 | { | 991 | { |
| 908 | spin_lock(&vcpu->kvm->lock); | ||
| 909 | vcpu->mmu.inval_page(vcpu, address); | ||
| 910 | spin_unlock(&vcpu->kvm->lock); | ||
| 911 | kvm_arch_ops->invlpg(vcpu, address); | ||
| 912 | return X86EMUL_CONTINUE; | 992 | return X86EMUL_CONTINUE; |
| 913 | } | 993 | } |
| 914 | 994 | ||
| 915 | int emulate_clts(struct kvm_vcpu *vcpu) | 995 | int emulate_clts(struct kvm_vcpu *vcpu) |
| 916 | { | 996 | { |
| 917 | unsigned long cr0 = vcpu->cr0; | 997 | unsigned long cr0; |
| 918 | 998 | ||
| 919 | cr0 &= ~CR0_TS_MASK; | 999 | kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu); |
| 1000 | cr0 = vcpu->cr0 & ~CR0_TS_MASK; | ||
| 920 | kvm_arch_ops->set_cr0(vcpu, cr0); | 1001 | kvm_arch_ops->set_cr0(vcpu, cr0); |
| 921 | return X86EMUL_CONTINUE; | 1002 | return X86EMUL_CONTINUE; |
| 922 | } | 1003 | } |
| @@ -975,6 +1056,9 @@ struct x86_emulate_ops emulate_ops = { | |||
| 975 | .read_emulated = emulator_read_emulated, | 1056 | .read_emulated = emulator_read_emulated, |
| 976 | .write_emulated = emulator_write_emulated, | 1057 | .write_emulated = emulator_write_emulated, |
| 977 | .cmpxchg_emulated = emulator_cmpxchg_emulated, | 1058 | .cmpxchg_emulated = emulator_cmpxchg_emulated, |
| 1059 | #ifdef CONFIG_X86_32 | ||
| 1060 | .cmpxchg8b_emulated = emulator_cmpxchg8b_emulated, | ||
| 1061 | #endif | ||
| 978 | }; | 1062 | }; |
| 979 | 1063 | ||
| 980 | int emulate_instruction(struct kvm_vcpu *vcpu, | 1064 | int emulate_instruction(struct kvm_vcpu *vcpu, |
| @@ -1024,6 +1108,8 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
| 1024 | } | 1108 | } |
| 1025 | 1109 | ||
| 1026 | if (r) { | 1110 | if (r) { |
| 1111 | if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) | ||
| 1112 | return EMULATE_DONE; | ||
| 1027 | if (!vcpu->mmio_needed) { | 1113 | if (!vcpu->mmio_needed) { |
| 1028 | report_emulation_failure(&emulate_ctxt); | 1114 | report_emulation_failure(&emulate_ctxt); |
| 1029 | return EMULATE_FAIL; | 1115 | return EMULATE_FAIL; |
| @@ -1069,6 +1155,7 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, | |||
| 1069 | 1155 | ||
| 1070 | unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) | 1156 | unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) |
| 1071 | { | 1157 | { |
| 1158 | kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu); | ||
| 1072 | switch (cr) { | 1159 | switch (cr) { |
| 1073 | case 0: | 1160 | case 0: |
| 1074 | return vcpu->cr0; | 1161 | return vcpu->cr0; |
| @@ -1403,6 +1490,7 @@ static int kvm_dev_ioctl_get_sregs(struct kvm *kvm, struct kvm_sregs *sregs) | |||
| 1403 | sregs->gdt.limit = dt.limit; | 1490 | sregs->gdt.limit = dt.limit; |
| 1404 | sregs->gdt.base = dt.base; | 1491 | sregs->gdt.base = dt.base; |
| 1405 | 1492 | ||
| 1493 | kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu); | ||
| 1406 | sregs->cr0 = vcpu->cr0; | 1494 | sregs->cr0 = vcpu->cr0; |
| 1407 | sregs->cr2 = vcpu->cr2; | 1495 | sregs->cr2 = vcpu->cr2; |
| 1408 | sregs->cr3 = vcpu->cr3; | 1496 | sregs->cr3 = vcpu->cr3; |
| @@ -1467,11 +1555,15 @@ static int kvm_dev_ioctl_set_sregs(struct kvm *kvm, struct kvm_sregs *sregs) | |||
| 1467 | #endif | 1555 | #endif |
| 1468 | vcpu->apic_base = sregs->apic_base; | 1556 | vcpu->apic_base = sregs->apic_base; |
| 1469 | 1557 | ||
| 1558 | kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu); | ||
| 1559 | |||
| 1470 | mmu_reset_needed |= vcpu->cr0 != sregs->cr0; | 1560 | mmu_reset_needed |= vcpu->cr0 != sregs->cr0; |
| 1471 | kvm_arch_ops->set_cr0_no_modeswitch(vcpu, sregs->cr0); | 1561 | kvm_arch_ops->set_cr0_no_modeswitch(vcpu, sregs->cr0); |
| 1472 | 1562 | ||
| 1473 | mmu_reset_needed |= vcpu->cr4 != sregs->cr4; | 1563 | mmu_reset_needed |= vcpu->cr4 != sregs->cr4; |
| 1474 | kvm_arch_ops->set_cr4(vcpu, sregs->cr4); | 1564 | kvm_arch_ops->set_cr4(vcpu, sregs->cr4); |
| 1565 | if (!is_long_mode(vcpu) && is_pae(vcpu)) | ||
| 1566 | load_pdptrs(vcpu, vcpu->cr3); | ||
| 1475 | 1567 | ||
| 1476 | if (mmu_reset_needed) | 1568 | if (mmu_reset_needed) |
| 1477 | kvm_mmu_reset_context(vcpu); | 1569 | kvm_mmu_reset_context(vcpu); |
| @@ -1693,12 +1785,12 @@ static long kvm_dev_ioctl(struct file *filp, | |||
| 1693 | if (copy_from_user(&kvm_run, (void *)arg, sizeof kvm_run)) | 1785 | if (copy_from_user(&kvm_run, (void *)arg, sizeof kvm_run)) |
| 1694 | goto out; | 1786 | goto out; |
| 1695 | r = kvm_dev_ioctl_run(kvm, &kvm_run); | 1787 | r = kvm_dev_ioctl_run(kvm, &kvm_run); |
| 1696 | if (r < 0) | 1788 | if (r < 0 && r != -EINTR) |
| 1697 | goto out; | 1789 | goto out; |
| 1698 | r = -EFAULT; | 1790 | if (copy_to_user((void *)arg, &kvm_run, sizeof kvm_run)) { |
| 1699 | if (copy_to_user((void *)arg, &kvm_run, sizeof kvm_run)) | 1791 | r = -EFAULT; |
| 1700 | goto out; | 1792 | goto out; |
| 1701 | r = 0; | 1793 | } |
| 1702 | break; | 1794 | break; |
| 1703 | } | 1795 | } |
| 1704 | case KVM_GET_REGS: { | 1796 | case KVM_GET_REGS: { |
| @@ -1842,6 +1934,7 @@ static long kvm_dev_ioctl(struct file *filp, | |||
| 1842 | num_msrs_to_save * sizeof(u32))) | 1934 | num_msrs_to_save * sizeof(u32))) |
| 1843 | goto out; | 1935 | goto out; |
| 1844 | r = 0; | 1936 | r = 0; |
| 1937 | break; | ||
| 1845 | } | 1938 | } |
| 1846 | default: | 1939 | default: |
| 1847 | ; | 1940 | ; |
| @@ -1944,17 +2037,17 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) | |||
| 1944 | return -EEXIST; | 2037 | return -EEXIST; |
| 1945 | } | 2038 | } |
| 1946 | 2039 | ||
| 1947 | kvm_arch_ops = ops; | 2040 | if (!ops->cpu_has_kvm_support()) { |
| 1948 | |||
| 1949 | if (!kvm_arch_ops->cpu_has_kvm_support()) { | ||
| 1950 | printk(KERN_ERR "kvm: no hardware support\n"); | 2041 | printk(KERN_ERR "kvm: no hardware support\n"); |
| 1951 | return -EOPNOTSUPP; | 2042 | return -EOPNOTSUPP; |
| 1952 | } | 2043 | } |
| 1953 | if (kvm_arch_ops->disabled_by_bios()) { | 2044 | if (ops->disabled_by_bios()) { |
| 1954 | printk(KERN_ERR "kvm: disabled by bios\n"); | 2045 | printk(KERN_ERR "kvm: disabled by bios\n"); |
| 1955 | return -EOPNOTSUPP; | 2046 | return -EOPNOTSUPP; |
| 1956 | } | 2047 | } |
| 1957 | 2048 | ||
| 2049 | kvm_arch_ops = ops; | ||
| 2050 | |||
| 1958 | r = kvm_arch_ops->hardware_setup(); | 2051 | r = kvm_arch_ops->hardware_setup(); |
| 1959 | if (r < 0) | 2052 | if (r < 0) |
| 1960 | return r; | 2053 | return r; |
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 790423c5f23d..c6f972914f08 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c | |||
| @@ -26,7 +26,31 @@ | |||
| 26 | #include "vmx.h" | 26 | #include "vmx.h" |
| 27 | #include "kvm.h" | 27 | #include "kvm.h" |
| 28 | 28 | ||
| 29 | #undef MMU_DEBUG | ||
| 30 | |||
| 31 | #undef AUDIT | ||
| 32 | |||
| 33 | #ifdef AUDIT | ||
| 34 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); | ||
| 35 | #else | ||
| 36 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} | ||
| 37 | #endif | ||
| 38 | |||
| 39 | #ifdef MMU_DEBUG | ||
| 40 | |||
| 41 | #define pgprintk(x...) do { if (dbg) printk(x); } while (0) | ||
| 42 | #define rmap_printk(x...) do { if (dbg) printk(x); } while (0) | ||
| 43 | |||
| 44 | #else | ||
| 45 | |||
| 29 | #define pgprintk(x...) do { } while (0) | 46 | #define pgprintk(x...) do { } while (0) |
| 47 | #define rmap_printk(x...) do { } while (0) | ||
| 48 | |||
| 49 | #endif | ||
| 50 | |||
| 51 | #if defined(MMU_DEBUG) || defined(AUDIT) | ||
| 52 | static int dbg = 1; | ||
| 53 | #endif | ||
| 30 | 54 | ||
| 31 | #define ASSERT(x) \ | 55 | #define ASSERT(x) \ |
| 32 | if (!(x)) { \ | 56 | if (!(x)) { \ |
| @@ -34,8 +58,10 @@ | |||
| 34 | __FILE__, __LINE__, #x); \ | 58 | __FILE__, __LINE__, #x); \ |
| 35 | } | 59 | } |
| 36 | 60 | ||
| 37 | #define PT64_ENT_PER_PAGE 512 | 61 | #define PT64_PT_BITS 9 |
| 38 | #define PT32_ENT_PER_PAGE 1024 | 62 | #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) |
| 63 | #define PT32_PT_BITS 10 | ||
| 64 | #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) | ||
| 39 | 65 | ||
| 40 | #define PT_WRITABLE_SHIFT 1 | 66 | #define PT_WRITABLE_SHIFT 1 |
| 41 | 67 | ||
| @@ -125,6 +151,13 @@ | |||
| 125 | #define PT_DIRECTORY_LEVEL 2 | 151 | #define PT_DIRECTORY_LEVEL 2 |
| 126 | #define PT_PAGE_TABLE_LEVEL 1 | 152 | #define PT_PAGE_TABLE_LEVEL 1 |
| 127 | 153 | ||
| 154 | #define RMAP_EXT 4 | ||
| 155 | |||
| 156 | struct kvm_rmap_desc { | ||
| 157 | u64 *shadow_ptes[RMAP_EXT]; | ||
| 158 | struct kvm_rmap_desc *more; | ||
| 159 | }; | ||
| 160 | |||
| 128 | static int is_write_protection(struct kvm_vcpu *vcpu) | 161 | static int is_write_protection(struct kvm_vcpu *vcpu) |
| 129 | { | 162 | { |
| 130 | return vcpu->cr0 & CR0_WP_MASK; | 163 | return vcpu->cr0 & CR0_WP_MASK; |
| @@ -150,32 +183,272 @@ static int is_io_pte(unsigned long pte) | |||
| 150 | return pte & PT_SHADOW_IO_MARK; | 183 | return pte & PT_SHADOW_IO_MARK; |
| 151 | } | 184 | } |
| 152 | 185 | ||
| 186 | static int is_rmap_pte(u64 pte) | ||
| 187 | { | ||
| 188 | return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK)) | ||
| 189 | == (PT_WRITABLE_MASK | PT_PRESENT_MASK); | ||
| 190 | } | ||
| 191 | |||
| 192 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, | ||
| 193 | size_t objsize, int min) | ||
| 194 | { | ||
| 195 | void *obj; | ||
| 196 | |||
| 197 | if (cache->nobjs >= min) | ||
| 198 | return 0; | ||
| 199 | while (cache->nobjs < ARRAY_SIZE(cache->objects)) { | ||
| 200 | obj = kzalloc(objsize, GFP_NOWAIT); | ||
| 201 | if (!obj) | ||
| 202 | return -ENOMEM; | ||
| 203 | cache->objects[cache->nobjs++] = obj; | ||
| 204 | } | ||
| 205 | return 0; | ||
| 206 | } | ||
| 207 | |||
| 208 | static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) | ||
| 209 | { | ||
| 210 | while (mc->nobjs) | ||
| 211 | kfree(mc->objects[--mc->nobjs]); | ||
| 212 | } | ||
| 213 | |||
| 214 | static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) | ||
| 215 | { | ||
| 216 | int r; | ||
| 217 | |||
| 218 | r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache, | ||
| 219 | sizeof(struct kvm_pte_chain), 4); | ||
| 220 | if (r) | ||
| 221 | goto out; | ||
| 222 | r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache, | ||
| 223 | sizeof(struct kvm_rmap_desc), 1); | ||
| 224 | out: | ||
| 225 | return r; | ||
| 226 | } | ||
| 227 | |||
| 228 | static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) | ||
| 229 | { | ||
| 230 | mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache); | ||
| 231 | mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache); | ||
| 232 | } | ||
| 233 | |||
| 234 | static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, | ||
| 235 | size_t size) | ||
| 236 | { | ||
| 237 | void *p; | ||
| 238 | |||
| 239 | BUG_ON(!mc->nobjs); | ||
| 240 | p = mc->objects[--mc->nobjs]; | ||
| 241 | memset(p, 0, size); | ||
| 242 | return p; | ||
| 243 | } | ||
| 244 | |||
| 245 | static void mmu_memory_cache_free(struct kvm_mmu_memory_cache *mc, void *obj) | ||
| 246 | { | ||
| 247 | if (mc->nobjs < KVM_NR_MEM_OBJS) | ||
| 248 | mc->objects[mc->nobjs++] = obj; | ||
| 249 | else | ||
| 250 | kfree(obj); | ||
| 251 | } | ||
| 252 | |||
| 253 | static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) | ||
| 254 | { | ||
| 255 | return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache, | ||
| 256 | sizeof(struct kvm_pte_chain)); | ||
| 257 | } | ||
| 258 | |||
| 259 | static void mmu_free_pte_chain(struct kvm_vcpu *vcpu, | ||
| 260 | struct kvm_pte_chain *pc) | ||
| 261 | { | ||
| 262 | mmu_memory_cache_free(&vcpu->mmu_pte_chain_cache, pc); | ||
| 263 | } | ||
| 264 | |||
| 265 | static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) | ||
| 266 | { | ||
| 267 | return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache, | ||
| 268 | sizeof(struct kvm_rmap_desc)); | ||
| 269 | } | ||
| 270 | |||
| 271 | static void mmu_free_rmap_desc(struct kvm_vcpu *vcpu, | ||
| 272 | struct kvm_rmap_desc *rd) | ||
| 273 | { | ||
| 274 | mmu_memory_cache_free(&vcpu->mmu_rmap_desc_cache, rd); | ||
| 275 | } | ||
| 276 | |||
| 277 | /* | ||
| 278 | * Reverse mapping data structures: | ||
| 279 | * | ||
| 280 | * If page->private bit zero is zero, then page->private points to the | ||
| 281 | * shadow page table entry that points to page_address(page). | ||
| 282 | * | ||
| 283 | * If page->private bit zero is one, (then page->private & ~1) points | ||
| 284 | * to a struct kvm_rmap_desc containing more mappings. | ||
| 285 | */ | ||
| 286 | static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte) | ||
| 287 | { | ||
| 288 | struct page *page; | ||
| 289 | struct kvm_rmap_desc *desc; | ||
| 290 | int i; | ||
| 291 | |||
| 292 | if (!is_rmap_pte(*spte)) | ||
| 293 | return; | ||
| 294 | page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); | ||
| 295 | if (!page->private) { | ||
| 296 | rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); | ||
| 297 | page->private = (unsigned long)spte; | ||
| 298 | } else if (!(page->private & 1)) { | ||
| 299 | rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); | ||
| 300 | desc = mmu_alloc_rmap_desc(vcpu); | ||
| 301 | desc->shadow_ptes[0] = (u64 *)page->private; | ||
| 302 | desc->shadow_ptes[1] = spte; | ||
| 303 | page->private = (unsigned long)desc | 1; | ||
| 304 | } else { | ||
| 305 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); | ||
| 306 | desc = (struct kvm_rmap_desc *)(page->private & ~1ul); | ||
| 307 | while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) | ||
| 308 | desc = desc->more; | ||
| 309 | if (desc->shadow_ptes[RMAP_EXT-1]) { | ||
| 310 | desc->more = mmu_alloc_rmap_desc(vcpu); | ||
| 311 | desc = desc->more; | ||
| 312 | } | ||
| 313 | for (i = 0; desc->shadow_ptes[i]; ++i) | ||
| 314 | ; | ||
| 315 | desc->shadow_ptes[i] = spte; | ||
| 316 | } | ||
| 317 | } | ||
| 318 | |||
| 319 | static void rmap_desc_remove_entry(struct kvm_vcpu *vcpu, | ||
| 320 | struct page *page, | ||
| 321 | struct kvm_rmap_desc *desc, | ||
| 322 | int i, | ||
| 323 | struct kvm_rmap_desc *prev_desc) | ||
| 324 | { | ||
| 325 | int j; | ||
| 326 | |||
| 327 | for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) | ||
| 328 | ; | ||
| 329 | desc->shadow_ptes[i] = desc->shadow_ptes[j]; | ||
| 330 | desc->shadow_ptes[j] = 0; | ||
| 331 | if (j != 0) | ||
| 332 | return; | ||
| 333 | if (!prev_desc && !desc->more) | ||
| 334 | page->private = (unsigned long)desc->shadow_ptes[0]; | ||
| 335 | else | ||
| 336 | if (prev_desc) | ||
| 337 | prev_desc->more = desc->more; | ||
| 338 | else | ||
| 339 | page->private = (unsigned long)desc->more | 1; | ||
| 340 | mmu_free_rmap_desc(vcpu, desc); | ||
| 341 | } | ||
| 342 | |||
| 343 | static void rmap_remove(struct kvm_vcpu *vcpu, u64 *spte) | ||
| 344 | { | ||
| 345 | struct page *page; | ||
| 346 | struct kvm_rmap_desc *desc; | ||
| 347 | struct kvm_rmap_desc *prev_desc; | ||
| 348 | int i; | ||
| 349 | |||
| 350 | if (!is_rmap_pte(*spte)) | ||
| 351 | return; | ||
| 352 | page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); | ||
| 353 | if (!page->private) { | ||
| 354 | printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); | ||
| 355 | BUG(); | ||
| 356 | } else if (!(page->private & 1)) { | ||
| 357 | rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); | ||
| 358 | if ((u64 *)page->private != spte) { | ||
| 359 | printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", | ||
| 360 | spte, *spte); | ||
| 361 | BUG(); | ||
| 362 | } | ||
| 363 | page->private = 0; | ||
| 364 | } else { | ||
| 365 | rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); | ||
| 366 | desc = (struct kvm_rmap_desc *)(page->private & ~1ul); | ||
| 367 | prev_desc = NULL; | ||
| 368 | while (desc) { | ||
| 369 | for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) | ||
| 370 | if (desc->shadow_ptes[i] == spte) { | ||
| 371 | rmap_desc_remove_entry(vcpu, page, | ||
| 372 | desc, i, | ||
| 373 | prev_desc); | ||
| 374 | return; | ||
| 375 | } | ||
| 376 | prev_desc = desc; | ||
| 377 | desc = desc->more; | ||
| 378 | } | ||
| 379 | BUG(); | ||
| 380 | } | ||
| 381 | } | ||
| 382 | |||
| 383 | static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) | ||
| 384 | { | ||
| 385 | struct kvm *kvm = vcpu->kvm; | ||
| 386 | struct page *page; | ||
| 387 | struct kvm_memory_slot *slot; | ||
| 388 | struct kvm_rmap_desc *desc; | ||
| 389 | u64 *spte; | ||
| 390 | |||
| 391 | slot = gfn_to_memslot(kvm, gfn); | ||
| 392 | BUG_ON(!slot); | ||
| 393 | page = gfn_to_page(slot, gfn); | ||
| 394 | |||
| 395 | while (page->private) { | ||
| 396 | if (!(page->private & 1)) | ||
| 397 | spte = (u64 *)page->private; | ||
| 398 | else { | ||
| 399 | desc = (struct kvm_rmap_desc *)(page->private & ~1ul); | ||
| 400 | spte = desc->shadow_ptes[0]; | ||
| 401 | } | ||
| 402 | BUG_ON(!spte); | ||
| 403 | BUG_ON((*spte & PT64_BASE_ADDR_MASK) != | ||
| 404 | page_to_pfn(page) << PAGE_SHIFT); | ||
| 405 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | ||
| 406 | BUG_ON(!(*spte & PT_WRITABLE_MASK)); | ||
| 407 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); | ||
| 408 | rmap_remove(vcpu, spte); | ||
| 409 | kvm_arch_ops->tlb_flush(vcpu); | ||
| 410 | *spte &= ~(u64)PT_WRITABLE_MASK; | ||
| 411 | } | ||
| 412 | } | ||
| 413 | |||
| 414 | static int is_empty_shadow_page(hpa_t page_hpa) | ||
| 415 | { | ||
| 416 | u64 *pos; | ||
| 417 | u64 *end; | ||
| 418 | |||
| 419 | for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u64); | ||
| 420 | pos != end; pos++) | ||
| 421 | if (*pos != 0) { | ||
| 422 | printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, | ||
| 423 | pos, *pos); | ||
| 424 | return 0; | ||
| 425 | } | ||
| 426 | return 1; | ||
| 427 | } | ||
| 428 | |||
| 153 | static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa) | 429 | static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa) |
| 154 | { | 430 | { |
| 155 | struct kvm_mmu_page *page_head = page_header(page_hpa); | 431 | struct kvm_mmu_page *page_head = page_header(page_hpa); |
| 156 | 432 | ||
| 433 | ASSERT(is_empty_shadow_page(page_hpa)); | ||
| 157 | list_del(&page_head->link); | 434 | list_del(&page_head->link); |
| 158 | page_head->page_hpa = page_hpa; | 435 | page_head->page_hpa = page_hpa; |
| 159 | list_add(&page_head->link, &vcpu->free_pages); | 436 | list_add(&page_head->link, &vcpu->free_pages); |
| 437 | ++vcpu->kvm->n_free_mmu_pages; | ||
| 160 | } | 438 | } |
| 161 | 439 | ||
| 162 | static int is_empty_shadow_page(hpa_t page_hpa) | 440 | static unsigned kvm_page_table_hashfn(gfn_t gfn) |
| 163 | { | 441 | { |
| 164 | u32 *pos; | 442 | return gfn; |
| 165 | u32 *end; | ||
| 166 | for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u32); | ||
| 167 | pos != end; pos++) | ||
| 168 | if (*pos != 0) | ||
| 169 | return 0; | ||
| 170 | return 1; | ||
| 171 | } | 443 | } |
| 172 | 444 | ||
| 173 | static hpa_t kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte) | 445 | static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, |
| 446 | u64 *parent_pte) | ||
| 174 | { | 447 | { |
| 175 | struct kvm_mmu_page *page; | 448 | struct kvm_mmu_page *page; |
| 176 | 449 | ||
| 177 | if (list_empty(&vcpu->free_pages)) | 450 | if (list_empty(&vcpu->free_pages)) |
| 178 | return INVALID_PAGE; | 451 | return NULL; |
| 179 | 452 | ||
| 180 | page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); | 453 | page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); |
| 181 | list_del(&page->link); | 454 | list_del(&page->link); |
| @@ -183,8 +456,239 @@ static hpa_t kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte) | |||
| 183 | ASSERT(is_empty_shadow_page(page->page_hpa)); | 456 | ASSERT(is_empty_shadow_page(page->page_hpa)); |
| 184 | page->slot_bitmap = 0; | 457 | page->slot_bitmap = 0; |
| 185 | page->global = 1; | 458 | page->global = 1; |
| 459 | page->multimapped = 0; | ||
| 186 | page->parent_pte = parent_pte; | 460 | page->parent_pte = parent_pte; |
| 187 | return page->page_hpa; | 461 | --vcpu->kvm->n_free_mmu_pages; |
| 462 | return page; | ||
| 463 | } | ||
| 464 | |||
| 465 | static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, | ||
| 466 | struct kvm_mmu_page *page, u64 *parent_pte) | ||
| 467 | { | ||
| 468 | struct kvm_pte_chain *pte_chain; | ||
| 469 | struct hlist_node *node; | ||
| 470 | int i; | ||
| 471 | |||
| 472 | if (!parent_pte) | ||
| 473 | return; | ||
| 474 | if (!page->multimapped) { | ||
| 475 | u64 *old = page->parent_pte; | ||
| 476 | |||
| 477 | if (!old) { | ||
| 478 | page->parent_pte = parent_pte; | ||
| 479 | return; | ||
| 480 | } | ||
| 481 | page->multimapped = 1; | ||
| 482 | pte_chain = mmu_alloc_pte_chain(vcpu); | ||
| 483 | INIT_HLIST_HEAD(&page->parent_ptes); | ||
| 484 | hlist_add_head(&pte_chain->link, &page->parent_ptes); | ||
| 485 | pte_chain->parent_ptes[0] = old; | ||
| 486 | } | ||
| 487 | hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) { | ||
| 488 | if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) | ||
| 489 | continue; | ||
| 490 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) | ||
| 491 | if (!pte_chain->parent_ptes[i]) { | ||
| 492 | pte_chain->parent_ptes[i] = parent_pte; | ||
| 493 | return; | ||
| 494 | } | ||
| 495 | } | ||
| 496 | pte_chain = mmu_alloc_pte_chain(vcpu); | ||
| 497 | BUG_ON(!pte_chain); | ||
| 498 | hlist_add_head(&pte_chain->link, &page->parent_ptes); | ||
| 499 | pte_chain->parent_ptes[0] = parent_pte; | ||
| 500 | } | ||
| 501 | |||
| 502 | static void mmu_page_remove_parent_pte(struct kvm_vcpu *vcpu, | ||
| 503 | struct kvm_mmu_page *page, | ||
| 504 | u64 *parent_pte) | ||
| 505 | { | ||
| 506 | struct kvm_pte_chain *pte_chain; | ||
| 507 | struct hlist_node *node; | ||
| 508 | int i; | ||
| 509 | |||
| 510 | if (!page->multimapped) { | ||
| 511 | BUG_ON(page->parent_pte != parent_pte); | ||
| 512 | page->parent_pte = NULL; | ||
| 513 | return; | ||
| 514 | } | ||
| 515 | hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) | ||
| 516 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { | ||
| 517 | if (!pte_chain->parent_ptes[i]) | ||
| 518 | break; | ||
| 519 | if (pte_chain->parent_ptes[i] != parent_pte) | ||
| 520 | continue; | ||
| 521 | while (i + 1 < NR_PTE_CHAIN_ENTRIES | ||
| 522 | && pte_chain->parent_ptes[i + 1]) { | ||
| 523 | pte_chain->parent_ptes[i] | ||
| 524 | = pte_chain->parent_ptes[i + 1]; | ||
| 525 | ++i; | ||
| 526 | } | ||
| 527 | pte_chain->parent_ptes[i] = NULL; | ||
| 528 | if (i == 0) { | ||
| 529 | hlist_del(&pte_chain->link); | ||
| 530 | mmu_free_pte_chain(vcpu, pte_chain); | ||
| 531 | if (hlist_empty(&page->parent_ptes)) { | ||
| 532 | page->multimapped = 0; | ||
| 533 | page->parent_pte = NULL; | ||
| 534 | } | ||
| 535 | } | ||
| 536 | return; | ||
| 537 | } | ||
| 538 | BUG(); | ||
| 539 | } | ||
| 540 | |||
| 541 | static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu, | ||
| 542 | gfn_t gfn) | ||
| 543 | { | ||
| 544 | unsigned index; | ||
| 545 | struct hlist_head *bucket; | ||
| 546 | struct kvm_mmu_page *page; | ||
| 547 | struct hlist_node *node; | ||
| 548 | |||
| 549 | pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); | ||
| 550 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
| 551 | bucket = &vcpu->kvm->mmu_page_hash[index]; | ||
| 552 | hlist_for_each_entry(page, node, bucket, hash_link) | ||
| 553 | if (page->gfn == gfn && !page->role.metaphysical) { | ||
| 554 | pgprintk("%s: found role %x\n", | ||
| 555 | __FUNCTION__, page->role.word); | ||
| 556 | return page; | ||
| 557 | } | ||
| 558 | return NULL; | ||
| 559 | } | ||
| 560 | |||
| 561 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | ||
| 562 | gfn_t gfn, | ||
| 563 | gva_t gaddr, | ||
| 564 | unsigned level, | ||
| 565 | int metaphysical, | ||
| 566 | u64 *parent_pte) | ||
| 567 | { | ||
| 568 | union kvm_mmu_page_role role; | ||
| 569 | unsigned index; | ||
| 570 | unsigned quadrant; | ||
| 571 | struct hlist_head *bucket; | ||
| 572 | struct kvm_mmu_page *page; | ||
| 573 | struct hlist_node *node; | ||
| 574 | |||
| 575 | role.word = 0; | ||
| 576 | role.glevels = vcpu->mmu.root_level; | ||
| 577 | role.level = level; | ||
| 578 | role.metaphysical = metaphysical; | ||
| 579 | if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) { | ||
| 580 | quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); | ||
| 581 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; | ||
| 582 | role.quadrant = quadrant; | ||
| 583 | } | ||
| 584 | pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, | ||
| 585 | gfn, role.word); | ||
| 586 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
| 587 | bucket = &vcpu->kvm->mmu_page_hash[index]; | ||
| 588 | hlist_for_each_entry(page, node, bucket, hash_link) | ||
| 589 | if (page->gfn == gfn && page->role.word == role.word) { | ||
| 590 | mmu_page_add_parent_pte(vcpu, page, parent_pte); | ||
| 591 | pgprintk("%s: found\n", __FUNCTION__); | ||
| 592 | return page; | ||
| 593 | } | ||
| 594 | page = kvm_mmu_alloc_page(vcpu, parent_pte); | ||
| 595 | if (!page) | ||
| 596 | return page; | ||
| 597 | pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word); | ||
| 598 | page->gfn = gfn; | ||
| 599 | page->role = role; | ||
| 600 | hlist_add_head(&page->hash_link, bucket); | ||
| 601 | if (!metaphysical) | ||
| 602 | rmap_write_protect(vcpu, gfn); | ||
| 603 | return page; | ||
| 604 | } | ||
| 605 | |||
| 606 | static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu, | ||
| 607 | struct kvm_mmu_page *page) | ||
| 608 | { | ||
| 609 | unsigned i; | ||
| 610 | u64 *pt; | ||
| 611 | u64 ent; | ||
| 612 | |||
| 613 | pt = __va(page->page_hpa); | ||
| 614 | |||
| 615 | if (page->role.level == PT_PAGE_TABLE_LEVEL) { | ||
| 616 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
| 617 | if (pt[i] & PT_PRESENT_MASK) | ||
| 618 | rmap_remove(vcpu, &pt[i]); | ||
| 619 | pt[i] = 0; | ||
| 620 | } | ||
| 621 | kvm_arch_ops->tlb_flush(vcpu); | ||
| 622 | return; | ||
| 623 | } | ||
| 624 | |||
| 625 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
| 626 | ent = pt[i]; | ||
| 627 | |||
| 628 | pt[i] = 0; | ||
| 629 | if (!(ent & PT_PRESENT_MASK)) | ||
| 630 | continue; | ||
| 631 | ent &= PT64_BASE_ADDR_MASK; | ||
| 632 | mmu_page_remove_parent_pte(vcpu, page_header(ent), &pt[i]); | ||
| 633 | } | ||
| 634 | } | ||
| 635 | |||
| 636 | static void kvm_mmu_put_page(struct kvm_vcpu *vcpu, | ||
| 637 | struct kvm_mmu_page *page, | ||
| 638 | u64 *parent_pte) | ||
| 639 | { | ||
| 640 | mmu_page_remove_parent_pte(vcpu, page, parent_pte); | ||
| 641 | } | ||
| 642 | |||
| 643 | static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu, | ||
| 644 | struct kvm_mmu_page *page) | ||
| 645 | { | ||
| 646 | u64 *parent_pte; | ||
| 647 | |||
| 648 | while (page->multimapped || page->parent_pte) { | ||
| 649 | if (!page->multimapped) | ||
| 650 | parent_pte = page->parent_pte; | ||
| 651 | else { | ||
| 652 | struct kvm_pte_chain *chain; | ||
| 653 | |||
| 654 | chain = container_of(page->parent_ptes.first, | ||
| 655 | struct kvm_pte_chain, link); | ||
| 656 | parent_pte = chain->parent_ptes[0]; | ||
| 657 | } | ||
| 658 | BUG_ON(!parent_pte); | ||
| 659 | kvm_mmu_put_page(vcpu, page, parent_pte); | ||
| 660 | *parent_pte = 0; | ||
| 661 | } | ||
| 662 | kvm_mmu_page_unlink_children(vcpu, page); | ||
| 663 | if (!page->root_count) { | ||
| 664 | hlist_del(&page->hash_link); | ||
| 665 | kvm_mmu_free_page(vcpu, page->page_hpa); | ||
| 666 | } else { | ||
| 667 | list_del(&page->link); | ||
| 668 | list_add(&page->link, &vcpu->kvm->active_mmu_pages); | ||
| 669 | } | ||
| 670 | } | ||
| 671 | |||
| 672 | static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
| 673 | { | ||
| 674 | unsigned index; | ||
| 675 | struct hlist_head *bucket; | ||
| 676 | struct kvm_mmu_page *page; | ||
| 677 | struct hlist_node *node, *n; | ||
| 678 | int r; | ||
| 679 | |||
| 680 | pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); | ||
| 681 | r = 0; | ||
| 682 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
| 683 | bucket = &vcpu->kvm->mmu_page_hash[index]; | ||
| 684 | hlist_for_each_entry_safe(page, node, n, bucket, hash_link) | ||
| 685 | if (page->gfn == gfn && !page->role.metaphysical) { | ||
| 686 | pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn, | ||
| 687 | page->role.word); | ||
| 688 | kvm_mmu_zap_page(vcpu, page); | ||
| 689 | r = 1; | ||
| 690 | } | ||
| 691 | return r; | ||
| 188 | } | 692 | } |
| 189 | 693 | ||
| 190 | static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) | 694 | static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) |
| @@ -225,35 +729,6 @@ hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva) | |||
| 225 | return gpa_to_hpa(vcpu, gpa); | 729 | return gpa_to_hpa(vcpu, gpa); |
| 226 | } | 730 | } |
| 227 | 731 | ||
| 228 | |||
| 229 | static void release_pt_page_64(struct kvm_vcpu *vcpu, hpa_t page_hpa, | ||
| 230 | int level) | ||
| 231 | { | ||
| 232 | ASSERT(vcpu); | ||
| 233 | ASSERT(VALID_PAGE(page_hpa)); | ||
| 234 | ASSERT(level <= PT64_ROOT_LEVEL && level > 0); | ||
| 235 | |||
| 236 | if (level == 1) | ||
| 237 | memset(__va(page_hpa), 0, PAGE_SIZE); | ||
| 238 | else { | ||
| 239 | u64 *pos; | ||
| 240 | u64 *end; | ||
| 241 | |||
| 242 | for (pos = __va(page_hpa), end = pos + PT64_ENT_PER_PAGE; | ||
| 243 | pos != end; pos++) { | ||
| 244 | u64 current_ent = *pos; | ||
| 245 | |||
| 246 | *pos = 0; | ||
| 247 | if (is_present_pte(current_ent)) | ||
| 248 | release_pt_page_64(vcpu, | ||
| 249 | current_ent & | ||
| 250 | PT64_BASE_ADDR_MASK, | ||
| 251 | level - 1); | ||
| 252 | } | ||
| 253 | } | ||
| 254 | kvm_mmu_free_page(vcpu, page_hpa); | ||
| 255 | } | ||
| 256 | |||
| 257 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | 732 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) |
| 258 | { | 733 | { |
| 259 | } | 734 | } |
| @@ -266,52 +741,109 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) | |||
| 266 | for (; ; level--) { | 741 | for (; ; level--) { |
| 267 | u32 index = PT64_INDEX(v, level); | 742 | u32 index = PT64_INDEX(v, level); |
| 268 | u64 *table; | 743 | u64 *table; |
| 744 | u64 pte; | ||
| 269 | 745 | ||
| 270 | ASSERT(VALID_PAGE(table_addr)); | 746 | ASSERT(VALID_PAGE(table_addr)); |
| 271 | table = __va(table_addr); | 747 | table = __va(table_addr); |
| 272 | 748 | ||
| 273 | if (level == 1) { | 749 | if (level == 1) { |
| 750 | pte = table[index]; | ||
| 751 | if (is_present_pte(pte) && is_writeble_pte(pte)) | ||
| 752 | return 0; | ||
| 274 | mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT); | 753 | mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT); |
| 275 | page_header_update_slot(vcpu->kvm, table, v); | 754 | page_header_update_slot(vcpu->kvm, table, v); |
| 276 | table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK | | 755 | table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK | |
| 277 | PT_USER_MASK; | 756 | PT_USER_MASK; |
| 757 | rmap_add(vcpu, &table[index]); | ||
| 278 | return 0; | 758 | return 0; |
| 279 | } | 759 | } |
| 280 | 760 | ||
| 281 | if (table[index] == 0) { | 761 | if (table[index] == 0) { |
| 282 | hpa_t new_table = kvm_mmu_alloc_page(vcpu, | 762 | struct kvm_mmu_page *new_table; |
| 283 | &table[index]); | 763 | gfn_t pseudo_gfn; |
| 284 | 764 | ||
| 285 | if (!VALID_PAGE(new_table)) { | 765 | pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) |
| 766 | >> PAGE_SHIFT; | ||
| 767 | new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, | ||
| 768 | v, level - 1, | ||
| 769 | 1, &table[index]); | ||
| 770 | if (!new_table) { | ||
| 286 | pgprintk("nonpaging_map: ENOMEM\n"); | 771 | pgprintk("nonpaging_map: ENOMEM\n"); |
| 287 | return -ENOMEM; | 772 | return -ENOMEM; |
| 288 | } | 773 | } |
| 289 | 774 | ||
| 290 | if (level == PT32E_ROOT_LEVEL) | 775 | table[index] = new_table->page_hpa | PT_PRESENT_MASK |
| 291 | table[index] = new_table | PT_PRESENT_MASK; | 776 | | PT_WRITABLE_MASK | PT_USER_MASK; |
| 292 | else | ||
| 293 | table[index] = new_table | PT_PRESENT_MASK | | ||
| 294 | PT_WRITABLE_MASK | PT_USER_MASK; | ||
| 295 | } | 777 | } |
| 296 | table_addr = table[index] & PT64_BASE_ADDR_MASK; | 778 | table_addr = table[index] & PT64_BASE_ADDR_MASK; |
| 297 | } | 779 | } |
| 298 | } | 780 | } |
| 299 | 781 | ||
| 300 | static void nonpaging_flush(struct kvm_vcpu *vcpu) | 782 | static void mmu_free_roots(struct kvm_vcpu *vcpu) |
| 301 | { | 783 | { |
| 302 | hpa_t root = vcpu->mmu.root_hpa; | 784 | int i; |
| 785 | struct kvm_mmu_page *page; | ||
| 303 | 786 | ||
| 304 | ++kvm_stat.tlb_flush; | 787 | #ifdef CONFIG_X86_64 |
| 305 | pgprintk("nonpaging_flush\n"); | 788 | if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { |
| 306 | ASSERT(VALID_PAGE(root)); | 789 | hpa_t root = vcpu->mmu.root_hpa; |
| 307 | release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level); | 790 | |
| 308 | root = kvm_mmu_alloc_page(vcpu, NULL); | 791 | ASSERT(VALID_PAGE(root)); |
| 309 | ASSERT(VALID_PAGE(root)); | 792 | page = page_header(root); |
| 310 | vcpu->mmu.root_hpa = root; | 793 | --page->root_count; |
| 311 | if (is_paging(vcpu)) | 794 | vcpu->mmu.root_hpa = INVALID_PAGE; |
| 312 | root |= (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)); | 795 | return; |
| 313 | kvm_arch_ops->set_cr3(vcpu, root); | 796 | } |
| 314 | kvm_arch_ops->tlb_flush(vcpu); | 797 | #endif |
| 798 | for (i = 0; i < 4; ++i) { | ||
| 799 | hpa_t root = vcpu->mmu.pae_root[i]; | ||
| 800 | |||
| 801 | ASSERT(VALID_PAGE(root)); | ||
| 802 | root &= PT64_BASE_ADDR_MASK; | ||
| 803 | page = page_header(root); | ||
| 804 | --page->root_count; | ||
| 805 | vcpu->mmu.pae_root[i] = INVALID_PAGE; | ||
| 806 | } | ||
| 807 | vcpu->mmu.root_hpa = INVALID_PAGE; | ||
| 808 | } | ||
| 809 | |||
| 810 | static void mmu_alloc_roots(struct kvm_vcpu *vcpu) | ||
| 811 | { | ||
| 812 | int i; | ||
| 813 | gfn_t root_gfn; | ||
| 814 | struct kvm_mmu_page *page; | ||
| 815 | |||
| 816 | root_gfn = vcpu->cr3 >> PAGE_SHIFT; | ||
| 817 | |||
| 818 | #ifdef CONFIG_X86_64 | ||
| 819 | if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
| 820 | hpa_t root = vcpu->mmu.root_hpa; | ||
| 821 | |||
| 822 | ASSERT(!VALID_PAGE(root)); | ||
| 823 | page = kvm_mmu_get_page(vcpu, root_gfn, 0, | ||
| 824 | PT64_ROOT_LEVEL, 0, NULL); | ||
| 825 | root = page->page_hpa; | ||
| 826 | ++page->root_count; | ||
| 827 | vcpu->mmu.root_hpa = root; | ||
| 828 | return; | ||
| 829 | } | ||
| 830 | #endif | ||
| 831 | for (i = 0; i < 4; ++i) { | ||
| 832 | hpa_t root = vcpu->mmu.pae_root[i]; | ||
| 833 | |||
| 834 | ASSERT(!VALID_PAGE(root)); | ||
| 835 | if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) | ||
| 836 | root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT; | ||
| 837 | else if (vcpu->mmu.root_level == 0) | ||
| 838 | root_gfn = 0; | ||
| 839 | page = kvm_mmu_get_page(vcpu, root_gfn, i << 30, | ||
| 840 | PT32_ROOT_LEVEL, !is_paging(vcpu), | ||
| 841 | NULL); | ||
| 842 | root = page->page_hpa; | ||
| 843 | ++page->root_count; | ||
| 844 | vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; | ||
| 845 | } | ||
| 846 | vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root); | ||
| 315 | } | 847 | } |
| 316 | 848 | ||
| 317 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) | 849 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) |
| @@ -322,43 +854,29 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) | |||
| 322 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | 854 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, |
| 323 | u32 error_code) | 855 | u32 error_code) |
| 324 | { | 856 | { |
| 325 | int ret; | ||
| 326 | gpa_t addr = gva; | 857 | gpa_t addr = gva; |
| 858 | hpa_t paddr; | ||
| 859 | int r; | ||
| 860 | |||
| 861 | r = mmu_topup_memory_caches(vcpu); | ||
| 862 | if (r) | ||
| 863 | return r; | ||
| 327 | 864 | ||
| 328 | ASSERT(vcpu); | 865 | ASSERT(vcpu); |
| 329 | ASSERT(VALID_PAGE(vcpu->mmu.root_hpa)); | 866 | ASSERT(VALID_PAGE(vcpu->mmu.root_hpa)); |
| 330 | 867 | ||
| 331 | for (;;) { | ||
| 332 | hpa_t paddr; | ||
| 333 | |||
| 334 | paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK); | ||
| 335 | 868 | ||
| 336 | if (is_error_hpa(paddr)) | 869 | paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK); |
| 337 | return 1; | ||
| 338 | 870 | ||
| 339 | ret = nonpaging_map(vcpu, addr & PAGE_MASK, paddr); | 871 | if (is_error_hpa(paddr)) |
| 340 | if (ret) { | 872 | return 1; |
| 341 | nonpaging_flush(vcpu); | ||
| 342 | continue; | ||
| 343 | } | ||
| 344 | break; | ||
| 345 | } | ||
| 346 | return ret; | ||
| 347 | } | ||
| 348 | 873 | ||
| 349 | static void nonpaging_inval_page(struct kvm_vcpu *vcpu, gva_t addr) | 874 | return nonpaging_map(vcpu, addr & PAGE_MASK, paddr); |
| 350 | { | ||
| 351 | } | 875 | } |
| 352 | 876 | ||
| 353 | static void nonpaging_free(struct kvm_vcpu *vcpu) | 877 | static void nonpaging_free(struct kvm_vcpu *vcpu) |
| 354 | { | 878 | { |
| 355 | hpa_t root; | 879 | mmu_free_roots(vcpu); |
| 356 | |||
| 357 | ASSERT(vcpu); | ||
| 358 | root = vcpu->mmu.root_hpa; | ||
| 359 | if (VALID_PAGE(root)) | ||
| 360 | release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level); | ||
| 361 | vcpu->mmu.root_hpa = INVALID_PAGE; | ||
| 362 | } | 880 | } |
| 363 | 881 | ||
| 364 | static int nonpaging_init_context(struct kvm_vcpu *vcpu) | 882 | static int nonpaging_init_context(struct kvm_vcpu *vcpu) |
| @@ -367,40 +885,31 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) | |||
| 367 | 885 | ||
| 368 | context->new_cr3 = nonpaging_new_cr3; | 886 | context->new_cr3 = nonpaging_new_cr3; |
| 369 | context->page_fault = nonpaging_page_fault; | 887 | context->page_fault = nonpaging_page_fault; |
| 370 | context->inval_page = nonpaging_inval_page; | ||
| 371 | context->gva_to_gpa = nonpaging_gva_to_gpa; | 888 | context->gva_to_gpa = nonpaging_gva_to_gpa; |
| 372 | context->free = nonpaging_free; | 889 | context->free = nonpaging_free; |
| 373 | context->root_level = PT32E_ROOT_LEVEL; | 890 | context->root_level = 0; |
| 374 | context->shadow_root_level = PT32E_ROOT_LEVEL; | 891 | context->shadow_root_level = PT32E_ROOT_LEVEL; |
| 375 | context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL); | 892 | mmu_alloc_roots(vcpu); |
| 376 | ASSERT(VALID_PAGE(context->root_hpa)); | 893 | ASSERT(VALID_PAGE(context->root_hpa)); |
| 377 | kvm_arch_ops->set_cr3(vcpu, context->root_hpa); | 894 | kvm_arch_ops->set_cr3(vcpu, context->root_hpa); |
| 378 | return 0; | 895 | return 0; |
| 379 | } | 896 | } |
| 380 | 897 | ||
| 381 | |||
| 382 | static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) | 898 | static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) |
| 383 | { | 899 | { |
| 384 | struct kvm_mmu_page *page, *npage; | ||
| 385 | |||
| 386 | list_for_each_entry_safe(page, npage, &vcpu->kvm->active_mmu_pages, | ||
| 387 | link) { | ||
| 388 | if (page->global) | ||
| 389 | continue; | ||
| 390 | |||
| 391 | if (!page->parent_pte) | ||
| 392 | continue; | ||
| 393 | |||
| 394 | *page->parent_pte = 0; | ||
| 395 | release_pt_page_64(vcpu, page->page_hpa, 1); | ||
| 396 | } | ||
| 397 | ++kvm_stat.tlb_flush; | 900 | ++kvm_stat.tlb_flush; |
| 398 | kvm_arch_ops->tlb_flush(vcpu); | 901 | kvm_arch_ops->tlb_flush(vcpu); |
| 399 | } | 902 | } |
| 400 | 903 | ||
| 401 | static void paging_new_cr3(struct kvm_vcpu *vcpu) | 904 | static void paging_new_cr3(struct kvm_vcpu *vcpu) |
| 402 | { | 905 | { |
| 906 | pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); | ||
| 907 | mmu_free_roots(vcpu); | ||
| 908 | if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) | ||
| 909 | kvm_mmu_free_some_pages(vcpu); | ||
| 910 | mmu_alloc_roots(vcpu); | ||
| 403 | kvm_mmu_flush_tlb(vcpu); | 911 | kvm_mmu_flush_tlb(vcpu); |
| 912 | kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); | ||
| 404 | } | 913 | } |
| 405 | 914 | ||
| 406 | static void mark_pagetable_nonglobal(void *shadow_pte) | 915 | static void mark_pagetable_nonglobal(void *shadow_pte) |
| @@ -412,7 +921,8 @@ static inline void set_pte_common(struct kvm_vcpu *vcpu, | |||
| 412 | u64 *shadow_pte, | 921 | u64 *shadow_pte, |
| 413 | gpa_t gaddr, | 922 | gpa_t gaddr, |
| 414 | int dirty, | 923 | int dirty, |
| 415 | u64 access_bits) | 924 | u64 access_bits, |
| 925 | gfn_t gfn) | ||
| 416 | { | 926 | { |
| 417 | hpa_t paddr; | 927 | hpa_t paddr; |
| 418 | 928 | ||
| @@ -420,13 +930,10 @@ static inline void set_pte_common(struct kvm_vcpu *vcpu, | |||
| 420 | if (!dirty) | 930 | if (!dirty) |
| 421 | access_bits &= ~PT_WRITABLE_MASK; | 931 | access_bits &= ~PT_WRITABLE_MASK; |
| 422 | 932 | ||
| 423 | if (access_bits & PT_WRITABLE_MASK) | 933 | paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK); |
| 424 | mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); | ||
| 425 | 934 | ||
| 426 | *shadow_pte |= access_bits; | 935 | *shadow_pte |= access_bits; |
| 427 | 936 | ||
| 428 | paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK); | ||
| 429 | |||
| 430 | if (!(*shadow_pte & PT_GLOBAL_MASK)) | 937 | if (!(*shadow_pte & PT_GLOBAL_MASK)) |
| 431 | mark_pagetable_nonglobal(shadow_pte); | 938 | mark_pagetable_nonglobal(shadow_pte); |
| 432 | 939 | ||
| @@ -434,10 +941,31 @@ static inline void set_pte_common(struct kvm_vcpu *vcpu, | |||
| 434 | *shadow_pte |= gaddr; | 941 | *shadow_pte |= gaddr; |
| 435 | *shadow_pte |= PT_SHADOW_IO_MARK; | 942 | *shadow_pte |= PT_SHADOW_IO_MARK; |
| 436 | *shadow_pte &= ~PT_PRESENT_MASK; | 943 | *shadow_pte &= ~PT_PRESENT_MASK; |
| 437 | } else { | 944 | return; |
| 438 | *shadow_pte |= paddr; | 945 | } |
| 439 | page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); | 946 | |
| 947 | *shadow_pte |= paddr; | ||
| 948 | |||
| 949 | if (access_bits & PT_WRITABLE_MASK) { | ||
| 950 | struct kvm_mmu_page *shadow; | ||
| 951 | |||
| 952 | shadow = kvm_mmu_lookup_page(vcpu, gfn); | ||
| 953 | if (shadow) { | ||
| 954 | pgprintk("%s: found shadow page for %lx, marking ro\n", | ||
| 955 | __FUNCTION__, gfn); | ||
| 956 | access_bits &= ~PT_WRITABLE_MASK; | ||
| 957 | if (is_writeble_pte(*shadow_pte)) { | ||
| 958 | *shadow_pte &= ~PT_WRITABLE_MASK; | ||
| 959 | kvm_arch_ops->tlb_flush(vcpu); | ||
| 960 | } | ||
| 961 | } | ||
| 440 | } | 962 | } |
| 963 | |||
| 964 | if (access_bits & PT_WRITABLE_MASK) | ||
| 965 | mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); | ||
| 966 | |||
| 967 | page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); | ||
| 968 | rmap_add(vcpu, shadow_pte); | ||
| 441 | } | 969 | } |
| 442 | 970 | ||
| 443 | static void inject_page_fault(struct kvm_vcpu *vcpu, | 971 | static void inject_page_fault(struct kvm_vcpu *vcpu, |
| @@ -474,41 +1002,6 @@ static int may_access(u64 pte, int write, int user) | |||
| 474 | return 1; | 1002 | return 1; |
| 475 | } | 1003 | } |
| 476 | 1004 | ||
| 477 | /* | ||
| 478 | * Remove a shadow pte. | ||
| 479 | */ | ||
| 480 | static void paging_inval_page(struct kvm_vcpu *vcpu, gva_t addr) | ||
| 481 | { | ||
| 482 | hpa_t page_addr = vcpu->mmu.root_hpa; | ||
| 483 | int level = vcpu->mmu.shadow_root_level; | ||
| 484 | |||
| 485 | ++kvm_stat.invlpg; | ||
| 486 | |||
| 487 | for (; ; level--) { | ||
| 488 | u32 index = PT64_INDEX(addr, level); | ||
| 489 | u64 *table = __va(page_addr); | ||
| 490 | |||
| 491 | if (level == PT_PAGE_TABLE_LEVEL ) { | ||
| 492 | table[index] = 0; | ||
| 493 | return; | ||
| 494 | } | ||
| 495 | |||
| 496 | if (!is_present_pte(table[index])) | ||
| 497 | return; | ||
| 498 | |||
| 499 | page_addr = table[index] & PT64_BASE_ADDR_MASK; | ||
| 500 | |||
| 501 | if (level == PT_DIRECTORY_LEVEL && | ||
| 502 | (table[index] & PT_SHADOW_PS_MARK)) { | ||
| 503 | table[index] = 0; | ||
| 504 | release_pt_page_64(vcpu, page_addr, PT_PAGE_TABLE_LEVEL); | ||
| 505 | |||
| 506 | kvm_arch_ops->tlb_flush(vcpu); | ||
| 507 | return; | ||
| 508 | } | ||
| 509 | } | ||
| 510 | } | ||
| 511 | |||
| 512 | static void paging_free(struct kvm_vcpu *vcpu) | 1005 | static void paging_free(struct kvm_vcpu *vcpu) |
| 513 | { | 1006 | { |
| 514 | nonpaging_free(vcpu); | 1007 | nonpaging_free(vcpu); |
| @@ -522,37 +1015,40 @@ static void paging_free(struct kvm_vcpu *vcpu) | |||
| 522 | #include "paging_tmpl.h" | 1015 | #include "paging_tmpl.h" |
| 523 | #undef PTTYPE | 1016 | #undef PTTYPE |
| 524 | 1017 | ||
| 525 | static int paging64_init_context(struct kvm_vcpu *vcpu) | 1018 | static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) |
| 526 | { | 1019 | { |
| 527 | struct kvm_mmu *context = &vcpu->mmu; | 1020 | struct kvm_mmu *context = &vcpu->mmu; |
| 528 | 1021 | ||
| 529 | ASSERT(is_pae(vcpu)); | 1022 | ASSERT(is_pae(vcpu)); |
| 530 | context->new_cr3 = paging_new_cr3; | 1023 | context->new_cr3 = paging_new_cr3; |
| 531 | context->page_fault = paging64_page_fault; | 1024 | context->page_fault = paging64_page_fault; |
| 532 | context->inval_page = paging_inval_page; | ||
| 533 | context->gva_to_gpa = paging64_gva_to_gpa; | 1025 | context->gva_to_gpa = paging64_gva_to_gpa; |
| 534 | context->free = paging_free; | 1026 | context->free = paging_free; |
| 535 | context->root_level = PT64_ROOT_LEVEL; | 1027 | context->root_level = level; |
| 536 | context->shadow_root_level = PT64_ROOT_LEVEL; | 1028 | context->shadow_root_level = level; |
| 537 | context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL); | 1029 | mmu_alloc_roots(vcpu); |
| 538 | ASSERT(VALID_PAGE(context->root_hpa)); | 1030 | ASSERT(VALID_PAGE(context->root_hpa)); |
| 539 | kvm_arch_ops->set_cr3(vcpu, context->root_hpa | | 1031 | kvm_arch_ops->set_cr3(vcpu, context->root_hpa | |
| 540 | (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); | 1032 | (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); |
| 541 | return 0; | 1033 | return 0; |
| 542 | } | 1034 | } |
| 543 | 1035 | ||
| 1036 | static int paging64_init_context(struct kvm_vcpu *vcpu) | ||
| 1037 | { | ||
| 1038 | return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); | ||
| 1039 | } | ||
| 1040 | |||
| 544 | static int paging32_init_context(struct kvm_vcpu *vcpu) | 1041 | static int paging32_init_context(struct kvm_vcpu *vcpu) |
| 545 | { | 1042 | { |
| 546 | struct kvm_mmu *context = &vcpu->mmu; | 1043 | struct kvm_mmu *context = &vcpu->mmu; |
| 547 | 1044 | ||
| 548 | context->new_cr3 = paging_new_cr3; | 1045 | context->new_cr3 = paging_new_cr3; |
| 549 | context->page_fault = paging32_page_fault; | 1046 | context->page_fault = paging32_page_fault; |
| 550 | context->inval_page = paging_inval_page; | ||
| 551 | context->gva_to_gpa = paging32_gva_to_gpa; | 1047 | context->gva_to_gpa = paging32_gva_to_gpa; |
| 552 | context->free = paging_free; | 1048 | context->free = paging_free; |
| 553 | context->root_level = PT32_ROOT_LEVEL; | 1049 | context->root_level = PT32_ROOT_LEVEL; |
| 554 | context->shadow_root_level = PT32E_ROOT_LEVEL; | 1050 | context->shadow_root_level = PT32E_ROOT_LEVEL; |
| 555 | context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL); | 1051 | mmu_alloc_roots(vcpu); |
| 556 | ASSERT(VALID_PAGE(context->root_hpa)); | 1052 | ASSERT(VALID_PAGE(context->root_hpa)); |
| 557 | kvm_arch_ops->set_cr3(vcpu, context->root_hpa | | 1053 | kvm_arch_ops->set_cr3(vcpu, context->root_hpa | |
| 558 | (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); | 1054 | (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); |
| @@ -561,14 +1057,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) | |||
| 561 | 1057 | ||
| 562 | static int paging32E_init_context(struct kvm_vcpu *vcpu) | 1058 | static int paging32E_init_context(struct kvm_vcpu *vcpu) |
| 563 | { | 1059 | { |
| 564 | int ret; | 1060 | return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); |
| 565 | |||
| 566 | if ((ret = paging64_init_context(vcpu))) | ||
| 567 | return ret; | ||
| 568 | |||
| 569 | vcpu->mmu.root_level = PT32E_ROOT_LEVEL; | ||
| 570 | vcpu->mmu.shadow_root_level = PT32E_ROOT_LEVEL; | ||
| 571 | return 0; | ||
| 572 | } | 1061 | } |
| 573 | 1062 | ||
| 574 | static int init_kvm_mmu(struct kvm_vcpu *vcpu) | 1063 | static int init_kvm_mmu(struct kvm_vcpu *vcpu) |
| @@ -597,41 +1086,161 @@ static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) | |||
| 597 | 1086 | ||
| 598 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) | 1087 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) |
| 599 | { | 1088 | { |
| 1089 | int r; | ||
| 1090 | |||
| 600 | destroy_kvm_mmu(vcpu); | 1091 | destroy_kvm_mmu(vcpu); |
| 601 | return init_kvm_mmu(vcpu); | 1092 | r = init_kvm_mmu(vcpu); |
| 1093 | if (r < 0) | ||
| 1094 | goto out; | ||
| 1095 | r = mmu_topup_memory_caches(vcpu); | ||
| 1096 | out: | ||
| 1097 | return r; | ||
| 602 | } | 1098 | } |
| 603 | 1099 | ||
| 604 | static void free_mmu_pages(struct kvm_vcpu *vcpu) | 1100 | void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) |
| 605 | { | 1101 | { |
| 606 | while (!list_empty(&vcpu->free_pages)) { | 1102 | gfn_t gfn = gpa >> PAGE_SHIFT; |
| 1103 | struct kvm_mmu_page *page; | ||
| 1104 | struct kvm_mmu_page *child; | ||
| 1105 | struct hlist_node *node, *n; | ||
| 1106 | struct hlist_head *bucket; | ||
| 1107 | unsigned index; | ||
| 1108 | u64 *spte; | ||
| 1109 | u64 pte; | ||
| 1110 | unsigned offset = offset_in_page(gpa); | ||
| 1111 | unsigned pte_size; | ||
| 1112 | unsigned page_offset; | ||
| 1113 | unsigned misaligned; | ||
| 1114 | int level; | ||
| 1115 | int flooded = 0; | ||
| 1116 | |||
| 1117 | pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); | ||
| 1118 | if (gfn == vcpu->last_pt_write_gfn) { | ||
| 1119 | ++vcpu->last_pt_write_count; | ||
| 1120 | if (vcpu->last_pt_write_count >= 3) | ||
| 1121 | flooded = 1; | ||
| 1122 | } else { | ||
| 1123 | vcpu->last_pt_write_gfn = gfn; | ||
| 1124 | vcpu->last_pt_write_count = 1; | ||
| 1125 | } | ||
| 1126 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
| 1127 | bucket = &vcpu->kvm->mmu_page_hash[index]; | ||
| 1128 | hlist_for_each_entry_safe(page, node, n, bucket, hash_link) { | ||
| 1129 | if (page->gfn != gfn || page->role.metaphysical) | ||
| 1130 | continue; | ||
| 1131 | pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; | ||
| 1132 | misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); | ||
| 1133 | if (misaligned || flooded) { | ||
| 1134 | /* | ||
| 1135 | * Misaligned accesses are too much trouble to fix | ||
| 1136 | * up; also, they usually indicate a page is not used | ||
| 1137 | * as a page table. | ||
| 1138 | * | ||
| 1139 | * If we're seeing too many writes to a page, | ||
| 1140 | * it may no longer be a page table, or we may be | ||
| 1141 | * forking, in which case it is better to unmap the | ||
| 1142 | * page. | ||
| 1143 | */ | ||
| 1144 | pgprintk("misaligned: gpa %llx bytes %d role %x\n", | ||
| 1145 | gpa, bytes, page->role.word); | ||
| 1146 | kvm_mmu_zap_page(vcpu, page); | ||
| 1147 | continue; | ||
| 1148 | } | ||
| 1149 | page_offset = offset; | ||
| 1150 | level = page->role.level; | ||
| 1151 | if (page->role.glevels == PT32_ROOT_LEVEL) { | ||
| 1152 | page_offset <<= 1; /* 32->64 */ | ||
| 1153 | page_offset &= ~PAGE_MASK; | ||
| 1154 | } | ||
| 1155 | spte = __va(page->page_hpa); | ||
| 1156 | spte += page_offset / sizeof(*spte); | ||
| 1157 | pte = *spte; | ||
| 1158 | if (is_present_pte(pte)) { | ||
| 1159 | if (level == PT_PAGE_TABLE_LEVEL) | ||
| 1160 | rmap_remove(vcpu, spte); | ||
| 1161 | else { | ||
| 1162 | child = page_header(pte & PT64_BASE_ADDR_MASK); | ||
| 1163 | mmu_page_remove_parent_pte(vcpu, child, spte); | ||
| 1164 | } | ||
| 1165 | } | ||
| 1166 | *spte = 0; | ||
| 1167 | } | ||
| 1168 | } | ||
| 1169 | |||
| 1170 | void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) | ||
| 1171 | { | ||
| 1172 | } | ||
| 1173 | |||
| 1174 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) | ||
| 1175 | { | ||
| 1176 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); | ||
| 1177 | |||
| 1178 | return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT); | ||
| 1179 | } | ||
| 1180 | |||
| 1181 | void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | ||
| 1182 | { | ||
| 1183 | while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) { | ||
| 607 | struct kvm_mmu_page *page; | 1184 | struct kvm_mmu_page *page; |
| 608 | 1185 | ||
| 1186 | page = container_of(vcpu->kvm->active_mmu_pages.prev, | ||
| 1187 | struct kvm_mmu_page, link); | ||
| 1188 | kvm_mmu_zap_page(vcpu, page); | ||
| 1189 | } | ||
| 1190 | } | ||
| 1191 | EXPORT_SYMBOL_GPL(kvm_mmu_free_some_pages); | ||
| 1192 | |||
| 1193 | static void free_mmu_pages(struct kvm_vcpu *vcpu) | ||
| 1194 | { | ||
| 1195 | struct kvm_mmu_page *page; | ||
| 1196 | |||
| 1197 | while (!list_empty(&vcpu->kvm->active_mmu_pages)) { | ||
| 1198 | page = container_of(vcpu->kvm->active_mmu_pages.next, | ||
| 1199 | struct kvm_mmu_page, link); | ||
| 1200 | kvm_mmu_zap_page(vcpu, page); | ||
| 1201 | } | ||
| 1202 | while (!list_empty(&vcpu->free_pages)) { | ||
| 609 | page = list_entry(vcpu->free_pages.next, | 1203 | page = list_entry(vcpu->free_pages.next, |
| 610 | struct kvm_mmu_page, link); | 1204 | struct kvm_mmu_page, link); |
| 611 | list_del(&page->link); | 1205 | list_del(&page->link); |
| 612 | __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT)); | 1206 | __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT)); |
| 613 | page->page_hpa = INVALID_PAGE; | 1207 | page->page_hpa = INVALID_PAGE; |
| 614 | } | 1208 | } |
| 1209 | free_page((unsigned long)vcpu->mmu.pae_root); | ||
| 615 | } | 1210 | } |
| 616 | 1211 | ||
| 617 | static int alloc_mmu_pages(struct kvm_vcpu *vcpu) | 1212 | static int alloc_mmu_pages(struct kvm_vcpu *vcpu) |
| 618 | { | 1213 | { |
| 1214 | struct page *page; | ||
| 619 | int i; | 1215 | int i; |
| 620 | 1216 | ||
| 621 | ASSERT(vcpu); | 1217 | ASSERT(vcpu); |
| 622 | 1218 | ||
| 623 | for (i = 0; i < KVM_NUM_MMU_PAGES; i++) { | 1219 | for (i = 0; i < KVM_NUM_MMU_PAGES; i++) { |
| 624 | struct page *page; | ||
| 625 | struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i]; | 1220 | struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i]; |
| 626 | 1221 | ||
| 627 | INIT_LIST_HEAD(&page_header->link); | 1222 | INIT_LIST_HEAD(&page_header->link); |
| 628 | if ((page = alloc_page(GFP_KVM_MMU)) == NULL) | 1223 | if ((page = alloc_page(GFP_KERNEL)) == NULL) |
| 629 | goto error_1; | 1224 | goto error_1; |
| 630 | page->private = (unsigned long)page_header; | 1225 | page->private = (unsigned long)page_header; |
| 631 | page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT; | 1226 | page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT; |
| 632 | memset(__va(page_header->page_hpa), 0, PAGE_SIZE); | 1227 | memset(__va(page_header->page_hpa), 0, PAGE_SIZE); |
| 633 | list_add(&page_header->link, &vcpu->free_pages); | 1228 | list_add(&page_header->link, &vcpu->free_pages); |
| 1229 | ++vcpu->kvm->n_free_mmu_pages; | ||
| 634 | } | 1230 | } |
| 1231 | |||
| 1232 | /* | ||
| 1233 | * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. | ||
| 1234 | * Therefore we need to allocate shadow page tables in the first | ||
| 1235 | * 4GB of memory, which happens to fit the DMA32 zone. | ||
| 1236 | */ | ||
| 1237 | page = alloc_page(GFP_KERNEL | __GFP_DMA32); | ||
| 1238 | if (!page) | ||
| 1239 | goto error_1; | ||
| 1240 | vcpu->mmu.pae_root = page_address(page); | ||
| 1241 | for (i = 0; i < 4; ++i) | ||
| 1242 | vcpu->mmu.pae_root[i] = INVALID_PAGE; | ||
| 1243 | |||
| 635 | return 0; | 1244 | return 0; |
| 636 | 1245 | ||
| 637 | error_1: | 1246 | error_1: |
| @@ -663,10 +1272,12 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) | |||
| 663 | 1272 | ||
| 664 | destroy_kvm_mmu(vcpu); | 1273 | destroy_kvm_mmu(vcpu); |
| 665 | free_mmu_pages(vcpu); | 1274 | free_mmu_pages(vcpu); |
| 1275 | mmu_free_memory_caches(vcpu); | ||
| 666 | } | 1276 | } |
| 667 | 1277 | ||
| 668 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | 1278 | void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot) |
| 669 | { | 1279 | { |
| 1280 | struct kvm *kvm = vcpu->kvm; | ||
| 670 | struct kvm_mmu_page *page; | 1281 | struct kvm_mmu_page *page; |
| 671 | 1282 | ||
| 672 | list_for_each_entry(page, &kvm->active_mmu_pages, link) { | 1283 | list_for_each_entry(page, &kvm->active_mmu_pages, link) { |
| @@ -679,8 +1290,169 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
| 679 | pt = __va(page->page_hpa); | 1290 | pt = __va(page->page_hpa); |
| 680 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | 1291 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) |
| 681 | /* avoid RMW */ | 1292 | /* avoid RMW */ |
| 682 | if (pt[i] & PT_WRITABLE_MASK) | 1293 | if (pt[i] & PT_WRITABLE_MASK) { |
| 1294 | rmap_remove(vcpu, &pt[i]); | ||
| 683 | pt[i] &= ~PT_WRITABLE_MASK; | 1295 | pt[i] &= ~PT_WRITABLE_MASK; |
| 1296 | } | ||
| 1297 | } | ||
| 1298 | } | ||
| 1299 | |||
| 1300 | #ifdef AUDIT | ||
| 1301 | |||
| 1302 | static const char *audit_msg; | ||
| 1303 | |||
| 1304 | static gva_t canonicalize(gva_t gva) | ||
| 1305 | { | ||
| 1306 | #ifdef CONFIG_X86_64 | ||
| 1307 | gva = (long long)(gva << 16) >> 16; | ||
| 1308 | #endif | ||
| 1309 | return gva; | ||
| 1310 | } | ||
| 684 | 1311 | ||
| 1312 | static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, | ||
| 1313 | gva_t va, int level) | ||
| 1314 | { | ||
| 1315 | u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); | ||
| 1316 | int i; | ||
| 1317 | gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); | ||
| 1318 | |||
| 1319 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { | ||
| 1320 | u64 ent = pt[i]; | ||
| 1321 | |||
| 1322 | if (!ent & PT_PRESENT_MASK) | ||
| 1323 | continue; | ||
| 1324 | |||
| 1325 | va = canonicalize(va); | ||
| 1326 | if (level > 1) | ||
| 1327 | audit_mappings_page(vcpu, ent, va, level - 1); | ||
| 1328 | else { | ||
| 1329 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va); | ||
| 1330 | hpa_t hpa = gpa_to_hpa(vcpu, gpa); | ||
| 1331 | |||
| 1332 | if ((ent & PT_PRESENT_MASK) | ||
| 1333 | && (ent & PT64_BASE_ADDR_MASK) != hpa) | ||
| 1334 | printk(KERN_ERR "audit error: (%s) levels %d" | ||
| 1335 | " gva %lx gpa %llx hpa %llx ent %llx\n", | ||
| 1336 | audit_msg, vcpu->mmu.root_level, | ||
| 1337 | va, gpa, hpa, ent); | ||
| 1338 | } | ||
| 685 | } | 1339 | } |
| 686 | } | 1340 | } |
| 1341 | |||
| 1342 | static void audit_mappings(struct kvm_vcpu *vcpu) | ||
| 1343 | { | ||
| 1344 | int i; | ||
| 1345 | |||
| 1346 | if (vcpu->mmu.root_level == 4) | ||
| 1347 | audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4); | ||
| 1348 | else | ||
| 1349 | for (i = 0; i < 4; ++i) | ||
| 1350 | if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK) | ||
| 1351 | audit_mappings_page(vcpu, | ||
| 1352 | vcpu->mmu.pae_root[i], | ||
| 1353 | i << 30, | ||
| 1354 | 2); | ||
| 1355 | } | ||
| 1356 | |||
| 1357 | static int count_rmaps(struct kvm_vcpu *vcpu) | ||
| 1358 | { | ||
| 1359 | int nmaps = 0; | ||
| 1360 | int i, j, k; | ||
| 1361 | |||
| 1362 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { | ||
| 1363 | struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; | ||
| 1364 | struct kvm_rmap_desc *d; | ||
| 1365 | |||
| 1366 | for (j = 0; j < m->npages; ++j) { | ||
| 1367 | struct page *page = m->phys_mem[j]; | ||
| 1368 | |||
| 1369 | if (!page->private) | ||
| 1370 | continue; | ||
| 1371 | if (!(page->private & 1)) { | ||
| 1372 | ++nmaps; | ||
| 1373 | continue; | ||
| 1374 | } | ||
| 1375 | d = (struct kvm_rmap_desc *)(page->private & ~1ul); | ||
| 1376 | while (d) { | ||
| 1377 | for (k = 0; k < RMAP_EXT; ++k) | ||
| 1378 | if (d->shadow_ptes[k]) | ||
| 1379 | ++nmaps; | ||
| 1380 | else | ||
| 1381 | break; | ||
| 1382 | d = d->more; | ||
| 1383 | } | ||
| 1384 | } | ||
| 1385 | } | ||
| 1386 | return nmaps; | ||
| 1387 | } | ||
| 1388 | |||
| 1389 | static int count_writable_mappings(struct kvm_vcpu *vcpu) | ||
| 1390 | { | ||
| 1391 | int nmaps = 0; | ||
| 1392 | struct kvm_mmu_page *page; | ||
| 1393 | int i; | ||
| 1394 | |||
| 1395 | list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { | ||
| 1396 | u64 *pt = __va(page->page_hpa); | ||
| 1397 | |||
| 1398 | if (page->role.level != PT_PAGE_TABLE_LEVEL) | ||
| 1399 | continue; | ||
| 1400 | |||
| 1401 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
| 1402 | u64 ent = pt[i]; | ||
| 1403 | |||
| 1404 | if (!(ent & PT_PRESENT_MASK)) | ||
| 1405 | continue; | ||
| 1406 | if (!(ent & PT_WRITABLE_MASK)) | ||
| 1407 | continue; | ||
| 1408 | ++nmaps; | ||
| 1409 | } | ||
| 1410 | } | ||
| 1411 | return nmaps; | ||
| 1412 | } | ||
| 1413 | |||
| 1414 | static void audit_rmap(struct kvm_vcpu *vcpu) | ||
| 1415 | { | ||
| 1416 | int n_rmap = count_rmaps(vcpu); | ||
| 1417 | int n_actual = count_writable_mappings(vcpu); | ||
| 1418 | |||
| 1419 | if (n_rmap != n_actual) | ||
| 1420 | printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", | ||
| 1421 | __FUNCTION__, audit_msg, n_rmap, n_actual); | ||
| 1422 | } | ||
| 1423 | |||
| 1424 | static void audit_write_protection(struct kvm_vcpu *vcpu) | ||
| 1425 | { | ||
| 1426 | struct kvm_mmu_page *page; | ||
| 1427 | |||
| 1428 | list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { | ||
| 1429 | hfn_t hfn; | ||
| 1430 | struct page *pg; | ||
| 1431 | |||
| 1432 | if (page->role.metaphysical) | ||
| 1433 | continue; | ||
| 1434 | |||
| 1435 | hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT) | ||
| 1436 | >> PAGE_SHIFT; | ||
| 1437 | pg = pfn_to_page(hfn); | ||
| 1438 | if (pg->private) | ||
| 1439 | printk(KERN_ERR "%s: (%s) shadow page has writable" | ||
| 1440 | " mappings: gfn %lx role %x\n", | ||
| 1441 | __FUNCTION__, audit_msg, page->gfn, | ||
| 1442 | page->role.word); | ||
| 1443 | } | ||
| 1444 | } | ||
| 1445 | |||
| 1446 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) | ||
| 1447 | { | ||
| 1448 | int olddbg = dbg; | ||
| 1449 | |||
| 1450 | dbg = 0; | ||
| 1451 | audit_msg = msg; | ||
| 1452 | audit_rmap(vcpu); | ||
| 1453 | audit_write_protection(vcpu); | ||
| 1454 | audit_mappings(vcpu); | ||
| 1455 | dbg = olddbg; | ||
| 1456 | } | ||
| 1457 | |||
| 1458 | #endif | ||
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 09bb9b4ed12d..2dbf4307ed9e 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h | |||
| @@ -32,6 +32,11 @@ | |||
| 32 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | 32 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) |
| 33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) | 33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) |
| 34 | #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK | 34 | #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK |
| 35 | #ifdef CONFIG_X86_64 | ||
| 36 | #define PT_MAX_FULL_LEVELS 4 | ||
| 37 | #else | ||
| 38 | #define PT_MAX_FULL_LEVELS 2 | ||
| 39 | #endif | ||
| 35 | #elif PTTYPE == 32 | 40 | #elif PTTYPE == 32 |
| 36 | #define pt_element_t u32 | 41 | #define pt_element_t u32 |
| 37 | #define guest_walker guest_walker32 | 42 | #define guest_walker guest_walker32 |
| @@ -42,6 +47,7 @@ | |||
| 42 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | 47 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) |
| 43 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) | 48 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) |
| 44 | #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK | 49 | #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK |
| 50 | #define PT_MAX_FULL_LEVELS 2 | ||
| 45 | #else | 51 | #else |
| 46 | #error Invalid PTTYPE value | 52 | #error Invalid PTTYPE value |
| 47 | #endif | 53 | #endif |
| @@ -52,93 +58,126 @@ | |||
| 52 | */ | 58 | */ |
| 53 | struct guest_walker { | 59 | struct guest_walker { |
| 54 | int level; | 60 | int level; |
| 61 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; | ||
| 55 | pt_element_t *table; | 62 | pt_element_t *table; |
| 63 | pt_element_t *ptep; | ||
| 56 | pt_element_t inherited_ar; | 64 | pt_element_t inherited_ar; |
| 65 | gfn_t gfn; | ||
| 57 | }; | 66 | }; |
| 58 | 67 | ||
| 59 | static void FNAME(init_walker)(struct guest_walker *walker, | 68 | /* |
| 60 | struct kvm_vcpu *vcpu) | 69 | * Fetch a guest pte for a guest virtual address |
| 70 | */ | ||
| 71 | static void FNAME(walk_addr)(struct guest_walker *walker, | ||
| 72 | struct kvm_vcpu *vcpu, gva_t addr) | ||
| 61 | { | 73 | { |
| 62 | hpa_t hpa; | 74 | hpa_t hpa; |
| 63 | struct kvm_memory_slot *slot; | 75 | struct kvm_memory_slot *slot; |
| 76 | pt_element_t *ptep; | ||
| 77 | pt_element_t root; | ||
| 78 | gfn_t table_gfn; | ||
| 64 | 79 | ||
| 80 | pgprintk("%s: addr %lx\n", __FUNCTION__, addr); | ||
| 65 | walker->level = vcpu->mmu.root_level; | 81 | walker->level = vcpu->mmu.root_level; |
| 66 | slot = gfn_to_memslot(vcpu->kvm, | 82 | walker->table = NULL; |
| 67 | (vcpu->cr3 & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); | 83 | root = vcpu->cr3; |
| 68 | hpa = safe_gpa_to_hpa(vcpu, vcpu->cr3 & PT64_BASE_ADDR_MASK); | 84 | #if PTTYPE == 64 |
| 85 | if (!is_long_mode(vcpu)) { | ||
| 86 | walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3]; | ||
| 87 | root = *walker->ptep; | ||
| 88 | if (!(root & PT_PRESENT_MASK)) | ||
| 89 | return; | ||
| 90 | --walker->level; | ||
| 91 | } | ||
| 92 | #endif | ||
| 93 | table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
| 94 | walker->table_gfn[walker->level - 1] = table_gfn; | ||
| 95 | pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, | ||
| 96 | walker->level - 1, table_gfn); | ||
| 97 | slot = gfn_to_memslot(vcpu->kvm, table_gfn); | ||
| 98 | hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK); | ||
| 69 | walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0); | 99 | walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0); |
| 70 | 100 | ||
| 71 | ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || | 101 | ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || |
| 72 | (vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) == 0); | 102 | (vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) == 0); |
| 73 | 103 | ||
| 74 | walker->table = (pt_element_t *)( (unsigned long)walker->table | | ||
| 75 | (unsigned long)(vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) ); | ||
| 76 | walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK; | 104 | walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK; |
| 105 | |||
| 106 | for (;;) { | ||
| 107 | int index = PT_INDEX(addr, walker->level); | ||
| 108 | hpa_t paddr; | ||
| 109 | |||
| 110 | ptep = &walker->table[index]; | ||
| 111 | ASSERT(((unsigned long)walker->table & PAGE_MASK) == | ||
| 112 | ((unsigned long)ptep & PAGE_MASK)); | ||
| 113 | |||
| 114 | if (is_present_pte(*ptep) && !(*ptep & PT_ACCESSED_MASK)) | ||
| 115 | *ptep |= PT_ACCESSED_MASK; | ||
| 116 | |||
| 117 | if (!is_present_pte(*ptep)) | ||
| 118 | break; | ||
| 119 | |||
| 120 | if (walker->level == PT_PAGE_TABLE_LEVEL) { | ||
| 121 | walker->gfn = (*ptep & PT_BASE_ADDR_MASK) | ||
| 122 | >> PAGE_SHIFT; | ||
| 123 | break; | ||
| 124 | } | ||
| 125 | |||
| 126 | if (walker->level == PT_DIRECTORY_LEVEL | ||
| 127 | && (*ptep & PT_PAGE_SIZE_MASK) | ||
| 128 | && (PTTYPE == 64 || is_pse(vcpu))) { | ||
| 129 | walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK) | ||
| 130 | >> PAGE_SHIFT; | ||
| 131 | walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); | ||
| 132 | break; | ||
| 133 | } | ||
| 134 | |||
| 135 | if (walker->level != 3 || is_long_mode(vcpu)) | ||
| 136 | walker->inherited_ar &= walker->table[index]; | ||
| 137 | table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
| 138 | paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK); | ||
| 139 | kunmap_atomic(walker->table, KM_USER0); | ||
| 140 | walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT), | ||
| 141 | KM_USER0); | ||
| 142 | --walker->level; | ||
| 143 | walker->table_gfn[walker->level - 1 ] = table_gfn; | ||
| 144 | pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, | ||
| 145 | walker->level - 1, table_gfn); | ||
| 146 | } | ||
| 147 | walker->ptep = ptep; | ||
| 148 | pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep); | ||
| 77 | } | 149 | } |
| 78 | 150 | ||
| 79 | static void FNAME(release_walker)(struct guest_walker *walker) | 151 | static void FNAME(release_walker)(struct guest_walker *walker) |
| 80 | { | 152 | { |
| 81 | kunmap_atomic(walker->table, KM_USER0); | 153 | if (walker->table) |
| 154 | kunmap_atomic(walker->table, KM_USER0); | ||
| 82 | } | 155 | } |
| 83 | 156 | ||
| 84 | static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte, | 157 | static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte, |
| 85 | u64 *shadow_pte, u64 access_bits) | 158 | u64 *shadow_pte, u64 access_bits, gfn_t gfn) |
| 86 | { | 159 | { |
| 87 | ASSERT(*shadow_pte == 0); | 160 | ASSERT(*shadow_pte == 0); |
| 88 | access_bits &= guest_pte; | 161 | access_bits &= guest_pte; |
| 89 | *shadow_pte = (guest_pte & PT_PTE_COPY_MASK); | 162 | *shadow_pte = (guest_pte & PT_PTE_COPY_MASK); |
| 90 | set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK, | 163 | set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK, |
| 91 | guest_pte & PT_DIRTY_MASK, access_bits); | 164 | guest_pte & PT_DIRTY_MASK, access_bits, gfn); |
| 92 | } | 165 | } |
| 93 | 166 | ||
| 94 | static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde, | 167 | static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde, |
| 95 | u64 *shadow_pte, u64 access_bits, | 168 | u64 *shadow_pte, u64 access_bits, gfn_t gfn) |
| 96 | int index) | ||
| 97 | { | 169 | { |
| 98 | gpa_t gaddr; | 170 | gpa_t gaddr; |
| 99 | 171 | ||
| 100 | ASSERT(*shadow_pte == 0); | 172 | ASSERT(*shadow_pte == 0); |
| 101 | access_bits &= guest_pde; | 173 | access_bits &= guest_pde; |
| 102 | gaddr = (guest_pde & PT_DIR_BASE_ADDR_MASK) + PAGE_SIZE * index; | 174 | gaddr = (gpa_t)gfn << PAGE_SHIFT; |
| 103 | if (PTTYPE == 32 && is_cpuid_PSE36()) | 175 | if (PTTYPE == 32 && is_cpuid_PSE36()) |
| 104 | gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) << | 176 | gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) << |
| 105 | (32 - PT32_DIR_PSE36_SHIFT); | 177 | (32 - PT32_DIR_PSE36_SHIFT); |
| 106 | *shadow_pte = guest_pde & PT_PTE_COPY_MASK; | 178 | *shadow_pte = guest_pde & PT_PTE_COPY_MASK; |
| 107 | set_pte_common(vcpu, shadow_pte, gaddr, | 179 | set_pte_common(vcpu, shadow_pte, gaddr, |
| 108 | guest_pde & PT_DIRTY_MASK, access_bits); | 180 | guest_pde & PT_DIRTY_MASK, access_bits, gfn); |
| 109 | } | ||
| 110 | |||
| 111 | /* | ||
| 112 | * Fetch a guest pte from a specific level in the paging hierarchy. | ||
| 113 | */ | ||
| 114 | static pt_element_t *FNAME(fetch_guest)(struct kvm_vcpu *vcpu, | ||
| 115 | struct guest_walker *walker, | ||
| 116 | int level, | ||
| 117 | gva_t addr) | ||
| 118 | { | ||
| 119 | |||
| 120 | ASSERT(level > 0 && level <= walker->level); | ||
| 121 | |||
| 122 | for (;;) { | ||
| 123 | int index = PT_INDEX(addr, walker->level); | ||
| 124 | hpa_t paddr; | ||
| 125 | |||
| 126 | ASSERT(((unsigned long)walker->table & PAGE_MASK) == | ||
| 127 | ((unsigned long)&walker->table[index] & PAGE_MASK)); | ||
| 128 | if (level == walker->level || | ||
| 129 | !is_present_pte(walker->table[index]) || | ||
| 130 | (walker->level == PT_DIRECTORY_LEVEL && | ||
| 131 | (walker->table[index] & PT_PAGE_SIZE_MASK) && | ||
| 132 | (PTTYPE == 64 || is_pse(vcpu)))) | ||
| 133 | return &walker->table[index]; | ||
| 134 | if (walker->level != 3 || is_long_mode(vcpu)) | ||
| 135 | walker->inherited_ar &= walker->table[index]; | ||
| 136 | paddr = safe_gpa_to_hpa(vcpu, walker->table[index] & PT_BASE_ADDR_MASK); | ||
| 137 | kunmap_atomic(walker->table, KM_USER0); | ||
| 138 | walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT), | ||
| 139 | KM_USER0); | ||
| 140 | --walker->level; | ||
| 141 | } | ||
| 142 | } | 181 | } |
| 143 | 182 | ||
| 144 | /* | 183 | /* |
| @@ -150,15 +189,26 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
| 150 | hpa_t shadow_addr; | 189 | hpa_t shadow_addr; |
| 151 | int level; | 190 | int level; |
| 152 | u64 *prev_shadow_ent = NULL; | 191 | u64 *prev_shadow_ent = NULL; |
| 192 | pt_element_t *guest_ent = walker->ptep; | ||
| 193 | |||
| 194 | if (!is_present_pte(*guest_ent)) | ||
| 195 | return NULL; | ||
| 153 | 196 | ||
| 154 | shadow_addr = vcpu->mmu.root_hpa; | 197 | shadow_addr = vcpu->mmu.root_hpa; |
| 155 | level = vcpu->mmu.shadow_root_level; | 198 | level = vcpu->mmu.shadow_root_level; |
| 199 | if (level == PT32E_ROOT_LEVEL) { | ||
| 200 | shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3]; | ||
| 201 | shadow_addr &= PT64_BASE_ADDR_MASK; | ||
| 202 | --level; | ||
| 203 | } | ||
| 156 | 204 | ||
| 157 | for (; ; level--) { | 205 | for (; ; level--) { |
| 158 | u32 index = SHADOW_PT_INDEX(addr, level); | 206 | u32 index = SHADOW_PT_INDEX(addr, level); |
| 159 | u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index; | 207 | u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index; |
| 160 | pt_element_t *guest_ent; | 208 | struct kvm_mmu_page *shadow_page; |
| 161 | u64 shadow_pte; | 209 | u64 shadow_pte; |
| 210 | int metaphysical; | ||
| 211 | gfn_t table_gfn; | ||
| 162 | 212 | ||
| 163 | if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { | 213 | if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { |
| 164 | if (level == PT_PAGE_TABLE_LEVEL) | 214 | if (level == PT_PAGE_TABLE_LEVEL) |
| @@ -168,21 +218,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
| 168 | continue; | 218 | continue; |
| 169 | } | 219 | } |
| 170 | 220 | ||
| 171 | if (PTTYPE == 32 && level > PT32_ROOT_LEVEL) { | ||
| 172 | ASSERT(level == PT32E_ROOT_LEVEL); | ||
| 173 | guest_ent = FNAME(fetch_guest)(vcpu, walker, | ||
| 174 | PT32_ROOT_LEVEL, addr); | ||
| 175 | } else | ||
| 176 | guest_ent = FNAME(fetch_guest)(vcpu, walker, | ||
| 177 | level, addr); | ||
| 178 | |||
| 179 | if (!is_present_pte(*guest_ent)) | ||
| 180 | return NULL; | ||
| 181 | |||
| 182 | /* Don't set accessed bit on PAE PDPTRs */ | ||
| 183 | if (vcpu->mmu.root_level != 3 || walker->level != 3) | ||
| 184 | *guest_ent |= PT_ACCESSED_MASK; | ||
| 185 | |||
| 186 | if (level == PT_PAGE_TABLE_LEVEL) { | 221 | if (level == PT_PAGE_TABLE_LEVEL) { |
| 187 | 222 | ||
| 188 | if (walker->level == PT_DIRECTORY_LEVEL) { | 223 | if (walker->level == PT_DIRECTORY_LEVEL) { |
| @@ -190,21 +225,30 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
| 190 | *prev_shadow_ent |= PT_SHADOW_PS_MARK; | 225 | *prev_shadow_ent |= PT_SHADOW_PS_MARK; |
| 191 | FNAME(set_pde)(vcpu, *guest_ent, shadow_ent, | 226 | FNAME(set_pde)(vcpu, *guest_ent, shadow_ent, |
| 192 | walker->inherited_ar, | 227 | walker->inherited_ar, |
| 193 | PT_INDEX(addr, PT_PAGE_TABLE_LEVEL)); | 228 | walker->gfn); |
| 194 | } else { | 229 | } else { |
| 195 | ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); | 230 | ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); |
| 196 | FNAME(set_pte)(vcpu, *guest_ent, shadow_ent, walker->inherited_ar); | 231 | FNAME(set_pte)(vcpu, *guest_ent, shadow_ent, |
| 232 | walker->inherited_ar, | ||
| 233 | walker->gfn); | ||
| 197 | } | 234 | } |
| 198 | return shadow_ent; | 235 | return shadow_ent; |
| 199 | } | 236 | } |
| 200 | 237 | ||
| 201 | shadow_addr = kvm_mmu_alloc_page(vcpu, shadow_ent); | 238 | if (level - 1 == PT_PAGE_TABLE_LEVEL |
| 202 | if (!VALID_PAGE(shadow_addr)) | 239 | && walker->level == PT_DIRECTORY_LEVEL) { |
| 203 | return ERR_PTR(-ENOMEM); | 240 | metaphysical = 1; |
| 204 | shadow_pte = shadow_addr | PT_PRESENT_MASK; | 241 | table_gfn = (*guest_ent & PT_BASE_ADDR_MASK) |
| 205 | if (vcpu->mmu.root_level > 3 || level != 3) | 242 | >> PAGE_SHIFT; |
| 206 | shadow_pte |= PT_ACCESSED_MASK | 243 | } else { |
| 207 | | PT_WRITABLE_MASK | PT_USER_MASK; | 244 | metaphysical = 0; |
| 245 | table_gfn = walker->table_gfn[level - 2]; | ||
| 246 | } | ||
| 247 | shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, | ||
| 248 | metaphysical, shadow_ent); | ||
| 249 | shadow_addr = shadow_page->page_hpa; | ||
| 250 | shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK | ||
| 251 | | PT_WRITABLE_MASK | PT_USER_MASK; | ||
| 208 | *shadow_ent = shadow_pte; | 252 | *shadow_ent = shadow_pte; |
| 209 | prev_shadow_ent = shadow_ent; | 253 | prev_shadow_ent = shadow_ent; |
| 210 | } | 254 | } |
| @@ -221,11 +265,13 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, | |||
| 221 | u64 *shadow_ent, | 265 | u64 *shadow_ent, |
| 222 | struct guest_walker *walker, | 266 | struct guest_walker *walker, |
| 223 | gva_t addr, | 267 | gva_t addr, |
| 224 | int user) | 268 | int user, |
| 269 | int *write_pt) | ||
| 225 | { | 270 | { |
| 226 | pt_element_t *guest_ent; | 271 | pt_element_t *guest_ent; |
| 227 | int writable_shadow; | 272 | int writable_shadow; |
| 228 | gfn_t gfn; | 273 | gfn_t gfn; |
| 274 | struct kvm_mmu_page *page; | ||
| 229 | 275 | ||
| 230 | if (is_writeble_pte(*shadow_ent)) | 276 | if (is_writeble_pte(*shadow_ent)) |
| 231 | return 0; | 277 | return 0; |
| @@ -250,17 +296,35 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, | |||
| 250 | *shadow_ent &= ~PT_USER_MASK; | 296 | *shadow_ent &= ~PT_USER_MASK; |
| 251 | } | 297 | } |
| 252 | 298 | ||
| 253 | guest_ent = FNAME(fetch_guest)(vcpu, walker, PT_PAGE_TABLE_LEVEL, addr); | 299 | guest_ent = walker->ptep; |
| 254 | 300 | ||
| 255 | if (!is_present_pte(*guest_ent)) { | 301 | if (!is_present_pte(*guest_ent)) { |
| 256 | *shadow_ent = 0; | 302 | *shadow_ent = 0; |
| 257 | return 0; | 303 | return 0; |
| 258 | } | 304 | } |
| 259 | 305 | ||
| 260 | gfn = (*guest_ent & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | 306 | gfn = walker->gfn; |
| 307 | |||
| 308 | if (user) { | ||
| 309 | /* | ||
| 310 | * Usermode page faults won't be for page table updates. | ||
| 311 | */ | ||
| 312 | while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) { | ||
| 313 | pgprintk("%s: zap %lx %x\n", | ||
| 314 | __FUNCTION__, gfn, page->role.word); | ||
| 315 | kvm_mmu_zap_page(vcpu, page); | ||
| 316 | } | ||
| 317 | } else if (kvm_mmu_lookup_page(vcpu, gfn)) { | ||
| 318 | pgprintk("%s: found shadow page for %lx, marking ro\n", | ||
| 319 | __FUNCTION__, gfn); | ||
| 320 | *guest_ent |= PT_DIRTY_MASK; | ||
| 321 | *write_pt = 1; | ||
| 322 | return 0; | ||
| 323 | } | ||
| 261 | mark_page_dirty(vcpu->kvm, gfn); | 324 | mark_page_dirty(vcpu->kvm, gfn); |
| 262 | *shadow_ent |= PT_WRITABLE_MASK; | 325 | *shadow_ent |= PT_WRITABLE_MASK; |
| 263 | *guest_ent |= PT_DIRTY_MASK; | 326 | *guest_ent |= PT_DIRTY_MASK; |
| 327 | rmap_add(vcpu, shadow_ent); | ||
| 264 | 328 | ||
| 265 | return 1; | 329 | return 1; |
| 266 | } | 330 | } |
| @@ -276,7 +340,8 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, | |||
| 276 | * - normal guest page fault due to the guest pte marked not present, not | 340 | * - normal guest page fault due to the guest pte marked not present, not |
| 277 | * writable, or not executable | 341 | * writable, or not executable |
| 278 | * | 342 | * |
| 279 | * Returns: 1 if we need to emulate the instruction, 0 otherwise | 343 | * Returns: 1 if we need to emulate the instruction, 0 otherwise, or |
| 344 | * a negative value on error. | ||
| 280 | */ | 345 | */ |
| 281 | static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | 346 | static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, |
| 282 | u32 error_code) | 347 | u32 error_code) |
| @@ -287,39 +352,47 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
| 287 | struct guest_walker walker; | 352 | struct guest_walker walker; |
| 288 | u64 *shadow_pte; | 353 | u64 *shadow_pte; |
| 289 | int fixed; | 354 | int fixed; |
| 355 | int write_pt = 0; | ||
| 356 | int r; | ||
| 357 | |||
| 358 | pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); | ||
| 359 | kvm_mmu_audit(vcpu, "pre page fault"); | ||
| 360 | |||
| 361 | r = mmu_topup_memory_caches(vcpu); | ||
| 362 | if (r) | ||
| 363 | return r; | ||
| 290 | 364 | ||
| 291 | /* | 365 | /* |
| 292 | * Look up the shadow pte for the faulting address. | 366 | * Look up the shadow pte for the faulting address. |
| 293 | */ | 367 | */ |
| 294 | for (;;) { | 368 | FNAME(walk_addr)(&walker, vcpu, addr); |
| 295 | FNAME(init_walker)(&walker, vcpu); | 369 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker); |
| 296 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker); | ||
| 297 | if (IS_ERR(shadow_pte)) { /* must be -ENOMEM */ | ||
| 298 | nonpaging_flush(vcpu); | ||
| 299 | FNAME(release_walker)(&walker); | ||
| 300 | continue; | ||
| 301 | } | ||
| 302 | break; | ||
| 303 | } | ||
| 304 | 370 | ||
| 305 | /* | 371 | /* |
| 306 | * The page is not mapped by the guest. Let the guest handle it. | 372 | * The page is not mapped by the guest. Let the guest handle it. |
| 307 | */ | 373 | */ |
| 308 | if (!shadow_pte) { | 374 | if (!shadow_pte) { |
| 375 | pgprintk("%s: not mapped\n", __FUNCTION__); | ||
| 309 | inject_page_fault(vcpu, addr, error_code); | 376 | inject_page_fault(vcpu, addr, error_code); |
| 310 | FNAME(release_walker)(&walker); | 377 | FNAME(release_walker)(&walker); |
| 311 | return 0; | 378 | return 0; |
| 312 | } | 379 | } |
| 313 | 380 | ||
| 381 | pgprintk("%s: shadow pte %p %llx\n", __FUNCTION__, | ||
| 382 | shadow_pte, *shadow_pte); | ||
| 383 | |||
| 314 | /* | 384 | /* |
| 315 | * Update the shadow pte. | 385 | * Update the shadow pte. |
| 316 | */ | 386 | */ |
| 317 | if (write_fault) | 387 | if (write_fault) |
| 318 | fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr, | 388 | fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr, |
| 319 | user_fault); | 389 | user_fault, &write_pt); |
| 320 | else | 390 | else |
| 321 | fixed = fix_read_pf(shadow_pte); | 391 | fixed = fix_read_pf(shadow_pte); |
| 322 | 392 | ||
| 393 | pgprintk("%s: updated shadow pte %p %llx\n", __FUNCTION__, | ||
| 394 | shadow_pte, *shadow_pte); | ||
| 395 | |||
| 323 | FNAME(release_walker)(&walker); | 396 | FNAME(release_walker)(&walker); |
| 324 | 397 | ||
| 325 | /* | 398 | /* |
| @@ -331,20 +404,23 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
| 331 | pgprintk("%s: io work, no access\n", __FUNCTION__); | 404 | pgprintk("%s: io work, no access\n", __FUNCTION__); |
| 332 | inject_page_fault(vcpu, addr, | 405 | inject_page_fault(vcpu, addr, |
| 333 | error_code | PFERR_PRESENT_MASK); | 406 | error_code | PFERR_PRESENT_MASK); |
| 407 | kvm_mmu_audit(vcpu, "post page fault (io)"); | ||
| 334 | return 0; | 408 | return 0; |
| 335 | } | 409 | } |
| 336 | 410 | ||
| 337 | /* | 411 | /* |
| 338 | * pte not present, guest page fault. | 412 | * pte not present, guest page fault. |
| 339 | */ | 413 | */ |
| 340 | if (pte_present && !fixed) { | 414 | if (pte_present && !fixed && !write_pt) { |
| 341 | inject_page_fault(vcpu, addr, error_code); | 415 | inject_page_fault(vcpu, addr, error_code); |
| 416 | kvm_mmu_audit(vcpu, "post page fault (guest)"); | ||
| 342 | return 0; | 417 | return 0; |
| 343 | } | 418 | } |
| 344 | 419 | ||
| 345 | ++kvm_stat.pf_fixed; | 420 | ++kvm_stat.pf_fixed; |
| 421 | kvm_mmu_audit(vcpu, "post page fault (fixed)"); | ||
| 346 | 422 | ||
| 347 | return 0; | 423 | return write_pt; |
| 348 | } | 424 | } |
| 349 | 425 | ||
| 350 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | 426 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) |
| @@ -353,9 +429,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | |||
| 353 | pt_element_t guest_pte; | 429 | pt_element_t guest_pte; |
| 354 | gpa_t gpa; | 430 | gpa_t gpa; |
| 355 | 431 | ||
| 356 | FNAME(init_walker)(&walker, vcpu); | 432 | FNAME(walk_addr)(&walker, vcpu, vaddr); |
| 357 | guest_pte = *FNAME(fetch_guest)(vcpu, &walker, PT_PAGE_TABLE_LEVEL, | 433 | guest_pte = *walker.ptep; |
| 358 | vaddr); | ||
| 359 | FNAME(release_walker)(&walker); | 434 | FNAME(release_walker)(&walker); |
| 360 | 435 | ||
| 361 | if (!is_present_pte(guest_pte)) | 436 | if (!is_present_pte(guest_pte)) |
| @@ -389,3 +464,4 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | |||
| 389 | #undef PT_PTE_COPY_MASK | 464 | #undef PT_PTE_COPY_MASK |
| 390 | #undef PT_NON_PTE_COPY_MASK | 465 | #undef PT_NON_PTE_COPY_MASK |
| 391 | #undef PT_DIR_BASE_ADDR_MASK | 466 | #undef PT_DIR_BASE_ADDR_MASK |
| 467 | #undef PT_MAX_FULL_LEVELS | ||
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index fa0428735717..ccc06b1b91b5 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c | |||
| @@ -235,6 +235,8 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
| 235 | 235 | ||
| 236 | vcpu->rip = vcpu->svm->vmcb->save.rip = vcpu->svm->next_rip; | 236 | vcpu->rip = vcpu->svm->vmcb->save.rip = vcpu->svm->next_rip; |
| 237 | vcpu->svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; | 237 | vcpu->svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; |
| 238 | |||
| 239 | vcpu->interrupt_window_open = 1; | ||
| 238 | } | 240 | } |
| 239 | 241 | ||
| 240 | static int has_svm(void) | 242 | static int has_svm(void) |
| @@ -495,7 +497,6 @@ static void init_vmcb(struct vmcb *vmcb) | |||
| 495 | /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */ | 497 | /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */ |
| 496 | (1ULL << INTERCEPT_CPUID) | | 498 | (1ULL << INTERCEPT_CPUID) | |
| 497 | (1ULL << INTERCEPT_HLT) | | 499 | (1ULL << INTERCEPT_HLT) | |
| 498 | (1ULL << INTERCEPT_INVLPG) | | ||
| 499 | (1ULL << INTERCEPT_INVLPGA) | | 500 | (1ULL << INTERCEPT_INVLPGA) | |
| 500 | (1ULL << INTERCEPT_IOIO_PROT) | | 501 | (1ULL << INTERCEPT_IOIO_PROT) | |
| 501 | (1ULL << INTERCEPT_MSR_PROT) | | 502 | (1ULL << INTERCEPT_MSR_PROT) | |
| @@ -700,6 +701,10 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | |||
| 700 | vcpu->svm->vmcb->save.gdtr.base = dt->base ; | 701 | vcpu->svm->vmcb->save.gdtr.base = dt->base ; |
| 701 | } | 702 | } |
| 702 | 703 | ||
| 704 | static void svm_decache_cr0_cr4_guest_bits(struct kvm_vcpu *vcpu) | ||
| 705 | { | ||
| 706 | } | ||
| 707 | |||
| 703 | static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | 708 | static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) |
| 704 | { | 709 | { |
| 705 | #ifdef CONFIG_X86_64 | 710 | #ifdef CONFIG_X86_64 |
| @@ -847,6 +852,7 @@ static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 847 | u64 fault_address; | 852 | u64 fault_address; |
| 848 | u32 error_code; | 853 | u32 error_code; |
| 849 | enum emulation_result er; | 854 | enum emulation_result er; |
| 855 | int r; | ||
| 850 | 856 | ||
| 851 | if (is_external_interrupt(exit_int_info)) | 857 | if (is_external_interrupt(exit_int_info)) |
| 852 | push_irq(vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); | 858 | push_irq(vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); |
| @@ -855,7 +861,12 @@ static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 855 | 861 | ||
| 856 | fault_address = vcpu->svm->vmcb->control.exit_info_2; | 862 | fault_address = vcpu->svm->vmcb->control.exit_info_2; |
| 857 | error_code = vcpu->svm->vmcb->control.exit_info_1; | 863 | error_code = vcpu->svm->vmcb->control.exit_info_1; |
| 858 | if (!vcpu->mmu.page_fault(vcpu, fault_address, error_code)) { | 864 | r = kvm_mmu_page_fault(vcpu, fault_address, error_code); |
| 865 | if (r < 0) { | ||
| 866 | spin_unlock(&vcpu->kvm->lock); | ||
| 867 | return r; | ||
| 868 | } | ||
| 869 | if (!r) { | ||
| 859 | spin_unlock(&vcpu->kvm->lock); | 870 | spin_unlock(&vcpu->kvm->lock); |
| 860 | return 1; | 871 | return 1; |
| 861 | } | 872 | } |
| @@ -1031,10 +1042,11 @@ static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 1031 | { | 1042 | { |
| 1032 | vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1; | 1043 | vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1; |
| 1033 | skip_emulated_instruction(vcpu); | 1044 | skip_emulated_instruction(vcpu); |
| 1034 | if (vcpu->irq_summary && (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF)) | 1045 | if (vcpu->irq_summary) |
| 1035 | return 1; | 1046 | return 1; |
| 1036 | 1047 | ||
| 1037 | kvm_run->exit_reason = KVM_EXIT_HLT; | 1048 | kvm_run->exit_reason = KVM_EXIT_HLT; |
| 1049 | ++kvm_stat.halt_exits; | ||
| 1038 | return 0; | 1050 | return 0; |
| 1039 | } | 1051 | } |
| 1040 | 1052 | ||
| @@ -1186,6 +1198,23 @@ static int msr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 1186 | return rdmsr_interception(vcpu, kvm_run); | 1198 | return rdmsr_interception(vcpu, kvm_run); |
| 1187 | } | 1199 | } |
| 1188 | 1200 | ||
| 1201 | static int interrupt_window_interception(struct kvm_vcpu *vcpu, | ||
| 1202 | struct kvm_run *kvm_run) | ||
| 1203 | { | ||
| 1204 | /* | ||
| 1205 | * If the user space waits to inject interrupts, exit as soon as | ||
| 1206 | * possible | ||
| 1207 | */ | ||
| 1208 | if (kvm_run->request_interrupt_window && | ||
| 1209 | !vcpu->irq_summary) { | ||
| 1210 | ++kvm_stat.irq_window_exits; | ||
| 1211 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; | ||
| 1212 | return 0; | ||
| 1213 | } | ||
| 1214 | |||
| 1215 | return 1; | ||
| 1216 | } | ||
| 1217 | |||
| 1189 | static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu, | 1218 | static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu, |
| 1190 | struct kvm_run *kvm_run) = { | 1219 | struct kvm_run *kvm_run) = { |
| 1191 | [SVM_EXIT_READ_CR0] = emulate_on_interception, | 1220 | [SVM_EXIT_READ_CR0] = emulate_on_interception, |
| @@ -1210,6 +1239,7 @@ static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu, | |||
| 1210 | [SVM_EXIT_NMI] = nop_on_interception, | 1239 | [SVM_EXIT_NMI] = nop_on_interception, |
| 1211 | [SVM_EXIT_SMI] = nop_on_interception, | 1240 | [SVM_EXIT_SMI] = nop_on_interception, |
| 1212 | [SVM_EXIT_INIT] = nop_on_interception, | 1241 | [SVM_EXIT_INIT] = nop_on_interception, |
| 1242 | [SVM_EXIT_VINTR] = interrupt_window_interception, | ||
| 1213 | /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ | 1243 | /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ |
| 1214 | [SVM_EXIT_CPUID] = cpuid_interception, | 1244 | [SVM_EXIT_CPUID] = cpuid_interception, |
| 1215 | [SVM_EXIT_HLT] = halt_interception, | 1245 | [SVM_EXIT_HLT] = halt_interception, |
| @@ -1278,15 +1308,11 @@ static void pre_svm_run(struct kvm_vcpu *vcpu) | |||
| 1278 | } | 1308 | } |
| 1279 | 1309 | ||
| 1280 | 1310 | ||
| 1281 | static inline void kvm_try_inject_irq(struct kvm_vcpu *vcpu) | 1311 | static inline void kvm_do_inject_irq(struct kvm_vcpu *vcpu) |
| 1282 | { | 1312 | { |
| 1283 | struct vmcb_control_area *control; | 1313 | struct vmcb_control_area *control; |
| 1284 | 1314 | ||
| 1285 | if (!vcpu->irq_summary) | ||
| 1286 | return; | ||
| 1287 | |||
| 1288 | control = &vcpu->svm->vmcb->control; | 1315 | control = &vcpu->svm->vmcb->control; |
| 1289 | |||
| 1290 | control->int_vector = pop_irq(vcpu); | 1316 | control->int_vector = pop_irq(vcpu); |
| 1291 | control->int_ctl &= ~V_INTR_PRIO_MASK; | 1317 | control->int_ctl &= ~V_INTR_PRIO_MASK; |
| 1292 | control->int_ctl |= V_IRQ_MASK | | 1318 | control->int_ctl |= V_IRQ_MASK | |
| @@ -1301,6 +1327,59 @@ static void kvm_reput_irq(struct kvm_vcpu *vcpu) | |||
| 1301 | control->int_ctl &= ~V_IRQ_MASK; | 1327 | control->int_ctl &= ~V_IRQ_MASK; |
| 1302 | push_irq(vcpu, control->int_vector); | 1328 | push_irq(vcpu, control->int_vector); |
| 1303 | } | 1329 | } |
| 1330 | |||
| 1331 | vcpu->interrupt_window_open = | ||
| 1332 | !(control->int_state & SVM_INTERRUPT_SHADOW_MASK); | ||
| 1333 | } | ||
| 1334 | |||
| 1335 | static void do_interrupt_requests(struct kvm_vcpu *vcpu, | ||
| 1336 | struct kvm_run *kvm_run) | ||
| 1337 | { | ||
| 1338 | struct vmcb_control_area *control = &vcpu->svm->vmcb->control; | ||
| 1339 | |||
| 1340 | vcpu->interrupt_window_open = | ||
| 1341 | (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && | ||
| 1342 | (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF)); | ||
| 1343 | |||
| 1344 | if (vcpu->interrupt_window_open && vcpu->irq_summary) | ||
| 1345 | /* | ||
| 1346 | * If interrupts enabled, and not blocked by sti or mov ss. Good. | ||
| 1347 | */ | ||
| 1348 | kvm_do_inject_irq(vcpu); | ||
| 1349 | |||
| 1350 | /* | ||
| 1351 | * Interrupts blocked. Wait for unblock. | ||
| 1352 | */ | ||
| 1353 | if (!vcpu->interrupt_window_open && | ||
| 1354 | (vcpu->irq_summary || kvm_run->request_interrupt_window)) { | ||
| 1355 | control->intercept |= 1ULL << INTERCEPT_VINTR; | ||
| 1356 | } else | ||
| 1357 | control->intercept &= ~(1ULL << INTERCEPT_VINTR); | ||
| 1358 | } | ||
| 1359 | |||
| 1360 | static void post_kvm_run_save(struct kvm_vcpu *vcpu, | ||
| 1361 | struct kvm_run *kvm_run) | ||
| 1362 | { | ||
| 1363 | kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open && | ||
| 1364 | vcpu->irq_summary == 0); | ||
| 1365 | kvm_run->if_flag = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF) != 0; | ||
| 1366 | kvm_run->cr8 = vcpu->cr8; | ||
| 1367 | kvm_run->apic_base = vcpu->apic_base; | ||
| 1368 | } | ||
| 1369 | |||
| 1370 | /* | ||
| 1371 | * Check if userspace requested an interrupt window, and that the | ||
| 1372 | * interrupt window is open. | ||
| 1373 | * | ||
| 1374 | * No need to exit to userspace if we already have an interrupt queued. | ||
| 1375 | */ | ||
| 1376 | static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, | ||
| 1377 | struct kvm_run *kvm_run) | ||
| 1378 | { | ||
| 1379 | return (!vcpu->irq_summary && | ||
| 1380 | kvm_run->request_interrupt_window && | ||
| 1381 | vcpu->interrupt_window_open && | ||
| 1382 | (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF)); | ||
| 1304 | } | 1383 | } |
| 1305 | 1384 | ||
| 1306 | static void save_db_regs(unsigned long *db_regs) | 1385 | static void save_db_regs(unsigned long *db_regs) |
| @@ -1324,9 +1403,10 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 1324 | u16 fs_selector; | 1403 | u16 fs_selector; |
| 1325 | u16 gs_selector; | 1404 | u16 gs_selector; |
| 1326 | u16 ldt_selector; | 1405 | u16 ldt_selector; |
| 1406 | int r; | ||
| 1327 | 1407 | ||
| 1328 | again: | 1408 | again: |
| 1329 | kvm_try_inject_irq(vcpu); | 1409 | do_interrupt_requests(vcpu, kvm_run); |
| 1330 | 1410 | ||
| 1331 | clgi(); | 1411 | clgi(); |
| 1332 | 1412 | ||
| @@ -1487,18 +1567,28 @@ again: | |||
| 1487 | if (vcpu->svm->vmcb->control.exit_code == SVM_EXIT_ERR) { | 1567 | if (vcpu->svm->vmcb->control.exit_code == SVM_EXIT_ERR) { |
| 1488 | kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; | 1568 | kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; |
| 1489 | kvm_run->exit_reason = vcpu->svm->vmcb->control.exit_code; | 1569 | kvm_run->exit_reason = vcpu->svm->vmcb->control.exit_code; |
| 1570 | post_kvm_run_save(vcpu, kvm_run); | ||
| 1490 | return 0; | 1571 | return 0; |
| 1491 | } | 1572 | } |
| 1492 | 1573 | ||
| 1493 | if (handle_exit(vcpu, kvm_run)) { | 1574 | r = handle_exit(vcpu, kvm_run); |
| 1575 | if (r > 0) { | ||
| 1494 | if (signal_pending(current)) { | 1576 | if (signal_pending(current)) { |
| 1495 | ++kvm_stat.signal_exits; | 1577 | ++kvm_stat.signal_exits; |
| 1578 | post_kvm_run_save(vcpu, kvm_run); | ||
| 1579 | return -EINTR; | ||
| 1580 | } | ||
| 1581 | |||
| 1582 | if (dm_request_for_irq_injection(vcpu, kvm_run)) { | ||
| 1583 | ++kvm_stat.request_irq_exits; | ||
| 1584 | post_kvm_run_save(vcpu, kvm_run); | ||
| 1496 | return -EINTR; | 1585 | return -EINTR; |
| 1497 | } | 1586 | } |
| 1498 | kvm_resched(vcpu); | 1587 | kvm_resched(vcpu); |
| 1499 | goto again; | 1588 | goto again; |
| 1500 | } | 1589 | } |
| 1501 | return 0; | 1590 | post_kvm_run_save(vcpu, kvm_run); |
| 1591 | return r; | ||
| 1502 | } | 1592 | } |
| 1503 | 1593 | ||
| 1504 | static void svm_flush_tlb(struct kvm_vcpu *vcpu) | 1594 | static void svm_flush_tlb(struct kvm_vcpu *vcpu) |
| @@ -1565,6 +1655,7 @@ static struct kvm_arch_ops svm_arch_ops = { | |||
| 1565 | .get_segment = svm_get_segment, | 1655 | .get_segment = svm_get_segment, |
| 1566 | .set_segment = svm_set_segment, | 1656 | .set_segment = svm_set_segment, |
| 1567 | .get_cs_db_l_bits = svm_get_cs_db_l_bits, | 1657 | .get_cs_db_l_bits = svm_get_cs_db_l_bits, |
| 1658 | .decache_cr0_cr4_guest_bits = svm_decache_cr0_cr4_guest_bits, | ||
| 1568 | .set_cr0 = svm_set_cr0, | 1659 | .set_cr0 = svm_set_cr0, |
| 1569 | .set_cr0_no_modeswitch = svm_set_cr0, | 1660 | .set_cr0_no_modeswitch = svm_set_cr0, |
| 1570 | .set_cr3 = svm_set_cr3, | 1661 | .set_cr3 = svm_set_cr3, |
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index d0a2c2d5342a..d4701cb4c654 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c | |||
| @@ -116,7 +116,7 @@ static void vmcs_clear(struct vmcs *vmcs) | |||
| 116 | static void __vcpu_clear(void *arg) | 116 | static void __vcpu_clear(void *arg) |
| 117 | { | 117 | { |
| 118 | struct kvm_vcpu *vcpu = arg; | 118 | struct kvm_vcpu *vcpu = arg; |
| 119 | int cpu = smp_processor_id(); | 119 | int cpu = raw_smp_processor_id(); |
| 120 | 120 | ||
| 121 | if (vcpu->cpu == cpu) | 121 | if (vcpu->cpu == cpu) |
| 122 | vmcs_clear(vcpu->vmcs); | 122 | vmcs_clear(vcpu->vmcs); |
| @@ -152,15 +152,21 @@ static u64 vmcs_read64(unsigned long field) | |||
| 152 | #endif | 152 | #endif |
| 153 | } | 153 | } |
| 154 | 154 | ||
| 155 | static noinline void vmwrite_error(unsigned long field, unsigned long value) | ||
| 156 | { | ||
| 157 | printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", | ||
| 158 | field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); | ||
| 159 | dump_stack(); | ||
| 160 | } | ||
| 161 | |||
| 155 | static void vmcs_writel(unsigned long field, unsigned long value) | 162 | static void vmcs_writel(unsigned long field, unsigned long value) |
| 156 | { | 163 | { |
| 157 | u8 error; | 164 | u8 error; |
| 158 | 165 | ||
| 159 | asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" | 166 | asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" |
| 160 | : "=q"(error) : "a"(value), "d"(field) : "cc" ); | 167 | : "=q"(error) : "a"(value), "d"(field) : "cc" ); |
| 161 | if (error) | 168 | if (unlikely(error)) |
| 162 | printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", | 169 | vmwrite_error(field, value); |
| 163 | field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); | ||
| 164 | } | 170 | } |
| 165 | 171 | ||
| 166 | static void vmcs_write16(unsigned long field, u16 value) | 172 | static void vmcs_write16(unsigned long field, u16 value) |
| @@ -263,6 +269,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
| 263 | if (interruptibility & 3) | 269 | if (interruptibility & 3) |
| 264 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, | 270 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, |
| 265 | interruptibility & ~3); | 271 | interruptibility & ~3); |
| 272 | vcpu->interrupt_window_open = 1; | ||
| 266 | } | 273 | } |
| 267 | 274 | ||
| 268 | static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) | 275 | static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) |
| @@ -541,7 +548,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) | |||
| 541 | 548 | ||
| 542 | static struct vmcs *alloc_vmcs(void) | 549 | static struct vmcs *alloc_vmcs(void) |
| 543 | { | 550 | { |
| 544 | return alloc_vmcs_cpu(smp_processor_id()); | 551 | return alloc_vmcs_cpu(raw_smp_processor_id()); |
| 545 | } | 552 | } |
| 546 | 553 | ||
| 547 | static void free_vmcs(struct vmcs *vmcs) | 554 | static void free_vmcs(struct vmcs *vmcs) |
| @@ -736,6 +743,15 @@ static void exit_lmode(struct kvm_vcpu *vcpu) | |||
| 736 | 743 | ||
| 737 | #endif | 744 | #endif |
| 738 | 745 | ||
| 746 | static void vmx_decache_cr0_cr4_guest_bits(struct kvm_vcpu *vcpu) | ||
| 747 | { | ||
| 748 | vcpu->cr0 &= KVM_GUEST_CR0_MASK; | ||
| 749 | vcpu->cr0 |= vmcs_readl(GUEST_CR0) & ~KVM_GUEST_CR0_MASK; | ||
| 750 | |||
| 751 | vcpu->cr4 &= KVM_GUEST_CR4_MASK; | ||
| 752 | vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; | ||
| 753 | } | ||
| 754 | |||
| 739 | static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | 755 | static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) |
| 740 | { | 756 | { |
| 741 | if (vcpu->rmode.active && (cr0 & CR0_PE_MASK)) | 757 | if (vcpu->rmode.active && (cr0 & CR0_PE_MASK)) |
| @@ -1011,8 +1027,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) | |||
| 1011 | vmcs_writel(GUEST_RIP, 0xfff0); | 1027 | vmcs_writel(GUEST_RIP, 0xfff0); |
| 1012 | vmcs_writel(GUEST_RSP, 0); | 1028 | vmcs_writel(GUEST_RSP, 0); |
| 1013 | 1029 | ||
| 1014 | vmcs_writel(GUEST_CR3, 0); | ||
| 1015 | |||
| 1016 | //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 | 1030 | //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 |
| 1017 | vmcs_writel(GUEST_DR7, 0x400); | 1031 | vmcs_writel(GUEST_DR7, 0x400); |
| 1018 | 1032 | ||
| @@ -1049,7 +1063,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) | |||
| 1049 | | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */ | 1063 | | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */ |
| 1050 | | CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */ | 1064 | | CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */ |
| 1051 | | CPU_BASED_UNCOND_IO_EXITING /* 20.6.2 */ | 1065 | | CPU_BASED_UNCOND_IO_EXITING /* 20.6.2 */ |
| 1052 | | CPU_BASED_INVDPG_EXITING | ||
| 1053 | | CPU_BASED_MOV_DR_EXITING | 1066 | | CPU_BASED_MOV_DR_EXITING |
| 1054 | | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */ | 1067 | | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */ |
| 1055 | ); | 1068 | ); |
| @@ -1094,14 +1107,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) | |||
| 1094 | rdmsrl(MSR_IA32_SYSENTER_EIP, a); | 1107 | rdmsrl(MSR_IA32_SYSENTER_EIP, a); |
| 1095 | vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ | 1108 | vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ |
| 1096 | 1109 | ||
| 1097 | ret = -ENOMEM; | ||
| 1098 | vcpu->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
| 1099 | if (!vcpu->guest_msrs) | ||
| 1100 | goto out; | ||
| 1101 | vcpu->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
| 1102 | if (!vcpu->host_msrs) | ||
| 1103 | goto out_free_guest_msrs; | ||
| 1104 | |||
| 1105 | for (i = 0; i < NR_VMX_MSR; ++i) { | 1110 | for (i = 0; i < NR_VMX_MSR; ++i) { |
| 1106 | u32 index = vmx_msr_index[i]; | 1111 | u32 index = vmx_msr_index[i]; |
| 1107 | u32 data_low, data_high; | 1112 | u32 data_low, data_high; |
| @@ -1155,8 +1160,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) | |||
| 1155 | 1160 | ||
| 1156 | return 0; | 1161 | return 0; |
| 1157 | 1162 | ||
| 1158 | out_free_guest_msrs: | ||
| 1159 | kfree(vcpu->guest_msrs); | ||
| 1160 | out: | 1163 | out: |
| 1161 | return ret; | 1164 | return ret; |
| 1162 | } | 1165 | } |
| @@ -1224,21 +1227,34 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) | |||
| 1224 | irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); | 1227 | irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); |
| 1225 | } | 1228 | } |
| 1226 | 1229 | ||
| 1227 | static void kvm_try_inject_irq(struct kvm_vcpu *vcpu) | 1230 | |
| 1231 | static void do_interrupt_requests(struct kvm_vcpu *vcpu, | ||
| 1232 | struct kvm_run *kvm_run) | ||
| 1228 | { | 1233 | { |
| 1229 | if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) | 1234 | u32 cpu_based_vm_exec_control; |
| 1230 | && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0) | 1235 | |
| 1236 | vcpu->interrupt_window_open = | ||
| 1237 | ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && | ||
| 1238 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); | ||
| 1239 | |||
| 1240 | if (vcpu->interrupt_window_open && | ||
| 1241 | vcpu->irq_summary && | ||
| 1242 | !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) | ||
| 1231 | /* | 1243 | /* |
| 1232 | * Interrupts enabled, and not blocked by sti or mov ss. Good. | 1244 | * If interrupts enabled, and not blocked by sti or mov ss. Good. |
| 1233 | */ | 1245 | */ |
| 1234 | kvm_do_inject_irq(vcpu); | 1246 | kvm_do_inject_irq(vcpu); |
| 1235 | else | 1247 | |
| 1248 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | ||
| 1249 | if (!vcpu->interrupt_window_open && | ||
| 1250 | (vcpu->irq_summary || kvm_run->request_interrupt_window)) | ||
| 1236 | /* | 1251 | /* |
| 1237 | * Interrupts blocked. Wait for unblock. | 1252 | * Interrupts blocked. Wait for unblock. |
| 1238 | */ | 1253 | */ |
| 1239 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, | 1254 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; |
| 1240 | vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | 1255 | else |
| 1241 | | CPU_BASED_VIRTUAL_INTR_PENDING); | 1256 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; |
| 1257 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | ||
| 1242 | } | 1258 | } |
| 1243 | 1259 | ||
| 1244 | static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) | 1260 | static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) |
| @@ -1277,6 +1293,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 1277 | unsigned long cr2, rip; | 1293 | unsigned long cr2, rip; |
| 1278 | u32 vect_info; | 1294 | u32 vect_info; |
| 1279 | enum emulation_result er; | 1295 | enum emulation_result er; |
| 1296 | int r; | ||
| 1280 | 1297 | ||
| 1281 | vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | 1298 | vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); |
| 1282 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | 1299 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); |
| @@ -1305,7 +1322,12 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 1305 | cr2 = vmcs_readl(EXIT_QUALIFICATION); | 1322 | cr2 = vmcs_readl(EXIT_QUALIFICATION); |
| 1306 | 1323 | ||
| 1307 | spin_lock(&vcpu->kvm->lock); | 1324 | spin_lock(&vcpu->kvm->lock); |
| 1308 | if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) { | 1325 | r = kvm_mmu_page_fault(vcpu, cr2, error_code); |
| 1326 | if (r < 0) { | ||
| 1327 | spin_unlock(&vcpu->kvm->lock); | ||
| 1328 | return r; | ||
| 1329 | } | ||
| 1330 | if (!r) { | ||
| 1309 | spin_unlock(&vcpu->kvm->lock); | 1331 | spin_unlock(&vcpu->kvm->lock); |
| 1310 | return 1; | 1332 | return 1; |
| 1311 | } | 1333 | } |
| @@ -1425,17 +1447,6 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 1425 | return 0; | 1447 | return 0; |
| 1426 | } | 1448 | } |
| 1427 | 1449 | ||
| 1428 | static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
| 1429 | { | ||
| 1430 | u64 address = vmcs_read64(EXIT_QUALIFICATION); | ||
| 1431 | int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | ||
| 1432 | spin_lock(&vcpu->kvm->lock); | ||
| 1433 | vcpu->mmu.inval_page(vcpu, address); | ||
| 1434 | spin_unlock(&vcpu->kvm->lock); | ||
| 1435 | vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length); | ||
| 1436 | return 1; | ||
| 1437 | } | ||
| 1438 | |||
| 1439 | static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 1450 | static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
| 1440 | { | 1451 | { |
| 1441 | u64 exit_qualification; | 1452 | u64 exit_qualification; |
| @@ -1575,23 +1586,40 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
| 1575 | return 1; | 1586 | return 1; |
| 1576 | } | 1587 | } |
| 1577 | 1588 | ||
| 1589 | static void post_kvm_run_save(struct kvm_vcpu *vcpu, | ||
| 1590 | struct kvm_run *kvm_run) | ||
| 1591 | { | ||
| 1592 | kvm_run->if_flag = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) != 0; | ||
| 1593 | kvm_run->cr8 = vcpu->cr8; | ||
| 1594 | kvm_run->apic_base = vcpu->apic_base; | ||
| 1595 | kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open && | ||
| 1596 | vcpu->irq_summary == 0); | ||
| 1597 | } | ||
| 1598 | |||
| 1578 | static int handle_interrupt_window(struct kvm_vcpu *vcpu, | 1599 | static int handle_interrupt_window(struct kvm_vcpu *vcpu, |
| 1579 | struct kvm_run *kvm_run) | 1600 | struct kvm_run *kvm_run) |
| 1580 | { | 1601 | { |
| 1581 | /* Turn off interrupt window reporting. */ | 1602 | /* |
| 1582 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, | 1603 | * If the user space waits to inject interrupts, exit as soon as |
| 1583 | vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | 1604 | * possible |
| 1584 | & ~CPU_BASED_VIRTUAL_INTR_PENDING); | 1605 | */ |
| 1606 | if (kvm_run->request_interrupt_window && | ||
| 1607 | !vcpu->irq_summary) { | ||
| 1608 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; | ||
| 1609 | ++kvm_stat.irq_window_exits; | ||
| 1610 | return 0; | ||
| 1611 | } | ||
| 1585 | return 1; | 1612 | return 1; |
| 1586 | } | 1613 | } |
| 1587 | 1614 | ||
| 1588 | static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 1615 | static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
| 1589 | { | 1616 | { |
| 1590 | skip_emulated_instruction(vcpu); | 1617 | skip_emulated_instruction(vcpu); |
| 1591 | if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)) | 1618 | if (vcpu->irq_summary) |
| 1592 | return 1; | 1619 | return 1; |
| 1593 | 1620 | ||
| 1594 | kvm_run->exit_reason = KVM_EXIT_HLT; | 1621 | kvm_run->exit_reason = KVM_EXIT_HLT; |
| 1622 | ++kvm_stat.halt_exits; | ||
| 1595 | return 0; | 1623 | return 0; |
| 1596 | } | 1624 | } |
| 1597 | 1625 | ||
| @@ -1605,7 +1633,6 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, | |||
| 1605 | [EXIT_REASON_EXCEPTION_NMI] = handle_exception, | 1633 | [EXIT_REASON_EXCEPTION_NMI] = handle_exception, |
| 1606 | [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, | 1634 | [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, |
| 1607 | [EXIT_REASON_IO_INSTRUCTION] = handle_io, | 1635 | [EXIT_REASON_IO_INSTRUCTION] = handle_io, |
| 1608 | [EXIT_REASON_INVLPG] = handle_invlpg, | ||
| 1609 | [EXIT_REASON_CR_ACCESS] = handle_cr, | 1636 | [EXIT_REASON_CR_ACCESS] = handle_cr, |
| 1610 | [EXIT_REASON_DR_ACCESS] = handle_dr, | 1637 | [EXIT_REASON_DR_ACCESS] = handle_dr, |
| 1611 | [EXIT_REASON_CPUID] = handle_cpuid, | 1638 | [EXIT_REASON_CPUID] = handle_cpuid, |
| @@ -1642,11 +1669,27 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
| 1642 | return 0; | 1669 | return 0; |
| 1643 | } | 1670 | } |
| 1644 | 1671 | ||
| 1672 | /* | ||
| 1673 | * Check if userspace requested an interrupt window, and that the | ||
| 1674 | * interrupt window is open. | ||
| 1675 | * | ||
| 1676 | * No need to exit to userspace if we already have an interrupt queued. | ||
| 1677 | */ | ||
| 1678 | static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, | ||
| 1679 | struct kvm_run *kvm_run) | ||
| 1680 | { | ||
| 1681 | return (!vcpu->irq_summary && | ||
| 1682 | kvm_run->request_interrupt_window && | ||
| 1683 | vcpu->interrupt_window_open && | ||
| 1684 | (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)); | ||
| 1685 | } | ||
| 1686 | |||
| 1645 | static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 1687 | static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
| 1646 | { | 1688 | { |
| 1647 | u8 fail; | 1689 | u8 fail; |
| 1648 | u16 fs_sel, gs_sel, ldt_sel; | 1690 | u16 fs_sel, gs_sel, ldt_sel; |
| 1649 | int fs_gs_ldt_reload_needed; | 1691 | int fs_gs_ldt_reload_needed; |
| 1692 | int r; | ||
| 1650 | 1693 | ||
| 1651 | again: | 1694 | again: |
| 1652 | /* | 1695 | /* |
| @@ -1673,9 +1716,7 @@ again: | |||
| 1673 | vmcs_writel(HOST_GS_BASE, segment_base(gs_sel)); | 1716 | vmcs_writel(HOST_GS_BASE, segment_base(gs_sel)); |
| 1674 | #endif | 1717 | #endif |
| 1675 | 1718 | ||
| 1676 | if (vcpu->irq_summary && | 1719 | do_interrupt_requests(vcpu, kvm_run); |
| 1677 | !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) | ||
| 1678 | kvm_try_inject_irq(vcpu); | ||
| 1679 | 1720 | ||
| 1680 | if (vcpu->guest_debug.enabled) | 1721 | if (vcpu->guest_debug.enabled) |
| 1681 | kvm_guest_debug_pre(vcpu); | 1722 | kvm_guest_debug_pre(vcpu); |
| @@ -1812,6 +1853,7 @@ again: | |||
| 1812 | 1853 | ||
| 1813 | fx_save(vcpu->guest_fx_image); | 1854 | fx_save(vcpu->guest_fx_image); |
| 1814 | fx_restore(vcpu->host_fx_image); | 1855 | fx_restore(vcpu->host_fx_image); |
| 1856 | vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; | ||
| 1815 | 1857 | ||
| 1816 | #ifndef CONFIG_X86_64 | 1858 | #ifndef CONFIG_X86_64 |
| 1817 | asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); | 1859 | asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); |
| @@ -1821,6 +1863,7 @@ again: | |||
| 1821 | if (fail) { | 1863 | if (fail) { |
| 1822 | kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; | 1864 | kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; |
| 1823 | kvm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR); | 1865 | kvm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR); |
| 1866 | r = 0; | ||
| 1824 | } else { | 1867 | } else { |
| 1825 | if (fs_gs_ldt_reload_needed) { | 1868 | if (fs_gs_ldt_reload_needed) { |
| 1826 | load_ldt(ldt_sel); | 1869 | load_ldt(ldt_sel); |
| @@ -1840,17 +1883,28 @@ again: | |||
| 1840 | } | 1883 | } |
| 1841 | vcpu->launched = 1; | 1884 | vcpu->launched = 1; |
| 1842 | kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT; | 1885 | kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT; |
| 1843 | if (kvm_handle_exit(kvm_run, vcpu)) { | 1886 | r = kvm_handle_exit(kvm_run, vcpu); |
| 1887 | if (r > 0) { | ||
| 1844 | /* Give scheduler a change to reschedule. */ | 1888 | /* Give scheduler a change to reschedule. */ |
| 1845 | if (signal_pending(current)) { | 1889 | if (signal_pending(current)) { |
| 1846 | ++kvm_stat.signal_exits; | 1890 | ++kvm_stat.signal_exits; |
| 1891 | post_kvm_run_save(vcpu, kvm_run); | ||
| 1892 | return -EINTR; | ||
| 1893 | } | ||
| 1894 | |||
| 1895 | if (dm_request_for_irq_injection(vcpu, kvm_run)) { | ||
| 1896 | ++kvm_stat.request_irq_exits; | ||
| 1897 | post_kvm_run_save(vcpu, kvm_run); | ||
| 1847 | return -EINTR; | 1898 | return -EINTR; |
| 1848 | } | 1899 | } |
| 1900 | |||
| 1849 | kvm_resched(vcpu); | 1901 | kvm_resched(vcpu); |
| 1850 | goto again; | 1902 | goto again; |
| 1851 | } | 1903 | } |
| 1852 | } | 1904 | } |
| 1853 | return 0; | 1905 | |
| 1906 | post_kvm_run_save(vcpu, kvm_run); | ||
| 1907 | return r; | ||
| 1854 | } | 1908 | } |
| 1855 | 1909 | ||
| 1856 | static void vmx_flush_tlb(struct kvm_vcpu *vcpu) | 1910 | static void vmx_flush_tlb(struct kvm_vcpu *vcpu) |
| @@ -1906,13 +1960,33 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) | |||
| 1906 | { | 1960 | { |
| 1907 | struct vmcs *vmcs; | 1961 | struct vmcs *vmcs; |
| 1908 | 1962 | ||
| 1963 | vcpu->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
| 1964 | if (!vcpu->guest_msrs) | ||
| 1965 | return -ENOMEM; | ||
| 1966 | |||
| 1967 | vcpu->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
| 1968 | if (!vcpu->host_msrs) | ||
| 1969 | goto out_free_guest_msrs; | ||
| 1970 | |||
| 1909 | vmcs = alloc_vmcs(); | 1971 | vmcs = alloc_vmcs(); |
| 1910 | if (!vmcs) | 1972 | if (!vmcs) |
| 1911 | return -ENOMEM; | 1973 | goto out_free_msrs; |
| 1974 | |||
| 1912 | vmcs_clear(vmcs); | 1975 | vmcs_clear(vmcs); |
| 1913 | vcpu->vmcs = vmcs; | 1976 | vcpu->vmcs = vmcs; |
| 1914 | vcpu->launched = 0; | 1977 | vcpu->launched = 0; |
| 1978 | |||
| 1915 | return 0; | 1979 | return 0; |
| 1980 | |||
| 1981 | out_free_msrs: | ||
| 1982 | kfree(vcpu->host_msrs); | ||
| 1983 | vcpu->host_msrs = NULL; | ||
| 1984 | |||
| 1985 | out_free_guest_msrs: | ||
| 1986 | kfree(vcpu->guest_msrs); | ||
| 1987 | vcpu->guest_msrs = NULL; | ||
| 1988 | |||
| 1989 | return -ENOMEM; | ||
| 1916 | } | 1990 | } |
| 1917 | 1991 | ||
| 1918 | static struct kvm_arch_ops vmx_arch_ops = { | 1992 | static struct kvm_arch_ops vmx_arch_ops = { |
| @@ -1936,6 +2010,7 @@ static struct kvm_arch_ops vmx_arch_ops = { | |||
| 1936 | .get_segment = vmx_get_segment, | 2010 | .get_segment = vmx_get_segment, |
| 1937 | .set_segment = vmx_set_segment, | 2011 | .set_segment = vmx_set_segment, |
| 1938 | .get_cs_db_l_bits = vmx_get_cs_db_l_bits, | 2012 | .get_cs_db_l_bits = vmx_get_cs_db_l_bits, |
| 2013 | .decache_cr0_cr4_guest_bits = vmx_decache_cr0_cr4_guest_bits, | ||
| 1939 | .set_cr0 = vmx_set_cr0, | 2014 | .set_cr0 = vmx_set_cr0, |
| 1940 | .set_cr0_no_modeswitch = vmx_set_cr0_no_modeswitch, | 2015 | .set_cr0_no_modeswitch = vmx_set_cr0_no_modeswitch, |
| 1941 | .set_cr3 = vmx_set_cr3, | 2016 | .set_cr3 = vmx_set_cr3, |
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c index 1bff3e925fda..be70795b4822 100644 --- a/drivers/kvm/x86_emulate.c +++ b/drivers/kvm/x86_emulate.c | |||
| @@ -1323,7 +1323,7 @@ twobyte_special_insn: | |||
| 1323 | ctxt)) != 0)) | 1323 | ctxt)) != 0)) |
| 1324 | goto done; | 1324 | goto done; |
| 1325 | if ((old_lo != _regs[VCPU_REGS_RAX]) | 1325 | if ((old_lo != _regs[VCPU_REGS_RAX]) |
| 1326 | || (old_hi != _regs[VCPU_REGS_RDI])) { | 1326 | || (old_hi != _regs[VCPU_REGS_RDX])) { |
| 1327 | _regs[VCPU_REGS_RAX] = old_lo; | 1327 | _regs[VCPU_REGS_RAX] = old_lo; |
| 1328 | _regs[VCPU_REGS_RDX] = old_hi; | 1328 | _regs[VCPU_REGS_RDX] = old_hi; |
| 1329 | _eflags &= ~EFLG_ZF; | 1329 | _eflags &= ~EFLG_ZF; |
