diff options
37 files changed, 2372 insertions, 1752 deletions
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 284b44259750..5575759b84ee 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt | |||
@@ -350,15 +350,6 @@ Who: anybody or Florian Mickler <florian@mickler.org> | |||
350 | 350 | ||
351 | ---------------------------- | 351 | ---------------------------- |
352 | 352 | ||
353 | What: KVM paravirt mmu host support | ||
354 | When: January 2011 | ||
355 | Why: The paravirt mmu host support is slower than non-paravirt mmu, both | ||
356 | on newer and older hardware. It is already not exposed to the guest, | ||
357 | and kept only for live migration purposes. | ||
358 | Who: Avi Kivity <avi@redhat.com> | ||
359 | |||
360 | ---------------------------- | ||
361 | |||
362 | What: iwlwifi 50XX module parameters | 353 | What: iwlwifi 50XX module parameters |
363 | When: 3.0 | 354 | When: 3.0 |
364 | Why: The "..50" modules parameters were used to configure 5000 series and | 355 | Why: The "..50" modules parameters were used to configure 5000 series and |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 7b2e5c5eefa6..e69a461a06c2 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -1178,9 +1178,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
1178 | kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs. | 1178 | kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs. |
1179 | Default is 0 (don't ignore, but inject #GP) | 1179 | Default is 0 (don't ignore, but inject #GP) |
1180 | 1180 | ||
1181 | kvm.oos_shadow= [KVM] Disable out-of-sync shadow paging. | ||
1182 | Default is 1 (enabled) | ||
1183 | |||
1184 | kvm.mmu_audit= [KVM] This is a R/W parameter which allows audit | 1181 | kvm.mmu_audit= [KVM] This is a R/W parameter which allows audit |
1185 | KVM MMU at runtime. | 1182 | KVM MMU at runtime. |
1186 | Default is 0 (off) | 1183 | Default is 0 (off) |
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index e2a4b5287361..e1d94bf4056e 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt | |||
@@ -1466,6 +1466,31 @@ is supported; 2 if the processor requires all virtual machines to have | |||
1466 | an RMA, or 1 if the processor can use an RMA but doesn't require it, | 1466 | an RMA, or 1 if the processor can use an RMA but doesn't require it, |
1467 | because it supports the Virtual RMA (VRMA) facility. | 1467 | because it supports the Virtual RMA (VRMA) facility. |
1468 | 1468 | ||
1469 | 4.64 KVM_NMI | ||
1470 | |||
1471 | Capability: KVM_CAP_USER_NMI | ||
1472 | Architectures: x86 | ||
1473 | Type: vcpu ioctl | ||
1474 | Parameters: none | ||
1475 | Returns: 0 on success, -1 on error | ||
1476 | |||
1477 | Queues an NMI on the thread's vcpu. Note this is well defined only | ||
1478 | when KVM_CREATE_IRQCHIP has not been called, since this is an interface | ||
1479 | between the virtual cpu core and virtual local APIC. After KVM_CREATE_IRQCHIP | ||
1480 | has been called, this interface is completely emulated within the kernel. | ||
1481 | |||
1482 | To use this to emulate the LINT1 input with KVM_CREATE_IRQCHIP, use the | ||
1483 | following algorithm: | ||
1484 | |||
1485 | - pause the vpcu | ||
1486 | - read the local APIC's state (KVM_GET_LAPIC) | ||
1487 | - check whether changing LINT1 will queue an NMI (see the LVT entry for LINT1) | ||
1488 | - if so, issue KVM_NMI | ||
1489 | - resume the vcpu | ||
1490 | |||
1491 | Some guests configure the LINT1 NMI input to cause a panic, aiding in | ||
1492 | debugging. | ||
1493 | |||
1469 | 5. The kvm_run structure | 1494 | 5. The kvm_run structure |
1470 | 1495 | ||
1471 | Application code obtains a pointer to the kvm_run structure by | 1496 | Application code obtains a pointer to the kvm_run structure by |
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 43f4c92816ef..405052002493 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c | |||
@@ -774,13 +774,13 @@ struct kvm *kvm_arch_alloc_vm(void) | |||
774 | return kvm; | 774 | return kvm; |
775 | } | 775 | } |
776 | 776 | ||
777 | struct kvm_io_range { | 777 | struct kvm_ia64_io_range { |
778 | unsigned long start; | 778 | unsigned long start; |
779 | unsigned long size; | 779 | unsigned long size; |
780 | unsigned long type; | 780 | unsigned long type; |
781 | }; | 781 | }; |
782 | 782 | ||
783 | static const struct kvm_io_range io_ranges[] = { | 783 | static const struct kvm_ia64_io_range io_ranges[] = { |
784 | {VGA_IO_START, VGA_IO_SIZE, GPFN_FRAME_BUFFER}, | 784 | {VGA_IO_START, VGA_IO_SIZE, GPFN_FRAME_BUFFER}, |
785 | {MMIO_START, MMIO_SIZE, GPFN_LOW_MMIO}, | 785 | {MMIO_START, MMIO_SIZE, GPFN_LOW_MMIO}, |
786 | {LEGACY_IO_START, LEGACY_IO_SIZE, GPFN_LEGACY_IO}, | 786 | {LEGACY_IO_START, LEGACY_IO_SIZE, GPFN_LEGACY_IO}, |
@@ -1366,14 +1366,12 @@ static void kvm_release_vm_pages(struct kvm *kvm) | |||
1366 | { | 1366 | { |
1367 | struct kvm_memslots *slots; | 1367 | struct kvm_memslots *slots; |
1368 | struct kvm_memory_slot *memslot; | 1368 | struct kvm_memory_slot *memslot; |
1369 | int i, j; | 1369 | int j; |
1370 | unsigned long base_gfn; | 1370 | unsigned long base_gfn; |
1371 | 1371 | ||
1372 | slots = kvm_memslots(kvm); | 1372 | slots = kvm_memslots(kvm); |
1373 | for (i = 0; i < slots->nmemslots; i++) { | 1373 | kvm_for_each_memslot(memslot, slots) { |
1374 | memslot = &slots->memslots[i]; | ||
1375 | base_gfn = memslot->base_gfn; | 1374 | base_gfn = memslot->base_gfn; |
1376 | |||
1377 | for (j = 0; j < memslot->npages; j++) { | 1375 | for (j = 0; j < memslot->npages; j++) { |
1378 | if (memslot->rmap[j]) | 1376 | if (memslot->rmap[j]) |
1379 | put_page((struct page *)memslot->rmap[j]); | 1377 | put_page((struct page *)memslot->rmap[j]); |
@@ -1820,7 +1818,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | |||
1820 | if (log->slot >= KVM_MEMORY_SLOTS) | 1818 | if (log->slot >= KVM_MEMORY_SLOTS) |
1821 | goto out; | 1819 | goto out; |
1822 | 1820 | ||
1823 | memslot = &kvm->memslots->memslots[log->slot]; | 1821 | memslot = id_to_memslot(kvm->memslots, log->slot); |
1824 | r = -ENOENT; | 1822 | r = -ENOENT; |
1825 | if (!memslot->dirty_bitmap) | 1823 | if (!memslot->dirty_bitmap) |
1826 | goto out; | 1824 | goto out; |
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index 0ad432bc81d6..f7727d91ac6b 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h | |||
@@ -170,8 +170,8 @@ struct kvm_sregs { | |||
170 | } ppc64; | 170 | } ppc64; |
171 | struct { | 171 | struct { |
172 | __u32 sr[16]; | 172 | __u32 sr[16]; |
173 | __u64 ibat[8]; | 173 | __u64 ibat[8]; |
174 | __u64 dbat[8]; | 174 | __u64 dbat[8]; |
175 | } ppc32; | 175 | } ppc32; |
176 | } s; | 176 | } s; |
177 | struct { | 177 | struct { |
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index a459479995c6..e41ac6f7dcf1 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c | |||
@@ -498,7 +498,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | |||
498 | 498 | ||
499 | /* If nothing is dirty, don't bother messing with page tables. */ | 499 | /* If nothing is dirty, don't bother messing with page tables. */ |
500 | if (is_dirty) { | 500 | if (is_dirty) { |
501 | memslot = &kvm->memslots->memslots[log->slot]; | 501 | memslot = id_to_memslot(kvm->memslots, log->slot); |
502 | 502 | ||
503 | ga = memslot->base_gfn << PAGE_SHIFT; | 503 | ga = memslot->base_gfn << PAGE_SHIFT; |
504 | ga_end = ga + (memslot->npages << PAGE_SHIFT); | 504 | ga_end = ga + (memslot->npages << PAGE_SHIFT); |
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 286f13d601cf..a795a13f4a70 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c | |||
@@ -86,7 +86,7 @@ static inline int lpcr_rmls(unsigned long rma_size) | |||
86 | * to allocate contiguous physical memory for the real memory | 86 | * to allocate contiguous physical memory for the real memory |
87 | * areas for guests. | 87 | * areas for guests. |
88 | */ | 88 | */ |
89 | void kvm_rma_init(void) | 89 | void __init kvm_rma_init(void) |
90 | { | 90 | { |
91 | unsigned long i; | 91 | unsigned long i; |
92 | unsigned long j, npages; | 92 | unsigned long j, npages; |
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index f3444f700f36..17c5d4bdee5e 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h | |||
@@ -197,7 +197,10 @@ | |||
197 | 197 | ||
198 | /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ | 198 | /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ |
199 | #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ | 199 | #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ |
200 | #define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */ | ||
201 | #define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ | ||
200 | #define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */ | 202 | #define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */ |
203 | #define X86_FEATURE_BMI2 (9*32+ 8) /* 2nd group bit manipulation extensions */ | ||
201 | #define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ | 204 | #define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ |
202 | 205 | ||
203 | #if defined(__KERNEL__) && !defined(__ASSEMBLY__) | 206 | #if defined(__KERNEL__) && !defined(__ASSEMBLY__) |
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index a026507893e9..ab4092e3214e 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h | |||
@@ -181,6 +181,7 @@ struct x86_emulate_ops { | |||
181 | int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); | 181 | int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); |
182 | int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); | 182 | int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); |
183 | int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); | 183 | int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); |
184 | int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata); | ||
184 | void (*halt)(struct x86_emulate_ctxt *ctxt); | 185 | void (*halt)(struct x86_emulate_ctxt *ctxt); |
185 | void (*wbinvd)(struct x86_emulate_ctxt *ctxt); | 186 | void (*wbinvd)(struct x86_emulate_ctxt *ctxt); |
186 | int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt); | 187 | int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt); |
@@ -364,6 +365,7 @@ enum x86_intercept { | |||
364 | #endif | 365 | #endif |
365 | 366 | ||
366 | int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len); | 367 | int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len); |
368 | bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); | ||
367 | #define EMULATION_FAILED -1 | 369 | #define EMULATION_FAILED -1 |
368 | #define EMULATION_OK 0 | 370 | #define EMULATION_OK 0 |
369 | #define EMULATION_RESTART 1 | 371 | #define EMULATION_RESTART 1 |
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b4973f4dab98..52d6640a5ca1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h | |||
@@ -16,10 +16,12 @@ | |||
16 | #include <linux/mmu_notifier.h> | 16 | #include <linux/mmu_notifier.h> |
17 | #include <linux/tracepoint.h> | 17 | #include <linux/tracepoint.h> |
18 | #include <linux/cpumask.h> | 18 | #include <linux/cpumask.h> |
19 | #include <linux/irq_work.h> | ||
19 | 20 | ||
20 | #include <linux/kvm.h> | 21 | #include <linux/kvm.h> |
21 | #include <linux/kvm_para.h> | 22 | #include <linux/kvm_para.h> |
22 | #include <linux/kvm_types.h> | 23 | #include <linux/kvm_types.h> |
24 | #include <linux/perf_event.h> | ||
23 | 25 | ||
24 | #include <asm/pvclock-abi.h> | 26 | #include <asm/pvclock-abi.h> |
25 | #include <asm/desc.h> | 27 | #include <asm/desc.h> |
@@ -31,6 +33,8 @@ | |||
31 | #define KVM_MEMORY_SLOTS 32 | 33 | #define KVM_MEMORY_SLOTS 32 |
32 | /* memory slots that does not exposed to userspace */ | 34 | /* memory slots that does not exposed to userspace */ |
33 | #define KVM_PRIVATE_MEM_SLOTS 4 | 35 | #define KVM_PRIVATE_MEM_SLOTS 4 |
36 | #define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) | ||
37 | |||
34 | #define KVM_MMIO_SIZE 16 | 38 | #define KVM_MMIO_SIZE 16 |
35 | 39 | ||
36 | #define KVM_PIO_PAGE_OFFSET 1 | 40 | #define KVM_PIO_PAGE_OFFSET 1 |
@@ -228,7 +232,7 @@ struct kvm_mmu_page { | |||
228 | * One bit set per slot which has memory | 232 | * One bit set per slot which has memory |
229 | * in this shadow page. | 233 | * in this shadow page. |
230 | */ | 234 | */ |
231 | DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); | 235 | DECLARE_BITMAP(slot_bitmap, KVM_MEM_SLOTS_NUM); |
232 | bool unsync; | 236 | bool unsync; |
233 | int root_count; /* Currently serving as active root */ | 237 | int root_count; /* Currently serving as active root */ |
234 | unsigned int unsync_children; | 238 | unsigned int unsync_children; |
@@ -239,14 +243,9 @@ struct kvm_mmu_page { | |||
239 | int clear_spte_count; | 243 | int clear_spte_count; |
240 | #endif | 244 | #endif |
241 | 245 | ||
242 | struct rcu_head rcu; | 246 | int write_flooding_count; |
243 | }; | ||
244 | 247 | ||
245 | struct kvm_pv_mmu_op_buffer { | 248 | struct rcu_head rcu; |
246 | void *ptr; | ||
247 | unsigned len; | ||
248 | unsigned processed; | ||
249 | char buf[512] __aligned(sizeof(long)); | ||
250 | }; | 249 | }; |
251 | 250 | ||
252 | struct kvm_pio_request { | 251 | struct kvm_pio_request { |
@@ -294,6 +293,37 @@ struct kvm_mmu { | |||
294 | u64 pdptrs[4]; /* pae */ | 293 | u64 pdptrs[4]; /* pae */ |
295 | }; | 294 | }; |
296 | 295 | ||
296 | enum pmc_type { | ||
297 | KVM_PMC_GP = 0, | ||
298 | KVM_PMC_FIXED, | ||
299 | }; | ||
300 | |||
301 | struct kvm_pmc { | ||
302 | enum pmc_type type; | ||
303 | u8 idx; | ||
304 | u64 counter; | ||
305 | u64 eventsel; | ||
306 | struct perf_event *perf_event; | ||
307 | struct kvm_vcpu *vcpu; | ||
308 | }; | ||
309 | |||
310 | struct kvm_pmu { | ||
311 | unsigned nr_arch_gp_counters; | ||
312 | unsigned nr_arch_fixed_counters; | ||
313 | unsigned available_event_types; | ||
314 | u64 fixed_ctr_ctrl; | ||
315 | u64 global_ctrl; | ||
316 | u64 global_status; | ||
317 | u64 global_ovf_ctrl; | ||
318 | u64 counter_bitmask[2]; | ||
319 | u64 global_ctrl_mask; | ||
320 | u8 version; | ||
321 | struct kvm_pmc gp_counters[X86_PMC_MAX_GENERIC]; | ||
322 | struct kvm_pmc fixed_counters[X86_PMC_MAX_FIXED]; | ||
323 | struct irq_work irq_work; | ||
324 | u64 reprogram_pmi; | ||
325 | }; | ||
326 | |||
297 | struct kvm_vcpu_arch { | 327 | struct kvm_vcpu_arch { |
298 | /* | 328 | /* |
299 | * rip and regs accesses must go through | 329 | * rip and regs accesses must go through |
@@ -345,19 +375,10 @@ struct kvm_vcpu_arch { | |||
345 | */ | 375 | */ |
346 | struct kvm_mmu *walk_mmu; | 376 | struct kvm_mmu *walk_mmu; |
347 | 377 | ||
348 | /* only needed in kvm_pv_mmu_op() path, but it's hot so | ||
349 | * put it here to avoid allocation */ | ||
350 | struct kvm_pv_mmu_op_buffer mmu_op_buffer; | ||
351 | |||
352 | struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; | 378 | struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; |
353 | struct kvm_mmu_memory_cache mmu_page_cache; | 379 | struct kvm_mmu_memory_cache mmu_page_cache; |
354 | struct kvm_mmu_memory_cache mmu_page_header_cache; | 380 | struct kvm_mmu_memory_cache mmu_page_header_cache; |
355 | 381 | ||
356 | gfn_t last_pt_write_gfn; | ||
357 | int last_pt_write_count; | ||
358 | u64 *last_pte_updated; | ||
359 | gfn_t last_pte_gfn; | ||
360 | |||
361 | struct fpu guest_fpu; | 382 | struct fpu guest_fpu; |
362 | u64 xcr0; | 383 | u64 xcr0; |
363 | 384 | ||
@@ -436,6 +457,8 @@ struct kvm_vcpu_arch { | |||
436 | unsigned access; | 457 | unsigned access; |
437 | gfn_t mmio_gfn; | 458 | gfn_t mmio_gfn; |
438 | 459 | ||
460 | struct kvm_pmu pmu; | ||
461 | |||
439 | /* used for guest single stepping over the given code position */ | 462 | /* used for guest single stepping over the given code position */ |
440 | unsigned long singlestep_rip; | 463 | unsigned long singlestep_rip; |
441 | 464 | ||
@@ -444,6 +467,9 @@ struct kvm_vcpu_arch { | |||
444 | 467 | ||
445 | cpumask_var_t wbinvd_dirty_mask; | 468 | cpumask_var_t wbinvd_dirty_mask; |
446 | 469 | ||
470 | unsigned long last_retry_eip; | ||
471 | unsigned long last_retry_addr; | ||
472 | |||
447 | struct { | 473 | struct { |
448 | bool halted; | 474 | bool halted; |
449 | gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)]; | 475 | gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)]; |
@@ -459,7 +485,6 @@ struct kvm_arch { | |||
459 | unsigned int n_requested_mmu_pages; | 485 | unsigned int n_requested_mmu_pages; |
460 | unsigned int n_max_mmu_pages; | 486 | unsigned int n_max_mmu_pages; |
461 | unsigned int indirect_shadow_pages; | 487 | unsigned int indirect_shadow_pages; |
462 | atomic_t invlpg_counter; | ||
463 | struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; | 488 | struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; |
464 | /* | 489 | /* |
465 | * Hash table of struct kvm_mmu_page. | 490 | * Hash table of struct kvm_mmu_page. |
@@ -660,6 +685,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, | |||
660 | 685 | ||
661 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); | 686 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); |
662 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); | 687 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); |
688 | int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, | ||
689 | struct kvm_memory_slot *slot); | ||
663 | void kvm_mmu_zap_all(struct kvm *kvm); | 690 | void kvm_mmu_zap_all(struct kvm *kvm); |
664 | unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); | 691 | unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); |
665 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); | 692 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); |
@@ -668,8 +695,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); | |||
668 | 695 | ||
669 | int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, | 696 | int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, |
670 | const void *val, int bytes); | 697 | const void *val, int bytes); |
671 | int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, | ||
672 | gpa_t addr, unsigned long *ret); | ||
673 | u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); | 698 | u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); |
674 | 699 | ||
675 | extern bool tdp_enabled; | 700 | extern bool tdp_enabled; |
@@ -692,6 +717,7 @@ enum emulation_result { | |||
692 | #define EMULTYPE_NO_DECODE (1 << 0) | 717 | #define EMULTYPE_NO_DECODE (1 << 0) |
693 | #define EMULTYPE_TRAP_UD (1 << 1) | 718 | #define EMULTYPE_TRAP_UD (1 << 1) |
694 | #define EMULTYPE_SKIP (1 << 2) | 719 | #define EMULTYPE_SKIP (1 << 2) |
720 | #define EMULTYPE_RETRY (1 << 3) | ||
695 | int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, | 721 | int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, |
696 | int emulation_type, void *insn, int insn_len); | 722 | int emulation_type, void *insn, int insn_len); |
697 | 723 | ||
@@ -734,6 +760,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); | |||
734 | 760 | ||
735 | unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); | 761 | unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); |
736 | void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); | 762 | void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); |
763 | bool kvm_rdpmc(struct kvm_vcpu *vcpu); | ||
737 | 764 | ||
738 | void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); | 765 | void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); |
739 | void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); | 766 | void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); |
@@ -754,13 +781,14 @@ int fx_init(struct kvm_vcpu *vcpu); | |||
754 | 781 | ||
755 | void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); | 782 | void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); |
756 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | 783 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, |
757 | const u8 *new, int bytes, | 784 | const u8 *new, int bytes); |
758 | bool guest_initiated); | 785 | int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); |
759 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); | 786 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); |
760 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); | 787 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); |
761 | int kvm_mmu_load(struct kvm_vcpu *vcpu); | 788 | int kvm_mmu_load(struct kvm_vcpu *vcpu); |
762 | void kvm_mmu_unload(struct kvm_vcpu *vcpu); | 789 | void kvm_mmu_unload(struct kvm_vcpu *vcpu); |
763 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); | 790 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); |
791 | gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); | ||
764 | gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, | 792 | gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, |
765 | struct x86_exception *exception); | 793 | struct x86_exception *exception); |
766 | gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, | 794 | gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, |
@@ -782,6 +810,11 @@ void kvm_disable_tdp(void); | |||
782 | int complete_pio(struct kvm_vcpu *vcpu); | 810 | int complete_pio(struct kvm_vcpu *vcpu); |
783 | bool kvm_check_iopl(struct kvm_vcpu *vcpu); | 811 | bool kvm_check_iopl(struct kvm_vcpu *vcpu); |
784 | 812 | ||
813 | static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) | ||
814 | { | ||
815 | return gpa; | ||
816 | } | ||
817 | |||
785 | static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) | 818 | static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) |
786 | { | 819 | { |
787 | struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); | 820 | struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); |
@@ -894,4 +927,17 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); | |||
894 | 927 | ||
895 | void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); | 928 | void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); |
896 | 929 | ||
930 | int kvm_is_in_guest(void); | ||
931 | |||
932 | void kvm_pmu_init(struct kvm_vcpu *vcpu); | ||
933 | void kvm_pmu_destroy(struct kvm_vcpu *vcpu); | ||
934 | void kvm_pmu_reset(struct kvm_vcpu *vcpu); | ||
935 | void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu); | ||
936 | bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr); | ||
937 | int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); | ||
938 | int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); | ||
939 | int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); | ||
940 | void kvm_handle_pmu_event(struct kvm_vcpu *vcpu); | ||
941 | void kvm_deliver_pmi(struct kvm_vcpu *vcpu); | ||
942 | |||
897 | #endif /* _ASM_X86_KVM_HOST_H */ | 943 | #endif /* _ASM_X86_KVM_HOST_H */ |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a9c2116001d6..f0c6fd6f176b 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
@@ -39,8 +39,6 @@ | |||
39 | #include <asm/desc.h> | 39 | #include <asm/desc.h> |
40 | #include <asm/tlbflush.h> | 40 | #include <asm/tlbflush.h> |
41 | 41 | ||
42 | #define MMU_QUEUE_SIZE 1024 | ||
43 | |||
44 | static int kvmapf = 1; | 42 | static int kvmapf = 1; |
45 | 43 | ||
46 | static int parse_no_kvmapf(char *arg) | 44 | static int parse_no_kvmapf(char *arg) |
@@ -60,21 +58,10 @@ static int parse_no_stealacc(char *arg) | |||
60 | 58 | ||
61 | early_param("no-steal-acc", parse_no_stealacc); | 59 | early_param("no-steal-acc", parse_no_stealacc); |
62 | 60 | ||
63 | struct kvm_para_state { | ||
64 | u8 mmu_queue[MMU_QUEUE_SIZE]; | ||
65 | int mmu_queue_len; | ||
66 | }; | ||
67 | |||
68 | static DEFINE_PER_CPU(struct kvm_para_state, para_state); | ||
69 | static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); | 61 | static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); |
70 | static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); | 62 | static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); |
71 | static int has_steal_clock = 0; | 63 | static int has_steal_clock = 0; |
72 | 64 | ||
73 | static struct kvm_para_state *kvm_para_state(void) | ||
74 | { | ||
75 | return &per_cpu(para_state, raw_smp_processor_id()); | ||
76 | } | ||
77 | |||
78 | /* | 65 | /* |
79 | * No need for any "IO delay" on KVM | 66 | * No need for any "IO delay" on KVM |
80 | */ | 67 | */ |
@@ -271,151 +258,6 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
271 | } | 258 | } |
272 | } | 259 | } |
273 | 260 | ||
274 | static void kvm_mmu_op(void *buffer, unsigned len) | ||
275 | { | ||
276 | int r; | ||
277 | unsigned long a1, a2; | ||
278 | |||
279 | do { | ||
280 | a1 = __pa(buffer); | ||
281 | a2 = 0; /* on i386 __pa() always returns <4G */ | ||
282 | r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2); | ||
283 | buffer += r; | ||
284 | len -= r; | ||
285 | } while (len); | ||
286 | } | ||
287 | |||
288 | static void mmu_queue_flush(struct kvm_para_state *state) | ||
289 | { | ||
290 | if (state->mmu_queue_len) { | ||
291 | kvm_mmu_op(state->mmu_queue, state->mmu_queue_len); | ||
292 | state->mmu_queue_len = 0; | ||
293 | } | ||
294 | } | ||
295 | |||
296 | static void kvm_deferred_mmu_op(void *buffer, int len) | ||
297 | { | ||
298 | struct kvm_para_state *state = kvm_para_state(); | ||
299 | |||
300 | if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) { | ||
301 | kvm_mmu_op(buffer, len); | ||
302 | return; | ||
303 | } | ||
304 | if (state->mmu_queue_len + len > sizeof state->mmu_queue) | ||
305 | mmu_queue_flush(state); | ||
306 | memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len); | ||
307 | state->mmu_queue_len += len; | ||
308 | } | ||
309 | |||
310 | static void kvm_mmu_write(void *dest, u64 val) | ||
311 | { | ||
312 | __u64 pte_phys; | ||
313 | struct kvm_mmu_op_write_pte wpte; | ||
314 | |||
315 | #ifdef CONFIG_HIGHPTE | ||
316 | struct page *page; | ||
317 | unsigned long dst = (unsigned long) dest; | ||
318 | |||
319 | page = kmap_atomic_to_page(dest); | ||
320 | pte_phys = page_to_pfn(page); | ||
321 | pte_phys <<= PAGE_SHIFT; | ||
322 | pte_phys += (dst & ~(PAGE_MASK)); | ||
323 | #else | ||
324 | pte_phys = (unsigned long)__pa(dest); | ||
325 | #endif | ||
326 | wpte.header.op = KVM_MMU_OP_WRITE_PTE; | ||
327 | wpte.pte_val = val; | ||
328 | wpte.pte_phys = pte_phys; | ||
329 | |||
330 | kvm_deferred_mmu_op(&wpte, sizeof wpte); | ||
331 | } | ||
332 | |||
333 | /* | ||
334 | * We only need to hook operations that are MMU writes. We hook these so that | ||
335 | * we can use lazy MMU mode to batch these operations. We could probably | ||
336 | * improve the performance of the host code if we used some of the information | ||
337 | * here to simplify processing of batched writes. | ||
338 | */ | ||
339 | static void kvm_set_pte(pte_t *ptep, pte_t pte) | ||
340 | { | ||
341 | kvm_mmu_write(ptep, pte_val(pte)); | ||
342 | } | ||
343 | |||
344 | static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr, | ||
345 | pte_t *ptep, pte_t pte) | ||
346 | { | ||
347 | kvm_mmu_write(ptep, pte_val(pte)); | ||
348 | } | ||
349 | |||
350 | static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd) | ||
351 | { | ||
352 | kvm_mmu_write(pmdp, pmd_val(pmd)); | ||
353 | } | ||
354 | |||
355 | #if PAGETABLE_LEVELS >= 3 | ||
356 | #ifdef CONFIG_X86_PAE | ||
357 | static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte) | ||
358 | { | ||
359 | kvm_mmu_write(ptep, pte_val(pte)); | ||
360 | } | ||
361 | |||
362 | static void kvm_pte_clear(struct mm_struct *mm, | ||
363 | unsigned long addr, pte_t *ptep) | ||
364 | { | ||
365 | kvm_mmu_write(ptep, 0); | ||
366 | } | ||
367 | |||
368 | static void kvm_pmd_clear(pmd_t *pmdp) | ||
369 | { | ||
370 | kvm_mmu_write(pmdp, 0); | ||
371 | } | ||
372 | #endif | ||
373 | |||
374 | static void kvm_set_pud(pud_t *pudp, pud_t pud) | ||
375 | { | ||
376 | kvm_mmu_write(pudp, pud_val(pud)); | ||
377 | } | ||
378 | |||
379 | #if PAGETABLE_LEVELS == 4 | ||
380 | static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd) | ||
381 | { | ||
382 | kvm_mmu_write(pgdp, pgd_val(pgd)); | ||
383 | } | ||
384 | #endif | ||
385 | #endif /* PAGETABLE_LEVELS >= 3 */ | ||
386 | |||
387 | static void kvm_flush_tlb(void) | ||
388 | { | ||
389 | struct kvm_mmu_op_flush_tlb ftlb = { | ||
390 | .header.op = KVM_MMU_OP_FLUSH_TLB, | ||
391 | }; | ||
392 | |||
393 | kvm_deferred_mmu_op(&ftlb, sizeof ftlb); | ||
394 | } | ||
395 | |||
396 | static void kvm_release_pt(unsigned long pfn) | ||
397 | { | ||
398 | struct kvm_mmu_op_release_pt rpt = { | ||
399 | .header.op = KVM_MMU_OP_RELEASE_PT, | ||
400 | .pt_phys = (u64)pfn << PAGE_SHIFT, | ||
401 | }; | ||
402 | |||
403 | kvm_mmu_op(&rpt, sizeof rpt); | ||
404 | } | ||
405 | |||
406 | static void kvm_enter_lazy_mmu(void) | ||
407 | { | ||
408 | paravirt_enter_lazy_mmu(); | ||
409 | } | ||
410 | |||
411 | static void kvm_leave_lazy_mmu(void) | ||
412 | { | ||
413 | struct kvm_para_state *state = kvm_para_state(); | ||
414 | |||
415 | mmu_queue_flush(state); | ||
416 | paravirt_leave_lazy_mmu(); | ||
417 | } | ||
418 | |||
419 | static void __init paravirt_ops_setup(void) | 261 | static void __init paravirt_ops_setup(void) |
420 | { | 262 | { |
421 | pv_info.name = "KVM"; | 263 | pv_info.name = "KVM"; |
@@ -424,29 +266,6 @@ static void __init paravirt_ops_setup(void) | |||
424 | if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) | 266 | if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) |
425 | pv_cpu_ops.io_delay = kvm_io_delay; | 267 | pv_cpu_ops.io_delay = kvm_io_delay; |
426 | 268 | ||
427 | if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) { | ||
428 | pv_mmu_ops.set_pte = kvm_set_pte; | ||
429 | pv_mmu_ops.set_pte_at = kvm_set_pte_at; | ||
430 | pv_mmu_ops.set_pmd = kvm_set_pmd; | ||
431 | #if PAGETABLE_LEVELS >= 3 | ||
432 | #ifdef CONFIG_X86_PAE | ||
433 | pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic; | ||
434 | pv_mmu_ops.pte_clear = kvm_pte_clear; | ||
435 | pv_mmu_ops.pmd_clear = kvm_pmd_clear; | ||
436 | #endif | ||
437 | pv_mmu_ops.set_pud = kvm_set_pud; | ||
438 | #if PAGETABLE_LEVELS == 4 | ||
439 | pv_mmu_ops.set_pgd = kvm_set_pgd; | ||
440 | #endif | ||
441 | #endif | ||
442 | pv_mmu_ops.flush_tlb_user = kvm_flush_tlb; | ||
443 | pv_mmu_ops.release_pte = kvm_release_pt; | ||
444 | pv_mmu_ops.release_pmd = kvm_release_pt; | ||
445 | pv_mmu_ops.release_pud = kvm_release_pt; | ||
446 | |||
447 | pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; | ||
448 | pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; | ||
449 | } | ||
450 | #ifdef CONFIG_X86_IO_APIC | 269 | #ifdef CONFIG_X86_IO_APIC |
451 | no_timer_check = 1; | 270 | no_timer_check = 1; |
452 | #endif | 271 | #endif |
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ff5790d8e990..1a7fe868f375 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig | |||
@@ -35,6 +35,7 @@ config KVM | |||
35 | select KVM_MMIO | 35 | select KVM_MMIO |
36 | select TASKSTATS | 36 | select TASKSTATS |
37 | select TASK_DELAY_ACCT | 37 | select TASK_DELAY_ACCT |
38 | select PERF_EVENTS | ||
38 | ---help--- | 39 | ---help--- |
39 | Support hosting fully virtualized guest machines using hardware | 40 | Support hosting fully virtualized guest machines using hardware |
40 | virtualization extensions. You will need a fairly recent | 41 | virtualization extensions. You will need a fairly recent |
@@ -52,6 +53,8 @@ config KVM | |||
52 | config KVM_INTEL | 53 | config KVM_INTEL |
53 | tristate "KVM for Intel processors support" | 54 | tristate "KVM for Intel processors support" |
54 | depends on KVM | 55 | depends on KVM |
56 | # for perf_guest_get_msrs(): | ||
57 | depends on CPU_SUP_INTEL | ||
55 | ---help--- | 58 | ---help--- |
56 | Provides support for KVM on Intel processors equipped with the VT | 59 | Provides support for KVM on Intel processors equipped with the VT |
57 | extensions. | 60 | extensions. |
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f15501f431c8..4f579e8dcacf 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile | |||
@@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) | |||
12 | kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) | 12 | kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) |
13 | 13 | ||
14 | kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ | 14 | kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ |
15 | i8254.o timer.o | 15 | i8254.o timer.o cpuid.o pmu.o |
16 | kvm-intel-y += vmx.o | 16 | kvm-intel-y += vmx.o |
17 | kvm-amd-y += svm.o | 17 | kvm-amd-y += svm.o |
18 | 18 | ||
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c new file mode 100644 index 000000000000..89b02bfaaca5 --- /dev/null +++ b/arch/x86/kvm/cpuid.c | |||
@@ -0,0 +1,670 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * cpuid support routines | ||
4 | * | ||
5 | * derived from arch/x86/kvm/x86.c | ||
6 | * | ||
7 | * Copyright 2011 Red Hat, Inc. and/or its affiliates. | ||
8 | * Copyright IBM Corporation, 2008 | ||
9 | * | ||
10 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
11 | * the COPYING file in the top-level directory. | ||
12 | * | ||
13 | */ | ||
14 | |||
15 | #include <linux/kvm_host.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/vmalloc.h> | ||
18 | #include <linux/uaccess.h> | ||
19 | #include <asm/user.h> | ||
20 | #include <asm/xsave.h> | ||
21 | #include "cpuid.h" | ||
22 | #include "lapic.h" | ||
23 | #include "mmu.h" | ||
24 | #include "trace.h" | ||
25 | |||
26 | void kvm_update_cpuid(struct kvm_vcpu *vcpu) | ||
27 | { | ||
28 | struct kvm_cpuid_entry2 *best; | ||
29 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
30 | |||
31 | best = kvm_find_cpuid_entry(vcpu, 1, 0); | ||
32 | if (!best) | ||
33 | return; | ||
34 | |||
35 | /* Update OSXSAVE bit */ | ||
36 | if (cpu_has_xsave && best->function == 0x1) { | ||
37 | best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); | ||
38 | if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) | ||
39 | best->ecx |= bit(X86_FEATURE_OSXSAVE); | ||
40 | } | ||
41 | |||
42 | if (apic) { | ||
43 | if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER)) | ||
44 | apic->lapic_timer.timer_mode_mask = 3 << 17; | ||
45 | else | ||
46 | apic->lapic_timer.timer_mode_mask = 1 << 17; | ||
47 | } | ||
48 | |||
49 | kvm_pmu_cpuid_update(vcpu); | ||
50 | } | ||
51 | |||
52 | static int is_efer_nx(void) | ||
53 | { | ||
54 | unsigned long long efer = 0; | ||
55 | |||
56 | rdmsrl_safe(MSR_EFER, &efer); | ||
57 | return efer & EFER_NX; | ||
58 | } | ||
59 | |||
60 | static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) | ||
61 | { | ||
62 | int i; | ||
63 | struct kvm_cpuid_entry2 *e, *entry; | ||
64 | |||
65 | entry = NULL; | ||
66 | for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { | ||
67 | e = &vcpu->arch.cpuid_entries[i]; | ||
68 | if (e->function == 0x80000001) { | ||
69 | entry = e; | ||
70 | break; | ||
71 | } | ||
72 | } | ||
73 | if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { | ||
74 | entry->edx &= ~(1 << 20); | ||
75 | printk(KERN_INFO "kvm: guest NX capability removed\n"); | ||
76 | } | ||
77 | } | ||
78 | |||
79 | /* when an old userspace process fills a new kernel module */ | ||
80 | int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, | ||
81 | struct kvm_cpuid *cpuid, | ||
82 | struct kvm_cpuid_entry __user *entries) | ||
83 | { | ||
84 | int r, i; | ||
85 | struct kvm_cpuid_entry *cpuid_entries; | ||
86 | |||
87 | r = -E2BIG; | ||
88 | if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) | ||
89 | goto out; | ||
90 | r = -ENOMEM; | ||
91 | cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); | ||
92 | if (!cpuid_entries) | ||
93 | goto out; | ||
94 | r = -EFAULT; | ||
95 | if (copy_from_user(cpuid_entries, entries, | ||
96 | cpuid->nent * sizeof(struct kvm_cpuid_entry))) | ||
97 | goto out_free; | ||
98 | for (i = 0; i < cpuid->nent; i++) { | ||
99 | vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; | ||
100 | vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; | ||
101 | vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; | ||
102 | vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; | ||
103 | vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; | ||
104 | vcpu->arch.cpuid_entries[i].index = 0; | ||
105 | vcpu->arch.cpuid_entries[i].flags = 0; | ||
106 | vcpu->arch.cpuid_entries[i].padding[0] = 0; | ||
107 | vcpu->arch.cpuid_entries[i].padding[1] = 0; | ||
108 | vcpu->arch.cpuid_entries[i].padding[2] = 0; | ||
109 | } | ||
110 | vcpu->arch.cpuid_nent = cpuid->nent; | ||
111 | cpuid_fix_nx_cap(vcpu); | ||
112 | r = 0; | ||
113 | kvm_apic_set_version(vcpu); | ||
114 | kvm_x86_ops->cpuid_update(vcpu); | ||
115 | kvm_update_cpuid(vcpu); | ||
116 | |||
117 | out_free: | ||
118 | vfree(cpuid_entries); | ||
119 | out: | ||
120 | return r; | ||
121 | } | ||
122 | |||
123 | int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, | ||
124 | struct kvm_cpuid2 *cpuid, | ||
125 | struct kvm_cpuid_entry2 __user *entries) | ||
126 | { | ||
127 | int r; | ||
128 | |||
129 | r = -E2BIG; | ||
130 | if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) | ||
131 | goto out; | ||
132 | r = -EFAULT; | ||
133 | if (copy_from_user(&vcpu->arch.cpuid_entries, entries, | ||
134 | cpuid->nent * sizeof(struct kvm_cpuid_entry2))) | ||
135 | goto out; | ||
136 | vcpu->arch.cpuid_nent = cpuid->nent; | ||
137 | kvm_apic_set_version(vcpu); | ||
138 | kvm_x86_ops->cpuid_update(vcpu); | ||
139 | kvm_update_cpuid(vcpu); | ||
140 | return 0; | ||
141 | |||
142 | out: | ||
143 | return r; | ||
144 | } | ||
145 | |||
146 | int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, | ||
147 | struct kvm_cpuid2 *cpuid, | ||
148 | struct kvm_cpuid_entry2 __user *entries) | ||
149 | { | ||
150 | int r; | ||
151 | |||
152 | r = -E2BIG; | ||
153 | if (cpuid->nent < vcpu->arch.cpuid_nent) | ||
154 | goto out; | ||
155 | r = -EFAULT; | ||
156 | if (copy_to_user(entries, &vcpu->arch.cpuid_entries, | ||
157 | vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) | ||
158 | goto out; | ||
159 | return 0; | ||
160 | |||
161 | out: | ||
162 | cpuid->nent = vcpu->arch.cpuid_nent; | ||
163 | return r; | ||
164 | } | ||
165 | |||
166 | static void cpuid_mask(u32 *word, int wordnum) | ||
167 | { | ||
168 | *word &= boot_cpu_data.x86_capability[wordnum]; | ||
169 | } | ||
170 | |||
171 | static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, | ||
172 | u32 index) | ||
173 | { | ||
174 | entry->function = function; | ||
175 | entry->index = index; | ||
176 | cpuid_count(entry->function, entry->index, | ||
177 | &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); | ||
178 | entry->flags = 0; | ||
179 | } | ||
180 | |||
181 | static bool supported_xcr0_bit(unsigned bit) | ||
182 | { | ||
183 | u64 mask = ((u64)1 << bit); | ||
184 | |||
185 | return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0; | ||
186 | } | ||
187 | |||
188 | #define F(x) bit(X86_FEATURE_##x) | ||
189 | |||
190 | static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | ||
191 | u32 index, int *nent, int maxnent) | ||
192 | { | ||
193 | int r; | ||
194 | unsigned f_nx = is_efer_nx() ? F(NX) : 0; | ||
195 | #ifdef CONFIG_X86_64 | ||
196 | unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) | ||
197 | ? F(GBPAGES) : 0; | ||
198 | unsigned f_lm = F(LM); | ||
199 | #else | ||
200 | unsigned f_gbpages = 0; | ||
201 | unsigned f_lm = 0; | ||
202 | #endif | ||
203 | unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; | ||
204 | |||
205 | /* cpuid 1.edx */ | ||
206 | const u32 kvm_supported_word0_x86_features = | ||
207 | F(FPU) | F(VME) | F(DE) | F(PSE) | | ||
208 | F(TSC) | F(MSR) | F(PAE) | F(MCE) | | ||
209 | F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | | ||
210 | F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | | ||
211 | F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | | ||
212 | 0 /* Reserved, DS, ACPI */ | F(MMX) | | ||
213 | F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | | ||
214 | 0 /* HTT, TM, Reserved, PBE */; | ||
215 | /* cpuid 0x80000001.edx */ | ||
216 | const u32 kvm_supported_word1_x86_features = | ||
217 | F(FPU) | F(VME) | F(DE) | F(PSE) | | ||
218 | F(TSC) | F(MSR) | F(PAE) | F(MCE) | | ||
219 | F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | | ||
220 | F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | | ||
221 | F(PAT) | F(PSE36) | 0 /* Reserved */ | | ||
222 | f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | | ||
223 | F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | | ||
224 | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); | ||
225 | /* cpuid 1.ecx */ | ||
226 | const u32 kvm_supported_word4_x86_features = | ||
227 | F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | | ||
228 | 0 /* DS-CPL, VMX, SMX, EST */ | | ||
229 | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | | ||
230 | F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | | ||
231 | 0 /* Reserved, DCA */ | F(XMM4_1) | | ||
232 | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | | ||
233 | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | | ||
234 | F(F16C) | F(RDRAND); | ||
235 | /* cpuid 0x80000001.ecx */ | ||
236 | const u32 kvm_supported_word6_x86_features = | ||
237 | F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | | ||
238 | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | | ||
239 | F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | | ||
240 | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); | ||
241 | |||
242 | /* cpuid 0xC0000001.edx */ | ||
243 | const u32 kvm_supported_word5_x86_features = | ||
244 | F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | | ||
245 | F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | | ||
246 | F(PMM) | F(PMM_EN); | ||
247 | |||
248 | /* cpuid 7.0.ebx */ | ||
249 | const u32 kvm_supported_word9_x86_features = | ||
250 | F(FSGSBASE) | F(BMI1) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS); | ||
251 | |||
252 | /* all calls to cpuid_count() should be made on the same cpu */ | ||
253 | get_cpu(); | ||
254 | |||
255 | r = -E2BIG; | ||
256 | |||
257 | if (*nent >= maxnent) | ||
258 | goto out; | ||
259 | |||
260 | do_cpuid_1_ent(entry, function, index); | ||
261 | ++*nent; | ||
262 | |||
263 | switch (function) { | ||
264 | case 0: | ||
265 | entry->eax = min(entry->eax, (u32)0xd); | ||
266 | break; | ||
267 | case 1: | ||
268 | entry->edx &= kvm_supported_word0_x86_features; | ||
269 | cpuid_mask(&entry->edx, 0); | ||
270 | entry->ecx &= kvm_supported_word4_x86_features; | ||
271 | cpuid_mask(&entry->ecx, 4); | ||
272 | /* we support x2apic emulation even if host does not support | ||
273 | * it since we emulate x2apic in software */ | ||
274 | entry->ecx |= F(X2APIC); | ||
275 | break; | ||
276 | /* function 2 entries are STATEFUL. That is, repeated cpuid commands | ||
277 | * may return different values. This forces us to get_cpu() before | ||
278 | * issuing the first command, and also to emulate this annoying behavior | ||
279 | * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ | ||
280 | case 2: { | ||
281 | int t, times = entry->eax & 0xff; | ||
282 | |||
283 | entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; | ||
284 | entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; | ||
285 | for (t = 1; t < times; ++t) { | ||
286 | if (*nent >= maxnent) | ||
287 | goto out; | ||
288 | |||
289 | do_cpuid_1_ent(&entry[t], function, 0); | ||
290 | entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; | ||
291 | ++*nent; | ||
292 | } | ||
293 | break; | ||
294 | } | ||
295 | /* function 4 has additional index. */ | ||
296 | case 4: { | ||
297 | int i, cache_type; | ||
298 | |||
299 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
300 | /* read more entries until cache_type is zero */ | ||
301 | for (i = 1; ; ++i) { | ||
302 | if (*nent >= maxnent) | ||
303 | goto out; | ||
304 | |||
305 | cache_type = entry[i - 1].eax & 0x1f; | ||
306 | if (!cache_type) | ||
307 | break; | ||
308 | do_cpuid_1_ent(&entry[i], function, i); | ||
309 | entry[i].flags |= | ||
310 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
311 | ++*nent; | ||
312 | } | ||
313 | break; | ||
314 | } | ||
315 | case 7: { | ||
316 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
317 | /* Mask ebx against host capbability word 9 */ | ||
318 | if (index == 0) { | ||
319 | entry->ebx &= kvm_supported_word9_x86_features; | ||
320 | cpuid_mask(&entry->ebx, 9); | ||
321 | } else | ||
322 | entry->ebx = 0; | ||
323 | entry->eax = 0; | ||
324 | entry->ecx = 0; | ||
325 | entry->edx = 0; | ||
326 | break; | ||
327 | } | ||
328 | case 9: | ||
329 | break; | ||
330 | case 0xa: { /* Architectural Performance Monitoring */ | ||
331 | struct x86_pmu_capability cap; | ||
332 | union cpuid10_eax eax; | ||
333 | union cpuid10_edx edx; | ||
334 | |||
335 | perf_get_x86_pmu_capability(&cap); | ||
336 | |||
337 | /* | ||
338 | * Only support guest architectural pmu on a host | ||
339 | * with architectural pmu. | ||
340 | */ | ||
341 | if (!cap.version) | ||
342 | memset(&cap, 0, sizeof(cap)); | ||
343 | |||
344 | eax.split.version_id = min(cap.version, 2); | ||
345 | eax.split.num_counters = cap.num_counters_gp; | ||
346 | eax.split.bit_width = cap.bit_width_gp; | ||
347 | eax.split.mask_length = cap.events_mask_len; | ||
348 | |||
349 | edx.split.num_counters_fixed = cap.num_counters_fixed; | ||
350 | edx.split.bit_width_fixed = cap.bit_width_fixed; | ||
351 | edx.split.reserved = 0; | ||
352 | |||
353 | entry->eax = eax.full; | ||
354 | entry->ebx = cap.events_mask; | ||
355 | entry->ecx = 0; | ||
356 | entry->edx = edx.full; | ||
357 | break; | ||
358 | } | ||
359 | /* function 0xb has additional index. */ | ||
360 | case 0xb: { | ||
361 | int i, level_type; | ||
362 | |||
363 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
364 | /* read more entries until level_type is zero */ | ||
365 | for (i = 1; ; ++i) { | ||
366 | if (*nent >= maxnent) | ||
367 | goto out; | ||
368 | |||
369 | level_type = entry[i - 1].ecx & 0xff00; | ||
370 | if (!level_type) | ||
371 | break; | ||
372 | do_cpuid_1_ent(&entry[i], function, i); | ||
373 | entry[i].flags |= | ||
374 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
375 | ++*nent; | ||
376 | } | ||
377 | break; | ||
378 | } | ||
379 | case 0xd: { | ||
380 | int idx, i; | ||
381 | |||
382 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
383 | for (idx = 1, i = 1; idx < 64; ++idx) { | ||
384 | if (*nent >= maxnent) | ||
385 | goto out; | ||
386 | |||
387 | do_cpuid_1_ent(&entry[i], function, idx); | ||
388 | if (entry[i].eax == 0 || !supported_xcr0_bit(idx)) | ||
389 | continue; | ||
390 | entry[i].flags |= | ||
391 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
392 | ++*nent; | ||
393 | ++i; | ||
394 | } | ||
395 | break; | ||
396 | } | ||
397 | case KVM_CPUID_SIGNATURE: { | ||
398 | char signature[12] = "KVMKVMKVM\0\0"; | ||
399 | u32 *sigptr = (u32 *)signature; | ||
400 | entry->eax = 0; | ||
401 | entry->ebx = sigptr[0]; | ||
402 | entry->ecx = sigptr[1]; | ||
403 | entry->edx = sigptr[2]; | ||
404 | break; | ||
405 | } | ||
406 | case KVM_CPUID_FEATURES: | ||
407 | entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | | ||
408 | (1 << KVM_FEATURE_NOP_IO_DELAY) | | ||
409 | (1 << KVM_FEATURE_CLOCKSOURCE2) | | ||
410 | (1 << KVM_FEATURE_ASYNC_PF) | | ||
411 | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); | ||
412 | |||
413 | if (sched_info_on()) | ||
414 | entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); | ||
415 | |||
416 | entry->ebx = 0; | ||
417 | entry->ecx = 0; | ||
418 | entry->edx = 0; | ||
419 | break; | ||
420 | case 0x80000000: | ||
421 | entry->eax = min(entry->eax, 0x8000001a); | ||
422 | break; | ||
423 | case 0x80000001: | ||
424 | entry->edx &= kvm_supported_word1_x86_features; | ||
425 | cpuid_mask(&entry->edx, 1); | ||
426 | entry->ecx &= kvm_supported_word6_x86_features; | ||
427 | cpuid_mask(&entry->ecx, 6); | ||
428 | break; | ||
429 | case 0x80000008: { | ||
430 | unsigned g_phys_as = (entry->eax >> 16) & 0xff; | ||
431 | unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); | ||
432 | unsigned phys_as = entry->eax & 0xff; | ||
433 | |||
434 | if (!g_phys_as) | ||
435 | g_phys_as = phys_as; | ||
436 | entry->eax = g_phys_as | (virt_as << 8); | ||
437 | entry->ebx = entry->edx = 0; | ||
438 | break; | ||
439 | } | ||
440 | case 0x80000019: | ||
441 | entry->ecx = entry->edx = 0; | ||
442 | break; | ||
443 | case 0x8000001a: | ||
444 | break; | ||
445 | case 0x8000001d: | ||
446 | break; | ||
447 | /*Add support for Centaur's CPUID instruction*/ | ||
448 | case 0xC0000000: | ||
449 | /*Just support up to 0xC0000004 now*/ | ||
450 | entry->eax = min(entry->eax, 0xC0000004); | ||
451 | break; | ||
452 | case 0xC0000001: | ||
453 | entry->edx &= kvm_supported_word5_x86_features; | ||
454 | cpuid_mask(&entry->edx, 5); | ||
455 | break; | ||
456 | case 3: /* Processor serial number */ | ||
457 | case 5: /* MONITOR/MWAIT */ | ||
458 | case 6: /* Thermal management */ | ||
459 | case 0x80000007: /* Advanced power management */ | ||
460 | case 0xC0000002: | ||
461 | case 0xC0000003: | ||
462 | case 0xC0000004: | ||
463 | default: | ||
464 | entry->eax = entry->ebx = entry->ecx = entry->edx = 0; | ||
465 | break; | ||
466 | } | ||
467 | |||
468 | kvm_x86_ops->set_supported_cpuid(function, entry); | ||
469 | |||
470 | r = 0; | ||
471 | |||
472 | out: | ||
473 | put_cpu(); | ||
474 | |||
475 | return r; | ||
476 | } | ||
477 | |||
478 | #undef F | ||
479 | |||
480 | struct kvm_cpuid_param { | ||
481 | u32 func; | ||
482 | u32 idx; | ||
483 | bool has_leaf_count; | ||
484 | bool (*qualifier)(struct kvm_cpuid_param *param); | ||
485 | }; | ||
486 | |||
487 | static bool is_centaur_cpu(struct kvm_cpuid_param *param) | ||
488 | { | ||
489 | return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR; | ||
490 | } | ||
491 | |||
492 | int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, | ||
493 | struct kvm_cpuid_entry2 __user *entries) | ||
494 | { | ||
495 | struct kvm_cpuid_entry2 *cpuid_entries; | ||
496 | int limit, nent = 0, r = -E2BIG, i; | ||
497 | u32 func; | ||
498 | static struct kvm_cpuid_param param[] = { | ||
499 | { .func = 0, .has_leaf_count = true }, | ||
500 | { .func = 0x80000000, .has_leaf_count = true }, | ||
501 | { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true }, | ||
502 | { .func = KVM_CPUID_SIGNATURE }, | ||
503 | { .func = KVM_CPUID_FEATURES }, | ||
504 | }; | ||
505 | |||
506 | if (cpuid->nent < 1) | ||
507 | goto out; | ||
508 | if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) | ||
509 | cpuid->nent = KVM_MAX_CPUID_ENTRIES; | ||
510 | r = -ENOMEM; | ||
511 | cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); | ||
512 | if (!cpuid_entries) | ||
513 | goto out; | ||
514 | |||
515 | r = 0; | ||
516 | for (i = 0; i < ARRAY_SIZE(param); i++) { | ||
517 | struct kvm_cpuid_param *ent = ¶m[i]; | ||
518 | |||
519 | if (ent->qualifier && !ent->qualifier(ent)) | ||
520 | continue; | ||
521 | |||
522 | r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx, | ||
523 | &nent, cpuid->nent); | ||
524 | |||
525 | if (r) | ||
526 | goto out_free; | ||
527 | |||
528 | if (!ent->has_leaf_count) | ||
529 | continue; | ||
530 | |||
531 | limit = cpuid_entries[nent - 1].eax; | ||
532 | for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func) | ||
533 | r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx, | ||
534 | &nent, cpuid->nent); | ||
535 | |||
536 | if (r) | ||
537 | goto out_free; | ||
538 | } | ||
539 | |||
540 | r = -EFAULT; | ||
541 | if (copy_to_user(entries, cpuid_entries, | ||
542 | nent * sizeof(struct kvm_cpuid_entry2))) | ||
543 | goto out_free; | ||
544 | cpuid->nent = nent; | ||
545 | r = 0; | ||
546 | |||
547 | out_free: | ||
548 | vfree(cpuid_entries); | ||
549 | out: | ||
550 | return r; | ||
551 | } | ||
552 | |||
553 | static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) | ||
554 | { | ||
555 | struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; | ||
556 | int j, nent = vcpu->arch.cpuid_nent; | ||
557 | |||
558 | e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; | ||
559 | /* when no next entry is found, the current entry[i] is reselected */ | ||
560 | for (j = i + 1; ; j = (j + 1) % nent) { | ||
561 | struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; | ||
562 | if (ej->function == e->function) { | ||
563 | ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; | ||
564 | return j; | ||
565 | } | ||
566 | } | ||
567 | return 0; /* silence gcc, even though control never reaches here */ | ||
568 | } | ||
569 | |||
570 | /* find an entry with matching function, matching index (if needed), and that | ||
571 | * should be read next (if it's stateful) */ | ||
572 | static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, | ||
573 | u32 function, u32 index) | ||
574 | { | ||
575 | if (e->function != function) | ||
576 | return 0; | ||
577 | if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) | ||
578 | return 0; | ||
579 | if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && | ||
580 | !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) | ||
581 | return 0; | ||
582 | return 1; | ||
583 | } | ||
584 | |||
585 | struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, | ||
586 | u32 function, u32 index) | ||
587 | { | ||
588 | int i; | ||
589 | struct kvm_cpuid_entry2 *best = NULL; | ||
590 | |||
591 | for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { | ||
592 | struct kvm_cpuid_entry2 *e; | ||
593 | |||
594 | e = &vcpu->arch.cpuid_entries[i]; | ||
595 | if (is_matching_cpuid_entry(e, function, index)) { | ||
596 | if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) | ||
597 | move_to_next_stateful_cpuid_entry(vcpu, i); | ||
598 | best = e; | ||
599 | break; | ||
600 | } | ||
601 | } | ||
602 | return best; | ||
603 | } | ||
604 | EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); | ||
605 | |||
606 | int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) | ||
607 | { | ||
608 | struct kvm_cpuid_entry2 *best; | ||
609 | |||
610 | best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); | ||
611 | if (!best || best->eax < 0x80000008) | ||
612 | goto not_found; | ||
613 | best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); | ||
614 | if (best) | ||
615 | return best->eax & 0xff; | ||
616 | not_found: | ||
617 | return 36; | ||
618 | } | ||
619 | |||
620 | /* | ||
621 | * If no match is found, check whether we exceed the vCPU's limit | ||
622 | * and return the content of the highest valid _standard_ leaf instead. | ||
623 | * This is to satisfy the CPUID specification. | ||
624 | */ | ||
625 | static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, | ||
626 | u32 function, u32 index) | ||
627 | { | ||
628 | struct kvm_cpuid_entry2 *maxlevel; | ||
629 | |||
630 | maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); | ||
631 | if (!maxlevel || maxlevel->eax >= function) | ||
632 | return NULL; | ||
633 | if (function & 0x80000000) { | ||
634 | maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0); | ||
635 | if (!maxlevel) | ||
636 | return NULL; | ||
637 | } | ||
638 | return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); | ||
639 | } | ||
640 | |||
641 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | ||
642 | { | ||
643 | u32 function, index; | ||
644 | struct kvm_cpuid_entry2 *best; | ||
645 | |||
646 | function = kvm_register_read(vcpu, VCPU_REGS_RAX); | ||
647 | index = kvm_register_read(vcpu, VCPU_REGS_RCX); | ||
648 | kvm_register_write(vcpu, VCPU_REGS_RAX, 0); | ||
649 | kvm_register_write(vcpu, VCPU_REGS_RBX, 0); | ||
650 | kvm_register_write(vcpu, VCPU_REGS_RCX, 0); | ||
651 | kvm_register_write(vcpu, VCPU_REGS_RDX, 0); | ||
652 | best = kvm_find_cpuid_entry(vcpu, function, index); | ||
653 | |||
654 | if (!best) | ||
655 | best = check_cpuid_limit(vcpu, function, index); | ||
656 | |||
657 | if (best) { | ||
658 | kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); | ||
659 | kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); | ||
660 | kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); | ||
661 | kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); | ||
662 | } | ||
663 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
664 | trace_kvm_cpuid(function, | ||
665 | kvm_register_read(vcpu, VCPU_REGS_RAX), | ||
666 | kvm_register_read(vcpu, VCPU_REGS_RBX), | ||
667 | kvm_register_read(vcpu, VCPU_REGS_RCX), | ||
668 | kvm_register_read(vcpu, VCPU_REGS_RDX)); | ||
669 | } | ||
670 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); | ||
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h new file mode 100644 index 000000000000..5b97e1797a6d --- /dev/null +++ b/arch/x86/kvm/cpuid.h | |||
@@ -0,0 +1,46 @@ | |||
1 | #ifndef ARCH_X86_KVM_CPUID_H | ||
2 | #define ARCH_X86_KVM_CPUID_H | ||
3 | |||
4 | #include "x86.h" | ||
5 | |||
6 | void kvm_update_cpuid(struct kvm_vcpu *vcpu); | ||
7 | struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, | ||
8 | u32 function, u32 index); | ||
9 | int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, | ||
10 | struct kvm_cpuid_entry2 __user *entries); | ||
11 | int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, | ||
12 | struct kvm_cpuid *cpuid, | ||
13 | struct kvm_cpuid_entry __user *entries); | ||
14 | int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, | ||
15 | struct kvm_cpuid2 *cpuid, | ||
16 | struct kvm_cpuid_entry2 __user *entries); | ||
17 | int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, | ||
18 | struct kvm_cpuid2 *cpuid, | ||
19 | struct kvm_cpuid_entry2 __user *entries); | ||
20 | |||
21 | |||
22 | static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) | ||
23 | { | ||
24 | struct kvm_cpuid_entry2 *best; | ||
25 | |||
26 | best = kvm_find_cpuid_entry(vcpu, 1, 0); | ||
27 | return best && (best->ecx & bit(X86_FEATURE_XSAVE)); | ||
28 | } | ||
29 | |||
30 | static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) | ||
31 | { | ||
32 | struct kvm_cpuid_entry2 *best; | ||
33 | |||
34 | best = kvm_find_cpuid_entry(vcpu, 7, 0); | ||
35 | return best && (best->ebx & bit(X86_FEATURE_SMEP)); | ||
36 | } | ||
37 | |||
38 | static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) | ||
39 | { | ||
40 | struct kvm_cpuid_entry2 *best; | ||
41 | |||
42 | best = kvm_find_cpuid_entry(vcpu, 7, 0); | ||
43 | return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); | ||
44 | } | ||
45 | |||
46 | #endif | ||
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f1e3be18a08f..05a562b85025 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c | |||
@@ -125,8 +125,9 @@ | |||
125 | #define Lock (1<<26) /* lock prefix is allowed for the instruction */ | 125 | #define Lock (1<<26) /* lock prefix is allowed for the instruction */ |
126 | #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ | 126 | #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ |
127 | #define No64 (1<<28) | 127 | #define No64 (1<<28) |
128 | #define PageTable (1 << 29) /* instruction used to write page table */ | ||
128 | /* Source 2 operand type */ | 129 | /* Source 2 operand type */ |
129 | #define Src2Shift (29) | 130 | #define Src2Shift (30) |
130 | #define Src2None (OpNone << Src2Shift) | 131 | #define Src2None (OpNone << Src2Shift) |
131 | #define Src2CL (OpCL << Src2Shift) | 132 | #define Src2CL (OpCL << Src2Shift) |
132 | #define Src2ImmByte (OpImmByte << Src2Shift) | 133 | #define Src2ImmByte (OpImmByte << Src2Shift) |
@@ -1674,11 +1675,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) | |||
1674 | return X86EMUL_CONTINUE; | 1675 | return X86EMUL_CONTINUE; |
1675 | } | 1676 | } |
1676 | 1677 | ||
1677 | static int em_grp1a(struct x86_emulate_ctxt *ctxt) | ||
1678 | { | ||
1679 | return emulate_pop(ctxt, &ctxt->dst.val, ctxt->dst.bytes); | ||
1680 | } | ||
1681 | |||
1682 | static int em_grp2(struct x86_emulate_ctxt *ctxt) | 1678 | static int em_grp2(struct x86_emulate_ctxt *ctxt) |
1683 | { | 1679 | { |
1684 | switch (ctxt->modrm_reg) { | 1680 | switch (ctxt->modrm_reg) { |
@@ -1788,7 +1784,7 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt) | |||
1788 | return rc; | 1784 | return rc; |
1789 | } | 1785 | } |
1790 | 1786 | ||
1791 | static int em_grp9(struct x86_emulate_ctxt *ctxt) | 1787 | static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt) |
1792 | { | 1788 | { |
1793 | u64 old = ctxt->dst.orig_val64; | 1789 | u64 old = ctxt->dst.orig_val64; |
1794 | 1790 | ||
@@ -1831,6 +1827,24 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) | |||
1831 | return rc; | 1827 | return rc; |
1832 | } | 1828 | } |
1833 | 1829 | ||
1830 | static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) | ||
1831 | { | ||
1832 | /* Save real source value, then compare EAX against destination. */ | ||
1833 | ctxt->src.orig_val = ctxt->src.val; | ||
1834 | ctxt->src.val = ctxt->regs[VCPU_REGS_RAX]; | ||
1835 | emulate_2op_SrcV(ctxt, "cmp"); | ||
1836 | |||
1837 | if (ctxt->eflags & EFLG_ZF) { | ||
1838 | /* Success: write back to memory. */ | ||
1839 | ctxt->dst.val = ctxt->src.orig_val; | ||
1840 | } else { | ||
1841 | /* Failure: write the value we saw to EAX. */ | ||
1842 | ctxt->dst.type = OP_REG; | ||
1843 | ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; | ||
1844 | } | ||
1845 | return X86EMUL_CONTINUE; | ||
1846 | } | ||
1847 | |||
1834 | static int em_lseg(struct x86_emulate_ctxt *ctxt) | 1848 | static int em_lseg(struct x86_emulate_ctxt *ctxt) |
1835 | { | 1849 | { |
1836 | int seg = ctxt->src2.val; | 1850 | int seg = ctxt->src2.val; |
@@ -2481,6 +2495,15 @@ static int em_das(struct x86_emulate_ctxt *ctxt) | |||
2481 | return X86EMUL_CONTINUE; | 2495 | return X86EMUL_CONTINUE; |
2482 | } | 2496 | } |
2483 | 2497 | ||
2498 | static int em_call(struct x86_emulate_ctxt *ctxt) | ||
2499 | { | ||
2500 | long rel = ctxt->src.val; | ||
2501 | |||
2502 | ctxt->src.val = (unsigned long)ctxt->_eip; | ||
2503 | jmp_rel(ctxt, rel); | ||
2504 | return em_push(ctxt); | ||
2505 | } | ||
2506 | |||
2484 | static int em_call_far(struct x86_emulate_ctxt *ctxt) | 2507 | static int em_call_far(struct x86_emulate_ctxt *ctxt) |
2485 | { | 2508 | { |
2486 | u16 sel, old_cs; | 2509 | u16 sel, old_cs; |
@@ -2622,12 +2645,75 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt) | |||
2622 | return X86EMUL_CONTINUE; | 2645 | return X86EMUL_CONTINUE; |
2623 | } | 2646 | } |
2624 | 2647 | ||
2648 | static int em_rdpmc(struct x86_emulate_ctxt *ctxt) | ||
2649 | { | ||
2650 | u64 pmc; | ||
2651 | |||
2652 | if (ctxt->ops->read_pmc(ctxt, ctxt->regs[VCPU_REGS_RCX], &pmc)) | ||
2653 | return emulate_gp(ctxt, 0); | ||
2654 | ctxt->regs[VCPU_REGS_RAX] = (u32)pmc; | ||
2655 | ctxt->regs[VCPU_REGS_RDX] = pmc >> 32; | ||
2656 | return X86EMUL_CONTINUE; | ||
2657 | } | ||
2658 | |||
2625 | static int em_mov(struct x86_emulate_ctxt *ctxt) | 2659 | static int em_mov(struct x86_emulate_ctxt *ctxt) |
2626 | { | 2660 | { |
2627 | ctxt->dst.val = ctxt->src.val; | 2661 | ctxt->dst.val = ctxt->src.val; |
2628 | return X86EMUL_CONTINUE; | 2662 | return X86EMUL_CONTINUE; |
2629 | } | 2663 | } |
2630 | 2664 | ||
2665 | static int em_cr_write(struct x86_emulate_ctxt *ctxt) | ||
2666 | { | ||
2667 | if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) | ||
2668 | return emulate_gp(ctxt, 0); | ||
2669 | |||
2670 | /* Disable writeback. */ | ||
2671 | ctxt->dst.type = OP_NONE; | ||
2672 | return X86EMUL_CONTINUE; | ||
2673 | } | ||
2674 | |||
2675 | static int em_dr_write(struct x86_emulate_ctxt *ctxt) | ||
2676 | { | ||
2677 | unsigned long val; | ||
2678 | |||
2679 | if (ctxt->mode == X86EMUL_MODE_PROT64) | ||
2680 | val = ctxt->src.val & ~0ULL; | ||
2681 | else | ||
2682 | val = ctxt->src.val & ~0U; | ||
2683 | |||
2684 | /* #UD condition is already handled. */ | ||
2685 | if (ctxt->ops->set_dr(ctxt, ctxt->modrm_reg, val) < 0) | ||
2686 | return emulate_gp(ctxt, 0); | ||
2687 | |||
2688 | /* Disable writeback. */ | ||
2689 | ctxt->dst.type = OP_NONE; | ||
2690 | return X86EMUL_CONTINUE; | ||
2691 | } | ||
2692 | |||
2693 | static int em_wrmsr(struct x86_emulate_ctxt *ctxt) | ||
2694 | { | ||
2695 | u64 msr_data; | ||
2696 | |||
2697 | msr_data = (u32)ctxt->regs[VCPU_REGS_RAX] | ||
2698 | | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32); | ||
2699 | if (ctxt->ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) | ||
2700 | return emulate_gp(ctxt, 0); | ||
2701 | |||
2702 | return X86EMUL_CONTINUE; | ||
2703 | } | ||
2704 | |||
2705 | static int em_rdmsr(struct x86_emulate_ctxt *ctxt) | ||
2706 | { | ||
2707 | u64 msr_data; | ||
2708 | |||
2709 | if (ctxt->ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) | ||
2710 | return emulate_gp(ctxt, 0); | ||
2711 | |||
2712 | ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data; | ||
2713 | ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32; | ||
2714 | return X86EMUL_CONTINUE; | ||
2715 | } | ||
2716 | |||
2631 | static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt) | 2717 | static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt) |
2632 | { | 2718 | { |
2633 | if (ctxt->modrm_reg > VCPU_SREG_GS) | 2719 | if (ctxt->modrm_reg > VCPU_SREG_GS) |
@@ -2775,6 +2861,24 @@ static int em_jcxz(struct x86_emulate_ctxt *ctxt) | |||
2775 | return X86EMUL_CONTINUE; | 2861 | return X86EMUL_CONTINUE; |
2776 | } | 2862 | } |
2777 | 2863 | ||
2864 | static int em_in(struct x86_emulate_ctxt *ctxt) | ||
2865 | { | ||
2866 | if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val, | ||
2867 | &ctxt->dst.val)) | ||
2868 | return X86EMUL_IO_NEEDED; | ||
2869 | |||
2870 | return X86EMUL_CONTINUE; | ||
2871 | } | ||
2872 | |||
2873 | static int em_out(struct x86_emulate_ctxt *ctxt) | ||
2874 | { | ||
2875 | ctxt->ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val, | ||
2876 | &ctxt->src.val, 1); | ||
2877 | /* Disable writeback. */ | ||
2878 | ctxt->dst.type = OP_NONE; | ||
2879 | return X86EMUL_CONTINUE; | ||
2880 | } | ||
2881 | |||
2778 | static int em_cli(struct x86_emulate_ctxt *ctxt) | 2882 | static int em_cli(struct x86_emulate_ctxt *ctxt) |
2779 | { | 2883 | { |
2780 | if (emulator_bad_iopl(ctxt)) | 2884 | if (emulator_bad_iopl(ctxt)) |
@@ -2794,6 +2898,69 @@ static int em_sti(struct x86_emulate_ctxt *ctxt) | |||
2794 | return X86EMUL_CONTINUE; | 2898 | return X86EMUL_CONTINUE; |
2795 | } | 2899 | } |
2796 | 2900 | ||
2901 | static int em_bt(struct x86_emulate_ctxt *ctxt) | ||
2902 | { | ||
2903 | /* Disable writeback. */ | ||
2904 | ctxt->dst.type = OP_NONE; | ||
2905 | /* only subword offset */ | ||
2906 | ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; | ||
2907 | |||
2908 | emulate_2op_SrcV_nobyte(ctxt, "bt"); | ||
2909 | return X86EMUL_CONTINUE; | ||
2910 | } | ||
2911 | |||
2912 | static int em_bts(struct x86_emulate_ctxt *ctxt) | ||
2913 | { | ||
2914 | emulate_2op_SrcV_nobyte(ctxt, "bts"); | ||
2915 | return X86EMUL_CONTINUE; | ||
2916 | } | ||
2917 | |||
2918 | static int em_btr(struct x86_emulate_ctxt *ctxt) | ||
2919 | { | ||
2920 | emulate_2op_SrcV_nobyte(ctxt, "btr"); | ||
2921 | return X86EMUL_CONTINUE; | ||
2922 | } | ||
2923 | |||
2924 | static int em_btc(struct x86_emulate_ctxt *ctxt) | ||
2925 | { | ||
2926 | emulate_2op_SrcV_nobyte(ctxt, "btc"); | ||
2927 | return X86EMUL_CONTINUE; | ||
2928 | } | ||
2929 | |||
2930 | static int em_bsf(struct x86_emulate_ctxt *ctxt) | ||
2931 | { | ||
2932 | u8 zf; | ||
2933 | |||
2934 | __asm__ ("bsf %2, %0; setz %1" | ||
2935 | : "=r"(ctxt->dst.val), "=q"(zf) | ||
2936 | : "r"(ctxt->src.val)); | ||
2937 | |||
2938 | ctxt->eflags &= ~X86_EFLAGS_ZF; | ||
2939 | if (zf) { | ||
2940 | ctxt->eflags |= X86_EFLAGS_ZF; | ||
2941 | /* Disable writeback. */ | ||
2942 | ctxt->dst.type = OP_NONE; | ||
2943 | } | ||
2944 | return X86EMUL_CONTINUE; | ||
2945 | } | ||
2946 | |||
2947 | static int em_bsr(struct x86_emulate_ctxt *ctxt) | ||
2948 | { | ||
2949 | u8 zf; | ||
2950 | |||
2951 | __asm__ ("bsr %2, %0; setz %1" | ||
2952 | : "=r"(ctxt->dst.val), "=q"(zf) | ||
2953 | : "r"(ctxt->src.val)); | ||
2954 | |||
2955 | ctxt->eflags &= ~X86_EFLAGS_ZF; | ||
2956 | if (zf) { | ||
2957 | ctxt->eflags |= X86_EFLAGS_ZF; | ||
2958 | /* Disable writeback. */ | ||
2959 | ctxt->dst.type = OP_NONE; | ||
2960 | } | ||
2961 | return X86EMUL_CONTINUE; | ||
2962 | } | ||
2963 | |||
2797 | static bool valid_cr(int nr) | 2964 | static bool valid_cr(int nr) |
2798 | { | 2965 | { |
2799 | switch (nr) { | 2966 | switch (nr) { |
@@ -2867,9 +3034,6 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) | |||
2867 | break; | 3034 | break; |
2868 | } | 3035 | } |
2869 | case 4: { | 3036 | case 4: { |
2870 | u64 cr4; | ||
2871 | |||
2872 | cr4 = ctxt->ops->get_cr(ctxt, 4); | ||
2873 | ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); | 3037 | ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); |
2874 | 3038 | ||
2875 | if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE)) | 3039 | if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE)) |
@@ -3003,6 +3167,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) | |||
3003 | #define D2bv(_f) D((_f) | ByteOp), D(_f) | 3167 | #define D2bv(_f) D((_f) | ByteOp), D(_f) |
3004 | #define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) | 3168 | #define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) |
3005 | #define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) | 3169 | #define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) |
3170 | #define I2bvIP(_f, _e, _i, _p) \ | ||
3171 | IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p) | ||
3006 | 3172 | ||
3007 | #define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \ | 3173 | #define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \ |
3008 | I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ | 3174 | I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ |
@@ -3033,17 +3199,17 @@ static struct opcode group7_rm7[] = { | |||
3033 | 3199 | ||
3034 | static struct opcode group1[] = { | 3200 | static struct opcode group1[] = { |
3035 | I(Lock, em_add), | 3201 | I(Lock, em_add), |
3036 | I(Lock, em_or), | 3202 | I(Lock | PageTable, em_or), |
3037 | I(Lock, em_adc), | 3203 | I(Lock, em_adc), |
3038 | I(Lock, em_sbb), | 3204 | I(Lock, em_sbb), |
3039 | I(Lock, em_and), | 3205 | I(Lock | PageTable, em_and), |
3040 | I(Lock, em_sub), | 3206 | I(Lock, em_sub), |
3041 | I(Lock, em_xor), | 3207 | I(Lock, em_xor), |
3042 | I(0, em_cmp), | 3208 | I(0, em_cmp), |
3043 | }; | 3209 | }; |
3044 | 3210 | ||
3045 | static struct opcode group1A[] = { | 3211 | static struct opcode group1A[] = { |
3046 | D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N, | 3212 | I(DstMem | SrcNone | ModRM | Mov | Stack, em_pop), N, N, N, N, N, N, N, |
3047 | }; | 3213 | }; |
3048 | 3214 | ||
3049 | static struct opcode group3[] = { | 3215 | static struct opcode group3[] = { |
@@ -3058,16 +3224,19 @@ static struct opcode group3[] = { | |||
3058 | }; | 3224 | }; |
3059 | 3225 | ||
3060 | static struct opcode group4[] = { | 3226 | static struct opcode group4[] = { |
3061 | D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock), | 3227 | I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), |
3228 | I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), | ||
3062 | N, N, N, N, N, N, | 3229 | N, N, N, N, N, N, |
3063 | }; | 3230 | }; |
3064 | 3231 | ||
3065 | static struct opcode group5[] = { | 3232 | static struct opcode group5[] = { |
3066 | D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), | 3233 | I(DstMem | SrcNone | ModRM | Lock, em_grp45), |
3067 | D(SrcMem | ModRM | Stack), | 3234 | I(DstMem | SrcNone | ModRM | Lock, em_grp45), |
3235 | I(SrcMem | ModRM | Stack, em_grp45), | ||
3068 | I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), | 3236 | I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), |
3069 | D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps), | 3237 | I(SrcMem | ModRM | Stack, em_grp45), |
3070 | D(SrcMem | ModRM | Stack), N, | 3238 | I(SrcMemFAddr | ModRM | ImplicitOps, em_grp45), |
3239 | I(SrcMem | ModRM | Stack, em_grp45), N, | ||
3071 | }; | 3240 | }; |
3072 | 3241 | ||
3073 | static struct opcode group6[] = { | 3242 | static struct opcode group6[] = { |
@@ -3096,18 +3265,21 @@ static struct group_dual group7 = { { | |||
3096 | 3265 | ||
3097 | static struct opcode group8[] = { | 3266 | static struct opcode group8[] = { |
3098 | N, N, N, N, | 3267 | N, N, N, N, |
3099 | D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock), | 3268 | I(DstMem | SrcImmByte | ModRM, em_bt), |
3100 | D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock), | 3269 | I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_bts), |
3270 | I(DstMem | SrcImmByte | ModRM | Lock, em_btr), | ||
3271 | I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_btc), | ||
3101 | }; | 3272 | }; |
3102 | 3273 | ||
3103 | static struct group_dual group9 = { { | 3274 | static struct group_dual group9 = { { |
3104 | N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N, | 3275 | N, I(DstMem64 | ModRM | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, |
3105 | }, { | 3276 | }, { |
3106 | N, N, N, N, N, N, N, N, | 3277 | N, N, N, N, N, N, N, N, |
3107 | } }; | 3278 | } }; |
3108 | 3279 | ||
3109 | static struct opcode group11[] = { | 3280 | static struct opcode group11[] = { |
3110 | I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)), | 3281 | I(DstMem | SrcImm | ModRM | Mov | PageTable, em_mov), |
3282 | X7(D(Undefined)), | ||
3111 | }; | 3283 | }; |
3112 | 3284 | ||
3113 | static struct gprefix pfx_0f_6f_0f_7f = { | 3285 | static struct gprefix pfx_0f_6f_0f_7f = { |
@@ -3120,7 +3292,7 @@ static struct opcode opcode_table[256] = { | |||
3120 | I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), | 3292 | I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), |
3121 | I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg), | 3293 | I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg), |
3122 | /* 0x08 - 0x0F */ | 3294 | /* 0x08 - 0x0F */ |
3123 | I6ALU(Lock, em_or), | 3295 | I6ALU(Lock | PageTable, em_or), |
3124 | I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg), | 3296 | I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg), |
3125 | N, | 3297 | N, |
3126 | /* 0x10 - 0x17 */ | 3298 | /* 0x10 - 0x17 */ |
@@ -3132,7 +3304,7 @@ static struct opcode opcode_table[256] = { | |||
3132 | I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg), | 3304 | I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg), |
3133 | I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg), | 3305 | I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg), |
3134 | /* 0x20 - 0x27 */ | 3306 | /* 0x20 - 0x27 */ |
3135 | I6ALU(Lock, em_and), N, N, | 3307 | I6ALU(Lock | PageTable, em_and), N, N, |
3136 | /* 0x28 - 0x2F */ | 3308 | /* 0x28 - 0x2F */ |
3137 | I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), | 3309 | I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), |
3138 | /* 0x30 - 0x37 */ | 3310 | /* 0x30 - 0x37 */ |
@@ -3155,8 +3327,8 @@ static struct opcode opcode_table[256] = { | |||
3155 | I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), | 3327 | I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), |
3156 | I(SrcImmByte | Mov | Stack, em_push), | 3328 | I(SrcImmByte | Mov | Stack, em_push), |
3157 | I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), | 3329 | I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), |
3158 | D2bvIP(DstDI | SrcDX | Mov | String, ins, check_perm_in), /* insb, insw/insd */ | 3330 | I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */ |
3159 | D2bvIP(SrcSI | DstDX | String, outs, check_perm_out), /* outsb, outsw/outsd */ | 3331 | I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */ |
3160 | /* 0x70 - 0x7F */ | 3332 | /* 0x70 - 0x7F */ |
3161 | X16(D(SrcImmByte)), | 3333 | X16(D(SrcImmByte)), |
3162 | /* 0x80 - 0x87 */ | 3334 | /* 0x80 - 0x87 */ |
@@ -3165,11 +3337,11 @@ static struct opcode opcode_table[256] = { | |||
3165 | G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), | 3337 | G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), |
3166 | G(DstMem | SrcImmByte | ModRM | Group, group1), | 3338 | G(DstMem | SrcImmByte | ModRM | Group, group1), |
3167 | I2bv(DstMem | SrcReg | ModRM, em_test), | 3339 | I2bv(DstMem | SrcReg | ModRM, em_test), |
3168 | I2bv(DstMem | SrcReg | ModRM | Lock, em_xchg), | 3340 | I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg), |
3169 | /* 0x88 - 0x8F */ | 3341 | /* 0x88 - 0x8F */ |
3170 | I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), | 3342 | I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov), |
3171 | I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), | 3343 | I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), |
3172 | I(DstMem | SrcNone | ModRM | Mov, em_mov_rm_sreg), | 3344 | I(DstMem | SrcNone | ModRM | Mov | PageTable, em_mov_rm_sreg), |
3173 | D(ModRM | SrcMem | NoAccess | DstReg), | 3345 | D(ModRM | SrcMem | NoAccess | DstReg), |
3174 | I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm), | 3346 | I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm), |
3175 | G(0, group1A), | 3347 | G(0, group1A), |
@@ -3182,7 +3354,7 @@ static struct opcode opcode_table[256] = { | |||
3182 | II(ImplicitOps | Stack, em_popf, popf), N, N, | 3354 | II(ImplicitOps | Stack, em_popf, popf), N, N, |
3183 | /* 0xA0 - 0xA7 */ | 3355 | /* 0xA0 - 0xA7 */ |
3184 | I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), | 3356 | I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), |
3185 | I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov), | 3357 | I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), |
3186 | I2bv(SrcSI | DstDI | Mov | String, em_mov), | 3358 | I2bv(SrcSI | DstDI | Mov | String, em_mov), |
3187 | I2bv(SrcSI | DstDI | String, em_cmp), | 3359 | I2bv(SrcSI | DstDI | String, em_cmp), |
3188 | /* 0xA8 - 0xAF */ | 3360 | /* 0xA8 - 0xAF */ |
@@ -3213,13 +3385,13 @@ static struct opcode opcode_table[256] = { | |||
3213 | /* 0xE0 - 0xE7 */ | 3385 | /* 0xE0 - 0xE7 */ |
3214 | X3(I(SrcImmByte, em_loop)), | 3386 | X3(I(SrcImmByte, em_loop)), |
3215 | I(SrcImmByte, em_jcxz), | 3387 | I(SrcImmByte, em_jcxz), |
3216 | D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in), | 3388 | I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in), |
3217 | D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out), | 3389 | I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out), |
3218 | /* 0xE8 - 0xEF */ | 3390 | /* 0xE8 - 0xEF */ |
3219 | D(SrcImm | Stack), D(SrcImm | ImplicitOps), | 3391 | I(SrcImm | Stack, em_call), D(SrcImm | ImplicitOps), |
3220 | I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps), | 3392 | I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps), |
3221 | D2bvIP(SrcDX | DstAcc, in, check_perm_in), | 3393 | I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in), |
3222 | D2bvIP(SrcAcc | DstDX, out, check_perm_out), | 3394 | I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out), |
3223 | /* 0xF0 - 0xF7 */ | 3395 | /* 0xF0 - 0xF7 */ |
3224 | N, DI(ImplicitOps, icebp), N, N, | 3396 | N, DI(ImplicitOps, icebp), N, N, |
3225 | DI(ImplicitOps | Priv, hlt), D(ImplicitOps), | 3397 | DI(ImplicitOps | Priv, hlt), D(ImplicitOps), |
@@ -3242,15 +3414,15 @@ static struct opcode twobyte_table[256] = { | |||
3242 | /* 0x20 - 0x2F */ | 3414 | /* 0x20 - 0x2F */ |
3243 | DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read), | 3415 | DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read), |
3244 | DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read), | 3416 | DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read), |
3245 | DIP(ModRM | SrcMem | Priv | Op3264, cr_write, check_cr_write), | 3417 | IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), |
3246 | DIP(ModRM | SrcMem | Priv | Op3264, dr_write, check_dr_write), | 3418 | IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), |
3247 | N, N, N, N, | 3419 | N, N, N, N, |
3248 | N, N, N, N, N, N, N, N, | 3420 | N, N, N, N, N, N, N, N, |
3249 | /* 0x30 - 0x3F */ | 3421 | /* 0x30 - 0x3F */ |
3250 | DI(ImplicitOps | Priv, wrmsr), | 3422 | II(ImplicitOps | Priv, em_wrmsr, wrmsr), |
3251 | IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), | 3423 | IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), |
3252 | DI(ImplicitOps | Priv, rdmsr), | 3424 | II(ImplicitOps | Priv, em_rdmsr, rdmsr), |
3253 | DIP(ImplicitOps | Priv, rdpmc, check_rdpmc), | 3425 | IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc), |
3254 | I(ImplicitOps | VendorSpecific, em_sysenter), | 3426 | I(ImplicitOps | VendorSpecific, em_sysenter), |
3255 | I(ImplicitOps | Priv | VendorSpecific, em_sysexit), | 3427 | I(ImplicitOps | Priv | VendorSpecific, em_sysexit), |
3256 | N, N, | 3428 | N, N, |
@@ -3275,26 +3447,28 @@ static struct opcode twobyte_table[256] = { | |||
3275 | X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), | 3447 | X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), |
3276 | /* 0xA0 - 0xA7 */ | 3448 | /* 0xA0 - 0xA7 */ |
3277 | I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), | 3449 | I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), |
3278 | DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp), | 3450 | DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), |
3279 | D(DstMem | SrcReg | Src2ImmByte | ModRM), | 3451 | D(DstMem | SrcReg | Src2ImmByte | ModRM), |
3280 | D(DstMem | SrcReg | Src2CL | ModRM), N, N, | 3452 | D(DstMem | SrcReg | Src2CL | ModRM), N, N, |
3281 | /* 0xA8 - 0xAF */ | 3453 | /* 0xA8 - 0xAF */ |
3282 | I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), | 3454 | I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), |
3283 | DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock), | 3455 | DI(ImplicitOps, rsm), |
3456 | I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), | ||
3284 | D(DstMem | SrcReg | Src2ImmByte | ModRM), | 3457 | D(DstMem | SrcReg | Src2ImmByte | ModRM), |
3285 | D(DstMem | SrcReg | Src2CL | ModRM), | 3458 | D(DstMem | SrcReg | Src2CL | ModRM), |
3286 | D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), | 3459 | D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), |
3287 | /* 0xB0 - 0xB7 */ | 3460 | /* 0xB0 - 0xB7 */ |
3288 | D2bv(DstMem | SrcReg | ModRM | Lock), | 3461 | I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), |
3289 | I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), | 3462 | I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), |
3290 | D(DstMem | SrcReg | ModRM | BitOp | Lock), | 3463 | I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), |
3291 | I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), | 3464 | I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), |
3292 | I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), | 3465 | I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), |
3293 | D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), | 3466 | D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), |
3294 | /* 0xB8 - 0xBF */ | 3467 | /* 0xB8 - 0xBF */ |
3295 | N, N, | 3468 | N, N, |
3296 | G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock), | 3469 | G(BitOp, group8), |
3297 | D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), | 3470 | I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), |
3471 | I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), | ||
3298 | D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), | 3472 | D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), |
3299 | /* 0xC0 - 0xCF */ | 3473 | /* 0xC0 - 0xCF */ |
3300 | D2bv(DstMem | SrcReg | ModRM | Lock), | 3474 | D2bv(DstMem | SrcReg | ModRM | Lock), |
@@ -3320,6 +3494,7 @@ static struct opcode twobyte_table[256] = { | |||
3320 | #undef D2bv | 3494 | #undef D2bv |
3321 | #undef D2bvIP | 3495 | #undef D2bvIP |
3322 | #undef I2bv | 3496 | #undef I2bv |
3497 | #undef I2bvIP | ||
3323 | #undef I6ALU | 3498 | #undef I6ALU |
3324 | 3499 | ||
3325 | static unsigned imm_size(struct x86_emulate_ctxt *ctxt) | 3500 | static unsigned imm_size(struct x86_emulate_ctxt *ctxt) |
@@ -3697,6 +3872,11 @@ done: | |||
3697 | return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; | 3872 | return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; |
3698 | } | 3873 | } |
3699 | 3874 | ||
3875 | bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt) | ||
3876 | { | ||
3877 | return ctxt->d & PageTable; | ||
3878 | } | ||
3879 | |||
3700 | static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) | 3880 | static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) |
3701 | { | 3881 | { |
3702 | /* The second termination condition only applies for REPE | 3882 | /* The second termination condition only applies for REPE |
@@ -3720,7 +3900,6 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) | |||
3720 | int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) | 3900 | int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) |
3721 | { | 3901 | { |
3722 | struct x86_emulate_ops *ops = ctxt->ops; | 3902 | struct x86_emulate_ops *ops = ctxt->ops; |
3723 | u64 msr_data; | ||
3724 | int rc = X86EMUL_CONTINUE; | 3903 | int rc = X86EMUL_CONTINUE; |
3725 | int saved_dst_type = ctxt->dst.type; | 3904 | int saved_dst_type = ctxt->dst.type; |
3726 | 3905 | ||
@@ -3854,15 +4033,6 @@ special_insn: | |||
3854 | goto cannot_emulate; | 4033 | goto cannot_emulate; |
3855 | ctxt->dst.val = (s32) ctxt->src.val; | 4034 | ctxt->dst.val = (s32) ctxt->src.val; |
3856 | break; | 4035 | break; |
3857 | case 0x6c: /* insb */ | ||
3858 | case 0x6d: /* insw/insd */ | ||
3859 | ctxt->src.val = ctxt->regs[VCPU_REGS_RDX]; | ||
3860 | goto do_io_in; | ||
3861 | case 0x6e: /* outsb */ | ||
3862 | case 0x6f: /* outsw/outsd */ | ||
3863 | ctxt->dst.val = ctxt->regs[VCPU_REGS_RDX]; | ||
3864 | goto do_io_out; | ||
3865 | break; | ||
3866 | case 0x70 ... 0x7f: /* jcc (short) */ | 4036 | case 0x70 ... 0x7f: /* jcc (short) */ |
3867 | if (test_cc(ctxt->b, ctxt->eflags)) | 4037 | if (test_cc(ctxt->b, ctxt->eflags)) |
3868 | jmp_rel(ctxt, ctxt->src.val); | 4038 | jmp_rel(ctxt, ctxt->src.val); |
@@ -3870,9 +4040,6 @@ special_insn: | |||
3870 | case 0x8d: /* lea r16/r32, m */ | 4040 | case 0x8d: /* lea r16/r32, m */ |
3871 | ctxt->dst.val = ctxt->src.addr.mem.ea; | 4041 | ctxt->dst.val = ctxt->src.addr.mem.ea; |
3872 | break; | 4042 | break; |
3873 | case 0x8f: /* pop (sole member of Grp1a) */ | ||
3874 | rc = em_grp1a(ctxt); | ||
3875 | break; | ||
3876 | case 0x90 ... 0x97: /* nop / xchg reg, rax */ | 4043 | case 0x90 ... 0x97: /* nop / xchg reg, rax */ |
3877 | if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX]) | 4044 | if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX]) |
3878 | break; | 4045 | break; |
@@ -3905,38 +4072,11 @@ special_insn: | |||
3905 | ctxt->src.val = ctxt->regs[VCPU_REGS_RCX]; | 4072 | ctxt->src.val = ctxt->regs[VCPU_REGS_RCX]; |
3906 | rc = em_grp2(ctxt); | 4073 | rc = em_grp2(ctxt); |
3907 | break; | 4074 | break; |
3908 | case 0xe4: /* inb */ | ||
3909 | case 0xe5: /* in */ | ||
3910 | goto do_io_in; | ||
3911 | case 0xe6: /* outb */ | ||
3912 | case 0xe7: /* out */ | ||
3913 | goto do_io_out; | ||
3914 | case 0xe8: /* call (near) */ { | ||
3915 | long int rel = ctxt->src.val; | ||
3916 | ctxt->src.val = (unsigned long) ctxt->_eip; | ||
3917 | jmp_rel(ctxt, rel); | ||
3918 | rc = em_push(ctxt); | ||
3919 | break; | ||
3920 | } | ||
3921 | case 0xe9: /* jmp rel */ | 4075 | case 0xe9: /* jmp rel */ |
3922 | case 0xeb: /* jmp rel short */ | 4076 | case 0xeb: /* jmp rel short */ |
3923 | jmp_rel(ctxt, ctxt->src.val); | 4077 | jmp_rel(ctxt, ctxt->src.val); |
3924 | ctxt->dst.type = OP_NONE; /* Disable writeback. */ | 4078 | ctxt->dst.type = OP_NONE; /* Disable writeback. */ |
3925 | break; | 4079 | break; |
3926 | case 0xec: /* in al,dx */ | ||
3927 | case 0xed: /* in (e/r)ax,dx */ | ||
3928 | do_io_in: | ||
3929 | if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val, | ||
3930 | &ctxt->dst.val)) | ||
3931 | goto done; /* IO is needed */ | ||
3932 | break; | ||
3933 | case 0xee: /* out dx,al */ | ||
3934 | case 0xef: /* out dx,(e/r)ax */ | ||
3935 | do_io_out: | ||
3936 | ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val, | ||
3937 | &ctxt->src.val, 1); | ||
3938 | ctxt->dst.type = OP_NONE; /* Disable writeback. */ | ||
3939 | break; | ||
3940 | case 0xf4: /* hlt */ | 4080 | case 0xf4: /* hlt */ |
3941 | ctxt->ops->halt(ctxt); | 4081 | ctxt->ops->halt(ctxt); |
3942 | break; | 4082 | break; |
@@ -3956,12 +4096,6 @@ special_insn: | |||
3956 | case 0xfd: /* std */ | 4096 | case 0xfd: /* std */ |
3957 | ctxt->eflags |= EFLG_DF; | 4097 | ctxt->eflags |= EFLG_DF; |
3958 | break; | 4098 | break; |
3959 | case 0xfe: /* Grp4 */ | ||
3960 | rc = em_grp45(ctxt); | ||
3961 | break; | ||
3962 | case 0xff: /* Grp5 */ | ||
3963 | rc = em_grp45(ctxt); | ||
3964 | break; | ||
3965 | default: | 4099 | default: |
3966 | goto cannot_emulate; | 4100 | goto cannot_emulate; |
3967 | } | 4101 | } |
@@ -4036,49 +4170,6 @@ twobyte_insn: | |||
4036 | case 0x21: /* mov from dr to reg */ | 4170 | case 0x21: /* mov from dr to reg */ |
4037 | ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val); | 4171 | ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val); |
4038 | break; | 4172 | break; |
4039 | case 0x22: /* mov reg, cr */ | ||
4040 | if (ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) { | ||
4041 | emulate_gp(ctxt, 0); | ||
4042 | rc = X86EMUL_PROPAGATE_FAULT; | ||
4043 | goto done; | ||
4044 | } | ||
4045 | ctxt->dst.type = OP_NONE; | ||
4046 | break; | ||
4047 | case 0x23: /* mov from reg to dr */ | ||
4048 | if (ops->set_dr(ctxt, ctxt->modrm_reg, ctxt->src.val & | ||
4049 | ((ctxt->mode == X86EMUL_MODE_PROT64) ? | ||
4050 | ~0ULL : ~0U)) < 0) { | ||
4051 | /* #UD condition is already handled by the code above */ | ||
4052 | emulate_gp(ctxt, 0); | ||
4053 | rc = X86EMUL_PROPAGATE_FAULT; | ||
4054 | goto done; | ||
4055 | } | ||
4056 | |||
4057 | ctxt->dst.type = OP_NONE; /* no writeback */ | ||
4058 | break; | ||
4059 | case 0x30: | ||
4060 | /* wrmsr */ | ||
4061 | msr_data = (u32)ctxt->regs[VCPU_REGS_RAX] | ||
4062 | | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32); | ||
4063 | if (ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) { | ||
4064 | emulate_gp(ctxt, 0); | ||
4065 | rc = X86EMUL_PROPAGATE_FAULT; | ||
4066 | goto done; | ||
4067 | } | ||
4068 | rc = X86EMUL_CONTINUE; | ||
4069 | break; | ||
4070 | case 0x32: | ||
4071 | /* rdmsr */ | ||
4072 | if (ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) { | ||
4073 | emulate_gp(ctxt, 0); | ||
4074 | rc = X86EMUL_PROPAGATE_FAULT; | ||
4075 | goto done; | ||
4076 | } else { | ||
4077 | ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data; | ||
4078 | ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32; | ||
4079 | } | ||
4080 | rc = X86EMUL_CONTINUE; | ||
4081 | break; | ||
4082 | case 0x40 ... 0x4f: /* cmov */ | 4173 | case 0x40 ... 0x4f: /* cmov */ |
4083 | ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val; | 4174 | ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val; |
4084 | if (!test_cc(ctxt->b, ctxt->eflags)) | 4175 | if (!test_cc(ctxt->b, ctxt->eflags)) |
@@ -4091,93 +4182,21 @@ twobyte_insn: | |||
4091 | case 0x90 ... 0x9f: /* setcc r/m8 */ | 4182 | case 0x90 ... 0x9f: /* setcc r/m8 */ |
4092 | ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); | 4183 | ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); |
4093 | break; | 4184 | break; |
4094 | case 0xa3: | ||
4095 | bt: /* bt */ | ||
4096 | ctxt->dst.type = OP_NONE; | ||
4097 | /* only subword offset */ | ||
4098 | ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; | ||
4099 | emulate_2op_SrcV_nobyte(ctxt, "bt"); | ||
4100 | break; | ||
4101 | case 0xa4: /* shld imm8, r, r/m */ | 4185 | case 0xa4: /* shld imm8, r, r/m */ |
4102 | case 0xa5: /* shld cl, r, r/m */ | 4186 | case 0xa5: /* shld cl, r, r/m */ |
4103 | emulate_2op_cl(ctxt, "shld"); | 4187 | emulate_2op_cl(ctxt, "shld"); |
4104 | break; | 4188 | break; |
4105 | case 0xab: | ||
4106 | bts: /* bts */ | ||
4107 | emulate_2op_SrcV_nobyte(ctxt, "bts"); | ||
4108 | break; | ||
4109 | case 0xac: /* shrd imm8, r, r/m */ | 4189 | case 0xac: /* shrd imm8, r, r/m */ |
4110 | case 0xad: /* shrd cl, r, r/m */ | 4190 | case 0xad: /* shrd cl, r, r/m */ |
4111 | emulate_2op_cl(ctxt, "shrd"); | 4191 | emulate_2op_cl(ctxt, "shrd"); |
4112 | break; | 4192 | break; |
4113 | case 0xae: /* clflush */ | 4193 | case 0xae: /* clflush */ |
4114 | break; | 4194 | break; |
4115 | case 0xb0 ... 0xb1: /* cmpxchg */ | ||
4116 | /* | ||
4117 | * Save real source value, then compare EAX against | ||
4118 | * destination. | ||
4119 | */ | ||
4120 | ctxt->src.orig_val = ctxt->src.val; | ||
4121 | ctxt->src.val = ctxt->regs[VCPU_REGS_RAX]; | ||
4122 | emulate_2op_SrcV(ctxt, "cmp"); | ||
4123 | if (ctxt->eflags & EFLG_ZF) { | ||
4124 | /* Success: write back to memory. */ | ||
4125 | ctxt->dst.val = ctxt->src.orig_val; | ||
4126 | } else { | ||
4127 | /* Failure: write the value we saw to EAX. */ | ||
4128 | ctxt->dst.type = OP_REG; | ||
4129 | ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; | ||
4130 | } | ||
4131 | break; | ||
4132 | case 0xb3: | ||
4133 | btr: /* btr */ | ||
4134 | emulate_2op_SrcV_nobyte(ctxt, "btr"); | ||
4135 | break; | ||
4136 | case 0xb6 ... 0xb7: /* movzx */ | 4195 | case 0xb6 ... 0xb7: /* movzx */ |
4137 | ctxt->dst.bytes = ctxt->op_bytes; | 4196 | ctxt->dst.bytes = ctxt->op_bytes; |
4138 | ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val | 4197 | ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val |
4139 | : (u16) ctxt->src.val; | 4198 | : (u16) ctxt->src.val; |
4140 | break; | 4199 | break; |
4141 | case 0xba: /* Grp8 */ | ||
4142 | switch (ctxt->modrm_reg & 3) { | ||
4143 | case 0: | ||
4144 | goto bt; | ||
4145 | case 1: | ||
4146 | goto bts; | ||
4147 | case 2: | ||
4148 | goto btr; | ||
4149 | case 3: | ||
4150 | goto btc; | ||
4151 | } | ||
4152 | break; | ||
4153 | case 0xbb: | ||
4154 | btc: /* btc */ | ||
4155 | emulate_2op_SrcV_nobyte(ctxt, "btc"); | ||
4156 | break; | ||
4157 | case 0xbc: { /* bsf */ | ||
4158 | u8 zf; | ||
4159 | __asm__ ("bsf %2, %0; setz %1" | ||
4160 | : "=r"(ctxt->dst.val), "=q"(zf) | ||
4161 | : "r"(ctxt->src.val)); | ||
4162 | ctxt->eflags &= ~X86_EFLAGS_ZF; | ||
4163 | if (zf) { | ||
4164 | ctxt->eflags |= X86_EFLAGS_ZF; | ||
4165 | ctxt->dst.type = OP_NONE; /* Disable writeback. */ | ||
4166 | } | ||
4167 | break; | ||
4168 | } | ||
4169 | case 0xbd: { /* bsr */ | ||
4170 | u8 zf; | ||
4171 | __asm__ ("bsr %2, %0; setz %1" | ||
4172 | : "=r"(ctxt->dst.val), "=q"(zf) | ||
4173 | : "r"(ctxt->src.val)); | ||
4174 | ctxt->eflags &= ~X86_EFLAGS_ZF; | ||
4175 | if (zf) { | ||
4176 | ctxt->eflags |= X86_EFLAGS_ZF; | ||
4177 | ctxt->dst.type = OP_NONE; /* Disable writeback. */ | ||
4178 | } | ||
4179 | break; | ||
4180 | } | ||
4181 | case 0xbe ... 0xbf: /* movsx */ | 4200 | case 0xbe ... 0xbf: /* movsx */ |
4182 | ctxt->dst.bytes = ctxt->op_bytes; | 4201 | ctxt->dst.bytes = ctxt->op_bytes; |
4183 | ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : | 4202 | ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : |
@@ -4194,9 +4213,6 @@ twobyte_insn: | |||
4194 | ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : | 4213 | ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : |
4195 | (u64) ctxt->src.val; | 4214 | (u64) ctxt->src.val; |
4196 | break; | 4215 | break; |
4197 | case 0xc7: /* Grp9 (cmpxchg8b) */ | ||
4198 | rc = em_grp9(ctxt); | ||
4199 | break; | ||
4200 | default: | 4216 | default: |
4201 | goto cannot_emulate; | 4217 | goto cannot_emulate; |
4202 | } | 4218 | } |
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 405f2620392f..d68f99df690c 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c | |||
@@ -344,7 +344,7 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) | |||
344 | struct kvm_timer *pt = &ps->pit_timer; | 344 | struct kvm_timer *pt = &ps->pit_timer; |
345 | s64 interval; | 345 | s64 interval; |
346 | 346 | ||
347 | if (!irqchip_in_kernel(kvm)) | 347 | if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) |
348 | return; | 348 | return; |
349 | 349 | ||
350 | interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); | 350 | interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); |
@@ -397,15 +397,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) | |||
397 | case 1: | 397 | case 1: |
398 | /* FIXME: enhance mode 4 precision */ | 398 | /* FIXME: enhance mode 4 precision */ |
399 | case 4: | 399 | case 4: |
400 | if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) { | 400 | create_pit_timer(kvm, val, 0); |
401 | create_pit_timer(kvm, val, 0); | ||
402 | } | ||
403 | break; | 401 | break; |
404 | case 2: | 402 | case 2: |
405 | case 3: | 403 | case 3: |
406 | if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){ | 404 | create_pit_timer(kvm, val, 1); |
407 | create_pit_timer(kvm, val, 1); | ||
408 | } | ||
409 | break; | 405 | break; |
410 | default: | 406 | default: |
411 | destroy_pit_timer(kvm->arch.vpit); | 407 | destroy_pit_timer(kvm->arch.vpit); |
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index cac4746d7ffb..b6a73537e1ef 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c | |||
@@ -262,9 +262,10 @@ int kvm_pic_read_irq(struct kvm *kvm) | |||
262 | 262 | ||
263 | void kvm_pic_reset(struct kvm_kpic_state *s) | 263 | void kvm_pic_reset(struct kvm_kpic_state *s) |
264 | { | 264 | { |
265 | int irq; | 265 | int irq, i; |
266 | struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu; | 266 | struct kvm_vcpu *vcpu; |
267 | u8 irr = s->irr, isr = s->imr; | 267 | u8 irr = s->irr, isr = s->imr; |
268 | bool found = false; | ||
268 | 269 | ||
269 | s->last_irr = 0; | 270 | s->last_irr = 0; |
270 | s->irr = 0; | 271 | s->irr = 0; |
@@ -281,12 +282,19 @@ void kvm_pic_reset(struct kvm_kpic_state *s) | |||
281 | s->special_fully_nested_mode = 0; | 282 | s->special_fully_nested_mode = 0; |
282 | s->init4 = 0; | 283 | s->init4 = 0; |
283 | 284 | ||
284 | for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { | 285 | kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm) |
285 | if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) | 286 | if (kvm_apic_accept_pic_intr(vcpu)) { |
286 | if (irr & (1 << irq) || isr & (1 << irq)) { | 287 | found = true; |
287 | pic_clear_isr(s, irq); | 288 | break; |
288 | } | 289 | } |
289 | } | 290 | |
291 | |||
292 | if (!found) | ||
293 | return; | ||
294 | |||
295 | for (irq = 0; irq < PIC_NUM_PINS/2; irq++) | ||
296 | if (irr & (1 << irq) || isr & (1 << irq)) | ||
297 | pic_clear_isr(s, irq); | ||
290 | } | 298 | } |
291 | 299 | ||
292 | static void pic_ioport_write(void *opaque, u32 addr, u32 val) | 300 | static void pic_ioport_write(void *opaque, u32 addr, u32 val) |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 54abb40199d6..cfdc6e0ef002 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -38,6 +38,7 @@ | |||
38 | #include "irq.h" | 38 | #include "irq.h" |
39 | #include "trace.h" | 39 | #include "trace.h" |
40 | #include "x86.h" | 40 | #include "x86.h" |
41 | #include "cpuid.h" | ||
41 | 42 | ||
42 | #ifndef CONFIG_X86_64 | 43 | #ifndef CONFIG_X86_64 |
43 | #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) | 44 | #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) |
@@ -1120,7 +1121,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu) | |||
1120 | return 0; | 1121 | return 0; |
1121 | } | 1122 | } |
1122 | 1123 | ||
1123 | static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) | 1124 | int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) |
1124 | { | 1125 | { |
1125 | u32 reg = apic_get_reg(apic, lvt_type); | 1126 | u32 reg = apic_get_reg(apic, lvt_type); |
1126 | int vector, mode, trig_mode; | 1127 | int vector, mode, trig_mode; |
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 138e8cc6fea6..6f4ce2575d09 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h | |||
@@ -34,6 +34,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu); | |||
34 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); | 34 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); |
35 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); | 35 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); |
36 | int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); | 36 | int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); |
37 | int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); | ||
37 | 38 | ||
38 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); | 39 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); |
39 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); | 40 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f1b36cf3e3d0..2a2a9b40db19 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -59,15 +59,6 @@ enum { | |||
59 | AUDIT_POST_SYNC | 59 | AUDIT_POST_SYNC |
60 | }; | 60 | }; |
61 | 61 | ||
62 | char *audit_point_name[] = { | ||
63 | "pre page fault", | ||
64 | "post page fault", | ||
65 | "pre pte write", | ||
66 | "post pte write", | ||
67 | "pre sync", | ||
68 | "post sync" | ||
69 | }; | ||
70 | |||
71 | #undef MMU_DEBUG | 62 | #undef MMU_DEBUG |
72 | 63 | ||
73 | #ifdef MMU_DEBUG | 64 | #ifdef MMU_DEBUG |
@@ -87,9 +78,6 @@ static int dbg = 0; | |||
87 | module_param(dbg, bool, 0644); | 78 | module_param(dbg, bool, 0644); |
88 | #endif | 79 | #endif |
89 | 80 | ||
90 | static int oos_shadow = 1; | ||
91 | module_param(oos_shadow, bool, 0644); | ||
92 | |||
93 | #ifndef MMU_DEBUG | 81 | #ifndef MMU_DEBUG |
94 | #define ASSERT(x) do { } while (0) | 82 | #define ASSERT(x) do { } while (0) |
95 | #else | 83 | #else |
@@ -593,6 +581,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, | |||
593 | return 0; | 581 | return 0; |
594 | } | 582 | } |
595 | 583 | ||
584 | static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache) | ||
585 | { | ||
586 | return cache->nobjs; | ||
587 | } | ||
588 | |||
596 | static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, | 589 | static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, |
597 | struct kmem_cache *cache) | 590 | struct kmem_cache *cache) |
598 | { | 591 | { |
@@ -953,21 +946,35 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) | |||
953 | } | 946 | } |
954 | } | 947 | } |
955 | 948 | ||
949 | static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level, | ||
950 | struct kvm_memory_slot *slot) | ||
951 | { | ||
952 | struct kvm_lpage_info *linfo; | ||
953 | |||
954 | if (likely(level == PT_PAGE_TABLE_LEVEL)) | ||
955 | return &slot->rmap[gfn - slot->base_gfn]; | ||
956 | |||
957 | linfo = lpage_info_slot(gfn, slot, level); | ||
958 | return &linfo->rmap_pde; | ||
959 | } | ||
960 | |||
956 | /* | 961 | /* |
957 | * Take gfn and return the reverse mapping to it. | 962 | * Take gfn and return the reverse mapping to it. |
958 | */ | 963 | */ |
959 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) | 964 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) |
960 | { | 965 | { |
961 | struct kvm_memory_slot *slot; | 966 | struct kvm_memory_slot *slot; |
962 | struct kvm_lpage_info *linfo; | ||
963 | 967 | ||
964 | slot = gfn_to_memslot(kvm, gfn); | 968 | slot = gfn_to_memslot(kvm, gfn); |
965 | if (likely(level == PT_PAGE_TABLE_LEVEL)) | 969 | return __gfn_to_rmap(kvm, gfn, level, slot); |
966 | return &slot->rmap[gfn - slot->base_gfn]; | 970 | } |
967 | 971 | ||
968 | linfo = lpage_info_slot(gfn, slot, level); | 972 | static bool rmap_can_add(struct kvm_vcpu *vcpu) |
973 | { | ||
974 | struct kvm_mmu_memory_cache *cache; | ||
969 | 975 | ||
970 | return &linfo->rmap_pde; | 976 | cache = &vcpu->arch.mmu_pte_list_desc_cache; |
977 | return mmu_memory_cache_free_objects(cache); | ||
971 | } | 978 | } |
972 | 979 | ||
973 | static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | 980 | static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) |
@@ -1004,17 +1011,16 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) | |||
1004 | rmap_remove(kvm, sptep); | 1011 | rmap_remove(kvm, sptep); |
1005 | } | 1012 | } |
1006 | 1013 | ||
1007 | static int rmap_write_protect(struct kvm *kvm, u64 gfn) | 1014 | int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, |
1015 | struct kvm_memory_slot *slot) | ||
1008 | { | 1016 | { |
1009 | unsigned long *rmapp; | 1017 | unsigned long *rmapp; |
1010 | u64 *spte; | 1018 | u64 *spte; |
1011 | int i, write_protected = 0; | 1019 | int i, write_protected = 0; |
1012 | 1020 | ||
1013 | rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); | 1021 | rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot); |
1014 | |||
1015 | spte = rmap_next(kvm, rmapp, NULL); | 1022 | spte = rmap_next(kvm, rmapp, NULL); |
1016 | while (spte) { | 1023 | while (spte) { |
1017 | BUG_ON(!spte); | ||
1018 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 1024 | BUG_ON(!(*spte & PT_PRESENT_MASK)); |
1019 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); | 1025 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); |
1020 | if (is_writable_pte(*spte)) { | 1026 | if (is_writable_pte(*spte)) { |
@@ -1027,12 +1033,11 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
1027 | /* check for huge page mappings */ | 1033 | /* check for huge page mappings */ |
1028 | for (i = PT_DIRECTORY_LEVEL; | 1034 | for (i = PT_DIRECTORY_LEVEL; |
1029 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | 1035 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
1030 | rmapp = gfn_to_rmap(kvm, gfn, i); | 1036 | rmapp = __gfn_to_rmap(kvm, gfn, i, slot); |
1031 | spte = rmap_next(kvm, rmapp, NULL); | 1037 | spte = rmap_next(kvm, rmapp, NULL); |
1032 | while (spte) { | 1038 | while (spte) { |
1033 | BUG_ON(!spte); | ||
1034 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 1039 | BUG_ON(!(*spte & PT_PRESENT_MASK)); |
1035 | BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); | 1040 | BUG_ON(!is_large_pte(*spte)); |
1036 | pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); | 1041 | pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); |
1037 | if (is_writable_pte(*spte)) { | 1042 | if (is_writable_pte(*spte)) { |
1038 | drop_spte(kvm, spte); | 1043 | drop_spte(kvm, spte); |
@@ -1047,6 +1052,14 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
1047 | return write_protected; | 1052 | return write_protected; |
1048 | } | 1053 | } |
1049 | 1054 | ||
1055 | static int rmap_write_protect(struct kvm *kvm, u64 gfn) | ||
1056 | { | ||
1057 | struct kvm_memory_slot *slot; | ||
1058 | |||
1059 | slot = gfn_to_memslot(kvm, gfn); | ||
1060 | return kvm_mmu_rmap_write_protect(kvm, gfn, slot); | ||
1061 | } | ||
1062 | |||
1050 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, | 1063 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, |
1051 | unsigned long data) | 1064 | unsigned long data) |
1052 | { | 1065 | { |
@@ -1103,15 +1116,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, | |||
1103 | int (*handler)(struct kvm *kvm, unsigned long *rmapp, | 1116 | int (*handler)(struct kvm *kvm, unsigned long *rmapp, |
1104 | unsigned long data)) | 1117 | unsigned long data)) |
1105 | { | 1118 | { |
1106 | int i, j; | 1119 | int j; |
1107 | int ret; | 1120 | int ret; |
1108 | int retval = 0; | 1121 | int retval = 0; |
1109 | struct kvm_memslots *slots; | 1122 | struct kvm_memslots *slots; |
1123 | struct kvm_memory_slot *memslot; | ||
1110 | 1124 | ||
1111 | slots = kvm_memslots(kvm); | 1125 | slots = kvm_memslots(kvm); |
1112 | 1126 | ||
1113 | for (i = 0; i < slots->nmemslots; i++) { | 1127 | kvm_for_each_memslot(memslot, slots) { |
1114 | struct kvm_memory_slot *memslot = &slots->memslots[i]; | ||
1115 | unsigned long start = memslot->userspace_addr; | 1128 | unsigned long start = memslot->userspace_addr; |
1116 | unsigned long end; | 1129 | unsigned long end; |
1117 | 1130 | ||
@@ -1324,7 +1337,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | |||
1324 | PAGE_SIZE); | 1337 | PAGE_SIZE); |
1325 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | 1338 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); |
1326 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); | 1339 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); |
1327 | bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); | 1340 | bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); |
1328 | sp->parent_ptes = 0; | 1341 | sp->parent_ptes = 0; |
1329 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); | 1342 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); |
1330 | kvm_mod_used_mmu_pages(vcpu->kvm, +1); | 1343 | kvm_mod_used_mmu_pages(vcpu->kvm, +1); |
@@ -1511,6 +1524,13 @@ static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, | |||
1511 | return ret; | 1524 | return ret; |
1512 | } | 1525 | } |
1513 | 1526 | ||
1527 | #ifdef CONFIG_KVM_MMU_AUDIT | ||
1528 | #include "mmu_audit.c" | ||
1529 | #else | ||
1530 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } | ||
1531 | static void mmu_audit_disable(void) { } | ||
1532 | #endif | ||
1533 | |||
1514 | static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | 1534 | static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, |
1515 | struct list_head *invalid_list) | 1535 | struct list_head *invalid_list) |
1516 | { | 1536 | { |
@@ -1640,6 +1660,18 @@ static void init_shadow_page_table(struct kvm_mmu_page *sp) | |||
1640 | sp->spt[i] = 0ull; | 1660 | sp->spt[i] = 0ull; |
1641 | } | 1661 | } |
1642 | 1662 | ||
1663 | static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) | ||
1664 | { | ||
1665 | sp->write_flooding_count = 0; | ||
1666 | } | ||
1667 | |||
1668 | static void clear_sp_write_flooding_count(u64 *spte) | ||
1669 | { | ||
1670 | struct kvm_mmu_page *sp = page_header(__pa(spte)); | ||
1671 | |||
1672 | __clear_sp_write_flooding_count(sp); | ||
1673 | } | ||
1674 | |||
1643 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | 1675 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, |
1644 | gfn_t gfn, | 1676 | gfn_t gfn, |
1645 | gva_t gaddr, | 1677 | gva_t gaddr, |
@@ -1683,6 +1715,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
1683 | } else if (sp->unsync) | 1715 | } else if (sp->unsync) |
1684 | kvm_mmu_mark_parents_unsync(sp); | 1716 | kvm_mmu_mark_parents_unsync(sp); |
1685 | 1717 | ||
1718 | __clear_sp_write_flooding_count(sp); | ||
1686 | trace_kvm_mmu_get_page(sp, false); | 1719 | trace_kvm_mmu_get_page(sp, false); |
1687 | return sp; | 1720 | return sp; |
1688 | } | 1721 | } |
@@ -1796,7 +1829,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1796 | } | 1829 | } |
1797 | } | 1830 | } |
1798 | 1831 | ||
1799 | static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, | 1832 | static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, |
1800 | u64 *spte) | 1833 | u64 *spte) |
1801 | { | 1834 | { |
1802 | u64 pte; | 1835 | u64 pte; |
@@ -1804,17 +1837,21 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, | |||
1804 | 1837 | ||
1805 | pte = *spte; | 1838 | pte = *spte; |
1806 | if (is_shadow_present_pte(pte)) { | 1839 | if (is_shadow_present_pte(pte)) { |
1807 | if (is_last_spte(pte, sp->role.level)) | 1840 | if (is_last_spte(pte, sp->role.level)) { |
1808 | drop_spte(kvm, spte); | 1841 | drop_spte(kvm, spte); |
1809 | else { | 1842 | if (is_large_pte(pte)) |
1843 | --kvm->stat.lpages; | ||
1844 | } else { | ||
1810 | child = page_header(pte & PT64_BASE_ADDR_MASK); | 1845 | child = page_header(pte & PT64_BASE_ADDR_MASK); |
1811 | drop_parent_pte(child, spte); | 1846 | drop_parent_pte(child, spte); |
1812 | } | 1847 | } |
1813 | } else if (is_mmio_spte(pte)) | 1848 | return true; |
1849 | } | ||
1850 | |||
1851 | if (is_mmio_spte(pte)) | ||
1814 | mmu_spte_clear_no_track(spte); | 1852 | mmu_spte_clear_no_track(spte); |
1815 | 1853 | ||
1816 | if (is_large_pte(pte)) | 1854 | return false; |
1817 | --kvm->stat.lpages; | ||
1818 | } | 1855 | } |
1819 | 1856 | ||
1820 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, | 1857 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, |
@@ -1831,15 +1868,6 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) | |||
1831 | mmu_page_remove_parent_pte(sp, parent_pte); | 1868 | mmu_page_remove_parent_pte(sp, parent_pte); |
1832 | } | 1869 | } |
1833 | 1870 | ||
1834 | static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) | ||
1835 | { | ||
1836 | int i; | ||
1837 | struct kvm_vcpu *vcpu; | ||
1838 | |||
1839 | kvm_for_each_vcpu(i, vcpu, kvm) | ||
1840 | vcpu->arch.last_pte_updated = NULL; | ||
1841 | } | ||
1842 | |||
1843 | static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) | 1871 | static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) |
1844 | { | 1872 | { |
1845 | u64 *parent_pte; | 1873 | u64 *parent_pte; |
@@ -1899,7 +1927,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, | |||
1899 | } | 1927 | } |
1900 | 1928 | ||
1901 | sp->role.invalid = 1; | 1929 | sp->role.invalid = 1; |
1902 | kvm_mmu_reset_last_pte_updated(kvm); | ||
1903 | return ret; | 1930 | return ret; |
1904 | } | 1931 | } |
1905 | 1932 | ||
@@ -1985,7 +2012,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) | |||
1985 | kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; | 2012 | kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; |
1986 | } | 2013 | } |
1987 | 2014 | ||
1988 | static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | 2015 | int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) |
1989 | { | 2016 | { |
1990 | struct kvm_mmu_page *sp; | 2017 | struct kvm_mmu_page *sp; |
1991 | struct hlist_node *node; | 2018 | struct hlist_node *node; |
@@ -1994,7 +2021,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | |||
1994 | 2021 | ||
1995 | pgprintk("%s: looking for gfn %llx\n", __func__, gfn); | 2022 | pgprintk("%s: looking for gfn %llx\n", __func__, gfn); |
1996 | r = 0; | 2023 | r = 0; |
1997 | 2024 | spin_lock(&kvm->mmu_lock); | |
1998 | for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { | 2025 | for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { |
1999 | pgprintk("%s: gfn %llx role %x\n", __func__, gfn, | 2026 | pgprintk("%s: gfn %llx role %x\n", __func__, gfn, |
2000 | sp->role.word); | 2027 | sp->role.word); |
@@ -2002,22 +2029,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | |||
2002 | kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); | 2029 | kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); |
2003 | } | 2030 | } |
2004 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | 2031 | kvm_mmu_commit_zap_page(kvm, &invalid_list); |
2005 | return r; | 2032 | spin_unlock(&kvm->mmu_lock); |
2006 | } | ||
2007 | |||
2008 | static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) | ||
2009 | { | ||
2010 | struct kvm_mmu_page *sp; | ||
2011 | struct hlist_node *node; | ||
2012 | LIST_HEAD(invalid_list); | ||
2013 | 2033 | ||
2014 | for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { | 2034 | return r; |
2015 | pgprintk("%s: zap %llx %x\n", | ||
2016 | __func__, gfn, sp->role.word); | ||
2017 | kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); | ||
2018 | } | ||
2019 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | ||
2020 | } | 2035 | } |
2036 | EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); | ||
2021 | 2037 | ||
2022 | static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) | 2038 | static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) |
2023 | { | 2039 | { |
@@ -2169,8 +2185,6 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, | |||
2169 | return 1; | 2185 | return 1; |
2170 | 2186 | ||
2171 | if (!need_unsync && !s->unsync) { | 2187 | if (!need_unsync && !s->unsync) { |
2172 | if (!oos_shadow) | ||
2173 | return 1; | ||
2174 | need_unsync = true; | 2188 | need_unsync = true; |
2175 | } | 2189 | } |
2176 | } | 2190 | } |
@@ -2191,11 +2205,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2191 | if (set_mmio_spte(sptep, gfn, pfn, pte_access)) | 2205 | if (set_mmio_spte(sptep, gfn, pfn, pte_access)) |
2192 | return 0; | 2206 | return 0; |
2193 | 2207 | ||
2194 | /* | ||
2195 | * We don't set the accessed bit, since we sometimes want to see | ||
2196 | * whether the guest actually used the pte (in order to detect | ||
2197 | * demand paging). | ||
2198 | */ | ||
2199 | spte = PT_PRESENT_MASK; | 2208 | spte = PT_PRESENT_MASK; |
2200 | if (!speculative) | 2209 | if (!speculative) |
2201 | spte |= shadow_accessed_mask; | 2210 | spte |= shadow_accessed_mask; |
@@ -2346,10 +2355,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2346 | } | 2355 | } |
2347 | } | 2356 | } |
2348 | kvm_release_pfn_clean(pfn); | 2357 | kvm_release_pfn_clean(pfn); |
2349 | if (speculative) { | ||
2350 | vcpu->arch.last_pte_updated = sptep; | ||
2351 | vcpu->arch.last_pte_gfn = gfn; | ||
2352 | } | ||
2353 | } | 2358 | } |
2354 | 2359 | ||
2355 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | 2360 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) |
@@ -2840,12 +2845,12 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) | |||
2840 | return; | 2845 | return; |
2841 | 2846 | ||
2842 | vcpu_clear_mmio_info(vcpu, ~0ul); | 2847 | vcpu_clear_mmio_info(vcpu, ~0ul); |
2843 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); | 2848 | kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); |
2844 | if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { | 2849 | if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { |
2845 | hpa_t root = vcpu->arch.mmu.root_hpa; | 2850 | hpa_t root = vcpu->arch.mmu.root_hpa; |
2846 | sp = page_header(root); | 2851 | sp = page_header(root); |
2847 | mmu_sync_children(vcpu, sp); | 2852 | mmu_sync_children(vcpu, sp); |
2848 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); | 2853 | kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); |
2849 | return; | 2854 | return; |
2850 | } | 2855 | } |
2851 | for (i = 0; i < 4; ++i) { | 2856 | for (i = 0; i < 4; ++i) { |
@@ -2857,7 +2862,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) | |||
2857 | mmu_sync_children(vcpu, sp); | 2862 | mmu_sync_children(vcpu, sp); |
2858 | } | 2863 | } |
2859 | } | 2864 | } |
2860 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); | 2865 | kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); |
2861 | } | 2866 | } |
2862 | 2867 | ||
2863 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) | 2868 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) |
@@ -3510,28 +3515,119 @@ static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, | |||
3510 | kvm_mmu_flush_tlb(vcpu); | 3515 | kvm_mmu_flush_tlb(vcpu); |
3511 | } | 3516 | } |
3512 | 3517 | ||
3513 | static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) | 3518 | static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, |
3519 | const u8 *new, int *bytes) | ||
3514 | { | 3520 | { |
3515 | u64 *spte = vcpu->arch.last_pte_updated; | 3521 | u64 gentry; |
3522 | int r; | ||
3523 | |||
3524 | /* | ||
3525 | * Assume that the pte write on a page table of the same type | ||
3526 | * as the current vcpu paging mode since we update the sptes only | ||
3527 | * when they have the same mode. | ||
3528 | */ | ||
3529 | if (is_pae(vcpu) && *bytes == 4) { | ||
3530 | /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ | ||
3531 | *gpa &= ~(gpa_t)7; | ||
3532 | *bytes = 8; | ||
3533 | r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8)); | ||
3534 | if (r) | ||
3535 | gentry = 0; | ||
3536 | new = (const u8 *)&gentry; | ||
3537 | } | ||
3516 | 3538 | ||
3517 | return !!(spte && (*spte & shadow_accessed_mask)); | 3539 | switch (*bytes) { |
3540 | case 4: | ||
3541 | gentry = *(const u32 *)new; | ||
3542 | break; | ||
3543 | case 8: | ||
3544 | gentry = *(const u64 *)new; | ||
3545 | break; | ||
3546 | default: | ||
3547 | gentry = 0; | ||
3548 | break; | ||
3549 | } | ||
3550 | |||
3551 | return gentry; | ||
3518 | } | 3552 | } |
3519 | 3553 | ||
3520 | static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) | 3554 | /* |
3555 | * If we're seeing too many writes to a page, it may no longer be a page table, | ||
3556 | * or we may be forking, in which case it is better to unmap the page. | ||
3557 | */ | ||
3558 | static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte) | ||
3521 | { | 3559 | { |
3522 | u64 *spte = vcpu->arch.last_pte_updated; | 3560 | /* |
3561 | * Skip write-flooding detected for the sp whose level is 1, because | ||
3562 | * it can become unsync, then the guest page is not write-protected. | ||
3563 | */ | ||
3564 | if (sp->role.level == 1) | ||
3565 | return false; | ||
3523 | 3566 | ||
3524 | if (spte | 3567 | return ++sp->write_flooding_count >= 3; |
3525 | && vcpu->arch.last_pte_gfn == gfn | 3568 | } |
3526 | && shadow_accessed_mask | 3569 | |
3527 | && !(*spte & shadow_accessed_mask) | 3570 | /* |
3528 | && is_shadow_present_pte(*spte)) | 3571 | * Misaligned accesses are too much trouble to fix up; also, they usually |
3529 | set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); | 3572 | * indicate a page is not used as a page table. |
3573 | */ | ||
3574 | static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, | ||
3575 | int bytes) | ||
3576 | { | ||
3577 | unsigned offset, pte_size, misaligned; | ||
3578 | |||
3579 | pgprintk("misaligned: gpa %llx bytes %d role %x\n", | ||
3580 | gpa, bytes, sp->role.word); | ||
3581 | |||
3582 | offset = offset_in_page(gpa); | ||
3583 | pte_size = sp->role.cr4_pae ? 8 : 4; | ||
3584 | |||
3585 | /* | ||
3586 | * Sometimes, the OS only writes the last one bytes to update status | ||
3587 | * bits, for example, in linux, andb instruction is used in clear_bit(). | ||
3588 | */ | ||
3589 | if (!(offset & (pte_size - 1)) && bytes == 1) | ||
3590 | return false; | ||
3591 | |||
3592 | misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); | ||
3593 | misaligned |= bytes < 4; | ||
3594 | |||
3595 | return misaligned; | ||
3596 | } | ||
3597 | |||
3598 | static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) | ||
3599 | { | ||
3600 | unsigned page_offset, quadrant; | ||
3601 | u64 *spte; | ||
3602 | int level; | ||
3603 | |||
3604 | page_offset = offset_in_page(gpa); | ||
3605 | level = sp->role.level; | ||
3606 | *nspte = 1; | ||
3607 | if (!sp->role.cr4_pae) { | ||
3608 | page_offset <<= 1; /* 32->64 */ | ||
3609 | /* | ||
3610 | * A 32-bit pde maps 4MB while the shadow pdes map | ||
3611 | * only 2MB. So we need to double the offset again | ||
3612 | * and zap two pdes instead of one. | ||
3613 | */ | ||
3614 | if (level == PT32_ROOT_LEVEL) { | ||
3615 | page_offset &= ~7; /* kill rounding error */ | ||
3616 | page_offset <<= 1; | ||
3617 | *nspte = 2; | ||
3618 | } | ||
3619 | quadrant = page_offset >> PAGE_SHIFT; | ||
3620 | page_offset &= ~PAGE_MASK; | ||
3621 | if (quadrant != sp->role.quadrant) | ||
3622 | return NULL; | ||
3623 | } | ||
3624 | |||
3625 | spte = &sp->spt[page_offset / sizeof(*spte)]; | ||
3626 | return spte; | ||
3530 | } | 3627 | } |
3531 | 3628 | ||
3532 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | 3629 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, |
3533 | const u8 *new, int bytes, | 3630 | const u8 *new, int bytes) |
3534 | bool guest_initiated) | ||
3535 | { | 3631 | { |
3536 | gfn_t gfn = gpa >> PAGE_SHIFT; | 3632 | gfn_t gfn = gpa >> PAGE_SHIFT; |
3537 | union kvm_mmu_page_role mask = { .word = 0 }; | 3633 | union kvm_mmu_page_role mask = { .word = 0 }; |
@@ -3539,8 +3635,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
3539 | struct hlist_node *node; | 3635 | struct hlist_node *node; |
3540 | LIST_HEAD(invalid_list); | 3636 | LIST_HEAD(invalid_list); |
3541 | u64 entry, gentry, *spte; | 3637 | u64 entry, gentry, *spte; |
3542 | unsigned pte_size, page_offset, misaligned, quadrant, offset; | 3638 | int npte; |
3543 | int level, npte, invlpg_counter, r, flooded = 0; | ||
3544 | bool remote_flush, local_flush, zap_page; | 3639 | bool remote_flush, local_flush, zap_page; |
3545 | 3640 | ||
3546 | /* | 3641 | /* |
@@ -3551,112 +3646,45 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
3551 | return; | 3646 | return; |
3552 | 3647 | ||
3553 | zap_page = remote_flush = local_flush = false; | 3648 | zap_page = remote_flush = local_flush = false; |
3554 | offset = offset_in_page(gpa); | ||
3555 | 3649 | ||
3556 | pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); | 3650 | pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); |
3557 | 3651 | ||
3558 | invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter); | 3652 | gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes); |
3559 | 3653 | ||
3560 | /* | 3654 | /* |
3561 | * Assume that the pte write on a page table of the same type | 3655 | * No need to care whether allocation memory is successful |
3562 | * as the current vcpu paging mode since we update the sptes only | 3656 | * or not since pte prefetch is skiped if it does not have |
3563 | * when they have the same mode. | 3657 | * enough objects in the cache. |
3564 | */ | 3658 | */ |
3565 | if ((is_pae(vcpu) && bytes == 4) || !new) { | 3659 | mmu_topup_memory_caches(vcpu); |
3566 | /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ | ||
3567 | if (is_pae(vcpu)) { | ||
3568 | gpa &= ~(gpa_t)7; | ||
3569 | bytes = 8; | ||
3570 | } | ||
3571 | r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8)); | ||
3572 | if (r) | ||
3573 | gentry = 0; | ||
3574 | new = (const u8 *)&gentry; | ||
3575 | } | ||
3576 | |||
3577 | switch (bytes) { | ||
3578 | case 4: | ||
3579 | gentry = *(const u32 *)new; | ||
3580 | break; | ||
3581 | case 8: | ||
3582 | gentry = *(const u64 *)new; | ||
3583 | break; | ||
3584 | default: | ||
3585 | gentry = 0; | ||
3586 | break; | ||
3587 | } | ||
3588 | 3660 | ||
3589 | spin_lock(&vcpu->kvm->mmu_lock); | 3661 | spin_lock(&vcpu->kvm->mmu_lock); |
3590 | if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) | ||
3591 | gentry = 0; | ||
3592 | kvm_mmu_free_some_pages(vcpu); | ||
3593 | ++vcpu->kvm->stat.mmu_pte_write; | 3662 | ++vcpu->kvm->stat.mmu_pte_write; |
3594 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); | 3663 | kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); |
3595 | if (guest_initiated) { | ||
3596 | kvm_mmu_access_page(vcpu, gfn); | ||
3597 | if (gfn == vcpu->arch.last_pt_write_gfn | ||
3598 | && !last_updated_pte_accessed(vcpu)) { | ||
3599 | ++vcpu->arch.last_pt_write_count; | ||
3600 | if (vcpu->arch.last_pt_write_count >= 3) | ||
3601 | flooded = 1; | ||
3602 | } else { | ||
3603 | vcpu->arch.last_pt_write_gfn = gfn; | ||
3604 | vcpu->arch.last_pt_write_count = 1; | ||
3605 | vcpu->arch.last_pte_updated = NULL; | ||
3606 | } | ||
3607 | } | ||
3608 | 3664 | ||
3609 | mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; | 3665 | mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; |
3610 | for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { | 3666 | for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { |
3611 | pte_size = sp->role.cr4_pae ? 8 : 4; | 3667 | spte = get_written_sptes(sp, gpa, &npte); |
3612 | misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); | 3668 | |
3613 | misaligned |= bytes < 4; | 3669 | if (detect_write_misaligned(sp, gpa, bytes) || |
3614 | if (misaligned || flooded) { | 3670 | detect_write_flooding(sp, spte)) { |
3615 | /* | ||
3616 | * Misaligned accesses are too much trouble to fix | ||
3617 | * up; also, they usually indicate a page is not used | ||
3618 | * as a page table. | ||
3619 | * | ||
3620 | * If we're seeing too many writes to a page, | ||
3621 | * it may no longer be a page table, or we may be | ||
3622 | * forking, in which case it is better to unmap the | ||
3623 | * page. | ||
3624 | */ | ||
3625 | pgprintk("misaligned: gpa %llx bytes %d role %x\n", | ||
3626 | gpa, bytes, sp->role.word); | ||
3627 | zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, | 3671 | zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, |
3628 | &invalid_list); | 3672 | &invalid_list); |
3629 | ++vcpu->kvm->stat.mmu_flooded; | 3673 | ++vcpu->kvm->stat.mmu_flooded; |
3630 | continue; | 3674 | continue; |
3631 | } | 3675 | } |
3632 | page_offset = offset; | 3676 | |
3633 | level = sp->role.level; | 3677 | spte = get_written_sptes(sp, gpa, &npte); |
3634 | npte = 1; | 3678 | if (!spte) |
3635 | if (!sp->role.cr4_pae) { | 3679 | continue; |
3636 | page_offset <<= 1; /* 32->64 */ | 3680 | |
3637 | /* | ||
3638 | * A 32-bit pde maps 4MB while the shadow pdes map | ||
3639 | * only 2MB. So we need to double the offset again | ||
3640 | * and zap two pdes instead of one. | ||
3641 | */ | ||
3642 | if (level == PT32_ROOT_LEVEL) { | ||
3643 | page_offset &= ~7; /* kill rounding error */ | ||
3644 | page_offset <<= 1; | ||
3645 | npte = 2; | ||
3646 | } | ||
3647 | quadrant = page_offset >> PAGE_SHIFT; | ||
3648 | page_offset &= ~PAGE_MASK; | ||
3649 | if (quadrant != sp->role.quadrant) | ||
3650 | continue; | ||
3651 | } | ||
3652 | local_flush = true; | 3681 | local_flush = true; |
3653 | spte = &sp->spt[page_offset / sizeof(*spte)]; | ||
3654 | while (npte--) { | 3682 | while (npte--) { |
3655 | entry = *spte; | 3683 | entry = *spte; |
3656 | mmu_page_zap_pte(vcpu->kvm, sp, spte); | 3684 | mmu_page_zap_pte(vcpu->kvm, sp, spte); |
3657 | if (gentry && | 3685 | if (gentry && |
3658 | !((sp->role.word ^ vcpu->arch.mmu.base_role.word) | 3686 | !((sp->role.word ^ vcpu->arch.mmu.base_role.word) |
3659 | & mask.word)) | 3687 | & mask.word) && rmap_can_add(vcpu)) |
3660 | mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); | 3688 | mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); |
3661 | if (!remote_flush && need_remote_flush(entry, *spte)) | 3689 | if (!remote_flush && need_remote_flush(entry, *spte)) |
3662 | remote_flush = true; | 3690 | remote_flush = true; |
@@ -3665,7 +3693,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
3665 | } | 3693 | } |
3666 | mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); | 3694 | mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); |
3667 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | 3695 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); |
3668 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); | 3696 | kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); |
3669 | spin_unlock(&vcpu->kvm->mmu_lock); | 3697 | spin_unlock(&vcpu->kvm->mmu_lock); |
3670 | } | 3698 | } |
3671 | 3699 | ||
@@ -3679,9 +3707,8 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) | |||
3679 | 3707 | ||
3680 | gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); | 3708 | gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); |
3681 | 3709 | ||
3682 | spin_lock(&vcpu->kvm->mmu_lock); | ||
3683 | r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); | 3710 | r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); |
3684 | spin_unlock(&vcpu->kvm->mmu_lock); | 3711 | |
3685 | return r; | 3712 | return r; |
3686 | } | 3713 | } |
3687 | EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); | 3714 | EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); |
@@ -3702,10 +3729,18 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | |||
3702 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | 3729 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); |
3703 | } | 3730 | } |
3704 | 3731 | ||
3732 | static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr) | ||
3733 | { | ||
3734 | if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu)) | ||
3735 | return vcpu_match_mmio_gpa(vcpu, addr); | ||
3736 | |||
3737 | return vcpu_match_mmio_gva(vcpu, addr); | ||
3738 | } | ||
3739 | |||
3705 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, | 3740 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, |
3706 | void *insn, int insn_len) | 3741 | void *insn, int insn_len) |
3707 | { | 3742 | { |
3708 | int r; | 3743 | int r, emulation_type = EMULTYPE_RETRY; |
3709 | enum emulation_result er; | 3744 | enum emulation_result er; |
3710 | 3745 | ||
3711 | r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); | 3746 | r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); |
@@ -3717,11 +3752,10 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, | |||
3717 | goto out; | 3752 | goto out; |
3718 | } | 3753 | } |
3719 | 3754 | ||
3720 | r = mmu_topup_memory_caches(vcpu); | 3755 | if (is_mmio_page_fault(vcpu, cr2)) |
3721 | if (r) | 3756 | emulation_type = 0; |
3722 | goto out; | ||
3723 | 3757 | ||
3724 | er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len); | 3758 | er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len); |
3725 | 3759 | ||
3726 | switch (er) { | 3760 | switch (er) { |
3727 | case EMULATE_DONE: | 3761 | case EMULATE_DONE: |
@@ -3792,7 +3826,11 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) | |||
3792 | int kvm_mmu_create(struct kvm_vcpu *vcpu) | 3826 | int kvm_mmu_create(struct kvm_vcpu *vcpu) |
3793 | { | 3827 | { |
3794 | ASSERT(vcpu); | 3828 | ASSERT(vcpu); |
3795 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | 3829 | |
3830 | vcpu->arch.walk_mmu = &vcpu->arch.mmu; | ||
3831 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
3832 | vcpu->arch.mmu.translate_gpa = translate_gpa; | ||
3833 | vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; | ||
3796 | 3834 | ||
3797 | return alloc_mmu_pages(vcpu); | 3835 | return alloc_mmu_pages(vcpu); |
3798 | } | 3836 | } |
@@ -3852,14 +3890,14 @@ restart: | |||
3852 | spin_unlock(&kvm->mmu_lock); | 3890 | spin_unlock(&kvm->mmu_lock); |
3853 | } | 3891 | } |
3854 | 3892 | ||
3855 | static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, | 3893 | static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, |
3856 | struct list_head *invalid_list) | 3894 | struct list_head *invalid_list) |
3857 | { | 3895 | { |
3858 | struct kvm_mmu_page *page; | 3896 | struct kvm_mmu_page *page; |
3859 | 3897 | ||
3860 | page = container_of(kvm->arch.active_mmu_pages.prev, | 3898 | page = container_of(kvm->arch.active_mmu_pages.prev, |
3861 | struct kvm_mmu_page, link); | 3899 | struct kvm_mmu_page, link); |
3862 | return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); | 3900 | kvm_mmu_prepare_zap_page(kvm, page, invalid_list); |
3863 | } | 3901 | } |
3864 | 3902 | ||
3865 | static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) | 3903 | static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) |
@@ -3874,15 +3912,15 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) | |||
3874 | raw_spin_lock(&kvm_lock); | 3912 | raw_spin_lock(&kvm_lock); |
3875 | 3913 | ||
3876 | list_for_each_entry(kvm, &vm_list, vm_list) { | 3914 | list_for_each_entry(kvm, &vm_list, vm_list) { |
3877 | int idx, freed_pages; | 3915 | int idx; |
3878 | LIST_HEAD(invalid_list); | 3916 | LIST_HEAD(invalid_list); |
3879 | 3917 | ||
3880 | idx = srcu_read_lock(&kvm->srcu); | 3918 | idx = srcu_read_lock(&kvm->srcu); |
3881 | spin_lock(&kvm->mmu_lock); | 3919 | spin_lock(&kvm->mmu_lock); |
3882 | if (!kvm_freed && nr_to_scan > 0 && | 3920 | if (!kvm_freed && nr_to_scan > 0 && |
3883 | kvm->arch.n_used_mmu_pages > 0) { | 3921 | kvm->arch.n_used_mmu_pages > 0) { |
3884 | freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, | 3922 | kvm_mmu_remove_some_alloc_mmu_pages(kvm, |
3885 | &invalid_list); | 3923 | &invalid_list); |
3886 | kvm_freed = kvm; | 3924 | kvm_freed = kvm; |
3887 | } | 3925 | } |
3888 | nr_to_scan--; | 3926 | nr_to_scan--; |
@@ -3944,15 +3982,15 @@ nomem: | |||
3944 | */ | 3982 | */ |
3945 | unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) | 3983 | unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) |
3946 | { | 3984 | { |
3947 | int i; | ||
3948 | unsigned int nr_mmu_pages; | 3985 | unsigned int nr_mmu_pages; |
3949 | unsigned int nr_pages = 0; | 3986 | unsigned int nr_pages = 0; |
3950 | struct kvm_memslots *slots; | 3987 | struct kvm_memslots *slots; |
3988 | struct kvm_memory_slot *memslot; | ||
3951 | 3989 | ||
3952 | slots = kvm_memslots(kvm); | 3990 | slots = kvm_memslots(kvm); |
3953 | 3991 | ||
3954 | for (i = 0; i < slots->nmemslots; i++) | 3992 | kvm_for_each_memslot(memslot, slots) |
3955 | nr_pages += slots->memslots[i].npages; | 3993 | nr_pages += memslot->npages; |
3956 | 3994 | ||
3957 | nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; | 3995 | nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; |
3958 | nr_mmu_pages = max(nr_mmu_pages, | 3996 | nr_mmu_pages = max(nr_mmu_pages, |
@@ -3961,127 +3999,6 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) | |||
3961 | return nr_mmu_pages; | 3999 | return nr_mmu_pages; |
3962 | } | 4000 | } |
3963 | 4001 | ||
3964 | static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer, | ||
3965 | unsigned len) | ||
3966 | { | ||
3967 | if (len > buffer->len) | ||
3968 | return NULL; | ||
3969 | return buffer->ptr; | ||
3970 | } | ||
3971 | |||
3972 | static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer, | ||
3973 | unsigned len) | ||
3974 | { | ||
3975 | void *ret; | ||
3976 | |||
3977 | ret = pv_mmu_peek_buffer(buffer, len); | ||
3978 | if (!ret) | ||
3979 | return ret; | ||
3980 | buffer->ptr += len; | ||
3981 | buffer->len -= len; | ||
3982 | buffer->processed += len; | ||
3983 | return ret; | ||
3984 | } | ||
3985 | |||
3986 | static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, | ||
3987 | gpa_t addr, gpa_t value) | ||
3988 | { | ||
3989 | int bytes = 8; | ||
3990 | int r; | ||
3991 | |||
3992 | if (!is_long_mode(vcpu) && !is_pae(vcpu)) | ||
3993 | bytes = 4; | ||
3994 | |||
3995 | r = mmu_topup_memory_caches(vcpu); | ||
3996 | if (r) | ||
3997 | return r; | ||
3998 | |||
3999 | if (!emulator_write_phys(vcpu, addr, &value, bytes)) | ||
4000 | return -EFAULT; | ||
4001 | |||
4002 | return 1; | ||
4003 | } | ||
4004 | |||
4005 | static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) | ||
4006 | { | ||
4007 | (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu)); | ||
4008 | return 1; | ||
4009 | } | ||
4010 | |||
4011 | static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr) | ||
4012 | { | ||
4013 | spin_lock(&vcpu->kvm->mmu_lock); | ||
4014 | mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT); | ||
4015 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
4016 | return 1; | ||
4017 | } | ||
4018 | |||
4019 | static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu, | ||
4020 | struct kvm_pv_mmu_op_buffer *buffer) | ||
4021 | { | ||
4022 | struct kvm_mmu_op_header *header; | ||
4023 | |||
4024 | header = pv_mmu_peek_buffer(buffer, sizeof *header); | ||
4025 | if (!header) | ||
4026 | return 0; | ||
4027 | switch (header->op) { | ||
4028 | case KVM_MMU_OP_WRITE_PTE: { | ||
4029 | struct kvm_mmu_op_write_pte *wpte; | ||
4030 | |||
4031 | wpte = pv_mmu_read_buffer(buffer, sizeof *wpte); | ||
4032 | if (!wpte) | ||
4033 | return 0; | ||
4034 | return kvm_pv_mmu_write(vcpu, wpte->pte_phys, | ||
4035 | wpte->pte_val); | ||
4036 | } | ||
4037 | case KVM_MMU_OP_FLUSH_TLB: { | ||
4038 | struct kvm_mmu_op_flush_tlb *ftlb; | ||
4039 | |||
4040 | ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb); | ||
4041 | if (!ftlb) | ||
4042 | return 0; | ||
4043 | return kvm_pv_mmu_flush_tlb(vcpu); | ||
4044 | } | ||
4045 | case KVM_MMU_OP_RELEASE_PT: { | ||
4046 | struct kvm_mmu_op_release_pt *rpt; | ||
4047 | |||
4048 | rpt = pv_mmu_read_buffer(buffer, sizeof *rpt); | ||
4049 | if (!rpt) | ||
4050 | return 0; | ||
4051 | return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys); | ||
4052 | } | ||
4053 | default: return 0; | ||
4054 | } | ||
4055 | } | ||
4056 | |||
4057 | int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, | ||
4058 | gpa_t addr, unsigned long *ret) | ||
4059 | { | ||
4060 | int r; | ||
4061 | struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer; | ||
4062 | |||
4063 | buffer->ptr = buffer->buf; | ||
4064 | buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf); | ||
4065 | buffer->processed = 0; | ||
4066 | |||
4067 | r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len); | ||
4068 | if (r) | ||
4069 | goto out; | ||
4070 | |||
4071 | while (buffer->len) { | ||
4072 | r = kvm_pv_mmu_op_one(vcpu, buffer); | ||
4073 | if (r < 0) | ||
4074 | goto out; | ||
4075 | if (r == 0) | ||
4076 | break; | ||
4077 | } | ||
4078 | |||
4079 | r = 1; | ||
4080 | out: | ||
4081 | *ret = buffer->processed; | ||
4082 | return r; | ||
4083 | } | ||
4084 | |||
4085 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) | 4002 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) |
4086 | { | 4003 | { |
4087 | struct kvm_shadow_walk_iterator iterator; | 4004 | struct kvm_shadow_walk_iterator iterator; |
@@ -4110,12 +4027,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) | |||
4110 | mmu_free_memory_caches(vcpu); | 4027 | mmu_free_memory_caches(vcpu); |
4111 | } | 4028 | } |
4112 | 4029 | ||
4113 | #ifdef CONFIG_KVM_MMU_AUDIT | ||
4114 | #include "mmu_audit.c" | ||
4115 | #else | ||
4116 | static void mmu_audit_disable(void) { } | ||
4117 | #endif | ||
4118 | |||
4119 | void kvm_mmu_module_exit(void) | 4030 | void kvm_mmu_module_exit(void) |
4120 | { | 4031 | { |
4121 | mmu_destroy_caches(); | 4032 | mmu_destroy_caches(); |
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 746ec259d024..fe15dcc07a6b 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c | |||
@@ -19,6 +19,15 @@ | |||
19 | 19 | ||
20 | #include <linux/ratelimit.h> | 20 | #include <linux/ratelimit.h> |
21 | 21 | ||
22 | char const *audit_point_name[] = { | ||
23 | "pre page fault", | ||
24 | "post page fault", | ||
25 | "pre pte write", | ||
26 | "post pte write", | ||
27 | "pre sync", | ||
28 | "post sync" | ||
29 | }; | ||
30 | |||
22 | #define audit_printk(kvm, fmt, args...) \ | 31 | #define audit_printk(kvm, fmt, args...) \ |
23 | printk(KERN_ERR "audit: (%s) error: " \ | 32 | printk(KERN_ERR "audit: (%s) error: " \ |
24 | fmt, audit_point_name[kvm->arch.audit_point], ##args) | 33 | fmt, audit_point_name[kvm->arch.audit_point], ##args) |
@@ -224,7 +233,10 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu) | |||
224 | mmu_spte_walk(vcpu, audit_spte); | 233 | mmu_spte_walk(vcpu, audit_spte); |
225 | } | 234 | } |
226 | 235 | ||
227 | static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point) | 236 | static bool mmu_audit; |
237 | static struct jump_label_key mmu_audit_key; | ||
238 | |||
239 | static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) | ||
228 | { | 240 | { |
229 | static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); | 241 | static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); |
230 | 242 | ||
@@ -236,18 +248,18 @@ static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point) | |||
236 | audit_vcpu_spte(vcpu); | 248 | audit_vcpu_spte(vcpu); |
237 | } | 249 | } |
238 | 250 | ||
239 | static bool mmu_audit; | 251 | static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) |
252 | { | ||
253 | if (static_branch((&mmu_audit_key))) | ||
254 | __kvm_mmu_audit(vcpu, point); | ||
255 | } | ||
240 | 256 | ||
241 | static void mmu_audit_enable(void) | 257 | static void mmu_audit_enable(void) |
242 | { | 258 | { |
243 | int ret; | ||
244 | |||
245 | if (mmu_audit) | 259 | if (mmu_audit) |
246 | return; | 260 | return; |
247 | 261 | ||
248 | ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); | 262 | jump_label_inc(&mmu_audit_key); |
249 | WARN_ON(ret); | ||
250 | |||
251 | mmu_audit = true; | 263 | mmu_audit = true; |
252 | } | 264 | } |
253 | 265 | ||
@@ -256,8 +268,7 @@ static void mmu_audit_disable(void) | |||
256 | if (!mmu_audit) | 268 | if (!mmu_audit) |
257 | return; | 269 | return; |
258 | 270 | ||
259 | unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); | 271 | jump_label_dec(&mmu_audit_key); |
260 | tracepoint_synchronize_unregister(); | ||
261 | mmu_audit = false; | 272 | mmu_audit = false; |
262 | } | 273 | } |
263 | 274 | ||
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index eed67f34146d..89fb0e81322a 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h | |||
@@ -243,25 +243,6 @@ TRACE_EVENT( | |||
243 | TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, | 243 | TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, |
244 | __entry->access) | 244 | __entry->access) |
245 | ); | 245 | ); |
246 | |||
247 | TRACE_EVENT( | ||
248 | kvm_mmu_audit, | ||
249 | TP_PROTO(struct kvm_vcpu *vcpu, int audit_point), | ||
250 | TP_ARGS(vcpu, audit_point), | ||
251 | |||
252 | TP_STRUCT__entry( | ||
253 | __field(struct kvm_vcpu *, vcpu) | ||
254 | __field(int, audit_point) | ||
255 | ), | ||
256 | |||
257 | TP_fast_assign( | ||
258 | __entry->vcpu = vcpu; | ||
259 | __entry->audit_point = audit_point; | ||
260 | ), | ||
261 | |||
262 | TP_printk("vcpu:%d %s", __entry->vcpu->cpu, | ||
263 | audit_point_name[__entry->audit_point]) | ||
264 | ); | ||
265 | #endif /* _TRACE_KVMMMU_H */ | 246 | #endif /* _TRACE_KVMMMU_H */ |
266 | 247 | ||
267 | #undef TRACE_INCLUDE_PATH | 248 | #undef TRACE_INCLUDE_PATH |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 92994100638b..15610285ebb6 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -497,6 +497,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
497 | shadow_walk_next(&it)) { | 497 | shadow_walk_next(&it)) { |
498 | gfn_t table_gfn; | 498 | gfn_t table_gfn; |
499 | 499 | ||
500 | clear_sp_write_flooding_count(it.sptep); | ||
500 | drop_large_spte(vcpu, it.sptep); | 501 | drop_large_spte(vcpu, it.sptep); |
501 | 502 | ||
502 | sp = NULL; | 503 | sp = NULL; |
@@ -522,6 +523,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
522 | shadow_walk_next(&it)) { | 523 | shadow_walk_next(&it)) { |
523 | gfn_t direct_gfn; | 524 | gfn_t direct_gfn; |
524 | 525 | ||
526 | clear_sp_write_flooding_count(it.sptep); | ||
525 | validate_direct_spte(vcpu, it.sptep, direct_access); | 527 | validate_direct_spte(vcpu, it.sptep, direct_access); |
526 | 528 | ||
527 | drop_large_spte(vcpu, it.sptep); | 529 | drop_large_spte(vcpu, it.sptep); |
@@ -536,6 +538,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
536 | link_shadow_page(it.sptep, sp); | 538 | link_shadow_page(it.sptep, sp); |
537 | } | 539 | } |
538 | 540 | ||
541 | clear_sp_write_flooding_count(it.sptep); | ||
539 | mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, | 542 | mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, |
540 | user_fault, write_fault, emulate, it.level, | 543 | user_fault, write_fault, emulate, it.level, |
541 | gw->gfn, pfn, prefault, map_writable); | 544 | gw->gfn, pfn, prefault, map_writable); |
@@ -599,11 +602,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
599 | */ | 602 | */ |
600 | if (!r) { | 603 | if (!r) { |
601 | pgprintk("%s: guest page fault\n", __func__); | 604 | pgprintk("%s: guest page fault\n", __func__); |
602 | if (!prefault) { | 605 | if (!prefault) |
603 | inject_page_fault(vcpu, &walker.fault); | 606 | inject_page_fault(vcpu, &walker.fault); |
604 | /* reset fork detector */ | 607 | |
605 | vcpu->arch.last_pt_write_count = 0; | ||
606 | } | ||
607 | return 0; | 608 | return 0; |
608 | } | 609 | } |
609 | 610 | ||
@@ -631,7 +632,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
631 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 632 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
632 | goto out_unlock; | 633 | goto out_unlock; |
633 | 634 | ||
634 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); | 635 | kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); |
635 | kvm_mmu_free_some_pages(vcpu); | 636 | kvm_mmu_free_some_pages(vcpu); |
636 | if (!force_pt_level) | 637 | if (!force_pt_level) |
637 | transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); | 638 | transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); |
@@ -641,11 +642,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
641 | pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__, | 642 | pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__, |
642 | sptep, *sptep, emulate); | 643 | sptep, *sptep, emulate); |
643 | 644 | ||
644 | if (!emulate) | ||
645 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | ||
646 | |||
647 | ++vcpu->stat.pf_fixed; | 645 | ++vcpu->stat.pf_fixed; |
648 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); | 646 | kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); |
649 | spin_unlock(&vcpu->kvm->mmu_lock); | 647 | spin_unlock(&vcpu->kvm->mmu_lock); |
650 | 648 | ||
651 | return emulate; | 649 | return emulate; |
@@ -656,65 +654,66 @@ out_unlock: | |||
656 | return 0; | 654 | return 0; |
657 | } | 655 | } |
658 | 656 | ||
657 | static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) | ||
658 | { | ||
659 | int offset = 0; | ||
660 | |||
661 | WARN_ON(sp->role.level != 1); | ||
662 | |||
663 | if (PTTYPE == 32) | ||
664 | offset = sp->role.quadrant << PT64_LEVEL_BITS; | ||
665 | |||
666 | return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); | ||
667 | } | ||
668 | |||
659 | static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | 669 | static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) |
660 | { | 670 | { |
661 | struct kvm_shadow_walk_iterator iterator; | 671 | struct kvm_shadow_walk_iterator iterator; |
662 | struct kvm_mmu_page *sp; | 672 | struct kvm_mmu_page *sp; |
663 | gpa_t pte_gpa = -1; | ||
664 | int level; | 673 | int level; |
665 | u64 *sptep; | 674 | u64 *sptep; |
666 | int need_flush = 0; | ||
667 | 675 | ||
668 | vcpu_clear_mmio_info(vcpu, gva); | 676 | vcpu_clear_mmio_info(vcpu, gva); |
669 | 677 | ||
670 | spin_lock(&vcpu->kvm->mmu_lock); | 678 | /* |
679 | * No need to check return value here, rmap_can_add() can | ||
680 | * help us to skip pte prefetch later. | ||
681 | */ | ||
682 | mmu_topup_memory_caches(vcpu); | ||
671 | 683 | ||
684 | spin_lock(&vcpu->kvm->mmu_lock); | ||
672 | for_each_shadow_entry(vcpu, gva, iterator) { | 685 | for_each_shadow_entry(vcpu, gva, iterator) { |
673 | level = iterator.level; | 686 | level = iterator.level; |
674 | sptep = iterator.sptep; | 687 | sptep = iterator.sptep; |
675 | 688 | ||
676 | sp = page_header(__pa(sptep)); | 689 | sp = page_header(__pa(sptep)); |
677 | if (is_last_spte(*sptep, level)) { | 690 | if (is_last_spte(*sptep, level)) { |
678 | int offset, shift; | 691 | pt_element_t gpte; |
692 | gpa_t pte_gpa; | ||
679 | 693 | ||
680 | if (!sp->unsync) | 694 | if (!sp->unsync) |
681 | break; | 695 | break; |
682 | 696 | ||
683 | shift = PAGE_SHIFT - | 697 | pte_gpa = FNAME(get_level1_sp_gpa)(sp); |
684 | (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level; | ||
685 | offset = sp->role.quadrant << shift; | ||
686 | |||
687 | pte_gpa = (sp->gfn << PAGE_SHIFT) + offset; | ||
688 | pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); | 698 | pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); |
689 | 699 | ||
690 | if (is_shadow_present_pte(*sptep)) { | 700 | if (mmu_page_zap_pte(vcpu->kvm, sp, sptep)) |
691 | if (is_large_pte(*sptep)) | 701 | kvm_flush_remote_tlbs(vcpu->kvm); |
692 | --vcpu->kvm->stat.lpages; | ||
693 | drop_spte(vcpu->kvm, sptep); | ||
694 | need_flush = 1; | ||
695 | } else if (is_mmio_spte(*sptep)) | ||
696 | mmu_spte_clear_no_track(sptep); | ||
697 | 702 | ||
698 | break; | 703 | if (!rmap_can_add(vcpu)) |
704 | break; | ||
705 | |||
706 | if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, | ||
707 | sizeof(pt_element_t))) | ||
708 | break; | ||
709 | |||
710 | FNAME(update_pte)(vcpu, sp, sptep, &gpte); | ||
699 | } | 711 | } |
700 | 712 | ||
701 | if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) | 713 | if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) |
702 | break; | 714 | break; |
703 | } | 715 | } |
704 | |||
705 | if (need_flush) | ||
706 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
707 | |||
708 | atomic_inc(&vcpu->kvm->arch.invlpg_counter); | ||
709 | |||
710 | spin_unlock(&vcpu->kvm->mmu_lock); | 716 | spin_unlock(&vcpu->kvm->mmu_lock); |
711 | |||
712 | if (pte_gpa == -1) | ||
713 | return; | ||
714 | |||
715 | if (mmu_topup_memory_caches(vcpu)) | ||
716 | return; | ||
717 | kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0); | ||
718 | } | 717 | } |
719 | 718 | ||
720 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, | 719 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, |
@@ -769,19 +768,14 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, | |||
769 | */ | 768 | */ |
770 | static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | 769 | static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) |
771 | { | 770 | { |
772 | int i, offset, nr_present; | 771 | int i, nr_present = 0; |
773 | bool host_writable; | 772 | bool host_writable; |
774 | gpa_t first_pte_gpa; | 773 | gpa_t first_pte_gpa; |
775 | 774 | ||
776 | offset = nr_present = 0; | ||
777 | |||
778 | /* direct kvm_mmu_page can not be unsync. */ | 775 | /* direct kvm_mmu_page can not be unsync. */ |
779 | BUG_ON(sp->role.direct); | 776 | BUG_ON(sp->role.direct); |
780 | 777 | ||
781 | if (PTTYPE == 32) | 778 | first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); |
782 | offset = sp->role.quadrant << PT64_LEVEL_BITS; | ||
783 | |||
784 | first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); | ||
785 | 779 | ||
786 | for (i = 0; i < PT64_ENT_PER_PAGE; i++) { | 780 | for (i = 0; i < PT64_ENT_PER_PAGE; i++) { |
787 | unsigned pte_access; | 781 | unsigned pte_access; |
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c new file mode 100644 index 000000000000..7aad5446f393 --- /dev/null +++ b/arch/x86/kvm/pmu.c | |||
@@ -0,0 +1,533 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine -- Performane Monitoring Unit support | ||
3 | * | ||
4 | * Copyright 2011 Red Hat, Inc. and/or its affiliates. | ||
5 | * | ||
6 | * Authors: | ||
7 | * Avi Kivity <avi@redhat.com> | ||
8 | * Gleb Natapov <gleb@redhat.com> | ||
9 | * | ||
10 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
11 | * the COPYING file in the top-level directory. | ||
12 | * | ||
13 | */ | ||
14 | |||
15 | #include <linux/types.h> | ||
16 | #include <linux/kvm_host.h> | ||
17 | #include <linux/perf_event.h> | ||
18 | #include "x86.h" | ||
19 | #include "cpuid.h" | ||
20 | #include "lapic.h" | ||
21 | |||
22 | static struct kvm_arch_event_perf_mapping { | ||
23 | u8 eventsel; | ||
24 | u8 unit_mask; | ||
25 | unsigned event_type; | ||
26 | bool inexact; | ||
27 | } arch_events[] = { | ||
28 | /* Index must match CPUID 0x0A.EBX bit vector */ | ||
29 | [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, | ||
30 | [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, | ||
31 | [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, | ||
32 | [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES }, | ||
33 | [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, | ||
34 | [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, | ||
35 | [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, | ||
36 | }; | ||
37 | |||
38 | /* mapping between fixed pmc index and arch_events array */ | ||
39 | int fixed_pmc_events[] = {1, 0, 2}; | ||
40 | |||
41 | static bool pmc_is_gp(struct kvm_pmc *pmc) | ||
42 | { | ||
43 | return pmc->type == KVM_PMC_GP; | ||
44 | } | ||
45 | |||
46 | static inline u64 pmc_bitmask(struct kvm_pmc *pmc) | ||
47 | { | ||
48 | struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; | ||
49 | |||
50 | return pmu->counter_bitmask[pmc->type]; | ||
51 | } | ||
52 | |||
53 | static inline bool pmc_enabled(struct kvm_pmc *pmc) | ||
54 | { | ||
55 | struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; | ||
56 | return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); | ||
57 | } | ||
58 | |||
59 | static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr, | ||
60 | u32 base) | ||
61 | { | ||
62 | if (msr >= base && msr < base + pmu->nr_arch_gp_counters) | ||
63 | return &pmu->gp_counters[msr - base]; | ||
64 | return NULL; | ||
65 | } | ||
66 | |||
67 | static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr) | ||
68 | { | ||
69 | int base = MSR_CORE_PERF_FIXED_CTR0; | ||
70 | if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) | ||
71 | return &pmu->fixed_counters[msr - base]; | ||
72 | return NULL; | ||
73 | } | ||
74 | |||
75 | static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx) | ||
76 | { | ||
77 | return get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + idx); | ||
78 | } | ||
79 | |||
80 | static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx) | ||
81 | { | ||
82 | if (idx < X86_PMC_IDX_FIXED) | ||
83 | return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0); | ||
84 | else | ||
85 | return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED); | ||
86 | } | ||
87 | |||
88 | void kvm_deliver_pmi(struct kvm_vcpu *vcpu) | ||
89 | { | ||
90 | if (vcpu->arch.apic) | ||
91 | kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); | ||
92 | } | ||
93 | |||
94 | static void trigger_pmi(struct irq_work *irq_work) | ||
95 | { | ||
96 | struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, | ||
97 | irq_work); | ||
98 | struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu, | ||
99 | arch.pmu); | ||
100 | |||
101 | kvm_deliver_pmi(vcpu); | ||
102 | } | ||
103 | |||
104 | static void kvm_perf_overflow(struct perf_event *perf_event, | ||
105 | struct perf_sample_data *data, | ||
106 | struct pt_regs *regs) | ||
107 | { | ||
108 | struct kvm_pmc *pmc = perf_event->overflow_handler_context; | ||
109 | struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; | ||
110 | __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); | ||
111 | } | ||
112 | |||
113 | static void kvm_perf_overflow_intr(struct perf_event *perf_event, | ||
114 | struct perf_sample_data *data, struct pt_regs *regs) | ||
115 | { | ||
116 | struct kvm_pmc *pmc = perf_event->overflow_handler_context; | ||
117 | struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; | ||
118 | if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { | ||
119 | kvm_perf_overflow(perf_event, data, regs); | ||
120 | kvm_make_request(KVM_REQ_PMU, pmc->vcpu); | ||
121 | /* | ||
122 | * Inject PMI. If vcpu was in a guest mode during NMI PMI | ||
123 | * can be ejected on a guest mode re-entry. Otherwise we can't | ||
124 | * be sure that vcpu wasn't executing hlt instruction at the | ||
125 | * time of vmexit and is not going to re-enter guest mode until, | ||
126 | * woken up. So we should wake it, but this is impossible from | ||
127 | * NMI context. Do it from irq work instead. | ||
128 | */ | ||
129 | if (!kvm_is_in_guest()) | ||
130 | irq_work_queue(&pmc->vcpu->arch.pmu.irq_work); | ||
131 | else | ||
132 | kvm_make_request(KVM_REQ_PMI, pmc->vcpu); | ||
133 | } | ||
134 | } | ||
135 | |||
136 | static u64 read_pmc(struct kvm_pmc *pmc) | ||
137 | { | ||
138 | u64 counter, enabled, running; | ||
139 | |||
140 | counter = pmc->counter; | ||
141 | |||
142 | if (pmc->perf_event) | ||
143 | counter += perf_event_read_value(pmc->perf_event, | ||
144 | &enabled, &running); | ||
145 | |||
146 | /* FIXME: Scaling needed? */ | ||
147 | |||
148 | return counter & pmc_bitmask(pmc); | ||
149 | } | ||
150 | |||
151 | static void stop_counter(struct kvm_pmc *pmc) | ||
152 | { | ||
153 | if (pmc->perf_event) { | ||
154 | pmc->counter = read_pmc(pmc); | ||
155 | perf_event_release_kernel(pmc->perf_event); | ||
156 | pmc->perf_event = NULL; | ||
157 | } | ||
158 | } | ||
159 | |||
160 | static void reprogram_counter(struct kvm_pmc *pmc, u32 type, | ||
161 | unsigned config, bool exclude_user, bool exclude_kernel, | ||
162 | bool intr) | ||
163 | { | ||
164 | struct perf_event *event; | ||
165 | struct perf_event_attr attr = { | ||
166 | .type = type, | ||
167 | .size = sizeof(attr), | ||
168 | .pinned = true, | ||
169 | .exclude_idle = true, | ||
170 | .exclude_host = 1, | ||
171 | .exclude_user = exclude_user, | ||
172 | .exclude_kernel = exclude_kernel, | ||
173 | .config = config, | ||
174 | }; | ||
175 | |||
176 | attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); | ||
177 | |||
178 | event = perf_event_create_kernel_counter(&attr, -1, current, | ||
179 | intr ? kvm_perf_overflow_intr : | ||
180 | kvm_perf_overflow, pmc); | ||
181 | if (IS_ERR(event)) { | ||
182 | printk_once("kvm: pmu event creation failed %ld\n", | ||
183 | PTR_ERR(event)); | ||
184 | return; | ||
185 | } | ||
186 | |||
187 | pmc->perf_event = event; | ||
188 | clear_bit(pmc->idx, (unsigned long*)&pmc->vcpu->arch.pmu.reprogram_pmi); | ||
189 | } | ||
190 | |||
191 | static unsigned find_arch_event(struct kvm_pmu *pmu, u8 event_select, | ||
192 | u8 unit_mask) | ||
193 | { | ||
194 | int i; | ||
195 | |||
196 | for (i = 0; i < ARRAY_SIZE(arch_events); i++) | ||
197 | if (arch_events[i].eventsel == event_select | ||
198 | && arch_events[i].unit_mask == unit_mask | ||
199 | && (pmu->available_event_types & (1 << i))) | ||
200 | break; | ||
201 | |||
202 | if (i == ARRAY_SIZE(arch_events)) | ||
203 | return PERF_COUNT_HW_MAX; | ||
204 | |||
205 | return arch_events[i].event_type; | ||
206 | } | ||
207 | |||
208 | static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) | ||
209 | { | ||
210 | unsigned config, type = PERF_TYPE_RAW; | ||
211 | u8 event_select, unit_mask; | ||
212 | |||
213 | pmc->eventsel = eventsel; | ||
214 | |||
215 | stop_counter(pmc); | ||
216 | |||
217 | if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_enabled(pmc)) | ||
218 | return; | ||
219 | |||
220 | event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; | ||
221 | unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; | ||
222 | |||
223 | if (!(event_select & (ARCH_PERFMON_EVENTSEL_EDGE | | ||
224 | ARCH_PERFMON_EVENTSEL_INV | | ||
225 | ARCH_PERFMON_EVENTSEL_CMASK))) { | ||
226 | config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, | ||
227 | unit_mask); | ||
228 | if (config != PERF_COUNT_HW_MAX) | ||
229 | type = PERF_TYPE_HARDWARE; | ||
230 | } | ||
231 | |||
232 | if (type == PERF_TYPE_RAW) | ||
233 | config = eventsel & X86_RAW_EVENT_MASK; | ||
234 | |||
235 | reprogram_counter(pmc, type, config, | ||
236 | !(eventsel & ARCH_PERFMON_EVENTSEL_USR), | ||
237 | !(eventsel & ARCH_PERFMON_EVENTSEL_OS), | ||
238 | eventsel & ARCH_PERFMON_EVENTSEL_INT); | ||
239 | } | ||
240 | |||
241 | static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) | ||
242 | { | ||
243 | unsigned en = en_pmi & 0x3; | ||
244 | bool pmi = en_pmi & 0x8; | ||
245 | |||
246 | stop_counter(pmc); | ||
247 | |||
248 | if (!en || !pmc_enabled(pmc)) | ||
249 | return; | ||
250 | |||
251 | reprogram_counter(pmc, PERF_TYPE_HARDWARE, | ||
252 | arch_events[fixed_pmc_events[idx]].event_type, | ||
253 | !(en & 0x2), /* exclude user */ | ||
254 | !(en & 0x1), /* exclude kernel */ | ||
255 | pmi); | ||
256 | } | ||
257 | |||
258 | static inline u8 fixed_en_pmi(u64 ctrl, int idx) | ||
259 | { | ||
260 | return (ctrl >> (idx * 4)) & 0xf; | ||
261 | } | ||
262 | |||
263 | static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) | ||
264 | { | ||
265 | int i; | ||
266 | |||
267 | for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { | ||
268 | u8 en_pmi = fixed_en_pmi(data, i); | ||
269 | struct kvm_pmc *pmc = get_fixed_pmc_idx(pmu, i); | ||
270 | |||
271 | if (fixed_en_pmi(pmu->fixed_ctr_ctrl, i) == en_pmi) | ||
272 | continue; | ||
273 | |||
274 | reprogram_fixed_counter(pmc, en_pmi, i); | ||
275 | } | ||
276 | |||
277 | pmu->fixed_ctr_ctrl = data; | ||
278 | } | ||
279 | |||
280 | static void reprogram_idx(struct kvm_pmu *pmu, int idx) | ||
281 | { | ||
282 | struct kvm_pmc *pmc = global_idx_to_pmc(pmu, idx); | ||
283 | |||
284 | if (!pmc) | ||
285 | return; | ||
286 | |||
287 | if (pmc_is_gp(pmc)) | ||
288 | reprogram_gp_counter(pmc, pmc->eventsel); | ||
289 | else { | ||
290 | int fidx = idx - X86_PMC_IDX_FIXED; | ||
291 | reprogram_fixed_counter(pmc, | ||
292 | fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx); | ||
293 | } | ||
294 | } | ||
295 | |||
296 | static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) | ||
297 | { | ||
298 | int bit; | ||
299 | u64 diff = pmu->global_ctrl ^ data; | ||
300 | |||
301 | pmu->global_ctrl = data; | ||
302 | |||
303 | for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) | ||
304 | reprogram_idx(pmu, bit); | ||
305 | } | ||
306 | |||
307 | bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr) | ||
308 | { | ||
309 | struct kvm_pmu *pmu = &vcpu->arch.pmu; | ||
310 | int ret; | ||
311 | |||
312 | switch (msr) { | ||
313 | case MSR_CORE_PERF_FIXED_CTR_CTRL: | ||
314 | case MSR_CORE_PERF_GLOBAL_STATUS: | ||
315 | case MSR_CORE_PERF_GLOBAL_CTRL: | ||
316 | case MSR_CORE_PERF_GLOBAL_OVF_CTRL: | ||
317 | ret = pmu->version > 1; | ||
318 | break; | ||
319 | default: | ||
320 | ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) | ||
321 | || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) | ||
322 | || get_fixed_pmc(pmu, msr); | ||
323 | break; | ||
324 | } | ||
325 | return ret; | ||
326 | } | ||
327 | |||
328 | int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) | ||
329 | { | ||
330 | struct kvm_pmu *pmu = &vcpu->arch.pmu; | ||
331 | struct kvm_pmc *pmc; | ||
332 | |||
333 | switch (index) { | ||
334 | case MSR_CORE_PERF_FIXED_CTR_CTRL: | ||
335 | *data = pmu->fixed_ctr_ctrl; | ||
336 | return 0; | ||
337 | case MSR_CORE_PERF_GLOBAL_STATUS: | ||
338 | *data = pmu->global_status; | ||
339 | return 0; | ||
340 | case MSR_CORE_PERF_GLOBAL_CTRL: | ||
341 | *data = pmu->global_ctrl; | ||
342 | return 0; | ||
343 | case MSR_CORE_PERF_GLOBAL_OVF_CTRL: | ||
344 | *data = pmu->global_ovf_ctrl; | ||
345 | return 0; | ||
346 | default: | ||
347 | if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) || | ||
348 | (pmc = get_fixed_pmc(pmu, index))) { | ||
349 | *data = read_pmc(pmc); | ||
350 | return 0; | ||
351 | } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { | ||
352 | *data = pmc->eventsel; | ||
353 | return 0; | ||
354 | } | ||
355 | } | ||
356 | return 1; | ||
357 | } | ||
358 | |||
359 | int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) | ||
360 | { | ||
361 | struct kvm_pmu *pmu = &vcpu->arch.pmu; | ||
362 | struct kvm_pmc *pmc; | ||
363 | |||
364 | switch (index) { | ||
365 | case MSR_CORE_PERF_FIXED_CTR_CTRL: | ||
366 | if (pmu->fixed_ctr_ctrl == data) | ||
367 | return 0; | ||
368 | if (!(data & 0xfffffffffffff444)) { | ||
369 | reprogram_fixed_counters(pmu, data); | ||
370 | return 0; | ||
371 | } | ||
372 | break; | ||
373 | case MSR_CORE_PERF_GLOBAL_STATUS: | ||
374 | break; /* RO MSR */ | ||
375 | case MSR_CORE_PERF_GLOBAL_CTRL: | ||
376 | if (pmu->global_ctrl == data) | ||
377 | return 0; | ||
378 | if (!(data & pmu->global_ctrl_mask)) { | ||
379 | global_ctrl_changed(pmu, data); | ||
380 | return 0; | ||
381 | } | ||
382 | break; | ||
383 | case MSR_CORE_PERF_GLOBAL_OVF_CTRL: | ||
384 | if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) { | ||
385 | pmu->global_status &= ~data; | ||
386 | pmu->global_ovf_ctrl = data; | ||
387 | return 0; | ||
388 | } | ||
389 | break; | ||
390 | default: | ||
391 | if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) || | ||
392 | (pmc = get_fixed_pmc(pmu, index))) { | ||
393 | data = (s64)(s32)data; | ||
394 | pmc->counter += data - read_pmc(pmc); | ||
395 | return 0; | ||
396 | } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { | ||
397 | if (data == pmc->eventsel) | ||
398 | return 0; | ||
399 | if (!(data & 0xffffffff00200000ull)) { | ||
400 | reprogram_gp_counter(pmc, data); | ||
401 | return 0; | ||
402 | } | ||
403 | } | ||
404 | } | ||
405 | return 1; | ||
406 | } | ||
407 | |||
408 | int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) | ||
409 | { | ||
410 | struct kvm_pmu *pmu = &vcpu->arch.pmu; | ||
411 | bool fast_mode = pmc & (1u << 31); | ||
412 | bool fixed = pmc & (1u << 30); | ||
413 | struct kvm_pmc *counters; | ||
414 | u64 ctr; | ||
415 | |||
416 | pmc &= (3u << 30) - 1; | ||
417 | if (!fixed && pmc >= pmu->nr_arch_gp_counters) | ||
418 | return 1; | ||
419 | if (fixed && pmc >= pmu->nr_arch_fixed_counters) | ||
420 | return 1; | ||
421 | counters = fixed ? pmu->fixed_counters : pmu->gp_counters; | ||
422 | ctr = read_pmc(&counters[pmc]); | ||
423 | if (fast_mode) | ||
424 | ctr = (u32)ctr; | ||
425 | *data = ctr; | ||
426 | |||
427 | return 0; | ||
428 | } | ||
429 | |||
430 | void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) | ||
431 | { | ||
432 | struct kvm_pmu *pmu = &vcpu->arch.pmu; | ||
433 | struct kvm_cpuid_entry2 *entry; | ||
434 | unsigned bitmap_len; | ||
435 | |||
436 | pmu->nr_arch_gp_counters = 0; | ||
437 | pmu->nr_arch_fixed_counters = 0; | ||
438 | pmu->counter_bitmask[KVM_PMC_GP] = 0; | ||
439 | pmu->counter_bitmask[KVM_PMC_FIXED] = 0; | ||
440 | pmu->version = 0; | ||
441 | |||
442 | entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); | ||
443 | if (!entry) | ||
444 | return; | ||
445 | |||
446 | pmu->version = entry->eax & 0xff; | ||
447 | if (!pmu->version) | ||
448 | return; | ||
449 | |||
450 | pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff, | ||
451 | X86_PMC_MAX_GENERIC); | ||
452 | pmu->counter_bitmask[KVM_PMC_GP] = | ||
453 | ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1; | ||
454 | bitmap_len = (entry->eax >> 24) & 0xff; | ||
455 | pmu->available_event_types = ~entry->ebx & ((1ull << bitmap_len) - 1); | ||
456 | |||
457 | if (pmu->version == 1) { | ||
458 | pmu->global_ctrl = (1 << pmu->nr_arch_gp_counters) - 1; | ||
459 | return; | ||
460 | } | ||
461 | |||
462 | pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f), | ||
463 | X86_PMC_MAX_FIXED); | ||
464 | pmu->counter_bitmask[KVM_PMC_FIXED] = | ||
465 | ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1; | ||
466 | pmu->global_ctrl_mask = ~(((1 << pmu->nr_arch_gp_counters) - 1) | ||
467 | | (((1ull << pmu->nr_arch_fixed_counters) - 1) | ||
468 | << X86_PMC_IDX_FIXED)); | ||
469 | } | ||
470 | |||
471 | void kvm_pmu_init(struct kvm_vcpu *vcpu) | ||
472 | { | ||
473 | int i; | ||
474 | struct kvm_pmu *pmu = &vcpu->arch.pmu; | ||
475 | |||
476 | memset(pmu, 0, sizeof(*pmu)); | ||
477 | for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { | ||
478 | pmu->gp_counters[i].type = KVM_PMC_GP; | ||
479 | pmu->gp_counters[i].vcpu = vcpu; | ||
480 | pmu->gp_counters[i].idx = i; | ||
481 | } | ||
482 | for (i = 0; i < X86_PMC_MAX_FIXED; i++) { | ||
483 | pmu->fixed_counters[i].type = KVM_PMC_FIXED; | ||
484 | pmu->fixed_counters[i].vcpu = vcpu; | ||
485 | pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED; | ||
486 | } | ||
487 | init_irq_work(&pmu->irq_work, trigger_pmi); | ||
488 | kvm_pmu_cpuid_update(vcpu); | ||
489 | } | ||
490 | |||
491 | void kvm_pmu_reset(struct kvm_vcpu *vcpu) | ||
492 | { | ||
493 | struct kvm_pmu *pmu = &vcpu->arch.pmu; | ||
494 | int i; | ||
495 | |||
496 | irq_work_sync(&pmu->irq_work); | ||
497 | for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { | ||
498 | struct kvm_pmc *pmc = &pmu->gp_counters[i]; | ||
499 | stop_counter(pmc); | ||
500 | pmc->counter = pmc->eventsel = 0; | ||
501 | } | ||
502 | |||
503 | for (i = 0; i < X86_PMC_MAX_FIXED; i++) | ||
504 | stop_counter(&pmu->fixed_counters[i]); | ||
505 | |||
506 | pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = | ||
507 | pmu->global_ovf_ctrl = 0; | ||
508 | } | ||
509 | |||
510 | void kvm_pmu_destroy(struct kvm_vcpu *vcpu) | ||
511 | { | ||
512 | kvm_pmu_reset(vcpu); | ||
513 | } | ||
514 | |||
515 | void kvm_handle_pmu_event(struct kvm_vcpu *vcpu) | ||
516 | { | ||
517 | struct kvm_pmu *pmu = &vcpu->arch.pmu; | ||
518 | u64 bitmask; | ||
519 | int bit; | ||
520 | |||
521 | bitmask = pmu->reprogram_pmi; | ||
522 | |||
523 | for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) { | ||
524 | struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit); | ||
525 | |||
526 | if (unlikely(!pmc || !pmc->perf_event)) { | ||
527 | clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi); | ||
528 | continue; | ||
529 | } | ||
530 | |||
531 | reprogram_idx(pmu, bit); | ||
532 | } | ||
533 | } | ||
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e32243eac2f4..5fa553babe56 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -1014,6 +1014,7 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
1014 | set_intercept(svm, INTERCEPT_NMI); | 1014 | set_intercept(svm, INTERCEPT_NMI); |
1015 | set_intercept(svm, INTERCEPT_SMI); | 1015 | set_intercept(svm, INTERCEPT_SMI); |
1016 | set_intercept(svm, INTERCEPT_SELECTIVE_CR0); | 1016 | set_intercept(svm, INTERCEPT_SELECTIVE_CR0); |
1017 | set_intercept(svm, INTERCEPT_RDPMC); | ||
1017 | set_intercept(svm, INTERCEPT_CPUID); | 1018 | set_intercept(svm, INTERCEPT_CPUID); |
1018 | set_intercept(svm, INTERCEPT_INVD); | 1019 | set_intercept(svm, INTERCEPT_INVD); |
1019 | set_intercept(svm, INTERCEPT_HLT); | 1020 | set_intercept(svm, INTERCEPT_HLT); |
@@ -2770,6 +2771,19 @@ static int emulate_on_interception(struct vcpu_svm *svm) | |||
2770 | return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; | 2771 | return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; |
2771 | } | 2772 | } |
2772 | 2773 | ||
2774 | static int rdpmc_interception(struct vcpu_svm *svm) | ||
2775 | { | ||
2776 | int err; | ||
2777 | |||
2778 | if (!static_cpu_has(X86_FEATURE_NRIPS)) | ||
2779 | return emulate_on_interception(svm); | ||
2780 | |||
2781 | err = kvm_rdpmc(&svm->vcpu); | ||
2782 | kvm_complete_insn_gp(&svm->vcpu, err); | ||
2783 | |||
2784 | return 1; | ||
2785 | } | ||
2786 | |||
2773 | bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val) | 2787 | bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val) |
2774 | { | 2788 | { |
2775 | unsigned long cr0 = svm->vcpu.arch.cr0; | 2789 | unsigned long cr0 = svm->vcpu.arch.cr0; |
@@ -3190,6 +3204,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { | |||
3190 | [SVM_EXIT_SMI] = nop_on_interception, | 3204 | [SVM_EXIT_SMI] = nop_on_interception, |
3191 | [SVM_EXIT_INIT] = nop_on_interception, | 3205 | [SVM_EXIT_INIT] = nop_on_interception, |
3192 | [SVM_EXIT_VINTR] = interrupt_window_interception, | 3206 | [SVM_EXIT_VINTR] = interrupt_window_interception, |
3207 | [SVM_EXIT_RDPMC] = rdpmc_interception, | ||
3193 | [SVM_EXIT_CPUID] = cpuid_interception, | 3208 | [SVM_EXIT_CPUID] = cpuid_interception, |
3194 | [SVM_EXIT_IRET] = iret_interception, | 3209 | [SVM_EXIT_IRET] = iret_interception, |
3195 | [SVM_EXIT_INVD] = emulate_on_interception, | 3210 | [SVM_EXIT_INVD] = emulate_on_interception, |
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index ae432ea1cd83..6b85cc647f34 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c | |||
@@ -18,9 +18,10 @@ | |||
18 | #include <linux/atomic.h> | 18 | #include <linux/atomic.h> |
19 | #include "kvm_timer.h" | 19 | #include "kvm_timer.h" |
20 | 20 | ||
21 | static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) | 21 | enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) |
22 | { | 22 | { |
23 | int restart_timer = 0; | 23 | struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); |
24 | struct kvm_vcpu *vcpu = ktimer->vcpu; | ||
24 | wait_queue_head_t *q = &vcpu->wq; | 25 | wait_queue_head_t *q = &vcpu->wq; |
25 | 26 | ||
26 | /* | 27 | /* |
@@ -40,26 +41,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) | |||
40 | 41 | ||
41 | if (ktimer->t_ops->is_periodic(ktimer)) { | 42 | if (ktimer->t_ops->is_periodic(ktimer)) { |
42 | hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); | 43 | hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); |
43 | restart_timer = 1; | ||
44 | } | ||
45 | |||
46 | return restart_timer; | ||
47 | } | ||
48 | |||
49 | enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) | ||
50 | { | ||
51 | int restart_timer; | ||
52 | struct kvm_vcpu *vcpu; | ||
53 | struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); | ||
54 | |||
55 | vcpu = ktimer->vcpu; | ||
56 | if (!vcpu) | ||
57 | return HRTIMER_NORESTART; | ||
58 | |||
59 | restart_timer = __kvm_timer_fn(vcpu, ktimer); | ||
60 | if (restart_timer) | ||
61 | return HRTIMER_RESTART; | 44 | return HRTIMER_RESTART; |
62 | else | 45 | } else |
63 | return HRTIMER_NORESTART; | 46 | return HRTIMER_NORESTART; |
64 | } | 47 | } |
65 | |||
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 579a0b51696a..906a7e84200f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -18,6 +18,7 @@ | |||
18 | 18 | ||
19 | #include "irq.h" | 19 | #include "irq.h" |
20 | #include "mmu.h" | 20 | #include "mmu.h" |
21 | #include "cpuid.h" | ||
21 | 22 | ||
22 | #include <linux/kvm_host.h> | 23 | #include <linux/kvm_host.h> |
23 | #include <linux/module.h> | 24 | #include <linux/module.h> |
@@ -1747,7 +1748,6 @@ static void setup_msrs(struct vcpu_vmx *vmx) | |||
1747 | int save_nmsrs, index; | 1748 | int save_nmsrs, index; |
1748 | unsigned long *msr_bitmap; | 1749 | unsigned long *msr_bitmap; |
1749 | 1750 | ||
1750 | vmx_load_host_state(vmx); | ||
1751 | save_nmsrs = 0; | 1751 | save_nmsrs = 0; |
1752 | #ifdef CONFIG_X86_64 | 1752 | #ifdef CONFIG_X86_64 |
1753 | if (is_long_mode(&vmx->vcpu)) { | 1753 | if (is_long_mode(&vmx->vcpu)) { |
@@ -1956,6 +1956,7 @@ static __init void nested_vmx_setup_ctls_msrs(void) | |||
1956 | #endif | 1956 | #endif |
1957 | CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | | 1957 | CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | |
1958 | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | | 1958 | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | |
1959 | CPU_BASED_RDPMC_EXITING | | ||
1959 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | 1960 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; |
1960 | /* | 1961 | /* |
1961 | * We can allow some features even when not supported by the | 1962 | * We can allow some features even when not supported by the |
@@ -2142,12 +2143,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | |||
2142 | return 1; | 2143 | return 1; |
2143 | /* Otherwise falls through */ | 2144 | /* Otherwise falls through */ |
2144 | default: | 2145 | default: |
2145 | vmx_load_host_state(to_vmx(vcpu)); | ||
2146 | if (vmx_get_vmx_msr(vcpu, msr_index, pdata)) | 2146 | if (vmx_get_vmx_msr(vcpu, msr_index, pdata)) |
2147 | return 0; | 2147 | return 0; |
2148 | msr = find_msr_entry(to_vmx(vcpu), msr_index); | 2148 | msr = find_msr_entry(to_vmx(vcpu), msr_index); |
2149 | if (msr) { | 2149 | if (msr) { |
2150 | vmx_load_host_state(to_vmx(vcpu)); | ||
2151 | data = msr->data; | 2150 | data = msr->data; |
2152 | break; | 2151 | break; |
2153 | } | 2152 | } |
@@ -2171,7 +2170,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
2171 | 2170 | ||
2172 | switch (msr_index) { | 2171 | switch (msr_index) { |
2173 | case MSR_EFER: | 2172 | case MSR_EFER: |
2174 | vmx_load_host_state(vmx); | ||
2175 | ret = kvm_set_msr_common(vcpu, msr_index, data); | 2173 | ret = kvm_set_msr_common(vcpu, msr_index, data); |
2176 | break; | 2174 | break; |
2177 | #ifdef CONFIG_X86_64 | 2175 | #ifdef CONFIG_X86_64 |
@@ -2220,7 +2218,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
2220 | break; | 2218 | break; |
2221 | msr = find_msr_entry(vmx, msr_index); | 2219 | msr = find_msr_entry(vmx, msr_index); |
2222 | if (msr) { | 2220 | if (msr) { |
2223 | vmx_load_host_state(vmx); | ||
2224 | msr->data = data; | 2221 | msr->data = data; |
2225 | break; | 2222 | break; |
2226 | } | 2223 | } |
@@ -2414,7 +2411,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
2414 | CPU_BASED_USE_TSC_OFFSETING | | 2411 | CPU_BASED_USE_TSC_OFFSETING | |
2415 | CPU_BASED_MWAIT_EXITING | | 2412 | CPU_BASED_MWAIT_EXITING | |
2416 | CPU_BASED_MONITOR_EXITING | | 2413 | CPU_BASED_MONITOR_EXITING | |
2417 | CPU_BASED_INVLPG_EXITING; | 2414 | CPU_BASED_INVLPG_EXITING | |
2415 | CPU_BASED_RDPMC_EXITING; | ||
2418 | 2416 | ||
2419 | if (yield_on_hlt) | 2417 | if (yield_on_hlt) |
2420 | min |= CPU_BASED_HLT_EXITING; | 2418 | min |= CPU_BASED_HLT_EXITING; |
@@ -2716,11 +2714,13 @@ static gva_t rmode_tss_base(struct kvm *kvm) | |||
2716 | { | 2714 | { |
2717 | if (!kvm->arch.tss_addr) { | 2715 | if (!kvm->arch.tss_addr) { |
2718 | struct kvm_memslots *slots; | 2716 | struct kvm_memslots *slots; |
2717 | struct kvm_memory_slot *slot; | ||
2719 | gfn_t base_gfn; | 2718 | gfn_t base_gfn; |
2720 | 2719 | ||
2721 | slots = kvm_memslots(kvm); | 2720 | slots = kvm_memslots(kvm); |
2722 | base_gfn = slots->memslots[0].base_gfn + | 2721 | slot = id_to_memslot(slots, 0); |
2723 | kvm->memslots->memslots[0].npages - 3; | 2722 | base_gfn = slot->base_gfn + slot->npages - 3; |
2723 | |||
2724 | return base_gfn << PAGE_SHIFT; | 2724 | return base_gfn << PAGE_SHIFT; |
2725 | } | 2725 | } |
2726 | return kvm->arch.tss_addr; | 2726 | return kvm->arch.tss_addr; |
@@ -3945,12 +3945,15 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) | |||
3945 | static void enable_irq_window(struct kvm_vcpu *vcpu) | 3945 | static void enable_irq_window(struct kvm_vcpu *vcpu) |
3946 | { | 3946 | { |
3947 | u32 cpu_based_vm_exec_control; | 3947 | u32 cpu_based_vm_exec_control; |
3948 | if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) | 3948 | if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { |
3949 | /* We can get here when nested_run_pending caused | 3949 | /* |
3950 | * vmx_interrupt_allowed() to return false. In this case, do | 3950 | * We get here if vmx_interrupt_allowed() said we can't |
3951 | * nothing - the interrupt will be injected later. | 3951 | * inject to L1 now because L2 must run. Ask L2 to exit |
3952 | * right after entry, so we can inject to L1 more promptly. | ||
3952 | */ | 3953 | */ |
3954 | kvm_make_request(KVM_REQ_IMMEDIATE_EXIT, vcpu); | ||
3953 | return; | 3955 | return; |
3956 | } | ||
3954 | 3957 | ||
3955 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | 3958 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); |
3956 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; | 3959 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; |
@@ -4077,11 +4080,12 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) | |||
4077 | static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) | 4080 | static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) |
4078 | { | 4081 | { |
4079 | if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { | 4082 | if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { |
4080 | struct vmcs12 *vmcs12; | 4083 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); |
4081 | if (to_vmx(vcpu)->nested.nested_run_pending) | 4084 | if (to_vmx(vcpu)->nested.nested_run_pending || |
4085 | (vmcs12->idt_vectoring_info_field & | ||
4086 | VECTORING_INFO_VALID_MASK)) | ||
4082 | return 0; | 4087 | return 0; |
4083 | nested_vmx_vmexit(vcpu); | 4088 | nested_vmx_vmexit(vcpu); |
4084 | vmcs12 = get_vmcs12(vcpu); | ||
4085 | vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT; | 4089 | vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT; |
4086 | vmcs12->vm_exit_intr_info = 0; | 4090 | vmcs12->vm_exit_intr_info = 0; |
4087 | /* fall through to normal code, but now in L1, not L2 */ | 4091 | /* fall through to normal code, but now in L1, not L2 */ |
@@ -4611,6 +4615,16 @@ static int handle_invlpg(struct kvm_vcpu *vcpu) | |||
4611 | return 1; | 4615 | return 1; |
4612 | } | 4616 | } |
4613 | 4617 | ||
4618 | static int handle_rdpmc(struct kvm_vcpu *vcpu) | ||
4619 | { | ||
4620 | int err; | ||
4621 | |||
4622 | err = kvm_rdpmc(vcpu); | ||
4623 | kvm_complete_insn_gp(vcpu, err); | ||
4624 | |||
4625 | return 1; | ||
4626 | } | ||
4627 | |||
4614 | static int handle_wbinvd(struct kvm_vcpu *vcpu) | 4628 | static int handle_wbinvd(struct kvm_vcpu *vcpu) |
4615 | { | 4629 | { |
4616 | skip_emulated_instruction(vcpu); | 4630 | skip_emulated_instruction(vcpu); |
@@ -5561,6 +5575,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { | |||
5561 | [EXIT_REASON_HLT] = handle_halt, | 5575 | [EXIT_REASON_HLT] = handle_halt, |
5562 | [EXIT_REASON_INVD] = handle_invd, | 5576 | [EXIT_REASON_INVD] = handle_invd, |
5563 | [EXIT_REASON_INVLPG] = handle_invlpg, | 5577 | [EXIT_REASON_INVLPG] = handle_invlpg, |
5578 | [EXIT_REASON_RDPMC] = handle_rdpmc, | ||
5564 | [EXIT_REASON_VMCALL] = handle_vmcall, | 5579 | [EXIT_REASON_VMCALL] = handle_vmcall, |
5565 | [EXIT_REASON_VMCLEAR] = handle_vmclear, | 5580 | [EXIT_REASON_VMCLEAR] = handle_vmclear, |
5566 | [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, | 5581 | [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4c938da2ba00..1171def5f96b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #include "tss.h" | 26 | #include "tss.h" |
27 | #include "kvm_cache_regs.h" | 27 | #include "kvm_cache_regs.h" |
28 | #include "x86.h" | 28 | #include "x86.h" |
29 | #include "cpuid.h" | ||
29 | 30 | ||
30 | #include <linux/clocksource.h> | 31 | #include <linux/clocksource.h> |
31 | #include <linux/interrupt.h> | 32 | #include <linux/interrupt.h> |
@@ -82,8 +83,6 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); | |||
82 | #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU | 83 | #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU |
83 | 84 | ||
84 | static void update_cr8_intercept(struct kvm_vcpu *vcpu); | 85 | static void update_cr8_intercept(struct kvm_vcpu *vcpu); |
85 | static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, | ||
86 | struct kvm_cpuid_entry2 __user *entries); | ||
87 | static void process_nmi(struct kvm_vcpu *vcpu); | 86 | static void process_nmi(struct kvm_vcpu *vcpu); |
88 | 87 | ||
89 | struct kvm_x86_ops *kvm_x86_ops; | 88 | struct kvm_x86_ops *kvm_x86_ops; |
@@ -574,54 +573,6 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) | |||
574 | } | 573 | } |
575 | EXPORT_SYMBOL_GPL(kvm_set_xcr); | 574 | EXPORT_SYMBOL_GPL(kvm_set_xcr); |
576 | 575 | ||
577 | static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) | ||
578 | { | ||
579 | struct kvm_cpuid_entry2 *best; | ||
580 | |||
581 | best = kvm_find_cpuid_entry(vcpu, 1, 0); | ||
582 | return best && (best->ecx & bit(X86_FEATURE_XSAVE)); | ||
583 | } | ||
584 | |||
585 | static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) | ||
586 | { | ||
587 | struct kvm_cpuid_entry2 *best; | ||
588 | |||
589 | best = kvm_find_cpuid_entry(vcpu, 7, 0); | ||
590 | return best && (best->ebx & bit(X86_FEATURE_SMEP)); | ||
591 | } | ||
592 | |||
593 | static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) | ||
594 | { | ||
595 | struct kvm_cpuid_entry2 *best; | ||
596 | |||
597 | best = kvm_find_cpuid_entry(vcpu, 7, 0); | ||
598 | return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); | ||
599 | } | ||
600 | |||
601 | static void update_cpuid(struct kvm_vcpu *vcpu) | ||
602 | { | ||
603 | struct kvm_cpuid_entry2 *best; | ||
604 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
605 | |||
606 | best = kvm_find_cpuid_entry(vcpu, 1, 0); | ||
607 | if (!best) | ||
608 | return; | ||
609 | |||
610 | /* Update OSXSAVE bit */ | ||
611 | if (cpu_has_xsave && best->function == 0x1) { | ||
612 | best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); | ||
613 | if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) | ||
614 | best->ecx |= bit(X86_FEATURE_OSXSAVE); | ||
615 | } | ||
616 | |||
617 | if (apic) { | ||
618 | if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER)) | ||
619 | apic->lapic_timer.timer_mode_mask = 3 << 17; | ||
620 | else | ||
621 | apic->lapic_timer.timer_mode_mask = 1 << 17; | ||
622 | } | ||
623 | } | ||
624 | |||
625 | int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | 576 | int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
626 | { | 577 | { |
627 | unsigned long old_cr4 = kvm_read_cr4(vcpu); | 578 | unsigned long old_cr4 = kvm_read_cr4(vcpu); |
@@ -655,7 +606,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
655 | kvm_mmu_reset_context(vcpu); | 606 | kvm_mmu_reset_context(vcpu); |
656 | 607 | ||
657 | if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) | 608 | if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) |
658 | update_cpuid(vcpu); | 609 | kvm_update_cpuid(vcpu); |
659 | 610 | ||
660 | return 0; | 611 | return 0; |
661 | } | 612 | } |
@@ -809,6 +760,21 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) | |||
809 | } | 760 | } |
810 | EXPORT_SYMBOL_GPL(kvm_get_dr); | 761 | EXPORT_SYMBOL_GPL(kvm_get_dr); |
811 | 762 | ||
763 | bool kvm_rdpmc(struct kvm_vcpu *vcpu) | ||
764 | { | ||
765 | u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); | ||
766 | u64 data; | ||
767 | int err; | ||
768 | |||
769 | err = kvm_pmu_read_pmc(vcpu, ecx, &data); | ||
770 | if (err) | ||
771 | return err; | ||
772 | kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data); | ||
773 | kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32); | ||
774 | return err; | ||
775 | } | ||
776 | EXPORT_SYMBOL_GPL(kvm_rdpmc); | ||
777 | |||
812 | /* | 778 | /* |
813 | * List of msr numbers which we expose to userspace through KVM_GET_MSRS | 779 | * List of msr numbers which we expose to userspace through KVM_GET_MSRS |
814 | * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. | 780 | * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. |
@@ -1358,12 +1324,11 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) | |||
1358 | if (page_num >= blob_size) | 1324 | if (page_num >= blob_size) |
1359 | goto out; | 1325 | goto out; |
1360 | r = -ENOMEM; | 1326 | r = -ENOMEM; |
1361 | page = kzalloc(PAGE_SIZE, GFP_KERNEL); | 1327 | page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE); |
1362 | if (!page) | 1328 | if (IS_ERR(page)) { |
1329 | r = PTR_ERR(page); | ||
1363 | goto out; | 1330 | goto out; |
1364 | r = -EFAULT; | 1331 | } |
1365 | if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE)) | ||
1366 | goto out_free; | ||
1367 | if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE)) | 1332 | if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE)) |
1368 | goto out_free; | 1333 | goto out_free; |
1369 | r = 0; | 1334 | r = 0; |
@@ -1652,8 +1617,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1652 | * which we perfectly emulate ;-). Any other value should be at least | 1617 | * which we perfectly emulate ;-). Any other value should be at least |
1653 | * reported, some guests depend on them. | 1618 | * reported, some guests depend on them. |
1654 | */ | 1619 | */ |
1655 | case MSR_P6_EVNTSEL0: | ||
1656 | case MSR_P6_EVNTSEL1: | ||
1657 | case MSR_K7_EVNTSEL0: | 1620 | case MSR_K7_EVNTSEL0: |
1658 | case MSR_K7_EVNTSEL1: | 1621 | case MSR_K7_EVNTSEL1: |
1659 | case MSR_K7_EVNTSEL2: | 1622 | case MSR_K7_EVNTSEL2: |
@@ -1665,8 +1628,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1665 | /* at least RHEL 4 unconditionally writes to the perfctr registers, | 1628 | /* at least RHEL 4 unconditionally writes to the perfctr registers, |
1666 | * so we ignore writes to make it happy. | 1629 | * so we ignore writes to make it happy. |
1667 | */ | 1630 | */ |
1668 | case MSR_P6_PERFCTR0: | ||
1669 | case MSR_P6_PERFCTR1: | ||
1670 | case MSR_K7_PERFCTR0: | 1631 | case MSR_K7_PERFCTR0: |
1671 | case MSR_K7_PERFCTR1: | 1632 | case MSR_K7_PERFCTR1: |
1672 | case MSR_K7_PERFCTR2: | 1633 | case MSR_K7_PERFCTR2: |
@@ -1703,6 +1664,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1703 | default: | 1664 | default: |
1704 | if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) | 1665 | if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) |
1705 | return xen_hvm_config(vcpu, data); | 1666 | return xen_hvm_config(vcpu, data); |
1667 | if (kvm_pmu_msr(vcpu, msr)) | ||
1668 | return kvm_pmu_set_msr(vcpu, msr, data); | ||
1706 | if (!ignore_msrs) { | 1669 | if (!ignore_msrs) { |
1707 | pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", | 1670 | pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", |
1708 | msr, data); | 1671 | msr, data); |
@@ -1865,10 +1828,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
1865 | case MSR_K8_SYSCFG: | 1828 | case MSR_K8_SYSCFG: |
1866 | case MSR_K7_HWCR: | 1829 | case MSR_K7_HWCR: |
1867 | case MSR_VM_HSAVE_PA: | 1830 | case MSR_VM_HSAVE_PA: |
1868 | case MSR_P6_PERFCTR0: | ||
1869 | case MSR_P6_PERFCTR1: | ||
1870 | case MSR_P6_EVNTSEL0: | ||
1871 | case MSR_P6_EVNTSEL1: | ||
1872 | case MSR_K7_EVNTSEL0: | 1831 | case MSR_K7_EVNTSEL0: |
1873 | case MSR_K7_PERFCTR0: | 1832 | case MSR_K7_PERFCTR0: |
1874 | case MSR_K8_INT_PENDING_MSG: | 1833 | case MSR_K8_INT_PENDING_MSG: |
@@ -1979,6 +1938,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
1979 | data = 0xbe702111; | 1938 | data = 0xbe702111; |
1980 | break; | 1939 | break; |
1981 | default: | 1940 | default: |
1941 | if (kvm_pmu_msr(vcpu, msr)) | ||
1942 | return kvm_pmu_get_msr(vcpu, msr, pdata); | ||
1982 | if (!ignore_msrs) { | 1943 | if (!ignore_msrs) { |
1983 | pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); | 1944 | pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); |
1984 | return 1; | 1945 | return 1; |
@@ -2037,15 +1998,12 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, | |||
2037 | if (msrs.nmsrs >= MAX_IO_MSRS) | 1998 | if (msrs.nmsrs >= MAX_IO_MSRS) |
2038 | goto out; | 1999 | goto out; |
2039 | 2000 | ||
2040 | r = -ENOMEM; | ||
2041 | size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; | 2001 | size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; |
2042 | entries = kmalloc(size, GFP_KERNEL); | 2002 | entries = memdup_user(user_msrs->entries, size); |
2043 | if (!entries) | 2003 | if (IS_ERR(entries)) { |
2004 | r = PTR_ERR(entries); | ||
2044 | goto out; | 2005 | goto out; |
2045 | 2006 | } | |
2046 | r = -EFAULT; | ||
2047 | if (copy_from_user(entries, user_msrs->entries, size)) | ||
2048 | goto out_free; | ||
2049 | 2007 | ||
2050 | r = n = __msr_io(vcpu, &msrs, entries, do_msr); | 2008 | r = n = __msr_io(vcpu, &msrs, entries, do_msr); |
2051 | if (r < 0) | 2009 | if (r < 0) |
@@ -2265,466 +2223,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) | |||
2265 | vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); | 2223 | vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); |
2266 | } | 2224 | } |
2267 | 2225 | ||
2268 | static int is_efer_nx(void) | ||
2269 | { | ||
2270 | unsigned long long efer = 0; | ||
2271 | |||
2272 | rdmsrl_safe(MSR_EFER, &efer); | ||
2273 | return efer & EFER_NX; | ||
2274 | } | ||
2275 | |||
2276 | static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) | ||
2277 | { | ||
2278 | int i; | ||
2279 | struct kvm_cpuid_entry2 *e, *entry; | ||
2280 | |||
2281 | entry = NULL; | ||
2282 | for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { | ||
2283 | e = &vcpu->arch.cpuid_entries[i]; | ||
2284 | if (e->function == 0x80000001) { | ||
2285 | entry = e; | ||
2286 | break; | ||
2287 | } | ||
2288 | } | ||
2289 | if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { | ||
2290 | entry->edx &= ~(1 << 20); | ||
2291 | printk(KERN_INFO "kvm: guest NX capability removed\n"); | ||
2292 | } | ||
2293 | } | ||
2294 | |||
2295 | /* when an old userspace process fills a new kernel module */ | ||
2296 | static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, | ||
2297 | struct kvm_cpuid *cpuid, | ||
2298 | struct kvm_cpuid_entry __user *entries) | ||
2299 | { | ||
2300 | int r, i; | ||
2301 | struct kvm_cpuid_entry *cpuid_entries; | ||
2302 | |||
2303 | r = -E2BIG; | ||
2304 | if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) | ||
2305 | goto out; | ||
2306 | r = -ENOMEM; | ||
2307 | cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); | ||
2308 | if (!cpuid_entries) | ||
2309 | goto out; | ||
2310 | r = -EFAULT; | ||
2311 | if (copy_from_user(cpuid_entries, entries, | ||
2312 | cpuid->nent * sizeof(struct kvm_cpuid_entry))) | ||
2313 | goto out_free; | ||
2314 | for (i = 0; i < cpuid->nent; i++) { | ||
2315 | vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; | ||
2316 | vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; | ||
2317 | vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; | ||
2318 | vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; | ||
2319 | vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; | ||
2320 | vcpu->arch.cpuid_entries[i].index = 0; | ||
2321 | vcpu->arch.cpuid_entries[i].flags = 0; | ||
2322 | vcpu->arch.cpuid_entries[i].padding[0] = 0; | ||
2323 | vcpu->arch.cpuid_entries[i].padding[1] = 0; | ||
2324 | vcpu->arch.cpuid_entries[i].padding[2] = 0; | ||
2325 | } | ||
2326 | vcpu->arch.cpuid_nent = cpuid->nent; | ||
2327 | cpuid_fix_nx_cap(vcpu); | ||
2328 | r = 0; | ||
2329 | kvm_apic_set_version(vcpu); | ||
2330 | kvm_x86_ops->cpuid_update(vcpu); | ||
2331 | update_cpuid(vcpu); | ||
2332 | |||
2333 | out_free: | ||
2334 | vfree(cpuid_entries); | ||
2335 | out: | ||
2336 | return r; | ||
2337 | } | ||
2338 | |||
2339 | static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, | ||
2340 | struct kvm_cpuid2 *cpuid, | ||
2341 | struct kvm_cpuid_entry2 __user *entries) | ||
2342 | { | ||
2343 | int r; | ||
2344 | |||
2345 | r = -E2BIG; | ||
2346 | if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) | ||
2347 | goto out; | ||
2348 | r = -EFAULT; | ||
2349 | if (copy_from_user(&vcpu->arch.cpuid_entries, entries, | ||
2350 | cpuid->nent * sizeof(struct kvm_cpuid_entry2))) | ||
2351 | goto out; | ||
2352 | vcpu->arch.cpuid_nent = cpuid->nent; | ||
2353 | kvm_apic_set_version(vcpu); | ||
2354 | kvm_x86_ops->cpuid_update(vcpu); | ||
2355 | update_cpuid(vcpu); | ||
2356 | return 0; | ||
2357 | |||
2358 | out: | ||
2359 | return r; | ||
2360 | } | ||
2361 | |||
2362 | static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, | ||
2363 | struct kvm_cpuid2 *cpuid, | ||
2364 | struct kvm_cpuid_entry2 __user *entries) | ||
2365 | { | ||
2366 | int r; | ||
2367 | |||
2368 | r = -E2BIG; | ||
2369 | if (cpuid->nent < vcpu->arch.cpuid_nent) | ||
2370 | goto out; | ||
2371 | r = -EFAULT; | ||
2372 | if (copy_to_user(entries, &vcpu->arch.cpuid_entries, | ||
2373 | vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) | ||
2374 | goto out; | ||
2375 | return 0; | ||
2376 | |||
2377 | out: | ||
2378 | cpuid->nent = vcpu->arch.cpuid_nent; | ||
2379 | return r; | ||
2380 | } | ||
2381 | |||
2382 | static void cpuid_mask(u32 *word, int wordnum) | ||
2383 | { | ||
2384 | *word &= boot_cpu_data.x86_capability[wordnum]; | ||
2385 | } | ||
2386 | |||
2387 | static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, | ||
2388 | u32 index) | ||
2389 | { | ||
2390 | entry->function = function; | ||
2391 | entry->index = index; | ||
2392 | cpuid_count(entry->function, entry->index, | ||
2393 | &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); | ||
2394 | entry->flags = 0; | ||
2395 | } | ||
2396 | |||
2397 | static bool supported_xcr0_bit(unsigned bit) | ||
2398 | { | ||
2399 | u64 mask = ((u64)1 << bit); | ||
2400 | |||
2401 | return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0; | ||
2402 | } | ||
2403 | |||
2404 | #define F(x) bit(X86_FEATURE_##x) | ||
2405 | |||
2406 | static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | ||
2407 | u32 index, int *nent, int maxnent) | ||
2408 | { | ||
2409 | unsigned f_nx = is_efer_nx() ? F(NX) : 0; | ||
2410 | #ifdef CONFIG_X86_64 | ||
2411 | unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) | ||
2412 | ? F(GBPAGES) : 0; | ||
2413 | unsigned f_lm = F(LM); | ||
2414 | #else | ||
2415 | unsigned f_gbpages = 0; | ||
2416 | unsigned f_lm = 0; | ||
2417 | #endif | ||
2418 | unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; | ||
2419 | |||
2420 | /* cpuid 1.edx */ | ||
2421 | const u32 kvm_supported_word0_x86_features = | ||
2422 | F(FPU) | F(VME) | F(DE) | F(PSE) | | ||
2423 | F(TSC) | F(MSR) | F(PAE) | F(MCE) | | ||
2424 | F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | | ||
2425 | F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | | ||
2426 | F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | | ||
2427 | 0 /* Reserved, DS, ACPI */ | F(MMX) | | ||
2428 | F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | | ||
2429 | 0 /* HTT, TM, Reserved, PBE */; | ||
2430 | /* cpuid 0x80000001.edx */ | ||
2431 | const u32 kvm_supported_word1_x86_features = | ||
2432 | F(FPU) | F(VME) | F(DE) | F(PSE) | | ||
2433 | F(TSC) | F(MSR) | F(PAE) | F(MCE) | | ||
2434 | F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | | ||
2435 | F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | | ||
2436 | F(PAT) | F(PSE36) | 0 /* Reserved */ | | ||
2437 | f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | | ||
2438 | F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | | ||
2439 | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); | ||
2440 | /* cpuid 1.ecx */ | ||
2441 | const u32 kvm_supported_word4_x86_features = | ||
2442 | F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | | ||
2443 | 0 /* DS-CPL, VMX, SMX, EST */ | | ||
2444 | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | | ||
2445 | 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | | ||
2446 | 0 /* Reserved, DCA */ | F(XMM4_1) | | ||
2447 | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | | ||
2448 | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | | ||
2449 | F(F16C) | F(RDRAND); | ||
2450 | /* cpuid 0x80000001.ecx */ | ||
2451 | const u32 kvm_supported_word6_x86_features = | ||
2452 | F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | | ||
2453 | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | | ||
2454 | F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | | ||
2455 | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); | ||
2456 | |||
2457 | /* cpuid 0xC0000001.edx */ | ||
2458 | const u32 kvm_supported_word5_x86_features = | ||
2459 | F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | | ||
2460 | F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | | ||
2461 | F(PMM) | F(PMM_EN); | ||
2462 | |||
2463 | /* cpuid 7.0.ebx */ | ||
2464 | const u32 kvm_supported_word9_x86_features = | ||
2465 | F(SMEP) | F(FSGSBASE) | F(ERMS); | ||
2466 | |||
2467 | /* all calls to cpuid_count() should be made on the same cpu */ | ||
2468 | get_cpu(); | ||
2469 | do_cpuid_1_ent(entry, function, index); | ||
2470 | ++*nent; | ||
2471 | |||
2472 | switch (function) { | ||
2473 | case 0: | ||
2474 | entry->eax = min(entry->eax, (u32)0xd); | ||
2475 | break; | ||
2476 | case 1: | ||
2477 | entry->edx &= kvm_supported_word0_x86_features; | ||
2478 | cpuid_mask(&entry->edx, 0); | ||
2479 | entry->ecx &= kvm_supported_word4_x86_features; | ||
2480 | cpuid_mask(&entry->ecx, 4); | ||
2481 | /* we support x2apic emulation even if host does not support | ||
2482 | * it since we emulate x2apic in software */ | ||
2483 | entry->ecx |= F(X2APIC); | ||
2484 | break; | ||
2485 | /* function 2 entries are STATEFUL. That is, repeated cpuid commands | ||
2486 | * may return different values. This forces us to get_cpu() before | ||
2487 | * issuing the first command, and also to emulate this annoying behavior | ||
2488 | * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ | ||
2489 | case 2: { | ||
2490 | int t, times = entry->eax & 0xff; | ||
2491 | |||
2492 | entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; | ||
2493 | entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; | ||
2494 | for (t = 1; t < times && *nent < maxnent; ++t) { | ||
2495 | do_cpuid_1_ent(&entry[t], function, 0); | ||
2496 | entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; | ||
2497 | ++*nent; | ||
2498 | } | ||
2499 | break; | ||
2500 | } | ||
2501 | /* function 4 has additional index. */ | ||
2502 | case 4: { | ||
2503 | int i, cache_type; | ||
2504 | |||
2505 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
2506 | /* read more entries until cache_type is zero */ | ||
2507 | for (i = 1; *nent < maxnent; ++i) { | ||
2508 | cache_type = entry[i - 1].eax & 0x1f; | ||
2509 | if (!cache_type) | ||
2510 | break; | ||
2511 | do_cpuid_1_ent(&entry[i], function, i); | ||
2512 | entry[i].flags |= | ||
2513 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
2514 | ++*nent; | ||
2515 | } | ||
2516 | break; | ||
2517 | } | ||
2518 | case 7: { | ||
2519 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
2520 | /* Mask ebx against host capbability word 9 */ | ||
2521 | if (index == 0) { | ||
2522 | entry->ebx &= kvm_supported_word9_x86_features; | ||
2523 | cpuid_mask(&entry->ebx, 9); | ||
2524 | } else | ||
2525 | entry->ebx = 0; | ||
2526 | entry->eax = 0; | ||
2527 | entry->ecx = 0; | ||
2528 | entry->edx = 0; | ||
2529 | break; | ||
2530 | } | ||
2531 | case 9: | ||
2532 | break; | ||
2533 | /* function 0xb has additional index. */ | ||
2534 | case 0xb: { | ||
2535 | int i, level_type; | ||
2536 | |||
2537 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
2538 | /* read more entries until level_type is zero */ | ||
2539 | for (i = 1; *nent < maxnent; ++i) { | ||
2540 | level_type = entry[i - 1].ecx & 0xff00; | ||
2541 | if (!level_type) | ||
2542 | break; | ||
2543 | do_cpuid_1_ent(&entry[i], function, i); | ||
2544 | entry[i].flags |= | ||
2545 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
2546 | ++*nent; | ||
2547 | } | ||
2548 | break; | ||
2549 | } | ||
2550 | case 0xd: { | ||
2551 | int idx, i; | ||
2552 | |||
2553 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
2554 | for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) { | ||
2555 | do_cpuid_1_ent(&entry[i], function, idx); | ||
2556 | if (entry[i].eax == 0 || !supported_xcr0_bit(idx)) | ||
2557 | continue; | ||
2558 | entry[i].flags |= | ||
2559 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
2560 | ++*nent; | ||
2561 | ++i; | ||
2562 | } | ||
2563 | break; | ||
2564 | } | ||
2565 | case KVM_CPUID_SIGNATURE: { | ||
2566 | char signature[12] = "KVMKVMKVM\0\0"; | ||
2567 | u32 *sigptr = (u32 *)signature; | ||
2568 | entry->eax = 0; | ||
2569 | entry->ebx = sigptr[0]; | ||
2570 | entry->ecx = sigptr[1]; | ||
2571 | entry->edx = sigptr[2]; | ||
2572 | break; | ||
2573 | } | ||
2574 | case KVM_CPUID_FEATURES: | ||
2575 | entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | | ||
2576 | (1 << KVM_FEATURE_NOP_IO_DELAY) | | ||
2577 | (1 << KVM_FEATURE_CLOCKSOURCE2) | | ||
2578 | (1 << KVM_FEATURE_ASYNC_PF) | | ||
2579 | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); | ||
2580 | |||
2581 | if (sched_info_on()) | ||
2582 | entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); | ||
2583 | |||
2584 | entry->ebx = 0; | ||
2585 | entry->ecx = 0; | ||
2586 | entry->edx = 0; | ||
2587 | break; | ||
2588 | case 0x80000000: | ||
2589 | entry->eax = min(entry->eax, 0x8000001a); | ||
2590 | break; | ||
2591 | case 0x80000001: | ||
2592 | entry->edx &= kvm_supported_word1_x86_features; | ||
2593 | cpuid_mask(&entry->edx, 1); | ||
2594 | entry->ecx &= kvm_supported_word6_x86_features; | ||
2595 | cpuid_mask(&entry->ecx, 6); | ||
2596 | break; | ||
2597 | case 0x80000008: { | ||
2598 | unsigned g_phys_as = (entry->eax >> 16) & 0xff; | ||
2599 | unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); | ||
2600 | unsigned phys_as = entry->eax & 0xff; | ||
2601 | |||
2602 | if (!g_phys_as) | ||
2603 | g_phys_as = phys_as; | ||
2604 | entry->eax = g_phys_as | (virt_as << 8); | ||
2605 | entry->ebx = entry->edx = 0; | ||
2606 | break; | ||
2607 | } | ||
2608 | case 0x80000019: | ||
2609 | entry->ecx = entry->edx = 0; | ||
2610 | break; | ||
2611 | case 0x8000001a: | ||
2612 | break; | ||
2613 | case 0x8000001d: | ||
2614 | break; | ||
2615 | /*Add support for Centaur's CPUID instruction*/ | ||
2616 | case 0xC0000000: | ||
2617 | /*Just support up to 0xC0000004 now*/ | ||
2618 | entry->eax = min(entry->eax, 0xC0000004); | ||
2619 | break; | ||
2620 | case 0xC0000001: | ||
2621 | entry->edx &= kvm_supported_word5_x86_features; | ||
2622 | cpuid_mask(&entry->edx, 5); | ||
2623 | break; | ||
2624 | case 3: /* Processor serial number */ | ||
2625 | case 5: /* MONITOR/MWAIT */ | ||
2626 | case 6: /* Thermal management */ | ||
2627 | case 0xA: /* Architectural Performance Monitoring */ | ||
2628 | case 0x80000007: /* Advanced power management */ | ||
2629 | case 0xC0000002: | ||
2630 | case 0xC0000003: | ||
2631 | case 0xC0000004: | ||
2632 | default: | ||
2633 | entry->eax = entry->ebx = entry->ecx = entry->edx = 0; | ||
2634 | break; | ||
2635 | } | ||
2636 | |||
2637 | kvm_x86_ops->set_supported_cpuid(function, entry); | ||
2638 | |||
2639 | put_cpu(); | ||
2640 | } | ||
2641 | |||
2642 | #undef F | ||
2643 | |||
2644 | static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, | ||
2645 | struct kvm_cpuid_entry2 __user *entries) | ||
2646 | { | ||
2647 | struct kvm_cpuid_entry2 *cpuid_entries; | ||
2648 | int limit, nent = 0, r = -E2BIG; | ||
2649 | u32 func; | ||
2650 | |||
2651 | if (cpuid->nent < 1) | ||
2652 | goto out; | ||
2653 | if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) | ||
2654 | cpuid->nent = KVM_MAX_CPUID_ENTRIES; | ||
2655 | r = -ENOMEM; | ||
2656 | cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); | ||
2657 | if (!cpuid_entries) | ||
2658 | goto out; | ||
2659 | |||
2660 | do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); | ||
2661 | limit = cpuid_entries[0].eax; | ||
2662 | for (func = 1; func <= limit && nent < cpuid->nent; ++func) | ||
2663 | do_cpuid_ent(&cpuid_entries[nent], func, 0, | ||
2664 | &nent, cpuid->nent); | ||
2665 | r = -E2BIG; | ||
2666 | if (nent >= cpuid->nent) | ||
2667 | goto out_free; | ||
2668 | |||
2669 | do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); | ||
2670 | limit = cpuid_entries[nent - 1].eax; | ||
2671 | for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) | ||
2672 | do_cpuid_ent(&cpuid_entries[nent], func, 0, | ||
2673 | &nent, cpuid->nent); | ||
2674 | |||
2675 | |||
2676 | |||
2677 | r = -E2BIG; | ||
2678 | if (nent >= cpuid->nent) | ||
2679 | goto out_free; | ||
2680 | |||
2681 | /* Add support for Centaur's CPUID instruction. */ | ||
2682 | if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) { | ||
2683 | do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0, | ||
2684 | &nent, cpuid->nent); | ||
2685 | |||
2686 | r = -E2BIG; | ||
2687 | if (nent >= cpuid->nent) | ||
2688 | goto out_free; | ||
2689 | |||
2690 | limit = cpuid_entries[nent - 1].eax; | ||
2691 | for (func = 0xC0000001; | ||
2692 | func <= limit && nent < cpuid->nent; ++func) | ||
2693 | do_cpuid_ent(&cpuid_entries[nent], func, 0, | ||
2694 | &nent, cpuid->nent); | ||
2695 | |||
2696 | r = -E2BIG; | ||
2697 | if (nent >= cpuid->nent) | ||
2698 | goto out_free; | ||
2699 | } | ||
2700 | |||
2701 | do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, | ||
2702 | cpuid->nent); | ||
2703 | |||
2704 | r = -E2BIG; | ||
2705 | if (nent >= cpuid->nent) | ||
2706 | goto out_free; | ||
2707 | |||
2708 | do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent, | ||
2709 | cpuid->nent); | ||
2710 | |||
2711 | r = -E2BIG; | ||
2712 | if (nent >= cpuid->nent) | ||
2713 | goto out_free; | ||
2714 | |||
2715 | r = -EFAULT; | ||
2716 | if (copy_to_user(entries, cpuid_entries, | ||
2717 | nent * sizeof(struct kvm_cpuid_entry2))) | ||
2718 | goto out_free; | ||
2719 | cpuid->nent = nent; | ||
2720 | r = 0; | ||
2721 | |||
2722 | out_free: | ||
2723 | vfree(cpuid_entries); | ||
2724 | out: | ||
2725 | return r; | ||
2726 | } | ||
2727 | |||
2728 | static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, | 2226 | static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, |
2729 | struct kvm_lapic_state *s) | 2227 | struct kvm_lapic_state *s) |
2730 | { | 2228 | { |
@@ -3042,13 +2540,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
3042 | r = -EINVAL; | 2540 | r = -EINVAL; |
3043 | if (!vcpu->arch.apic) | 2541 | if (!vcpu->arch.apic) |
3044 | goto out; | 2542 | goto out; |
3045 | u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); | 2543 | u.lapic = memdup_user(argp, sizeof(*u.lapic)); |
3046 | r = -ENOMEM; | 2544 | if (IS_ERR(u.lapic)) { |
3047 | if (!u.lapic) | 2545 | r = PTR_ERR(u.lapic); |
3048 | goto out; | ||
3049 | r = -EFAULT; | ||
3050 | if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state))) | ||
3051 | goto out; | 2546 | goto out; |
2547 | } | ||
2548 | |||
3052 | r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); | 2549 | r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); |
3053 | if (r) | 2550 | if (r) |
3054 | goto out; | 2551 | goto out; |
@@ -3227,14 +2724,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
3227 | break; | 2724 | break; |
3228 | } | 2725 | } |
3229 | case KVM_SET_XSAVE: { | 2726 | case KVM_SET_XSAVE: { |
3230 | u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); | 2727 | u.xsave = memdup_user(argp, sizeof(*u.xsave)); |
3231 | r = -ENOMEM; | 2728 | if (IS_ERR(u.xsave)) { |
3232 | if (!u.xsave) | 2729 | r = PTR_ERR(u.xsave); |
3233 | break; | 2730 | goto out; |
3234 | 2731 | } | |
3235 | r = -EFAULT; | ||
3236 | if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave))) | ||
3237 | break; | ||
3238 | 2732 | ||
3239 | r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); | 2733 | r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); |
3240 | break; | 2734 | break; |
@@ -3255,15 +2749,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
3255 | break; | 2749 | break; |
3256 | } | 2750 | } |
3257 | case KVM_SET_XCRS: { | 2751 | case KVM_SET_XCRS: { |
3258 | u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); | 2752 | u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); |
3259 | r = -ENOMEM; | 2753 | if (IS_ERR(u.xcrs)) { |
3260 | if (!u.xcrs) | 2754 | r = PTR_ERR(u.xcrs); |
3261 | break; | 2755 | goto out; |
3262 | 2756 | } | |
3263 | r = -EFAULT; | ||
3264 | if (copy_from_user(u.xcrs, argp, | ||
3265 | sizeof(struct kvm_xcrs))) | ||
3266 | break; | ||
3267 | 2757 | ||
3268 | r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); | 2758 | r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); |
3269 | break; | 2759 | break; |
@@ -3460,16 +2950,59 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, | |||
3460 | return 0; | 2950 | return 0; |
3461 | } | 2951 | } |
3462 | 2952 | ||
2953 | /** | ||
2954 | * write_protect_slot - write protect a slot for dirty logging | ||
2955 | * @kvm: the kvm instance | ||
2956 | * @memslot: the slot we protect | ||
2957 | * @dirty_bitmap: the bitmap indicating which pages are dirty | ||
2958 | * @nr_dirty_pages: the number of dirty pages | ||
2959 | * | ||
2960 | * We have two ways to find all sptes to protect: | ||
2961 | * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and | ||
2962 | * checks ones that have a spte mapping a page in the slot. | ||
2963 | * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap. | ||
2964 | * | ||
2965 | * Generally speaking, if there are not so many dirty pages compared to the | ||
2966 | * number of shadow pages, we should use the latter. | ||
2967 | * | ||
2968 | * Note that letting others write into a page marked dirty in the old bitmap | ||
2969 | * by using the remaining tlb entry is not a problem. That page will become | ||
2970 | * write protected again when we flush the tlb and then be reported dirty to | ||
2971 | * the user space by copying the old bitmap. | ||
2972 | */ | ||
2973 | static void write_protect_slot(struct kvm *kvm, | ||
2974 | struct kvm_memory_slot *memslot, | ||
2975 | unsigned long *dirty_bitmap, | ||
2976 | unsigned long nr_dirty_pages) | ||
2977 | { | ||
2978 | /* Not many dirty pages compared to # of shadow pages. */ | ||
2979 | if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { | ||
2980 | unsigned long gfn_offset; | ||
2981 | |||
2982 | for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { | ||
2983 | unsigned long gfn = memslot->base_gfn + gfn_offset; | ||
2984 | |||
2985 | spin_lock(&kvm->mmu_lock); | ||
2986 | kvm_mmu_rmap_write_protect(kvm, gfn, memslot); | ||
2987 | spin_unlock(&kvm->mmu_lock); | ||
2988 | } | ||
2989 | kvm_flush_remote_tlbs(kvm); | ||
2990 | } else { | ||
2991 | spin_lock(&kvm->mmu_lock); | ||
2992 | kvm_mmu_slot_remove_write_access(kvm, memslot->id); | ||
2993 | spin_unlock(&kvm->mmu_lock); | ||
2994 | } | ||
2995 | } | ||
2996 | |||
3463 | /* | 2997 | /* |
3464 | * Get (and clear) the dirty memory log for a memory slot. | 2998 | * Get (and clear) the dirty memory log for a memory slot. |
3465 | */ | 2999 | */ |
3466 | int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | 3000 | int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, |
3467 | struct kvm_dirty_log *log) | 3001 | struct kvm_dirty_log *log) |
3468 | { | 3002 | { |
3469 | int r, i; | 3003 | int r; |
3470 | struct kvm_memory_slot *memslot; | 3004 | struct kvm_memory_slot *memslot; |
3471 | unsigned long n; | 3005 | unsigned long n, nr_dirty_pages; |
3472 | unsigned long is_dirty = 0; | ||
3473 | 3006 | ||
3474 | mutex_lock(&kvm->slots_lock); | 3007 | mutex_lock(&kvm->slots_lock); |
3475 | 3008 | ||
@@ -3477,43 +3010,41 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | |||
3477 | if (log->slot >= KVM_MEMORY_SLOTS) | 3010 | if (log->slot >= KVM_MEMORY_SLOTS) |
3478 | goto out; | 3011 | goto out; |
3479 | 3012 | ||
3480 | memslot = &kvm->memslots->memslots[log->slot]; | 3013 | memslot = id_to_memslot(kvm->memslots, log->slot); |
3481 | r = -ENOENT; | 3014 | r = -ENOENT; |
3482 | if (!memslot->dirty_bitmap) | 3015 | if (!memslot->dirty_bitmap) |
3483 | goto out; | 3016 | goto out; |
3484 | 3017 | ||
3485 | n = kvm_dirty_bitmap_bytes(memslot); | 3018 | n = kvm_dirty_bitmap_bytes(memslot); |
3486 | 3019 | nr_dirty_pages = memslot->nr_dirty_pages; | |
3487 | for (i = 0; !is_dirty && i < n/sizeof(long); i++) | ||
3488 | is_dirty = memslot->dirty_bitmap[i]; | ||
3489 | 3020 | ||
3490 | /* If nothing is dirty, don't bother messing with page tables. */ | 3021 | /* If nothing is dirty, don't bother messing with page tables. */ |
3491 | if (is_dirty) { | 3022 | if (nr_dirty_pages) { |
3492 | struct kvm_memslots *slots, *old_slots; | 3023 | struct kvm_memslots *slots, *old_slots; |
3493 | unsigned long *dirty_bitmap; | 3024 | unsigned long *dirty_bitmap, *dirty_bitmap_head; |
3494 | 3025 | ||
3495 | dirty_bitmap = memslot->dirty_bitmap_head; | 3026 | dirty_bitmap = memslot->dirty_bitmap; |
3496 | if (memslot->dirty_bitmap == dirty_bitmap) | 3027 | dirty_bitmap_head = memslot->dirty_bitmap_head; |
3497 | dirty_bitmap += n / sizeof(long); | 3028 | if (dirty_bitmap == dirty_bitmap_head) |
3498 | memset(dirty_bitmap, 0, n); | 3029 | dirty_bitmap_head += n / sizeof(long); |
3030 | memset(dirty_bitmap_head, 0, n); | ||
3499 | 3031 | ||
3500 | r = -ENOMEM; | 3032 | r = -ENOMEM; |
3501 | slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); | 3033 | slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL); |
3502 | if (!slots) | 3034 | if (!slots) |
3503 | goto out; | 3035 | goto out; |
3504 | memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); | 3036 | |
3505 | slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; | 3037 | memslot = id_to_memslot(slots, log->slot); |
3506 | slots->generation++; | 3038 | memslot->nr_dirty_pages = 0; |
3039 | memslot->dirty_bitmap = dirty_bitmap_head; | ||
3040 | update_memslots(slots, NULL); | ||
3507 | 3041 | ||
3508 | old_slots = kvm->memslots; | 3042 | old_slots = kvm->memslots; |
3509 | rcu_assign_pointer(kvm->memslots, slots); | 3043 | rcu_assign_pointer(kvm->memslots, slots); |
3510 | synchronize_srcu_expedited(&kvm->srcu); | 3044 | synchronize_srcu_expedited(&kvm->srcu); |
3511 | dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; | ||
3512 | kfree(old_slots); | 3045 | kfree(old_slots); |
3513 | 3046 | ||
3514 | spin_lock(&kvm->mmu_lock); | 3047 | write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages); |
3515 | kvm_mmu_slot_remove_write_access(kvm, log->slot); | ||
3516 | spin_unlock(&kvm->mmu_lock); | ||
3517 | 3048 | ||
3518 | r = -EFAULT; | 3049 | r = -EFAULT; |
3519 | if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) | 3050 | if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) |
@@ -3658,14 +3189,14 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3658 | } | 3189 | } |
3659 | case KVM_GET_IRQCHIP: { | 3190 | case KVM_GET_IRQCHIP: { |
3660 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | 3191 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ |
3661 | struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); | 3192 | struct kvm_irqchip *chip; |
3662 | 3193 | ||
3663 | r = -ENOMEM; | 3194 | chip = memdup_user(argp, sizeof(*chip)); |
3664 | if (!chip) | 3195 | if (IS_ERR(chip)) { |
3196 | r = PTR_ERR(chip); | ||
3665 | goto out; | 3197 | goto out; |
3666 | r = -EFAULT; | 3198 | } |
3667 | if (copy_from_user(chip, argp, sizeof *chip)) | 3199 | |
3668 | goto get_irqchip_out; | ||
3669 | r = -ENXIO; | 3200 | r = -ENXIO; |
3670 | if (!irqchip_in_kernel(kvm)) | 3201 | if (!irqchip_in_kernel(kvm)) |
3671 | goto get_irqchip_out; | 3202 | goto get_irqchip_out; |
@@ -3684,14 +3215,14 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3684 | } | 3215 | } |
3685 | case KVM_SET_IRQCHIP: { | 3216 | case KVM_SET_IRQCHIP: { |
3686 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | 3217 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ |
3687 | struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); | 3218 | struct kvm_irqchip *chip; |
3688 | 3219 | ||
3689 | r = -ENOMEM; | 3220 | chip = memdup_user(argp, sizeof(*chip)); |
3690 | if (!chip) | 3221 | if (IS_ERR(chip)) { |
3222 | r = PTR_ERR(chip); | ||
3691 | goto out; | 3223 | goto out; |
3692 | r = -EFAULT; | 3224 | } |
3693 | if (copy_from_user(chip, argp, sizeof *chip)) | 3225 | |
3694 | goto set_irqchip_out; | ||
3695 | r = -ENXIO; | 3226 | r = -ENXIO; |
3696 | if (!irqchip_in_kernel(kvm)) | 3227 | if (!irqchip_in_kernel(kvm)) |
3697 | goto set_irqchip_out; | 3228 | goto set_irqchip_out; |
@@ -3898,12 +3429,7 @@ void kvm_get_segment(struct kvm_vcpu *vcpu, | |||
3898 | kvm_x86_ops->get_segment(vcpu, var, seg); | 3429 | kvm_x86_ops->get_segment(vcpu, var, seg); |
3899 | } | 3430 | } |
3900 | 3431 | ||
3901 | static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) | 3432 | gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) |
3902 | { | ||
3903 | return gpa; | ||
3904 | } | ||
3905 | |||
3906 | static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) | ||
3907 | { | 3433 | { |
3908 | gpa_t t_gpa; | 3434 | gpa_t t_gpa; |
3909 | struct x86_exception exception; | 3435 | struct x86_exception exception; |
@@ -4087,7 +3613,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
4087 | ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); | 3613 | ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); |
4088 | if (ret < 0) | 3614 | if (ret < 0) |
4089 | return 0; | 3615 | return 0; |
4090 | kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); | 3616 | kvm_mmu_pte_write(vcpu, gpa, val, bytes); |
4091 | return 1; | 3617 | return 1; |
4092 | } | 3618 | } |
4093 | 3619 | ||
@@ -4324,7 +3850,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, | |||
4324 | if (!exchanged) | 3850 | if (!exchanged) |
4325 | return X86EMUL_CMPXCHG_FAILED; | 3851 | return X86EMUL_CMPXCHG_FAILED; |
4326 | 3852 | ||
4327 | kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1); | 3853 | kvm_mmu_pte_write(vcpu, gpa, new, bytes); |
4328 | 3854 | ||
4329 | return X86EMUL_CONTINUE; | 3855 | return X86EMUL_CONTINUE; |
4330 | 3856 | ||
@@ -4349,32 +3875,24 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) | |||
4349 | return r; | 3875 | return r; |
4350 | } | 3876 | } |
4351 | 3877 | ||
4352 | 3878 | static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, | |
4353 | static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, | 3879 | unsigned short port, void *val, |
4354 | int size, unsigned short port, void *val, | 3880 | unsigned int count, bool in) |
4355 | unsigned int count) | ||
4356 | { | 3881 | { |
4357 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); | 3882 | trace_kvm_pio(!in, port, size, count); |
4358 | |||
4359 | if (vcpu->arch.pio.count) | ||
4360 | goto data_avail; | ||
4361 | |||
4362 | trace_kvm_pio(0, port, size, count); | ||
4363 | 3883 | ||
4364 | vcpu->arch.pio.port = port; | 3884 | vcpu->arch.pio.port = port; |
4365 | vcpu->arch.pio.in = 1; | 3885 | vcpu->arch.pio.in = in; |
4366 | vcpu->arch.pio.count = count; | 3886 | vcpu->arch.pio.count = count; |
4367 | vcpu->arch.pio.size = size; | 3887 | vcpu->arch.pio.size = size; |
4368 | 3888 | ||
4369 | if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { | 3889 | if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { |
4370 | data_avail: | ||
4371 | memcpy(val, vcpu->arch.pio_data, size * count); | ||
4372 | vcpu->arch.pio.count = 0; | 3890 | vcpu->arch.pio.count = 0; |
4373 | return 1; | 3891 | return 1; |
4374 | } | 3892 | } |
4375 | 3893 | ||
4376 | vcpu->run->exit_reason = KVM_EXIT_IO; | 3894 | vcpu->run->exit_reason = KVM_EXIT_IO; |
4377 | vcpu->run->io.direction = KVM_EXIT_IO_IN; | 3895 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; |
4378 | vcpu->run->io.size = size; | 3896 | vcpu->run->io.size = size; |
4379 | vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; | 3897 | vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; |
4380 | vcpu->run->io.count = count; | 3898 | vcpu->run->io.count = count; |
@@ -4383,36 +3901,37 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, | |||
4383 | return 0; | 3901 | return 0; |
4384 | } | 3902 | } |
4385 | 3903 | ||
4386 | static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, | 3904 | static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, |
4387 | int size, unsigned short port, | 3905 | int size, unsigned short port, void *val, |
4388 | const void *val, unsigned int count) | 3906 | unsigned int count) |
4389 | { | 3907 | { |
4390 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); | 3908 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
3909 | int ret; | ||
4391 | 3910 | ||
4392 | trace_kvm_pio(1, port, size, count); | 3911 | if (vcpu->arch.pio.count) |
4393 | 3912 | goto data_avail; | |
4394 | vcpu->arch.pio.port = port; | ||
4395 | vcpu->arch.pio.in = 0; | ||
4396 | vcpu->arch.pio.count = count; | ||
4397 | vcpu->arch.pio.size = size; | ||
4398 | |||
4399 | memcpy(vcpu->arch.pio_data, val, size * count); | ||
4400 | 3913 | ||
4401 | if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { | 3914 | ret = emulator_pio_in_out(vcpu, size, port, val, count, true); |
3915 | if (ret) { | ||
3916 | data_avail: | ||
3917 | memcpy(val, vcpu->arch.pio_data, size * count); | ||
4402 | vcpu->arch.pio.count = 0; | 3918 | vcpu->arch.pio.count = 0; |
4403 | return 1; | 3919 | return 1; |
4404 | } | 3920 | } |
4405 | 3921 | ||
4406 | vcpu->run->exit_reason = KVM_EXIT_IO; | ||
4407 | vcpu->run->io.direction = KVM_EXIT_IO_OUT; | ||
4408 | vcpu->run->io.size = size; | ||
4409 | vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; | ||
4410 | vcpu->run->io.count = count; | ||
4411 | vcpu->run->io.port = port; | ||
4412 | |||
4413 | return 0; | 3922 | return 0; |
4414 | } | 3923 | } |
4415 | 3924 | ||
3925 | static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, | ||
3926 | int size, unsigned short port, | ||
3927 | const void *val, unsigned int count) | ||
3928 | { | ||
3929 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); | ||
3930 | |||
3931 | memcpy(vcpu->arch.pio_data, val, size * count); | ||
3932 | return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); | ||
3933 | } | ||
3934 | |||
4416 | static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) | 3935 | static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) |
4417 | { | 3936 | { |
4418 | return kvm_x86_ops->get_segment_base(vcpu, seg); | 3937 | return kvm_x86_ops->get_segment_base(vcpu, seg); |
@@ -4627,6 +4146,12 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, | |||
4627 | return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); | 4146 | return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); |
4628 | } | 4147 | } |
4629 | 4148 | ||
4149 | static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, | ||
4150 | u32 pmc, u64 *pdata) | ||
4151 | { | ||
4152 | return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata); | ||
4153 | } | ||
4154 | |||
4630 | static void emulator_halt(struct x86_emulate_ctxt *ctxt) | 4155 | static void emulator_halt(struct x86_emulate_ctxt *ctxt) |
4631 | { | 4156 | { |
4632 | emul_to_vcpu(ctxt)->arch.halt_request = 1; | 4157 | emul_to_vcpu(ctxt)->arch.halt_request = 1; |
@@ -4679,6 +4204,7 @@ static struct x86_emulate_ops emulate_ops = { | |||
4679 | .set_dr = emulator_set_dr, | 4204 | .set_dr = emulator_set_dr, |
4680 | .set_msr = emulator_set_msr, | 4205 | .set_msr = emulator_set_msr, |
4681 | .get_msr = emulator_get_msr, | 4206 | .get_msr = emulator_get_msr, |
4207 | .read_pmc = emulator_read_pmc, | ||
4682 | .halt = emulator_halt, | 4208 | .halt = emulator_halt, |
4683 | .wbinvd = emulator_wbinvd, | 4209 | .wbinvd = emulator_wbinvd, |
4684 | .fix_hypercall = emulator_fix_hypercall, | 4210 | .fix_hypercall = emulator_fix_hypercall, |
@@ -4836,6 +4362,50 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) | |||
4836 | return false; | 4362 | return false; |
4837 | } | 4363 | } |
4838 | 4364 | ||
4365 | static bool retry_instruction(struct x86_emulate_ctxt *ctxt, | ||
4366 | unsigned long cr2, int emulation_type) | ||
4367 | { | ||
4368 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); | ||
4369 | unsigned long last_retry_eip, last_retry_addr, gpa = cr2; | ||
4370 | |||
4371 | last_retry_eip = vcpu->arch.last_retry_eip; | ||
4372 | last_retry_addr = vcpu->arch.last_retry_addr; | ||
4373 | |||
4374 | /* | ||
4375 | * If the emulation is caused by #PF and it is non-page_table | ||
4376 | * writing instruction, it means the VM-EXIT is caused by shadow | ||
4377 | * page protected, we can zap the shadow page and retry this | ||
4378 | * instruction directly. | ||
4379 | * | ||
4380 | * Note: if the guest uses a non-page-table modifying instruction | ||
4381 | * on the PDE that points to the instruction, then we will unmap | ||
4382 | * the instruction and go to an infinite loop. So, we cache the | ||
4383 | * last retried eip and the last fault address, if we meet the eip | ||
4384 | * and the address again, we can break out of the potential infinite | ||
4385 | * loop. | ||
4386 | */ | ||
4387 | vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; | ||
4388 | |||
4389 | if (!(emulation_type & EMULTYPE_RETRY)) | ||
4390 | return false; | ||
4391 | |||
4392 | if (x86_page_table_writing_insn(ctxt)) | ||
4393 | return false; | ||
4394 | |||
4395 | if (ctxt->eip == last_retry_eip && last_retry_addr == cr2) | ||
4396 | return false; | ||
4397 | |||
4398 | vcpu->arch.last_retry_eip = ctxt->eip; | ||
4399 | vcpu->arch.last_retry_addr = cr2; | ||
4400 | |||
4401 | if (!vcpu->arch.mmu.direct_map) | ||
4402 | gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); | ||
4403 | |||
4404 | kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
4405 | |||
4406 | return true; | ||
4407 | } | ||
4408 | |||
4839 | int x86_emulate_instruction(struct kvm_vcpu *vcpu, | 4409 | int x86_emulate_instruction(struct kvm_vcpu *vcpu, |
4840 | unsigned long cr2, | 4410 | unsigned long cr2, |
4841 | int emulation_type, | 4411 | int emulation_type, |
@@ -4877,6 +4447,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, | |||
4877 | return EMULATE_DONE; | 4447 | return EMULATE_DONE; |
4878 | } | 4448 | } |
4879 | 4449 | ||
4450 | if (retry_instruction(ctxt, cr2, emulation_type)) | ||
4451 | return EMULATE_DONE; | ||
4452 | |||
4880 | /* this is needed for vmware backdoor interface to work since it | 4453 | /* this is needed for vmware backdoor interface to work since it |
4881 | changes registers values during IO operation */ | 4454 | changes registers values during IO operation */ |
4882 | if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { | 4455 | if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { |
@@ -5095,17 +4668,17 @@ static void kvm_timer_init(void) | |||
5095 | 4668 | ||
5096 | static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); | 4669 | static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); |
5097 | 4670 | ||
5098 | static int kvm_is_in_guest(void) | 4671 | int kvm_is_in_guest(void) |
5099 | { | 4672 | { |
5100 | return percpu_read(current_vcpu) != NULL; | 4673 | return __this_cpu_read(current_vcpu) != NULL; |
5101 | } | 4674 | } |
5102 | 4675 | ||
5103 | static int kvm_is_user_mode(void) | 4676 | static int kvm_is_user_mode(void) |
5104 | { | 4677 | { |
5105 | int user_mode = 3; | 4678 | int user_mode = 3; |
5106 | 4679 | ||
5107 | if (percpu_read(current_vcpu)) | 4680 | if (__this_cpu_read(current_vcpu)) |
5108 | user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu)); | 4681 | user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu)); |
5109 | 4682 | ||
5110 | return user_mode != 0; | 4683 | return user_mode != 0; |
5111 | } | 4684 | } |
@@ -5114,8 +4687,8 @@ static unsigned long kvm_get_guest_ip(void) | |||
5114 | { | 4687 | { |
5115 | unsigned long ip = 0; | 4688 | unsigned long ip = 0; |
5116 | 4689 | ||
5117 | if (percpu_read(current_vcpu)) | 4690 | if (__this_cpu_read(current_vcpu)) |
5118 | ip = kvm_rip_read(percpu_read(current_vcpu)); | 4691 | ip = kvm_rip_read(__this_cpu_read(current_vcpu)); |
5119 | 4692 | ||
5120 | return ip; | 4693 | return ip; |
5121 | } | 4694 | } |
@@ -5128,13 +4701,13 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { | |||
5128 | 4701 | ||
5129 | void kvm_before_handle_nmi(struct kvm_vcpu *vcpu) | 4702 | void kvm_before_handle_nmi(struct kvm_vcpu *vcpu) |
5130 | { | 4703 | { |
5131 | percpu_write(current_vcpu, vcpu); | 4704 | __this_cpu_write(current_vcpu, vcpu); |
5132 | } | 4705 | } |
5133 | EXPORT_SYMBOL_GPL(kvm_before_handle_nmi); | 4706 | EXPORT_SYMBOL_GPL(kvm_before_handle_nmi); |
5134 | 4707 | ||
5135 | void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) | 4708 | void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) |
5136 | { | 4709 | { |
5137 | percpu_write(current_vcpu, NULL); | 4710 | __this_cpu_write(current_vcpu, NULL); |
5138 | } | 4711 | } |
5139 | EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); | 4712 | EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); |
5140 | 4713 | ||
@@ -5233,15 +4806,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) | |||
5233 | } | 4806 | } |
5234 | EXPORT_SYMBOL_GPL(kvm_emulate_halt); | 4807 | EXPORT_SYMBOL_GPL(kvm_emulate_halt); |
5235 | 4808 | ||
5236 | static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, | ||
5237 | unsigned long a1) | ||
5238 | { | ||
5239 | if (is_long_mode(vcpu)) | ||
5240 | return a0; | ||
5241 | else | ||
5242 | return a0 | ((gpa_t)a1 << 32); | ||
5243 | } | ||
5244 | |||
5245 | int kvm_hv_hypercall(struct kvm_vcpu *vcpu) | 4809 | int kvm_hv_hypercall(struct kvm_vcpu *vcpu) |
5246 | { | 4810 | { |
5247 | u64 param, ingpa, outgpa, ret; | 4811 | u64 param, ingpa, outgpa, ret; |
@@ -5337,9 +4901,6 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | |||
5337 | case KVM_HC_VAPIC_POLL_IRQ: | 4901 | case KVM_HC_VAPIC_POLL_IRQ: |
5338 | ret = 0; | 4902 | ret = 0; |
5339 | break; | 4903 | break; |
5340 | case KVM_HC_MMU_OP: | ||
5341 | r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); | ||
5342 | break; | ||
5343 | default: | 4904 | default: |
5344 | ret = -KVM_ENOSYS; | 4905 | ret = -KVM_ENOSYS; |
5345 | break; | 4906 | break; |
@@ -5369,125 +4930,6 @@ int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) | |||
5369 | return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); | 4930 | return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); |
5370 | } | 4931 | } |
5371 | 4932 | ||
5372 | static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) | ||
5373 | { | ||
5374 | struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; | ||
5375 | int j, nent = vcpu->arch.cpuid_nent; | ||
5376 | |||
5377 | e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; | ||
5378 | /* when no next entry is found, the current entry[i] is reselected */ | ||
5379 | for (j = i + 1; ; j = (j + 1) % nent) { | ||
5380 | struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; | ||
5381 | if (ej->function == e->function) { | ||
5382 | ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; | ||
5383 | return j; | ||
5384 | } | ||
5385 | } | ||
5386 | return 0; /* silence gcc, even though control never reaches here */ | ||
5387 | } | ||
5388 | |||
5389 | /* find an entry with matching function, matching index (if needed), and that | ||
5390 | * should be read next (if it's stateful) */ | ||
5391 | static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, | ||
5392 | u32 function, u32 index) | ||
5393 | { | ||
5394 | if (e->function != function) | ||
5395 | return 0; | ||
5396 | if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) | ||
5397 | return 0; | ||
5398 | if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && | ||
5399 | !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) | ||
5400 | return 0; | ||
5401 | return 1; | ||
5402 | } | ||
5403 | |||
5404 | struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, | ||
5405 | u32 function, u32 index) | ||
5406 | { | ||
5407 | int i; | ||
5408 | struct kvm_cpuid_entry2 *best = NULL; | ||
5409 | |||
5410 | for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { | ||
5411 | struct kvm_cpuid_entry2 *e; | ||
5412 | |||
5413 | e = &vcpu->arch.cpuid_entries[i]; | ||
5414 | if (is_matching_cpuid_entry(e, function, index)) { | ||
5415 | if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) | ||
5416 | move_to_next_stateful_cpuid_entry(vcpu, i); | ||
5417 | best = e; | ||
5418 | break; | ||
5419 | } | ||
5420 | } | ||
5421 | return best; | ||
5422 | } | ||
5423 | EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); | ||
5424 | |||
5425 | int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) | ||
5426 | { | ||
5427 | struct kvm_cpuid_entry2 *best; | ||
5428 | |||
5429 | best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); | ||
5430 | if (!best || best->eax < 0x80000008) | ||
5431 | goto not_found; | ||
5432 | best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); | ||
5433 | if (best) | ||
5434 | return best->eax & 0xff; | ||
5435 | not_found: | ||
5436 | return 36; | ||
5437 | } | ||
5438 | |||
5439 | /* | ||
5440 | * If no match is found, check whether we exceed the vCPU's limit | ||
5441 | * and return the content of the highest valid _standard_ leaf instead. | ||
5442 | * This is to satisfy the CPUID specification. | ||
5443 | */ | ||
5444 | static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, | ||
5445 | u32 function, u32 index) | ||
5446 | { | ||
5447 | struct kvm_cpuid_entry2 *maxlevel; | ||
5448 | |||
5449 | maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); | ||
5450 | if (!maxlevel || maxlevel->eax >= function) | ||
5451 | return NULL; | ||
5452 | if (function & 0x80000000) { | ||
5453 | maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0); | ||
5454 | if (!maxlevel) | ||
5455 | return NULL; | ||
5456 | } | ||
5457 | return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); | ||
5458 | } | ||
5459 | |||
5460 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | ||
5461 | { | ||
5462 | u32 function, index; | ||
5463 | struct kvm_cpuid_entry2 *best; | ||
5464 | |||
5465 | function = kvm_register_read(vcpu, VCPU_REGS_RAX); | ||
5466 | index = kvm_register_read(vcpu, VCPU_REGS_RCX); | ||
5467 | kvm_register_write(vcpu, VCPU_REGS_RAX, 0); | ||
5468 | kvm_register_write(vcpu, VCPU_REGS_RBX, 0); | ||
5469 | kvm_register_write(vcpu, VCPU_REGS_RCX, 0); | ||
5470 | kvm_register_write(vcpu, VCPU_REGS_RDX, 0); | ||
5471 | best = kvm_find_cpuid_entry(vcpu, function, index); | ||
5472 | |||
5473 | if (!best) | ||
5474 | best = check_cpuid_limit(vcpu, function, index); | ||
5475 | |||
5476 | if (best) { | ||
5477 | kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); | ||
5478 | kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); | ||
5479 | kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); | ||
5480 | kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); | ||
5481 | } | ||
5482 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
5483 | trace_kvm_cpuid(function, | ||
5484 | kvm_register_read(vcpu, VCPU_REGS_RAX), | ||
5485 | kvm_register_read(vcpu, VCPU_REGS_RBX), | ||
5486 | kvm_register_read(vcpu, VCPU_REGS_RCX), | ||
5487 | kvm_register_read(vcpu, VCPU_REGS_RDX)); | ||
5488 | } | ||
5489 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); | ||
5490 | |||
5491 | /* | 4933 | /* |
5492 | * Check if userspace requested an interrupt window, and that the | 4934 | * Check if userspace requested an interrupt window, and that the |
5493 | * interrupt window is open. | 4935 | * interrupt window is open. |
@@ -5648,6 +5090,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
5648 | int r; | 5090 | int r; |
5649 | bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && | 5091 | bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && |
5650 | vcpu->run->request_interrupt_window; | 5092 | vcpu->run->request_interrupt_window; |
5093 | bool req_immediate_exit = 0; | ||
5651 | 5094 | ||
5652 | if (vcpu->requests) { | 5095 | if (vcpu->requests) { |
5653 | if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) | 5096 | if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) |
@@ -5687,7 +5130,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
5687 | record_steal_time(vcpu); | 5130 | record_steal_time(vcpu); |
5688 | if (kvm_check_request(KVM_REQ_NMI, vcpu)) | 5131 | if (kvm_check_request(KVM_REQ_NMI, vcpu)) |
5689 | process_nmi(vcpu); | 5132 | process_nmi(vcpu); |
5690 | 5133 | req_immediate_exit = | |
5134 | kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu); | ||
5135 | if (kvm_check_request(KVM_REQ_PMU, vcpu)) | ||
5136 | kvm_handle_pmu_event(vcpu); | ||
5137 | if (kvm_check_request(KVM_REQ_PMI, vcpu)) | ||
5138 | kvm_deliver_pmi(vcpu); | ||
5691 | } | 5139 | } |
5692 | 5140 | ||
5693 | r = kvm_mmu_reload(vcpu); | 5141 | r = kvm_mmu_reload(vcpu); |
@@ -5738,6 +5186,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
5738 | 5186 | ||
5739 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); | 5187 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); |
5740 | 5188 | ||
5189 | if (req_immediate_exit) | ||
5190 | smp_send_reschedule(vcpu->cpu); | ||
5191 | |||
5741 | kvm_guest_enter(); | 5192 | kvm_guest_enter(); |
5742 | 5193 | ||
5743 | if (unlikely(vcpu->arch.switch_db_regs)) { | 5194 | if (unlikely(vcpu->arch.switch_db_regs)) { |
@@ -5943,10 +5394,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
5943 | if (r <= 0) | 5394 | if (r <= 0) |
5944 | goto out; | 5395 | goto out; |
5945 | 5396 | ||
5946 | if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) | ||
5947 | kvm_register_write(vcpu, VCPU_REGS_RAX, | ||
5948 | kvm_run->hypercall.ret); | ||
5949 | |||
5950 | r = __vcpu_run(vcpu); | 5397 | r = __vcpu_run(vcpu); |
5951 | 5398 | ||
5952 | out: | 5399 | out: |
@@ -6148,7 +5595,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
6148 | mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; | 5595 | mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; |
6149 | kvm_x86_ops->set_cr4(vcpu, sregs->cr4); | 5596 | kvm_x86_ops->set_cr4(vcpu, sregs->cr4); |
6150 | if (sregs->cr4 & X86_CR4_OSXSAVE) | 5597 | if (sregs->cr4 & X86_CR4_OSXSAVE) |
6151 | update_cpuid(vcpu); | 5598 | kvm_update_cpuid(vcpu); |
6152 | 5599 | ||
6153 | idx = srcu_read_lock(&vcpu->kvm->srcu); | 5600 | idx = srcu_read_lock(&vcpu->kvm->srcu); |
6154 | if (!is_long_mode(vcpu) && is_pae(vcpu)) { | 5601 | if (!is_long_mode(vcpu) && is_pae(vcpu)) { |
@@ -6425,6 +5872,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) | |||
6425 | kvm_async_pf_hash_reset(vcpu); | 5872 | kvm_async_pf_hash_reset(vcpu); |
6426 | vcpu->arch.apf.halted = false; | 5873 | vcpu->arch.apf.halted = false; |
6427 | 5874 | ||
5875 | kvm_pmu_reset(vcpu); | ||
5876 | |||
6428 | return kvm_x86_ops->vcpu_reset(vcpu); | 5877 | return kvm_x86_ops->vcpu_reset(vcpu); |
6429 | } | 5878 | } |
6430 | 5879 | ||
@@ -6473,10 +5922,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
6473 | kvm = vcpu->kvm; | 5922 | kvm = vcpu->kvm; |
6474 | 5923 | ||
6475 | vcpu->arch.emulate_ctxt.ops = &emulate_ops; | 5924 | vcpu->arch.emulate_ctxt.ops = &emulate_ops; |
6476 | vcpu->arch.walk_mmu = &vcpu->arch.mmu; | ||
6477 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
6478 | vcpu->arch.mmu.translate_gpa = translate_gpa; | ||
6479 | vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; | ||
6480 | if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) | 5925 | if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) |
6481 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | 5926 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
6482 | else | 5927 | else |
@@ -6513,6 +5958,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
6513 | goto fail_free_mce_banks; | 5958 | goto fail_free_mce_banks; |
6514 | 5959 | ||
6515 | kvm_async_pf_hash_reset(vcpu); | 5960 | kvm_async_pf_hash_reset(vcpu); |
5961 | kvm_pmu_init(vcpu); | ||
6516 | 5962 | ||
6517 | return 0; | 5963 | return 0; |
6518 | fail_free_mce_banks: | 5964 | fail_free_mce_banks: |
@@ -6531,6 +5977,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) | |||
6531 | { | 5977 | { |
6532 | int idx; | 5978 | int idx; |
6533 | 5979 | ||
5980 | kvm_pmu_destroy(vcpu); | ||
6534 | kfree(vcpu->arch.mce_banks); | 5981 | kfree(vcpu->arch.mce_banks); |
6535 | kvm_free_lapic(vcpu); | 5982 | kvm_free_lapic(vcpu); |
6536 | idx = srcu_read_lock(&vcpu->kvm->srcu); | 5983 | idx = srcu_read_lock(&vcpu->kvm->srcu); |
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index d36fe237c665..cb80c293cdd8 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h | |||
@@ -33,9 +33,6 @@ static inline bool kvm_exception_is_soft(unsigned int nr) | |||
33 | return (nr == BP_VECTOR) || (nr == OF_VECTOR); | 33 | return (nr == BP_VECTOR) || (nr == OF_VECTOR); |
34 | } | 34 | } |
35 | 35 | ||
36 | struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, | ||
37 | u32 function, u32 index); | ||
38 | |||
39 | static inline bool is_protmode(struct kvm_vcpu *vcpu) | 36 | static inline bool is_protmode(struct kvm_vcpu *vcpu) |
40 | { | 37 | { |
41 | return kvm_read_cr0_bits(vcpu, X86_CR0_PE); | 38 | return kvm_read_cr0_bits(vcpu, X86_CR0_PE); |
@@ -125,4 +122,6 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, | |||
125 | gva_t addr, void *val, unsigned int bytes, | 122 | gva_t addr, void *val, unsigned int bytes, |
126 | struct x86_exception *exception); | 123 | struct x86_exception *exception); |
127 | 124 | ||
125 | extern u64 host_xcr0; | ||
126 | |||
128 | #endif | 127 | #endif |
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d52623199978..900c76337e8f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/signal.h> | 14 | #include <linux/signal.h> |
15 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
16 | #include <linux/mm.h> | 16 | #include <linux/mm.h> |
17 | #include <linux/mmu_notifier.h> | ||
17 | #include <linux/preempt.h> | 18 | #include <linux/preempt.h> |
18 | #include <linux/msi.h> | 19 | #include <linux/msi.h> |
19 | #include <linux/slab.h> | 20 | #include <linux/slab.h> |
@@ -50,6 +51,9 @@ | |||
50 | #define KVM_REQ_APF_HALT 12 | 51 | #define KVM_REQ_APF_HALT 12 |
51 | #define KVM_REQ_STEAL_UPDATE 13 | 52 | #define KVM_REQ_STEAL_UPDATE 13 |
52 | #define KVM_REQ_NMI 14 | 53 | #define KVM_REQ_NMI 14 |
54 | #define KVM_REQ_IMMEDIATE_EXIT 15 | ||
55 | #define KVM_REQ_PMU 16 | ||
56 | #define KVM_REQ_PMI 17 | ||
53 | 57 | ||
54 | #define KVM_USERSPACE_IRQ_SOURCE_ID 0 | 58 | #define KVM_USERSPACE_IRQ_SOURCE_ID 0 |
55 | 59 | ||
@@ -179,6 +183,7 @@ struct kvm_memory_slot { | |||
179 | unsigned long *rmap; | 183 | unsigned long *rmap; |
180 | unsigned long *dirty_bitmap; | 184 | unsigned long *dirty_bitmap; |
181 | unsigned long *dirty_bitmap_head; | 185 | unsigned long *dirty_bitmap_head; |
186 | unsigned long nr_dirty_pages; | ||
182 | struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; | 187 | struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; |
183 | unsigned long userspace_addr; | 188 | unsigned long userspace_addr; |
184 | int user_alloc; | 189 | int user_alloc; |
@@ -224,11 +229,20 @@ struct kvm_irq_routing_table {}; | |||
224 | 229 | ||
225 | #endif | 230 | #endif |
226 | 231 | ||
232 | #ifndef KVM_MEM_SLOTS_NUM | ||
233 | #define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) | ||
234 | #endif | ||
235 | |||
236 | /* | ||
237 | * Note: | ||
238 | * memslots are not sorted by id anymore, please use id_to_memslot() | ||
239 | * to get the memslot by its id. | ||
240 | */ | ||
227 | struct kvm_memslots { | 241 | struct kvm_memslots { |
228 | int nmemslots; | ||
229 | u64 generation; | 242 | u64 generation; |
230 | struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + | 243 | struct kvm_memory_slot memslots[KVM_MEM_SLOTS_NUM]; |
231 | KVM_PRIVATE_MEM_SLOTS]; | 244 | /* The mapping table from slot id to the index in memslots[]. */ |
245 | int id_to_index[KVM_MEM_SLOTS_NUM]; | ||
232 | }; | 246 | }; |
233 | 247 | ||
234 | struct kvm { | 248 | struct kvm { |
@@ -239,7 +253,6 @@ struct kvm { | |||
239 | struct srcu_struct srcu; | 253 | struct srcu_struct srcu; |
240 | #ifdef CONFIG_KVM_APIC_ARCHITECTURE | 254 | #ifdef CONFIG_KVM_APIC_ARCHITECTURE |
241 | u32 bsp_vcpu_id; | 255 | u32 bsp_vcpu_id; |
242 | struct kvm_vcpu *bsp_vcpu; | ||
243 | #endif | 256 | #endif |
244 | struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; | 257 | struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; |
245 | atomic_t online_vcpus; | 258 | atomic_t online_vcpus; |
@@ -302,6 +315,11 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) | |||
302 | (vcpup = kvm_get_vcpu(kvm, idx)) != NULL; \ | 315 | (vcpup = kvm_get_vcpu(kvm, idx)) != NULL; \ |
303 | idx++) | 316 | idx++) |
304 | 317 | ||
318 | #define kvm_for_each_memslot(memslot, slots) \ | ||
319 | for (memslot = &slots->memslots[0]; \ | ||
320 | memslot < slots->memslots + KVM_MEM_SLOTS_NUM && memslot->npages;\ | ||
321 | memslot++) | ||
322 | |||
305 | int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); | 323 | int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); |
306 | void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); | 324 | void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); |
307 | 325 | ||
@@ -314,6 +332,7 @@ void kvm_exit(void); | |||
314 | 332 | ||
315 | void kvm_get_kvm(struct kvm *kvm); | 333 | void kvm_get_kvm(struct kvm *kvm); |
316 | void kvm_put_kvm(struct kvm *kvm); | 334 | void kvm_put_kvm(struct kvm *kvm); |
335 | void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new); | ||
317 | 336 | ||
318 | static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm) | 337 | static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm) |
319 | { | 338 | { |
@@ -322,6 +341,18 @@ static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm) | |||
322 | || lockdep_is_held(&kvm->slots_lock)); | 341 | || lockdep_is_held(&kvm->slots_lock)); |
323 | } | 342 | } |
324 | 343 | ||
344 | static inline struct kvm_memory_slot * | ||
345 | id_to_memslot(struct kvm_memslots *slots, int id) | ||
346 | { | ||
347 | int index = slots->id_to_index[id]; | ||
348 | struct kvm_memory_slot *slot; | ||
349 | |||
350 | slot = &slots->memslots[index]; | ||
351 | |||
352 | WARN_ON(slot->id != id); | ||
353 | return slot; | ||
354 | } | ||
355 | |||
325 | #define HPA_MSB ((sizeof(hpa_t) * 8) - 1) | 356 | #define HPA_MSB ((sizeof(hpa_t) * 8) - 1) |
326 | #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) | 357 | #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) |
327 | static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } | 358 | static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } |
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h index 47a070b0520e..ff476ddaf310 100644 --- a/include/linux/kvm_para.h +++ b/include/linux/kvm_para.h | |||
@@ -35,4 +35,3 @@ static inline int kvm_para_has_feature(unsigned int feature) | |||
35 | } | 35 | } |
36 | #endif /* __KERNEL__ */ | 36 | #endif /* __KERNEL__ */ |
37 | #endif /* __LINUX_KVM_PARA_H */ | 37 | #endif /* __LINUX_KVM_PARA_H */ |
38 | |||
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 30c3c7708132..01d3b70fc98a 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
@@ -71,6 +71,7 @@ void jump_label_inc(struct jump_label_key *key) | |||
71 | atomic_inc(&key->enabled); | 71 | atomic_inc(&key->enabled); |
72 | jump_label_unlock(); | 72 | jump_label_unlock(); |
73 | } | 73 | } |
74 | EXPORT_SYMBOL_GPL(jump_label_inc); | ||
74 | 75 | ||
75 | static void __jump_label_dec(struct jump_label_key *key, | 76 | static void __jump_label_dec(struct jump_label_key *key, |
76 | unsigned long rate_limit, struct delayed_work *work) | 77 | unsigned long rate_limit, struct delayed_work *work) |
@@ -86,6 +87,7 @@ static void __jump_label_dec(struct jump_label_key *key, | |||
86 | 87 | ||
87 | jump_label_unlock(); | 88 | jump_label_unlock(); |
88 | } | 89 | } |
90 | EXPORT_SYMBOL_GPL(jump_label_dec); | ||
89 | 91 | ||
90 | static void jump_label_update_timeout(struct work_struct *work) | 92 | static void jump_label_update_timeout(struct work_struct *work) |
91 | { | 93 | { |
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index a6ec206f36ba..88b2fe3ddf42 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c | |||
@@ -28,9 +28,15 @@ static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev, | |||
28 | * (addr,len) is fully included in | 28 | * (addr,len) is fully included in |
29 | * (zone->addr, zone->size) | 29 | * (zone->addr, zone->size) |
30 | */ | 30 | */ |
31 | 31 | if (len < 0) | |
32 | return (dev->zone.addr <= addr && | 32 | return 0; |
33 | addr + len <= dev->zone.addr + dev->zone.size); | 33 | if (addr + len < addr) |
34 | return 0; | ||
35 | if (addr < dev->zone.addr) | ||
36 | return 0; | ||
37 | if (addr + len > dev->zone.addr + dev->zone.size) | ||
38 | return 0; | ||
39 | return 1; | ||
34 | } | 40 | } |
35 | 41 | ||
36 | static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev) | 42 | static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev) |
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 3eed61eb4867..dcaf272c26c0 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c | |||
@@ -185,7 +185,7 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) | |||
185 | irqe.dest_mode = 0; /* Physical mode. */ | 185 | irqe.dest_mode = 0; /* Physical mode. */ |
186 | /* need to read apic_id from apic regiest since | 186 | /* need to read apic_id from apic regiest since |
187 | * it can be rewritten */ | 187 | * it can be rewritten */ |
188 | irqe.dest_id = ioapic->kvm->bsp_vcpu->vcpu_id; | 188 | irqe.dest_id = ioapic->kvm->bsp_vcpu_id; |
189 | } | 189 | } |
190 | #endif | 190 | #endif |
191 | return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); | 191 | return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); |
@@ -332,9 +332,18 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, | |||
332 | (void*)addr, len, val); | 332 | (void*)addr, len, val); |
333 | ASSERT(!(addr & 0xf)); /* check alignment */ | 333 | ASSERT(!(addr & 0xf)); /* check alignment */ |
334 | 334 | ||
335 | if (len == 4 || len == 8) | 335 | switch (len) { |
336 | case 8: | ||
337 | case 4: | ||
336 | data = *(u32 *) val; | 338 | data = *(u32 *) val; |
337 | else { | 339 | break; |
340 | case 2: | ||
341 | data = *(u16 *) val; | ||
342 | break; | ||
343 | case 1: | ||
344 | data = *(u8 *) val; | ||
345 | break; | ||
346 | default: | ||
338 | printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); | 347 | printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); |
339 | return 0; | 348 | return 0; |
340 | } | 349 | } |
@@ -343,7 +352,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, | |||
343 | spin_lock(&ioapic->lock); | 352 | spin_lock(&ioapic->lock); |
344 | switch (addr) { | 353 | switch (addr) { |
345 | case IOAPIC_REG_SELECT: | 354 | case IOAPIC_REG_SELECT: |
346 | ioapic->ioregsel = data; | 355 | ioapic->ioregsel = data & 0xFF; /* 8-bit register */ |
347 | break; | 356 | break; |
348 | 357 | ||
349 | case IOAPIC_REG_WINDOW: | 358 | case IOAPIC_REG_WINDOW: |
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index a195c07fa829..4e5f7b7f1d2b 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c | |||
@@ -134,14 +134,15 @@ unmap_pages: | |||
134 | 134 | ||
135 | static int kvm_iommu_map_memslots(struct kvm *kvm) | 135 | static int kvm_iommu_map_memslots(struct kvm *kvm) |
136 | { | 136 | { |
137 | int i, idx, r = 0; | 137 | int idx, r = 0; |
138 | struct kvm_memslots *slots; | 138 | struct kvm_memslots *slots; |
139 | struct kvm_memory_slot *memslot; | ||
139 | 140 | ||
140 | idx = srcu_read_lock(&kvm->srcu); | 141 | idx = srcu_read_lock(&kvm->srcu); |
141 | slots = kvm_memslots(kvm); | 142 | slots = kvm_memslots(kvm); |
142 | 143 | ||
143 | for (i = 0; i < slots->nmemslots; i++) { | 144 | kvm_for_each_memslot(memslot, slots) { |
144 | r = kvm_iommu_map_pages(kvm, &slots->memslots[i]); | 145 | r = kvm_iommu_map_pages(kvm, memslot); |
145 | if (r) | 146 | if (r) |
146 | break; | 147 | break; |
147 | } | 148 | } |
@@ -311,16 +312,16 @@ static void kvm_iommu_put_pages(struct kvm *kvm, | |||
311 | 312 | ||
312 | static int kvm_iommu_unmap_memslots(struct kvm *kvm) | 313 | static int kvm_iommu_unmap_memslots(struct kvm *kvm) |
313 | { | 314 | { |
314 | int i, idx; | 315 | int idx; |
315 | struct kvm_memslots *slots; | 316 | struct kvm_memslots *slots; |
317 | struct kvm_memory_slot *memslot; | ||
316 | 318 | ||
317 | idx = srcu_read_lock(&kvm->srcu); | 319 | idx = srcu_read_lock(&kvm->srcu); |
318 | slots = kvm_memslots(kvm); | 320 | slots = kvm_memslots(kvm); |
319 | 321 | ||
320 | for (i = 0; i < slots->nmemslots; i++) { | 322 | kvm_for_each_memslot(memslot, slots) |
321 | kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn, | 323 | kvm_iommu_put_pages(kvm, memslot->base_gfn, memslot->npages); |
322 | slots->memslots[i].npages); | 324 | |
323 | } | ||
324 | srcu_read_unlock(&kvm->srcu, idx); | 325 | srcu_read_unlock(&kvm->srcu, idx); |
325 | 326 | ||
326 | return 0; | 327 | return 0; |
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d9cfb782cb81..7287bf5d1c9e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
@@ -440,6 +440,15 @@ static int kvm_init_mmu_notifier(struct kvm *kvm) | |||
440 | 440 | ||
441 | #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ | 441 | #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ |
442 | 442 | ||
443 | static void kvm_init_memslots_id(struct kvm *kvm) | ||
444 | { | ||
445 | int i; | ||
446 | struct kvm_memslots *slots = kvm->memslots; | ||
447 | |||
448 | for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) | ||
449 | slots->id_to_index[i] = slots->memslots[i].id = i; | ||
450 | } | ||
451 | |||
443 | static struct kvm *kvm_create_vm(void) | 452 | static struct kvm *kvm_create_vm(void) |
444 | { | 453 | { |
445 | int r, i; | 454 | int r, i; |
@@ -465,6 +474,7 @@ static struct kvm *kvm_create_vm(void) | |||
465 | kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); | 474 | kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); |
466 | if (!kvm->memslots) | 475 | if (!kvm->memslots) |
467 | goto out_err_nosrcu; | 476 | goto out_err_nosrcu; |
477 | kvm_init_memslots_id(kvm); | ||
468 | if (init_srcu_struct(&kvm->srcu)) | 478 | if (init_srcu_struct(&kvm->srcu)) |
469 | goto out_err_nosrcu; | 479 | goto out_err_nosrcu; |
470 | for (i = 0; i < KVM_NR_BUSES; i++) { | 480 | for (i = 0; i < KVM_NR_BUSES; i++) { |
@@ -547,11 +557,11 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free, | |||
547 | 557 | ||
548 | void kvm_free_physmem(struct kvm *kvm) | 558 | void kvm_free_physmem(struct kvm *kvm) |
549 | { | 559 | { |
550 | int i; | ||
551 | struct kvm_memslots *slots = kvm->memslots; | 560 | struct kvm_memslots *slots = kvm->memslots; |
561 | struct kvm_memory_slot *memslot; | ||
552 | 562 | ||
553 | for (i = 0; i < slots->nmemslots; ++i) | 563 | kvm_for_each_memslot(memslot, slots) |
554 | kvm_free_physmem_slot(&slots->memslots[i], NULL); | 564 | kvm_free_physmem_slot(memslot, NULL); |
555 | 565 | ||
556 | kfree(kvm->memslots); | 566 | kfree(kvm->memslots); |
557 | } | 567 | } |
@@ -625,10 +635,69 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) | |||
625 | return -ENOMEM; | 635 | return -ENOMEM; |
626 | 636 | ||
627 | memslot->dirty_bitmap_head = memslot->dirty_bitmap; | 637 | memslot->dirty_bitmap_head = memslot->dirty_bitmap; |
638 | memslot->nr_dirty_pages = 0; | ||
628 | return 0; | 639 | return 0; |
629 | } | 640 | } |
630 | #endif /* !CONFIG_S390 */ | 641 | #endif /* !CONFIG_S390 */ |
631 | 642 | ||
643 | static struct kvm_memory_slot * | ||
644 | search_memslots(struct kvm_memslots *slots, gfn_t gfn) | ||
645 | { | ||
646 | struct kvm_memory_slot *memslot; | ||
647 | |||
648 | kvm_for_each_memslot(memslot, slots) | ||
649 | if (gfn >= memslot->base_gfn && | ||
650 | gfn < memslot->base_gfn + memslot->npages) | ||
651 | return memslot; | ||
652 | |||
653 | return NULL; | ||
654 | } | ||
655 | |||
656 | static int cmp_memslot(const void *slot1, const void *slot2) | ||
657 | { | ||
658 | struct kvm_memory_slot *s1, *s2; | ||
659 | |||
660 | s1 = (struct kvm_memory_slot *)slot1; | ||
661 | s2 = (struct kvm_memory_slot *)slot2; | ||
662 | |||
663 | if (s1->npages < s2->npages) | ||
664 | return 1; | ||
665 | if (s1->npages > s2->npages) | ||
666 | return -1; | ||
667 | |||
668 | return 0; | ||
669 | } | ||
670 | |||
671 | /* | ||
672 | * Sort the memslots base on its size, so the larger slots | ||
673 | * will get better fit. | ||
674 | */ | ||
675 | static void sort_memslots(struct kvm_memslots *slots) | ||
676 | { | ||
677 | int i; | ||
678 | |||
679 | sort(slots->memslots, KVM_MEM_SLOTS_NUM, | ||
680 | sizeof(struct kvm_memory_slot), cmp_memslot, NULL); | ||
681 | |||
682 | for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) | ||
683 | slots->id_to_index[slots->memslots[i].id] = i; | ||
684 | } | ||
685 | |||
686 | void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new) | ||
687 | { | ||
688 | if (new) { | ||
689 | int id = new->id; | ||
690 | struct kvm_memory_slot *old = id_to_memslot(slots, id); | ||
691 | unsigned long npages = old->npages; | ||
692 | |||
693 | *old = *new; | ||
694 | if (new->npages != npages) | ||
695 | sort_memslots(slots); | ||
696 | } | ||
697 | |||
698 | slots->generation++; | ||
699 | } | ||
700 | |||
632 | /* | 701 | /* |
633 | * Allocate some memory and give it an address in the guest physical address | 702 | * Allocate some memory and give it an address in the guest physical address |
634 | * space. | 703 | * space. |
@@ -662,12 +731,12 @@ int __kvm_set_memory_region(struct kvm *kvm, | |||
662 | (void __user *)(unsigned long)mem->userspace_addr, | 731 | (void __user *)(unsigned long)mem->userspace_addr, |
663 | mem->memory_size))) | 732 | mem->memory_size))) |
664 | goto out; | 733 | goto out; |
665 | if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) | 734 | if (mem->slot >= KVM_MEM_SLOTS_NUM) |
666 | goto out; | 735 | goto out; |
667 | if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) | 736 | if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) |
668 | goto out; | 737 | goto out; |
669 | 738 | ||
670 | memslot = &kvm->memslots->memslots[mem->slot]; | 739 | memslot = id_to_memslot(kvm->memslots, mem->slot); |
671 | base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; | 740 | base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; |
672 | npages = mem->memory_size >> PAGE_SHIFT; | 741 | npages = mem->memory_size >> PAGE_SHIFT; |
673 | 742 | ||
@@ -774,15 +843,17 @@ skip_lpage: | |||
774 | #endif /* not defined CONFIG_S390 */ | 843 | #endif /* not defined CONFIG_S390 */ |
775 | 844 | ||
776 | if (!npages) { | 845 | if (!npages) { |
846 | struct kvm_memory_slot *slot; | ||
847 | |||
777 | r = -ENOMEM; | 848 | r = -ENOMEM; |
778 | slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); | 849 | slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), |
850 | GFP_KERNEL); | ||
779 | if (!slots) | 851 | if (!slots) |
780 | goto out_free; | 852 | goto out_free; |
781 | memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); | 853 | slot = id_to_memslot(slots, mem->slot); |
782 | if (mem->slot >= slots->nmemslots) | 854 | slot->flags |= KVM_MEMSLOT_INVALID; |
783 | slots->nmemslots = mem->slot + 1; | 855 | |
784 | slots->generation++; | 856 | update_memslots(slots, NULL); |
785 | slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; | ||
786 | 857 | ||
787 | old_memslots = kvm->memslots; | 858 | old_memslots = kvm->memslots; |
788 | rcu_assign_pointer(kvm->memslots, slots); | 859 | rcu_assign_pointer(kvm->memslots, slots); |
@@ -810,13 +881,10 @@ skip_lpage: | |||
810 | } | 881 | } |
811 | 882 | ||
812 | r = -ENOMEM; | 883 | r = -ENOMEM; |
813 | slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); | 884 | slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), |
885 | GFP_KERNEL); | ||
814 | if (!slots) | 886 | if (!slots) |
815 | goto out_free; | 887 | goto out_free; |
816 | memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); | ||
817 | if (mem->slot >= slots->nmemslots) | ||
818 | slots->nmemslots = mem->slot + 1; | ||
819 | slots->generation++; | ||
820 | 888 | ||
821 | /* actual memory is freed via old in kvm_free_physmem_slot below */ | 889 | /* actual memory is freed via old in kvm_free_physmem_slot below */ |
822 | if (!npages) { | 890 | if (!npages) { |
@@ -826,7 +894,7 @@ skip_lpage: | |||
826 | new.lpage_info[i] = NULL; | 894 | new.lpage_info[i] = NULL; |
827 | } | 895 | } |
828 | 896 | ||
829 | slots->memslots[mem->slot] = new; | 897 | update_memslots(slots, &new); |
830 | old_memslots = kvm->memslots; | 898 | old_memslots = kvm->memslots; |
831 | rcu_assign_pointer(kvm->memslots, slots); | 899 | rcu_assign_pointer(kvm->memslots, slots); |
832 | synchronize_srcu_expedited(&kvm->srcu); | 900 | synchronize_srcu_expedited(&kvm->srcu); |
@@ -888,7 +956,7 @@ int kvm_get_dirty_log(struct kvm *kvm, | |||
888 | if (log->slot >= KVM_MEMORY_SLOTS) | 956 | if (log->slot >= KVM_MEMORY_SLOTS) |
889 | goto out; | 957 | goto out; |
890 | 958 | ||
891 | memslot = &kvm->memslots->memslots[log->slot]; | 959 | memslot = id_to_memslot(kvm->memslots, log->slot); |
892 | r = -ENOENT; | 960 | r = -ENOENT; |
893 | if (!memslot->dirty_bitmap) | 961 | if (!memslot->dirty_bitmap) |
894 | goto out; | 962 | goto out; |
@@ -966,16 +1034,7 @@ EXPORT_SYMBOL_GPL(kvm_is_error_hva); | |||
966 | static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots, | 1034 | static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots, |
967 | gfn_t gfn) | 1035 | gfn_t gfn) |
968 | { | 1036 | { |
969 | int i; | 1037 | return search_memslots(slots, gfn); |
970 | |||
971 | for (i = 0; i < slots->nmemslots; ++i) { | ||
972 | struct kvm_memory_slot *memslot = &slots->memslots[i]; | ||
973 | |||
974 | if (gfn >= memslot->base_gfn | ||
975 | && gfn < memslot->base_gfn + memslot->npages) | ||
976 | return memslot; | ||
977 | } | ||
978 | return NULL; | ||
979 | } | 1038 | } |
980 | 1039 | ||
981 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) | 1040 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) |
@@ -986,20 +1045,13 @@ EXPORT_SYMBOL_GPL(gfn_to_memslot); | |||
986 | 1045 | ||
987 | int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) | 1046 | int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) |
988 | { | 1047 | { |
989 | int i; | 1048 | struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); |
990 | struct kvm_memslots *slots = kvm_memslots(kvm); | ||
991 | 1049 | ||
992 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { | 1050 | if (!memslot || memslot->id >= KVM_MEMORY_SLOTS || |
993 | struct kvm_memory_slot *memslot = &slots->memslots[i]; | 1051 | memslot->flags & KVM_MEMSLOT_INVALID) |
994 | 1052 | return 0; | |
995 | if (memslot->flags & KVM_MEMSLOT_INVALID) | ||
996 | continue; | ||
997 | 1053 | ||
998 | if (gfn >= memslot->base_gfn | 1054 | return 1; |
999 | && gfn < memslot->base_gfn + memslot->npages) | ||
1000 | return 1; | ||
1001 | } | ||
1002 | return 0; | ||
1003 | } | 1055 | } |
1004 | EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); | 1056 | EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); |
1005 | 1057 | ||
@@ -1491,7 +1543,8 @@ void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, | |||
1491 | if (memslot && memslot->dirty_bitmap) { | 1543 | if (memslot && memslot->dirty_bitmap) { |
1492 | unsigned long rel_gfn = gfn - memslot->base_gfn; | 1544 | unsigned long rel_gfn = gfn - memslot->base_gfn; |
1493 | 1545 | ||
1494 | __set_bit_le(rel_gfn, memslot->dirty_bitmap); | 1546 | if (!__test_and_set_bit_le(rel_gfn, memslot->dirty_bitmap)) |
1547 | memslot->nr_dirty_pages++; | ||
1495 | } | 1548 | } |
1496 | } | 1549 | } |
1497 | 1550 | ||
@@ -1690,10 +1743,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) | |||
1690 | smp_wmb(); | 1743 | smp_wmb(); |
1691 | atomic_inc(&kvm->online_vcpus); | 1744 | atomic_inc(&kvm->online_vcpus); |
1692 | 1745 | ||
1693 | #ifdef CONFIG_KVM_APIC_ARCHITECTURE | ||
1694 | if (kvm->bsp_vcpu_id == id) | ||
1695 | kvm->bsp_vcpu = vcpu; | ||
1696 | #endif | ||
1697 | mutex_unlock(&kvm->lock); | 1746 | mutex_unlock(&kvm->lock); |
1698 | return r; | 1747 | return r; |
1699 | 1748 | ||
@@ -1768,12 +1817,11 @@ out_free1: | |||
1768 | struct kvm_regs *kvm_regs; | 1817 | struct kvm_regs *kvm_regs; |
1769 | 1818 | ||
1770 | r = -ENOMEM; | 1819 | r = -ENOMEM; |
1771 | kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); | 1820 | kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); |
1772 | if (!kvm_regs) | 1821 | if (IS_ERR(kvm_regs)) { |
1822 | r = PTR_ERR(kvm_regs); | ||
1773 | goto out; | 1823 | goto out; |
1774 | r = -EFAULT; | 1824 | } |
1775 | if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs))) | ||
1776 | goto out_free2; | ||
1777 | r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); | 1825 | r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); |
1778 | if (r) | 1826 | if (r) |
1779 | goto out_free2; | 1827 | goto out_free2; |
@@ -1797,13 +1845,11 @@ out_free2: | |||
1797 | break; | 1845 | break; |
1798 | } | 1846 | } |
1799 | case KVM_SET_SREGS: { | 1847 | case KVM_SET_SREGS: { |
1800 | kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL); | 1848 | kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); |
1801 | r = -ENOMEM; | 1849 | if (IS_ERR(kvm_sregs)) { |
1802 | if (!kvm_sregs) | 1850 | r = PTR_ERR(kvm_sregs); |
1803 | goto out; | ||
1804 | r = -EFAULT; | ||
1805 | if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs))) | ||
1806 | goto out; | 1851 | goto out; |
1852 | } | ||
1807 | r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); | 1853 | r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); |
1808 | if (r) | 1854 | if (r) |
1809 | goto out; | 1855 | goto out; |
@@ -1899,13 +1945,11 @@ out_free2: | |||
1899 | break; | 1945 | break; |
1900 | } | 1946 | } |
1901 | case KVM_SET_FPU: { | 1947 | case KVM_SET_FPU: { |
1902 | fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL); | 1948 | fpu = memdup_user(argp, sizeof(*fpu)); |
1903 | r = -ENOMEM; | 1949 | if (IS_ERR(fpu)) { |
1904 | if (!fpu) | 1950 | r = PTR_ERR(fpu); |
1905 | goto out; | ||
1906 | r = -EFAULT; | ||
1907 | if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu))) | ||
1908 | goto out; | 1951 | goto out; |
1952 | } | ||
1909 | r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); | 1953 | r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); |
1910 | if (r) | 1954 | if (r) |
1911 | goto out; | 1955 | goto out; |
@@ -2520,10 +2564,9 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, | |||
2520 | if (bus->dev_count > NR_IOBUS_DEVS-1) | 2564 | if (bus->dev_count > NR_IOBUS_DEVS-1) |
2521 | return -ENOSPC; | 2565 | return -ENOSPC; |
2522 | 2566 | ||
2523 | new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); | 2567 | new_bus = kmemdup(bus, sizeof(struct kvm_io_bus), GFP_KERNEL); |
2524 | if (!new_bus) | 2568 | if (!new_bus) |
2525 | return -ENOMEM; | 2569 | return -ENOMEM; |
2526 | memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); | ||
2527 | kvm_io_bus_insert_dev(new_bus, dev, addr, len); | 2570 | kvm_io_bus_insert_dev(new_bus, dev, addr, len); |
2528 | rcu_assign_pointer(kvm->buses[bus_idx], new_bus); | 2571 | rcu_assign_pointer(kvm->buses[bus_idx], new_bus); |
2529 | synchronize_srcu_expedited(&kvm->srcu); | 2572 | synchronize_srcu_expedited(&kvm->srcu); |
@@ -2539,13 +2582,12 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, | |||
2539 | int i, r; | 2582 | int i, r; |
2540 | struct kvm_io_bus *new_bus, *bus; | 2583 | struct kvm_io_bus *new_bus, *bus; |
2541 | 2584 | ||
2542 | new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); | 2585 | bus = kvm->buses[bus_idx]; |
2586 | |||
2587 | new_bus = kmemdup(bus, sizeof(*bus), GFP_KERNEL); | ||
2543 | if (!new_bus) | 2588 | if (!new_bus) |
2544 | return -ENOMEM; | 2589 | return -ENOMEM; |
2545 | 2590 | ||
2546 | bus = kvm->buses[bus_idx]; | ||
2547 | memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); | ||
2548 | |||
2549 | r = -ENOENT; | 2591 | r = -ENOENT; |
2550 | for (i = 0; i < new_bus->dev_count; i++) | 2592 | for (i = 0; i < new_bus->dev_count; i++) |
2551 | if (new_bus->range[i].dev == dev) { | 2593 | if (new_bus->range[i].dev == dev) { |
@@ -2612,15 +2654,29 @@ static const struct file_operations *stat_fops[] = { | |||
2612 | [KVM_STAT_VM] = &vm_stat_fops, | 2654 | [KVM_STAT_VM] = &vm_stat_fops, |
2613 | }; | 2655 | }; |
2614 | 2656 | ||
2615 | static void kvm_init_debug(void) | 2657 | static int kvm_init_debug(void) |
2616 | { | 2658 | { |
2659 | int r = -EFAULT; | ||
2617 | struct kvm_stats_debugfs_item *p; | 2660 | struct kvm_stats_debugfs_item *p; |
2618 | 2661 | ||
2619 | kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); | 2662 | kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); |
2620 | for (p = debugfs_entries; p->name; ++p) | 2663 | if (kvm_debugfs_dir == NULL) |
2664 | goto out; | ||
2665 | |||
2666 | for (p = debugfs_entries; p->name; ++p) { | ||
2621 | p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, | 2667 | p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, |
2622 | (void *)(long)p->offset, | 2668 | (void *)(long)p->offset, |
2623 | stat_fops[p->kind]); | 2669 | stat_fops[p->kind]); |
2670 | if (p->dentry == NULL) | ||
2671 | goto out_dir; | ||
2672 | } | ||
2673 | |||
2674 | return 0; | ||
2675 | |||
2676 | out_dir: | ||
2677 | debugfs_remove_recursive(kvm_debugfs_dir); | ||
2678 | out: | ||
2679 | return r; | ||
2624 | } | 2680 | } |
2625 | 2681 | ||
2626 | static void kvm_exit_debug(void) | 2682 | static void kvm_exit_debug(void) |
@@ -2764,10 +2820,16 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, | |||
2764 | kvm_preempt_ops.sched_in = kvm_sched_in; | 2820 | kvm_preempt_ops.sched_in = kvm_sched_in; |
2765 | kvm_preempt_ops.sched_out = kvm_sched_out; | 2821 | kvm_preempt_ops.sched_out = kvm_sched_out; |
2766 | 2822 | ||
2767 | kvm_init_debug(); | 2823 | r = kvm_init_debug(); |
2824 | if (r) { | ||
2825 | printk(KERN_ERR "kvm: create debugfs files failed\n"); | ||
2826 | goto out_undebugfs; | ||
2827 | } | ||
2768 | 2828 | ||
2769 | return 0; | 2829 | return 0; |
2770 | 2830 | ||
2831 | out_undebugfs: | ||
2832 | unregister_syscore_ops(&kvm_syscore_ops); | ||
2771 | out_unreg: | 2833 | out_unreg: |
2772 | kvm_async_pf_deinit(); | 2834 | kvm_async_pf_deinit(); |
2773 | out_free: | 2835 | out_free: |