diff options
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 15 | ||||
-rw-r--r-- | arch/x86/kvm/Makefile | 13 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 391 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 301 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 18 | ||||
-rw-r--r-- | arch/x86/kvm/mmutrace.h | 76 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 10 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 10 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 21 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 19 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 80 |
12 files changed, 567 insertions, 391 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index af9c5525434d..f87f7fcefa0a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h | |||
@@ -222,14 +222,22 @@ struct kvm_mmu_page { | |||
222 | int root_count; /* Currently serving as active root */ | 222 | int root_count; /* Currently serving as active root */ |
223 | unsigned int unsync_children; | 223 | unsigned int unsync_children; |
224 | unsigned long parent_ptes; /* Reverse mapping for parent_pte */ | 224 | unsigned long parent_ptes; /* Reverse mapping for parent_pte */ |
225 | |||
226 | /* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen. */ | ||
227 | unsigned long mmu_valid_gen; | ||
228 | |||
225 | DECLARE_BITMAP(unsync_child_bitmap, 512); | 229 | DECLARE_BITMAP(unsync_child_bitmap, 512); |
226 | 230 | ||
227 | #ifdef CONFIG_X86_32 | 231 | #ifdef CONFIG_X86_32 |
232 | /* | ||
233 | * Used out of the mmu-lock to avoid reading spte values while an | ||
234 | * update is in progress; see the comments in __get_spte_lockless(). | ||
235 | */ | ||
228 | int clear_spte_count; | 236 | int clear_spte_count; |
229 | #endif | 237 | #endif |
230 | 238 | ||
239 | /* Number of writes since the last time traversal visited this page. */ | ||
231 | int write_flooding_count; | 240 | int write_flooding_count; |
232 | bool mmio_cached; | ||
233 | }; | 241 | }; |
234 | 242 | ||
235 | struct kvm_pio_request { | 243 | struct kvm_pio_request { |
@@ -529,11 +537,14 @@ struct kvm_arch { | |||
529 | unsigned int n_requested_mmu_pages; | 537 | unsigned int n_requested_mmu_pages; |
530 | unsigned int n_max_mmu_pages; | 538 | unsigned int n_max_mmu_pages; |
531 | unsigned int indirect_shadow_pages; | 539 | unsigned int indirect_shadow_pages; |
540 | unsigned long mmu_valid_gen; | ||
532 | struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; | 541 | struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; |
533 | /* | 542 | /* |
534 | * Hash table of struct kvm_mmu_page. | 543 | * Hash table of struct kvm_mmu_page. |
535 | */ | 544 | */ |
536 | struct list_head active_mmu_pages; | 545 | struct list_head active_mmu_pages; |
546 | struct list_head zapped_obsolete_pages; | ||
547 | |||
537 | struct list_head assigned_dev_head; | 548 | struct list_head assigned_dev_head; |
538 | struct iommu_domain *iommu_domain; | 549 | struct iommu_domain *iommu_domain; |
539 | int iommu_flags; | 550 | int iommu_flags; |
@@ -769,7 +780,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, | |||
769 | struct kvm_memory_slot *slot, | 780 | struct kvm_memory_slot *slot, |
770 | gfn_t gfn_offset, unsigned long mask); | 781 | gfn_t gfn_offset, unsigned long mask); |
771 | void kvm_mmu_zap_all(struct kvm *kvm); | 782 | void kvm_mmu_zap_all(struct kvm *kvm); |
772 | void kvm_mmu_zap_mmio_sptes(struct kvm *kvm); | 783 | void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm); |
773 | unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); | 784 | unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); |
774 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); | 785 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); |
775 | 786 | ||
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index d609e1d84048..bf4fb04d0112 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile | |||
@@ -5,12 +5,13 @@ CFLAGS_x86.o := -I. | |||
5 | CFLAGS_svm.o := -I. | 5 | CFLAGS_svm.o := -I. |
6 | CFLAGS_vmx.o := -I. | 6 | CFLAGS_vmx.o := -I. |
7 | 7 | ||
8 | kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ | 8 | KVM := ../../../virt/kvm |
9 | coalesced_mmio.o irq_comm.o eventfd.o \ | 9 | |
10 | irqchip.o) | 10 | kvm-y += $(KVM)/kvm_main.o $(KVM)/ioapic.o \ |
11 | kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(addprefix ../../../virt/kvm/, \ | 11 | $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \ |
12 | assigned-dev.o iommu.o) | 12 | $(KVM)/eventfd.o $(KVM)/irqchip.o |
13 | kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) | 13 | kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(KVM)/assigned-dev.o $(KVM)/iommu.o |
14 | kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o | ||
14 | 15 | ||
15 | kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ | 16 | kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ |
16 | i8254.o cpuid.o pmu.o | 17 | i8254.o cpuid.o pmu.o |
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5953dcea752d..2bc1e81045b0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c | |||
@@ -61,6 +61,8 @@ | |||
61 | #define OpMem8 26ull /* 8-bit zero extended memory operand */ | 61 | #define OpMem8 26ull /* 8-bit zero extended memory operand */ |
62 | #define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */ | 62 | #define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */ |
63 | #define OpXLat 28ull /* memory at BX/EBX/RBX + zero-extended AL */ | 63 | #define OpXLat 28ull /* memory at BX/EBX/RBX + zero-extended AL */ |
64 | #define OpAccLo 29ull /* Low part of extended acc (AX/AX/EAX/RAX) */ | ||
65 | #define OpAccHi 30ull /* High part of extended acc (-/DX/EDX/RDX) */ | ||
64 | 66 | ||
65 | #define OpBits 5 /* Width of operand field */ | 67 | #define OpBits 5 /* Width of operand field */ |
66 | #define OpMask ((1ull << OpBits) - 1) | 68 | #define OpMask ((1ull << OpBits) - 1) |
@@ -86,6 +88,7 @@ | |||
86 | #define DstMem64 (OpMem64 << DstShift) | 88 | #define DstMem64 (OpMem64 << DstShift) |
87 | #define DstImmUByte (OpImmUByte << DstShift) | 89 | #define DstImmUByte (OpImmUByte << DstShift) |
88 | #define DstDX (OpDX << DstShift) | 90 | #define DstDX (OpDX << DstShift) |
91 | #define DstAccLo (OpAccLo << DstShift) | ||
89 | #define DstMask (OpMask << DstShift) | 92 | #define DstMask (OpMask << DstShift) |
90 | /* Source operand type. */ | 93 | /* Source operand type. */ |
91 | #define SrcShift 6 | 94 | #define SrcShift 6 |
@@ -108,6 +111,7 @@ | |||
108 | #define SrcImm64 (OpImm64 << SrcShift) | 111 | #define SrcImm64 (OpImm64 << SrcShift) |
109 | #define SrcDX (OpDX << SrcShift) | 112 | #define SrcDX (OpDX << SrcShift) |
110 | #define SrcMem8 (OpMem8 << SrcShift) | 113 | #define SrcMem8 (OpMem8 << SrcShift) |
114 | #define SrcAccHi (OpAccHi << SrcShift) | ||
111 | #define SrcMask (OpMask << SrcShift) | 115 | #define SrcMask (OpMask << SrcShift) |
112 | #define BitOp (1<<11) | 116 | #define BitOp (1<<11) |
113 | #define MemAbs (1<<12) /* Memory operand is absolute displacement */ | 117 | #define MemAbs (1<<12) /* Memory operand is absolute displacement */ |
@@ -138,6 +142,7 @@ | |||
138 | /* Source 2 operand type */ | 142 | /* Source 2 operand type */ |
139 | #define Src2Shift (31) | 143 | #define Src2Shift (31) |
140 | #define Src2None (OpNone << Src2Shift) | 144 | #define Src2None (OpNone << Src2Shift) |
145 | #define Src2Mem (OpMem << Src2Shift) | ||
141 | #define Src2CL (OpCL << Src2Shift) | 146 | #define Src2CL (OpCL << Src2Shift) |
142 | #define Src2ImmByte (OpImmByte << Src2Shift) | 147 | #define Src2ImmByte (OpImmByte << Src2Shift) |
143 | #define Src2One (OpOne << Src2Shift) | 148 | #define Src2One (OpOne << Src2Shift) |
@@ -155,6 +160,9 @@ | |||
155 | #define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ | 160 | #define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ |
156 | #define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ | 161 | #define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ |
157 | #define NoWrite ((u64)1 << 45) /* No writeback */ | 162 | #define NoWrite ((u64)1 << 45) /* No writeback */ |
163 | #define SrcWrite ((u64)1 << 46) /* Write back src operand */ | ||
164 | |||
165 | #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) | ||
158 | 166 | ||
159 | #define X2(x...) x, x | 167 | #define X2(x...) x, x |
160 | #define X3(x...) X2(x), x | 168 | #define X3(x...) X2(x), x |
@@ -171,10 +179,11 @@ | |||
171 | /* | 179 | /* |
172 | * fastop functions have a special calling convention: | 180 | * fastop functions have a special calling convention: |
173 | * | 181 | * |
174 | * dst: [rdx]:rax (in/out) | 182 | * dst: rax (in/out) |
175 | * src: rbx (in/out) | 183 | * src: rdx (in/out) |
176 | * src2: rcx (in) | 184 | * src2: rcx (in) |
177 | * flags: rflags (in/out) | 185 | * flags: rflags (in/out) |
186 | * ex: rsi (in:fastop pointer, out:zero if exception) | ||
178 | * | 187 | * |
179 | * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for | 188 | * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for |
180 | * different operand sizes can be reached by calculation, rather than a jump | 189 | * different operand sizes can be reached by calculation, rather than a jump |
@@ -276,174 +285,17 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) | |||
276 | } | 285 | } |
277 | 286 | ||
278 | /* | 287 | /* |
279 | * Instruction emulation: | ||
280 | * Most instructions are emulated directly via a fragment of inline assembly | ||
281 | * code. This allows us to save/restore EFLAGS and thus very easily pick up | ||
282 | * any modified flags. | ||
283 | */ | ||
284 | |||
285 | #if defined(CONFIG_X86_64) | ||
286 | #define _LO32 "k" /* force 32-bit operand */ | ||
287 | #define _STK "%%rsp" /* stack pointer */ | ||
288 | #elif defined(__i386__) | ||
289 | #define _LO32 "" /* force 32-bit operand */ | ||
290 | #define _STK "%%esp" /* stack pointer */ | ||
291 | #endif | ||
292 | |||
293 | /* | ||
294 | * These EFLAGS bits are restored from saved value during emulation, and | 288 | * These EFLAGS bits are restored from saved value during emulation, and |
295 | * any changes are written back to the saved value after emulation. | 289 | * any changes are written back to the saved value after emulation. |
296 | */ | 290 | */ |
297 | #define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) | 291 | #define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) |
298 | 292 | ||
299 | /* Before executing instruction: restore necessary bits in EFLAGS. */ | ||
300 | #define _PRE_EFLAGS(_sav, _msk, _tmp) \ | ||
301 | /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ | ||
302 | "movl %"_sav",%"_LO32 _tmp"; " \ | ||
303 | "push %"_tmp"; " \ | ||
304 | "push %"_tmp"; " \ | ||
305 | "movl %"_msk",%"_LO32 _tmp"; " \ | ||
306 | "andl %"_LO32 _tmp",("_STK"); " \ | ||
307 | "pushf; " \ | ||
308 | "notl %"_LO32 _tmp"; " \ | ||
309 | "andl %"_LO32 _tmp",("_STK"); " \ | ||
310 | "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \ | ||
311 | "pop %"_tmp"; " \ | ||
312 | "orl %"_LO32 _tmp",("_STK"); " \ | ||
313 | "popf; " \ | ||
314 | "pop %"_sav"; " | ||
315 | |||
316 | /* After executing instruction: write-back necessary bits in EFLAGS. */ | ||
317 | #define _POST_EFLAGS(_sav, _msk, _tmp) \ | ||
318 | /* _sav |= EFLAGS & _msk; */ \ | ||
319 | "pushf; " \ | ||
320 | "pop %"_tmp"; " \ | ||
321 | "andl %"_msk",%"_LO32 _tmp"; " \ | ||
322 | "orl %"_LO32 _tmp",%"_sav"; " | ||
323 | |||
324 | #ifdef CONFIG_X86_64 | 293 | #ifdef CONFIG_X86_64 |
325 | #define ON64(x) x | 294 | #define ON64(x) x |
326 | #else | 295 | #else |
327 | #define ON64(x) | 296 | #define ON64(x) |
328 | #endif | 297 | #endif |
329 | 298 | ||
330 | #define ____emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype) \ | ||
331 | do { \ | ||
332 | __asm__ __volatile__ ( \ | ||
333 | _PRE_EFLAGS("0", "4", "2") \ | ||
334 | _op _suffix " %"_x"3,%1; " \ | ||
335 | _POST_EFLAGS("0", "4", "2") \ | ||
336 | : "=m" ((ctxt)->eflags), \ | ||
337 | "+q" (*(_dsttype*)&(ctxt)->dst.val), \ | ||
338 | "=&r" (_tmp) \ | ||
339 | : _y ((ctxt)->src.val), "i" (EFLAGS_MASK)); \ | ||
340 | } while (0) | ||
341 | |||
342 | |||
343 | /* Raw emulation: instruction has two explicit operands. */ | ||
344 | #define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy) \ | ||
345 | do { \ | ||
346 | unsigned long _tmp; \ | ||
347 | \ | ||
348 | switch ((ctxt)->dst.bytes) { \ | ||
349 | case 2: \ | ||
350 | ____emulate_2op(ctxt,_op,_wx,_wy,"w",u16); \ | ||
351 | break; \ | ||
352 | case 4: \ | ||
353 | ____emulate_2op(ctxt,_op,_lx,_ly,"l",u32); \ | ||
354 | break; \ | ||
355 | case 8: \ | ||
356 | ON64(____emulate_2op(ctxt,_op,_qx,_qy,"q",u64)); \ | ||
357 | break; \ | ||
358 | } \ | ||
359 | } while (0) | ||
360 | |||
361 | #define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ | ||
362 | do { \ | ||
363 | unsigned long _tmp; \ | ||
364 | switch ((ctxt)->dst.bytes) { \ | ||
365 | case 1: \ | ||
366 | ____emulate_2op(ctxt,_op,_bx,_by,"b",u8); \ | ||
367 | break; \ | ||
368 | default: \ | ||
369 | __emulate_2op_nobyte(ctxt, _op, \ | ||
370 | _wx, _wy, _lx, _ly, _qx, _qy); \ | ||
371 | break; \ | ||
372 | } \ | ||
373 | } while (0) | ||
374 | |||
375 | /* Source operand is byte-sized and may be restricted to just %cl. */ | ||
376 | #define emulate_2op_SrcB(ctxt, _op) \ | ||
377 | __emulate_2op(ctxt, _op, "b", "c", "b", "c", "b", "c", "b", "c") | ||
378 | |||
379 | /* Source operand is byte, word, long or quad sized. */ | ||
380 | #define emulate_2op_SrcV(ctxt, _op) \ | ||
381 | __emulate_2op(ctxt, _op, "b", "q", "w", "r", _LO32, "r", "", "r") | ||
382 | |||
383 | /* Source operand is word, long or quad sized. */ | ||
384 | #define emulate_2op_SrcV_nobyte(ctxt, _op) \ | ||
385 | __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r") | ||
386 | |||
387 | /* Instruction has three operands and one operand is stored in ECX register */ | ||
388 | #define __emulate_2op_cl(ctxt, _op, _suffix, _type) \ | ||
389 | do { \ | ||
390 | unsigned long _tmp; \ | ||
391 | _type _clv = (ctxt)->src2.val; \ | ||
392 | _type _srcv = (ctxt)->src.val; \ | ||
393 | _type _dstv = (ctxt)->dst.val; \ | ||
394 | \ | ||
395 | __asm__ __volatile__ ( \ | ||
396 | _PRE_EFLAGS("0", "5", "2") \ | ||
397 | _op _suffix " %4,%1 \n" \ | ||
398 | _POST_EFLAGS("0", "5", "2") \ | ||
399 | : "=m" ((ctxt)->eflags), "+r" (_dstv), "=&r" (_tmp) \ | ||
400 | : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ | ||
401 | ); \ | ||
402 | \ | ||
403 | (ctxt)->src2.val = (unsigned long) _clv; \ | ||
404 | (ctxt)->src2.val = (unsigned long) _srcv; \ | ||
405 | (ctxt)->dst.val = (unsigned long) _dstv; \ | ||
406 | } while (0) | ||
407 | |||
408 | #define emulate_2op_cl(ctxt, _op) \ | ||
409 | do { \ | ||
410 | switch ((ctxt)->dst.bytes) { \ | ||
411 | case 2: \ | ||
412 | __emulate_2op_cl(ctxt, _op, "w", u16); \ | ||
413 | break; \ | ||
414 | case 4: \ | ||
415 | __emulate_2op_cl(ctxt, _op, "l", u32); \ | ||
416 | break; \ | ||
417 | case 8: \ | ||
418 | ON64(__emulate_2op_cl(ctxt, _op, "q", ulong)); \ | ||
419 | break; \ | ||
420 | } \ | ||
421 | } while (0) | ||
422 | |||
423 | #define __emulate_1op(ctxt, _op, _suffix) \ | ||
424 | do { \ | ||
425 | unsigned long _tmp; \ | ||
426 | \ | ||
427 | __asm__ __volatile__ ( \ | ||
428 | _PRE_EFLAGS("0", "3", "2") \ | ||
429 | _op _suffix " %1; " \ | ||
430 | _POST_EFLAGS("0", "3", "2") \ | ||
431 | : "=m" ((ctxt)->eflags), "+m" ((ctxt)->dst.val), \ | ||
432 | "=&r" (_tmp) \ | ||
433 | : "i" (EFLAGS_MASK)); \ | ||
434 | } while (0) | ||
435 | |||
436 | /* Instruction has only one explicit operand (no source operand). */ | ||
437 | #define emulate_1op(ctxt, _op) \ | ||
438 | do { \ | ||
439 | switch ((ctxt)->dst.bytes) { \ | ||
440 | case 1: __emulate_1op(ctxt, _op, "b"); break; \ | ||
441 | case 2: __emulate_1op(ctxt, _op, "w"); break; \ | ||
442 | case 4: __emulate_1op(ctxt, _op, "l"); break; \ | ||
443 | case 8: ON64(__emulate_1op(ctxt, _op, "q")); break; \ | ||
444 | } \ | ||
445 | } while (0) | ||
446 | |||
447 | static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); | 299 | static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); |
448 | 300 | ||
449 | #define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t" | 301 | #define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t" |
@@ -462,7 +314,10 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); | |||
462 | #define FOPNOP() FOP_ALIGN FOP_RET | 314 | #define FOPNOP() FOP_ALIGN FOP_RET |
463 | 315 | ||
464 | #define FOP1E(op, dst) \ | 316 | #define FOP1E(op, dst) \ |
465 | FOP_ALIGN #op " %" #dst " \n\t" FOP_RET | 317 | FOP_ALIGN "10: " #op " %" #dst " \n\t" FOP_RET |
318 | |||
319 | #define FOP1EEX(op, dst) \ | ||
320 | FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception) | ||
466 | 321 | ||
467 | #define FASTOP1(op) \ | 322 | #define FASTOP1(op) \ |
468 | FOP_START(op) \ | 323 | FOP_START(op) \ |
@@ -472,24 +327,42 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); | |||
472 | ON64(FOP1E(op##q, rax)) \ | 327 | ON64(FOP1E(op##q, rax)) \ |
473 | FOP_END | 328 | FOP_END |
474 | 329 | ||
330 | /* 1-operand, using src2 (for MUL/DIV r/m) */ | ||
331 | #define FASTOP1SRC2(op, name) \ | ||
332 | FOP_START(name) \ | ||
333 | FOP1E(op, cl) \ | ||
334 | FOP1E(op, cx) \ | ||
335 | FOP1E(op, ecx) \ | ||
336 | ON64(FOP1E(op, rcx)) \ | ||
337 | FOP_END | ||
338 | |||
339 | /* 1-operand, using src2 (for MUL/DIV r/m), with exceptions */ | ||
340 | #define FASTOP1SRC2EX(op, name) \ | ||
341 | FOP_START(name) \ | ||
342 | FOP1EEX(op, cl) \ | ||
343 | FOP1EEX(op, cx) \ | ||
344 | FOP1EEX(op, ecx) \ | ||
345 | ON64(FOP1EEX(op, rcx)) \ | ||
346 | FOP_END | ||
347 | |||
475 | #define FOP2E(op, dst, src) \ | 348 | #define FOP2E(op, dst, src) \ |
476 | FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET | 349 | FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET |
477 | 350 | ||
478 | #define FASTOP2(op) \ | 351 | #define FASTOP2(op) \ |
479 | FOP_START(op) \ | 352 | FOP_START(op) \ |
480 | FOP2E(op##b, al, bl) \ | 353 | FOP2E(op##b, al, dl) \ |
481 | FOP2E(op##w, ax, bx) \ | 354 | FOP2E(op##w, ax, dx) \ |
482 | FOP2E(op##l, eax, ebx) \ | 355 | FOP2E(op##l, eax, edx) \ |
483 | ON64(FOP2E(op##q, rax, rbx)) \ | 356 | ON64(FOP2E(op##q, rax, rdx)) \ |
484 | FOP_END | 357 | FOP_END |
485 | 358 | ||
486 | /* 2 operand, word only */ | 359 | /* 2 operand, word only */ |
487 | #define FASTOP2W(op) \ | 360 | #define FASTOP2W(op) \ |
488 | FOP_START(op) \ | 361 | FOP_START(op) \ |
489 | FOPNOP() \ | 362 | FOPNOP() \ |
490 | FOP2E(op##w, ax, bx) \ | 363 | FOP2E(op##w, ax, dx) \ |
491 | FOP2E(op##l, eax, ebx) \ | 364 | FOP2E(op##l, eax, edx) \ |
492 | ON64(FOP2E(op##q, rax, rbx)) \ | 365 | ON64(FOP2E(op##q, rax, rdx)) \ |
493 | FOP_END | 366 | FOP_END |
494 | 367 | ||
495 | /* 2 operand, src is CL */ | 368 | /* 2 operand, src is CL */ |
@@ -508,14 +381,17 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); | |||
508 | #define FASTOP3WCL(op) \ | 381 | #define FASTOP3WCL(op) \ |
509 | FOP_START(op) \ | 382 | FOP_START(op) \ |
510 | FOPNOP() \ | 383 | FOPNOP() \ |
511 | FOP3E(op##w, ax, bx, cl) \ | 384 | FOP3E(op##w, ax, dx, cl) \ |
512 | FOP3E(op##l, eax, ebx, cl) \ | 385 | FOP3E(op##l, eax, edx, cl) \ |
513 | ON64(FOP3E(op##q, rax, rbx, cl)) \ | 386 | ON64(FOP3E(op##q, rax, rdx, cl)) \ |
514 | FOP_END | 387 | FOP_END |
515 | 388 | ||
516 | /* Special case for SETcc - 1 instruction per cc */ | 389 | /* Special case for SETcc - 1 instruction per cc */ |
517 | #define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t" | 390 | #define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t" |
518 | 391 | ||
392 | asm(".global kvm_fastop_exception \n" | ||
393 | "kvm_fastop_exception: xor %esi, %esi; ret"); | ||
394 | |||
519 | FOP_START(setcc) | 395 | FOP_START(setcc) |
520 | FOP_SETCC(seto) | 396 | FOP_SETCC(seto) |
521 | FOP_SETCC(setno) | 397 | FOP_SETCC(setno) |
@@ -538,47 +414,6 @@ FOP_END; | |||
538 | FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET | 414 | FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET |
539 | FOP_END; | 415 | FOP_END; |
540 | 416 | ||
541 | #define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ | ||
542 | do { \ | ||
543 | unsigned long _tmp; \ | ||
544 | ulong *rax = reg_rmw((ctxt), VCPU_REGS_RAX); \ | ||
545 | ulong *rdx = reg_rmw((ctxt), VCPU_REGS_RDX); \ | ||
546 | \ | ||
547 | __asm__ __volatile__ ( \ | ||
548 | _PRE_EFLAGS("0", "5", "1") \ | ||
549 | "1: \n\t" \ | ||
550 | _op _suffix " %6; " \ | ||
551 | "2: \n\t" \ | ||
552 | _POST_EFLAGS("0", "5", "1") \ | ||
553 | ".pushsection .fixup,\"ax\" \n\t" \ | ||
554 | "3: movb $1, %4 \n\t" \ | ||
555 | "jmp 2b \n\t" \ | ||
556 | ".popsection \n\t" \ | ||
557 | _ASM_EXTABLE(1b, 3b) \ | ||
558 | : "=m" ((ctxt)->eflags), "=&r" (_tmp), \ | ||
559 | "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \ | ||
560 | : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val)); \ | ||
561 | } while (0) | ||
562 | |||
563 | /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ | ||
564 | #define emulate_1op_rax_rdx(ctxt, _op, _ex) \ | ||
565 | do { \ | ||
566 | switch((ctxt)->src.bytes) { \ | ||
567 | case 1: \ | ||
568 | __emulate_1op_rax_rdx(ctxt, _op, "b", _ex); \ | ||
569 | break; \ | ||
570 | case 2: \ | ||
571 | __emulate_1op_rax_rdx(ctxt, _op, "w", _ex); \ | ||
572 | break; \ | ||
573 | case 4: \ | ||
574 | __emulate_1op_rax_rdx(ctxt, _op, "l", _ex); \ | ||
575 | break; \ | ||
576 | case 8: ON64( \ | ||
577 | __emulate_1op_rax_rdx(ctxt, _op, "q", _ex)); \ | ||
578 | break; \ | ||
579 | } \ | ||
580 | } while (0) | ||
581 | |||
582 | static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, | 417 | static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, |
583 | enum x86_intercept intercept, | 418 | enum x86_intercept intercept, |
584 | enum x86_intercept_stage stage) | 419 | enum x86_intercept_stage stage) |
@@ -988,6 +823,11 @@ FASTOP2(xor); | |||
988 | FASTOP2(cmp); | 823 | FASTOP2(cmp); |
989 | FASTOP2(test); | 824 | FASTOP2(test); |
990 | 825 | ||
826 | FASTOP1SRC2(mul, mul_ex); | ||
827 | FASTOP1SRC2(imul, imul_ex); | ||
828 | FASTOP1SRC2EX(div, div_ex); | ||
829 | FASTOP1SRC2EX(idiv, idiv_ex); | ||
830 | |||
991 | FASTOP3WCL(shld); | 831 | FASTOP3WCL(shld); |
992 | FASTOP3WCL(shrd); | 832 | FASTOP3WCL(shrd); |
993 | 833 | ||
@@ -1013,6 +853,8 @@ FASTOP2W(bts); | |||
1013 | FASTOP2W(btr); | 853 | FASTOP2W(btr); |
1014 | FASTOP2W(btc); | 854 | FASTOP2W(btc); |
1015 | 855 | ||
856 | FASTOP2(xadd); | ||
857 | |||
1016 | static u8 test_cc(unsigned int condition, unsigned long flags) | 858 | static u8 test_cc(unsigned int condition, unsigned long flags) |
1017 | { | 859 | { |
1018 | u8 rc; | 860 | u8 rc; |
@@ -1726,45 +1568,42 @@ static void write_register_operand(struct operand *op) | |||
1726 | } | 1568 | } |
1727 | } | 1569 | } |
1728 | 1570 | ||
1729 | static int writeback(struct x86_emulate_ctxt *ctxt) | 1571 | static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) |
1730 | { | 1572 | { |
1731 | int rc; | 1573 | int rc; |
1732 | 1574 | ||
1733 | if (ctxt->d & NoWrite) | 1575 | switch (op->type) { |
1734 | return X86EMUL_CONTINUE; | ||
1735 | |||
1736 | switch (ctxt->dst.type) { | ||
1737 | case OP_REG: | 1576 | case OP_REG: |
1738 | write_register_operand(&ctxt->dst); | 1577 | write_register_operand(op); |
1739 | break; | 1578 | break; |
1740 | case OP_MEM: | 1579 | case OP_MEM: |
1741 | if (ctxt->lock_prefix) | 1580 | if (ctxt->lock_prefix) |
1742 | rc = segmented_cmpxchg(ctxt, | 1581 | rc = segmented_cmpxchg(ctxt, |
1743 | ctxt->dst.addr.mem, | 1582 | op->addr.mem, |
1744 | &ctxt->dst.orig_val, | 1583 | &op->orig_val, |
1745 | &ctxt->dst.val, | 1584 | &op->val, |
1746 | ctxt->dst.bytes); | 1585 | op->bytes); |
1747 | else | 1586 | else |
1748 | rc = segmented_write(ctxt, | 1587 | rc = segmented_write(ctxt, |
1749 | ctxt->dst.addr.mem, | 1588 | op->addr.mem, |
1750 | &ctxt->dst.val, | 1589 | &op->val, |
1751 | ctxt->dst.bytes); | 1590 | op->bytes); |
1752 | if (rc != X86EMUL_CONTINUE) | 1591 | if (rc != X86EMUL_CONTINUE) |
1753 | return rc; | 1592 | return rc; |
1754 | break; | 1593 | break; |
1755 | case OP_MEM_STR: | 1594 | case OP_MEM_STR: |
1756 | rc = segmented_write(ctxt, | 1595 | rc = segmented_write(ctxt, |
1757 | ctxt->dst.addr.mem, | 1596 | op->addr.mem, |
1758 | ctxt->dst.data, | 1597 | op->data, |
1759 | ctxt->dst.bytes * ctxt->dst.count); | 1598 | op->bytes * op->count); |
1760 | if (rc != X86EMUL_CONTINUE) | 1599 | if (rc != X86EMUL_CONTINUE) |
1761 | return rc; | 1600 | return rc; |
1762 | break; | 1601 | break; |
1763 | case OP_XMM: | 1602 | case OP_XMM: |
1764 | write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); | 1603 | write_sse_reg(ctxt, &op->vec_val, op->addr.xmm); |
1765 | break; | 1604 | break; |
1766 | case OP_MM: | 1605 | case OP_MM: |
1767 | write_mmx_reg(ctxt, &ctxt->dst.mm_val, ctxt->dst.addr.mm); | 1606 | write_mmx_reg(ctxt, &op->mm_val, op->addr.mm); |
1768 | break; | 1607 | break; |
1769 | case OP_NONE: | 1608 | case OP_NONE: |
1770 | /* no writeback */ | 1609 | /* no writeback */ |
@@ -2117,42 +1956,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) | |||
2117 | return X86EMUL_CONTINUE; | 1956 | return X86EMUL_CONTINUE; |
2118 | } | 1957 | } |
2119 | 1958 | ||
2120 | static int em_mul_ex(struct x86_emulate_ctxt *ctxt) | ||
2121 | { | ||
2122 | u8 ex = 0; | ||
2123 | |||
2124 | emulate_1op_rax_rdx(ctxt, "mul", ex); | ||
2125 | return X86EMUL_CONTINUE; | ||
2126 | } | ||
2127 | |||
2128 | static int em_imul_ex(struct x86_emulate_ctxt *ctxt) | ||
2129 | { | ||
2130 | u8 ex = 0; | ||
2131 | |||
2132 | emulate_1op_rax_rdx(ctxt, "imul", ex); | ||
2133 | return X86EMUL_CONTINUE; | ||
2134 | } | ||
2135 | |||
2136 | static int em_div_ex(struct x86_emulate_ctxt *ctxt) | ||
2137 | { | ||
2138 | u8 de = 0; | ||
2139 | |||
2140 | emulate_1op_rax_rdx(ctxt, "div", de); | ||
2141 | if (de) | ||
2142 | return emulate_de(ctxt); | ||
2143 | return X86EMUL_CONTINUE; | ||
2144 | } | ||
2145 | |||
2146 | static int em_idiv_ex(struct x86_emulate_ctxt *ctxt) | ||
2147 | { | ||
2148 | u8 de = 0; | ||
2149 | |||
2150 | emulate_1op_rax_rdx(ctxt, "idiv", de); | ||
2151 | if (de) | ||
2152 | return emulate_de(ctxt); | ||
2153 | return X86EMUL_CONTINUE; | ||
2154 | } | ||
2155 | |||
2156 | static int em_grp45(struct x86_emulate_ctxt *ctxt) | 1959 | static int em_grp45(struct x86_emulate_ctxt *ctxt) |
2157 | { | 1960 | { |
2158 | int rc = X86EMUL_CONTINUE; | 1961 | int rc = X86EMUL_CONTINUE; |
@@ -3734,10 +3537,10 @@ static const struct opcode group3[] = { | |||
3734 | F(DstMem | SrcImm | NoWrite, em_test), | 3537 | F(DstMem | SrcImm | NoWrite, em_test), |
3735 | F(DstMem | SrcNone | Lock, em_not), | 3538 | F(DstMem | SrcNone | Lock, em_not), |
3736 | F(DstMem | SrcNone | Lock, em_neg), | 3539 | F(DstMem | SrcNone | Lock, em_neg), |
3737 | I(SrcMem, em_mul_ex), | 3540 | F(DstXacc | Src2Mem, em_mul_ex), |
3738 | I(SrcMem, em_imul_ex), | 3541 | F(DstXacc | Src2Mem, em_imul_ex), |
3739 | I(SrcMem, em_div_ex), | 3542 | F(DstXacc | Src2Mem, em_div_ex), |
3740 | I(SrcMem, em_idiv_ex), | 3543 | F(DstXacc | Src2Mem, em_idiv_ex), |
3741 | }; | 3544 | }; |
3742 | 3545 | ||
3743 | static const struct opcode group4[] = { | 3546 | static const struct opcode group4[] = { |
@@ -4064,7 +3867,7 @@ static const struct opcode twobyte_table[256] = { | |||
4064 | F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr), | 3867 | F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr), |
4065 | D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), | 3868 | D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), |
4066 | /* 0xC0 - 0xC7 */ | 3869 | /* 0xC0 - 0xC7 */ |
4067 | D2bv(DstMem | SrcReg | ModRM | Lock), | 3870 | F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd), |
4068 | N, D(DstMem | SrcReg | ModRM | Mov), | 3871 | N, D(DstMem | SrcReg | ModRM | Mov), |
4069 | N, N, N, GD(0, &group9), | 3872 | N, N, N, GD(0, &group9), |
4070 | /* 0xC8 - 0xCF */ | 3873 | /* 0xC8 - 0xCF */ |
@@ -4172,6 +3975,24 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, | |||
4172 | fetch_register_operand(op); | 3975 | fetch_register_operand(op); |
4173 | op->orig_val = op->val; | 3976 | op->orig_val = op->val; |
4174 | break; | 3977 | break; |
3978 | case OpAccLo: | ||
3979 | op->type = OP_REG; | ||
3980 | op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes; | ||
3981 | op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); | ||
3982 | fetch_register_operand(op); | ||
3983 | op->orig_val = op->val; | ||
3984 | break; | ||
3985 | case OpAccHi: | ||
3986 | if (ctxt->d & ByteOp) { | ||
3987 | op->type = OP_NONE; | ||
3988 | break; | ||
3989 | } | ||
3990 | op->type = OP_REG; | ||
3991 | op->bytes = ctxt->op_bytes; | ||
3992 | op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX); | ||
3993 | fetch_register_operand(op); | ||
3994 | op->orig_val = op->val; | ||
3995 | break; | ||
4175 | case OpDI: | 3996 | case OpDI: |
4176 | op->type = OP_MEM; | 3997 | op->type = OP_MEM; |
4177 | op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; | 3998 | op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; |
@@ -4553,11 +4374,15 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, | |||
4553 | static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) | 4374 | static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) |
4554 | { | 4375 | { |
4555 | ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; | 4376 | ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; |
4556 | fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; | 4377 | if (!(ctxt->d & ByteOp)) |
4378 | fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; | ||
4557 | asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" | 4379 | asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" |
4558 | : "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags) | 4380 | : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), |
4559 | : "c"(ctxt->src2.val), [fastop]"S"(fop)); | 4381 | [fastop]"+S"(fop) |
4382 | : "c"(ctxt->src2.val)); | ||
4560 | ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); | 4383 | ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); |
4384 | if (!fop) /* exception is returned in fop variable */ | ||
4385 | return emulate_de(ctxt); | ||
4561 | return X86EMUL_CONTINUE; | 4386 | return X86EMUL_CONTINUE; |
4562 | } | 4387 | } |
4563 | 4388 | ||
@@ -4773,9 +4598,17 @@ special_insn: | |||
4773 | goto done; | 4598 | goto done; |
4774 | 4599 | ||
4775 | writeback: | 4600 | writeback: |
4776 | rc = writeback(ctxt); | 4601 | if (!(ctxt->d & NoWrite)) { |
4777 | if (rc != X86EMUL_CONTINUE) | 4602 | rc = writeback(ctxt, &ctxt->dst); |
4778 | goto done; | 4603 | if (rc != X86EMUL_CONTINUE) |
4604 | goto done; | ||
4605 | } | ||
4606 | if (ctxt->d & SrcWrite) { | ||
4607 | BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR); | ||
4608 | rc = writeback(ctxt, &ctxt->src); | ||
4609 | if (rc != X86EMUL_CONTINUE) | ||
4610 | goto done; | ||
4611 | } | ||
4779 | 4612 | ||
4780 | /* | 4613 | /* |
4781 | * restore dst type in case the decoding will be reused | 4614 | * restore dst type in case the decoding will be reused |
@@ -4872,12 +4705,6 @@ twobyte_insn: | |||
4872 | ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val : | 4705 | ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val : |
4873 | (s16) ctxt->src.val; | 4706 | (s16) ctxt->src.val; |
4874 | break; | 4707 | break; |
4875 | case 0xc0 ... 0xc1: /* xadd */ | ||
4876 | fastop(ctxt, em_add); | ||
4877 | /* Write back the register source. */ | ||
4878 | ctxt->src.val = ctxt->dst.orig_val; | ||
4879 | write_register_operand(&ctxt->src); | ||
4880 | break; | ||
4881 | case 0xc3: /* movnti */ | 4708 | case 0xc3: /* movnti */ |
4882 | ctxt->dst.bytes = ctxt->op_bytes; | 4709 | ctxt->dst.bytes = ctxt->op_bytes; |
4883 | ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : | 4710 | ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0eee2c8b64d1..afc11245827c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -1608,8 +1608,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) | |||
1608 | return; | 1608 | return; |
1609 | 1609 | ||
1610 | if (atomic_read(&apic->lapic_timer.pending) > 0) { | 1610 | if (atomic_read(&apic->lapic_timer.pending) > 0) { |
1611 | if (kvm_apic_local_deliver(apic, APIC_LVTT)) | 1611 | kvm_apic_local_deliver(apic, APIC_LVTT); |
1612 | atomic_dec(&apic->lapic_timer.pending); | 1612 | atomic_set(&apic->lapic_timer.pending, 0); |
1613 | } | 1613 | } |
1614 | } | 1614 | } |
1615 | 1615 | ||
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 004cc87b781c..0d094da49541 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -197,15 +197,63 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) | |||
197 | } | 197 | } |
198 | EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); | 198 | EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); |
199 | 199 | ||
200 | static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) | 200 | /* |
201 | * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number, | ||
202 | * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation | ||
203 | * number. | ||
204 | */ | ||
205 | #define MMIO_SPTE_GEN_LOW_SHIFT 3 | ||
206 | #define MMIO_SPTE_GEN_HIGH_SHIFT 52 | ||
207 | |||
208 | #define MMIO_GEN_SHIFT 19 | ||
209 | #define MMIO_GEN_LOW_SHIFT 9 | ||
210 | #define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1) | ||
211 | #define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) | ||
212 | #define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1) | ||
213 | |||
214 | static u64 generation_mmio_spte_mask(unsigned int gen) | ||
201 | { | 215 | { |
202 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); | 216 | u64 mask; |
217 | |||
218 | WARN_ON(gen > MMIO_MAX_GEN); | ||
219 | |||
220 | mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT; | ||
221 | mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; | ||
222 | return mask; | ||
223 | } | ||
224 | |||
225 | static unsigned int get_mmio_spte_generation(u64 spte) | ||
226 | { | ||
227 | unsigned int gen; | ||
228 | |||
229 | spte &= ~shadow_mmio_mask; | ||
230 | |||
231 | gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK; | ||
232 | gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT; | ||
233 | return gen; | ||
234 | } | ||
235 | |||
236 | static unsigned int kvm_current_mmio_generation(struct kvm *kvm) | ||
237 | { | ||
238 | /* | ||
239 | * Init kvm generation close to MMIO_MAX_GEN to easily test the | ||
240 | * code of handling generation number wrap-around. | ||
241 | */ | ||
242 | return (kvm_memslots(kvm)->generation + | ||
243 | MMIO_MAX_GEN - 150) & MMIO_GEN_MASK; | ||
244 | } | ||
245 | |||
246 | static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn, | ||
247 | unsigned access) | ||
248 | { | ||
249 | unsigned int gen = kvm_current_mmio_generation(kvm); | ||
250 | u64 mask = generation_mmio_spte_mask(gen); | ||
203 | 251 | ||
204 | access &= ACC_WRITE_MASK | ACC_USER_MASK; | 252 | access &= ACC_WRITE_MASK | ACC_USER_MASK; |
253 | mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT; | ||
205 | 254 | ||
206 | sp->mmio_cached = true; | 255 | trace_mark_mmio_spte(sptep, gfn, access, gen); |
207 | trace_mark_mmio_spte(sptep, gfn, access); | 256 | mmu_spte_set(sptep, mask); |
208 | mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT); | ||
209 | } | 257 | } |
210 | 258 | ||
211 | static bool is_mmio_spte(u64 spte) | 259 | static bool is_mmio_spte(u64 spte) |
@@ -215,24 +263,38 @@ static bool is_mmio_spte(u64 spte) | |||
215 | 263 | ||
216 | static gfn_t get_mmio_spte_gfn(u64 spte) | 264 | static gfn_t get_mmio_spte_gfn(u64 spte) |
217 | { | 265 | { |
218 | return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT; | 266 | u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask; |
267 | return (spte & ~mask) >> PAGE_SHIFT; | ||
219 | } | 268 | } |
220 | 269 | ||
221 | static unsigned get_mmio_spte_access(u64 spte) | 270 | static unsigned get_mmio_spte_access(u64 spte) |
222 | { | 271 | { |
223 | return (spte & ~shadow_mmio_mask) & ~PAGE_MASK; | 272 | u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask; |
273 | return (spte & ~mask) & ~PAGE_MASK; | ||
224 | } | 274 | } |
225 | 275 | ||
226 | static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) | 276 | static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, |
277 | pfn_t pfn, unsigned access) | ||
227 | { | 278 | { |
228 | if (unlikely(is_noslot_pfn(pfn))) { | 279 | if (unlikely(is_noslot_pfn(pfn))) { |
229 | mark_mmio_spte(sptep, gfn, access); | 280 | mark_mmio_spte(kvm, sptep, gfn, access); |
230 | return true; | 281 | return true; |
231 | } | 282 | } |
232 | 283 | ||
233 | return false; | 284 | return false; |
234 | } | 285 | } |
235 | 286 | ||
287 | static bool check_mmio_spte(struct kvm *kvm, u64 spte) | ||
288 | { | ||
289 | unsigned int kvm_gen, spte_gen; | ||
290 | |||
291 | kvm_gen = kvm_current_mmio_generation(kvm); | ||
292 | spte_gen = get_mmio_spte_generation(spte); | ||
293 | |||
294 | trace_check_mmio_spte(spte, kvm_gen, spte_gen); | ||
295 | return likely(kvm_gen == spte_gen); | ||
296 | } | ||
297 | |||
236 | static inline u64 rsvd_bits(int s, int e) | 298 | static inline u64 rsvd_bits(int s, int e) |
237 | { | 299 | { |
238 | return ((1ULL << (e - s + 1)) - 1) << s; | 300 | return ((1ULL << (e - s + 1)) - 1) << s; |
@@ -404,9 +466,20 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) | |||
404 | /* | 466 | /* |
405 | * The idea using the light way get the spte on x86_32 guest is from | 467 | * The idea using the light way get the spte on x86_32 guest is from |
406 | * gup_get_pte(arch/x86/mm/gup.c). | 468 | * gup_get_pte(arch/x86/mm/gup.c). |
407 | * The difference is we can not catch the spte tlb flush if we leave | 469 | * |
408 | * guest mode, so we emulate it by increase clear_spte_count when spte | 470 | * An spte tlb flush may be pending, because kvm_set_pte_rmapp |
409 | * is cleared. | 471 | * coalesces them and we are running out of the MMU lock. Therefore |
472 | * we need to protect against in-progress updates of the spte. | ||
473 | * | ||
474 | * Reading the spte while an update is in progress may get the old value | ||
475 | * for the high part of the spte. The race is fine for a present->non-present | ||
476 | * change (because the high part of the spte is ignored for non-present spte), | ||
477 | * but for a present->present change we must reread the spte. | ||
478 | * | ||
479 | * All such changes are done in two steps (present->non-present and | ||
480 | * non-present->present), hence it is enough to count the number of | ||
481 | * present->non-present updates: if it changed while reading the spte, | ||
482 | * we might have hit the race. This is done using clear_spte_count. | ||
410 | */ | 483 | */ |
411 | static u64 __get_spte_lockless(u64 *sptep) | 484 | static u64 __get_spte_lockless(u64 *sptep) |
412 | { | 485 | { |
@@ -1511,6 +1584,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | |||
1511 | if (!direct) | 1584 | if (!direct) |
1512 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); | 1585 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); |
1513 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | 1586 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); |
1587 | |||
1588 | /* | ||
1589 | * The active_mmu_pages list is the FIFO list, do not move the | ||
1590 | * page until it is zapped. kvm_zap_obsolete_pages depends on | ||
1591 | * this feature. See the comments in kvm_zap_obsolete_pages(). | ||
1592 | */ | ||
1514 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); | 1593 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); |
1515 | sp->parent_ptes = 0; | 1594 | sp->parent_ptes = 0; |
1516 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); | 1595 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); |
@@ -1648,6 +1727,16 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, | |||
1648 | static void kvm_mmu_commit_zap_page(struct kvm *kvm, | 1727 | static void kvm_mmu_commit_zap_page(struct kvm *kvm, |
1649 | struct list_head *invalid_list); | 1728 | struct list_head *invalid_list); |
1650 | 1729 | ||
1730 | /* | ||
1731 | * NOTE: we should pay more attention on the zapped-obsolete page | ||
1732 | * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk | ||
1733 | * since it has been deleted from active_mmu_pages but still can be found | ||
1734 | * at hast list. | ||
1735 | * | ||
1736 | * for_each_gfn_indirect_valid_sp has skipped that kind of page and | ||
1737 | * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped | ||
1738 | * all the obsolete pages. | ||
1739 | */ | ||
1651 | #define for_each_gfn_sp(_kvm, _sp, _gfn) \ | 1740 | #define for_each_gfn_sp(_kvm, _sp, _gfn) \ |
1652 | hlist_for_each_entry(_sp, \ | 1741 | hlist_for_each_entry(_sp, \ |
1653 | &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ | 1742 | &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ |
@@ -1838,6 +1927,11 @@ static void clear_sp_write_flooding_count(u64 *spte) | |||
1838 | __clear_sp_write_flooding_count(sp); | 1927 | __clear_sp_write_flooding_count(sp); |
1839 | } | 1928 | } |
1840 | 1929 | ||
1930 | static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
1931 | { | ||
1932 | return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); | ||
1933 | } | ||
1934 | |||
1841 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | 1935 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, |
1842 | gfn_t gfn, | 1936 | gfn_t gfn, |
1843 | gva_t gaddr, | 1937 | gva_t gaddr, |
@@ -1864,6 +1958,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
1864 | role.quadrant = quadrant; | 1958 | role.quadrant = quadrant; |
1865 | } | 1959 | } |
1866 | for_each_gfn_sp(vcpu->kvm, sp, gfn) { | 1960 | for_each_gfn_sp(vcpu->kvm, sp, gfn) { |
1961 | if (is_obsolete_sp(vcpu->kvm, sp)) | ||
1962 | continue; | ||
1963 | |||
1867 | if (!need_sync && sp->unsync) | 1964 | if (!need_sync && sp->unsync) |
1868 | need_sync = true; | 1965 | need_sync = true; |
1869 | 1966 | ||
@@ -1900,6 +1997,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
1900 | 1997 | ||
1901 | account_shadowed(vcpu->kvm, gfn); | 1998 | account_shadowed(vcpu->kvm, gfn); |
1902 | } | 1999 | } |
2000 | sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; | ||
1903 | init_shadow_page_table(sp); | 2001 | init_shadow_page_table(sp); |
1904 | trace_kvm_mmu_get_page(sp, true); | 2002 | trace_kvm_mmu_get_page(sp, true); |
1905 | return sp; | 2003 | return sp; |
@@ -2070,8 +2168,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, | |||
2070 | ret = mmu_zap_unsync_children(kvm, sp, invalid_list); | 2168 | ret = mmu_zap_unsync_children(kvm, sp, invalid_list); |
2071 | kvm_mmu_page_unlink_children(kvm, sp); | 2169 | kvm_mmu_page_unlink_children(kvm, sp); |
2072 | kvm_mmu_unlink_parents(kvm, sp); | 2170 | kvm_mmu_unlink_parents(kvm, sp); |
2171 | |||
2073 | if (!sp->role.invalid && !sp->role.direct) | 2172 | if (!sp->role.invalid && !sp->role.direct) |
2074 | unaccount_shadowed(kvm, sp->gfn); | 2173 | unaccount_shadowed(kvm, sp->gfn); |
2174 | |||
2075 | if (sp->unsync) | 2175 | if (sp->unsync) |
2076 | kvm_unlink_unsync_page(kvm, sp); | 2176 | kvm_unlink_unsync_page(kvm, sp); |
2077 | if (!sp->root_count) { | 2177 | if (!sp->root_count) { |
@@ -2081,7 +2181,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, | |||
2081 | kvm_mod_used_mmu_pages(kvm, -1); | 2181 | kvm_mod_used_mmu_pages(kvm, -1); |
2082 | } else { | 2182 | } else { |
2083 | list_move(&sp->link, &kvm->arch.active_mmu_pages); | 2183 | list_move(&sp->link, &kvm->arch.active_mmu_pages); |
2084 | kvm_reload_remote_mmus(kvm); | 2184 | |
2185 | /* | ||
2186 | * The obsolete pages can not be used on any vcpus. | ||
2187 | * See the comments in kvm_mmu_invalidate_zap_all_pages(). | ||
2188 | */ | ||
2189 | if (!sp->role.invalid && !is_obsolete_sp(kvm, sp)) | ||
2190 | kvm_reload_remote_mmus(kvm); | ||
2085 | } | 2191 | } |
2086 | 2192 | ||
2087 | sp->role.invalid = 1; | 2193 | sp->role.invalid = 1; |
@@ -2331,7 +2437,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2331 | u64 spte; | 2437 | u64 spte; |
2332 | int ret = 0; | 2438 | int ret = 0; |
2333 | 2439 | ||
2334 | if (set_mmio_spte(sptep, gfn, pfn, pte_access)) | 2440 | if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access)) |
2335 | return 0; | 2441 | return 0; |
2336 | 2442 | ||
2337 | spte = PT_PRESENT_MASK; | 2443 | spte = PT_PRESENT_MASK; |
@@ -2869,22 +2975,25 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) | |||
2869 | 2975 | ||
2870 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | 2976 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) |
2871 | return; | 2977 | return; |
2872 | spin_lock(&vcpu->kvm->mmu_lock); | 2978 | |
2873 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && | 2979 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && |
2874 | (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || | 2980 | (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || |
2875 | vcpu->arch.mmu.direct_map)) { | 2981 | vcpu->arch.mmu.direct_map)) { |
2876 | hpa_t root = vcpu->arch.mmu.root_hpa; | 2982 | hpa_t root = vcpu->arch.mmu.root_hpa; |
2877 | 2983 | ||
2984 | spin_lock(&vcpu->kvm->mmu_lock); | ||
2878 | sp = page_header(root); | 2985 | sp = page_header(root); |
2879 | --sp->root_count; | 2986 | --sp->root_count; |
2880 | if (!sp->root_count && sp->role.invalid) { | 2987 | if (!sp->root_count && sp->role.invalid) { |
2881 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); | 2988 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); |
2882 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | 2989 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); |
2883 | } | 2990 | } |
2884 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
2885 | spin_unlock(&vcpu->kvm->mmu_lock); | 2991 | spin_unlock(&vcpu->kvm->mmu_lock); |
2992 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
2886 | return; | 2993 | return; |
2887 | } | 2994 | } |
2995 | |||
2996 | spin_lock(&vcpu->kvm->mmu_lock); | ||
2888 | for (i = 0; i < 4; ++i) { | 2997 | for (i = 0; i < 4; ++i) { |
2889 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | 2998 | hpa_t root = vcpu->arch.mmu.pae_root[i]; |
2890 | 2999 | ||
@@ -3148,17 +3257,12 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr) | |||
3148 | return spte; | 3257 | return spte; |
3149 | } | 3258 | } |
3150 | 3259 | ||
3151 | /* | ||
3152 | * If it is a real mmio page fault, return 1 and emulat the instruction | ||
3153 | * directly, return 0 to let CPU fault again on the address, -1 is | ||
3154 | * returned if bug is detected. | ||
3155 | */ | ||
3156 | int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) | 3260 | int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) |
3157 | { | 3261 | { |
3158 | u64 spte; | 3262 | u64 spte; |
3159 | 3263 | ||
3160 | if (quickly_check_mmio_pf(vcpu, addr, direct)) | 3264 | if (quickly_check_mmio_pf(vcpu, addr, direct)) |
3161 | return 1; | 3265 | return RET_MMIO_PF_EMULATE; |
3162 | 3266 | ||
3163 | spte = walk_shadow_page_get_mmio_spte(vcpu, addr); | 3267 | spte = walk_shadow_page_get_mmio_spte(vcpu, addr); |
3164 | 3268 | ||
@@ -3166,12 +3270,15 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) | |||
3166 | gfn_t gfn = get_mmio_spte_gfn(spte); | 3270 | gfn_t gfn = get_mmio_spte_gfn(spte); |
3167 | unsigned access = get_mmio_spte_access(spte); | 3271 | unsigned access = get_mmio_spte_access(spte); |
3168 | 3272 | ||
3273 | if (!check_mmio_spte(vcpu->kvm, spte)) | ||
3274 | return RET_MMIO_PF_INVALID; | ||
3275 | |||
3169 | if (direct) | 3276 | if (direct) |
3170 | addr = 0; | 3277 | addr = 0; |
3171 | 3278 | ||
3172 | trace_handle_mmio_page_fault(addr, gfn, access); | 3279 | trace_handle_mmio_page_fault(addr, gfn, access); |
3173 | vcpu_cache_mmio_info(vcpu, addr, gfn, access); | 3280 | vcpu_cache_mmio_info(vcpu, addr, gfn, access); |
3174 | return 1; | 3281 | return RET_MMIO_PF_EMULATE; |
3175 | } | 3282 | } |
3176 | 3283 | ||
3177 | /* | 3284 | /* |
@@ -3179,13 +3286,13 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) | |||
3179 | * it's a BUG if the gfn is not a mmio page. | 3286 | * it's a BUG if the gfn is not a mmio page. |
3180 | */ | 3287 | */ |
3181 | if (direct && !check_direct_spte_mmio_pf(spte)) | 3288 | if (direct && !check_direct_spte_mmio_pf(spte)) |
3182 | return -1; | 3289 | return RET_MMIO_PF_BUG; |
3183 | 3290 | ||
3184 | /* | 3291 | /* |
3185 | * If the page table is zapped by other cpus, let CPU fault again on | 3292 | * If the page table is zapped by other cpus, let CPU fault again on |
3186 | * the address. | 3293 | * the address. |
3187 | */ | 3294 | */ |
3188 | return 0; | 3295 | return RET_MMIO_PF_RETRY; |
3189 | } | 3296 | } |
3190 | EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); | 3297 | EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); |
3191 | 3298 | ||
@@ -3195,7 +3302,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, | |||
3195 | int ret; | 3302 | int ret; |
3196 | 3303 | ||
3197 | ret = handle_mmio_page_fault_common(vcpu, addr, direct); | 3304 | ret = handle_mmio_page_fault_common(vcpu, addr, direct); |
3198 | WARN_ON(ret < 0); | 3305 | WARN_ON(ret == RET_MMIO_PF_BUG); |
3199 | return ret; | 3306 | return ret; |
3200 | } | 3307 | } |
3201 | 3308 | ||
@@ -3207,8 +3314,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | |||
3207 | 3314 | ||
3208 | pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); | 3315 | pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); |
3209 | 3316 | ||
3210 | if (unlikely(error_code & PFERR_RSVD_MASK)) | 3317 | if (unlikely(error_code & PFERR_RSVD_MASK)) { |
3211 | return handle_mmio_page_fault(vcpu, gva, error_code, true); | 3318 | r = handle_mmio_page_fault(vcpu, gva, error_code, true); |
3319 | |||
3320 | if (likely(r != RET_MMIO_PF_INVALID)) | ||
3321 | return r; | ||
3322 | } | ||
3212 | 3323 | ||
3213 | r = mmu_topup_memory_caches(vcpu); | 3324 | r = mmu_topup_memory_caches(vcpu); |
3214 | if (r) | 3325 | if (r) |
@@ -3284,8 +3395,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | |||
3284 | ASSERT(vcpu); | 3395 | ASSERT(vcpu); |
3285 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); | 3396 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
3286 | 3397 | ||
3287 | if (unlikely(error_code & PFERR_RSVD_MASK)) | 3398 | if (unlikely(error_code & PFERR_RSVD_MASK)) { |
3288 | return handle_mmio_page_fault(vcpu, gpa, error_code, true); | 3399 | r = handle_mmio_page_fault(vcpu, gpa, error_code, true); |
3400 | |||
3401 | if (likely(r != RET_MMIO_PF_INVALID)) | ||
3402 | return r; | ||
3403 | } | ||
3289 | 3404 | ||
3290 | r = mmu_topup_memory_caches(vcpu); | 3405 | r = mmu_topup_memory_caches(vcpu); |
3291 | if (r) | 3406 | if (r) |
@@ -3391,8 +3506,8 @@ static inline void protect_clean_gpte(unsigned *access, unsigned gpte) | |||
3391 | *access &= mask; | 3506 | *access &= mask; |
3392 | } | 3507 | } |
3393 | 3508 | ||
3394 | static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, | 3509 | static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, |
3395 | int *nr_present) | 3510 | unsigned access, int *nr_present) |
3396 | { | 3511 | { |
3397 | if (unlikely(is_mmio_spte(*sptep))) { | 3512 | if (unlikely(is_mmio_spte(*sptep))) { |
3398 | if (gfn != get_mmio_spte_gfn(*sptep)) { | 3513 | if (gfn != get_mmio_spte_gfn(*sptep)) { |
@@ -3401,7 +3516,7 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, | |||
3401 | } | 3516 | } |
3402 | 3517 | ||
3403 | (*nr_present)++; | 3518 | (*nr_present)++; |
3404 | mark_mmio_spte(sptep, gfn, access); | 3519 | mark_mmio_spte(kvm, sptep, gfn, access); |
3405 | return true; | 3520 | return true; |
3406 | } | 3521 | } |
3407 | 3522 | ||
@@ -3764,9 +3879,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) | |||
3764 | if (r) | 3879 | if (r) |
3765 | goto out; | 3880 | goto out; |
3766 | r = mmu_alloc_roots(vcpu); | 3881 | r = mmu_alloc_roots(vcpu); |
3767 | spin_lock(&vcpu->kvm->mmu_lock); | 3882 | kvm_mmu_sync_roots(vcpu); |
3768 | mmu_sync_roots(vcpu); | ||
3769 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
3770 | if (r) | 3883 | if (r) |
3771 | goto out; | 3884 | goto out; |
3772 | /* set_cr3() should ensure TLB has been flushed */ | 3885 | /* set_cr3() should ensure TLB has been flushed */ |
@@ -4179,39 +4292,107 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
4179 | spin_unlock(&kvm->mmu_lock); | 4292 | spin_unlock(&kvm->mmu_lock); |
4180 | } | 4293 | } |
4181 | 4294 | ||
4182 | void kvm_mmu_zap_all(struct kvm *kvm) | 4295 | #define BATCH_ZAP_PAGES 10 |
4296 | static void kvm_zap_obsolete_pages(struct kvm *kvm) | ||
4183 | { | 4297 | { |
4184 | struct kvm_mmu_page *sp, *node; | 4298 | struct kvm_mmu_page *sp, *node; |
4185 | LIST_HEAD(invalid_list); | 4299 | int batch = 0; |
4186 | 4300 | ||
4187 | spin_lock(&kvm->mmu_lock); | ||
4188 | restart: | 4301 | restart: |
4189 | list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) | 4302 | list_for_each_entry_safe_reverse(sp, node, |
4190 | if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) | 4303 | &kvm->arch.active_mmu_pages, link) { |
4304 | int ret; | ||
4305 | |||
4306 | /* | ||
4307 | * No obsolete page exists before new created page since | ||
4308 | * active_mmu_pages is the FIFO list. | ||
4309 | */ | ||
4310 | if (!is_obsolete_sp(kvm, sp)) | ||
4311 | break; | ||
4312 | |||
4313 | /* | ||
4314 | * Since we are reversely walking the list and the invalid | ||
4315 | * list will be moved to the head, skip the invalid page | ||
4316 | * can help us to avoid the infinity list walking. | ||
4317 | */ | ||
4318 | if (sp->role.invalid) | ||
4319 | continue; | ||
4320 | |||
4321 | /* | ||
4322 | * Need not flush tlb since we only zap the sp with invalid | ||
4323 | * generation number. | ||
4324 | */ | ||
4325 | if (batch >= BATCH_ZAP_PAGES && | ||
4326 | cond_resched_lock(&kvm->mmu_lock)) { | ||
4327 | batch = 0; | ||
4328 | goto restart; | ||
4329 | } | ||
4330 | |||
4331 | ret = kvm_mmu_prepare_zap_page(kvm, sp, | ||
4332 | &kvm->arch.zapped_obsolete_pages); | ||
4333 | batch += ret; | ||
4334 | |||
4335 | if (ret) | ||
4191 | goto restart; | 4336 | goto restart; |
4337 | } | ||
4192 | 4338 | ||
4193 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | 4339 | /* |
4194 | spin_unlock(&kvm->mmu_lock); | 4340 | * Should flush tlb before free page tables since lockless-walking |
4341 | * may use the pages. | ||
4342 | */ | ||
4343 | kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); | ||
4195 | } | 4344 | } |
4196 | 4345 | ||
4197 | void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) | 4346 | /* |
4347 | * Fast invalidate all shadow pages and use lock-break technique | ||
4348 | * to zap obsolete pages. | ||
4349 | * | ||
4350 | * It's required when memslot is being deleted or VM is being | ||
4351 | * destroyed, in these cases, we should ensure that KVM MMU does | ||
4352 | * not use any resource of the being-deleted slot or all slots | ||
4353 | * after calling the function. | ||
4354 | */ | ||
4355 | void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) | ||
4198 | { | 4356 | { |
4199 | struct kvm_mmu_page *sp, *node; | ||
4200 | LIST_HEAD(invalid_list); | ||
4201 | |||
4202 | spin_lock(&kvm->mmu_lock); | 4357 | spin_lock(&kvm->mmu_lock); |
4203 | restart: | 4358 | trace_kvm_mmu_invalidate_zap_all_pages(kvm); |
4204 | list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { | 4359 | kvm->arch.mmu_valid_gen++; |
4205 | if (!sp->mmio_cached) | ||
4206 | continue; | ||
4207 | if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) | ||
4208 | goto restart; | ||
4209 | } | ||
4210 | 4360 | ||
4211 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | 4361 | /* |
4362 | * Notify all vcpus to reload its shadow page table | ||
4363 | * and flush TLB. Then all vcpus will switch to new | ||
4364 | * shadow page table with the new mmu_valid_gen. | ||
4365 | * | ||
4366 | * Note: we should do this under the protection of | ||
4367 | * mmu-lock, otherwise, vcpu would purge shadow page | ||
4368 | * but miss tlb flush. | ||
4369 | */ | ||
4370 | kvm_reload_remote_mmus(kvm); | ||
4371 | |||
4372 | kvm_zap_obsolete_pages(kvm); | ||
4212 | spin_unlock(&kvm->mmu_lock); | 4373 | spin_unlock(&kvm->mmu_lock); |
4213 | } | 4374 | } |
4214 | 4375 | ||
4376 | static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) | ||
4377 | { | ||
4378 | return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); | ||
4379 | } | ||
4380 | |||
4381 | void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) | ||
4382 | { | ||
4383 | /* | ||
4384 | * The very rare case: if the generation-number is round, | ||
4385 | * zap all shadow pages. | ||
4386 | * | ||
4387 | * The max value is MMIO_MAX_GEN - 1 since it is not called | ||
4388 | * when mark memslot invalid. | ||
4389 | */ | ||
4390 | if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) { | ||
4391 | printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n"); | ||
4392 | kvm_mmu_invalidate_zap_all_pages(kvm); | ||
4393 | } | ||
4394 | } | ||
4395 | |||
4215 | static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) | 4396 | static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) |
4216 | { | 4397 | { |
4217 | struct kvm *kvm; | 4398 | struct kvm *kvm; |
@@ -4240,15 +4421,23 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) | |||
4240 | * want to shrink a VM that only started to populate its MMU | 4421 | * want to shrink a VM that only started to populate its MMU |
4241 | * anyway. | 4422 | * anyway. |
4242 | */ | 4423 | */ |
4243 | if (!kvm->arch.n_used_mmu_pages) | 4424 | if (!kvm->arch.n_used_mmu_pages && |
4425 | !kvm_has_zapped_obsolete_pages(kvm)) | ||
4244 | continue; | 4426 | continue; |
4245 | 4427 | ||
4246 | idx = srcu_read_lock(&kvm->srcu); | 4428 | idx = srcu_read_lock(&kvm->srcu); |
4247 | spin_lock(&kvm->mmu_lock); | 4429 | spin_lock(&kvm->mmu_lock); |
4248 | 4430 | ||
4431 | if (kvm_has_zapped_obsolete_pages(kvm)) { | ||
4432 | kvm_mmu_commit_zap_page(kvm, | ||
4433 | &kvm->arch.zapped_obsolete_pages); | ||
4434 | goto unlock; | ||
4435 | } | ||
4436 | |||
4249 | prepare_zap_oldest_mmu_page(kvm, &invalid_list); | 4437 | prepare_zap_oldest_mmu_page(kvm, &invalid_list); |
4250 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | 4438 | kvm_mmu_commit_zap_page(kvm, &invalid_list); |
4251 | 4439 | ||
4440 | unlock: | ||
4252 | spin_unlock(&kvm->mmu_lock); | 4441 | spin_unlock(&kvm->mmu_lock); |
4253 | srcu_read_unlock(&kvm->srcu, idx); | 4442 | srcu_read_unlock(&kvm->srcu, idx); |
4254 | 4443 | ||
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 2adcbc2cac6d..5b59c573aba7 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h | |||
@@ -52,6 +52,23 @@ | |||
52 | 52 | ||
53 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); | 53 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); |
54 | void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); | 54 | void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); |
55 | |||
56 | /* | ||
57 | * Return values of handle_mmio_page_fault_common: | ||
58 | * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction | ||
59 | * directly. | ||
60 | * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page | ||
61 | * fault path update the mmio spte. | ||
62 | * RET_MMIO_PF_RETRY: let CPU fault again on the address. | ||
63 | * RET_MMIO_PF_BUG: bug is detected. | ||
64 | */ | ||
65 | enum { | ||
66 | RET_MMIO_PF_EMULATE = 1, | ||
67 | RET_MMIO_PF_INVALID = 2, | ||
68 | RET_MMIO_PF_RETRY = 0, | ||
69 | RET_MMIO_PF_BUG = -1 | ||
70 | }; | ||
71 | |||
55 | int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); | 72 | int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); |
56 | int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); | 73 | int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); |
57 | 74 | ||
@@ -97,4 +114,5 @@ static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access, | |||
97 | return (mmu->permissions[pfec >> 1] >> pte_access) & 1; | 114 | return (mmu->permissions[pfec >> 1] >> pte_access) & 1; |
98 | } | 115 | } |
99 | 116 | ||
117 | void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); | ||
100 | #endif | 118 | #endif |
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index b8f6172f4174..9d2e0ffcb190 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h | |||
@@ -7,16 +7,18 @@ | |||
7 | #undef TRACE_SYSTEM | 7 | #undef TRACE_SYSTEM |
8 | #define TRACE_SYSTEM kvmmmu | 8 | #define TRACE_SYSTEM kvmmmu |
9 | 9 | ||
10 | #define KVM_MMU_PAGE_FIELDS \ | 10 | #define KVM_MMU_PAGE_FIELDS \ |
11 | __field(__u64, gfn) \ | 11 | __field(unsigned long, mmu_valid_gen) \ |
12 | __field(__u32, role) \ | 12 | __field(__u64, gfn) \ |
13 | __field(__u32, root_count) \ | 13 | __field(__u32, role) \ |
14 | __field(__u32, root_count) \ | ||
14 | __field(bool, unsync) | 15 | __field(bool, unsync) |
15 | 16 | ||
16 | #define KVM_MMU_PAGE_ASSIGN(sp) \ | 17 | #define KVM_MMU_PAGE_ASSIGN(sp) \ |
17 | __entry->gfn = sp->gfn; \ | 18 | __entry->mmu_valid_gen = sp->mmu_valid_gen; \ |
18 | __entry->role = sp->role.word; \ | 19 | __entry->gfn = sp->gfn; \ |
19 | __entry->root_count = sp->root_count; \ | 20 | __entry->role = sp->role.word; \ |
21 | __entry->root_count = sp->root_count; \ | ||
20 | __entry->unsync = sp->unsync; | 22 | __entry->unsync = sp->unsync; |
21 | 23 | ||
22 | #define KVM_MMU_PAGE_PRINTK() ({ \ | 24 | #define KVM_MMU_PAGE_PRINTK() ({ \ |
@@ -28,8 +30,8 @@ | |||
28 | \ | 30 | \ |
29 | role.word = __entry->role; \ | 31 | role.word = __entry->role; \ |
30 | \ | 32 | \ |
31 | trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \ | 33 | trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s" \ |
32 | " %snxe root %u %s%c", \ | 34 | " %snxe root %u %s%c", __entry->mmu_valid_gen, \ |
33 | __entry->gfn, role.level, \ | 35 | __entry->gfn, role.level, \ |
34 | role.cr4_pae ? " pae" : "", \ | 36 | role.cr4_pae ? " pae" : "", \ |
35 | role.quadrant, \ | 37 | role.quadrant, \ |
@@ -197,23 +199,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, | |||
197 | 199 | ||
198 | TRACE_EVENT( | 200 | TRACE_EVENT( |
199 | mark_mmio_spte, | 201 | mark_mmio_spte, |
200 | TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access), | 202 | TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen), |
201 | TP_ARGS(sptep, gfn, access), | 203 | TP_ARGS(sptep, gfn, access, gen), |
202 | 204 | ||
203 | TP_STRUCT__entry( | 205 | TP_STRUCT__entry( |
204 | __field(void *, sptep) | 206 | __field(void *, sptep) |
205 | __field(gfn_t, gfn) | 207 | __field(gfn_t, gfn) |
206 | __field(unsigned, access) | 208 | __field(unsigned, access) |
209 | __field(unsigned int, gen) | ||
207 | ), | 210 | ), |
208 | 211 | ||
209 | TP_fast_assign( | 212 | TP_fast_assign( |
210 | __entry->sptep = sptep; | 213 | __entry->sptep = sptep; |
211 | __entry->gfn = gfn; | 214 | __entry->gfn = gfn; |
212 | __entry->access = access; | 215 | __entry->access = access; |
216 | __entry->gen = gen; | ||
213 | ), | 217 | ), |
214 | 218 | ||
215 | TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn, | 219 | TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep, |
216 | __entry->access) | 220 | __entry->gfn, __entry->access, __entry->gen) |
217 | ); | 221 | ); |
218 | 222 | ||
219 | TRACE_EVENT( | 223 | TRACE_EVENT( |
@@ -274,6 +278,50 @@ TRACE_EVENT( | |||
274 | __spte_satisfied(old_spte), __spte_satisfied(new_spte) | 278 | __spte_satisfied(old_spte), __spte_satisfied(new_spte) |
275 | ) | 279 | ) |
276 | ); | 280 | ); |
281 | |||
282 | TRACE_EVENT( | ||
283 | kvm_mmu_invalidate_zap_all_pages, | ||
284 | TP_PROTO(struct kvm *kvm), | ||
285 | TP_ARGS(kvm), | ||
286 | |||
287 | TP_STRUCT__entry( | ||
288 | __field(unsigned long, mmu_valid_gen) | ||
289 | __field(unsigned int, mmu_used_pages) | ||
290 | ), | ||
291 | |||
292 | TP_fast_assign( | ||
293 | __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen; | ||
294 | __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages; | ||
295 | ), | ||
296 | |||
297 | TP_printk("kvm-mmu-valid-gen %lx used_pages %x", | ||
298 | __entry->mmu_valid_gen, __entry->mmu_used_pages | ||
299 | ) | ||
300 | ); | ||
301 | |||
302 | |||
303 | TRACE_EVENT( | ||
304 | check_mmio_spte, | ||
305 | TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen), | ||
306 | TP_ARGS(spte, kvm_gen, spte_gen), | ||
307 | |||
308 | TP_STRUCT__entry( | ||
309 | __field(unsigned int, kvm_gen) | ||
310 | __field(unsigned int, spte_gen) | ||
311 | __field(u64, spte) | ||
312 | ), | ||
313 | |||
314 | TP_fast_assign( | ||
315 | __entry->kvm_gen = kvm_gen; | ||
316 | __entry->spte_gen = spte_gen; | ||
317 | __entry->spte = spte; | ||
318 | ), | ||
319 | |||
320 | TP_printk("spte %llx kvm_gen %x spte-gen %x valid %d", __entry->spte, | ||
321 | __entry->kvm_gen, __entry->spte_gen, | ||
322 | __entry->kvm_gen == __entry->spte_gen | ||
323 | ) | ||
324 | ); | ||
277 | #endif /* _TRACE_KVMMMU_H */ | 325 | #endif /* _TRACE_KVMMMU_H */ |
278 | 326 | ||
279 | #undef TRACE_INCLUDE_PATH | 327 | #undef TRACE_INCLUDE_PATH |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index da20860b457a..7769699d48a8 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -552,9 +552,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
552 | 552 | ||
553 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); | 553 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); |
554 | 554 | ||
555 | if (unlikely(error_code & PFERR_RSVD_MASK)) | 555 | if (unlikely(error_code & PFERR_RSVD_MASK)) { |
556 | return handle_mmio_page_fault(vcpu, addr, error_code, | 556 | r = handle_mmio_page_fault(vcpu, addr, error_code, |
557 | mmu_is_nested(vcpu)); | 557 | mmu_is_nested(vcpu)); |
558 | if (likely(r != RET_MMIO_PF_INVALID)) | ||
559 | return r; | ||
560 | }; | ||
558 | 561 | ||
559 | r = mmu_topup_memory_caches(vcpu); | 562 | r = mmu_topup_memory_caches(vcpu); |
560 | if (r) | 563 | if (r) |
@@ -792,7 +795,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
792 | pte_access &= gpte_access(vcpu, gpte); | 795 | pte_access &= gpte_access(vcpu, gpte); |
793 | protect_clean_gpte(&pte_access, gpte); | 796 | protect_clean_gpte(&pte_access, gpte); |
794 | 797 | ||
795 | if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) | 798 | if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access, |
799 | &nr_present)) | ||
796 | continue; | 800 | continue; |
797 | 801 | ||
798 | if (gfn != sp->gfns[i]) { | 802 | if (gfn != sp->gfns[i]) { |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a14a6eaf871d..c0bc80391e40 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -1026,7 +1026,10 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) | |||
1026 | g_tsc_offset = svm->vmcb->control.tsc_offset - | 1026 | g_tsc_offset = svm->vmcb->control.tsc_offset - |
1027 | svm->nested.hsave->control.tsc_offset; | 1027 | svm->nested.hsave->control.tsc_offset; |
1028 | svm->nested.hsave->control.tsc_offset = offset; | 1028 | svm->nested.hsave->control.tsc_offset = offset; |
1029 | } | 1029 | } else |
1030 | trace_kvm_write_tsc_offset(vcpu->vcpu_id, | ||
1031 | svm->vmcb->control.tsc_offset, | ||
1032 | offset); | ||
1030 | 1033 | ||
1031 | svm->vmcb->control.tsc_offset = offset + g_tsc_offset; | 1034 | svm->vmcb->control.tsc_offset = offset + g_tsc_offset; |
1032 | 1035 | ||
@@ -1044,6 +1047,11 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho | |||
1044 | svm->vmcb->control.tsc_offset += adjustment; | 1047 | svm->vmcb->control.tsc_offset += adjustment; |
1045 | if (is_guest_mode(vcpu)) | 1048 | if (is_guest_mode(vcpu)) |
1046 | svm->nested.hsave->control.tsc_offset += adjustment; | 1049 | svm->nested.hsave->control.tsc_offset += adjustment; |
1050 | else | ||
1051 | trace_kvm_write_tsc_offset(vcpu->vcpu_id, | ||
1052 | svm->vmcb->control.tsc_offset - adjustment, | ||
1053 | svm->vmcb->control.tsc_offset); | ||
1054 | |||
1047 | mark_dirty(svm->vmcb, VMCB_INTERCEPTS); | 1055 | mark_dirty(svm->vmcb, VMCB_INTERCEPTS); |
1048 | } | 1056 | } |
1049 | 1057 | ||
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index fe5e00ed7036..545245d7cc63 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h | |||
@@ -756,6 +756,27 @@ TRACE_EVENT( | |||
756 | __entry->gpa_match ? "GPA" : "GVA") | 756 | __entry->gpa_match ? "GPA" : "GVA") |
757 | ); | 757 | ); |
758 | 758 | ||
759 | TRACE_EVENT(kvm_write_tsc_offset, | ||
760 | TP_PROTO(unsigned int vcpu_id, __u64 previous_tsc_offset, | ||
761 | __u64 next_tsc_offset), | ||
762 | TP_ARGS(vcpu_id, previous_tsc_offset, next_tsc_offset), | ||
763 | |||
764 | TP_STRUCT__entry( | ||
765 | __field( unsigned int, vcpu_id ) | ||
766 | __field( __u64, previous_tsc_offset ) | ||
767 | __field( __u64, next_tsc_offset ) | ||
768 | ), | ||
769 | |||
770 | TP_fast_assign( | ||
771 | __entry->vcpu_id = vcpu_id; | ||
772 | __entry->previous_tsc_offset = previous_tsc_offset; | ||
773 | __entry->next_tsc_offset = next_tsc_offset; | ||
774 | ), | ||
775 | |||
776 | TP_printk("vcpu=%u prev=%llu next=%llu", __entry->vcpu_id, | ||
777 | __entry->previous_tsc_offset, __entry->next_tsc_offset) | ||
778 | ); | ||
779 | |||
759 | #ifdef CONFIG_X86_64 | 780 | #ifdef CONFIG_X86_64 |
760 | 781 | ||
761 | #define host_clocks \ | 782 | #define host_clocks \ |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b30f5a54a2ab..a7e18551c968 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -2096,6 +2096,8 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) | |||
2096 | (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? | 2096 | (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? |
2097 | vmcs12->tsc_offset : 0)); | 2097 | vmcs12->tsc_offset : 0)); |
2098 | } else { | 2098 | } else { |
2099 | trace_kvm_write_tsc_offset(vcpu->vcpu_id, | ||
2100 | vmcs_read64(TSC_OFFSET), offset); | ||
2099 | vmcs_write64(TSC_OFFSET, offset); | 2101 | vmcs_write64(TSC_OFFSET, offset); |
2100 | } | 2102 | } |
2101 | } | 2103 | } |
@@ -2103,11 +2105,14 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) | |||
2103 | static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) | 2105 | static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) |
2104 | { | 2106 | { |
2105 | u64 offset = vmcs_read64(TSC_OFFSET); | 2107 | u64 offset = vmcs_read64(TSC_OFFSET); |
2108 | |||
2106 | vmcs_write64(TSC_OFFSET, offset + adjustment); | 2109 | vmcs_write64(TSC_OFFSET, offset + adjustment); |
2107 | if (is_guest_mode(vcpu)) { | 2110 | if (is_guest_mode(vcpu)) { |
2108 | /* Even when running L2, the adjustment needs to apply to L1 */ | 2111 | /* Even when running L2, the adjustment needs to apply to L1 */ |
2109 | to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment; | 2112 | to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment; |
2110 | } | 2113 | } else |
2114 | trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset, | ||
2115 | offset + adjustment); | ||
2111 | } | 2116 | } |
2112 | 2117 | ||
2113 | static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) | 2118 | static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) |
@@ -4176,10 +4181,10 @@ static void ept_set_mmio_spte_mask(void) | |||
4176 | /* | 4181 | /* |
4177 | * EPT Misconfigurations can be generated if the value of bits 2:0 | 4182 | * EPT Misconfigurations can be generated if the value of bits 2:0 |
4178 | * of an EPT paging-structure entry is 110b (write/execute). | 4183 | * of an EPT paging-structure entry is 110b (write/execute). |
4179 | * Also, magic bits (0xffull << 49) is set to quickly identify mmio | 4184 | * Also, magic bits (0x3ull << 62) is set to quickly identify mmio |
4180 | * spte. | 4185 | * spte. |
4181 | */ | 4186 | */ |
4182 | kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull); | 4187 | kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); |
4183 | } | 4188 | } |
4184 | 4189 | ||
4185 | /* | 4190 | /* |
@@ -5366,10 +5371,14 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) | |||
5366 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); | 5371 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); |
5367 | 5372 | ||
5368 | ret = handle_mmio_page_fault_common(vcpu, gpa, true); | 5373 | ret = handle_mmio_page_fault_common(vcpu, gpa, true); |
5369 | if (likely(ret == 1)) | 5374 | if (likely(ret == RET_MMIO_PF_EMULATE)) |
5370 | return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == | 5375 | return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == |
5371 | EMULATE_DONE; | 5376 | EMULATE_DONE; |
5372 | if (unlikely(!ret)) | 5377 | |
5378 | if (unlikely(ret == RET_MMIO_PF_INVALID)) | ||
5379 | return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0); | ||
5380 | |||
5381 | if (unlikely(ret == RET_MMIO_PF_RETRY)) | ||
5373 | return 1; | 5382 | return 1; |
5374 | 5383 | ||
5375 | /* It is the real ept misconfig */ | 5384 | /* It is the real ept misconfig */ |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 292e6ca89f42..d21bce505315 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -1193,20 +1193,37 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) | |||
1193 | elapsed = ns - kvm->arch.last_tsc_nsec; | 1193 | elapsed = ns - kvm->arch.last_tsc_nsec; |
1194 | 1194 | ||
1195 | if (vcpu->arch.virtual_tsc_khz) { | 1195 | if (vcpu->arch.virtual_tsc_khz) { |
1196 | int faulted = 0; | ||
1197 | |||
1196 | /* n.b - signed multiplication and division required */ | 1198 | /* n.b - signed multiplication and division required */ |
1197 | usdiff = data - kvm->arch.last_tsc_write; | 1199 | usdiff = data - kvm->arch.last_tsc_write; |
1198 | #ifdef CONFIG_X86_64 | 1200 | #ifdef CONFIG_X86_64 |
1199 | usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; | 1201 | usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; |
1200 | #else | 1202 | #else |
1201 | /* do_div() only does unsigned */ | 1203 | /* do_div() only does unsigned */ |
1202 | asm("idivl %2; xor %%edx, %%edx" | 1204 | asm("1: idivl %[divisor]\n" |
1203 | : "=A"(usdiff) | 1205 | "2: xor %%edx, %%edx\n" |
1204 | : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); | 1206 | " movl $0, %[faulted]\n" |
1207 | "3:\n" | ||
1208 | ".section .fixup,\"ax\"\n" | ||
1209 | "4: movl $1, %[faulted]\n" | ||
1210 | " jmp 3b\n" | ||
1211 | ".previous\n" | ||
1212 | |||
1213 | _ASM_EXTABLE(1b, 4b) | ||
1214 | |||
1215 | : "=A"(usdiff), [faulted] "=r" (faulted) | ||
1216 | : "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz)); | ||
1217 | |||
1205 | #endif | 1218 | #endif |
1206 | do_div(elapsed, 1000); | 1219 | do_div(elapsed, 1000); |
1207 | usdiff -= elapsed; | 1220 | usdiff -= elapsed; |
1208 | if (usdiff < 0) | 1221 | if (usdiff < 0) |
1209 | usdiff = -usdiff; | 1222 | usdiff = -usdiff; |
1223 | |||
1224 | /* idivl overflow => difference is larger than USEC_PER_SEC */ | ||
1225 | if (faulted) | ||
1226 | usdiff = USEC_PER_SEC; | ||
1210 | } else | 1227 | } else |
1211 | usdiff = USEC_PER_SEC; /* disable TSC match window below */ | 1228 | usdiff = USEC_PER_SEC; /* disable TSC match window below */ |
1212 | 1229 | ||
@@ -1587,6 +1604,30 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) | |||
1587 | return 0; | 1604 | return 0; |
1588 | } | 1605 | } |
1589 | 1606 | ||
1607 | /* | ||
1608 | * kvmclock updates which are isolated to a given vcpu, such as | ||
1609 | * vcpu->cpu migration, should not allow system_timestamp from | ||
1610 | * the rest of the vcpus to remain static. Otherwise ntp frequency | ||
1611 | * correction applies to one vcpu's system_timestamp but not | ||
1612 | * the others. | ||
1613 | * | ||
1614 | * So in those cases, request a kvmclock update for all vcpus. | ||
1615 | * The worst case for a remote vcpu to update its kvmclock | ||
1616 | * is then bounded by maximum nohz sleep latency. | ||
1617 | */ | ||
1618 | |||
1619 | static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) | ||
1620 | { | ||
1621 | int i; | ||
1622 | struct kvm *kvm = v->kvm; | ||
1623 | struct kvm_vcpu *vcpu; | ||
1624 | |||
1625 | kvm_for_each_vcpu(i, vcpu, kvm) { | ||
1626 | set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); | ||
1627 | kvm_vcpu_kick(vcpu); | ||
1628 | } | ||
1629 | } | ||
1630 | |||
1590 | static bool msr_mtrr_valid(unsigned msr) | 1631 | static bool msr_mtrr_valid(unsigned msr) |
1591 | { | 1632 | { |
1592 | switch (msr) { | 1633 | switch (msr) { |
@@ -1984,7 +2025,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) | |||
1984 | kvmclock_reset(vcpu); | 2025 | kvmclock_reset(vcpu); |
1985 | 2026 | ||
1986 | vcpu->arch.time = data; | 2027 | vcpu->arch.time = data; |
1987 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); | 2028 | kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); |
1988 | 2029 | ||
1989 | /* we verify if the enable bit is set... */ | 2030 | /* we verify if the enable bit is set... */ |
1990 | if (!(data & 1)) | 2031 | if (!(data & 1)) |
@@ -2701,7 +2742,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
2701 | * kvmclock on vcpu->cpu migration | 2742 | * kvmclock on vcpu->cpu migration |
2702 | */ | 2743 | */ |
2703 | if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) | 2744 | if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) |
2704 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); | 2745 | kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); |
2705 | if (vcpu->cpu != cpu) | 2746 | if (vcpu->cpu != cpu) |
2706 | kvm_migrate_timers(vcpu); | 2747 | kvm_migrate_timers(vcpu); |
2707 | vcpu->cpu = cpu; | 2748 | vcpu->cpu = cpu; |
@@ -5238,7 +5279,13 @@ static void kvm_set_mmio_spte_mask(void) | |||
5238 | * Set the reserved bits and the present bit of an paging-structure | 5279 | * Set the reserved bits and the present bit of an paging-structure |
5239 | * entry to generate page fault with PFER.RSV = 1. | 5280 | * entry to generate page fault with PFER.RSV = 1. |
5240 | */ | 5281 | */ |
5241 | mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr; | 5282 | /* Mask the reserved physical address bits. */ |
5283 | mask = ((1ull << (51 - maxphyaddr + 1)) - 1) << maxphyaddr; | ||
5284 | |||
5285 | /* Bit 62 is always reserved for 32bit host. */ | ||
5286 | mask |= 0x3ull << 62; | ||
5287 | |||
5288 | /* Set the present bit. */ | ||
5242 | mask |= 1ull; | 5289 | mask |= 1ull; |
5243 | 5290 | ||
5244 | #ifdef CONFIG_X86_64 | 5291 | #ifdef CONFIG_X86_64 |
@@ -5498,13 +5545,6 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) | |||
5498 | char instruction[3]; | 5545 | char instruction[3]; |
5499 | unsigned long rip = kvm_rip_read(vcpu); | 5546 | unsigned long rip = kvm_rip_read(vcpu); |
5500 | 5547 | ||
5501 | /* | ||
5502 | * Blow out the MMU to ensure that no other VCPU has an active mapping | ||
5503 | * to ensure that the updated hypercall appears atomically across all | ||
5504 | * VCPUs. | ||
5505 | */ | ||
5506 | kvm_mmu_zap_all(vcpu->kvm); | ||
5507 | |||
5508 | kvm_x86_ops->patch_hypercall(vcpu, instruction); | 5548 | kvm_x86_ops->patch_hypercall(vcpu, instruction); |
5509 | 5549 | ||
5510 | return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); | 5550 | return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); |
@@ -5702,6 +5742,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
5702 | __kvm_migrate_timers(vcpu); | 5742 | __kvm_migrate_timers(vcpu); |
5703 | if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) | 5743 | if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) |
5704 | kvm_gen_update_masterclock(vcpu->kvm); | 5744 | kvm_gen_update_masterclock(vcpu->kvm); |
5745 | if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu)) | ||
5746 | kvm_gen_kvmclock_update(vcpu); | ||
5705 | if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { | 5747 | if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { |
5706 | r = kvm_guest_time_update(vcpu); | 5748 | r = kvm_guest_time_update(vcpu); |
5707 | if (unlikely(r)) | 5749 | if (unlikely(r)) |
@@ -6812,6 +6854,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) | |||
6812 | return -EINVAL; | 6854 | return -EINVAL; |
6813 | 6855 | ||
6814 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); | 6856 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); |
6857 | INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); | ||
6815 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); | 6858 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); |
6816 | 6859 | ||
6817 | /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ | 6860 | /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ |
@@ -7040,22 +7083,18 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, | |||
7040 | * If memory slot is created, or moved, we need to clear all | 7083 | * If memory slot is created, or moved, we need to clear all |
7041 | * mmio sptes. | 7084 | * mmio sptes. |
7042 | */ | 7085 | */ |
7043 | if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { | 7086 | kvm_mmu_invalidate_mmio_sptes(kvm); |
7044 | kvm_mmu_zap_mmio_sptes(kvm); | ||
7045 | kvm_reload_remote_mmus(kvm); | ||
7046 | } | ||
7047 | } | 7087 | } |
7048 | 7088 | ||
7049 | void kvm_arch_flush_shadow_all(struct kvm *kvm) | 7089 | void kvm_arch_flush_shadow_all(struct kvm *kvm) |
7050 | { | 7090 | { |
7051 | kvm_mmu_zap_all(kvm); | 7091 | kvm_mmu_invalidate_zap_all_pages(kvm); |
7052 | kvm_reload_remote_mmus(kvm); | ||
7053 | } | 7092 | } |
7054 | 7093 | ||
7055 | void kvm_arch_flush_shadow_memslot(struct kvm *kvm, | 7094 | void kvm_arch_flush_shadow_memslot(struct kvm *kvm, |
7056 | struct kvm_memory_slot *slot) | 7095 | struct kvm_memory_slot *slot) |
7057 | { | 7096 | { |
7058 | kvm_arch_flush_shadow_all(kvm); | 7097 | kvm_mmu_invalidate_zap_all_pages(kvm); |
7059 | } | 7098 | } |
7060 | 7099 | ||
7061 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) | 7100 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) |
@@ -7263,3 +7302,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); | |||
7263 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); | 7302 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); |
7264 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); | 7303 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); |
7265 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); | 7304 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); |
7305 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); | ||