aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Makefile13
-rw-r--r--arch/x86/kvm/emulate.c391
-rw-r--r--arch/x86/kvm/lapic.c4
-rw-r--r--arch/x86/kvm/mmu.c301
-rw-r--r--arch/x86/kvm/mmu.h18
-rw-r--r--arch/x86/kvm/mmutrace.h76
-rw-r--r--arch/x86/kvm/paging_tmpl.h10
-rw-r--r--arch/x86/kvm/svm.c10
-rw-r--r--arch/x86/kvm/trace.h21
-rw-r--r--arch/x86/kvm/vmx.c21
-rw-r--r--arch/x86/kvm/x86.c87
11 files changed, 558 insertions, 394 deletions
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d609e1d84048..bf4fb04d0112 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -5,12 +5,13 @@ CFLAGS_x86.o := -I.
5CFLAGS_svm.o := -I. 5CFLAGS_svm.o := -I.
6CFLAGS_vmx.o := -I. 6CFLAGS_vmx.o := -I.
7 7
8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 8KVM := ../../../virt/kvm
9 coalesced_mmio.o irq_comm.o eventfd.o \ 9
10 irqchip.o) 10kvm-y += $(KVM)/kvm_main.o $(KVM)/ioapic.o \
11kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(addprefix ../../../virt/kvm/, \ 11 $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \
12 assigned-dev.o iommu.o) 12 $(KVM)/eventfd.o $(KVM)/irqchip.o
13kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) 13kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(KVM)/assigned-dev.o $(KVM)/iommu.o
14kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
14 15
15kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 16kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
16 i8254.o cpuid.o pmu.o 17 i8254.o cpuid.o pmu.o
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5953dcea752d..2bc1e81045b0 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -61,6 +61,8 @@
61#define OpMem8 26ull /* 8-bit zero extended memory operand */ 61#define OpMem8 26ull /* 8-bit zero extended memory operand */
62#define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */ 62#define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */
63#define OpXLat 28ull /* memory at BX/EBX/RBX + zero-extended AL */ 63#define OpXLat 28ull /* memory at BX/EBX/RBX + zero-extended AL */
64#define OpAccLo 29ull /* Low part of extended acc (AX/AX/EAX/RAX) */
65#define OpAccHi 30ull /* High part of extended acc (-/DX/EDX/RDX) */
64 66
65#define OpBits 5 /* Width of operand field */ 67#define OpBits 5 /* Width of operand field */
66#define OpMask ((1ull << OpBits) - 1) 68#define OpMask ((1ull << OpBits) - 1)
@@ -86,6 +88,7 @@
86#define DstMem64 (OpMem64 << DstShift) 88#define DstMem64 (OpMem64 << DstShift)
87#define DstImmUByte (OpImmUByte << DstShift) 89#define DstImmUByte (OpImmUByte << DstShift)
88#define DstDX (OpDX << DstShift) 90#define DstDX (OpDX << DstShift)
91#define DstAccLo (OpAccLo << DstShift)
89#define DstMask (OpMask << DstShift) 92#define DstMask (OpMask << DstShift)
90/* Source operand type. */ 93/* Source operand type. */
91#define SrcShift 6 94#define SrcShift 6
@@ -108,6 +111,7 @@
108#define SrcImm64 (OpImm64 << SrcShift) 111#define SrcImm64 (OpImm64 << SrcShift)
109#define SrcDX (OpDX << SrcShift) 112#define SrcDX (OpDX << SrcShift)
110#define SrcMem8 (OpMem8 << SrcShift) 113#define SrcMem8 (OpMem8 << SrcShift)
114#define SrcAccHi (OpAccHi << SrcShift)
111#define SrcMask (OpMask << SrcShift) 115#define SrcMask (OpMask << SrcShift)
112#define BitOp (1<<11) 116#define BitOp (1<<11)
113#define MemAbs (1<<12) /* Memory operand is absolute displacement */ 117#define MemAbs (1<<12) /* Memory operand is absolute displacement */
@@ -138,6 +142,7 @@
138/* Source 2 operand type */ 142/* Source 2 operand type */
139#define Src2Shift (31) 143#define Src2Shift (31)
140#define Src2None (OpNone << Src2Shift) 144#define Src2None (OpNone << Src2Shift)
145#define Src2Mem (OpMem << Src2Shift)
141#define Src2CL (OpCL << Src2Shift) 146#define Src2CL (OpCL << Src2Shift)
142#define Src2ImmByte (OpImmByte << Src2Shift) 147#define Src2ImmByte (OpImmByte << Src2Shift)
143#define Src2One (OpOne << Src2Shift) 148#define Src2One (OpOne << Src2Shift)
@@ -155,6 +160,9 @@
155#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ 160#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */
156#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ 161#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */
157#define NoWrite ((u64)1 << 45) /* No writeback */ 162#define NoWrite ((u64)1 << 45) /* No writeback */
163#define SrcWrite ((u64)1 << 46) /* Write back src operand */
164
165#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
158 166
159#define X2(x...) x, x 167#define X2(x...) x, x
160#define X3(x...) X2(x), x 168#define X3(x...) X2(x), x
@@ -171,10 +179,11 @@
171/* 179/*
172 * fastop functions have a special calling convention: 180 * fastop functions have a special calling convention:
173 * 181 *
174 * dst: [rdx]:rax (in/out) 182 * dst: rax (in/out)
175 * src: rbx (in/out) 183 * src: rdx (in/out)
176 * src2: rcx (in) 184 * src2: rcx (in)
177 * flags: rflags (in/out) 185 * flags: rflags (in/out)
186 * ex: rsi (in:fastop pointer, out:zero if exception)
178 * 187 *
179 * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for 188 * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
180 * different operand sizes can be reached by calculation, rather than a jump 189 * different operand sizes can be reached by calculation, rather than a jump
@@ -276,174 +285,17 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
276} 285}
277 286
278/* 287/*
279 * Instruction emulation:
280 * Most instructions are emulated directly via a fragment of inline assembly
281 * code. This allows us to save/restore EFLAGS and thus very easily pick up
282 * any modified flags.
283 */
284
285#if defined(CONFIG_X86_64)
286#define _LO32 "k" /* force 32-bit operand */
287#define _STK "%%rsp" /* stack pointer */
288#elif defined(__i386__)
289#define _LO32 "" /* force 32-bit operand */
290#define _STK "%%esp" /* stack pointer */
291#endif
292
293/*
294 * These EFLAGS bits are restored from saved value during emulation, and 288 * These EFLAGS bits are restored from saved value during emulation, and
295 * any changes are written back to the saved value after emulation. 289 * any changes are written back to the saved value after emulation.
296 */ 290 */
297#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) 291#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
298 292
299/* Before executing instruction: restore necessary bits in EFLAGS. */
300#define _PRE_EFLAGS(_sav, _msk, _tmp) \
301 /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
302 "movl %"_sav",%"_LO32 _tmp"; " \
303 "push %"_tmp"; " \
304 "push %"_tmp"; " \
305 "movl %"_msk",%"_LO32 _tmp"; " \
306 "andl %"_LO32 _tmp",("_STK"); " \
307 "pushf; " \
308 "notl %"_LO32 _tmp"; " \
309 "andl %"_LO32 _tmp",("_STK"); " \
310 "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \
311 "pop %"_tmp"; " \
312 "orl %"_LO32 _tmp",("_STK"); " \
313 "popf; " \
314 "pop %"_sav"; "
315
316/* After executing instruction: write-back necessary bits in EFLAGS. */
317#define _POST_EFLAGS(_sav, _msk, _tmp) \
318 /* _sav |= EFLAGS & _msk; */ \
319 "pushf; " \
320 "pop %"_tmp"; " \
321 "andl %"_msk",%"_LO32 _tmp"; " \
322 "orl %"_LO32 _tmp",%"_sav"; "
323
324#ifdef CONFIG_X86_64 293#ifdef CONFIG_X86_64
325#define ON64(x) x 294#define ON64(x) x
326#else 295#else
327#define ON64(x) 296#define ON64(x)
328#endif 297#endif
329 298
330#define ____emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype) \
331 do { \
332 __asm__ __volatile__ ( \
333 _PRE_EFLAGS("0", "4", "2") \
334 _op _suffix " %"_x"3,%1; " \
335 _POST_EFLAGS("0", "4", "2") \
336 : "=m" ((ctxt)->eflags), \
337 "+q" (*(_dsttype*)&(ctxt)->dst.val), \
338 "=&r" (_tmp) \
339 : _y ((ctxt)->src.val), "i" (EFLAGS_MASK)); \
340 } while (0)
341
342
343/* Raw emulation: instruction has two explicit operands. */
344#define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy) \
345 do { \
346 unsigned long _tmp; \
347 \
348 switch ((ctxt)->dst.bytes) { \
349 case 2: \
350 ____emulate_2op(ctxt,_op,_wx,_wy,"w",u16); \
351 break; \
352 case 4: \
353 ____emulate_2op(ctxt,_op,_lx,_ly,"l",u32); \
354 break; \
355 case 8: \
356 ON64(____emulate_2op(ctxt,_op,_qx,_qy,"q",u64)); \
357 break; \
358 } \
359 } while (0)
360
361#define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
362 do { \
363 unsigned long _tmp; \
364 switch ((ctxt)->dst.bytes) { \
365 case 1: \
366 ____emulate_2op(ctxt,_op,_bx,_by,"b",u8); \
367 break; \
368 default: \
369 __emulate_2op_nobyte(ctxt, _op, \
370 _wx, _wy, _lx, _ly, _qx, _qy); \
371 break; \
372 } \
373 } while (0)
374
375/* Source operand is byte-sized and may be restricted to just %cl. */
376#define emulate_2op_SrcB(ctxt, _op) \
377 __emulate_2op(ctxt, _op, "b", "c", "b", "c", "b", "c", "b", "c")
378
379/* Source operand is byte, word, long or quad sized. */
380#define emulate_2op_SrcV(ctxt, _op) \
381 __emulate_2op(ctxt, _op, "b", "q", "w", "r", _LO32, "r", "", "r")
382
383/* Source operand is word, long or quad sized. */
384#define emulate_2op_SrcV_nobyte(ctxt, _op) \
385 __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r")
386
387/* Instruction has three operands and one operand is stored in ECX register */
388#define __emulate_2op_cl(ctxt, _op, _suffix, _type) \
389 do { \
390 unsigned long _tmp; \
391 _type _clv = (ctxt)->src2.val; \
392 _type _srcv = (ctxt)->src.val; \
393 _type _dstv = (ctxt)->dst.val; \
394 \
395 __asm__ __volatile__ ( \
396 _PRE_EFLAGS("0", "5", "2") \
397 _op _suffix " %4,%1 \n" \
398 _POST_EFLAGS("0", "5", "2") \
399 : "=m" ((ctxt)->eflags), "+r" (_dstv), "=&r" (_tmp) \
400 : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
401 ); \
402 \
403 (ctxt)->src2.val = (unsigned long) _clv; \
404 (ctxt)->src2.val = (unsigned long) _srcv; \
405 (ctxt)->dst.val = (unsigned long) _dstv; \
406 } while (0)
407
408#define emulate_2op_cl(ctxt, _op) \
409 do { \
410 switch ((ctxt)->dst.bytes) { \
411 case 2: \
412 __emulate_2op_cl(ctxt, _op, "w", u16); \
413 break; \
414 case 4: \
415 __emulate_2op_cl(ctxt, _op, "l", u32); \
416 break; \
417 case 8: \
418 ON64(__emulate_2op_cl(ctxt, _op, "q", ulong)); \
419 break; \
420 } \
421 } while (0)
422
423#define __emulate_1op(ctxt, _op, _suffix) \
424 do { \
425 unsigned long _tmp; \
426 \
427 __asm__ __volatile__ ( \
428 _PRE_EFLAGS("0", "3", "2") \
429 _op _suffix " %1; " \
430 _POST_EFLAGS("0", "3", "2") \
431 : "=m" ((ctxt)->eflags), "+m" ((ctxt)->dst.val), \
432 "=&r" (_tmp) \
433 : "i" (EFLAGS_MASK)); \
434 } while (0)
435
436/* Instruction has only one explicit operand (no source operand). */
437#define emulate_1op(ctxt, _op) \
438 do { \
439 switch ((ctxt)->dst.bytes) { \
440 case 1: __emulate_1op(ctxt, _op, "b"); break; \
441 case 2: __emulate_1op(ctxt, _op, "w"); break; \
442 case 4: __emulate_1op(ctxt, _op, "l"); break; \
443 case 8: ON64(__emulate_1op(ctxt, _op, "q")); break; \
444 } \
445 } while (0)
446
447static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); 299static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
448 300
449#define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t" 301#define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t"
@@ -462,7 +314,10 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
462#define FOPNOP() FOP_ALIGN FOP_RET 314#define FOPNOP() FOP_ALIGN FOP_RET
463 315
464#define FOP1E(op, dst) \ 316#define FOP1E(op, dst) \
465 FOP_ALIGN #op " %" #dst " \n\t" FOP_RET 317 FOP_ALIGN "10: " #op " %" #dst " \n\t" FOP_RET
318
319#define FOP1EEX(op, dst) \
320 FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception)
466 321
467#define FASTOP1(op) \ 322#define FASTOP1(op) \
468 FOP_START(op) \ 323 FOP_START(op) \
@@ -472,24 +327,42 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
472 ON64(FOP1E(op##q, rax)) \ 327 ON64(FOP1E(op##q, rax)) \
473 FOP_END 328 FOP_END
474 329
330/* 1-operand, using src2 (for MUL/DIV r/m) */
331#define FASTOP1SRC2(op, name) \
332 FOP_START(name) \
333 FOP1E(op, cl) \
334 FOP1E(op, cx) \
335 FOP1E(op, ecx) \
336 ON64(FOP1E(op, rcx)) \
337 FOP_END
338
339/* 1-operand, using src2 (for MUL/DIV r/m), with exceptions */
340#define FASTOP1SRC2EX(op, name) \
341 FOP_START(name) \
342 FOP1EEX(op, cl) \
343 FOP1EEX(op, cx) \
344 FOP1EEX(op, ecx) \
345 ON64(FOP1EEX(op, rcx)) \
346 FOP_END
347
475#define FOP2E(op, dst, src) \ 348#define FOP2E(op, dst, src) \
476 FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET 349 FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET
477 350
478#define FASTOP2(op) \ 351#define FASTOP2(op) \
479 FOP_START(op) \ 352 FOP_START(op) \
480 FOP2E(op##b, al, bl) \ 353 FOP2E(op##b, al, dl) \
481 FOP2E(op##w, ax, bx) \ 354 FOP2E(op##w, ax, dx) \
482 FOP2E(op##l, eax, ebx) \ 355 FOP2E(op##l, eax, edx) \
483 ON64(FOP2E(op##q, rax, rbx)) \ 356 ON64(FOP2E(op##q, rax, rdx)) \
484 FOP_END 357 FOP_END
485 358
486/* 2 operand, word only */ 359/* 2 operand, word only */
487#define FASTOP2W(op) \ 360#define FASTOP2W(op) \
488 FOP_START(op) \ 361 FOP_START(op) \
489 FOPNOP() \ 362 FOPNOP() \
490 FOP2E(op##w, ax, bx) \ 363 FOP2E(op##w, ax, dx) \
491 FOP2E(op##l, eax, ebx) \ 364 FOP2E(op##l, eax, edx) \
492 ON64(FOP2E(op##q, rax, rbx)) \ 365 ON64(FOP2E(op##q, rax, rdx)) \
493 FOP_END 366 FOP_END
494 367
495/* 2 operand, src is CL */ 368/* 2 operand, src is CL */
@@ -508,14 +381,17 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
508#define FASTOP3WCL(op) \ 381#define FASTOP3WCL(op) \
509 FOP_START(op) \ 382 FOP_START(op) \
510 FOPNOP() \ 383 FOPNOP() \
511 FOP3E(op##w, ax, bx, cl) \ 384 FOP3E(op##w, ax, dx, cl) \
512 FOP3E(op##l, eax, ebx, cl) \ 385 FOP3E(op##l, eax, edx, cl) \
513 ON64(FOP3E(op##q, rax, rbx, cl)) \ 386 ON64(FOP3E(op##q, rax, rdx, cl)) \
514 FOP_END 387 FOP_END
515 388
516/* Special case for SETcc - 1 instruction per cc */ 389/* Special case for SETcc - 1 instruction per cc */
517#define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t" 390#define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t"
518 391
392asm(".global kvm_fastop_exception \n"
393 "kvm_fastop_exception: xor %esi, %esi; ret");
394
519FOP_START(setcc) 395FOP_START(setcc)
520FOP_SETCC(seto) 396FOP_SETCC(seto)
521FOP_SETCC(setno) 397FOP_SETCC(setno)
@@ -538,47 +414,6 @@ FOP_END;
538FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET 414FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET
539FOP_END; 415FOP_END;
540 416
541#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \
542 do { \
543 unsigned long _tmp; \
544 ulong *rax = reg_rmw((ctxt), VCPU_REGS_RAX); \
545 ulong *rdx = reg_rmw((ctxt), VCPU_REGS_RDX); \
546 \
547 __asm__ __volatile__ ( \
548 _PRE_EFLAGS("0", "5", "1") \
549 "1: \n\t" \
550 _op _suffix " %6; " \
551 "2: \n\t" \
552 _POST_EFLAGS("0", "5", "1") \
553 ".pushsection .fixup,\"ax\" \n\t" \
554 "3: movb $1, %4 \n\t" \
555 "jmp 2b \n\t" \
556 ".popsection \n\t" \
557 _ASM_EXTABLE(1b, 3b) \
558 : "=m" ((ctxt)->eflags), "=&r" (_tmp), \
559 "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \
560 : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val)); \
561 } while (0)
562
563/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
564#define emulate_1op_rax_rdx(ctxt, _op, _ex) \
565 do { \
566 switch((ctxt)->src.bytes) { \
567 case 1: \
568 __emulate_1op_rax_rdx(ctxt, _op, "b", _ex); \
569 break; \
570 case 2: \
571 __emulate_1op_rax_rdx(ctxt, _op, "w", _ex); \
572 break; \
573 case 4: \
574 __emulate_1op_rax_rdx(ctxt, _op, "l", _ex); \
575 break; \
576 case 8: ON64( \
577 __emulate_1op_rax_rdx(ctxt, _op, "q", _ex)); \
578 break; \
579 } \
580 } while (0)
581
582static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, 417static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
583 enum x86_intercept intercept, 418 enum x86_intercept intercept,
584 enum x86_intercept_stage stage) 419 enum x86_intercept_stage stage)
@@ -988,6 +823,11 @@ FASTOP2(xor);
988FASTOP2(cmp); 823FASTOP2(cmp);
989FASTOP2(test); 824FASTOP2(test);
990 825
826FASTOP1SRC2(mul, mul_ex);
827FASTOP1SRC2(imul, imul_ex);
828FASTOP1SRC2EX(div, div_ex);
829FASTOP1SRC2EX(idiv, idiv_ex);
830
991FASTOP3WCL(shld); 831FASTOP3WCL(shld);
992FASTOP3WCL(shrd); 832FASTOP3WCL(shrd);
993 833
@@ -1013,6 +853,8 @@ FASTOP2W(bts);
1013FASTOP2W(btr); 853FASTOP2W(btr);
1014FASTOP2W(btc); 854FASTOP2W(btc);
1015 855
856FASTOP2(xadd);
857
1016static u8 test_cc(unsigned int condition, unsigned long flags) 858static u8 test_cc(unsigned int condition, unsigned long flags)
1017{ 859{
1018 u8 rc; 860 u8 rc;
@@ -1726,45 +1568,42 @@ static void write_register_operand(struct operand *op)
1726 } 1568 }
1727} 1569}
1728 1570
1729static int writeback(struct x86_emulate_ctxt *ctxt) 1571static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
1730{ 1572{
1731 int rc; 1573 int rc;
1732 1574
1733 if (ctxt->d & NoWrite) 1575 switch (op->type) {
1734 return X86EMUL_CONTINUE;
1735
1736 switch (ctxt->dst.type) {
1737 case OP_REG: 1576 case OP_REG:
1738 write_register_operand(&ctxt->dst); 1577 write_register_operand(op);
1739 break; 1578 break;
1740 case OP_MEM: 1579 case OP_MEM:
1741 if (ctxt->lock_prefix) 1580 if (ctxt->lock_prefix)
1742 rc = segmented_cmpxchg(ctxt, 1581 rc = segmented_cmpxchg(ctxt,
1743 ctxt->dst.addr.mem, 1582 op->addr.mem,
1744 &ctxt->dst.orig_val, 1583 &op->orig_val,
1745 &ctxt->dst.val, 1584 &op->val,
1746 ctxt->dst.bytes); 1585 op->bytes);
1747 else 1586 else
1748 rc = segmented_write(ctxt, 1587 rc = segmented_write(ctxt,
1749 ctxt->dst.addr.mem, 1588 op->addr.mem,
1750 &ctxt->dst.val, 1589 &op->val,
1751 ctxt->dst.bytes); 1590 op->bytes);
1752 if (rc != X86EMUL_CONTINUE) 1591 if (rc != X86EMUL_CONTINUE)
1753 return rc; 1592 return rc;
1754 break; 1593 break;
1755 case OP_MEM_STR: 1594 case OP_MEM_STR:
1756 rc = segmented_write(ctxt, 1595 rc = segmented_write(ctxt,
1757 ctxt->dst.addr.mem, 1596 op->addr.mem,
1758 ctxt->dst.data, 1597 op->data,
1759 ctxt->dst.bytes * ctxt->dst.count); 1598 op->bytes * op->count);
1760 if (rc != X86EMUL_CONTINUE) 1599 if (rc != X86EMUL_CONTINUE)
1761 return rc; 1600 return rc;
1762 break; 1601 break;
1763 case OP_XMM: 1602 case OP_XMM:
1764 write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); 1603 write_sse_reg(ctxt, &op->vec_val, op->addr.xmm);
1765 break; 1604 break;
1766 case OP_MM: 1605 case OP_MM:
1767 write_mmx_reg(ctxt, &ctxt->dst.mm_val, ctxt->dst.addr.mm); 1606 write_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
1768 break; 1607 break;
1769 case OP_NONE: 1608 case OP_NONE:
1770 /* no writeback */ 1609 /* no writeback */
@@ -2117,42 +1956,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
2117 return X86EMUL_CONTINUE; 1956 return X86EMUL_CONTINUE;
2118} 1957}
2119 1958
2120static int em_mul_ex(struct x86_emulate_ctxt *ctxt)
2121{
2122 u8 ex = 0;
2123
2124 emulate_1op_rax_rdx(ctxt, "mul", ex);
2125 return X86EMUL_CONTINUE;
2126}
2127
2128static int em_imul_ex(struct x86_emulate_ctxt *ctxt)
2129{
2130 u8 ex = 0;
2131
2132 emulate_1op_rax_rdx(ctxt, "imul", ex);
2133 return X86EMUL_CONTINUE;
2134}
2135
2136static int em_div_ex(struct x86_emulate_ctxt *ctxt)
2137{
2138 u8 de = 0;
2139
2140 emulate_1op_rax_rdx(ctxt, "div", de);
2141 if (de)
2142 return emulate_de(ctxt);
2143 return X86EMUL_CONTINUE;
2144}
2145
2146static int em_idiv_ex(struct x86_emulate_ctxt *ctxt)
2147{
2148 u8 de = 0;
2149
2150 emulate_1op_rax_rdx(ctxt, "idiv", de);
2151 if (de)
2152 return emulate_de(ctxt);
2153 return X86EMUL_CONTINUE;
2154}
2155
2156static int em_grp45(struct x86_emulate_ctxt *ctxt) 1959static int em_grp45(struct x86_emulate_ctxt *ctxt)
2157{ 1960{
2158 int rc = X86EMUL_CONTINUE; 1961 int rc = X86EMUL_CONTINUE;
@@ -3734,10 +3537,10 @@ static const struct opcode group3[] = {
3734 F(DstMem | SrcImm | NoWrite, em_test), 3537 F(DstMem | SrcImm | NoWrite, em_test),
3735 F(DstMem | SrcNone | Lock, em_not), 3538 F(DstMem | SrcNone | Lock, em_not),
3736 F(DstMem | SrcNone | Lock, em_neg), 3539 F(DstMem | SrcNone | Lock, em_neg),
3737 I(SrcMem, em_mul_ex), 3540 F(DstXacc | Src2Mem, em_mul_ex),
3738 I(SrcMem, em_imul_ex), 3541 F(DstXacc | Src2Mem, em_imul_ex),
3739 I(SrcMem, em_div_ex), 3542 F(DstXacc | Src2Mem, em_div_ex),
3740 I(SrcMem, em_idiv_ex), 3543 F(DstXacc | Src2Mem, em_idiv_ex),
3741}; 3544};
3742 3545
3743static const struct opcode group4[] = { 3546static const struct opcode group4[] = {
@@ -4064,7 +3867,7 @@ static const struct opcode twobyte_table[256] = {
4064 F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr), 3867 F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr),
4065 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 3868 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
4066 /* 0xC0 - 0xC7 */ 3869 /* 0xC0 - 0xC7 */
4067 D2bv(DstMem | SrcReg | ModRM | Lock), 3870 F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd),
4068 N, D(DstMem | SrcReg | ModRM | Mov), 3871 N, D(DstMem | SrcReg | ModRM | Mov),
4069 N, N, N, GD(0, &group9), 3872 N, N, N, GD(0, &group9),
4070 /* 0xC8 - 0xCF */ 3873 /* 0xC8 - 0xCF */
@@ -4172,6 +3975,24 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
4172 fetch_register_operand(op); 3975 fetch_register_operand(op);
4173 op->orig_val = op->val; 3976 op->orig_val = op->val;
4174 break; 3977 break;
3978 case OpAccLo:
3979 op->type = OP_REG;
3980 op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes;
3981 op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
3982 fetch_register_operand(op);
3983 op->orig_val = op->val;
3984 break;
3985 case OpAccHi:
3986 if (ctxt->d & ByteOp) {
3987 op->type = OP_NONE;
3988 break;
3989 }
3990 op->type = OP_REG;
3991 op->bytes = ctxt->op_bytes;
3992 op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
3993 fetch_register_operand(op);
3994 op->orig_val = op->val;
3995 break;
4175 case OpDI: 3996 case OpDI:
4176 op->type = OP_MEM; 3997 op->type = OP_MEM;
4177 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 3998 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
@@ -4553,11 +4374,15 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
4553static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) 4374static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
4554{ 4375{
4555 ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; 4376 ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
4556 fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; 4377 if (!(ctxt->d & ByteOp))
4378 fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
4557 asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" 4379 asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
4558 : "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags) 4380 : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
4559 : "c"(ctxt->src2.val), [fastop]"S"(fop)); 4381 [fastop]"+S"(fop)
4382 : "c"(ctxt->src2.val));
4560 ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); 4383 ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
4384 if (!fop) /* exception is returned in fop variable */
4385 return emulate_de(ctxt);
4561 return X86EMUL_CONTINUE; 4386 return X86EMUL_CONTINUE;
4562} 4387}
4563 4388
@@ -4773,9 +4598,17 @@ special_insn:
4773 goto done; 4598 goto done;
4774 4599
4775writeback: 4600writeback:
4776 rc = writeback(ctxt); 4601 if (!(ctxt->d & NoWrite)) {
4777 if (rc != X86EMUL_CONTINUE) 4602 rc = writeback(ctxt, &ctxt->dst);
4778 goto done; 4603 if (rc != X86EMUL_CONTINUE)
4604 goto done;
4605 }
4606 if (ctxt->d & SrcWrite) {
4607 BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR);
4608 rc = writeback(ctxt, &ctxt->src);
4609 if (rc != X86EMUL_CONTINUE)
4610 goto done;
4611 }
4779 4612
4780 /* 4613 /*
4781 * restore dst type in case the decoding will be reused 4614 * restore dst type in case the decoding will be reused
@@ -4872,12 +4705,6 @@ twobyte_insn:
4872 ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val : 4705 ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :
4873 (s16) ctxt->src.val; 4706 (s16) ctxt->src.val;
4874 break; 4707 break;
4875 case 0xc0 ... 0xc1: /* xadd */
4876 fastop(ctxt, em_add);
4877 /* Write back the register source. */
4878 ctxt->src.val = ctxt->dst.orig_val;
4879 write_register_operand(&ctxt->src);
4880 break;
4881 case 0xc3: /* movnti */ 4708 case 0xc3: /* movnti */
4882 ctxt->dst.bytes = ctxt->op_bytes; 4709 ctxt->dst.bytes = ctxt->op_bytes;
4883 ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : 4710 ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0eee2c8b64d1..afc11245827c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1608,8 +1608,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1608 return; 1608 return;
1609 1609
1610 if (atomic_read(&apic->lapic_timer.pending) > 0) { 1610 if (atomic_read(&apic->lapic_timer.pending) > 0) {
1611 if (kvm_apic_local_deliver(apic, APIC_LVTT)) 1611 kvm_apic_local_deliver(apic, APIC_LVTT);
1612 atomic_dec(&apic->lapic_timer.pending); 1612 atomic_set(&apic->lapic_timer.pending, 0);
1613 } 1613 }
1614} 1614}
1615 1615
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 004cc87b781c..0d094da49541 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -197,15 +197,63 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
197} 197}
198EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); 198EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
199 199
200static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) 200/*
201 * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number,
202 * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation
203 * number.
204 */
205#define MMIO_SPTE_GEN_LOW_SHIFT 3
206#define MMIO_SPTE_GEN_HIGH_SHIFT 52
207
208#define MMIO_GEN_SHIFT 19
209#define MMIO_GEN_LOW_SHIFT 9
210#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1)
211#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
212#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1)
213
214static u64 generation_mmio_spte_mask(unsigned int gen)
201{ 215{
202 struct kvm_mmu_page *sp = page_header(__pa(sptep)); 216 u64 mask;
217
218 WARN_ON(gen > MMIO_MAX_GEN);
219
220 mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
221 mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
222 return mask;
223}
224
225static unsigned int get_mmio_spte_generation(u64 spte)
226{
227 unsigned int gen;
228
229 spte &= ~shadow_mmio_mask;
230
231 gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK;
232 gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT;
233 return gen;
234}
235
236static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
237{
238 /*
239 * Init kvm generation close to MMIO_MAX_GEN to easily test the
240 * code of handling generation number wrap-around.
241 */
242 return (kvm_memslots(kvm)->generation +
243 MMIO_MAX_GEN - 150) & MMIO_GEN_MASK;
244}
245
246static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
247 unsigned access)
248{
249 unsigned int gen = kvm_current_mmio_generation(kvm);
250 u64 mask = generation_mmio_spte_mask(gen);
203 251
204 access &= ACC_WRITE_MASK | ACC_USER_MASK; 252 access &= ACC_WRITE_MASK | ACC_USER_MASK;
253 mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT;
205 254
206 sp->mmio_cached = true; 255 trace_mark_mmio_spte(sptep, gfn, access, gen);
207 trace_mark_mmio_spte(sptep, gfn, access); 256 mmu_spte_set(sptep, mask);
208 mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
209} 257}
210 258
211static bool is_mmio_spte(u64 spte) 259static bool is_mmio_spte(u64 spte)
@@ -215,24 +263,38 @@ static bool is_mmio_spte(u64 spte)
215 263
216static gfn_t get_mmio_spte_gfn(u64 spte) 264static gfn_t get_mmio_spte_gfn(u64 spte)
217{ 265{
218 return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT; 266 u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
267 return (spte & ~mask) >> PAGE_SHIFT;
219} 268}
220 269
221static unsigned get_mmio_spte_access(u64 spte) 270static unsigned get_mmio_spte_access(u64 spte)
222{ 271{
223 return (spte & ~shadow_mmio_mask) & ~PAGE_MASK; 272 u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
273 return (spte & ~mask) & ~PAGE_MASK;
224} 274}
225 275
226static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) 276static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
277 pfn_t pfn, unsigned access)
227{ 278{
228 if (unlikely(is_noslot_pfn(pfn))) { 279 if (unlikely(is_noslot_pfn(pfn))) {
229 mark_mmio_spte(sptep, gfn, access); 280 mark_mmio_spte(kvm, sptep, gfn, access);
230 return true; 281 return true;
231 } 282 }
232 283
233 return false; 284 return false;
234} 285}
235 286
287static bool check_mmio_spte(struct kvm *kvm, u64 spte)
288{
289 unsigned int kvm_gen, spte_gen;
290
291 kvm_gen = kvm_current_mmio_generation(kvm);
292 spte_gen = get_mmio_spte_generation(spte);
293
294 trace_check_mmio_spte(spte, kvm_gen, spte_gen);
295 return likely(kvm_gen == spte_gen);
296}
297
236static inline u64 rsvd_bits(int s, int e) 298static inline u64 rsvd_bits(int s, int e)
237{ 299{
238 return ((1ULL << (e - s + 1)) - 1) << s; 300 return ((1ULL << (e - s + 1)) - 1) << s;
@@ -404,9 +466,20 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
404/* 466/*
405 * The idea using the light way get the spte on x86_32 guest is from 467 * The idea using the light way get the spte on x86_32 guest is from
406 * gup_get_pte(arch/x86/mm/gup.c). 468 * gup_get_pte(arch/x86/mm/gup.c).
407 * The difference is we can not catch the spte tlb flush if we leave 469 *
408 * guest mode, so we emulate it by increase clear_spte_count when spte 470 * An spte tlb flush may be pending, because kvm_set_pte_rmapp
409 * is cleared. 471 * coalesces them and we are running out of the MMU lock. Therefore
472 * we need to protect against in-progress updates of the spte.
473 *
474 * Reading the spte while an update is in progress may get the old value
475 * for the high part of the spte. The race is fine for a present->non-present
476 * change (because the high part of the spte is ignored for non-present spte),
477 * but for a present->present change we must reread the spte.
478 *
479 * All such changes are done in two steps (present->non-present and
480 * non-present->present), hence it is enough to count the number of
481 * present->non-present updates: if it changed while reading the spte,
482 * we might have hit the race. This is done using clear_spte_count.
410 */ 483 */
411static u64 __get_spte_lockless(u64 *sptep) 484static u64 __get_spte_lockless(u64 *sptep)
412{ 485{
@@ -1511,6 +1584,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1511 if (!direct) 1584 if (!direct)
1512 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); 1585 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1513 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 1586 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1587
1588 /*
1589 * The active_mmu_pages list is the FIFO list, do not move the
1590 * page until it is zapped. kvm_zap_obsolete_pages depends on
1591 * this feature. See the comments in kvm_zap_obsolete_pages().
1592 */
1514 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 1593 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1515 sp->parent_ptes = 0; 1594 sp->parent_ptes = 0;
1516 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1595 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
@@ -1648,6 +1727,16 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1648static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1727static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1649 struct list_head *invalid_list); 1728 struct list_head *invalid_list);
1650 1729
1730/*
1731 * NOTE: we should pay more attention on the zapped-obsolete page
1732 * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk
1733 * since it has been deleted from active_mmu_pages but still can be found
1734 * at hast list.
1735 *
1736 * for_each_gfn_indirect_valid_sp has skipped that kind of page and
1737 * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped
1738 * all the obsolete pages.
1739 */
1651#define for_each_gfn_sp(_kvm, _sp, _gfn) \ 1740#define for_each_gfn_sp(_kvm, _sp, _gfn) \
1652 hlist_for_each_entry(_sp, \ 1741 hlist_for_each_entry(_sp, \
1653 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ 1742 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
@@ -1838,6 +1927,11 @@ static void clear_sp_write_flooding_count(u64 *spte)
1838 __clear_sp_write_flooding_count(sp); 1927 __clear_sp_write_flooding_count(sp);
1839} 1928}
1840 1929
1930static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
1931{
1932 return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
1933}
1934
1841static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 1935static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1842 gfn_t gfn, 1936 gfn_t gfn,
1843 gva_t gaddr, 1937 gva_t gaddr,
@@ -1864,6 +1958,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1864 role.quadrant = quadrant; 1958 role.quadrant = quadrant;
1865 } 1959 }
1866 for_each_gfn_sp(vcpu->kvm, sp, gfn) { 1960 for_each_gfn_sp(vcpu->kvm, sp, gfn) {
1961 if (is_obsolete_sp(vcpu->kvm, sp))
1962 continue;
1963
1867 if (!need_sync && sp->unsync) 1964 if (!need_sync && sp->unsync)
1868 need_sync = true; 1965 need_sync = true;
1869 1966
@@ -1900,6 +1997,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1900 1997
1901 account_shadowed(vcpu->kvm, gfn); 1998 account_shadowed(vcpu->kvm, gfn);
1902 } 1999 }
2000 sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
1903 init_shadow_page_table(sp); 2001 init_shadow_page_table(sp);
1904 trace_kvm_mmu_get_page(sp, true); 2002 trace_kvm_mmu_get_page(sp, true);
1905 return sp; 2003 return sp;
@@ -2070,8 +2168,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2070 ret = mmu_zap_unsync_children(kvm, sp, invalid_list); 2168 ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
2071 kvm_mmu_page_unlink_children(kvm, sp); 2169 kvm_mmu_page_unlink_children(kvm, sp);
2072 kvm_mmu_unlink_parents(kvm, sp); 2170 kvm_mmu_unlink_parents(kvm, sp);
2171
2073 if (!sp->role.invalid && !sp->role.direct) 2172 if (!sp->role.invalid && !sp->role.direct)
2074 unaccount_shadowed(kvm, sp->gfn); 2173 unaccount_shadowed(kvm, sp->gfn);
2174
2075 if (sp->unsync) 2175 if (sp->unsync)
2076 kvm_unlink_unsync_page(kvm, sp); 2176 kvm_unlink_unsync_page(kvm, sp);
2077 if (!sp->root_count) { 2177 if (!sp->root_count) {
@@ -2081,7 +2181,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2081 kvm_mod_used_mmu_pages(kvm, -1); 2181 kvm_mod_used_mmu_pages(kvm, -1);
2082 } else { 2182 } else {
2083 list_move(&sp->link, &kvm->arch.active_mmu_pages); 2183 list_move(&sp->link, &kvm->arch.active_mmu_pages);
2084 kvm_reload_remote_mmus(kvm); 2184
2185 /*
2186 * The obsolete pages can not be used on any vcpus.
2187 * See the comments in kvm_mmu_invalidate_zap_all_pages().
2188 */
2189 if (!sp->role.invalid && !is_obsolete_sp(kvm, sp))
2190 kvm_reload_remote_mmus(kvm);
2085 } 2191 }
2086 2192
2087 sp->role.invalid = 1; 2193 sp->role.invalid = 1;
@@ -2331,7 +2437,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2331 u64 spte; 2437 u64 spte;
2332 int ret = 0; 2438 int ret = 0;
2333 2439
2334 if (set_mmio_spte(sptep, gfn, pfn, pte_access)) 2440 if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access))
2335 return 0; 2441 return 0;
2336 2442
2337 spte = PT_PRESENT_MASK; 2443 spte = PT_PRESENT_MASK;
@@ -2869,22 +2975,25 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2869 2975
2870 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2976 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2871 return; 2977 return;
2872 spin_lock(&vcpu->kvm->mmu_lock); 2978
2873 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && 2979 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
2874 (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || 2980 (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
2875 vcpu->arch.mmu.direct_map)) { 2981 vcpu->arch.mmu.direct_map)) {
2876 hpa_t root = vcpu->arch.mmu.root_hpa; 2982 hpa_t root = vcpu->arch.mmu.root_hpa;
2877 2983
2984 spin_lock(&vcpu->kvm->mmu_lock);
2878 sp = page_header(root); 2985 sp = page_header(root);
2879 --sp->root_count; 2986 --sp->root_count;
2880 if (!sp->root_count && sp->role.invalid) { 2987 if (!sp->root_count && sp->role.invalid) {
2881 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); 2988 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2882 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 2989 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2883 } 2990 }
2884 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2885 spin_unlock(&vcpu->kvm->mmu_lock); 2991 spin_unlock(&vcpu->kvm->mmu_lock);
2992 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2886 return; 2993 return;
2887 } 2994 }
2995
2996 spin_lock(&vcpu->kvm->mmu_lock);
2888 for (i = 0; i < 4; ++i) { 2997 for (i = 0; i < 4; ++i) {
2889 hpa_t root = vcpu->arch.mmu.pae_root[i]; 2998 hpa_t root = vcpu->arch.mmu.pae_root[i];
2890 2999
@@ -3148,17 +3257,12 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
3148 return spte; 3257 return spte;
3149} 3258}
3150 3259
3151/*
3152 * If it is a real mmio page fault, return 1 and emulat the instruction
3153 * directly, return 0 to let CPU fault again on the address, -1 is
3154 * returned if bug is detected.
3155 */
3156int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3260int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3157{ 3261{
3158 u64 spte; 3262 u64 spte;
3159 3263
3160 if (quickly_check_mmio_pf(vcpu, addr, direct)) 3264 if (quickly_check_mmio_pf(vcpu, addr, direct))
3161 return 1; 3265 return RET_MMIO_PF_EMULATE;
3162 3266
3163 spte = walk_shadow_page_get_mmio_spte(vcpu, addr); 3267 spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
3164 3268
@@ -3166,12 +3270,15 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3166 gfn_t gfn = get_mmio_spte_gfn(spte); 3270 gfn_t gfn = get_mmio_spte_gfn(spte);
3167 unsigned access = get_mmio_spte_access(spte); 3271 unsigned access = get_mmio_spte_access(spte);
3168 3272
3273 if (!check_mmio_spte(vcpu->kvm, spte))
3274 return RET_MMIO_PF_INVALID;
3275
3169 if (direct) 3276 if (direct)
3170 addr = 0; 3277 addr = 0;
3171 3278
3172 trace_handle_mmio_page_fault(addr, gfn, access); 3279 trace_handle_mmio_page_fault(addr, gfn, access);
3173 vcpu_cache_mmio_info(vcpu, addr, gfn, access); 3280 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
3174 return 1; 3281 return RET_MMIO_PF_EMULATE;
3175 } 3282 }
3176 3283
3177 /* 3284 /*
@@ -3179,13 +3286,13 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3179 * it's a BUG if the gfn is not a mmio page. 3286 * it's a BUG if the gfn is not a mmio page.
3180 */ 3287 */
3181 if (direct && !check_direct_spte_mmio_pf(spte)) 3288 if (direct && !check_direct_spte_mmio_pf(spte))
3182 return -1; 3289 return RET_MMIO_PF_BUG;
3183 3290
3184 /* 3291 /*
3185 * If the page table is zapped by other cpus, let CPU fault again on 3292 * If the page table is zapped by other cpus, let CPU fault again on
3186 * the address. 3293 * the address.
3187 */ 3294 */
3188 return 0; 3295 return RET_MMIO_PF_RETRY;
3189} 3296}
3190EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); 3297EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
3191 3298
@@ -3195,7 +3302,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
3195 int ret; 3302 int ret;
3196 3303
3197 ret = handle_mmio_page_fault_common(vcpu, addr, direct); 3304 ret = handle_mmio_page_fault_common(vcpu, addr, direct);
3198 WARN_ON(ret < 0); 3305 WARN_ON(ret == RET_MMIO_PF_BUG);
3199 return ret; 3306 return ret;
3200} 3307}
3201 3308
@@ -3207,8 +3314,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
3207 3314
3208 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); 3315 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
3209 3316
3210 if (unlikely(error_code & PFERR_RSVD_MASK)) 3317 if (unlikely(error_code & PFERR_RSVD_MASK)) {
3211 return handle_mmio_page_fault(vcpu, gva, error_code, true); 3318 r = handle_mmio_page_fault(vcpu, gva, error_code, true);
3319
3320 if (likely(r != RET_MMIO_PF_INVALID))
3321 return r;
3322 }
3212 3323
3213 r = mmu_topup_memory_caches(vcpu); 3324 r = mmu_topup_memory_caches(vcpu);
3214 if (r) 3325 if (r)
@@ -3284,8 +3395,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3284 ASSERT(vcpu); 3395 ASSERT(vcpu);
3285 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3396 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
3286 3397
3287 if (unlikely(error_code & PFERR_RSVD_MASK)) 3398 if (unlikely(error_code & PFERR_RSVD_MASK)) {
3288 return handle_mmio_page_fault(vcpu, gpa, error_code, true); 3399 r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
3400
3401 if (likely(r != RET_MMIO_PF_INVALID))
3402 return r;
3403 }
3289 3404
3290 r = mmu_topup_memory_caches(vcpu); 3405 r = mmu_topup_memory_caches(vcpu);
3291 if (r) 3406 if (r)
@@ -3391,8 +3506,8 @@ static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
3391 *access &= mask; 3506 *access &= mask;
3392} 3507}
3393 3508
3394static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, 3509static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
3395 int *nr_present) 3510 unsigned access, int *nr_present)
3396{ 3511{
3397 if (unlikely(is_mmio_spte(*sptep))) { 3512 if (unlikely(is_mmio_spte(*sptep))) {
3398 if (gfn != get_mmio_spte_gfn(*sptep)) { 3513 if (gfn != get_mmio_spte_gfn(*sptep)) {
@@ -3401,7 +3516,7 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
3401 } 3516 }
3402 3517
3403 (*nr_present)++; 3518 (*nr_present)++;
3404 mark_mmio_spte(sptep, gfn, access); 3519 mark_mmio_spte(kvm, sptep, gfn, access);
3405 return true; 3520 return true;
3406 } 3521 }
3407 3522
@@ -3764,9 +3879,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
3764 if (r) 3879 if (r)
3765 goto out; 3880 goto out;
3766 r = mmu_alloc_roots(vcpu); 3881 r = mmu_alloc_roots(vcpu);
3767 spin_lock(&vcpu->kvm->mmu_lock); 3882 kvm_mmu_sync_roots(vcpu);
3768 mmu_sync_roots(vcpu);
3769 spin_unlock(&vcpu->kvm->mmu_lock);
3770 if (r) 3883 if (r)
3771 goto out; 3884 goto out;
3772 /* set_cr3() should ensure TLB has been flushed */ 3885 /* set_cr3() should ensure TLB has been flushed */
@@ -4179,39 +4292,107 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
4179 spin_unlock(&kvm->mmu_lock); 4292 spin_unlock(&kvm->mmu_lock);
4180} 4293}
4181 4294
4182void kvm_mmu_zap_all(struct kvm *kvm) 4295#define BATCH_ZAP_PAGES 10
4296static void kvm_zap_obsolete_pages(struct kvm *kvm)
4183{ 4297{
4184 struct kvm_mmu_page *sp, *node; 4298 struct kvm_mmu_page *sp, *node;
4185 LIST_HEAD(invalid_list); 4299 int batch = 0;
4186 4300
4187 spin_lock(&kvm->mmu_lock);
4188restart: 4301restart:
4189 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 4302 list_for_each_entry_safe_reverse(sp, node,
4190 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) 4303 &kvm->arch.active_mmu_pages, link) {
4304 int ret;
4305
4306 /*
4307 * No obsolete page exists before new created page since
4308 * active_mmu_pages is the FIFO list.
4309 */
4310 if (!is_obsolete_sp(kvm, sp))
4311 break;
4312
4313 /*
4314 * Since we are reversely walking the list and the invalid
4315 * list will be moved to the head, skip the invalid page
4316 * can help us to avoid the infinity list walking.
4317 */
4318 if (sp->role.invalid)
4319 continue;
4320
4321 /*
4322 * Need not flush tlb since we only zap the sp with invalid
4323 * generation number.
4324 */
4325 if (batch >= BATCH_ZAP_PAGES &&
4326 cond_resched_lock(&kvm->mmu_lock)) {
4327 batch = 0;
4328 goto restart;
4329 }
4330
4331 ret = kvm_mmu_prepare_zap_page(kvm, sp,
4332 &kvm->arch.zapped_obsolete_pages);
4333 batch += ret;
4334
4335 if (ret)
4191 goto restart; 4336 goto restart;
4337 }
4192 4338
4193 kvm_mmu_commit_zap_page(kvm, &invalid_list); 4339 /*
4194 spin_unlock(&kvm->mmu_lock); 4340 * Should flush tlb before free page tables since lockless-walking
4341 * may use the pages.
4342 */
4343 kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
4195} 4344}
4196 4345
4197void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) 4346/*
4347 * Fast invalidate all shadow pages and use lock-break technique
4348 * to zap obsolete pages.
4349 *
4350 * It's required when memslot is being deleted or VM is being
4351 * destroyed, in these cases, we should ensure that KVM MMU does
4352 * not use any resource of the being-deleted slot or all slots
4353 * after calling the function.
4354 */
4355void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
4198{ 4356{
4199 struct kvm_mmu_page *sp, *node;
4200 LIST_HEAD(invalid_list);
4201
4202 spin_lock(&kvm->mmu_lock); 4357 spin_lock(&kvm->mmu_lock);
4203restart: 4358 trace_kvm_mmu_invalidate_zap_all_pages(kvm);
4204 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { 4359 kvm->arch.mmu_valid_gen++;
4205 if (!sp->mmio_cached)
4206 continue;
4207 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
4208 goto restart;
4209 }
4210 4360
4211 kvm_mmu_commit_zap_page(kvm, &invalid_list); 4361 /*
4362 * Notify all vcpus to reload its shadow page table
4363 * and flush TLB. Then all vcpus will switch to new
4364 * shadow page table with the new mmu_valid_gen.
4365 *
4366 * Note: we should do this under the protection of
4367 * mmu-lock, otherwise, vcpu would purge shadow page
4368 * but miss tlb flush.
4369 */
4370 kvm_reload_remote_mmus(kvm);
4371
4372 kvm_zap_obsolete_pages(kvm);
4212 spin_unlock(&kvm->mmu_lock); 4373 spin_unlock(&kvm->mmu_lock);
4213} 4374}
4214 4375
4376static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
4377{
4378 return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
4379}
4380
4381void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
4382{
4383 /*
4384 * The very rare case: if the generation-number is round,
4385 * zap all shadow pages.
4386 *
4387 * The max value is MMIO_MAX_GEN - 1 since it is not called
4388 * when mark memslot invalid.
4389 */
4390 if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) {
4391 printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
4392 kvm_mmu_invalidate_zap_all_pages(kvm);
4393 }
4394}
4395
4215static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) 4396static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
4216{ 4397{
4217 struct kvm *kvm; 4398 struct kvm *kvm;
@@ -4240,15 +4421,23 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
4240 * want to shrink a VM that only started to populate its MMU 4421 * want to shrink a VM that only started to populate its MMU
4241 * anyway. 4422 * anyway.
4242 */ 4423 */
4243 if (!kvm->arch.n_used_mmu_pages) 4424 if (!kvm->arch.n_used_mmu_pages &&
4425 !kvm_has_zapped_obsolete_pages(kvm))
4244 continue; 4426 continue;
4245 4427
4246 idx = srcu_read_lock(&kvm->srcu); 4428 idx = srcu_read_lock(&kvm->srcu);
4247 spin_lock(&kvm->mmu_lock); 4429 spin_lock(&kvm->mmu_lock);
4248 4430
4431 if (kvm_has_zapped_obsolete_pages(kvm)) {
4432 kvm_mmu_commit_zap_page(kvm,
4433 &kvm->arch.zapped_obsolete_pages);
4434 goto unlock;
4435 }
4436
4249 prepare_zap_oldest_mmu_page(kvm, &invalid_list); 4437 prepare_zap_oldest_mmu_page(kvm, &invalid_list);
4250 kvm_mmu_commit_zap_page(kvm, &invalid_list); 4438 kvm_mmu_commit_zap_page(kvm, &invalid_list);
4251 4439
4440unlock:
4252 spin_unlock(&kvm->mmu_lock); 4441 spin_unlock(&kvm->mmu_lock);
4253 srcu_read_unlock(&kvm->srcu, idx); 4442 srcu_read_unlock(&kvm->srcu, idx);
4254 4443
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 2adcbc2cac6d..5b59c573aba7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -52,6 +52,23 @@
52 52
53int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); 53int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
54void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); 54void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
55
56/*
57 * Return values of handle_mmio_page_fault_common:
58 * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
59 * directly.
60 * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
61 * fault path update the mmio spte.
62 * RET_MMIO_PF_RETRY: let CPU fault again on the address.
63 * RET_MMIO_PF_BUG: bug is detected.
64 */
65enum {
66 RET_MMIO_PF_EMULATE = 1,
67 RET_MMIO_PF_INVALID = 2,
68 RET_MMIO_PF_RETRY = 0,
69 RET_MMIO_PF_BUG = -1
70};
71
55int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); 72int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
56int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); 73int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
57 74
@@ -97,4 +114,5 @@ static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access,
97 return (mmu->permissions[pfec >> 1] >> pte_access) & 1; 114 return (mmu->permissions[pfec >> 1] >> pte_access) & 1;
98} 115}
99 116
117void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
100#endif 118#endif
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index b8f6172f4174..9d2e0ffcb190 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -7,16 +7,18 @@
7#undef TRACE_SYSTEM 7#undef TRACE_SYSTEM
8#define TRACE_SYSTEM kvmmmu 8#define TRACE_SYSTEM kvmmmu
9 9
10#define KVM_MMU_PAGE_FIELDS \ 10#define KVM_MMU_PAGE_FIELDS \
11 __field(__u64, gfn) \ 11 __field(unsigned long, mmu_valid_gen) \
12 __field(__u32, role) \ 12 __field(__u64, gfn) \
13 __field(__u32, root_count) \ 13 __field(__u32, role) \
14 __field(__u32, root_count) \
14 __field(bool, unsync) 15 __field(bool, unsync)
15 16
16#define KVM_MMU_PAGE_ASSIGN(sp) \ 17#define KVM_MMU_PAGE_ASSIGN(sp) \
17 __entry->gfn = sp->gfn; \ 18 __entry->mmu_valid_gen = sp->mmu_valid_gen; \
18 __entry->role = sp->role.word; \ 19 __entry->gfn = sp->gfn; \
19 __entry->root_count = sp->root_count; \ 20 __entry->role = sp->role.word; \
21 __entry->root_count = sp->root_count; \
20 __entry->unsync = sp->unsync; 22 __entry->unsync = sp->unsync;
21 23
22#define KVM_MMU_PAGE_PRINTK() ({ \ 24#define KVM_MMU_PAGE_PRINTK() ({ \
@@ -28,8 +30,8 @@
28 \ 30 \
29 role.word = __entry->role; \ 31 role.word = __entry->role; \
30 \ 32 \
31 trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \ 33 trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s" \
32 " %snxe root %u %s%c", \ 34 " %snxe root %u %s%c", __entry->mmu_valid_gen, \
33 __entry->gfn, role.level, \ 35 __entry->gfn, role.level, \
34 role.cr4_pae ? " pae" : "", \ 36 role.cr4_pae ? " pae" : "", \
35 role.quadrant, \ 37 role.quadrant, \
@@ -197,23 +199,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
197 199
198TRACE_EVENT( 200TRACE_EVENT(
199 mark_mmio_spte, 201 mark_mmio_spte,
200 TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access), 202 TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen),
201 TP_ARGS(sptep, gfn, access), 203 TP_ARGS(sptep, gfn, access, gen),
202 204
203 TP_STRUCT__entry( 205 TP_STRUCT__entry(
204 __field(void *, sptep) 206 __field(void *, sptep)
205 __field(gfn_t, gfn) 207 __field(gfn_t, gfn)
206 __field(unsigned, access) 208 __field(unsigned, access)
209 __field(unsigned int, gen)
207 ), 210 ),
208 211
209 TP_fast_assign( 212 TP_fast_assign(
210 __entry->sptep = sptep; 213 __entry->sptep = sptep;
211 __entry->gfn = gfn; 214 __entry->gfn = gfn;
212 __entry->access = access; 215 __entry->access = access;
216 __entry->gen = gen;
213 ), 217 ),
214 218
215 TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn, 219 TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep,
216 __entry->access) 220 __entry->gfn, __entry->access, __entry->gen)
217); 221);
218 222
219TRACE_EVENT( 223TRACE_EVENT(
@@ -274,6 +278,50 @@ TRACE_EVENT(
274 __spte_satisfied(old_spte), __spte_satisfied(new_spte) 278 __spte_satisfied(old_spte), __spte_satisfied(new_spte)
275 ) 279 )
276); 280);
281
282TRACE_EVENT(
283 kvm_mmu_invalidate_zap_all_pages,
284 TP_PROTO(struct kvm *kvm),
285 TP_ARGS(kvm),
286
287 TP_STRUCT__entry(
288 __field(unsigned long, mmu_valid_gen)
289 __field(unsigned int, mmu_used_pages)
290 ),
291
292 TP_fast_assign(
293 __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
294 __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
295 ),
296
297 TP_printk("kvm-mmu-valid-gen %lx used_pages %x",
298 __entry->mmu_valid_gen, __entry->mmu_used_pages
299 )
300);
301
302
303TRACE_EVENT(
304 check_mmio_spte,
305 TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen),
306 TP_ARGS(spte, kvm_gen, spte_gen),
307
308 TP_STRUCT__entry(
309 __field(unsigned int, kvm_gen)
310 __field(unsigned int, spte_gen)
311 __field(u64, spte)
312 ),
313
314 TP_fast_assign(
315 __entry->kvm_gen = kvm_gen;
316 __entry->spte_gen = spte_gen;
317 __entry->spte = spte;
318 ),
319
320 TP_printk("spte %llx kvm_gen %x spte-gen %x valid %d", __entry->spte,
321 __entry->kvm_gen, __entry->spte_gen,
322 __entry->kvm_gen == __entry->spte_gen
323 )
324);
277#endif /* _TRACE_KVMMMU_H */ 325#endif /* _TRACE_KVMMMU_H */
278 326
279#undef TRACE_INCLUDE_PATH 327#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index da20860b457a..7769699d48a8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -552,9 +552,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
552 552
553 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 553 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
554 554
555 if (unlikely(error_code & PFERR_RSVD_MASK)) 555 if (unlikely(error_code & PFERR_RSVD_MASK)) {
556 return handle_mmio_page_fault(vcpu, addr, error_code, 556 r = handle_mmio_page_fault(vcpu, addr, error_code,
557 mmu_is_nested(vcpu)); 557 mmu_is_nested(vcpu));
558 if (likely(r != RET_MMIO_PF_INVALID))
559 return r;
560 };
558 561
559 r = mmu_topup_memory_caches(vcpu); 562 r = mmu_topup_memory_caches(vcpu);
560 if (r) 563 if (r)
@@ -792,7 +795,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
792 pte_access &= gpte_access(vcpu, gpte); 795 pte_access &= gpte_access(vcpu, gpte);
793 protect_clean_gpte(&pte_access, gpte); 796 protect_clean_gpte(&pte_access, gpte);
794 797
795 if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) 798 if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access,
799 &nr_present))
796 continue; 800 continue;
797 801
798 if (gfn != sp->gfns[i]) { 802 if (gfn != sp->gfns[i]) {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a14a6eaf871d..c0bc80391e40 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1026,7 +1026,10 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1026 g_tsc_offset = svm->vmcb->control.tsc_offset - 1026 g_tsc_offset = svm->vmcb->control.tsc_offset -
1027 svm->nested.hsave->control.tsc_offset; 1027 svm->nested.hsave->control.tsc_offset;
1028 svm->nested.hsave->control.tsc_offset = offset; 1028 svm->nested.hsave->control.tsc_offset = offset;
1029 } 1029 } else
1030 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1031 svm->vmcb->control.tsc_offset,
1032 offset);
1030 1033
1031 svm->vmcb->control.tsc_offset = offset + g_tsc_offset; 1034 svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
1032 1035
@@ -1044,6 +1047,11 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
1044 svm->vmcb->control.tsc_offset += adjustment; 1047 svm->vmcb->control.tsc_offset += adjustment;
1045 if (is_guest_mode(vcpu)) 1048 if (is_guest_mode(vcpu))
1046 svm->nested.hsave->control.tsc_offset += adjustment; 1049 svm->nested.hsave->control.tsc_offset += adjustment;
1050 else
1051 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1052 svm->vmcb->control.tsc_offset - adjustment,
1053 svm->vmcb->control.tsc_offset);
1054
1047 mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1055 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1048} 1056}
1049 1057
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index fe5e00ed7036..545245d7cc63 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -756,6 +756,27 @@ TRACE_EVENT(
756 __entry->gpa_match ? "GPA" : "GVA") 756 __entry->gpa_match ? "GPA" : "GVA")
757); 757);
758 758
759TRACE_EVENT(kvm_write_tsc_offset,
760 TP_PROTO(unsigned int vcpu_id, __u64 previous_tsc_offset,
761 __u64 next_tsc_offset),
762 TP_ARGS(vcpu_id, previous_tsc_offset, next_tsc_offset),
763
764 TP_STRUCT__entry(
765 __field( unsigned int, vcpu_id )
766 __field( __u64, previous_tsc_offset )
767 __field( __u64, next_tsc_offset )
768 ),
769
770 TP_fast_assign(
771 __entry->vcpu_id = vcpu_id;
772 __entry->previous_tsc_offset = previous_tsc_offset;
773 __entry->next_tsc_offset = next_tsc_offset;
774 ),
775
776 TP_printk("vcpu=%u prev=%llu next=%llu", __entry->vcpu_id,
777 __entry->previous_tsc_offset, __entry->next_tsc_offset)
778);
779
759#ifdef CONFIG_X86_64 780#ifdef CONFIG_X86_64
760 781
761#define host_clocks \ 782#define host_clocks \
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 260a91939555..a7e18551c968 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2096,6 +2096,8 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2096 (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? 2096 (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
2097 vmcs12->tsc_offset : 0)); 2097 vmcs12->tsc_offset : 0));
2098 } else { 2098 } else {
2099 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2100 vmcs_read64(TSC_OFFSET), offset);
2099 vmcs_write64(TSC_OFFSET, offset); 2101 vmcs_write64(TSC_OFFSET, offset);
2100 } 2102 }
2101} 2103}
@@ -2103,11 +2105,14 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2103static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) 2105static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
2104{ 2106{
2105 u64 offset = vmcs_read64(TSC_OFFSET); 2107 u64 offset = vmcs_read64(TSC_OFFSET);
2108
2106 vmcs_write64(TSC_OFFSET, offset + adjustment); 2109 vmcs_write64(TSC_OFFSET, offset + adjustment);
2107 if (is_guest_mode(vcpu)) { 2110 if (is_guest_mode(vcpu)) {
2108 /* Even when running L2, the adjustment needs to apply to L1 */ 2111 /* Even when running L2, the adjustment needs to apply to L1 */
2109 to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment; 2112 to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
2110 } 2113 } else
2114 trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset,
2115 offset + adjustment);
2111} 2116}
2112 2117
2113static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) 2118static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
@@ -4176,10 +4181,10 @@ static void ept_set_mmio_spte_mask(void)
4176 /* 4181 /*
4177 * EPT Misconfigurations can be generated if the value of bits 2:0 4182 * EPT Misconfigurations can be generated if the value of bits 2:0
4178 * of an EPT paging-structure entry is 110b (write/execute). 4183 * of an EPT paging-structure entry is 110b (write/execute).
4179 * Also, magic bits (0xffull << 49) is set to quickly identify mmio 4184 * Also, magic bits (0x3ull << 62) is set to quickly identify mmio
4180 * spte. 4185 * spte.
4181 */ 4186 */
4182 kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull); 4187 kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
4183} 4188}
4184 4189
4185/* 4190/*
@@ -5366,10 +5371,14 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5366 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5371 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5367 5372
5368 ret = handle_mmio_page_fault_common(vcpu, gpa, true); 5373 ret = handle_mmio_page_fault_common(vcpu, gpa, true);
5369 if (likely(ret == 1)) 5374 if (likely(ret == RET_MMIO_PF_EMULATE))
5370 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == 5375 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
5371 EMULATE_DONE; 5376 EMULATE_DONE;
5372 if (unlikely(!ret)) 5377
5378 if (unlikely(ret == RET_MMIO_PF_INVALID))
5379 return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0);
5380
5381 if (unlikely(ret == RET_MMIO_PF_RETRY))
5373 return 1; 5382 return 1;
5374 5383
5375 /* It is the real ept misconfig */ 5384 /* It is the real ept misconfig */
@@ -7942,7 +7951,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
7942 7951
7943 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); 7952 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
7944 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); 7953 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
7945 vmx_set_rflags(vcpu, X86_EFLAGS_BIT1); 7954 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
7946 /* 7955 /*
7947 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 7956 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
7948 * actually changed, because it depends on the current state of 7957 * actually changed, because it depends on the current state of
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 094b5d96ab14..d21bce505315 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -582,8 +582,6 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
582 if (index != XCR_XFEATURE_ENABLED_MASK) 582 if (index != XCR_XFEATURE_ENABLED_MASK)
583 return 1; 583 return 1;
584 xcr0 = xcr; 584 xcr0 = xcr;
585 if (kvm_x86_ops->get_cpl(vcpu) != 0)
586 return 1;
587 if (!(xcr0 & XSTATE_FP)) 585 if (!(xcr0 & XSTATE_FP))
588 return 1; 586 return 1;
589 if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) 587 if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
@@ -597,7 +595,8 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
597 595
598int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) 596int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
599{ 597{
600 if (__kvm_set_xcr(vcpu, index, xcr)) { 598 if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
599 __kvm_set_xcr(vcpu, index, xcr)) {
601 kvm_inject_gp(vcpu, 0); 600 kvm_inject_gp(vcpu, 0);
602 return 1; 601 return 1;
603 } 602 }
@@ -619,7 +618,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
619 if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) 618 if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
620 return 1; 619 return 1;
621 620
622 if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS)) 621 if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE))
623 return 1; 622 return 1;
624 623
625 if (is_long_mode(vcpu)) { 624 if (is_long_mode(vcpu)) {
@@ -1194,20 +1193,37 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1194 elapsed = ns - kvm->arch.last_tsc_nsec; 1193 elapsed = ns - kvm->arch.last_tsc_nsec;
1195 1194
1196 if (vcpu->arch.virtual_tsc_khz) { 1195 if (vcpu->arch.virtual_tsc_khz) {
1196 int faulted = 0;
1197
1197 /* n.b - signed multiplication and division required */ 1198 /* n.b - signed multiplication and division required */
1198 usdiff = data - kvm->arch.last_tsc_write; 1199 usdiff = data - kvm->arch.last_tsc_write;
1199#ifdef CONFIG_X86_64 1200#ifdef CONFIG_X86_64
1200 usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; 1201 usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
1201#else 1202#else
1202 /* do_div() only does unsigned */ 1203 /* do_div() only does unsigned */
1203 asm("idivl %2; xor %%edx, %%edx" 1204 asm("1: idivl %[divisor]\n"
1204 : "=A"(usdiff) 1205 "2: xor %%edx, %%edx\n"
1205 : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); 1206 " movl $0, %[faulted]\n"
1207 "3:\n"
1208 ".section .fixup,\"ax\"\n"
1209 "4: movl $1, %[faulted]\n"
1210 " jmp 3b\n"
1211 ".previous\n"
1212
1213 _ASM_EXTABLE(1b, 4b)
1214
1215 : "=A"(usdiff), [faulted] "=r" (faulted)
1216 : "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz));
1217
1206#endif 1218#endif
1207 do_div(elapsed, 1000); 1219 do_div(elapsed, 1000);
1208 usdiff -= elapsed; 1220 usdiff -= elapsed;
1209 if (usdiff < 0) 1221 if (usdiff < 0)
1210 usdiff = -usdiff; 1222 usdiff = -usdiff;
1223
1224 /* idivl overflow => difference is larger than USEC_PER_SEC */
1225 if (faulted)
1226 usdiff = USEC_PER_SEC;
1211 } else 1227 } else
1212 usdiff = USEC_PER_SEC; /* disable TSC match window below */ 1228 usdiff = USEC_PER_SEC; /* disable TSC match window below */
1213 1229
@@ -1588,6 +1604,30 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1588 return 0; 1604 return 0;
1589} 1605}
1590 1606
1607/*
1608 * kvmclock updates which are isolated to a given vcpu, such as
1609 * vcpu->cpu migration, should not allow system_timestamp from
1610 * the rest of the vcpus to remain static. Otherwise ntp frequency
1611 * correction applies to one vcpu's system_timestamp but not
1612 * the others.
1613 *
1614 * So in those cases, request a kvmclock update for all vcpus.
1615 * The worst case for a remote vcpu to update its kvmclock
1616 * is then bounded by maximum nohz sleep latency.
1617 */
1618
1619static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
1620{
1621 int i;
1622 struct kvm *kvm = v->kvm;
1623 struct kvm_vcpu *vcpu;
1624
1625 kvm_for_each_vcpu(i, vcpu, kvm) {
1626 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
1627 kvm_vcpu_kick(vcpu);
1628 }
1629}
1630
1591static bool msr_mtrr_valid(unsigned msr) 1631static bool msr_mtrr_valid(unsigned msr)
1592{ 1632{
1593 switch (msr) { 1633 switch (msr) {
@@ -1985,7 +2025,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1985 kvmclock_reset(vcpu); 2025 kvmclock_reset(vcpu);
1986 2026
1987 vcpu->arch.time = data; 2027 vcpu->arch.time = data;
1988 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2028 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
1989 2029
1990 /* we verify if the enable bit is set... */ 2030 /* we verify if the enable bit is set... */
1991 if (!(data & 1)) 2031 if (!(data & 1))
@@ -2702,7 +2742,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2702 * kvmclock on vcpu->cpu migration 2742 * kvmclock on vcpu->cpu migration
2703 */ 2743 */
2704 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) 2744 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
2705 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2745 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2706 if (vcpu->cpu != cpu) 2746 if (vcpu->cpu != cpu)
2707 kvm_migrate_timers(vcpu); 2747 kvm_migrate_timers(vcpu);
2708 vcpu->cpu = cpu; 2748 vcpu->cpu = cpu;
@@ -5239,7 +5279,13 @@ static void kvm_set_mmio_spte_mask(void)
5239 * Set the reserved bits and the present bit of an paging-structure 5279 * Set the reserved bits and the present bit of an paging-structure
5240 * entry to generate page fault with PFER.RSV = 1. 5280 * entry to generate page fault with PFER.RSV = 1.
5241 */ 5281 */
5242 mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr; 5282 /* Mask the reserved physical address bits. */
5283 mask = ((1ull << (51 - maxphyaddr + 1)) - 1) << maxphyaddr;
5284
5285 /* Bit 62 is always reserved for 32bit host. */
5286 mask |= 0x3ull << 62;
5287
5288 /* Set the present bit. */
5243 mask |= 1ull; 5289 mask |= 1ull;
5244 5290
5245#ifdef CONFIG_X86_64 5291#ifdef CONFIG_X86_64
@@ -5499,13 +5545,6 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
5499 char instruction[3]; 5545 char instruction[3];
5500 unsigned long rip = kvm_rip_read(vcpu); 5546 unsigned long rip = kvm_rip_read(vcpu);
5501 5547
5502 /*
5503 * Blow out the MMU to ensure that no other VCPU has an active mapping
5504 * to ensure that the updated hypercall appears atomically across all
5505 * VCPUs.
5506 */
5507 kvm_mmu_zap_all(vcpu->kvm);
5508
5509 kvm_x86_ops->patch_hypercall(vcpu, instruction); 5548 kvm_x86_ops->patch_hypercall(vcpu, instruction);
5510 5549
5511 return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); 5550 return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
@@ -5703,6 +5742,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5703 __kvm_migrate_timers(vcpu); 5742 __kvm_migrate_timers(vcpu);
5704 if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) 5743 if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
5705 kvm_gen_update_masterclock(vcpu->kvm); 5744 kvm_gen_update_masterclock(vcpu->kvm);
5745 if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
5746 kvm_gen_kvmclock_update(vcpu);
5706 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { 5747 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
5707 r = kvm_guest_time_update(vcpu); 5748 r = kvm_guest_time_update(vcpu);
5708 if (unlikely(r)) 5749 if (unlikely(r))
@@ -6813,6 +6854,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
6813 return -EINVAL; 6854 return -EINVAL;
6814 6855
6815 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 6856 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
6857 INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
6816 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 6858 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
6817 6859
6818 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 6860 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -7041,22 +7083,18 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
7041 * If memory slot is created, or moved, we need to clear all 7083 * If memory slot is created, or moved, we need to clear all
7042 * mmio sptes. 7084 * mmio sptes.
7043 */ 7085 */
7044 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 7086 kvm_mmu_invalidate_mmio_sptes(kvm);
7045 kvm_mmu_zap_mmio_sptes(kvm);
7046 kvm_reload_remote_mmus(kvm);
7047 }
7048} 7087}
7049 7088
7050void kvm_arch_flush_shadow_all(struct kvm *kvm) 7089void kvm_arch_flush_shadow_all(struct kvm *kvm)
7051{ 7090{
7052 kvm_mmu_zap_all(kvm); 7091 kvm_mmu_invalidate_zap_all_pages(kvm);
7053 kvm_reload_remote_mmus(kvm);
7054} 7092}
7055 7093
7056void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 7094void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
7057 struct kvm_memory_slot *slot) 7095 struct kvm_memory_slot *slot)
7058{ 7096{
7059 kvm_arch_flush_shadow_all(kvm); 7097 kvm_mmu_invalidate_zap_all_pages(kvm);
7060} 7098}
7061 7099
7062int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 7100int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
@@ -7264,3 +7302,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
7264EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); 7302EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
7265EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); 7303EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
7266EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); 7304EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
7305EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);