aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-05-24 19:17:30 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-05-24 19:17:30 -0400
commit07acfc2a9349a8ce45b236c2624dad452001966b (patch)
treec40f3eaac18a8320e65af220979223b5cd632b1b /arch/x86/kvm
parentb5f4035adfffbcc6b478de5b8c44b618b3124aff (diff)
parent322728e55aa7834e2fab2786b76df183c4843a12 (diff)
Merge branch 'next' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM changes from Avi Kivity: "Changes include additional instruction emulation, page-crossing MMIO, faster dirty logging, preventing the watchdog from killing a stopped guest, module autoload, a new MSI ABI, and some minor optimizations and fixes. Outside x86 we have a small s390 and a very large ppc update. Regarding the new (for kvm) rebaseless workflow, some of the patches that were merged before we switch trees had to be rebased, while others are true pulls. In either case the signoffs should be correct now." Fix up trivial conflicts in Documentation/feature-removal-schedule.txt arch/powerpc/kvm/book3s_segment.S and arch/x86/include/asm/kvm_para.h. I suspect the kvm_para.h resolution ends up doing the "do I have cpuid" check effectively twice (it was done differently in two different commits), but better safe than sorry ;) * 'next' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (125 commits) KVM: make asm-generic/kvm_para.h have an ifdef __KERNEL__ block KVM: s390: onereg for timer related registers KVM: s390: epoch difference and TOD programmable field KVM: s390: KVM_GET/SET_ONEREG for s390 KVM: s390: add capability indicating COW support KVM: Fix mmu_reload() clash with nested vmx event injection KVM: MMU: Don't use RCU for lockless shadow walking KVM: VMX: Optimize %ds, %es reload KVM: VMX: Fix %ds/%es clobber KVM: x86 emulator: convert bsf/bsr instructions to emulate_2op_SrcV_nobyte() KVM: VMX: unlike vmcs on fail path KVM: PPC: Emulator: clean up SPR reads and writes KVM: PPC: Emulator: clean up instruction parsing kvm/powerpc: Add new ioctl to retreive server MMU infos kvm/book3s: Make kernel emulated H_PUT_TCE available for "PR" KVM KVM: PPC: bookehv: Fix r8/r13 storing in level exception handler KVM: PPC: Book3S: Enable IRQs during exit handling KVM: PPC: Fix PR KVM on POWER7 bare metal KVM: PPC: Fix stbux emulation KVM: PPC: bookehv: Use lwz/stw instead of PPC_LL/PPC_STL for 32-bit fields ...
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/cpuid.c5
-rw-r--r--arch/x86/kvm/emulate.c293
-rw-r--r--arch/x86/kvm/i8254.c31
-rw-r--r--arch/x86/kvm/i8254.h7
-rw-r--r--arch/x86/kvm/lapic.c31
-rw-r--r--arch/x86/kvm/mmu.c345
-rw-r--r--arch/x86/kvm/mmu_audit.c10
-rw-r--r--arch/x86/kvm/paging_tmpl.h2
-rw-r--r--arch/x86/kvm/svm.c9
-rw-r--r--arch/x86/kvm/vmx.c41
-rw-r--r--arch/x86/kvm/x86.c280
-rw-r--r--arch/x86/kvm/x86.h2
13 files changed, 649 insertions, 408 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 1a7fe868f37..a28f338843e 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -36,6 +36,7 @@ config KVM
36 select TASKSTATS 36 select TASKSTATS
37 select TASK_DELAY_ACCT 37 select TASK_DELAY_ACCT
38 select PERF_EVENTS 38 select PERF_EVENTS
39 select HAVE_KVM_MSI
39 ---help--- 40 ---help---
40 Support hosting fully virtualized guest machines using hardware 41 Support hosting fully virtualized guest machines using hardware
41 virtualization extensions. You will need a fairly recent 42 virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 9fed5bedaad..7df1c6d839f 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -247,7 +247,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
247 247
248 /* cpuid 7.0.ebx */ 248 /* cpuid 7.0.ebx */
249 const u32 kvm_supported_word9_x86_features = 249 const u32 kvm_supported_word9_x86_features =
250 F(FSGSBASE) | F(BMI1) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS); 250 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
251 F(BMI2) | F(ERMS) | F(RTM);
251 252
252 /* all calls to cpuid_count() should be made on the same cpu */ 253 /* all calls to cpuid_count() should be made on the same cpu */
253 get_cpu(); 254 get_cpu();
@@ -397,7 +398,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
397 case KVM_CPUID_SIGNATURE: { 398 case KVM_CPUID_SIGNATURE: {
398 char signature[12] = "KVMKVMKVM\0\0"; 399 char signature[12] = "KVMKVMKVM\0\0";
399 u32 *sigptr = (u32 *)signature; 400 u32 *sigptr = (u32 *)signature;
400 entry->eax = 0; 401 entry->eax = KVM_CPUID_FEATURES;
401 entry->ebx = sigptr[0]; 402 entry->ebx = sigptr[0];
402 entry->ecx = sigptr[1]; 403 entry->ecx = sigptr[1];
403 entry->edx = sigptr[2]; 404 entry->edx = sigptr[2];
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 83756223f8a..f95d242ee9f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -142,6 +142,10 @@
142#define Src2FS (OpFS << Src2Shift) 142#define Src2FS (OpFS << Src2Shift)
143#define Src2GS (OpGS << Src2Shift) 143#define Src2GS (OpGS << Src2Shift)
144#define Src2Mask (OpMask << Src2Shift) 144#define Src2Mask (OpMask << Src2Shift)
145#define Mmx ((u64)1 << 40) /* MMX Vector instruction */
146#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */
147#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */
148#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */
145 149
146#define X2(x...) x, x 150#define X2(x...) x, x
147#define X3(x...) X2(x), x 151#define X3(x...) X2(x), x
@@ -557,6 +561,29 @@ static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
557 ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg); 561 ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg);
558} 562}
559 563
564/*
565 * x86 defines three classes of vector instructions: explicitly
566 * aligned, explicitly unaligned, and the rest, which change behaviour
567 * depending on whether they're AVX encoded or not.
568 *
569 * Also included is CMPXCHG16B which is not a vector instruction, yet it is
570 * subject to the same check.
571 */
572static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size)
573{
574 if (likely(size < 16))
575 return false;
576
577 if (ctxt->d & Aligned)
578 return true;
579 else if (ctxt->d & Unaligned)
580 return false;
581 else if (ctxt->d & Avx)
582 return false;
583 else
584 return true;
585}
586
560static int __linearize(struct x86_emulate_ctxt *ctxt, 587static int __linearize(struct x86_emulate_ctxt *ctxt,
561 struct segmented_address addr, 588 struct segmented_address addr,
562 unsigned size, bool write, bool fetch, 589 unsigned size, bool write, bool fetch,
@@ -621,6 +648,8 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
621 } 648 }
622 if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8) 649 if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8)
623 la &= (u32)-1; 650 la &= (u32)-1;
651 if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
652 return emulate_gp(ctxt, 0);
624 *linear = la; 653 *linear = la;
625 return X86EMUL_CONTINUE; 654 return X86EMUL_CONTINUE;
626bad: 655bad:
@@ -859,6 +888,40 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
859 ctxt->ops->put_fpu(ctxt); 888 ctxt->ops->put_fpu(ctxt);
860} 889}
861 890
891static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
892{
893 ctxt->ops->get_fpu(ctxt);
894 switch (reg) {
895 case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;
896 case 1: asm("movq %%mm1, %0" : "=m"(*data)); break;
897 case 2: asm("movq %%mm2, %0" : "=m"(*data)); break;
898 case 3: asm("movq %%mm3, %0" : "=m"(*data)); break;
899 case 4: asm("movq %%mm4, %0" : "=m"(*data)); break;
900 case 5: asm("movq %%mm5, %0" : "=m"(*data)); break;
901 case 6: asm("movq %%mm6, %0" : "=m"(*data)); break;
902 case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;
903 default: BUG();
904 }
905 ctxt->ops->put_fpu(ctxt);
906}
907
908static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
909{
910 ctxt->ops->get_fpu(ctxt);
911 switch (reg) {
912 case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;
913 case 1: asm("movq %0, %%mm1" : : "m"(*data)); break;
914 case 2: asm("movq %0, %%mm2" : : "m"(*data)); break;
915 case 3: asm("movq %0, %%mm3" : : "m"(*data)); break;
916 case 4: asm("movq %0, %%mm4" : : "m"(*data)); break;
917 case 5: asm("movq %0, %%mm5" : : "m"(*data)); break;
918 case 6: asm("movq %0, %%mm6" : : "m"(*data)); break;
919 case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;
920 default: BUG();
921 }
922 ctxt->ops->put_fpu(ctxt);
923}
924
862static void decode_register_operand(struct x86_emulate_ctxt *ctxt, 925static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
863 struct operand *op) 926 struct operand *op)
864{ 927{
@@ -875,6 +938,13 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
875 read_sse_reg(ctxt, &op->vec_val, reg); 938 read_sse_reg(ctxt, &op->vec_val, reg);
876 return; 939 return;
877 } 940 }
941 if (ctxt->d & Mmx) {
942 reg &= 7;
943 op->type = OP_MM;
944 op->bytes = 8;
945 op->addr.mm = reg;
946 return;
947 }
878 948
879 op->type = OP_REG; 949 op->type = OP_REG;
880 if (ctxt->d & ByteOp) { 950 if (ctxt->d & ByteOp) {
@@ -902,7 +972,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
902 ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ 972 ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
903 } 973 }
904 974
905 ctxt->modrm = insn_fetch(u8, ctxt);
906 ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; 975 ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
907 ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; 976 ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
908 ctxt->modrm_rm |= (ctxt->modrm & 0x07); 977 ctxt->modrm_rm |= (ctxt->modrm & 0x07);
@@ -920,6 +989,12 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
920 read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm); 989 read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm);
921 return rc; 990 return rc;
922 } 991 }
992 if (ctxt->d & Mmx) {
993 op->type = OP_MM;
994 op->bytes = 8;
995 op->addr.xmm = ctxt->modrm_rm & 7;
996 return rc;
997 }
923 fetch_register_operand(op); 998 fetch_register_operand(op);
924 return rc; 999 return rc;
925 } 1000 }
@@ -1387,6 +1462,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
1387 case OP_XMM: 1462 case OP_XMM:
1388 write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); 1463 write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
1389 break; 1464 break;
1465 case OP_MM:
1466 write_mmx_reg(ctxt, &ctxt->dst.mm_val, ctxt->dst.addr.mm);
1467 break;
1390 case OP_NONE: 1468 case OP_NONE:
1391 /* no writeback */ 1469 /* no writeback */
1392 break; 1470 break;
@@ -2790,7 +2868,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
2790 2868
2791static int em_mov(struct x86_emulate_ctxt *ctxt) 2869static int em_mov(struct x86_emulate_ctxt *ctxt)
2792{ 2870{
2793 ctxt->dst.val = ctxt->src.val; 2871 memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes);
2794 return X86EMUL_CONTINUE; 2872 return X86EMUL_CONTINUE;
2795} 2873}
2796 2874
@@ -2870,12 +2948,6 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
2870 return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); 2948 return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
2871} 2949}
2872 2950
2873static int em_movdqu(struct x86_emulate_ctxt *ctxt)
2874{
2875 memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes);
2876 return X86EMUL_CONTINUE;
2877}
2878
2879static int em_invlpg(struct x86_emulate_ctxt *ctxt) 2951static int em_invlpg(struct x86_emulate_ctxt *ctxt)
2880{ 2952{
2881 int rc; 2953 int rc;
@@ -3061,35 +3133,13 @@ static int em_btc(struct x86_emulate_ctxt *ctxt)
3061 3133
3062static int em_bsf(struct x86_emulate_ctxt *ctxt) 3134static int em_bsf(struct x86_emulate_ctxt *ctxt)
3063{ 3135{
3064 u8 zf; 3136 emulate_2op_SrcV_nobyte(ctxt, "bsf");
3065
3066 __asm__ ("bsf %2, %0; setz %1"
3067 : "=r"(ctxt->dst.val), "=q"(zf)
3068 : "r"(ctxt->src.val));
3069
3070 ctxt->eflags &= ~X86_EFLAGS_ZF;
3071 if (zf) {
3072 ctxt->eflags |= X86_EFLAGS_ZF;
3073 /* Disable writeback. */
3074 ctxt->dst.type = OP_NONE;
3075 }
3076 return X86EMUL_CONTINUE; 3137 return X86EMUL_CONTINUE;
3077} 3138}
3078 3139
3079static int em_bsr(struct x86_emulate_ctxt *ctxt) 3140static int em_bsr(struct x86_emulate_ctxt *ctxt)
3080{ 3141{
3081 u8 zf; 3142 emulate_2op_SrcV_nobyte(ctxt, "bsr");
3082
3083 __asm__ ("bsr %2, %0; setz %1"
3084 : "=r"(ctxt->dst.val), "=q"(zf)
3085 : "r"(ctxt->src.val));
3086
3087 ctxt->eflags &= ~X86_EFLAGS_ZF;
3088 if (zf) {
3089 ctxt->eflags |= X86_EFLAGS_ZF;
3090 /* Disable writeback. */
3091 ctxt->dst.type = OP_NONE;
3092 }
3093 return X86EMUL_CONTINUE; 3143 return X86EMUL_CONTINUE;
3094} 3144}
3095 3145
@@ -3286,8 +3336,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3286 .check_perm = (_p) } 3336 .check_perm = (_p) }
3287#define N D(0) 3337#define N D(0)
3288#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } 3338#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
3289#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } 3339#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
3290#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) } 3340#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
3291#define I(_f, _e) { .flags = (_f), .u.execute = (_e) } 3341#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
3292#define II(_f, _e, _i) \ 3342#define II(_f, _e, _i) \
3293 { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } 3343 { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
@@ -3307,25 +3357,25 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3307 I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) 3357 I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
3308 3358
3309static struct opcode group7_rm1[] = { 3359static struct opcode group7_rm1[] = {
3310 DI(SrcNone | ModRM | Priv, monitor), 3360 DI(SrcNone | Priv, monitor),
3311 DI(SrcNone | ModRM | Priv, mwait), 3361 DI(SrcNone | Priv, mwait),
3312 N, N, N, N, N, N, 3362 N, N, N, N, N, N,
3313}; 3363};
3314 3364
3315static struct opcode group7_rm3[] = { 3365static struct opcode group7_rm3[] = {
3316 DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa), 3366 DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa),
3317 II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall), 3367 II(SrcNone | Prot | VendorSpecific, em_vmmcall, vmmcall),
3318 DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa), 3368 DIP(SrcNone | Prot | Priv, vmload, check_svme_pa),
3319 DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa), 3369 DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa),
3320 DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme), 3370 DIP(SrcNone | Prot | Priv, stgi, check_svme),
3321 DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme), 3371 DIP(SrcNone | Prot | Priv, clgi, check_svme),
3322 DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme), 3372 DIP(SrcNone | Prot | Priv, skinit, check_svme),
3323 DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme), 3373 DIP(SrcNone | Prot | Priv, invlpga, check_svme),
3324}; 3374};
3325 3375
3326static struct opcode group7_rm7[] = { 3376static struct opcode group7_rm7[] = {
3327 N, 3377 N,
3328 DIP(SrcNone | ModRM, rdtscp, check_rdtsc), 3378 DIP(SrcNone, rdtscp, check_rdtsc),
3329 N, N, N, N, N, N, 3379 N, N, N, N, N, N,
3330}; 3380};
3331 3381
@@ -3341,81 +3391,86 @@ static struct opcode group1[] = {
3341}; 3391};
3342 3392
3343static struct opcode group1A[] = { 3393static struct opcode group1A[] = {
3344 I(DstMem | SrcNone | ModRM | Mov | Stack, em_pop), N, N, N, N, N, N, N, 3394 I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
3345}; 3395};
3346 3396
3347static struct opcode group3[] = { 3397static struct opcode group3[] = {
3348 I(DstMem | SrcImm | ModRM, em_test), 3398 I(DstMem | SrcImm, em_test),
3349 I(DstMem | SrcImm | ModRM, em_test), 3399 I(DstMem | SrcImm, em_test),
3350 I(DstMem | SrcNone | ModRM | Lock, em_not), 3400 I(DstMem | SrcNone | Lock, em_not),
3351 I(DstMem | SrcNone | ModRM | Lock, em_neg), 3401 I(DstMem | SrcNone | Lock, em_neg),
3352 I(SrcMem | ModRM, em_mul_ex), 3402 I(SrcMem, em_mul_ex),
3353 I(SrcMem | ModRM, em_imul_ex), 3403 I(SrcMem, em_imul_ex),
3354 I(SrcMem | ModRM, em_div_ex), 3404 I(SrcMem, em_div_ex),
3355 I(SrcMem | ModRM, em_idiv_ex), 3405 I(SrcMem, em_idiv_ex),
3356}; 3406};
3357 3407
3358static struct opcode group4[] = { 3408static struct opcode group4[] = {
3359 I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), 3409 I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
3360 I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), 3410 I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
3361 N, N, N, N, N, N, 3411 N, N, N, N, N, N,
3362}; 3412};
3363 3413
3364static struct opcode group5[] = { 3414static struct opcode group5[] = {
3365 I(DstMem | SrcNone | ModRM | Lock, em_grp45), 3415 I(DstMem | SrcNone | Lock, em_grp45),
3366 I(DstMem | SrcNone | ModRM | Lock, em_grp45), 3416 I(DstMem | SrcNone | Lock, em_grp45),
3367 I(SrcMem | ModRM | Stack, em_grp45), 3417 I(SrcMem | Stack, em_grp45),
3368 I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), 3418 I(SrcMemFAddr | ImplicitOps | Stack, em_call_far),
3369 I(SrcMem | ModRM | Stack, em_grp45), 3419 I(SrcMem | Stack, em_grp45),
3370 I(SrcMemFAddr | ModRM | ImplicitOps, em_grp45), 3420 I(SrcMemFAddr | ImplicitOps, em_grp45),
3371 I(SrcMem | ModRM | Stack, em_grp45), N, 3421 I(SrcMem | Stack, em_grp45), N,
3372}; 3422};
3373 3423
3374static struct opcode group6[] = { 3424static struct opcode group6[] = {
3375 DI(ModRM | Prot, sldt), 3425 DI(Prot, sldt),
3376 DI(ModRM | Prot, str), 3426 DI(Prot, str),
3377 DI(ModRM | Prot | Priv, lldt), 3427 DI(Prot | Priv, lldt),
3378 DI(ModRM | Prot | Priv, ltr), 3428 DI(Prot | Priv, ltr),
3379 N, N, N, N, 3429 N, N, N, N,
3380}; 3430};
3381 3431
3382static struct group_dual group7 = { { 3432static struct group_dual group7 = { {
3383 DI(ModRM | Mov | DstMem | Priv, sgdt), 3433 DI(Mov | DstMem | Priv, sgdt),
3384 DI(ModRM | Mov | DstMem | Priv, sidt), 3434 DI(Mov | DstMem | Priv, sidt),
3385 II(ModRM | SrcMem | Priv, em_lgdt, lgdt), 3435 II(SrcMem | Priv, em_lgdt, lgdt),
3386 II(ModRM | SrcMem | Priv, em_lidt, lidt), 3436 II(SrcMem | Priv, em_lidt, lidt),
3387 II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, 3437 II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
3388 II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), 3438 II(SrcMem16 | Mov | Priv, em_lmsw, lmsw),
3389 II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg), 3439 II(SrcMem | ByteOp | Priv | NoAccess, em_invlpg, invlpg),
3390}, { 3440}, {
3391 I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall), 3441 I(SrcNone | Priv | VendorSpecific, em_vmcall),
3392 EXT(0, group7_rm1), 3442 EXT(0, group7_rm1),
3393 N, EXT(0, group7_rm3), 3443 N, EXT(0, group7_rm3),
3394 II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, 3444 II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
3395 II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7), 3445 II(SrcMem16 | Mov | Priv, em_lmsw, lmsw),
3446 EXT(0, group7_rm7),
3396} }; 3447} };
3397 3448
3398static struct opcode group8[] = { 3449static struct opcode group8[] = {
3399 N, N, N, N, 3450 N, N, N, N,
3400 I(DstMem | SrcImmByte | ModRM, em_bt), 3451 I(DstMem | SrcImmByte, em_bt),
3401 I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_bts), 3452 I(DstMem | SrcImmByte | Lock | PageTable, em_bts),
3402 I(DstMem | SrcImmByte | ModRM | Lock, em_btr), 3453 I(DstMem | SrcImmByte | Lock, em_btr),
3403 I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_btc), 3454 I(DstMem | SrcImmByte | Lock | PageTable, em_btc),
3404}; 3455};
3405 3456
3406static struct group_dual group9 = { { 3457static struct group_dual group9 = { {
3407 N, I(DstMem64 | ModRM | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, 3458 N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
3408}, { 3459}, {
3409 N, N, N, N, N, N, N, N, 3460 N, N, N, N, N, N, N, N,
3410} }; 3461} };
3411 3462
3412static struct opcode group11[] = { 3463static struct opcode group11[] = {
3413 I(DstMem | SrcImm | ModRM | Mov | PageTable, em_mov), 3464 I(DstMem | SrcImm | Mov | PageTable, em_mov),
3414 X7(D(Undefined)), 3465 X7(D(Undefined)),
3415}; 3466};
3416 3467
3417static struct gprefix pfx_0f_6f_0f_7f = { 3468static struct gprefix pfx_0f_6f_0f_7f = {
3418 N, N, N, I(Sse, em_movdqu), 3469 I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
3470};
3471
3472static struct gprefix pfx_vmovntpx = {
3473 I(0, em_mov), N, N, N,
3419}; 3474};
3420 3475
3421static struct opcode opcode_table[256] = { 3476static struct opcode opcode_table[256] = {
@@ -3464,10 +3519,10 @@ static struct opcode opcode_table[256] = {
3464 /* 0x70 - 0x7F */ 3519 /* 0x70 - 0x7F */
3465 X16(D(SrcImmByte)), 3520 X16(D(SrcImmByte)),
3466 /* 0x80 - 0x87 */ 3521 /* 0x80 - 0x87 */
3467 G(ByteOp | DstMem | SrcImm | ModRM | Group, group1), 3522 G(ByteOp | DstMem | SrcImm, group1),
3468 G(DstMem | SrcImm | ModRM | Group, group1), 3523 G(DstMem | SrcImm, group1),
3469 G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), 3524 G(ByteOp | DstMem | SrcImm | No64, group1),
3470 G(DstMem | SrcImmByte | ModRM | Group, group1), 3525 G(DstMem | SrcImmByte, group1),
3471 I2bv(DstMem | SrcReg | ModRM, em_test), 3526 I2bv(DstMem | SrcReg | ModRM, em_test),
3472 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg), 3527 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
3473 /* 0x88 - 0x8F */ 3528 /* 0x88 - 0x8F */
@@ -3549,7 +3604,8 @@ static struct opcode twobyte_table[256] = {
3549 IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), 3604 IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write),
3550 IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), 3605 IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write),
3551 N, N, N, N, 3606 N, N, N, N,
3552 N, N, N, N, N, N, N, N, 3607 N, N, N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx),
3608 N, N, N, N,
3553 /* 0x30 - 0x3F */ 3609 /* 0x30 - 0x3F */
3554 II(ImplicitOps | Priv, em_wrmsr, wrmsr), 3610 II(ImplicitOps | Priv, em_wrmsr, wrmsr),
3555 IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), 3611 IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
@@ -3897,17 +3953,16 @@ done_prefixes:
3897 } 3953 }
3898 ctxt->d = opcode.flags; 3954 ctxt->d = opcode.flags;
3899 3955
3956 if (ctxt->d & ModRM)
3957 ctxt->modrm = insn_fetch(u8, ctxt);
3958
3900 while (ctxt->d & GroupMask) { 3959 while (ctxt->d & GroupMask) {
3901 switch (ctxt->d & GroupMask) { 3960 switch (ctxt->d & GroupMask) {
3902 case Group: 3961 case Group:
3903 ctxt->modrm = insn_fetch(u8, ctxt);
3904 --ctxt->_eip;
3905 goffset = (ctxt->modrm >> 3) & 7; 3962 goffset = (ctxt->modrm >> 3) & 7;
3906 opcode = opcode.u.group[goffset]; 3963 opcode = opcode.u.group[goffset];
3907 break; 3964 break;
3908 case GroupDual: 3965 case GroupDual:
3909 ctxt->modrm = insn_fetch(u8, ctxt);
3910 --ctxt->_eip;
3911 goffset = (ctxt->modrm >> 3) & 7; 3966 goffset = (ctxt->modrm >> 3) & 7;
3912 if ((ctxt->modrm >> 6) == 3) 3967 if ((ctxt->modrm >> 6) == 3)
3913 opcode = opcode.u.gdual->mod3[goffset]; 3968 opcode = opcode.u.gdual->mod3[goffset];
@@ -3960,6 +4015,8 @@ done_prefixes:
3960 4015
3961 if (ctxt->d & Sse) 4016 if (ctxt->d & Sse)
3962 ctxt->op_bytes = 16; 4017 ctxt->op_bytes = 16;
4018 else if (ctxt->d & Mmx)
4019 ctxt->op_bytes = 8;
3963 4020
3964 /* ModRM and SIB bytes. */ 4021 /* ModRM and SIB bytes. */
3965 if (ctxt->d & ModRM) { 4022 if (ctxt->d & ModRM) {
@@ -4030,6 +4087,35 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
4030 return false; 4087 return false;
4031} 4088}
4032 4089
4090static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
4091{
4092 bool fault = false;
4093
4094 ctxt->ops->get_fpu(ctxt);
4095 asm volatile("1: fwait \n\t"
4096 "2: \n\t"
4097 ".pushsection .fixup,\"ax\" \n\t"
4098 "3: \n\t"
4099 "movb $1, %[fault] \n\t"
4100 "jmp 2b \n\t"
4101 ".popsection \n\t"
4102 _ASM_EXTABLE(1b, 3b)
4103 : [fault]"+qm"(fault));
4104 ctxt->ops->put_fpu(ctxt);
4105
4106 if (unlikely(fault))
4107 return emulate_exception(ctxt, MF_VECTOR, 0, false);
4108
4109 return X86EMUL_CONTINUE;
4110}
4111
4112static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
4113 struct operand *op)
4114{
4115 if (op->type == OP_MM)
4116 read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
4117}
4118
4033int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) 4119int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4034{ 4120{
4035 struct x86_emulate_ops *ops = ctxt->ops; 4121 struct x86_emulate_ops *ops = ctxt->ops;
@@ -4054,18 +4140,31 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4054 goto done; 4140 goto done;
4055 } 4141 }
4056 4142
4057 if ((ctxt->d & Sse) 4143 if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)))
4058 && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) 4144 || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
4059 || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
4060 rc = emulate_ud(ctxt); 4145 rc = emulate_ud(ctxt);
4061 goto done; 4146 goto done;
4062 } 4147 }
4063 4148
4064 if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { 4149 if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
4065 rc = emulate_nm(ctxt); 4150 rc = emulate_nm(ctxt);
4066 goto done; 4151 goto done;
4067 } 4152 }
4068 4153
4154 if (ctxt->d & Mmx) {
4155 rc = flush_pending_x87_faults(ctxt);
4156 if (rc != X86EMUL_CONTINUE)
4157 goto done;
4158 /*
4159 * Now that we know the fpu is exception safe, we can fetch
4160 * operands from it.
4161 */
4162 fetch_possible_mmx_operand(ctxt, &ctxt->src);
4163 fetch_possible_mmx_operand(ctxt, &ctxt->src2);
4164 if (!(ctxt->d & Mov))
4165 fetch_possible_mmx_operand(ctxt, &ctxt->dst);
4166 }
4167
4069 if (unlikely(ctxt->guest_mode) && ctxt->intercept) { 4168 if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
4070 rc = emulator_check_intercept(ctxt, ctxt->intercept, 4169 rc = emulator_check_intercept(ctxt, ctxt->intercept,
4071 X86_ICPT_PRE_EXCEPT); 4170 X86_ICPT_PRE_EXCEPT);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index d68f99df690..adba28f88d1 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -34,7 +34,6 @@
34 34
35#include <linux/kvm_host.h> 35#include <linux/kvm_host.h>
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/workqueue.h>
38 37
39#include "irq.h" 38#include "irq.h"
40#include "i8254.h" 39#include "i8254.h"
@@ -249,7 +248,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
249 /* in this case, we had multiple outstanding pit interrupts 248 /* in this case, we had multiple outstanding pit interrupts
250 * that we needed to inject. Reinject 249 * that we needed to inject. Reinject
251 */ 250 */
252 queue_work(ps->pit->wq, &ps->pit->expired); 251 queue_kthread_work(&ps->pit->worker, &ps->pit->expired);
253 ps->irq_ack = 1; 252 ps->irq_ack = 1;
254 spin_unlock(&ps->inject_lock); 253 spin_unlock(&ps->inject_lock);
255} 254}
@@ -270,7 +269,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
270static void destroy_pit_timer(struct kvm_pit *pit) 269static void destroy_pit_timer(struct kvm_pit *pit)
271{ 270{
272 hrtimer_cancel(&pit->pit_state.pit_timer.timer); 271 hrtimer_cancel(&pit->pit_state.pit_timer.timer);
273 cancel_work_sync(&pit->expired); 272 flush_kthread_work(&pit->expired);
274} 273}
275 274
276static bool kpit_is_periodic(struct kvm_timer *ktimer) 275static bool kpit_is_periodic(struct kvm_timer *ktimer)
@@ -284,7 +283,7 @@ static struct kvm_timer_ops kpit_ops = {
284 .is_periodic = kpit_is_periodic, 283 .is_periodic = kpit_is_periodic,
285}; 284};
286 285
287static void pit_do_work(struct work_struct *work) 286static void pit_do_work(struct kthread_work *work)
288{ 287{
289 struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); 288 struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
290 struct kvm *kvm = pit->kvm; 289 struct kvm *kvm = pit->kvm;
@@ -328,7 +327,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
328 327
329 if (ktimer->reinject || !atomic_read(&ktimer->pending)) { 328 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
330 atomic_inc(&ktimer->pending); 329 atomic_inc(&ktimer->pending);
331 queue_work(pt->wq, &pt->expired); 330 queue_kthread_work(&pt->worker, &pt->expired);
332 } 331 }
333 332
334 if (ktimer->t_ops->is_periodic(ktimer)) { 333 if (ktimer->t_ops->is_periodic(ktimer)) {
@@ -353,7 +352,7 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
353 352
354 /* TODO The new value only affected after the retriggered */ 353 /* TODO The new value only affected after the retriggered */
355 hrtimer_cancel(&pt->timer); 354 hrtimer_cancel(&pt->timer);
356 cancel_work_sync(&ps->pit->expired); 355 flush_kthread_work(&ps->pit->expired);
357 pt->period = interval; 356 pt->period = interval;
358 ps->is_periodic = is_period; 357 ps->is_periodic = is_period;
359 358
@@ -669,6 +668,8 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
669{ 668{
670 struct kvm_pit *pit; 669 struct kvm_pit *pit;
671 struct kvm_kpit_state *pit_state; 670 struct kvm_kpit_state *pit_state;
671 struct pid *pid;
672 pid_t pid_nr;
672 int ret; 673 int ret;
673 674
674 pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); 675 pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
@@ -685,14 +686,20 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
685 mutex_lock(&pit->pit_state.lock); 686 mutex_lock(&pit->pit_state.lock);
686 spin_lock_init(&pit->pit_state.inject_lock); 687 spin_lock_init(&pit->pit_state.inject_lock);
687 688
688 pit->wq = create_singlethread_workqueue("kvm-pit-wq"); 689 pid = get_pid(task_tgid(current));
689 if (!pit->wq) { 690 pid_nr = pid_vnr(pid);
691 put_pid(pid);
692
693 init_kthread_worker(&pit->worker);
694 pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker,
695 "kvm-pit/%d", pid_nr);
696 if (IS_ERR(pit->worker_task)) {
690 mutex_unlock(&pit->pit_state.lock); 697 mutex_unlock(&pit->pit_state.lock);
691 kvm_free_irq_source_id(kvm, pit->irq_source_id); 698 kvm_free_irq_source_id(kvm, pit->irq_source_id);
692 kfree(pit); 699 kfree(pit);
693 return NULL; 700 return NULL;
694 } 701 }
695 INIT_WORK(&pit->expired, pit_do_work); 702 init_kthread_work(&pit->expired, pit_do_work);
696 703
697 kvm->arch.vpit = pit; 704 kvm->arch.vpit = pit;
698 pit->kvm = kvm; 705 pit->kvm = kvm;
@@ -736,7 +743,7 @@ fail:
736 kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); 743 kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
737 kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); 744 kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
738 kvm_free_irq_source_id(kvm, pit->irq_source_id); 745 kvm_free_irq_source_id(kvm, pit->irq_source_id);
739 destroy_workqueue(pit->wq); 746 kthread_stop(pit->worker_task);
740 kfree(pit); 747 kfree(pit);
741 return NULL; 748 return NULL;
742} 749}
@@ -756,10 +763,10 @@ void kvm_free_pit(struct kvm *kvm)
756 mutex_lock(&kvm->arch.vpit->pit_state.lock); 763 mutex_lock(&kvm->arch.vpit->pit_state.lock);
757 timer = &kvm->arch.vpit->pit_state.pit_timer.timer; 764 timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
758 hrtimer_cancel(timer); 765 hrtimer_cancel(timer);
759 cancel_work_sync(&kvm->arch.vpit->expired); 766 flush_kthread_work(&kvm->arch.vpit->expired);
767 kthread_stop(kvm->arch.vpit->worker_task);
760 kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); 768 kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
761 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 769 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
762 destroy_workqueue(kvm->arch.vpit->wq);
763 kfree(kvm->arch.vpit); 770 kfree(kvm->arch.vpit);
764 } 771 }
765} 772}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 51a97426e79..fdf40425ea1 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -1,6 +1,8 @@
1#ifndef __I8254_H 1#ifndef __I8254_H
2#define __I8254_H 2#define __I8254_H
3 3
4#include <linux/kthread.h>
5
4#include "iodev.h" 6#include "iodev.h"
5 7
6struct kvm_kpit_channel_state { 8struct kvm_kpit_channel_state {
@@ -39,8 +41,9 @@ struct kvm_pit {
39 struct kvm_kpit_state pit_state; 41 struct kvm_kpit_state pit_state;
40 int irq_source_id; 42 int irq_source_id;
41 struct kvm_irq_mask_notifier mask_notifier; 43 struct kvm_irq_mask_notifier mask_notifier;
42 struct workqueue_struct *wq; 44 struct kthread_worker worker;
43 struct work_struct expired; 45 struct task_struct *worker_task;
46 struct kthread_work expired;
44}; 47};
45 48
46#define KVM_PIT_BASE_ADDRESS 0x40 49#define KVM_PIT_BASE_ADDRESS 0x40
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 858432287ab..93c15743f1e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -92,6 +92,11 @@ static inline int apic_test_and_clear_vector(int vec, void *bitmap)
92 return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 92 return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
93} 93}
94 94
95static inline int apic_test_vector(int vec, void *bitmap)
96{
97 return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
98}
99
95static inline void apic_set_vector(int vec, void *bitmap) 100static inline void apic_set_vector(int vec, void *bitmap)
96{ 101{
97 set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 102 set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -480,7 +485,6 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
480static void apic_set_eoi(struct kvm_lapic *apic) 485static void apic_set_eoi(struct kvm_lapic *apic)
481{ 486{
482 int vector = apic_find_highest_isr(apic); 487 int vector = apic_find_highest_isr(apic);
483 int trigger_mode;
484 /* 488 /*
485 * Not every write EOI will has corresponding ISR, 489 * Not every write EOI will has corresponding ISR,
486 * one example is when Kernel check timer on setup_IO_APIC 490 * one example is when Kernel check timer on setup_IO_APIC
@@ -491,12 +495,15 @@ static void apic_set_eoi(struct kvm_lapic *apic)
491 apic_clear_vector(vector, apic->regs + APIC_ISR); 495 apic_clear_vector(vector, apic->regs + APIC_ISR);
492 apic_update_ppr(apic); 496 apic_update_ppr(apic);
493 497
494 if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) 498 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
495 trigger_mode = IOAPIC_LEVEL_TRIG; 499 kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
496 else 500 int trigger_mode;
497 trigger_mode = IOAPIC_EDGE_TRIG; 501 if (apic_test_vector(vector, apic->regs + APIC_TMR))
498 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) 502 trigger_mode = IOAPIC_LEVEL_TRIG;
503 else
504 trigger_mode = IOAPIC_EDGE_TRIG;
499 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 505 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
506 }
500 kvm_make_request(KVM_REQ_EVENT, apic->vcpu); 507 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
501} 508}
502 509
@@ -1081,6 +1088,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1081 apic_update_ppr(apic); 1088 apic_update_ppr(apic);
1082 1089
1083 vcpu->arch.apic_arb_prio = 0; 1090 vcpu->arch.apic_arb_prio = 0;
1091 vcpu->arch.apic_attention = 0;
1084 1092
1085 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" 1093 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
1086 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, 1094 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
@@ -1280,7 +1288,7 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
1280 u32 data; 1288 u32 data;
1281 void *vapic; 1289 void *vapic;
1282 1290
1283 if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) 1291 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
1284 return; 1292 return;
1285 1293
1286 vapic = kmap_atomic(vcpu->arch.apic->vapic_page); 1294 vapic = kmap_atomic(vcpu->arch.apic->vapic_page);
@@ -1297,7 +1305,7 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
1297 struct kvm_lapic *apic; 1305 struct kvm_lapic *apic;
1298 void *vapic; 1306 void *vapic;
1299 1307
1300 if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) 1308 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
1301 return; 1309 return;
1302 1310
1303 apic = vcpu->arch.apic; 1311 apic = vcpu->arch.apic;
@@ -1317,10 +1325,11 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
1317 1325
1318void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) 1326void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
1319{ 1327{
1320 if (!irqchip_in_kernel(vcpu->kvm))
1321 return;
1322
1323 vcpu->arch.apic->vapic_addr = vapic_addr; 1328 vcpu->arch.apic->vapic_addr = vapic_addr;
1329 if (vapic_addr)
1330 __set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
1331 else
1332 __clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
1324} 1333}
1325 1334
1326int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1335int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4cb16426884..72102e0ab7c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -135,8 +135,6 @@ module_param(dbg, bool, 0644);
135#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ 135#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
136 | PT64_NX_MASK) 136 | PT64_NX_MASK)
137 137
138#define PTE_LIST_EXT 4
139
140#define ACC_EXEC_MASK 1 138#define ACC_EXEC_MASK 1
141#define ACC_WRITE_MASK PT_WRITABLE_MASK 139#define ACC_WRITE_MASK PT_WRITABLE_MASK
142#define ACC_USER_MASK PT_USER_MASK 140#define ACC_USER_MASK PT_USER_MASK
@@ -151,6 +149,9 @@ module_param(dbg, bool, 0644);
151 149
152#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 150#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
153 151
152/* make pte_list_desc fit well in cache line */
153#define PTE_LIST_EXT 3
154
154struct pte_list_desc { 155struct pte_list_desc {
155 u64 *sptes[PTE_LIST_EXT]; 156 u64 *sptes[PTE_LIST_EXT];
156 struct pte_list_desc *more; 157 struct pte_list_desc *more;
@@ -550,19 +551,29 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
550 551
551static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) 552static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
552{ 553{
553 rcu_read_lock(); 554 /*
554 atomic_inc(&vcpu->kvm->arch.reader_counter); 555 * Prevent page table teardown by making any free-er wait during
555 556 * kvm_flush_remote_tlbs() IPI to all active vcpus.
556 /* Increase the counter before walking shadow page table */ 557 */
557 smp_mb__after_atomic_inc(); 558 local_irq_disable();
559 vcpu->mode = READING_SHADOW_PAGE_TABLES;
560 /*
561 * Make sure a following spte read is not reordered ahead of the write
562 * to vcpu->mode.
563 */
564 smp_mb();
558} 565}
559 566
560static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) 567static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
561{ 568{
562 /* Decrease the counter after walking shadow page table finished */ 569 /*
563 smp_mb__before_atomic_dec(); 570 * Make sure the write to vcpu->mode is not reordered in front of
564 atomic_dec(&vcpu->kvm->arch.reader_counter); 571 * reads to sptes. If it does, kvm_commit_zap_page() can see us
565 rcu_read_unlock(); 572 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
573 */
574 smp_mb();
575 vcpu->mode = OUTSIDE_GUEST_MODE;
576 local_irq_enable();
566} 577}
567 578
568static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 579static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
@@ -841,32 +852,6 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
841 return count; 852 return count;
842} 853}
843 854
844static u64 *pte_list_next(unsigned long *pte_list, u64 *spte)
845{
846 struct pte_list_desc *desc;
847 u64 *prev_spte;
848 int i;
849
850 if (!*pte_list)
851 return NULL;
852 else if (!(*pte_list & 1)) {
853 if (!spte)
854 return (u64 *)*pte_list;
855 return NULL;
856 }
857 desc = (struct pte_list_desc *)(*pte_list & ~1ul);
858 prev_spte = NULL;
859 while (desc) {
860 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
861 if (prev_spte == spte)
862 return desc->sptes[i];
863 prev_spte = desc->sptes[i];
864 }
865 desc = desc->more;
866 }
867 return NULL;
868}
869
870static void 855static void
871pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, 856pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
872 int i, struct pte_list_desc *prev_desc) 857 int i, struct pte_list_desc *prev_desc)
@@ -987,11 +972,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
987 return pte_list_add(vcpu, spte, rmapp); 972 return pte_list_add(vcpu, spte, rmapp);
988} 973}
989 974
990static u64 *rmap_next(unsigned long *rmapp, u64 *spte)
991{
992 return pte_list_next(rmapp, spte);
993}
994
995static void rmap_remove(struct kvm *kvm, u64 *spte) 975static void rmap_remove(struct kvm *kvm, u64 *spte)
996{ 976{
997 struct kvm_mmu_page *sp; 977 struct kvm_mmu_page *sp;
@@ -1004,106 +984,201 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
1004 pte_list_remove(spte, rmapp); 984 pte_list_remove(spte, rmapp);
1005} 985}
1006 986
987/*
988 * Used by the following functions to iterate through the sptes linked by a
989 * rmap. All fields are private and not assumed to be used outside.
990 */
991struct rmap_iterator {
992 /* private fields */
993 struct pte_list_desc *desc; /* holds the sptep if not NULL */
994 int pos; /* index of the sptep */
995};
996
997/*
998 * Iteration must be started by this function. This should also be used after
999 * removing/dropping sptes from the rmap link because in such cases the
1000 * information in the itererator may not be valid.
1001 *
1002 * Returns sptep if found, NULL otherwise.
1003 */
1004static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter)
1005{
1006 if (!rmap)
1007 return NULL;
1008
1009 if (!(rmap & 1)) {
1010 iter->desc = NULL;
1011 return (u64 *)rmap;
1012 }
1013
1014 iter->desc = (struct pte_list_desc *)(rmap & ~1ul);
1015 iter->pos = 0;
1016 return iter->desc->sptes[iter->pos];
1017}
1018
1019/*
1020 * Must be used with a valid iterator: e.g. after rmap_get_first().
1021 *
1022 * Returns sptep if found, NULL otherwise.
1023 */
1024static u64 *rmap_get_next(struct rmap_iterator *iter)
1025{
1026 if (iter->desc) {
1027 if (iter->pos < PTE_LIST_EXT - 1) {
1028 u64 *sptep;
1029
1030 ++iter->pos;
1031 sptep = iter->desc->sptes[iter->pos];
1032 if (sptep)
1033 return sptep;
1034 }
1035
1036 iter->desc = iter->desc->more;
1037
1038 if (iter->desc) {
1039 iter->pos = 0;
1040 /* desc->sptes[0] cannot be NULL */
1041 return iter->desc->sptes[iter->pos];
1042 }
1043 }
1044
1045 return NULL;
1046}
1047
1007static void drop_spte(struct kvm *kvm, u64 *sptep) 1048static void drop_spte(struct kvm *kvm, u64 *sptep)
1008{ 1049{
1009 if (mmu_spte_clear_track_bits(sptep)) 1050 if (mmu_spte_clear_track_bits(sptep))
1010 rmap_remove(kvm, sptep); 1051 rmap_remove(kvm, sptep);
1011} 1052}
1012 1053
1013int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, 1054static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
1014 struct kvm_memory_slot *slot)
1015{ 1055{
1016 unsigned long *rmapp; 1056 u64 *sptep;
1017 u64 *spte; 1057 struct rmap_iterator iter;
1018 int i, write_protected = 0; 1058 int write_protected = 0;
1019 1059
1020 rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot); 1060 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
1021 spte = rmap_next(rmapp, NULL); 1061 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1022 while (spte) { 1062 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1023 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1063
1024 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 1064 if (!is_writable_pte(*sptep)) {
1025 if (is_writable_pte(*spte)) { 1065 sptep = rmap_get_next(&iter);
1026 mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); 1066 continue;
1027 write_protected = 1;
1028 } 1067 }
1029 spte = rmap_next(rmapp, spte);
1030 }
1031 1068
1032 /* check for huge page mappings */ 1069 if (level == PT_PAGE_TABLE_LEVEL) {
1033 for (i = PT_DIRECTORY_LEVEL; 1070 mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK);
1034 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 1071 sptep = rmap_get_next(&iter);
1035 rmapp = __gfn_to_rmap(gfn, i, slot); 1072 } else {
1036 spte = rmap_next(rmapp, NULL); 1073 BUG_ON(!is_large_pte(*sptep));
1037 while (spte) { 1074 drop_spte(kvm, sptep);
1038 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1075 --kvm->stat.lpages;
1039 BUG_ON(!is_large_pte(*spte)); 1076 sptep = rmap_get_first(*rmapp, &iter);
1040 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
1041 if (is_writable_pte(*spte)) {
1042 drop_spte(kvm, spte);
1043 --kvm->stat.lpages;
1044 spte = NULL;
1045 write_protected = 1;
1046 }
1047 spte = rmap_next(rmapp, spte);
1048 } 1077 }
1078
1079 write_protected = 1;
1049 } 1080 }
1050 1081
1051 return write_protected; 1082 return write_protected;
1052} 1083}
1053 1084
1085/**
1086 * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1087 * @kvm: kvm instance
1088 * @slot: slot to protect
1089 * @gfn_offset: start of the BITS_PER_LONG pages we care about
1090 * @mask: indicates which pages we should protect
1091 *
1092 * Used when we do not need to care about huge page mappings: e.g. during dirty
1093 * logging we do not have any such mappings.
1094 */
1095void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1096 struct kvm_memory_slot *slot,
1097 gfn_t gfn_offset, unsigned long mask)
1098{
1099 unsigned long *rmapp;
1100
1101 while (mask) {
1102 rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
1103 __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL);
1104
1105 /* clear the first set bit */
1106 mask &= mask - 1;
1107 }
1108}
1109
1054static int rmap_write_protect(struct kvm *kvm, u64 gfn) 1110static int rmap_write_protect(struct kvm *kvm, u64 gfn)
1055{ 1111{
1056 struct kvm_memory_slot *slot; 1112 struct kvm_memory_slot *slot;
1113 unsigned long *rmapp;
1114 int i;
1115 int write_protected = 0;
1057 1116
1058 slot = gfn_to_memslot(kvm, gfn); 1117 slot = gfn_to_memslot(kvm, gfn);
1059 return kvm_mmu_rmap_write_protect(kvm, gfn, slot); 1118
1119 for (i = PT_PAGE_TABLE_LEVEL;
1120 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
1121 rmapp = __gfn_to_rmap(gfn, i, slot);
1122 write_protected |= __rmap_write_protect(kvm, rmapp, i);
1123 }
1124
1125 return write_protected;
1060} 1126}
1061 1127
1062static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, 1128static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
1063 unsigned long data) 1129 unsigned long data)
1064{ 1130{
1065 u64 *spte; 1131 u64 *sptep;
1132 struct rmap_iterator iter;
1066 int need_tlb_flush = 0; 1133 int need_tlb_flush = 0;
1067 1134
1068 while ((spte = rmap_next(rmapp, NULL))) { 1135 while ((sptep = rmap_get_first(*rmapp, &iter))) {
1069 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1136 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1070 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); 1137 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep);
1071 drop_spte(kvm, spte); 1138
1139 drop_spte(kvm, sptep);
1072 need_tlb_flush = 1; 1140 need_tlb_flush = 1;
1073 } 1141 }
1142
1074 return need_tlb_flush; 1143 return need_tlb_flush;
1075} 1144}
1076 1145
1077static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, 1146static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1078 unsigned long data) 1147 unsigned long data)
1079{ 1148{
1149 u64 *sptep;
1150 struct rmap_iterator iter;
1080 int need_flush = 0; 1151 int need_flush = 0;
1081 u64 *spte, new_spte; 1152 u64 new_spte;
1082 pte_t *ptep = (pte_t *)data; 1153 pte_t *ptep = (pte_t *)data;
1083 pfn_t new_pfn; 1154 pfn_t new_pfn;
1084 1155
1085 WARN_ON(pte_huge(*ptep)); 1156 WARN_ON(pte_huge(*ptep));
1086 new_pfn = pte_pfn(*ptep); 1157 new_pfn = pte_pfn(*ptep);
1087 spte = rmap_next(rmapp, NULL); 1158
1088 while (spte) { 1159 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
1089 BUG_ON(!is_shadow_present_pte(*spte)); 1160 BUG_ON(!is_shadow_present_pte(*sptep));
1090 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); 1161 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep);
1162
1091 need_flush = 1; 1163 need_flush = 1;
1164
1092 if (pte_write(*ptep)) { 1165 if (pte_write(*ptep)) {
1093 drop_spte(kvm, spte); 1166 drop_spte(kvm, sptep);
1094 spte = rmap_next(rmapp, NULL); 1167 sptep = rmap_get_first(*rmapp, &iter);
1095 } else { 1168 } else {
1096 new_spte = *spte &~ (PT64_BASE_ADDR_MASK); 1169 new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
1097 new_spte |= (u64)new_pfn << PAGE_SHIFT; 1170 new_spte |= (u64)new_pfn << PAGE_SHIFT;
1098 1171
1099 new_spte &= ~PT_WRITABLE_MASK; 1172 new_spte &= ~PT_WRITABLE_MASK;
1100 new_spte &= ~SPTE_HOST_WRITEABLE; 1173 new_spte &= ~SPTE_HOST_WRITEABLE;
1101 new_spte &= ~shadow_accessed_mask; 1174 new_spte &= ~shadow_accessed_mask;
1102 mmu_spte_clear_track_bits(spte); 1175
1103 mmu_spte_set(spte, new_spte); 1176 mmu_spte_clear_track_bits(sptep);
1104 spte = rmap_next(rmapp, spte); 1177 mmu_spte_set(sptep, new_spte);
1178 sptep = rmap_get_next(&iter);
1105 } 1179 }
1106 } 1180 }
1181
1107 if (need_flush) 1182 if (need_flush)
1108 kvm_flush_remote_tlbs(kvm); 1183 kvm_flush_remote_tlbs(kvm);
1109 1184
@@ -1162,7 +1237,8 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1162static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 1237static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1163 unsigned long data) 1238 unsigned long data)
1164{ 1239{
1165 u64 *spte; 1240 u64 *sptep;
1241 struct rmap_iterator iter;
1166 int young = 0; 1242 int young = 0;
1167 1243
1168 /* 1244 /*
@@ -1175,25 +1251,24 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1175 if (!shadow_accessed_mask) 1251 if (!shadow_accessed_mask)
1176 return kvm_unmap_rmapp(kvm, rmapp, data); 1252 return kvm_unmap_rmapp(kvm, rmapp, data);
1177 1253
1178 spte = rmap_next(rmapp, NULL); 1254 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
1179 while (spte) { 1255 sptep = rmap_get_next(&iter)) {
1180 int _young; 1256 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1181 u64 _spte = *spte; 1257
1182 BUG_ON(!(_spte & PT_PRESENT_MASK)); 1258 if (*sptep & PT_ACCESSED_MASK) {
1183 _young = _spte & PT_ACCESSED_MASK;
1184 if (_young) {
1185 young = 1; 1259 young = 1;
1186 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); 1260 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep);
1187 } 1261 }
1188 spte = rmap_next(rmapp, spte);
1189 } 1262 }
1263
1190 return young; 1264 return young;
1191} 1265}
1192 1266
1193static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 1267static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1194 unsigned long data) 1268 unsigned long data)
1195{ 1269{
1196 u64 *spte; 1270 u64 *sptep;
1271 struct rmap_iterator iter;
1197 int young = 0; 1272 int young = 0;
1198 1273
1199 /* 1274 /*
@@ -1204,16 +1279,14 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1204 if (!shadow_accessed_mask) 1279 if (!shadow_accessed_mask)
1205 goto out; 1280 goto out;
1206 1281
1207 spte = rmap_next(rmapp, NULL); 1282 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
1208 while (spte) { 1283 sptep = rmap_get_next(&iter)) {
1209 u64 _spte = *spte; 1284 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1210 BUG_ON(!(_spte & PT_PRESENT_MASK)); 1285
1211 young = _spte & PT_ACCESSED_MASK; 1286 if (*sptep & PT_ACCESSED_MASK) {
1212 if (young) {
1213 young = 1; 1287 young = 1;
1214 break; 1288 break;
1215 } 1289 }
1216 spte = rmap_next(rmapp, spte);
1217 } 1290 }
1218out: 1291out:
1219 return young; 1292 return young;
@@ -1865,10 +1938,11 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1865 1938
1866static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) 1939static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1867{ 1940{
1868 u64 *parent_pte; 1941 u64 *sptep;
1942 struct rmap_iterator iter;
1869 1943
1870 while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL))) 1944 while ((sptep = rmap_get_first(sp->parent_ptes, &iter)))
1871 drop_parent_pte(sp, parent_pte); 1945 drop_parent_pte(sp, sptep);
1872} 1946}
1873 1947
1874static int mmu_zap_unsync_children(struct kvm *kvm, 1948static int mmu_zap_unsync_children(struct kvm *kvm,
@@ -1925,30 +1999,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1925 return ret; 1999 return ret;
1926} 2000}
1927 2001
1928static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
1929{
1930 struct kvm_mmu_page *sp;
1931
1932 list_for_each_entry(sp, invalid_list, link)
1933 kvm_mmu_isolate_page(sp);
1934}
1935
1936static void free_pages_rcu(struct rcu_head *head)
1937{
1938 struct kvm_mmu_page *next, *sp;
1939
1940 sp = container_of(head, struct kvm_mmu_page, rcu);
1941 while (sp) {
1942 if (!list_empty(&sp->link))
1943 next = list_first_entry(&sp->link,
1944 struct kvm_mmu_page, link);
1945 else
1946 next = NULL;
1947 kvm_mmu_free_page(sp);
1948 sp = next;
1949 }
1950}
1951
1952static void kvm_mmu_commit_zap_page(struct kvm *kvm, 2002static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1953 struct list_head *invalid_list) 2003 struct list_head *invalid_list)
1954{ 2004{
@@ -1957,17 +2007,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1957 if (list_empty(invalid_list)) 2007 if (list_empty(invalid_list))
1958 return; 2008 return;
1959 2009
1960 kvm_flush_remote_tlbs(kvm); 2010 /*
1961 2011 * wmb: make sure everyone sees our modifications to the page tables
1962 if (atomic_read(&kvm->arch.reader_counter)) { 2012 * rmb: make sure we see changes to vcpu->mode
1963 kvm_mmu_isolate_pages(invalid_list); 2013 */
1964 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); 2014 smp_mb();
1965 list_del_init(invalid_list);
1966 2015
1967 trace_kvm_mmu_delay_free_pages(sp); 2016 /*
1968 call_rcu(&sp->rcu, free_pages_rcu); 2017 * Wait for all vcpus to exit guest mode and/or lockless shadow
1969 return; 2018 * page table walks.
1970 } 2019 */
2020 kvm_flush_remote_tlbs(kvm);
1971 2021
1972 do { 2022 do {
1973 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); 2023 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
@@ -1975,7 +2025,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1975 kvm_mmu_isolate_page(sp); 2025 kvm_mmu_isolate_page(sp);
1976 kvm_mmu_free_page(sp); 2026 kvm_mmu_free_page(sp);
1977 } while (!list_empty(invalid_list)); 2027 } while (!list_empty(invalid_list));
1978
1979} 2028}
1980 2029
1981/* 2030/*
@@ -3554,7 +3603,7 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp)
3554 * Skip write-flooding detected for the sp whose level is 1, because 3603 * Skip write-flooding detected for the sp whose level is 1, because
3555 * it can become unsync, then the guest page is not write-protected. 3604 * it can become unsync, then the guest page is not write-protected.
3556 */ 3605 */
3557 if (sp->role.level == 1) 3606 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
3558 return false; 3607 return false;
3559 3608
3560 return ++sp->write_flooding_count >= 3; 3609 return ++sp->write_flooding_count >= 3;
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 715da5a19a5..7d7d0b9e23e 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -192,7 +192,8 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
192{ 192{
193 struct kvm_memory_slot *slot; 193 struct kvm_memory_slot *slot;
194 unsigned long *rmapp; 194 unsigned long *rmapp;
195 u64 *spte; 195 u64 *sptep;
196 struct rmap_iterator iter;
196 197
197 if (sp->role.direct || sp->unsync || sp->role.invalid) 198 if (sp->role.direct || sp->unsync || sp->role.invalid)
198 return; 199 return;
@@ -200,13 +201,12 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
200 slot = gfn_to_memslot(kvm, sp->gfn); 201 slot = gfn_to_memslot(kvm, sp->gfn);
201 rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; 202 rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
202 203
203 spte = rmap_next(rmapp, NULL); 204 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
204 while (spte) { 205 sptep = rmap_get_next(&iter)) {
205 if (is_writable_pte(*spte)) 206 if (is_writable_pte(*sptep))
206 audit_printk(kvm, "shadow page has writable " 207 audit_printk(kvm, "shadow page has writable "
207 "mappings: gfn %llx role %x\n", 208 "mappings: gfn %llx role %x\n",
208 sp->gfn, sp->role.word); 209 sp->gfn, sp->role.word);
209 spte = rmap_next(rmapp, spte);
210 } 210 }
211} 211}
212 212
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index df5a70311be..34f970937ef 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -658,7 +658,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
658{ 658{
659 int offset = 0; 659 int offset = 0;
660 660
661 WARN_ON(sp->role.level != 1); 661 WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
662 662
663 if (PTTYPE == 32) 663 if (PTTYPE == 32)
664 offset = sp->role.quadrant << PT64_LEVEL_BITS; 664 offset = sp->role.quadrant << PT64_LEVEL_BITS;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e334389e1c7..f75af406b26 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -22,6 +22,7 @@
22#include "x86.h" 22#include "x86.h"
23 23
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/mod_devicetable.h>
25#include <linux/kernel.h> 26#include <linux/kernel.h>
26#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
27#include <linux/highmem.h> 28#include <linux/highmem.h>
@@ -42,6 +43,12 @@
42MODULE_AUTHOR("Qumranet"); 43MODULE_AUTHOR("Qumranet");
43MODULE_LICENSE("GPL"); 44MODULE_LICENSE("GPL");
44 45
46static const struct x86_cpu_id svm_cpu_id[] = {
47 X86_FEATURE_MATCH(X86_FEATURE_SVM),
48 {}
49};
50MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
51
45#define IOPM_ALLOC_ORDER 2 52#define IOPM_ALLOC_ORDER 2
46#define MSRPM_ALLOC_ORDER 1 53#define MSRPM_ALLOC_ORDER 1
47 54
@@ -3240,6 +3247,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
3240 svm_clear_vintr(svm); 3247 svm_clear_vintr(svm);
3241 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 3248 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3242 mark_dirty(svm->vmcb, VMCB_INTR); 3249 mark_dirty(svm->vmcb, VMCB_INTR);
3250 ++svm->vcpu.stat.irq_window_exits;
3243 /* 3251 /*
3244 * If the user space waits to inject interrupts, exit as soon as 3252 * If the user space waits to inject interrupts, exit as soon as
3245 * possible 3253 * possible
@@ -3247,7 +3255,6 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
3247 if (!irqchip_in_kernel(svm->vcpu.kvm) && 3255 if (!irqchip_in_kernel(svm->vcpu.kvm) &&
3248 kvm_run->request_interrupt_window && 3256 kvm_run->request_interrupt_window &&
3249 !kvm_cpu_has_interrupt(&svm->vcpu)) { 3257 !kvm_cpu_has_interrupt(&svm->vcpu)) {
3250 ++svm->vcpu.stat.irq_window_exits;
3251 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 3258 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
3252 return 0; 3259 return 0;
3253 } 3260 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4ff0ab9bc3c..32eb5886629 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -27,6 +27,7 @@
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/sched.h> 28#include <linux/sched.h>
29#include <linux/moduleparam.h> 29#include <linux/moduleparam.h>
30#include <linux/mod_devicetable.h>
30#include <linux/ftrace_event.h> 31#include <linux/ftrace_event.h>
31#include <linux/slab.h> 32#include <linux/slab.h>
32#include <linux/tboot.h> 33#include <linux/tboot.h>
@@ -51,6 +52,12 @@
51MODULE_AUTHOR("Qumranet"); 52MODULE_AUTHOR("Qumranet");
52MODULE_LICENSE("GPL"); 53MODULE_LICENSE("GPL");
53 54
55static const struct x86_cpu_id vmx_cpu_id[] = {
56 X86_FEATURE_MATCH(X86_FEATURE_VMX),
57 {}
58};
59MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
60
54static bool __read_mostly enable_vpid = 1; 61static bool __read_mostly enable_vpid = 1;
55module_param_named(vpid, enable_vpid, bool, 0444); 62module_param_named(vpid, enable_vpid, bool, 0444);
56 63
@@ -386,6 +393,9 @@ struct vcpu_vmx {
386 struct { 393 struct {
387 int loaded; 394 int loaded;
388 u16 fs_sel, gs_sel, ldt_sel; 395 u16 fs_sel, gs_sel, ldt_sel;
396#ifdef CONFIG_X86_64
397 u16 ds_sel, es_sel;
398#endif
389 int gs_ldt_reload_needed; 399 int gs_ldt_reload_needed;
390 int fs_reload_needed; 400 int fs_reload_needed;
391 } host_state; 401 } host_state;
@@ -1411,6 +1421,11 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
1411 } 1421 }
1412 1422
1413#ifdef CONFIG_X86_64 1423#ifdef CONFIG_X86_64
1424 savesegment(ds, vmx->host_state.ds_sel);
1425 savesegment(es, vmx->host_state.es_sel);
1426#endif
1427
1428#ifdef CONFIG_X86_64
1414 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); 1429 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
1415 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); 1430 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
1416#else 1431#else
@@ -1450,6 +1465,19 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
1450 } 1465 }
1451 if (vmx->host_state.fs_reload_needed) 1466 if (vmx->host_state.fs_reload_needed)
1452 loadsegment(fs, vmx->host_state.fs_sel); 1467 loadsegment(fs, vmx->host_state.fs_sel);
1468#ifdef CONFIG_X86_64
1469 if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
1470 loadsegment(ds, vmx->host_state.ds_sel);
1471 loadsegment(es, vmx->host_state.es_sel);
1472 }
1473#else
1474 /*
1475 * The sysexit path does not restore ds/es, so we must set them to
1476 * a reasonable value ourselves.
1477 */
1478 loadsegment(ds, __USER_DS);
1479 loadsegment(es, __USER_DS);
1480#endif
1453 reload_tss(); 1481 reload_tss();
1454#ifdef CONFIG_X86_64 1482#ifdef CONFIG_X86_64
1455 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 1483 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
@@ -3633,8 +3661,18 @@ static void vmx_set_constant_host_state(void)
3633 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ 3661 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
3634 3662
3635 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 3663 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
3664#ifdef CONFIG_X86_64
3665 /*
3666 * Load null selectors, so we can avoid reloading them in
3667 * __vmx_load_host_state(), in case userspace uses the null selectors
3668 * too (the expected case).
3669 */
3670 vmcs_write16(HOST_DS_SELECTOR, 0);
3671 vmcs_write16(HOST_ES_SELECTOR, 0);
3672#else
3636 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 3673 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3637 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 3674 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3675#endif
3638 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 3676 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3639 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 3677 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
3640 3678
@@ -6256,7 +6294,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6256 } 6294 }
6257 } 6295 }
6258 6296
6259 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
6260 vmx->loaded_vmcs->launched = 1; 6297 vmx->loaded_vmcs->launched = 1;
6261 6298
6262 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); 6299 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
@@ -6343,7 +6380,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
6343 return &vmx->vcpu; 6380 return &vmx->vcpu;
6344 6381
6345free_vmcs: 6382free_vmcs:
6346 free_vmcs(vmx->loaded_vmcs->vmcs); 6383 free_loaded_vmcs(vmx->loaded_vmcs);
6347free_msrs: 6384free_msrs:
6348 kfree(vmx->guest_msrs); 6385 kfree(vmx->guest_msrs);
6349uninit_vcpu: 6386uninit_vcpu:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 185a2b823a2..be6d54929fa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2147,6 +2147,7 @@ int kvm_dev_ioctl_check_extension(long ext)
2147 case KVM_CAP_ASYNC_PF: 2147 case KVM_CAP_ASYNC_PF:
2148 case KVM_CAP_GET_TSC_KHZ: 2148 case KVM_CAP_GET_TSC_KHZ:
2149 case KVM_CAP_PCI_2_3: 2149 case KVM_CAP_PCI_2_3:
2150 case KVM_CAP_KVMCLOCK_CTRL:
2150 r = 1; 2151 r = 1;
2151 break; 2152 break;
2152 case KVM_CAP_COALESCED_MMIO: 2153 case KVM_CAP_COALESCED_MMIO:
@@ -2597,6 +2598,23 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
2597 return r; 2598 return r;
2598} 2599}
2599 2600
2601/*
2602 * kvm_set_guest_paused() indicates to the guest kernel that it has been
2603 * stopped by the hypervisor. This function will be called from the host only.
2604 * EINVAL is returned when the host attempts to set the flag for a guest that
2605 * does not support pv clocks.
2606 */
2607static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
2608{
2609 struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock;
2610 if (!vcpu->arch.time_page)
2611 return -EINVAL;
2612 src->flags |= PVCLOCK_GUEST_STOPPED;
2613 mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT);
2614 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2615 return 0;
2616}
2617
2600long kvm_arch_vcpu_ioctl(struct file *filp, 2618long kvm_arch_vcpu_ioctl(struct file *filp,
2601 unsigned int ioctl, unsigned long arg) 2619 unsigned int ioctl, unsigned long arg)
2602{ 2620{
@@ -2873,6 +2891,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2873 r = vcpu->arch.virtual_tsc_khz; 2891 r = vcpu->arch.virtual_tsc_khz;
2874 goto out; 2892 goto out;
2875 } 2893 }
2894 case KVM_KVMCLOCK_CTRL: {
2895 r = kvm_set_guest_paused(vcpu);
2896 goto out;
2897 }
2876 default: 2898 default:
2877 r = -EINVAL; 2899 r = -EINVAL;
2878 } 2900 }
@@ -3045,57 +3067,32 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
3045} 3067}
3046 3068
3047/** 3069/**
3048 * write_protect_slot - write protect a slot for dirty logging 3070 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
3049 * @kvm: the kvm instance 3071 * @kvm: kvm instance
3050 * @memslot: the slot we protect 3072 * @log: slot id and address to which we copy the log
3051 * @dirty_bitmap: the bitmap indicating which pages are dirty
3052 * @nr_dirty_pages: the number of dirty pages
3053 * 3073 *
3054 * We have two ways to find all sptes to protect: 3074 * We need to keep it in mind that VCPU threads can write to the bitmap
3055 * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and 3075 * concurrently. So, to avoid losing data, we keep the following order for
3056 * checks ones that have a spte mapping a page in the slot. 3076 * each bit:
3057 * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap.
3058 * 3077 *
3059 * Generally speaking, if there are not so many dirty pages compared to the 3078 * 1. Take a snapshot of the bit and clear it if needed.
3060 * number of shadow pages, we should use the latter. 3079 * 2. Write protect the corresponding page.
3080 * 3. Flush TLB's if needed.
3081 * 4. Copy the snapshot to the userspace.
3061 * 3082 *
3062 * Note that letting others write into a page marked dirty in the old bitmap 3083 * Between 2 and 3, the guest may write to the page using the remaining TLB
3063 * by using the remaining tlb entry is not a problem. That page will become 3084 * entry. This is not a problem because the page will be reported dirty at
3064 * write protected again when we flush the tlb and then be reported dirty to 3085 * step 4 using the snapshot taken before and step 3 ensures that successive
3065 * the user space by copying the old bitmap. 3086 * writes will be logged for the next call.
3066 */
3067static void write_protect_slot(struct kvm *kvm,
3068 struct kvm_memory_slot *memslot,
3069 unsigned long *dirty_bitmap,
3070 unsigned long nr_dirty_pages)
3071{
3072 spin_lock(&kvm->mmu_lock);
3073
3074 /* Not many dirty pages compared to # of shadow pages. */
3075 if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
3076 unsigned long gfn_offset;
3077
3078 for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
3079 unsigned long gfn = memslot->base_gfn + gfn_offset;
3080
3081 kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
3082 }
3083 kvm_flush_remote_tlbs(kvm);
3084 } else
3085 kvm_mmu_slot_remove_write_access(kvm, memslot->id);
3086
3087 spin_unlock(&kvm->mmu_lock);
3088}
3089
3090/*
3091 * Get (and clear) the dirty memory log for a memory slot.
3092 */ 3087 */
3093int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 3088int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
3094 struct kvm_dirty_log *log)
3095{ 3089{
3096 int r; 3090 int r;
3097 struct kvm_memory_slot *memslot; 3091 struct kvm_memory_slot *memslot;
3098 unsigned long n, nr_dirty_pages; 3092 unsigned long n, i;
3093 unsigned long *dirty_bitmap;
3094 unsigned long *dirty_bitmap_buffer;
3095 bool is_dirty = false;
3099 3096
3100 mutex_lock(&kvm->slots_lock); 3097 mutex_lock(&kvm->slots_lock);
3101 3098
@@ -3104,49 +3101,42 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
3104 goto out; 3101 goto out;
3105 3102
3106 memslot = id_to_memslot(kvm->memslots, log->slot); 3103 memslot = id_to_memslot(kvm->memslots, log->slot);
3104
3105 dirty_bitmap = memslot->dirty_bitmap;
3107 r = -ENOENT; 3106 r = -ENOENT;
3108 if (!memslot->dirty_bitmap) 3107 if (!dirty_bitmap)
3109 goto out; 3108 goto out;
3110 3109
3111 n = kvm_dirty_bitmap_bytes(memslot); 3110 n = kvm_dirty_bitmap_bytes(memslot);
3112 nr_dirty_pages = memslot->nr_dirty_pages;
3113 3111
3114 /* If nothing is dirty, don't bother messing with page tables. */ 3112 dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
3115 if (nr_dirty_pages) { 3113 memset(dirty_bitmap_buffer, 0, n);
3116 struct kvm_memslots *slots, *old_slots;
3117 unsigned long *dirty_bitmap, *dirty_bitmap_head;
3118 3114
3119 dirty_bitmap = memslot->dirty_bitmap; 3115 spin_lock(&kvm->mmu_lock);
3120 dirty_bitmap_head = memslot->dirty_bitmap_head;
3121 if (dirty_bitmap == dirty_bitmap_head)
3122 dirty_bitmap_head += n / sizeof(long);
3123 memset(dirty_bitmap_head, 0, n);
3124 3116
3125 r = -ENOMEM; 3117 for (i = 0; i < n / sizeof(long); i++) {
3126 slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL); 3118 unsigned long mask;
3127 if (!slots) 3119 gfn_t offset;
3128 goto out;
3129 3120
3130 memslot = id_to_memslot(slots, log->slot); 3121 if (!dirty_bitmap[i])
3131 memslot->nr_dirty_pages = 0; 3122 continue;
3132 memslot->dirty_bitmap = dirty_bitmap_head;
3133 update_memslots(slots, NULL);
3134 3123
3135 old_slots = kvm->memslots; 3124 is_dirty = true;
3136 rcu_assign_pointer(kvm->memslots, slots);
3137 synchronize_srcu_expedited(&kvm->srcu);
3138 kfree(old_slots);
3139 3125
3140 write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages); 3126 mask = xchg(&dirty_bitmap[i], 0);
3127 dirty_bitmap_buffer[i] = mask;
3141 3128
3142 r = -EFAULT; 3129 offset = i * BITS_PER_LONG;
3143 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) 3130 kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
3144 goto out;
3145 } else {
3146 r = -EFAULT;
3147 if (clear_user(log->dirty_bitmap, n))
3148 goto out;
3149 } 3131 }
3132 if (is_dirty)
3133 kvm_flush_remote_tlbs(kvm);
3134
3135 spin_unlock(&kvm->mmu_lock);
3136
3137 r = -EFAULT;
3138 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
3139 goto out;
3150 3140
3151 r = 0; 3141 r = 0;
3152out: 3142out:
@@ -3728,9 +3718,8 @@ struct read_write_emulator_ops {
3728static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) 3718static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
3729{ 3719{
3730 if (vcpu->mmio_read_completed) { 3720 if (vcpu->mmio_read_completed) {
3731 memcpy(val, vcpu->mmio_data, bytes);
3732 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, 3721 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
3733 vcpu->mmio_phys_addr, *(u64 *)val); 3722 vcpu->mmio_fragments[0].gpa, *(u64 *)val);
3734 vcpu->mmio_read_completed = 0; 3723 vcpu->mmio_read_completed = 0;
3735 return 1; 3724 return 1;
3736 } 3725 }
@@ -3766,8 +3755,9 @@ static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
3766static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, 3755static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
3767 void *val, int bytes) 3756 void *val, int bytes)
3768{ 3757{
3769 memcpy(vcpu->mmio_data, val, bytes); 3758 struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
3770 memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); 3759
3760 memcpy(vcpu->run->mmio.data, frag->data, frag->len);
3771 return X86EMUL_CONTINUE; 3761 return X86EMUL_CONTINUE;
3772} 3762}
3773 3763
@@ -3794,10 +3784,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
3794 gpa_t gpa; 3784 gpa_t gpa;
3795 int handled, ret; 3785 int handled, ret;
3796 bool write = ops->write; 3786 bool write = ops->write;
3797 3787 struct kvm_mmio_fragment *frag;
3798 if (ops->read_write_prepare &&
3799 ops->read_write_prepare(vcpu, val, bytes))
3800 return X86EMUL_CONTINUE;
3801 3788
3802 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); 3789 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
3803 3790
@@ -3823,15 +3810,19 @@ mmio:
3823 bytes -= handled; 3810 bytes -= handled;
3824 val += handled; 3811 val += handled;
3825 3812
3826 vcpu->mmio_needed = 1; 3813 while (bytes) {
3827 vcpu->run->exit_reason = KVM_EXIT_MMIO; 3814 unsigned now = min(bytes, 8U);
3828 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
3829 vcpu->mmio_size = bytes;
3830 vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
3831 vcpu->run->mmio.is_write = vcpu->mmio_is_write = write;
3832 vcpu->mmio_index = 0;
3833 3815
3834 return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); 3816 frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
3817 frag->gpa = gpa;
3818 frag->data = val;
3819 frag->len = now;
3820
3821 gpa += now;
3822 val += now;
3823 bytes -= now;
3824 }
3825 return X86EMUL_CONTINUE;
3835} 3826}
3836 3827
3837int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, 3828int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
@@ -3840,10 +3831,18 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
3840 struct read_write_emulator_ops *ops) 3831 struct read_write_emulator_ops *ops)
3841{ 3832{
3842 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 3833 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3834 gpa_t gpa;
3835 int rc;
3836
3837 if (ops->read_write_prepare &&
3838 ops->read_write_prepare(vcpu, val, bytes))
3839 return X86EMUL_CONTINUE;
3840
3841 vcpu->mmio_nr_fragments = 0;
3843 3842
3844 /* Crossing a page boundary? */ 3843 /* Crossing a page boundary? */
3845 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 3844 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
3846 int rc, now; 3845 int now;
3847 3846
3848 now = -addr & ~PAGE_MASK; 3847 now = -addr & ~PAGE_MASK;
3849 rc = emulator_read_write_onepage(addr, val, now, exception, 3848 rc = emulator_read_write_onepage(addr, val, now, exception,
@@ -3856,8 +3855,25 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
3856 bytes -= now; 3855 bytes -= now;
3857 } 3856 }
3858 3857
3859 return emulator_read_write_onepage(addr, val, bytes, exception, 3858 rc = emulator_read_write_onepage(addr, val, bytes, exception,
3860 vcpu, ops); 3859 vcpu, ops);
3860 if (rc != X86EMUL_CONTINUE)
3861 return rc;
3862
3863 if (!vcpu->mmio_nr_fragments)
3864 return rc;
3865
3866 gpa = vcpu->mmio_fragments[0].gpa;
3867
3868 vcpu->mmio_needed = 1;
3869 vcpu->mmio_cur_fragment = 0;
3870
3871 vcpu->run->mmio.len = vcpu->mmio_fragments[0].len;
3872 vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
3873 vcpu->run->exit_reason = KVM_EXIT_MMIO;
3874 vcpu->run->mmio.phys_addr = gpa;
3875
3876 return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
3861} 3877}
3862 3878
3863static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, 3879static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
@@ -5263,10 +5279,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5263 kvm_deliver_pmi(vcpu); 5279 kvm_deliver_pmi(vcpu);
5264 } 5280 }
5265 5281
5266 r = kvm_mmu_reload(vcpu);
5267 if (unlikely(r))
5268 goto out;
5269
5270 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { 5282 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
5271 inject_pending_event(vcpu); 5283 inject_pending_event(vcpu);
5272 5284
@@ -5282,6 +5294,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5282 } 5294 }
5283 } 5295 }
5284 5296
5297 r = kvm_mmu_reload(vcpu);
5298 if (unlikely(r)) {
5299 kvm_x86_ops->cancel_injection(vcpu);
5300 goto out;
5301 }
5302
5285 preempt_disable(); 5303 preempt_disable();
5286 5304
5287 kvm_x86_ops->prepare_guest_switch(vcpu); 5305 kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -5456,33 +5474,55 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5456 return r; 5474 return r;
5457} 5475}
5458 5476
5477/*
5478 * Implements the following, as a state machine:
5479 *
5480 * read:
5481 * for each fragment
5482 * write gpa, len
5483 * exit
5484 * copy data
5485 * execute insn
5486 *
5487 * write:
5488 * for each fragment
5489 * write gpa, len
5490 * copy data
5491 * exit
5492 */
5459static int complete_mmio(struct kvm_vcpu *vcpu) 5493static int complete_mmio(struct kvm_vcpu *vcpu)
5460{ 5494{
5461 struct kvm_run *run = vcpu->run; 5495 struct kvm_run *run = vcpu->run;
5496 struct kvm_mmio_fragment *frag;
5462 int r; 5497 int r;
5463 5498
5464 if (!(vcpu->arch.pio.count || vcpu->mmio_needed)) 5499 if (!(vcpu->arch.pio.count || vcpu->mmio_needed))
5465 return 1; 5500 return 1;
5466 5501
5467 if (vcpu->mmio_needed) { 5502 if (vcpu->mmio_needed) {
5468 vcpu->mmio_needed = 0; 5503 /* Complete previous fragment */
5504 frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
5469 if (!vcpu->mmio_is_write) 5505 if (!vcpu->mmio_is_write)
5470 memcpy(vcpu->mmio_data + vcpu->mmio_index, 5506 memcpy(frag->data, run->mmio.data, frag->len);
5471 run->mmio.data, 8); 5507 if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
5472 vcpu->mmio_index += 8; 5508 vcpu->mmio_needed = 0;
5473 if (vcpu->mmio_index < vcpu->mmio_size) { 5509 if (vcpu->mmio_is_write)
5474 run->exit_reason = KVM_EXIT_MMIO; 5510 return 1;
5475 run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index; 5511 vcpu->mmio_read_completed = 1;
5476 memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8); 5512 goto done;
5477 run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8);
5478 run->mmio.is_write = vcpu->mmio_is_write;
5479 vcpu->mmio_needed = 1;
5480 return 0;
5481 } 5513 }
5514 /* Initiate next fragment */
5515 ++frag;
5516 run->exit_reason = KVM_EXIT_MMIO;
5517 run->mmio.phys_addr = frag->gpa;
5482 if (vcpu->mmio_is_write) 5518 if (vcpu->mmio_is_write)
5483 return 1; 5519 memcpy(run->mmio.data, frag->data, frag->len);
5484 vcpu->mmio_read_completed = 1; 5520 run->mmio.len = frag->len;
5521 run->mmio.is_write = vcpu->mmio_is_write;
5522 return 0;
5523
5485 } 5524 }
5525done:
5486 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 5526 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5487 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); 5527 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
5488 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5528 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -6399,21 +6439,9 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
6399 kvm_cpu_has_interrupt(vcpu)); 6439 kvm_cpu_has_interrupt(vcpu));
6400} 6440}
6401 6441
6402void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 6442int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
6403{ 6443{
6404 int me; 6444 return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
6405 int cpu = vcpu->cpu;
6406
6407 if (waitqueue_active(&vcpu->wq)) {
6408 wake_up_interruptible(&vcpu->wq);
6409 ++vcpu->stat.halt_wakeup;
6410 }
6411
6412 me = get_cpu();
6413 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
6414 if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE)
6415 smp_send_reschedule(cpu);
6416 put_cpu();
6417} 6445}
6418 6446
6419int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) 6447int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index cb80c293cdd..3d1134ddb88 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -64,7 +64,7 @@ static inline int is_pse(struct kvm_vcpu *vcpu)
64 64
65static inline int is_paging(struct kvm_vcpu *vcpu) 65static inline int is_paging(struct kvm_vcpu *vcpu)
66{ 66{
67 return kvm_read_cr0_bits(vcpu, X86_CR0_PG); 67 return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG));
68} 68}
69 69
70static inline u32 bit(int bitno) 70static inline u32 bit(int bitno)