aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/cpuid.c5
-rw-r--r--arch/x86/kvm/emulate.c293
-rw-r--r--arch/x86/kvm/i8254.c31
-rw-r--r--arch/x86/kvm/i8254.h7
-rw-r--r--arch/x86/kvm/lapic.c31
-rw-r--r--arch/x86/kvm/mmu.c345
-rw-r--r--arch/x86/kvm/mmu_audit.c10
-rw-r--r--arch/x86/kvm/paging_tmpl.h2
-rw-r--r--arch/x86/kvm/svm.c9
-rw-r--r--arch/x86/kvm/vmx.c41
-rw-r--r--arch/x86/kvm/x86.c280
-rw-r--r--arch/x86/kvm/x86.h2
13 files changed, 649 insertions, 408 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 1a7fe868f375..a28f338843ea 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -36,6 +36,7 @@ config KVM
36 select TASKSTATS 36 select TASKSTATS
37 select TASK_DELAY_ACCT 37 select TASK_DELAY_ACCT
38 select PERF_EVENTS 38 select PERF_EVENTS
39 select HAVE_KVM_MSI
39 ---help--- 40 ---help---
40 Support hosting fully virtualized guest machines using hardware 41 Support hosting fully virtualized guest machines using hardware
41 virtualization extensions. You will need a fairly recent 42 virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 9fed5bedaad6..7df1c6d839fb 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -247,7 +247,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
247 247
248 /* cpuid 7.0.ebx */ 248 /* cpuid 7.0.ebx */
249 const u32 kvm_supported_word9_x86_features = 249 const u32 kvm_supported_word9_x86_features =
250 F(FSGSBASE) | F(BMI1) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS); 250 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
251 F(BMI2) | F(ERMS) | F(RTM);
251 252
252 /* all calls to cpuid_count() should be made on the same cpu */ 253 /* all calls to cpuid_count() should be made on the same cpu */
253 get_cpu(); 254 get_cpu();
@@ -397,7 +398,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
397 case KVM_CPUID_SIGNATURE: { 398 case KVM_CPUID_SIGNATURE: {
398 char signature[12] = "KVMKVMKVM\0\0"; 399 char signature[12] = "KVMKVMKVM\0\0";
399 u32 *sigptr = (u32 *)signature; 400 u32 *sigptr = (u32 *)signature;
400 entry->eax = 0; 401 entry->eax = KVM_CPUID_FEATURES;
401 entry->ebx = sigptr[0]; 402 entry->ebx = sigptr[0];
402 entry->ecx = sigptr[1]; 403 entry->ecx = sigptr[1];
403 entry->edx = sigptr[2]; 404 entry->edx = sigptr[2];
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 83756223f8aa..f95d242ee9f7 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -142,6 +142,10 @@
142#define Src2FS (OpFS << Src2Shift) 142#define Src2FS (OpFS << Src2Shift)
143#define Src2GS (OpGS << Src2Shift) 143#define Src2GS (OpGS << Src2Shift)
144#define Src2Mask (OpMask << Src2Shift) 144#define Src2Mask (OpMask << Src2Shift)
145#define Mmx ((u64)1 << 40) /* MMX Vector instruction */
146#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */
147#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */
148#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */
145 149
146#define X2(x...) x, x 150#define X2(x...) x, x
147#define X3(x...) X2(x), x 151#define X3(x...) X2(x), x
@@ -557,6 +561,29 @@ static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
557 ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg); 561 ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg);
558} 562}
559 563
564/*
565 * x86 defines three classes of vector instructions: explicitly
566 * aligned, explicitly unaligned, and the rest, which change behaviour
567 * depending on whether they're AVX encoded or not.
568 *
569 * Also included is CMPXCHG16B which is not a vector instruction, yet it is
570 * subject to the same check.
571 */
572static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size)
573{
574 if (likely(size < 16))
575 return false;
576
577 if (ctxt->d & Aligned)
578 return true;
579 else if (ctxt->d & Unaligned)
580 return false;
581 else if (ctxt->d & Avx)
582 return false;
583 else
584 return true;
585}
586
560static int __linearize(struct x86_emulate_ctxt *ctxt, 587static int __linearize(struct x86_emulate_ctxt *ctxt,
561 struct segmented_address addr, 588 struct segmented_address addr,
562 unsigned size, bool write, bool fetch, 589 unsigned size, bool write, bool fetch,
@@ -621,6 +648,8 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
621 } 648 }
622 if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8) 649 if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8)
623 la &= (u32)-1; 650 la &= (u32)-1;
651 if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
652 return emulate_gp(ctxt, 0);
624 *linear = la; 653 *linear = la;
625 return X86EMUL_CONTINUE; 654 return X86EMUL_CONTINUE;
626bad: 655bad:
@@ -859,6 +888,40 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
859 ctxt->ops->put_fpu(ctxt); 888 ctxt->ops->put_fpu(ctxt);
860} 889}
861 890
891static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
892{
893 ctxt->ops->get_fpu(ctxt);
894 switch (reg) {
895 case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;
896 case 1: asm("movq %%mm1, %0" : "=m"(*data)); break;
897 case 2: asm("movq %%mm2, %0" : "=m"(*data)); break;
898 case 3: asm("movq %%mm3, %0" : "=m"(*data)); break;
899 case 4: asm("movq %%mm4, %0" : "=m"(*data)); break;
900 case 5: asm("movq %%mm5, %0" : "=m"(*data)); break;
901 case 6: asm("movq %%mm6, %0" : "=m"(*data)); break;
902 case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;
903 default: BUG();
904 }
905 ctxt->ops->put_fpu(ctxt);
906}
907
908static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
909{
910 ctxt->ops->get_fpu(ctxt);
911 switch (reg) {
912 case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;
913 case 1: asm("movq %0, %%mm1" : : "m"(*data)); break;
914 case 2: asm("movq %0, %%mm2" : : "m"(*data)); break;
915 case 3: asm("movq %0, %%mm3" : : "m"(*data)); break;
916 case 4: asm("movq %0, %%mm4" : : "m"(*data)); break;
917 case 5: asm("movq %0, %%mm5" : : "m"(*data)); break;
918 case 6: asm("movq %0, %%mm6" : : "m"(*data)); break;
919 case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;
920 default: BUG();
921 }
922 ctxt->ops->put_fpu(ctxt);
923}
924
862static void decode_register_operand(struct x86_emulate_ctxt *ctxt, 925static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
863 struct operand *op) 926 struct operand *op)
864{ 927{
@@ -875,6 +938,13 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
875 read_sse_reg(ctxt, &op->vec_val, reg); 938 read_sse_reg(ctxt, &op->vec_val, reg);
876 return; 939 return;
877 } 940 }
941 if (ctxt->d & Mmx) {
942 reg &= 7;
943 op->type = OP_MM;
944 op->bytes = 8;
945 op->addr.mm = reg;
946 return;
947 }
878 948
879 op->type = OP_REG; 949 op->type = OP_REG;
880 if (ctxt->d & ByteOp) { 950 if (ctxt->d & ByteOp) {
@@ -902,7 +972,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
902 ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ 972 ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
903 } 973 }
904 974
905 ctxt->modrm = insn_fetch(u8, ctxt);
906 ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; 975 ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
907 ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; 976 ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
908 ctxt->modrm_rm |= (ctxt->modrm & 0x07); 977 ctxt->modrm_rm |= (ctxt->modrm & 0x07);
@@ -920,6 +989,12 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
920 read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm); 989 read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm);
921 return rc; 990 return rc;
922 } 991 }
992 if (ctxt->d & Mmx) {
993 op->type = OP_MM;
994 op->bytes = 8;
995 op->addr.xmm = ctxt->modrm_rm & 7;
996 return rc;
997 }
923 fetch_register_operand(op); 998 fetch_register_operand(op);
924 return rc; 999 return rc;
925 } 1000 }
@@ -1387,6 +1462,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
1387 case OP_XMM: 1462 case OP_XMM:
1388 write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); 1463 write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
1389 break; 1464 break;
1465 case OP_MM:
1466 write_mmx_reg(ctxt, &ctxt->dst.mm_val, ctxt->dst.addr.mm);
1467 break;
1390 case OP_NONE: 1468 case OP_NONE:
1391 /* no writeback */ 1469 /* no writeback */
1392 break; 1470 break;
@@ -2790,7 +2868,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
2790 2868
2791static int em_mov(struct x86_emulate_ctxt *ctxt) 2869static int em_mov(struct x86_emulate_ctxt *ctxt)
2792{ 2870{
2793 ctxt->dst.val = ctxt->src.val; 2871 memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes);
2794 return X86EMUL_CONTINUE; 2872 return X86EMUL_CONTINUE;
2795} 2873}
2796 2874
@@ -2870,12 +2948,6 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
2870 return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); 2948 return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
2871} 2949}
2872 2950
2873static int em_movdqu(struct x86_emulate_ctxt *ctxt)
2874{
2875 memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes);
2876 return X86EMUL_CONTINUE;
2877}
2878
2879static int em_invlpg(struct x86_emulate_ctxt *ctxt) 2951static int em_invlpg(struct x86_emulate_ctxt *ctxt)
2880{ 2952{
2881 int rc; 2953 int rc;
@@ -3061,35 +3133,13 @@ static int em_btc(struct x86_emulate_ctxt *ctxt)
3061 3133
3062static int em_bsf(struct x86_emulate_ctxt *ctxt) 3134static int em_bsf(struct x86_emulate_ctxt *ctxt)
3063{ 3135{
3064 u8 zf; 3136 emulate_2op_SrcV_nobyte(ctxt, "bsf");
3065
3066 __asm__ ("bsf %2, %0; setz %1"
3067 : "=r"(ctxt->dst.val), "=q"(zf)
3068 : "r"(ctxt->src.val));
3069
3070 ctxt->eflags &= ~X86_EFLAGS_ZF;
3071 if (zf) {
3072 ctxt->eflags |= X86_EFLAGS_ZF;
3073 /* Disable writeback. */
3074 ctxt->dst.type = OP_NONE;
3075 }
3076 return X86EMUL_CONTINUE; 3137 return X86EMUL_CONTINUE;
3077} 3138}
3078 3139
3079static int em_bsr(struct x86_emulate_ctxt *ctxt) 3140static int em_bsr(struct x86_emulate_ctxt *ctxt)
3080{ 3141{
3081 u8 zf; 3142 emulate_2op_SrcV_nobyte(ctxt, "bsr");
3082
3083 __asm__ ("bsr %2, %0; setz %1"
3084 : "=r"(ctxt->dst.val), "=q"(zf)
3085 : "r"(ctxt->src.val));
3086
3087 ctxt->eflags &= ~X86_EFLAGS_ZF;
3088 if (zf) {
3089 ctxt->eflags |= X86_EFLAGS_ZF;
3090 /* Disable writeback. */
3091 ctxt->dst.type = OP_NONE;
3092 }
3093 return X86EMUL_CONTINUE; 3143 return X86EMUL_CONTINUE;
3094} 3144}
3095 3145
@@ -3286,8 +3336,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3286 .check_perm = (_p) } 3336 .check_perm = (_p) }
3287#define N D(0) 3337#define N D(0)
3288#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } 3338#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
3289#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } 3339#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
3290#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) } 3340#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
3291#define I(_f, _e) { .flags = (_f), .u.execute = (_e) } 3341#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
3292#define II(_f, _e, _i) \ 3342#define II(_f, _e, _i) \
3293 { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } 3343 { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
@@ -3307,25 +3357,25 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3307 I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) 3357 I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
3308 3358
3309static struct opcode group7_rm1[] = { 3359static struct opcode group7_rm1[] = {
3310 DI(SrcNone | ModRM | Priv, monitor), 3360 DI(SrcNone | Priv, monitor),
3311 DI(SrcNone | ModRM | Priv, mwait), 3361 DI(SrcNone | Priv, mwait),
3312 N, N, N, N, N, N, 3362 N, N, N, N, N, N,
3313}; 3363};
3314 3364
3315static struct opcode group7_rm3[] = { 3365static struct opcode group7_rm3[] = {
3316 DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa), 3366 DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa),
3317 II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall), 3367 II(SrcNone | Prot | VendorSpecific, em_vmmcall, vmmcall),
3318 DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa), 3368 DIP(SrcNone | Prot | Priv, vmload, check_svme_pa),
3319 DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa), 3369 DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa),
3320 DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme), 3370 DIP(SrcNone | Prot | Priv, stgi, check_svme),
3321 DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme), 3371 DIP(SrcNone | Prot | Priv, clgi, check_svme),
3322 DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme), 3372 DIP(SrcNone | Prot | Priv, skinit, check_svme),
3323 DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme), 3373 DIP(SrcNone | Prot | Priv, invlpga, check_svme),
3324}; 3374};
3325 3375
3326static struct opcode group7_rm7[] = { 3376static struct opcode group7_rm7[] = {
3327 N, 3377 N,
3328 DIP(SrcNone | ModRM, rdtscp, check_rdtsc), 3378 DIP(SrcNone, rdtscp, check_rdtsc),
3329 N, N, N, N, N, N, 3379 N, N, N, N, N, N,
3330}; 3380};
3331 3381
@@ -3341,81 +3391,86 @@ static struct opcode group1[] = {
3341}; 3391};
3342 3392
3343static struct opcode group1A[] = { 3393static struct opcode group1A[] = {
3344 I(DstMem | SrcNone | ModRM | Mov | Stack, em_pop), N, N, N, N, N, N, N, 3394 I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
3345}; 3395};
3346 3396
3347static struct opcode group3[] = { 3397static struct opcode group3[] = {
3348 I(DstMem | SrcImm | ModRM, em_test), 3398 I(DstMem | SrcImm, em_test),
3349 I(DstMem | SrcImm | ModRM, em_test), 3399 I(DstMem | SrcImm, em_test),
3350 I(DstMem | SrcNone | ModRM | Lock, em_not), 3400 I(DstMem | SrcNone | Lock, em_not),
3351 I(DstMem | SrcNone | ModRM | Lock, em_neg), 3401 I(DstMem | SrcNone | Lock, em_neg),
3352 I(SrcMem | ModRM, em_mul_ex), 3402 I(SrcMem, em_mul_ex),
3353 I(SrcMem | ModRM, em_imul_ex), 3403 I(SrcMem, em_imul_ex),
3354 I(SrcMem | ModRM, em_div_ex), 3404 I(SrcMem, em_div_ex),
3355 I(SrcMem | ModRM, em_idiv_ex), 3405 I(SrcMem, em_idiv_ex),
3356}; 3406};
3357 3407
3358static struct opcode group4[] = { 3408static struct opcode group4[] = {
3359 I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), 3409 I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
3360 I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), 3410 I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
3361 N, N, N, N, N, N, 3411 N, N, N, N, N, N,
3362}; 3412};
3363 3413
3364static struct opcode group5[] = { 3414static struct opcode group5[] = {
3365 I(DstMem | SrcNone | ModRM | Lock, em_grp45), 3415 I(DstMem | SrcNone | Lock, em_grp45),
3366 I(DstMem | SrcNone | ModRM | Lock, em_grp45), 3416 I(DstMem | SrcNone | Lock, em_grp45),
3367 I(SrcMem | ModRM | Stack, em_grp45), 3417 I(SrcMem | Stack, em_grp45),
3368 I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), 3418 I(SrcMemFAddr | ImplicitOps | Stack, em_call_far),
3369 I(SrcMem | ModRM | Stack, em_grp45), 3419 I(SrcMem | Stack, em_grp45),
3370 I(SrcMemFAddr | ModRM | ImplicitOps, em_grp45), 3420 I(SrcMemFAddr | ImplicitOps, em_grp45),
3371 I(SrcMem | ModRM | Stack, em_grp45), N, 3421 I(SrcMem | Stack, em_grp45), N,
3372}; 3422};
3373 3423
3374static struct opcode group6[] = { 3424static struct opcode group6[] = {
3375 DI(ModRM | Prot, sldt), 3425 DI(Prot, sldt),
3376 DI(ModRM | Prot, str), 3426 DI(Prot, str),
3377 DI(ModRM | Prot | Priv, lldt), 3427 DI(Prot | Priv, lldt),
3378 DI(ModRM | Prot | Priv, ltr), 3428 DI(Prot | Priv, ltr),
3379 N, N, N, N, 3429 N, N, N, N,
3380}; 3430};
3381 3431
3382static struct group_dual group7 = { { 3432static struct group_dual group7 = { {
3383 DI(ModRM | Mov | DstMem | Priv, sgdt), 3433 DI(Mov | DstMem | Priv, sgdt),
3384 DI(ModRM | Mov | DstMem | Priv, sidt), 3434 DI(Mov | DstMem | Priv, sidt),
3385 II(ModRM | SrcMem | Priv, em_lgdt, lgdt), 3435 II(SrcMem | Priv, em_lgdt, lgdt),
3386 II(ModRM | SrcMem | Priv, em_lidt, lidt), 3436 II(SrcMem | Priv, em_lidt, lidt),
3387 II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, 3437 II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
3388 II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), 3438 II(SrcMem16 | Mov | Priv, em_lmsw, lmsw),
3389 II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg), 3439 II(SrcMem | ByteOp | Priv | NoAccess, em_invlpg, invlpg),
3390}, { 3440}, {
3391 I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall), 3441 I(SrcNone | Priv | VendorSpecific, em_vmcall),
3392 EXT(0, group7_rm1), 3442 EXT(0, group7_rm1),
3393 N, EXT(0, group7_rm3), 3443 N, EXT(0, group7_rm3),
3394 II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, 3444 II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
3395 II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7), 3445 II(SrcMem16 | Mov | Priv, em_lmsw, lmsw),
3446 EXT(0, group7_rm7),
3396} }; 3447} };
3397 3448
3398static struct opcode group8[] = { 3449static struct opcode group8[] = {
3399 N, N, N, N, 3450 N, N, N, N,
3400 I(DstMem | SrcImmByte | ModRM, em_bt), 3451 I(DstMem | SrcImmByte, em_bt),
3401 I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_bts), 3452 I(DstMem | SrcImmByte | Lock | PageTable, em_bts),
3402 I(DstMem | SrcImmByte | ModRM | Lock, em_btr), 3453 I(DstMem | SrcImmByte | Lock, em_btr),
3403 I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_btc), 3454 I(DstMem | SrcImmByte | Lock | PageTable, em_btc),
3404}; 3455};
3405 3456
3406static struct group_dual group9 = { { 3457static struct group_dual group9 = { {
3407 N, I(DstMem64 | ModRM | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, 3458 N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
3408}, { 3459}, {
3409 N, N, N, N, N, N, N, N, 3460 N, N, N, N, N, N, N, N,
3410} }; 3461} };
3411 3462
3412static struct opcode group11[] = { 3463static struct opcode group11[] = {
3413 I(DstMem | SrcImm | ModRM | Mov | PageTable, em_mov), 3464 I(DstMem | SrcImm | Mov | PageTable, em_mov),
3414 X7(D(Undefined)), 3465 X7(D(Undefined)),
3415}; 3466};
3416 3467
3417static struct gprefix pfx_0f_6f_0f_7f = { 3468static struct gprefix pfx_0f_6f_0f_7f = {
3418 N, N, N, I(Sse, em_movdqu), 3469 I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
3470};
3471
3472static struct gprefix pfx_vmovntpx = {
3473 I(0, em_mov), N, N, N,
3419}; 3474};
3420 3475
3421static struct opcode opcode_table[256] = { 3476static struct opcode opcode_table[256] = {
@@ -3464,10 +3519,10 @@ static struct opcode opcode_table[256] = {
3464 /* 0x70 - 0x7F */ 3519 /* 0x70 - 0x7F */
3465 X16(D(SrcImmByte)), 3520 X16(D(SrcImmByte)),
3466 /* 0x80 - 0x87 */ 3521 /* 0x80 - 0x87 */
3467 G(ByteOp | DstMem | SrcImm | ModRM | Group, group1), 3522 G(ByteOp | DstMem | SrcImm, group1),
3468 G(DstMem | SrcImm | ModRM | Group, group1), 3523 G(DstMem | SrcImm, group1),
3469 G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), 3524 G(ByteOp | DstMem | SrcImm | No64, group1),
3470 G(DstMem | SrcImmByte | ModRM | Group, group1), 3525 G(DstMem | SrcImmByte, group1),
3471 I2bv(DstMem | SrcReg | ModRM, em_test), 3526 I2bv(DstMem | SrcReg | ModRM, em_test),
3472 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg), 3527 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
3473 /* 0x88 - 0x8F */ 3528 /* 0x88 - 0x8F */
@@ -3549,7 +3604,8 @@ static struct opcode twobyte_table[256] = {
3549 IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), 3604 IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write),
3550 IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), 3605 IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write),
3551 N, N, N, N, 3606 N, N, N, N,
3552 N, N, N, N, N, N, N, N, 3607 N, N, N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx),
3608 N, N, N, N,
3553 /* 0x30 - 0x3F */ 3609 /* 0x30 - 0x3F */
3554 II(ImplicitOps | Priv, em_wrmsr, wrmsr), 3610 II(ImplicitOps | Priv, em_wrmsr, wrmsr),
3555 IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), 3611 IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
@@ -3897,17 +3953,16 @@ done_prefixes:
3897 } 3953 }
3898 ctxt->d = opcode.flags; 3954 ctxt->d = opcode.flags;
3899 3955
3956 if (ctxt->d & ModRM)
3957 ctxt->modrm = insn_fetch(u8, ctxt);
3958
3900 while (ctxt->d & GroupMask) { 3959 while (ctxt->d & GroupMask) {
3901 switch (ctxt->d & GroupMask) { 3960 switch (ctxt->d & GroupMask) {
3902 case Group: 3961 case Group:
3903 ctxt->modrm = insn_fetch(u8, ctxt);
3904 --ctxt->_eip;
3905 goffset = (ctxt->modrm >> 3) & 7; 3962 goffset = (ctxt->modrm >> 3) & 7;
3906 opcode = opcode.u.group[goffset]; 3963 opcode = opcode.u.group[goffset];
3907 break; 3964 break;
3908 case GroupDual: 3965 case GroupDual:
3909 ctxt->modrm = insn_fetch(u8, ctxt);
3910 --ctxt->_eip;
3911 goffset = (ctxt->modrm >> 3) & 7; 3966 goffset = (ctxt->modrm >> 3) & 7;
3912 if ((ctxt->modrm >> 6) == 3) 3967 if ((ctxt->modrm >> 6) == 3)
3913 opcode = opcode.u.gdual->mod3[goffset]; 3968 opcode = opcode.u.gdual->mod3[goffset];
@@ -3960,6 +4015,8 @@ done_prefixes:
3960 4015
3961 if (ctxt->d & Sse) 4016 if (ctxt->d & Sse)
3962 ctxt->op_bytes = 16; 4017 ctxt->op_bytes = 16;
4018 else if (ctxt->d & Mmx)
4019 ctxt->op_bytes = 8;
3963 4020
3964 /* ModRM and SIB bytes. */ 4021 /* ModRM and SIB bytes. */
3965 if (ctxt->d & ModRM) { 4022 if (ctxt->d & ModRM) {
@@ -4030,6 +4087,35 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
4030 return false; 4087 return false;
4031} 4088}
4032 4089
4090static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
4091{
4092 bool fault = false;
4093
4094 ctxt->ops->get_fpu(ctxt);
4095 asm volatile("1: fwait \n\t"
4096 "2: \n\t"
4097 ".pushsection .fixup,\"ax\" \n\t"
4098 "3: \n\t"
4099 "movb $1, %[fault] \n\t"
4100 "jmp 2b \n\t"
4101 ".popsection \n\t"
4102 _ASM_EXTABLE(1b, 3b)
4103 : [fault]"+qm"(fault));
4104 ctxt->ops->put_fpu(ctxt);
4105
4106 if (unlikely(fault))
4107 return emulate_exception(ctxt, MF_VECTOR, 0, false);
4108
4109 return X86EMUL_CONTINUE;
4110}
4111
4112static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
4113 struct operand *op)
4114{
4115 if (op->type == OP_MM)
4116 read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
4117}
4118
4033int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) 4119int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4034{ 4120{
4035 struct x86_emulate_ops *ops = ctxt->ops; 4121 struct x86_emulate_ops *ops = ctxt->ops;
@@ -4054,18 +4140,31 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4054 goto done; 4140 goto done;
4055 } 4141 }
4056 4142
4057 if ((ctxt->d & Sse) 4143 if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)))
4058 && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) 4144 || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
4059 || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
4060 rc = emulate_ud(ctxt); 4145 rc = emulate_ud(ctxt);
4061 goto done; 4146 goto done;
4062 } 4147 }
4063 4148
4064 if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { 4149 if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
4065 rc = emulate_nm(ctxt); 4150 rc = emulate_nm(ctxt);
4066 goto done; 4151 goto done;
4067 } 4152 }
4068 4153
4154 if (ctxt->d & Mmx) {
4155 rc = flush_pending_x87_faults(ctxt);
4156 if (rc != X86EMUL_CONTINUE)
4157 goto done;
4158 /*
4159 * Now that we know the fpu is exception safe, we can fetch
4160 * operands from it.
4161 */
4162 fetch_possible_mmx_operand(ctxt, &ctxt->src);
4163 fetch_possible_mmx_operand(ctxt, &ctxt->src2);
4164 if (!(ctxt->d & Mov))
4165 fetch_possible_mmx_operand(ctxt, &ctxt->dst);
4166 }
4167
4069 if (unlikely(ctxt->guest_mode) && ctxt->intercept) { 4168 if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
4070 rc = emulator_check_intercept(ctxt, ctxt->intercept, 4169 rc = emulator_check_intercept(ctxt, ctxt->intercept,
4071 X86_ICPT_PRE_EXCEPT); 4170 X86_ICPT_PRE_EXCEPT);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index d68f99df690c..adba28f88d1a 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -34,7 +34,6 @@
34 34
35#include <linux/kvm_host.h> 35#include <linux/kvm_host.h>
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/workqueue.h>
38 37
39#include "irq.h" 38#include "irq.h"
40#include "i8254.h" 39#include "i8254.h"
@@ -249,7 +248,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
249 /* in this case, we had multiple outstanding pit interrupts 248 /* in this case, we had multiple outstanding pit interrupts
250 * that we needed to inject. Reinject 249 * that we needed to inject. Reinject
251 */ 250 */
252 queue_work(ps->pit->wq, &ps->pit->expired); 251 queue_kthread_work(&ps->pit->worker, &ps->pit->expired);
253 ps->irq_ack = 1; 252 ps->irq_ack = 1;
254 spin_unlock(&ps->inject_lock); 253 spin_unlock(&ps->inject_lock);
255} 254}
@@ -270,7 +269,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
270static void destroy_pit_timer(struct kvm_pit *pit) 269static void destroy_pit_timer(struct kvm_pit *pit)
271{ 270{
272 hrtimer_cancel(&pit->pit_state.pit_timer.timer); 271 hrtimer_cancel(&pit->pit_state.pit_timer.timer);
273 cancel_work_sync(&pit->expired); 272 flush_kthread_work(&pit->expired);
274} 273}
275 274
276static bool kpit_is_periodic(struct kvm_timer *ktimer) 275static bool kpit_is_periodic(struct kvm_timer *ktimer)
@@ -284,7 +283,7 @@ static struct kvm_timer_ops kpit_ops = {
284 .is_periodic = kpit_is_periodic, 283 .is_periodic = kpit_is_periodic,
285}; 284};
286 285
287static void pit_do_work(struct work_struct *work) 286static void pit_do_work(struct kthread_work *work)
288{ 287{
289 struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); 288 struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
290 struct kvm *kvm = pit->kvm; 289 struct kvm *kvm = pit->kvm;
@@ -328,7 +327,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
328 327
329 if (ktimer->reinject || !atomic_read(&ktimer->pending)) { 328 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
330 atomic_inc(&ktimer->pending); 329 atomic_inc(&ktimer->pending);
331 queue_work(pt->wq, &pt->expired); 330 queue_kthread_work(&pt->worker, &pt->expired);
332 } 331 }
333 332
334 if (ktimer->t_ops->is_periodic(ktimer)) { 333 if (ktimer->t_ops->is_periodic(ktimer)) {
@@ -353,7 +352,7 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
353 352
354 /* TODO The new value only affected after the retriggered */ 353 /* TODO The new value only affected after the retriggered */
355 hrtimer_cancel(&pt->timer); 354 hrtimer_cancel(&pt->timer);
356 cancel_work_sync(&ps->pit->expired); 355 flush_kthread_work(&ps->pit->expired);
357 pt->period = interval; 356 pt->period = interval;
358 ps->is_periodic = is_period; 357 ps->is_periodic = is_period;
359 358
@@ -669,6 +668,8 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
669{ 668{
670 struct kvm_pit *pit; 669 struct kvm_pit *pit;
671 struct kvm_kpit_state *pit_state; 670 struct kvm_kpit_state *pit_state;
671 struct pid *pid;
672 pid_t pid_nr;
672 int ret; 673 int ret;
673 674
674 pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); 675 pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
@@ -685,14 +686,20 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
685 mutex_lock(&pit->pit_state.lock); 686 mutex_lock(&pit->pit_state.lock);
686 spin_lock_init(&pit->pit_state.inject_lock); 687 spin_lock_init(&pit->pit_state.inject_lock);
687 688
688 pit->wq = create_singlethread_workqueue("kvm-pit-wq"); 689 pid = get_pid(task_tgid(current));
689 if (!pit->wq) { 690 pid_nr = pid_vnr(pid);
691 put_pid(pid);
692
693 init_kthread_worker(&pit->worker);
694 pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker,
695 "kvm-pit/%d", pid_nr);
696 if (IS_ERR(pit->worker_task)) {
690 mutex_unlock(&pit->pit_state.lock); 697 mutex_unlock(&pit->pit_state.lock);
691 kvm_free_irq_source_id(kvm, pit->irq_source_id); 698 kvm_free_irq_source_id(kvm, pit->irq_source_id);
692 kfree(pit); 699 kfree(pit);
693 return NULL; 700 return NULL;
694 } 701 }
695 INIT_WORK(&pit->expired, pit_do_work); 702 init_kthread_work(&pit->expired, pit_do_work);
696 703
697 kvm->arch.vpit = pit; 704 kvm->arch.vpit = pit;
698 pit->kvm = kvm; 705 pit->kvm = kvm;
@@ -736,7 +743,7 @@ fail:
736 kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); 743 kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
737 kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); 744 kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
738 kvm_free_irq_source_id(kvm, pit->irq_source_id); 745 kvm_free_irq_source_id(kvm, pit->irq_source_id);
739 destroy_workqueue(pit->wq); 746 kthread_stop(pit->worker_task);
740 kfree(pit); 747 kfree(pit);
741 return NULL; 748 return NULL;
742} 749}
@@ -756,10 +763,10 @@ void kvm_free_pit(struct kvm *kvm)
756 mutex_lock(&kvm->arch.vpit->pit_state.lock); 763 mutex_lock(&kvm->arch.vpit->pit_state.lock);
757 timer = &kvm->arch.vpit->pit_state.pit_timer.timer; 764 timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
758 hrtimer_cancel(timer); 765 hrtimer_cancel(timer);
759 cancel_work_sync(&kvm->arch.vpit->expired); 766 flush_kthread_work(&kvm->arch.vpit->expired);
767 kthread_stop(kvm->arch.vpit->worker_task);
760 kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); 768 kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
761 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 769 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
762 destroy_workqueue(kvm->arch.vpit->wq);
763 kfree(kvm->arch.vpit); 770 kfree(kvm->arch.vpit);
764 } 771 }
765} 772}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 51a97426e791..fdf40425ea1d 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -1,6 +1,8 @@
1#ifndef __I8254_H 1#ifndef __I8254_H
2#define __I8254_H 2#define __I8254_H
3 3
4#include <linux/kthread.h>
5
4#include "iodev.h" 6#include "iodev.h"
5 7
6struct kvm_kpit_channel_state { 8struct kvm_kpit_channel_state {
@@ -39,8 +41,9 @@ struct kvm_pit {
39 struct kvm_kpit_state pit_state; 41 struct kvm_kpit_state pit_state;
40 int irq_source_id; 42 int irq_source_id;
41 struct kvm_irq_mask_notifier mask_notifier; 43 struct kvm_irq_mask_notifier mask_notifier;
42 struct workqueue_struct *wq; 44 struct kthread_worker worker;
43 struct work_struct expired; 45 struct task_struct *worker_task;
46 struct kthread_work expired;
44}; 47};
45 48
46#define KVM_PIT_BASE_ADDRESS 0x40 49#define KVM_PIT_BASE_ADDRESS 0x40
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 858432287ab6..93c15743f1ee 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -92,6 +92,11 @@ static inline int apic_test_and_clear_vector(int vec, void *bitmap)
92 return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 92 return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
93} 93}
94 94
95static inline int apic_test_vector(int vec, void *bitmap)
96{
97 return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
98}
99
95static inline void apic_set_vector(int vec, void *bitmap) 100static inline void apic_set_vector(int vec, void *bitmap)
96{ 101{
97 set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 102 set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -480,7 +485,6 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
480static void apic_set_eoi(struct kvm_lapic *apic) 485static void apic_set_eoi(struct kvm_lapic *apic)
481{ 486{
482 int vector = apic_find_highest_isr(apic); 487 int vector = apic_find_highest_isr(apic);
483 int trigger_mode;
484 /* 488 /*
485 * Not every write EOI will has corresponding ISR, 489 * Not every write EOI will has corresponding ISR,
486 * one example is when Kernel check timer on setup_IO_APIC 490 * one example is when Kernel check timer on setup_IO_APIC
@@ -491,12 +495,15 @@ static void apic_set_eoi(struct kvm_lapic *apic)
491 apic_clear_vector(vector, apic->regs + APIC_ISR); 495 apic_clear_vector(vector, apic->regs + APIC_ISR);
492 apic_update_ppr(apic); 496 apic_update_ppr(apic);
493 497
494 if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) 498 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
495 trigger_mode = IOAPIC_LEVEL_TRIG; 499 kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
496 else 500 int trigger_mode;
497 trigger_mode = IOAPIC_EDGE_TRIG; 501 if (apic_test_vector(vector, apic->regs + APIC_TMR))
498 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) 502 trigger_mode = IOAPIC_LEVEL_TRIG;
503 else
504 trigger_mode = IOAPIC_EDGE_TRIG;
499 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 505 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
506 }
500 kvm_make_request(KVM_REQ_EVENT, apic->vcpu); 507 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
501} 508}
502 509
@@ -1081,6 +1088,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1081 apic_update_ppr(apic); 1088 apic_update_ppr(apic);
1082 1089
1083 vcpu->arch.apic_arb_prio = 0; 1090 vcpu->arch.apic_arb_prio = 0;
1091 vcpu->arch.apic_attention = 0;
1084 1092
1085 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" 1093 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
1086 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, 1094 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
@@ -1280,7 +1288,7 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
1280 u32 data; 1288 u32 data;
1281 void *vapic; 1289 void *vapic;
1282 1290
1283 if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) 1291 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
1284 return; 1292 return;
1285 1293
1286 vapic = kmap_atomic(vcpu->arch.apic->vapic_page); 1294 vapic = kmap_atomic(vcpu->arch.apic->vapic_page);
@@ -1297,7 +1305,7 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
1297 struct kvm_lapic *apic; 1305 struct kvm_lapic *apic;
1298 void *vapic; 1306 void *vapic;
1299 1307
1300 if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) 1308 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
1301 return; 1309 return;
1302 1310
1303 apic = vcpu->arch.apic; 1311 apic = vcpu->arch.apic;
@@ -1317,10 +1325,11 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
1317 1325
1318void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) 1326void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
1319{ 1327{
1320 if (!irqchip_in_kernel(vcpu->kvm))
1321 return;
1322
1323 vcpu->arch.apic->vapic_addr = vapic_addr; 1328 vcpu->arch.apic->vapic_addr = vapic_addr;
1329 if (vapic_addr)
1330 __set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
1331 else
1332 __clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
1324} 1333}
1325 1334
1326int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1335int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4cb164268846..72102e0ab7cb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -135,8 +135,6 @@ module_param(dbg, bool, 0644);
135#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ 135#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
136 | PT64_NX_MASK) 136 | PT64_NX_MASK)
137 137
138#define PTE_LIST_EXT 4
139
140#define ACC_EXEC_MASK 1 138#define ACC_EXEC_MASK 1
141#define ACC_WRITE_MASK PT_WRITABLE_MASK 139#define ACC_WRITE_MASK PT_WRITABLE_MASK
142#define ACC_USER_MASK PT_USER_MASK 140#define ACC_USER_MASK PT_USER_MASK
@@ -151,6 +149,9 @@ module_param(dbg, bool, 0644);
151 149
152#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 150#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
153 151
152/* make pte_list_desc fit well in cache line */
153#define PTE_LIST_EXT 3
154
154struct pte_list_desc { 155struct pte_list_desc {
155 u64 *sptes[PTE_LIST_EXT]; 156 u64 *sptes[PTE_LIST_EXT];
156 struct pte_list_desc *more; 157 struct pte_list_desc *more;
@@ -550,19 +551,29 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
550 551
551static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) 552static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
552{ 553{
553 rcu_read_lock(); 554 /*
554 atomic_inc(&vcpu->kvm->arch.reader_counter); 555 * Prevent page table teardown by making any free-er wait during
555 556 * kvm_flush_remote_tlbs() IPI to all active vcpus.
556 /* Increase the counter before walking shadow page table */ 557 */
557 smp_mb__after_atomic_inc(); 558 local_irq_disable();
559 vcpu->mode = READING_SHADOW_PAGE_TABLES;
560 /*
561 * Make sure a following spte read is not reordered ahead of the write
562 * to vcpu->mode.
563 */
564 smp_mb();
558} 565}
559 566
560static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) 567static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
561{ 568{
562 /* Decrease the counter after walking shadow page table finished */ 569 /*
563 smp_mb__before_atomic_dec(); 570 * Make sure the write to vcpu->mode is not reordered in front of
564 atomic_dec(&vcpu->kvm->arch.reader_counter); 571 * reads to sptes. If it does, kvm_commit_zap_page() can see us
565 rcu_read_unlock(); 572 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
573 */
574 smp_mb();
575 vcpu->mode = OUTSIDE_GUEST_MODE;
576 local_irq_enable();
566} 577}
567 578
568static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 579static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
@@ -841,32 +852,6 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
841 return count; 852 return count;
842} 853}
843 854
844static u64 *pte_list_next(unsigned long *pte_list, u64 *spte)
845{
846 struct pte_list_desc *desc;
847 u64 *prev_spte;
848 int i;
849
850 if (!*pte_list)
851 return NULL;
852 else if (!(*pte_list & 1)) {
853 if (!spte)
854 return (u64 *)*pte_list;
855 return NULL;
856 }
857 desc = (struct pte_list_desc *)(*pte_list & ~1ul);
858 prev_spte = NULL;
859 while (desc) {
860 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
861 if (prev_spte == spte)
862 return desc->sptes[i];
863 prev_spte = desc->sptes[i];
864 }
865 desc = desc->more;
866 }
867 return NULL;
868}
869
870static void 855static void
871pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, 856pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
872 int i, struct pte_list_desc *prev_desc) 857 int i, struct pte_list_desc *prev_desc)
@@ -987,11 +972,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
987 return pte_list_add(vcpu, spte, rmapp); 972 return pte_list_add(vcpu, spte, rmapp);
988} 973}
989 974
990static u64 *rmap_next(unsigned long *rmapp, u64 *spte)
991{
992 return pte_list_next(rmapp, spte);
993}
994
995static void rmap_remove(struct kvm *kvm, u64 *spte) 975static void rmap_remove(struct kvm *kvm, u64 *spte)
996{ 976{
997 struct kvm_mmu_page *sp; 977 struct kvm_mmu_page *sp;
@@ -1004,106 +984,201 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
1004 pte_list_remove(spte, rmapp); 984 pte_list_remove(spte, rmapp);
1005} 985}
1006 986
987/*
988 * Used by the following functions to iterate through the sptes linked by a
989 * rmap. All fields are private and not assumed to be used outside.
990 */
991struct rmap_iterator {
992 /* private fields */
993 struct pte_list_desc *desc; /* holds the sptep if not NULL */
994 int pos; /* index of the sptep */
995};
996
997/*
998 * Iteration must be started by this function. This should also be used after
999 * removing/dropping sptes from the rmap link because in such cases the
1000 * information in the itererator may not be valid.
1001 *
1002 * Returns sptep if found, NULL otherwise.
1003 */
1004static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter)
1005{
1006 if (!rmap)
1007 return NULL;
1008
1009 if (!(rmap & 1)) {
1010 iter->desc = NULL;
1011 return (u64 *)rmap;
1012 }
1013
1014 iter->desc = (struct pte_list_desc *)(rmap & ~1ul);
1015 iter->pos = 0;
1016 return iter->desc->sptes[iter->pos];
1017}
1018
1019/*
1020 * Must be used with a valid iterator: e.g. after rmap_get_first().
1021 *
1022 * Returns sptep if found, NULL otherwise.
1023 */
1024static u64 *rmap_get_next(struct rmap_iterator *iter)
1025{
1026 if (iter->desc) {
1027 if (iter->pos < PTE_LIST_EXT - 1) {
1028 u64 *sptep;
1029
1030 ++iter->pos;
1031 sptep = iter->desc->sptes[iter->pos];
1032 if (sptep)
1033 return sptep;
1034 }
1035
1036 iter->desc = iter->desc->more;
1037
1038 if (iter->desc) {
1039 iter->pos = 0;
1040 /* desc->sptes[0] cannot be NULL */
1041 return iter->desc->sptes[iter->pos];
1042 }
1043 }
1044
1045 return NULL;
1046}
1047
1007static void drop_spte(struct kvm *kvm, u64 *sptep) 1048static void drop_spte(struct kvm *kvm, u64 *sptep)
1008{ 1049{
1009 if (mmu_spte_clear_track_bits(sptep)) 1050 if (mmu_spte_clear_track_bits(sptep))
1010 rmap_remove(kvm, sptep); 1051 rmap_remove(kvm, sptep);
1011} 1052}
1012 1053
1013int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, 1054static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
1014 struct kvm_memory_slot *slot)
1015{ 1055{
1016 unsigned long *rmapp; 1056 u64 *sptep;
1017 u64 *spte; 1057 struct rmap_iterator iter;
1018 int i, write_protected = 0; 1058 int write_protected = 0;
1019 1059
1020 rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot); 1060 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
1021 spte = rmap_next(rmapp, NULL); 1061 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1022 while (spte) { 1062 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1023 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1063
1024 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 1064 if (!is_writable_pte(*sptep)) {
1025 if (is_writable_pte(*spte)) { 1065 sptep = rmap_get_next(&iter);
1026 mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); 1066 continue;
1027 write_protected = 1;
1028 } 1067 }
1029 spte = rmap_next(rmapp, spte);
1030 }
1031 1068
1032 /* check for huge page mappings */ 1069 if (level == PT_PAGE_TABLE_LEVEL) {
1033 for (i = PT_DIRECTORY_LEVEL; 1070 mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK);
1034 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 1071 sptep = rmap_get_next(&iter);
1035 rmapp = __gfn_to_rmap(gfn, i, slot); 1072 } else {
1036 spte = rmap_next(rmapp, NULL); 1073 BUG_ON(!is_large_pte(*sptep));
1037 while (spte) { 1074 drop_spte(kvm, sptep);
1038 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1075 --kvm->stat.lpages;
1039 BUG_ON(!is_large_pte(*spte)); 1076 sptep = rmap_get_first(*rmapp, &iter);
1040 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
1041 if (is_writable_pte(*spte)) {
1042 drop_spte(kvm, spte);
1043 --kvm->stat.lpages;
1044 spte = NULL;
1045 write_protected = 1;
1046 }
1047 spte = rmap_next(rmapp, spte);
1048 } 1077 }
1078
1079 write_protected = 1;
1049 } 1080 }
1050 1081
1051 return write_protected; 1082 return write_protected;
1052} 1083}
1053 1084
1085/**
1086 * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1087 * @kvm: kvm instance
1088 * @slot: slot to protect
1089 * @gfn_offset: start of the BITS_PER_LONG pages we care about
1090 * @mask: indicates which pages we should protect
1091 *
1092 * Used when we do not need to care about huge page mappings: e.g. during dirty
1093 * logging we do not have any such mappings.
1094 */
1095void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1096 struct kvm_memory_slot *slot,
1097 gfn_t gfn_offset, unsigned long mask)
1098{
1099 unsigned long *rmapp;
1100
1101 while (mask) {
1102 rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
1103 __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL);
1104
1105 /* clear the first set bit */
1106 mask &= mask - 1;
1107 }
1108}
1109
1054static int rmap_write_protect(struct kvm *kvm, u64 gfn) 1110static int rmap_write_protect(struct kvm *kvm, u64 gfn)
1055{ 1111{
1056 struct kvm_memory_slot *slot; 1112 struct kvm_memory_slot *slot;
1113 unsigned long *rmapp;
1114 int i;
1115 int write_protected = 0;
1057 1116
1058 slot = gfn_to_memslot(kvm, gfn); 1117 slot = gfn_to_memslot(kvm, gfn);
1059 return kvm_mmu_rmap_write_protect(kvm, gfn, slot); 1118
1119 for (i = PT_PAGE_TABLE_LEVEL;
1120 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
1121 rmapp = __gfn_to_rmap(gfn, i, slot);
1122 write_protected |= __rmap_write_protect(kvm, rmapp, i);
1123 }
1124
1125 return write_protected;
1060} 1126}
1061 1127
1062static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, 1128static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
1063 unsigned long data) 1129 unsigned long data)
1064{ 1130{
1065 u64 *spte; 1131 u64 *sptep;
1132 struct rmap_iterator iter;
1066 int need_tlb_flush = 0; 1133 int need_tlb_flush = 0;
1067 1134
1068 while ((spte = rmap_next(rmapp, NULL))) { 1135 while ((sptep = rmap_get_first(*rmapp, &iter))) {
1069 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1136 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1070 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); 1137 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep);
1071 drop_spte(kvm, spte); 1138
1139 drop_spte(kvm, sptep);
1072 need_tlb_flush = 1; 1140 need_tlb_flush = 1;
1073 } 1141 }
1142
1074 return need_tlb_flush; 1143 return need_tlb_flush;
1075} 1144}
1076 1145
1077static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, 1146static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1078 unsigned long data) 1147 unsigned long data)
1079{ 1148{
1149 u64 *sptep;
1150 struct rmap_iterator iter;
1080 int need_flush = 0; 1151 int need_flush = 0;
1081 u64 *spte, new_spte; 1152 u64 new_spte;
1082 pte_t *ptep = (pte_t *)data; 1153 pte_t *ptep = (pte_t *)data;
1083 pfn_t new_pfn; 1154 pfn_t new_pfn;
1084 1155
1085 WARN_ON(pte_huge(*ptep)); 1156 WARN_ON(pte_huge(*ptep));
1086 new_pfn = pte_pfn(*ptep); 1157 new_pfn = pte_pfn(*ptep);
1087 spte = rmap_next(rmapp, NULL); 1158
1088 while (spte) { 1159 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
1089 BUG_ON(!is_shadow_present_pte(*spte)); 1160 BUG_ON(!is_shadow_present_pte(*sptep));
1090 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); 1161 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep);
1162
1091 need_flush = 1; 1163 need_flush = 1;
1164
1092 if (pte_write(*ptep)) { 1165 if (pte_write(*ptep)) {
1093 drop_spte(kvm, spte); 1166 drop_spte(kvm, sptep);
1094 spte = rmap_next(rmapp, NULL); 1167 sptep = rmap_get_first(*rmapp, &iter);
1095 } else { 1168 } else {
1096 new_spte = *spte &~ (PT64_BASE_ADDR_MASK); 1169 new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
1097 new_spte |= (u64)new_pfn << PAGE_SHIFT; 1170 new_spte |= (u64)new_pfn << PAGE_SHIFT;
1098 1171
1099 new_spte &= ~PT_WRITABLE_MASK; 1172 new_spte &= ~PT_WRITABLE_MASK;
1100 new_spte &= ~SPTE_HOST_WRITEABLE; 1173 new_spte &= ~SPTE_HOST_WRITEABLE;
1101 new_spte &= ~shadow_accessed_mask; 1174 new_spte &= ~shadow_accessed_mask;
1102 mmu_spte_clear_track_bits(spte); 1175
1103 mmu_spte_set(spte, new_spte); 1176 mmu_spte_clear_track_bits(sptep);
1104 spte = rmap_next(rmapp, spte); 1177 mmu_spte_set(sptep, new_spte);
1178 sptep = rmap_get_next(&iter);
1105 } 1179 }
1106 } 1180 }
1181
1107 if (need_flush) 1182 if (need_flush)
1108 kvm_flush_remote_tlbs(kvm); 1183 kvm_flush_remote_tlbs(kvm);
1109 1184
@@ -1162,7 +1237,8 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1162static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 1237static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1163 unsigned long data) 1238 unsigned long data)
1164{ 1239{
1165 u64 *spte; 1240 u64 *sptep;
1241 struct rmap_iterator iter;
1166 int young = 0; 1242 int young = 0;
1167 1243
1168 /* 1244 /*
@@ -1175,25 +1251,24 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1175 if (!shadow_accessed_mask) 1251 if (!shadow_accessed_mask)
1176 return kvm_unmap_rmapp(kvm, rmapp, data); 1252 return kvm_unmap_rmapp(kvm, rmapp, data);
1177 1253
1178 spte = rmap_next(rmapp, NULL); 1254 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
1179 while (spte) { 1255 sptep = rmap_get_next(&iter)) {
1180 int _young; 1256 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1181 u64 _spte = *spte; 1257
1182 BUG_ON(!(_spte & PT_PRESENT_MASK)); 1258 if (*sptep & PT_ACCESSED_MASK) {
1183 _young = _spte & PT_ACCESSED_MASK;
1184 if (_young) {
1185 young = 1; 1259 young = 1;
1186 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); 1260 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep);
1187 } 1261 }
1188 spte = rmap_next(rmapp, spte);
1189 } 1262 }
1263
1190 return young; 1264 return young;
1191} 1265}
1192 1266
1193static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 1267static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1194 unsigned long data) 1268 unsigned long data)
1195{ 1269{
1196 u64 *spte; 1270 u64 *sptep;
1271 struct rmap_iterator iter;
1197 int young = 0; 1272 int young = 0;
1198 1273
1199 /* 1274 /*
@@ -1204,16 +1279,14 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1204 if (!shadow_accessed_mask) 1279 if (!shadow_accessed_mask)
1205 goto out; 1280 goto out;
1206 1281
1207 spte = rmap_next(rmapp, NULL); 1282 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
1208 while (spte) { 1283 sptep = rmap_get_next(&iter)) {
1209 u64 _spte = *spte; 1284 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1210 BUG_ON(!(_spte & PT_PRESENT_MASK)); 1285
1211 young = _spte & PT_ACCESSED_MASK; 1286 if (*sptep & PT_ACCESSED_MASK) {
1212 if (young) {
1213 young = 1; 1287 young = 1;
1214 break; 1288 break;
1215 } 1289 }
1216 spte = rmap_next(rmapp, spte);
1217 } 1290 }
1218out: 1291out:
1219 return young; 1292 return young;
@@ -1865,10 +1938,11 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1865 1938
1866static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) 1939static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1867{ 1940{
1868 u64 *parent_pte; 1941 u64 *sptep;
1942 struct rmap_iterator iter;
1869 1943
1870 while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL))) 1944 while ((sptep = rmap_get_first(sp->parent_ptes, &iter)))
1871 drop_parent_pte(sp, parent_pte); 1945 drop_parent_pte(sp, sptep);
1872} 1946}
1873 1947
1874static int mmu_zap_unsync_children(struct kvm *kvm, 1948static int mmu_zap_unsync_children(struct kvm *kvm,
@@ -1925,30 +1999,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1925 return ret; 1999 return ret;
1926} 2000}
1927 2001
1928static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
1929{
1930 struct kvm_mmu_page *sp;
1931
1932 list_for_each_entry(sp, invalid_list, link)
1933 kvm_mmu_isolate_page(sp);
1934}
1935
1936static void free_pages_rcu(struct rcu_head *head)
1937{
1938 struct kvm_mmu_page *next, *sp;
1939
1940 sp = container_of(head, struct kvm_mmu_page, rcu);
1941 while (sp) {
1942 if (!list_empty(&sp->link))
1943 next = list_first_entry(&sp->link,
1944 struct kvm_mmu_page, link);
1945 else
1946 next = NULL;
1947 kvm_mmu_free_page(sp);
1948 sp = next;
1949 }
1950}
1951
1952static void kvm_mmu_commit_zap_page(struct kvm *kvm, 2002static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1953 struct list_head *invalid_list) 2003 struct list_head *invalid_list)
1954{ 2004{
@@ -1957,17 +2007,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1957 if (list_empty(invalid_list)) 2007 if (list_empty(invalid_list))
1958 return; 2008 return;
1959 2009
1960 kvm_flush_remote_tlbs(kvm); 2010 /*
1961 2011 * wmb: make sure everyone sees our modifications to the page tables
1962 if (atomic_read(&kvm->arch.reader_counter)) { 2012 * rmb: make sure we see changes to vcpu->mode
1963 kvm_mmu_isolate_pages(invalid_list); 2013 */
1964 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); 2014 smp_mb();
1965 list_del_init(invalid_list);
1966 2015
1967 trace_kvm_mmu_delay_free_pages(sp); 2016 /*
1968 call_rcu(&sp->rcu, free_pages_rcu); 2017 * Wait for all vcpus to exit guest mode and/or lockless shadow
1969 return; 2018 * page table walks.
1970 } 2019 */
2020 kvm_flush_remote_tlbs(kvm);
1971 2021
1972 do { 2022 do {
1973 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); 2023 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
@@ -1975,7 +2025,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1975 kvm_mmu_isolate_page(sp); 2025 kvm_mmu_isolate_page(sp);
1976 kvm_mmu_free_page(sp); 2026 kvm_mmu_free_page(sp);
1977 } while (!list_empty(invalid_list)); 2027 } while (!list_empty(invalid_list));
1978
1979} 2028}
1980 2029
1981/* 2030/*
@@ -3554,7 +3603,7 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp)
3554 * Skip write-flooding detected for the sp whose level is 1, because 3603 * Skip write-flooding detected for the sp whose level is 1, because
3555 * it can become unsync, then the guest page is not write-protected. 3604 * it can become unsync, then the guest page is not write-protected.
3556 */ 3605 */
3557 if (sp->role.level == 1) 3606 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
3558 return false; 3607 return false;
3559 3608
3560 return ++sp->write_flooding_count >= 3; 3609 return ++sp->write_flooding_count >= 3;
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 715da5a19a5b..7d7d0b9e23eb 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -192,7 +192,8 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
192{ 192{
193 struct kvm_memory_slot *slot; 193 struct kvm_memory_slot *slot;
194 unsigned long *rmapp; 194 unsigned long *rmapp;
195 u64 *spte; 195 u64 *sptep;
196 struct rmap_iterator iter;
196 197
197 if (sp->role.direct || sp->unsync || sp->role.invalid) 198 if (sp->role.direct || sp->unsync || sp->role.invalid)
198 return; 199 return;
@@ -200,13 +201,12 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
200 slot = gfn_to_memslot(kvm, sp->gfn); 201 slot = gfn_to_memslot(kvm, sp->gfn);
201 rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; 202 rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
202 203
203 spte = rmap_next(rmapp, NULL); 204 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
204 while (spte) { 205 sptep = rmap_get_next(&iter)) {
205 if (is_writable_pte(*spte)) 206 if (is_writable_pte(*sptep))
206 audit_printk(kvm, "shadow page has writable " 207 audit_printk(kvm, "shadow page has writable "
207 "mappings: gfn %llx role %x\n", 208 "mappings: gfn %llx role %x\n",
208 sp->gfn, sp->role.word); 209 sp->gfn, sp->role.word);
209 spte = rmap_next(rmapp, spte);
210 } 210 }
211} 211}
212 212
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index df5a70311be8..34f970937ef1 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -658,7 +658,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
658{ 658{
659 int offset = 0; 659 int offset = 0;
660 660
661 WARN_ON(sp->role.level != 1); 661 WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
662 662
663 if (PTTYPE == 32) 663 if (PTTYPE == 32)
664 offset = sp->role.quadrant << PT64_LEVEL_BITS; 664 offset = sp->role.quadrant << PT64_LEVEL_BITS;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e334389e1c75..f75af406b268 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -22,6 +22,7 @@
22#include "x86.h" 22#include "x86.h"
23 23
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/mod_devicetable.h>
25#include <linux/kernel.h> 26#include <linux/kernel.h>
26#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
27#include <linux/highmem.h> 28#include <linux/highmem.h>
@@ -42,6 +43,12 @@
42MODULE_AUTHOR("Qumranet"); 43MODULE_AUTHOR("Qumranet");
43MODULE_LICENSE("GPL"); 44MODULE_LICENSE("GPL");
44 45
46static const struct x86_cpu_id svm_cpu_id[] = {
47 X86_FEATURE_MATCH(X86_FEATURE_SVM),
48 {}
49};
50MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
51
45#define IOPM_ALLOC_ORDER 2 52#define IOPM_ALLOC_ORDER 2
46#define MSRPM_ALLOC_ORDER 1 53#define MSRPM_ALLOC_ORDER 1
47 54
@@ -3240,6 +3247,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
3240 svm_clear_vintr(svm); 3247 svm_clear_vintr(svm);
3241 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 3248 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3242 mark_dirty(svm->vmcb, VMCB_INTR); 3249 mark_dirty(svm->vmcb, VMCB_INTR);
3250 ++svm->vcpu.stat.irq_window_exits;
3243 /* 3251 /*
3244 * If the user space waits to inject interrupts, exit as soon as 3252 * If the user space waits to inject interrupts, exit as soon as
3245 * possible 3253 * possible
@@ -3247,7 +3255,6 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
3247 if (!irqchip_in_kernel(svm->vcpu.kvm) && 3255 if (!irqchip_in_kernel(svm->vcpu.kvm) &&
3248 kvm_run->request_interrupt_window && 3256 kvm_run->request_interrupt_window &&
3249 !kvm_cpu_has_interrupt(&svm->vcpu)) { 3257 !kvm_cpu_has_interrupt(&svm->vcpu)) {
3250 ++svm->vcpu.stat.irq_window_exits;
3251 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 3258 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
3252 return 0; 3259 return 0;
3253 } 3260 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4ff0ab9bc3c8..32eb58866292 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -27,6 +27,7 @@
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/sched.h> 28#include <linux/sched.h>
29#include <linux/moduleparam.h> 29#include <linux/moduleparam.h>
30#include <linux/mod_devicetable.h>
30#include <linux/ftrace_event.h> 31#include <linux/ftrace_event.h>
31#include <linux/slab.h> 32#include <linux/slab.h>
32#include <linux/tboot.h> 33#include <linux/tboot.h>
@@ -51,6 +52,12 @@
51MODULE_AUTHOR("Qumranet"); 52MODULE_AUTHOR("Qumranet");
52MODULE_LICENSE("GPL"); 53MODULE_LICENSE("GPL");
53 54
55static const struct x86_cpu_id vmx_cpu_id[] = {
56 X86_FEATURE_MATCH(X86_FEATURE_VMX),
57 {}
58};
59MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
60
54static bool __read_mostly enable_vpid = 1; 61static bool __read_mostly enable_vpid = 1;
55module_param_named(vpid, enable_vpid, bool, 0444); 62module_param_named(vpid, enable_vpid, bool, 0444);
56 63
@@ -386,6 +393,9 @@ struct vcpu_vmx {
386 struct { 393 struct {
387 int loaded; 394 int loaded;
388 u16 fs_sel, gs_sel, ldt_sel; 395 u16 fs_sel, gs_sel, ldt_sel;
396#ifdef CONFIG_X86_64
397 u16 ds_sel, es_sel;
398#endif
389 int gs_ldt_reload_needed; 399 int gs_ldt_reload_needed;
390 int fs_reload_needed; 400 int fs_reload_needed;
391 } host_state; 401 } host_state;
@@ -1411,6 +1421,11 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
1411 } 1421 }
1412 1422
1413#ifdef CONFIG_X86_64 1423#ifdef CONFIG_X86_64
1424 savesegment(ds, vmx->host_state.ds_sel);
1425 savesegment(es, vmx->host_state.es_sel);
1426#endif
1427
1428#ifdef CONFIG_X86_64
1414 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); 1429 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
1415 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); 1430 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
1416#else 1431#else
@@ -1450,6 +1465,19 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
1450 } 1465 }
1451 if (vmx->host_state.fs_reload_needed) 1466 if (vmx->host_state.fs_reload_needed)
1452 loadsegment(fs, vmx->host_state.fs_sel); 1467 loadsegment(fs, vmx->host_state.fs_sel);
1468#ifdef CONFIG_X86_64
1469 if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
1470 loadsegment(ds, vmx->host_state.ds_sel);
1471 loadsegment(es, vmx->host_state.es_sel);
1472 }
1473#else
1474 /*
1475 * The sysexit path does not restore ds/es, so we must set them to
1476 * a reasonable value ourselves.
1477 */
1478 loadsegment(ds, __USER_DS);
1479 loadsegment(es, __USER_DS);
1480#endif
1453 reload_tss(); 1481 reload_tss();
1454#ifdef CONFIG_X86_64 1482#ifdef CONFIG_X86_64
1455 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 1483 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
@@ -3633,8 +3661,18 @@ static void vmx_set_constant_host_state(void)
3633 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ 3661 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
3634 3662
3635 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 3663 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
3664#ifdef CONFIG_X86_64
3665 /*
3666 * Load null selectors, so we can avoid reloading them in
3667 * __vmx_load_host_state(), in case userspace uses the null selectors
3668 * too (the expected case).
3669 */
3670 vmcs_write16(HOST_DS_SELECTOR, 0);
3671 vmcs_write16(HOST_ES_SELECTOR, 0);
3672#else
3636 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 3673 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3637 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 3674 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3675#endif
3638 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 3676 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3639 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 3677 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
3640 3678
@@ -6256,7 +6294,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6256 } 6294 }
6257 } 6295 }
6258 6296
6259 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
6260 vmx->loaded_vmcs->launched = 1; 6297 vmx->loaded_vmcs->launched = 1;
6261 6298
6262 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); 6299 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
@@ -6343,7 +6380,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
6343 return &vmx->vcpu; 6380 return &vmx->vcpu;
6344 6381
6345free_vmcs: 6382free_vmcs:
6346 free_vmcs(vmx->loaded_vmcs->vmcs); 6383 free_loaded_vmcs(vmx->loaded_vmcs);
6347free_msrs: 6384free_msrs:
6348 kfree(vmx->guest_msrs); 6385 kfree(vmx->guest_msrs);
6349uninit_vcpu: 6386uninit_vcpu:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 185a2b823a2d..be6d54929fa7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2147,6 +2147,7 @@ int kvm_dev_ioctl_check_extension(long ext)
2147 case KVM_CAP_ASYNC_PF: 2147 case KVM_CAP_ASYNC_PF:
2148 case KVM_CAP_GET_TSC_KHZ: 2148 case KVM_CAP_GET_TSC_KHZ:
2149 case KVM_CAP_PCI_2_3: 2149 case KVM_CAP_PCI_2_3:
2150 case KVM_CAP_KVMCLOCK_CTRL:
2150 r = 1; 2151 r = 1;
2151 break; 2152 break;
2152 case KVM_CAP_COALESCED_MMIO: 2153 case KVM_CAP_COALESCED_MMIO:
@@ -2597,6 +2598,23 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
2597 return r; 2598 return r;
2598} 2599}
2599 2600
2601/*
2602 * kvm_set_guest_paused() indicates to the guest kernel that it has been
2603 * stopped by the hypervisor. This function will be called from the host only.
2604 * EINVAL is returned when the host attempts to set the flag for a guest that
2605 * does not support pv clocks.
2606 */
2607static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
2608{
2609 struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock;
2610 if (!vcpu->arch.time_page)
2611 return -EINVAL;
2612 src->flags |= PVCLOCK_GUEST_STOPPED;
2613 mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT);
2614 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2615 return 0;
2616}
2617
2600long kvm_arch_vcpu_ioctl(struct file *filp, 2618long kvm_arch_vcpu_ioctl(struct file *filp,
2601 unsigned int ioctl, unsigned long arg) 2619 unsigned int ioctl, unsigned long arg)
2602{ 2620{
@@ -2873,6 +2891,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2873 r = vcpu->arch.virtual_tsc_khz; 2891 r = vcpu->arch.virtual_tsc_khz;
2874 goto out; 2892 goto out;
2875 } 2893 }
2894 case KVM_KVMCLOCK_CTRL: {
2895 r = kvm_set_guest_paused(vcpu);
2896 goto out;
2897 }
2876 default: 2898 default:
2877 r = -EINVAL; 2899 r = -EINVAL;
2878 } 2900 }
@@ -3045,57 +3067,32 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
3045} 3067}
3046 3068
3047/** 3069/**
3048 * write_protect_slot - write protect a slot for dirty logging 3070 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
3049 * @kvm: the kvm instance 3071 * @kvm: kvm instance
3050 * @memslot: the slot we protect 3072 * @log: slot id and address to which we copy the log
3051 * @dirty_bitmap: the bitmap indicating which pages are dirty
3052 * @nr_dirty_pages: the number of dirty pages
3053 * 3073 *
3054 * We have two ways to find all sptes to protect: 3074 * We need to keep it in mind that VCPU threads can write to the bitmap
3055 * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and 3075 * concurrently. So, to avoid losing data, we keep the following order for
3056 * checks ones that have a spte mapping a page in the slot. 3076 * each bit:
3057 * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap.
3058 * 3077 *
3059 * Generally speaking, if there are not so many dirty pages compared to the 3078 * 1. Take a snapshot of the bit and clear it if needed.
3060 * number of shadow pages, we should use the latter. 3079 * 2. Write protect the corresponding page.
3080 * 3. Flush TLB's if needed.
3081 * 4. Copy the snapshot to the userspace.
3061 * 3082 *
3062 * Note that letting others write into a page marked dirty in the old bitmap 3083 * Between 2 and 3, the guest may write to the page using the remaining TLB
3063 * by using the remaining tlb entry is not a problem. That page will become 3084 * entry. This is not a problem because the page will be reported dirty at
3064 * write protected again when we flush the tlb and then be reported dirty to 3085 * step 4 using the snapshot taken before and step 3 ensures that successive
3065 * the user space by copying the old bitmap. 3086 * writes will be logged for the next call.
3066 */
3067static void write_protect_slot(struct kvm *kvm,
3068 struct kvm_memory_slot *memslot,
3069 unsigned long *dirty_bitmap,
3070 unsigned long nr_dirty_pages)
3071{
3072 spin_lock(&kvm->mmu_lock);
3073
3074 /* Not many dirty pages compared to # of shadow pages. */
3075 if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
3076 unsigned long gfn_offset;
3077
3078 for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
3079 unsigned long gfn = memslot->base_gfn + gfn_offset;
3080
3081 kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
3082 }
3083 kvm_flush_remote_tlbs(kvm);
3084 } else
3085 kvm_mmu_slot_remove_write_access(kvm, memslot->id);
3086
3087 spin_unlock(&kvm->mmu_lock);
3088}
3089
3090/*
3091 * Get (and clear) the dirty memory log for a memory slot.
3092 */ 3087 */
3093int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 3088int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
3094 struct kvm_dirty_log *log)
3095{ 3089{
3096 int r; 3090 int r;
3097 struct kvm_memory_slot *memslot; 3091 struct kvm_memory_slot *memslot;
3098 unsigned long n, nr_dirty_pages; 3092 unsigned long n, i;
3093 unsigned long *dirty_bitmap;
3094 unsigned long *dirty_bitmap_buffer;
3095 bool is_dirty = false;
3099 3096
3100 mutex_lock(&kvm->slots_lock); 3097 mutex_lock(&kvm->slots_lock);
3101 3098
@@ -3104,49 +3101,42 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
3104 goto out; 3101 goto out;
3105 3102
3106 memslot = id_to_memslot(kvm->memslots, log->slot); 3103 memslot = id_to_memslot(kvm->memslots, log->slot);
3104
3105 dirty_bitmap = memslot->dirty_bitmap;
3107 r = -ENOENT; 3106 r = -ENOENT;
3108 if (!memslot->dirty_bitmap) 3107 if (!dirty_bitmap)
3109 goto out; 3108 goto out;
3110 3109
3111 n = kvm_dirty_bitmap_bytes(memslot); 3110 n = kvm_dirty_bitmap_bytes(memslot);
3112 nr_dirty_pages = memslot->nr_dirty_pages;
3113 3111
3114 /* If nothing is dirty, don't bother messing with page tables. */ 3112 dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
3115 if (nr_dirty_pages) { 3113 memset(dirty_bitmap_buffer, 0, n);
3116 struct kvm_memslots *slots, *old_slots;
3117 unsigned long *dirty_bitmap, *dirty_bitmap_head;
3118 3114
3119 dirty_bitmap = memslot->dirty_bitmap; 3115 spin_lock(&kvm->mmu_lock);
3120 dirty_bitmap_head = memslot->dirty_bitmap_head;
3121 if (dirty_bitmap == dirty_bitmap_head)
3122 dirty_bitmap_head += n / sizeof(long);
3123 memset(dirty_bitmap_head, 0, n);
3124 3116
3125 r = -ENOMEM; 3117 for (i = 0; i < n / sizeof(long); i++) {
3126 slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL); 3118 unsigned long mask;
3127 if (!slots) 3119 gfn_t offset;
3128 goto out;
3129 3120
3130 memslot = id_to_memslot(slots, log->slot); 3121 if (!dirty_bitmap[i])
3131 memslot->nr_dirty_pages = 0; 3122 continue;
3132 memslot->dirty_bitmap = dirty_bitmap_head;
3133 update_memslots(slots, NULL);
3134 3123
3135 old_slots = kvm->memslots; 3124 is_dirty = true;
3136 rcu_assign_pointer(kvm->memslots, slots);
3137 synchronize_srcu_expedited(&kvm->srcu);
3138 kfree(old_slots);
3139 3125
3140 write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages); 3126 mask = xchg(&dirty_bitmap[i], 0);
3127 dirty_bitmap_buffer[i] = mask;
3141 3128
3142 r = -EFAULT; 3129 offset = i * BITS_PER_LONG;
3143 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) 3130 kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
3144 goto out;
3145 } else {
3146 r = -EFAULT;
3147 if (clear_user(log->dirty_bitmap, n))
3148 goto out;
3149 } 3131 }
3132 if (is_dirty)
3133 kvm_flush_remote_tlbs(kvm);
3134
3135 spin_unlock(&kvm->mmu_lock);
3136
3137 r = -EFAULT;
3138 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
3139 goto out;
3150 3140
3151 r = 0; 3141 r = 0;
3152out: 3142out:
@@ -3728,9 +3718,8 @@ struct read_write_emulator_ops {
3728static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) 3718static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
3729{ 3719{
3730 if (vcpu->mmio_read_completed) { 3720 if (vcpu->mmio_read_completed) {
3731 memcpy(val, vcpu->mmio_data, bytes);
3732 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, 3721 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
3733 vcpu->mmio_phys_addr, *(u64 *)val); 3722 vcpu->mmio_fragments[0].gpa, *(u64 *)val);
3734 vcpu->mmio_read_completed = 0; 3723 vcpu->mmio_read_completed = 0;
3735 return 1; 3724 return 1;
3736 } 3725 }
@@ -3766,8 +3755,9 @@ static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
3766static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, 3755static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
3767 void *val, int bytes) 3756 void *val, int bytes)
3768{ 3757{
3769 memcpy(vcpu->mmio_data, val, bytes); 3758 struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
3770 memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); 3759
3760 memcpy(vcpu->run->mmio.data, frag->data, frag->len);
3771 return X86EMUL_CONTINUE; 3761 return X86EMUL_CONTINUE;
3772} 3762}
3773 3763
@@ -3794,10 +3784,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
3794 gpa_t gpa; 3784 gpa_t gpa;
3795 int handled, ret; 3785 int handled, ret;
3796 bool write = ops->write; 3786 bool write = ops->write;
3797 3787 struct kvm_mmio_fragment *frag;
3798 if (ops->read_write_prepare &&
3799 ops->read_write_prepare(vcpu, val, bytes))
3800 return X86EMUL_CONTINUE;
3801 3788
3802 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); 3789 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
3803 3790
@@ -3823,15 +3810,19 @@ mmio:
3823 bytes -= handled; 3810 bytes -= handled;
3824 val += handled; 3811 val += handled;
3825 3812
3826 vcpu->mmio_needed = 1; 3813 while (bytes) {
3827 vcpu->run->exit_reason = KVM_EXIT_MMIO; 3814 unsigned now = min(bytes, 8U);
3828 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
3829 vcpu->mmio_size = bytes;
3830 vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
3831 vcpu->run->mmio.is_write = vcpu->mmio_is_write = write;
3832 vcpu->mmio_index = 0;
3833 3815
3834 return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); 3816 frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
3817 frag->gpa = gpa;
3818 frag->data = val;
3819 frag->len = now;
3820
3821 gpa += now;
3822 val += now;
3823 bytes -= now;
3824 }
3825 return X86EMUL_CONTINUE;
3835} 3826}
3836 3827
3837int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, 3828int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
@@ -3840,10 +3831,18 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
3840 struct read_write_emulator_ops *ops) 3831 struct read_write_emulator_ops *ops)
3841{ 3832{
3842 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 3833 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3834 gpa_t gpa;
3835 int rc;
3836
3837 if (ops->read_write_prepare &&
3838 ops->read_write_prepare(vcpu, val, bytes))
3839 return X86EMUL_CONTINUE;
3840
3841 vcpu->mmio_nr_fragments = 0;
3843 3842
3844 /* Crossing a page boundary? */ 3843 /* Crossing a page boundary? */
3845 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 3844 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
3846 int rc, now; 3845 int now;
3847 3846
3848 now = -addr & ~PAGE_MASK; 3847 now = -addr & ~PAGE_MASK;
3849 rc = emulator_read_write_onepage(addr, val, now, exception, 3848 rc = emulator_read_write_onepage(addr, val, now, exception,
@@ -3856,8 +3855,25 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
3856 bytes -= now; 3855 bytes -= now;
3857 } 3856 }
3858 3857
3859 return emulator_read_write_onepage(addr, val, bytes, exception, 3858 rc = emulator_read_write_onepage(addr, val, bytes, exception,
3860 vcpu, ops); 3859 vcpu, ops);
3860 if (rc != X86EMUL_CONTINUE)
3861 return rc;
3862
3863 if (!vcpu->mmio_nr_fragments)
3864 return rc;
3865
3866 gpa = vcpu->mmio_fragments[0].gpa;
3867
3868 vcpu->mmio_needed = 1;
3869 vcpu->mmio_cur_fragment = 0;
3870
3871 vcpu->run->mmio.len = vcpu->mmio_fragments[0].len;
3872 vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
3873 vcpu->run->exit_reason = KVM_EXIT_MMIO;
3874 vcpu->run->mmio.phys_addr = gpa;
3875
3876 return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
3861} 3877}
3862 3878
3863static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, 3879static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
@@ -5263,10 +5279,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5263 kvm_deliver_pmi(vcpu); 5279 kvm_deliver_pmi(vcpu);
5264 } 5280 }
5265 5281
5266 r = kvm_mmu_reload(vcpu);
5267 if (unlikely(r))
5268 goto out;
5269
5270 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { 5282 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
5271 inject_pending_event(vcpu); 5283 inject_pending_event(vcpu);
5272 5284
@@ -5282,6 +5294,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5282 } 5294 }
5283 } 5295 }
5284 5296
5297 r = kvm_mmu_reload(vcpu);
5298 if (unlikely(r)) {
5299 kvm_x86_ops->cancel_injection(vcpu);
5300 goto out;
5301 }
5302
5285 preempt_disable(); 5303 preempt_disable();
5286 5304
5287 kvm_x86_ops->prepare_guest_switch(vcpu); 5305 kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -5456,33 +5474,55 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5456 return r; 5474 return r;
5457} 5475}
5458 5476
5477/*
5478 * Implements the following, as a state machine:
5479 *
5480 * read:
5481 * for each fragment
5482 * write gpa, len
5483 * exit
5484 * copy data
5485 * execute insn
5486 *
5487 * write:
5488 * for each fragment
5489 * write gpa, len
5490 * copy data
5491 * exit
5492 */
5459static int complete_mmio(struct kvm_vcpu *vcpu) 5493static int complete_mmio(struct kvm_vcpu *vcpu)
5460{ 5494{
5461 struct kvm_run *run = vcpu->run; 5495 struct kvm_run *run = vcpu->run;
5496 struct kvm_mmio_fragment *frag;
5462 int r; 5497 int r;
5463 5498
5464 if (!(vcpu->arch.pio.count || vcpu->mmio_needed)) 5499 if (!(vcpu->arch.pio.count || vcpu->mmio_needed))
5465 return 1; 5500 return 1;
5466 5501
5467 if (vcpu->mmio_needed) { 5502 if (vcpu->mmio_needed) {
5468 vcpu->mmio_needed = 0; 5503 /* Complete previous fragment */
5504 frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
5469 if (!vcpu->mmio_is_write) 5505 if (!vcpu->mmio_is_write)
5470 memcpy(vcpu->mmio_data + vcpu->mmio_index, 5506 memcpy(frag->data, run->mmio.data, frag->len);
5471 run->mmio.data, 8); 5507 if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
5472 vcpu->mmio_index += 8; 5508 vcpu->mmio_needed = 0;
5473 if (vcpu->mmio_index < vcpu->mmio_size) { 5509 if (vcpu->mmio_is_write)
5474 run->exit_reason = KVM_EXIT_MMIO; 5510 return 1;
5475 run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index; 5511 vcpu->mmio_read_completed = 1;
5476 memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8); 5512 goto done;
5477 run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8);
5478 run->mmio.is_write = vcpu->mmio_is_write;
5479 vcpu->mmio_needed = 1;
5480 return 0;
5481 } 5513 }
5514 /* Initiate next fragment */
5515 ++frag;
5516 run->exit_reason = KVM_EXIT_MMIO;
5517 run->mmio.phys_addr = frag->gpa;
5482 if (vcpu->mmio_is_write) 5518 if (vcpu->mmio_is_write)
5483 return 1; 5519 memcpy(run->mmio.data, frag->data, frag->len);
5484 vcpu->mmio_read_completed = 1; 5520 run->mmio.len = frag->len;
5521 run->mmio.is_write = vcpu->mmio_is_write;
5522 return 0;
5523
5485 } 5524 }
5525done:
5486 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 5526 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5487 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); 5527 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
5488 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5528 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -6399,21 +6439,9 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
6399 kvm_cpu_has_interrupt(vcpu)); 6439 kvm_cpu_has_interrupt(vcpu));
6400} 6440}
6401 6441
6402void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 6442int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
6403{ 6443{
6404 int me; 6444 return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
6405 int cpu = vcpu->cpu;
6406
6407 if (waitqueue_active(&vcpu->wq)) {
6408 wake_up_interruptible(&vcpu->wq);
6409 ++vcpu->stat.halt_wakeup;
6410 }
6411
6412 me = get_cpu();
6413 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
6414 if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE)
6415 smp_send_reschedule(cpu);
6416 put_cpu();
6417} 6445}
6418 6446
6419int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) 6447int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index cb80c293cdd8..3d1134ddb885 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -64,7 +64,7 @@ static inline int is_pse(struct kvm_vcpu *vcpu)
64 64
65static inline int is_paging(struct kvm_vcpu *vcpu) 65static inline int is_paging(struct kvm_vcpu *vcpu)
66{ 66{
67 return kvm_read_cr0_bits(vcpu, X86_CR0_PG); 67 return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG));
68} 68}
69 69
70static inline u32 bit(int bitno) 70static inline u32 bit(int bitno)