diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/Kconfig | 1 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.c | 5 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 293 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.c | 31 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.h | 7 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 31 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 345 | ||||
-rw-r--r-- | arch/x86/kvm/mmu_audit.c | 10 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 9 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 41 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 280 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 2 |
13 files changed, 649 insertions, 408 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 1a7fe868f375..a28f338843ea 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig | |||
@@ -36,6 +36,7 @@ config KVM | |||
36 | select TASKSTATS | 36 | select TASKSTATS |
37 | select TASK_DELAY_ACCT | 37 | select TASK_DELAY_ACCT |
38 | select PERF_EVENTS | 38 | select PERF_EVENTS |
39 | select HAVE_KVM_MSI | ||
39 | ---help--- | 40 | ---help--- |
40 | Support hosting fully virtualized guest machines using hardware | 41 | Support hosting fully virtualized guest machines using hardware |
41 | virtualization extensions. You will need a fairly recent | 42 | virtualization extensions. You will need a fairly recent |
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 9fed5bedaad6..7df1c6d839fb 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c | |||
@@ -247,7 +247,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
247 | 247 | ||
248 | /* cpuid 7.0.ebx */ | 248 | /* cpuid 7.0.ebx */ |
249 | const u32 kvm_supported_word9_x86_features = | 249 | const u32 kvm_supported_word9_x86_features = |
250 | F(FSGSBASE) | F(BMI1) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS); | 250 | F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | |
251 | F(BMI2) | F(ERMS) | F(RTM); | ||
251 | 252 | ||
252 | /* all calls to cpuid_count() should be made on the same cpu */ | 253 | /* all calls to cpuid_count() should be made on the same cpu */ |
253 | get_cpu(); | 254 | get_cpu(); |
@@ -397,7 +398,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
397 | case KVM_CPUID_SIGNATURE: { | 398 | case KVM_CPUID_SIGNATURE: { |
398 | char signature[12] = "KVMKVMKVM\0\0"; | 399 | char signature[12] = "KVMKVMKVM\0\0"; |
399 | u32 *sigptr = (u32 *)signature; | 400 | u32 *sigptr = (u32 *)signature; |
400 | entry->eax = 0; | 401 | entry->eax = KVM_CPUID_FEATURES; |
401 | entry->ebx = sigptr[0]; | 402 | entry->ebx = sigptr[0]; |
402 | entry->ecx = sigptr[1]; | 403 | entry->ecx = sigptr[1]; |
403 | entry->edx = sigptr[2]; | 404 | entry->edx = sigptr[2]; |
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 83756223f8aa..f95d242ee9f7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c | |||
@@ -142,6 +142,10 @@ | |||
142 | #define Src2FS (OpFS << Src2Shift) | 142 | #define Src2FS (OpFS << Src2Shift) |
143 | #define Src2GS (OpGS << Src2Shift) | 143 | #define Src2GS (OpGS << Src2Shift) |
144 | #define Src2Mask (OpMask << Src2Shift) | 144 | #define Src2Mask (OpMask << Src2Shift) |
145 | #define Mmx ((u64)1 << 40) /* MMX Vector instruction */ | ||
146 | #define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ | ||
147 | #define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */ | ||
148 | #define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ | ||
145 | 149 | ||
146 | #define X2(x...) x, x | 150 | #define X2(x...) x, x |
147 | #define X3(x...) X2(x), x | 151 | #define X3(x...) X2(x), x |
@@ -557,6 +561,29 @@ static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector, | |||
557 | ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg); | 561 | ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg); |
558 | } | 562 | } |
559 | 563 | ||
564 | /* | ||
565 | * x86 defines three classes of vector instructions: explicitly | ||
566 | * aligned, explicitly unaligned, and the rest, which change behaviour | ||
567 | * depending on whether they're AVX encoded or not. | ||
568 | * | ||
569 | * Also included is CMPXCHG16B which is not a vector instruction, yet it is | ||
570 | * subject to the same check. | ||
571 | */ | ||
572 | static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size) | ||
573 | { | ||
574 | if (likely(size < 16)) | ||
575 | return false; | ||
576 | |||
577 | if (ctxt->d & Aligned) | ||
578 | return true; | ||
579 | else if (ctxt->d & Unaligned) | ||
580 | return false; | ||
581 | else if (ctxt->d & Avx) | ||
582 | return false; | ||
583 | else | ||
584 | return true; | ||
585 | } | ||
586 | |||
560 | static int __linearize(struct x86_emulate_ctxt *ctxt, | 587 | static int __linearize(struct x86_emulate_ctxt *ctxt, |
561 | struct segmented_address addr, | 588 | struct segmented_address addr, |
562 | unsigned size, bool write, bool fetch, | 589 | unsigned size, bool write, bool fetch, |
@@ -621,6 +648,8 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, | |||
621 | } | 648 | } |
622 | if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8) | 649 | if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8) |
623 | la &= (u32)-1; | 650 | la &= (u32)-1; |
651 | if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0)) | ||
652 | return emulate_gp(ctxt, 0); | ||
624 | *linear = la; | 653 | *linear = la; |
625 | return X86EMUL_CONTINUE; | 654 | return X86EMUL_CONTINUE; |
626 | bad: | 655 | bad: |
@@ -859,6 +888,40 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, | |||
859 | ctxt->ops->put_fpu(ctxt); | 888 | ctxt->ops->put_fpu(ctxt); |
860 | } | 889 | } |
861 | 890 | ||
891 | static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) | ||
892 | { | ||
893 | ctxt->ops->get_fpu(ctxt); | ||
894 | switch (reg) { | ||
895 | case 0: asm("movq %%mm0, %0" : "=m"(*data)); break; | ||
896 | case 1: asm("movq %%mm1, %0" : "=m"(*data)); break; | ||
897 | case 2: asm("movq %%mm2, %0" : "=m"(*data)); break; | ||
898 | case 3: asm("movq %%mm3, %0" : "=m"(*data)); break; | ||
899 | case 4: asm("movq %%mm4, %0" : "=m"(*data)); break; | ||
900 | case 5: asm("movq %%mm5, %0" : "=m"(*data)); break; | ||
901 | case 6: asm("movq %%mm6, %0" : "=m"(*data)); break; | ||
902 | case 7: asm("movq %%mm7, %0" : "=m"(*data)); break; | ||
903 | default: BUG(); | ||
904 | } | ||
905 | ctxt->ops->put_fpu(ctxt); | ||
906 | } | ||
907 | |||
908 | static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) | ||
909 | { | ||
910 | ctxt->ops->get_fpu(ctxt); | ||
911 | switch (reg) { | ||
912 | case 0: asm("movq %0, %%mm0" : : "m"(*data)); break; | ||
913 | case 1: asm("movq %0, %%mm1" : : "m"(*data)); break; | ||
914 | case 2: asm("movq %0, %%mm2" : : "m"(*data)); break; | ||
915 | case 3: asm("movq %0, %%mm3" : : "m"(*data)); break; | ||
916 | case 4: asm("movq %0, %%mm4" : : "m"(*data)); break; | ||
917 | case 5: asm("movq %0, %%mm5" : : "m"(*data)); break; | ||
918 | case 6: asm("movq %0, %%mm6" : : "m"(*data)); break; | ||
919 | case 7: asm("movq %0, %%mm7" : : "m"(*data)); break; | ||
920 | default: BUG(); | ||
921 | } | ||
922 | ctxt->ops->put_fpu(ctxt); | ||
923 | } | ||
924 | |||
862 | static void decode_register_operand(struct x86_emulate_ctxt *ctxt, | 925 | static void decode_register_operand(struct x86_emulate_ctxt *ctxt, |
863 | struct operand *op) | 926 | struct operand *op) |
864 | { | 927 | { |
@@ -875,6 +938,13 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, | |||
875 | read_sse_reg(ctxt, &op->vec_val, reg); | 938 | read_sse_reg(ctxt, &op->vec_val, reg); |
876 | return; | 939 | return; |
877 | } | 940 | } |
941 | if (ctxt->d & Mmx) { | ||
942 | reg &= 7; | ||
943 | op->type = OP_MM; | ||
944 | op->bytes = 8; | ||
945 | op->addr.mm = reg; | ||
946 | return; | ||
947 | } | ||
878 | 948 | ||
879 | op->type = OP_REG; | 949 | op->type = OP_REG; |
880 | if (ctxt->d & ByteOp) { | 950 | if (ctxt->d & ByteOp) { |
@@ -902,7 +972,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, | |||
902 | ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ | 972 | ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ |
903 | } | 973 | } |
904 | 974 | ||
905 | ctxt->modrm = insn_fetch(u8, ctxt); | ||
906 | ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; | 975 | ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; |
907 | ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; | 976 | ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; |
908 | ctxt->modrm_rm |= (ctxt->modrm & 0x07); | 977 | ctxt->modrm_rm |= (ctxt->modrm & 0x07); |
@@ -920,6 +989,12 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, | |||
920 | read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm); | 989 | read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm); |
921 | return rc; | 990 | return rc; |
922 | } | 991 | } |
992 | if (ctxt->d & Mmx) { | ||
993 | op->type = OP_MM; | ||
994 | op->bytes = 8; | ||
995 | op->addr.xmm = ctxt->modrm_rm & 7; | ||
996 | return rc; | ||
997 | } | ||
923 | fetch_register_operand(op); | 998 | fetch_register_operand(op); |
924 | return rc; | 999 | return rc; |
925 | } | 1000 | } |
@@ -1387,6 +1462,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt) | |||
1387 | case OP_XMM: | 1462 | case OP_XMM: |
1388 | write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); | 1463 | write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); |
1389 | break; | 1464 | break; |
1465 | case OP_MM: | ||
1466 | write_mmx_reg(ctxt, &ctxt->dst.mm_val, ctxt->dst.addr.mm); | ||
1467 | break; | ||
1390 | case OP_NONE: | 1468 | case OP_NONE: |
1391 | /* no writeback */ | 1469 | /* no writeback */ |
1392 | break; | 1470 | break; |
@@ -2790,7 +2868,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt) | |||
2790 | 2868 | ||
2791 | static int em_mov(struct x86_emulate_ctxt *ctxt) | 2869 | static int em_mov(struct x86_emulate_ctxt *ctxt) |
2792 | { | 2870 | { |
2793 | ctxt->dst.val = ctxt->src.val; | 2871 | memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes); |
2794 | return X86EMUL_CONTINUE; | 2872 | return X86EMUL_CONTINUE; |
2795 | } | 2873 | } |
2796 | 2874 | ||
@@ -2870,12 +2948,6 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) | |||
2870 | return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); | 2948 | return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); |
2871 | } | 2949 | } |
2872 | 2950 | ||
2873 | static int em_movdqu(struct x86_emulate_ctxt *ctxt) | ||
2874 | { | ||
2875 | memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes); | ||
2876 | return X86EMUL_CONTINUE; | ||
2877 | } | ||
2878 | |||
2879 | static int em_invlpg(struct x86_emulate_ctxt *ctxt) | 2951 | static int em_invlpg(struct x86_emulate_ctxt *ctxt) |
2880 | { | 2952 | { |
2881 | int rc; | 2953 | int rc; |
@@ -3061,35 +3133,13 @@ static int em_btc(struct x86_emulate_ctxt *ctxt) | |||
3061 | 3133 | ||
3062 | static int em_bsf(struct x86_emulate_ctxt *ctxt) | 3134 | static int em_bsf(struct x86_emulate_ctxt *ctxt) |
3063 | { | 3135 | { |
3064 | u8 zf; | 3136 | emulate_2op_SrcV_nobyte(ctxt, "bsf"); |
3065 | |||
3066 | __asm__ ("bsf %2, %0; setz %1" | ||
3067 | : "=r"(ctxt->dst.val), "=q"(zf) | ||
3068 | : "r"(ctxt->src.val)); | ||
3069 | |||
3070 | ctxt->eflags &= ~X86_EFLAGS_ZF; | ||
3071 | if (zf) { | ||
3072 | ctxt->eflags |= X86_EFLAGS_ZF; | ||
3073 | /* Disable writeback. */ | ||
3074 | ctxt->dst.type = OP_NONE; | ||
3075 | } | ||
3076 | return X86EMUL_CONTINUE; | 3137 | return X86EMUL_CONTINUE; |
3077 | } | 3138 | } |
3078 | 3139 | ||
3079 | static int em_bsr(struct x86_emulate_ctxt *ctxt) | 3140 | static int em_bsr(struct x86_emulate_ctxt *ctxt) |
3080 | { | 3141 | { |
3081 | u8 zf; | 3142 | emulate_2op_SrcV_nobyte(ctxt, "bsr"); |
3082 | |||
3083 | __asm__ ("bsr %2, %0; setz %1" | ||
3084 | : "=r"(ctxt->dst.val), "=q"(zf) | ||
3085 | : "r"(ctxt->src.val)); | ||
3086 | |||
3087 | ctxt->eflags &= ~X86_EFLAGS_ZF; | ||
3088 | if (zf) { | ||
3089 | ctxt->eflags |= X86_EFLAGS_ZF; | ||
3090 | /* Disable writeback. */ | ||
3091 | ctxt->dst.type = OP_NONE; | ||
3092 | } | ||
3093 | return X86EMUL_CONTINUE; | 3143 | return X86EMUL_CONTINUE; |
3094 | } | 3144 | } |
3095 | 3145 | ||
@@ -3286,8 +3336,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) | |||
3286 | .check_perm = (_p) } | 3336 | .check_perm = (_p) } |
3287 | #define N D(0) | 3337 | #define N D(0) |
3288 | #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } | 3338 | #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } |
3289 | #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } | 3339 | #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } |
3290 | #define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) } | 3340 | #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } |
3291 | #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } | 3341 | #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } |
3292 | #define II(_f, _e, _i) \ | 3342 | #define II(_f, _e, _i) \ |
3293 | { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } | 3343 | { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } |
@@ -3307,25 +3357,25 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) | |||
3307 | I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) | 3357 | I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) |
3308 | 3358 | ||
3309 | static struct opcode group7_rm1[] = { | 3359 | static struct opcode group7_rm1[] = { |
3310 | DI(SrcNone | ModRM | Priv, monitor), | 3360 | DI(SrcNone | Priv, monitor), |
3311 | DI(SrcNone | ModRM | Priv, mwait), | 3361 | DI(SrcNone | Priv, mwait), |
3312 | N, N, N, N, N, N, | 3362 | N, N, N, N, N, N, |
3313 | }; | 3363 | }; |
3314 | 3364 | ||
3315 | static struct opcode group7_rm3[] = { | 3365 | static struct opcode group7_rm3[] = { |
3316 | DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa), | 3366 | DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), |
3317 | II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall), | 3367 | II(SrcNone | Prot | VendorSpecific, em_vmmcall, vmmcall), |
3318 | DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa), | 3368 | DIP(SrcNone | Prot | Priv, vmload, check_svme_pa), |
3319 | DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa), | 3369 | DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa), |
3320 | DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme), | 3370 | DIP(SrcNone | Prot | Priv, stgi, check_svme), |
3321 | DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme), | 3371 | DIP(SrcNone | Prot | Priv, clgi, check_svme), |
3322 | DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme), | 3372 | DIP(SrcNone | Prot | Priv, skinit, check_svme), |
3323 | DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme), | 3373 | DIP(SrcNone | Prot | Priv, invlpga, check_svme), |
3324 | }; | 3374 | }; |
3325 | 3375 | ||
3326 | static struct opcode group7_rm7[] = { | 3376 | static struct opcode group7_rm7[] = { |
3327 | N, | 3377 | N, |
3328 | DIP(SrcNone | ModRM, rdtscp, check_rdtsc), | 3378 | DIP(SrcNone, rdtscp, check_rdtsc), |
3329 | N, N, N, N, N, N, | 3379 | N, N, N, N, N, N, |
3330 | }; | 3380 | }; |
3331 | 3381 | ||
@@ -3341,81 +3391,86 @@ static struct opcode group1[] = { | |||
3341 | }; | 3391 | }; |
3342 | 3392 | ||
3343 | static struct opcode group1A[] = { | 3393 | static struct opcode group1A[] = { |
3344 | I(DstMem | SrcNone | ModRM | Mov | Stack, em_pop), N, N, N, N, N, N, N, | 3394 | I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, |
3345 | }; | 3395 | }; |
3346 | 3396 | ||
3347 | static struct opcode group3[] = { | 3397 | static struct opcode group3[] = { |
3348 | I(DstMem | SrcImm | ModRM, em_test), | 3398 | I(DstMem | SrcImm, em_test), |
3349 | I(DstMem | SrcImm | ModRM, em_test), | 3399 | I(DstMem | SrcImm, em_test), |
3350 | I(DstMem | SrcNone | ModRM | Lock, em_not), | 3400 | I(DstMem | SrcNone | Lock, em_not), |
3351 | I(DstMem | SrcNone | ModRM | Lock, em_neg), | 3401 | I(DstMem | SrcNone | Lock, em_neg), |
3352 | I(SrcMem | ModRM, em_mul_ex), | 3402 | I(SrcMem, em_mul_ex), |
3353 | I(SrcMem | ModRM, em_imul_ex), | 3403 | I(SrcMem, em_imul_ex), |
3354 | I(SrcMem | ModRM, em_div_ex), | 3404 | I(SrcMem, em_div_ex), |
3355 | I(SrcMem | ModRM, em_idiv_ex), | 3405 | I(SrcMem, em_idiv_ex), |
3356 | }; | 3406 | }; |
3357 | 3407 | ||
3358 | static struct opcode group4[] = { | 3408 | static struct opcode group4[] = { |
3359 | I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), | 3409 | I(ByteOp | DstMem | SrcNone | Lock, em_grp45), |
3360 | I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), | 3410 | I(ByteOp | DstMem | SrcNone | Lock, em_grp45), |
3361 | N, N, N, N, N, N, | 3411 | N, N, N, N, N, N, |
3362 | }; | 3412 | }; |
3363 | 3413 | ||
3364 | static struct opcode group5[] = { | 3414 | static struct opcode group5[] = { |
3365 | I(DstMem | SrcNone | ModRM | Lock, em_grp45), | 3415 | I(DstMem | SrcNone | Lock, em_grp45), |
3366 | I(DstMem | SrcNone | ModRM | Lock, em_grp45), | 3416 | I(DstMem | SrcNone | Lock, em_grp45), |
3367 | I(SrcMem | ModRM | Stack, em_grp45), | 3417 | I(SrcMem | Stack, em_grp45), |
3368 | I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), | 3418 | I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), |
3369 | I(SrcMem | ModRM | Stack, em_grp45), | 3419 | I(SrcMem | Stack, em_grp45), |
3370 | I(SrcMemFAddr | ModRM | ImplicitOps, em_grp45), | 3420 | I(SrcMemFAddr | ImplicitOps, em_grp45), |
3371 | I(SrcMem | ModRM | Stack, em_grp45), N, | 3421 | I(SrcMem | Stack, em_grp45), N, |
3372 | }; | 3422 | }; |
3373 | 3423 | ||
3374 | static struct opcode group6[] = { | 3424 | static struct opcode group6[] = { |
3375 | DI(ModRM | Prot, sldt), | 3425 | DI(Prot, sldt), |
3376 | DI(ModRM | Prot, str), | 3426 | DI(Prot, str), |
3377 | DI(ModRM | Prot | Priv, lldt), | 3427 | DI(Prot | Priv, lldt), |
3378 | DI(ModRM | Prot | Priv, ltr), | 3428 | DI(Prot | Priv, ltr), |
3379 | N, N, N, N, | 3429 | N, N, N, N, |
3380 | }; | 3430 | }; |
3381 | 3431 | ||
3382 | static struct group_dual group7 = { { | 3432 | static struct group_dual group7 = { { |
3383 | DI(ModRM | Mov | DstMem | Priv, sgdt), | 3433 | DI(Mov | DstMem | Priv, sgdt), |
3384 | DI(ModRM | Mov | DstMem | Priv, sidt), | 3434 | DI(Mov | DstMem | Priv, sidt), |
3385 | II(ModRM | SrcMem | Priv, em_lgdt, lgdt), | 3435 | II(SrcMem | Priv, em_lgdt, lgdt), |
3386 | II(ModRM | SrcMem | Priv, em_lidt, lidt), | 3436 | II(SrcMem | Priv, em_lidt, lidt), |
3387 | II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, | 3437 | II(SrcNone | DstMem | Mov, em_smsw, smsw), N, |
3388 | II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), | 3438 | II(SrcMem16 | Mov | Priv, em_lmsw, lmsw), |
3389 | II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg), | 3439 | II(SrcMem | ByteOp | Priv | NoAccess, em_invlpg, invlpg), |
3390 | }, { | 3440 | }, { |
3391 | I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall), | 3441 | I(SrcNone | Priv | VendorSpecific, em_vmcall), |
3392 | EXT(0, group7_rm1), | 3442 | EXT(0, group7_rm1), |
3393 | N, EXT(0, group7_rm3), | 3443 | N, EXT(0, group7_rm3), |
3394 | II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, | 3444 | II(SrcNone | DstMem | Mov, em_smsw, smsw), N, |
3395 | II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7), | 3445 | II(SrcMem16 | Mov | Priv, em_lmsw, lmsw), |
3446 | EXT(0, group7_rm7), | ||
3396 | } }; | 3447 | } }; |
3397 | 3448 | ||
3398 | static struct opcode group8[] = { | 3449 | static struct opcode group8[] = { |
3399 | N, N, N, N, | 3450 | N, N, N, N, |
3400 | I(DstMem | SrcImmByte | ModRM, em_bt), | 3451 | I(DstMem | SrcImmByte, em_bt), |
3401 | I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_bts), | 3452 | I(DstMem | SrcImmByte | Lock | PageTable, em_bts), |
3402 | I(DstMem | SrcImmByte | ModRM | Lock, em_btr), | 3453 | I(DstMem | SrcImmByte | Lock, em_btr), |
3403 | I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_btc), | 3454 | I(DstMem | SrcImmByte | Lock | PageTable, em_btc), |
3404 | }; | 3455 | }; |
3405 | 3456 | ||
3406 | static struct group_dual group9 = { { | 3457 | static struct group_dual group9 = { { |
3407 | N, I(DstMem64 | ModRM | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, | 3458 | N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, |
3408 | }, { | 3459 | }, { |
3409 | N, N, N, N, N, N, N, N, | 3460 | N, N, N, N, N, N, N, N, |
3410 | } }; | 3461 | } }; |
3411 | 3462 | ||
3412 | static struct opcode group11[] = { | 3463 | static struct opcode group11[] = { |
3413 | I(DstMem | SrcImm | ModRM | Mov | PageTable, em_mov), | 3464 | I(DstMem | SrcImm | Mov | PageTable, em_mov), |
3414 | X7(D(Undefined)), | 3465 | X7(D(Undefined)), |
3415 | }; | 3466 | }; |
3416 | 3467 | ||
3417 | static struct gprefix pfx_0f_6f_0f_7f = { | 3468 | static struct gprefix pfx_0f_6f_0f_7f = { |
3418 | N, N, N, I(Sse, em_movdqu), | 3469 | I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), |
3470 | }; | ||
3471 | |||
3472 | static struct gprefix pfx_vmovntpx = { | ||
3473 | I(0, em_mov), N, N, N, | ||
3419 | }; | 3474 | }; |
3420 | 3475 | ||
3421 | static struct opcode opcode_table[256] = { | 3476 | static struct opcode opcode_table[256] = { |
@@ -3464,10 +3519,10 @@ static struct opcode opcode_table[256] = { | |||
3464 | /* 0x70 - 0x7F */ | 3519 | /* 0x70 - 0x7F */ |
3465 | X16(D(SrcImmByte)), | 3520 | X16(D(SrcImmByte)), |
3466 | /* 0x80 - 0x87 */ | 3521 | /* 0x80 - 0x87 */ |
3467 | G(ByteOp | DstMem | SrcImm | ModRM | Group, group1), | 3522 | G(ByteOp | DstMem | SrcImm, group1), |
3468 | G(DstMem | SrcImm | ModRM | Group, group1), | 3523 | G(DstMem | SrcImm, group1), |
3469 | G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), | 3524 | G(ByteOp | DstMem | SrcImm | No64, group1), |
3470 | G(DstMem | SrcImmByte | ModRM | Group, group1), | 3525 | G(DstMem | SrcImmByte, group1), |
3471 | I2bv(DstMem | SrcReg | ModRM, em_test), | 3526 | I2bv(DstMem | SrcReg | ModRM, em_test), |
3472 | I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg), | 3527 | I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg), |
3473 | /* 0x88 - 0x8F */ | 3528 | /* 0x88 - 0x8F */ |
@@ -3549,7 +3604,8 @@ static struct opcode twobyte_table[256] = { | |||
3549 | IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), | 3604 | IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), |
3550 | IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), | 3605 | IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), |
3551 | N, N, N, N, | 3606 | N, N, N, N, |
3552 | N, N, N, N, N, N, N, N, | 3607 | N, N, N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx), |
3608 | N, N, N, N, | ||
3553 | /* 0x30 - 0x3F */ | 3609 | /* 0x30 - 0x3F */ |
3554 | II(ImplicitOps | Priv, em_wrmsr, wrmsr), | 3610 | II(ImplicitOps | Priv, em_wrmsr, wrmsr), |
3555 | IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), | 3611 | IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), |
@@ -3897,17 +3953,16 @@ done_prefixes: | |||
3897 | } | 3953 | } |
3898 | ctxt->d = opcode.flags; | 3954 | ctxt->d = opcode.flags; |
3899 | 3955 | ||
3956 | if (ctxt->d & ModRM) | ||
3957 | ctxt->modrm = insn_fetch(u8, ctxt); | ||
3958 | |||
3900 | while (ctxt->d & GroupMask) { | 3959 | while (ctxt->d & GroupMask) { |
3901 | switch (ctxt->d & GroupMask) { | 3960 | switch (ctxt->d & GroupMask) { |
3902 | case Group: | 3961 | case Group: |
3903 | ctxt->modrm = insn_fetch(u8, ctxt); | ||
3904 | --ctxt->_eip; | ||
3905 | goffset = (ctxt->modrm >> 3) & 7; | 3962 | goffset = (ctxt->modrm >> 3) & 7; |
3906 | opcode = opcode.u.group[goffset]; | 3963 | opcode = opcode.u.group[goffset]; |
3907 | break; | 3964 | break; |
3908 | case GroupDual: | 3965 | case GroupDual: |
3909 | ctxt->modrm = insn_fetch(u8, ctxt); | ||
3910 | --ctxt->_eip; | ||
3911 | goffset = (ctxt->modrm >> 3) & 7; | 3966 | goffset = (ctxt->modrm >> 3) & 7; |
3912 | if ((ctxt->modrm >> 6) == 3) | 3967 | if ((ctxt->modrm >> 6) == 3) |
3913 | opcode = opcode.u.gdual->mod3[goffset]; | 3968 | opcode = opcode.u.gdual->mod3[goffset]; |
@@ -3960,6 +4015,8 @@ done_prefixes: | |||
3960 | 4015 | ||
3961 | if (ctxt->d & Sse) | 4016 | if (ctxt->d & Sse) |
3962 | ctxt->op_bytes = 16; | 4017 | ctxt->op_bytes = 16; |
4018 | else if (ctxt->d & Mmx) | ||
4019 | ctxt->op_bytes = 8; | ||
3963 | 4020 | ||
3964 | /* ModRM and SIB bytes. */ | 4021 | /* ModRM and SIB bytes. */ |
3965 | if (ctxt->d & ModRM) { | 4022 | if (ctxt->d & ModRM) { |
@@ -4030,6 +4087,35 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) | |||
4030 | return false; | 4087 | return false; |
4031 | } | 4088 | } |
4032 | 4089 | ||
4090 | static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) | ||
4091 | { | ||
4092 | bool fault = false; | ||
4093 | |||
4094 | ctxt->ops->get_fpu(ctxt); | ||
4095 | asm volatile("1: fwait \n\t" | ||
4096 | "2: \n\t" | ||
4097 | ".pushsection .fixup,\"ax\" \n\t" | ||
4098 | "3: \n\t" | ||
4099 | "movb $1, %[fault] \n\t" | ||
4100 | "jmp 2b \n\t" | ||
4101 | ".popsection \n\t" | ||
4102 | _ASM_EXTABLE(1b, 3b) | ||
4103 | : [fault]"+qm"(fault)); | ||
4104 | ctxt->ops->put_fpu(ctxt); | ||
4105 | |||
4106 | if (unlikely(fault)) | ||
4107 | return emulate_exception(ctxt, MF_VECTOR, 0, false); | ||
4108 | |||
4109 | return X86EMUL_CONTINUE; | ||
4110 | } | ||
4111 | |||
4112 | static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, | ||
4113 | struct operand *op) | ||
4114 | { | ||
4115 | if (op->type == OP_MM) | ||
4116 | read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); | ||
4117 | } | ||
4118 | |||
4033 | int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) | 4119 | int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) |
4034 | { | 4120 | { |
4035 | struct x86_emulate_ops *ops = ctxt->ops; | 4121 | struct x86_emulate_ops *ops = ctxt->ops; |
@@ -4054,18 +4140,31 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) | |||
4054 | goto done; | 4140 | goto done; |
4055 | } | 4141 | } |
4056 | 4142 | ||
4057 | if ((ctxt->d & Sse) | 4143 | if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) |
4058 | && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) | 4144 | || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { |
4059 | || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { | ||
4060 | rc = emulate_ud(ctxt); | 4145 | rc = emulate_ud(ctxt); |
4061 | goto done; | 4146 | goto done; |
4062 | } | 4147 | } |
4063 | 4148 | ||
4064 | if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { | 4149 | if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { |
4065 | rc = emulate_nm(ctxt); | 4150 | rc = emulate_nm(ctxt); |
4066 | goto done; | 4151 | goto done; |
4067 | } | 4152 | } |
4068 | 4153 | ||
4154 | if (ctxt->d & Mmx) { | ||
4155 | rc = flush_pending_x87_faults(ctxt); | ||
4156 | if (rc != X86EMUL_CONTINUE) | ||
4157 | goto done; | ||
4158 | /* | ||
4159 | * Now that we know the fpu is exception safe, we can fetch | ||
4160 | * operands from it. | ||
4161 | */ | ||
4162 | fetch_possible_mmx_operand(ctxt, &ctxt->src); | ||
4163 | fetch_possible_mmx_operand(ctxt, &ctxt->src2); | ||
4164 | if (!(ctxt->d & Mov)) | ||
4165 | fetch_possible_mmx_operand(ctxt, &ctxt->dst); | ||
4166 | } | ||
4167 | |||
4069 | if (unlikely(ctxt->guest_mode) && ctxt->intercept) { | 4168 | if (unlikely(ctxt->guest_mode) && ctxt->intercept) { |
4070 | rc = emulator_check_intercept(ctxt, ctxt->intercept, | 4169 | rc = emulator_check_intercept(ctxt, ctxt->intercept, |
4071 | X86_ICPT_PRE_EXCEPT); | 4170 | X86_ICPT_PRE_EXCEPT); |
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index d68f99df690c..adba28f88d1a 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c | |||
@@ -34,7 +34,6 @@ | |||
34 | 34 | ||
35 | #include <linux/kvm_host.h> | 35 | #include <linux/kvm_host.h> |
36 | #include <linux/slab.h> | 36 | #include <linux/slab.h> |
37 | #include <linux/workqueue.h> | ||
38 | 37 | ||
39 | #include "irq.h" | 38 | #include "irq.h" |
40 | #include "i8254.h" | 39 | #include "i8254.h" |
@@ -249,7 +248,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) | |||
249 | /* in this case, we had multiple outstanding pit interrupts | 248 | /* in this case, we had multiple outstanding pit interrupts |
250 | * that we needed to inject. Reinject | 249 | * that we needed to inject. Reinject |
251 | */ | 250 | */ |
252 | queue_work(ps->pit->wq, &ps->pit->expired); | 251 | queue_kthread_work(&ps->pit->worker, &ps->pit->expired); |
253 | ps->irq_ack = 1; | 252 | ps->irq_ack = 1; |
254 | spin_unlock(&ps->inject_lock); | 253 | spin_unlock(&ps->inject_lock); |
255 | } | 254 | } |
@@ -270,7 +269,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) | |||
270 | static void destroy_pit_timer(struct kvm_pit *pit) | 269 | static void destroy_pit_timer(struct kvm_pit *pit) |
271 | { | 270 | { |
272 | hrtimer_cancel(&pit->pit_state.pit_timer.timer); | 271 | hrtimer_cancel(&pit->pit_state.pit_timer.timer); |
273 | cancel_work_sync(&pit->expired); | 272 | flush_kthread_work(&pit->expired); |
274 | } | 273 | } |
275 | 274 | ||
276 | static bool kpit_is_periodic(struct kvm_timer *ktimer) | 275 | static bool kpit_is_periodic(struct kvm_timer *ktimer) |
@@ -284,7 +283,7 @@ static struct kvm_timer_ops kpit_ops = { | |||
284 | .is_periodic = kpit_is_periodic, | 283 | .is_periodic = kpit_is_periodic, |
285 | }; | 284 | }; |
286 | 285 | ||
287 | static void pit_do_work(struct work_struct *work) | 286 | static void pit_do_work(struct kthread_work *work) |
288 | { | 287 | { |
289 | struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); | 288 | struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); |
290 | struct kvm *kvm = pit->kvm; | 289 | struct kvm *kvm = pit->kvm; |
@@ -328,7 +327,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) | |||
328 | 327 | ||
329 | if (ktimer->reinject || !atomic_read(&ktimer->pending)) { | 328 | if (ktimer->reinject || !atomic_read(&ktimer->pending)) { |
330 | atomic_inc(&ktimer->pending); | 329 | atomic_inc(&ktimer->pending); |
331 | queue_work(pt->wq, &pt->expired); | 330 | queue_kthread_work(&pt->worker, &pt->expired); |
332 | } | 331 | } |
333 | 332 | ||
334 | if (ktimer->t_ops->is_periodic(ktimer)) { | 333 | if (ktimer->t_ops->is_periodic(ktimer)) { |
@@ -353,7 +352,7 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) | |||
353 | 352 | ||
354 | /* TODO The new value only affected after the retriggered */ | 353 | /* TODO The new value only affected after the retriggered */ |
355 | hrtimer_cancel(&pt->timer); | 354 | hrtimer_cancel(&pt->timer); |
356 | cancel_work_sync(&ps->pit->expired); | 355 | flush_kthread_work(&ps->pit->expired); |
357 | pt->period = interval; | 356 | pt->period = interval; |
358 | ps->is_periodic = is_period; | 357 | ps->is_periodic = is_period; |
359 | 358 | ||
@@ -669,6 +668,8 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) | |||
669 | { | 668 | { |
670 | struct kvm_pit *pit; | 669 | struct kvm_pit *pit; |
671 | struct kvm_kpit_state *pit_state; | 670 | struct kvm_kpit_state *pit_state; |
671 | struct pid *pid; | ||
672 | pid_t pid_nr; | ||
672 | int ret; | 673 | int ret; |
673 | 674 | ||
674 | pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); | 675 | pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); |
@@ -685,14 +686,20 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) | |||
685 | mutex_lock(&pit->pit_state.lock); | 686 | mutex_lock(&pit->pit_state.lock); |
686 | spin_lock_init(&pit->pit_state.inject_lock); | 687 | spin_lock_init(&pit->pit_state.inject_lock); |
687 | 688 | ||
688 | pit->wq = create_singlethread_workqueue("kvm-pit-wq"); | 689 | pid = get_pid(task_tgid(current)); |
689 | if (!pit->wq) { | 690 | pid_nr = pid_vnr(pid); |
691 | put_pid(pid); | ||
692 | |||
693 | init_kthread_worker(&pit->worker); | ||
694 | pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker, | ||
695 | "kvm-pit/%d", pid_nr); | ||
696 | if (IS_ERR(pit->worker_task)) { | ||
690 | mutex_unlock(&pit->pit_state.lock); | 697 | mutex_unlock(&pit->pit_state.lock); |
691 | kvm_free_irq_source_id(kvm, pit->irq_source_id); | 698 | kvm_free_irq_source_id(kvm, pit->irq_source_id); |
692 | kfree(pit); | 699 | kfree(pit); |
693 | return NULL; | 700 | return NULL; |
694 | } | 701 | } |
695 | INIT_WORK(&pit->expired, pit_do_work); | 702 | init_kthread_work(&pit->expired, pit_do_work); |
696 | 703 | ||
697 | kvm->arch.vpit = pit; | 704 | kvm->arch.vpit = pit; |
698 | pit->kvm = kvm; | 705 | pit->kvm = kvm; |
@@ -736,7 +743,7 @@ fail: | |||
736 | kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); | 743 | kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); |
737 | kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); | 744 | kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); |
738 | kvm_free_irq_source_id(kvm, pit->irq_source_id); | 745 | kvm_free_irq_source_id(kvm, pit->irq_source_id); |
739 | destroy_workqueue(pit->wq); | 746 | kthread_stop(pit->worker_task); |
740 | kfree(pit); | 747 | kfree(pit); |
741 | return NULL; | 748 | return NULL; |
742 | } | 749 | } |
@@ -756,10 +763,10 @@ void kvm_free_pit(struct kvm *kvm) | |||
756 | mutex_lock(&kvm->arch.vpit->pit_state.lock); | 763 | mutex_lock(&kvm->arch.vpit->pit_state.lock); |
757 | timer = &kvm->arch.vpit->pit_state.pit_timer.timer; | 764 | timer = &kvm->arch.vpit->pit_state.pit_timer.timer; |
758 | hrtimer_cancel(timer); | 765 | hrtimer_cancel(timer); |
759 | cancel_work_sync(&kvm->arch.vpit->expired); | 766 | flush_kthread_work(&kvm->arch.vpit->expired); |
767 | kthread_stop(kvm->arch.vpit->worker_task); | ||
760 | kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); | 768 | kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); |
761 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); | 769 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); |
762 | destroy_workqueue(kvm->arch.vpit->wq); | ||
763 | kfree(kvm->arch.vpit); | 770 | kfree(kvm->arch.vpit); |
764 | } | 771 | } |
765 | } | 772 | } |
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 51a97426e791..fdf40425ea1d 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h | |||
@@ -1,6 +1,8 @@ | |||
1 | #ifndef __I8254_H | 1 | #ifndef __I8254_H |
2 | #define __I8254_H | 2 | #define __I8254_H |
3 | 3 | ||
4 | #include <linux/kthread.h> | ||
5 | |||
4 | #include "iodev.h" | 6 | #include "iodev.h" |
5 | 7 | ||
6 | struct kvm_kpit_channel_state { | 8 | struct kvm_kpit_channel_state { |
@@ -39,8 +41,9 @@ struct kvm_pit { | |||
39 | struct kvm_kpit_state pit_state; | 41 | struct kvm_kpit_state pit_state; |
40 | int irq_source_id; | 42 | int irq_source_id; |
41 | struct kvm_irq_mask_notifier mask_notifier; | 43 | struct kvm_irq_mask_notifier mask_notifier; |
42 | struct workqueue_struct *wq; | 44 | struct kthread_worker worker; |
43 | struct work_struct expired; | 45 | struct task_struct *worker_task; |
46 | struct kthread_work expired; | ||
44 | }; | 47 | }; |
45 | 48 | ||
46 | #define KVM_PIT_BASE_ADDRESS 0x40 | 49 | #define KVM_PIT_BASE_ADDRESS 0x40 |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 858432287ab6..93c15743f1ee 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -92,6 +92,11 @@ static inline int apic_test_and_clear_vector(int vec, void *bitmap) | |||
92 | return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | 92 | return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); |
93 | } | 93 | } |
94 | 94 | ||
95 | static inline int apic_test_vector(int vec, void *bitmap) | ||
96 | { | ||
97 | return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | ||
98 | } | ||
99 | |||
95 | static inline void apic_set_vector(int vec, void *bitmap) | 100 | static inline void apic_set_vector(int vec, void *bitmap) |
96 | { | 101 | { |
97 | set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | 102 | set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); |
@@ -480,7 +485,6 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) | |||
480 | static void apic_set_eoi(struct kvm_lapic *apic) | 485 | static void apic_set_eoi(struct kvm_lapic *apic) |
481 | { | 486 | { |
482 | int vector = apic_find_highest_isr(apic); | 487 | int vector = apic_find_highest_isr(apic); |
483 | int trigger_mode; | ||
484 | /* | 488 | /* |
485 | * Not every write EOI will has corresponding ISR, | 489 | * Not every write EOI will has corresponding ISR, |
486 | * one example is when Kernel check timer on setup_IO_APIC | 490 | * one example is when Kernel check timer on setup_IO_APIC |
@@ -491,12 +495,15 @@ static void apic_set_eoi(struct kvm_lapic *apic) | |||
491 | apic_clear_vector(vector, apic->regs + APIC_ISR); | 495 | apic_clear_vector(vector, apic->regs + APIC_ISR); |
492 | apic_update_ppr(apic); | 496 | apic_update_ppr(apic); |
493 | 497 | ||
494 | if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) | 498 | if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && |
495 | trigger_mode = IOAPIC_LEVEL_TRIG; | 499 | kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { |
496 | else | 500 | int trigger_mode; |
497 | trigger_mode = IOAPIC_EDGE_TRIG; | 501 | if (apic_test_vector(vector, apic->regs + APIC_TMR)) |
498 | if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) | 502 | trigger_mode = IOAPIC_LEVEL_TRIG; |
503 | else | ||
504 | trigger_mode = IOAPIC_EDGE_TRIG; | ||
499 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); | 505 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); |
506 | } | ||
500 | kvm_make_request(KVM_REQ_EVENT, apic->vcpu); | 507 | kvm_make_request(KVM_REQ_EVENT, apic->vcpu); |
501 | } | 508 | } |
502 | 509 | ||
@@ -1081,6 +1088,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
1081 | apic_update_ppr(apic); | 1088 | apic_update_ppr(apic); |
1082 | 1089 | ||
1083 | vcpu->arch.apic_arb_prio = 0; | 1090 | vcpu->arch.apic_arb_prio = 0; |
1091 | vcpu->arch.apic_attention = 0; | ||
1084 | 1092 | ||
1085 | apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" | 1093 | apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" |
1086 | "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, | 1094 | "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, |
@@ -1280,7 +1288,7 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) | |||
1280 | u32 data; | 1288 | u32 data; |
1281 | void *vapic; | 1289 | void *vapic; |
1282 | 1290 | ||
1283 | if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) | 1291 | if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) |
1284 | return; | 1292 | return; |
1285 | 1293 | ||
1286 | vapic = kmap_atomic(vcpu->arch.apic->vapic_page); | 1294 | vapic = kmap_atomic(vcpu->arch.apic->vapic_page); |
@@ -1297,7 +1305,7 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) | |||
1297 | struct kvm_lapic *apic; | 1305 | struct kvm_lapic *apic; |
1298 | void *vapic; | 1306 | void *vapic; |
1299 | 1307 | ||
1300 | if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) | 1308 | if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) |
1301 | return; | 1309 | return; |
1302 | 1310 | ||
1303 | apic = vcpu->arch.apic; | 1311 | apic = vcpu->arch.apic; |
@@ -1317,10 +1325,11 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) | |||
1317 | 1325 | ||
1318 | void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) | 1326 | void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) |
1319 | { | 1327 | { |
1320 | if (!irqchip_in_kernel(vcpu->kvm)) | ||
1321 | return; | ||
1322 | |||
1323 | vcpu->arch.apic->vapic_addr = vapic_addr; | 1328 | vcpu->arch.apic->vapic_addr = vapic_addr; |
1329 | if (vapic_addr) | ||
1330 | __set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); | ||
1331 | else | ||
1332 | __clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); | ||
1324 | } | 1333 | } |
1325 | 1334 | ||
1326 | int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) | 1335 | int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4cb164268846..72102e0ab7cb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -135,8 +135,6 @@ module_param(dbg, bool, 0644); | |||
135 | #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | 135 | #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ |
136 | | PT64_NX_MASK) | 136 | | PT64_NX_MASK) |
137 | 137 | ||
138 | #define PTE_LIST_EXT 4 | ||
139 | |||
140 | #define ACC_EXEC_MASK 1 | 138 | #define ACC_EXEC_MASK 1 |
141 | #define ACC_WRITE_MASK PT_WRITABLE_MASK | 139 | #define ACC_WRITE_MASK PT_WRITABLE_MASK |
142 | #define ACC_USER_MASK PT_USER_MASK | 140 | #define ACC_USER_MASK PT_USER_MASK |
@@ -151,6 +149,9 @@ module_param(dbg, bool, 0644); | |||
151 | 149 | ||
152 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | 150 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) |
153 | 151 | ||
152 | /* make pte_list_desc fit well in cache line */ | ||
153 | #define PTE_LIST_EXT 3 | ||
154 | |||
154 | struct pte_list_desc { | 155 | struct pte_list_desc { |
155 | u64 *sptes[PTE_LIST_EXT]; | 156 | u64 *sptes[PTE_LIST_EXT]; |
156 | struct pte_list_desc *more; | 157 | struct pte_list_desc *more; |
@@ -550,19 +551,29 @@ static u64 mmu_spte_get_lockless(u64 *sptep) | |||
550 | 551 | ||
551 | static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) | 552 | static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) |
552 | { | 553 | { |
553 | rcu_read_lock(); | 554 | /* |
554 | atomic_inc(&vcpu->kvm->arch.reader_counter); | 555 | * Prevent page table teardown by making any free-er wait during |
555 | 556 | * kvm_flush_remote_tlbs() IPI to all active vcpus. | |
556 | /* Increase the counter before walking shadow page table */ | 557 | */ |
557 | smp_mb__after_atomic_inc(); | 558 | local_irq_disable(); |
559 | vcpu->mode = READING_SHADOW_PAGE_TABLES; | ||
560 | /* | ||
561 | * Make sure a following spte read is not reordered ahead of the write | ||
562 | * to vcpu->mode. | ||
563 | */ | ||
564 | smp_mb(); | ||
558 | } | 565 | } |
559 | 566 | ||
560 | static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) | 567 | static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) |
561 | { | 568 | { |
562 | /* Decrease the counter after walking shadow page table finished */ | 569 | /* |
563 | smp_mb__before_atomic_dec(); | 570 | * Make sure the write to vcpu->mode is not reordered in front of |
564 | atomic_dec(&vcpu->kvm->arch.reader_counter); | 571 | * reads to sptes. If it does, kvm_commit_zap_page() can see us |
565 | rcu_read_unlock(); | 572 | * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. |
573 | */ | ||
574 | smp_mb(); | ||
575 | vcpu->mode = OUTSIDE_GUEST_MODE; | ||
576 | local_irq_enable(); | ||
566 | } | 577 | } |
567 | 578 | ||
568 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, | 579 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, |
@@ -841,32 +852,6 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, | |||
841 | return count; | 852 | return count; |
842 | } | 853 | } |
843 | 854 | ||
844 | static u64 *pte_list_next(unsigned long *pte_list, u64 *spte) | ||
845 | { | ||
846 | struct pte_list_desc *desc; | ||
847 | u64 *prev_spte; | ||
848 | int i; | ||
849 | |||
850 | if (!*pte_list) | ||
851 | return NULL; | ||
852 | else if (!(*pte_list & 1)) { | ||
853 | if (!spte) | ||
854 | return (u64 *)*pte_list; | ||
855 | return NULL; | ||
856 | } | ||
857 | desc = (struct pte_list_desc *)(*pte_list & ~1ul); | ||
858 | prev_spte = NULL; | ||
859 | while (desc) { | ||
860 | for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { | ||
861 | if (prev_spte == spte) | ||
862 | return desc->sptes[i]; | ||
863 | prev_spte = desc->sptes[i]; | ||
864 | } | ||
865 | desc = desc->more; | ||
866 | } | ||
867 | return NULL; | ||
868 | } | ||
869 | |||
870 | static void | 855 | static void |
871 | pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, | 856 | pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, |
872 | int i, struct pte_list_desc *prev_desc) | 857 | int i, struct pte_list_desc *prev_desc) |
@@ -987,11 +972,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | |||
987 | return pte_list_add(vcpu, spte, rmapp); | 972 | return pte_list_add(vcpu, spte, rmapp); |
988 | } | 973 | } |
989 | 974 | ||
990 | static u64 *rmap_next(unsigned long *rmapp, u64 *spte) | ||
991 | { | ||
992 | return pte_list_next(rmapp, spte); | ||
993 | } | ||
994 | |||
995 | static void rmap_remove(struct kvm *kvm, u64 *spte) | 975 | static void rmap_remove(struct kvm *kvm, u64 *spte) |
996 | { | 976 | { |
997 | struct kvm_mmu_page *sp; | 977 | struct kvm_mmu_page *sp; |
@@ -1004,106 +984,201 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) | |||
1004 | pte_list_remove(spte, rmapp); | 984 | pte_list_remove(spte, rmapp); |
1005 | } | 985 | } |
1006 | 986 | ||
987 | /* | ||
988 | * Used by the following functions to iterate through the sptes linked by a | ||
989 | * rmap. All fields are private and not assumed to be used outside. | ||
990 | */ | ||
991 | struct rmap_iterator { | ||
992 | /* private fields */ | ||
993 | struct pte_list_desc *desc; /* holds the sptep if not NULL */ | ||
994 | int pos; /* index of the sptep */ | ||
995 | }; | ||
996 | |||
997 | /* | ||
998 | * Iteration must be started by this function. This should also be used after | ||
999 | * removing/dropping sptes from the rmap link because in such cases the | ||
1000 | * information in the itererator may not be valid. | ||
1001 | * | ||
1002 | * Returns sptep if found, NULL otherwise. | ||
1003 | */ | ||
1004 | static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter) | ||
1005 | { | ||
1006 | if (!rmap) | ||
1007 | return NULL; | ||
1008 | |||
1009 | if (!(rmap & 1)) { | ||
1010 | iter->desc = NULL; | ||
1011 | return (u64 *)rmap; | ||
1012 | } | ||
1013 | |||
1014 | iter->desc = (struct pte_list_desc *)(rmap & ~1ul); | ||
1015 | iter->pos = 0; | ||
1016 | return iter->desc->sptes[iter->pos]; | ||
1017 | } | ||
1018 | |||
1019 | /* | ||
1020 | * Must be used with a valid iterator: e.g. after rmap_get_first(). | ||
1021 | * | ||
1022 | * Returns sptep if found, NULL otherwise. | ||
1023 | */ | ||
1024 | static u64 *rmap_get_next(struct rmap_iterator *iter) | ||
1025 | { | ||
1026 | if (iter->desc) { | ||
1027 | if (iter->pos < PTE_LIST_EXT - 1) { | ||
1028 | u64 *sptep; | ||
1029 | |||
1030 | ++iter->pos; | ||
1031 | sptep = iter->desc->sptes[iter->pos]; | ||
1032 | if (sptep) | ||
1033 | return sptep; | ||
1034 | } | ||
1035 | |||
1036 | iter->desc = iter->desc->more; | ||
1037 | |||
1038 | if (iter->desc) { | ||
1039 | iter->pos = 0; | ||
1040 | /* desc->sptes[0] cannot be NULL */ | ||
1041 | return iter->desc->sptes[iter->pos]; | ||
1042 | } | ||
1043 | } | ||
1044 | |||
1045 | return NULL; | ||
1046 | } | ||
1047 | |||
1007 | static void drop_spte(struct kvm *kvm, u64 *sptep) | 1048 | static void drop_spte(struct kvm *kvm, u64 *sptep) |
1008 | { | 1049 | { |
1009 | if (mmu_spte_clear_track_bits(sptep)) | 1050 | if (mmu_spte_clear_track_bits(sptep)) |
1010 | rmap_remove(kvm, sptep); | 1051 | rmap_remove(kvm, sptep); |
1011 | } | 1052 | } |
1012 | 1053 | ||
1013 | int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, | 1054 | static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) |
1014 | struct kvm_memory_slot *slot) | ||
1015 | { | 1055 | { |
1016 | unsigned long *rmapp; | 1056 | u64 *sptep; |
1017 | u64 *spte; | 1057 | struct rmap_iterator iter; |
1018 | int i, write_protected = 0; | 1058 | int write_protected = 0; |
1019 | 1059 | ||
1020 | rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot); | 1060 | for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { |
1021 | spte = rmap_next(rmapp, NULL); | 1061 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); |
1022 | while (spte) { | 1062 | rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); |
1023 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 1063 | |
1024 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); | 1064 | if (!is_writable_pte(*sptep)) { |
1025 | if (is_writable_pte(*spte)) { | 1065 | sptep = rmap_get_next(&iter); |
1026 | mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); | 1066 | continue; |
1027 | write_protected = 1; | ||
1028 | } | 1067 | } |
1029 | spte = rmap_next(rmapp, spte); | ||
1030 | } | ||
1031 | 1068 | ||
1032 | /* check for huge page mappings */ | 1069 | if (level == PT_PAGE_TABLE_LEVEL) { |
1033 | for (i = PT_DIRECTORY_LEVEL; | 1070 | mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK); |
1034 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | 1071 | sptep = rmap_get_next(&iter); |
1035 | rmapp = __gfn_to_rmap(gfn, i, slot); | 1072 | } else { |
1036 | spte = rmap_next(rmapp, NULL); | 1073 | BUG_ON(!is_large_pte(*sptep)); |
1037 | while (spte) { | 1074 | drop_spte(kvm, sptep); |
1038 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 1075 | --kvm->stat.lpages; |
1039 | BUG_ON(!is_large_pte(*spte)); | 1076 | sptep = rmap_get_first(*rmapp, &iter); |
1040 | pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); | ||
1041 | if (is_writable_pte(*spte)) { | ||
1042 | drop_spte(kvm, spte); | ||
1043 | --kvm->stat.lpages; | ||
1044 | spte = NULL; | ||
1045 | write_protected = 1; | ||
1046 | } | ||
1047 | spte = rmap_next(rmapp, spte); | ||
1048 | } | 1077 | } |
1078 | |||
1079 | write_protected = 1; | ||
1049 | } | 1080 | } |
1050 | 1081 | ||
1051 | return write_protected; | 1082 | return write_protected; |
1052 | } | 1083 | } |
1053 | 1084 | ||
1085 | /** | ||
1086 | * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages | ||
1087 | * @kvm: kvm instance | ||
1088 | * @slot: slot to protect | ||
1089 | * @gfn_offset: start of the BITS_PER_LONG pages we care about | ||
1090 | * @mask: indicates which pages we should protect | ||
1091 | * | ||
1092 | * Used when we do not need to care about huge page mappings: e.g. during dirty | ||
1093 | * logging we do not have any such mappings. | ||
1094 | */ | ||
1095 | void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, | ||
1096 | struct kvm_memory_slot *slot, | ||
1097 | gfn_t gfn_offset, unsigned long mask) | ||
1098 | { | ||
1099 | unsigned long *rmapp; | ||
1100 | |||
1101 | while (mask) { | ||
1102 | rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; | ||
1103 | __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); | ||
1104 | |||
1105 | /* clear the first set bit */ | ||
1106 | mask &= mask - 1; | ||
1107 | } | ||
1108 | } | ||
1109 | |||
1054 | static int rmap_write_protect(struct kvm *kvm, u64 gfn) | 1110 | static int rmap_write_protect(struct kvm *kvm, u64 gfn) |
1055 | { | 1111 | { |
1056 | struct kvm_memory_slot *slot; | 1112 | struct kvm_memory_slot *slot; |
1113 | unsigned long *rmapp; | ||
1114 | int i; | ||
1115 | int write_protected = 0; | ||
1057 | 1116 | ||
1058 | slot = gfn_to_memslot(kvm, gfn); | 1117 | slot = gfn_to_memslot(kvm, gfn); |
1059 | return kvm_mmu_rmap_write_protect(kvm, gfn, slot); | 1118 | |
1119 | for (i = PT_PAGE_TABLE_LEVEL; | ||
1120 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | ||
1121 | rmapp = __gfn_to_rmap(gfn, i, slot); | ||
1122 | write_protected |= __rmap_write_protect(kvm, rmapp, i); | ||
1123 | } | ||
1124 | |||
1125 | return write_protected; | ||
1060 | } | 1126 | } |
1061 | 1127 | ||
1062 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, | 1128 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, |
1063 | unsigned long data) | 1129 | unsigned long data) |
1064 | { | 1130 | { |
1065 | u64 *spte; | 1131 | u64 *sptep; |
1132 | struct rmap_iterator iter; | ||
1066 | int need_tlb_flush = 0; | 1133 | int need_tlb_flush = 0; |
1067 | 1134 | ||
1068 | while ((spte = rmap_next(rmapp, NULL))) { | 1135 | while ((sptep = rmap_get_first(*rmapp, &iter))) { |
1069 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 1136 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); |
1070 | rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); | 1137 | rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep); |
1071 | drop_spte(kvm, spte); | 1138 | |
1139 | drop_spte(kvm, sptep); | ||
1072 | need_tlb_flush = 1; | 1140 | need_tlb_flush = 1; |
1073 | } | 1141 | } |
1142 | |||
1074 | return need_tlb_flush; | 1143 | return need_tlb_flush; |
1075 | } | 1144 | } |
1076 | 1145 | ||
1077 | static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, | 1146 | static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, |
1078 | unsigned long data) | 1147 | unsigned long data) |
1079 | { | 1148 | { |
1149 | u64 *sptep; | ||
1150 | struct rmap_iterator iter; | ||
1080 | int need_flush = 0; | 1151 | int need_flush = 0; |
1081 | u64 *spte, new_spte; | 1152 | u64 new_spte; |
1082 | pte_t *ptep = (pte_t *)data; | 1153 | pte_t *ptep = (pte_t *)data; |
1083 | pfn_t new_pfn; | 1154 | pfn_t new_pfn; |
1084 | 1155 | ||
1085 | WARN_ON(pte_huge(*ptep)); | 1156 | WARN_ON(pte_huge(*ptep)); |
1086 | new_pfn = pte_pfn(*ptep); | 1157 | new_pfn = pte_pfn(*ptep); |
1087 | spte = rmap_next(rmapp, NULL); | 1158 | |
1088 | while (spte) { | 1159 | for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { |
1089 | BUG_ON(!is_shadow_present_pte(*spte)); | 1160 | BUG_ON(!is_shadow_present_pte(*sptep)); |
1090 | rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); | 1161 | rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep); |
1162 | |||
1091 | need_flush = 1; | 1163 | need_flush = 1; |
1164 | |||
1092 | if (pte_write(*ptep)) { | 1165 | if (pte_write(*ptep)) { |
1093 | drop_spte(kvm, spte); | 1166 | drop_spte(kvm, sptep); |
1094 | spte = rmap_next(rmapp, NULL); | 1167 | sptep = rmap_get_first(*rmapp, &iter); |
1095 | } else { | 1168 | } else { |
1096 | new_spte = *spte &~ (PT64_BASE_ADDR_MASK); | 1169 | new_spte = *sptep & ~PT64_BASE_ADDR_MASK; |
1097 | new_spte |= (u64)new_pfn << PAGE_SHIFT; | 1170 | new_spte |= (u64)new_pfn << PAGE_SHIFT; |
1098 | 1171 | ||
1099 | new_spte &= ~PT_WRITABLE_MASK; | 1172 | new_spte &= ~PT_WRITABLE_MASK; |
1100 | new_spte &= ~SPTE_HOST_WRITEABLE; | 1173 | new_spte &= ~SPTE_HOST_WRITEABLE; |
1101 | new_spte &= ~shadow_accessed_mask; | 1174 | new_spte &= ~shadow_accessed_mask; |
1102 | mmu_spte_clear_track_bits(spte); | 1175 | |
1103 | mmu_spte_set(spte, new_spte); | 1176 | mmu_spte_clear_track_bits(sptep); |
1104 | spte = rmap_next(rmapp, spte); | 1177 | mmu_spte_set(sptep, new_spte); |
1178 | sptep = rmap_get_next(&iter); | ||
1105 | } | 1179 | } |
1106 | } | 1180 | } |
1181 | |||
1107 | if (need_flush) | 1182 | if (need_flush) |
1108 | kvm_flush_remote_tlbs(kvm); | 1183 | kvm_flush_remote_tlbs(kvm); |
1109 | 1184 | ||
@@ -1162,7 +1237,8 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) | |||
1162 | static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | 1237 | static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, |
1163 | unsigned long data) | 1238 | unsigned long data) |
1164 | { | 1239 | { |
1165 | u64 *spte; | 1240 | u64 *sptep; |
1241 | struct rmap_iterator iter; | ||
1166 | int young = 0; | 1242 | int young = 0; |
1167 | 1243 | ||
1168 | /* | 1244 | /* |
@@ -1175,25 +1251,24 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
1175 | if (!shadow_accessed_mask) | 1251 | if (!shadow_accessed_mask) |
1176 | return kvm_unmap_rmapp(kvm, rmapp, data); | 1252 | return kvm_unmap_rmapp(kvm, rmapp, data); |
1177 | 1253 | ||
1178 | spte = rmap_next(rmapp, NULL); | 1254 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; |
1179 | while (spte) { | 1255 | sptep = rmap_get_next(&iter)) { |
1180 | int _young; | 1256 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); |
1181 | u64 _spte = *spte; | 1257 | |
1182 | BUG_ON(!(_spte & PT_PRESENT_MASK)); | 1258 | if (*sptep & PT_ACCESSED_MASK) { |
1183 | _young = _spte & PT_ACCESSED_MASK; | ||
1184 | if (_young) { | ||
1185 | young = 1; | 1259 | young = 1; |
1186 | clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); | 1260 | clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep); |
1187 | } | 1261 | } |
1188 | spte = rmap_next(rmapp, spte); | ||
1189 | } | 1262 | } |
1263 | |||
1190 | return young; | 1264 | return young; |
1191 | } | 1265 | } |
1192 | 1266 | ||
1193 | static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | 1267 | static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, |
1194 | unsigned long data) | 1268 | unsigned long data) |
1195 | { | 1269 | { |
1196 | u64 *spte; | 1270 | u64 *sptep; |
1271 | struct rmap_iterator iter; | ||
1197 | int young = 0; | 1272 | int young = 0; |
1198 | 1273 | ||
1199 | /* | 1274 | /* |
@@ -1204,16 +1279,14 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
1204 | if (!shadow_accessed_mask) | 1279 | if (!shadow_accessed_mask) |
1205 | goto out; | 1280 | goto out; |
1206 | 1281 | ||
1207 | spte = rmap_next(rmapp, NULL); | 1282 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; |
1208 | while (spte) { | 1283 | sptep = rmap_get_next(&iter)) { |
1209 | u64 _spte = *spte; | 1284 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); |
1210 | BUG_ON(!(_spte & PT_PRESENT_MASK)); | 1285 | |
1211 | young = _spte & PT_ACCESSED_MASK; | 1286 | if (*sptep & PT_ACCESSED_MASK) { |
1212 | if (young) { | ||
1213 | young = 1; | 1287 | young = 1; |
1214 | break; | 1288 | break; |
1215 | } | 1289 | } |
1216 | spte = rmap_next(rmapp, spte); | ||
1217 | } | 1290 | } |
1218 | out: | 1291 | out: |
1219 | return young; | 1292 | return young; |
@@ -1865,10 +1938,11 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) | |||
1865 | 1938 | ||
1866 | static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) | 1939 | static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) |
1867 | { | 1940 | { |
1868 | u64 *parent_pte; | 1941 | u64 *sptep; |
1942 | struct rmap_iterator iter; | ||
1869 | 1943 | ||
1870 | while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL))) | 1944 | while ((sptep = rmap_get_first(sp->parent_ptes, &iter))) |
1871 | drop_parent_pte(sp, parent_pte); | 1945 | drop_parent_pte(sp, sptep); |
1872 | } | 1946 | } |
1873 | 1947 | ||
1874 | static int mmu_zap_unsync_children(struct kvm *kvm, | 1948 | static int mmu_zap_unsync_children(struct kvm *kvm, |
@@ -1925,30 +1999,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, | |||
1925 | return ret; | 1999 | return ret; |
1926 | } | 2000 | } |
1927 | 2001 | ||
1928 | static void kvm_mmu_isolate_pages(struct list_head *invalid_list) | ||
1929 | { | ||
1930 | struct kvm_mmu_page *sp; | ||
1931 | |||
1932 | list_for_each_entry(sp, invalid_list, link) | ||
1933 | kvm_mmu_isolate_page(sp); | ||
1934 | } | ||
1935 | |||
1936 | static void free_pages_rcu(struct rcu_head *head) | ||
1937 | { | ||
1938 | struct kvm_mmu_page *next, *sp; | ||
1939 | |||
1940 | sp = container_of(head, struct kvm_mmu_page, rcu); | ||
1941 | while (sp) { | ||
1942 | if (!list_empty(&sp->link)) | ||
1943 | next = list_first_entry(&sp->link, | ||
1944 | struct kvm_mmu_page, link); | ||
1945 | else | ||
1946 | next = NULL; | ||
1947 | kvm_mmu_free_page(sp); | ||
1948 | sp = next; | ||
1949 | } | ||
1950 | } | ||
1951 | |||
1952 | static void kvm_mmu_commit_zap_page(struct kvm *kvm, | 2002 | static void kvm_mmu_commit_zap_page(struct kvm *kvm, |
1953 | struct list_head *invalid_list) | 2003 | struct list_head *invalid_list) |
1954 | { | 2004 | { |
@@ -1957,17 +2007,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, | |||
1957 | if (list_empty(invalid_list)) | 2007 | if (list_empty(invalid_list)) |
1958 | return; | 2008 | return; |
1959 | 2009 | ||
1960 | kvm_flush_remote_tlbs(kvm); | 2010 | /* |
1961 | 2011 | * wmb: make sure everyone sees our modifications to the page tables | |
1962 | if (atomic_read(&kvm->arch.reader_counter)) { | 2012 | * rmb: make sure we see changes to vcpu->mode |
1963 | kvm_mmu_isolate_pages(invalid_list); | 2013 | */ |
1964 | sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); | 2014 | smp_mb(); |
1965 | list_del_init(invalid_list); | ||
1966 | 2015 | ||
1967 | trace_kvm_mmu_delay_free_pages(sp); | 2016 | /* |
1968 | call_rcu(&sp->rcu, free_pages_rcu); | 2017 | * Wait for all vcpus to exit guest mode and/or lockless shadow |
1969 | return; | 2018 | * page table walks. |
1970 | } | 2019 | */ |
2020 | kvm_flush_remote_tlbs(kvm); | ||
1971 | 2021 | ||
1972 | do { | 2022 | do { |
1973 | sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); | 2023 | sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); |
@@ -1975,7 +2025,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, | |||
1975 | kvm_mmu_isolate_page(sp); | 2025 | kvm_mmu_isolate_page(sp); |
1976 | kvm_mmu_free_page(sp); | 2026 | kvm_mmu_free_page(sp); |
1977 | } while (!list_empty(invalid_list)); | 2027 | } while (!list_empty(invalid_list)); |
1978 | |||
1979 | } | 2028 | } |
1980 | 2029 | ||
1981 | /* | 2030 | /* |
@@ -3554,7 +3603,7 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp) | |||
3554 | * Skip write-flooding detected for the sp whose level is 1, because | 3603 | * Skip write-flooding detected for the sp whose level is 1, because |
3555 | * it can become unsync, then the guest page is not write-protected. | 3604 | * it can become unsync, then the guest page is not write-protected. |
3556 | */ | 3605 | */ |
3557 | if (sp->role.level == 1) | 3606 | if (sp->role.level == PT_PAGE_TABLE_LEVEL) |
3558 | return false; | 3607 | return false; |
3559 | 3608 | ||
3560 | return ++sp->write_flooding_count >= 3; | 3609 | return ++sp->write_flooding_count >= 3; |
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 715da5a19a5b..7d7d0b9e23eb 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c | |||
@@ -192,7 +192,8 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
192 | { | 192 | { |
193 | struct kvm_memory_slot *slot; | 193 | struct kvm_memory_slot *slot; |
194 | unsigned long *rmapp; | 194 | unsigned long *rmapp; |
195 | u64 *spte; | 195 | u64 *sptep; |
196 | struct rmap_iterator iter; | ||
196 | 197 | ||
197 | if (sp->role.direct || sp->unsync || sp->role.invalid) | 198 | if (sp->role.direct || sp->unsync || sp->role.invalid) |
198 | return; | 199 | return; |
@@ -200,13 +201,12 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
200 | slot = gfn_to_memslot(kvm, sp->gfn); | 201 | slot = gfn_to_memslot(kvm, sp->gfn); |
201 | rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; | 202 | rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; |
202 | 203 | ||
203 | spte = rmap_next(rmapp, NULL); | 204 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; |
204 | while (spte) { | 205 | sptep = rmap_get_next(&iter)) { |
205 | if (is_writable_pte(*spte)) | 206 | if (is_writable_pte(*sptep)) |
206 | audit_printk(kvm, "shadow page has writable " | 207 | audit_printk(kvm, "shadow page has writable " |
207 | "mappings: gfn %llx role %x\n", | 208 | "mappings: gfn %llx role %x\n", |
208 | sp->gfn, sp->role.word); | 209 | sp->gfn, sp->role.word); |
209 | spte = rmap_next(rmapp, spte); | ||
210 | } | 210 | } |
211 | } | 211 | } |
212 | 212 | ||
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index df5a70311be8..34f970937ef1 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -658,7 +658,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) | |||
658 | { | 658 | { |
659 | int offset = 0; | 659 | int offset = 0; |
660 | 660 | ||
661 | WARN_ON(sp->role.level != 1); | 661 | WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL); |
662 | 662 | ||
663 | if (PTTYPE == 32) | 663 | if (PTTYPE == 32) |
664 | offset = sp->role.quadrant << PT64_LEVEL_BITS; | 664 | offset = sp->role.quadrant << PT64_LEVEL_BITS; |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e334389e1c75..f75af406b268 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include "x86.h" | 22 | #include "x86.h" |
23 | 23 | ||
24 | #include <linux/module.h> | 24 | #include <linux/module.h> |
25 | #include <linux/mod_devicetable.h> | ||
25 | #include <linux/kernel.h> | 26 | #include <linux/kernel.h> |
26 | #include <linux/vmalloc.h> | 27 | #include <linux/vmalloc.h> |
27 | #include <linux/highmem.h> | 28 | #include <linux/highmem.h> |
@@ -42,6 +43,12 @@ | |||
42 | MODULE_AUTHOR("Qumranet"); | 43 | MODULE_AUTHOR("Qumranet"); |
43 | MODULE_LICENSE("GPL"); | 44 | MODULE_LICENSE("GPL"); |
44 | 45 | ||
46 | static const struct x86_cpu_id svm_cpu_id[] = { | ||
47 | X86_FEATURE_MATCH(X86_FEATURE_SVM), | ||
48 | {} | ||
49 | }; | ||
50 | MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); | ||
51 | |||
45 | #define IOPM_ALLOC_ORDER 2 | 52 | #define IOPM_ALLOC_ORDER 2 |
46 | #define MSRPM_ALLOC_ORDER 1 | 53 | #define MSRPM_ALLOC_ORDER 1 |
47 | 54 | ||
@@ -3240,6 +3247,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm) | |||
3240 | svm_clear_vintr(svm); | 3247 | svm_clear_vintr(svm); |
3241 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; | 3248 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; |
3242 | mark_dirty(svm->vmcb, VMCB_INTR); | 3249 | mark_dirty(svm->vmcb, VMCB_INTR); |
3250 | ++svm->vcpu.stat.irq_window_exits; | ||
3243 | /* | 3251 | /* |
3244 | * If the user space waits to inject interrupts, exit as soon as | 3252 | * If the user space waits to inject interrupts, exit as soon as |
3245 | * possible | 3253 | * possible |
@@ -3247,7 +3255,6 @@ static int interrupt_window_interception(struct vcpu_svm *svm) | |||
3247 | if (!irqchip_in_kernel(svm->vcpu.kvm) && | 3255 | if (!irqchip_in_kernel(svm->vcpu.kvm) && |
3248 | kvm_run->request_interrupt_window && | 3256 | kvm_run->request_interrupt_window && |
3249 | !kvm_cpu_has_interrupt(&svm->vcpu)) { | 3257 | !kvm_cpu_has_interrupt(&svm->vcpu)) { |
3250 | ++svm->vcpu.stat.irq_window_exits; | ||
3251 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; | 3258 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; |
3252 | return 0; | 3259 | return 0; |
3253 | } | 3260 | } |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4ff0ab9bc3c8..32eb58866292 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/highmem.h> | 27 | #include <linux/highmem.h> |
28 | #include <linux/sched.h> | 28 | #include <linux/sched.h> |
29 | #include <linux/moduleparam.h> | 29 | #include <linux/moduleparam.h> |
30 | #include <linux/mod_devicetable.h> | ||
30 | #include <linux/ftrace_event.h> | 31 | #include <linux/ftrace_event.h> |
31 | #include <linux/slab.h> | 32 | #include <linux/slab.h> |
32 | #include <linux/tboot.h> | 33 | #include <linux/tboot.h> |
@@ -51,6 +52,12 @@ | |||
51 | MODULE_AUTHOR("Qumranet"); | 52 | MODULE_AUTHOR("Qumranet"); |
52 | MODULE_LICENSE("GPL"); | 53 | MODULE_LICENSE("GPL"); |
53 | 54 | ||
55 | static const struct x86_cpu_id vmx_cpu_id[] = { | ||
56 | X86_FEATURE_MATCH(X86_FEATURE_VMX), | ||
57 | {} | ||
58 | }; | ||
59 | MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); | ||
60 | |||
54 | static bool __read_mostly enable_vpid = 1; | 61 | static bool __read_mostly enable_vpid = 1; |
55 | module_param_named(vpid, enable_vpid, bool, 0444); | 62 | module_param_named(vpid, enable_vpid, bool, 0444); |
56 | 63 | ||
@@ -386,6 +393,9 @@ struct vcpu_vmx { | |||
386 | struct { | 393 | struct { |
387 | int loaded; | 394 | int loaded; |
388 | u16 fs_sel, gs_sel, ldt_sel; | 395 | u16 fs_sel, gs_sel, ldt_sel; |
396 | #ifdef CONFIG_X86_64 | ||
397 | u16 ds_sel, es_sel; | ||
398 | #endif | ||
389 | int gs_ldt_reload_needed; | 399 | int gs_ldt_reload_needed; |
390 | int fs_reload_needed; | 400 | int fs_reload_needed; |
391 | } host_state; | 401 | } host_state; |
@@ -1411,6 +1421,11 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) | |||
1411 | } | 1421 | } |
1412 | 1422 | ||
1413 | #ifdef CONFIG_X86_64 | 1423 | #ifdef CONFIG_X86_64 |
1424 | savesegment(ds, vmx->host_state.ds_sel); | ||
1425 | savesegment(es, vmx->host_state.es_sel); | ||
1426 | #endif | ||
1427 | |||
1428 | #ifdef CONFIG_X86_64 | ||
1414 | vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); | 1429 | vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); |
1415 | vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); | 1430 | vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); |
1416 | #else | 1431 | #else |
@@ -1450,6 +1465,19 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) | |||
1450 | } | 1465 | } |
1451 | if (vmx->host_state.fs_reload_needed) | 1466 | if (vmx->host_state.fs_reload_needed) |
1452 | loadsegment(fs, vmx->host_state.fs_sel); | 1467 | loadsegment(fs, vmx->host_state.fs_sel); |
1468 | #ifdef CONFIG_X86_64 | ||
1469 | if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) { | ||
1470 | loadsegment(ds, vmx->host_state.ds_sel); | ||
1471 | loadsegment(es, vmx->host_state.es_sel); | ||
1472 | } | ||
1473 | #else | ||
1474 | /* | ||
1475 | * The sysexit path does not restore ds/es, so we must set them to | ||
1476 | * a reasonable value ourselves. | ||
1477 | */ | ||
1478 | loadsegment(ds, __USER_DS); | ||
1479 | loadsegment(es, __USER_DS); | ||
1480 | #endif | ||
1453 | reload_tss(); | 1481 | reload_tss(); |
1454 | #ifdef CONFIG_X86_64 | 1482 | #ifdef CONFIG_X86_64 |
1455 | wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); | 1483 | wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); |
@@ -3633,8 +3661,18 @@ static void vmx_set_constant_host_state(void) | |||
3633 | vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ | 3661 | vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ |
3634 | 3662 | ||
3635 | vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ | 3663 | vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ |
3664 | #ifdef CONFIG_X86_64 | ||
3665 | /* | ||
3666 | * Load null selectors, so we can avoid reloading them in | ||
3667 | * __vmx_load_host_state(), in case userspace uses the null selectors | ||
3668 | * too (the expected case). | ||
3669 | */ | ||
3670 | vmcs_write16(HOST_DS_SELECTOR, 0); | ||
3671 | vmcs_write16(HOST_ES_SELECTOR, 0); | ||
3672 | #else | ||
3636 | vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | 3673 | vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ |
3637 | vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | 3674 | vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ |
3675 | #endif | ||
3638 | vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | 3676 | vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ |
3639 | vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ | 3677 | vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ |
3640 | 3678 | ||
@@ -6256,7 +6294,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
6256 | } | 6294 | } |
6257 | } | 6295 | } |
6258 | 6296 | ||
6259 | asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); | ||
6260 | vmx->loaded_vmcs->launched = 1; | 6297 | vmx->loaded_vmcs->launched = 1; |
6261 | 6298 | ||
6262 | vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); | 6299 | vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); |
@@ -6343,7 +6380,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | |||
6343 | return &vmx->vcpu; | 6380 | return &vmx->vcpu; |
6344 | 6381 | ||
6345 | free_vmcs: | 6382 | free_vmcs: |
6346 | free_vmcs(vmx->loaded_vmcs->vmcs); | 6383 | free_loaded_vmcs(vmx->loaded_vmcs); |
6347 | free_msrs: | 6384 | free_msrs: |
6348 | kfree(vmx->guest_msrs); | 6385 | kfree(vmx->guest_msrs); |
6349 | uninit_vcpu: | 6386 | uninit_vcpu: |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 185a2b823a2d..be6d54929fa7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -2147,6 +2147,7 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
2147 | case KVM_CAP_ASYNC_PF: | 2147 | case KVM_CAP_ASYNC_PF: |
2148 | case KVM_CAP_GET_TSC_KHZ: | 2148 | case KVM_CAP_GET_TSC_KHZ: |
2149 | case KVM_CAP_PCI_2_3: | 2149 | case KVM_CAP_PCI_2_3: |
2150 | case KVM_CAP_KVMCLOCK_CTRL: | ||
2150 | r = 1; | 2151 | r = 1; |
2151 | break; | 2152 | break; |
2152 | case KVM_CAP_COALESCED_MMIO: | 2153 | case KVM_CAP_COALESCED_MMIO: |
@@ -2597,6 +2598,23 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, | |||
2597 | return r; | 2598 | return r; |
2598 | } | 2599 | } |
2599 | 2600 | ||
2601 | /* | ||
2602 | * kvm_set_guest_paused() indicates to the guest kernel that it has been | ||
2603 | * stopped by the hypervisor. This function will be called from the host only. | ||
2604 | * EINVAL is returned when the host attempts to set the flag for a guest that | ||
2605 | * does not support pv clocks. | ||
2606 | */ | ||
2607 | static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) | ||
2608 | { | ||
2609 | struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock; | ||
2610 | if (!vcpu->arch.time_page) | ||
2611 | return -EINVAL; | ||
2612 | src->flags |= PVCLOCK_GUEST_STOPPED; | ||
2613 | mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT); | ||
2614 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); | ||
2615 | return 0; | ||
2616 | } | ||
2617 | |||
2600 | long kvm_arch_vcpu_ioctl(struct file *filp, | 2618 | long kvm_arch_vcpu_ioctl(struct file *filp, |
2601 | unsigned int ioctl, unsigned long arg) | 2619 | unsigned int ioctl, unsigned long arg) |
2602 | { | 2620 | { |
@@ -2873,6 +2891,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
2873 | r = vcpu->arch.virtual_tsc_khz; | 2891 | r = vcpu->arch.virtual_tsc_khz; |
2874 | goto out; | 2892 | goto out; |
2875 | } | 2893 | } |
2894 | case KVM_KVMCLOCK_CTRL: { | ||
2895 | r = kvm_set_guest_paused(vcpu); | ||
2896 | goto out; | ||
2897 | } | ||
2876 | default: | 2898 | default: |
2877 | r = -EINVAL; | 2899 | r = -EINVAL; |
2878 | } | 2900 | } |
@@ -3045,57 +3067,32 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, | |||
3045 | } | 3067 | } |
3046 | 3068 | ||
3047 | /** | 3069 | /** |
3048 | * write_protect_slot - write protect a slot for dirty logging | 3070 | * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot |
3049 | * @kvm: the kvm instance | 3071 | * @kvm: kvm instance |
3050 | * @memslot: the slot we protect | 3072 | * @log: slot id and address to which we copy the log |
3051 | * @dirty_bitmap: the bitmap indicating which pages are dirty | ||
3052 | * @nr_dirty_pages: the number of dirty pages | ||
3053 | * | 3073 | * |
3054 | * We have two ways to find all sptes to protect: | 3074 | * We need to keep it in mind that VCPU threads can write to the bitmap |
3055 | * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and | 3075 | * concurrently. So, to avoid losing data, we keep the following order for |
3056 | * checks ones that have a spte mapping a page in the slot. | 3076 | * each bit: |
3057 | * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap. | ||
3058 | * | 3077 | * |
3059 | * Generally speaking, if there are not so many dirty pages compared to the | 3078 | * 1. Take a snapshot of the bit and clear it if needed. |
3060 | * number of shadow pages, we should use the latter. | 3079 | * 2. Write protect the corresponding page. |
3080 | * 3. Flush TLB's if needed. | ||
3081 | * 4. Copy the snapshot to the userspace. | ||
3061 | * | 3082 | * |
3062 | * Note that letting others write into a page marked dirty in the old bitmap | 3083 | * Between 2 and 3, the guest may write to the page using the remaining TLB |
3063 | * by using the remaining tlb entry is not a problem. That page will become | 3084 | * entry. This is not a problem because the page will be reported dirty at |
3064 | * write protected again when we flush the tlb and then be reported dirty to | 3085 | * step 4 using the snapshot taken before and step 3 ensures that successive |
3065 | * the user space by copying the old bitmap. | 3086 | * writes will be logged for the next call. |
3066 | */ | ||
3067 | static void write_protect_slot(struct kvm *kvm, | ||
3068 | struct kvm_memory_slot *memslot, | ||
3069 | unsigned long *dirty_bitmap, | ||
3070 | unsigned long nr_dirty_pages) | ||
3071 | { | ||
3072 | spin_lock(&kvm->mmu_lock); | ||
3073 | |||
3074 | /* Not many dirty pages compared to # of shadow pages. */ | ||
3075 | if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { | ||
3076 | unsigned long gfn_offset; | ||
3077 | |||
3078 | for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { | ||
3079 | unsigned long gfn = memslot->base_gfn + gfn_offset; | ||
3080 | |||
3081 | kvm_mmu_rmap_write_protect(kvm, gfn, memslot); | ||
3082 | } | ||
3083 | kvm_flush_remote_tlbs(kvm); | ||
3084 | } else | ||
3085 | kvm_mmu_slot_remove_write_access(kvm, memslot->id); | ||
3086 | |||
3087 | spin_unlock(&kvm->mmu_lock); | ||
3088 | } | ||
3089 | |||
3090 | /* | ||
3091 | * Get (and clear) the dirty memory log for a memory slot. | ||
3092 | */ | 3087 | */ |
3093 | int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | 3088 | int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) |
3094 | struct kvm_dirty_log *log) | ||
3095 | { | 3089 | { |
3096 | int r; | 3090 | int r; |
3097 | struct kvm_memory_slot *memslot; | 3091 | struct kvm_memory_slot *memslot; |
3098 | unsigned long n, nr_dirty_pages; | 3092 | unsigned long n, i; |
3093 | unsigned long *dirty_bitmap; | ||
3094 | unsigned long *dirty_bitmap_buffer; | ||
3095 | bool is_dirty = false; | ||
3099 | 3096 | ||
3100 | mutex_lock(&kvm->slots_lock); | 3097 | mutex_lock(&kvm->slots_lock); |
3101 | 3098 | ||
@@ -3104,49 +3101,42 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | |||
3104 | goto out; | 3101 | goto out; |
3105 | 3102 | ||
3106 | memslot = id_to_memslot(kvm->memslots, log->slot); | 3103 | memslot = id_to_memslot(kvm->memslots, log->slot); |
3104 | |||
3105 | dirty_bitmap = memslot->dirty_bitmap; | ||
3107 | r = -ENOENT; | 3106 | r = -ENOENT; |
3108 | if (!memslot->dirty_bitmap) | 3107 | if (!dirty_bitmap) |
3109 | goto out; | 3108 | goto out; |
3110 | 3109 | ||
3111 | n = kvm_dirty_bitmap_bytes(memslot); | 3110 | n = kvm_dirty_bitmap_bytes(memslot); |
3112 | nr_dirty_pages = memslot->nr_dirty_pages; | ||
3113 | 3111 | ||
3114 | /* If nothing is dirty, don't bother messing with page tables. */ | 3112 | dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); |
3115 | if (nr_dirty_pages) { | 3113 | memset(dirty_bitmap_buffer, 0, n); |
3116 | struct kvm_memslots *slots, *old_slots; | ||
3117 | unsigned long *dirty_bitmap, *dirty_bitmap_head; | ||
3118 | 3114 | ||
3119 | dirty_bitmap = memslot->dirty_bitmap; | 3115 | spin_lock(&kvm->mmu_lock); |
3120 | dirty_bitmap_head = memslot->dirty_bitmap_head; | ||
3121 | if (dirty_bitmap == dirty_bitmap_head) | ||
3122 | dirty_bitmap_head += n / sizeof(long); | ||
3123 | memset(dirty_bitmap_head, 0, n); | ||
3124 | 3116 | ||
3125 | r = -ENOMEM; | 3117 | for (i = 0; i < n / sizeof(long); i++) { |
3126 | slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL); | 3118 | unsigned long mask; |
3127 | if (!slots) | 3119 | gfn_t offset; |
3128 | goto out; | ||
3129 | 3120 | ||
3130 | memslot = id_to_memslot(slots, log->slot); | 3121 | if (!dirty_bitmap[i]) |
3131 | memslot->nr_dirty_pages = 0; | 3122 | continue; |
3132 | memslot->dirty_bitmap = dirty_bitmap_head; | ||
3133 | update_memslots(slots, NULL); | ||
3134 | 3123 | ||
3135 | old_slots = kvm->memslots; | 3124 | is_dirty = true; |
3136 | rcu_assign_pointer(kvm->memslots, slots); | ||
3137 | synchronize_srcu_expedited(&kvm->srcu); | ||
3138 | kfree(old_slots); | ||
3139 | 3125 | ||
3140 | write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages); | 3126 | mask = xchg(&dirty_bitmap[i], 0); |
3127 | dirty_bitmap_buffer[i] = mask; | ||
3141 | 3128 | ||
3142 | r = -EFAULT; | 3129 | offset = i * BITS_PER_LONG; |
3143 | if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) | 3130 | kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask); |
3144 | goto out; | ||
3145 | } else { | ||
3146 | r = -EFAULT; | ||
3147 | if (clear_user(log->dirty_bitmap, n)) | ||
3148 | goto out; | ||
3149 | } | 3131 | } |
3132 | if (is_dirty) | ||
3133 | kvm_flush_remote_tlbs(kvm); | ||
3134 | |||
3135 | spin_unlock(&kvm->mmu_lock); | ||
3136 | |||
3137 | r = -EFAULT; | ||
3138 | if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) | ||
3139 | goto out; | ||
3150 | 3140 | ||
3151 | r = 0; | 3141 | r = 0; |
3152 | out: | 3142 | out: |
@@ -3728,9 +3718,8 @@ struct read_write_emulator_ops { | |||
3728 | static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) | 3718 | static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) |
3729 | { | 3719 | { |
3730 | if (vcpu->mmio_read_completed) { | 3720 | if (vcpu->mmio_read_completed) { |
3731 | memcpy(val, vcpu->mmio_data, bytes); | ||
3732 | trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, | 3721 | trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, |
3733 | vcpu->mmio_phys_addr, *(u64 *)val); | 3722 | vcpu->mmio_fragments[0].gpa, *(u64 *)val); |
3734 | vcpu->mmio_read_completed = 0; | 3723 | vcpu->mmio_read_completed = 0; |
3735 | return 1; | 3724 | return 1; |
3736 | } | 3725 | } |
@@ -3766,8 +3755,9 @@ static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
3766 | static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, | 3755 | static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, |
3767 | void *val, int bytes) | 3756 | void *val, int bytes) |
3768 | { | 3757 | { |
3769 | memcpy(vcpu->mmio_data, val, bytes); | 3758 | struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0]; |
3770 | memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); | 3759 | |
3760 | memcpy(vcpu->run->mmio.data, frag->data, frag->len); | ||
3771 | return X86EMUL_CONTINUE; | 3761 | return X86EMUL_CONTINUE; |
3772 | } | 3762 | } |
3773 | 3763 | ||
@@ -3794,10 +3784,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, | |||
3794 | gpa_t gpa; | 3784 | gpa_t gpa; |
3795 | int handled, ret; | 3785 | int handled, ret; |
3796 | bool write = ops->write; | 3786 | bool write = ops->write; |
3797 | 3787 | struct kvm_mmio_fragment *frag; | |
3798 | if (ops->read_write_prepare && | ||
3799 | ops->read_write_prepare(vcpu, val, bytes)) | ||
3800 | return X86EMUL_CONTINUE; | ||
3801 | 3788 | ||
3802 | ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); | 3789 | ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); |
3803 | 3790 | ||
@@ -3823,15 +3810,19 @@ mmio: | |||
3823 | bytes -= handled; | 3810 | bytes -= handled; |
3824 | val += handled; | 3811 | val += handled; |
3825 | 3812 | ||
3826 | vcpu->mmio_needed = 1; | 3813 | while (bytes) { |
3827 | vcpu->run->exit_reason = KVM_EXIT_MMIO; | 3814 | unsigned now = min(bytes, 8U); |
3828 | vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; | ||
3829 | vcpu->mmio_size = bytes; | ||
3830 | vcpu->run->mmio.len = min(vcpu->mmio_size, 8); | ||
3831 | vcpu->run->mmio.is_write = vcpu->mmio_is_write = write; | ||
3832 | vcpu->mmio_index = 0; | ||
3833 | 3815 | ||
3834 | return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); | 3816 | frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; |
3817 | frag->gpa = gpa; | ||
3818 | frag->data = val; | ||
3819 | frag->len = now; | ||
3820 | |||
3821 | gpa += now; | ||
3822 | val += now; | ||
3823 | bytes -= now; | ||
3824 | } | ||
3825 | return X86EMUL_CONTINUE; | ||
3835 | } | 3826 | } |
3836 | 3827 | ||
3837 | int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, | 3828 | int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, |
@@ -3840,10 +3831,18 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, | |||
3840 | struct read_write_emulator_ops *ops) | 3831 | struct read_write_emulator_ops *ops) |
3841 | { | 3832 | { |
3842 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); | 3833 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
3834 | gpa_t gpa; | ||
3835 | int rc; | ||
3836 | |||
3837 | if (ops->read_write_prepare && | ||
3838 | ops->read_write_prepare(vcpu, val, bytes)) | ||
3839 | return X86EMUL_CONTINUE; | ||
3840 | |||
3841 | vcpu->mmio_nr_fragments = 0; | ||
3843 | 3842 | ||
3844 | /* Crossing a page boundary? */ | 3843 | /* Crossing a page boundary? */ |
3845 | if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { | 3844 | if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { |
3846 | int rc, now; | 3845 | int now; |
3847 | 3846 | ||
3848 | now = -addr & ~PAGE_MASK; | 3847 | now = -addr & ~PAGE_MASK; |
3849 | rc = emulator_read_write_onepage(addr, val, now, exception, | 3848 | rc = emulator_read_write_onepage(addr, val, now, exception, |
@@ -3856,8 +3855,25 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, | |||
3856 | bytes -= now; | 3855 | bytes -= now; |
3857 | } | 3856 | } |
3858 | 3857 | ||
3859 | return emulator_read_write_onepage(addr, val, bytes, exception, | 3858 | rc = emulator_read_write_onepage(addr, val, bytes, exception, |
3860 | vcpu, ops); | 3859 | vcpu, ops); |
3860 | if (rc != X86EMUL_CONTINUE) | ||
3861 | return rc; | ||
3862 | |||
3863 | if (!vcpu->mmio_nr_fragments) | ||
3864 | return rc; | ||
3865 | |||
3866 | gpa = vcpu->mmio_fragments[0].gpa; | ||
3867 | |||
3868 | vcpu->mmio_needed = 1; | ||
3869 | vcpu->mmio_cur_fragment = 0; | ||
3870 | |||
3871 | vcpu->run->mmio.len = vcpu->mmio_fragments[0].len; | ||
3872 | vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write; | ||
3873 | vcpu->run->exit_reason = KVM_EXIT_MMIO; | ||
3874 | vcpu->run->mmio.phys_addr = gpa; | ||
3875 | |||
3876 | return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); | ||
3861 | } | 3877 | } |
3862 | 3878 | ||
3863 | static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, | 3879 | static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, |
@@ -5263,10 +5279,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
5263 | kvm_deliver_pmi(vcpu); | 5279 | kvm_deliver_pmi(vcpu); |
5264 | } | 5280 | } |
5265 | 5281 | ||
5266 | r = kvm_mmu_reload(vcpu); | ||
5267 | if (unlikely(r)) | ||
5268 | goto out; | ||
5269 | |||
5270 | if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { | 5282 | if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { |
5271 | inject_pending_event(vcpu); | 5283 | inject_pending_event(vcpu); |
5272 | 5284 | ||
@@ -5282,6 +5294,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
5282 | } | 5294 | } |
5283 | } | 5295 | } |
5284 | 5296 | ||
5297 | r = kvm_mmu_reload(vcpu); | ||
5298 | if (unlikely(r)) { | ||
5299 | kvm_x86_ops->cancel_injection(vcpu); | ||
5300 | goto out; | ||
5301 | } | ||
5302 | |||
5285 | preempt_disable(); | 5303 | preempt_disable(); |
5286 | 5304 | ||
5287 | kvm_x86_ops->prepare_guest_switch(vcpu); | 5305 | kvm_x86_ops->prepare_guest_switch(vcpu); |
@@ -5456,33 +5474,55 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) | |||
5456 | return r; | 5474 | return r; |
5457 | } | 5475 | } |
5458 | 5476 | ||
5477 | /* | ||
5478 | * Implements the following, as a state machine: | ||
5479 | * | ||
5480 | * read: | ||
5481 | * for each fragment | ||
5482 | * write gpa, len | ||
5483 | * exit | ||
5484 | * copy data | ||
5485 | * execute insn | ||
5486 | * | ||
5487 | * write: | ||
5488 | * for each fragment | ||
5489 | * write gpa, len | ||
5490 | * copy data | ||
5491 | * exit | ||
5492 | */ | ||
5459 | static int complete_mmio(struct kvm_vcpu *vcpu) | 5493 | static int complete_mmio(struct kvm_vcpu *vcpu) |
5460 | { | 5494 | { |
5461 | struct kvm_run *run = vcpu->run; | 5495 | struct kvm_run *run = vcpu->run; |
5496 | struct kvm_mmio_fragment *frag; | ||
5462 | int r; | 5497 | int r; |
5463 | 5498 | ||
5464 | if (!(vcpu->arch.pio.count || vcpu->mmio_needed)) | 5499 | if (!(vcpu->arch.pio.count || vcpu->mmio_needed)) |
5465 | return 1; | 5500 | return 1; |
5466 | 5501 | ||
5467 | if (vcpu->mmio_needed) { | 5502 | if (vcpu->mmio_needed) { |
5468 | vcpu->mmio_needed = 0; | 5503 | /* Complete previous fragment */ |
5504 | frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++]; | ||
5469 | if (!vcpu->mmio_is_write) | 5505 | if (!vcpu->mmio_is_write) |
5470 | memcpy(vcpu->mmio_data + vcpu->mmio_index, | 5506 | memcpy(frag->data, run->mmio.data, frag->len); |
5471 | run->mmio.data, 8); | 5507 | if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { |
5472 | vcpu->mmio_index += 8; | 5508 | vcpu->mmio_needed = 0; |
5473 | if (vcpu->mmio_index < vcpu->mmio_size) { | 5509 | if (vcpu->mmio_is_write) |
5474 | run->exit_reason = KVM_EXIT_MMIO; | 5510 | return 1; |
5475 | run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index; | 5511 | vcpu->mmio_read_completed = 1; |
5476 | memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8); | 5512 | goto done; |
5477 | run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8); | ||
5478 | run->mmio.is_write = vcpu->mmio_is_write; | ||
5479 | vcpu->mmio_needed = 1; | ||
5480 | return 0; | ||
5481 | } | 5513 | } |
5514 | /* Initiate next fragment */ | ||
5515 | ++frag; | ||
5516 | run->exit_reason = KVM_EXIT_MMIO; | ||
5517 | run->mmio.phys_addr = frag->gpa; | ||
5482 | if (vcpu->mmio_is_write) | 5518 | if (vcpu->mmio_is_write) |
5483 | return 1; | 5519 | memcpy(run->mmio.data, frag->data, frag->len); |
5484 | vcpu->mmio_read_completed = 1; | 5520 | run->mmio.len = frag->len; |
5521 | run->mmio.is_write = vcpu->mmio_is_write; | ||
5522 | return 0; | ||
5523 | |||
5485 | } | 5524 | } |
5525 | done: | ||
5486 | vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); | 5526 | vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); |
5487 | r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); | 5527 | r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); |
5488 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); | 5528 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); |
@@ -6399,21 +6439,9 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) | |||
6399 | kvm_cpu_has_interrupt(vcpu)); | 6439 | kvm_cpu_has_interrupt(vcpu)); |
6400 | } | 6440 | } |
6401 | 6441 | ||
6402 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu) | 6442 | int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) |
6403 | { | 6443 | { |
6404 | int me; | 6444 | return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; |
6405 | int cpu = vcpu->cpu; | ||
6406 | |||
6407 | if (waitqueue_active(&vcpu->wq)) { | ||
6408 | wake_up_interruptible(&vcpu->wq); | ||
6409 | ++vcpu->stat.halt_wakeup; | ||
6410 | } | ||
6411 | |||
6412 | me = get_cpu(); | ||
6413 | if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) | ||
6414 | if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE) | ||
6415 | smp_send_reschedule(cpu); | ||
6416 | put_cpu(); | ||
6417 | } | 6445 | } |
6418 | 6446 | ||
6419 | int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) | 6447 | int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) |
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index cb80c293cdd8..3d1134ddb885 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h | |||
@@ -64,7 +64,7 @@ static inline int is_pse(struct kvm_vcpu *vcpu) | |||
64 | 64 | ||
65 | static inline int is_paging(struct kvm_vcpu *vcpu) | 65 | static inline int is_paging(struct kvm_vcpu *vcpu) |
66 | { | 66 | { |
67 | return kvm_read_cr0_bits(vcpu, X86_CR0_PG); | 67 | return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG)); |
68 | } | 68 | } |
69 | 69 | ||
70 | static inline u32 bit(int bitno) | 70 | static inline u32 bit(int bitno) |