aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-07-24 12:07:03 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-07-24 12:07:03 -0400
commit5fabc487c96819dd12ddb9414835d170fd9cd6d5 (patch)
tree01532d492e5074b0d3add29bf92ebf9a9d161e9e /arch/x86/kvm
parentc61264f98c1a974ee6f545f61a4ab33b141d6bda (diff)
parent3f68b0318bbbd61bf08478ab99a149f0d9e5156e (diff)
Merge branch 'kvm-updates/3.1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/3.1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (143 commits) KVM: IOMMU: Disable device assignment without interrupt remapping KVM: MMU: trace mmio page fault KVM: MMU: mmio page fault support KVM: MMU: reorganize struct kvm_shadow_walk_iterator KVM: MMU: lockless walking shadow page table KVM: MMU: do not need atomicly to set/clear spte KVM: MMU: introduce the rules to modify shadow page table KVM: MMU: abstract some functions to handle fault pfn KVM: MMU: filter out the mmio pfn from the fault pfn KVM: MMU: remove bypass_guest_pf KVM: MMU: split kvm_mmu_free_page KVM: MMU: count used shadow pages on prepareing path KVM: MMU: rename 'pt_write' to 'emulate' KVM: MMU: cleanup for FNAME(fetch) KVM: MMU: optimize to handle dirty bit KVM: MMU: cache mmio info on page fault path KVM: x86: introduce vcpu_mmio_gva_to_gpa to cleanup the code KVM: MMU: do not update slot bitmap if spte is nonpresent KVM: MMU: fix walking shadow page table KVM guest: KVM Steal time registration ...
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/emulate.c1749
-rw-r--r--arch/x86/kvm/mmu.c1226
-rw-r--r--arch/x86/kvm/mmu.h25
-rw-r--r--arch/x86/kvm/mmu_audit.c12
-rw-r--r--arch/x86/kvm/mmutrace.h48
-rw-r--r--arch/x86/kvm/paging_tmpl.h258
-rw-r--r--arch/x86/kvm/svm.c6
-rw-r--r--arch/x86/kvm/trace.h31
-rw-r--r--arch/x86/kvm/vmx.c2784
-rw-r--r--arch/x86/kvm/x86.c374
-rw-r--r--arch/x86/kvm/x86.h44
12 files changed, 4744 insertions, 1814 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 65cf8233d25c..988724b236b6 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -31,6 +31,7 @@ config KVM
31 select KVM_ASYNC_PF 31 select KVM_ASYNC_PF
32 select USER_RETURN_NOTIFIER 32 select USER_RETURN_NOTIFIER
33 select KVM_MMIO 33 select KVM_MMIO
34 select TASK_DELAY_ACCT
34 ---help--- 35 ---help---
35 Support hosting fully virtualized guest machines using hardware 36 Support hosting fully virtualized guest machines using hardware
36 virtualization extensions. You will need a fairly recent 37 virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index adc98675cda0..6f08bc940fa8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -407,76 +407,59 @@ struct gprefix {
407 } \ 407 } \
408 } while (0) 408 } while (0)
409 409
410/* Fetch next part of the instruction being emulated. */
411#define insn_fetch(_type, _size, _eip) \
412({ unsigned long _x; \
413 rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
414 if (rc != X86EMUL_CONTINUE) \
415 goto done; \
416 (_eip) += (_size); \
417 (_type)_x; \
418})
419
420#define insn_fetch_arr(_arr, _size, _eip) \
421({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \
422 if (rc != X86EMUL_CONTINUE) \
423 goto done; \
424 (_eip) += (_size); \
425})
426
427static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, 410static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
428 enum x86_intercept intercept, 411 enum x86_intercept intercept,
429 enum x86_intercept_stage stage) 412 enum x86_intercept_stage stage)
430{ 413{
431 struct x86_instruction_info info = { 414 struct x86_instruction_info info = {
432 .intercept = intercept, 415 .intercept = intercept,
433 .rep_prefix = ctxt->decode.rep_prefix, 416 .rep_prefix = ctxt->rep_prefix,
434 .modrm_mod = ctxt->decode.modrm_mod, 417 .modrm_mod = ctxt->modrm_mod,
435 .modrm_reg = ctxt->decode.modrm_reg, 418 .modrm_reg = ctxt->modrm_reg,
436 .modrm_rm = ctxt->decode.modrm_rm, 419 .modrm_rm = ctxt->modrm_rm,
437 .src_val = ctxt->decode.src.val64, 420 .src_val = ctxt->src.val64,
438 .src_bytes = ctxt->decode.src.bytes, 421 .src_bytes = ctxt->src.bytes,
439 .dst_bytes = ctxt->decode.dst.bytes, 422 .dst_bytes = ctxt->dst.bytes,
440 .ad_bytes = ctxt->decode.ad_bytes, 423 .ad_bytes = ctxt->ad_bytes,
441 .next_rip = ctxt->eip, 424 .next_rip = ctxt->eip,
442 }; 425 };
443 426
444 return ctxt->ops->intercept(ctxt, &info, stage); 427 return ctxt->ops->intercept(ctxt, &info, stage);
445} 428}
446 429
447static inline unsigned long ad_mask(struct decode_cache *c) 430static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
448{ 431{
449 return (1UL << (c->ad_bytes << 3)) - 1; 432 return (1UL << (ctxt->ad_bytes << 3)) - 1;
450} 433}
451 434
452/* Access/update address held in a register, based on addressing mode. */ 435/* Access/update address held in a register, based on addressing mode. */
453static inline unsigned long 436static inline unsigned long
454address_mask(struct decode_cache *c, unsigned long reg) 437address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
455{ 438{
456 if (c->ad_bytes == sizeof(unsigned long)) 439 if (ctxt->ad_bytes == sizeof(unsigned long))
457 return reg; 440 return reg;
458 else 441 else
459 return reg & ad_mask(c); 442 return reg & ad_mask(ctxt);
460} 443}
461 444
462static inline unsigned long 445static inline unsigned long
463register_address(struct decode_cache *c, unsigned long reg) 446register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg)
464{ 447{
465 return address_mask(c, reg); 448 return address_mask(ctxt, reg);
466} 449}
467 450
468static inline void 451static inline void
469register_address_increment(struct decode_cache *c, unsigned long *reg, int inc) 452register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc)
470{ 453{
471 if (c->ad_bytes == sizeof(unsigned long)) 454 if (ctxt->ad_bytes == sizeof(unsigned long))
472 *reg += inc; 455 *reg += inc;
473 else 456 else
474 *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c)); 457 *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt));
475} 458}
476 459
477static inline void jmp_rel(struct decode_cache *c, int rel) 460static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
478{ 461{
479 register_address_increment(c, &c->eip, rel); 462 register_address_increment(ctxt, &ctxt->_eip, rel);
480} 463}
481 464
482static u32 desc_limit_scaled(struct desc_struct *desc) 465static u32 desc_limit_scaled(struct desc_struct *desc)
@@ -486,28 +469,26 @@ static u32 desc_limit_scaled(struct desc_struct *desc)
486 return desc->g ? (limit << 12) | 0xfff : limit; 469 return desc->g ? (limit << 12) | 0xfff : limit;
487} 470}
488 471
489static void set_seg_override(struct decode_cache *c, int seg) 472static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg)
490{ 473{
491 c->has_seg_override = true; 474 ctxt->has_seg_override = true;
492 c->seg_override = seg; 475 ctxt->seg_override = seg;
493} 476}
494 477
495static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, 478static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
496 struct x86_emulate_ops *ops, int seg)
497{ 479{
498 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) 480 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
499 return 0; 481 return 0;
500 482
501 return ops->get_cached_segment_base(ctxt, seg); 483 return ctxt->ops->get_cached_segment_base(ctxt, seg);
502} 484}
503 485
504static unsigned seg_override(struct x86_emulate_ctxt *ctxt, 486static unsigned seg_override(struct x86_emulate_ctxt *ctxt)
505 struct decode_cache *c)
506{ 487{
507 if (!c->has_seg_override) 488 if (!ctxt->has_seg_override)
508 return 0; 489 return 0;
509 490
510 return c->seg_override; 491 return ctxt->seg_override;
511} 492}
512 493
513static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, 494static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
@@ -579,7 +560,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
579 unsigned size, bool write, bool fetch, 560 unsigned size, bool write, bool fetch,
580 ulong *linear) 561 ulong *linear)
581{ 562{
582 struct decode_cache *c = &ctxt->decode;
583 struct desc_struct desc; 563 struct desc_struct desc;
584 bool usable; 564 bool usable;
585 ulong la; 565 ulong la;
@@ -587,7 +567,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
587 u16 sel; 567 u16 sel;
588 unsigned cpl, rpl; 568 unsigned cpl, rpl;
589 569
590 la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; 570 la = seg_base(ctxt, addr.seg) + addr.ea;
591 switch (ctxt->mode) { 571 switch (ctxt->mode) {
592 case X86EMUL_MODE_REAL: 572 case X86EMUL_MODE_REAL:
593 break; 573 break;
@@ -637,7 +617,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
637 } 617 }
638 break; 618 break;
639 } 619 }
640 if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : c->ad_bytes != 8) 620 if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8)
641 la &= (u32)-1; 621 la &= (u32)-1;
642 *linear = la; 622 *linear = la;
643 return X86EMUL_CONTINUE; 623 return X86EMUL_CONTINUE;
@@ -671,11 +651,10 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
671 return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); 651 return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
672} 652}
673 653
674static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 654static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt,
675 struct x86_emulate_ops *ops,
676 unsigned long eip, u8 *dest) 655 unsigned long eip, u8 *dest)
677{ 656{
678 struct fetch_cache *fc = &ctxt->decode.fetch; 657 struct fetch_cache *fc = &ctxt->fetch;
679 int rc; 658 int rc;
680 int size, cur_size; 659 int size, cur_size;
681 660
@@ -687,8 +666,8 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
687 rc = __linearize(ctxt, addr, size, false, true, &linear); 666 rc = __linearize(ctxt, addr, size, false, true, &linear);
688 if (rc != X86EMUL_CONTINUE) 667 if (rc != X86EMUL_CONTINUE)
689 return rc; 668 return rc;
690 rc = ops->fetch(ctxt, linear, fc->data + cur_size, 669 rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size,
691 size, &ctxt->exception); 670 size, &ctxt->exception);
692 if (rc != X86EMUL_CONTINUE) 671 if (rc != X86EMUL_CONTINUE)
693 return rc; 672 return rc;
694 fc->end += size; 673 fc->end += size;
@@ -698,7 +677,6 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
698} 677}
699 678
700static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, 679static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
701 struct x86_emulate_ops *ops,
702 unsigned long eip, void *dest, unsigned size) 680 unsigned long eip, void *dest, unsigned size)
703{ 681{
704 int rc; 682 int rc;
@@ -707,13 +685,30 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
707 if (eip + size - ctxt->eip > 15) 685 if (eip + size - ctxt->eip > 15)
708 return X86EMUL_UNHANDLEABLE; 686 return X86EMUL_UNHANDLEABLE;
709 while (size--) { 687 while (size--) {
710 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); 688 rc = do_insn_fetch_byte(ctxt, eip++, dest++);
711 if (rc != X86EMUL_CONTINUE) 689 if (rc != X86EMUL_CONTINUE)
712 return rc; 690 return rc;
713 } 691 }
714 return X86EMUL_CONTINUE; 692 return X86EMUL_CONTINUE;
715} 693}
716 694
695/* Fetch next part of the instruction being emulated. */
696#define insn_fetch(_type, _size, _eip) \
697({ unsigned long _x; \
698 rc = do_insn_fetch(ctxt, (_eip), &_x, (_size)); \
699 if (rc != X86EMUL_CONTINUE) \
700 goto done; \
701 (_eip) += (_size); \
702 (_type)_x; \
703})
704
705#define insn_fetch_arr(_arr, _size, _eip) \
706({ rc = do_insn_fetch(ctxt, (_eip), _arr, (_size)); \
707 if (rc != X86EMUL_CONTINUE) \
708 goto done; \
709 (_eip) += (_size); \
710})
711
717/* 712/*
718 * Given the 'reg' portion of a ModRM byte, and a register block, return a 713 * Given the 'reg' portion of a ModRM byte, and a register block, return a
719 * pointer into the block that addresses the relevant register. 714 * pointer into the block that addresses the relevant register.
@@ -857,16 +852,15 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
857 852
858static void decode_register_operand(struct x86_emulate_ctxt *ctxt, 853static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
859 struct operand *op, 854 struct operand *op,
860 struct decode_cache *c,
861 int inhibit_bytereg) 855 int inhibit_bytereg)
862{ 856{
863 unsigned reg = c->modrm_reg; 857 unsigned reg = ctxt->modrm_reg;
864 int highbyte_regs = c->rex_prefix == 0; 858 int highbyte_regs = ctxt->rex_prefix == 0;
865 859
866 if (!(c->d & ModRM)) 860 if (!(ctxt->d & ModRM))
867 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); 861 reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
868 862
869 if (c->d & Sse) { 863 if (ctxt->d & Sse) {
870 op->type = OP_XMM; 864 op->type = OP_XMM;
871 op->bytes = 16; 865 op->bytes = 16;
872 op->addr.xmm = reg; 866 op->addr.xmm = reg;
@@ -875,49 +869,47 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
875 } 869 }
876 870
877 op->type = OP_REG; 871 op->type = OP_REG;
878 if ((c->d & ByteOp) && !inhibit_bytereg) { 872 if ((ctxt->d & ByteOp) && !inhibit_bytereg) {
879 op->addr.reg = decode_register(reg, c->regs, highbyte_regs); 873 op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs);
880 op->bytes = 1; 874 op->bytes = 1;
881 } else { 875 } else {
882 op->addr.reg = decode_register(reg, c->regs, 0); 876 op->addr.reg = decode_register(reg, ctxt->regs, 0);
883 op->bytes = c->op_bytes; 877 op->bytes = ctxt->op_bytes;
884 } 878 }
885 fetch_register_operand(op); 879 fetch_register_operand(op);
886 op->orig_val = op->val; 880 op->orig_val = op->val;
887} 881}
888 882
889static int decode_modrm(struct x86_emulate_ctxt *ctxt, 883static int decode_modrm(struct x86_emulate_ctxt *ctxt,
890 struct x86_emulate_ops *ops,
891 struct operand *op) 884 struct operand *op)
892{ 885{
893 struct decode_cache *c = &ctxt->decode;
894 u8 sib; 886 u8 sib;
895 int index_reg = 0, base_reg = 0, scale; 887 int index_reg = 0, base_reg = 0, scale;
896 int rc = X86EMUL_CONTINUE; 888 int rc = X86EMUL_CONTINUE;
897 ulong modrm_ea = 0; 889 ulong modrm_ea = 0;
898 890
899 if (c->rex_prefix) { 891 if (ctxt->rex_prefix) {
900 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ 892 ctxt->modrm_reg = (ctxt->rex_prefix & 4) << 1; /* REX.R */
901 index_reg = (c->rex_prefix & 2) << 2; /* REX.X */ 893 index_reg = (ctxt->rex_prefix & 2) << 2; /* REX.X */
902 c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */ 894 ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
903 } 895 }
904 896
905 c->modrm = insn_fetch(u8, 1, c->eip); 897 ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
906 c->modrm_mod |= (c->modrm & 0xc0) >> 6; 898 ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
907 c->modrm_reg |= (c->modrm & 0x38) >> 3; 899 ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
908 c->modrm_rm |= (c->modrm & 0x07); 900 ctxt->modrm_rm |= (ctxt->modrm & 0x07);
909 c->modrm_seg = VCPU_SREG_DS; 901 ctxt->modrm_seg = VCPU_SREG_DS;
910 902
911 if (c->modrm_mod == 3) { 903 if (ctxt->modrm_mod == 3) {
912 op->type = OP_REG; 904 op->type = OP_REG;
913 op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 905 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
914 op->addr.reg = decode_register(c->modrm_rm, 906 op->addr.reg = decode_register(ctxt->modrm_rm,
915 c->regs, c->d & ByteOp); 907 ctxt->regs, ctxt->d & ByteOp);
916 if (c->d & Sse) { 908 if (ctxt->d & Sse) {
917 op->type = OP_XMM; 909 op->type = OP_XMM;
918 op->bytes = 16; 910 op->bytes = 16;
919 op->addr.xmm = c->modrm_rm; 911 op->addr.xmm = ctxt->modrm_rm;
920 read_sse_reg(ctxt, &op->vec_val, c->modrm_rm); 912 read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm);
921 return rc; 913 return rc;
922 } 914 }
923 fetch_register_operand(op); 915 fetch_register_operand(op);
@@ -926,26 +918,26 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
926 918
927 op->type = OP_MEM; 919 op->type = OP_MEM;
928 920
929 if (c->ad_bytes == 2) { 921 if (ctxt->ad_bytes == 2) {
930 unsigned bx = c->regs[VCPU_REGS_RBX]; 922 unsigned bx = ctxt->regs[VCPU_REGS_RBX];
931 unsigned bp = c->regs[VCPU_REGS_RBP]; 923 unsigned bp = ctxt->regs[VCPU_REGS_RBP];
932 unsigned si = c->regs[VCPU_REGS_RSI]; 924 unsigned si = ctxt->regs[VCPU_REGS_RSI];
933 unsigned di = c->regs[VCPU_REGS_RDI]; 925 unsigned di = ctxt->regs[VCPU_REGS_RDI];
934 926
935 /* 16-bit ModR/M decode. */ 927 /* 16-bit ModR/M decode. */
936 switch (c->modrm_mod) { 928 switch (ctxt->modrm_mod) {
937 case 0: 929 case 0:
938 if (c->modrm_rm == 6) 930 if (ctxt->modrm_rm == 6)
939 modrm_ea += insn_fetch(u16, 2, c->eip); 931 modrm_ea += insn_fetch(u16, 2, ctxt->_eip);
940 break; 932 break;
941 case 1: 933 case 1:
942 modrm_ea += insn_fetch(s8, 1, c->eip); 934 modrm_ea += insn_fetch(s8, 1, ctxt->_eip);
943 break; 935 break;
944 case 2: 936 case 2:
945 modrm_ea += insn_fetch(u16, 2, c->eip); 937 modrm_ea += insn_fetch(u16, 2, ctxt->_eip);
946 break; 938 break;
947 } 939 }
948 switch (c->modrm_rm) { 940 switch (ctxt->modrm_rm) {
949 case 0: 941 case 0:
950 modrm_ea += bx + si; 942 modrm_ea += bx + si;
951 break; 943 break;
@@ -965,46 +957,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
965 modrm_ea += di; 957 modrm_ea += di;
966 break; 958 break;
967 case 6: 959 case 6:
968 if (c->modrm_mod != 0) 960 if (ctxt->modrm_mod != 0)
969 modrm_ea += bp; 961 modrm_ea += bp;
970 break; 962 break;
971 case 7: 963 case 7:
972 modrm_ea += bx; 964 modrm_ea += bx;
973 break; 965 break;
974 } 966 }
975 if (c->modrm_rm == 2 || c->modrm_rm == 3 || 967 if (ctxt->modrm_rm == 2 || ctxt->modrm_rm == 3 ||
976 (c->modrm_rm == 6 && c->modrm_mod != 0)) 968 (ctxt->modrm_rm == 6 && ctxt->modrm_mod != 0))
977 c->modrm_seg = VCPU_SREG_SS; 969 ctxt->modrm_seg = VCPU_SREG_SS;
978 modrm_ea = (u16)modrm_ea; 970 modrm_ea = (u16)modrm_ea;
979 } else { 971 } else {
980 /* 32/64-bit ModR/M decode. */ 972 /* 32/64-bit ModR/M decode. */
981 if ((c->modrm_rm & 7) == 4) { 973 if ((ctxt->modrm_rm & 7) == 4) {
982 sib = insn_fetch(u8, 1, c->eip); 974 sib = insn_fetch(u8, 1, ctxt->_eip);
983 index_reg |= (sib >> 3) & 7; 975 index_reg |= (sib >> 3) & 7;
984 base_reg |= sib & 7; 976 base_reg |= sib & 7;
985 scale = sib >> 6; 977 scale = sib >> 6;
986 978
987 if ((base_reg & 7) == 5 && c->modrm_mod == 0) 979 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
988 modrm_ea += insn_fetch(s32, 4, c->eip); 980 modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
989 else 981 else
990 modrm_ea += c->regs[base_reg]; 982 modrm_ea += ctxt->regs[base_reg];
991 if (index_reg != 4) 983 if (index_reg != 4)
992 modrm_ea += c->regs[index_reg] << scale; 984 modrm_ea += ctxt->regs[index_reg] << scale;
993 } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { 985 } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
994 if (ctxt->mode == X86EMUL_MODE_PROT64) 986 if (ctxt->mode == X86EMUL_MODE_PROT64)
995 c->rip_relative = 1; 987 ctxt->rip_relative = 1;
996 } else 988 } else
997 modrm_ea += c->regs[c->modrm_rm]; 989 modrm_ea += ctxt->regs[ctxt->modrm_rm];
998 switch (c->modrm_mod) { 990 switch (ctxt->modrm_mod) {
999 case 0: 991 case 0:
1000 if (c->modrm_rm == 5) 992 if (ctxt->modrm_rm == 5)
1001 modrm_ea += insn_fetch(s32, 4, c->eip); 993 modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
1002 break; 994 break;
1003 case 1: 995 case 1:
1004 modrm_ea += insn_fetch(s8, 1, c->eip); 996 modrm_ea += insn_fetch(s8, 1, ctxt->_eip);
1005 break; 997 break;
1006 case 2: 998 case 2:
1007 modrm_ea += insn_fetch(s32, 4, c->eip); 999 modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
1008 break; 1000 break;
1009 } 1001 }
1010 } 1002 }
@@ -1014,53 +1006,50 @@ done:
1014} 1006}
1015 1007
1016static int decode_abs(struct x86_emulate_ctxt *ctxt, 1008static int decode_abs(struct x86_emulate_ctxt *ctxt,
1017 struct x86_emulate_ops *ops,
1018 struct operand *op) 1009 struct operand *op)
1019{ 1010{
1020 struct decode_cache *c = &ctxt->decode;
1021 int rc = X86EMUL_CONTINUE; 1011 int rc = X86EMUL_CONTINUE;
1022 1012
1023 op->type = OP_MEM; 1013 op->type = OP_MEM;
1024 switch (c->ad_bytes) { 1014 switch (ctxt->ad_bytes) {
1025 case 2: 1015 case 2:
1026 op->addr.mem.ea = insn_fetch(u16, 2, c->eip); 1016 op->addr.mem.ea = insn_fetch(u16, 2, ctxt->_eip);
1027 break; 1017 break;
1028 case 4: 1018 case 4:
1029 op->addr.mem.ea = insn_fetch(u32, 4, c->eip); 1019 op->addr.mem.ea = insn_fetch(u32, 4, ctxt->_eip);
1030 break; 1020 break;
1031 case 8: 1021 case 8:
1032 op->addr.mem.ea = insn_fetch(u64, 8, c->eip); 1022 op->addr.mem.ea = insn_fetch(u64, 8, ctxt->_eip);
1033 break; 1023 break;
1034 } 1024 }
1035done: 1025done:
1036 return rc; 1026 return rc;
1037} 1027}
1038 1028
1039static void fetch_bit_operand(struct decode_cache *c) 1029static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt)
1040{ 1030{
1041 long sv = 0, mask; 1031 long sv = 0, mask;
1042 1032
1043 if (c->dst.type == OP_MEM && c->src.type == OP_REG) { 1033 if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) {
1044 mask = ~(c->dst.bytes * 8 - 1); 1034 mask = ~(ctxt->dst.bytes * 8 - 1);
1045 1035
1046 if (c->src.bytes == 2) 1036 if (ctxt->src.bytes == 2)
1047 sv = (s16)c->src.val & (s16)mask; 1037 sv = (s16)ctxt->src.val & (s16)mask;
1048 else if (c->src.bytes == 4) 1038 else if (ctxt->src.bytes == 4)
1049 sv = (s32)c->src.val & (s32)mask; 1039 sv = (s32)ctxt->src.val & (s32)mask;
1050 1040
1051 c->dst.addr.mem.ea += (sv >> 3); 1041 ctxt->dst.addr.mem.ea += (sv >> 3);
1052 } 1042 }
1053 1043
1054 /* only subword offset */ 1044 /* only subword offset */
1055 c->src.val &= (c->dst.bytes << 3) - 1; 1045 ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
1056} 1046}
1057 1047
1058static int read_emulated(struct x86_emulate_ctxt *ctxt, 1048static int read_emulated(struct x86_emulate_ctxt *ctxt,
1059 struct x86_emulate_ops *ops,
1060 unsigned long addr, void *dest, unsigned size) 1049 unsigned long addr, void *dest, unsigned size)
1061{ 1050{
1062 int rc; 1051 int rc;
1063 struct read_cache *mc = &ctxt->decode.mem_read; 1052 struct read_cache *mc = &ctxt->mem_read;
1064 1053
1065 while (size) { 1054 while (size) {
1066 int n = min(size, 8u); 1055 int n = min(size, 8u);
@@ -1068,8 +1057,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
1068 if (mc->pos < mc->end) 1057 if (mc->pos < mc->end)
1069 goto read_cached; 1058 goto read_cached;
1070 1059
1071 rc = ops->read_emulated(ctxt, addr, mc->data + mc->end, n, 1060 rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
1072 &ctxt->exception); 1061 &ctxt->exception);
1073 if (rc != X86EMUL_CONTINUE) 1062 if (rc != X86EMUL_CONTINUE)
1074 return rc; 1063 return rc;
1075 mc->end += n; 1064 mc->end += n;
@@ -1094,7 +1083,7 @@ static int segmented_read(struct x86_emulate_ctxt *ctxt,
1094 rc = linearize(ctxt, addr, size, false, &linear); 1083 rc = linearize(ctxt, addr, size, false, &linear);
1095 if (rc != X86EMUL_CONTINUE) 1084 if (rc != X86EMUL_CONTINUE)
1096 return rc; 1085 return rc;
1097 return read_emulated(ctxt, ctxt->ops, linear, data, size); 1086 return read_emulated(ctxt, linear, data, size);
1098} 1087}
1099 1088
1100static int segmented_write(struct x86_emulate_ctxt *ctxt, 1089static int segmented_write(struct x86_emulate_ctxt *ctxt,
@@ -1128,26 +1117,24 @@ static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt,
1128} 1117}
1129 1118
1130static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, 1119static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1131 struct x86_emulate_ops *ops,
1132 unsigned int size, unsigned short port, 1120 unsigned int size, unsigned short port,
1133 void *dest) 1121 void *dest)
1134{ 1122{
1135 struct read_cache *rc = &ctxt->decode.io_read; 1123 struct read_cache *rc = &ctxt->io_read;
1136 1124
1137 if (rc->pos == rc->end) { /* refill pio read ahead */ 1125 if (rc->pos == rc->end) { /* refill pio read ahead */
1138 struct decode_cache *c = &ctxt->decode;
1139 unsigned int in_page, n; 1126 unsigned int in_page, n;
1140 unsigned int count = c->rep_prefix ? 1127 unsigned int count = ctxt->rep_prefix ?
1141 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1; 1128 address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) : 1;
1142 in_page = (ctxt->eflags & EFLG_DF) ? 1129 in_page = (ctxt->eflags & EFLG_DF) ?
1143 offset_in_page(c->regs[VCPU_REGS_RDI]) : 1130 offset_in_page(ctxt->regs[VCPU_REGS_RDI]) :
1144 PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]); 1131 PAGE_SIZE - offset_in_page(ctxt->regs[VCPU_REGS_RDI]);
1145 n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, 1132 n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
1146 count); 1133 count);
1147 if (n == 0) 1134 if (n == 0)
1148 n = 1; 1135 n = 1;
1149 rc->pos = rc->end = 0; 1136 rc->pos = rc->end = 0;
1150 if (!ops->pio_in_emulated(ctxt, size, port, rc->data, n)) 1137 if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n))
1151 return 0; 1138 return 0;
1152 rc->end = n * size; 1139 rc->end = n * size;
1153 } 1140 }
@@ -1158,9 +1145,10 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1158} 1145}
1159 1146
1160static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, 1147static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1161 struct x86_emulate_ops *ops,
1162 u16 selector, struct desc_ptr *dt) 1148 u16 selector, struct desc_ptr *dt)
1163{ 1149{
1150 struct x86_emulate_ops *ops = ctxt->ops;
1151
1164 if (selector & 1 << 2) { 1152 if (selector & 1 << 2) {
1165 struct desc_struct desc; 1153 struct desc_struct desc;
1166 u16 sel; 1154 u16 sel;
@@ -1177,48 +1165,42 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1177 1165
1178/* allowed just for 8 bytes segments */ 1166/* allowed just for 8 bytes segments */
1179static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1167static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1180 struct x86_emulate_ops *ops,
1181 u16 selector, struct desc_struct *desc) 1168 u16 selector, struct desc_struct *desc)
1182{ 1169{
1183 struct desc_ptr dt; 1170 struct desc_ptr dt;
1184 u16 index = selector >> 3; 1171 u16 index = selector >> 3;
1185 int ret;
1186 ulong addr; 1172 ulong addr;
1187 1173
1188 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1174 get_descriptor_table_ptr(ctxt, selector, &dt);
1189 1175
1190 if (dt.size < index * 8 + 7) 1176 if (dt.size < index * 8 + 7)
1191 return emulate_gp(ctxt, selector & 0xfffc); 1177 return emulate_gp(ctxt, selector & 0xfffc);
1192 addr = dt.address + index * 8;
1193 ret = ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception);
1194 1178
1195 return ret; 1179 addr = dt.address + index * 8;
1180 return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
1181 &ctxt->exception);
1196} 1182}
1197 1183
1198/* allowed just for 8 bytes segments */ 1184/* allowed just for 8 bytes segments */
1199static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1185static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1200 struct x86_emulate_ops *ops,
1201 u16 selector, struct desc_struct *desc) 1186 u16 selector, struct desc_struct *desc)
1202{ 1187{
1203 struct desc_ptr dt; 1188 struct desc_ptr dt;
1204 u16 index = selector >> 3; 1189 u16 index = selector >> 3;
1205 ulong addr; 1190 ulong addr;
1206 int ret;
1207 1191
1208 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1192 get_descriptor_table_ptr(ctxt, selector, &dt);
1209 1193
1210 if (dt.size < index * 8 + 7) 1194 if (dt.size < index * 8 + 7)
1211 return emulate_gp(ctxt, selector & 0xfffc); 1195 return emulate_gp(ctxt, selector & 0xfffc);
1212 1196
1213 addr = dt.address + index * 8; 1197 addr = dt.address + index * 8;
1214 ret = ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); 1198 return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc,
1215 1199 &ctxt->exception);
1216 return ret;
1217} 1200}
1218 1201
1219/* Does not support long mode */ 1202/* Does not support long mode */
1220static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1203static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1221 struct x86_emulate_ops *ops,
1222 u16 selector, int seg) 1204 u16 selector, int seg)
1223{ 1205{
1224 struct desc_struct seg_desc; 1206 struct desc_struct seg_desc;
@@ -1253,7 +1235,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1253 if (null_selector) /* for NULL selector skip all following checks */ 1235 if (null_selector) /* for NULL selector skip all following checks */
1254 goto load; 1236 goto load;
1255 1237
1256 ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc); 1238 ret = read_segment_descriptor(ctxt, selector, &seg_desc);
1257 if (ret != X86EMUL_CONTINUE) 1239 if (ret != X86EMUL_CONTINUE)
1258 return ret; 1240 return ret;
1259 1241
@@ -1271,7 +1253,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1271 1253
1272 rpl = selector & 3; 1254 rpl = selector & 3;
1273 dpl = seg_desc.dpl; 1255 dpl = seg_desc.dpl;
1274 cpl = ops->cpl(ctxt); 1256 cpl = ctxt->ops->cpl(ctxt);
1275 1257
1276 switch (seg) { 1258 switch (seg) {
1277 case VCPU_SREG_SS: 1259 case VCPU_SREG_SS:
@@ -1322,12 +1304,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1322 if (seg_desc.s) { 1304 if (seg_desc.s) {
1323 /* mark segment as accessed */ 1305 /* mark segment as accessed */
1324 seg_desc.type |= 1; 1306 seg_desc.type |= 1;
1325 ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc); 1307 ret = write_segment_descriptor(ctxt, selector, &seg_desc);
1326 if (ret != X86EMUL_CONTINUE) 1308 if (ret != X86EMUL_CONTINUE)
1327 return ret; 1309 return ret;
1328 } 1310 }
1329load: 1311load:
1330 ops->set_segment(ctxt, selector, &seg_desc, 0, seg); 1312 ctxt->ops->set_segment(ctxt, selector, &seg_desc, 0, seg);
1331 return X86EMUL_CONTINUE; 1313 return X86EMUL_CONTINUE;
1332exception: 1314exception:
1333 emulate_exception(ctxt, err_vec, err_code, true); 1315 emulate_exception(ctxt, err_vec, err_code, true);
@@ -1356,29 +1338,28 @@ static void write_register_operand(struct operand *op)
1356static int writeback(struct x86_emulate_ctxt *ctxt) 1338static int writeback(struct x86_emulate_ctxt *ctxt)
1357{ 1339{
1358 int rc; 1340 int rc;
1359 struct decode_cache *c = &ctxt->decode;
1360 1341
1361 switch (c->dst.type) { 1342 switch (ctxt->dst.type) {
1362 case OP_REG: 1343 case OP_REG:
1363 write_register_operand(&c->dst); 1344 write_register_operand(&ctxt->dst);
1364 break; 1345 break;
1365 case OP_MEM: 1346 case OP_MEM:
1366 if (c->lock_prefix) 1347 if (ctxt->lock_prefix)
1367 rc = segmented_cmpxchg(ctxt, 1348 rc = segmented_cmpxchg(ctxt,
1368 c->dst.addr.mem, 1349 ctxt->dst.addr.mem,
1369 &c->dst.orig_val, 1350 &ctxt->dst.orig_val,
1370 &c->dst.val, 1351 &ctxt->dst.val,
1371 c->dst.bytes); 1352 ctxt->dst.bytes);
1372 else 1353 else
1373 rc = segmented_write(ctxt, 1354 rc = segmented_write(ctxt,
1374 c->dst.addr.mem, 1355 ctxt->dst.addr.mem,
1375 &c->dst.val, 1356 &ctxt->dst.val,
1376 c->dst.bytes); 1357 ctxt->dst.bytes);
1377 if (rc != X86EMUL_CONTINUE) 1358 if (rc != X86EMUL_CONTINUE)
1378 return rc; 1359 return rc;
1379 break; 1360 break;
1380 case OP_XMM: 1361 case OP_XMM:
1381 write_sse_reg(ctxt, &c->dst.vec_val, c->dst.addr.xmm); 1362 write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
1382 break; 1363 break;
1383 case OP_NONE: 1364 case OP_NONE:
1384 /* no writeback */ 1365 /* no writeback */
@@ -1391,50 +1372,45 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
1391 1372
1392static int em_push(struct x86_emulate_ctxt *ctxt) 1373static int em_push(struct x86_emulate_ctxt *ctxt)
1393{ 1374{
1394 struct decode_cache *c = &ctxt->decode;
1395 struct segmented_address addr; 1375 struct segmented_address addr;
1396 1376
1397 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1377 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes);
1398 addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); 1378 addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
1399 addr.seg = VCPU_SREG_SS; 1379 addr.seg = VCPU_SREG_SS;
1400 1380
1401 /* Disable writeback. */ 1381 /* Disable writeback. */
1402 c->dst.type = OP_NONE; 1382 ctxt->dst.type = OP_NONE;
1403 return segmented_write(ctxt, addr, &c->src.val, c->op_bytes); 1383 return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes);
1404} 1384}
1405 1385
1406static int emulate_pop(struct x86_emulate_ctxt *ctxt, 1386static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1407 void *dest, int len) 1387 void *dest, int len)
1408{ 1388{
1409 struct decode_cache *c = &ctxt->decode;
1410 int rc; 1389 int rc;
1411 struct segmented_address addr; 1390 struct segmented_address addr;
1412 1391
1413 addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); 1392 addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
1414 addr.seg = VCPU_SREG_SS; 1393 addr.seg = VCPU_SREG_SS;
1415 rc = segmented_read(ctxt, addr, dest, len); 1394 rc = segmented_read(ctxt, addr, dest, len);
1416 if (rc != X86EMUL_CONTINUE) 1395 if (rc != X86EMUL_CONTINUE)
1417 return rc; 1396 return rc;
1418 1397
1419 register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); 1398 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len);
1420 return rc; 1399 return rc;
1421} 1400}
1422 1401
1423static int em_pop(struct x86_emulate_ctxt *ctxt) 1402static int em_pop(struct x86_emulate_ctxt *ctxt)
1424{ 1403{
1425 struct decode_cache *c = &ctxt->decode; 1404 return emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
1426
1427 return emulate_pop(ctxt, &c->dst.val, c->op_bytes);
1428} 1405}
1429 1406
1430static int emulate_popf(struct x86_emulate_ctxt *ctxt, 1407static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1431 struct x86_emulate_ops *ops, 1408 void *dest, int len)
1432 void *dest, int len)
1433{ 1409{
1434 int rc; 1410 int rc;
1435 unsigned long val, change_mask; 1411 unsigned long val, change_mask;
1436 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 1412 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1437 int cpl = ops->cpl(ctxt); 1413 int cpl = ctxt->ops->cpl(ctxt);
1438 1414
1439 rc = emulate_pop(ctxt, &val, len); 1415 rc = emulate_pop(ctxt, &val, len);
1440 if (rc != X86EMUL_CONTINUE) 1416 if (rc != X86EMUL_CONTINUE)
@@ -1470,49 +1446,41 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1470 1446
1471static int em_popf(struct x86_emulate_ctxt *ctxt) 1447static int em_popf(struct x86_emulate_ctxt *ctxt)
1472{ 1448{
1473 struct decode_cache *c = &ctxt->decode; 1449 ctxt->dst.type = OP_REG;
1474 1450 ctxt->dst.addr.reg = &ctxt->eflags;
1475 c->dst.type = OP_REG; 1451 ctxt->dst.bytes = ctxt->op_bytes;
1476 c->dst.addr.reg = &ctxt->eflags; 1452 return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
1477 c->dst.bytes = c->op_bytes;
1478 return emulate_popf(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
1479} 1453}
1480 1454
1481static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, 1455static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
1482 struct x86_emulate_ops *ops, int seg)
1483{ 1456{
1484 struct decode_cache *c = &ctxt->decode; 1457 ctxt->src.val = get_segment_selector(ctxt, seg);
1485
1486 c->src.val = get_segment_selector(ctxt, seg);
1487 1458
1488 return em_push(ctxt); 1459 return em_push(ctxt);
1489} 1460}
1490 1461
1491static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, 1462static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int seg)
1492 struct x86_emulate_ops *ops, int seg)
1493{ 1463{
1494 struct decode_cache *c = &ctxt->decode;
1495 unsigned long selector; 1464 unsigned long selector;
1496 int rc; 1465 int rc;
1497 1466
1498 rc = emulate_pop(ctxt, &selector, c->op_bytes); 1467 rc = emulate_pop(ctxt, &selector, ctxt->op_bytes);
1499 if (rc != X86EMUL_CONTINUE) 1468 if (rc != X86EMUL_CONTINUE)
1500 return rc; 1469 return rc;
1501 1470
1502 rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg); 1471 rc = load_segment_descriptor(ctxt, (u16)selector, seg);
1503 return rc; 1472 return rc;
1504} 1473}
1505 1474
1506static int em_pusha(struct x86_emulate_ctxt *ctxt) 1475static int em_pusha(struct x86_emulate_ctxt *ctxt)
1507{ 1476{
1508 struct decode_cache *c = &ctxt->decode; 1477 unsigned long old_esp = ctxt->regs[VCPU_REGS_RSP];
1509 unsigned long old_esp = c->regs[VCPU_REGS_RSP];
1510 int rc = X86EMUL_CONTINUE; 1478 int rc = X86EMUL_CONTINUE;
1511 int reg = VCPU_REGS_RAX; 1479 int reg = VCPU_REGS_RAX;
1512 1480
1513 while (reg <= VCPU_REGS_RDI) { 1481 while (reg <= VCPU_REGS_RDI) {
1514 (reg == VCPU_REGS_RSP) ? 1482 (reg == VCPU_REGS_RSP) ?
1515 (c->src.val = old_esp) : (c->src.val = c->regs[reg]); 1483 (ctxt->src.val = old_esp) : (ctxt->src.val = ctxt->regs[reg]);
1516 1484
1517 rc = em_push(ctxt); 1485 rc = em_push(ctxt);
1518 if (rc != X86EMUL_CONTINUE) 1486 if (rc != X86EMUL_CONTINUE)
@@ -1526,26 +1494,23 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt)
1526 1494
1527static int em_pushf(struct x86_emulate_ctxt *ctxt) 1495static int em_pushf(struct x86_emulate_ctxt *ctxt)
1528{ 1496{
1529 struct decode_cache *c = &ctxt->decode; 1497 ctxt->src.val = (unsigned long)ctxt->eflags;
1530
1531 c->src.val = (unsigned long)ctxt->eflags;
1532 return em_push(ctxt); 1498 return em_push(ctxt);
1533} 1499}
1534 1500
1535static int em_popa(struct x86_emulate_ctxt *ctxt) 1501static int em_popa(struct x86_emulate_ctxt *ctxt)
1536{ 1502{
1537 struct decode_cache *c = &ctxt->decode;
1538 int rc = X86EMUL_CONTINUE; 1503 int rc = X86EMUL_CONTINUE;
1539 int reg = VCPU_REGS_RDI; 1504 int reg = VCPU_REGS_RDI;
1540 1505
1541 while (reg >= VCPU_REGS_RAX) { 1506 while (reg >= VCPU_REGS_RAX) {
1542 if (reg == VCPU_REGS_RSP) { 1507 if (reg == VCPU_REGS_RSP) {
1543 register_address_increment(c, &c->regs[VCPU_REGS_RSP], 1508 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP],
1544 c->op_bytes); 1509 ctxt->op_bytes);
1545 --reg; 1510 --reg;
1546 } 1511 }
1547 1512
1548 rc = emulate_pop(ctxt, &c->regs[reg], c->op_bytes); 1513 rc = emulate_pop(ctxt, &ctxt->regs[reg], ctxt->op_bytes);
1549 if (rc != X86EMUL_CONTINUE) 1514 if (rc != X86EMUL_CONTINUE)
1550 break; 1515 break;
1551 --reg; 1516 --reg;
@@ -1553,10 +1518,9 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
1553 return rc; 1518 return rc;
1554} 1519}
1555 1520
1556int emulate_int_real(struct x86_emulate_ctxt *ctxt, 1521int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
1557 struct x86_emulate_ops *ops, int irq)
1558{ 1522{
1559 struct decode_cache *c = &ctxt->decode; 1523 struct x86_emulate_ops *ops = ctxt->ops;
1560 int rc; 1524 int rc;
1561 struct desc_ptr dt; 1525 struct desc_ptr dt;
1562 gva_t cs_addr; 1526 gva_t cs_addr;
@@ -1564,19 +1528,19 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1564 u16 cs, eip; 1528 u16 cs, eip;
1565 1529
1566 /* TODO: Add limit checks */ 1530 /* TODO: Add limit checks */
1567 c->src.val = ctxt->eflags; 1531 ctxt->src.val = ctxt->eflags;
1568 rc = em_push(ctxt); 1532 rc = em_push(ctxt);
1569 if (rc != X86EMUL_CONTINUE) 1533 if (rc != X86EMUL_CONTINUE)
1570 return rc; 1534 return rc;
1571 1535
1572 ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); 1536 ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
1573 1537
1574 c->src.val = get_segment_selector(ctxt, VCPU_SREG_CS); 1538 ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
1575 rc = em_push(ctxt); 1539 rc = em_push(ctxt);
1576 if (rc != X86EMUL_CONTINUE) 1540 if (rc != X86EMUL_CONTINUE)
1577 return rc; 1541 return rc;
1578 1542
1579 c->src.val = c->eip; 1543 ctxt->src.val = ctxt->_eip;
1580 rc = em_push(ctxt); 1544 rc = em_push(ctxt);
1581 if (rc != X86EMUL_CONTINUE) 1545 if (rc != X86EMUL_CONTINUE)
1582 return rc; 1546 return rc;
@@ -1594,21 +1558,20 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1594 if (rc != X86EMUL_CONTINUE) 1558 if (rc != X86EMUL_CONTINUE)
1595 return rc; 1559 return rc;
1596 1560
1597 rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS); 1561 rc = load_segment_descriptor(ctxt, cs, VCPU_SREG_CS);
1598 if (rc != X86EMUL_CONTINUE) 1562 if (rc != X86EMUL_CONTINUE)
1599 return rc; 1563 return rc;
1600 1564
1601 c->eip = eip; 1565 ctxt->_eip = eip;
1602 1566
1603 return rc; 1567 return rc;
1604} 1568}
1605 1569
1606static int emulate_int(struct x86_emulate_ctxt *ctxt, 1570static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq)
1607 struct x86_emulate_ops *ops, int irq)
1608{ 1571{
1609 switch(ctxt->mode) { 1572 switch(ctxt->mode) {
1610 case X86EMUL_MODE_REAL: 1573 case X86EMUL_MODE_REAL:
1611 return emulate_int_real(ctxt, ops, irq); 1574 return emulate_int_real(ctxt, irq);
1612 case X86EMUL_MODE_VM86: 1575 case X86EMUL_MODE_VM86:
1613 case X86EMUL_MODE_PROT16: 1576 case X86EMUL_MODE_PROT16:
1614 case X86EMUL_MODE_PROT32: 1577 case X86EMUL_MODE_PROT32:
@@ -1619,10 +1582,8 @@ static int emulate_int(struct x86_emulate_ctxt *ctxt,
1619 } 1582 }
1620} 1583}
1621 1584
1622static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, 1585static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
1623 struct x86_emulate_ops *ops)
1624{ 1586{
1625 struct decode_cache *c = &ctxt->decode;
1626 int rc = X86EMUL_CONTINUE; 1587 int rc = X86EMUL_CONTINUE;
1627 unsigned long temp_eip = 0; 1588 unsigned long temp_eip = 0;
1628 unsigned long temp_eflags = 0; 1589 unsigned long temp_eflags = 0;
@@ -1634,7 +1595,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1634 1595
1635 /* TODO: Add stack limit check */ 1596 /* TODO: Add stack limit check */
1636 1597
1637 rc = emulate_pop(ctxt, &temp_eip, c->op_bytes); 1598 rc = emulate_pop(ctxt, &temp_eip, ctxt->op_bytes);
1638 1599
1639 if (rc != X86EMUL_CONTINUE) 1600 if (rc != X86EMUL_CONTINUE)
1640 return rc; 1601 return rc;
@@ -1642,27 +1603,27 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1642 if (temp_eip & ~0xffff) 1603 if (temp_eip & ~0xffff)
1643 return emulate_gp(ctxt, 0); 1604 return emulate_gp(ctxt, 0);
1644 1605
1645 rc = emulate_pop(ctxt, &cs, c->op_bytes); 1606 rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
1646 1607
1647 if (rc != X86EMUL_CONTINUE) 1608 if (rc != X86EMUL_CONTINUE)
1648 return rc; 1609 return rc;
1649 1610
1650 rc = emulate_pop(ctxt, &temp_eflags, c->op_bytes); 1611 rc = emulate_pop(ctxt, &temp_eflags, ctxt->op_bytes);
1651 1612
1652 if (rc != X86EMUL_CONTINUE) 1613 if (rc != X86EMUL_CONTINUE)
1653 return rc; 1614 return rc;
1654 1615
1655 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); 1616 rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
1656 1617
1657 if (rc != X86EMUL_CONTINUE) 1618 if (rc != X86EMUL_CONTINUE)
1658 return rc; 1619 return rc;
1659 1620
1660 c->eip = temp_eip; 1621 ctxt->_eip = temp_eip;
1661 1622
1662 1623
1663 if (c->op_bytes == 4) 1624 if (ctxt->op_bytes == 4)
1664 ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); 1625 ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
1665 else if (c->op_bytes == 2) { 1626 else if (ctxt->op_bytes == 2) {
1666 ctxt->eflags &= ~0xffff; 1627 ctxt->eflags &= ~0xffff;
1667 ctxt->eflags |= temp_eflags; 1628 ctxt->eflags |= temp_eflags;
1668 } 1629 }
@@ -1673,12 +1634,11 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1673 return rc; 1634 return rc;
1674} 1635}
1675 1636
1676static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, 1637static int em_iret(struct x86_emulate_ctxt *ctxt)
1677 struct x86_emulate_ops* ops)
1678{ 1638{
1679 switch(ctxt->mode) { 1639 switch(ctxt->mode) {
1680 case X86EMUL_MODE_REAL: 1640 case X86EMUL_MODE_REAL:
1681 return emulate_iret_real(ctxt, ops); 1641 return emulate_iret_real(ctxt);
1682 case X86EMUL_MODE_VM86: 1642 case X86EMUL_MODE_VM86:
1683 case X86EMUL_MODE_PROT16: 1643 case X86EMUL_MODE_PROT16:
1684 case X86EMUL_MODE_PROT32: 1644 case X86EMUL_MODE_PROT32:
@@ -1691,53 +1651,49 @@ static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
1691 1651
1692static int em_jmp_far(struct x86_emulate_ctxt *ctxt) 1652static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
1693{ 1653{
1694 struct decode_cache *c = &ctxt->decode;
1695 int rc; 1654 int rc;
1696 unsigned short sel; 1655 unsigned short sel;
1697 1656
1698 memcpy(&sel, c->src.valptr + c->op_bytes, 2); 1657 memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
1699 1658
1700 rc = load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS); 1659 rc = load_segment_descriptor(ctxt, sel, VCPU_SREG_CS);
1701 if (rc != X86EMUL_CONTINUE) 1660 if (rc != X86EMUL_CONTINUE)
1702 return rc; 1661 return rc;
1703 1662
1704 c->eip = 0; 1663 ctxt->_eip = 0;
1705 memcpy(&c->eip, c->src.valptr, c->op_bytes); 1664 memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes);
1706 return X86EMUL_CONTINUE; 1665 return X86EMUL_CONTINUE;
1707} 1666}
1708 1667
1709static int em_grp1a(struct x86_emulate_ctxt *ctxt) 1668static int em_grp1a(struct x86_emulate_ctxt *ctxt)
1710{ 1669{
1711 struct decode_cache *c = &ctxt->decode; 1670 return emulate_pop(ctxt, &ctxt->dst.val, ctxt->dst.bytes);
1712
1713 return emulate_pop(ctxt, &c->dst.val, c->dst.bytes);
1714} 1671}
1715 1672
1716static int em_grp2(struct x86_emulate_ctxt *ctxt) 1673static int em_grp2(struct x86_emulate_ctxt *ctxt)
1717{ 1674{
1718 struct decode_cache *c = &ctxt->decode; 1675 switch (ctxt->modrm_reg) {
1719 switch (c->modrm_reg) {
1720 case 0: /* rol */ 1676 case 0: /* rol */
1721 emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags); 1677 emulate_2op_SrcB("rol", ctxt->src, ctxt->dst, ctxt->eflags);
1722 break; 1678 break;
1723 case 1: /* ror */ 1679 case 1: /* ror */
1724 emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags); 1680 emulate_2op_SrcB("ror", ctxt->src, ctxt->dst, ctxt->eflags);
1725 break; 1681 break;
1726 case 2: /* rcl */ 1682 case 2: /* rcl */
1727 emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags); 1683 emulate_2op_SrcB("rcl", ctxt->src, ctxt->dst, ctxt->eflags);
1728 break; 1684 break;
1729 case 3: /* rcr */ 1685 case 3: /* rcr */
1730 emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags); 1686 emulate_2op_SrcB("rcr", ctxt->src, ctxt->dst, ctxt->eflags);
1731 break; 1687 break;
1732 case 4: /* sal/shl */ 1688 case 4: /* sal/shl */
1733 case 6: /* sal/shl */ 1689 case 6: /* sal/shl */
1734 emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags); 1690 emulate_2op_SrcB("sal", ctxt->src, ctxt->dst, ctxt->eflags);
1735 break; 1691 break;
1736 case 5: /* shr */ 1692 case 5: /* shr */
1737 emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags); 1693 emulate_2op_SrcB("shr", ctxt->src, ctxt->dst, ctxt->eflags);
1738 break; 1694 break;
1739 case 7: /* sar */ 1695 case 7: /* sar */
1740 emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); 1696 emulate_2op_SrcB("sar", ctxt->src, ctxt->dst, ctxt->eflags);
1741 break; 1697 break;
1742 } 1698 }
1743 return X86EMUL_CONTINUE; 1699 return X86EMUL_CONTINUE;
@@ -1745,33 +1701,32 @@ static int em_grp2(struct x86_emulate_ctxt *ctxt)
1745 1701
1746static int em_grp3(struct x86_emulate_ctxt *ctxt) 1702static int em_grp3(struct x86_emulate_ctxt *ctxt)
1747{ 1703{
1748 struct decode_cache *c = &ctxt->decode; 1704 unsigned long *rax = &ctxt->regs[VCPU_REGS_RAX];
1749 unsigned long *rax = &c->regs[VCPU_REGS_RAX]; 1705 unsigned long *rdx = &ctxt->regs[VCPU_REGS_RDX];
1750 unsigned long *rdx = &c->regs[VCPU_REGS_RDX];
1751 u8 de = 0; 1706 u8 de = 0;
1752 1707
1753 switch (c->modrm_reg) { 1708 switch (ctxt->modrm_reg) {
1754 case 0 ... 1: /* test */ 1709 case 0 ... 1: /* test */
1755 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); 1710 emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags);
1756 break; 1711 break;
1757 case 2: /* not */ 1712 case 2: /* not */
1758 c->dst.val = ~c->dst.val; 1713 ctxt->dst.val = ~ctxt->dst.val;
1759 break; 1714 break;
1760 case 3: /* neg */ 1715 case 3: /* neg */
1761 emulate_1op("neg", c->dst, ctxt->eflags); 1716 emulate_1op("neg", ctxt->dst, ctxt->eflags);
1762 break; 1717 break;
1763 case 4: /* mul */ 1718 case 4: /* mul */
1764 emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags); 1719 emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, ctxt->eflags);
1765 break; 1720 break;
1766 case 5: /* imul */ 1721 case 5: /* imul */
1767 emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags); 1722 emulate_1op_rax_rdx("imul", ctxt->src, *rax, *rdx, ctxt->eflags);
1768 break; 1723 break;
1769 case 6: /* div */ 1724 case 6: /* div */
1770 emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx, 1725 emulate_1op_rax_rdx_ex("div", ctxt->src, *rax, *rdx,
1771 ctxt->eflags, de); 1726 ctxt->eflags, de);
1772 break; 1727 break;
1773 case 7: /* idiv */ 1728 case 7: /* idiv */
1774 emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx, 1729 emulate_1op_rax_rdx_ex("idiv", ctxt->src, *rax, *rdx,
1775 ctxt->eflags, de); 1730 ctxt->eflags, de);
1776 break; 1731 break;
1777 default: 1732 default:
@@ -1784,26 +1739,25 @@ static int em_grp3(struct x86_emulate_ctxt *ctxt)
1784 1739
1785static int em_grp45(struct x86_emulate_ctxt *ctxt) 1740static int em_grp45(struct x86_emulate_ctxt *ctxt)
1786{ 1741{
1787 struct decode_cache *c = &ctxt->decode;
1788 int rc = X86EMUL_CONTINUE; 1742 int rc = X86EMUL_CONTINUE;
1789 1743
1790 switch (c->modrm_reg) { 1744 switch (ctxt->modrm_reg) {
1791 case 0: /* inc */ 1745 case 0: /* inc */
1792 emulate_1op("inc", c->dst, ctxt->eflags); 1746 emulate_1op("inc", ctxt->dst, ctxt->eflags);
1793 break; 1747 break;
1794 case 1: /* dec */ 1748 case 1: /* dec */
1795 emulate_1op("dec", c->dst, ctxt->eflags); 1749 emulate_1op("dec", ctxt->dst, ctxt->eflags);
1796 break; 1750 break;
1797 case 2: /* call near abs */ { 1751 case 2: /* call near abs */ {
1798 long int old_eip; 1752 long int old_eip;
1799 old_eip = c->eip; 1753 old_eip = ctxt->_eip;
1800 c->eip = c->src.val; 1754 ctxt->_eip = ctxt->src.val;
1801 c->src.val = old_eip; 1755 ctxt->src.val = old_eip;
1802 rc = em_push(ctxt); 1756 rc = em_push(ctxt);
1803 break; 1757 break;
1804 } 1758 }
1805 case 4: /* jmp abs */ 1759 case 4: /* jmp abs */
1806 c->eip = c->src.val; 1760 ctxt->_eip = ctxt->src.val;
1807 break; 1761 break;
1808 case 5: /* jmp far */ 1762 case 5: /* jmp far */
1809 rc = em_jmp_far(ctxt); 1763 rc = em_jmp_far(ctxt);
@@ -1817,68 +1771,70 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
1817 1771
1818static int em_grp9(struct x86_emulate_ctxt *ctxt) 1772static int em_grp9(struct x86_emulate_ctxt *ctxt)
1819{ 1773{
1820 struct decode_cache *c = &ctxt->decode; 1774 u64 old = ctxt->dst.orig_val64;
1821 u64 old = c->dst.orig_val64;
1822 1775
1823 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || 1776 if (((u32) (old >> 0) != (u32) ctxt->regs[VCPU_REGS_RAX]) ||
1824 ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { 1777 ((u32) (old >> 32) != (u32) ctxt->regs[VCPU_REGS_RDX])) {
1825 c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); 1778 ctxt->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
1826 c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); 1779 ctxt->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1827 ctxt->eflags &= ~EFLG_ZF; 1780 ctxt->eflags &= ~EFLG_ZF;
1828 } else { 1781 } else {
1829 c->dst.val64 = ((u64)c->regs[VCPU_REGS_RCX] << 32) | 1782 ctxt->dst.val64 = ((u64)ctxt->regs[VCPU_REGS_RCX] << 32) |
1830 (u32) c->regs[VCPU_REGS_RBX]; 1783 (u32) ctxt->regs[VCPU_REGS_RBX];
1831 1784
1832 ctxt->eflags |= EFLG_ZF; 1785 ctxt->eflags |= EFLG_ZF;
1833 } 1786 }
1834 return X86EMUL_CONTINUE; 1787 return X86EMUL_CONTINUE;
1835} 1788}
1836 1789
1837static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, 1790static int em_ret(struct x86_emulate_ctxt *ctxt)
1838 struct x86_emulate_ops *ops) 1791{
1792 ctxt->dst.type = OP_REG;
1793 ctxt->dst.addr.reg = &ctxt->_eip;
1794 ctxt->dst.bytes = ctxt->op_bytes;
1795 return em_pop(ctxt);
1796}
1797
1798static int em_ret_far(struct x86_emulate_ctxt *ctxt)
1839{ 1799{
1840 struct decode_cache *c = &ctxt->decode;
1841 int rc; 1800 int rc;
1842 unsigned long cs; 1801 unsigned long cs;
1843 1802
1844 rc = emulate_pop(ctxt, &c->eip, c->op_bytes); 1803 rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes);
1845 if (rc != X86EMUL_CONTINUE) 1804 if (rc != X86EMUL_CONTINUE)
1846 return rc; 1805 return rc;
1847 if (c->op_bytes == 4) 1806 if (ctxt->op_bytes == 4)
1848 c->eip = (u32)c->eip; 1807 ctxt->_eip = (u32)ctxt->_eip;
1849 rc = emulate_pop(ctxt, &cs, c->op_bytes); 1808 rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
1850 if (rc != X86EMUL_CONTINUE) 1809 if (rc != X86EMUL_CONTINUE)
1851 return rc; 1810 return rc;
1852 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); 1811 rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
1853 return rc; 1812 return rc;
1854} 1813}
1855 1814
1856static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, 1815static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, int seg)
1857 struct x86_emulate_ops *ops, int seg)
1858{ 1816{
1859 struct decode_cache *c = &ctxt->decode;
1860 unsigned short sel; 1817 unsigned short sel;
1861 int rc; 1818 int rc;
1862 1819
1863 memcpy(&sel, c->src.valptr + c->op_bytes, 2); 1820 memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
1864 1821
1865 rc = load_segment_descriptor(ctxt, ops, sel, seg); 1822 rc = load_segment_descriptor(ctxt, sel, seg);
1866 if (rc != X86EMUL_CONTINUE) 1823 if (rc != X86EMUL_CONTINUE)
1867 return rc; 1824 return rc;
1868 1825
1869 c->dst.val = c->src.val; 1826 ctxt->dst.val = ctxt->src.val;
1870 return rc; 1827 return rc;
1871} 1828}
1872 1829
1873static inline void 1830static void
1874setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, 1831setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1875 struct x86_emulate_ops *ops, struct desc_struct *cs, 1832 struct desc_struct *cs, struct desc_struct *ss)
1876 struct desc_struct *ss)
1877{ 1833{
1878 u16 selector; 1834 u16 selector;
1879 1835
1880 memset(cs, 0, sizeof(struct desc_struct)); 1836 memset(cs, 0, sizeof(struct desc_struct));
1881 ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS); 1837 ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
1882 memset(ss, 0, sizeof(struct desc_struct)); 1838 memset(ss, 0, sizeof(struct desc_struct));
1883 1839
1884 cs->l = 0; /* will be adjusted later */ 1840 cs->l = 0; /* will be adjusted later */
@@ -1901,10 +1857,9 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1901 ss->p = 1; 1857 ss->p = 1;
1902} 1858}
1903 1859
1904static int 1860static int em_syscall(struct x86_emulate_ctxt *ctxt)
1905emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1906{ 1861{
1907 struct decode_cache *c = &ctxt->decode; 1862 struct x86_emulate_ops *ops = ctxt->ops;
1908 struct desc_struct cs, ss; 1863 struct desc_struct cs, ss;
1909 u64 msr_data; 1864 u64 msr_data;
1910 u16 cs_sel, ss_sel; 1865 u16 cs_sel, ss_sel;
@@ -1916,7 +1871,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1916 return emulate_ud(ctxt); 1871 return emulate_ud(ctxt);
1917 1872
1918 ops->get_msr(ctxt, MSR_EFER, &efer); 1873 ops->get_msr(ctxt, MSR_EFER, &efer);
1919 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1874 setup_syscalls_segments(ctxt, &cs, &ss);
1920 1875
1921 ops->get_msr(ctxt, MSR_STAR, &msr_data); 1876 ops->get_msr(ctxt, MSR_STAR, &msr_data);
1922 msr_data >>= 32; 1877 msr_data >>= 32;
@@ -1930,15 +1885,15 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1930 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); 1885 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
1931 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); 1886 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
1932 1887
1933 c->regs[VCPU_REGS_RCX] = c->eip; 1888 ctxt->regs[VCPU_REGS_RCX] = ctxt->_eip;
1934 if (efer & EFER_LMA) { 1889 if (efer & EFER_LMA) {
1935#ifdef CONFIG_X86_64 1890#ifdef CONFIG_X86_64
1936 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; 1891 ctxt->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
1937 1892
1938 ops->get_msr(ctxt, 1893 ops->get_msr(ctxt,
1939 ctxt->mode == X86EMUL_MODE_PROT64 ? 1894 ctxt->mode == X86EMUL_MODE_PROT64 ?
1940 MSR_LSTAR : MSR_CSTAR, &msr_data); 1895 MSR_LSTAR : MSR_CSTAR, &msr_data);
1941 c->eip = msr_data; 1896 ctxt->_eip = msr_data;
1942 1897
1943 ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); 1898 ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
1944 ctxt->eflags &= ~(msr_data | EFLG_RF); 1899 ctxt->eflags &= ~(msr_data | EFLG_RF);
@@ -1946,7 +1901,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1946 } else { 1901 } else {
1947 /* legacy mode */ 1902 /* legacy mode */
1948 ops->get_msr(ctxt, MSR_STAR, &msr_data); 1903 ops->get_msr(ctxt, MSR_STAR, &msr_data);
1949 c->eip = (u32)msr_data; 1904 ctxt->_eip = (u32)msr_data;
1950 1905
1951 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 1906 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
1952 } 1907 }
@@ -1954,16 +1909,15 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1954 return X86EMUL_CONTINUE; 1909 return X86EMUL_CONTINUE;
1955} 1910}
1956 1911
1957static int 1912static int em_sysenter(struct x86_emulate_ctxt *ctxt)
1958emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1959{ 1913{
1960 struct decode_cache *c = &ctxt->decode; 1914 struct x86_emulate_ops *ops = ctxt->ops;
1961 struct desc_struct cs, ss; 1915 struct desc_struct cs, ss;
1962 u64 msr_data; 1916 u64 msr_data;
1963 u16 cs_sel, ss_sel; 1917 u16 cs_sel, ss_sel;
1964 u64 efer = 0; 1918 u64 efer = 0;
1965 1919
1966 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); 1920 ops->get_msr(ctxt, MSR_EFER, &efer);
1967 /* inject #GP if in real mode */ 1921 /* inject #GP if in real mode */
1968 if (ctxt->mode == X86EMUL_MODE_REAL) 1922 if (ctxt->mode == X86EMUL_MODE_REAL)
1969 return emulate_gp(ctxt, 0); 1923 return emulate_gp(ctxt, 0);
@@ -1974,7 +1928,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1974 if (ctxt->mode == X86EMUL_MODE_PROT64) 1928 if (ctxt->mode == X86EMUL_MODE_PROT64)
1975 return emulate_ud(ctxt); 1929 return emulate_ud(ctxt);
1976 1930
1977 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1931 setup_syscalls_segments(ctxt, &cs, &ss);
1978 1932
1979 ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); 1933 ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
1980 switch (ctxt->mode) { 1934 switch (ctxt->mode) {
@@ -2002,31 +1956,30 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2002 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); 1956 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
2003 1957
2004 ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data); 1958 ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data);
2005 c->eip = msr_data; 1959 ctxt->_eip = msr_data;
2006 1960
2007 ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); 1961 ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
2008 c->regs[VCPU_REGS_RSP] = msr_data; 1962 ctxt->regs[VCPU_REGS_RSP] = msr_data;
2009 1963
2010 return X86EMUL_CONTINUE; 1964 return X86EMUL_CONTINUE;
2011} 1965}
2012 1966
2013static int 1967static int em_sysexit(struct x86_emulate_ctxt *ctxt)
2014emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2015{ 1968{
2016 struct decode_cache *c = &ctxt->decode; 1969 struct x86_emulate_ops *ops = ctxt->ops;
2017 struct desc_struct cs, ss; 1970 struct desc_struct cs, ss;
2018 u64 msr_data; 1971 u64 msr_data;
2019 int usermode; 1972 int usermode;
2020 u16 cs_sel, ss_sel; 1973 u16 cs_sel = 0, ss_sel = 0;
2021 1974
2022 /* inject #GP if in real mode or Virtual 8086 mode */ 1975 /* inject #GP if in real mode or Virtual 8086 mode */
2023 if (ctxt->mode == X86EMUL_MODE_REAL || 1976 if (ctxt->mode == X86EMUL_MODE_REAL ||
2024 ctxt->mode == X86EMUL_MODE_VM86) 1977 ctxt->mode == X86EMUL_MODE_VM86)
2025 return emulate_gp(ctxt, 0); 1978 return emulate_gp(ctxt, 0);
2026 1979
2027 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1980 setup_syscalls_segments(ctxt, &cs, &ss);
2028 1981
2029 if ((c->rex_prefix & 0x8) != 0x0) 1982 if ((ctxt->rex_prefix & 0x8) != 0x0)
2030 usermode = X86EMUL_MODE_PROT64; 1983 usermode = X86EMUL_MODE_PROT64;
2031 else 1984 else
2032 usermode = X86EMUL_MODE_PROT32; 1985 usermode = X86EMUL_MODE_PROT32;
@@ -2056,14 +2009,13 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2056 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); 2009 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
2057 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); 2010 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
2058 2011
2059 c->eip = c->regs[VCPU_REGS_RDX]; 2012 ctxt->_eip = ctxt->regs[VCPU_REGS_RDX];
2060 c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; 2013 ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX];
2061 2014
2062 return X86EMUL_CONTINUE; 2015 return X86EMUL_CONTINUE;
2063} 2016}
2064 2017
2065static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt, 2018static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
2066 struct x86_emulate_ops *ops)
2067{ 2019{
2068 int iopl; 2020 int iopl;
2069 if (ctxt->mode == X86EMUL_MODE_REAL) 2021 if (ctxt->mode == X86EMUL_MODE_REAL)
@@ -2071,13 +2023,13 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
2071 if (ctxt->mode == X86EMUL_MODE_VM86) 2023 if (ctxt->mode == X86EMUL_MODE_VM86)
2072 return true; 2024 return true;
2073 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 2025 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
2074 return ops->cpl(ctxt) > iopl; 2026 return ctxt->ops->cpl(ctxt) > iopl;
2075} 2027}
2076 2028
2077static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, 2029static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
2078 struct x86_emulate_ops *ops,
2079 u16 port, u16 len) 2030 u16 port, u16 len)
2080{ 2031{
2032 struct x86_emulate_ops *ops = ctxt->ops;
2081 struct desc_struct tr_seg; 2033 struct desc_struct tr_seg;
2082 u32 base3; 2034 u32 base3;
2083 int r; 2035 int r;
@@ -2108,14 +2060,13 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
2108} 2060}
2109 2061
2110static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, 2062static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
2111 struct x86_emulate_ops *ops,
2112 u16 port, u16 len) 2063 u16 port, u16 len)
2113{ 2064{
2114 if (ctxt->perm_ok) 2065 if (ctxt->perm_ok)
2115 return true; 2066 return true;
2116 2067
2117 if (emulator_bad_iopl(ctxt, ops)) 2068 if (emulator_bad_iopl(ctxt))
2118 if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) 2069 if (!emulator_io_port_access_allowed(ctxt, port, len))
2119 return false; 2070 return false;
2120 2071
2121 ctxt->perm_ok = true; 2072 ctxt->perm_ok = true;
@@ -2124,21 +2075,18 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
2124} 2075}
2125 2076
2126static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, 2077static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2127 struct x86_emulate_ops *ops,
2128 struct tss_segment_16 *tss) 2078 struct tss_segment_16 *tss)
2129{ 2079{
2130 struct decode_cache *c = &ctxt->decode; 2080 tss->ip = ctxt->_eip;
2131
2132 tss->ip = c->eip;
2133 tss->flag = ctxt->eflags; 2081 tss->flag = ctxt->eflags;
2134 tss->ax = c->regs[VCPU_REGS_RAX]; 2082 tss->ax = ctxt->regs[VCPU_REGS_RAX];
2135 tss->cx = c->regs[VCPU_REGS_RCX]; 2083 tss->cx = ctxt->regs[VCPU_REGS_RCX];
2136 tss->dx = c->regs[VCPU_REGS_RDX]; 2084 tss->dx = ctxt->regs[VCPU_REGS_RDX];
2137 tss->bx = c->regs[VCPU_REGS_RBX]; 2085 tss->bx = ctxt->regs[VCPU_REGS_RBX];
2138 tss->sp = c->regs[VCPU_REGS_RSP]; 2086 tss->sp = ctxt->regs[VCPU_REGS_RSP];
2139 tss->bp = c->regs[VCPU_REGS_RBP]; 2087 tss->bp = ctxt->regs[VCPU_REGS_RBP];
2140 tss->si = c->regs[VCPU_REGS_RSI]; 2088 tss->si = ctxt->regs[VCPU_REGS_RSI];
2141 tss->di = c->regs[VCPU_REGS_RDI]; 2089 tss->di = ctxt->regs[VCPU_REGS_RDI];
2142 2090
2143 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); 2091 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
2144 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); 2092 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2148,22 +2096,20 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2148} 2096}
2149 2097
2150static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, 2098static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2151 struct x86_emulate_ops *ops,
2152 struct tss_segment_16 *tss) 2099 struct tss_segment_16 *tss)
2153{ 2100{
2154 struct decode_cache *c = &ctxt->decode;
2155 int ret; 2101 int ret;
2156 2102
2157 c->eip = tss->ip; 2103 ctxt->_eip = tss->ip;
2158 ctxt->eflags = tss->flag | 2; 2104 ctxt->eflags = tss->flag | 2;
2159 c->regs[VCPU_REGS_RAX] = tss->ax; 2105 ctxt->regs[VCPU_REGS_RAX] = tss->ax;
2160 c->regs[VCPU_REGS_RCX] = tss->cx; 2106 ctxt->regs[VCPU_REGS_RCX] = tss->cx;
2161 c->regs[VCPU_REGS_RDX] = tss->dx; 2107 ctxt->regs[VCPU_REGS_RDX] = tss->dx;
2162 c->regs[VCPU_REGS_RBX] = tss->bx; 2108 ctxt->regs[VCPU_REGS_RBX] = tss->bx;
2163 c->regs[VCPU_REGS_RSP] = tss->sp; 2109 ctxt->regs[VCPU_REGS_RSP] = tss->sp;
2164 c->regs[VCPU_REGS_RBP] = tss->bp; 2110 ctxt->regs[VCPU_REGS_RBP] = tss->bp;
2165 c->regs[VCPU_REGS_RSI] = tss->si; 2111 ctxt->regs[VCPU_REGS_RSI] = tss->si;
2166 c->regs[VCPU_REGS_RDI] = tss->di; 2112 ctxt->regs[VCPU_REGS_RDI] = tss->di;
2167 2113
2168 /* 2114 /*
2169 * SDM says that segment selectors are loaded before segment 2115 * SDM says that segment selectors are loaded before segment
@@ -2179,19 +2125,19 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2179 * Now load segment descriptors. If fault happenes at this stage 2125 * Now load segment descriptors. If fault happenes at this stage
2180 * it is handled in a context of new task 2126 * it is handled in a context of new task
2181 */ 2127 */
2182 ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR); 2128 ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
2183 if (ret != X86EMUL_CONTINUE) 2129 if (ret != X86EMUL_CONTINUE)
2184 return ret; 2130 return ret;
2185 ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES); 2131 ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
2186 if (ret != X86EMUL_CONTINUE) 2132 if (ret != X86EMUL_CONTINUE)
2187 return ret; 2133 return ret;
2188 ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS); 2134 ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
2189 if (ret != X86EMUL_CONTINUE) 2135 if (ret != X86EMUL_CONTINUE)
2190 return ret; 2136 return ret;
2191 ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS); 2137 ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
2192 if (ret != X86EMUL_CONTINUE) 2138 if (ret != X86EMUL_CONTINUE)
2193 return ret; 2139 return ret;
2194 ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS); 2140 ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
2195 if (ret != X86EMUL_CONTINUE) 2141 if (ret != X86EMUL_CONTINUE)
2196 return ret; 2142 return ret;
2197 2143
@@ -2199,10 +2145,10 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2199} 2145}
2200 2146
2201static int task_switch_16(struct x86_emulate_ctxt *ctxt, 2147static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2202 struct x86_emulate_ops *ops,
2203 u16 tss_selector, u16 old_tss_sel, 2148 u16 tss_selector, u16 old_tss_sel,
2204 ulong old_tss_base, struct desc_struct *new_desc) 2149 ulong old_tss_base, struct desc_struct *new_desc)
2205{ 2150{
2151 struct x86_emulate_ops *ops = ctxt->ops;
2206 struct tss_segment_16 tss_seg; 2152 struct tss_segment_16 tss_seg;
2207 int ret; 2153 int ret;
2208 u32 new_tss_base = get_desc_base(new_desc); 2154 u32 new_tss_base = get_desc_base(new_desc);
@@ -2213,7 +2159,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2213 /* FIXME: need to provide precise fault address */ 2159 /* FIXME: need to provide precise fault address */
2214 return ret; 2160 return ret;
2215 2161
2216 save_state_to_tss16(ctxt, ops, &tss_seg); 2162 save_state_to_tss16(ctxt, &tss_seg);
2217 2163
2218 ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, 2164 ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
2219 &ctxt->exception); 2165 &ctxt->exception);
@@ -2239,26 +2185,23 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2239 return ret; 2185 return ret;
2240 } 2186 }
2241 2187
2242 return load_state_from_tss16(ctxt, ops, &tss_seg); 2188 return load_state_from_tss16(ctxt, &tss_seg);
2243} 2189}
2244 2190
2245static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, 2191static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
2246 struct x86_emulate_ops *ops,
2247 struct tss_segment_32 *tss) 2192 struct tss_segment_32 *tss)
2248{ 2193{
2249 struct decode_cache *c = &ctxt->decode; 2194 tss->cr3 = ctxt->ops->get_cr(ctxt, 3);
2250 2195 tss->eip = ctxt->_eip;
2251 tss->cr3 = ops->get_cr(ctxt, 3);
2252 tss->eip = c->eip;
2253 tss->eflags = ctxt->eflags; 2196 tss->eflags = ctxt->eflags;
2254 tss->eax = c->regs[VCPU_REGS_RAX]; 2197 tss->eax = ctxt->regs[VCPU_REGS_RAX];
2255 tss->ecx = c->regs[VCPU_REGS_RCX]; 2198 tss->ecx = ctxt->regs[VCPU_REGS_RCX];
2256 tss->edx = c->regs[VCPU_REGS_RDX]; 2199 tss->edx = ctxt->regs[VCPU_REGS_RDX];
2257 tss->ebx = c->regs[VCPU_REGS_RBX]; 2200 tss->ebx = ctxt->regs[VCPU_REGS_RBX];
2258 tss->esp = c->regs[VCPU_REGS_RSP]; 2201 tss->esp = ctxt->regs[VCPU_REGS_RSP];
2259 tss->ebp = c->regs[VCPU_REGS_RBP]; 2202 tss->ebp = ctxt->regs[VCPU_REGS_RBP];
2260 tss->esi = c->regs[VCPU_REGS_RSI]; 2203 tss->esi = ctxt->regs[VCPU_REGS_RSI];
2261 tss->edi = c->regs[VCPU_REGS_RDI]; 2204 tss->edi = ctxt->regs[VCPU_REGS_RDI];
2262 2205
2263 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); 2206 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
2264 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); 2207 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2270,24 +2213,22 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
2270} 2213}
2271 2214
2272static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, 2215static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2273 struct x86_emulate_ops *ops,
2274 struct tss_segment_32 *tss) 2216 struct tss_segment_32 *tss)
2275{ 2217{
2276 struct decode_cache *c = &ctxt->decode;
2277 int ret; 2218 int ret;
2278 2219
2279 if (ops->set_cr(ctxt, 3, tss->cr3)) 2220 if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
2280 return emulate_gp(ctxt, 0); 2221 return emulate_gp(ctxt, 0);
2281 c->eip = tss->eip; 2222 ctxt->_eip = tss->eip;
2282 ctxt->eflags = tss->eflags | 2; 2223 ctxt->eflags = tss->eflags | 2;
2283 c->regs[VCPU_REGS_RAX] = tss->eax; 2224 ctxt->regs[VCPU_REGS_RAX] = tss->eax;
2284 c->regs[VCPU_REGS_RCX] = tss->ecx; 2225 ctxt->regs[VCPU_REGS_RCX] = tss->ecx;
2285 c->regs[VCPU_REGS_RDX] = tss->edx; 2226 ctxt->regs[VCPU_REGS_RDX] = tss->edx;
2286 c->regs[VCPU_REGS_RBX] = tss->ebx; 2227 ctxt->regs[VCPU_REGS_RBX] = tss->ebx;
2287 c->regs[VCPU_REGS_RSP] = tss->esp; 2228 ctxt->regs[VCPU_REGS_RSP] = tss->esp;
2288 c->regs[VCPU_REGS_RBP] = tss->ebp; 2229 ctxt->regs[VCPU_REGS_RBP] = tss->ebp;
2289 c->regs[VCPU_REGS_RSI] = tss->esi; 2230 ctxt->regs[VCPU_REGS_RSI] = tss->esi;
2290 c->regs[VCPU_REGS_RDI] = tss->edi; 2231 ctxt->regs[VCPU_REGS_RDI] = tss->edi;
2291 2232
2292 /* 2233 /*
2293 * SDM says that segment selectors are loaded before segment 2234 * SDM says that segment selectors are loaded before segment
@@ -2305,25 +2246,25 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2305 * Now load segment descriptors. If fault happenes at this stage 2246 * Now load segment descriptors. If fault happenes at this stage
2306 * it is handled in a context of new task 2247 * it is handled in a context of new task
2307 */ 2248 */
2308 ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR); 2249 ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
2309 if (ret != X86EMUL_CONTINUE) 2250 if (ret != X86EMUL_CONTINUE)
2310 return ret; 2251 return ret;
2311 ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES); 2252 ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
2312 if (ret != X86EMUL_CONTINUE) 2253 if (ret != X86EMUL_CONTINUE)
2313 return ret; 2254 return ret;
2314 ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS); 2255 ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
2315 if (ret != X86EMUL_CONTINUE) 2256 if (ret != X86EMUL_CONTINUE)
2316 return ret; 2257 return ret;
2317 ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS); 2258 ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
2318 if (ret != X86EMUL_CONTINUE) 2259 if (ret != X86EMUL_CONTINUE)
2319 return ret; 2260 return ret;
2320 ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS); 2261 ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
2321 if (ret != X86EMUL_CONTINUE) 2262 if (ret != X86EMUL_CONTINUE)
2322 return ret; 2263 return ret;
2323 ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS); 2264 ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS);
2324 if (ret != X86EMUL_CONTINUE) 2265 if (ret != X86EMUL_CONTINUE)
2325 return ret; 2266 return ret;
2326 ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS); 2267 ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS);
2327 if (ret != X86EMUL_CONTINUE) 2268 if (ret != X86EMUL_CONTINUE)
2328 return ret; 2269 return ret;
2329 2270
@@ -2331,10 +2272,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2331} 2272}
2332 2273
2333static int task_switch_32(struct x86_emulate_ctxt *ctxt, 2274static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2334 struct x86_emulate_ops *ops,
2335 u16 tss_selector, u16 old_tss_sel, 2275 u16 tss_selector, u16 old_tss_sel,
2336 ulong old_tss_base, struct desc_struct *new_desc) 2276 ulong old_tss_base, struct desc_struct *new_desc)
2337{ 2277{
2278 struct x86_emulate_ops *ops = ctxt->ops;
2338 struct tss_segment_32 tss_seg; 2279 struct tss_segment_32 tss_seg;
2339 int ret; 2280 int ret;
2340 u32 new_tss_base = get_desc_base(new_desc); 2281 u32 new_tss_base = get_desc_base(new_desc);
@@ -2345,7 +2286,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2345 /* FIXME: need to provide precise fault address */ 2286 /* FIXME: need to provide precise fault address */
2346 return ret; 2287 return ret;
2347 2288
2348 save_state_to_tss32(ctxt, ops, &tss_seg); 2289 save_state_to_tss32(ctxt, &tss_seg);
2349 2290
2350 ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, 2291 ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
2351 &ctxt->exception); 2292 &ctxt->exception);
@@ -2371,14 +2312,14 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2371 return ret; 2312 return ret;
2372 } 2313 }
2373 2314
2374 return load_state_from_tss32(ctxt, ops, &tss_seg); 2315 return load_state_from_tss32(ctxt, &tss_seg);
2375} 2316}
2376 2317
2377static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, 2318static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2378 struct x86_emulate_ops *ops,
2379 u16 tss_selector, int reason, 2319 u16 tss_selector, int reason,
2380 bool has_error_code, u32 error_code) 2320 bool has_error_code, u32 error_code)
2381{ 2321{
2322 struct x86_emulate_ops *ops = ctxt->ops;
2382 struct desc_struct curr_tss_desc, next_tss_desc; 2323 struct desc_struct curr_tss_desc, next_tss_desc;
2383 int ret; 2324 int ret;
2384 u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); 2325 u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
@@ -2388,10 +2329,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2388 2329
2389 /* FIXME: old_tss_base == ~0 ? */ 2330 /* FIXME: old_tss_base == ~0 ? */
2390 2331
2391 ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc); 2332 ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
2392 if (ret != X86EMUL_CONTINUE) 2333 if (ret != X86EMUL_CONTINUE)
2393 return ret; 2334 return ret;
2394 ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc); 2335 ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
2395 if (ret != X86EMUL_CONTINUE) 2336 if (ret != X86EMUL_CONTINUE)
2396 return ret; 2337 return ret;
2397 2338
@@ -2413,8 +2354,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2413 2354
2414 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { 2355 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
2415 curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */ 2356 curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
2416 write_segment_descriptor(ctxt, ops, old_tss_sel, 2357 write_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
2417 &curr_tss_desc);
2418 } 2358 }
2419 2359
2420 if (reason == TASK_SWITCH_IRET) 2360 if (reason == TASK_SWITCH_IRET)
@@ -2426,10 +2366,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2426 old_tss_sel = 0xffff; 2366 old_tss_sel = 0xffff;
2427 2367
2428 if (next_tss_desc.type & 8) 2368 if (next_tss_desc.type & 8)
2429 ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel, 2369 ret = task_switch_32(ctxt, tss_selector, old_tss_sel,
2430 old_tss_base, &next_tss_desc); 2370 old_tss_base, &next_tss_desc);
2431 else 2371 else
2432 ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel, 2372 ret = task_switch_16(ctxt, tss_selector, old_tss_sel,
2433 old_tss_base, &next_tss_desc); 2373 old_tss_base, &next_tss_desc);
2434 if (ret != X86EMUL_CONTINUE) 2374 if (ret != X86EMUL_CONTINUE)
2435 return ret; 2375 return ret;
@@ -2439,19 +2379,16 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2439 2379
2440 if (reason != TASK_SWITCH_IRET) { 2380 if (reason != TASK_SWITCH_IRET) {
2441 next_tss_desc.type |= (1 << 1); /* set busy flag */ 2381 next_tss_desc.type |= (1 << 1); /* set busy flag */
2442 write_segment_descriptor(ctxt, ops, tss_selector, 2382 write_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
2443 &next_tss_desc);
2444 } 2383 }
2445 2384
2446 ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS); 2385 ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS);
2447 ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR); 2386 ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
2448 2387
2449 if (has_error_code) { 2388 if (has_error_code) {
2450 struct decode_cache *c = &ctxt->decode; 2389 ctxt->op_bytes = ctxt->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
2451 2390 ctxt->lock_prefix = 0;
2452 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; 2391 ctxt->src.val = (unsigned long) error_code;
2453 c->lock_prefix = 0;
2454 c->src.val = (unsigned long) error_code;
2455 ret = em_push(ctxt); 2392 ret = em_push(ctxt);
2456 } 2393 }
2457 2394
@@ -2462,18 +2399,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2462 u16 tss_selector, int reason, 2399 u16 tss_selector, int reason,
2463 bool has_error_code, u32 error_code) 2400 bool has_error_code, u32 error_code)
2464{ 2401{
2465 struct x86_emulate_ops *ops = ctxt->ops;
2466 struct decode_cache *c = &ctxt->decode;
2467 int rc; 2402 int rc;
2468 2403
2469 c->eip = ctxt->eip; 2404 ctxt->_eip = ctxt->eip;
2470 c->dst.type = OP_NONE; 2405 ctxt->dst.type = OP_NONE;
2471 2406
2472 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, 2407 rc = emulator_do_task_switch(ctxt, tss_selector, reason,
2473 has_error_code, error_code); 2408 has_error_code, error_code);
2474 2409
2475 if (rc == X86EMUL_CONTINUE) 2410 if (rc == X86EMUL_CONTINUE)
2476 ctxt->eip = c->eip; 2411 ctxt->eip = ctxt->_eip;
2477 2412
2478 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 2413 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
2479} 2414}
@@ -2481,22 +2416,20 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2481static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, 2416static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
2482 int reg, struct operand *op) 2417 int reg, struct operand *op)
2483{ 2418{
2484 struct decode_cache *c = &ctxt->decode;
2485 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; 2419 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
2486 2420
2487 register_address_increment(c, &c->regs[reg], df * op->bytes); 2421 register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes);
2488 op->addr.mem.ea = register_address(c, c->regs[reg]); 2422 op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]);
2489 op->addr.mem.seg = seg; 2423 op->addr.mem.seg = seg;
2490} 2424}
2491 2425
2492static int em_das(struct x86_emulate_ctxt *ctxt) 2426static int em_das(struct x86_emulate_ctxt *ctxt)
2493{ 2427{
2494 struct decode_cache *c = &ctxt->decode;
2495 u8 al, old_al; 2428 u8 al, old_al;
2496 bool af, cf, old_cf; 2429 bool af, cf, old_cf;
2497 2430
2498 cf = ctxt->eflags & X86_EFLAGS_CF; 2431 cf = ctxt->eflags & X86_EFLAGS_CF;
2499 al = c->dst.val; 2432 al = ctxt->dst.val;
2500 2433
2501 old_al = al; 2434 old_al = al;
2502 old_cf = cf; 2435 old_cf = cf;
@@ -2514,12 +2447,12 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
2514 cf = true; 2447 cf = true;
2515 } 2448 }
2516 2449
2517 c->dst.val = al; 2450 ctxt->dst.val = al;
2518 /* Set PF, ZF, SF */ 2451 /* Set PF, ZF, SF */
2519 c->src.type = OP_IMM; 2452 ctxt->src.type = OP_IMM;
2520 c->src.val = 0; 2453 ctxt->src.val = 0;
2521 c->src.bytes = 1; 2454 ctxt->src.bytes = 1;
2522 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); 2455 emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags);
2523 ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); 2456 ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
2524 if (cf) 2457 if (cf)
2525 ctxt->eflags |= X86_EFLAGS_CF; 2458 ctxt->eflags |= X86_EFLAGS_CF;
@@ -2530,175 +2463,189 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
2530 2463
2531static int em_call_far(struct x86_emulate_ctxt *ctxt) 2464static int em_call_far(struct x86_emulate_ctxt *ctxt)
2532{ 2465{
2533 struct decode_cache *c = &ctxt->decode;
2534 u16 sel, old_cs; 2466 u16 sel, old_cs;
2535 ulong old_eip; 2467 ulong old_eip;
2536 int rc; 2468 int rc;
2537 2469
2538 old_cs = get_segment_selector(ctxt, VCPU_SREG_CS); 2470 old_cs = get_segment_selector(ctxt, VCPU_SREG_CS);
2539 old_eip = c->eip; 2471 old_eip = ctxt->_eip;
2540 2472
2541 memcpy(&sel, c->src.valptr + c->op_bytes, 2); 2473 memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
2542 if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS)) 2474 if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS))
2543 return X86EMUL_CONTINUE; 2475 return X86EMUL_CONTINUE;
2544 2476
2545 c->eip = 0; 2477 ctxt->_eip = 0;
2546 memcpy(&c->eip, c->src.valptr, c->op_bytes); 2478 memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes);
2547 2479
2548 c->src.val = old_cs; 2480 ctxt->src.val = old_cs;
2549 rc = em_push(ctxt); 2481 rc = em_push(ctxt);
2550 if (rc != X86EMUL_CONTINUE) 2482 if (rc != X86EMUL_CONTINUE)
2551 return rc; 2483 return rc;
2552 2484
2553 c->src.val = old_eip; 2485 ctxt->src.val = old_eip;
2554 return em_push(ctxt); 2486 return em_push(ctxt);
2555} 2487}
2556 2488
2557static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) 2489static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
2558{ 2490{
2559 struct decode_cache *c = &ctxt->decode;
2560 int rc; 2491 int rc;
2561 2492
2562 c->dst.type = OP_REG; 2493 ctxt->dst.type = OP_REG;
2563 c->dst.addr.reg = &c->eip; 2494 ctxt->dst.addr.reg = &ctxt->_eip;
2564 c->dst.bytes = c->op_bytes; 2495 ctxt->dst.bytes = ctxt->op_bytes;
2565 rc = emulate_pop(ctxt, &c->dst.val, c->op_bytes); 2496 rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
2566 if (rc != X86EMUL_CONTINUE) 2497 if (rc != X86EMUL_CONTINUE)
2567 return rc; 2498 return rc;
2568 register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val); 2499 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val);
2569 return X86EMUL_CONTINUE; 2500 return X86EMUL_CONTINUE;
2570} 2501}
2571 2502
2572static int em_add(struct x86_emulate_ctxt *ctxt) 2503static int em_add(struct x86_emulate_ctxt *ctxt)
2573{ 2504{
2574 struct decode_cache *c = &ctxt->decode; 2505 emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags);
2575
2576 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
2577 return X86EMUL_CONTINUE; 2506 return X86EMUL_CONTINUE;
2578} 2507}
2579 2508
2580static int em_or(struct x86_emulate_ctxt *ctxt) 2509static int em_or(struct x86_emulate_ctxt *ctxt)
2581{ 2510{
2582 struct decode_cache *c = &ctxt->decode; 2511 emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags);
2583
2584 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
2585 return X86EMUL_CONTINUE; 2512 return X86EMUL_CONTINUE;
2586} 2513}
2587 2514
2588static int em_adc(struct x86_emulate_ctxt *ctxt) 2515static int em_adc(struct x86_emulate_ctxt *ctxt)
2589{ 2516{
2590 struct decode_cache *c = &ctxt->decode; 2517 emulate_2op_SrcV("adc", ctxt->src, ctxt->dst, ctxt->eflags);
2591
2592 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
2593 return X86EMUL_CONTINUE; 2518 return X86EMUL_CONTINUE;
2594} 2519}
2595 2520
2596static int em_sbb(struct x86_emulate_ctxt *ctxt) 2521static int em_sbb(struct x86_emulate_ctxt *ctxt)
2597{ 2522{
2598 struct decode_cache *c = &ctxt->decode; 2523 emulate_2op_SrcV("sbb", ctxt->src, ctxt->dst, ctxt->eflags);
2599
2600 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
2601 return X86EMUL_CONTINUE; 2524 return X86EMUL_CONTINUE;
2602} 2525}
2603 2526
2604static int em_and(struct x86_emulate_ctxt *ctxt) 2527static int em_and(struct x86_emulate_ctxt *ctxt)
2605{ 2528{
2606 struct decode_cache *c = &ctxt->decode; 2529 emulate_2op_SrcV("and", ctxt->src, ctxt->dst, ctxt->eflags);
2607
2608 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
2609 return X86EMUL_CONTINUE; 2530 return X86EMUL_CONTINUE;
2610} 2531}
2611 2532
2612static int em_sub(struct x86_emulate_ctxt *ctxt) 2533static int em_sub(struct x86_emulate_ctxt *ctxt)
2613{ 2534{
2614 struct decode_cache *c = &ctxt->decode; 2535 emulate_2op_SrcV("sub", ctxt->src, ctxt->dst, ctxt->eflags);
2615
2616 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
2617 return X86EMUL_CONTINUE; 2536 return X86EMUL_CONTINUE;
2618} 2537}
2619 2538
2620static int em_xor(struct x86_emulate_ctxt *ctxt) 2539static int em_xor(struct x86_emulate_ctxt *ctxt)
2621{ 2540{
2622 struct decode_cache *c = &ctxt->decode; 2541 emulate_2op_SrcV("xor", ctxt->src, ctxt->dst, ctxt->eflags);
2623
2624 emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
2625 return X86EMUL_CONTINUE; 2542 return X86EMUL_CONTINUE;
2626} 2543}
2627 2544
2628static int em_cmp(struct x86_emulate_ctxt *ctxt) 2545static int em_cmp(struct x86_emulate_ctxt *ctxt)
2629{ 2546{
2630 struct decode_cache *c = &ctxt->decode; 2547 emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags);
2631
2632 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
2633 /* Disable writeback. */ 2548 /* Disable writeback. */
2634 c->dst.type = OP_NONE; 2549 ctxt->dst.type = OP_NONE;
2635 return X86EMUL_CONTINUE; 2550 return X86EMUL_CONTINUE;
2636} 2551}
2637 2552
2638static int em_imul(struct x86_emulate_ctxt *ctxt) 2553static int em_test(struct x86_emulate_ctxt *ctxt)
2554{
2555 emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags);
2556 return X86EMUL_CONTINUE;
2557}
2558
2559static int em_xchg(struct x86_emulate_ctxt *ctxt)
2639{ 2560{
2640 struct decode_cache *c = &ctxt->decode; 2561 /* Write back the register source. */
2562 ctxt->src.val = ctxt->dst.val;
2563 write_register_operand(&ctxt->src);
2641 2564
2642 emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags); 2565 /* Write back the memory destination with implicit LOCK prefix. */
2566 ctxt->dst.val = ctxt->src.orig_val;
2567 ctxt->lock_prefix = 1;
2643 return X86EMUL_CONTINUE; 2568 return X86EMUL_CONTINUE;
2644} 2569}
2645 2570
2646static int em_imul_3op(struct x86_emulate_ctxt *ctxt) 2571static int em_imul(struct x86_emulate_ctxt *ctxt)
2647{ 2572{
2648 struct decode_cache *c = &ctxt->decode; 2573 emulate_2op_SrcV_nobyte("imul", ctxt->src, ctxt->dst, ctxt->eflags);
2574 return X86EMUL_CONTINUE;
2575}
2649 2576
2650 c->dst.val = c->src2.val; 2577static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
2578{
2579 ctxt->dst.val = ctxt->src2.val;
2651 return em_imul(ctxt); 2580 return em_imul(ctxt);
2652} 2581}
2653 2582
2654static int em_cwd(struct x86_emulate_ctxt *ctxt) 2583static int em_cwd(struct x86_emulate_ctxt *ctxt)
2655{ 2584{
2656 struct decode_cache *c = &ctxt->decode; 2585 ctxt->dst.type = OP_REG;
2657 2586 ctxt->dst.bytes = ctxt->src.bytes;
2658 c->dst.type = OP_REG; 2587 ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
2659 c->dst.bytes = c->src.bytes; 2588 ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1);
2660 c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
2661 c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1);
2662 2589
2663 return X86EMUL_CONTINUE; 2590 return X86EMUL_CONTINUE;
2664} 2591}
2665 2592
2666static int em_rdtsc(struct x86_emulate_ctxt *ctxt) 2593static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
2667{ 2594{
2668 struct decode_cache *c = &ctxt->decode;
2669 u64 tsc = 0; 2595 u64 tsc = 0;
2670 2596
2671 ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); 2597 ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
2672 c->regs[VCPU_REGS_RAX] = (u32)tsc; 2598 ctxt->regs[VCPU_REGS_RAX] = (u32)tsc;
2673 c->regs[VCPU_REGS_RDX] = tsc >> 32; 2599 ctxt->regs[VCPU_REGS_RDX] = tsc >> 32;
2674 return X86EMUL_CONTINUE; 2600 return X86EMUL_CONTINUE;
2675} 2601}
2676 2602
2677static int em_mov(struct x86_emulate_ctxt *ctxt) 2603static int em_mov(struct x86_emulate_ctxt *ctxt)
2678{ 2604{
2679 struct decode_cache *c = &ctxt->decode; 2605 ctxt->dst.val = ctxt->src.val;
2680 c->dst.val = c->src.val;
2681 return X86EMUL_CONTINUE; 2606 return X86EMUL_CONTINUE;
2682} 2607}
2683 2608
2609static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
2610{
2611 if (ctxt->modrm_reg > VCPU_SREG_GS)
2612 return emulate_ud(ctxt);
2613
2614 ctxt->dst.val = get_segment_selector(ctxt, ctxt->modrm_reg);
2615 return X86EMUL_CONTINUE;
2616}
2617
2618static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
2619{
2620 u16 sel = ctxt->src.val;
2621
2622 if (ctxt->modrm_reg == VCPU_SREG_CS || ctxt->modrm_reg > VCPU_SREG_GS)
2623 return emulate_ud(ctxt);
2624
2625 if (ctxt->modrm_reg == VCPU_SREG_SS)
2626 ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
2627
2628 /* Disable writeback. */
2629 ctxt->dst.type = OP_NONE;
2630 return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
2631}
2632
2684static int em_movdqu(struct x86_emulate_ctxt *ctxt) 2633static int em_movdqu(struct x86_emulate_ctxt *ctxt)
2685{ 2634{
2686 struct decode_cache *c = &ctxt->decode; 2635 memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes);
2687 memcpy(&c->dst.vec_val, &c->src.vec_val, c->op_bytes);
2688 return X86EMUL_CONTINUE; 2636 return X86EMUL_CONTINUE;
2689} 2637}
2690 2638
2691static int em_invlpg(struct x86_emulate_ctxt *ctxt) 2639static int em_invlpg(struct x86_emulate_ctxt *ctxt)
2692{ 2640{
2693 struct decode_cache *c = &ctxt->decode;
2694 int rc; 2641 int rc;
2695 ulong linear; 2642 ulong linear;
2696 2643
2697 rc = linearize(ctxt, c->src.addr.mem, 1, false, &linear); 2644 rc = linearize(ctxt, ctxt->src.addr.mem, 1, false, &linear);
2698 if (rc == X86EMUL_CONTINUE) 2645 if (rc == X86EMUL_CONTINUE)
2699 ctxt->ops->invlpg(ctxt, linear); 2646 ctxt->ops->invlpg(ctxt, linear);
2700 /* Disable writeback. */ 2647 /* Disable writeback. */
2701 c->dst.type = OP_NONE; 2648 ctxt->dst.type = OP_NONE;
2702 return X86EMUL_CONTINUE; 2649 return X86EMUL_CONTINUE;
2703} 2650}
2704 2651
@@ -2714,10 +2661,9 @@ static int em_clts(struct x86_emulate_ctxt *ctxt)
2714 2661
2715static int em_vmcall(struct x86_emulate_ctxt *ctxt) 2662static int em_vmcall(struct x86_emulate_ctxt *ctxt)
2716{ 2663{
2717 struct decode_cache *c = &ctxt->decode;
2718 int rc; 2664 int rc;
2719 2665
2720 if (c->modrm_mod != 3 || c->modrm_rm != 1) 2666 if (ctxt->modrm_mod != 3 || ctxt->modrm_rm != 1)
2721 return X86EMUL_UNHANDLEABLE; 2667 return X86EMUL_UNHANDLEABLE;
2722 2668
2723 rc = ctxt->ops->fix_hypercall(ctxt); 2669 rc = ctxt->ops->fix_hypercall(ctxt);
@@ -2725,73 +2671,104 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt)
2725 return rc; 2671 return rc;
2726 2672
2727 /* Let the processor re-execute the fixed hypercall */ 2673 /* Let the processor re-execute the fixed hypercall */
2728 c->eip = ctxt->eip; 2674 ctxt->_eip = ctxt->eip;
2729 /* Disable writeback. */ 2675 /* Disable writeback. */
2730 c->dst.type = OP_NONE; 2676 ctxt->dst.type = OP_NONE;
2731 return X86EMUL_CONTINUE; 2677 return X86EMUL_CONTINUE;
2732} 2678}
2733 2679
2734static int em_lgdt(struct x86_emulate_ctxt *ctxt) 2680static int em_lgdt(struct x86_emulate_ctxt *ctxt)
2735{ 2681{
2736 struct decode_cache *c = &ctxt->decode;
2737 struct desc_ptr desc_ptr; 2682 struct desc_ptr desc_ptr;
2738 int rc; 2683 int rc;
2739 2684
2740 rc = read_descriptor(ctxt, c->src.addr.mem, 2685 rc = read_descriptor(ctxt, ctxt->src.addr.mem,
2741 &desc_ptr.size, &desc_ptr.address, 2686 &desc_ptr.size, &desc_ptr.address,
2742 c->op_bytes); 2687 ctxt->op_bytes);
2743 if (rc != X86EMUL_CONTINUE) 2688 if (rc != X86EMUL_CONTINUE)
2744 return rc; 2689 return rc;
2745 ctxt->ops->set_gdt(ctxt, &desc_ptr); 2690 ctxt->ops->set_gdt(ctxt, &desc_ptr);
2746 /* Disable writeback. */ 2691 /* Disable writeback. */
2747 c->dst.type = OP_NONE; 2692 ctxt->dst.type = OP_NONE;
2748 return X86EMUL_CONTINUE; 2693 return X86EMUL_CONTINUE;
2749} 2694}
2750 2695
2751static int em_vmmcall(struct x86_emulate_ctxt *ctxt) 2696static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
2752{ 2697{
2753 struct decode_cache *c = &ctxt->decode;
2754 int rc; 2698 int rc;
2755 2699
2756 rc = ctxt->ops->fix_hypercall(ctxt); 2700 rc = ctxt->ops->fix_hypercall(ctxt);
2757 2701
2758 /* Disable writeback. */ 2702 /* Disable writeback. */
2759 c->dst.type = OP_NONE; 2703 ctxt->dst.type = OP_NONE;
2760 return rc; 2704 return rc;
2761} 2705}
2762 2706
2763static int em_lidt(struct x86_emulate_ctxt *ctxt) 2707static int em_lidt(struct x86_emulate_ctxt *ctxt)
2764{ 2708{
2765 struct decode_cache *c = &ctxt->decode;
2766 struct desc_ptr desc_ptr; 2709 struct desc_ptr desc_ptr;
2767 int rc; 2710 int rc;
2768 2711
2769 rc = read_descriptor(ctxt, c->src.addr.mem, 2712 rc = read_descriptor(ctxt, ctxt->src.addr.mem,
2770 &desc_ptr.size, &desc_ptr.address, 2713 &desc_ptr.size, &desc_ptr.address,
2771 c->op_bytes); 2714 ctxt->op_bytes);
2772 if (rc != X86EMUL_CONTINUE) 2715 if (rc != X86EMUL_CONTINUE)
2773 return rc; 2716 return rc;
2774 ctxt->ops->set_idt(ctxt, &desc_ptr); 2717 ctxt->ops->set_idt(ctxt, &desc_ptr);
2775 /* Disable writeback. */ 2718 /* Disable writeback. */
2776 c->dst.type = OP_NONE; 2719 ctxt->dst.type = OP_NONE;
2777 return X86EMUL_CONTINUE; 2720 return X86EMUL_CONTINUE;
2778} 2721}
2779 2722
2780static int em_smsw(struct x86_emulate_ctxt *ctxt) 2723static int em_smsw(struct x86_emulate_ctxt *ctxt)
2781{ 2724{
2782 struct decode_cache *c = &ctxt->decode; 2725 ctxt->dst.bytes = 2;
2783 2726 ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0);
2784 c->dst.bytes = 2;
2785 c->dst.val = ctxt->ops->get_cr(ctxt, 0);
2786 return X86EMUL_CONTINUE; 2727 return X86EMUL_CONTINUE;
2787} 2728}
2788 2729
2789static int em_lmsw(struct x86_emulate_ctxt *ctxt) 2730static int em_lmsw(struct x86_emulate_ctxt *ctxt)
2790{ 2731{
2791 struct decode_cache *c = &ctxt->decode;
2792 ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul) 2732 ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
2793 | (c->src.val & 0x0f)); 2733 | (ctxt->src.val & 0x0f));
2794 c->dst.type = OP_NONE; 2734 ctxt->dst.type = OP_NONE;
2735 return X86EMUL_CONTINUE;
2736}
2737
2738static int em_loop(struct x86_emulate_ctxt *ctxt)
2739{
2740 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
2741 if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) &&
2742 (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
2743 jmp_rel(ctxt, ctxt->src.val);
2744
2745 return X86EMUL_CONTINUE;
2746}
2747
2748static int em_jcxz(struct x86_emulate_ctxt *ctxt)
2749{
2750 if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0)
2751 jmp_rel(ctxt, ctxt->src.val);
2752
2753 return X86EMUL_CONTINUE;
2754}
2755
2756static int em_cli(struct x86_emulate_ctxt *ctxt)
2757{
2758 if (emulator_bad_iopl(ctxt))
2759 return emulate_gp(ctxt, 0);
2760
2761 ctxt->eflags &= ~X86_EFLAGS_IF;
2762 return X86EMUL_CONTINUE;
2763}
2764
2765static int em_sti(struct x86_emulate_ctxt *ctxt)
2766{
2767 if (emulator_bad_iopl(ctxt))
2768 return emulate_gp(ctxt, 0);
2769
2770 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
2771 ctxt->eflags |= X86_EFLAGS_IF;
2795 return X86EMUL_CONTINUE; 2772 return X86EMUL_CONTINUE;
2796} 2773}
2797 2774
@@ -2809,9 +2786,7 @@ static bool valid_cr(int nr)
2809 2786
2810static int check_cr_read(struct x86_emulate_ctxt *ctxt) 2787static int check_cr_read(struct x86_emulate_ctxt *ctxt)
2811{ 2788{
2812 struct decode_cache *c = &ctxt->decode; 2789 if (!valid_cr(ctxt->modrm_reg))
2813
2814 if (!valid_cr(c->modrm_reg))
2815 return emulate_ud(ctxt); 2790 return emulate_ud(ctxt);
2816 2791
2817 return X86EMUL_CONTINUE; 2792 return X86EMUL_CONTINUE;
@@ -2819,9 +2794,8 @@ static int check_cr_read(struct x86_emulate_ctxt *ctxt)
2819 2794
2820static int check_cr_write(struct x86_emulate_ctxt *ctxt) 2795static int check_cr_write(struct x86_emulate_ctxt *ctxt)
2821{ 2796{
2822 struct decode_cache *c = &ctxt->decode; 2797 u64 new_val = ctxt->src.val64;
2823 u64 new_val = c->src.val64; 2798 int cr = ctxt->modrm_reg;
2824 int cr = c->modrm_reg;
2825 u64 efer = 0; 2799 u64 efer = 0;
2826 2800
2827 static u64 cr_reserved_bits[] = { 2801 static u64 cr_reserved_bits[] = {
@@ -2898,8 +2872,7 @@ static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
2898 2872
2899static int check_dr_read(struct x86_emulate_ctxt *ctxt) 2873static int check_dr_read(struct x86_emulate_ctxt *ctxt)
2900{ 2874{
2901 struct decode_cache *c = &ctxt->decode; 2875 int dr = ctxt->modrm_reg;
2902 int dr = c->modrm_reg;
2903 u64 cr4; 2876 u64 cr4;
2904 2877
2905 if (dr > 7) 2878 if (dr > 7)
@@ -2917,9 +2890,8 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt)
2917 2890
2918static int check_dr_write(struct x86_emulate_ctxt *ctxt) 2891static int check_dr_write(struct x86_emulate_ctxt *ctxt)
2919{ 2892{
2920 struct decode_cache *c = &ctxt->decode; 2893 u64 new_val = ctxt->src.val64;
2921 u64 new_val = c->src.val64; 2894 int dr = ctxt->modrm_reg;
2922 int dr = c->modrm_reg;
2923 2895
2924 if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL)) 2896 if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL))
2925 return emulate_gp(ctxt, 0); 2897 return emulate_gp(ctxt, 0);
@@ -2941,7 +2913,7 @@ static int check_svme(struct x86_emulate_ctxt *ctxt)
2941 2913
2942static int check_svme_pa(struct x86_emulate_ctxt *ctxt) 2914static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
2943{ 2915{
2944 u64 rax = ctxt->decode.regs[VCPU_REGS_RAX]; 2916 u64 rax = ctxt->regs[VCPU_REGS_RAX];
2945 2917
2946 /* Valid physical address? */ 2918 /* Valid physical address? */
2947 if (rax & 0xffff000000000000ULL) 2919 if (rax & 0xffff000000000000ULL)
@@ -2963,7 +2935,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
2963static int check_rdpmc(struct x86_emulate_ctxt *ctxt) 2935static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
2964{ 2936{
2965 u64 cr4 = ctxt->ops->get_cr(ctxt, 4); 2937 u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
2966 u64 rcx = ctxt->decode.regs[VCPU_REGS_RCX]; 2938 u64 rcx = ctxt->regs[VCPU_REGS_RCX];
2967 2939
2968 if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || 2940 if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
2969 (rcx > 3)) 2941 (rcx > 3))
@@ -2974,10 +2946,8 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
2974 2946
2975static int check_perm_in(struct x86_emulate_ctxt *ctxt) 2947static int check_perm_in(struct x86_emulate_ctxt *ctxt)
2976{ 2948{
2977 struct decode_cache *c = &ctxt->decode; 2949 ctxt->dst.bytes = min(ctxt->dst.bytes, 4u);
2978 2950 if (!emulator_io_permited(ctxt, ctxt->src.val, ctxt->dst.bytes))
2979 c->dst.bytes = min(c->dst.bytes, 4u);
2980 if (!emulator_io_permited(ctxt, ctxt->ops, c->src.val, c->dst.bytes))
2981 return emulate_gp(ctxt, 0); 2951 return emulate_gp(ctxt, 0);
2982 2952
2983 return X86EMUL_CONTINUE; 2953 return X86EMUL_CONTINUE;
@@ -2985,10 +2955,8 @@ static int check_perm_in(struct x86_emulate_ctxt *ctxt)
2985 2955
2986static int check_perm_out(struct x86_emulate_ctxt *ctxt) 2956static int check_perm_out(struct x86_emulate_ctxt *ctxt)
2987{ 2957{
2988 struct decode_cache *c = &ctxt->decode; 2958 ctxt->src.bytes = min(ctxt->src.bytes, 4u);
2989 2959 if (!emulator_io_permited(ctxt, ctxt->dst.val, ctxt->src.bytes))
2990 c->src.bytes = min(c->src.bytes, 4u);
2991 if (!emulator_io_permited(ctxt, ctxt->ops, c->dst.val, c->src.bytes))
2992 return emulate_gp(ctxt, 0); 2960 return emulate_gp(ctxt, 0);
2993 2961
2994 return X86EMUL_CONTINUE; 2962 return X86EMUL_CONTINUE;
@@ -3165,12 +3133,15 @@ static struct opcode opcode_table[256] = {
3165 G(DstMem | SrcImm | ModRM | Group, group1), 3133 G(DstMem | SrcImm | ModRM | Group, group1),
3166 G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), 3134 G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
3167 G(DstMem | SrcImmByte | ModRM | Group, group1), 3135 G(DstMem | SrcImmByte | ModRM | Group, group1),
3168 D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock), 3136 I2bv(DstMem | SrcReg | ModRM, em_test),
3137 I2bv(DstMem | SrcReg | ModRM | Lock, em_xchg),
3169 /* 0x88 - 0x8F */ 3138 /* 0x88 - 0x8F */
3170 I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), 3139 I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
3171 I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), 3140 I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
3172 D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg), 3141 I(DstMem | SrcNone | ModRM | Mov, em_mov_rm_sreg),
3173 D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), 3142 D(ModRM | SrcMem | NoAccess | DstReg),
3143 I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm),
3144 G(0, group1A),
3174 /* 0x90 - 0x97 */ 3145 /* 0x90 - 0x97 */
3175 DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)), 3146 DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
3176 /* 0x98 - 0x9F */ 3147 /* 0x98 - 0x9F */
@@ -3184,7 +3155,7 @@ static struct opcode opcode_table[256] = {
3184 I2bv(SrcSI | DstDI | Mov | String, em_mov), 3155 I2bv(SrcSI | DstDI | Mov | String, em_mov),
3185 I2bv(SrcSI | DstDI | String, em_cmp), 3156 I2bv(SrcSI | DstDI | String, em_cmp),
3186 /* 0xA8 - 0xAF */ 3157 /* 0xA8 - 0xAF */
3187 D2bv(DstAcc | SrcImm), 3158 I2bv(DstAcc | SrcImm, em_test),
3188 I2bv(SrcAcc | DstDI | Mov | String, em_mov), 3159 I2bv(SrcAcc | DstDI | Mov | String, em_mov),
3189 I2bv(SrcSI | DstAcc | Mov | String, em_mov), 3160 I2bv(SrcSI | DstAcc | Mov | String, em_mov),
3190 I2bv(SrcAcc | DstDI | String, em_cmp), 3161 I2bv(SrcAcc | DstDI | String, em_cmp),
@@ -3195,25 +3166,26 @@ static struct opcode opcode_table[256] = {
3195 /* 0xC0 - 0xC7 */ 3166 /* 0xC0 - 0xC7 */
3196 D2bv(DstMem | SrcImmByte | ModRM), 3167 D2bv(DstMem | SrcImmByte | ModRM),
3197 I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), 3168 I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
3198 D(ImplicitOps | Stack), 3169 I(ImplicitOps | Stack, em_ret),
3199 D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), 3170 D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
3200 G(ByteOp, group11), G(0, group11), 3171 G(ByteOp, group11), G(0, group11),
3201 /* 0xC8 - 0xCF */ 3172 /* 0xC8 - 0xCF */
3202 N, N, N, D(ImplicitOps | Stack), 3173 N, N, N, I(ImplicitOps | Stack, em_ret_far),
3203 D(ImplicitOps), DI(SrcImmByte, intn), 3174 D(ImplicitOps), DI(SrcImmByte, intn),
3204 D(ImplicitOps | No64), DI(ImplicitOps, iret), 3175 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
3205 /* 0xD0 - 0xD7 */ 3176 /* 0xD0 - 0xD7 */
3206 D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), 3177 D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
3207 N, N, N, N, 3178 N, N, N, N,
3208 /* 0xD8 - 0xDF */ 3179 /* 0xD8 - 0xDF */
3209 N, N, N, N, N, N, N, N, 3180 N, N, N, N, N, N, N, N,
3210 /* 0xE0 - 0xE7 */ 3181 /* 0xE0 - 0xE7 */
3211 X4(D(SrcImmByte)), 3182 X3(I(SrcImmByte, em_loop)),
3183 I(SrcImmByte, em_jcxz),
3212 D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in), 3184 D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in),
3213 D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out), 3185 D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out),
3214 /* 0xE8 - 0xEF */ 3186 /* 0xE8 - 0xEF */
3215 D(SrcImm | Stack), D(SrcImm | ImplicitOps), 3187 D(SrcImm | Stack), D(SrcImm | ImplicitOps),
3216 D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), 3188 I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps),
3217 D2bvIP(SrcDX | DstAcc, in, check_perm_in), 3189 D2bvIP(SrcDX | DstAcc, in, check_perm_in),
3218 D2bvIP(SrcAcc | DstDX, out, check_perm_out), 3190 D2bvIP(SrcAcc | DstDX, out, check_perm_out),
3219 /* 0xF0 - 0xF7 */ 3191 /* 0xF0 - 0xF7 */
@@ -3221,14 +3193,16 @@ static struct opcode opcode_table[256] = {
3221 DI(ImplicitOps | Priv, hlt), D(ImplicitOps), 3193 DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
3222 G(ByteOp, group3), G(0, group3), 3194 G(ByteOp, group3), G(0, group3),
3223 /* 0xF8 - 0xFF */ 3195 /* 0xF8 - 0xFF */
3224 D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), 3196 D(ImplicitOps), D(ImplicitOps),
3197 I(ImplicitOps, em_cli), I(ImplicitOps, em_sti),
3225 D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), 3198 D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
3226}; 3199};
3227 3200
3228static struct opcode twobyte_table[256] = { 3201static struct opcode twobyte_table[256] = {
3229 /* 0x00 - 0x0F */ 3202 /* 0x00 - 0x0F */
3230 G(0, group6), GD(0, &group7), N, N, 3203 G(0, group6), GD(0, &group7), N, N,
3231 N, D(ImplicitOps | VendorSpecific), DI(ImplicitOps | Priv, clts), N, 3204 N, I(ImplicitOps | VendorSpecific, em_syscall),
3205 II(ImplicitOps | Priv, em_clts, clts), N,
3232 DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, 3206 DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
3233 N, D(ImplicitOps | ModRM), N, N, 3207 N, D(ImplicitOps | ModRM), N, N,
3234 /* 0x10 - 0x1F */ 3208 /* 0x10 - 0x1F */
@@ -3245,7 +3219,8 @@ static struct opcode twobyte_table[256] = {
3245 IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), 3219 IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
3246 DI(ImplicitOps | Priv, rdmsr), 3220 DI(ImplicitOps | Priv, rdmsr),
3247 DIP(ImplicitOps | Priv, rdpmc, check_rdpmc), 3221 DIP(ImplicitOps | Priv, rdpmc, check_rdpmc),
3248 D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific), 3222 I(ImplicitOps | VendorSpecific, em_sysenter),
3223 I(ImplicitOps | Priv | VendorSpecific, em_sysexit),
3249 N, N, 3224 N, N,
3250 N, N, N, N, N, N, N, N, 3225 N, N, N, N, N, N, N, N,
3251 /* 0x40 - 0x4F */ 3226 /* 0x40 - 0x4F */
@@ -3313,11 +3288,11 @@ static struct opcode twobyte_table[256] = {
3313#undef I2bv 3288#undef I2bv
3314#undef I6ALU 3289#undef I6ALU
3315 3290
3316static unsigned imm_size(struct decode_cache *c) 3291static unsigned imm_size(struct x86_emulate_ctxt *ctxt)
3317{ 3292{
3318 unsigned size; 3293 unsigned size;
3319 3294
3320 size = (c->d & ByteOp) ? 1 : c->op_bytes; 3295 size = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3321 if (size == 8) 3296 if (size == 8)
3322 size = 4; 3297 size = 4;
3323 return size; 3298 return size;
@@ -3326,23 +3301,21 @@ static unsigned imm_size(struct decode_cache *c)
3326static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, 3301static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
3327 unsigned size, bool sign_extension) 3302 unsigned size, bool sign_extension)
3328{ 3303{
3329 struct decode_cache *c = &ctxt->decode;
3330 struct x86_emulate_ops *ops = ctxt->ops;
3331 int rc = X86EMUL_CONTINUE; 3304 int rc = X86EMUL_CONTINUE;
3332 3305
3333 op->type = OP_IMM; 3306 op->type = OP_IMM;
3334 op->bytes = size; 3307 op->bytes = size;
3335 op->addr.mem.ea = c->eip; 3308 op->addr.mem.ea = ctxt->_eip;
3336 /* NB. Immediates are sign-extended as necessary. */ 3309 /* NB. Immediates are sign-extended as necessary. */
3337 switch (op->bytes) { 3310 switch (op->bytes) {
3338 case 1: 3311 case 1:
3339 op->val = insn_fetch(s8, 1, c->eip); 3312 op->val = insn_fetch(s8, 1, ctxt->_eip);
3340 break; 3313 break;
3341 case 2: 3314 case 2:
3342 op->val = insn_fetch(s16, 2, c->eip); 3315 op->val = insn_fetch(s16, 2, ctxt->_eip);
3343 break; 3316 break;
3344 case 4: 3317 case 4:
3345 op->val = insn_fetch(s32, 4, c->eip); 3318 op->val = insn_fetch(s32, 4, ctxt->_eip);
3346 break; 3319 break;
3347 } 3320 }
3348 if (!sign_extension) { 3321 if (!sign_extension) {
@@ -3362,11 +3335,8 @@ done:
3362 return rc; 3335 return rc;
3363} 3336}
3364 3337
3365int 3338int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3366x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3367{ 3339{
3368 struct x86_emulate_ops *ops = ctxt->ops;
3369 struct decode_cache *c = &ctxt->decode;
3370 int rc = X86EMUL_CONTINUE; 3340 int rc = X86EMUL_CONTINUE;
3371 int mode = ctxt->mode; 3341 int mode = ctxt->mode;
3372 int def_op_bytes, def_ad_bytes, goffset, simd_prefix; 3342 int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
@@ -3374,11 +3344,11 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3374 struct opcode opcode; 3344 struct opcode opcode;
3375 struct operand memop = { .type = OP_NONE }, *memopp = NULL; 3345 struct operand memop = { .type = OP_NONE }, *memopp = NULL;
3376 3346
3377 c->eip = ctxt->eip; 3347 ctxt->_eip = ctxt->eip;
3378 c->fetch.start = c->eip; 3348 ctxt->fetch.start = ctxt->_eip;
3379 c->fetch.end = c->fetch.start + insn_len; 3349 ctxt->fetch.end = ctxt->fetch.start + insn_len;
3380 if (insn_len > 0) 3350 if (insn_len > 0)
3381 memcpy(c->fetch.data, insn, insn_len); 3351 memcpy(ctxt->fetch.data, insn, insn_len);
3382 3352
3383 switch (mode) { 3353 switch (mode) {
3384 case X86EMUL_MODE_REAL: 3354 case X86EMUL_MODE_REAL:
@@ -3399,46 +3369,46 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3399 return -1; 3369 return -1;
3400 } 3370 }
3401 3371
3402 c->op_bytes = def_op_bytes; 3372 ctxt->op_bytes = def_op_bytes;
3403 c->ad_bytes = def_ad_bytes; 3373 ctxt->ad_bytes = def_ad_bytes;
3404 3374
3405 /* Legacy prefixes. */ 3375 /* Legacy prefixes. */
3406 for (;;) { 3376 for (;;) {
3407 switch (c->b = insn_fetch(u8, 1, c->eip)) { 3377 switch (ctxt->b = insn_fetch(u8, 1, ctxt->_eip)) {
3408 case 0x66: /* operand-size override */ 3378 case 0x66: /* operand-size override */
3409 op_prefix = true; 3379 op_prefix = true;
3410 /* switch between 2/4 bytes */ 3380 /* switch between 2/4 bytes */
3411 c->op_bytes = def_op_bytes ^ 6; 3381 ctxt->op_bytes = def_op_bytes ^ 6;
3412 break; 3382 break;
3413 case 0x67: /* address-size override */ 3383 case 0x67: /* address-size override */
3414 if (mode == X86EMUL_MODE_PROT64) 3384 if (mode == X86EMUL_MODE_PROT64)
3415 /* switch between 4/8 bytes */ 3385 /* switch between 4/8 bytes */
3416 c->ad_bytes = def_ad_bytes ^ 12; 3386 ctxt->ad_bytes = def_ad_bytes ^ 12;
3417 else 3387 else
3418 /* switch between 2/4 bytes */ 3388 /* switch between 2/4 bytes */
3419 c->ad_bytes = def_ad_bytes ^ 6; 3389 ctxt->ad_bytes = def_ad_bytes ^ 6;
3420 break; 3390 break;
3421 case 0x26: /* ES override */ 3391 case 0x26: /* ES override */
3422 case 0x2e: /* CS override */ 3392 case 0x2e: /* CS override */
3423 case 0x36: /* SS override */ 3393 case 0x36: /* SS override */
3424 case 0x3e: /* DS override */ 3394 case 0x3e: /* DS override */
3425 set_seg_override(c, (c->b >> 3) & 3); 3395 set_seg_override(ctxt, (ctxt->b >> 3) & 3);
3426 break; 3396 break;
3427 case 0x64: /* FS override */ 3397 case 0x64: /* FS override */
3428 case 0x65: /* GS override */ 3398 case 0x65: /* GS override */
3429 set_seg_override(c, c->b & 7); 3399 set_seg_override(ctxt, ctxt->b & 7);
3430 break; 3400 break;
3431 case 0x40 ... 0x4f: /* REX */ 3401 case 0x40 ... 0x4f: /* REX */
3432 if (mode != X86EMUL_MODE_PROT64) 3402 if (mode != X86EMUL_MODE_PROT64)
3433 goto done_prefixes; 3403 goto done_prefixes;
3434 c->rex_prefix = c->b; 3404 ctxt->rex_prefix = ctxt->b;
3435 continue; 3405 continue;
3436 case 0xf0: /* LOCK */ 3406 case 0xf0: /* LOCK */
3437 c->lock_prefix = 1; 3407 ctxt->lock_prefix = 1;
3438 break; 3408 break;
3439 case 0xf2: /* REPNE/REPNZ */ 3409 case 0xf2: /* REPNE/REPNZ */
3440 case 0xf3: /* REP/REPE/REPZ */ 3410 case 0xf3: /* REP/REPE/REPZ */
3441 c->rep_prefix = c->b; 3411 ctxt->rep_prefix = ctxt->b;
3442 break; 3412 break;
3443 default: 3413 default:
3444 goto done_prefixes; 3414 goto done_prefixes;
@@ -3446,50 +3416,50 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3446 3416
3447 /* Any legacy prefix after a REX prefix nullifies its effect. */ 3417 /* Any legacy prefix after a REX prefix nullifies its effect. */
3448 3418
3449 c->rex_prefix = 0; 3419 ctxt->rex_prefix = 0;
3450 } 3420 }
3451 3421
3452done_prefixes: 3422done_prefixes:
3453 3423
3454 /* REX prefix. */ 3424 /* REX prefix. */
3455 if (c->rex_prefix & 8) 3425 if (ctxt->rex_prefix & 8)
3456 c->op_bytes = 8; /* REX.W */ 3426 ctxt->op_bytes = 8; /* REX.W */
3457 3427
3458 /* Opcode byte(s). */ 3428 /* Opcode byte(s). */
3459 opcode = opcode_table[c->b]; 3429 opcode = opcode_table[ctxt->b];
3460 /* Two-byte opcode? */ 3430 /* Two-byte opcode? */
3461 if (c->b == 0x0f) { 3431 if (ctxt->b == 0x0f) {
3462 c->twobyte = 1; 3432 ctxt->twobyte = 1;
3463 c->b = insn_fetch(u8, 1, c->eip); 3433 ctxt->b = insn_fetch(u8, 1, ctxt->_eip);
3464 opcode = twobyte_table[c->b]; 3434 opcode = twobyte_table[ctxt->b];
3465 } 3435 }
3466 c->d = opcode.flags; 3436 ctxt->d = opcode.flags;
3467 3437
3468 while (c->d & GroupMask) { 3438 while (ctxt->d & GroupMask) {
3469 switch (c->d & GroupMask) { 3439 switch (ctxt->d & GroupMask) {
3470 case Group: 3440 case Group:
3471 c->modrm = insn_fetch(u8, 1, c->eip); 3441 ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
3472 --c->eip; 3442 --ctxt->_eip;
3473 goffset = (c->modrm >> 3) & 7; 3443 goffset = (ctxt->modrm >> 3) & 7;
3474 opcode = opcode.u.group[goffset]; 3444 opcode = opcode.u.group[goffset];
3475 break; 3445 break;
3476 case GroupDual: 3446 case GroupDual:
3477 c->modrm = insn_fetch(u8, 1, c->eip); 3447 ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
3478 --c->eip; 3448 --ctxt->_eip;
3479 goffset = (c->modrm >> 3) & 7; 3449 goffset = (ctxt->modrm >> 3) & 7;
3480 if ((c->modrm >> 6) == 3) 3450 if ((ctxt->modrm >> 6) == 3)
3481 opcode = opcode.u.gdual->mod3[goffset]; 3451 opcode = opcode.u.gdual->mod3[goffset];
3482 else 3452 else
3483 opcode = opcode.u.gdual->mod012[goffset]; 3453 opcode = opcode.u.gdual->mod012[goffset];
3484 break; 3454 break;
3485 case RMExt: 3455 case RMExt:
3486 goffset = c->modrm & 7; 3456 goffset = ctxt->modrm & 7;
3487 opcode = opcode.u.group[goffset]; 3457 opcode = opcode.u.group[goffset];
3488 break; 3458 break;
3489 case Prefix: 3459 case Prefix:
3490 if (c->rep_prefix && op_prefix) 3460 if (ctxt->rep_prefix && op_prefix)
3491 return X86EMUL_UNHANDLEABLE; 3461 return X86EMUL_UNHANDLEABLE;
3492 simd_prefix = op_prefix ? 0x66 : c->rep_prefix; 3462 simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix;
3493 switch (simd_prefix) { 3463 switch (simd_prefix) {
3494 case 0x00: opcode = opcode.u.gprefix->pfx_no; break; 3464 case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
3495 case 0x66: opcode = opcode.u.gprefix->pfx_66; break; 3465 case 0x66: opcode = opcode.u.gprefix->pfx_66; break;
@@ -3501,61 +3471,61 @@ done_prefixes:
3501 return X86EMUL_UNHANDLEABLE; 3471 return X86EMUL_UNHANDLEABLE;
3502 } 3472 }
3503 3473
3504 c->d &= ~GroupMask; 3474 ctxt->d &= ~GroupMask;
3505 c->d |= opcode.flags; 3475 ctxt->d |= opcode.flags;
3506 } 3476 }
3507 3477
3508 c->execute = opcode.u.execute; 3478 ctxt->execute = opcode.u.execute;
3509 c->check_perm = opcode.check_perm; 3479 ctxt->check_perm = opcode.check_perm;
3510 c->intercept = opcode.intercept; 3480 ctxt->intercept = opcode.intercept;
3511 3481
3512 /* Unrecognised? */ 3482 /* Unrecognised? */
3513 if (c->d == 0 || (c->d & Undefined)) 3483 if (ctxt->d == 0 || (ctxt->d & Undefined))
3514 return -1; 3484 return -1;
3515 3485
3516 if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn) 3486 if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
3517 return -1; 3487 return -1;
3518 3488
3519 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) 3489 if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
3520 c->op_bytes = 8; 3490 ctxt->op_bytes = 8;
3521 3491
3522 if (c->d & Op3264) { 3492 if (ctxt->d & Op3264) {
3523 if (mode == X86EMUL_MODE_PROT64) 3493 if (mode == X86EMUL_MODE_PROT64)
3524 c->op_bytes = 8; 3494 ctxt->op_bytes = 8;
3525 else 3495 else
3526 c->op_bytes = 4; 3496 ctxt->op_bytes = 4;
3527 } 3497 }
3528 3498
3529 if (c->d & Sse) 3499 if (ctxt->d & Sse)
3530 c->op_bytes = 16; 3500 ctxt->op_bytes = 16;
3531 3501
3532 /* ModRM and SIB bytes. */ 3502 /* ModRM and SIB bytes. */
3533 if (c->d & ModRM) { 3503 if (ctxt->d & ModRM) {
3534 rc = decode_modrm(ctxt, ops, &memop); 3504 rc = decode_modrm(ctxt, &memop);
3535 if (!c->has_seg_override) 3505 if (!ctxt->has_seg_override)
3536 set_seg_override(c, c->modrm_seg); 3506 set_seg_override(ctxt, ctxt->modrm_seg);
3537 } else if (c->d & MemAbs) 3507 } else if (ctxt->d & MemAbs)
3538 rc = decode_abs(ctxt, ops, &memop); 3508 rc = decode_abs(ctxt, &memop);
3539 if (rc != X86EMUL_CONTINUE) 3509 if (rc != X86EMUL_CONTINUE)
3540 goto done; 3510 goto done;
3541 3511
3542 if (!c->has_seg_override) 3512 if (!ctxt->has_seg_override)
3543 set_seg_override(c, VCPU_SREG_DS); 3513 set_seg_override(ctxt, VCPU_SREG_DS);
3544 3514
3545 memop.addr.mem.seg = seg_override(ctxt, c); 3515 memop.addr.mem.seg = seg_override(ctxt);
3546 3516
3547 if (memop.type == OP_MEM && c->ad_bytes != 8) 3517 if (memop.type == OP_MEM && ctxt->ad_bytes != 8)
3548 memop.addr.mem.ea = (u32)memop.addr.mem.ea; 3518 memop.addr.mem.ea = (u32)memop.addr.mem.ea;
3549 3519
3550 /* 3520 /*
3551 * Decode and fetch the source operand: register, memory 3521 * Decode and fetch the source operand: register, memory
3552 * or immediate. 3522 * or immediate.
3553 */ 3523 */
3554 switch (c->d & SrcMask) { 3524 switch (ctxt->d & SrcMask) {
3555 case SrcNone: 3525 case SrcNone:
3556 break; 3526 break;
3557 case SrcReg: 3527 case SrcReg:
3558 decode_register_operand(ctxt, &c->src, c, 0); 3528 decode_register_operand(ctxt, &ctxt->src, 0);
3559 break; 3529 break;
3560 case SrcMem16: 3530 case SrcMem16:
3561 memop.bytes = 2; 3531 memop.bytes = 2;
@@ -3564,60 +3534,60 @@ done_prefixes:
3564 memop.bytes = 4; 3534 memop.bytes = 4;
3565 goto srcmem_common; 3535 goto srcmem_common;
3566 case SrcMem: 3536 case SrcMem:
3567 memop.bytes = (c->d & ByteOp) ? 1 : 3537 memop.bytes = (ctxt->d & ByteOp) ? 1 :
3568 c->op_bytes; 3538 ctxt->op_bytes;
3569 srcmem_common: 3539 srcmem_common:
3570 c->src = memop; 3540 ctxt->src = memop;
3571 memopp = &c->src; 3541 memopp = &ctxt->src;
3572 break; 3542 break;
3573 case SrcImmU16: 3543 case SrcImmU16:
3574 rc = decode_imm(ctxt, &c->src, 2, false); 3544 rc = decode_imm(ctxt, &ctxt->src, 2, false);
3575 break; 3545 break;
3576 case SrcImm: 3546 case SrcImm:
3577 rc = decode_imm(ctxt, &c->src, imm_size(c), true); 3547 rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), true);
3578 break; 3548 break;
3579 case SrcImmU: 3549 case SrcImmU:
3580 rc = decode_imm(ctxt, &c->src, imm_size(c), false); 3550 rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), false);
3581 break; 3551 break;
3582 case SrcImmByte: 3552 case SrcImmByte:
3583 rc = decode_imm(ctxt, &c->src, 1, true); 3553 rc = decode_imm(ctxt, &ctxt->src, 1, true);
3584 break; 3554 break;
3585 case SrcImmUByte: 3555 case SrcImmUByte:
3586 rc = decode_imm(ctxt, &c->src, 1, false); 3556 rc = decode_imm(ctxt, &ctxt->src, 1, false);
3587 break; 3557 break;
3588 case SrcAcc: 3558 case SrcAcc:
3589 c->src.type = OP_REG; 3559 ctxt->src.type = OP_REG;
3590 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 3560 ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3591 c->src.addr.reg = &c->regs[VCPU_REGS_RAX]; 3561 ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
3592 fetch_register_operand(&c->src); 3562 fetch_register_operand(&ctxt->src);
3593 break; 3563 break;
3594 case SrcOne: 3564 case SrcOne:
3595 c->src.bytes = 1; 3565 ctxt->src.bytes = 1;
3596 c->src.val = 1; 3566 ctxt->src.val = 1;
3597 break; 3567 break;
3598 case SrcSI: 3568 case SrcSI:
3599 c->src.type = OP_MEM; 3569 ctxt->src.type = OP_MEM;
3600 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 3570 ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3601 c->src.addr.mem.ea = 3571 ctxt->src.addr.mem.ea =
3602 register_address(c, c->regs[VCPU_REGS_RSI]); 3572 register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
3603 c->src.addr.mem.seg = seg_override(ctxt, c); 3573 ctxt->src.addr.mem.seg = seg_override(ctxt);
3604 c->src.val = 0; 3574 ctxt->src.val = 0;
3605 break; 3575 break;
3606 case SrcImmFAddr: 3576 case SrcImmFAddr:
3607 c->src.type = OP_IMM; 3577 ctxt->src.type = OP_IMM;
3608 c->src.addr.mem.ea = c->eip; 3578 ctxt->src.addr.mem.ea = ctxt->_eip;
3609 c->src.bytes = c->op_bytes + 2; 3579 ctxt->src.bytes = ctxt->op_bytes + 2;
3610 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); 3580 insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt->_eip);
3611 break; 3581 break;
3612 case SrcMemFAddr: 3582 case SrcMemFAddr:
3613 memop.bytes = c->op_bytes + 2; 3583 memop.bytes = ctxt->op_bytes + 2;
3614 goto srcmem_common; 3584 goto srcmem_common;
3615 break; 3585 break;
3616 case SrcDX: 3586 case SrcDX:
3617 c->src.type = OP_REG; 3587 ctxt->src.type = OP_REG;
3618 c->src.bytes = 2; 3588 ctxt->src.bytes = 2;
3619 c->src.addr.reg = &c->regs[VCPU_REGS_RDX]; 3589 ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
3620 fetch_register_operand(&c->src); 3590 fetch_register_operand(&ctxt->src);
3621 break; 3591 break;
3622 } 3592 }
3623 3593
@@ -3628,22 +3598,22 @@ done_prefixes:
3628 * Decode and fetch the second source operand: register, memory 3598 * Decode and fetch the second source operand: register, memory
3629 * or immediate. 3599 * or immediate.
3630 */ 3600 */
3631 switch (c->d & Src2Mask) { 3601 switch (ctxt->d & Src2Mask) {
3632 case Src2None: 3602 case Src2None:
3633 break; 3603 break;
3634 case Src2CL: 3604 case Src2CL:
3635 c->src2.bytes = 1; 3605 ctxt->src2.bytes = 1;
3636 c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; 3606 ctxt->src2.val = ctxt->regs[VCPU_REGS_RCX] & 0x8;
3637 break; 3607 break;
3638 case Src2ImmByte: 3608 case Src2ImmByte:
3639 rc = decode_imm(ctxt, &c->src2, 1, true); 3609 rc = decode_imm(ctxt, &ctxt->src2, 1, true);
3640 break; 3610 break;
3641 case Src2One: 3611 case Src2One:
3642 c->src2.bytes = 1; 3612 ctxt->src2.bytes = 1;
3643 c->src2.val = 1; 3613 ctxt->src2.val = 1;
3644 break; 3614 break;
3645 case Src2Imm: 3615 case Src2Imm:
3646 rc = decode_imm(ctxt, &c->src2, imm_size(c), true); 3616 rc = decode_imm(ctxt, &ctxt->src2, imm_size(ctxt), true);
3647 break; 3617 break;
3648 } 3618 }
3649 3619
@@ -3651,68 +3621,66 @@ done_prefixes:
3651 goto done; 3621 goto done;
3652 3622
3653 /* Decode and fetch the destination operand: register or memory. */ 3623 /* Decode and fetch the destination operand: register or memory. */
3654 switch (c->d & DstMask) { 3624 switch (ctxt->d & DstMask) {
3655 case DstReg: 3625 case DstReg:
3656 decode_register_operand(ctxt, &c->dst, c, 3626 decode_register_operand(ctxt, &ctxt->dst,
3657 c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); 3627 ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
3658 break; 3628 break;
3659 case DstImmUByte: 3629 case DstImmUByte:
3660 c->dst.type = OP_IMM; 3630 ctxt->dst.type = OP_IMM;
3661 c->dst.addr.mem.ea = c->eip; 3631 ctxt->dst.addr.mem.ea = ctxt->_eip;
3662 c->dst.bytes = 1; 3632 ctxt->dst.bytes = 1;
3663 c->dst.val = insn_fetch(u8, 1, c->eip); 3633 ctxt->dst.val = insn_fetch(u8, 1, ctxt->_eip);
3664 break; 3634 break;
3665 case DstMem: 3635 case DstMem:
3666 case DstMem64: 3636 case DstMem64:
3667 c->dst = memop; 3637 ctxt->dst = memop;
3668 memopp = &c->dst; 3638 memopp = &ctxt->dst;
3669 if ((c->d & DstMask) == DstMem64) 3639 if ((ctxt->d & DstMask) == DstMem64)
3670 c->dst.bytes = 8; 3640 ctxt->dst.bytes = 8;
3671 else 3641 else
3672 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 3642 ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3673 if (c->d & BitOp) 3643 if (ctxt->d & BitOp)
3674 fetch_bit_operand(c); 3644 fetch_bit_operand(ctxt);
3675 c->dst.orig_val = c->dst.val; 3645 ctxt->dst.orig_val = ctxt->dst.val;
3676 break; 3646 break;
3677 case DstAcc: 3647 case DstAcc:
3678 c->dst.type = OP_REG; 3648 ctxt->dst.type = OP_REG;
3679 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 3649 ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3680 c->dst.addr.reg = &c->regs[VCPU_REGS_RAX]; 3650 ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
3681 fetch_register_operand(&c->dst); 3651 fetch_register_operand(&ctxt->dst);
3682 c->dst.orig_val = c->dst.val; 3652 ctxt->dst.orig_val = ctxt->dst.val;
3683 break; 3653 break;
3684 case DstDI: 3654 case DstDI:
3685 c->dst.type = OP_MEM; 3655 ctxt->dst.type = OP_MEM;
3686 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 3656 ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3687 c->dst.addr.mem.ea = 3657 ctxt->dst.addr.mem.ea =
3688 register_address(c, c->regs[VCPU_REGS_RDI]); 3658 register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
3689 c->dst.addr.mem.seg = VCPU_SREG_ES; 3659 ctxt->dst.addr.mem.seg = VCPU_SREG_ES;
3690 c->dst.val = 0; 3660 ctxt->dst.val = 0;
3691 break; 3661 break;
3692 case DstDX: 3662 case DstDX:
3693 c->dst.type = OP_REG; 3663 ctxt->dst.type = OP_REG;
3694 c->dst.bytes = 2; 3664 ctxt->dst.bytes = 2;
3695 c->dst.addr.reg = &c->regs[VCPU_REGS_RDX]; 3665 ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
3696 fetch_register_operand(&c->dst); 3666 fetch_register_operand(&ctxt->dst);
3697 break; 3667 break;
3698 case ImplicitOps: 3668 case ImplicitOps:
3699 /* Special instructions do their own operand decoding. */ 3669 /* Special instructions do their own operand decoding. */
3700 default: 3670 default:
3701 c->dst.type = OP_NONE; /* Disable writeback. */ 3671 ctxt->dst.type = OP_NONE; /* Disable writeback. */
3702 break; 3672 break;
3703 } 3673 }
3704 3674
3705done: 3675done:
3706 if (memopp && memopp->type == OP_MEM && c->rip_relative) 3676 if (memopp && memopp->type == OP_MEM && ctxt->rip_relative)
3707 memopp->addr.mem.ea += c->eip; 3677 memopp->addr.mem.ea += ctxt->_eip;
3708 3678
3709 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 3679 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
3710} 3680}
3711 3681
3712static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) 3682static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
3713{ 3683{
3714 struct decode_cache *c = &ctxt->decode;
3715
3716 /* The second termination condition only applies for REPE 3684 /* The second termination condition only applies for REPE
3717 * and REPNE. Test if the repeat string operation prefix is 3685 * and REPNE. Test if the repeat string operation prefix is
3718 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the 3686 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
@@ -3720,304 +3688,232 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
3720 * - if REPE/REPZ and ZF = 0 then done 3688 * - if REPE/REPZ and ZF = 0 then done
3721 * - if REPNE/REPNZ and ZF = 1 then done 3689 * - if REPNE/REPNZ and ZF = 1 then done
3722 */ 3690 */
3723 if (((c->b == 0xa6) || (c->b == 0xa7) || 3691 if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) ||
3724 (c->b == 0xae) || (c->b == 0xaf)) 3692 (ctxt->b == 0xae) || (ctxt->b == 0xaf))
3725 && (((c->rep_prefix == REPE_PREFIX) && 3693 && (((ctxt->rep_prefix == REPE_PREFIX) &&
3726 ((ctxt->eflags & EFLG_ZF) == 0)) 3694 ((ctxt->eflags & EFLG_ZF) == 0))
3727 || ((c->rep_prefix == REPNE_PREFIX) && 3695 || ((ctxt->rep_prefix == REPNE_PREFIX) &&
3728 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) 3696 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
3729 return true; 3697 return true;
3730 3698
3731 return false; 3699 return false;
3732} 3700}
3733 3701
3734int 3702int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3735x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3736{ 3703{
3737 struct x86_emulate_ops *ops = ctxt->ops; 3704 struct x86_emulate_ops *ops = ctxt->ops;
3738 u64 msr_data; 3705 u64 msr_data;
3739 struct decode_cache *c = &ctxt->decode;
3740 int rc = X86EMUL_CONTINUE; 3706 int rc = X86EMUL_CONTINUE;
3741 int saved_dst_type = c->dst.type; 3707 int saved_dst_type = ctxt->dst.type;
3742 int irq; /* Used for int 3, int, and into */
3743 3708
3744 ctxt->decode.mem_read.pos = 0; 3709 ctxt->mem_read.pos = 0;
3745 3710
3746 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { 3711 if (ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) {
3747 rc = emulate_ud(ctxt); 3712 rc = emulate_ud(ctxt);
3748 goto done; 3713 goto done;
3749 } 3714 }
3750 3715
3751 /* LOCK prefix is allowed only with some instructions */ 3716 /* LOCK prefix is allowed only with some instructions */
3752 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { 3717 if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) {
3753 rc = emulate_ud(ctxt); 3718 rc = emulate_ud(ctxt);
3754 goto done; 3719 goto done;
3755 } 3720 }
3756 3721
3757 if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { 3722 if ((ctxt->d & SrcMask) == SrcMemFAddr && ctxt->src.type != OP_MEM) {
3758 rc = emulate_ud(ctxt); 3723 rc = emulate_ud(ctxt);
3759 goto done; 3724 goto done;
3760 } 3725 }
3761 3726
3762 if ((c->d & Sse) 3727 if ((ctxt->d & Sse)
3763 && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) 3728 && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)
3764 || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { 3729 || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
3765 rc = emulate_ud(ctxt); 3730 rc = emulate_ud(ctxt);
3766 goto done; 3731 goto done;
3767 } 3732 }
3768 3733
3769 if ((c->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { 3734 if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
3770 rc = emulate_nm(ctxt); 3735 rc = emulate_nm(ctxt);
3771 goto done; 3736 goto done;
3772 } 3737 }
3773 3738
3774 if (unlikely(ctxt->guest_mode) && c->intercept) { 3739 if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
3775 rc = emulator_check_intercept(ctxt, c->intercept, 3740 rc = emulator_check_intercept(ctxt, ctxt->intercept,
3776 X86_ICPT_PRE_EXCEPT); 3741 X86_ICPT_PRE_EXCEPT);
3777 if (rc != X86EMUL_CONTINUE) 3742 if (rc != X86EMUL_CONTINUE)
3778 goto done; 3743 goto done;
3779 } 3744 }
3780 3745
3781 /* Privileged instruction can be executed only in CPL=0 */ 3746 /* Privileged instruction can be executed only in CPL=0 */
3782 if ((c->d & Priv) && ops->cpl(ctxt)) { 3747 if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
3783 rc = emulate_gp(ctxt, 0); 3748 rc = emulate_gp(ctxt, 0);
3784 goto done; 3749 goto done;
3785 } 3750 }
3786 3751
3787 /* Instruction can only be executed in protected mode */ 3752 /* Instruction can only be executed in protected mode */
3788 if ((c->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) { 3753 if ((ctxt->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) {
3789 rc = emulate_ud(ctxt); 3754 rc = emulate_ud(ctxt);
3790 goto done; 3755 goto done;
3791 } 3756 }
3792 3757
3793 /* Do instruction specific permission checks */ 3758 /* Do instruction specific permission checks */
3794 if (c->check_perm) { 3759 if (ctxt->check_perm) {
3795 rc = c->check_perm(ctxt); 3760 rc = ctxt->check_perm(ctxt);
3796 if (rc != X86EMUL_CONTINUE) 3761 if (rc != X86EMUL_CONTINUE)
3797 goto done; 3762 goto done;
3798 } 3763 }
3799 3764
3800 if (unlikely(ctxt->guest_mode) && c->intercept) { 3765 if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
3801 rc = emulator_check_intercept(ctxt, c->intercept, 3766 rc = emulator_check_intercept(ctxt, ctxt->intercept,
3802 X86_ICPT_POST_EXCEPT); 3767 X86_ICPT_POST_EXCEPT);
3803 if (rc != X86EMUL_CONTINUE) 3768 if (rc != X86EMUL_CONTINUE)
3804 goto done; 3769 goto done;
3805 } 3770 }
3806 3771
3807 if (c->rep_prefix && (c->d & String)) { 3772 if (ctxt->rep_prefix && (ctxt->d & String)) {
3808 /* All REP prefixes have the same first termination condition */ 3773 /* All REP prefixes have the same first termination condition */
3809 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { 3774 if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) {
3810 ctxt->eip = c->eip; 3775 ctxt->eip = ctxt->_eip;
3811 goto done; 3776 goto done;
3812 } 3777 }
3813 } 3778 }
3814 3779
3815 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { 3780 if ((ctxt->src.type == OP_MEM) && !(ctxt->d & NoAccess)) {
3816 rc = segmented_read(ctxt, c->src.addr.mem, 3781 rc = segmented_read(ctxt, ctxt->src.addr.mem,
3817 c->src.valptr, c->src.bytes); 3782 ctxt->src.valptr, ctxt->src.bytes);
3818 if (rc != X86EMUL_CONTINUE) 3783 if (rc != X86EMUL_CONTINUE)
3819 goto done; 3784 goto done;
3820 c->src.orig_val64 = c->src.val64; 3785 ctxt->src.orig_val64 = ctxt->src.val64;
3821 } 3786 }
3822 3787
3823 if (c->src2.type == OP_MEM) { 3788 if (ctxt->src2.type == OP_MEM) {
3824 rc = segmented_read(ctxt, c->src2.addr.mem, 3789 rc = segmented_read(ctxt, ctxt->src2.addr.mem,
3825 &c->src2.val, c->src2.bytes); 3790 &ctxt->src2.val, ctxt->src2.bytes);
3826 if (rc != X86EMUL_CONTINUE) 3791 if (rc != X86EMUL_CONTINUE)
3827 goto done; 3792 goto done;
3828 } 3793 }
3829 3794
3830 if ((c->d & DstMask) == ImplicitOps) 3795 if ((ctxt->d & DstMask) == ImplicitOps)
3831 goto special_insn; 3796 goto special_insn;
3832 3797
3833 3798
3834 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { 3799 if ((ctxt->dst.type == OP_MEM) && !(ctxt->d & Mov)) {
3835 /* optimisation - avoid slow emulated read if Mov */ 3800 /* optimisation - avoid slow emulated read if Mov */
3836 rc = segmented_read(ctxt, c->dst.addr.mem, 3801 rc = segmented_read(ctxt, ctxt->dst.addr.mem,
3837 &c->dst.val, c->dst.bytes); 3802 &ctxt->dst.val, ctxt->dst.bytes);
3838 if (rc != X86EMUL_CONTINUE) 3803 if (rc != X86EMUL_CONTINUE)
3839 goto done; 3804 goto done;
3840 } 3805 }
3841 c->dst.orig_val = c->dst.val; 3806 ctxt->dst.orig_val = ctxt->dst.val;
3842 3807
3843special_insn: 3808special_insn:
3844 3809
3845 if (unlikely(ctxt->guest_mode) && c->intercept) { 3810 if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
3846 rc = emulator_check_intercept(ctxt, c->intercept, 3811 rc = emulator_check_intercept(ctxt, ctxt->intercept,
3847 X86_ICPT_POST_MEMACCESS); 3812 X86_ICPT_POST_MEMACCESS);
3848 if (rc != X86EMUL_CONTINUE) 3813 if (rc != X86EMUL_CONTINUE)
3849 goto done; 3814 goto done;
3850 } 3815 }
3851 3816
3852 if (c->execute) { 3817 if (ctxt->execute) {
3853 rc = c->execute(ctxt); 3818 rc = ctxt->execute(ctxt);
3854 if (rc != X86EMUL_CONTINUE) 3819 if (rc != X86EMUL_CONTINUE)
3855 goto done; 3820 goto done;
3856 goto writeback; 3821 goto writeback;
3857 } 3822 }
3858 3823
3859 if (c->twobyte) 3824 if (ctxt->twobyte)
3860 goto twobyte_insn; 3825 goto twobyte_insn;
3861 3826
3862 switch (c->b) { 3827 switch (ctxt->b) {
3863 case 0x06: /* push es */ 3828 case 0x06: /* push es */
3864 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); 3829 rc = emulate_push_sreg(ctxt, VCPU_SREG_ES);
3865 break; 3830 break;
3866 case 0x07: /* pop es */ 3831 case 0x07: /* pop es */
3867 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); 3832 rc = emulate_pop_sreg(ctxt, VCPU_SREG_ES);
3868 break; 3833 break;
3869 case 0x0e: /* push cs */ 3834 case 0x0e: /* push cs */
3870 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); 3835 rc = emulate_push_sreg(ctxt, VCPU_SREG_CS);
3871 break; 3836 break;
3872 case 0x16: /* push ss */ 3837 case 0x16: /* push ss */
3873 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); 3838 rc = emulate_push_sreg(ctxt, VCPU_SREG_SS);
3874 break; 3839 break;
3875 case 0x17: /* pop ss */ 3840 case 0x17: /* pop ss */
3876 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); 3841 rc = emulate_pop_sreg(ctxt, VCPU_SREG_SS);
3877 break; 3842 break;
3878 case 0x1e: /* push ds */ 3843 case 0x1e: /* push ds */
3879 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); 3844 rc = emulate_push_sreg(ctxt, VCPU_SREG_DS);
3880 break; 3845 break;
3881 case 0x1f: /* pop ds */ 3846 case 0x1f: /* pop ds */
3882 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); 3847 rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS);
3883 break; 3848 break;
3884 case 0x40 ... 0x47: /* inc r16/r32 */ 3849 case 0x40 ... 0x47: /* inc r16/r32 */
3885 emulate_1op("inc", c->dst, ctxt->eflags); 3850 emulate_1op("inc", ctxt->dst, ctxt->eflags);
3886 break; 3851 break;
3887 case 0x48 ... 0x4f: /* dec r16/r32 */ 3852 case 0x48 ... 0x4f: /* dec r16/r32 */
3888 emulate_1op("dec", c->dst, ctxt->eflags); 3853 emulate_1op("dec", ctxt->dst, ctxt->eflags);
3889 break; 3854 break;
3890 case 0x63: /* movsxd */ 3855 case 0x63: /* movsxd */
3891 if (ctxt->mode != X86EMUL_MODE_PROT64) 3856 if (ctxt->mode != X86EMUL_MODE_PROT64)
3892 goto cannot_emulate; 3857 goto cannot_emulate;
3893 c->dst.val = (s32) c->src.val; 3858 ctxt->dst.val = (s32) ctxt->src.val;
3894 break; 3859 break;
3895 case 0x6c: /* insb */ 3860 case 0x6c: /* insb */
3896 case 0x6d: /* insw/insd */ 3861 case 0x6d: /* insw/insd */
3897 c->src.val = c->regs[VCPU_REGS_RDX]; 3862 ctxt->src.val = ctxt->regs[VCPU_REGS_RDX];
3898 goto do_io_in; 3863 goto do_io_in;
3899 case 0x6e: /* outsb */ 3864 case 0x6e: /* outsb */
3900 case 0x6f: /* outsw/outsd */ 3865 case 0x6f: /* outsw/outsd */
3901 c->dst.val = c->regs[VCPU_REGS_RDX]; 3866 ctxt->dst.val = ctxt->regs[VCPU_REGS_RDX];
3902 goto do_io_out; 3867 goto do_io_out;
3903 break; 3868 break;
3904 case 0x70 ... 0x7f: /* jcc (short) */ 3869 case 0x70 ... 0x7f: /* jcc (short) */
3905 if (test_cc(c->b, ctxt->eflags)) 3870 if (test_cc(ctxt->b, ctxt->eflags))
3906 jmp_rel(c, c->src.val); 3871 jmp_rel(ctxt, ctxt->src.val);
3907 break;
3908 case 0x84 ... 0x85:
3909 test:
3910 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
3911 break;
3912 case 0x86 ... 0x87: /* xchg */
3913 xchg:
3914 /* Write back the register source. */
3915 c->src.val = c->dst.val;
3916 write_register_operand(&c->src);
3917 /*
3918 * Write back the memory destination with implicit LOCK
3919 * prefix.
3920 */
3921 c->dst.val = c->src.orig_val;
3922 c->lock_prefix = 1;
3923 break;
3924 case 0x8c: /* mov r/m, sreg */
3925 if (c->modrm_reg > VCPU_SREG_GS) {
3926 rc = emulate_ud(ctxt);
3927 goto done;
3928 }
3929 c->dst.val = get_segment_selector(ctxt, c->modrm_reg);
3930 break; 3872 break;
3931 case 0x8d: /* lea r16/r32, m */ 3873 case 0x8d: /* lea r16/r32, m */
3932 c->dst.val = c->src.addr.mem.ea; 3874 ctxt->dst.val = ctxt->src.addr.mem.ea;
3933 break; 3875 break;
3934 case 0x8e: { /* mov seg, r/m16 */
3935 uint16_t sel;
3936
3937 sel = c->src.val;
3938
3939 if (c->modrm_reg == VCPU_SREG_CS ||
3940 c->modrm_reg > VCPU_SREG_GS) {
3941 rc = emulate_ud(ctxt);
3942 goto done;
3943 }
3944
3945 if (c->modrm_reg == VCPU_SREG_SS)
3946 ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
3947
3948 rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
3949
3950 c->dst.type = OP_NONE; /* Disable writeback. */
3951 break;
3952 }
3953 case 0x8f: /* pop (sole member of Grp1a) */ 3876 case 0x8f: /* pop (sole member of Grp1a) */
3954 rc = em_grp1a(ctxt); 3877 rc = em_grp1a(ctxt);
3955 break; 3878 break;
3956 case 0x90 ... 0x97: /* nop / xchg reg, rax */ 3879 case 0x90 ... 0x97: /* nop / xchg reg, rax */
3957 if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) 3880 if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX])
3958 break; 3881 break;
3959 goto xchg; 3882 rc = em_xchg(ctxt);
3883 break;
3960 case 0x98: /* cbw/cwde/cdqe */ 3884 case 0x98: /* cbw/cwde/cdqe */
3961 switch (c->op_bytes) { 3885 switch (ctxt->op_bytes) {
3962 case 2: c->dst.val = (s8)c->dst.val; break; 3886 case 2: ctxt->dst.val = (s8)ctxt->dst.val; break;
3963 case 4: c->dst.val = (s16)c->dst.val; break; 3887 case 4: ctxt->dst.val = (s16)ctxt->dst.val; break;
3964 case 8: c->dst.val = (s32)c->dst.val; break; 3888 case 8: ctxt->dst.val = (s32)ctxt->dst.val; break;
3965 } 3889 }
3966 break; 3890 break;
3967 case 0xa8 ... 0xa9: /* test ax, imm */
3968 goto test;
3969 case 0xc0 ... 0xc1: 3891 case 0xc0 ... 0xc1:
3970 rc = em_grp2(ctxt); 3892 rc = em_grp2(ctxt);
3971 break; 3893 break;
3972 case 0xc3: /* ret */
3973 c->dst.type = OP_REG;
3974 c->dst.addr.reg = &c->eip;
3975 c->dst.bytes = c->op_bytes;
3976 rc = em_pop(ctxt);
3977 break;
3978 case 0xc4: /* les */ 3894 case 0xc4: /* les */
3979 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES); 3895 rc = emulate_load_segment(ctxt, VCPU_SREG_ES);
3980 break; 3896 break;
3981 case 0xc5: /* lds */ 3897 case 0xc5: /* lds */
3982 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS); 3898 rc = emulate_load_segment(ctxt, VCPU_SREG_DS);
3983 break;
3984 case 0xcb: /* ret far */
3985 rc = emulate_ret_far(ctxt, ops);
3986 break; 3899 break;
3987 case 0xcc: /* int3 */ 3900 case 0xcc: /* int3 */
3988 irq = 3; 3901 rc = emulate_int(ctxt, 3);
3989 goto do_interrupt; 3902 break;
3990 case 0xcd: /* int n */ 3903 case 0xcd: /* int n */
3991 irq = c->src.val; 3904 rc = emulate_int(ctxt, ctxt->src.val);
3992 do_interrupt:
3993 rc = emulate_int(ctxt, ops, irq);
3994 break; 3905 break;
3995 case 0xce: /* into */ 3906 case 0xce: /* into */
3996 if (ctxt->eflags & EFLG_OF) { 3907 if (ctxt->eflags & EFLG_OF)
3997 irq = 4; 3908 rc = emulate_int(ctxt, 4);
3998 goto do_interrupt;
3999 }
4000 break;
4001 case 0xcf: /* iret */
4002 rc = emulate_iret(ctxt, ops);
4003 break; 3909 break;
4004 case 0xd0 ... 0xd1: /* Grp2 */ 3910 case 0xd0 ... 0xd1: /* Grp2 */
4005 rc = em_grp2(ctxt); 3911 rc = em_grp2(ctxt);
4006 break; 3912 break;
4007 case 0xd2 ... 0xd3: /* Grp2 */ 3913 case 0xd2 ... 0xd3: /* Grp2 */
4008 c->src.val = c->regs[VCPU_REGS_RCX]; 3914 ctxt->src.val = ctxt->regs[VCPU_REGS_RCX];
4009 rc = em_grp2(ctxt); 3915 rc = em_grp2(ctxt);
4010 break; 3916 break;
4011 case 0xe0 ... 0xe2: /* loop/loopz/loopnz */
4012 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
4013 if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 &&
4014 (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
4015 jmp_rel(c, c->src.val);
4016 break;
4017 case 0xe3: /* jcxz/jecxz/jrcxz */
4018 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0)
4019 jmp_rel(c, c->src.val);
4020 break;
4021 case 0xe4: /* inb */ 3917 case 0xe4: /* inb */
4022 case 0xe5: /* in */ 3918 case 0xe5: /* in */
4023 goto do_io_in; 3919 goto do_io_in;
@@ -4025,35 +3921,30 @@ special_insn:
4025 case 0xe7: /* out */ 3921 case 0xe7: /* out */
4026 goto do_io_out; 3922 goto do_io_out;
4027 case 0xe8: /* call (near) */ { 3923 case 0xe8: /* call (near) */ {
4028 long int rel = c->src.val; 3924 long int rel = ctxt->src.val;
4029 c->src.val = (unsigned long) c->eip; 3925 ctxt->src.val = (unsigned long) ctxt->_eip;
4030 jmp_rel(c, rel); 3926 jmp_rel(ctxt, rel);
4031 rc = em_push(ctxt); 3927 rc = em_push(ctxt);
4032 break; 3928 break;
4033 } 3929 }
4034 case 0xe9: /* jmp rel */ 3930 case 0xe9: /* jmp rel */
4035 goto jmp; 3931 case 0xeb: /* jmp rel short */
4036 case 0xea: /* jmp far */ 3932 jmp_rel(ctxt, ctxt->src.val);
4037 rc = em_jmp_far(ctxt); 3933 ctxt->dst.type = OP_NONE; /* Disable writeback. */
4038 break;
4039 case 0xeb:
4040 jmp: /* jmp rel short */
4041 jmp_rel(c, c->src.val);
4042 c->dst.type = OP_NONE; /* Disable writeback. */
4043 break; 3934 break;
4044 case 0xec: /* in al,dx */ 3935 case 0xec: /* in al,dx */
4045 case 0xed: /* in (e/r)ax,dx */ 3936 case 0xed: /* in (e/r)ax,dx */
4046 do_io_in: 3937 do_io_in:
4047 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, 3938 if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
4048 &c->dst.val)) 3939 &ctxt->dst.val))
4049 goto done; /* IO is needed */ 3940 goto done; /* IO is needed */
4050 break; 3941 break;
4051 case 0xee: /* out dx,al */ 3942 case 0xee: /* out dx,al */
4052 case 0xef: /* out dx,(e/r)ax */ 3943 case 0xef: /* out dx,(e/r)ax */
4053 do_io_out: 3944 do_io_out:
4054 ops->pio_out_emulated(ctxt, c->src.bytes, c->dst.val, 3945 ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
4055 &c->src.val, 1); 3946 &ctxt->src.val, 1);
4056 c->dst.type = OP_NONE; /* Disable writeback. */ 3947 ctxt->dst.type = OP_NONE; /* Disable writeback. */
4057 break; 3948 break;
4058 case 0xf4: /* hlt */ 3949 case 0xf4: /* hlt */
4059 ctxt->ops->halt(ctxt); 3950 ctxt->ops->halt(ctxt);
@@ -4071,22 +3962,6 @@ special_insn:
4071 case 0xf9: /* stc */ 3962 case 0xf9: /* stc */
4072 ctxt->eflags |= EFLG_CF; 3963 ctxt->eflags |= EFLG_CF;
4073 break; 3964 break;
4074 case 0xfa: /* cli */
4075 if (emulator_bad_iopl(ctxt, ops)) {
4076 rc = emulate_gp(ctxt, 0);
4077 goto done;
4078 } else
4079 ctxt->eflags &= ~X86_EFLAGS_IF;
4080 break;
4081 case 0xfb: /* sti */
4082 if (emulator_bad_iopl(ctxt, ops)) {
4083 rc = emulate_gp(ctxt, 0);
4084 goto done;
4085 } else {
4086 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
4087 ctxt->eflags |= X86_EFLAGS_IF;
4088 }
4089 break;
4090 case 0xfc: /* cld */ 3965 case 0xfc: /* cld */
4091 ctxt->eflags &= ~EFLG_DF; 3966 ctxt->eflags &= ~EFLG_DF;
4092 break; 3967 break;
@@ -4115,40 +3990,40 @@ writeback:
4115 * restore dst type in case the decoding will be reused 3990 * restore dst type in case the decoding will be reused
4116 * (happens for string instruction ) 3991 * (happens for string instruction )
4117 */ 3992 */
4118 c->dst.type = saved_dst_type; 3993 ctxt->dst.type = saved_dst_type;
4119 3994
4120 if ((c->d & SrcMask) == SrcSI) 3995 if ((ctxt->d & SrcMask) == SrcSI)
4121 string_addr_inc(ctxt, seg_override(ctxt, c), 3996 string_addr_inc(ctxt, seg_override(ctxt),
4122 VCPU_REGS_RSI, &c->src); 3997 VCPU_REGS_RSI, &ctxt->src);
4123 3998
4124 if ((c->d & DstMask) == DstDI) 3999 if ((ctxt->d & DstMask) == DstDI)
4125 string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI, 4000 string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
4126 &c->dst); 4001 &ctxt->dst);
4127 4002
4128 if (c->rep_prefix && (c->d & String)) { 4003 if (ctxt->rep_prefix && (ctxt->d & String)) {
4129 struct read_cache *r = &ctxt->decode.io_read; 4004 struct read_cache *r = &ctxt->io_read;
4130 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); 4005 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
4131 4006
4132 if (!string_insn_completed(ctxt)) { 4007 if (!string_insn_completed(ctxt)) {
4133 /* 4008 /*
4134 * Re-enter guest when pio read ahead buffer is empty 4009 * Re-enter guest when pio read ahead buffer is empty
4135 * or, if it is not used, after each 1024 iteration. 4010 * or, if it is not used, after each 1024 iteration.
4136 */ 4011 */
4137 if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) && 4012 if ((r->end != 0 || ctxt->regs[VCPU_REGS_RCX] & 0x3ff) &&
4138 (r->end == 0 || r->end != r->pos)) { 4013 (r->end == 0 || r->end != r->pos)) {
4139 /* 4014 /*
4140 * Reset read cache. Usually happens before 4015 * Reset read cache. Usually happens before
4141 * decode, but since instruction is restarted 4016 * decode, but since instruction is restarted
4142 * we have to do it here. 4017 * we have to do it here.
4143 */ 4018 */
4144 ctxt->decode.mem_read.end = 0; 4019 ctxt->mem_read.end = 0;
4145 return EMULATION_RESTART; 4020 return EMULATION_RESTART;
4146 } 4021 }
4147 goto done; /* skip rip writeback */ 4022 goto done; /* skip rip writeback */
4148 } 4023 }
4149 } 4024 }
4150 4025
4151 ctxt->eip = c->eip; 4026 ctxt->eip = ctxt->_eip;
4152 4027
4153done: 4028done:
4154 if (rc == X86EMUL_PROPAGATE_FAULT) 4029 if (rc == X86EMUL_PROPAGATE_FAULT)
@@ -4159,13 +4034,7 @@ done:
4159 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 4034 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
4160 4035
4161twobyte_insn: 4036twobyte_insn:
4162 switch (c->b) { 4037 switch (ctxt->b) {
4163 case 0x05: /* syscall */
4164 rc = emulate_syscall(ctxt, ops);
4165 break;
4166 case 0x06:
4167 rc = em_clts(ctxt);
4168 break;
4169 case 0x09: /* wbinvd */ 4038 case 0x09: /* wbinvd */
4170 (ctxt->ops->wbinvd)(ctxt); 4039 (ctxt->ops->wbinvd)(ctxt);
4171 break; 4040 break;
@@ -4174,21 +4043,21 @@ twobyte_insn:
4174 case 0x18: /* Grp16 (prefetch/nop) */ 4043 case 0x18: /* Grp16 (prefetch/nop) */
4175 break; 4044 break;
4176 case 0x20: /* mov cr, reg */ 4045 case 0x20: /* mov cr, reg */
4177 c->dst.val = ops->get_cr(ctxt, c->modrm_reg); 4046 ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg);
4178 break; 4047 break;
4179 case 0x21: /* mov from dr to reg */ 4048 case 0x21: /* mov from dr to reg */
4180 ops->get_dr(ctxt, c->modrm_reg, &c->dst.val); 4049 ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
4181 break; 4050 break;
4182 case 0x22: /* mov reg, cr */ 4051 case 0x22: /* mov reg, cr */
4183 if (ops->set_cr(ctxt, c->modrm_reg, c->src.val)) { 4052 if (ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) {
4184 emulate_gp(ctxt, 0); 4053 emulate_gp(ctxt, 0);
4185 rc = X86EMUL_PROPAGATE_FAULT; 4054 rc = X86EMUL_PROPAGATE_FAULT;
4186 goto done; 4055 goto done;
4187 } 4056 }
4188 c->dst.type = OP_NONE; 4057 ctxt->dst.type = OP_NONE;
4189 break; 4058 break;
4190 case 0x23: /* mov from reg to dr */ 4059 case 0x23: /* mov from reg to dr */
4191 if (ops->set_dr(ctxt, c->modrm_reg, c->src.val & 4060 if (ops->set_dr(ctxt, ctxt->modrm_reg, ctxt->src.val &
4192 ((ctxt->mode == X86EMUL_MODE_PROT64) ? 4061 ((ctxt->mode == X86EMUL_MODE_PROT64) ?
4193 ~0ULL : ~0U)) < 0) { 4062 ~0ULL : ~0U)) < 0) {
4194 /* #UD condition is already handled by the code above */ 4063 /* #UD condition is already handled by the code above */
@@ -4197,13 +4066,13 @@ twobyte_insn:
4197 goto done; 4066 goto done;
4198 } 4067 }
4199 4068
4200 c->dst.type = OP_NONE; /* no writeback */ 4069 ctxt->dst.type = OP_NONE; /* no writeback */
4201 break; 4070 break;
4202 case 0x30: 4071 case 0x30:
4203 /* wrmsr */ 4072 /* wrmsr */
4204 msr_data = (u32)c->regs[VCPU_REGS_RAX] 4073 msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
4205 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 4074 | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32);
4206 if (ops->set_msr(ctxt, c->regs[VCPU_REGS_RCX], msr_data)) { 4075 if (ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) {
4207 emulate_gp(ctxt, 0); 4076 emulate_gp(ctxt, 0);
4208 rc = X86EMUL_PROPAGATE_FAULT; 4077 rc = X86EMUL_PROPAGATE_FAULT;
4209 goto done; 4078 goto done;
@@ -4212,64 +4081,58 @@ twobyte_insn:
4212 break; 4081 break;
4213 case 0x32: 4082 case 0x32:
4214 /* rdmsr */ 4083 /* rdmsr */
4215 if (ops->get_msr(ctxt, c->regs[VCPU_REGS_RCX], &msr_data)) { 4084 if (ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) {
4216 emulate_gp(ctxt, 0); 4085 emulate_gp(ctxt, 0);
4217 rc = X86EMUL_PROPAGATE_FAULT; 4086 rc = X86EMUL_PROPAGATE_FAULT;
4218 goto done; 4087 goto done;
4219 } else { 4088 } else {
4220 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 4089 ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data;
4221 c->regs[VCPU_REGS_RDX] = msr_data >> 32; 4090 ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32;
4222 } 4091 }
4223 rc = X86EMUL_CONTINUE; 4092 rc = X86EMUL_CONTINUE;
4224 break; 4093 break;
4225 case 0x34: /* sysenter */
4226 rc = emulate_sysenter(ctxt, ops);
4227 break;
4228 case 0x35: /* sysexit */
4229 rc = emulate_sysexit(ctxt, ops);
4230 break;
4231 case 0x40 ... 0x4f: /* cmov */ 4094 case 0x40 ... 0x4f: /* cmov */
4232 c->dst.val = c->dst.orig_val = c->src.val; 4095 ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val;
4233 if (!test_cc(c->b, ctxt->eflags)) 4096 if (!test_cc(ctxt->b, ctxt->eflags))
4234 c->dst.type = OP_NONE; /* no writeback */ 4097 ctxt->dst.type = OP_NONE; /* no writeback */
4235 break; 4098 break;
4236 case 0x80 ... 0x8f: /* jnz rel, etc*/ 4099 case 0x80 ... 0x8f: /* jnz rel, etc*/
4237 if (test_cc(c->b, ctxt->eflags)) 4100 if (test_cc(ctxt->b, ctxt->eflags))
4238 jmp_rel(c, c->src.val); 4101 jmp_rel(ctxt, ctxt->src.val);
4239 break; 4102 break;
4240 case 0x90 ... 0x9f: /* setcc r/m8 */ 4103 case 0x90 ... 0x9f: /* setcc r/m8 */
4241 c->dst.val = test_cc(c->b, ctxt->eflags); 4104 ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
4242 break; 4105 break;
4243 case 0xa0: /* push fs */ 4106 case 0xa0: /* push fs */
4244 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); 4107 rc = emulate_push_sreg(ctxt, VCPU_SREG_FS);
4245 break; 4108 break;
4246 case 0xa1: /* pop fs */ 4109 case 0xa1: /* pop fs */
4247 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); 4110 rc = emulate_pop_sreg(ctxt, VCPU_SREG_FS);
4248 break; 4111 break;
4249 case 0xa3: 4112 case 0xa3:
4250 bt: /* bt */ 4113 bt: /* bt */
4251 c->dst.type = OP_NONE; 4114 ctxt->dst.type = OP_NONE;
4252 /* only subword offset */ 4115 /* only subword offset */
4253 c->src.val &= (c->dst.bytes << 3) - 1; 4116 ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
4254 emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); 4117 emulate_2op_SrcV_nobyte("bt", ctxt->src, ctxt->dst, ctxt->eflags);
4255 break; 4118 break;
4256 case 0xa4: /* shld imm8, r, r/m */ 4119 case 0xa4: /* shld imm8, r, r/m */
4257 case 0xa5: /* shld cl, r, r/m */ 4120 case 0xa5: /* shld cl, r, r/m */
4258 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); 4121 emulate_2op_cl("shld", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags);
4259 break; 4122 break;
4260 case 0xa8: /* push gs */ 4123 case 0xa8: /* push gs */
4261 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); 4124 rc = emulate_push_sreg(ctxt, VCPU_SREG_GS);
4262 break; 4125 break;
4263 case 0xa9: /* pop gs */ 4126 case 0xa9: /* pop gs */
4264 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); 4127 rc = emulate_pop_sreg(ctxt, VCPU_SREG_GS);
4265 break; 4128 break;
4266 case 0xab: 4129 case 0xab:
4267 bts: /* bts */ 4130 bts: /* bts */
4268 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); 4131 emulate_2op_SrcV_nobyte("bts", ctxt->src, ctxt->dst, ctxt->eflags);
4269 break; 4132 break;
4270 case 0xac: /* shrd imm8, r, r/m */ 4133 case 0xac: /* shrd imm8, r, r/m */
4271 case 0xad: /* shrd cl, r, r/m */ 4134 case 0xad: /* shrd cl, r, r/m */
4272 emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags); 4135 emulate_2op_cl("shrd", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags);
4273 break; 4136 break;
4274 case 0xae: /* clflush */ 4137 case 0xae: /* clflush */
4275 break; 4138 break;
@@ -4278,38 +4141,38 @@ twobyte_insn:
4278 * Save real source value, then compare EAX against 4141 * Save real source value, then compare EAX against
4279 * destination. 4142 * destination.
4280 */ 4143 */
4281 c->src.orig_val = c->src.val; 4144 ctxt->src.orig_val = ctxt->src.val;
4282 c->src.val = c->regs[VCPU_REGS_RAX]; 4145 ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
4283 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); 4146 emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags);
4284 if (ctxt->eflags & EFLG_ZF) { 4147 if (ctxt->eflags & EFLG_ZF) {
4285 /* Success: write back to memory. */ 4148 /* Success: write back to memory. */
4286 c->dst.val = c->src.orig_val; 4149 ctxt->dst.val = ctxt->src.orig_val;
4287 } else { 4150 } else {
4288 /* Failure: write the value we saw to EAX. */ 4151 /* Failure: write the value we saw to EAX. */
4289 c->dst.type = OP_REG; 4152 ctxt->dst.type = OP_REG;
4290 c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 4153 ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
4291 } 4154 }
4292 break; 4155 break;
4293 case 0xb2: /* lss */ 4156 case 0xb2: /* lss */
4294 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS); 4157 rc = emulate_load_segment(ctxt, VCPU_SREG_SS);
4295 break; 4158 break;
4296 case 0xb3: 4159 case 0xb3:
4297 btr: /* btr */ 4160 btr: /* btr */
4298 emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); 4161 emulate_2op_SrcV_nobyte("btr", ctxt->src, ctxt->dst, ctxt->eflags);
4299 break; 4162 break;
4300 case 0xb4: /* lfs */ 4163 case 0xb4: /* lfs */
4301 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS); 4164 rc = emulate_load_segment(ctxt, VCPU_SREG_FS);
4302 break; 4165 break;
4303 case 0xb5: /* lgs */ 4166 case 0xb5: /* lgs */
4304 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS); 4167 rc = emulate_load_segment(ctxt, VCPU_SREG_GS);
4305 break; 4168 break;
4306 case 0xb6 ... 0xb7: /* movzx */ 4169 case 0xb6 ... 0xb7: /* movzx */
4307 c->dst.bytes = c->op_bytes; 4170 ctxt->dst.bytes = ctxt->op_bytes;
4308 c->dst.val = (c->d & ByteOp) ? (u8) c->src.val 4171 ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val
4309 : (u16) c->src.val; 4172 : (u16) ctxt->src.val;
4310 break; 4173 break;
4311 case 0xba: /* Grp8 */ 4174 case 0xba: /* Grp8 */
4312 switch (c->modrm_reg & 3) { 4175 switch (ctxt->modrm_reg & 3) {
4313 case 0: 4176 case 0:
4314 goto bt; 4177 goto bt;
4315 case 1: 4178 case 1:
@@ -4322,47 +4185,47 @@ twobyte_insn:
4322 break; 4185 break;
4323 case 0xbb: 4186 case 0xbb:
4324 btc: /* btc */ 4187 btc: /* btc */
4325 emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); 4188 emulate_2op_SrcV_nobyte("btc", ctxt->src, ctxt->dst, ctxt->eflags);
4326 break; 4189 break;
4327 case 0xbc: { /* bsf */ 4190 case 0xbc: { /* bsf */
4328 u8 zf; 4191 u8 zf;
4329 __asm__ ("bsf %2, %0; setz %1" 4192 __asm__ ("bsf %2, %0; setz %1"
4330 : "=r"(c->dst.val), "=q"(zf) 4193 : "=r"(ctxt->dst.val), "=q"(zf)
4331 : "r"(c->src.val)); 4194 : "r"(ctxt->src.val));
4332 ctxt->eflags &= ~X86_EFLAGS_ZF; 4195 ctxt->eflags &= ~X86_EFLAGS_ZF;
4333 if (zf) { 4196 if (zf) {
4334 ctxt->eflags |= X86_EFLAGS_ZF; 4197 ctxt->eflags |= X86_EFLAGS_ZF;
4335 c->dst.type = OP_NONE; /* Disable writeback. */ 4198 ctxt->dst.type = OP_NONE; /* Disable writeback. */
4336 } 4199 }
4337 break; 4200 break;
4338 } 4201 }
4339 case 0xbd: { /* bsr */ 4202 case 0xbd: { /* bsr */
4340 u8 zf; 4203 u8 zf;
4341 __asm__ ("bsr %2, %0; setz %1" 4204 __asm__ ("bsr %2, %0; setz %1"
4342 : "=r"(c->dst.val), "=q"(zf) 4205 : "=r"(ctxt->dst.val), "=q"(zf)
4343 : "r"(c->src.val)); 4206 : "r"(ctxt->src.val));
4344 ctxt->eflags &= ~X86_EFLAGS_ZF; 4207 ctxt->eflags &= ~X86_EFLAGS_ZF;
4345 if (zf) { 4208 if (zf) {
4346 ctxt->eflags |= X86_EFLAGS_ZF; 4209 ctxt->eflags |= X86_EFLAGS_ZF;
4347 c->dst.type = OP_NONE; /* Disable writeback. */ 4210 ctxt->dst.type = OP_NONE; /* Disable writeback. */
4348 } 4211 }
4349 break; 4212 break;
4350 } 4213 }
4351 case 0xbe ... 0xbf: /* movsx */ 4214 case 0xbe ... 0xbf: /* movsx */
4352 c->dst.bytes = c->op_bytes; 4215 ctxt->dst.bytes = ctxt->op_bytes;
4353 c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : 4216 ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val :
4354 (s16) c->src.val; 4217 (s16) ctxt->src.val;
4355 break; 4218 break;
4356 case 0xc0 ... 0xc1: /* xadd */ 4219 case 0xc0 ... 0xc1: /* xadd */
4357 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); 4220 emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags);
4358 /* Write back the register source. */ 4221 /* Write back the register source. */
4359 c->src.val = c->dst.orig_val; 4222 ctxt->src.val = ctxt->dst.orig_val;
4360 write_register_operand(&c->src); 4223 write_register_operand(&ctxt->src);
4361 break; 4224 break;
4362 case 0xc3: /* movnti */ 4225 case 0xc3: /* movnti */
4363 c->dst.bytes = c->op_bytes; 4226 ctxt->dst.bytes = ctxt->op_bytes;
4364 c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : 4227 ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :
4365 (u64) c->src.val; 4228 (u64) ctxt->src.val;
4366 break; 4229 break;
4367 case 0xc7: /* Grp9 (cmpxchg8b) */ 4230 case 0xc7: /* Grp9 (cmpxchg8b) */
4368 rc = em_grp9(ctxt); 4231 rc = em_grp9(ctxt);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index aee38623b768..9335e1bf72ad 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -148,7 +148,7 @@ module_param(oos_shadow, bool, 0644);
148#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ 148#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
149 | PT64_NX_MASK) 149 | PT64_NX_MASK)
150 150
151#define RMAP_EXT 4 151#define PTE_LIST_EXT 4
152 152
153#define ACC_EXEC_MASK 1 153#define ACC_EXEC_MASK 1
154#define ACC_WRITE_MASK PT_WRITABLE_MASK 154#define ACC_WRITE_MASK PT_WRITABLE_MASK
@@ -164,16 +164,16 @@ module_param(oos_shadow, bool, 0644);
164 164
165#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 165#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
166 166
167struct kvm_rmap_desc { 167struct pte_list_desc {
168 u64 *sptes[RMAP_EXT]; 168 u64 *sptes[PTE_LIST_EXT];
169 struct kvm_rmap_desc *more; 169 struct pte_list_desc *more;
170}; 170};
171 171
172struct kvm_shadow_walk_iterator { 172struct kvm_shadow_walk_iterator {
173 u64 addr; 173 u64 addr;
174 hpa_t shadow_addr; 174 hpa_t shadow_addr;
175 int level;
176 u64 *sptep; 175 u64 *sptep;
176 int level;
177 unsigned index; 177 unsigned index;
178}; 178};
179 179
@@ -182,32 +182,68 @@ struct kvm_shadow_walk_iterator {
182 shadow_walk_okay(&(_walker)); \ 182 shadow_walk_okay(&(_walker)); \
183 shadow_walk_next(&(_walker))) 183 shadow_walk_next(&(_walker)))
184 184
185typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); 185#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
186 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
187 shadow_walk_okay(&(_walker)) && \
188 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
189 __shadow_walk_next(&(_walker), spte))
186 190
187static struct kmem_cache *pte_chain_cache; 191static struct kmem_cache *pte_list_desc_cache;
188static struct kmem_cache *rmap_desc_cache;
189static struct kmem_cache *mmu_page_header_cache; 192static struct kmem_cache *mmu_page_header_cache;
190static struct percpu_counter kvm_total_used_mmu_pages; 193static struct percpu_counter kvm_total_used_mmu_pages;
191 194
192static u64 __read_mostly shadow_trap_nonpresent_pte;
193static u64 __read_mostly shadow_notrap_nonpresent_pte;
194static u64 __read_mostly shadow_nx_mask; 195static u64 __read_mostly shadow_nx_mask;
195static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ 196static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
196static u64 __read_mostly shadow_user_mask; 197static u64 __read_mostly shadow_user_mask;
197static u64 __read_mostly shadow_accessed_mask; 198static u64 __read_mostly shadow_accessed_mask;
198static u64 __read_mostly shadow_dirty_mask; 199static u64 __read_mostly shadow_dirty_mask;
200static u64 __read_mostly shadow_mmio_mask;
199 201
200static inline u64 rsvd_bits(int s, int e) 202static void mmu_spte_set(u64 *sptep, u64 spte);
203
204void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
201{ 205{
202 return ((1ULL << (e - s + 1)) - 1) << s; 206 shadow_mmio_mask = mmio_mask;
207}
208EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
209
210static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
211{
212 access &= ACC_WRITE_MASK | ACC_USER_MASK;
213
214 trace_mark_mmio_spte(sptep, gfn, access);
215 mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
203} 216}
204 217
205void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) 218static bool is_mmio_spte(u64 spte)
206{ 219{
207 shadow_trap_nonpresent_pte = trap_pte; 220 return (spte & shadow_mmio_mask) == shadow_mmio_mask;
208 shadow_notrap_nonpresent_pte = notrap_pte; 221}
222
223static gfn_t get_mmio_spte_gfn(u64 spte)
224{
225 return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
226}
227
228static unsigned get_mmio_spte_access(u64 spte)
229{
230 return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
231}
232
233static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
234{
235 if (unlikely(is_noslot_pfn(pfn))) {
236 mark_mmio_spte(sptep, gfn, access);
237 return true;
238 }
239
240 return false;
241}
242
243static inline u64 rsvd_bits(int s, int e)
244{
245 return ((1ULL << (e - s + 1)) - 1) << s;
209} 246}
210EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
211 247
212void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 248void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
213 u64 dirty_mask, u64 nx_mask, u64 x_mask) 249 u64 dirty_mask, u64 nx_mask, u64 x_mask)
@@ -220,11 +256,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
220} 256}
221EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 257EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
222 258
223static bool is_write_protection(struct kvm_vcpu *vcpu)
224{
225 return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
226}
227
228static int is_cpuid_PSE36(void) 259static int is_cpuid_PSE36(void)
229{ 260{
230 return 1; 261 return 1;
@@ -237,8 +268,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
237 268
238static int is_shadow_present_pte(u64 pte) 269static int is_shadow_present_pte(u64 pte)
239{ 270{
240 return pte != shadow_trap_nonpresent_pte 271 return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
241 && pte != shadow_notrap_nonpresent_pte;
242} 272}
243 273
244static int is_large_pte(u64 pte) 274static int is_large_pte(u64 pte)
@@ -246,11 +276,6 @@ static int is_large_pte(u64 pte)
246 return pte & PT_PAGE_SIZE_MASK; 276 return pte & PT_PAGE_SIZE_MASK;
247} 277}
248 278
249static int is_writable_pte(unsigned long pte)
250{
251 return pte & PT_WRITABLE_MASK;
252}
253
254static int is_dirty_gpte(unsigned long pte) 279static int is_dirty_gpte(unsigned long pte)
255{ 280{
256 return pte & PT_DIRTY_MASK; 281 return pte & PT_DIRTY_MASK;
@@ -282,26 +307,154 @@ static gfn_t pse36_gfn_delta(u32 gpte)
282 return (gpte & PT32_DIR_PSE36_MASK) << shift; 307 return (gpte & PT32_DIR_PSE36_MASK) << shift;
283} 308}
284 309
310#ifdef CONFIG_X86_64
285static void __set_spte(u64 *sptep, u64 spte) 311static void __set_spte(u64 *sptep, u64 spte)
286{ 312{
287 set_64bit(sptep, spte); 313 *sptep = spte;
288} 314}
289 315
290static u64 __xchg_spte(u64 *sptep, u64 new_spte) 316static void __update_clear_spte_fast(u64 *sptep, u64 spte)
291{ 317{
292#ifdef CONFIG_X86_64 318 *sptep = spte;
293 return xchg(sptep, new_spte); 319}
320
321static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
322{
323 return xchg(sptep, spte);
324}
325
326static u64 __get_spte_lockless(u64 *sptep)
327{
328 return ACCESS_ONCE(*sptep);
329}
330
331static bool __check_direct_spte_mmio_pf(u64 spte)
332{
333 /* It is valid if the spte is zapped. */
334 return spte == 0ull;
335}
294#else 336#else
295 u64 old_spte; 337union split_spte {
338 struct {
339 u32 spte_low;
340 u32 spte_high;
341 };
342 u64 spte;
343};
296 344
297 do { 345static void count_spte_clear(u64 *sptep, u64 spte)
298 old_spte = *sptep; 346{
299 } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte); 347 struct kvm_mmu_page *sp = page_header(__pa(sptep));
300 348
301 return old_spte; 349 if (is_shadow_present_pte(spte))
302#endif 350 return;
351
352 /* Ensure the spte is completely set before we increase the count */
353 smp_wmb();
354 sp->clear_spte_count++;
355}
356
357static void __set_spte(u64 *sptep, u64 spte)
358{
359 union split_spte *ssptep, sspte;
360
361 ssptep = (union split_spte *)sptep;
362 sspte = (union split_spte)spte;
363
364 ssptep->spte_high = sspte.spte_high;
365
366 /*
367 * If we map the spte from nonpresent to present, We should store
368 * the high bits firstly, then set present bit, so cpu can not
369 * fetch this spte while we are setting the spte.
370 */
371 smp_wmb();
372
373 ssptep->spte_low = sspte.spte_low;
303} 374}
304 375
376static void __update_clear_spte_fast(u64 *sptep, u64 spte)
377{
378 union split_spte *ssptep, sspte;
379
380 ssptep = (union split_spte *)sptep;
381 sspte = (union split_spte)spte;
382
383 ssptep->spte_low = sspte.spte_low;
384
385 /*
386 * If we map the spte from present to nonpresent, we should clear
387 * present bit firstly to avoid vcpu fetch the old high bits.
388 */
389 smp_wmb();
390
391 ssptep->spte_high = sspte.spte_high;
392 count_spte_clear(sptep, spte);
393}
394
395static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
396{
397 union split_spte *ssptep, sspte, orig;
398
399 ssptep = (union split_spte *)sptep;
400 sspte = (union split_spte)spte;
401
402 /* xchg acts as a barrier before the setting of the high bits */
403 orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
404 orig.spte_high = ssptep->spte_high = sspte.spte_high;
405 count_spte_clear(sptep, spte);
406
407 return orig.spte;
408}
409
410/*
411 * The idea using the light way get the spte on x86_32 guest is from
412 * gup_get_pte(arch/x86/mm/gup.c).
413 * The difference is we can not catch the spte tlb flush if we leave
414 * guest mode, so we emulate it by increase clear_spte_count when spte
415 * is cleared.
416 */
417static u64 __get_spte_lockless(u64 *sptep)
418{
419 struct kvm_mmu_page *sp = page_header(__pa(sptep));
420 union split_spte spte, *orig = (union split_spte *)sptep;
421 int count;
422
423retry:
424 count = sp->clear_spte_count;
425 smp_rmb();
426
427 spte.spte_low = orig->spte_low;
428 smp_rmb();
429
430 spte.spte_high = orig->spte_high;
431 smp_rmb();
432
433 if (unlikely(spte.spte_low != orig->spte_low ||
434 count != sp->clear_spte_count))
435 goto retry;
436
437 return spte.spte;
438}
439
440static bool __check_direct_spte_mmio_pf(u64 spte)
441{
442 union split_spte sspte = (union split_spte)spte;
443 u32 high_mmio_mask = shadow_mmio_mask >> 32;
444
445 /* It is valid if the spte is zapped. */
446 if (spte == 0ull)
447 return true;
448
449 /* It is valid if the spte is being zapped. */
450 if (sspte.spte_low == 0ull &&
451 (sspte.spte_high & high_mmio_mask) == high_mmio_mask)
452 return true;
453
454 return false;
455}
456#endif
457
305static bool spte_has_volatile_bits(u64 spte) 458static bool spte_has_volatile_bits(u64 spte)
306{ 459{
307 if (!shadow_accessed_mask) 460 if (!shadow_accessed_mask)
@@ -322,12 +475,30 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
322 return (old_spte & bit_mask) && !(new_spte & bit_mask); 475 return (old_spte & bit_mask) && !(new_spte & bit_mask);
323} 476}
324 477
325static void update_spte(u64 *sptep, u64 new_spte) 478/* Rules for using mmu_spte_set:
479 * Set the sptep from nonpresent to present.
480 * Note: the sptep being assigned *must* be either not present
481 * or in a state where the hardware will not attempt to update
482 * the spte.
483 */
484static void mmu_spte_set(u64 *sptep, u64 new_spte)
485{
486 WARN_ON(is_shadow_present_pte(*sptep));
487 __set_spte(sptep, new_spte);
488}
489
490/* Rules for using mmu_spte_update:
491 * Update the state bits, it means the mapped pfn is not changged.
492 */
493static void mmu_spte_update(u64 *sptep, u64 new_spte)
326{ 494{
327 u64 mask, old_spte = *sptep; 495 u64 mask, old_spte = *sptep;
328 496
329 WARN_ON(!is_rmap_spte(new_spte)); 497 WARN_ON(!is_rmap_spte(new_spte));
330 498
499 if (!is_shadow_present_pte(old_spte))
500 return mmu_spte_set(sptep, new_spte);
501
331 new_spte |= old_spte & shadow_dirty_mask; 502 new_spte |= old_spte & shadow_dirty_mask;
332 503
333 mask = shadow_accessed_mask; 504 mask = shadow_accessed_mask;
@@ -335,9 +506,9 @@ static void update_spte(u64 *sptep, u64 new_spte)
335 mask |= shadow_dirty_mask; 506 mask |= shadow_dirty_mask;
336 507
337 if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) 508 if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
338 __set_spte(sptep, new_spte); 509 __update_clear_spte_fast(sptep, new_spte);
339 else 510 else
340 old_spte = __xchg_spte(sptep, new_spte); 511 old_spte = __update_clear_spte_slow(sptep, new_spte);
341 512
342 if (!shadow_accessed_mask) 513 if (!shadow_accessed_mask)
343 return; 514 return;
@@ -348,6 +519,64 @@ static void update_spte(u64 *sptep, u64 new_spte)
348 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 519 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
349} 520}
350 521
522/*
523 * Rules for using mmu_spte_clear_track_bits:
524 * It sets the sptep from present to nonpresent, and track the
525 * state bits, it is used to clear the last level sptep.
526 */
527static int mmu_spte_clear_track_bits(u64 *sptep)
528{
529 pfn_t pfn;
530 u64 old_spte = *sptep;
531
532 if (!spte_has_volatile_bits(old_spte))
533 __update_clear_spte_fast(sptep, 0ull);
534 else
535 old_spte = __update_clear_spte_slow(sptep, 0ull);
536
537 if (!is_rmap_spte(old_spte))
538 return 0;
539
540 pfn = spte_to_pfn(old_spte);
541 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
542 kvm_set_pfn_accessed(pfn);
543 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
544 kvm_set_pfn_dirty(pfn);
545 return 1;
546}
547
548/*
549 * Rules for using mmu_spte_clear_no_track:
550 * Directly clear spte without caring the state bits of sptep,
551 * it is used to set the upper level spte.
552 */
553static void mmu_spte_clear_no_track(u64 *sptep)
554{
555 __update_clear_spte_fast(sptep, 0ull);
556}
557
558static u64 mmu_spte_get_lockless(u64 *sptep)
559{
560 return __get_spte_lockless(sptep);
561}
562
563static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
564{
565 rcu_read_lock();
566 atomic_inc(&vcpu->kvm->arch.reader_counter);
567
568 /* Increase the counter before walking shadow page table */
569 smp_mb__after_atomic_inc();
570}
571
572static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
573{
574 /* Decrease the counter after walking shadow page table finished */
575 smp_mb__before_atomic_dec();
576 atomic_dec(&vcpu->kvm->arch.reader_counter);
577 rcu_read_unlock();
578}
579
351static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 580static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
352 struct kmem_cache *base_cache, int min) 581 struct kmem_cache *base_cache, int min)
353{ 582{
@@ -397,12 +626,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
397{ 626{
398 int r; 627 int r;
399 628
400 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, 629 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
401 pte_chain_cache, 4); 630 pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
402 if (r)
403 goto out;
404 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
405 rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
406 if (r) 631 if (r)
407 goto out; 632 goto out;
408 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); 633 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -416,8 +641,8 @@ out:
416 641
417static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) 642static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
418{ 643{
419 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache); 644 mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
420 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache); 645 pte_list_desc_cache);
421 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); 646 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
422 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, 647 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
423 mmu_page_header_cache); 648 mmu_page_header_cache);
@@ -433,26 +658,15 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
433 return p; 658 return p;
434} 659}
435 660
436static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) 661static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
437{
438 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
439 sizeof(struct kvm_pte_chain));
440}
441
442static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
443{ 662{
444 kmem_cache_free(pte_chain_cache, pc); 663 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache,
664 sizeof(struct pte_list_desc));
445} 665}
446 666
447static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) 667static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
448{ 668{
449 return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache, 669 kmem_cache_free(pte_list_desc_cache, pte_list_desc);
450 sizeof(struct kvm_rmap_desc));
451}
452
453static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
454{
455 kmem_cache_free(rmap_desc_cache, rd);
456} 670}
457 671
458static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) 672static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
@@ -498,6 +712,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
498 linfo = lpage_info_slot(gfn, slot, i); 712 linfo = lpage_info_slot(gfn, slot, i);
499 linfo->write_count += 1; 713 linfo->write_count += 1;
500 } 714 }
715 kvm->arch.indirect_shadow_pages++;
501} 716}
502 717
503static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) 718static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
@@ -513,6 +728,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
513 linfo->write_count -= 1; 728 linfo->write_count -= 1;
514 WARN_ON(linfo->write_count < 0); 729 WARN_ON(linfo->write_count < 0);
515 } 730 }
731 kvm->arch.indirect_shadow_pages--;
516} 732}
517 733
518static int has_wrprotected_page(struct kvm *kvm, 734static int has_wrprotected_page(struct kvm *kvm,
@@ -588,67 +804,42 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
588} 804}
589 805
590/* 806/*
591 * Take gfn and return the reverse mapping to it. 807 * Pte mapping structures:
592 */
593
594static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
595{
596 struct kvm_memory_slot *slot;
597 struct kvm_lpage_info *linfo;
598
599 slot = gfn_to_memslot(kvm, gfn);
600 if (likely(level == PT_PAGE_TABLE_LEVEL))
601 return &slot->rmap[gfn - slot->base_gfn];
602
603 linfo = lpage_info_slot(gfn, slot, level);
604
605 return &linfo->rmap_pde;
606}
607
608/*
609 * Reverse mapping data structures:
610 * 808 *
611 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry 809 * If pte_list bit zero is zero, then pte_list point to the spte.
612 * that points to page_address(page).
613 * 810 *
614 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc 811 * If pte_list bit zero is one, (then pte_list & ~1) points to a struct
615 * containing more mappings. 812 * pte_list_desc containing more mappings.
616 * 813 *
617 * Returns the number of rmap entries before the spte was added or zero if 814 * Returns the number of pte entries before the spte was added or zero if
618 * the spte was not added. 815 * the spte was not added.
619 * 816 *
620 */ 817 */
621static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 818static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
819 unsigned long *pte_list)
622{ 820{
623 struct kvm_mmu_page *sp; 821 struct pte_list_desc *desc;
624 struct kvm_rmap_desc *desc;
625 unsigned long *rmapp;
626 int i, count = 0; 822 int i, count = 0;
627 823
628 if (!is_rmap_spte(*spte)) 824 if (!*pte_list) {
629 return count; 825 rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
630 sp = page_header(__pa(spte)); 826 *pte_list = (unsigned long)spte;
631 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); 827 } else if (!(*pte_list & 1)) {
632 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 828 rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
633 if (!*rmapp) { 829 desc = mmu_alloc_pte_list_desc(vcpu);
634 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); 830 desc->sptes[0] = (u64 *)*pte_list;
635 *rmapp = (unsigned long)spte;
636 } else if (!(*rmapp & 1)) {
637 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
638 desc = mmu_alloc_rmap_desc(vcpu);
639 desc->sptes[0] = (u64 *)*rmapp;
640 desc->sptes[1] = spte; 831 desc->sptes[1] = spte;
641 *rmapp = (unsigned long)desc | 1; 832 *pte_list = (unsigned long)desc | 1;
642 ++count; 833 ++count;
643 } else { 834 } else {
644 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); 835 rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
645 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 836 desc = (struct pte_list_desc *)(*pte_list & ~1ul);
646 while (desc->sptes[RMAP_EXT-1] && desc->more) { 837 while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
647 desc = desc->more; 838 desc = desc->more;
648 count += RMAP_EXT; 839 count += PTE_LIST_EXT;
649 } 840 }
650 if (desc->sptes[RMAP_EXT-1]) { 841 if (desc->sptes[PTE_LIST_EXT-1]) {
651 desc->more = mmu_alloc_rmap_desc(vcpu); 842 desc->more = mmu_alloc_pte_list_desc(vcpu);
652 desc = desc->more; 843 desc = desc->more;
653 } 844 }
654 for (i = 0; desc->sptes[i]; ++i) 845 for (i = 0; desc->sptes[i]; ++i)
@@ -658,59 +849,78 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
658 return count; 849 return count;
659} 850}
660 851
661static void rmap_desc_remove_entry(unsigned long *rmapp, 852static u64 *pte_list_next(unsigned long *pte_list, u64 *spte)
662 struct kvm_rmap_desc *desc, 853{
663 int i, 854 struct pte_list_desc *desc;
664 struct kvm_rmap_desc *prev_desc) 855 u64 *prev_spte;
856 int i;
857
858 if (!*pte_list)
859 return NULL;
860 else if (!(*pte_list & 1)) {
861 if (!spte)
862 return (u64 *)*pte_list;
863 return NULL;
864 }
865 desc = (struct pte_list_desc *)(*pte_list & ~1ul);
866 prev_spte = NULL;
867 while (desc) {
868 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
869 if (prev_spte == spte)
870 return desc->sptes[i];
871 prev_spte = desc->sptes[i];
872 }
873 desc = desc->more;
874 }
875 return NULL;
876}
877
878static void
879pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
880 int i, struct pte_list_desc *prev_desc)
665{ 881{
666 int j; 882 int j;
667 883
668 for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j) 884 for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
669 ; 885 ;
670 desc->sptes[i] = desc->sptes[j]; 886 desc->sptes[i] = desc->sptes[j];
671 desc->sptes[j] = NULL; 887 desc->sptes[j] = NULL;
672 if (j != 0) 888 if (j != 0)
673 return; 889 return;
674 if (!prev_desc && !desc->more) 890 if (!prev_desc && !desc->more)
675 *rmapp = (unsigned long)desc->sptes[0]; 891 *pte_list = (unsigned long)desc->sptes[0];
676 else 892 else
677 if (prev_desc) 893 if (prev_desc)
678 prev_desc->more = desc->more; 894 prev_desc->more = desc->more;
679 else 895 else
680 *rmapp = (unsigned long)desc->more | 1; 896 *pte_list = (unsigned long)desc->more | 1;
681 mmu_free_rmap_desc(desc); 897 mmu_free_pte_list_desc(desc);
682} 898}
683 899
684static void rmap_remove(struct kvm *kvm, u64 *spte) 900static void pte_list_remove(u64 *spte, unsigned long *pte_list)
685{ 901{
686 struct kvm_rmap_desc *desc; 902 struct pte_list_desc *desc;
687 struct kvm_rmap_desc *prev_desc; 903 struct pte_list_desc *prev_desc;
688 struct kvm_mmu_page *sp;
689 gfn_t gfn;
690 unsigned long *rmapp;
691 int i; 904 int i;
692 905
693 sp = page_header(__pa(spte)); 906 if (!*pte_list) {
694 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); 907 printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
695 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
696 if (!*rmapp) {
697 printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
698 BUG(); 908 BUG();
699 } else if (!(*rmapp & 1)) { 909 } else if (!(*pte_list & 1)) {
700 rmap_printk("rmap_remove: %p 1->0\n", spte); 910 rmap_printk("pte_list_remove: %p 1->0\n", spte);
701 if ((u64 *)*rmapp != spte) { 911 if ((u64 *)*pte_list != spte) {
702 printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte); 912 printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte);
703 BUG(); 913 BUG();
704 } 914 }
705 *rmapp = 0; 915 *pte_list = 0;
706 } else { 916 } else {
707 rmap_printk("rmap_remove: %p many->many\n", spte); 917 rmap_printk("pte_list_remove: %p many->many\n", spte);
708 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 918 desc = (struct pte_list_desc *)(*pte_list & ~1ul);
709 prev_desc = NULL; 919 prev_desc = NULL;
710 while (desc) { 920 while (desc) {
711 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) 921 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
712 if (desc->sptes[i] == spte) { 922 if (desc->sptes[i] == spte) {
713 rmap_desc_remove_entry(rmapp, 923 pte_list_desc_remove_entry(pte_list,
714 desc, i, 924 desc, i,
715 prev_desc); 925 prev_desc);
716 return; 926 return;
@@ -718,62 +928,80 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
718 prev_desc = desc; 928 prev_desc = desc;
719 desc = desc->more; 929 desc = desc->more;
720 } 930 }
721 pr_err("rmap_remove: %p many->many\n", spte); 931 pr_err("pte_list_remove: %p many->many\n", spte);
722 BUG(); 932 BUG();
723 } 933 }
724} 934}
725 935
726static int set_spte_track_bits(u64 *sptep, u64 new_spte) 936typedef void (*pte_list_walk_fn) (u64 *spte);
937static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
727{ 938{
728 pfn_t pfn; 939 struct pte_list_desc *desc;
729 u64 old_spte = *sptep; 940 int i;
730 941
731 if (!spte_has_volatile_bits(old_spte)) 942 if (!*pte_list)
732 __set_spte(sptep, new_spte); 943 return;
733 else
734 old_spte = __xchg_spte(sptep, new_spte);
735 944
736 if (!is_rmap_spte(old_spte)) 945 if (!(*pte_list & 1))
737 return 0; 946 return fn((u64 *)*pte_list);
738 947
739 pfn = spte_to_pfn(old_spte); 948 desc = (struct pte_list_desc *)(*pte_list & ~1ul);
740 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) 949 while (desc) {
741 kvm_set_pfn_accessed(pfn); 950 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
742 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) 951 fn(desc->sptes[i]);
743 kvm_set_pfn_dirty(pfn); 952 desc = desc->more;
744 return 1; 953 }
745} 954}
746 955
747static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) 956/*
957 * Take gfn and return the reverse mapping to it.
958 */
959static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
748{ 960{
749 if (set_spte_track_bits(sptep, new_spte)) 961 struct kvm_memory_slot *slot;
750 rmap_remove(kvm, sptep); 962 struct kvm_lpage_info *linfo;
963
964 slot = gfn_to_memslot(kvm, gfn);
965 if (likely(level == PT_PAGE_TABLE_LEVEL))
966 return &slot->rmap[gfn - slot->base_gfn];
967
968 linfo = lpage_info_slot(gfn, slot, level);
969
970 return &linfo->rmap_pde;
971}
972
973static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
974{
975 struct kvm_mmu_page *sp;
976 unsigned long *rmapp;
977
978 sp = page_header(__pa(spte));
979 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
980 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
981 return pte_list_add(vcpu, spte, rmapp);
751} 982}
752 983
753static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 984static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
754{ 985{
755 struct kvm_rmap_desc *desc; 986 return pte_list_next(rmapp, spte);
756 u64 *prev_spte; 987}
757 int i;
758 988
759 if (!*rmapp) 989static void rmap_remove(struct kvm *kvm, u64 *spte)
760 return NULL; 990{
761 else if (!(*rmapp & 1)) { 991 struct kvm_mmu_page *sp;
762 if (!spte) 992 gfn_t gfn;
763 return (u64 *)*rmapp; 993 unsigned long *rmapp;
764 return NULL; 994
765 } 995 sp = page_header(__pa(spte));
766 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 996 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
767 prev_spte = NULL; 997 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
768 while (desc) { 998 pte_list_remove(spte, rmapp);
769 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { 999}
770 if (prev_spte == spte) 1000
771 return desc->sptes[i]; 1001static void drop_spte(struct kvm *kvm, u64 *sptep)
772 prev_spte = desc->sptes[i]; 1002{
773 } 1003 if (mmu_spte_clear_track_bits(sptep))
774 desc = desc->more; 1004 rmap_remove(kvm, sptep);
775 }
776 return NULL;
777} 1005}
778 1006
779static int rmap_write_protect(struct kvm *kvm, u64 gfn) 1007static int rmap_write_protect(struct kvm *kvm, u64 gfn)
@@ -790,7 +1018,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
790 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1018 BUG_ON(!(*spte & PT_PRESENT_MASK));
791 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 1019 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
792 if (is_writable_pte(*spte)) { 1020 if (is_writable_pte(*spte)) {
793 update_spte(spte, *spte & ~PT_WRITABLE_MASK); 1021 mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
794 write_protected = 1; 1022 write_protected = 1;
795 } 1023 }
796 spte = rmap_next(kvm, rmapp, spte); 1024 spte = rmap_next(kvm, rmapp, spte);
@@ -807,8 +1035,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
807 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 1035 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
808 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 1036 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
809 if (is_writable_pte(*spte)) { 1037 if (is_writable_pte(*spte)) {
810 drop_spte(kvm, spte, 1038 drop_spte(kvm, spte);
811 shadow_trap_nonpresent_pte);
812 --kvm->stat.lpages; 1039 --kvm->stat.lpages;
813 spte = NULL; 1040 spte = NULL;
814 write_protected = 1; 1041 write_protected = 1;
@@ -829,7 +1056,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
829 while ((spte = rmap_next(kvm, rmapp, NULL))) { 1056 while ((spte = rmap_next(kvm, rmapp, NULL))) {
830 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1057 BUG_ON(!(*spte & PT_PRESENT_MASK));
831 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); 1058 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
832 drop_spte(kvm, spte, shadow_trap_nonpresent_pte); 1059 drop_spte(kvm, spte);
833 need_tlb_flush = 1; 1060 need_tlb_flush = 1;
834 } 1061 }
835 return need_tlb_flush; 1062 return need_tlb_flush;
@@ -851,7 +1078,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
851 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); 1078 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
852 need_flush = 1; 1079 need_flush = 1;
853 if (pte_write(*ptep)) { 1080 if (pte_write(*ptep)) {
854 drop_spte(kvm, spte, shadow_trap_nonpresent_pte); 1081 drop_spte(kvm, spte);
855 spte = rmap_next(kvm, rmapp, NULL); 1082 spte = rmap_next(kvm, rmapp, NULL);
856 } else { 1083 } else {
857 new_spte = *spte &~ (PT64_BASE_ADDR_MASK); 1084 new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -860,7 +1087,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
860 new_spte &= ~PT_WRITABLE_MASK; 1087 new_spte &= ~PT_WRITABLE_MASK;
861 new_spte &= ~SPTE_HOST_WRITEABLE; 1088 new_spte &= ~SPTE_HOST_WRITEABLE;
862 new_spte &= ~shadow_accessed_mask; 1089 new_spte &= ~shadow_accessed_mask;
863 set_spte_track_bits(spte, new_spte); 1090 mmu_spte_clear_track_bits(spte);
1091 mmu_spte_set(spte, new_spte);
864 spte = rmap_next(kvm, rmapp, spte); 1092 spte = rmap_next(kvm, rmapp, spte);
865 } 1093 }
866 } 1094 }
@@ -1032,151 +1260,89 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
1032 percpu_counter_add(&kvm_total_used_mmu_pages, nr); 1260 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1033} 1261}
1034 1262
1035static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1263/*
1264 * Remove the sp from shadow page cache, after call it,
1265 * we can not find this sp from the cache, and the shadow
1266 * page table is still valid.
1267 * It should be under the protection of mmu lock.
1268 */
1269static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
1036{ 1270{
1037 ASSERT(is_empty_shadow_page(sp->spt)); 1271 ASSERT(is_empty_shadow_page(sp->spt));
1038 hlist_del(&sp->hash_link); 1272 hlist_del(&sp->hash_link);
1039 list_del(&sp->link);
1040 free_page((unsigned long)sp->spt);
1041 if (!sp->role.direct) 1273 if (!sp->role.direct)
1042 free_page((unsigned long)sp->gfns); 1274 free_page((unsigned long)sp->gfns);
1043 kmem_cache_free(mmu_page_header_cache, sp);
1044 kvm_mod_used_mmu_pages(kvm, -1);
1045} 1275}
1046 1276
1047static unsigned kvm_page_table_hashfn(gfn_t gfn) 1277/*
1278 * Free the shadow page table and the sp, we can do it
1279 * out of the protection of mmu lock.
1280 */
1281static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1048{ 1282{
1049 return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); 1283 list_del(&sp->link);
1284 free_page((unsigned long)sp->spt);
1285 kmem_cache_free(mmu_page_header_cache, sp);
1050} 1286}
1051 1287
1052static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, 1288static unsigned kvm_page_table_hashfn(gfn_t gfn)
1053 u64 *parent_pte, int direct)
1054{ 1289{
1055 struct kvm_mmu_page *sp; 1290 return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
1056
1057 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
1058 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
1059 if (!direct)
1060 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
1061 PAGE_SIZE);
1062 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1063 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1064 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
1065 sp->multimapped = 0;
1066 sp->parent_pte = parent_pte;
1067 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
1068 return sp;
1069} 1291}
1070 1292
1071static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, 1293static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
1072 struct kvm_mmu_page *sp, u64 *parent_pte) 1294 struct kvm_mmu_page *sp, u64 *parent_pte)
1073{ 1295{
1074 struct kvm_pte_chain *pte_chain;
1075 struct hlist_node *node;
1076 int i;
1077
1078 if (!parent_pte) 1296 if (!parent_pte)
1079 return; 1297 return;
1080 if (!sp->multimapped) {
1081 u64 *old = sp->parent_pte;
1082 1298
1083 if (!old) { 1299 pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
1084 sp->parent_pte = parent_pte;
1085 return;
1086 }
1087 sp->multimapped = 1;
1088 pte_chain = mmu_alloc_pte_chain(vcpu);
1089 INIT_HLIST_HEAD(&sp->parent_ptes);
1090 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1091 pte_chain->parent_ptes[0] = old;
1092 }
1093 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
1094 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
1095 continue;
1096 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
1097 if (!pte_chain->parent_ptes[i]) {
1098 pte_chain->parent_ptes[i] = parent_pte;
1099 return;
1100 }
1101 }
1102 pte_chain = mmu_alloc_pte_chain(vcpu);
1103 BUG_ON(!pte_chain);
1104 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1105 pte_chain->parent_ptes[0] = parent_pte;
1106} 1300}
1107 1301
1108static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, 1302static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1109 u64 *parent_pte) 1303 u64 *parent_pte)
1110{ 1304{
1111 struct kvm_pte_chain *pte_chain; 1305 pte_list_remove(parent_pte, &sp->parent_ptes);
1112 struct hlist_node *node;
1113 int i;
1114
1115 if (!sp->multimapped) {
1116 BUG_ON(sp->parent_pte != parent_pte);
1117 sp->parent_pte = NULL;
1118 return;
1119 }
1120 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1121 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1122 if (!pte_chain->parent_ptes[i])
1123 break;
1124 if (pte_chain->parent_ptes[i] != parent_pte)
1125 continue;
1126 while (i + 1 < NR_PTE_CHAIN_ENTRIES
1127 && pte_chain->parent_ptes[i + 1]) {
1128 pte_chain->parent_ptes[i]
1129 = pte_chain->parent_ptes[i + 1];
1130 ++i;
1131 }
1132 pte_chain->parent_ptes[i] = NULL;
1133 if (i == 0) {
1134 hlist_del(&pte_chain->link);
1135 mmu_free_pte_chain(pte_chain);
1136 if (hlist_empty(&sp->parent_ptes)) {
1137 sp->multimapped = 0;
1138 sp->parent_pte = NULL;
1139 }
1140 }
1141 return;
1142 }
1143 BUG();
1144} 1306}
1145 1307
1146static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) 1308static void drop_parent_pte(struct kvm_mmu_page *sp,
1309 u64 *parent_pte)
1147{ 1310{
1148 struct kvm_pte_chain *pte_chain; 1311 mmu_page_remove_parent_pte(sp, parent_pte);
1149 struct hlist_node *node; 1312 mmu_spte_clear_no_track(parent_pte);
1150 struct kvm_mmu_page *parent_sp; 1313}
1151 int i;
1152
1153 if (!sp->multimapped && sp->parent_pte) {
1154 parent_sp = page_header(__pa(sp->parent_pte));
1155 fn(parent_sp, sp->parent_pte);
1156 return;
1157 }
1158
1159 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1160 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1161 u64 *spte = pte_chain->parent_ptes[i];
1162 1314
1163 if (!spte) 1315static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1164 break; 1316 u64 *parent_pte, int direct)
1165 parent_sp = page_header(__pa(spte)); 1317{
1166 fn(parent_sp, spte); 1318 struct kvm_mmu_page *sp;
1167 } 1319 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache,
1320 sizeof *sp);
1321 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
1322 if (!direct)
1323 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
1324 PAGE_SIZE);
1325 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1326 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1327 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
1328 sp->parent_ptes = 0;
1329 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1330 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
1331 return sp;
1168} 1332}
1169 1333
1170static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte); 1334static void mark_unsync(u64 *spte);
1171static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) 1335static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1172{ 1336{
1173 mmu_parent_walk(sp, mark_unsync); 1337 pte_list_walk(&sp->parent_ptes, mark_unsync);
1174} 1338}
1175 1339
1176static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte) 1340static void mark_unsync(u64 *spte)
1177{ 1341{
1342 struct kvm_mmu_page *sp;
1178 unsigned int index; 1343 unsigned int index;
1179 1344
1345 sp = page_header(__pa(spte));
1180 index = spte - sp->spt; 1346 index = spte - sp->spt;
1181 if (__test_and_set_bit(index, sp->unsync_child_bitmap)) 1347 if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1182 return; 1348 return;
@@ -1185,15 +1351,6 @@ static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
1185 kvm_mmu_mark_parents_unsync(sp); 1351 kvm_mmu_mark_parents_unsync(sp);
1186} 1352}
1187 1353
1188static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1189 struct kvm_mmu_page *sp)
1190{
1191 int i;
1192
1193 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1194 sp->spt[i] = shadow_trap_nonpresent_pte;
1195}
1196
1197static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1354static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1198 struct kvm_mmu_page *sp) 1355 struct kvm_mmu_page *sp)
1199{ 1356{
@@ -1475,6 +1632,14 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
1475 } 1632 }
1476} 1633}
1477 1634
1635static void init_shadow_page_table(struct kvm_mmu_page *sp)
1636{
1637 int i;
1638
1639 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1640 sp->spt[i] = 0ull;
1641}
1642
1478static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 1643static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1479 gfn_t gfn, 1644 gfn_t gfn,
1480 gva_t gaddr, 1645 gva_t gaddr,
@@ -1537,10 +1702,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1537 1702
1538 account_shadowed(vcpu->kvm, gfn); 1703 account_shadowed(vcpu->kvm, gfn);
1539 } 1704 }
1540 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) 1705 init_shadow_page_table(sp);
1541 vcpu->arch.mmu.prefetch_page(vcpu, sp);
1542 else
1543 nonpaging_prefetch_page(vcpu, sp);
1544 trace_kvm_mmu_get_page(sp, true); 1706 trace_kvm_mmu_get_page(sp, true);
1545 return sp; 1707 return sp;
1546} 1708}
@@ -1572,21 +1734,28 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
1572 if (iterator->level < PT_PAGE_TABLE_LEVEL) 1734 if (iterator->level < PT_PAGE_TABLE_LEVEL)
1573 return false; 1735 return false;
1574 1736
1575 if (iterator->level == PT_PAGE_TABLE_LEVEL)
1576 if (is_large_pte(*iterator->sptep))
1577 return false;
1578
1579 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); 1737 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
1580 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; 1738 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
1581 return true; 1739 return true;
1582} 1740}
1583 1741
1584static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) 1742static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
1743 u64 spte)
1585{ 1744{
1586 iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK; 1745 if (is_last_spte(spte, iterator->level)) {
1746 iterator->level = 0;
1747 return;
1748 }
1749
1750 iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
1587 --iterator->level; 1751 --iterator->level;
1588} 1752}
1589 1753
1754static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1755{
1756 return __shadow_walk_next(iterator, *iterator->sptep);
1757}
1758
1590static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) 1759static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1591{ 1760{
1592 u64 spte; 1761 u64 spte;
@@ -1594,13 +1763,13 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1594 spte = __pa(sp->spt) 1763 spte = __pa(sp->spt)
1595 | PT_PRESENT_MASK | PT_ACCESSED_MASK 1764 | PT_PRESENT_MASK | PT_ACCESSED_MASK
1596 | PT_WRITABLE_MASK | PT_USER_MASK; 1765 | PT_WRITABLE_MASK | PT_USER_MASK;
1597 __set_spte(sptep, spte); 1766 mmu_spte_set(sptep, spte);
1598} 1767}
1599 1768
1600static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) 1769static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1601{ 1770{
1602 if (is_large_pte(*sptep)) { 1771 if (is_large_pte(*sptep)) {
1603 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 1772 drop_spte(vcpu->kvm, sptep);
1604 kvm_flush_remote_tlbs(vcpu->kvm); 1773 kvm_flush_remote_tlbs(vcpu->kvm);
1605 } 1774 }
1606} 1775}
@@ -1622,38 +1791,39 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1622 if (child->role.access == direct_access) 1791 if (child->role.access == direct_access)
1623 return; 1792 return;
1624 1793
1625 mmu_page_remove_parent_pte(child, sptep); 1794 drop_parent_pte(child, sptep);
1626 __set_spte(sptep, shadow_trap_nonpresent_pte);
1627 kvm_flush_remote_tlbs(vcpu->kvm); 1795 kvm_flush_remote_tlbs(vcpu->kvm);
1628 } 1796 }
1629} 1797}
1630 1798
1799static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
1800 u64 *spte)
1801{
1802 u64 pte;
1803 struct kvm_mmu_page *child;
1804
1805 pte = *spte;
1806 if (is_shadow_present_pte(pte)) {
1807 if (is_last_spte(pte, sp->role.level))
1808 drop_spte(kvm, spte);
1809 else {
1810 child = page_header(pte & PT64_BASE_ADDR_MASK);
1811 drop_parent_pte(child, spte);
1812 }
1813 } else if (is_mmio_spte(pte))
1814 mmu_spte_clear_no_track(spte);
1815
1816 if (is_large_pte(pte))
1817 --kvm->stat.lpages;
1818}
1819
1631static void kvm_mmu_page_unlink_children(struct kvm *kvm, 1820static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1632 struct kvm_mmu_page *sp) 1821 struct kvm_mmu_page *sp)
1633{ 1822{
1634 unsigned i; 1823 unsigned i;
1635 u64 *pt; 1824
1636 u64 ent; 1825 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1637 1826 mmu_page_zap_pte(kvm, sp, sp->spt + i);
1638 pt = sp->spt;
1639
1640 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1641 ent = pt[i];
1642
1643 if (is_shadow_present_pte(ent)) {
1644 if (!is_last_spte(ent, sp->role.level)) {
1645 ent &= PT64_BASE_ADDR_MASK;
1646 mmu_page_remove_parent_pte(page_header(ent),
1647 &pt[i]);
1648 } else {
1649 if (is_large_pte(ent))
1650 --kvm->stat.lpages;
1651 drop_spte(kvm, &pt[i],
1652 shadow_trap_nonpresent_pte);
1653 }
1654 }
1655 pt[i] = shadow_trap_nonpresent_pte;
1656 }
1657} 1827}
1658 1828
1659static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) 1829static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -1674,20 +1844,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1674{ 1844{
1675 u64 *parent_pte; 1845 u64 *parent_pte;
1676 1846
1677 while (sp->multimapped || sp->parent_pte) { 1847 while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL)))
1678 if (!sp->multimapped) 1848 drop_parent_pte(sp, parent_pte);
1679 parent_pte = sp->parent_pte;
1680 else {
1681 struct kvm_pte_chain *chain;
1682
1683 chain = container_of(sp->parent_ptes.first,
1684 struct kvm_pte_chain, link);
1685 parent_pte = chain->parent_ptes[0];
1686 }
1687 BUG_ON(!parent_pte);
1688 kvm_mmu_put_page(sp, parent_pte);
1689 __set_spte(parent_pte, shadow_trap_nonpresent_pte);
1690 }
1691} 1849}
1692 1850
1693static int mmu_zap_unsync_children(struct kvm *kvm, 1851static int mmu_zap_unsync_children(struct kvm *kvm,
@@ -1734,6 +1892,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1734 /* Count self */ 1892 /* Count self */
1735 ret++; 1893 ret++;
1736 list_move(&sp->link, invalid_list); 1894 list_move(&sp->link, invalid_list);
1895 kvm_mod_used_mmu_pages(kvm, -1);
1737 } else { 1896 } else {
1738 list_move(&sp->link, &kvm->arch.active_mmu_pages); 1897 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1739 kvm_reload_remote_mmus(kvm); 1898 kvm_reload_remote_mmus(kvm);
@@ -1744,6 +1903,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1744 return ret; 1903 return ret;
1745} 1904}
1746 1905
1906static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
1907{
1908 struct kvm_mmu_page *sp;
1909
1910 list_for_each_entry(sp, invalid_list, link)
1911 kvm_mmu_isolate_page(sp);
1912}
1913
1914static void free_pages_rcu(struct rcu_head *head)
1915{
1916 struct kvm_mmu_page *next, *sp;
1917
1918 sp = container_of(head, struct kvm_mmu_page, rcu);
1919 while (sp) {
1920 if (!list_empty(&sp->link))
1921 next = list_first_entry(&sp->link,
1922 struct kvm_mmu_page, link);
1923 else
1924 next = NULL;
1925 kvm_mmu_free_page(sp);
1926 sp = next;
1927 }
1928}
1929
1747static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1930static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1748 struct list_head *invalid_list) 1931 struct list_head *invalid_list)
1749{ 1932{
@@ -1754,10 +1937,21 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1754 1937
1755 kvm_flush_remote_tlbs(kvm); 1938 kvm_flush_remote_tlbs(kvm);
1756 1939
1940 if (atomic_read(&kvm->arch.reader_counter)) {
1941 kvm_mmu_isolate_pages(invalid_list);
1942 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1943 list_del_init(invalid_list);
1944
1945 trace_kvm_mmu_delay_free_pages(sp);
1946 call_rcu(&sp->rcu, free_pages_rcu);
1947 return;
1948 }
1949
1757 do { 1950 do {
1758 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); 1951 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1759 WARN_ON(!sp->role.invalid || sp->root_count); 1952 WARN_ON(!sp->role.invalid || sp->root_count);
1760 kvm_mmu_free_page(kvm, sp); 1953 kvm_mmu_isolate_page(sp);
1954 kvm_mmu_free_page(sp);
1761 } while (!list_empty(invalid_list)); 1955 } while (!list_empty(invalid_list));
1762 1956
1763} 1957}
@@ -1783,8 +1977,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
1783 page = container_of(kvm->arch.active_mmu_pages.prev, 1977 page = container_of(kvm->arch.active_mmu_pages.prev,
1784 struct kvm_mmu_page, link); 1978 struct kvm_mmu_page, link);
1785 kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); 1979 kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
1786 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1787 } 1980 }
1981 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1788 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; 1982 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
1789 } 1983 }
1790 1984
@@ -1833,20 +2027,6 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1833 __set_bit(slot, sp->slot_bitmap); 2027 __set_bit(slot, sp->slot_bitmap);
1834} 2028}
1835 2029
1836static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1837{
1838 int i;
1839 u64 *pt = sp->spt;
1840
1841 if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
1842 return;
1843
1844 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1845 if (pt[i] == shadow_notrap_nonpresent_pte)
1846 __set_spte(&pt[i], shadow_trap_nonpresent_pte);
1847 }
1848}
1849
1850/* 2030/*
1851 * The function is based on mtrr_type_lookup() in 2031 * The function is based on mtrr_type_lookup() in
1852 * arch/x86/kernel/cpu/mtrr/generic.c 2032 * arch/x86/kernel/cpu/mtrr/generic.c
@@ -1959,7 +2139,6 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1959 sp->unsync = 1; 2139 sp->unsync = 1;
1960 2140
1961 kvm_mmu_mark_parents_unsync(sp); 2141 kvm_mmu_mark_parents_unsync(sp);
1962 mmu_convert_notrap(sp);
1963} 2142}
1964 2143
1965static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) 2144static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -2002,13 +2181,16 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2002 2181
2003static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2182static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2004 unsigned pte_access, int user_fault, 2183 unsigned pte_access, int user_fault,
2005 int write_fault, int dirty, int level, 2184 int write_fault, int level,
2006 gfn_t gfn, pfn_t pfn, bool speculative, 2185 gfn_t gfn, pfn_t pfn, bool speculative,
2007 bool can_unsync, bool host_writable) 2186 bool can_unsync, bool host_writable)
2008{ 2187{
2009 u64 spte, entry = *sptep; 2188 u64 spte, entry = *sptep;
2010 int ret = 0; 2189 int ret = 0;
2011 2190
2191 if (set_mmio_spte(sptep, gfn, pfn, pte_access))
2192 return 0;
2193
2012 /* 2194 /*
2013 * We don't set the accessed bit, since we sometimes want to see 2195 * We don't set the accessed bit, since we sometimes want to see
2014 * whether the guest actually used the pte (in order to detect 2196 * whether the guest actually used the pte (in order to detect
@@ -2017,8 +2199,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2017 spte = PT_PRESENT_MASK; 2199 spte = PT_PRESENT_MASK;
2018 if (!speculative) 2200 if (!speculative)
2019 spte |= shadow_accessed_mask; 2201 spte |= shadow_accessed_mask;
2020 if (!dirty) 2202
2021 pte_access &= ~ACC_WRITE_MASK;
2022 if (pte_access & ACC_EXEC_MASK) 2203 if (pte_access & ACC_EXEC_MASK)
2023 spte |= shadow_x_mask; 2204 spte |= shadow_x_mask;
2024 else 2205 else
@@ -2045,15 +2226,24 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2045 if (level > PT_PAGE_TABLE_LEVEL && 2226 if (level > PT_PAGE_TABLE_LEVEL &&
2046 has_wrprotected_page(vcpu->kvm, gfn, level)) { 2227 has_wrprotected_page(vcpu->kvm, gfn, level)) {
2047 ret = 1; 2228 ret = 1;
2048 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 2229 drop_spte(vcpu->kvm, sptep);
2049 goto done; 2230 goto done;
2050 } 2231 }
2051 2232
2052 spte |= PT_WRITABLE_MASK; 2233 spte |= PT_WRITABLE_MASK;
2053 2234
2054 if (!vcpu->arch.mmu.direct_map 2235 if (!vcpu->arch.mmu.direct_map
2055 && !(pte_access & ACC_WRITE_MASK)) 2236 && !(pte_access & ACC_WRITE_MASK)) {
2056 spte &= ~PT_USER_MASK; 2237 spte &= ~PT_USER_MASK;
2238 /*
2239 * If we converted a user page to a kernel page,
2240 * so that the kernel can write to it when cr0.wp=0,
2241 * then we should prevent the kernel from executing it
2242 * if SMEP is enabled.
2243 */
2244 if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
2245 spte |= PT64_NX_MASK;
2246 }
2057 2247
2058 /* 2248 /*
2059 * Optimization: for pte sync, if spte was writable the hash 2249 * Optimization: for pte sync, if spte was writable the hash
@@ -2078,7 +2268,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2078 mark_page_dirty(vcpu->kvm, gfn); 2268 mark_page_dirty(vcpu->kvm, gfn);
2079 2269
2080set_pte: 2270set_pte:
2081 update_spte(sptep, spte); 2271 mmu_spte_update(sptep, spte);
2082 /* 2272 /*
2083 * If we overwrite a writable spte with a read-only one we 2273 * If we overwrite a writable spte with a read-only one we
2084 * should flush remote TLBs. Otherwise rmap_write_protect 2274 * should flush remote TLBs. Otherwise rmap_write_protect
@@ -2093,8 +2283,8 @@ done:
2093 2283
2094static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2284static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2095 unsigned pt_access, unsigned pte_access, 2285 unsigned pt_access, unsigned pte_access,
2096 int user_fault, int write_fault, int dirty, 2286 int user_fault, int write_fault,
2097 int *ptwrite, int level, gfn_t gfn, 2287 int *emulate, int level, gfn_t gfn,
2098 pfn_t pfn, bool speculative, 2288 pfn_t pfn, bool speculative,
2099 bool host_writable) 2289 bool host_writable)
2100{ 2290{
@@ -2117,26 +2307,28 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2117 u64 pte = *sptep; 2307 u64 pte = *sptep;
2118 2308
2119 child = page_header(pte & PT64_BASE_ADDR_MASK); 2309 child = page_header(pte & PT64_BASE_ADDR_MASK);
2120 mmu_page_remove_parent_pte(child, sptep); 2310 drop_parent_pte(child, sptep);
2121 __set_spte(sptep, shadow_trap_nonpresent_pte);
2122 kvm_flush_remote_tlbs(vcpu->kvm); 2311 kvm_flush_remote_tlbs(vcpu->kvm);
2123 } else if (pfn != spte_to_pfn(*sptep)) { 2312 } else if (pfn != spte_to_pfn(*sptep)) {
2124 pgprintk("hfn old %llx new %llx\n", 2313 pgprintk("hfn old %llx new %llx\n",
2125 spte_to_pfn(*sptep), pfn); 2314 spte_to_pfn(*sptep), pfn);
2126 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 2315 drop_spte(vcpu->kvm, sptep);
2127 kvm_flush_remote_tlbs(vcpu->kvm); 2316 kvm_flush_remote_tlbs(vcpu->kvm);
2128 } else 2317 } else
2129 was_rmapped = 1; 2318 was_rmapped = 1;
2130 } 2319 }
2131 2320
2132 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, 2321 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
2133 dirty, level, gfn, pfn, speculative, true, 2322 level, gfn, pfn, speculative, true,
2134 host_writable)) { 2323 host_writable)) {
2135 if (write_fault) 2324 if (write_fault)
2136 *ptwrite = 1; 2325 *emulate = 1;
2137 kvm_mmu_flush_tlb(vcpu); 2326 kvm_mmu_flush_tlb(vcpu);
2138 } 2327 }
2139 2328
2329 if (unlikely(is_mmio_spte(*sptep) && emulate))
2330 *emulate = 1;
2331
2140 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2332 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2141 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", 2333 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
2142 is_large_pte(*sptep)? "2MB" : "4kB", 2334 is_large_pte(*sptep)? "2MB" : "4kB",
@@ -2145,11 +2337,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2145 if (!was_rmapped && is_large_pte(*sptep)) 2337 if (!was_rmapped && is_large_pte(*sptep))
2146 ++vcpu->kvm->stat.lpages; 2338 ++vcpu->kvm->stat.lpages;
2147 2339
2148 page_header_update_slot(vcpu->kvm, sptep, gfn); 2340 if (is_shadow_present_pte(*sptep)) {
2149 if (!was_rmapped) { 2341 page_header_update_slot(vcpu->kvm, sptep, gfn);
2150 rmap_count = rmap_add(vcpu, sptep, gfn); 2342 if (!was_rmapped) {
2151 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 2343 rmap_count = rmap_add(vcpu, sptep, gfn);
2152 rmap_recycle(vcpu, sptep, gfn); 2344 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
2345 rmap_recycle(vcpu, sptep, gfn);
2346 }
2153 } 2347 }
2154 kvm_release_pfn_clean(pfn); 2348 kvm_release_pfn_clean(pfn);
2155 if (speculative) { 2349 if (speculative) {
@@ -2170,8 +2364,8 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2170 2364
2171 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); 2365 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
2172 if (!slot) { 2366 if (!slot) {
2173 get_page(bad_page); 2367 get_page(fault_page);
2174 return page_to_pfn(bad_page); 2368 return page_to_pfn(fault_page);
2175 } 2369 }
2176 2370
2177 hva = gfn_to_hva_memslot(slot, gfn); 2371 hva = gfn_to_hva_memslot(slot, gfn);
@@ -2198,7 +2392,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2198 2392
2199 for (i = 0; i < ret; i++, gfn++, start++) 2393 for (i = 0; i < ret; i++, gfn++, start++)
2200 mmu_set_spte(vcpu, start, ACC_ALL, 2394 mmu_set_spte(vcpu, start, ACC_ALL,
2201 access, 0, 0, 1, NULL, 2395 access, 0, 0, NULL,
2202 sp->role.level, gfn, 2396 sp->role.level, gfn,
2203 page_to_pfn(pages[i]), true, true); 2397 page_to_pfn(pages[i]), true, true);
2204 2398
@@ -2217,7 +2411,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2217 spte = sp->spt + i; 2411 spte = sp->spt + i;
2218 2412
2219 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { 2413 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2220 if (*spte != shadow_trap_nonpresent_pte || spte == sptep) { 2414 if (is_shadow_present_pte(*spte) || spte == sptep) {
2221 if (!start) 2415 if (!start)
2222 continue; 2416 continue;
2223 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) 2417 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
@@ -2254,7 +2448,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2254{ 2448{
2255 struct kvm_shadow_walk_iterator iterator; 2449 struct kvm_shadow_walk_iterator iterator;
2256 struct kvm_mmu_page *sp; 2450 struct kvm_mmu_page *sp;
2257 int pt_write = 0; 2451 int emulate = 0;
2258 gfn_t pseudo_gfn; 2452 gfn_t pseudo_gfn;
2259 2453
2260 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 2454 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
@@ -2262,14 +2456,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2262 unsigned pte_access = ACC_ALL; 2456 unsigned pte_access = ACC_ALL;
2263 2457
2264 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, 2458 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
2265 0, write, 1, &pt_write, 2459 0, write, &emulate,
2266 level, gfn, pfn, prefault, map_writable); 2460 level, gfn, pfn, prefault, map_writable);
2267 direct_pte_prefetch(vcpu, iterator.sptep); 2461 direct_pte_prefetch(vcpu, iterator.sptep);
2268 ++vcpu->stat.pf_fixed; 2462 ++vcpu->stat.pf_fixed;
2269 break; 2463 break;
2270 } 2464 }
2271 2465
2272 if (*iterator.sptep == shadow_trap_nonpresent_pte) { 2466 if (!is_shadow_present_pte(*iterator.sptep)) {
2273 u64 base_addr = iterator.addr; 2467 u64 base_addr = iterator.addr;
2274 2468
2275 base_addr &= PT64_LVL_ADDR_MASK(iterator.level); 2469 base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
@@ -2283,14 +2477,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2283 return -ENOMEM; 2477 return -ENOMEM;
2284 } 2478 }
2285 2479
2286 __set_spte(iterator.sptep, 2480 mmu_spte_set(iterator.sptep,
2287 __pa(sp->spt) 2481 __pa(sp->spt)
2288 | PT_PRESENT_MASK | PT_WRITABLE_MASK 2482 | PT_PRESENT_MASK | PT_WRITABLE_MASK
2289 | shadow_user_mask | shadow_x_mask 2483 | shadow_user_mask | shadow_x_mask
2290 | shadow_accessed_mask); 2484 | shadow_accessed_mask);
2291 } 2485 }
2292 } 2486 }
2293 return pt_write; 2487 return emulate;
2294} 2488}
2295 2489
2296static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) 2490static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
@@ -2306,16 +2500,15 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
2306 send_sig_info(SIGBUS, &info, tsk); 2500 send_sig_info(SIGBUS, &info, tsk);
2307} 2501}
2308 2502
2309static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) 2503static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
2310{ 2504{
2311 kvm_release_pfn_clean(pfn); 2505 kvm_release_pfn_clean(pfn);
2312 if (is_hwpoison_pfn(pfn)) { 2506 if (is_hwpoison_pfn(pfn)) {
2313 kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current); 2507 kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
2314 return 0; 2508 return 0;
2315 } else if (is_fault_pfn(pfn)) 2509 }
2316 return -EFAULT;
2317 2510
2318 return 1; 2511 return -EFAULT;
2319} 2512}
2320 2513
2321static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, 2514static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
@@ -2360,6 +2553,30 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2360 } 2553 }
2361} 2554}
2362 2555
2556static bool mmu_invalid_pfn(pfn_t pfn)
2557{
2558 return unlikely(is_invalid_pfn(pfn));
2559}
2560
2561static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
2562 pfn_t pfn, unsigned access, int *ret_val)
2563{
2564 bool ret = true;
2565
2566 /* The pfn is invalid, report the error! */
2567 if (unlikely(is_invalid_pfn(pfn))) {
2568 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
2569 goto exit;
2570 }
2571
2572 if (unlikely(is_noslot_pfn(pfn)))
2573 vcpu_cache_mmio_info(vcpu, gva, gfn, access);
2574
2575 ret = false;
2576exit:
2577 return ret;
2578}
2579
2363static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 2580static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2364 gva_t gva, pfn_t *pfn, bool write, bool *writable); 2581 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2365 2582
@@ -2394,9 +2611,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2394 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) 2611 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
2395 return 0; 2612 return 0;
2396 2613
2397 /* mmio */ 2614 if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
2398 if (is_error_pfn(pfn)) 2615 return r;
2399 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2400 2616
2401 spin_lock(&vcpu->kvm->mmu_lock); 2617 spin_lock(&vcpu->kvm->mmu_lock);
2402 if (mmu_notifier_retry(vcpu, mmu_seq)) 2618 if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2623,6 +2839,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2623 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2839 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2624 return; 2840 return;
2625 2841
2842 vcpu_clear_mmio_info(vcpu, ~0ul);
2626 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); 2843 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
2627 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { 2844 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2628 hpa_t root = vcpu->arch.mmu.root_hpa; 2845 hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -2667,6 +2884,94 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2667 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); 2884 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2668} 2885}
2669 2886
2887static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
2888{
2889 if (direct)
2890 return vcpu_match_mmio_gpa(vcpu, addr);
2891
2892 return vcpu_match_mmio_gva(vcpu, addr);
2893}
2894
2895
2896/*
2897 * On direct hosts, the last spte is only allows two states
2898 * for mmio page fault:
2899 * - It is the mmio spte
2900 * - It is zapped or it is being zapped.
2901 *
2902 * This function completely checks the spte when the last spte
2903 * is not the mmio spte.
2904 */
2905static bool check_direct_spte_mmio_pf(u64 spte)
2906{
2907 return __check_direct_spte_mmio_pf(spte);
2908}
2909
2910static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
2911{
2912 struct kvm_shadow_walk_iterator iterator;
2913 u64 spte = 0ull;
2914
2915 walk_shadow_page_lockless_begin(vcpu);
2916 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
2917 if (!is_shadow_present_pte(spte))
2918 break;
2919 walk_shadow_page_lockless_end(vcpu);
2920
2921 return spte;
2922}
2923
2924/*
2925 * If it is a real mmio page fault, return 1 and emulat the instruction
2926 * directly, return 0 to let CPU fault again on the address, -1 is
2927 * returned if bug is detected.
2928 */
2929int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
2930{
2931 u64 spte;
2932
2933 if (quickly_check_mmio_pf(vcpu, addr, direct))
2934 return 1;
2935
2936 spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
2937
2938 if (is_mmio_spte(spte)) {
2939 gfn_t gfn = get_mmio_spte_gfn(spte);
2940 unsigned access = get_mmio_spte_access(spte);
2941
2942 if (direct)
2943 addr = 0;
2944
2945 trace_handle_mmio_page_fault(addr, gfn, access);
2946 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
2947 return 1;
2948 }
2949
2950 /*
2951 * It's ok if the gva is remapped by other cpus on shadow guest,
2952 * it's a BUG if the gfn is not a mmio page.
2953 */
2954 if (direct && !check_direct_spte_mmio_pf(spte))
2955 return -1;
2956
2957 /*
2958 * If the page table is zapped by other cpus, let CPU fault again on
2959 * the address.
2960 */
2961 return 0;
2962}
2963EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
2964
2965static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
2966 u32 error_code, bool direct)
2967{
2968 int ret;
2969
2970 ret = handle_mmio_page_fault_common(vcpu, addr, direct);
2971 WARN_ON(ret < 0);
2972 return ret;
2973}
2974
2670static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 2975static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2671 u32 error_code, bool prefault) 2976 u32 error_code, bool prefault)
2672{ 2977{
@@ -2674,6 +2979,10 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2674 int r; 2979 int r;
2675 2980
2676 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); 2981 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
2982
2983 if (unlikely(error_code & PFERR_RSVD_MASK))
2984 return handle_mmio_page_fault(vcpu, gva, error_code, true);
2985
2677 r = mmu_topup_memory_caches(vcpu); 2986 r = mmu_topup_memory_caches(vcpu);
2678 if (r) 2987 if (r)
2679 return r; 2988 return r;
@@ -2750,6 +3059,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2750 ASSERT(vcpu); 3059 ASSERT(vcpu);
2751 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3060 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2752 3061
3062 if (unlikely(error_code & PFERR_RSVD_MASK))
3063 return handle_mmio_page_fault(vcpu, gpa, error_code, true);
3064
2753 r = mmu_topup_memory_caches(vcpu); 3065 r = mmu_topup_memory_caches(vcpu);
2754 if (r) 3066 if (r)
2755 return r; 3067 return r;
@@ -2767,9 +3079,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2767 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) 3079 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
2768 return 0; 3080 return 0;
2769 3081
2770 /* mmio */ 3082 if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
2771 if (is_error_pfn(pfn)) 3083 return r;
2772 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); 3084
2773 spin_lock(&vcpu->kvm->mmu_lock); 3085 spin_lock(&vcpu->kvm->mmu_lock);
2774 if (mmu_notifier_retry(vcpu, mmu_seq)) 3086 if (mmu_notifier_retry(vcpu, mmu_seq))
2775 goto out_unlock; 3087 goto out_unlock;
@@ -2800,7 +3112,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
2800 context->page_fault = nonpaging_page_fault; 3112 context->page_fault = nonpaging_page_fault;
2801 context->gva_to_gpa = nonpaging_gva_to_gpa; 3113 context->gva_to_gpa = nonpaging_gva_to_gpa;
2802 context->free = nonpaging_free; 3114 context->free = nonpaging_free;
2803 context->prefetch_page = nonpaging_prefetch_page;
2804 context->sync_page = nonpaging_sync_page; 3115 context->sync_page = nonpaging_sync_page;
2805 context->invlpg = nonpaging_invlpg; 3116 context->invlpg = nonpaging_invlpg;
2806 context->update_pte = nonpaging_update_pte; 3117 context->update_pte = nonpaging_update_pte;
@@ -2848,6 +3159,23 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2848 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; 3159 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2849} 3160}
2850 3161
3162static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
3163 int *nr_present)
3164{
3165 if (unlikely(is_mmio_spte(*sptep))) {
3166 if (gfn != get_mmio_spte_gfn(*sptep)) {
3167 mmu_spte_clear_no_track(sptep);
3168 return true;
3169 }
3170
3171 (*nr_present)++;
3172 mark_mmio_spte(sptep, gfn, access);
3173 return true;
3174 }
3175
3176 return false;
3177}
3178
2851#define PTTYPE 64 3179#define PTTYPE 64
2852#include "paging_tmpl.h" 3180#include "paging_tmpl.h"
2853#undef PTTYPE 3181#undef PTTYPE
@@ -2930,7 +3258,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
2930 context->new_cr3 = paging_new_cr3; 3258 context->new_cr3 = paging_new_cr3;
2931 context->page_fault = paging64_page_fault; 3259 context->page_fault = paging64_page_fault;
2932 context->gva_to_gpa = paging64_gva_to_gpa; 3260 context->gva_to_gpa = paging64_gva_to_gpa;
2933 context->prefetch_page = paging64_prefetch_page;
2934 context->sync_page = paging64_sync_page; 3261 context->sync_page = paging64_sync_page;
2935 context->invlpg = paging64_invlpg; 3262 context->invlpg = paging64_invlpg;
2936 context->update_pte = paging64_update_pte; 3263 context->update_pte = paging64_update_pte;
@@ -2959,7 +3286,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
2959 context->page_fault = paging32_page_fault; 3286 context->page_fault = paging32_page_fault;
2960 context->gva_to_gpa = paging32_gva_to_gpa; 3287 context->gva_to_gpa = paging32_gva_to_gpa;
2961 context->free = paging_free; 3288 context->free = paging_free;
2962 context->prefetch_page = paging32_prefetch_page;
2963 context->sync_page = paging32_sync_page; 3289 context->sync_page = paging32_sync_page;
2964 context->invlpg = paging32_invlpg; 3290 context->invlpg = paging32_invlpg;
2965 context->update_pte = paging32_update_pte; 3291 context->update_pte = paging32_update_pte;
@@ -2984,7 +3310,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2984 context->new_cr3 = nonpaging_new_cr3; 3310 context->new_cr3 = nonpaging_new_cr3;
2985 context->page_fault = tdp_page_fault; 3311 context->page_fault = tdp_page_fault;
2986 context->free = nonpaging_free; 3312 context->free = nonpaging_free;
2987 context->prefetch_page = nonpaging_prefetch_page;
2988 context->sync_page = nonpaging_sync_page; 3313 context->sync_page = nonpaging_sync_page;
2989 context->invlpg = nonpaging_invlpg; 3314 context->invlpg = nonpaging_invlpg;
2990 context->update_pte = nonpaging_update_pte; 3315 context->update_pte = nonpaging_update_pte;
@@ -3023,6 +3348,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3023int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) 3348int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
3024{ 3349{
3025 int r; 3350 int r;
3351 bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
3026 ASSERT(vcpu); 3352 ASSERT(vcpu);
3027 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3353 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3028 3354
@@ -3037,6 +3363,8 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
3037 3363
3038 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); 3364 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
3039 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); 3365 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
3366 vcpu->arch.mmu.base_role.smep_andnot_wp
3367 = smep && !is_write_protection(vcpu);
3040 3368
3041 return r; 3369 return r;
3042} 3370}
@@ -3141,27 +3469,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
3141} 3469}
3142EXPORT_SYMBOL_GPL(kvm_mmu_unload); 3470EXPORT_SYMBOL_GPL(kvm_mmu_unload);
3143 3471
3144static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
3145 struct kvm_mmu_page *sp,
3146 u64 *spte)
3147{
3148 u64 pte;
3149 struct kvm_mmu_page *child;
3150
3151 pte = *spte;
3152 if (is_shadow_present_pte(pte)) {
3153 if (is_last_spte(pte, sp->role.level))
3154 drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
3155 else {
3156 child = page_header(pte & PT64_BASE_ADDR_MASK);
3157 mmu_page_remove_parent_pte(child, spte);
3158 }
3159 }
3160 __set_spte(spte, shadow_trap_nonpresent_pte);
3161 if (is_large_pte(pte))
3162 --vcpu->kvm->stat.lpages;
3163}
3164
3165static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 3472static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3166 struct kvm_mmu_page *sp, u64 *spte, 3473 struct kvm_mmu_page *sp, u64 *spte,
3167 const void *new) 3474 const void *new)
@@ -3233,6 +3540,13 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3233 int level, npte, invlpg_counter, r, flooded = 0; 3540 int level, npte, invlpg_counter, r, flooded = 0;
3234 bool remote_flush, local_flush, zap_page; 3541 bool remote_flush, local_flush, zap_page;
3235 3542
3543 /*
3544 * If we don't have indirect shadow pages, it means no page is
3545 * write-protected, so we can exit simply.
3546 */
3547 if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
3548 return;
3549
3236 zap_page = remote_flush = local_flush = false; 3550 zap_page = remote_flush = local_flush = false;
3237 offset = offset_in_page(gpa); 3551 offset = offset_in_page(gpa);
3238 3552
@@ -3336,7 +3650,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3336 spte = &sp->spt[page_offset / sizeof(*spte)]; 3650 spte = &sp->spt[page_offset / sizeof(*spte)];
3337 while (npte--) { 3651 while (npte--) {
3338 entry = *spte; 3652 entry = *spte;
3339 mmu_pte_write_zap_pte(vcpu, sp, spte); 3653 mmu_page_zap_pte(vcpu->kvm, sp, spte);
3340 if (gentry && 3654 if (gentry &&
3341 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 3655 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
3342 & mask.word)) 3656 & mask.word))
@@ -3380,9 +3694,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
3380 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 3694 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
3381 struct kvm_mmu_page, link); 3695 struct kvm_mmu_page, link);
3382 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); 3696 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
3383 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3384 ++vcpu->kvm->stat.mmu_recycled; 3697 ++vcpu->kvm->stat.mmu_recycled;
3385 } 3698 }
3699 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3386} 3700}
3387 3701
3388int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, 3702int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
@@ -3506,15 +3820,15 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3506 continue; 3820 continue;
3507 3821
3508 if (is_large_pte(pt[i])) { 3822 if (is_large_pte(pt[i])) {
3509 drop_spte(kvm, &pt[i], 3823 drop_spte(kvm, &pt[i]);
3510 shadow_trap_nonpresent_pte);
3511 --kvm->stat.lpages; 3824 --kvm->stat.lpages;
3512 continue; 3825 continue;
3513 } 3826 }
3514 3827
3515 /* avoid RMW */ 3828 /* avoid RMW */
3516 if (is_writable_pte(pt[i])) 3829 if (is_writable_pte(pt[i]))
3517 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); 3830 mmu_spte_update(&pt[i],
3831 pt[i] & ~PT_WRITABLE_MASK);
3518 } 3832 }
3519 } 3833 }
3520 kvm_flush_remote_tlbs(kvm); 3834 kvm_flush_remote_tlbs(kvm);
@@ -3590,25 +3904,18 @@ static struct shrinker mmu_shrinker = {
3590 3904
3591static void mmu_destroy_caches(void) 3905static void mmu_destroy_caches(void)
3592{ 3906{
3593 if (pte_chain_cache) 3907 if (pte_list_desc_cache)
3594 kmem_cache_destroy(pte_chain_cache); 3908 kmem_cache_destroy(pte_list_desc_cache);
3595 if (rmap_desc_cache)
3596 kmem_cache_destroy(rmap_desc_cache);
3597 if (mmu_page_header_cache) 3909 if (mmu_page_header_cache)
3598 kmem_cache_destroy(mmu_page_header_cache); 3910 kmem_cache_destroy(mmu_page_header_cache);
3599} 3911}
3600 3912
3601int kvm_mmu_module_init(void) 3913int kvm_mmu_module_init(void)
3602{ 3914{
3603 pte_chain_cache = kmem_cache_create("kvm_pte_chain", 3915 pte_list_desc_cache = kmem_cache_create("pte_list_desc",
3604 sizeof(struct kvm_pte_chain), 3916 sizeof(struct pte_list_desc),
3605 0, 0, NULL);
3606 if (!pte_chain_cache)
3607 goto nomem;
3608 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
3609 sizeof(struct kvm_rmap_desc),
3610 0, 0, NULL); 3917 0, 0, NULL);
3611 if (!rmap_desc_cache) 3918 if (!pte_list_desc_cache)
3612 goto nomem; 3919 goto nomem;
3613 3920
3614 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", 3921 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
@@ -3775,16 +4082,17 @@ out:
3775int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) 4082int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3776{ 4083{
3777 struct kvm_shadow_walk_iterator iterator; 4084 struct kvm_shadow_walk_iterator iterator;
4085 u64 spte;
3778 int nr_sptes = 0; 4086 int nr_sptes = 0;
3779 4087
3780 spin_lock(&vcpu->kvm->mmu_lock); 4088 walk_shadow_page_lockless_begin(vcpu);
3781 for_each_shadow_entry(vcpu, addr, iterator) { 4089 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
3782 sptes[iterator.level-1] = *iterator.sptep; 4090 sptes[iterator.level-1] = spte;
3783 nr_sptes++; 4091 nr_sptes++;
3784 if (!is_shadow_present_pte(*iterator.sptep)) 4092 if (!is_shadow_present_pte(spte))
3785 break; 4093 break;
3786 } 4094 }
3787 spin_unlock(&vcpu->kvm->mmu_lock); 4095 walk_shadow_page_lockless_end(vcpu);
3788 4096
3789 return nr_sptes; 4097 return nr_sptes;
3790} 4098}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 7086ca85d3e7..e374db9af021 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -49,6 +49,8 @@
49#define PFERR_FETCH_MASK (1U << 4) 49#define PFERR_FETCH_MASK (1U << 4)
50 50
51int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); 51int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
52void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
53int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
52int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); 54int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
53 55
54static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) 56static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
@@ -76,4 +78,27 @@ static inline int is_present_gpte(unsigned long pte)
76 return pte & PT_PRESENT_MASK; 78 return pte & PT_PRESENT_MASK;
77} 79}
78 80
81static inline int is_writable_pte(unsigned long pte)
82{
83 return pte & PT_WRITABLE_MASK;
84}
85
86static inline bool is_write_protection(struct kvm_vcpu *vcpu)
87{
88 return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
89}
90
91static inline bool check_write_user_access(struct kvm_vcpu *vcpu,
92 bool write_fault, bool user_fault,
93 unsigned long pte)
94{
95 if (unlikely(write_fault && !is_writable_pte(pte)
96 && (user_fault || is_write_protection(vcpu))))
97 return false;
98
99 if (unlikely(user_fault && !(pte & PT_USER_MASK)))
100 return false;
101
102 return true;
103}
79#endif 104#endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 5f6223b8bcf7..2460a265be23 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -99,18 +99,6 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
99 "level = %d\n", sp, level); 99 "level = %d\n", sp, level);
100 return; 100 return;
101 } 101 }
102
103 if (*sptep == shadow_notrap_nonpresent_pte) {
104 audit_printk(vcpu->kvm, "notrap spte in unsync "
105 "sp: %p\n", sp);
106 return;
107 }
108 }
109
110 if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
111 audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
112 sp);
113 return;
114 } 102 }
115 103
116 if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level)) 104 if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index b60b4fdb3eda..eed67f34146d 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -196,6 +196,54 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
196 TP_ARGS(sp) 196 TP_ARGS(sp)
197); 197);
198 198
199DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages,
200 TP_PROTO(struct kvm_mmu_page *sp),
201
202 TP_ARGS(sp)
203);
204
205TRACE_EVENT(
206 mark_mmio_spte,
207 TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access),
208 TP_ARGS(sptep, gfn, access),
209
210 TP_STRUCT__entry(
211 __field(void *, sptep)
212 __field(gfn_t, gfn)
213 __field(unsigned, access)
214 ),
215
216 TP_fast_assign(
217 __entry->sptep = sptep;
218 __entry->gfn = gfn;
219 __entry->access = access;
220 ),
221
222 TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn,
223 __entry->access)
224);
225
226TRACE_EVENT(
227 handle_mmio_page_fault,
228 TP_PROTO(u64 addr, gfn_t gfn, unsigned access),
229 TP_ARGS(addr, gfn, access),
230
231 TP_STRUCT__entry(
232 __field(u64, addr)
233 __field(gfn_t, gfn)
234 __field(unsigned, access)
235 ),
236
237 TP_fast_assign(
238 __entry->addr = addr;
239 __entry->gfn = gfn;
240 __entry->access = access;
241 ),
242
243 TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
244 __entry->access)
245);
246
199TRACE_EVENT( 247TRACE_EVENT(
200 kvm_mmu_audit, 248 kvm_mmu_audit,
201 TP_PROTO(struct kvm_vcpu *vcpu, int audit_point), 249 TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9d03ad4dd5ec..507e2b844cfa 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -101,11 +101,15 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
101 return (ret != orig_pte); 101 return (ret != orig_pte);
102} 102}
103 103
104static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) 104static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
105 bool last)
105{ 106{
106 unsigned access; 107 unsigned access;
107 108
108 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; 109 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
110 if (last && !is_dirty_gpte(gpte))
111 access &= ~ACC_WRITE_MASK;
112
109#if PTTYPE == 64 113#if PTTYPE == 64
110 if (vcpu->arch.mmu.nx) 114 if (vcpu->arch.mmu.nx)
111 access &= ~(gpte >> PT64_NX_SHIFT); 115 access &= ~(gpte >> PT64_NX_SHIFT);
@@ -113,6 +117,24 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
113 return access; 117 return access;
114} 118}
115 119
120static bool FNAME(is_last_gpte)(struct guest_walker *walker,
121 struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
122 pt_element_t gpte)
123{
124 if (walker->level == PT_PAGE_TABLE_LEVEL)
125 return true;
126
127 if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) &&
128 (PTTYPE == 64 || is_pse(vcpu)))
129 return true;
130
131 if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) &&
132 (mmu->root_level == PT64_ROOT_LEVEL))
133 return true;
134
135 return false;
136}
137
116/* 138/*
117 * Fetch a guest pte for a guest virtual address 139 * Fetch a guest pte for a guest virtual address
118 */ 140 */
@@ -125,18 +147,17 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
125 gfn_t table_gfn; 147 gfn_t table_gfn;
126 unsigned index, pt_access, uninitialized_var(pte_access); 148 unsigned index, pt_access, uninitialized_var(pte_access);
127 gpa_t pte_gpa; 149 gpa_t pte_gpa;
128 bool eperm, present, rsvd_fault; 150 bool eperm;
129 int offset, write_fault, user_fault, fetch_fault; 151 int offset;
130 152 const int write_fault = access & PFERR_WRITE_MASK;
131 write_fault = access & PFERR_WRITE_MASK; 153 const int user_fault = access & PFERR_USER_MASK;
132 user_fault = access & PFERR_USER_MASK; 154 const int fetch_fault = access & PFERR_FETCH_MASK;
133 fetch_fault = access & PFERR_FETCH_MASK; 155 u16 errcode = 0;
134 156
135 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, 157 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
136 fetch_fault); 158 fetch_fault);
137walk: 159retry_walk:
138 present = true; 160 eperm = false;
139 eperm = rsvd_fault = false;
140 walker->level = mmu->root_level; 161 walker->level = mmu->root_level;
141 pte = mmu->get_cr3(vcpu); 162 pte = mmu->get_cr3(vcpu);
142 163
@@ -144,10 +165,8 @@ walk:
144 if (walker->level == PT32E_ROOT_LEVEL) { 165 if (walker->level == PT32E_ROOT_LEVEL) {
145 pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3); 166 pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
146 trace_kvm_mmu_paging_element(pte, walker->level); 167 trace_kvm_mmu_paging_element(pte, walker->level);
147 if (!is_present_gpte(pte)) { 168 if (!is_present_gpte(pte))
148 present = false;
149 goto error; 169 goto error;
150 }
151 --walker->level; 170 --walker->level;
152 } 171 }
153#endif 172#endif
@@ -170,42 +189,31 @@ walk:
170 189
171 real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), 190 real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
172 PFERR_USER_MASK|PFERR_WRITE_MASK); 191 PFERR_USER_MASK|PFERR_WRITE_MASK);
173 if (unlikely(real_gfn == UNMAPPED_GVA)) { 192 if (unlikely(real_gfn == UNMAPPED_GVA))
174 present = false; 193 goto error;
175 break;
176 }
177 real_gfn = gpa_to_gfn(real_gfn); 194 real_gfn = gpa_to_gfn(real_gfn);
178 195
179 host_addr = gfn_to_hva(vcpu->kvm, real_gfn); 196 host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
180 if (unlikely(kvm_is_error_hva(host_addr))) { 197 if (unlikely(kvm_is_error_hva(host_addr)))
181 present = false; 198 goto error;
182 break;
183 }
184 199
185 ptep_user = (pt_element_t __user *)((void *)host_addr + offset); 200 ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
186 if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) { 201 if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
187 present = false; 202 goto error;
188 break;
189 }
190 203
191 trace_kvm_mmu_paging_element(pte, walker->level); 204 trace_kvm_mmu_paging_element(pte, walker->level);
192 205
193 if (unlikely(!is_present_gpte(pte))) { 206 if (unlikely(!is_present_gpte(pte)))
194 present = false; 207 goto error;
195 break;
196 }
197 208
198 if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, 209 if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
199 walker->level))) { 210 walker->level))) {
200 rsvd_fault = true; 211 errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
201 break; 212 goto error;
202 } 213 }
203 214
204 if (unlikely(write_fault && !is_writable_pte(pte) 215 if (!check_write_user_access(vcpu, write_fault, user_fault,
205 && (user_fault || is_write_protection(vcpu)))) 216 pte))
206 eperm = true;
207
208 if (unlikely(user_fault && !(pte & PT_USER_MASK)))
209 eperm = true; 217 eperm = true;
210 218
211#if PTTYPE == 64 219#if PTTYPE == 64
@@ -213,39 +221,35 @@ walk:
213 eperm = true; 221 eperm = true;
214#endif 222#endif
215 223
216 if (!eperm && !rsvd_fault 224 if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
217 && unlikely(!(pte & PT_ACCESSED_MASK))) {
218 int ret; 225 int ret;
219 trace_kvm_mmu_set_accessed_bit(table_gfn, index, 226 trace_kvm_mmu_set_accessed_bit(table_gfn, index,
220 sizeof(pte)); 227 sizeof(pte));
221 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, 228 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
222 pte, pte|PT_ACCESSED_MASK); 229 pte, pte|PT_ACCESSED_MASK);
223 if (unlikely(ret < 0)) { 230 if (unlikely(ret < 0))
224 present = false; 231 goto error;
225 break; 232 else if (ret)
226 } else if (ret) 233 goto retry_walk;
227 goto walk;
228 234
229 mark_page_dirty(vcpu->kvm, table_gfn); 235 mark_page_dirty(vcpu->kvm, table_gfn);
230 pte |= PT_ACCESSED_MASK; 236 pte |= PT_ACCESSED_MASK;
231 } 237 }
232 238
233 pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
234
235 walker->ptes[walker->level - 1] = pte; 239 walker->ptes[walker->level - 1] = pte;
236 240
237 if ((walker->level == PT_PAGE_TABLE_LEVEL) || 241 if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) {
238 ((walker->level == PT_DIRECTORY_LEVEL) &&
239 is_large_pte(pte) &&
240 (PTTYPE == 64 || is_pse(vcpu))) ||
241 ((walker->level == PT_PDPE_LEVEL) &&
242 is_large_pte(pte) &&
243 mmu->root_level == PT64_ROOT_LEVEL)) {
244 int lvl = walker->level; 242 int lvl = walker->level;
245 gpa_t real_gpa; 243 gpa_t real_gpa;
246 gfn_t gfn; 244 gfn_t gfn;
247 u32 ac; 245 u32 ac;
248 246
247 /* check if the kernel is fetching from user page */
248 if (unlikely(pte_access & PT_USER_MASK) &&
249 kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
250 if (fetch_fault && !user_fault)
251 eperm = true;
252
249 gfn = gpte_to_gfn_lvl(pte, lvl); 253 gfn = gpte_to_gfn_lvl(pte, lvl);
250 gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; 254 gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
251 255
@@ -266,12 +270,14 @@ walk:
266 break; 270 break;
267 } 271 }
268 272
269 pt_access = pte_access; 273 pt_access &= FNAME(gpte_access)(vcpu, pte, false);
270 --walker->level; 274 --walker->level;
271 } 275 }
272 276
273 if (unlikely(!present || eperm || rsvd_fault)) 277 if (unlikely(eperm)) {
278 errcode |= PFERR_PRESENT_MASK;
274 goto error; 279 goto error;
280 }
275 281
276 if (write_fault && unlikely(!is_dirty_gpte(pte))) { 282 if (write_fault && unlikely(!is_dirty_gpte(pte))) {
277 int ret; 283 int ret;
@@ -279,17 +285,17 @@ walk:
279 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 285 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
280 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, 286 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
281 pte, pte|PT_DIRTY_MASK); 287 pte, pte|PT_DIRTY_MASK);
282 if (unlikely(ret < 0)) { 288 if (unlikely(ret < 0))
283 present = false;
284 goto error; 289 goto error;
285 } else if (ret) 290 else if (ret)
286 goto walk; 291 goto retry_walk;
287 292
288 mark_page_dirty(vcpu->kvm, table_gfn); 293 mark_page_dirty(vcpu->kvm, table_gfn);
289 pte |= PT_DIRTY_MASK; 294 pte |= PT_DIRTY_MASK;
290 walker->ptes[walker->level - 1] = pte; 295 walker->ptes[walker->level - 1] = pte;
291 } 296 }
292 297
298 pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true);
293 walker->pt_access = pt_access; 299 walker->pt_access = pt_access;
294 walker->pte_access = pte_access; 300 walker->pte_access = pte_access;
295 pgprintk("%s: pte %llx pte_access %x pt_access %x\n", 301 pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
@@ -297,19 +303,14 @@ walk:
297 return 1; 303 return 1;
298 304
299error: 305error:
306 errcode |= write_fault | user_fault;
307 if (fetch_fault && (mmu->nx ||
308 kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
309 errcode |= PFERR_FETCH_MASK;
310
300 walker->fault.vector = PF_VECTOR; 311 walker->fault.vector = PF_VECTOR;
301 walker->fault.error_code_valid = true; 312 walker->fault.error_code_valid = true;
302 walker->fault.error_code = 0; 313 walker->fault.error_code = errcode;
303 if (present)
304 walker->fault.error_code |= PFERR_PRESENT_MASK;
305
306 walker->fault.error_code |= write_fault | user_fault;
307
308 if (fetch_fault && mmu->nx)
309 walker->fault.error_code |= PFERR_FETCH_MASK;
310 if (rsvd_fault)
311 walker->fault.error_code |= PFERR_RSVD_MASK;
312
313 walker->fault.address = addr; 314 walker->fault.address = addr;
314 walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; 315 walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
315 316
@@ -336,16 +337,11 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
336 struct kvm_mmu_page *sp, u64 *spte, 337 struct kvm_mmu_page *sp, u64 *spte,
337 pt_element_t gpte) 338 pt_element_t gpte)
338{ 339{
339 u64 nonpresent = shadow_trap_nonpresent_pte;
340
341 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) 340 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
342 goto no_present; 341 goto no_present;
343 342
344 if (!is_present_gpte(gpte)) { 343 if (!is_present_gpte(gpte))
345 if (!sp->unsync)
346 nonpresent = shadow_notrap_nonpresent_pte;
347 goto no_present; 344 goto no_present;
348 }
349 345
350 if (!(gpte & PT_ACCESSED_MASK)) 346 if (!(gpte & PT_ACCESSED_MASK))
351 goto no_present; 347 goto no_present;
@@ -353,7 +349,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
353 return false; 349 return false;
354 350
355no_present: 351no_present:
356 drop_spte(vcpu->kvm, spte, nonpresent); 352 drop_spte(vcpu->kvm, spte);
357 return true; 353 return true;
358} 354}
359 355
@@ -369,9 +365,9 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
369 return; 365 return;
370 366
371 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 367 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
372 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 368 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
373 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); 369 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
374 if (is_error_pfn(pfn)) { 370 if (mmu_invalid_pfn(pfn)) {
375 kvm_release_pfn_clean(pfn); 371 kvm_release_pfn_clean(pfn);
376 return; 372 return;
377 } 373 }
@@ -381,7 +377,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
381 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 377 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
382 */ 378 */
383 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 379 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
384 is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL, 380 NULL, PT_PAGE_TABLE_LEVEL,
385 gpte_to_gfn(gpte), pfn, true, true); 381 gpte_to_gfn(gpte), pfn, true, true);
386} 382}
387 383
@@ -432,12 +428,11 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
432 unsigned pte_access; 428 unsigned pte_access;
433 gfn_t gfn; 429 gfn_t gfn;
434 pfn_t pfn; 430 pfn_t pfn;
435 bool dirty;
436 431
437 if (spte == sptep) 432 if (spte == sptep)
438 continue; 433 continue;
439 434
440 if (*spte != shadow_trap_nonpresent_pte) 435 if (is_shadow_present_pte(*spte))
441 continue; 436 continue;
442 437
443 gpte = gptep[i]; 438 gpte = gptep[i];
@@ -445,18 +440,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
445 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) 440 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
446 continue; 441 continue;
447 442
448 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 443 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
444 true);
449 gfn = gpte_to_gfn(gpte); 445 gfn = gpte_to_gfn(gpte);
450 dirty = is_dirty_gpte(gpte);
451 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, 446 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
452 (pte_access & ACC_WRITE_MASK) && dirty); 447 pte_access & ACC_WRITE_MASK);
453 if (is_error_pfn(pfn)) { 448 if (mmu_invalid_pfn(pfn)) {
454 kvm_release_pfn_clean(pfn); 449 kvm_release_pfn_clean(pfn);
455 break; 450 break;
456 } 451 }
457 452
458 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 453 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
459 dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn, 454 NULL, PT_PAGE_TABLE_LEVEL, gfn,
460 pfn, true, true); 455 pfn, true, true);
461 } 456 }
462} 457}
@@ -467,12 +462,11 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
467static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 462static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
468 struct guest_walker *gw, 463 struct guest_walker *gw,
469 int user_fault, int write_fault, int hlevel, 464 int user_fault, int write_fault, int hlevel,
470 int *ptwrite, pfn_t pfn, bool map_writable, 465 int *emulate, pfn_t pfn, bool map_writable,
471 bool prefault) 466 bool prefault)
472{ 467{
473 unsigned access = gw->pt_access; 468 unsigned access = gw->pt_access;
474 struct kvm_mmu_page *sp = NULL; 469 struct kvm_mmu_page *sp = NULL;
475 bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
476 int top_level; 470 int top_level;
477 unsigned direct_access; 471 unsigned direct_access;
478 struct kvm_shadow_walk_iterator it; 472 struct kvm_shadow_walk_iterator it;
@@ -480,9 +474,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
480 if (!is_present_gpte(gw->ptes[gw->level - 1])) 474 if (!is_present_gpte(gw->ptes[gw->level - 1]))
481 return NULL; 475 return NULL;
482 476
483 direct_access = gw->pt_access & gw->pte_access; 477 direct_access = gw->pte_access;
484 if (!dirty)
485 direct_access &= ~ACC_WRITE_MASK;
486 478
487 top_level = vcpu->arch.mmu.root_level; 479 top_level = vcpu->arch.mmu.root_level;
488 if (top_level == PT32E_ROOT_LEVEL) 480 if (top_level == PT32E_ROOT_LEVEL)
@@ -540,8 +532,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
540 link_shadow_page(it.sptep, sp); 532 link_shadow_page(it.sptep, sp);
541 } 533 }
542 534
543 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, 535 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
544 user_fault, write_fault, dirty, ptwrite, it.level, 536 user_fault, write_fault, emulate, it.level,
545 gw->gfn, pfn, prefault, map_writable); 537 gw->gfn, pfn, prefault, map_writable);
546 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 538 FNAME(pte_prefetch)(vcpu, gw, it.sptep);
547 539
@@ -575,7 +567,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
575 int user_fault = error_code & PFERR_USER_MASK; 567 int user_fault = error_code & PFERR_USER_MASK;
576 struct guest_walker walker; 568 struct guest_walker walker;
577 u64 *sptep; 569 u64 *sptep;
578 int write_pt = 0; 570 int emulate = 0;
579 int r; 571 int r;
580 pfn_t pfn; 572 pfn_t pfn;
581 int level = PT_PAGE_TABLE_LEVEL; 573 int level = PT_PAGE_TABLE_LEVEL;
@@ -585,6 +577,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
585 577
586 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 578 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
587 579
580 if (unlikely(error_code & PFERR_RSVD_MASK))
581 return handle_mmio_page_fault(vcpu, addr, error_code,
582 mmu_is_nested(vcpu));
583
588 r = mmu_topup_memory_caches(vcpu); 584 r = mmu_topup_memory_caches(vcpu);
589 if (r) 585 if (r)
590 return r; 586 return r;
@@ -623,9 +619,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
623 &map_writable)) 619 &map_writable))
624 return 0; 620 return 0;
625 621
626 /* mmio */ 622 if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
627 if (is_error_pfn(pfn)) 623 walker.gfn, pfn, walker.pte_access, &r))
628 return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn); 624 return r;
629 625
630 spin_lock(&vcpu->kvm->mmu_lock); 626 spin_lock(&vcpu->kvm->mmu_lock);
631 if (mmu_notifier_retry(vcpu, mmu_seq)) 627 if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -636,19 +632,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
636 if (!force_pt_level) 632 if (!force_pt_level)
637 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); 633 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
638 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 634 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
639 level, &write_pt, pfn, map_writable, prefault); 635 level, &emulate, pfn, map_writable, prefault);
640 (void)sptep; 636 (void)sptep;
641 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, 637 pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
642 sptep, *sptep, write_pt); 638 sptep, *sptep, emulate);
643 639
644 if (!write_pt) 640 if (!emulate)
645 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 641 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
646 642
647 ++vcpu->stat.pf_fixed; 643 ++vcpu->stat.pf_fixed;
648 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); 644 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
649 spin_unlock(&vcpu->kvm->mmu_lock); 645 spin_unlock(&vcpu->kvm->mmu_lock);
650 646
651 return write_pt; 647 return emulate;
652 648
653out_unlock: 649out_unlock:
654 spin_unlock(&vcpu->kvm->mmu_lock); 650 spin_unlock(&vcpu->kvm->mmu_lock);
@@ -665,6 +661,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
665 u64 *sptep; 661 u64 *sptep;
666 int need_flush = 0; 662 int need_flush = 0;
667 663
664 vcpu_clear_mmio_info(vcpu, gva);
665
668 spin_lock(&vcpu->kvm->mmu_lock); 666 spin_lock(&vcpu->kvm->mmu_lock);
669 667
670 for_each_shadow_entry(vcpu, gva, iterator) { 668 for_each_shadow_entry(vcpu, gva, iterator) {
@@ -688,11 +686,11 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
688 if (is_shadow_present_pte(*sptep)) { 686 if (is_shadow_present_pte(*sptep)) {
689 if (is_large_pte(*sptep)) 687 if (is_large_pte(*sptep))
690 --vcpu->kvm->stat.lpages; 688 --vcpu->kvm->stat.lpages;
691 drop_spte(vcpu->kvm, sptep, 689 drop_spte(vcpu->kvm, sptep);
692 shadow_trap_nonpresent_pte);
693 need_flush = 1; 690 need_flush = 1;
694 } else 691 } else if (is_mmio_spte(*sptep))
695 __set_spte(sptep, shadow_trap_nonpresent_pte); 692 mmu_spte_clear_no_track(sptep);
693
696 break; 694 break;
697 } 695 }
698 696
@@ -752,36 +750,6 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
752 return gpa; 750 return gpa;
753} 751}
754 752
755static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
756 struct kvm_mmu_page *sp)
757{
758 int i, j, offset, r;
759 pt_element_t pt[256 / sizeof(pt_element_t)];
760 gpa_t pte_gpa;
761
762 if (sp->role.direct
763 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
764 nonpaging_prefetch_page(vcpu, sp);
765 return;
766 }
767
768 pte_gpa = gfn_to_gpa(sp->gfn);
769 if (PTTYPE == 32) {
770 offset = sp->role.quadrant << PT64_LEVEL_BITS;
771 pte_gpa += offset * sizeof(pt_element_t);
772 }
773
774 for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
775 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
776 pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
777 for (j = 0; j < ARRAY_SIZE(pt); ++j)
778 if (r || is_present_gpte(pt[j]))
779 sp->spt[i+j] = shadow_trap_nonpresent_pte;
780 else
781 sp->spt[i+j] = shadow_notrap_nonpresent_pte;
782 }
783}
784
785/* 753/*
786 * Using the cached information from sp->gfns is safe because: 754 * Using the cached information from sp->gfns is safe because:
787 * - The spte has a reference to the struct page, so the pfn for a given gfn 755 * - The spte has a reference to the struct page, so the pfn for a given gfn
@@ -817,7 +785,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
817 gpa_t pte_gpa; 785 gpa_t pte_gpa;
818 gfn_t gfn; 786 gfn_t gfn;
819 787
820 if (!is_shadow_present_pte(sp->spt[i])) 788 if (!sp->spt[i])
821 continue; 789 continue;
822 790
823 pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); 791 pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
@@ -826,26 +794,30 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
826 sizeof(pt_element_t))) 794 sizeof(pt_element_t)))
827 return -EINVAL; 795 return -EINVAL;
828 796
829 gfn = gpte_to_gfn(gpte);
830
831 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { 797 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
832 vcpu->kvm->tlbs_dirty++; 798 vcpu->kvm->tlbs_dirty++;
833 continue; 799 continue;
834 } 800 }
835 801
802 gfn = gpte_to_gfn(gpte);
803 pte_access = sp->role.access;
804 pte_access &= FNAME(gpte_access)(vcpu, gpte, true);
805
806 if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
807 continue;
808
836 if (gfn != sp->gfns[i]) { 809 if (gfn != sp->gfns[i]) {
837 drop_spte(vcpu->kvm, &sp->spt[i], 810 drop_spte(vcpu->kvm, &sp->spt[i]);
838 shadow_trap_nonpresent_pte);
839 vcpu->kvm->tlbs_dirty++; 811 vcpu->kvm->tlbs_dirty++;
840 continue; 812 continue;
841 } 813 }
842 814
843 nr_present++; 815 nr_present++;
844 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 816
845 host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; 817 host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
846 818
847 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 819 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
848 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, 820 PT_PAGE_TABLE_LEVEL, gfn,
849 spte_to_pfn(sp->spt[i]), true, false, 821 spte_to_pfn(sp->spt[i]), true, false,
850 host_writable); 822 host_writable);
851 } 823 }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 506e4fe23adc..475d1c948501 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1496,11 +1496,14 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1496 update_cr0_intercept(svm); 1496 update_cr0_intercept(svm);
1497} 1497}
1498 1498
1499static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1499static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1500{ 1500{
1501 unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; 1501 unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
1502 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; 1502 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
1503 1503
1504 if (cr4 & X86_CR4_VMXE)
1505 return 1;
1506
1504 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) 1507 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1505 svm_flush_tlb(vcpu); 1508 svm_flush_tlb(vcpu);
1506 1509
@@ -1510,6 +1513,7 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1510 cr4 |= host_cr4_mce; 1513 cr4 |= host_cr4_mce;
1511 to_svm(vcpu)->vmcb->save.cr4 = cr4; 1514 to_svm(vcpu)->vmcb->save.cr4 = cr4;
1512 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); 1515 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1516 return 0;
1513} 1517}
1514 1518
1515static void svm_set_segment(struct kvm_vcpu *vcpu, 1519static void svm_set_segment(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index db932760ea82..3ff898c104f7 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -675,12 +675,12 @@ TRACE_EVENT(kvm_emulate_insn,
675 ), 675 ),
676 676
677 TP_fast_assign( 677 TP_fast_assign(
678 __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start; 678 __entry->rip = vcpu->arch.emulate_ctxt.fetch.start;
679 __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS); 679 __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
680 __entry->len = vcpu->arch.emulate_ctxt.decode.eip 680 __entry->len = vcpu->arch.emulate_ctxt._eip
681 - vcpu->arch.emulate_ctxt.decode.fetch.start; 681 - vcpu->arch.emulate_ctxt.fetch.start;
682 memcpy(__entry->insn, 682 memcpy(__entry->insn,
683 vcpu->arch.emulate_ctxt.decode.fetch.data, 683 vcpu->arch.emulate_ctxt.fetch.data,
684 15); 684 15);
685 __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode); 685 __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
686 __entry->failed = failed; 686 __entry->failed = failed;
@@ -698,6 +698,29 @@ TRACE_EVENT(kvm_emulate_insn,
698#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0) 698#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
699#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1) 699#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
700 700
701TRACE_EVENT(
702 vcpu_match_mmio,
703 TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match),
704 TP_ARGS(gva, gpa, write, gpa_match),
705
706 TP_STRUCT__entry(
707 __field(gva_t, gva)
708 __field(gpa_t, gpa)
709 __field(bool, write)
710 __field(bool, gpa_match)
711 ),
712
713 TP_fast_assign(
714 __entry->gva = gva;
715 __entry->gpa = gpa;
716 __entry->write = write;
717 __entry->gpa_match = gpa_match
718 ),
719
720 TP_printk("gva %#lx gpa %#llx %s %s", __entry->gva, __entry->gpa,
721 __entry->write ? "Write" : "Read",
722 __entry->gpa_match ? "GPA" : "GVA")
723);
701#endif /* _TRACE_KVM_H */ 724#endif /* _TRACE_KVM_H */
702 725
703#undef TRACE_INCLUDE_PATH 726#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d48ec60ea421..e65a158dee64 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -43,13 +43,12 @@
43#include "trace.h" 43#include "trace.h"
44 44
45#define __ex(x) __kvm_handle_fault_on_reboot(x) 45#define __ex(x) __kvm_handle_fault_on_reboot(x)
46#define __ex_clear(x, reg) \
47 ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
46 48
47MODULE_AUTHOR("Qumranet"); 49MODULE_AUTHOR("Qumranet");
48MODULE_LICENSE("GPL"); 50MODULE_LICENSE("GPL");
49 51
50static int __read_mostly bypass_guest_pf = 1;
51module_param(bypass_guest_pf, bool, S_IRUGO);
52
53static int __read_mostly enable_vpid = 1; 52static int __read_mostly enable_vpid = 1;
54module_param_named(vpid, enable_vpid, bool, 0444); 53module_param_named(vpid, enable_vpid, bool, 0444);
55 54
@@ -72,6 +71,14 @@ module_param(vmm_exclusive, bool, S_IRUGO);
72static int __read_mostly yield_on_hlt = 1; 71static int __read_mostly yield_on_hlt = 1;
73module_param(yield_on_hlt, bool, S_IRUGO); 72module_param(yield_on_hlt, bool, S_IRUGO);
74 73
74/*
75 * If nested=1, nested virtualization is supported, i.e., guests may use
76 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
77 * use VMX instructions.
78 */
79static int __read_mostly nested = 0;
80module_param(nested, bool, S_IRUGO);
81
75#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 82#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
76 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) 83 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
77#define KVM_GUEST_CR0_MASK \ 84#define KVM_GUEST_CR0_MASK \
@@ -109,6 +116,7 @@ static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
109module_param(ple_window, int, S_IRUGO); 116module_param(ple_window, int, S_IRUGO);
110 117
111#define NR_AUTOLOAD_MSRS 1 118#define NR_AUTOLOAD_MSRS 1
119#define VMCS02_POOL_SIZE 1
112 120
113struct vmcs { 121struct vmcs {
114 u32 revision_id; 122 u32 revision_id;
@@ -116,17 +124,237 @@ struct vmcs {
116 char data[0]; 124 char data[0];
117}; 125};
118 126
127/*
128 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
129 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
130 * loaded on this CPU (so we can clear them if the CPU goes down).
131 */
132struct loaded_vmcs {
133 struct vmcs *vmcs;
134 int cpu;
135 int launched;
136 struct list_head loaded_vmcss_on_cpu_link;
137};
138
119struct shared_msr_entry { 139struct shared_msr_entry {
120 unsigned index; 140 unsigned index;
121 u64 data; 141 u64 data;
122 u64 mask; 142 u64 mask;
123}; 143};
124 144
145/*
146 * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
147 * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
148 * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
149 * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
150 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
151 * More than one of these structures may exist, if L1 runs multiple L2 guests.
152 * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
153 * underlying hardware which will be used to run L2.
154 * This structure is packed to ensure that its layout is identical across
155 * machines (necessary for live migration).
156 * If there are changes in this struct, VMCS12_REVISION must be changed.
157 */
158typedef u64 natural_width;
159struct __packed vmcs12 {
160 /* According to the Intel spec, a VMCS region must start with the
161 * following two fields. Then follow implementation-specific data.
162 */
163 u32 revision_id;
164 u32 abort;
165
166 u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
167 u32 padding[7]; /* room for future expansion */
168
169 u64 io_bitmap_a;
170 u64 io_bitmap_b;
171 u64 msr_bitmap;
172 u64 vm_exit_msr_store_addr;
173 u64 vm_exit_msr_load_addr;
174 u64 vm_entry_msr_load_addr;
175 u64 tsc_offset;
176 u64 virtual_apic_page_addr;
177 u64 apic_access_addr;
178 u64 ept_pointer;
179 u64 guest_physical_address;
180 u64 vmcs_link_pointer;
181 u64 guest_ia32_debugctl;
182 u64 guest_ia32_pat;
183 u64 guest_ia32_efer;
184 u64 guest_ia32_perf_global_ctrl;
185 u64 guest_pdptr0;
186 u64 guest_pdptr1;
187 u64 guest_pdptr2;
188 u64 guest_pdptr3;
189 u64 host_ia32_pat;
190 u64 host_ia32_efer;
191 u64 host_ia32_perf_global_ctrl;
192 u64 padding64[8]; /* room for future expansion */
193 /*
194 * To allow migration of L1 (complete with its L2 guests) between
195 * machines of different natural widths (32 or 64 bit), we cannot have
196 * unsigned long fields with no explict size. We use u64 (aliased
197 * natural_width) instead. Luckily, x86 is little-endian.
198 */
199 natural_width cr0_guest_host_mask;
200 natural_width cr4_guest_host_mask;
201 natural_width cr0_read_shadow;
202 natural_width cr4_read_shadow;
203 natural_width cr3_target_value0;
204 natural_width cr3_target_value1;
205 natural_width cr3_target_value2;
206 natural_width cr3_target_value3;
207 natural_width exit_qualification;
208 natural_width guest_linear_address;
209 natural_width guest_cr0;
210 natural_width guest_cr3;
211 natural_width guest_cr4;
212 natural_width guest_es_base;
213 natural_width guest_cs_base;
214 natural_width guest_ss_base;
215 natural_width guest_ds_base;
216 natural_width guest_fs_base;
217 natural_width guest_gs_base;
218 natural_width guest_ldtr_base;
219 natural_width guest_tr_base;
220 natural_width guest_gdtr_base;
221 natural_width guest_idtr_base;
222 natural_width guest_dr7;
223 natural_width guest_rsp;
224 natural_width guest_rip;
225 natural_width guest_rflags;
226 natural_width guest_pending_dbg_exceptions;
227 natural_width guest_sysenter_esp;
228 natural_width guest_sysenter_eip;
229 natural_width host_cr0;
230 natural_width host_cr3;
231 natural_width host_cr4;
232 natural_width host_fs_base;
233 natural_width host_gs_base;
234 natural_width host_tr_base;
235 natural_width host_gdtr_base;
236 natural_width host_idtr_base;
237 natural_width host_ia32_sysenter_esp;
238 natural_width host_ia32_sysenter_eip;
239 natural_width host_rsp;
240 natural_width host_rip;
241 natural_width paddingl[8]; /* room for future expansion */
242 u32 pin_based_vm_exec_control;
243 u32 cpu_based_vm_exec_control;
244 u32 exception_bitmap;
245 u32 page_fault_error_code_mask;
246 u32 page_fault_error_code_match;
247 u32 cr3_target_count;
248 u32 vm_exit_controls;
249 u32 vm_exit_msr_store_count;
250 u32 vm_exit_msr_load_count;
251 u32 vm_entry_controls;
252 u32 vm_entry_msr_load_count;
253 u32 vm_entry_intr_info_field;
254 u32 vm_entry_exception_error_code;
255 u32 vm_entry_instruction_len;
256 u32 tpr_threshold;
257 u32 secondary_vm_exec_control;
258 u32 vm_instruction_error;
259 u32 vm_exit_reason;
260 u32 vm_exit_intr_info;
261 u32 vm_exit_intr_error_code;
262 u32 idt_vectoring_info_field;
263 u32 idt_vectoring_error_code;
264 u32 vm_exit_instruction_len;
265 u32 vmx_instruction_info;
266 u32 guest_es_limit;
267 u32 guest_cs_limit;
268 u32 guest_ss_limit;
269 u32 guest_ds_limit;
270 u32 guest_fs_limit;
271 u32 guest_gs_limit;
272 u32 guest_ldtr_limit;
273 u32 guest_tr_limit;
274 u32 guest_gdtr_limit;
275 u32 guest_idtr_limit;
276 u32 guest_es_ar_bytes;
277 u32 guest_cs_ar_bytes;
278 u32 guest_ss_ar_bytes;
279 u32 guest_ds_ar_bytes;
280 u32 guest_fs_ar_bytes;
281 u32 guest_gs_ar_bytes;
282 u32 guest_ldtr_ar_bytes;
283 u32 guest_tr_ar_bytes;
284 u32 guest_interruptibility_info;
285 u32 guest_activity_state;
286 u32 guest_sysenter_cs;
287 u32 host_ia32_sysenter_cs;
288 u32 padding32[8]; /* room for future expansion */
289 u16 virtual_processor_id;
290 u16 guest_es_selector;
291 u16 guest_cs_selector;
292 u16 guest_ss_selector;
293 u16 guest_ds_selector;
294 u16 guest_fs_selector;
295 u16 guest_gs_selector;
296 u16 guest_ldtr_selector;
297 u16 guest_tr_selector;
298 u16 host_es_selector;
299 u16 host_cs_selector;
300 u16 host_ss_selector;
301 u16 host_ds_selector;
302 u16 host_fs_selector;
303 u16 host_gs_selector;
304 u16 host_tr_selector;
305};
306
307/*
308 * VMCS12_REVISION is an arbitrary id that should be changed if the content or
309 * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
310 * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
311 */
312#define VMCS12_REVISION 0x11e57ed0
313
314/*
315 * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
316 * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
317 * current implementation, 4K are reserved to avoid future complications.
318 */
319#define VMCS12_SIZE 0x1000
320
321/* Used to remember the last vmcs02 used for some recently used vmcs12s */
322struct vmcs02_list {
323 struct list_head list;
324 gpa_t vmptr;
325 struct loaded_vmcs vmcs02;
326};
327
328/*
329 * The nested_vmx structure is part of vcpu_vmx, and holds information we need
330 * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
331 */
332struct nested_vmx {
333 /* Has the level1 guest done vmxon? */
334 bool vmxon;
335
336 /* The guest-physical address of the current VMCS L1 keeps for L2 */
337 gpa_t current_vmptr;
338 /* The host-usable pointer to the above */
339 struct page *current_vmcs12_page;
340 struct vmcs12 *current_vmcs12;
341
342 /* vmcs02_list cache of VMCSs recently used to run L2 guests */
343 struct list_head vmcs02_pool;
344 int vmcs02_num;
345 u64 vmcs01_tsc_offset;
346 /* L2 must run next, and mustn't decide to exit to L1. */
347 bool nested_run_pending;
348 /*
349 * Guest pages referred to in vmcs02 with host-physical pointers, so
350 * we must keep them pinned while L2 runs.
351 */
352 struct page *apic_access_page;
353};
354
125struct vcpu_vmx { 355struct vcpu_vmx {
126 struct kvm_vcpu vcpu; 356 struct kvm_vcpu vcpu;
127 struct list_head local_vcpus_link;
128 unsigned long host_rsp; 357 unsigned long host_rsp;
129 int launched;
130 u8 fail; 358 u8 fail;
131 u8 cpl; 359 u8 cpl;
132 bool nmi_known_unmasked; 360 bool nmi_known_unmasked;
@@ -140,7 +368,14 @@ struct vcpu_vmx {
140 u64 msr_host_kernel_gs_base; 368 u64 msr_host_kernel_gs_base;
141 u64 msr_guest_kernel_gs_base; 369 u64 msr_guest_kernel_gs_base;
142#endif 370#endif
143 struct vmcs *vmcs; 371 /*
372 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
373 * non-nested (L1) guest, it always points to vmcs01. For a nested
374 * guest (L2), it points to a different VMCS.
375 */
376 struct loaded_vmcs vmcs01;
377 struct loaded_vmcs *loaded_vmcs;
378 bool __launched; /* temporary, used in vmx_vcpu_run */
144 struct msr_autoload { 379 struct msr_autoload {
145 unsigned nr; 380 unsigned nr;
146 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; 381 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
@@ -176,6 +411,9 @@ struct vcpu_vmx {
176 u32 exit_reason; 411 u32 exit_reason;
177 412
178 bool rdtscp_enabled; 413 bool rdtscp_enabled;
414
415 /* Support for a guest hypervisor (nested VMX) */
416 struct nested_vmx nested;
179}; 417};
180 418
181enum segment_cache_field { 419enum segment_cache_field {
@@ -192,6 +430,174 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
192 return container_of(vcpu, struct vcpu_vmx, vcpu); 430 return container_of(vcpu, struct vcpu_vmx, vcpu);
193} 431}
194 432
433#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
434#define FIELD(number, name) [number] = VMCS12_OFFSET(name)
435#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
436 [number##_HIGH] = VMCS12_OFFSET(name)+4
437
438static unsigned short vmcs_field_to_offset_table[] = {
439 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
440 FIELD(GUEST_ES_SELECTOR, guest_es_selector),
441 FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
442 FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
443 FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
444 FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
445 FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
446 FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
447 FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
448 FIELD(HOST_ES_SELECTOR, host_es_selector),
449 FIELD(HOST_CS_SELECTOR, host_cs_selector),
450 FIELD(HOST_SS_SELECTOR, host_ss_selector),
451 FIELD(HOST_DS_SELECTOR, host_ds_selector),
452 FIELD(HOST_FS_SELECTOR, host_fs_selector),
453 FIELD(HOST_GS_SELECTOR, host_gs_selector),
454 FIELD(HOST_TR_SELECTOR, host_tr_selector),
455 FIELD64(IO_BITMAP_A, io_bitmap_a),
456 FIELD64(IO_BITMAP_B, io_bitmap_b),
457 FIELD64(MSR_BITMAP, msr_bitmap),
458 FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
459 FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
460 FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
461 FIELD64(TSC_OFFSET, tsc_offset),
462 FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
463 FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
464 FIELD64(EPT_POINTER, ept_pointer),
465 FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
466 FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
467 FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
468 FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
469 FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
470 FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
471 FIELD64(GUEST_PDPTR0, guest_pdptr0),
472 FIELD64(GUEST_PDPTR1, guest_pdptr1),
473 FIELD64(GUEST_PDPTR2, guest_pdptr2),
474 FIELD64(GUEST_PDPTR3, guest_pdptr3),
475 FIELD64(HOST_IA32_PAT, host_ia32_pat),
476 FIELD64(HOST_IA32_EFER, host_ia32_efer),
477 FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
478 FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
479 FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
480 FIELD(EXCEPTION_BITMAP, exception_bitmap),
481 FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
482 FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
483 FIELD(CR3_TARGET_COUNT, cr3_target_count),
484 FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
485 FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
486 FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
487 FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
488 FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
489 FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
490 FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
491 FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
492 FIELD(TPR_THRESHOLD, tpr_threshold),
493 FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
494 FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
495 FIELD(VM_EXIT_REASON, vm_exit_reason),
496 FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
497 FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
498 FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
499 FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
500 FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
501 FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
502 FIELD(GUEST_ES_LIMIT, guest_es_limit),
503 FIELD(GUEST_CS_LIMIT, guest_cs_limit),
504 FIELD(GUEST_SS_LIMIT, guest_ss_limit),
505 FIELD(GUEST_DS_LIMIT, guest_ds_limit),
506 FIELD(GUEST_FS_LIMIT, guest_fs_limit),
507 FIELD(GUEST_GS_LIMIT, guest_gs_limit),
508 FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
509 FIELD(GUEST_TR_LIMIT, guest_tr_limit),
510 FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
511 FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
512 FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
513 FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
514 FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
515 FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
516 FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
517 FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
518 FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
519 FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
520 FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
521 FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
522 FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
523 FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
524 FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
525 FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
526 FIELD(CR0_READ_SHADOW, cr0_read_shadow),
527 FIELD(CR4_READ_SHADOW, cr4_read_shadow),
528 FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
529 FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
530 FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
531 FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
532 FIELD(EXIT_QUALIFICATION, exit_qualification),
533 FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
534 FIELD(GUEST_CR0, guest_cr0),
535 FIELD(GUEST_CR3, guest_cr3),
536 FIELD(GUEST_CR4, guest_cr4),
537 FIELD(GUEST_ES_BASE, guest_es_base),
538 FIELD(GUEST_CS_BASE, guest_cs_base),
539 FIELD(GUEST_SS_BASE, guest_ss_base),
540 FIELD(GUEST_DS_BASE, guest_ds_base),
541 FIELD(GUEST_FS_BASE, guest_fs_base),
542 FIELD(GUEST_GS_BASE, guest_gs_base),
543 FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
544 FIELD(GUEST_TR_BASE, guest_tr_base),
545 FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
546 FIELD(GUEST_IDTR_BASE, guest_idtr_base),
547 FIELD(GUEST_DR7, guest_dr7),
548 FIELD(GUEST_RSP, guest_rsp),
549 FIELD(GUEST_RIP, guest_rip),
550 FIELD(GUEST_RFLAGS, guest_rflags),
551 FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
552 FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
553 FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
554 FIELD(HOST_CR0, host_cr0),
555 FIELD(HOST_CR3, host_cr3),
556 FIELD(HOST_CR4, host_cr4),
557 FIELD(HOST_FS_BASE, host_fs_base),
558 FIELD(HOST_GS_BASE, host_gs_base),
559 FIELD(HOST_TR_BASE, host_tr_base),
560 FIELD(HOST_GDTR_BASE, host_gdtr_base),
561 FIELD(HOST_IDTR_BASE, host_idtr_base),
562 FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
563 FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
564 FIELD(HOST_RSP, host_rsp),
565 FIELD(HOST_RIP, host_rip),
566};
567static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table);
568
569static inline short vmcs_field_to_offset(unsigned long field)
570{
571 if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0)
572 return -1;
573 return vmcs_field_to_offset_table[field];
574}
575
576static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
577{
578 return to_vmx(vcpu)->nested.current_vmcs12;
579}
580
581static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
582{
583 struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
584 if (is_error_page(page)) {
585 kvm_release_page_clean(page);
586 return NULL;
587 }
588 return page;
589}
590
591static void nested_release_page(struct page *page)
592{
593 kvm_release_page_dirty(page);
594}
595
596static void nested_release_page_clean(struct page *page)
597{
598 kvm_release_page_clean(page);
599}
600
195static u64 construct_eptp(unsigned long root_hpa); 601static u64 construct_eptp(unsigned long root_hpa);
196static void kvm_cpu_vmxon(u64 addr); 602static void kvm_cpu_vmxon(u64 addr);
197static void kvm_cpu_vmxoff(void); 603static void kvm_cpu_vmxoff(void);
@@ -200,7 +606,11 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
200 606
201static DEFINE_PER_CPU(struct vmcs *, vmxarea); 607static DEFINE_PER_CPU(struct vmcs *, vmxarea);
202static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 608static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
203static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); 609/*
610 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
611 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
612 */
613static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
204static DEFINE_PER_CPU(struct desc_ptr, host_gdt); 614static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
205 615
206static unsigned long *vmx_io_bitmap_a; 616static unsigned long *vmx_io_bitmap_a;
@@ -442,6 +852,35 @@ static inline bool report_flexpriority(void)
442 return flexpriority_enabled; 852 return flexpriority_enabled;
443} 853}
444 854
855static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
856{
857 return vmcs12->cpu_based_vm_exec_control & bit;
858}
859
860static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
861{
862 return (vmcs12->cpu_based_vm_exec_control &
863 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
864 (vmcs12->secondary_vm_exec_control & bit);
865}
866
867static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
868 struct kvm_vcpu *vcpu)
869{
870 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
871}
872
873static inline bool is_exception(u32 intr_info)
874{
875 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
876 == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
877}
878
879static void nested_vmx_vmexit(struct kvm_vcpu *vcpu);
880static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
881 struct vmcs12 *vmcs12,
882 u32 reason, unsigned long qualification);
883
445static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 884static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
446{ 885{
447 int i; 886 int i;
@@ -501,6 +940,13 @@ static void vmcs_clear(struct vmcs *vmcs)
501 vmcs, phys_addr); 940 vmcs, phys_addr);
502} 941}
503 942
943static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
944{
945 vmcs_clear(loaded_vmcs->vmcs);
946 loaded_vmcs->cpu = -1;
947 loaded_vmcs->launched = 0;
948}
949
504static void vmcs_load(struct vmcs *vmcs) 950static void vmcs_load(struct vmcs *vmcs)
505{ 951{
506 u64 phys_addr = __pa(vmcs); 952 u64 phys_addr = __pa(vmcs);
@@ -510,29 +956,28 @@ static void vmcs_load(struct vmcs *vmcs)
510 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) 956 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
511 : "cc", "memory"); 957 : "cc", "memory");
512 if (error) 958 if (error)
513 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", 959 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
514 vmcs, phys_addr); 960 vmcs, phys_addr);
515} 961}
516 962
517static void __vcpu_clear(void *arg) 963static void __loaded_vmcs_clear(void *arg)
518{ 964{
519 struct vcpu_vmx *vmx = arg; 965 struct loaded_vmcs *loaded_vmcs = arg;
520 int cpu = raw_smp_processor_id(); 966 int cpu = raw_smp_processor_id();
521 967
522 if (vmx->vcpu.cpu == cpu) 968 if (loaded_vmcs->cpu != cpu)
523 vmcs_clear(vmx->vmcs); 969 return; /* vcpu migration can race with cpu offline */
524 if (per_cpu(current_vmcs, cpu) == vmx->vmcs) 970 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
525 per_cpu(current_vmcs, cpu) = NULL; 971 per_cpu(current_vmcs, cpu) = NULL;
526 list_del(&vmx->local_vcpus_link); 972 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
527 vmx->vcpu.cpu = -1; 973 loaded_vmcs_init(loaded_vmcs);
528 vmx->launched = 0;
529} 974}
530 975
531static void vcpu_clear(struct vcpu_vmx *vmx) 976static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
532{ 977{
533 if (vmx->vcpu.cpu == -1) 978 if (loaded_vmcs->cpu != -1)
534 return; 979 smp_call_function_single(
535 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); 980 loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
536} 981}
537 982
538static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) 983static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
@@ -585,26 +1030,26 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
585 } 1030 }
586} 1031}
587 1032
588static unsigned long vmcs_readl(unsigned long field) 1033static __always_inline unsigned long vmcs_readl(unsigned long field)
589{ 1034{
590 unsigned long value = 0; 1035 unsigned long value;
591 1036
592 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) 1037 asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
593 : "+a"(value) : "d"(field) : "cc"); 1038 : "=a"(value) : "d"(field) : "cc");
594 return value; 1039 return value;
595} 1040}
596 1041
597static u16 vmcs_read16(unsigned long field) 1042static __always_inline u16 vmcs_read16(unsigned long field)
598{ 1043{
599 return vmcs_readl(field); 1044 return vmcs_readl(field);
600} 1045}
601 1046
602static u32 vmcs_read32(unsigned long field) 1047static __always_inline u32 vmcs_read32(unsigned long field)
603{ 1048{
604 return vmcs_readl(field); 1049 return vmcs_readl(field);
605} 1050}
606 1051
607static u64 vmcs_read64(unsigned long field) 1052static __always_inline u64 vmcs_read64(unsigned long field)
608{ 1053{
609#ifdef CONFIG_X86_64 1054#ifdef CONFIG_X86_64
610 return vmcs_readl(field); 1055 return vmcs_readl(field);
@@ -731,6 +1176,15 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
731 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 1176 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
732 if (vcpu->fpu_active) 1177 if (vcpu->fpu_active)
733 eb &= ~(1u << NM_VECTOR); 1178 eb &= ~(1u << NM_VECTOR);
1179
1180 /* When we are running a nested L2 guest and L1 specified for it a
1181 * certain exception bitmap, we must trap the same exceptions and pass
1182 * them to L1. When running L2, we will only handle the exceptions
1183 * specified above if L1 did not want them.
1184 */
1185 if (is_guest_mode(vcpu))
1186 eb |= get_vmcs12(vcpu)->exception_bitmap;
1187
734 vmcs_write32(EXCEPTION_BITMAP, eb); 1188 vmcs_write32(EXCEPTION_BITMAP, eb);
735} 1189}
736 1190
@@ -971,22 +1425,22 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
971 1425
972 if (!vmm_exclusive) 1426 if (!vmm_exclusive)
973 kvm_cpu_vmxon(phys_addr); 1427 kvm_cpu_vmxon(phys_addr);
974 else if (vcpu->cpu != cpu) 1428 else if (vmx->loaded_vmcs->cpu != cpu)
975 vcpu_clear(vmx); 1429 loaded_vmcs_clear(vmx->loaded_vmcs);
976 1430
977 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { 1431 if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
978 per_cpu(current_vmcs, cpu) = vmx->vmcs; 1432 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
979 vmcs_load(vmx->vmcs); 1433 vmcs_load(vmx->loaded_vmcs->vmcs);
980 } 1434 }
981 1435
982 if (vcpu->cpu != cpu) { 1436 if (vmx->loaded_vmcs->cpu != cpu) {
983 struct desc_ptr *gdt = &__get_cpu_var(host_gdt); 1437 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
984 unsigned long sysenter_esp; 1438 unsigned long sysenter_esp;
985 1439
986 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1440 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
987 local_irq_disable(); 1441 local_irq_disable();
988 list_add(&vmx->local_vcpus_link, 1442 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
989 &per_cpu(vcpus_on_cpu, cpu)); 1443 &per_cpu(loaded_vmcss_on_cpu, cpu));
990 local_irq_enable(); 1444 local_irq_enable();
991 1445
992 /* 1446 /*
@@ -998,6 +1452,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
998 1452
999 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 1453 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
1000 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 1454 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
1455 vmx->loaded_vmcs->cpu = cpu;
1001 } 1456 }
1002} 1457}
1003 1458
@@ -1005,7 +1460,8 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1005{ 1460{
1006 __vmx_load_host_state(to_vmx(vcpu)); 1461 __vmx_load_host_state(to_vmx(vcpu));
1007 if (!vmm_exclusive) { 1462 if (!vmm_exclusive) {
1008 __vcpu_clear(to_vmx(vcpu)); 1463 __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
1464 vcpu->cpu = -1;
1009 kvm_cpu_vmxoff(); 1465 kvm_cpu_vmxoff();
1010 } 1466 }
1011} 1467}
@@ -1023,19 +1479,55 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
1023 vmcs_writel(GUEST_CR0, cr0); 1479 vmcs_writel(GUEST_CR0, cr0);
1024 update_exception_bitmap(vcpu); 1480 update_exception_bitmap(vcpu);
1025 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; 1481 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
1482 if (is_guest_mode(vcpu))
1483 vcpu->arch.cr0_guest_owned_bits &=
1484 ~get_vmcs12(vcpu)->cr0_guest_host_mask;
1026 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 1485 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
1027} 1486}
1028 1487
1029static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); 1488static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
1030 1489
1490/*
1491 * Return the cr0 value that a nested guest would read. This is a combination
1492 * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
1493 * its hypervisor (cr0_read_shadow).
1494 */
1495static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
1496{
1497 return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
1498 (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
1499}
1500static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
1501{
1502 return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
1503 (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
1504}
1505
1031static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) 1506static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
1032{ 1507{
1508 /* Note that there is no vcpu->fpu_active = 0 here. The caller must
1509 * set this *before* calling this function.
1510 */
1033 vmx_decache_cr0_guest_bits(vcpu); 1511 vmx_decache_cr0_guest_bits(vcpu);
1034 vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); 1512 vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
1035 update_exception_bitmap(vcpu); 1513 update_exception_bitmap(vcpu);
1036 vcpu->arch.cr0_guest_owned_bits = 0; 1514 vcpu->arch.cr0_guest_owned_bits = 0;
1037 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 1515 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
1038 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); 1516 if (is_guest_mode(vcpu)) {
1517 /*
1518 * L1's specified read shadow might not contain the TS bit,
1519 * so now that we turned on shadowing of this bit, we need to
1520 * set this bit of the shadow. Like in nested_vmx_run we need
1521 * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
1522 * up-to-date here because we just decached cr0.TS (and we'll
1523 * only update vmcs12->guest_cr0 on nested exit).
1524 */
1525 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1526 vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
1527 (vcpu->arch.cr0 & X86_CR0_TS);
1528 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
1529 } else
1530 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
1039} 1531}
1040 1532
1041static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 1533static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -1119,6 +1611,25 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1119 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 1611 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1120} 1612}
1121 1613
1614/*
1615 * KVM wants to inject page-faults which it got to the guest. This function
1616 * checks whether in a nested guest, we need to inject them to L1 or L2.
1617 * This function assumes it is called with the exit reason in vmcs02 being
1618 * a #PF exception (this is the only case in which KVM injects a #PF when L2
1619 * is running).
1620 */
1621static int nested_pf_handled(struct kvm_vcpu *vcpu)
1622{
1623 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1624
1625 /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
1626 if (!(vmcs12->exception_bitmap & PF_VECTOR))
1627 return 0;
1628
1629 nested_vmx_vmexit(vcpu);
1630 return 1;
1631}
1632
1122static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 1633static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1123 bool has_error_code, u32 error_code, 1634 bool has_error_code, u32 error_code,
1124 bool reinject) 1635 bool reinject)
@@ -1126,6 +1637,10 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1126 struct vcpu_vmx *vmx = to_vmx(vcpu); 1637 struct vcpu_vmx *vmx = to_vmx(vcpu);
1127 u32 intr_info = nr | INTR_INFO_VALID_MASK; 1638 u32 intr_info = nr | INTR_INFO_VALID_MASK;
1128 1639
1640 if (nr == PF_VECTOR && is_guest_mode(vcpu) &&
1641 nested_pf_handled(vcpu))
1642 return;
1643
1129 if (has_error_code) { 1644 if (has_error_code) {
1130 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); 1645 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1131 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 1646 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
@@ -1248,12 +1763,24 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
1248static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 1763static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1249{ 1764{
1250 vmcs_write64(TSC_OFFSET, offset); 1765 vmcs_write64(TSC_OFFSET, offset);
1766 if (is_guest_mode(vcpu))
1767 /*
1768 * We're here if L1 chose not to trap the TSC MSR. Since
1769 * prepare_vmcs12() does not copy tsc_offset, we need to also
1770 * set the vmcs12 field here.
1771 */
1772 get_vmcs12(vcpu)->tsc_offset = offset -
1773 to_vmx(vcpu)->nested.vmcs01_tsc_offset;
1251} 1774}
1252 1775
1253static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) 1776static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
1254{ 1777{
1255 u64 offset = vmcs_read64(TSC_OFFSET); 1778 u64 offset = vmcs_read64(TSC_OFFSET);
1256 vmcs_write64(TSC_OFFSET, offset + adjustment); 1779 vmcs_write64(TSC_OFFSET, offset + adjustment);
1780 if (is_guest_mode(vcpu)) {
1781 /* Even when running L2, the adjustment needs to apply to L1 */
1782 to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
1783 }
1257} 1784}
1258 1785
1259static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) 1786static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
@@ -1261,6 +1788,236 @@ static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1261 return target_tsc - native_read_tsc(); 1788 return target_tsc - native_read_tsc();
1262} 1789}
1263 1790
1791static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
1792{
1793 struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
1794 return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
1795}
1796
1797/*
1798 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
1799 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
1800 * all guests if the "nested" module option is off, and can also be disabled
1801 * for a single guest by disabling its VMX cpuid bit.
1802 */
1803static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
1804{
1805 return nested && guest_cpuid_has_vmx(vcpu);
1806}
1807
1808/*
1809 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
1810 * returned for the various VMX controls MSRs when nested VMX is enabled.
1811 * The same values should also be used to verify that vmcs12 control fields are
1812 * valid during nested entry from L1 to L2.
1813 * Each of these control msrs has a low and high 32-bit half: A low bit is on
1814 * if the corresponding bit in the (32-bit) control field *must* be on, and a
1815 * bit in the high half is on if the corresponding bit in the control field
1816 * may be on. See also vmx_control_verify().
1817 * TODO: allow these variables to be modified (downgraded) by module options
1818 * or other means.
1819 */
1820static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
1821static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
1822static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
1823static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
1824static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
1825static __init void nested_vmx_setup_ctls_msrs(void)
1826{
1827 /*
1828 * Note that as a general rule, the high half of the MSRs (bits in
1829 * the control fields which may be 1) should be initialized by the
1830 * intersection of the underlying hardware's MSR (i.e., features which
1831 * can be supported) and the list of features we want to expose -
1832 * because they are known to be properly supported in our code.
1833 * Also, usually, the low half of the MSRs (bits which must be 1) can
1834 * be set to 0, meaning that L1 may turn off any of these bits. The
1835 * reason is that if one of these bits is necessary, it will appear
1836 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
1837 * fields of vmcs01 and vmcs02, will turn these bits off - and
1838 * nested_vmx_exit_handled() will not pass related exits to L1.
1839 * These rules have exceptions below.
1840 */
1841
1842 /* pin-based controls */
1843 /*
1844 * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is
1845 * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR.
1846 */
1847 nested_vmx_pinbased_ctls_low = 0x16 ;
1848 nested_vmx_pinbased_ctls_high = 0x16 |
1849 PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
1850 PIN_BASED_VIRTUAL_NMIS;
1851
1852 /* exit controls */
1853 nested_vmx_exit_ctls_low = 0;
1854 /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
1855#ifdef CONFIG_X86_64
1856 nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
1857#else
1858 nested_vmx_exit_ctls_high = 0;
1859#endif
1860
1861 /* entry controls */
1862 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
1863 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
1864 nested_vmx_entry_ctls_low = 0;
1865 nested_vmx_entry_ctls_high &=
1866 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
1867
1868 /* cpu-based controls */
1869 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
1870 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
1871 nested_vmx_procbased_ctls_low = 0;
1872 nested_vmx_procbased_ctls_high &=
1873 CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING |
1874 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
1875 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
1876 CPU_BASED_CR3_STORE_EXITING |
1877#ifdef CONFIG_X86_64
1878 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
1879#endif
1880 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
1881 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
1882 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1883 /*
1884 * We can allow some features even when not supported by the
1885 * hardware. For example, L1 can specify an MSR bitmap - and we
1886 * can use it to avoid exits to L1 - even when L0 runs L2
1887 * without MSR bitmaps.
1888 */
1889 nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS;
1890
1891 /* secondary cpu-based controls */
1892 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
1893 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
1894 nested_vmx_secondary_ctls_low = 0;
1895 nested_vmx_secondary_ctls_high &=
1896 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1897}
1898
1899static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
1900{
1901 /*
1902 * Bits 0 in high must be 0, and bits 1 in low must be 1.
1903 */
1904 return ((control & high) | low) == control;
1905}
1906
1907static inline u64 vmx_control_msr(u32 low, u32 high)
1908{
1909 return low | ((u64)high << 32);
1910}
1911
1912/*
1913 * If we allow our guest to use VMX instructions (i.e., nested VMX), we should
1914 * also let it use VMX-specific MSRs.
1915 * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a
1916 * VMX-specific MSR, or 0 when we haven't (and the caller should handle it
1917 * like all other MSRs).
1918 */
1919static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1920{
1921 if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC &&
1922 msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) {
1923 /*
1924 * According to the spec, processors which do not support VMX
1925 * should throw a #GP(0) when VMX capability MSRs are read.
1926 */
1927 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
1928 return 1;
1929 }
1930
1931 switch (msr_index) {
1932 case MSR_IA32_FEATURE_CONTROL:
1933 *pdata = 0;
1934 break;
1935 case MSR_IA32_VMX_BASIC:
1936 /*
1937 * This MSR reports some information about VMX support. We
1938 * should return information about the VMX we emulate for the
1939 * guest, and the VMCS structure we give it - not about the
1940 * VMX support of the underlying hardware.
1941 */
1942 *pdata = VMCS12_REVISION |
1943 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
1944 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
1945 break;
1946 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1947 case MSR_IA32_VMX_PINBASED_CTLS:
1948 *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
1949 nested_vmx_pinbased_ctls_high);
1950 break;
1951 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1952 case MSR_IA32_VMX_PROCBASED_CTLS:
1953 *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
1954 nested_vmx_procbased_ctls_high);
1955 break;
1956 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1957 case MSR_IA32_VMX_EXIT_CTLS:
1958 *pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
1959 nested_vmx_exit_ctls_high);
1960 break;
1961 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1962 case MSR_IA32_VMX_ENTRY_CTLS:
1963 *pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
1964 nested_vmx_entry_ctls_high);
1965 break;
1966 case MSR_IA32_VMX_MISC:
1967 *pdata = 0;
1968 break;
1969 /*
1970 * These MSRs specify bits which the guest must keep fixed (on or off)
1971 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
1972 * We picked the standard core2 setting.
1973 */
1974#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
1975#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
1976 case MSR_IA32_VMX_CR0_FIXED0:
1977 *pdata = VMXON_CR0_ALWAYSON;
1978 break;
1979 case MSR_IA32_VMX_CR0_FIXED1:
1980 *pdata = -1ULL;
1981 break;
1982 case MSR_IA32_VMX_CR4_FIXED0:
1983 *pdata = VMXON_CR4_ALWAYSON;
1984 break;
1985 case MSR_IA32_VMX_CR4_FIXED1:
1986 *pdata = -1ULL;
1987 break;
1988 case MSR_IA32_VMX_VMCS_ENUM:
1989 *pdata = 0x1f;
1990 break;
1991 case MSR_IA32_VMX_PROCBASED_CTLS2:
1992 *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
1993 nested_vmx_secondary_ctls_high);
1994 break;
1995 case MSR_IA32_VMX_EPT_VPID_CAP:
1996 /* Currently, no nested ept or nested vpid */
1997 *pdata = 0;
1998 break;
1999 default:
2000 return 0;
2001 }
2002
2003 return 1;
2004}
2005
2006static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2007{
2008 if (!nested_vmx_allowed(vcpu))
2009 return 0;
2010
2011 if (msr_index == MSR_IA32_FEATURE_CONTROL)
2012 /* TODO: the right thing. */
2013 return 1;
2014 /*
2015 * No need to treat VMX capability MSRs specially: If we don't handle
2016 * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
2017 */
2018 return 0;
2019}
2020
1264/* 2021/*
1265 * Reads an msr value (of 'msr_index') into 'pdata'. 2022 * Reads an msr value (of 'msr_index') into 'pdata'.
1266 * Returns 0 on success, non-0 otherwise. 2023 * Returns 0 on success, non-0 otherwise.
@@ -1309,6 +2066,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1309 /* Otherwise falls through */ 2066 /* Otherwise falls through */
1310 default: 2067 default:
1311 vmx_load_host_state(to_vmx(vcpu)); 2068 vmx_load_host_state(to_vmx(vcpu));
2069 if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
2070 return 0;
1312 msr = find_msr_entry(to_vmx(vcpu), msr_index); 2071 msr = find_msr_entry(to_vmx(vcpu), msr_index);
1313 if (msr) { 2072 if (msr) {
1314 vmx_load_host_state(to_vmx(vcpu)); 2073 vmx_load_host_state(to_vmx(vcpu));
@@ -1380,6 +2139,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1380 return 1; 2139 return 1;
1381 /* Otherwise falls through */ 2140 /* Otherwise falls through */
1382 default: 2141 default:
2142 if (vmx_set_vmx_msr(vcpu, msr_index, data))
2143 break;
1383 msr = find_msr_entry(vmx, msr_index); 2144 msr = find_msr_entry(vmx, msr_index);
1384 if (msr) { 2145 if (msr) {
1385 vmx_load_host_state(vmx); 2146 vmx_load_host_state(vmx);
@@ -1469,7 +2230,7 @@ static int hardware_enable(void *garbage)
1469 if (read_cr4() & X86_CR4_VMXE) 2230 if (read_cr4() & X86_CR4_VMXE)
1470 return -EBUSY; 2231 return -EBUSY;
1471 2232
1472 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); 2233 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
1473 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 2234 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1474 2235
1475 test_bits = FEATURE_CONTROL_LOCKED; 2236 test_bits = FEATURE_CONTROL_LOCKED;
@@ -1493,14 +2254,14 @@ static int hardware_enable(void *garbage)
1493 return 0; 2254 return 0;
1494} 2255}
1495 2256
1496static void vmclear_local_vcpus(void) 2257static void vmclear_local_loaded_vmcss(void)
1497{ 2258{
1498 int cpu = raw_smp_processor_id(); 2259 int cpu = raw_smp_processor_id();
1499 struct vcpu_vmx *vmx, *n; 2260 struct loaded_vmcs *v, *n;
1500 2261
1501 list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu), 2262 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
1502 local_vcpus_link) 2263 loaded_vmcss_on_cpu_link)
1503 __vcpu_clear(vmx); 2264 __loaded_vmcs_clear(v);
1504} 2265}
1505 2266
1506 2267
@@ -1515,7 +2276,7 @@ static void kvm_cpu_vmxoff(void)
1515static void hardware_disable(void *garbage) 2276static void hardware_disable(void *garbage)
1516{ 2277{
1517 if (vmm_exclusive) { 2278 if (vmm_exclusive) {
1518 vmclear_local_vcpus(); 2279 vmclear_local_loaded_vmcss();
1519 kvm_cpu_vmxoff(); 2280 kvm_cpu_vmxoff();
1520 } 2281 }
1521 write_cr4(read_cr4() & ~X86_CR4_VMXE); 2282 write_cr4(read_cr4() & ~X86_CR4_VMXE);
@@ -1696,6 +2457,18 @@ static void free_vmcs(struct vmcs *vmcs)
1696 free_pages((unsigned long)vmcs, vmcs_config.order); 2457 free_pages((unsigned long)vmcs, vmcs_config.order);
1697} 2458}
1698 2459
2460/*
2461 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2462 */
2463static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2464{
2465 if (!loaded_vmcs->vmcs)
2466 return;
2467 loaded_vmcs_clear(loaded_vmcs);
2468 free_vmcs(loaded_vmcs->vmcs);
2469 loaded_vmcs->vmcs = NULL;
2470}
2471
1699static void free_kvm_area(void) 2472static void free_kvm_area(void)
1700{ 2473{
1701 int cpu; 2474 int cpu;
@@ -1756,6 +2529,9 @@ static __init int hardware_setup(void)
1756 if (!cpu_has_vmx_ple()) 2529 if (!cpu_has_vmx_ple())
1757 ple_gap = 0; 2530 ple_gap = 0;
1758 2531
2532 if (nested)
2533 nested_vmx_setup_ctls_msrs();
2534
1759 return alloc_kvm_area(); 2535 return alloc_kvm_area();
1760} 2536}
1761 2537
@@ -2041,7 +2817,7 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
2041 (unsigned long *)&vcpu->arch.regs_dirty); 2817 (unsigned long *)&vcpu->arch.regs_dirty);
2042} 2818}
2043 2819
2044static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 2820static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
2045 2821
2046static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, 2822static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
2047 unsigned long cr0, 2823 unsigned long cr0,
@@ -2139,11 +2915,23 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
2139 vmcs_writel(GUEST_CR3, guest_cr3); 2915 vmcs_writel(GUEST_CR3, guest_cr3);
2140} 2916}
2141 2917
2142static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 2918static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2143{ 2919{
2144 unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? 2920 unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
2145 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 2921 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
2146 2922
2923 if (cr4 & X86_CR4_VMXE) {
2924 /*
2925 * To use VMXON (and later other VMX instructions), a guest
2926 * must first be able to turn on cr4.VMXE (see handle_vmon()).
2927 * So basically the check on whether to allow nested VMX
2928 * is here.
2929 */
2930 if (!nested_vmx_allowed(vcpu))
2931 return 1;
2932 } else if (to_vmx(vcpu)->nested.vmxon)
2933 return 1;
2934
2147 vcpu->arch.cr4 = cr4; 2935 vcpu->arch.cr4 = cr4;
2148 if (enable_ept) { 2936 if (enable_ept) {
2149 if (!is_paging(vcpu)) { 2937 if (!is_paging(vcpu)) {
@@ -2156,6 +2944,7 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2156 2944
2157 vmcs_writel(CR4_READ_SHADOW, cr4); 2945 vmcs_writel(CR4_READ_SHADOW, cr4);
2158 vmcs_writel(GUEST_CR4, hw_cr4); 2946 vmcs_writel(GUEST_CR4, hw_cr4);
2947 return 0;
2159} 2948}
2160 2949
2161static void vmx_get_segment(struct kvm_vcpu *vcpu, 2950static void vmx_get_segment(struct kvm_vcpu *vcpu,
@@ -2721,18 +3510,110 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
2721} 3510}
2722 3511
2723/* 3512/*
3513 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
3514 * will not change in the lifetime of the guest.
3515 * Note that host-state that does change is set elsewhere. E.g., host-state
3516 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
3517 */
3518static void vmx_set_constant_host_state(void)
3519{
3520 u32 low32, high32;
3521 unsigned long tmpl;
3522 struct desc_ptr dt;
3523
3524 vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
3525 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
3526 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
3527
3528 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
3529 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3530 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3531 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3532 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
3533
3534 native_store_idt(&dt);
3535 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
3536
3537 asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
3538 vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
3539
3540 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
3541 vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
3542 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
3543 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
3544
3545 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
3546 rdmsr(MSR_IA32_CR_PAT, low32, high32);
3547 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
3548 }
3549}
3550
3551static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
3552{
3553 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
3554 if (enable_ept)
3555 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
3556 if (is_guest_mode(&vmx->vcpu))
3557 vmx->vcpu.arch.cr4_guest_owned_bits &=
3558 ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
3559 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
3560}
3561
3562static u32 vmx_exec_control(struct vcpu_vmx *vmx)
3563{
3564 u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
3565 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
3566 exec_control &= ~CPU_BASED_TPR_SHADOW;
3567#ifdef CONFIG_X86_64
3568 exec_control |= CPU_BASED_CR8_STORE_EXITING |
3569 CPU_BASED_CR8_LOAD_EXITING;
3570#endif
3571 }
3572 if (!enable_ept)
3573 exec_control |= CPU_BASED_CR3_STORE_EXITING |
3574 CPU_BASED_CR3_LOAD_EXITING |
3575 CPU_BASED_INVLPG_EXITING;
3576 return exec_control;
3577}
3578
3579static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3580{
3581 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
3582 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
3583 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
3584 if (vmx->vpid == 0)
3585 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
3586 if (!enable_ept) {
3587 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
3588 enable_unrestricted_guest = 0;
3589 }
3590 if (!enable_unrestricted_guest)
3591 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
3592 if (!ple_gap)
3593 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
3594 return exec_control;
3595}
3596
3597static void ept_set_mmio_spte_mask(void)
3598{
3599 /*
3600 * EPT Misconfigurations can be generated if the value of bits 2:0
3601 * of an EPT paging-structure entry is 110b (write/execute).
3602 * Also, magic bits (0xffull << 49) is set to quickly identify mmio
3603 * spte.
3604 */
3605 kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
3606}
3607
3608/*
2724 * Sets up the vmcs for emulated real mode. 3609 * Sets up the vmcs for emulated real mode.
2725 */ 3610 */
2726static int vmx_vcpu_setup(struct vcpu_vmx *vmx) 3611static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2727{ 3612{
2728 u32 host_sysenter_cs, msr_low, msr_high; 3613#ifdef CONFIG_X86_64
2729 u32 junk;
2730 u64 host_pat;
2731 unsigned long a; 3614 unsigned long a;
2732 struct desc_ptr dt; 3615#endif
2733 int i; 3616 int i;
2734 unsigned long kvm_vmx_return;
2735 u32 exec_control;
2736 3617
2737 /* I/O */ 3618 /* I/O */
2738 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); 3619 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
@@ -2747,36 +3628,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2747 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, 3628 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
2748 vmcs_config.pin_based_exec_ctrl); 3629 vmcs_config.pin_based_exec_ctrl);
2749 3630
2750 exec_control = vmcs_config.cpu_based_exec_ctrl; 3631 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
2751 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
2752 exec_control &= ~CPU_BASED_TPR_SHADOW;
2753#ifdef CONFIG_X86_64
2754 exec_control |= CPU_BASED_CR8_STORE_EXITING |
2755 CPU_BASED_CR8_LOAD_EXITING;
2756#endif
2757 }
2758 if (!enable_ept)
2759 exec_control |= CPU_BASED_CR3_STORE_EXITING |
2760 CPU_BASED_CR3_LOAD_EXITING |
2761 CPU_BASED_INVLPG_EXITING;
2762 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
2763 3632
2764 if (cpu_has_secondary_exec_ctrls()) { 3633 if (cpu_has_secondary_exec_ctrls()) {
2765 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 3634 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
2766 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) 3635 vmx_secondary_exec_control(vmx));
2767 exec_control &=
2768 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2769 if (vmx->vpid == 0)
2770 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2771 if (!enable_ept) {
2772 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2773 enable_unrestricted_guest = 0;
2774 }
2775 if (!enable_unrestricted_guest)
2776 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2777 if (!ple_gap)
2778 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
2779 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2780 } 3636 }
2781 3637
2782 if (ple_gap) { 3638 if (ple_gap) {
@@ -2784,20 +3640,13 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2784 vmcs_write32(PLE_WINDOW, ple_window); 3640 vmcs_write32(PLE_WINDOW, ple_window);
2785 } 3641 }
2786 3642
2787 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); 3643 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
2788 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); 3644 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
2789 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 3645 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
2790 3646
2791 vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
2792 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
2793 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
2794
2795 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
2796 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
2797 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
2798 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 3647 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
2799 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 3648 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
2800 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 3649 vmx_set_constant_host_state();
2801#ifdef CONFIG_X86_64 3650#ifdef CONFIG_X86_64
2802 rdmsrl(MSR_FS_BASE, a); 3651 rdmsrl(MSR_FS_BASE, a);
2803 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ 3652 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
@@ -2808,32 +3657,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2808 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 3657 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
2809#endif 3658#endif
2810 3659
2811 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
2812
2813 native_store_idt(&dt);
2814 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
2815
2816 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
2817 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
2818 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 3660 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
2819 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3661 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
2820 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); 3662 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
2821 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3663 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
2822 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); 3664 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
2823 3665
2824 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
2825 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
2826 rdmsrl(MSR_IA32_SYSENTER_ESP, a);
2827 vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
2828 rdmsrl(MSR_IA32_SYSENTER_EIP, a);
2829 vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
2830
2831 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
2832 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
2833 host_pat = msr_low | ((u64) msr_high << 32);
2834 vmcs_write64(HOST_IA32_PAT, host_pat);
2835 }
2836 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 3666 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
3667 u32 msr_low, msr_high;
3668 u64 host_pat;
2837 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); 3669 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
2838 host_pat = msr_low | ((u64) msr_high << 32); 3670 host_pat = msr_low | ((u64) msr_high << 32);
2839 /* Write the default value follow host pat */ 3671 /* Write the default value follow host pat */
@@ -2863,10 +3695,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2863 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); 3695 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
2864 3696
2865 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 3697 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
2866 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; 3698 set_cr4_guest_host_mask(vmx);
2867 if (enable_ept)
2868 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
2869 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
2870 3699
2871 kvm_write_tsc(&vmx->vcpu, 0); 3700 kvm_write_tsc(&vmx->vcpu, 0);
2872 3701
@@ -2990,9 +3819,25 @@ out:
2990 return ret; 3819 return ret;
2991} 3820}
2992 3821
3822/*
3823 * In nested virtualization, check if L1 asked to exit on external interrupts.
3824 * For most existing hypervisors, this will always return true.
3825 */
3826static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
3827{
3828 return get_vmcs12(vcpu)->pin_based_vm_exec_control &
3829 PIN_BASED_EXT_INTR_MASK;
3830}
3831
2993static void enable_irq_window(struct kvm_vcpu *vcpu) 3832static void enable_irq_window(struct kvm_vcpu *vcpu)
2994{ 3833{
2995 u32 cpu_based_vm_exec_control; 3834 u32 cpu_based_vm_exec_control;
3835 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
3836 /* We can get here when nested_run_pending caused
3837 * vmx_interrupt_allowed() to return false. In this case, do
3838 * nothing - the interrupt will be injected later.
3839 */
3840 return;
2996 3841
2997 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 3842 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2998 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; 3843 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
@@ -3049,6 +3894,9 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
3049{ 3894{
3050 struct vcpu_vmx *vmx = to_vmx(vcpu); 3895 struct vcpu_vmx *vmx = to_vmx(vcpu);
3051 3896
3897 if (is_guest_mode(vcpu))
3898 return;
3899
3052 if (!cpu_has_virtual_nmis()) { 3900 if (!cpu_has_virtual_nmis()) {
3053 /* 3901 /*
3054 * Tracking the NMI-blocked state in software is built upon 3902 * Tracking the NMI-blocked state in software is built upon
@@ -3115,6 +3963,17 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3115 3963
3116static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 3964static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
3117{ 3965{
3966 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
3967 struct vmcs12 *vmcs12;
3968 if (to_vmx(vcpu)->nested.nested_run_pending)
3969 return 0;
3970 nested_vmx_vmexit(vcpu);
3971 vmcs12 = get_vmcs12(vcpu);
3972 vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
3973 vmcs12->vm_exit_intr_info = 0;
3974 /* fall through to normal code, but now in L1, not L2 */
3975 }
3976
3118 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 3977 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
3119 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3978 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
3120 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 3979 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
@@ -3356,6 +4215,58 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3356 hypercall[2] = 0xc1; 4215 hypercall[2] = 0xc1;
3357} 4216}
3358 4217
4218/* called to set cr0 as approriate for a mov-to-cr0 exit. */
4219static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
4220{
4221 if (to_vmx(vcpu)->nested.vmxon &&
4222 ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
4223 return 1;
4224
4225 if (is_guest_mode(vcpu)) {
4226 /*
4227 * We get here when L2 changed cr0 in a way that did not change
4228 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
4229 * but did change L0 shadowed bits. This can currently happen
4230 * with the TS bit: L0 may want to leave TS on (for lazy fpu
4231 * loading) while pretending to allow the guest to change it.
4232 */
4233 if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) |
4234 (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits)))
4235 return 1;
4236 vmcs_writel(CR0_READ_SHADOW, val);
4237 return 0;
4238 } else
4239 return kvm_set_cr0(vcpu, val);
4240}
4241
4242static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
4243{
4244 if (is_guest_mode(vcpu)) {
4245 if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) |
4246 (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits)))
4247 return 1;
4248 vmcs_writel(CR4_READ_SHADOW, val);
4249 return 0;
4250 } else
4251 return kvm_set_cr4(vcpu, val);
4252}
4253
4254/* called to set cr0 as approriate for clts instruction exit. */
4255static void handle_clts(struct kvm_vcpu *vcpu)
4256{
4257 if (is_guest_mode(vcpu)) {
4258 /*
4259 * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS
4260 * but we did (!fpu_active). We need to keep GUEST_CR0.TS on,
4261 * just pretend it's off (also in arch.cr0 for fpu_activate).
4262 */
4263 vmcs_writel(CR0_READ_SHADOW,
4264 vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
4265 vcpu->arch.cr0 &= ~X86_CR0_TS;
4266 } else
4267 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
4268}
4269
3359static int handle_cr(struct kvm_vcpu *vcpu) 4270static int handle_cr(struct kvm_vcpu *vcpu)
3360{ 4271{
3361 unsigned long exit_qualification, val; 4272 unsigned long exit_qualification, val;
@@ -3372,7 +4283,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3372 trace_kvm_cr_write(cr, val); 4283 trace_kvm_cr_write(cr, val);
3373 switch (cr) { 4284 switch (cr) {
3374 case 0: 4285 case 0:
3375 err = kvm_set_cr0(vcpu, val); 4286 err = handle_set_cr0(vcpu, val);
3376 kvm_complete_insn_gp(vcpu, err); 4287 kvm_complete_insn_gp(vcpu, err);
3377 return 1; 4288 return 1;
3378 case 3: 4289 case 3:
@@ -3380,7 +4291,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3380 kvm_complete_insn_gp(vcpu, err); 4291 kvm_complete_insn_gp(vcpu, err);
3381 return 1; 4292 return 1;
3382 case 4: 4293 case 4:
3383 err = kvm_set_cr4(vcpu, val); 4294 err = handle_set_cr4(vcpu, val);
3384 kvm_complete_insn_gp(vcpu, err); 4295 kvm_complete_insn_gp(vcpu, err);
3385 return 1; 4296 return 1;
3386 case 8: { 4297 case 8: {
@@ -3398,7 +4309,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3398 }; 4309 };
3399 break; 4310 break;
3400 case 2: /* clts */ 4311 case 2: /* clts */
3401 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); 4312 handle_clts(vcpu);
3402 trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); 4313 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
3403 skip_emulated_instruction(vcpu); 4314 skip_emulated_instruction(vcpu);
3404 vmx_fpu_activate(vcpu); 4315 vmx_fpu_activate(vcpu);
@@ -3574,12 +4485,6 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
3574 return 1; 4485 return 1;
3575} 4486}
3576 4487
3577static int handle_vmx_insn(struct kvm_vcpu *vcpu)
3578{
3579 kvm_queue_exception(vcpu, UD_VECTOR);
3580 return 1;
3581}
3582
3583static int handle_invd(struct kvm_vcpu *vcpu) 4488static int handle_invd(struct kvm_vcpu *vcpu)
3584{ 4489{
3585 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 4490 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
@@ -3777,11 +4682,19 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
3777static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 4682static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
3778{ 4683{
3779 u64 sptes[4]; 4684 u64 sptes[4];
3780 int nr_sptes, i; 4685 int nr_sptes, i, ret;
3781 gpa_t gpa; 4686 gpa_t gpa;
3782 4687
3783 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 4688 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
3784 4689
4690 ret = handle_mmio_page_fault_common(vcpu, gpa, true);
4691 if (likely(ret == 1))
4692 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
4693 EMULATE_DONE;
4694 if (unlikely(!ret))
4695 return 1;
4696
4697 /* It is the real ept misconfig */
3785 printk(KERN_ERR "EPT: Misconfiguration.\n"); 4698 printk(KERN_ERR "EPT: Misconfiguration.\n");
3786 printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); 4699 printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
3787 4700
@@ -3866,6 +4779,639 @@ static int handle_invalid_op(struct kvm_vcpu *vcpu)
3866} 4779}
3867 4780
3868/* 4781/*
4782 * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
4783 * We could reuse a single VMCS for all the L2 guests, but we also want the
4784 * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
4785 * allows keeping them loaded on the processor, and in the future will allow
4786 * optimizations where prepare_vmcs02 doesn't need to set all the fields on
4787 * every entry if they never change.
4788 * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
4789 * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
4790 *
4791 * The following functions allocate and free a vmcs02 in this pool.
4792 */
4793
4794/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
4795static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
4796{
4797 struct vmcs02_list *item;
4798 list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
4799 if (item->vmptr == vmx->nested.current_vmptr) {
4800 list_move(&item->list, &vmx->nested.vmcs02_pool);
4801 return &item->vmcs02;
4802 }
4803
4804 if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
4805 /* Recycle the least recently used VMCS. */
4806 item = list_entry(vmx->nested.vmcs02_pool.prev,
4807 struct vmcs02_list, list);
4808 item->vmptr = vmx->nested.current_vmptr;
4809 list_move(&item->list, &vmx->nested.vmcs02_pool);
4810 return &item->vmcs02;
4811 }
4812
4813 /* Create a new VMCS */
4814 item = (struct vmcs02_list *)
4815 kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
4816 if (!item)
4817 return NULL;
4818 item->vmcs02.vmcs = alloc_vmcs();
4819 if (!item->vmcs02.vmcs) {
4820 kfree(item);
4821 return NULL;
4822 }
4823 loaded_vmcs_init(&item->vmcs02);
4824 item->vmptr = vmx->nested.current_vmptr;
4825 list_add(&(item->list), &(vmx->nested.vmcs02_pool));
4826 vmx->nested.vmcs02_num++;
4827 return &item->vmcs02;
4828}
4829
4830/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
4831static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
4832{
4833 struct vmcs02_list *item;
4834 list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
4835 if (item->vmptr == vmptr) {
4836 free_loaded_vmcs(&item->vmcs02);
4837 list_del(&item->list);
4838 kfree(item);
4839 vmx->nested.vmcs02_num--;
4840 return;
4841 }
4842}
4843
4844/*
4845 * Free all VMCSs saved for this vcpu, except the one pointed by
4846 * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one
4847 * currently used, if running L2), and vmcs01 when running L2.
4848 */
4849static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
4850{
4851 struct vmcs02_list *item, *n;
4852 list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
4853 if (vmx->loaded_vmcs != &item->vmcs02)
4854 free_loaded_vmcs(&item->vmcs02);
4855 list_del(&item->list);
4856 kfree(item);
4857 }
4858 vmx->nested.vmcs02_num = 0;
4859
4860 if (vmx->loaded_vmcs != &vmx->vmcs01)
4861 free_loaded_vmcs(&vmx->vmcs01);
4862}
4863
4864/*
4865 * Emulate the VMXON instruction.
4866 * Currently, we just remember that VMX is active, and do not save or even
4867 * inspect the argument to VMXON (the so-called "VMXON pointer") because we
4868 * do not currently need to store anything in that guest-allocated memory
4869 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
4870 * argument is different from the VMXON pointer (which the spec says they do).
4871 */
4872static int handle_vmon(struct kvm_vcpu *vcpu)
4873{
4874 struct kvm_segment cs;
4875 struct vcpu_vmx *vmx = to_vmx(vcpu);
4876
4877 /* The Intel VMX Instruction Reference lists a bunch of bits that
4878 * are prerequisite to running VMXON, most notably cr4.VMXE must be
4879 * set to 1 (see vmx_set_cr4() for when we allow the guest to set this).
4880 * Otherwise, we should fail with #UD. We test these now:
4881 */
4882 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) ||
4883 !kvm_read_cr0_bits(vcpu, X86_CR0_PE) ||
4884 (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
4885 kvm_queue_exception(vcpu, UD_VECTOR);
4886 return 1;
4887 }
4888
4889 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
4890 if (is_long_mode(vcpu) && !cs.l) {
4891 kvm_queue_exception(vcpu, UD_VECTOR);
4892 return 1;
4893 }
4894
4895 if (vmx_get_cpl(vcpu)) {
4896 kvm_inject_gp(vcpu, 0);
4897 return 1;
4898 }
4899
4900 INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
4901 vmx->nested.vmcs02_num = 0;
4902
4903 vmx->nested.vmxon = true;
4904
4905 skip_emulated_instruction(vcpu);
4906 return 1;
4907}
4908
4909/*
4910 * Intel's VMX Instruction Reference specifies a common set of prerequisites
4911 * for running VMX instructions (except VMXON, whose prerequisites are
4912 * slightly different). It also specifies what exception to inject otherwise.
4913 */
4914static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
4915{
4916 struct kvm_segment cs;
4917 struct vcpu_vmx *vmx = to_vmx(vcpu);
4918
4919 if (!vmx->nested.vmxon) {
4920 kvm_queue_exception(vcpu, UD_VECTOR);
4921 return 0;
4922 }
4923
4924 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
4925 if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
4926 (is_long_mode(vcpu) && !cs.l)) {
4927 kvm_queue_exception(vcpu, UD_VECTOR);
4928 return 0;
4929 }
4930
4931 if (vmx_get_cpl(vcpu)) {
4932 kvm_inject_gp(vcpu, 0);
4933 return 0;
4934 }
4935
4936 return 1;
4937}
4938
4939/*
4940 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
4941 * just stops using VMX.
4942 */
4943static void free_nested(struct vcpu_vmx *vmx)
4944{
4945 if (!vmx->nested.vmxon)
4946 return;
4947 vmx->nested.vmxon = false;
4948 if (vmx->nested.current_vmptr != -1ull) {
4949 kunmap(vmx->nested.current_vmcs12_page);
4950 nested_release_page(vmx->nested.current_vmcs12_page);
4951 vmx->nested.current_vmptr = -1ull;
4952 vmx->nested.current_vmcs12 = NULL;
4953 }
4954 /* Unpin physical memory we referred to in current vmcs02 */
4955 if (vmx->nested.apic_access_page) {
4956 nested_release_page(vmx->nested.apic_access_page);
4957 vmx->nested.apic_access_page = 0;
4958 }
4959
4960 nested_free_all_saved_vmcss(vmx);
4961}
4962
4963/* Emulate the VMXOFF instruction */
4964static int handle_vmoff(struct kvm_vcpu *vcpu)
4965{
4966 if (!nested_vmx_check_permission(vcpu))
4967 return 1;
4968 free_nested(to_vmx(vcpu));
4969 skip_emulated_instruction(vcpu);
4970 return 1;
4971}
4972
4973/*
4974 * Decode the memory-address operand of a vmx instruction, as recorded on an
4975 * exit caused by such an instruction (run by a guest hypervisor).
4976 * On success, returns 0. When the operand is invalid, returns 1 and throws
4977 * #UD or #GP.
4978 */
4979static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
4980 unsigned long exit_qualification,
4981 u32 vmx_instruction_info, gva_t *ret)
4982{
4983 /*
4984 * According to Vol. 3B, "Information for VM Exits Due to Instruction
4985 * Execution", on an exit, vmx_instruction_info holds most of the
4986 * addressing components of the operand. Only the displacement part
4987 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4988 * For how an actual address is calculated from all these components,
4989 * refer to Vol. 1, "Operand Addressing".
4990 */
4991 int scaling = vmx_instruction_info & 3;
4992 int addr_size = (vmx_instruction_info >> 7) & 7;
4993 bool is_reg = vmx_instruction_info & (1u << 10);
4994 int seg_reg = (vmx_instruction_info >> 15) & 7;
4995 int index_reg = (vmx_instruction_info >> 18) & 0xf;
4996 bool index_is_valid = !(vmx_instruction_info & (1u << 22));
4997 int base_reg = (vmx_instruction_info >> 23) & 0xf;
4998 bool base_is_valid = !(vmx_instruction_info & (1u << 27));
4999
5000 if (is_reg) {
5001 kvm_queue_exception(vcpu, UD_VECTOR);
5002 return 1;
5003 }
5004
5005 /* Addr = segment_base + offset */
5006 /* offset = base + [index * scale] + displacement */
5007 *ret = vmx_get_segment_base(vcpu, seg_reg);
5008 if (base_is_valid)
5009 *ret += kvm_register_read(vcpu, base_reg);
5010 if (index_is_valid)
5011 *ret += kvm_register_read(vcpu, index_reg)<<scaling;
5012 *ret += exit_qualification; /* holds the displacement */
5013
5014 if (addr_size == 1) /* 32 bit */
5015 *ret &= 0xffffffff;
5016
5017 /*
5018 * TODO: throw #GP (and return 1) in various cases that the VM*
5019 * instructions require it - e.g., offset beyond segment limit,
5020 * unusable or unreadable/unwritable segment, non-canonical 64-bit
5021 * address, and so on. Currently these are not checked.
5022 */
5023 return 0;
5024}
5025
5026/*
5027 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
5028 * set the success or error code of an emulated VMX instruction, as specified
5029 * by Vol 2B, VMX Instruction Reference, "Conventions".
5030 */
5031static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
5032{
5033 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
5034 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5035 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
5036}
5037
5038static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
5039{
5040 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5041 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
5042 X86_EFLAGS_SF | X86_EFLAGS_OF))
5043 | X86_EFLAGS_CF);
5044}
5045
5046static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
5047 u32 vm_instruction_error)
5048{
5049 if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
5050 /*
5051 * failValid writes the error number to the current VMCS, which
5052 * can't be done there isn't a current VMCS.
5053 */
5054 nested_vmx_failInvalid(vcpu);
5055 return;
5056 }
5057 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5058 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5059 X86_EFLAGS_SF | X86_EFLAGS_OF))
5060 | X86_EFLAGS_ZF);
5061 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
5062}
5063
5064/* Emulate the VMCLEAR instruction */
5065static int handle_vmclear(struct kvm_vcpu *vcpu)
5066{
5067 struct vcpu_vmx *vmx = to_vmx(vcpu);
5068 gva_t gva;
5069 gpa_t vmptr;
5070 struct vmcs12 *vmcs12;
5071 struct page *page;
5072 struct x86_exception e;
5073
5074 if (!nested_vmx_check_permission(vcpu))
5075 return 1;
5076
5077 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5078 vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
5079 return 1;
5080
5081 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
5082 sizeof(vmptr), &e)) {
5083 kvm_inject_page_fault(vcpu, &e);
5084 return 1;
5085 }
5086
5087 if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
5088 nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
5089 skip_emulated_instruction(vcpu);
5090 return 1;
5091 }
5092
5093 if (vmptr == vmx->nested.current_vmptr) {
5094 kunmap(vmx->nested.current_vmcs12_page);
5095 nested_release_page(vmx->nested.current_vmcs12_page);
5096 vmx->nested.current_vmptr = -1ull;
5097 vmx->nested.current_vmcs12 = NULL;
5098 }
5099
5100 page = nested_get_page(vcpu, vmptr);
5101 if (page == NULL) {
5102 /*
5103 * For accurate processor emulation, VMCLEAR beyond available
5104 * physical memory should do nothing at all. However, it is
5105 * possible that a nested vmx bug, not a guest hypervisor bug,
5106 * resulted in this case, so let's shut down before doing any
5107 * more damage:
5108 */
5109 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5110 return 1;
5111 }
5112 vmcs12 = kmap(page);
5113 vmcs12->launch_state = 0;
5114 kunmap(page);
5115 nested_release_page(page);
5116
5117 nested_free_vmcs02(vmx, vmptr);
5118
5119 skip_emulated_instruction(vcpu);
5120 nested_vmx_succeed(vcpu);
5121 return 1;
5122}
5123
5124static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
5125
5126/* Emulate the VMLAUNCH instruction */
5127static int handle_vmlaunch(struct kvm_vcpu *vcpu)
5128{
5129 return nested_vmx_run(vcpu, true);
5130}
5131
5132/* Emulate the VMRESUME instruction */
5133static int handle_vmresume(struct kvm_vcpu *vcpu)
5134{
5135
5136 return nested_vmx_run(vcpu, false);
5137}
5138
5139enum vmcs_field_type {
5140 VMCS_FIELD_TYPE_U16 = 0,
5141 VMCS_FIELD_TYPE_U64 = 1,
5142 VMCS_FIELD_TYPE_U32 = 2,
5143 VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
5144};
5145
5146static inline int vmcs_field_type(unsigned long field)
5147{
5148 if (0x1 & field) /* the *_HIGH fields are all 32 bit */
5149 return VMCS_FIELD_TYPE_U32;
5150 return (field >> 13) & 0x3 ;
5151}
5152
5153static inline int vmcs_field_readonly(unsigned long field)
5154{
5155 return (((field >> 10) & 0x3) == 1);
5156}
5157
5158/*
5159 * Read a vmcs12 field. Since these can have varying lengths and we return
5160 * one type, we chose the biggest type (u64) and zero-extend the return value
5161 * to that size. Note that the caller, handle_vmread, might need to use only
5162 * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
5163 * 64-bit fields are to be returned).
5164 */
5165static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
5166 unsigned long field, u64 *ret)
5167{
5168 short offset = vmcs_field_to_offset(field);
5169 char *p;
5170
5171 if (offset < 0)
5172 return 0;
5173
5174 p = ((char *)(get_vmcs12(vcpu))) + offset;
5175
5176 switch (vmcs_field_type(field)) {
5177 case VMCS_FIELD_TYPE_NATURAL_WIDTH:
5178 *ret = *((natural_width *)p);
5179 return 1;
5180 case VMCS_FIELD_TYPE_U16:
5181 *ret = *((u16 *)p);
5182 return 1;
5183 case VMCS_FIELD_TYPE_U32:
5184 *ret = *((u32 *)p);
5185 return 1;
5186 case VMCS_FIELD_TYPE_U64:
5187 *ret = *((u64 *)p);
5188 return 1;
5189 default:
5190 return 0; /* can never happen. */
5191 }
5192}
5193
5194/*
5195 * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
5196 * used before) all generate the same failure when it is missing.
5197 */
5198static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
5199{
5200 struct vcpu_vmx *vmx = to_vmx(vcpu);
5201 if (vmx->nested.current_vmptr == -1ull) {
5202 nested_vmx_failInvalid(vcpu);
5203 skip_emulated_instruction(vcpu);
5204 return 0;
5205 }
5206 return 1;
5207}
5208
5209static int handle_vmread(struct kvm_vcpu *vcpu)
5210{
5211 unsigned long field;
5212 u64 field_value;
5213 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5214 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5215 gva_t gva = 0;
5216
5217 if (!nested_vmx_check_permission(vcpu) ||
5218 !nested_vmx_check_vmcs12(vcpu))
5219 return 1;
5220
5221 /* Decode instruction info and find the field to read */
5222 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5223 /* Read the field, zero-extended to a u64 field_value */
5224 if (!vmcs12_read_any(vcpu, field, &field_value)) {
5225 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5226 skip_emulated_instruction(vcpu);
5227 return 1;
5228 }
5229 /*
5230 * Now copy part of this value to register or memory, as requested.
5231 * Note that the number of bits actually copied is 32 or 64 depending
5232 * on the guest's mode (32 or 64 bit), not on the given field's length.
5233 */
5234 if (vmx_instruction_info & (1u << 10)) {
5235 kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
5236 field_value);
5237 } else {
5238 if (get_vmx_mem_address(vcpu, exit_qualification,
5239 vmx_instruction_info, &gva))
5240 return 1;
5241 /* _system ok, as nested_vmx_check_permission verified cpl=0 */
5242 kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
5243 &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
5244 }
5245
5246 nested_vmx_succeed(vcpu);
5247 skip_emulated_instruction(vcpu);
5248 return 1;
5249}
5250
5251
5252static int handle_vmwrite(struct kvm_vcpu *vcpu)
5253{
5254 unsigned long field;
5255 gva_t gva;
5256 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5257 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5258 char *p;
5259 short offset;
5260 /* The value to write might be 32 or 64 bits, depending on L1's long
5261 * mode, and eventually we need to write that into a field of several
5262 * possible lengths. The code below first zero-extends the value to 64
5263 * bit (field_value), and then copies only the approriate number of
5264 * bits into the vmcs12 field.
5265 */
5266 u64 field_value = 0;
5267 struct x86_exception e;
5268
5269 if (!nested_vmx_check_permission(vcpu) ||
5270 !nested_vmx_check_vmcs12(vcpu))
5271 return 1;
5272
5273 if (vmx_instruction_info & (1u << 10))
5274 field_value = kvm_register_read(vcpu,
5275 (((vmx_instruction_info) >> 3) & 0xf));
5276 else {
5277 if (get_vmx_mem_address(vcpu, exit_qualification,
5278 vmx_instruction_info, &gva))
5279 return 1;
5280 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
5281 &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) {
5282 kvm_inject_page_fault(vcpu, &e);
5283 return 1;
5284 }
5285 }
5286
5287
5288 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5289 if (vmcs_field_readonly(field)) {
5290 nested_vmx_failValid(vcpu,
5291 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
5292 skip_emulated_instruction(vcpu);
5293 return 1;
5294 }
5295
5296 offset = vmcs_field_to_offset(field);
5297 if (offset < 0) {
5298 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5299 skip_emulated_instruction(vcpu);
5300 return 1;
5301 }
5302 p = ((char *) get_vmcs12(vcpu)) + offset;
5303
5304 switch (vmcs_field_type(field)) {
5305 case VMCS_FIELD_TYPE_U16:
5306 *(u16 *)p = field_value;
5307 break;
5308 case VMCS_FIELD_TYPE_U32:
5309 *(u32 *)p = field_value;
5310 break;
5311 case VMCS_FIELD_TYPE_U64:
5312 *(u64 *)p = field_value;
5313 break;
5314 case VMCS_FIELD_TYPE_NATURAL_WIDTH:
5315 *(natural_width *)p = field_value;
5316 break;
5317 default:
5318 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5319 skip_emulated_instruction(vcpu);
5320 return 1;
5321 }
5322
5323 nested_vmx_succeed(vcpu);
5324 skip_emulated_instruction(vcpu);
5325 return 1;
5326}
5327
5328/* Emulate the VMPTRLD instruction */
5329static int handle_vmptrld(struct kvm_vcpu *vcpu)
5330{
5331 struct vcpu_vmx *vmx = to_vmx(vcpu);
5332 gva_t gva;
5333 gpa_t vmptr;
5334 struct x86_exception e;
5335
5336 if (!nested_vmx_check_permission(vcpu))
5337 return 1;
5338
5339 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5340 vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
5341 return 1;
5342
5343 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
5344 sizeof(vmptr), &e)) {
5345 kvm_inject_page_fault(vcpu, &e);
5346 return 1;
5347 }
5348
5349 if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
5350 nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
5351 skip_emulated_instruction(vcpu);
5352 return 1;
5353 }
5354
5355 if (vmx->nested.current_vmptr != vmptr) {
5356 struct vmcs12 *new_vmcs12;
5357 struct page *page;
5358 page = nested_get_page(vcpu, vmptr);
5359 if (page == NULL) {
5360 nested_vmx_failInvalid(vcpu);
5361 skip_emulated_instruction(vcpu);
5362 return 1;
5363 }
5364 new_vmcs12 = kmap(page);
5365 if (new_vmcs12->revision_id != VMCS12_REVISION) {
5366 kunmap(page);
5367 nested_release_page_clean(page);
5368 nested_vmx_failValid(vcpu,
5369 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5370 skip_emulated_instruction(vcpu);
5371 return 1;
5372 }
5373 if (vmx->nested.current_vmptr != -1ull) {
5374 kunmap(vmx->nested.current_vmcs12_page);
5375 nested_release_page(vmx->nested.current_vmcs12_page);
5376 }
5377
5378 vmx->nested.current_vmptr = vmptr;
5379 vmx->nested.current_vmcs12 = new_vmcs12;
5380 vmx->nested.current_vmcs12_page = page;
5381 }
5382
5383 nested_vmx_succeed(vcpu);
5384 skip_emulated_instruction(vcpu);
5385 return 1;
5386}
5387
5388/* Emulate the VMPTRST instruction */
5389static int handle_vmptrst(struct kvm_vcpu *vcpu)
5390{
5391 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5392 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5393 gva_t vmcs_gva;
5394 struct x86_exception e;
5395
5396 if (!nested_vmx_check_permission(vcpu))
5397 return 1;
5398
5399 if (get_vmx_mem_address(vcpu, exit_qualification,
5400 vmx_instruction_info, &vmcs_gva))
5401 return 1;
5402 /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */
5403 if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
5404 (void *)&to_vmx(vcpu)->nested.current_vmptr,
5405 sizeof(u64), &e)) {
5406 kvm_inject_page_fault(vcpu, &e);
5407 return 1;
5408 }
5409 nested_vmx_succeed(vcpu);
5410 skip_emulated_instruction(vcpu);
5411 return 1;
5412}
5413
5414/*
3869 * The exit handlers return 1 if the exit was handled fully and guest execution 5415 * The exit handlers return 1 if the exit was handled fully and guest execution
3870 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 5416 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
3871 * to be done to userspace and return 0. 5417 * to be done to userspace and return 0.
@@ -3886,15 +5432,15 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3886 [EXIT_REASON_INVD] = handle_invd, 5432 [EXIT_REASON_INVD] = handle_invd,
3887 [EXIT_REASON_INVLPG] = handle_invlpg, 5433 [EXIT_REASON_INVLPG] = handle_invlpg,
3888 [EXIT_REASON_VMCALL] = handle_vmcall, 5434 [EXIT_REASON_VMCALL] = handle_vmcall,
3889 [EXIT_REASON_VMCLEAR] = handle_vmx_insn, 5435 [EXIT_REASON_VMCLEAR] = handle_vmclear,
3890 [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, 5436 [EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
3891 [EXIT_REASON_VMPTRLD] = handle_vmx_insn, 5437 [EXIT_REASON_VMPTRLD] = handle_vmptrld,
3892 [EXIT_REASON_VMPTRST] = handle_vmx_insn, 5438 [EXIT_REASON_VMPTRST] = handle_vmptrst,
3893 [EXIT_REASON_VMREAD] = handle_vmx_insn, 5439 [EXIT_REASON_VMREAD] = handle_vmread,
3894 [EXIT_REASON_VMRESUME] = handle_vmx_insn, 5440 [EXIT_REASON_VMRESUME] = handle_vmresume,
3895 [EXIT_REASON_VMWRITE] = handle_vmx_insn, 5441 [EXIT_REASON_VMWRITE] = handle_vmwrite,
3896 [EXIT_REASON_VMOFF] = handle_vmx_insn, 5442 [EXIT_REASON_VMOFF] = handle_vmoff,
3897 [EXIT_REASON_VMON] = handle_vmx_insn, 5443 [EXIT_REASON_VMON] = handle_vmon,
3898 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 5444 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
3899 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 5445 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
3900 [EXIT_REASON_WBINVD] = handle_wbinvd, 5446 [EXIT_REASON_WBINVD] = handle_wbinvd,
@@ -3911,6 +5457,229 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3911static const int kvm_vmx_max_exit_handlers = 5457static const int kvm_vmx_max_exit_handlers =
3912 ARRAY_SIZE(kvm_vmx_exit_handlers); 5458 ARRAY_SIZE(kvm_vmx_exit_handlers);
3913 5459
5460/*
5461 * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
5462 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
5463 * disinterest in the current event (read or write a specific MSR) by using an
5464 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
5465 */
5466static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5467 struct vmcs12 *vmcs12, u32 exit_reason)
5468{
5469 u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
5470 gpa_t bitmap;
5471
5472 if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS))
5473 return 1;
5474
5475 /*
5476 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
5477 * for the four combinations of read/write and low/high MSR numbers.
5478 * First we need to figure out which of the four to use:
5479 */
5480 bitmap = vmcs12->msr_bitmap;
5481 if (exit_reason == EXIT_REASON_MSR_WRITE)
5482 bitmap += 2048;
5483 if (msr_index >= 0xc0000000) {
5484 msr_index -= 0xc0000000;
5485 bitmap += 1024;
5486 }
5487
5488 /* Then read the msr_index'th bit from this bitmap: */
5489 if (msr_index < 1024*8) {
5490 unsigned char b;
5491 kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1);
5492 return 1 & (b >> (msr_index & 7));
5493 } else
5494 return 1; /* let L1 handle the wrong parameter */
5495}
5496
5497/*
5498 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
5499 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
5500 * intercept (via guest_host_mask etc.) the current event.
5501 */
5502static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
5503 struct vmcs12 *vmcs12)
5504{
5505 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5506 int cr = exit_qualification & 15;
5507 int reg = (exit_qualification >> 8) & 15;
5508 unsigned long val = kvm_register_read(vcpu, reg);
5509
5510 switch ((exit_qualification >> 4) & 3) {
5511 case 0: /* mov to cr */
5512 switch (cr) {
5513 case 0:
5514 if (vmcs12->cr0_guest_host_mask &
5515 (val ^ vmcs12->cr0_read_shadow))
5516 return 1;
5517 break;
5518 case 3:
5519 if ((vmcs12->cr3_target_count >= 1 &&
5520 vmcs12->cr3_target_value0 == val) ||
5521 (vmcs12->cr3_target_count >= 2 &&
5522 vmcs12->cr3_target_value1 == val) ||
5523 (vmcs12->cr3_target_count >= 3 &&
5524 vmcs12->cr3_target_value2 == val) ||
5525 (vmcs12->cr3_target_count >= 4 &&
5526 vmcs12->cr3_target_value3 == val))
5527 return 0;
5528 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
5529 return 1;
5530 break;
5531 case 4:
5532 if (vmcs12->cr4_guest_host_mask &
5533 (vmcs12->cr4_read_shadow ^ val))
5534 return 1;
5535 break;
5536 case 8:
5537 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
5538 return 1;
5539 break;
5540 }
5541 break;
5542 case 2: /* clts */
5543 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
5544 (vmcs12->cr0_read_shadow & X86_CR0_TS))
5545 return 1;
5546 break;
5547 case 1: /* mov from cr */
5548 switch (cr) {
5549 case 3:
5550 if (vmcs12->cpu_based_vm_exec_control &
5551 CPU_BASED_CR3_STORE_EXITING)
5552 return 1;
5553 break;
5554 case 8:
5555 if (vmcs12->cpu_based_vm_exec_control &
5556 CPU_BASED_CR8_STORE_EXITING)
5557 return 1;
5558 break;
5559 }
5560 break;
5561 case 3: /* lmsw */
5562 /*
5563 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
5564 * cr0. Other attempted changes are ignored, with no exit.
5565 */
5566 if (vmcs12->cr0_guest_host_mask & 0xe &
5567 (val ^ vmcs12->cr0_read_shadow))
5568 return 1;
5569 if ((vmcs12->cr0_guest_host_mask & 0x1) &&
5570 !(vmcs12->cr0_read_shadow & 0x1) &&
5571 (val & 0x1))
5572 return 1;
5573 break;
5574 }
5575 return 0;
5576}
5577
5578/*
5579 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
5580 * should handle it ourselves in L0 (and then continue L2). Only call this
5581 * when in is_guest_mode (L2).
5582 */
5583static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
5584{
5585 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
5586 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
5587 struct vcpu_vmx *vmx = to_vmx(vcpu);
5588 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5589
5590 if (vmx->nested.nested_run_pending)
5591 return 0;
5592
5593 if (unlikely(vmx->fail)) {
5594 printk(KERN_INFO "%s failed vm entry %x\n",
5595 __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
5596 return 1;
5597 }
5598
5599 switch (exit_reason) {
5600 case EXIT_REASON_EXCEPTION_NMI:
5601 if (!is_exception(intr_info))
5602 return 0;
5603 else if (is_page_fault(intr_info))
5604 return enable_ept;
5605 return vmcs12->exception_bitmap &
5606 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
5607 case EXIT_REASON_EXTERNAL_INTERRUPT:
5608 return 0;
5609 case EXIT_REASON_TRIPLE_FAULT:
5610 return 1;
5611 case EXIT_REASON_PENDING_INTERRUPT:
5612 case EXIT_REASON_NMI_WINDOW:
5613 /*
5614 * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit
5615 * (aka Interrupt Window Exiting) only when L1 turned it on,
5616 * so if we got a PENDING_INTERRUPT exit, this must be for L1.
5617 * Same for NMI Window Exiting.
5618 */
5619 return 1;
5620 case EXIT_REASON_TASK_SWITCH:
5621 return 1;
5622 case EXIT_REASON_CPUID:
5623 return 1;
5624 case EXIT_REASON_HLT:
5625 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
5626 case EXIT_REASON_INVD:
5627 return 1;
5628 case EXIT_REASON_INVLPG:
5629 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5630 case EXIT_REASON_RDPMC:
5631 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
5632 case EXIT_REASON_RDTSC:
5633 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
5634 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
5635 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
5636 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
5637 case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
5638 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
5639 /*
5640 * VMX instructions trap unconditionally. This allows L1 to
5641 * emulate them for its L2 guest, i.e., allows 3-level nesting!
5642 */
5643 return 1;
5644 case EXIT_REASON_CR_ACCESS:
5645 return nested_vmx_exit_handled_cr(vcpu, vmcs12);
5646 case EXIT_REASON_DR_ACCESS:
5647 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
5648 case EXIT_REASON_IO_INSTRUCTION:
5649 /* TODO: support IO bitmaps */
5650 return 1;
5651 case EXIT_REASON_MSR_READ:
5652 case EXIT_REASON_MSR_WRITE:
5653 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
5654 case EXIT_REASON_INVALID_STATE:
5655 return 1;
5656 case EXIT_REASON_MWAIT_INSTRUCTION:
5657 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
5658 case EXIT_REASON_MONITOR_INSTRUCTION:
5659 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
5660 case EXIT_REASON_PAUSE_INSTRUCTION:
5661 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
5662 nested_cpu_has2(vmcs12,
5663 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
5664 case EXIT_REASON_MCE_DURING_VMENTRY:
5665 return 0;
5666 case EXIT_REASON_TPR_BELOW_THRESHOLD:
5667 return 1;
5668 case EXIT_REASON_APIC_ACCESS:
5669 return nested_cpu_has2(vmcs12,
5670 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
5671 case EXIT_REASON_EPT_VIOLATION:
5672 case EXIT_REASON_EPT_MISCONFIG:
5673 return 0;
5674 case EXIT_REASON_WBINVD:
5675 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
5676 case EXIT_REASON_XSETBV:
5677 return 1;
5678 default:
5679 return 1;
5680 }
5681}
5682
3914static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) 5683static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3915{ 5684{
3916 *info1 = vmcs_readl(EXIT_QUALIFICATION); 5685 *info1 = vmcs_readl(EXIT_QUALIFICATION);
@@ -3933,6 +5702,25 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3933 if (vmx->emulation_required && emulate_invalid_guest_state) 5702 if (vmx->emulation_required && emulate_invalid_guest_state)
3934 return handle_invalid_guest_state(vcpu); 5703 return handle_invalid_guest_state(vcpu);
3935 5704
5705 /*
5706 * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
5707 * we did not inject a still-pending event to L1 now because of
5708 * nested_run_pending, we need to re-enable this bit.
5709 */
5710 if (vmx->nested.nested_run_pending)
5711 kvm_make_request(KVM_REQ_EVENT, vcpu);
5712
5713 if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH ||
5714 exit_reason == EXIT_REASON_VMRESUME))
5715 vmx->nested.nested_run_pending = 1;
5716 else
5717 vmx->nested.nested_run_pending = 0;
5718
5719 if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
5720 nested_vmx_vmexit(vcpu);
5721 return 1;
5722 }
5723
3936 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { 5724 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
3937 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 5725 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3938 vcpu->run->fail_entry.hardware_entry_failure_reason 5726 vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -3955,7 +5743,9 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3955 "(0x%x) and exit reason is 0x%x\n", 5743 "(0x%x) and exit reason is 0x%x\n",
3956 __func__, vectoring_info, exit_reason); 5744 __func__, vectoring_info, exit_reason);
3957 5745
3958 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { 5746 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
5747 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
5748 get_vmcs12(vcpu), vcpu)))) {
3959 if (vmx_interrupt_allowed(vcpu)) { 5749 if (vmx_interrupt_allowed(vcpu)) {
3960 vmx->soft_vnmi_blocked = 0; 5750 vmx->soft_vnmi_blocked = 0;
3961 } else if (vmx->vnmi_blocked_time > 1000000000LL && 5751 } else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -4118,6 +5908,8 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
4118 5908
4119static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 5909static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
4120{ 5910{
5911 if (is_guest_mode(&vmx->vcpu))
5912 return;
4121 __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, 5913 __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
4122 VM_EXIT_INSTRUCTION_LEN, 5914 VM_EXIT_INSTRUCTION_LEN,
4123 IDT_VECTORING_ERROR_CODE); 5915 IDT_VECTORING_ERROR_CODE);
@@ -4125,6 +5917,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
4125 5917
4126static void vmx_cancel_injection(struct kvm_vcpu *vcpu) 5918static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
4127{ 5919{
5920 if (is_guest_mode(vcpu))
5921 return;
4128 __vmx_complete_interrupts(to_vmx(vcpu), 5922 __vmx_complete_interrupts(to_vmx(vcpu),
4129 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 5923 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
4130 VM_ENTRY_INSTRUCTION_LEN, 5924 VM_ENTRY_INSTRUCTION_LEN,
@@ -4145,6 +5939,21 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4145{ 5939{
4146 struct vcpu_vmx *vmx = to_vmx(vcpu); 5940 struct vcpu_vmx *vmx = to_vmx(vcpu);
4147 5941
5942 if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
5943 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5944 if (vmcs12->idt_vectoring_info_field &
5945 VECTORING_INFO_VALID_MASK) {
5946 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
5947 vmcs12->idt_vectoring_info_field);
5948 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
5949 vmcs12->vm_exit_instruction_len);
5950 if (vmcs12->idt_vectoring_info_field &
5951 VECTORING_INFO_DELIVER_CODE_MASK)
5952 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
5953 vmcs12->idt_vectoring_error_code);
5954 }
5955 }
5956
4148 /* Record the guest's net vcpu time for enforced NMI injections. */ 5957 /* Record the guest's net vcpu time for enforced NMI injections. */
4149 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 5958 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
4150 vmx->entry_time = ktime_get(); 5959 vmx->entry_time = ktime_get();
@@ -4167,6 +5976,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4167 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 5976 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
4168 vmx_set_interrupt_shadow(vcpu, 0); 5977 vmx_set_interrupt_shadow(vcpu, 0);
4169 5978
5979 vmx->__launched = vmx->loaded_vmcs->launched;
4170 asm( 5980 asm(
4171 /* Store host registers */ 5981 /* Store host registers */
4172 "push %%"R"dx; push %%"R"bp;" 5982 "push %%"R"dx; push %%"R"bp;"
@@ -4237,7 +6047,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4237 "pop %%"R"bp; pop %%"R"dx \n\t" 6047 "pop %%"R"bp; pop %%"R"dx \n\t"
4238 "setbe %c[fail](%0) \n\t" 6048 "setbe %c[fail](%0) \n\t"
4239 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 6049 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
4240 [launched]"i"(offsetof(struct vcpu_vmx, launched)), 6050 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
4241 [fail]"i"(offsetof(struct vcpu_vmx, fail)), 6051 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
4242 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), 6052 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
4243 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), 6053 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
@@ -4276,8 +6086,19 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4276 6086
4277 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 6087 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
4278 6088
6089 if (is_guest_mode(vcpu)) {
6090 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6091 vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info;
6092 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
6093 vmcs12->idt_vectoring_error_code =
6094 vmcs_read32(IDT_VECTORING_ERROR_CODE);
6095 vmcs12->vm_exit_instruction_len =
6096 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6097 }
6098 }
6099
4279 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 6100 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
4280 vmx->launched = 1; 6101 vmx->loaded_vmcs->launched = 1;
4281 6102
4282 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); 6103 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
4283 6104
@@ -4289,41 +6110,18 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4289#undef R 6110#undef R
4290#undef Q 6111#undef Q
4291 6112
4292static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
4293{
4294 struct vcpu_vmx *vmx = to_vmx(vcpu);
4295
4296 if (vmx->vmcs) {
4297 vcpu_clear(vmx);
4298 free_vmcs(vmx->vmcs);
4299 vmx->vmcs = NULL;
4300 }
4301}
4302
4303static void vmx_free_vcpu(struct kvm_vcpu *vcpu) 6113static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
4304{ 6114{
4305 struct vcpu_vmx *vmx = to_vmx(vcpu); 6115 struct vcpu_vmx *vmx = to_vmx(vcpu);
4306 6116
4307 free_vpid(vmx); 6117 free_vpid(vmx);
4308 vmx_free_vmcs(vcpu); 6118 free_nested(vmx);
6119 free_loaded_vmcs(vmx->loaded_vmcs);
4309 kfree(vmx->guest_msrs); 6120 kfree(vmx->guest_msrs);
4310 kvm_vcpu_uninit(vcpu); 6121 kvm_vcpu_uninit(vcpu);
4311 kmem_cache_free(kvm_vcpu_cache, vmx); 6122 kmem_cache_free(kvm_vcpu_cache, vmx);
4312} 6123}
4313 6124
4314static inline void vmcs_init(struct vmcs *vmcs)
4315{
4316 u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
4317
4318 if (!vmm_exclusive)
4319 kvm_cpu_vmxon(phys_addr);
4320
4321 vmcs_clear(vmcs);
4322
4323 if (!vmm_exclusive)
4324 kvm_cpu_vmxoff();
4325}
4326
4327static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) 6125static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4328{ 6126{
4329 int err; 6127 int err;
@@ -4345,11 +6143,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4345 goto uninit_vcpu; 6143 goto uninit_vcpu;
4346 } 6144 }
4347 6145
4348 vmx->vmcs = alloc_vmcs(); 6146 vmx->loaded_vmcs = &vmx->vmcs01;
4349 if (!vmx->vmcs) 6147 vmx->loaded_vmcs->vmcs = alloc_vmcs();
6148 if (!vmx->loaded_vmcs->vmcs)
4350 goto free_msrs; 6149 goto free_msrs;
4351 6150 if (!vmm_exclusive)
4352 vmcs_init(vmx->vmcs); 6151 kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
6152 loaded_vmcs_init(vmx->loaded_vmcs);
6153 if (!vmm_exclusive)
6154 kvm_cpu_vmxoff();
4353 6155
4354 cpu = get_cpu(); 6156 cpu = get_cpu();
4355 vmx_vcpu_load(&vmx->vcpu, cpu); 6157 vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -4375,10 +6177,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4375 goto free_vmcs; 6177 goto free_vmcs;
4376 } 6178 }
4377 6179
6180 vmx->nested.current_vmptr = -1ull;
6181 vmx->nested.current_vmcs12 = NULL;
6182
4378 return &vmx->vcpu; 6183 return &vmx->vcpu;
4379 6184
4380free_vmcs: 6185free_vmcs:
4381 free_vmcs(vmx->vmcs); 6186 free_vmcs(vmx->loaded_vmcs->vmcs);
4382free_msrs: 6187free_msrs:
4383 kfree(vmx->guest_msrs); 6188 kfree(vmx->guest_msrs);
4384uninit_vcpu: 6189uninit_vcpu:
@@ -4512,6 +6317,650 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
4512 6317
4513static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 6318static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
4514{ 6319{
6320 if (func == 1 && nested)
6321 entry->ecx |= bit(X86_FEATURE_VMX);
6322}
6323
6324/*
6325 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
6326 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
6327 * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2
6328 * guest in a way that will both be appropriate to L1's requests, and our
6329 * needs. In addition to modifying the active vmcs (which is vmcs02), this
6330 * function also has additional necessary side-effects, like setting various
6331 * vcpu->arch fields.
6332 */
6333static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6334{
6335 struct vcpu_vmx *vmx = to_vmx(vcpu);
6336 u32 exec_control;
6337
6338 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
6339 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
6340 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
6341 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
6342 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
6343 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
6344 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
6345 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
6346 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
6347 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
6348 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
6349 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
6350 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
6351 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
6352 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
6353 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
6354 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
6355 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
6356 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
6357 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
6358 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
6359 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
6360 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
6361 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
6362 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
6363 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
6364 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
6365 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
6366 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
6367 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
6368 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
6369 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
6370 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
6371 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
6372 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
6373 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
6374
6375 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
6376 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
6377 vmcs12->vm_entry_intr_info_field);
6378 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
6379 vmcs12->vm_entry_exception_error_code);
6380 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
6381 vmcs12->vm_entry_instruction_len);
6382 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
6383 vmcs12->guest_interruptibility_info);
6384 vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state);
6385 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
6386 vmcs_writel(GUEST_DR7, vmcs12->guest_dr7);
6387 vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
6388 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
6389 vmcs12->guest_pending_dbg_exceptions);
6390 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
6391 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
6392
6393 vmcs_write64(VMCS_LINK_POINTER, -1ull);
6394
6395 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
6396 (vmcs_config.pin_based_exec_ctrl |
6397 vmcs12->pin_based_vm_exec_control));
6398
6399 /*
6400 * Whether page-faults are trapped is determined by a combination of
6401 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
6402 * If enable_ept, L0 doesn't care about page faults and we should
6403 * set all of these to L1's desires. However, if !enable_ept, L0 does
6404 * care about (at least some) page faults, and because it is not easy
6405 * (if at all possible?) to merge L0 and L1's desires, we simply ask
6406 * to exit on each and every L2 page fault. This is done by setting
6407 * MASK=MATCH=0 and (see below) EB.PF=1.
6408 * Note that below we don't need special code to set EB.PF beyond the
6409 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
6410 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
6411 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
6412 *
6413 * A problem with this approach (when !enable_ept) is that L1 may be
6414 * injected with more page faults than it asked for. This could have
6415 * caused problems, but in practice existing hypervisors don't care.
6416 * To fix this, we will need to emulate the PFEC checking (on the L1
6417 * page tables), using walk_addr(), when injecting PFs to L1.
6418 */
6419 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
6420 enable_ept ? vmcs12->page_fault_error_code_mask : 0);
6421 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
6422 enable_ept ? vmcs12->page_fault_error_code_match : 0);
6423
6424 if (cpu_has_secondary_exec_ctrls()) {
6425 u32 exec_control = vmx_secondary_exec_control(vmx);
6426 if (!vmx->rdtscp_enabled)
6427 exec_control &= ~SECONDARY_EXEC_RDTSCP;
6428 /* Take the following fields only from vmcs12 */
6429 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6430 if (nested_cpu_has(vmcs12,
6431 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
6432 exec_control |= vmcs12->secondary_vm_exec_control;
6433
6434 if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
6435 /*
6436 * Translate L1 physical address to host physical
6437 * address for vmcs02. Keep the page pinned, so this
6438 * physical address remains valid. We keep a reference
6439 * to it so we can release it later.
6440 */
6441 if (vmx->nested.apic_access_page) /* shouldn't happen */
6442 nested_release_page(vmx->nested.apic_access_page);
6443 vmx->nested.apic_access_page =
6444 nested_get_page(vcpu, vmcs12->apic_access_addr);
6445 /*
6446 * If translation failed, no matter: This feature asks
6447 * to exit when accessing the given address, and if it
6448 * can never be accessed, this feature won't do
6449 * anything anyway.
6450 */
6451 if (!vmx->nested.apic_access_page)
6452 exec_control &=
6453 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6454 else
6455 vmcs_write64(APIC_ACCESS_ADDR,
6456 page_to_phys(vmx->nested.apic_access_page));
6457 }
6458
6459 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
6460 }
6461
6462
6463 /*
6464 * Set host-state according to L0's settings (vmcs12 is irrelevant here)
6465 * Some constant fields are set here by vmx_set_constant_host_state().
6466 * Other fields are different per CPU, and will be set later when
6467 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
6468 */
6469 vmx_set_constant_host_state();
6470
6471 /*
6472 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
6473 * entry, but only if the current (host) sp changed from the value
6474 * we wrote last (vmx->host_rsp). This cache is no longer relevant
6475 * if we switch vmcs, and rather than hold a separate cache per vmcs,
6476 * here we just force the write to happen on entry.
6477 */
6478 vmx->host_rsp = 0;
6479
6480 exec_control = vmx_exec_control(vmx); /* L0's desires */
6481 exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
6482 exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
6483 exec_control &= ~CPU_BASED_TPR_SHADOW;
6484 exec_control |= vmcs12->cpu_based_vm_exec_control;
6485 /*
6486 * Merging of IO and MSR bitmaps not currently supported.
6487 * Rather, exit every time.
6488 */
6489 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
6490 exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
6491 exec_control |= CPU_BASED_UNCOND_IO_EXITING;
6492
6493 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
6494
6495 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
6496 * bitwise-or of what L1 wants to trap for L2, and what we want to
6497 * trap. Note that CR0.TS also needs updating - we do this later.
6498 */
6499 update_exception_bitmap(vcpu);
6500 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
6501 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
6502
6503 /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
6504 vmcs_write32(VM_EXIT_CONTROLS,
6505 vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
6506 vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
6507 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
6508
6509 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
6510 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
6511 else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
6512 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
6513
6514
6515 set_cr4_guest_host_mask(vmx);
6516
6517 vmcs_write64(TSC_OFFSET,
6518 vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
6519
6520 if (enable_vpid) {
6521 /*
6522 * Trivially support vpid by letting L2s share their parent
6523 * L1's vpid. TODO: move to a more elaborate solution, giving
6524 * each L2 its own vpid and exposing the vpid feature to L1.
6525 */
6526 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
6527 vmx_flush_tlb(vcpu);
6528 }
6529
6530 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
6531 vcpu->arch.efer = vmcs12->guest_ia32_efer;
6532 if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
6533 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
6534 else
6535 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
6536 /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
6537 vmx_set_efer(vcpu, vcpu->arch.efer);
6538
6539 /*
6540 * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
6541 * TS bit (for lazy fpu) and bits which we consider mandatory enabled.
6542 * The CR0_READ_SHADOW is what L2 should have expected to read given
6543 * the specifications by L1; It's not enough to take
6544 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
6545 * have more bits than L1 expected.
6546 */
6547 vmx_set_cr0(vcpu, vmcs12->guest_cr0);
6548 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
6549
6550 vmx_set_cr4(vcpu, vmcs12->guest_cr4);
6551 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
6552
6553 /* shadow page tables on either EPT or shadow page tables */
6554 kvm_set_cr3(vcpu, vmcs12->guest_cr3);
6555 kvm_mmu_reset_context(vcpu);
6556
6557 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
6558 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
6559}
6560
6561/*
6562 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
6563 * for running an L2 nested guest.
6564 */
6565static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
6566{
6567 struct vmcs12 *vmcs12;
6568 struct vcpu_vmx *vmx = to_vmx(vcpu);
6569 int cpu;
6570 struct loaded_vmcs *vmcs02;
6571
6572 if (!nested_vmx_check_permission(vcpu) ||
6573 !nested_vmx_check_vmcs12(vcpu))
6574 return 1;
6575
6576 skip_emulated_instruction(vcpu);
6577 vmcs12 = get_vmcs12(vcpu);
6578
6579 /*
6580 * The nested entry process starts with enforcing various prerequisites
6581 * on vmcs12 as required by the Intel SDM, and act appropriately when
6582 * they fail: As the SDM explains, some conditions should cause the
6583 * instruction to fail, while others will cause the instruction to seem
6584 * to succeed, but return an EXIT_REASON_INVALID_STATE.
6585 * To speed up the normal (success) code path, we should avoid checking
6586 * for misconfigurations which will anyway be caught by the processor
6587 * when using the merged vmcs02.
6588 */
6589 if (vmcs12->launch_state == launch) {
6590 nested_vmx_failValid(vcpu,
6591 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
6592 : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
6593 return 1;
6594 }
6595
6596 if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
6597 !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) {
6598 /*TODO: Also verify bits beyond physical address width are 0*/
6599 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6600 return 1;
6601 }
6602
6603 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
6604 !IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) {
6605 /*TODO: Also verify bits beyond physical address width are 0*/
6606 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6607 return 1;
6608 }
6609
6610 if (vmcs12->vm_entry_msr_load_count > 0 ||
6611 vmcs12->vm_exit_msr_load_count > 0 ||
6612 vmcs12->vm_exit_msr_store_count > 0) {
6613 if (printk_ratelimit())
6614 printk(KERN_WARNING
6615 "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__);
6616 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6617 return 1;
6618 }
6619
6620 if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
6621 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) ||
6622 !vmx_control_verify(vmcs12->secondary_vm_exec_control,
6623 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||
6624 !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
6625 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||
6626 !vmx_control_verify(vmcs12->vm_exit_controls,
6627 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) ||
6628 !vmx_control_verify(vmcs12->vm_entry_controls,
6629 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high))
6630 {
6631 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6632 return 1;
6633 }
6634
6635 if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
6636 ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
6637 nested_vmx_failValid(vcpu,
6638 VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
6639 return 1;
6640 }
6641
6642 if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
6643 ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
6644 nested_vmx_entry_failure(vcpu, vmcs12,
6645 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
6646 return 1;
6647 }
6648 if (vmcs12->vmcs_link_pointer != -1ull) {
6649 nested_vmx_entry_failure(vcpu, vmcs12,
6650 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
6651 return 1;
6652 }
6653
6654 /*
6655 * We're finally done with prerequisite checking, and can start with
6656 * the nested entry.
6657 */
6658
6659 vmcs02 = nested_get_current_vmcs02(vmx);
6660 if (!vmcs02)
6661 return -ENOMEM;
6662
6663 enter_guest_mode(vcpu);
6664
6665 vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
6666
6667 cpu = get_cpu();
6668 vmx->loaded_vmcs = vmcs02;
6669 vmx_vcpu_put(vcpu);
6670 vmx_vcpu_load(vcpu, cpu);
6671 vcpu->cpu = cpu;
6672 put_cpu();
6673
6674 vmcs12->launch_state = 1;
6675
6676 prepare_vmcs02(vcpu, vmcs12);
6677
6678 /*
6679 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
6680 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
6681 * returned as far as L1 is concerned. It will only return (and set
6682 * the success flag) when L2 exits (see nested_vmx_vmexit()).
6683 */
6684 return 1;
6685}
6686
6687/*
6688 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
6689 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
6690 * This function returns the new value we should put in vmcs12.guest_cr0.
6691 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
6692 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
6693 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
6694 * didn't trap the bit, because if L1 did, so would L0).
6695 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
6696 * been modified by L2, and L1 knows it. So just leave the old value of
6697 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
6698 * isn't relevant, because if L0 traps this bit it can set it to anything.
6699 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
6700 * changed these bits, and therefore they need to be updated, but L0
6701 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
6702 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
6703 */
6704static inline unsigned long
6705vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6706{
6707 return
6708 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
6709 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
6710 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
6711 vcpu->arch.cr0_guest_owned_bits));
6712}
6713
6714static inline unsigned long
6715vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6716{
6717 return
6718 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
6719 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
6720 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
6721 vcpu->arch.cr4_guest_owned_bits));
6722}
6723
6724/*
6725 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
6726 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
6727 * and this function updates it to reflect the changes to the guest state while
6728 * L2 was running (and perhaps made some exits which were handled directly by L0
6729 * without going back to L1), and to reflect the exit reason.
6730 * Note that we do not have to copy here all VMCS fields, just those that
6731 * could have changed by the L2 guest or the exit - i.e., the guest-state and
6732 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
6733 * which already writes to vmcs12 directly.
6734 */
6735void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6736{
6737 /* update guest state fields: */
6738 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
6739 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
6740
6741 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
6742 vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
6743 vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
6744 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
6745
6746 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
6747 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
6748 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
6749 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
6750 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
6751 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
6752 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
6753 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
6754 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
6755 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
6756 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
6757 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
6758 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
6759 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
6760 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
6761 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
6762 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
6763 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
6764 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
6765 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
6766 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
6767 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
6768 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
6769 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
6770 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
6771 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
6772 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
6773 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
6774 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
6775 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
6776 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
6777 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
6778 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
6779 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
6780 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
6781 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
6782
6783 vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
6784 vmcs12->guest_interruptibility_info =
6785 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
6786 vmcs12->guest_pending_dbg_exceptions =
6787 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
6788
6789 /* TODO: These cannot have changed unless we have MSR bitmaps and
6790 * the relevant bit asks not to trap the change */
6791 vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
6792 if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT)
6793 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
6794 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
6795 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
6796 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
6797
6798 /* update exit information fields: */
6799
6800 vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON);
6801 vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6802
6803 vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
6804 vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6805 vmcs12->idt_vectoring_info_field =
6806 vmcs_read32(IDT_VECTORING_INFO_FIELD);
6807 vmcs12->idt_vectoring_error_code =
6808 vmcs_read32(IDT_VECTORING_ERROR_CODE);
6809 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6810 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6811
6812 /* clear vm-entry fields which are to be cleared on exit */
6813 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
6814 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
6815}
6816
6817/*
6818 * A part of what we need to when the nested L2 guest exits and we want to
6819 * run its L1 parent, is to reset L1's guest state to the host state specified
6820 * in vmcs12.
6821 * This function is to be called not only on normal nested exit, but also on
6822 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
6823 * Failures During or After Loading Guest State").
6824 * This function should be called when the active VMCS is L1's (vmcs01).
6825 */
6826void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6827{
6828 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
6829 vcpu->arch.efer = vmcs12->host_ia32_efer;
6830 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
6831 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
6832 else
6833 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
6834 vmx_set_efer(vcpu, vcpu->arch.efer);
6835
6836 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
6837 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
6838 /*
6839 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
6840 * actually changed, because it depends on the current state of
6841 * fpu_active (which may have changed).
6842 * Note that vmx_set_cr0 refers to efer set above.
6843 */
6844 kvm_set_cr0(vcpu, vmcs12->host_cr0);
6845 /*
6846 * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
6847 * to apply the same changes to L1's vmcs. We just set cr0 correctly,
6848 * but we also need to update cr0_guest_host_mask and exception_bitmap.
6849 */
6850 update_exception_bitmap(vcpu);
6851 vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
6852 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
6853
6854 /*
6855 * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01
6856 * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
6857 */
6858 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
6859 kvm_set_cr4(vcpu, vmcs12->host_cr4);
6860
6861 /* shadow page tables on either EPT or shadow page tables */
6862 kvm_set_cr3(vcpu, vmcs12->host_cr3);
6863 kvm_mmu_reset_context(vcpu);
6864
6865 if (enable_vpid) {
6866 /*
6867 * Trivially support vpid by letting L2s share their parent
6868 * L1's vpid. TODO: move to a more elaborate solution, giving
6869 * each L2 its own vpid and exposing the vpid feature to L1.
6870 */
6871 vmx_flush_tlb(vcpu);
6872 }
6873
6874
6875 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
6876 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
6877 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
6878 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
6879 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
6880 vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base);
6881 vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base);
6882 vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
6883 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
6884 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
6885 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
6886 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
6887 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
6888 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
6889 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
6890
6891 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
6892 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
6893 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
6894 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
6895 vmcs12->host_ia32_perf_global_ctrl);
6896}
6897
6898/*
6899 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
6900 * and modify vmcs12 to make it see what it would expect to see there if
6901 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
6902 */
6903static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
6904{
6905 struct vcpu_vmx *vmx = to_vmx(vcpu);
6906 int cpu;
6907 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6908
6909 leave_guest_mode(vcpu);
6910 prepare_vmcs12(vcpu, vmcs12);
6911
6912 cpu = get_cpu();
6913 vmx->loaded_vmcs = &vmx->vmcs01;
6914 vmx_vcpu_put(vcpu);
6915 vmx_vcpu_load(vcpu, cpu);
6916 vcpu->cpu = cpu;
6917 put_cpu();
6918
6919 /* if no vmcs02 cache requested, remove the one we used */
6920 if (VMCS02_POOL_SIZE == 0)
6921 nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
6922
6923 load_vmcs12_host_state(vcpu, vmcs12);
6924
6925 /* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */
6926 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
6927
6928 /* This is needed for same reason as it was needed in prepare_vmcs02 */
6929 vmx->host_rsp = 0;
6930
6931 /* Unpin physical memory we referred to in vmcs02 */
6932 if (vmx->nested.apic_access_page) {
6933 nested_release_page(vmx->nested.apic_access_page);
6934 vmx->nested.apic_access_page = 0;
6935 }
6936
6937 /*
6938 * Exiting from L2 to L1, we're now back to L1 which thinks it just
6939 * finished a VMLAUNCH or VMRESUME instruction, so we need to set the
6940 * success or failure flag accordingly.
6941 */
6942 if (unlikely(vmx->fail)) {
6943 vmx->fail = 0;
6944 nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR));
6945 } else
6946 nested_vmx_succeed(vcpu);
6947}
6948
6949/*
6950 * L1's failure to enter L2 is a subset of a normal exit, as explained in
6951 * 23.7 "VM-entry failures during or after loading guest state" (this also
6952 * lists the acceptable exit-reason and exit-qualification parameters).
6953 * It should only be called before L2 actually succeeded to run, and when
6954 * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss).
6955 */
6956static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
6957 struct vmcs12 *vmcs12,
6958 u32 reason, unsigned long qualification)
6959{
6960 load_vmcs12_host_state(vcpu, vmcs12);
6961 vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
6962 vmcs12->exit_qualification = qualification;
6963 nested_vmx_succeed(vcpu);
4515} 6964}
4516 6965
4517static int vmx_check_intercept(struct kvm_vcpu *vcpu, 6966static int vmx_check_intercept(struct kvm_vcpu *vcpu,
@@ -4670,16 +7119,13 @@ static int __init vmx_init(void)
4670 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); 7119 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
4671 7120
4672 if (enable_ept) { 7121 if (enable_ept) {
4673 bypass_guest_pf = 0;
4674 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 7122 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
4675 VMX_EPT_EXECUTABLE_MASK); 7123 VMX_EPT_EXECUTABLE_MASK);
7124 ept_set_mmio_spte_mask();
4676 kvm_enable_tdp(); 7125 kvm_enable_tdp();
4677 } else 7126 } else
4678 kvm_disable_tdp(); 7127 kvm_disable_tdp();
4679 7128
4680 if (bypass_guest_pf)
4681 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
4682
4683 return 0; 7129 return 0;
4684 7130
4685out3: 7131out3:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 77c9d8673dc4..84a28ea45fa4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -347,6 +347,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
347 vcpu->arch.cr2 = fault->address; 347 vcpu->arch.cr2 = fault->address;
348 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); 348 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
349} 349}
350EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
350 351
351void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) 352void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
352{ 353{
@@ -579,6 +580,22 @@ static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
579 return best && (best->ecx & bit(X86_FEATURE_XSAVE)); 580 return best && (best->ecx & bit(X86_FEATURE_XSAVE));
580} 581}
581 582
583static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
584{
585 struct kvm_cpuid_entry2 *best;
586
587 best = kvm_find_cpuid_entry(vcpu, 7, 0);
588 return best && (best->ebx & bit(X86_FEATURE_SMEP));
589}
590
591static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
592{
593 struct kvm_cpuid_entry2 *best;
594
595 best = kvm_find_cpuid_entry(vcpu, 7, 0);
596 return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
597}
598
582static void update_cpuid(struct kvm_vcpu *vcpu) 599static void update_cpuid(struct kvm_vcpu *vcpu)
583{ 600{
584 struct kvm_cpuid_entry2 *best; 601 struct kvm_cpuid_entry2 *best;
@@ -598,14 +615,20 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
598int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 615int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
599{ 616{
600 unsigned long old_cr4 = kvm_read_cr4(vcpu); 617 unsigned long old_cr4 = kvm_read_cr4(vcpu);
601 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; 618 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
602 619 X86_CR4_PAE | X86_CR4_SMEP;
603 if (cr4 & CR4_RESERVED_BITS) 620 if (cr4 & CR4_RESERVED_BITS)
604 return 1; 621 return 1;
605 622
606 if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) 623 if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
607 return 1; 624 return 1;
608 625
626 if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
627 return 1;
628
629 if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS))
630 return 1;
631
609 if (is_long_mode(vcpu)) { 632 if (is_long_mode(vcpu)) {
610 if (!(cr4 & X86_CR4_PAE)) 633 if (!(cr4 & X86_CR4_PAE))
611 return 1; 634 return 1;
@@ -615,11 +638,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
615 kvm_read_cr3(vcpu))) 638 kvm_read_cr3(vcpu)))
616 return 1; 639 return 1;
617 640
618 if (cr4 & X86_CR4_VMXE) 641 if (kvm_x86_ops->set_cr4(vcpu, cr4))
619 return 1; 642 return 1;
620 643
621 kvm_x86_ops->set_cr4(vcpu, cr4);
622
623 if ((cr4 ^ old_cr4) & pdptr_bits) 644 if ((cr4 ^ old_cr4) & pdptr_bits)
624 kvm_mmu_reset_context(vcpu); 645 kvm_mmu_reset_context(vcpu);
625 646
@@ -787,12 +808,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
787 * kvm-specific. Those are put in the beginning of the list. 808 * kvm-specific. Those are put in the beginning of the list.
788 */ 809 */
789 810
790#define KVM_SAVE_MSRS_BEGIN 8 811#define KVM_SAVE_MSRS_BEGIN 9
791static u32 msrs_to_save[] = { 812static u32 msrs_to_save[] = {
792 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 813 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
793 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 814 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
794 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 815 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
795 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, 816 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
796 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 817 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
797 MSR_STAR, 818 MSR_STAR,
798#ifdef CONFIG_X86_64 819#ifdef CONFIG_X86_64
@@ -1388,7 +1409,7 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1388 return 1; 1409 return 1;
1389 kvm_x86_ops->patch_hypercall(vcpu, instructions); 1410 kvm_x86_ops->patch_hypercall(vcpu, instructions);
1390 ((unsigned char *)instructions)[3] = 0xc3; /* ret */ 1411 ((unsigned char *)instructions)[3] = 0xc3; /* ret */
1391 if (copy_to_user((void __user *)addr, instructions, 4)) 1412 if (__copy_to_user((void __user *)addr, instructions, 4))
1392 return 1; 1413 return 1;
1393 kvm->arch.hv_hypercall = data; 1414 kvm->arch.hv_hypercall = data;
1394 break; 1415 break;
@@ -1415,7 +1436,7 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1415 HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); 1436 HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
1416 if (kvm_is_error_hva(addr)) 1437 if (kvm_is_error_hva(addr))
1417 return 1; 1438 return 1;
1418 if (clear_user((void __user *)addr, PAGE_SIZE)) 1439 if (__clear_user((void __user *)addr, PAGE_SIZE))
1419 return 1; 1440 return 1;
1420 vcpu->arch.hv_vapic = data; 1441 vcpu->arch.hv_vapic = data;
1421 break; 1442 break;
@@ -1467,6 +1488,35 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
1467 } 1488 }
1468} 1489}
1469 1490
1491static void accumulate_steal_time(struct kvm_vcpu *vcpu)
1492{
1493 u64 delta;
1494
1495 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
1496 return;
1497
1498 delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
1499 vcpu->arch.st.last_steal = current->sched_info.run_delay;
1500 vcpu->arch.st.accum_steal = delta;
1501}
1502
1503static void record_steal_time(struct kvm_vcpu *vcpu)
1504{
1505 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
1506 return;
1507
1508 if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
1509 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
1510 return;
1511
1512 vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
1513 vcpu->arch.st.steal.version += 2;
1514 vcpu->arch.st.accum_steal = 0;
1515
1516 kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
1517 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
1518}
1519
1470int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1520int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1471{ 1521{
1472 switch (msr) { 1522 switch (msr) {
@@ -1549,6 +1599,33 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1549 if (kvm_pv_enable_async_pf(vcpu, data)) 1599 if (kvm_pv_enable_async_pf(vcpu, data))
1550 return 1; 1600 return 1;
1551 break; 1601 break;
1602 case MSR_KVM_STEAL_TIME:
1603
1604 if (unlikely(!sched_info_on()))
1605 return 1;
1606
1607 if (data & KVM_STEAL_RESERVED_MASK)
1608 return 1;
1609
1610 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
1611 data & KVM_STEAL_VALID_BITS))
1612 return 1;
1613
1614 vcpu->arch.st.msr_val = data;
1615
1616 if (!(data & KVM_MSR_ENABLED))
1617 break;
1618
1619 vcpu->arch.st.last_steal = current->sched_info.run_delay;
1620
1621 preempt_disable();
1622 accumulate_steal_time(vcpu);
1623 preempt_enable();
1624
1625 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
1626
1627 break;
1628
1552 case MSR_IA32_MCG_CTL: 1629 case MSR_IA32_MCG_CTL:
1553 case MSR_IA32_MCG_STATUS: 1630 case MSR_IA32_MCG_STATUS:
1554 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1631 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1834,6 +1911,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1834 case MSR_KVM_ASYNC_PF_EN: 1911 case MSR_KVM_ASYNC_PF_EN:
1835 data = vcpu->arch.apf.msr_val; 1912 data = vcpu->arch.apf.msr_val;
1836 break; 1913 break;
1914 case MSR_KVM_STEAL_TIME:
1915 data = vcpu->arch.st.msr_val;
1916 break;
1837 case MSR_IA32_P5_MC_ADDR: 1917 case MSR_IA32_P5_MC_ADDR:
1838 case MSR_IA32_P5_MC_TYPE: 1918 case MSR_IA32_P5_MC_TYPE:
1839 case MSR_IA32_MCG_CAP: 1919 case MSR_IA32_MCG_CAP:
@@ -2145,6 +2225,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2145 kvm_migrate_timers(vcpu); 2225 kvm_migrate_timers(vcpu);
2146 vcpu->cpu = cpu; 2226 vcpu->cpu = cpu;
2147 } 2227 }
2228
2229 accumulate_steal_time(vcpu);
2230 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2148} 2231}
2149 2232
2150void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 2233void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -2283,6 +2366,13 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2283 entry->flags = 0; 2366 entry->flags = 0;
2284} 2367}
2285 2368
2369static bool supported_xcr0_bit(unsigned bit)
2370{
2371 u64 mask = ((u64)1 << bit);
2372
2373 return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
2374}
2375
2286#define F(x) bit(X86_FEATURE_##x) 2376#define F(x) bit(X86_FEATURE_##x)
2287 2377
2288static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 2378static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
@@ -2328,7 +2418,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2328 0 /* Reserved, DCA */ | F(XMM4_1) | 2418 0 /* Reserved, DCA */ | F(XMM4_1) |
2329 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 2419 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
2330 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | 2420 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
2331 F(F16C); 2421 F(F16C) | F(RDRAND);
2332 /* cpuid 0x80000001.ecx */ 2422 /* cpuid 0x80000001.ecx */
2333 const u32 kvm_supported_word6_x86_features = 2423 const u32 kvm_supported_word6_x86_features =
2334 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | 2424 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
@@ -2342,6 +2432,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2342 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | 2432 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
2343 F(PMM) | F(PMM_EN); 2433 F(PMM) | F(PMM_EN);
2344 2434
2435 /* cpuid 7.0.ebx */
2436 const u32 kvm_supported_word9_x86_features =
2437 F(SMEP) | F(FSGSBASE) | F(ERMS);
2438
2345 /* all calls to cpuid_count() should be made on the same cpu */ 2439 /* all calls to cpuid_count() should be made on the same cpu */
2346 get_cpu(); 2440 get_cpu();
2347 do_cpuid_1_ent(entry, function, index); 2441 do_cpuid_1_ent(entry, function, index);
@@ -2376,7 +2470,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2376 } 2470 }
2377 break; 2471 break;
2378 } 2472 }
2379 /* function 4 and 0xb have additional index. */ 2473 /* function 4 has additional index. */
2380 case 4: { 2474 case 4: {
2381 int i, cache_type; 2475 int i, cache_type;
2382 2476
@@ -2393,6 +2487,22 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2393 } 2487 }
2394 break; 2488 break;
2395 } 2489 }
2490 case 7: {
2491 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2492 /* Mask ebx against host capbability word 9 */
2493 if (index == 0) {
2494 entry->ebx &= kvm_supported_word9_x86_features;
2495 cpuid_mask(&entry->ebx, 9);
2496 } else
2497 entry->ebx = 0;
2498 entry->eax = 0;
2499 entry->ecx = 0;
2500 entry->edx = 0;
2501 break;
2502 }
2503 case 9:
2504 break;
2505 /* function 0xb has additional index. */
2396 case 0xb: { 2506 case 0xb: {
2397 int i, level_type; 2507 int i, level_type;
2398 2508
@@ -2410,16 +2520,17 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2410 break; 2520 break;
2411 } 2521 }
2412 case 0xd: { 2522 case 0xd: {
2413 int i; 2523 int idx, i;
2414 2524
2415 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2525 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2416 for (i = 1; *nent < maxnent && i < 64; ++i) { 2526 for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) {
2417 if (entry[i].eax == 0) 2527 do_cpuid_1_ent(&entry[i], function, idx);
2528 if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
2418 continue; 2529 continue;
2419 do_cpuid_1_ent(&entry[i], function, i);
2420 entry[i].flags |= 2530 entry[i].flags |=
2421 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2531 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2422 ++*nent; 2532 ++*nent;
2533 ++i;
2423 } 2534 }
2424 break; 2535 break;
2425 } 2536 }
@@ -2438,6 +2549,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2438 (1 << KVM_FEATURE_CLOCKSOURCE2) | 2549 (1 << KVM_FEATURE_CLOCKSOURCE2) |
2439 (1 << KVM_FEATURE_ASYNC_PF) | 2550 (1 << KVM_FEATURE_ASYNC_PF) |
2440 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); 2551 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
2552
2553 if (sched_info_on())
2554 entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
2555
2441 entry->ebx = 0; 2556 entry->ebx = 0;
2442 entry->ecx = 0; 2557 entry->ecx = 0;
2443 entry->edx = 0; 2558 entry->edx = 0;
@@ -2451,6 +2566,24 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2451 entry->ecx &= kvm_supported_word6_x86_features; 2566 entry->ecx &= kvm_supported_word6_x86_features;
2452 cpuid_mask(&entry->ecx, 6); 2567 cpuid_mask(&entry->ecx, 6);
2453 break; 2568 break;
2569 case 0x80000008: {
2570 unsigned g_phys_as = (entry->eax >> 16) & 0xff;
2571 unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
2572 unsigned phys_as = entry->eax & 0xff;
2573
2574 if (!g_phys_as)
2575 g_phys_as = phys_as;
2576 entry->eax = g_phys_as | (virt_as << 8);
2577 entry->ebx = entry->edx = 0;
2578 break;
2579 }
2580 case 0x80000019:
2581 entry->ecx = entry->edx = 0;
2582 break;
2583 case 0x8000001a:
2584 break;
2585 case 0x8000001d:
2586 break;
2454 /*Add support for Centaur's CPUID instruction*/ 2587 /*Add support for Centaur's CPUID instruction*/
2455 case 0xC0000000: 2588 case 0xC0000000:
2456 /*Just support up to 0xC0000004 now*/ 2589 /*Just support up to 0xC0000004 now*/
@@ -2460,10 +2593,16 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2460 entry->edx &= kvm_supported_word5_x86_features; 2593 entry->edx &= kvm_supported_word5_x86_features;
2461 cpuid_mask(&entry->edx, 5); 2594 cpuid_mask(&entry->edx, 5);
2462 break; 2595 break;
2596 case 3: /* Processor serial number */
2597 case 5: /* MONITOR/MWAIT */
2598 case 6: /* Thermal management */
2599 case 0xA: /* Architectural Performance Monitoring */
2600 case 0x80000007: /* Advanced power management */
2463 case 0xC0000002: 2601 case 0xC0000002:
2464 case 0xC0000003: 2602 case 0xC0000003:
2465 case 0xC0000004: 2603 case 0xC0000004:
2466 /*Now nothing to do, reserved for the future*/ 2604 default:
2605 entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
2467 break; 2606 break;
2468 } 2607 }
2469 2608
@@ -3817,7 +3956,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
3817 exception); 3956 exception);
3818} 3957}
3819 3958
3820static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, 3959int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
3821 gva_t addr, void *val, unsigned int bytes, 3960 gva_t addr, void *val, unsigned int bytes,
3822 struct x86_exception *exception) 3961 struct x86_exception *exception)
3823{ 3962{
@@ -3827,6 +3966,7 @@ static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
3827 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, 3966 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
3828 exception); 3967 exception);
3829} 3968}
3969EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
3830 3970
3831static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, 3971static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3832 gva_t addr, void *val, unsigned int bytes, 3972 gva_t addr, void *val, unsigned int bytes,
@@ -3836,7 +3976,7 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3836 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); 3976 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
3837} 3977}
3838 3978
3839static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, 3979int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3840 gva_t addr, void *val, 3980 gva_t addr, void *val,
3841 unsigned int bytes, 3981 unsigned int bytes,
3842 struct x86_exception *exception) 3982 struct x86_exception *exception)
@@ -3868,6 +4008,42 @@ static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3868out: 4008out:
3869 return r; 4009 return r;
3870} 4010}
4011EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
4012
4013static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
4014 gpa_t *gpa, struct x86_exception *exception,
4015 bool write)
4016{
4017 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4018
4019 if (vcpu_match_mmio_gva(vcpu, gva) &&
4020 check_write_user_access(vcpu, write, access,
4021 vcpu->arch.access)) {
4022 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
4023 (gva & (PAGE_SIZE - 1));
4024 trace_vcpu_match_mmio(gva, *gpa, write, false);
4025 return 1;
4026 }
4027
4028 if (write)
4029 access |= PFERR_WRITE_MASK;
4030
4031 *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4032
4033 if (*gpa == UNMAPPED_GVA)
4034 return -1;
4035
4036 /* For APIC access vmexit */
4037 if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
4038 return 1;
4039
4040 if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
4041 trace_vcpu_match_mmio(gva, *gpa, write, true);
4042 return 1;
4043 }
4044
4045 return 0;
4046}
3871 4047
3872static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, 4048static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
3873 unsigned long addr, 4049 unsigned long addr,
@@ -3876,8 +4052,8 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
3876 struct x86_exception *exception) 4052 struct x86_exception *exception)
3877{ 4053{
3878 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 4054 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3879 gpa_t gpa; 4055 gpa_t gpa;
3880 int handled; 4056 int handled, ret;
3881 4057
3882 if (vcpu->mmio_read_completed) { 4058 if (vcpu->mmio_read_completed) {
3883 memcpy(val, vcpu->mmio_data, bytes); 4059 memcpy(val, vcpu->mmio_data, bytes);
@@ -3887,13 +4063,12 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
3887 return X86EMUL_CONTINUE; 4063 return X86EMUL_CONTINUE;
3888 } 4064 }
3889 4065
3890 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception); 4066 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false);
3891 4067
3892 if (gpa == UNMAPPED_GVA) 4068 if (ret < 0)
3893 return X86EMUL_PROPAGATE_FAULT; 4069 return X86EMUL_PROPAGATE_FAULT;
3894 4070
3895 /* For APIC access vmexit */ 4071 if (ret)
3896 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3897 goto mmio; 4072 goto mmio;
3898 4073
3899 if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception) 4074 if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
@@ -3944,16 +4119,16 @@ static int emulator_write_emulated_onepage(unsigned long addr,
3944 struct x86_exception *exception, 4119 struct x86_exception *exception,
3945 struct kvm_vcpu *vcpu) 4120 struct kvm_vcpu *vcpu)
3946{ 4121{
3947 gpa_t gpa; 4122 gpa_t gpa;
3948 int handled; 4123 int handled, ret;
3949 4124
3950 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); 4125 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true);
3951 4126
3952 if (gpa == UNMAPPED_GVA) 4127 if (ret < 0)
3953 return X86EMUL_PROPAGATE_FAULT; 4128 return X86EMUL_PROPAGATE_FAULT;
3954 4129
3955 /* For APIC access vmexit */ 4130 /* For APIC access vmexit */
3956 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 4131 if (ret)
3957 goto mmio; 4132 goto mmio;
3958 4133
3959 if (emulator_write_phys(vcpu, gpa, val, bytes)) 4134 if (emulator_write_phys(vcpu, gpa, val, bytes))
@@ -4473,9 +4648,24 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
4473 kvm_queue_exception(vcpu, ctxt->exception.vector); 4648 kvm_queue_exception(vcpu, ctxt->exception.vector);
4474} 4649}
4475 4650
4651static void init_decode_cache(struct x86_emulate_ctxt *ctxt,
4652 const unsigned long *regs)
4653{
4654 memset(&ctxt->twobyte, 0,
4655 (void *)&ctxt->regs - (void *)&ctxt->twobyte);
4656 memcpy(ctxt->regs, regs, sizeof(ctxt->regs));
4657
4658 ctxt->fetch.start = 0;
4659 ctxt->fetch.end = 0;
4660 ctxt->io_read.pos = 0;
4661 ctxt->io_read.end = 0;
4662 ctxt->mem_read.pos = 0;
4663 ctxt->mem_read.end = 0;
4664}
4665
4476static void init_emulate_ctxt(struct kvm_vcpu *vcpu) 4666static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4477{ 4667{
4478 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4668 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4479 int cs_db, cs_l; 4669 int cs_db, cs_l;
4480 4670
4481 /* 4671 /*
@@ -4488,40 +4678,38 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4488 4678
4489 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 4679 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4490 4680
4491 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); 4681 ctxt->eflags = kvm_get_rflags(vcpu);
4492 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); 4682 ctxt->eip = kvm_rip_read(vcpu);
4493 vcpu->arch.emulate_ctxt.mode = 4683 ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
4494 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : 4684 (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
4495 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 4685 cs_l ? X86EMUL_MODE_PROT64 :
4496 ? X86EMUL_MODE_VM86 : cs_l 4686 cs_db ? X86EMUL_MODE_PROT32 :
4497 ? X86EMUL_MODE_PROT64 : cs_db 4687 X86EMUL_MODE_PROT16;
4498 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 4688 ctxt->guest_mode = is_guest_mode(vcpu);
4499 vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu); 4689
4500 memset(c, 0, sizeof(struct decode_cache)); 4690 init_decode_cache(ctxt, vcpu->arch.regs);
4501 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4502 vcpu->arch.emulate_regs_need_sync_from_vcpu = false; 4691 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4503} 4692}
4504 4693
4505int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) 4694int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
4506{ 4695{
4507 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4696 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4508 int ret; 4697 int ret;
4509 4698
4510 init_emulate_ctxt(vcpu); 4699 init_emulate_ctxt(vcpu);
4511 4700
4512 vcpu->arch.emulate_ctxt.decode.op_bytes = 2; 4701 ctxt->op_bytes = 2;
4513 vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; 4702 ctxt->ad_bytes = 2;
4514 vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip + 4703 ctxt->_eip = ctxt->eip + inc_eip;
4515 inc_eip; 4704 ret = emulate_int_real(ctxt, irq);
4516 ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
4517 4705
4518 if (ret != X86EMUL_CONTINUE) 4706 if (ret != X86EMUL_CONTINUE)
4519 return EMULATE_FAIL; 4707 return EMULATE_FAIL;
4520 4708
4521 vcpu->arch.emulate_ctxt.eip = c->eip; 4709 ctxt->eip = ctxt->_eip;
4522 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 4710 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
4523 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 4711 kvm_rip_write(vcpu, ctxt->eip);
4524 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 4712 kvm_set_rflags(vcpu, ctxt->eflags);
4525 4713
4526 if (irq == NMI_VECTOR) 4714 if (irq == NMI_VECTOR)
4527 vcpu->arch.nmi_pending = false; 4715 vcpu->arch.nmi_pending = false;
@@ -4582,21 +4770,21 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4582 int insn_len) 4770 int insn_len)
4583{ 4771{
4584 int r; 4772 int r;
4585 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4773 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4586 bool writeback = true; 4774 bool writeback = true;
4587 4775
4588 kvm_clear_exception_queue(vcpu); 4776 kvm_clear_exception_queue(vcpu);
4589 4777
4590 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 4778 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
4591 init_emulate_ctxt(vcpu); 4779 init_emulate_ctxt(vcpu);
4592 vcpu->arch.emulate_ctxt.interruptibility = 0; 4780 ctxt->interruptibility = 0;
4593 vcpu->arch.emulate_ctxt.have_exception = false; 4781 ctxt->have_exception = false;
4594 vcpu->arch.emulate_ctxt.perm_ok = false; 4782 ctxt->perm_ok = false;
4595 4783
4596 vcpu->arch.emulate_ctxt.only_vendor_specific_insn 4784 ctxt->only_vendor_specific_insn
4597 = emulation_type & EMULTYPE_TRAP_UD; 4785 = emulation_type & EMULTYPE_TRAP_UD;
4598 4786
4599 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); 4787 r = x86_decode_insn(ctxt, insn, insn_len);
4600 4788
4601 trace_kvm_emulate_insn_start(vcpu); 4789 trace_kvm_emulate_insn_start(vcpu);
4602 ++vcpu->stat.insn_emulation; 4790 ++vcpu->stat.insn_emulation;
@@ -4612,7 +4800,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4612 } 4800 }
4613 4801
4614 if (emulation_type & EMULTYPE_SKIP) { 4802 if (emulation_type & EMULTYPE_SKIP) {
4615 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); 4803 kvm_rip_write(vcpu, ctxt->_eip);
4616 return EMULATE_DONE; 4804 return EMULATE_DONE;
4617 } 4805 }
4618 4806
@@ -4620,11 +4808,11 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4620 changes registers values during IO operation */ 4808 changes registers values during IO operation */
4621 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { 4809 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
4622 vcpu->arch.emulate_regs_need_sync_from_vcpu = false; 4810 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4623 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); 4811 memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs);
4624 } 4812 }
4625 4813
4626restart: 4814restart:
4627 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); 4815 r = x86_emulate_insn(ctxt);
4628 4816
4629 if (r == EMULATION_INTERCEPTED) 4817 if (r == EMULATION_INTERCEPTED)
4630 return EMULATE_DONE; 4818 return EMULATE_DONE;
@@ -4636,7 +4824,7 @@ restart:
4636 return handle_emulation_failure(vcpu); 4824 return handle_emulation_failure(vcpu);
4637 } 4825 }
4638 4826
4639 if (vcpu->arch.emulate_ctxt.have_exception) { 4827 if (ctxt->have_exception) {
4640 inject_emulated_exception(vcpu); 4828 inject_emulated_exception(vcpu);
4641 r = EMULATE_DONE; 4829 r = EMULATE_DONE;
4642 } else if (vcpu->arch.pio.count) { 4830 } else if (vcpu->arch.pio.count) {
@@ -4655,13 +4843,12 @@ restart:
4655 r = EMULATE_DONE; 4843 r = EMULATE_DONE;
4656 4844
4657 if (writeback) { 4845 if (writeback) {
4658 toggle_interruptibility(vcpu, 4846 toggle_interruptibility(vcpu, ctxt->interruptibility);
4659 vcpu->arch.emulate_ctxt.interruptibility); 4847 kvm_set_rflags(vcpu, ctxt->eflags);
4660 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4661 kvm_make_request(KVM_REQ_EVENT, vcpu); 4848 kvm_make_request(KVM_REQ_EVENT, vcpu);
4662 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 4849 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
4663 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 4850 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
4664 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 4851 kvm_rip_write(vcpu, ctxt->eip);
4665 } else 4852 } else
4666 vcpu->arch.emulate_regs_need_sync_to_vcpu = true; 4853 vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
4667 4854
@@ -4878,6 +5065,30 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
4878} 5065}
4879EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); 5066EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
4880 5067
5068static void kvm_set_mmio_spte_mask(void)
5069{
5070 u64 mask;
5071 int maxphyaddr = boot_cpu_data.x86_phys_bits;
5072
5073 /*
5074 * Set the reserved bits and the present bit of an paging-structure
5075 * entry to generate page fault with PFER.RSV = 1.
5076 */
5077 mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr;
5078 mask |= 1ull;
5079
5080#ifdef CONFIG_X86_64
5081 /*
5082 * If reserved bit is not supported, clear the present bit to disable
5083 * mmio page fault.
5084 */
5085 if (maxphyaddr == 52)
5086 mask &= ~1ull;
5087#endif
5088
5089 kvm_mmu_set_mmio_spte_mask(mask);
5090}
5091
4881int kvm_arch_init(void *opaque) 5092int kvm_arch_init(void *opaque)
4882{ 5093{
4883 int r; 5094 int r;
@@ -4904,10 +5115,10 @@ int kvm_arch_init(void *opaque)
4904 if (r) 5115 if (r)
4905 goto out; 5116 goto out;
4906 5117
5118 kvm_set_mmio_spte_mask();
4907 kvm_init_msr_list(); 5119 kvm_init_msr_list();
4908 5120
4909 kvm_x86_ops = ops; 5121 kvm_x86_ops = ops;
4910 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
4911 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 5122 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
4912 PT_DIRTY_MASK, PT64_NX_MASK, 0); 5123 PT_DIRTY_MASK, PT64_NX_MASK, 0);
4913 5124
@@ -5082,8 +5293,7 @@ int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
5082 5293
5083 kvm_x86_ops->patch_hypercall(vcpu, instruction); 5294 kvm_x86_ops->patch_hypercall(vcpu, instruction);
5084 5295
5085 return emulator_write_emulated(&vcpu->arch.emulate_ctxt, 5296 return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
5086 rip, instruction, 3, NULL);
5087} 5297}
5088 5298
5089static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 5299static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
@@ -5384,6 +5594,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5384 r = 1; 5594 r = 1;
5385 goto out; 5595 goto out;
5386 } 5596 }
5597 if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
5598 record_steal_time(vcpu);
5599
5387 } 5600 }
5388 5601
5389 r = kvm_mmu_reload(vcpu); 5602 r = kvm_mmu_reload(vcpu);
@@ -5671,8 +5884,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
5671 * that usually, but some bad designed PV devices (vmware 5884 * that usually, but some bad designed PV devices (vmware
5672 * backdoor interface) need this to work 5885 * backdoor interface) need this to work
5673 */ 5886 */
5674 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 5887 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
5675 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 5888 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
5676 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 5889 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5677 } 5890 }
5678 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 5891 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
@@ -5801,21 +6014,20 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
5801int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 6014int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
5802 bool has_error_code, u32 error_code) 6015 bool has_error_code, u32 error_code)
5803{ 6016{
5804 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 6017 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
5805 int ret; 6018 int ret;
5806 6019
5807 init_emulate_ctxt(vcpu); 6020 init_emulate_ctxt(vcpu);
5808 6021
5809 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, 6022 ret = emulator_task_switch(ctxt, tss_selector, reason,
5810 tss_selector, reason, has_error_code, 6023 has_error_code, error_code);
5811 error_code);
5812 6024
5813 if (ret) 6025 if (ret)
5814 return EMULATE_FAIL; 6026 return EMULATE_FAIL;
5815 6027
5816 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 6028 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
5817 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 6029 kvm_rip_write(vcpu, ctxt->eip);
5818 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 6030 kvm_set_rflags(vcpu, ctxt->eflags);
5819 kvm_make_request(KVM_REQ_EVENT, vcpu); 6031 kvm_make_request(KVM_REQ_EVENT, vcpu);
5820 return EMULATE_DONE; 6032 return EMULATE_DONE;
5821} 6033}
@@ -6093,12 +6305,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
6093 if (r == 0) 6305 if (r == 0)
6094 r = kvm_mmu_setup(vcpu); 6306 r = kvm_mmu_setup(vcpu);
6095 vcpu_put(vcpu); 6307 vcpu_put(vcpu);
6096 if (r < 0)
6097 goto free_vcpu;
6098 6308
6099 return 0;
6100free_vcpu:
6101 kvm_x86_ops->vcpu_free(vcpu);
6102 return r; 6309 return r;
6103} 6310}
6104 6311
@@ -6126,6 +6333,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
6126 6333
6127 kvm_make_request(KVM_REQ_EVENT, vcpu); 6334 kvm_make_request(KVM_REQ_EVENT, vcpu);
6128 vcpu->arch.apf.msr_val = 0; 6335 vcpu->arch.apf.msr_val = 0;
6336 vcpu->arch.st.msr_val = 0;
6129 6337
6130 kvmclock_reset(vcpu); 6338 kvmclock_reset(vcpu);
6131 6339
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index e407ed3df817..d36fe237c665 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -75,10 +75,54 @@ static inline u32 bit(int bitno)
75 return 1 << (bitno & 31); 75 return 1 << (bitno & 31);
76} 76}
77 77
78static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
79 gva_t gva, gfn_t gfn, unsigned access)
80{
81 vcpu->arch.mmio_gva = gva & PAGE_MASK;
82 vcpu->arch.access = access;
83 vcpu->arch.mmio_gfn = gfn;
84}
85
86/*
87 * Clear the mmio cache info for the given gva,
88 * specially, if gva is ~0ul, we clear all mmio cache info.
89 */
90static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
91{
92 if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
93 return;
94
95 vcpu->arch.mmio_gva = 0;
96}
97
98static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
99{
100 if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK))
101 return true;
102
103 return false;
104}
105
106static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
107{
108 if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
109 return true;
110
111 return false;
112}
113
78void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 114void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
79void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 115void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
80int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); 116int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
81 117
82void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); 118void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
83 119
120int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
121 gva_t addr, void *val, unsigned int bytes,
122 struct x86_exception *exception);
123
124int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
125 gva_t addr, void *val, unsigned int bytes,
126 struct x86_exception *exception);
127
84#endif 128#endif