diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-24 12:07:03 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-24 12:07:03 -0400 |
commit | 5fabc487c96819dd12ddb9414835d170fd9cd6d5 (patch) | |
tree | 01532d492e5074b0d3add29bf92ebf9a9d161e9e /arch/x86/kvm | |
parent | c61264f98c1a974ee6f545f61a4ab33b141d6bda (diff) | |
parent | 3f68b0318bbbd61bf08478ab99a149f0d9e5156e (diff) |
Merge branch 'kvm-updates/3.1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/3.1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (143 commits)
KVM: IOMMU: Disable device assignment without interrupt remapping
KVM: MMU: trace mmio page fault
KVM: MMU: mmio page fault support
KVM: MMU: reorganize struct kvm_shadow_walk_iterator
KVM: MMU: lockless walking shadow page table
KVM: MMU: do not need atomicly to set/clear spte
KVM: MMU: introduce the rules to modify shadow page table
KVM: MMU: abstract some functions to handle fault pfn
KVM: MMU: filter out the mmio pfn from the fault pfn
KVM: MMU: remove bypass_guest_pf
KVM: MMU: split kvm_mmu_free_page
KVM: MMU: count used shadow pages on prepareing path
KVM: MMU: rename 'pt_write' to 'emulate'
KVM: MMU: cleanup for FNAME(fetch)
KVM: MMU: optimize to handle dirty bit
KVM: MMU: cache mmio info on page fault path
KVM: x86: introduce vcpu_mmio_gva_to_gpa to cleanup the code
KVM: MMU: do not update slot bitmap if spte is nonpresent
KVM: MMU: fix walking shadow page table
KVM guest: KVM Steal time registration
...
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/Kconfig | 1 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 1749 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 1226 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 25 | ||||
-rw-r--r-- | arch/x86/kvm/mmu_audit.c | 12 | ||||
-rw-r--r-- | arch/x86/kvm/mmutrace.h | 48 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 258 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 6 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 31 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 2784 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 374 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 44 |
12 files changed, 4744 insertions, 1814 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 65cf8233d25c..988724b236b6 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig | |||
@@ -31,6 +31,7 @@ config KVM | |||
31 | select KVM_ASYNC_PF | 31 | select KVM_ASYNC_PF |
32 | select USER_RETURN_NOTIFIER | 32 | select USER_RETURN_NOTIFIER |
33 | select KVM_MMIO | 33 | select KVM_MMIO |
34 | select TASK_DELAY_ACCT | ||
34 | ---help--- | 35 | ---help--- |
35 | Support hosting fully virtualized guest machines using hardware | 36 | Support hosting fully virtualized guest machines using hardware |
36 | virtualization extensions. You will need a fairly recent | 37 | virtualization extensions. You will need a fairly recent |
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index adc98675cda0..6f08bc940fa8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c | |||
@@ -407,76 +407,59 @@ struct gprefix { | |||
407 | } \ | 407 | } \ |
408 | } while (0) | 408 | } while (0) |
409 | 409 | ||
410 | /* Fetch next part of the instruction being emulated. */ | ||
411 | #define insn_fetch(_type, _size, _eip) \ | ||
412 | ({ unsigned long _x; \ | ||
413 | rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ | ||
414 | if (rc != X86EMUL_CONTINUE) \ | ||
415 | goto done; \ | ||
416 | (_eip) += (_size); \ | ||
417 | (_type)_x; \ | ||
418 | }) | ||
419 | |||
420 | #define insn_fetch_arr(_arr, _size, _eip) \ | ||
421 | ({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ | ||
422 | if (rc != X86EMUL_CONTINUE) \ | ||
423 | goto done; \ | ||
424 | (_eip) += (_size); \ | ||
425 | }) | ||
426 | |||
427 | static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, | 410 | static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, |
428 | enum x86_intercept intercept, | 411 | enum x86_intercept intercept, |
429 | enum x86_intercept_stage stage) | 412 | enum x86_intercept_stage stage) |
430 | { | 413 | { |
431 | struct x86_instruction_info info = { | 414 | struct x86_instruction_info info = { |
432 | .intercept = intercept, | 415 | .intercept = intercept, |
433 | .rep_prefix = ctxt->decode.rep_prefix, | 416 | .rep_prefix = ctxt->rep_prefix, |
434 | .modrm_mod = ctxt->decode.modrm_mod, | 417 | .modrm_mod = ctxt->modrm_mod, |
435 | .modrm_reg = ctxt->decode.modrm_reg, | 418 | .modrm_reg = ctxt->modrm_reg, |
436 | .modrm_rm = ctxt->decode.modrm_rm, | 419 | .modrm_rm = ctxt->modrm_rm, |
437 | .src_val = ctxt->decode.src.val64, | 420 | .src_val = ctxt->src.val64, |
438 | .src_bytes = ctxt->decode.src.bytes, | 421 | .src_bytes = ctxt->src.bytes, |
439 | .dst_bytes = ctxt->decode.dst.bytes, | 422 | .dst_bytes = ctxt->dst.bytes, |
440 | .ad_bytes = ctxt->decode.ad_bytes, | 423 | .ad_bytes = ctxt->ad_bytes, |
441 | .next_rip = ctxt->eip, | 424 | .next_rip = ctxt->eip, |
442 | }; | 425 | }; |
443 | 426 | ||
444 | return ctxt->ops->intercept(ctxt, &info, stage); | 427 | return ctxt->ops->intercept(ctxt, &info, stage); |
445 | } | 428 | } |
446 | 429 | ||
447 | static inline unsigned long ad_mask(struct decode_cache *c) | 430 | static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) |
448 | { | 431 | { |
449 | return (1UL << (c->ad_bytes << 3)) - 1; | 432 | return (1UL << (ctxt->ad_bytes << 3)) - 1; |
450 | } | 433 | } |
451 | 434 | ||
452 | /* Access/update address held in a register, based on addressing mode. */ | 435 | /* Access/update address held in a register, based on addressing mode. */ |
453 | static inline unsigned long | 436 | static inline unsigned long |
454 | address_mask(struct decode_cache *c, unsigned long reg) | 437 | address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) |
455 | { | 438 | { |
456 | if (c->ad_bytes == sizeof(unsigned long)) | 439 | if (ctxt->ad_bytes == sizeof(unsigned long)) |
457 | return reg; | 440 | return reg; |
458 | else | 441 | else |
459 | return reg & ad_mask(c); | 442 | return reg & ad_mask(ctxt); |
460 | } | 443 | } |
461 | 444 | ||
462 | static inline unsigned long | 445 | static inline unsigned long |
463 | register_address(struct decode_cache *c, unsigned long reg) | 446 | register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg) |
464 | { | 447 | { |
465 | return address_mask(c, reg); | 448 | return address_mask(ctxt, reg); |
466 | } | 449 | } |
467 | 450 | ||
468 | static inline void | 451 | static inline void |
469 | register_address_increment(struct decode_cache *c, unsigned long *reg, int inc) | 452 | register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc) |
470 | { | 453 | { |
471 | if (c->ad_bytes == sizeof(unsigned long)) | 454 | if (ctxt->ad_bytes == sizeof(unsigned long)) |
472 | *reg += inc; | 455 | *reg += inc; |
473 | else | 456 | else |
474 | *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c)); | 457 | *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt)); |
475 | } | 458 | } |
476 | 459 | ||
477 | static inline void jmp_rel(struct decode_cache *c, int rel) | 460 | static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) |
478 | { | 461 | { |
479 | register_address_increment(c, &c->eip, rel); | 462 | register_address_increment(ctxt, &ctxt->_eip, rel); |
480 | } | 463 | } |
481 | 464 | ||
482 | static u32 desc_limit_scaled(struct desc_struct *desc) | 465 | static u32 desc_limit_scaled(struct desc_struct *desc) |
@@ -486,28 +469,26 @@ static u32 desc_limit_scaled(struct desc_struct *desc) | |||
486 | return desc->g ? (limit << 12) | 0xfff : limit; | 469 | return desc->g ? (limit << 12) | 0xfff : limit; |
487 | } | 470 | } |
488 | 471 | ||
489 | static void set_seg_override(struct decode_cache *c, int seg) | 472 | static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg) |
490 | { | 473 | { |
491 | c->has_seg_override = true; | 474 | ctxt->has_seg_override = true; |
492 | c->seg_override = seg; | 475 | ctxt->seg_override = seg; |
493 | } | 476 | } |
494 | 477 | ||
495 | static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, | 478 | static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) |
496 | struct x86_emulate_ops *ops, int seg) | ||
497 | { | 479 | { |
498 | if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) | 480 | if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) |
499 | return 0; | 481 | return 0; |
500 | 482 | ||
501 | return ops->get_cached_segment_base(ctxt, seg); | 483 | return ctxt->ops->get_cached_segment_base(ctxt, seg); |
502 | } | 484 | } |
503 | 485 | ||
504 | static unsigned seg_override(struct x86_emulate_ctxt *ctxt, | 486 | static unsigned seg_override(struct x86_emulate_ctxt *ctxt) |
505 | struct decode_cache *c) | ||
506 | { | 487 | { |
507 | if (!c->has_seg_override) | 488 | if (!ctxt->has_seg_override) |
508 | return 0; | 489 | return 0; |
509 | 490 | ||
510 | return c->seg_override; | 491 | return ctxt->seg_override; |
511 | } | 492 | } |
512 | 493 | ||
513 | static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, | 494 | static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, |
@@ -579,7 +560,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, | |||
579 | unsigned size, bool write, bool fetch, | 560 | unsigned size, bool write, bool fetch, |
580 | ulong *linear) | 561 | ulong *linear) |
581 | { | 562 | { |
582 | struct decode_cache *c = &ctxt->decode; | ||
583 | struct desc_struct desc; | 563 | struct desc_struct desc; |
584 | bool usable; | 564 | bool usable; |
585 | ulong la; | 565 | ulong la; |
@@ -587,7 +567,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, | |||
587 | u16 sel; | 567 | u16 sel; |
588 | unsigned cpl, rpl; | 568 | unsigned cpl, rpl; |
589 | 569 | ||
590 | la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; | 570 | la = seg_base(ctxt, addr.seg) + addr.ea; |
591 | switch (ctxt->mode) { | 571 | switch (ctxt->mode) { |
592 | case X86EMUL_MODE_REAL: | 572 | case X86EMUL_MODE_REAL: |
593 | break; | 573 | break; |
@@ -637,7 +617,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, | |||
637 | } | 617 | } |
638 | break; | 618 | break; |
639 | } | 619 | } |
640 | if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : c->ad_bytes != 8) | 620 | if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8) |
641 | la &= (u32)-1; | 621 | la &= (u32)-1; |
642 | *linear = la; | 622 | *linear = la; |
643 | return X86EMUL_CONTINUE; | 623 | return X86EMUL_CONTINUE; |
@@ -671,11 +651,10 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, | |||
671 | return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); | 651 | return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); |
672 | } | 652 | } |
673 | 653 | ||
674 | static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, | 654 | static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, |
675 | struct x86_emulate_ops *ops, | ||
676 | unsigned long eip, u8 *dest) | 655 | unsigned long eip, u8 *dest) |
677 | { | 656 | { |
678 | struct fetch_cache *fc = &ctxt->decode.fetch; | 657 | struct fetch_cache *fc = &ctxt->fetch; |
679 | int rc; | 658 | int rc; |
680 | int size, cur_size; | 659 | int size, cur_size; |
681 | 660 | ||
@@ -687,8 +666,8 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, | |||
687 | rc = __linearize(ctxt, addr, size, false, true, &linear); | 666 | rc = __linearize(ctxt, addr, size, false, true, &linear); |
688 | if (rc != X86EMUL_CONTINUE) | 667 | if (rc != X86EMUL_CONTINUE) |
689 | return rc; | 668 | return rc; |
690 | rc = ops->fetch(ctxt, linear, fc->data + cur_size, | 669 | rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size, |
691 | size, &ctxt->exception); | 670 | size, &ctxt->exception); |
692 | if (rc != X86EMUL_CONTINUE) | 671 | if (rc != X86EMUL_CONTINUE) |
693 | return rc; | 672 | return rc; |
694 | fc->end += size; | 673 | fc->end += size; |
@@ -698,7 +677,6 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, | |||
698 | } | 677 | } |
699 | 678 | ||
700 | static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, | 679 | static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, |
701 | struct x86_emulate_ops *ops, | ||
702 | unsigned long eip, void *dest, unsigned size) | 680 | unsigned long eip, void *dest, unsigned size) |
703 | { | 681 | { |
704 | int rc; | 682 | int rc; |
@@ -707,13 +685,30 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, | |||
707 | if (eip + size - ctxt->eip > 15) | 685 | if (eip + size - ctxt->eip > 15) |
708 | return X86EMUL_UNHANDLEABLE; | 686 | return X86EMUL_UNHANDLEABLE; |
709 | while (size--) { | 687 | while (size--) { |
710 | rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); | 688 | rc = do_insn_fetch_byte(ctxt, eip++, dest++); |
711 | if (rc != X86EMUL_CONTINUE) | 689 | if (rc != X86EMUL_CONTINUE) |
712 | return rc; | 690 | return rc; |
713 | } | 691 | } |
714 | return X86EMUL_CONTINUE; | 692 | return X86EMUL_CONTINUE; |
715 | } | 693 | } |
716 | 694 | ||
695 | /* Fetch next part of the instruction being emulated. */ | ||
696 | #define insn_fetch(_type, _size, _eip) \ | ||
697 | ({ unsigned long _x; \ | ||
698 | rc = do_insn_fetch(ctxt, (_eip), &_x, (_size)); \ | ||
699 | if (rc != X86EMUL_CONTINUE) \ | ||
700 | goto done; \ | ||
701 | (_eip) += (_size); \ | ||
702 | (_type)_x; \ | ||
703 | }) | ||
704 | |||
705 | #define insn_fetch_arr(_arr, _size, _eip) \ | ||
706 | ({ rc = do_insn_fetch(ctxt, (_eip), _arr, (_size)); \ | ||
707 | if (rc != X86EMUL_CONTINUE) \ | ||
708 | goto done; \ | ||
709 | (_eip) += (_size); \ | ||
710 | }) | ||
711 | |||
717 | /* | 712 | /* |
718 | * Given the 'reg' portion of a ModRM byte, and a register block, return a | 713 | * Given the 'reg' portion of a ModRM byte, and a register block, return a |
719 | * pointer into the block that addresses the relevant register. | 714 | * pointer into the block that addresses the relevant register. |
@@ -857,16 +852,15 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, | |||
857 | 852 | ||
858 | static void decode_register_operand(struct x86_emulate_ctxt *ctxt, | 853 | static void decode_register_operand(struct x86_emulate_ctxt *ctxt, |
859 | struct operand *op, | 854 | struct operand *op, |
860 | struct decode_cache *c, | ||
861 | int inhibit_bytereg) | 855 | int inhibit_bytereg) |
862 | { | 856 | { |
863 | unsigned reg = c->modrm_reg; | 857 | unsigned reg = ctxt->modrm_reg; |
864 | int highbyte_regs = c->rex_prefix == 0; | 858 | int highbyte_regs = ctxt->rex_prefix == 0; |
865 | 859 | ||
866 | if (!(c->d & ModRM)) | 860 | if (!(ctxt->d & ModRM)) |
867 | reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); | 861 | reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3); |
868 | 862 | ||
869 | if (c->d & Sse) { | 863 | if (ctxt->d & Sse) { |
870 | op->type = OP_XMM; | 864 | op->type = OP_XMM; |
871 | op->bytes = 16; | 865 | op->bytes = 16; |
872 | op->addr.xmm = reg; | 866 | op->addr.xmm = reg; |
@@ -875,49 +869,47 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, | |||
875 | } | 869 | } |
876 | 870 | ||
877 | op->type = OP_REG; | 871 | op->type = OP_REG; |
878 | if ((c->d & ByteOp) && !inhibit_bytereg) { | 872 | if ((ctxt->d & ByteOp) && !inhibit_bytereg) { |
879 | op->addr.reg = decode_register(reg, c->regs, highbyte_regs); | 873 | op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs); |
880 | op->bytes = 1; | 874 | op->bytes = 1; |
881 | } else { | 875 | } else { |
882 | op->addr.reg = decode_register(reg, c->regs, 0); | 876 | op->addr.reg = decode_register(reg, ctxt->regs, 0); |
883 | op->bytes = c->op_bytes; | 877 | op->bytes = ctxt->op_bytes; |
884 | } | 878 | } |
885 | fetch_register_operand(op); | 879 | fetch_register_operand(op); |
886 | op->orig_val = op->val; | 880 | op->orig_val = op->val; |
887 | } | 881 | } |
888 | 882 | ||
889 | static int decode_modrm(struct x86_emulate_ctxt *ctxt, | 883 | static int decode_modrm(struct x86_emulate_ctxt *ctxt, |
890 | struct x86_emulate_ops *ops, | ||
891 | struct operand *op) | 884 | struct operand *op) |
892 | { | 885 | { |
893 | struct decode_cache *c = &ctxt->decode; | ||
894 | u8 sib; | 886 | u8 sib; |
895 | int index_reg = 0, base_reg = 0, scale; | 887 | int index_reg = 0, base_reg = 0, scale; |
896 | int rc = X86EMUL_CONTINUE; | 888 | int rc = X86EMUL_CONTINUE; |
897 | ulong modrm_ea = 0; | 889 | ulong modrm_ea = 0; |
898 | 890 | ||
899 | if (c->rex_prefix) { | 891 | if (ctxt->rex_prefix) { |
900 | c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ | 892 | ctxt->modrm_reg = (ctxt->rex_prefix & 4) << 1; /* REX.R */ |
901 | index_reg = (c->rex_prefix & 2) << 2; /* REX.X */ | 893 | index_reg = (ctxt->rex_prefix & 2) << 2; /* REX.X */ |
902 | c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */ | 894 | ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ |
903 | } | 895 | } |
904 | 896 | ||
905 | c->modrm = insn_fetch(u8, 1, c->eip); | 897 | ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); |
906 | c->modrm_mod |= (c->modrm & 0xc0) >> 6; | 898 | ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; |
907 | c->modrm_reg |= (c->modrm & 0x38) >> 3; | 899 | ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; |
908 | c->modrm_rm |= (c->modrm & 0x07); | 900 | ctxt->modrm_rm |= (ctxt->modrm & 0x07); |
909 | c->modrm_seg = VCPU_SREG_DS; | 901 | ctxt->modrm_seg = VCPU_SREG_DS; |
910 | 902 | ||
911 | if (c->modrm_mod == 3) { | 903 | if (ctxt->modrm_mod == 3) { |
912 | op->type = OP_REG; | 904 | op->type = OP_REG; |
913 | op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 905 | op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; |
914 | op->addr.reg = decode_register(c->modrm_rm, | 906 | op->addr.reg = decode_register(ctxt->modrm_rm, |
915 | c->regs, c->d & ByteOp); | 907 | ctxt->regs, ctxt->d & ByteOp); |
916 | if (c->d & Sse) { | 908 | if (ctxt->d & Sse) { |
917 | op->type = OP_XMM; | 909 | op->type = OP_XMM; |
918 | op->bytes = 16; | 910 | op->bytes = 16; |
919 | op->addr.xmm = c->modrm_rm; | 911 | op->addr.xmm = ctxt->modrm_rm; |
920 | read_sse_reg(ctxt, &op->vec_val, c->modrm_rm); | 912 | read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm); |
921 | return rc; | 913 | return rc; |
922 | } | 914 | } |
923 | fetch_register_operand(op); | 915 | fetch_register_operand(op); |
@@ -926,26 +918,26 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, | |||
926 | 918 | ||
927 | op->type = OP_MEM; | 919 | op->type = OP_MEM; |
928 | 920 | ||
929 | if (c->ad_bytes == 2) { | 921 | if (ctxt->ad_bytes == 2) { |
930 | unsigned bx = c->regs[VCPU_REGS_RBX]; | 922 | unsigned bx = ctxt->regs[VCPU_REGS_RBX]; |
931 | unsigned bp = c->regs[VCPU_REGS_RBP]; | 923 | unsigned bp = ctxt->regs[VCPU_REGS_RBP]; |
932 | unsigned si = c->regs[VCPU_REGS_RSI]; | 924 | unsigned si = ctxt->regs[VCPU_REGS_RSI]; |
933 | unsigned di = c->regs[VCPU_REGS_RDI]; | 925 | unsigned di = ctxt->regs[VCPU_REGS_RDI]; |
934 | 926 | ||
935 | /* 16-bit ModR/M decode. */ | 927 | /* 16-bit ModR/M decode. */ |
936 | switch (c->modrm_mod) { | 928 | switch (ctxt->modrm_mod) { |
937 | case 0: | 929 | case 0: |
938 | if (c->modrm_rm == 6) | 930 | if (ctxt->modrm_rm == 6) |
939 | modrm_ea += insn_fetch(u16, 2, c->eip); | 931 | modrm_ea += insn_fetch(u16, 2, ctxt->_eip); |
940 | break; | 932 | break; |
941 | case 1: | 933 | case 1: |
942 | modrm_ea += insn_fetch(s8, 1, c->eip); | 934 | modrm_ea += insn_fetch(s8, 1, ctxt->_eip); |
943 | break; | 935 | break; |
944 | case 2: | 936 | case 2: |
945 | modrm_ea += insn_fetch(u16, 2, c->eip); | 937 | modrm_ea += insn_fetch(u16, 2, ctxt->_eip); |
946 | break; | 938 | break; |
947 | } | 939 | } |
948 | switch (c->modrm_rm) { | 940 | switch (ctxt->modrm_rm) { |
949 | case 0: | 941 | case 0: |
950 | modrm_ea += bx + si; | 942 | modrm_ea += bx + si; |
951 | break; | 943 | break; |
@@ -965,46 +957,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, | |||
965 | modrm_ea += di; | 957 | modrm_ea += di; |
966 | break; | 958 | break; |
967 | case 6: | 959 | case 6: |
968 | if (c->modrm_mod != 0) | 960 | if (ctxt->modrm_mod != 0) |
969 | modrm_ea += bp; | 961 | modrm_ea += bp; |
970 | break; | 962 | break; |
971 | case 7: | 963 | case 7: |
972 | modrm_ea += bx; | 964 | modrm_ea += bx; |
973 | break; | 965 | break; |
974 | } | 966 | } |
975 | if (c->modrm_rm == 2 || c->modrm_rm == 3 || | 967 | if (ctxt->modrm_rm == 2 || ctxt->modrm_rm == 3 || |
976 | (c->modrm_rm == 6 && c->modrm_mod != 0)) | 968 | (ctxt->modrm_rm == 6 && ctxt->modrm_mod != 0)) |
977 | c->modrm_seg = VCPU_SREG_SS; | 969 | ctxt->modrm_seg = VCPU_SREG_SS; |
978 | modrm_ea = (u16)modrm_ea; | 970 | modrm_ea = (u16)modrm_ea; |
979 | } else { | 971 | } else { |
980 | /* 32/64-bit ModR/M decode. */ | 972 | /* 32/64-bit ModR/M decode. */ |
981 | if ((c->modrm_rm & 7) == 4) { | 973 | if ((ctxt->modrm_rm & 7) == 4) { |
982 | sib = insn_fetch(u8, 1, c->eip); | 974 | sib = insn_fetch(u8, 1, ctxt->_eip); |
983 | index_reg |= (sib >> 3) & 7; | 975 | index_reg |= (sib >> 3) & 7; |
984 | base_reg |= sib & 7; | 976 | base_reg |= sib & 7; |
985 | scale = sib >> 6; | 977 | scale = sib >> 6; |
986 | 978 | ||
987 | if ((base_reg & 7) == 5 && c->modrm_mod == 0) | 979 | if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) |
988 | modrm_ea += insn_fetch(s32, 4, c->eip); | 980 | modrm_ea += insn_fetch(s32, 4, ctxt->_eip); |
989 | else | 981 | else |
990 | modrm_ea += c->regs[base_reg]; | 982 | modrm_ea += ctxt->regs[base_reg]; |
991 | if (index_reg != 4) | 983 | if (index_reg != 4) |
992 | modrm_ea += c->regs[index_reg] << scale; | 984 | modrm_ea += ctxt->regs[index_reg] << scale; |
993 | } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { | 985 | } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { |
994 | if (ctxt->mode == X86EMUL_MODE_PROT64) | 986 | if (ctxt->mode == X86EMUL_MODE_PROT64) |
995 | c->rip_relative = 1; | 987 | ctxt->rip_relative = 1; |
996 | } else | 988 | } else |
997 | modrm_ea += c->regs[c->modrm_rm]; | 989 | modrm_ea += ctxt->regs[ctxt->modrm_rm]; |
998 | switch (c->modrm_mod) { | 990 | switch (ctxt->modrm_mod) { |
999 | case 0: | 991 | case 0: |
1000 | if (c->modrm_rm == 5) | 992 | if (ctxt->modrm_rm == 5) |
1001 | modrm_ea += insn_fetch(s32, 4, c->eip); | 993 | modrm_ea += insn_fetch(s32, 4, ctxt->_eip); |
1002 | break; | 994 | break; |
1003 | case 1: | 995 | case 1: |
1004 | modrm_ea += insn_fetch(s8, 1, c->eip); | 996 | modrm_ea += insn_fetch(s8, 1, ctxt->_eip); |
1005 | break; | 997 | break; |
1006 | case 2: | 998 | case 2: |
1007 | modrm_ea += insn_fetch(s32, 4, c->eip); | 999 | modrm_ea += insn_fetch(s32, 4, ctxt->_eip); |
1008 | break; | 1000 | break; |
1009 | } | 1001 | } |
1010 | } | 1002 | } |
@@ -1014,53 +1006,50 @@ done: | |||
1014 | } | 1006 | } |
1015 | 1007 | ||
1016 | static int decode_abs(struct x86_emulate_ctxt *ctxt, | 1008 | static int decode_abs(struct x86_emulate_ctxt *ctxt, |
1017 | struct x86_emulate_ops *ops, | ||
1018 | struct operand *op) | 1009 | struct operand *op) |
1019 | { | 1010 | { |
1020 | struct decode_cache *c = &ctxt->decode; | ||
1021 | int rc = X86EMUL_CONTINUE; | 1011 | int rc = X86EMUL_CONTINUE; |
1022 | 1012 | ||
1023 | op->type = OP_MEM; | 1013 | op->type = OP_MEM; |
1024 | switch (c->ad_bytes) { | 1014 | switch (ctxt->ad_bytes) { |
1025 | case 2: | 1015 | case 2: |
1026 | op->addr.mem.ea = insn_fetch(u16, 2, c->eip); | 1016 | op->addr.mem.ea = insn_fetch(u16, 2, ctxt->_eip); |
1027 | break; | 1017 | break; |
1028 | case 4: | 1018 | case 4: |
1029 | op->addr.mem.ea = insn_fetch(u32, 4, c->eip); | 1019 | op->addr.mem.ea = insn_fetch(u32, 4, ctxt->_eip); |
1030 | break; | 1020 | break; |
1031 | case 8: | 1021 | case 8: |
1032 | op->addr.mem.ea = insn_fetch(u64, 8, c->eip); | 1022 | op->addr.mem.ea = insn_fetch(u64, 8, ctxt->_eip); |
1033 | break; | 1023 | break; |
1034 | } | 1024 | } |
1035 | done: | 1025 | done: |
1036 | return rc; | 1026 | return rc; |
1037 | } | 1027 | } |
1038 | 1028 | ||
1039 | static void fetch_bit_operand(struct decode_cache *c) | 1029 | static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt) |
1040 | { | 1030 | { |
1041 | long sv = 0, mask; | 1031 | long sv = 0, mask; |
1042 | 1032 | ||
1043 | if (c->dst.type == OP_MEM && c->src.type == OP_REG) { | 1033 | if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) { |
1044 | mask = ~(c->dst.bytes * 8 - 1); | 1034 | mask = ~(ctxt->dst.bytes * 8 - 1); |
1045 | 1035 | ||
1046 | if (c->src.bytes == 2) | 1036 | if (ctxt->src.bytes == 2) |
1047 | sv = (s16)c->src.val & (s16)mask; | 1037 | sv = (s16)ctxt->src.val & (s16)mask; |
1048 | else if (c->src.bytes == 4) | 1038 | else if (ctxt->src.bytes == 4) |
1049 | sv = (s32)c->src.val & (s32)mask; | 1039 | sv = (s32)ctxt->src.val & (s32)mask; |
1050 | 1040 | ||
1051 | c->dst.addr.mem.ea += (sv >> 3); | 1041 | ctxt->dst.addr.mem.ea += (sv >> 3); |
1052 | } | 1042 | } |
1053 | 1043 | ||
1054 | /* only subword offset */ | 1044 | /* only subword offset */ |
1055 | c->src.val &= (c->dst.bytes << 3) - 1; | 1045 | ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; |
1056 | } | 1046 | } |
1057 | 1047 | ||
1058 | static int read_emulated(struct x86_emulate_ctxt *ctxt, | 1048 | static int read_emulated(struct x86_emulate_ctxt *ctxt, |
1059 | struct x86_emulate_ops *ops, | ||
1060 | unsigned long addr, void *dest, unsigned size) | 1049 | unsigned long addr, void *dest, unsigned size) |
1061 | { | 1050 | { |
1062 | int rc; | 1051 | int rc; |
1063 | struct read_cache *mc = &ctxt->decode.mem_read; | 1052 | struct read_cache *mc = &ctxt->mem_read; |
1064 | 1053 | ||
1065 | while (size) { | 1054 | while (size) { |
1066 | int n = min(size, 8u); | 1055 | int n = min(size, 8u); |
@@ -1068,8 +1057,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, | |||
1068 | if (mc->pos < mc->end) | 1057 | if (mc->pos < mc->end) |
1069 | goto read_cached; | 1058 | goto read_cached; |
1070 | 1059 | ||
1071 | rc = ops->read_emulated(ctxt, addr, mc->data + mc->end, n, | 1060 | rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n, |
1072 | &ctxt->exception); | 1061 | &ctxt->exception); |
1073 | if (rc != X86EMUL_CONTINUE) | 1062 | if (rc != X86EMUL_CONTINUE) |
1074 | return rc; | 1063 | return rc; |
1075 | mc->end += n; | 1064 | mc->end += n; |
@@ -1094,7 +1083,7 @@ static int segmented_read(struct x86_emulate_ctxt *ctxt, | |||
1094 | rc = linearize(ctxt, addr, size, false, &linear); | 1083 | rc = linearize(ctxt, addr, size, false, &linear); |
1095 | if (rc != X86EMUL_CONTINUE) | 1084 | if (rc != X86EMUL_CONTINUE) |
1096 | return rc; | 1085 | return rc; |
1097 | return read_emulated(ctxt, ctxt->ops, linear, data, size); | 1086 | return read_emulated(ctxt, linear, data, size); |
1098 | } | 1087 | } |
1099 | 1088 | ||
1100 | static int segmented_write(struct x86_emulate_ctxt *ctxt, | 1089 | static int segmented_write(struct x86_emulate_ctxt *ctxt, |
@@ -1128,26 +1117,24 @@ static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt, | |||
1128 | } | 1117 | } |
1129 | 1118 | ||
1130 | static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, | 1119 | static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, |
1131 | struct x86_emulate_ops *ops, | ||
1132 | unsigned int size, unsigned short port, | 1120 | unsigned int size, unsigned short port, |
1133 | void *dest) | 1121 | void *dest) |
1134 | { | 1122 | { |
1135 | struct read_cache *rc = &ctxt->decode.io_read; | 1123 | struct read_cache *rc = &ctxt->io_read; |
1136 | 1124 | ||
1137 | if (rc->pos == rc->end) { /* refill pio read ahead */ | 1125 | if (rc->pos == rc->end) { /* refill pio read ahead */ |
1138 | struct decode_cache *c = &ctxt->decode; | ||
1139 | unsigned int in_page, n; | 1126 | unsigned int in_page, n; |
1140 | unsigned int count = c->rep_prefix ? | 1127 | unsigned int count = ctxt->rep_prefix ? |
1141 | address_mask(c, c->regs[VCPU_REGS_RCX]) : 1; | 1128 | address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) : 1; |
1142 | in_page = (ctxt->eflags & EFLG_DF) ? | 1129 | in_page = (ctxt->eflags & EFLG_DF) ? |
1143 | offset_in_page(c->regs[VCPU_REGS_RDI]) : | 1130 | offset_in_page(ctxt->regs[VCPU_REGS_RDI]) : |
1144 | PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]); | 1131 | PAGE_SIZE - offset_in_page(ctxt->regs[VCPU_REGS_RDI]); |
1145 | n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, | 1132 | n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, |
1146 | count); | 1133 | count); |
1147 | if (n == 0) | 1134 | if (n == 0) |
1148 | n = 1; | 1135 | n = 1; |
1149 | rc->pos = rc->end = 0; | 1136 | rc->pos = rc->end = 0; |
1150 | if (!ops->pio_in_emulated(ctxt, size, port, rc->data, n)) | 1137 | if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n)) |
1151 | return 0; | 1138 | return 0; |
1152 | rc->end = n * size; | 1139 | rc->end = n * size; |
1153 | } | 1140 | } |
@@ -1158,9 +1145,10 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, | |||
1158 | } | 1145 | } |
1159 | 1146 | ||
1160 | static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, | 1147 | static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, |
1161 | struct x86_emulate_ops *ops, | ||
1162 | u16 selector, struct desc_ptr *dt) | 1148 | u16 selector, struct desc_ptr *dt) |
1163 | { | 1149 | { |
1150 | struct x86_emulate_ops *ops = ctxt->ops; | ||
1151 | |||
1164 | if (selector & 1 << 2) { | 1152 | if (selector & 1 << 2) { |
1165 | struct desc_struct desc; | 1153 | struct desc_struct desc; |
1166 | u16 sel; | 1154 | u16 sel; |
@@ -1177,48 +1165,42 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, | |||
1177 | 1165 | ||
1178 | /* allowed just for 8 bytes segments */ | 1166 | /* allowed just for 8 bytes segments */ |
1179 | static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, | 1167 | static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, |
1180 | struct x86_emulate_ops *ops, | ||
1181 | u16 selector, struct desc_struct *desc) | 1168 | u16 selector, struct desc_struct *desc) |
1182 | { | 1169 | { |
1183 | struct desc_ptr dt; | 1170 | struct desc_ptr dt; |
1184 | u16 index = selector >> 3; | 1171 | u16 index = selector >> 3; |
1185 | int ret; | ||
1186 | ulong addr; | 1172 | ulong addr; |
1187 | 1173 | ||
1188 | get_descriptor_table_ptr(ctxt, ops, selector, &dt); | 1174 | get_descriptor_table_ptr(ctxt, selector, &dt); |
1189 | 1175 | ||
1190 | if (dt.size < index * 8 + 7) | 1176 | if (dt.size < index * 8 + 7) |
1191 | return emulate_gp(ctxt, selector & 0xfffc); | 1177 | return emulate_gp(ctxt, selector & 0xfffc); |
1192 | addr = dt.address + index * 8; | ||
1193 | ret = ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); | ||
1194 | 1178 | ||
1195 | return ret; | 1179 | addr = dt.address + index * 8; |
1180 | return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, | ||
1181 | &ctxt->exception); | ||
1196 | } | 1182 | } |
1197 | 1183 | ||
1198 | /* allowed just for 8 bytes segments */ | 1184 | /* allowed just for 8 bytes segments */ |
1199 | static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, | 1185 | static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, |
1200 | struct x86_emulate_ops *ops, | ||
1201 | u16 selector, struct desc_struct *desc) | 1186 | u16 selector, struct desc_struct *desc) |
1202 | { | 1187 | { |
1203 | struct desc_ptr dt; | 1188 | struct desc_ptr dt; |
1204 | u16 index = selector >> 3; | 1189 | u16 index = selector >> 3; |
1205 | ulong addr; | 1190 | ulong addr; |
1206 | int ret; | ||
1207 | 1191 | ||
1208 | get_descriptor_table_ptr(ctxt, ops, selector, &dt); | 1192 | get_descriptor_table_ptr(ctxt, selector, &dt); |
1209 | 1193 | ||
1210 | if (dt.size < index * 8 + 7) | 1194 | if (dt.size < index * 8 + 7) |
1211 | return emulate_gp(ctxt, selector & 0xfffc); | 1195 | return emulate_gp(ctxt, selector & 0xfffc); |
1212 | 1196 | ||
1213 | addr = dt.address + index * 8; | 1197 | addr = dt.address + index * 8; |
1214 | ret = ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); | 1198 | return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc, |
1215 | 1199 | &ctxt->exception); | |
1216 | return ret; | ||
1217 | } | 1200 | } |
1218 | 1201 | ||
1219 | /* Does not support long mode */ | 1202 | /* Does not support long mode */ |
1220 | static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | 1203 | static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, |
1221 | struct x86_emulate_ops *ops, | ||
1222 | u16 selector, int seg) | 1204 | u16 selector, int seg) |
1223 | { | 1205 | { |
1224 | struct desc_struct seg_desc; | 1206 | struct desc_struct seg_desc; |
@@ -1253,7 +1235,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1253 | if (null_selector) /* for NULL selector skip all following checks */ | 1235 | if (null_selector) /* for NULL selector skip all following checks */ |
1254 | goto load; | 1236 | goto load; |
1255 | 1237 | ||
1256 | ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc); | 1238 | ret = read_segment_descriptor(ctxt, selector, &seg_desc); |
1257 | if (ret != X86EMUL_CONTINUE) | 1239 | if (ret != X86EMUL_CONTINUE) |
1258 | return ret; | 1240 | return ret; |
1259 | 1241 | ||
@@ -1271,7 +1253,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1271 | 1253 | ||
1272 | rpl = selector & 3; | 1254 | rpl = selector & 3; |
1273 | dpl = seg_desc.dpl; | 1255 | dpl = seg_desc.dpl; |
1274 | cpl = ops->cpl(ctxt); | 1256 | cpl = ctxt->ops->cpl(ctxt); |
1275 | 1257 | ||
1276 | switch (seg) { | 1258 | switch (seg) { |
1277 | case VCPU_SREG_SS: | 1259 | case VCPU_SREG_SS: |
@@ -1322,12 +1304,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1322 | if (seg_desc.s) { | 1304 | if (seg_desc.s) { |
1323 | /* mark segment as accessed */ | 1305 | /* mark segment as accessed */ |
1324 | seg_desc.type |= 1; | 1306 | seg_desc.type |= 1; |
1325 | ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc); | 1307 | ret = write_segment_descriptor(ctxt, selector, &seg_desc); |
1326 | if (ret != X86EMUL_CONTINUE) | 1308 | if (ret != X86EMUL_CONTINUE) |
1327 | return ret; | 1309 | return ret; |
1328 | } | 1310 | } |
1329 | load: | 1311 | load: |
1330 | ops->set_segment(ctxt, selector, &seg_desc, 0, seg); | 1312 | ctxt->ops->set_segment(ctxt, selector, &seg_desc, 0, seg); |
1331 | return X86EMUL_CONTINUE; | 1313 | return X86EMUL_CONTINUE; |
1332 | exception: | 1314 | exception: |
1333 | emulate_exception(ctxt, err_vec, err_code, true); | 1315 | emulate_exception(ctxt, err_vec, err_code, true); |
@@ -1356,29 +1338,28 @@ static void write_register_operand(struct operand *op) | |||
1356 | static int writeback(struct x86_emulate_ctxt *ctxt) | 1338 | static int writeback(struct x86_emulate_ctxt *ctxt) |
1357 | { | 1339 | { |
1358 | int rc; | 1340 | int rc; |
1359 | struct decode_cache *c = &ctxt->decode; | ||
1360 | 1341 | ||
1361 | switch (c->dst.type) { | 1342 | switch (ctxt->dst.type) { |
1362 | case OP_REG: | 1343 | case OP_REG: |
1363 | write_register_operand(&c->dst); | 1344 | write_register_operand(&ctxt->dst); |
1364 | break; | 1345 | break; |
1365 | case OP_MEM: | 1346 | case OP_MEM: |
1366 | if (c->lock_prefix) | 1347 | if (ctxt->lock_prefix) |
1367 | rc = segmented_cmpxchg(ctxt, | 1348 | rc = segmented_cmpxchg(ctxt, |
1368 | c->dst.addr.mem, | 1349 | ctxt->dst.addr.mem, |
1369 | &c->dst.orig_val, | 1350 | &ctxt->dst.orig_val, |
1370 | &c->dst.val, | 1351 | &ctxt->dst.val, |
1371 | c->dst.bytes); | 1352 | ctxt->dst.bytes); |
1372 | else | 1353 | else |
1373 | rc = segmented_write(ctxt, | 1354 | rc = segmented_write(ctxt, |
1374 | c->dst.addr.mem, | 1355 | ctxt->dst.addr.mem, |
1375 | &c->dst.val, | 1356 | &ctxt->dst.val, |
1376 | c->dst.bytes); | 1357 | ctxt->dst.bytes); |
1377 | if (rc != X86EMUL_CONTINUE) | 1358 | if (rc != X86EMUL_CONTINUE) |
1378 | return rc; | 1359 | return rc; |
1379 | break; | 1360 | break; |
1380 | case OP_XMM: | 1361 | case OP_XMM: |
1381 | write_sse_reg(ctxt, &c->dst.vec_val, c->dst.addr.xmm); | 1362 | write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); |
1382 | break; | 1363 | break; |
1383 | case OP_NONE: | 1364 | case OP_NONE: |
1384 | /* no writeback */ | 1365 | /* no writeback */ |
@@ -1391,50 +1372,45 @@ static int writeback(struct x86_emulate_ctxt *ctxt) | |||
1391 | 1372 | ||
1392 | static int em_push(struct x86_emulate_ctxt *ctxt) | 1373 | static int em_push(struct x86_emulate_ctxt *ctxt) |
1393 | { | 1374 | { |
1394 | struct decode_cache *c = &ctxt->decode; | ||
1395 | struct segmented_address addr; | 1375 | struct segmented_address addr; |
1396 | 1376 | ||
1397 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); | 1377 | register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes); |
1398 | addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); | 1378 | addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); |
1399 | addr.seg = VCPU_SREG_SS; | 1379 | addr.seg = VCPU_SREG_SS; |
1400 | 1380 | ||
1401 | /* Disable writeback. */ | 1381 | /* Disable writeback. */ |
1402 | c->dst.type = OP_NONE; | 1382 | ctxt->dst.type = OP_NONE; |
1403 | return segmented_write(ctxt, addr, &c->src.val, c->op_bytes); | 1383 | return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes); |
1404 | } | 1384 | } |
1405 | 1385 | ||
1406 | static int emulate_pop(struct x86_emulate_ctxt *ctxt, | 1386 | static int emulate_pop(struct x86_emulate_ctxt *ctxt, |
1407 | void *dest, int len) | 1387 | void *dest, int len) |
1408 | { | 1388 | { |
1409 | struct decode_cache *c = &ctxt->decode; | ||
1410 | int rc; | 1389 | int rc; |
1411 | struct segmented_address addr; | 1390 | struct segmented_address addr; |
1412 | 1391 | ||
1413 | addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); | 1392 | addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); |
1414 | addr.seg = VCPU_SREG_SS; | 1393 | addr.seg = VCPU_SREG_SS; |
1415 | rc = segmented_read(ctxt, addr, dest, len); | 1394 | rc = segmented_read(ctxt, addr, dest, len); |
1416 | if (rc != X86EMUL_CONTINUE) | 1395 | if (rc != X86EMUL_CONTINUE) |
1417 | return rc; | 1396 | return rc; |
1418 | 1397 | ||
1419 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); | 1398 | register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len); |
1420 | return rc; | 1399 | return rc; |
1421 | } | 1400 | } |
1422 | 1401 | ||
1423 | static int em_pop(struct x86_emulate_ctxt *ctxt) | 1402 | static int em_pop(struct x86_emulate_ctxt *ctxt) |
1424 | { | 1403 | { |
1425 | struct decode_cache *c = &ctxt->decode; | 1404 | return emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); |
1426 | |||
1427 | return emulate_pop(ctxt, &c->dst.val, c->op_bytes); | ||
1428 | } | 1405 | } |
1429 | 1406 | ||
1430 | static int emulate_popf(struct x86_emulate_ctxt *ctxt, | 1407 | static int emulate_popf(struct x86_emulate_ctxt *ctxt, |
1431 | struct x86_emulate_ops *ops, | 1408 | void *dest, int len) |
1432 | void *dest, int len) | ||
1433 | { | 1409 | { |
1434 | int rc; | 1410 | int rc; |
1435 | unsigned long val, change_mask; | 1411 | unsigned long val, change_mask; |
1436 | int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; | 1412 | int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; |
1437 | int cpl = ops->cpl(ctxt); | 1413 | int cpl = ctxt->ops->cpl(ctxt); |
1438 | 1414 | ||
1439 | rc = emulate_pop(ctxt, &val, len); | 1415 | rc = emulate_pop(ctxt, &val, len); |
1440 | if (rc != X86EMUL_CONTINUE) | 1416 | if (rc != X86EMUL_CONTINUE) |
@@ -1470,49 +1446,41 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, | |||
1470 | 1446 | ||
1471 | static int em_popf(struct x86_emulate_ctxt *ctxt) | 1447 | static int em_popf(struct x86_emulate_ctxt *ctxt) |
1472 | { | 1448 | { |
1473 | struct decode_cache *c = &ctxt->decode; | 1449 | ctxt->dst.type = OP_REG; |
1474 | 1450 | ctxt->dst.addr.reg = &ctxt->eflags; | |
1475 | c->dst.type = OP_REG; | 1451 | ctxt->dst.bytes = ctxt->op_bytes; |
1476 | c->dst.addr.reg = &ctxt->eflags; | 1452 | return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); |
1477 | c->dst.bytes = c->op_bytes; | ||
1478 | return emulate_popf(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); | ||
1479 | } | 1453 | } |
1480 | 1454 | ||
1481 | static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, | 1455 | static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) |
1482 | struct x86_emulate_ops *ops, int seg) | ||
1483 | { | 1456 | { |
1484 | struct decode_cache *c = &ctxt->decode; | 1457 | ctxt->src.val = get_segment_selector(ctxt, seg); |
1485 | |||
1486 | c->src.val = get_segment_selector(ctxt, seg); | ||
1487 | 1458 | ||
1488 | return em_push(ctxt); | 1459 | return em_push(ctxt); |
1489 | } | 1460 | } |
1490 | 1461 | ||
1491 | static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, | 1462 | static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int seg) |
1492 | struct x86_emulate_ops *ops, int seg) | ||
1493 | { | 1463 | { |
1494 | struct decode_cache *c = &ctxt->decode; | ||
1495 | unsigned long selector; | 1464 | unsigned long selector; |
1496 | int rc; | 1465 | int rc; |
1497 | 1466 | ||
1498 | rc = emulate_pop(ctxt, &selector, c->op_bytes); | 1467 | rc = emulate_pop(ctxt, &selector, ctxt->op_bytes); |
1499 | if (rc != X86EMUL_CONTINUE) | 1468 | if (rc != X86EMUL_CONTINUE) |
1500 | return rc; | 1469 | return rc; |
1501 | 1470 | ||
1502 | rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg); | 1471 | rc = load_segment_descriptor(ctxt, (u16)selector, seg); |
1503 | return rc; | 1472 | return rc; |
1504 | } | 1473 | } |
1505 | 1474 | ||
1506 | static int em_pusha(struct x86_emulate_ctxt *ctxt) | 1475 | static int em_pusha(struct x86_emulate_ctxt *ctxt) |
1507 | { | 1476 | { |
1508 | struct decode_cache *c = &ctxt->decode; | 1477 | unsigned long old_esp = ctxt->regs[VCPU_REGS_RSP]; |
1509 | unsigned long old_esp = c->regs[VCPU_REGS_RSP]; | ||
1510 | int rc = X86EMUL_CONTINUE; | 1478 | int rc = X86EMUL_CONTINUE; |
1511 | int reg = VCPU_REGS_RAX; | 1479 | int reg = VCPU_REGS_RAX; |
1512 | 1480 | ||
1513 | while (reg <= VCPU_REGS_RDI) { | 1481 | while (reg <= VCPU_REGS_RDI) { |
1514 | (reg == VCPU_REGS_RSP) ? | 1482 | (reg == VCPU_REGS_RSP) ? |
1515 | (c->src.val = old_esp) : (c->src.val = c->regs[reg]); | 1483 | (ctxt->src.val = old_esp) : (ctxt->src.val = ctxt->regs[reg]); |
1516 | 1484 | ||
1517 | rc = em_push(ctxt); | 1485 | rc = em_push(ctxt); |
1518 | if (rc != X86EMUL_CONTINUE) | 1486 | if (rc != X86EMUL_CONTINUE) |
@@ -1526,26 +1494,23 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt) | |||
1526 | 1494 | ||
1527 | static int em_pushf(struct x86_emulate_ctxt *ctxt) | 1495 | static int em_pushf(struct x86_emulate_ctxt *ctxt) |
1528 | { | 1496 | { |
1529 | struct decode_cache *c = &ctxt->decode; | 1497 | ctxt->src.val = (unsigned long)ctxt->eflags; |
1530 | |||
1531 | c->src.val = (unsigned long)ctxt->eflags; | ||
1532 | return em_push(ctxt); | 1498 | return em_push(ctxt); |
1533 | } | 1499 | } |
1534 | 1500 | ||
1535 | static int em_popa(struct x86_emulate_ctxt *ctxt) | 1501 | static int em_popa(struct x86_emulate_ctxt *ctxt) |
1536 | { | 1502 | { |
1537 | struct decode_cache *c = &ctxt->decode; | ||
1538 | int rc = X86EMUL_CONTINUE; | 1503 | int rc = X86EMUL_CONTINUE; |
1539 | int reg = VCPU_REGS_RDI; | 1504 | int reg = VCPU_REGS_RDI; |
1540 | 1505 | ||
1541 | while (reg >= VCPU_REGS_RAX) { | 1506 | while (reg >= VCPU_REGS_RAX) { |
1542 | if (reg == VCPU_REGS_RSP) { | 1507 | if (reg == VCPU_REGS_RSP) { |
1543 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], | 1508 | register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], |
1544 | c->op_bytes); | 1509 | ctxt->op_bytes); |
1545 | --reg; | 1510 | --reg; |
1546 | } | 1511 | } |
1547 | 1512 | ||
1548 | rc = emulate_pop(ctxt, &c->regs[reg], c->op_bytes); | 1513 | rc = emulate_pop(ctxt, &ctxt->regs[reg], ctxt->op_bytes); |
1549 | if (rc != X86EMUL_CONTINUE) | 1514 | if (rc != X86EMUL_CONTINUE) |
1550 | break; | 1515 | break; |
1551 | --reg; | 1516 | --reg; |
@@ -1553,10 +1518,9 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) | |||
1553 | return rc; | 1518 | return rc; |
1554 | } | 1519 | } |
1555 | 1520 | ||
1556 | int emulate_int_real(struct x86_emulate_ctxt *ctxt, | 1521 | int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) |
1557 | struct x86_emulate_ops *ops, int irq) | ||
1558 | { | 1522 | { |
1559 | struct decode_cache *c = &ctxt->decode; | 1523 | struct x86_emulate_ops *ops = ctxt->ops; |
1560 | int rc; | 1524 | int rc; |
1561 | struct desc_ptr dt; | 1525 | struct desc_ptr dt; |
1562 | gva_t cs_addr; | 1526 | gva_t cs_addr; |
@@ -1564,19 +1528,19 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, | |||
1564 | u16 cs, eip; | 1528 | u16 cs, eip; |
1565 | 1529 | ||
1566 | /* TODO: Add limit checks */ | 1530 | /* TODO: Add limit checks */ |
1567 | c->src.val = ctxt->eflags; | 1531 | ctxt->src.val = ctxt->eflags; |
1568 | rc = em_push(ctxt); | 1532 | rc = em_push(ctxt); |
1569 | if (rc != X86EMUL_CONTINUE) | 1533 | if (rc != X86EMUL_CONTINUE) |
1570 | return rc; | 1534 | return rc; |
1571 | 1535 | ||
1572 | ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); | 1536 | ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); |
1573 | 1537 | ||
1574 | c->src.val = get_segment_selector(ctxt, VCPU_SREG_CS); | 1538 | ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS); |
1575 | rc = em_push(ctxt); | 1539 | rc = em_push(ctxt); |
1576 | if (rc != X86EMUL_CONTINUE) | 1540 | if (rc != X86EMUL_CONTINUE) |
1577 | return rc; | 1541 | return rc; |
1578 | 1542 | ||
1579 | c->src.val = c->eip; | 1543 | ctxt->src.val = ctxt->_eip; |
1580 | rc = em_push(ctxt); | 1544 | rc = em_push(ctxt); |
1581 | if (rc != X86EMUL_CONTINUE) | 1545 | if (rc != X86EMUL_CONTINUE) |
1582 | return rc; | 1546 | return rc; |
@@ -1594,21 +1558,20 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, | |||
1594 | if (rc != X86EMUL_CONTINUE) | 1558 | if (rc != X86EMUL_CONTINUE) |
1595 | return rc; | 1559 | return rc; |
1596 | 1560 | ||
1597 | rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS); | 1561 | rc = load_segment_descriptor(ctxt, cs, VCPU_SREG_CS); |
1598 | if (rc != X86EMUL_CONTINUE) | 1562 | if (rc != X86EMUL_CONTINUE) |
1599 | return rc; | 1563 | return rc; |
1600 | 1564 | ||
1601 | c->eip = eip; | 1565 | ctxt->_eip = eip; |
1602 | 1566 | ||
1603 | return rc; | 1567 | return rc; |
1604 | } | 1568 | } |
1605 | 1569 | ||
1606 | static int emulate_int(struct x86_emulate_ctxt *ctxt, | 1570 | static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq) |
1607 | struct x86_emulate_ops *ops, int irq) | ||
1608 | { | 1571 | { |
1609 | switch(ctxt->mode) { | 1572 | switch(ctxt->mode) { |
1610 | case X86EMUL_MODE_REAL: | 1573 | case X86EMUL_MODE_REAL: |
1611 | return emulate_int_real(ctxt, ops, irq); | 1574 | return emulate_int_real(ctxt, irq); |
1612 | case X86EMUL_MODE_VM86: | 1575 | case X86EMUL_MODE_VM86: |
1613 | case X86EMUL_MODE_PROT16: | 1576 | case X86EMUL_MODE_PROT16: |
1614 | case X86EMUL_MODE_PROT32: | 1577 | case X86EMUL_MODE_PROT32: |
@@ -1619,10 +1582,8 @@ static int emulate_int(struct x86_emulate_ctxt *ctxt, | |||
1619 | } | 1582 | } |
1620 | } | 1583 | } |
1621 | 1584 | ||
1622 | static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, | 1585 | static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) |
1623 | struct x86_emulate_ops *ops) | ||
1624 | { | 1586 | { |
1625 | struct decode_cache *c = &ctxt->decode; | ||
1626 | int rc = X86EMUL_CONTINUE; | 1587 | int rc = X86EMUL_CONTINUE; |
1627 | unsigned long temp_eip = 0; | 1588 | unsigned long temp_eip = 0; |
1628 | unsigned long temp_eflags = 0; | 1589 | unsigned long temp_eflags = 0; |
@@ -1634,7 +1595,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, | |||
1634 | 1595 | ||
1635 | /* TODO: Add stack limit check */ | 1596 | /* TODO: Add stack limit check */ |
1636 | 1597 | ||
1637 | rc = emulate_pop(ctxt, &temp_eip, c->op_bytes); | 1598 | rc = emulate_pop(ctxt, &temp_eip, ctxt->op_bytes); |
1638 | 1599 | ||
1639 | if (rc != X86EMUL_CONTINUE) | 1600 | if (rc != X86EMUL_CONTINUE) |
1640 | return rc; | 1601 | return rc; |
@@ -1642,27 +1603,27 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, | |||
1642 | if (temp_eip & ~0xffff) | 1603 | if (temp_eip & ~0xffff) |
1643 | return emulate_gp(ctxt, 0); | 1604 | return emulate_gp(ctxt, 0); |
1644 | 1605 | ||
1645 | rc = emulate_pop(ctxt, &cs, c->op_bytes); | 1606 | rc = emulate_pop(ctxt, &cs, ctxt->op_bytes); |
1646 | 1607 | ||
1647 | if (rc != X86EMUL_CONTINUE) | 1608 | if (rc != X86EMUL_CONTINUE) |
1648 | return rc; | 1609 | return rc; |
1649 | 1610 | ||
1650 | rc = emulate_pop(ctxt, &temp_eflags, c->op_bytes); | 1611 | rc = emulate_pop(ctxt, &temp_eflags, ctxt->op_bytes); |
1651 | 1612 | ||
1652 | if (rc != X86EMUL_CONTINUE) | 1613 | if (rc != X86EMUL_CONTINUE) |
1653 | return rc; | 1614 | return rc; |
1654 | 1615 | ||
1655 | rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); | 1616 | rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS); |
1656 | 1617 | ||
1657 | if (rc != X86EMUL_CONTINUE) | 1618 | if (rc != X86EMUL_CONTINUE) |
1658 | return rc; | 1619 | return rc; |
1659 | 1620 | ||
1660 | c->eip = temp_eip; | 1621 | ctxt->_eip = temp_eip; |
1661 | 1622 | ||
1662 | 1623 | ||
1663 | if (c->op_bytes == 4) | 1624 | if (ctxt->op_bytes == 4) |
1664 | ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); | 1625 | ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); |
1665 | else if (c->op_bytes == 2) { | 1626 | else if (ctxt->op_bytes == 2) { |
1666 | ctxt->eflags &= ~0xffff; | 1627 | ctxt->eflags &= ~0xffff; |
1667 | ctxt->eflags |= temp_eflags; | 1628 | ctxt->eflags |= temp_eflags; |
1668 | } | 1629 | } |
@@ -1673,12 +1634,11 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, | |||
1673 | return rc; | 1634 | return rc; |
1674 | } | 1635 | } |
1675 | 1636 | ||
1676 | static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, | 1637 | static int em_iret(struct x86_emulate_ctxt *ctxt) |
1677 | struct x86_emulate_ops* ops) | ||
1678 | { | 1638 | { |
1679 | switch(ctxt->mode) { | 1639 | switch(ctxt->mode) { |
1680 | case X86EMUL_MODE_REAL: | 1640 | case X86EMUL_MODE_REAL: |
1681 | return emulate_iret_real(ctxt, ops); | 1641 | return emulate_iret_real(ctxt); |
1682 | case X86EMUL_MODE_VM86: | 1642 | case X86EMUL_MODE_VM86: |
1683 | case X86EMUL_MODE_PROT16: | 1643 | case X86EMUL_MODE_PROT16: |
1684 | case X86EMUL_MODE_PROT32: | 1644 | case X86EMUL_MODE_PROT32: |
@@ -1691,53 +1651,49 @@ static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, | |||
1691 | 1651 | ||
1692 | static int em_jmp_far(struct x86_emulate_ctxt *ctxt) | 1652 | static int em_jmp_far(struct x86_emulate_ctxt *ctxt) |
1693 | { | 1653 | { |
1694 | struct decode_cache *c = &ctxt->decode; | ||
1695 | int rc; | 1654 | int rc; |
1696 | unsigned short sel; | 1655 | unsigned short sel; |
1697 | 1656 | ||
1698 | memcpy(&sel, c->src.valptr + c->op_bytes, 2); | 1657 | memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); |
1699 | 1658 | ||
1700 | rc = load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS); | 1659 | rc = load_segment_descriptor(ctxt, sel, VCPU_SREG_CS); |
1701 | if (rc != X86EMUL_CONTINUE) | 1660 | if (rc != X86EMUL_CONTINUE) |
1702 | return rc; | 1661 | return rc; |
1703 | 1662 | ||
1704 | c->eip = 0; | 1663 | ctxt->_eip = 0; |
1705 | memcpy(&c->eip, c->src.valptr, c->op_bytes); | 1664 | memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes); |
1706 | return X86EMUL_CONTINUE; | 1665 | return X86EMUL_CONTINUE; |
1707 | } | 1666 | } |
1708 | 1667 | ||
1709 | static int em_grp1a(struct x86_emulate_ctxt *ctxt) | 1668 | static int em_grp1a(struct x86_emulate_ctxt *ctxt) |
1710 | { | 1669 | { |
1711 | struct decode_cache *c = &ctxt->decode; | 1670 | return emulate_pop(ctxt, &ctxt->dst.val, ctxt->dst.bytes); |
1712 | |||
1713 | return emulate_pop(ctxt, &c->dst.val, c->dst.bytes); | ||
1714 | } | 1671 | } |
1715 | 1672 | ||
1716 | static int em_grp2(struct x86_emulate_ctxt *ctxt) | 1673 | static int em_grp2(struct x86_emulate_ctxt *ctxt) |
1717 | { | 1674 | { |
1718 | struct decode_cache *c = &ctxt->decode; | 1675 | switch (ctxt->modrm_reg) { |
1719 | switch (c->modrm_reg) { | ||
1720 | case 0: /* rol */ | 1676 | case 0: /* rol */ |
1721 | emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags); | 1677 | emulate_2op_SrcB("rol", ctxt->src, ctxt->dst, ctxt->eflags); |
1722 | break; | 1678 | break; |
1723 | case 1: /* ror */ | 1679 | case 1: /* ror */ |
1724 | emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags); | 1680 | emulate_2op_SrcB("ror", ctxt->src, ctxt->dst, ctxt->eflags); |
1725 | break; | 1681 | break; |
1726 | case 2: /* rcl */ | 1682 | case 2: /* rcl */ |
1727 | emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags); | 1683 | emulate_2op_SrcB("rcl", ctxt->src, ctxt->dst, ctxt->eflags); |
1728 | break; | 1684 | break; |
1729 | case 3: /* rcr */ | 1685 | case 3: /* rcr */ |
1730 | emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags); | 1686 | emulate_2op_SrcB("rcr", ctxt->src, ctxt->dst, ctxt->eflags); |
1731 | break; | 1687 | break; |
1732 | case 4: /* sal/shl */ | 1688 | case 4: /* sal/shl */ |
1733 | case 6: /* sal/shl */ | 1689 | case 6: /* sal/shl */ |
1734 | emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags); | 1690 | emulate_2op_SrcB("sal", ctxt->src, ctxt->dst, ctxt->eflags); |
1735 | break; | 1691 | break; |
1736 | case 5: /* shr */ | 1692 | case 5: /* shr */ |
1737 | emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags); | 1693 | emulate_2op_SrcB("shr", ctxt->src, ctxt->dst, ctxt->eflags); |
1738 | break; | 1694 | break; |
1739 | case 7: /* sar */ | 1695 | case 7: /* sar */ |
1740 | emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); | 1696 | emulate_2op_SrcB("sar", ctxt->src, ctxt->dst, ctxt->eflags); |
1741 | break; | 1697 | break; |
1742 | } | 1698 | } |
1743 | return X86EMUL_CONTINUE; | 1699 | return X86EMUL_CONTINUE; |
@@ -1745,33 +1701,32 @@ static int em_grp2(struct x86_emulate_ctxt *ctxt) | |||
1745 | 1701 | ||
1746 | static int em_grp3(struct x86_emulate_ctxt *ctxt) | 1702 | static int em_grp3(struct x86_emulate_ctxt *ctxt) |
1747 | { | 1703 | { |
1748 | struct decode_cache *c = &ctxt->decode; | 1704 | unsigned long *rax = &ctxt->regs[VCPU_REGS_RAX]; |
1749 | unsigned long *rax = &c->regs[VCPU_REGS_RAX]; | 1705 | unsigned long *rdx = &ctxt->regs[VCPU_REGS_RDX]; |
1750 | unsigned long *rdx = &c->regs[VCPU_REGS_RDX]; | ||
1751 | u8 de = 0; | 1706 | u8 de = 0; |
1752 | 1707 | ||
1753 | switch (c->modrm_reg) { | 1708 | switch (ctxt->modrm_reg) { |
1754 | case 0 ... 1: /* test */ | 1709 | case 0 ... 1: /* test */ |
1755 | emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); | 1710 | emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags); |
1756 | break; | 1711 | break; |
1757 | case 2: /* not */ | 1712 | case 2: /* not */ |
1758 | c->dst.val = ~c->dst.val; | 1713 | ctxt->dst.val = ~ctxt->dst.val; |
1759 | break; | 1714 | break; |
1760 | case 3: /* neg */ | 1715 | case 3: /* neg */ |
1761 | emulate_1op("neg", c->dst, ctxt->eflags); | 1716 | emulate_1op("neg", ctxt->dst, ctxt->eflags); |
1762 | break; | 1717 | break; |
1763 | case 4: /* mul */ | 1718 | case 4: /* mul */ |
1764 | emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags); | 1719 | emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, ctxt->eflags); |
1765 | break; | 1720 | break; |
1766 | case 5: /* imul */ | 1721 | case 5: /* imul */ |
1767 | emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags); | 1722 | emulate_1op_rax_rdx("imul", ctxt->src, *rax, *rdx, ctxt->eflags); |
1768 | break; | 1723 | break; |
1769 | case 6: /* div */ | 1724 | case 6: /* div */ |
1770 | emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx, | 1725 | emulate_1op_rax_rdx_ex("div", ctxt->src, *rax, *rdx, |
1771 | ctxt->eflags, de); | 1726 | ctxt->eflags, de); |
1772 | break; | 1727 | break; |
1773 | case 7: /* idiv */ | 1728 | case 7: /* idiv */ |
1774 | emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx, | 1729 | emulate_1op_rax_rdx_ex("idiv", ctxt->src, *rax, *rdx, |
1775 | ctxt->eflags, de); | 1730 | ctxt->eflags, de); |
1776 | break; | 1731 | break; |
1777 | default: | 1732 | default: |
@@ -1784,26 +1739,25 @@ static int em_grp3(struct x86_emulate_ctxt *ctxt) | |||
1784 | 1739 | ||
1785 | static int em_grp45(struct x86_emulate_ctxt *ctxt) | 1740 | static int em_grp45(struct x86_emulate_ctxt *ctxt) |
1786 | { | 1741 | { |
1787 | struct decode_cache *c = &ctxt->decode; | ||
1788 | int rc = X86EMUL_CONTINUE; | 1742 | int rc = X86EMUL_CONTINUE; |
1789 | 1743 | ||
1790 | switch (c->modrm_reg) { | 1744 | switch (ctxt->modrm_reg) { |
1791 | case 0: /* inc */ | 1745 | case 0: /* inc */ |
1792 | emulate_1op("inc", c->dst, ctxt->eflags); | 1746 | emulate_1op("inc", ctxt->dst, ctxt->eflags); |
1793 | break; | 1747 | break; |
1794 | case 1: /* dec */ | 1748 | case 1: /* dec */ |
1795 | emulate_1op("dec", c->dst, ctxt->eflags); | 1749 | emulate_1op("dec", ctxt->dst, ctxt->eflags); |
1796 | break; | 1750 | break; |
1797 | case 2: /* call near abs */ { | 1751 | case 2: /* call near abs */ { |
1798 | long int old_eip; | 1752 | long int old_eip; |
1799 | old_eip = c->eip; | 1753 | old_eip = ctxt->_eip; |
1800 | c->eip = c->src.val; | 1754 | ctxt->_eip = ctxt->src.val; |
1801 | c->src.val = old_eip; | 1755 | ctxt->src.val = old_eip; |
1802 | rc = em_push(ctxt); | 1756 | rc = em_push(ctxt); |
1803 | break; | 1757 | break; |
1804 | } | 1758 | } |
1805 | case 4: /* jmp abs */ | 1759 | case 4: /* jmp abs */ |
1806 | c->eip = c->src.val; | 1760 | ctxt->_eip = ctxt->src.val; |
1807 | break; | 1761 | break; |
1808 | case 5: /* jmp far */ | 1762 | case 5: /* jmp far */ |
1809 | rc = em_jmp_far(ctxt); | 1763 | rc = em_jmp_far(ctxt); |
@@ -1817,68 +1771,70 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt) | |||
1817 | 1771 | ||
1818 | static int em_grp9(struct x86_emulate_ctxt *ctxt) | 1772 | static int em_grp9(struct x86_emulate_ctxt *ctxt) |
1819 | { | 1773 | { |
1820 | struct decode_cache *c = &ctxt->decode; | 1774 | u64 old = ctxt->dst.orig_val64; |
1821 | u64 old = c->dst.orig_val64; | ||
1822 | 1775 | ||
1823 | if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || | 1776 | if (((u32) (old >> 0) != (u32) ctxt->regs[VCPU_REGS_RAX]) || |
1824 | ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { | 1777 | ((u32) (old >> 32) != (u32) ctxt->regs[VCPU_REGS_RDX])) { |
1825 | c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); | 1778 | ctxt->regs[VCPU_REGS_RAX] = (u32) (old >> 0); |
1826 | c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); | 1779 | ctxt->regs[VCPU_REGS_RDX] = (u32) (old >> 32); |
1827 | ctxt->eflags &= ~EFLG_ZF; | 1780 | ctxt->eflags &= ~EFLG_ZF; |
1828 | } else { | 1781 | } else { |
1829 | c->dst.val64 = ((u64)c->regs[VCPU_REGS_RCX] << 32) | | 1782 | ctxt->dst.val64 = ((u64)ctxt->regs[VCPU_REGS_RCX] << 32) | |
1830 | (u32) c->regs[VCPU_REGS_RBX]; | 1783 | (u32) ctxt->regs[VCPU_REGS_RBX]; |
1831 | 1784 | ||
1832 | ctxt->eflags |= EFLG_ZF; | 1785 | ctxt->eflags |= EFLG_ZF; |
1833 | } | 1786 | } |
1834 | return X86EMUL_CONTINUE; | 1787 | return X86EMUL_CONTINUE; |
1835 | } | 1788 | } |
1836 | 1789 | ||
1837 | static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, | 1790 | static int em_ret(struct x86_emulate_ctxt *ctxt) |
1838 | struct x86_emulate_ops *ops) | 1791 | { |
1792 | ctxt->dst.type = OP_REG; | ||
1793 | ctxt->dst.addr.reg = &ctxt->_eip; | ||
1794 | ctxt->dst.bytes = ctxt->op_bytes; | ||
1795 | return em_pop(ctxt); | ||
1796 | } | ||
1797 | |||
1798 | static int em_ret_far(struct x86_emulate_ctxt *ctxt) | ||
1839 | { | 1799 | { |
1840 | struct decode_cache *c = &ctxt->decode; | ||
1841 | int rc; | 1800 | int rc; |
1842 | unsigned long cs; | 1801 | unsigned long cs; |
1843 | 1802 | ||
1844 | rc = emulate_pop(ctxt, &c->eip, c->op_bytes); | 1803 | rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes); |
1845 | if (rc != X86EMUL_CONTINUE) | 1804 | if (rc != X86EMUL_CONTINUE) |
1846 | return rc; | 1805 | return rc; |
1847 | if (c->op_bytes == 4) | 1806 | if (ctxt->op_bytes == 4) |
1848 | c->eip = (u32)c->eip; | 1807 | ctxt->_eip = (u32)ctxt->_eip; |
1849 | rc = emulate_pop(ctxt, &cs, c->op_bytes); | 1808 | rc = emulate_pop(ctxt, &cs, ctxt->op_bytes); |
1850 | if (rc != X86EMUL_CONTINUE) | 1809 | if (rc != X86EMUL_CONTINUE) |
1851 | return rc; | 1810 | return rc; |
1852 | rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); | 1811 | rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS); |
1853 | return rc; | 1812 | return rc; |
1854 | } | 1813 | } |
1855 | 1814 | ||
1856 | static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, | 1815 | static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, int seg) |
1857 | struct x86_emulate_ops *ops, int seg) | ||
1858 | { | 1816 | { |
1859 | struct decode_cache *c = &ctxt->decode; | ||
1860 | unsigned short sel; | 1817 | unsigned short sel; |
1861 | int rc; | 1818 | int rc; |
1862 | 1819 | ||
1863 | memcpy(&sel, c->src.valptr + c->op_bytes, 2); | 1820 | memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); |
1864 | 1821 | ||
1865 | rc = load_segment_descriptor(ctxt, ops, sel, seg); | 1822 | rc = load_segment_descriptor(ctxt, sel, seg); |
1866 | if (rc != X86EMUL_CONTINUE) | 1823 | if (rc != X86EMUL_CONTINUE) |
1867 | return rc; | 1824 | return rc; |
1868 | 1825 | ||
1869 | c->dst.val = c->src.val; | 1826 | ctxt->dst.val = ctxt->src.val; |
1870 | return rc; | 1827 | return rc; |
1871 | } | 1828 | } |
1872 | 1829 | ||
1873 | static inline void | 1830 | static void |
1874 | setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, | 1831 | setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, |
1875 | struct x86_emulate_ops *ops, struct desc_struct *cs, | 1832 | struct desc_struct *cs, struct desc_struct *ss) |
1876 | struct desc_struct *ss) | ||
1877 | { | 1833 | { |
1878 | u16 selector; | 1834 | u16 selector; |
1879 | 1835 | ||
1880 | memset(cs, 0, sizeof(struct desc_struct)); | 1836 | memset(cs, 0, sizeof(struct desc_struct)); |
1881 | ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS); | 1837 | ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS); |
1882 | memset(ss, 0, sizeof(struct desc_struct)); | 1838 | memset(ss, 0, sizeof(struct desc_struct)); |
1883 | 1839 | ||
1884 | cs->l = 0; /* will be adjusted later */ | 1840 | cs->l = 0; /* will be adjusted later */ |
@@ -1901,10 +1857,9 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, | |||
1901 | ss->p = 1; | 1857 | ss->p = 1; |
1902 | } | 1858 | } |
1903 | 1859 | ||
1904 | static int | 1860 | static int em_syscall(struct x86_emulate_ctxt *ctxt) |
1905 | emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | ||
1906 | { | 1861 | { |
1907 | struct decode_cache *c = &ctxt->decode; | 1862 | struct x86_emulate_ops *ops = ctxt->ops; |
1908 | struct desc_struct cs, ss; | 1863 | struct desc_struct cs, ss; |
1909 | u64 msr_data; | 1864 | u64 msr_data; |
1910 | u16 cs_sel, ss_sel; | 1865 | u16 cs_sel, ss_sel; |
@@ -1916,7 +1871,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1916 | return emulate_ud(ctxt); | 1871 | return emulate_ud(ctxt); |
1917 | 1872 | ||
1918 | ops->get_msr(ctxt, MSR_EFER, &efer); | 1873 | ops->get_msr(ctxt, MSR_EFER, &efer); |
1919 | setup_syscalls_segments(ctxt, ops, &cs, &ss); | 1874 | setup_syscalls_segments(ctxt, &cs, &ss); |
1920 | 1875 | ||
1921 | ops->get_msr(ctxt, MSR_STAR, &msr_data); | 1876 | ops->get_msr(ctxt, MSR_STAR, &msr_data); |
1922 | msr_data >>= 32; | 1877 | msr_data >>= 32; |
@@ -1930,15 +1885,15 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1930 | ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); | 1885 | ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); |
1931 | ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); | 1886 | ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); |
1932 | 1887 | ||
1933 | c->regs[VCPU_REGS_RCX] = c->eip; | 1888 | ctxt->regs[VCPU_REGS_RCX] = ctxt->_eip; |
1934 | if (efer & EFER_LMA) { | 1889 | if (efer & EFER_LMA) { |
1935 | #ifdef CONFIG_X86_64 | 1890 | #ifdef CONFIG_X86_64 |
1936 | c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; | 1891 | ctxt->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; |
1937 | 1892 | ||
1938 | ops->get_msr(ctxt, | 1893 | ops->get_msr(ctxt, |
1939 | ctxt->mode == X86EMUL_MODE_PROT64 ? | 1894 | ctxt->mode == X86EMUL_MODE_PROT64 ? |
1940 | MSR_LSTAR : MSR_CSTAR, &msr_data); | 1895 | MSR_LSTAR : MSR_CSTAR, &msr_data); |
1941 | c->eip = msr_data; | 1896 | ctxt->_eip = msr_data; |
1942 | 1897 | ||
1943 | ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); | 1898 | ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); |
1944 | ctxt->eflags &= ~(msr_data | EFLG_RF); | 1899 | ctxt->eflags &= ~(msr_data | EFLG_RF); |
@@ -1946,7 +1901,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1946 | } else { | 1901 | } else { |
1947 | /* legacy mode */ | 1902 | /* legacy mode */ |
1948 | ops->get_msr(ctxt, MSR_STAR, &msr_data); | 1903 | ops->get_msr(ctxt, MSR_STAR, &msr_data); |
1949 | c->eip = (u32)msr_data; | 1904 | ctxt->_eip = (u32)msr_data; |
1950 | 1905 | ||
1951 | ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); | 1906 | ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); |
1952 | } | 1907 | } |
@@ -1954,16 +1909,15 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1954 | return X86EMUL_CONTINUE; | 1909 | return X86EMUL_CONTINUE; |
1955 | } | 1910 | } |
1956 | 1911 | ||
1957 | static int | 1912 | static int em_sysenter(struct x86_emulate_ctxt *ctxt) |
1958 | emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | ||
1959 | { | 1913 | { |
1960 | struct decode_cache *c = &ctxt->decode; | 1914 | struct x86_emulate_ops *ops = ctxt->ops; |
1961 | struct desc_struct cs, ss; | 1915 | struct desc_struct cs, ss; |
1962 | u64 msr_data; | 1916 | u64 msr_data; |
1963 | u16 cs_sel, ss_sel; | 1917 | u16 cs_sel, ss_sel; |
1964 | u64 efer = 0; | 1918 | u64 efer = 0; |
1965 | 1919 | ||
1966 | ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); | 1920 | ops->get_msr(ctxt, MSR_EFER, &efer); |
1967 | /* inject #GP if in real mode */ | 1921 | /* inject #GP if in real mode */ |
1968 | if (ctxt->mode == X86EMUL_MODE_REAL) | 1922 | if (ctxt->mode == X86EMUL_MODE_REAL) |
1969 | return emulate_gp(ctxt, 0); | 1923 | return emulate_gp(ctxt, 0); |
@@ -1974,7 +1928,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1974 | if (ctxt->mode == X86EMUL_MODE_PROT64) | 1928 | if (ctxt->mode == X86EMUL_MODE_PROT64) |
1975 | return emulate_ud(ctxt); | 1929 | return emulate_ud(ctxt); |
1976 | 1930 | ||
1977 | setup_syscalls_segments(ctxt, ops, &cs, &ss); | 1931 | setup_syscalls_segments(ctxt, &cs, &ss); |
1978 | 1932 | ||
1979 | ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); | 1933 | ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); |
1980 | switch (ctxt->mode) { | 1934 | switch (ctxt->mode) { |
@@ -2002,31 +1956,30 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
2002 | ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); | 1956 | ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); |
2003 | 1957 | ||
2004 | ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data); | 1958 | ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data); |
2005 | c->eip = msr_data; | 1959 | ctxt->_eip = msr_data; |
2006 | 1960 | ||
2007 | ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); | 1961 | ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); |
2008 | c->regs[VCPU_REGS_RSP] = msr_data; | 1962 | ctxt->regs[VCPU_REGS_RSP] = msr_data; |
2009 | 1963 | ||
2010 | return X86EMUL_CONTINUE; | 1964 | return X86EMUL_CONTINUE; |
2011 | } | 1965 | } |
2012 | 1966 | ||
2013 | static int | 1967 | static int em_sysexit(struct x86_emulate_ctxt *ctxt) |
2014 | emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | ||
2015 | { | 1968 | { |
2016 | struct decode_cache *c = &ctxt->decode; | 1969 | struct x86_emulate_ops *ops = ctxt->ops; |
2017 | struct desc_struct cs, ss; | 1970 | struct desc_struct cs, ss; |
2018 | u64 msr_data; | 1971 | u64 msr_data; |
2019 | int usermode; | 1972 | int usermode; |
2020 | u16 cs_sel, ss_sel; | 1973 | u16 cs_sel = 0, ss_sel = 0; |
2021 | 1974 | ||
2022 | /* inject #GP if in real mode or Virtual 8086 mode */ | 1975 | /* inject #GP if in real mode or Virtual 8086 mode */ |
2023 | if (ctxt->mode == X86EMUL_MODE_REAL || | 1976 | if (ctxt->mode == X86EMUL_MODE_REAL || |
2024 | ctxt->mode == X86EMUL_MODE_VM86) | 1977 | ctxt->mode == X86EMUL_MODE_VM86) |
2025 | return emulate_gp(ctxt, 0); | 1978 | return emulate_gp(ctxt, 0); |
2026 | 1979 | ||
2027 | setup_syscalls_segments(ctxt, ops, &cs, &ss); | 1980 | setup_syscalls_segments(ctxt, &cs, &ss); |
2028 | 1981 | ||
2029 | if ((c->rex_prefix & 0x8) != 0x0) | 1982 | if ((ctxt->rex_prefix & 0x8) != 0x0) |
2030 | usermode = X86EMUL_MODE_PROT64; | 1983 | usermode = X86EMUL_MODE_PROT64; |
2031 | else | 1984 | else |
2032 | usermode = X86EMUL_MODE_PROT32; | 1985 | usermode = X86EMUL_MODE_PROT32; |
@@ -2056,14 +2009,13 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
2056 | ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); | 2009 | ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); |
2057 | ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); | 2010 | ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); |
2058 | 2011 | ||
2059 | c->eip = c->regs[VCPU_REGS_RDX]; | 2012 | ctxt->_eip = ctxt->regs[VCPU_REGS_RDX]; |
2060 | c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; | 2013 | ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX]; |
2061 | 2014 | ||
2062 | return X86EMUL_CONTINUE; | 2015 | return X86EMUL_CONTINUE; |
2063 | } | 2016 | } |
2064 | 2017 | ||
2065 | static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt, | 2018 | static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) |
2066 | struct x86_emulate_ops *ops) | ||
2067 | { | 2019 | { |
2068 | int iopl; | 2020 | int iopl; |
2069 | if (ctxt->mode == X86EMUL_MODE_REAL) | 2021 | if (ctxt->mode == X86EMUL_MODE_REAL) |
@@ -2071,13 +2023,13 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt, | |||
2071 | if (ctxt->mode == X86EMUL_MODE_VM86) | 2023 | if (ctxt->mode == X86EMUL_MODE_VM86) |
2072 | return true; | 2024 | return true; |
2073 | iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; | 2025 | iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; |
2074 | return ops->cpl(ctxt) > iopl; | 2026 | return ctxt->ops->cpl(ctxt) > iopl; |
2075 | } | 2027 | } |
2076 | 2028 | ||
2077 | static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, | 2029 | static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, |
2078 | struct x86_emulate_ops *ops, | ||
2079 | u16 port, u16 len) | 2030 | u16 port, u16 len) |
2080 | { | 2031 | { |
2032 | struct x86_emulate_ops *ops = ctxt->ops; | ||
2081 | struct desc_struct tr_seg; | 2033 | struct desc_struct tr_seg; |
2082 | u32 base3; | 2034 | u32 base3; |
2083 | int r; | 2035 | int r; |
@@ -2108,14 +2060,13 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, | |||
2108 | } | 2060 | } |
2109 | 2061 | ||
2110 | static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, | 2062 | static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, |
2111 | struct x86_emulate_ops *ops, | ||
2112 | u16 port, u16 len) | 2063 | u16 port, u16 len) |
2113 | { | 2064 | { |
2114 | if (ctxt->perm_ok) | 2065 | if (ctxt->perm_ok) |
2115 | return true; | 2066 | return true; |
2116 | 2067 | ||
2117 | if (emulator_bad_iopl(ctxt, ops)) | 2068 | if (emulator_bad_iopl(ctxt)) |
2118 | if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) | 2069 | if (!emulator_io_port_access_allowed(ctxt, port, len)) |
2119 | return false; | 2070 | return false; |
2120 | 2071 | ||
2121 | ctxt->perm_ok = true; | 2072 | ctxt->perm_ok = true; |
@@ -2124,21 +2075,18 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, | |||
2124 | } | 2075 | } |
2125 | 2076 | ||
2126 | static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, | 2077 | static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, |
2127 | struct x86_emulate_ops *ops, | ||
2128 | struct tss_segment_16 *tss) | 2078 | struct tss_segment_16 *tss) |
2129 | { | 2079 | { |
2130 | struct decode_cache *c = &ctxt->decode; | 2080 | tss->ip = ctxt->_eip; |
2131 | |||
2132 | tss->ip = c->eip; | ||
2133 | tss->flag = ctxt->eflags; | 2081 | tss->flag = ctxt->eflags; |
2134 | tss->ax = c->regs[VCPU_REGS_RAX]; | 2082 | tss->ax = ctxt->regs[VCPU_REGS_RAX]; |
2135 | tss->cx = c->regs[VCPU_REGS_RCX]; | 2083 | tss->cx = ctxt->regs[VCPU_REGS_RCX]; |
2136 | tss->dx = c->regs[VCPU_REGS_RDX]; | 2084 | tss->dx = ctxt->regs[VCPU_REGS_RDX]; |
2137 | tss->bx = c->regs[VCPU_REGS_RBX]; | 2085 | tss->bx = ctxt->regs[VCPU_REGS_RBX]; |
2138 | tss->sp = c->regs[VCPU_REGS_RSP]; | 2086 | tss->sp = ctxt->regs[VCPU_REGS_RSP]; |
2139 | tss->bp = c->regs[VCPU_REGS_RBP]; | 2087 | tss->bp = ctxt->regs[VCPU_REGS_RBP]; |
2140 | tss->si = c->regs[VCPU_REGS_RSI]; | 2088 | tss->si = ctxt->regs[VCPU_REGS_RSI]; |
2141 | tss->di = c->regs[VCPU_REGS_RDI]; | 2089 | tss->di = ctxt->regs[VCPU_REGS_RDI]; |
2142 | 2090 | ||
2143 | tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); | 2091 | tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); |
2144 | tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); | 2092 | tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); |
@@ -2148,22 +2096,20 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, | |||
2148 | } | 2096 | } |
2149 | 2097 | ||
2150 | static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, | 2098 | static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, |
2151 | struct x86_emulate_ops *ops, | ||
2152 | struct tss_segment_16 *tss) | 2099 | struct tss_segment_16 *tss) |
2153 | { | 2100 | { |
2154 | struct decode_cache *c = &ctxt->decode; | ||
2155 | int ret; | 2101 | int ret; |
2156 | 2102 | ||
2157 | c->eip = tss->ip; | 2103 | ctxt->_eip = tss->ip; |
2158 | ctxt->eflags = tss->flag | 2; | 2104 | ctxt->eflags = tss->flag | 2; |
2159 | c->regs[VCPU_REGS_RAX] = tss->ax; | 2105 | ctxt->regs[VCPU_REGS_RAX] = tss->ax; |
2160 | c->regs[VCPU_REGS_RCX] = tss->cx; | 2106 | ctxt->regs[VCPU_REGS_RCX] = tss->cx; |
2161 | c->regs[VCPU_REGS_RDX] = tss->dx; | 2107 | ctxt->regs[VCPU_REGS_RDX] = tss->dx; |
2162 | c->regs[VCPU_REGS_RBX] = tss->bx; | 2108 | ctxt->regs[VCPU_REGS_RBX] = tss->bx; |
2163 | c->regs[VCPU_REGS_RSP] = tss->sp; | 2109 | ctxt->regs[VCPU_REGS_RSP] = tss->sp; |
2164 | c->regs[VCPU_REGS_RBP] = tss->bp; | 2110 | ctxt->regs[VCPU_REGS_RBP] = tss->bp; |
2165 | c->regs[VCPU_REGS_RSI] = tss->si; | 2111 | ctxt->regs[VCPU_REGS_RSI] = tss->si; |
2166 | c->regs[VCPU_REGS_RDI] = tss->di; | 2112 | ctxt->regs[VCPU_REGS_RDI] = tss->di; |
2167 | 2113 | ||
2168 | /* | 2114 | /* |
2169 | * SDM says that segment selectors are loaded before segment | 2115 | * SDM says that segment selectors are loaded before segment |
@@ -2179,19 +2125,19 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, | |||
2179 | * Now load segment descriptors. If fault happenes at this stage | 2125 | * Now load segment descriptors. If fault happenes at this stage |
2180 | * it is handled in a context of new task | 2126 | * it is handled in a context of new task |
2181 | */ | 2127 | */ |
2182 | ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR); | 2128 | ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR); |
2183 | if (ret != X86EMUL_CONTINUE) | 2129 | if (ret != X86EMUL_CONTINUE) |
2184 | return ret; | 2130 | return ret; |
2185 | ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES); | 2131 | ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); |
2186 | if (ret != X86EMUL_CONTINUE) | 2132 | if (ret != X86EMUL_CONTINUE) |
2187 | return ret; | 2133 | return ret; |
2188 | ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS); | 2134 | ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); |
2189 | if (ret != X86EMUL_CONTINUE) | 2135 | if (ret != X86EMUL_CONTINUE) |
2190 | return ret; | 2136 | return ret; |
2191 | ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS); | 2137 | ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); |
2192 | if (ret != X86EMUL_CONTINUE) | 2138 | if (ret != X86EMUL_CONTINUE) |
2193 | return ret; | 2139 | return ret; |
2194 | ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS); | 2140 | ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); |
2195 | if (ret != X86EMUL_CONTINUE) | 2141 | if (ret != X86EMUL_CONTINUE) |
2196 | return ret; | 2142 | return ret; |
2197 | 2143 | ||
@@ -2199,10 +2145,10 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, | |||
2199 | } | 2145 | } |
2200 | 2146 | ||
2201 | static int task_switch_16(struct x86_emulate_ctxt *ctxt, | 2147 | static int task_switch_16(struct x86_emulate_ctxt *ctxt, |
2202 | struct x86_emulate_ops *ops, | ||
2203 | u16 tss_selector, u16 old_tss_sel, | 2148 | u16 tss_selector, u16 old_tss_sel, |
2204 | ulong old_tss_base, struct desc_struct *new_desc) | 2149 | ulong old_tss_base, struct desc_struct *new_desc) |
2205 | { | 2150 | { |
2151 | struct x86_emulate_ops *ops = ctxt->ops; | ||
2206 | struct tss_segment_16 tss_seg; | 2152 | struct tss_segment_16 tss_seg; |
2207 | int ret; | 2153 | int ret; |
2208 | u32 new_tss_base = get_desc_base(new_desc); | 2154 | u32 new_tss_base = get_desc_base(new_desc); |
@@ -2213,7 +2159,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, | |||
2213 | /* FIXME: need to provide precise fault address */ | 2159 | /* FIXME: need to provide precise fault address */ |
2214 | return ret; | 2160 | return ret; |
2215 | 2161 | ||
2216 | save_state_to_tss16(ctxt, ops, &tss_seg); | 2162 | save_state_to_tss16(ctxt, &tss_seg); |
2217 | 2163 | ||
2218 | ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, | 2164 | ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, |
2219 | &ctxt->exception); | 2165 | &ctxt->exception); |
@@ -2239,26 +2185,23 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, | |||
2239 | return ret; | 2185 | return ret; |
2240 | } | 2186 | } |
2241 | 2187 | ||
2242 | return load_state_from_tss16(ctxt, ops, &tss_seg); | 2188 | return load_state_from_tss16(ctxt, &tss_seg); |
2243 | } | 2189 | } |
2244 | 2190 | ||
2245 | static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, | 2191 | static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, |
2246 | struct x86_emulate_ops *ops, | ||
2247 | struct tss_segment_32 *tss) | 2192 | struct tss_segment_32 *tss) |
2248 | { | 2193 | { |
2249 | struct decode_cache *c = &ctxt->decode; | 2194 | tss->cr3 = ctxt->ops->get_cr(ctxt, 3); |
2250 | 2195 | tss->eip = ctxt->_eip; | |
2251 | tss->cr3 = ops->get_cr(ctxt, 3); | ||
2252 | tss->eip = c->eip; | ||
2253 | tss->eflags = ctxt->eflags; | 2196 | tss->eflags = ctxt->eflags; |
2254 | tss->eax = c->regs[VCPU_REGS_RAX]; | 2197 | tss->eax = ctxt->regs[VCPU_REGS_RAX]; |
2255 | tss->ecx = c->regs[VCPU_REGS_RCX]; | 2198 | tss->ecx = ctxt->regs[VCPU_REGS_RCX]; |
2256 | tss->edx = c->regs[VCPU_REGS_RDX]; | 2199 | tss->edx = ctxt->regs[VCPU_REGS_RDX]; |
2257 | tss->ebx = c->regs[VCPU_REGS_RBX]; | 2200 | tss->ebx = ctxt->regs[VCPU_REGS_RBX]; |
2258 | tss->esp = c->regs[VCPU_REGS_RSP]; | 2201 | tss->esp = ctxt->regs[VCPU_REGS_RSP]; |
2259 | tss->ebp = c->regs[VCPU_REGS_RBP]; | 2202 | tss->ebp = ctxt->regs[VCPU_REGS_RBP]; |
2260 | tss->esi = c->regs[VCPU_REGS_RSI]; | 2203 | tss->esi = ctxt->regs[VCPU_REGS_RSI]; |
2261 | tss->edi = c->regs[VCPU_REGS_RDI]; | 2204 | tss->edi = ctxt->regs[VCPU_REGS_RDI]; |
2262 | 2205 | ||
2263 | tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); | 2206 | tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); |
2264 | tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); | 2207 | tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); |
@@ -2270,24 +2213,22 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, | |||
2270 | } | 2213 | } |
2271 | 2214 | ||
2272 | static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, | 2215 | static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, |
2273 | struct x86_emulate_ops *ops, | ||
2274 | struct tss_segment_32 *tss) | 2216 | struct tss_segment_32 *tss) |
2275 | { | 2217 | { |
2276 | struct decode_cache *c = &ctxt->decode; | ||
2277 | int ret; | 2218 | int ret; |
2278 | 2219 | ||
2279 | if (ops->set_cr(ctxt, 3, tss->cr3)) | 2220 | if (ctxt->ops->set_cr(ctxt, 3, tss->cr3)) |
2280 | return emulate_gp(ctxt, 0); | 2221 | return emulate_gp(ctxt, 0); |
2281 | c->eip = tss->eip; | 2222 | ctxt->_eip = tss->eip; |
2282 | ctxt->eflags = tss->eflags | 2; | 2223 | ctxt->eflags = tss->eflags | 2; |
2283 | c->regs[VCPU_REGS_RAX] = tss->eax; | 2224 | ctxt->regs[VCPU_REGS_RAX] = tss->eax; |
2284 | c->regs[VCPU_REGS_RCX] = tss->ecx; | 2225 | ctxt->regs[VCPU_REGS_RCX] = tss->ecx; |
2285 | c->regs[VCPU_REGS_RDX] = tss->edx; | 2226 | ctxt->regs[VCPU_REGS_RDX] = tss->edx; |
2286 | c->regs[VCPU_REGS_RBX] = tss->ebx; | 2227 | ctxt->regs[VCPU_REGS_RBX] = tss->ebx; |
2287 | c->regs[VCPU_REGS_RSP] = tss->esp; | 2228 | ctxt->regs[VCPU_REGS_RSP] = tss->esp; |
2288 | c->regs[VCPU_REGS_RBP] = tss->ebp; | 2229 | ctxt->regs[VCPU_REGS_RBP] = tss->ebp; |
2289 | c->regs[VCPU_REGS_RSI] = tss->esi; | 2230 | ctxt->regs[VCPU_REGS_RSI] = tss->esi; |
2290 | c->regs[VCPU_REGS_RDI] = tss->edi; | 2231 | ctxt->regs[VCPU_REGS_RDI] = tss->edi; |
2291 | 2232 | ||
2292 | /* | 2233 | /* |
2293 | * SDM says that segment selectors are loaded before segment | 2234 | * SDM says that segment selectors are loaded before segment |
@@ -2305,25 +2246,25 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, | |||
2305 | * Now load segment descriptors. If fault happenes at this stage | 2246 | * Now load segment descriptors. If fault happenes at this stage |
2306 | * it is handled in a context of new task | 2247 | * it is handled in a context of new task |
2307 | */ | 2248 | */ |
2308 | ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR); | 2249 | ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); |
2309 | if (ret != X86EMUL_CONTINUE) | 2250 | if (ret != X86EMUL_CONTINUE) |
2310 | return ret; | 2251 | return ret; |
2311 | ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES); | 2252 | ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); |
2312 | if (ret != X86EMUL_CONTINUE) | 2253 | if (ret != X86EMUL_CONTINUE) |
2313 | return ret; | 2254 | return ret; |
2314 | ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS); | 2255 | ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); |
2315 | if (ret != X86EMUL_CONTINUE) | 2256 | if (ret != X86EMUL_CONTINUE) |
2316 | return ret; | 2257 | return ret; |
2317 | ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS); | 2258 | ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); |
2318 | if (ret != X86EMUL_CONTINUE) | 2259 | if (ret != X86EMUL_CONTINUE) |
2319 | return ret; | 2260 | return ret; |
2320 | ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS); | 2261 | ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); |
2321 | if (ret != X86EMUL_CONTINUE) | 2262 | if (ret != X86EMUL_CONTINUE) |
2322 | return ret; | 2263 | return ret; |
2323 | ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS); | 2264 | ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS); |
2324 | if (ret != X86EMUL_CONTINUE) | 2265 | if (ret != X86EMUL_CONTINUE) |
2325 | return ret; | 2266 | return ret; |
2326 | ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS); | 2267 | ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS); |
2327 | if (ret != X86EMUL_CONTINUE) | 2268 | if (ret != X86EMUL_CONTINUE) |
2328 | return ret; | 2269 | return ret; |
2329 | 2270 | ||
@@ -2331,10 +2272,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, | |||
2331 | } | 2272 | } |
2332 | 2273 | ||
2333 | static int task_switch_32(struct x86_emulate_ctxt *ctxt, | 2274 | static int task_switch_32(struct x86_emulate_ctxt *ctxt, |
2334 | struct x86_emulate_ops *ops, | ||
2335 | u16 tss_selector, u16 old_tss_sel, | 2275 | u16 tss_selector, u16 old_tss_sel, |
2336 | ulong old_tss_base, struct desc_struct *new_desc) | 2276 | ulong old_tss_base, struct desc_struct *new_desc) |
2337 | { | 2277 | { |
2278 | struct x86_emulate_ops *ops = ctxt->ops; | ||
2338 | struct tss_segment_32 tss_seg; | 2279 | struct tss_segment_32 tss_seg; |
2339 | int ret; | 2280 | int ret; |
2340 | u32 new_tss_base = get_desc_base(new_desc); | 2281 | u32 new_tss_base = get_desc_base(new_desc); |
@@ -2345,7 +2286,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2345 | /* FIXME: need to provide precise fault address */ | 2286 | /* FIXME: need to provide precise fault address */ |
2346 | return ret; | 2287 | return ret; |
2347 | 2288 | ||
2348 | save_state_to_tss32(ctxt, ops, &tss_seg); | 2289 | save_state_to_tss32(ctxt, &tss_seg); |
2349 | 2290 | ||
2350 | ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, | 2291 | ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, |
2351 | &ctxt->exception); | 2292 | &ctxt->exception); |
@@ -2371,14 +2312,14 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2371 | return ret; | 2312 | return ret; |
2372 | } | 2313 | } |
2373 | 2314 | ||
2374 | return load_state_from_tss32(ctxt, ops, &tss_seg); | 2315 | return load_state_from_tss32(ctxt, &tss_seg); |
2375 | } | 2316 | } |
2376 | 2317 | ||
2377 | static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | 2318 | static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, |
2378 | struct x86_emulate_ops *ops, | ||
2379 | u16 tss_selector, int reason, | 2319 | u16 tss_selector, int reason, |
2380 | bool has_error_code, u32 error_code) | 2320 | bool has_error_code, u32 error_code) |
2381 | { | 2321 | { |
2322 | struct x86_emulate_ops *ops = ctxt->ops; | ||
2382 | struct desc_struct curr_tss_desc, next_tss_desc; | 2323 | struct desc_struct curr_tss_desc, next_tss_desc; |
2383 | int ret; | 2324 | int ret; |
2384 | u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); | 2325 | u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); |
@@ -2388,10 +2329,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2388 | 2329 | ||
2389 | /* FIXME: old_tss_base == ~0 ? */ | 2330 | /* FIXME: old_tss_base == ~0 ? */ |
2390 | 2331 | ||
2391 | ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc); | 2332 | ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc); |
2392 | if (ret != X86EMUL_CONTINUE) | 2333 | if (ret != X86EMUL_CONTINUE) |
2393 | return ret; | 2334 | return ret; |
2394 | ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc); | 2335 | ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc); |
2395 | if (ret != X86EMUL_CONTINUE) | 2336 | if (ret != X86EMUL_CONTINUE) |
2396 | return ret; | 2337 | return ret; |
2397 | 2338 | ||
@@ -2413,8 +2354,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2413 | 2354 | ||
2414 | if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { | 2355 | if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { |
2415 | curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */ | 2356 | curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */ |
2416 | write_segment_descriptor(ctxt, ops, old_tss_sel, | 2357 | write_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc); |
2417 | &curr_tss_desc); | ||
2418 | } | 2358 | } |
2419 | 2359 | ||
2420 | if (reason == TASK_SWITCH_IRET) | 2360 | if (reason == TASK_SWITCH_IRET) |
@@ -2426,10 +2366,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2426 | old_tss_sel = 0xffff; | 2366 | old_tss_sel = 0xffff; |
2427 | 2367 | ||
2428 | if (next_tss_desc.type & 8) | 2368 | if (next_tss_desc.type & 8) |
2429 | ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel, | 2369 | ret = task_switch_32(ctxt, tss_selector, old_tss_sel, |
2430 | old_tss_base, &next_tss_desc); | 2370 | old_tss_base, &next_tss_desc); |
2431 | else | 2371 | else |
2432 | ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel, | 2372 | ret = task_switch_16(ctxt, tss_selector, old_tss_sel, |
2433 | old_tss_base, &next_tss_desc); | 2373 | old_tss_base, &next_tss_desc); |
2434 | if (ret != X86EMUL_CONTINUE) | 2374 | if (ret != X86EMUL_CONTINUE) |
2435 | return ret; | 2375 | return ret; |
@@ -2439,19 +2379,16 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2439 | 2379 | ||
2440 | if (reason != TASK_SWITCH_IRET) { | 2380 | if (reason != TASK_SWITCH_IRET) { |
2441 | next_tss_desc.type |= (1 << 1); /* set busy flag */ | 2381 | next_tss_desc.type |= (1 << 1); /* set busy flag */ |
2442 | write_segment_descriptor(ctxt, ops, tss_selector, | 2382 | write_segment_descriptor(ctxt, tss_selector, &next_tss_desc); |
2443 | &next_tss_desc); | ||
2444 | } | 2383 | } |
2445 | 2384 | ||
2446 | ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS); | 2385 | ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS); |
2447 | ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR); | 2386 | ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR); |
2448 | 2387 | ||
2449 | if (has_error_code) { | 2388 | if (has_error_code) { |
2450 | struct decode_cache *c = &ctxt->decode; | 2389 | ctxt->op_bytes = ctxt->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; |
2451 | 2390 | ctxt->lock_prefix = 0; | |
2452 | c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; | 2391 | ctxt->src.val = (unsigned long) error_code; |
2453 | c->lock_prefix = 0; | ||
2454 | c->src.val = (unsigned long) error_code; | ||
2455 | ret = em_push(ctxt); | 2392 | ret = em_push(ctxt); |
2456 | } | 2393 | } |
2457 | 2394 | ||
@@ -2462,18 +2399,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2462 | u16 tss_selector, int reason, | 2399 | u16 tss_selector, int reason, |
2463 | bool has_error_code, u32 error_code) | 2400 | bool has_error_code, u32 error_code) |
2464 | { | 2401 | { |
2465 | struct x86_emulate_ops *ops = ctxt->ops; | ||
2466 | struct decode_cache *c = &ctxt->decode; | ||
2467 | int rc; | 2402 | int rc; |
2468 | 2403 | ||
2469 | c->eip = ctxt->eip; | 2404 | ctxt->_eip = ctxt->eip; |
2470 | c->dst.type = OP_NONE; | 2405 | ctxt->dst.type = OP_NONE; |
2471 | 2406 | ||
2472 | rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, | 2407 | rc = emulator_do_task_switch(ctxt, tss_selector, reason, |
2473 | has_error_code, error_code); | 2408 | has_error_code, error_code); |
2474 | 2409 | ||
2475 | if (rc == X86EMUL_CONTINUE) | 2410 | if (rc == X86EMUL_CONTINUE) |
2476 | ctxt->eip = c->eip; | 2411 | ctxt->eip = ctxt->_eip; |
2477 | 2412 | ||
2478 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; | 2413 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; |
2479 | } | 2414 | } |
@@ -2481,22 +2416,20 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2481 | static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, | 2416 | static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, |
2482 | int reg, struct operand *op) | 2417 | int reg, struct operand *op) |
2483 | { | 2418 | { |
2484 | struct decode_cache *c = &ctxt->decode; | ||
2485 | int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; | 2419 | int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; |
2486 | 2420 | ||
2487 | register_address_increment(c, &c->regs[reg], df * op->bytes); | 2421 | register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes); |
2488 | op->addr.mem.ea = register_address(c, c->regs[reg]); | 2422 | op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]); |
2489 | op->addr.mem.seg = seg; | 2423 | op->addr.mem.seg = seg; |
2490 | } | 2424 | } |
2491 | 2425 | ||
2492 | static int em_das(struct x86_emulate_ctxt *ctxt) | 2426 | static int em_das(struct x86_emulate_ctxt *ctxt) |
2493 | { | 2427 | { |
2494 | struct decode_cache *c = &ctxt->decode; | ||
2495 | u8 al, old_al; | 2428 | u8 al, old_al; |
2496 | bool af, cf, old_cf; | 2429 | bool af, cf, old_cf; |
2497 | 2430 | ||
2498 | cf = ctxt->eflags & X86_EFLAGS_CF; | 2431 | cf = ctxt->eflags & X86_EFLAGS_CF; |
2499 | al = c->dst.val; | 2432 | al = ctxt->dst.val; |
2500 | 2433 | ||
2501 | old_al = al; | 2434 | old_al = al; |
2502 | old_cf = cf; | 2435 | old_cf = cf; |
@@ -2514,12 +2447,12 @@ static int em_das(struct x86_emulate_ctxt *ctxt) | |||
2514 | cf = true; | 2447 | cf = true; |
2515 | } | 2448 | } |
2516 | 2449 | ||
2517 | c->dst.val = al; | 2450 | ctxt->dst.val = al; |
2518 | /* Set PF, ZF, SF */ | 2451 | /* Set PF, ZF, SF */ |
2519 | c->src.type = OP_IMM; | 2452 | ctxt->src.type = OP_IMM; |
2520 | c->src.val = 0; | 2453 | ctxt->src.val = 0; |
2521 | c->src.bytes = 1; | 2454 | ctxt->src.bytes = 1; |
2522 | emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); | 2455 | emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags); |
2523 | ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); | 2456 | ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); |
2524 | if (cf) | 2457 | if (cf) |
2525 | ctxt->eflags |= X86_EFLAGS_CF; | 2458 | ctxt->eflags |= X86_EFLAGS_CF; |
@@ -2530,175 +2463,189 @@ static int em_das(struct x86_emulate_ctxt *ctxt) | |||
2530 | 2463 | ||
2531 | static int em_call_far(struct x86_emulate_ctxt *ctxt) | 2464 | static int em_call_far(struct x86_emulate_ctxt *ctxt) |
2532 | { | 2465 | { |
2533 | struct decode_cache *c = &ctxt->decode; | ||
2534 | u16 sel, old_cs; | 2466 | u16 sel, old_cs; |
2535 | ulong old_eip; | 2467 | ulong old_eip; |
2536 | int rc; | 2468 | int rc; |
2537 | 2469 | ||
2538 | old_cs = get_segment_selector(ctxt, VCPU_SREG_CS); | 2470 | old_cs = get_segment_selector(ctxt, VCPU_SREG_CS); |
2539 | old_eip = c->eip; | 2471 | old_eip = ctxt->_eip; |
2540 | 2472 | ||
2541 | memcpy(&sel, c->src.valptr + c->op_bytes, 2); | 2473 | memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); |
2542 | if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS)) | 2474 | if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS)) |
2543 | return X86EMUL_CONTINUE; | 2475 | return X86EMUL_CONTINUE; |
2544 | 2476 | ||
2545 | c->eip = 0; | 2477 | ctxt->_eip = 0; |
2546 | memcpy(&c->eip, c->src.valptr, c->op_bytes); | 2478 | memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes); |
2547 | 2479 | ||
2548 | c->src.val = old_cs; | 2480 | ctxt->src.val = old_cs; |
2549 | rc = em_push(ctxt); | 2481 | rc = em_push(ctxt); |
2550 | if (rc != X86EMUL_CONTINUE) | 2482 | if (rc != X86EMUL_CONTINUE) |
2551 | return rc; | 2483 | return rc; |
2552 | 2484 | ||
2553 | c->src.val = old_eip; | 2485 | ctxt->src.val = old_eip; |
2554 | return em_push(ctxt); | 2486 | return em_push(ctxt); |
2555 | } | 2487 | } |
2556 | 2488 | ||
2557 | static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) | 2489 | static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) |
2558 | { | 2490 | { |
2559 | struct decode_cache *c = &ctxt->decode; | ||
2560 | int rc; | 2491 | int rc; |
2561 | 2492 | ||
2562 | c->dst.type = OP_REG; | 2493 | ctxt->dst.type = OP_REG; |
2563 | c->dst.addr.reg = &c->eip; | 2494 | ctxt->dst.addr.reg = &ctxt->_eip; |
2564 | c->dst.bytes = c->op_bytes; | 2495 | ctxt->dst.bytes = ctxt->op_bytes; |
2565 | rc = emulate_pop(ctxt, &c->dst.val, c->op_bytes); | 2496 | rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); |
2566 | if (rc != X86EMUL_CONTINUE) | 2497 | if (rc != X86EMUL_CONTINUE) |
2567 | return rc; | 2498 | return rc; |
2568 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val); | 2499 | register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val); |
2569 | return X86EMUL_CONTINUE; | 2500 | return X86EMUL_CONTINUE; |
2570 | } | 2501 | } |
2571 | 2502 | ||
2572 | static int em_add(struct x86_emulate_ctxt *ctxt) | 2503 | static int em_add(struct x86_emulate_ctxt *ctxt) |
2573 | { | 2504 | { |
2574 | struct decode_cache *c = &ctxt->decode; | 2505 | emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags); |
2575 | |||
2576 | emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); | ||
2577 | return X86EMUL_CONTINUE; | 2506 | return X86EMUL_CONTINUE; |
2578 | } | 2507 | } |
2579 | 2508 | ||
2580 | static int em_or(struct x86_emulate_ctxt *ctxt) | 2509 | static int em_or(struct x86_emulate_ctxt *ctxt) |
2581 | { | 2510 | { |
2582 | struct decode_cache *c = &ctxt->decode; | 2511 | emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags); |
2583 | |||
2584 | emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); | ||
2585 | return X86EMUL_CONTINUE; | 2512 | return X86EMUL_CONTINUE; |
2586 | } | 2513 | } |
2587 | 2514 | ||
2588 | static int em_adc(struct x86_emulate_ctxt *ctxt) | 2515 | static int em_adc(struct x86_emulate_ctxt *ctxt) |
2589 | { | 2516 | { |
2590 | struct decode_cache *c = &ctxt->decode; | 2517 | emulate_2op_SrcV("adc", ctxt->src, ctxt->dst, ctxt->eflags); |
2591 | |||
2592 | emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); | ||
2593 | return X86EMUL_CONTINUE; | 2518 | return X86EMUL_CONTINUE; |
2594 | } | 2519 | } |
2595 | 2520 | ||
2596 | static int em_sbb(struct x86_emulate_ctxt *ctxt) | 2521 | static int em_sbb(struct x86_emulate_ctxt *ctxt) |
2597 | { | 2522 | { |
2598 | struct decode_cache *c = &ctxt->decode; | 2523 | emulate_2op_SrcV("sbb", ctxt->src, ctxt->dst, ctxt->eflags); |
2599 | |||
2600 | emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); | ||
2601 | return X86EMUL_CONTINUE; | 2524 | return X86EMUL_CONTINUE; |
2602 | } | 2525 | } |
2603 | 2526 | ||
2604 | static int em_and(struct x86_emulate_ctxt *ctxt) | 2527 | static int em_and(struct x86_emulate_ctxt *ctxt) |
2605 | { | 2528 | { |
2606 | struct decode_cache *c = &ctxt->decode; | 2529 | emulate_2op_SrcV("and", ctxt->src, ctxt->dst, ctxt->eflags); |
2607 | |||
2608 | emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); | ||
2609 | return X86EMUL_CONTINUE; | 2530 | return X86EMUL_CONTINUE; |
2610 | } | 2531 | } |
2611 | 2532 | ||
2612 | static int em_sub(struct x86_emulate_ctxt *ctxt) | 2533 | static int em_sub(struct x86_emulate_ctxt *ctxt) |
2613 | { | 2534 | { |
2614 | struct decode_cache *c = &ctxt->decode; | 2535 | emulate_2op_SrcV("sub", ctxt->src, ctxt->dst, ctxt->eflags); |
2615 | |||
2616 | emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); | ||
2617 | return X86EMUL_CONTINUE; | 2536 | return X86EMUL_CONTINUE; |
2618 | } | 2537 | } |
2619 | 2538 | ||
2620 | static int em_xor(struct x86_emulate_ctxt *ctxt) | 2539 | static int em_xor(struct x86_emulate_ctxt *ctxt) |
2621 | { | 2540 | { |
2622 | struct decode_cache *c = &ctxt->decode; | 2541 | emulate_2op_SrcV("xor", ctxt->src, ctxt->dst, ctxt->eflags); |
2623 | |||
2624 | emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); | ||
2625 | return X86EMUL_CONTINUE; | 2542 | return X86EMUL_CONTINUE; |
2626 | } | 2543 | } |
2627 | 2544 | ||
2628 | static int em_cmp(struct x86_emulate_ctxt *ctxt) | 2545 | static int em_cmp(struct x86_emulate_ctxt *ctxt) |
2629 | { | 2546 | { |
2630 | struct decode_cache *c = &ctxt->decode; | 2547 | emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags); |
2631 | |||
2632 | emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); | ||
2633 | /* Disable writeback. */ | 2548 | /* Disable writeback. */ |
2634 | c->dst.type = OP_NONE; | 2549 | ctxt->dst.type = OP_NONE; |
2635 | return X86EMUL_CONTINUE; | 2550 | return X86EMUL_CONTINUE; |
2636 | } | 2551 | } |
2637 | 2552 | ||
2638 | static int em_imul(struct x86_emulate_ctxt *ctxt) | 2553 | static int em_test(struct x86_emulate_ctxt *ctxt) |
2554 | { | ||
2555 | emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags); | ||
2556 | return X86EMUL_CONTINUE; | ||
2557 | } | ||
2558 | |||
2559 | static int em_xchg(struct x86_emulate_ctxt *ctxt) | ||
2639 | { | 2560 | { |
2640 | struct decode_cache *c = &ctxt->decode; | 2561 | /* Write back the register source. */ |
2562 | ctxt->src.val = ctxt->dst.val; | ||
2563 | write_register_operand(&ctxt->src); | ||
2641 | 2564 | ||
2642 | emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags); | 2565 | /* Write back the memory destination with implicit LOCK prefix. */ |
2566 | ctxt->dst.val = ctxt->src.orig_val; | ||
2567 | ctxt->lock_prefix = 1; | ||
2643 | return X86EMUL_CONTINUE; | 2568 | return X86EMUL_CONTINUE; |
2644 | } | 2569 | } |
2645 | 2570 | ||
2646 | static int em_imul_3op(struct x86_emulate_ctxt *ctxt) | 2571 | static int em_imul(struct x86_emulate_ctxt *ctxt) |
2647 | { | 2572 | { |
2648 | struct decode_cache *c = &ctxt->decode; | 2573 | emulate_2op_SrcV_nobyte("imul", ctxt->src, ctxt->dst, ctxt->eflags); |
2574 | return X86EMUL_CONTINUE; | ||
2575 | } | ||
2649 | 2576 | ||
2650 | c->dst.val = c->src2.val; | 2577 | static int em_imul_3op(struct x86_emulate_ctxt *ctxt) |
2578 | { | ||
2579 | ctxt->dst.val = ctxt->src2.val; | ||
2651 | return em_imul(ctxt); | 2580 | return em_imul(ctxt); |
2652 | } | 2581 | } |
2653 | 2582 | ||
2654 | static int em_cwd(struct x86_emulate_ctxt *ctxt) | 2583 | static int em_cwd(struct x86_emulate_ctxt *ctxt) |
2655 | { | 2584 | { |
2656 | struct decode_cache *c = &ctxt->decode; | 2585 | ctxt->dst.type = OP_REG; |
2657 | 2586 | ctxt->dst.bytes = ctxt->src.bytes; | |
2658 | c->dst.type = OP_REG; | 2587 | ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; |
2659 | c->dst.bytes = c->src.bytes; | 2588 | ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1); |
2660 | c->dst.addr.reg = &c->regs[VCPU_REGS_RDX]; | ||
2661 | c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1); | ||
2662 | 2589 | ||
2663 | return X86EMUL_CONTINUE; | 2590 | return X86EMUL_CONTINUE; |
2664 | } | 2591 | } |
2665 | 2592 | ||
2666 | static int em_rdtsc(struct x86_emulate_ctxt *ctxt) | 2593 | static int em_rdtsc(struct x86_emulate_ctxt *ctxt) |
2667 | { | 2594 | { |
2668 | struct decode_cache *c = &ctxt->decode; | ||
2669 | u64 tsc = 0; | 2595 | u64 tsc = 0; |
2670 | 2596 | ||
2671 | ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); | 2597 | ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); |
2672 | c->regs[VCPU_REGS_RAX] = (u32)tsc; | 2598 | ctxt->regs[VCPU_REGS_RAX] = (u32)tsc; |
2673 | c->regs[VCPU_REGS_RDX] = tsc >> 32; | 2599 | ctxt->regs[VCPU_REGS_RDX] = tsc >> 32; |
2674 | return X86EMUL_CONTINUE; | 2600 | return X86EMUL_CONTINUE; |
2675 | } | 2601 | } |
2676 | 2602 | ||
2677 | static int em_mov(struct x86_emulate_ctxt *ctxt) | 2603 | static int em_mov(struct x86_emulate_ctxt *ctxt) |
2678 | { | 2604 | { |
2679 | struct decode_cache *c = &ctxt->decode; | 2605 | ctxt->dst.val = ctxt->src.val; |
2680 | c->dst.val = c->src.val; | ||
2681 | return X86EMUL_CONTINUE; | 2606 | return X86EMUL_CONTINUE; |
2682 | } | 2607 | } |
2683 | 2608 | ||
2609 | static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt) | ||
2610 | { | ||
2611 | if (ctxt->modrm_reg > VCPU_SREG_GS) | ||
2612 | return emulate_ud(ctxt); | ||
2613 | |||
2614 | ctxt->dst.val = get_segment_selector(ctxt, ctxt->modrm_reg); | ||
2615 | return X86EMUL_CONTINUE; | ||
2616 | } | ||
2617 | |||
2618 | static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) | ||
2619 | { | ||
2620 | u16 sel = ctxt->src.val; | ||
2621 | |||
2622 | if (ctxt->modrm_reg == VCPU_SREG_CS || ctxt->modrm_reg > VCPU_SREG_GS) | ||
2623 | return emulate_ud(ctxt); | ||
2624 | |||
2625 | if (ctxt->modrm_reg == VCPU_SREG_SS) | ||
2626 | ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; | ||
2627 | |||
2628 | /* Disable writeback. */ | ||
2629 | ctxt->dst.type = OP_NONE; | ||
2630 | return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); | ||
2631 | } | ||
2632 | |||
2684 | static int em_movdqu(struct x86_emulate_ctxt *ctxt) | 2633 | static int em_movdqu(struct x86_emulate_ctxt *ctxt) |
2685 | { | 2634 | { |
2686 | struct decode_cache *c = &ctxt->decode; | 2635 | memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes); |
2687 | memcpy(&c->dst.vec_val, &c->src.vec_val, c->op_bytes); | ||
2688 | return X86EMUL_CONTINUE; | 2636 | return X86EMUL_CONTINUE; |
2689 | } | 2637 | } |
2690 | 2638 | ||
2691 | static int em_invlpg(struct x86_emulate_ctxt *ctxt) | 2639 | static int em_invlpg(struct x86_emulate_ctxt *ctxt) |
2692 | { | 2640 | { |
2693 | struct decode_cache *c = &ctxt->decode; | ||
2694 | int rc; | 2641 | int rc; |
2695 | ulong linear; | 2642 | ulong linear; |
2696 | 2643 | ||
2697 | rc = linearize(ctxt, c->src.addr.mem, 1, false, &linear); | 2644 | rc = linearize(ctxt, ctxt->src.addr.mem, 1, false, &linear); |
2698 | if (rc == X86EMUL_CONTINUE) | 2645 | if (rc == X86EMUL_CONTINUE) |
2699 | ctxt->ops->invlpg(ctxt, linear); | 2646 | ctxt->ops->invlpg(ctxt, linear); |
2700 | /* Disable writeback. */ | 2647 | /* Disable writeback. */ |
2701 | c->dst.type = OP_NONE; | 2648 | ctxt->dst.type = OP_NONE; |
2702 | return X86EMUL_CONTINUE; | 2649 | return X86EMUL_CONTINUE; |
2703 | } | 2650 | } |
2704 | 2651 | ||
@@ -2714,10 +2661,9 @@ static int em_clts(struct x86_emulate_ctxt *ctxt) | |||
2714 | 2661 | ||
2715 | static int em_vmcall(struct x86_emulate_ctxt *ctxt) | 2662 | static int em_vmcall(struct x86_emulate_ctxt *ctxt) |
2716 | { | 2663 | { |
2717 | struct decode_cache *c = &ctxt->decode; | ||
2718 | int rc; | 2664 | int rc; |
2719 | 2665 | ||
2720 | if (c->modrm_mod != 3 || c->modrm_rm != 1) | 2666 | if (ctxt->modrm_mod != 3 || ctxt->modrm_rm != 1) |
2721 | return X86EMUL_UNHANDLEABLE; | 2667 | return X86EMUL_UNHANDLEABLE; |
2722 | 2668 | ||
2723 | rc = ctxt->ops->fix_hypercall(ctxt); | 2669 | rc = ctxt->ops->fix_hypercall(ctxt); |
@@ -2725,73 +2671,104 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt) | |||
2725 | return rc; | 2671 | return rc; |
2726 | 2672 | ||
2727 | /* Let the processor re-execute the fixed hypercall */ | 2673 | /* Let the processor re-execute the fixed hypercall */ |
2728 | c->eip = ctxt->eip; | 2674 | ctxt->_eip = ctxt->eip; |
2729 | /* Disable writeback. */ | 2675 | /* Disable writeback. */ |
2730 | c->dst.type = OP_NONE; | 2676 | ctxt->dst.type = OP_NONE; |
2731 | return X86EMUL_CONTINUE; | 2677 | return X86EMUL_CONTINUE; |
2732 | } | 2678 | } |
2733 | 2679 | ||
2734 | static int em_lgdt(struct x86_emulate_ctxt *ctxt) | 2680 | static int em_lgdt(struct x86_emulate_ctxt *ctxt) |
2735 | { | 2681 | { |
2736 | struct decode_cache *c = &ctxt->decode; | ||
2737 | struct desc_ptr desc_ptr; | 2682 | struct desc_ptr desc_ptr; |
2738 | int rc; | 2683 | int rc; |
2739 | 2684 | ||
2740 | rc = read_descriptor(ctxt, c->src.addr.mem, | 2685 | rc = read_descriptor(ctxt, ctxt->src.addr.mem, |
2741 | &desc_ptr.size, &desc_ptr.address, | 2686 | &desc_ptr.size, &desc_ptr.address, |
2742 | c->op_bytes); | 2687 | ctxt->op_bytes); |
2743 | if (rc != X86EMUL_CONTINUE) | 2688 | if (rc != X86EMUL_CONTINUE) |
2744 | return rc; | 2689 | return rc; |
2745 | ctxt->ops->set_gdt(ctxt, &desc_ptr); | 2690 | ctxt->ops->set_gdt(ctxt, &desc_ptr); |
2746 | /* Disable writeback. */ | 2691 | /* Disable writeback. */ |
2747 | c->dst.type = OP_NONE; | 2692 | ctxt->dst.type = OP_NONE; |
2748 | return X86EMUL_CONTINUE; | 2693 | return X86EMUL_CONTINUE; |
2749 | } | 2694 | } |
2750 | 2695 | ||
2751 | static int em_vmmcall(struct x86_emulate_ctxt *ctxt) | 2696 | static int em_vmmcall(struct x86_emulate_ctxt *ctxt) |
2752 | { | 2697 | { |
2753 | struct decode_cache *c = &ctxt->decode; | ||
2754 | int rc; | 2698 | int rc; |
2755 | 2699 | ||
2756 | rc = ctxt->ops->fix_hypercall(ctxt); | 2700 | rc = ctxt->ops->fix_hypercall(ctxt); |
2757 | 2701 | ||
2758 | /* Disable writeback. */ | 2702 | /* Disable writeback. */ |
2759 | c->dst.type = OP_NONE; | 2703 | ctxt->dst.type = OP_NONE; |
2760 | return rc; | 2704 | return rc; |
2761 | } | 2705 | } |
2762 | 2706 | ||
2763 | static int em_lidt(struct x86_emulate_ctxt *ctxt) | 2707 | static int em_lidt(struct x86_emulate_ctxt *ctxt) |
2764 | { | 2708 | { |
2765 | struct decode_cache *c = &ctxt->decode; | ||
2766 | struct desc_ptr desc_ptr; | 2709 | struct desc_ptr desc_ptr; |
2767 | int rc; | 2710 | int rc; |
2768 | 2711 | ||
2769 | rc = read_descriptor(ctxt, c->src.addr.mem, | 2712 | rc = read_descriptor(ctxt, ctxt->src.addr.mem, |
2770 | &desc_ptr.size, &desc_ptr.address, | 2713 | &desc_ptr.size, &desc_ptr.address, |
2771 | c->op_bytes); | 2714 | ctxt->op_bytes); |
2772 | if (rc != X86EMUL_CONTINUE) | 2715 | if (rc != X86EMUL_CONTINUE) |
2773 | return rc; | 2716 | return rc; |
2774 | ctxt->ops->set_idt(ctxt, &desc_ptr); | 2717 | ctxt->ops->set_idt(ctxt, &desc_ptr); |
2775 | /* Disable writeback. */ | 2718 | /* Disable writeback. */ |
2776 | c->dst.type = OP_NONE; | 2719 | ctxt->dst.type = OP_NONE; |
2777 | return X86EMUL_CONTINUE; | 2720 | return X86EMUL_CONTINUE; |
2778 | } | 2721 | } |
2779 | 2722 | ||
2780 | static int em_smsw(struct x86_emulate_ctxt *ctxt) | 2723 | static int em_smsw(struct x86_emulate_ctxt *ctxt) |
2781 | { | 2724 | { |
2782 | struct decode_cache *c = &ctxt->decode; | 2725 | ctxt->dst.bytes = 2; |
2783 | 2726 | ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0); | |
2784 | c->dst.bytes = 2; | ||
2785 | c->dst.val = ctxt->ops->get_cr(ctxt, 0); | ||
2786 | return X86EMUL_CONTINUE; | 2727 | return X86EMUL_CONTINUE; |
2787 | } | 2728 | } |
2788 | 2729 | ||
2789 | static int em_lmsw(struct x86_emulate_ctxt *ctxt) | 2730 | static int em_lmsw(struct x86_emulate_ctxt *ctxt) |
2790 | { | 2731 | { |
2791 | struct decode_cache *c = &ctxt->decode; | ||
2792 | ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul) | 2732 | ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul) |
2793 | | (c->src.val & 0x0f)); | 2733 | | (ctxt->src.val & 0x0f)); |
2794 | c->dst.type = OP_NONE; | 2734 | ctxt->dst.type = OP_NONE; |
2735 | return X86EMUL_CONTINUE; | ||
2736 | } | ||
2737 | |||
2738 | static int em_loop(struct x86_emulate_ctxt *ctxt) | ||
2739 | { | ||
2740 | register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1); | ||
2741 | if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) && | ||
2742 | (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags))) | ||
2743 | jmp_rel(ctxt, ctxt->src.val); | ||
2744 | |||
2745 | return X86EMUL_CONTINUE; | ||
2746 | } | ||
2747 | |||
2748 | static int em_jcxz(struct x86_emulate_ctxt *ctxt) | ||
2749 | { | ||
2750 | if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) | ||
2751 | jmp_rel(ctxt, ctxt->src.val); | ||
2752 | |||
2753 | return X86EMUL_CONTINUE; | ||
2754 | } | ||
2755 | |||
2756 | static int em_cli(struct x86_emulate_ctxt *ctxt) | ||
2757 | { | ||
2758 | if (emulator_bad_iopl(ctxt)) | ||
2759 | return emulate_gp(ctxt, 0); | ||
2760 | |||
2761 | ctxt->eflags &= ~X86_EFLAGS_IF; | ||
2762 | return X86EMUL_CONTINUE; | ||
2763 | } | ||
2764 | |||
2765 | static int em_sti(struct x86_emulate_ctxt *ctxt) | ||
2766 | { | ||
2767 | if (emulator_bad_iopl(ctxt)) | ||
2768 | return emulate_gp(ctxt, 0); | ||
2769 | |||
2770 | ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; | ||
2771 | ctxt->eflags |= X86_EFLAGS_IF; | ||
2795 | return X86EMUL_CONTINUE; | 2772 | return X86EMUL_CONTINUE; |
2796 | } | 2773 | } |
2797 | 2774 | ||
@@ -2809,9 +2786,7 @@ static bool valid_cr(int nr) | |||
2809 | 2786 | ||
2810 | static int check_cr_read(struct x86_emulate_ctxt *ctxt) | 2787 | static int check_cr_read(struct x86_emulate_ctxt *ctxt) |
2811 | { | 2788 | { |
2812 | struct decode_cache *c = &ctxt->decode; | 2789 | if (!valid_cr(ctxt->modrm_reg)) |
2813 | |||
2814 | if (!valid_cr(c->modrm_reg)) | ||
2815 | return emulate_ud(ctxt); | 2790 | return emulate_ud(ctxt); |
2816 | 2791 | ||
2817 | return X86EMUL_CONTINUE; | 2792 | return X86EMUL_CONTINUE; |
@@ -2819,9 +2794,8 @@ static int check_cr_read(struct x86_emulate_ctxt *ctxt) | |||
2819 | 2794 | ||
2820 | static int check_cr_write(struct x86_emulate_ctxt *ctxt) | 2795 | static int check_cr_write(struct x86_emulate_ctxt *ctxt) |
2821 | { | 2796 | { |
2822 | struct decode_cache *c = &ctxt->decode; | 2797 | u64 new_val = ctxt->src.val64; |
2823 | u64 new_val = c->src.val64; | 2798 | int cr = ctxt->modrm_reg; |
2824 | int cr = c->modrm_reg; | ||
2825 | u64 efer = 0; | 2799 | u64 efer = 0; |
2826 | 2800 | ||
2827 | static u64 cr_reserved_bits[] = { | 2801 | static u64 cr_reserved_bits[] = { |
@@ -2898,8 +2872,7 @@ static int check_dr7_gd(struct x86_emulate_ctxt *ctxt) | |||
2898 | 2872 | ||
2899 | static int check_dr_read(struct x86_emulate_ctxt *ctxt) | 2873 | static int check_dr_read(struct x86_emulate_ctxt *ctxt) |
2900 | { | 2874 | { |
2901 | struct decode_cache *c = &ctxt->decode; | 2875 | int dr = ctxt->modrm_reg; |
2902 | int dr = c->modrm_reg; | ||
2903 | u64 cr4; | 2876 | u64 cr4; |
2904 | 2877 | ||
2905 | if (dr > 7) | 2878 | if (dr > 7) |
@@ -2917,9 +2890,8 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt) | |||
2917 | 2890 | ||
2918 | static int check_dr_write(struct x86_emulate_ctxt *ctxt) | 2891 | static int check_dr_write(struct x86_emulate_ctxt *ctxt) |
2919 | { | 2892 | { |
2920 | struct decode_cache *c = &ctxt->decode; | 2893 | u64 new_val = ctxt->src.val64; |
2921 | u64 new_val = c->src.val64; | 2894 | int dr = ctxt->modrm_reg; |
2922 | int dr = c->modrm_reg; | ||
2923 | 2895 | ||
2924 | if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL)) | 2896 | if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL)) |
2925 | return emulate_gp(ctxt, 0); | 2897 | return emulate_gp(ctxt, 0); |
@@ -2941,7 +2913,7 @@ static int check_svme(struct x86_emulate_ctxt *ctxt) | |||
2941 | 2913 | ||
2942 | static int check_svme_pa(struct x86_emulate_ctxt *ctxt) | 2914 | static int check_svme_pa(struct x86_emulate_ctxt *ctxt) |
2943 | { | 2915 | { |
2944 | u64 rax = ctxt->decode.regs[VCPU_REGS_RAX]; | 2916 | u64 rax = ctxt->regs[VCPU_REGS_RAX]; |
2945 | 2917 | ||
2946 | /* Valid physical address? */ | 2918 | /* Valid physical address? */ |
2947 | if (rax & 0xffff000000000000ULL) | 2919 | if (rax & 0xffff000000000000ULL) |
@@ -2963,7 +2935,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt) | |||
2963 | static int check_rdpmc(struct x86_emulate_ctxt *ctxt) | 2935 | static int check_rdpmc(struct x86_emulate_ctxt *ctxt) |
2964 | { | 2936 | { |
2965 | u64 cr4 = ctxt->ops->get_cr(ctxt, 4); | 2937 | u64 cr4 = ctxt->ops->get_cr(ctxt, 4); |
2966 | u64 rcx = ctxt->decode.regs[VCPU_REGS_RCX]; | 2938 | u64 rcx = ctxt->regs[VCPU_REGS_RCX]; |
2967 | 2939 | ||
2968 | if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || | 2940 | if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || |
2969 | (rcx > 3)) | 2941 | (rcx > 3)) |
@@ -2974,10 +2946,8 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt) | |||
2974 | 2946 | ||
2975 | static int check_perm_in(struct x86_emulate_ctxt *ctxt) | 2947 | static int check_perm_in(struct x86_emulate_ctxt *ctxt) |
2976 | { | 2948 | { |
2977 | struct decode_cache *c = &ctxt->decode; | 2949 | ctxt->dst.bytes = min(ctxt->dst.bytes, 4u); |
2978 | 2950 | if (!emulator_io_permited(ctxt, ctxt->src.val, ctxt->dst.bytes)) | |
2979 | c->dst.bytes = min(c->dst.bytes, 4u); | ||
2980 | if (!emulator_io_permited(ctxt, ctxt->ops, c->src.val, c->dst.bytes)) | ||
2981 | return emulate_gp(ctxt, 0); | 2951 | return emulate_gp(ctxt, 0); |
2982 | 2952 | ||
2983 | return X86EMUL_CONTINUE; | 2953 | return X86EMUL_CONTINUE; |
@@ -2985,10 +2955,8 @@ static int check_perm_in(struct x86_emulate_ctxt *ctxt) | |||
2985 | 2955 | ||
2986 | static int check_perm_out(struct x86_emulate_ctxt *ctxt) | 2956 | static int check_perm_out(struct x86_emulate_ctxt *ctxt) |
2987 | { | 2957 | { |
2988 | struct decode_cache *c = &ctxt->decode; | 2958 | ctxt->src.bytes = min(ctxt->src.bytes, 4u); |
2989 | 2959 | if (!emulator_io_permited(ctxt, ctxt->dst.val, ctxt->src.bytes)) | |
2990 | c->src.bytes = min(c->src.bytes, 4u); | ||
2991 | if (!emulator_io_permited(ctxt, ctxt->ops, c->dst.val, c->src.bytes)) | ||
2992 | return emulate_gp(ctxt, 0); | 2960 | return emulate_gp(ctxt, 0); |
2993 | 2961 | ||
2994 | return X86EMUL_CONTINUE; | 2962 | return X86EMUL_CONTINUE; |
@@ -3165,12 +3133,15 @@ static struct opcode opcode_table[256] = { | |||
3165 | G(DstMem | SrcImm | ModRM | Group, group1), | 3133 | G(DstMem | SrcImm | ModRM | Group, group1), |
3166 | G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), | 3134 | G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), |
3167 | G(DstMem | SrcImmByte | ModRM | Group, group1), | 3135 | G(DstMem | SrcImmByte | ModRM | Group, group1), |
3168 | D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock), | 3136 | I2bv(DstMem | SrcReg | ModRM, em_test), |
3137 | I2bv(DstMem | SrcReg | ModRM | Lock, em_xchg), | ||
3169 | /* 0x88 - 0x8F */ | 3138 | /* 0x88 - 0x8F */ |
3170 | I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), | 3139 | I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), |
3171 | I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), | 3140 | I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), |
3172 | D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg), | 3141 | I(DstMem | SrcNone | ModRM | Mov, em_mov_rm_sreg), |
3173 | D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), | 3142 | D(ModRM | SrcMem | NoAccess | DstReg), |
3143 | I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm), | ||
3144 | G(0, group1A), | ||
3174 | /* 0x90 - 0x97 */ | 3145 | /* 0x90 - 0x97 */ |
3175 | DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)), | 3146 | DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)), |
3176 | /* 0x98 - 0x9F */ | 3147 | /* 0x98 - 0x9F */ |
@@ -3184,7 +3155,7 @@ static struct opcode opcode_table[256] = { | |||
3184 | I2bv(SrcSI | DstDI | Mov | String, em_mov), | 3155 | I2bv(SrcSI | DstDI | Mov | String, em_mov), |
3185 | I2bv(SrcSI | DstDI | String, em_cmp), | 3156 | I2bv(SrcSI | DstDI | String, em_cmp), |
3186 | /* 0xA8 - 0xAF */ | 3157 | /* 0xA8 - 0xAF */ |
3187 | D2bv(DstAcc | SrcImm), | 3158 | I2bv(DstAcc | SrcImm, em_test), |
3188 | I2bv(SrcAcc | DstDI | Mov | String, em_mov), | 3159 | I2bv(SrcAcc | DstDI | Mov | String, em_mov), |
3189 | I2bv(SrcSI | DstAcc | Mov | String, em_mov), | 3160 | I2bv(SrcSI | DstAcc | Mov | String, em_mov), |
3190 | I2bv(SrcAcc | DstDI | String, em_cmp), | 3161 | I2bv(SrcAcc | DstDI | String, em_cmp), |
@@ -3195,25 +3166,26 @@ static struct opcode opcode_table[256] = { | |||
3195 | /* 0xC0 - 0xC7 */ | 3166 | /* 0xC0 - 0xC7 */ |
3196 | D2bv(DstMem | SrcImmByte | ModRM), | 3167 | D2bv(DstMem | SrcImmByte | ModRM), |
3197 | I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), | 3168 | I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), |
3198 | D(ImplicitOps | Stack), | 3169 | I(ImplicitOps | Stack, em_ret), |
3199 | D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), | 3170 | D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), |
3200 | G(ByteOp, group11), G(0, group11), | 3171 | G(ByteOp, group11), G(0, group11), |
3201 | /* 0xC8 - 0xCF */ | 3172 | /* 0xC8 - 0xCF */ |
3202 | N, N, N, D(ImplicitOps | Stack), | 3173 | N, N, N, I(ImplicitOps | Stack, em_ret_far), |
3203 | D(ImplicitOps), DI(SrcImmByte, intn), | 3174 | D(ImplicitOps), DI(SrcImmByte, intn), |
3204 | D(ImplicitOps | No64), DI(ImplicitOps, iret), | 3175 | D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), |
3205 | /* 0xD0 - 0xD7 */ | 3176 | /* 0xD0 - 0xD7 */ |
3206 | D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), | 3177 | D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), |
3207 | N, N, N, N, | 3178 | N, N, N, N, |
3208 | /* 0xD8 - 0xDF */ | 3179 | /* 0xD8 - 0xDF */ |
3209 | N, N, N, N, N, N, N, N, | 3180 | N, N, N, N, N, N, N, N, |
3210 | /* 0xE0 - 0xE7 */ | 3181 | /* 0xE0 - 0xE7 */ |
3211 | X4(D(SrcImmByte)), | 3182 | X3(I(SrcImmByte, em_loop)), |
3183 | I(SrcImmByte, em_jcxz), | ||
3212 | D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in), | 3184 | D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in), |
3213 | D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out), | 3185 | D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out), |
3214 | /* 0xE8 - 0xEF */ | 3186 | /* 0xE8 - 0xEF */ |
3215 | D(SrcImm | Stack), D(SrcImm | ImplicitOps), | 3187 | D(SrcImm | Stack), D(SrcImm | ImplicitOps), |
3216 | D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), | 3188 | I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps), |
3217 | D2bvIP(SrcDX | DstAcc, in, check_perm_in), | 3189 | D2bvIP(SrcDX | DstAcc, in, check_perm_in), |
3218 | D2bvIP(SrcAcc | DstDX, out, check_perm_out), | 3190 | D2bvIP(SrcAcc | DstDX, out, check_perm_out), |
3219 | /* 0xF0 - 0xF7 */ | 3191 | /* 0xF0 - 0xF7 */ |
@@ -3221,14 +3193,16 @@ static struct opcode opcode_table[256] = { | |||
3221 | DI(ImplicitOps | Priv, hlt), D(ImplicitOps), | 3193 | DI(ImplicitOps | Priv, hlt), D(ImplicitOps), |
3222 | G(ByteOp, group3), G(0, group3), | 3194 | G(ByteOp, group3), G(0, group3), |
3223 | /* 0xF8 - 0xFF */ | 3195 | /* 0xF8 - 0xFF */ |
3224 | D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), | 3196 | D(ImplicitOps), D(ImplicitOps), |
3197 | I(ImplicitOps, em_cli), I(ImplicitOps, em_sti), | ||
3225 | D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), | 3198 | D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), |
3226 | }; | 3199 | }; |
3227 | 3200 | ||
3228 | static struct opcode twobyte_table[256] = { | 3201 | static struct opcode twobyte_table[256] = { |
3229 | /* 0x00 - 0x0F */ | 3202 | /* 0x00 - 0x0F */ |
3230 | G(0, group6), GD(0, &group7), N, N, | 3203 | G(0, group6), GD(0, &group7), N, N, |
3231 | N, D(ImplicitOps | VendorSpecific), DI(ImplicitOps | Priv, clts), N, | 3204 | N, I(ImplicitOps | VendorSpecific, em_syscall), |
3205 | II(ImplicitOps | Priv, em_clts, clts), N, | ||
3232 | DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, | 3206 | DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, |
3233 | N, D(ImplicitOps | ModRM), N, N, | 3207 | N, D(ImplicitOps | ModRM), N, N, |
3234 | /* 0x10 - 0x1F */ | 3208 | /* 0x10 - 0x1F */ |
@@ -3245,7 +3219,8 @@ static struct opcode twobyte_table[256] = { | |||
3245 | IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), | 3219 | IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), |
3246 | DI(ImplicitOps | Priv, rdmsr), | 3220 | DI(ImplicitOps | Priv, rdmsr), |
3247 | DIP(ImplicitOps | Priv, rdpmc, check_rdpmc), | 3221 | DIP(ImplicitOps | Priv, rdpmc, check_rdpmc), |
3248 | D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific), | 3222 | I(ImplicitOps | VendorSpecific, em_sysenter), |
3223 | I(ImplicitOps | Priv | VendorSpecific, em_sysexit), | ||
3249 | N, N, | 3224 | N, N, |
3250 | N, N, N, N, N, N, N, N, | 3225 | N, N, N, N, N, N, N, N, |
3251 | /* 0x40 - 0x4F */ | 3226 | /* 0x40 - 0x4F */ |
@@ -3313,11 +3288,11 @@ static struct opcode twobyte_table[256] = { | |||
3313 | #undef I2bv | 3288 | #undef I2bv |
3314 | #undef I6ALU | 3289 | #undef I6ALU |
3315 | 3290 | ||
3316 | static unsigned imm_size(struct decode_cache *c) | 3291 | static unsigned imm_size(struct x86_emulate_ctxt *ctxt) |
3317 | { | 3292 | { |
3318 | unsigned size; | 3293 | unsigned size; |
3319 | 3294 | ||
3320 | size = (c->d & ByteOp) ? 1 : c->op_bytes; | 3295 | size = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; |
3321 | if (size == 8) | 3296 | if (size == 8) |
3322 | size = 4; | 3297 | size = 4; |
3323 | return size; | 3298 | return size; |
@@ -3326,23 +3301,21 @@ static unsigned imm_size(struct decode_cache *c) | |||
3326 | static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, | 3301 | static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, |
3327 | unsigned size, bool sign_extension) | 3302 | unsigned size, bool sign_extension) |
3328 | { | 3303 | { |
3329 | struct decode_cache *c = &ctxt->decode; | ||
3330 | struct x86_emulate_ops *ops = ctxt->ops; | ||
3331 | int rc = X86EMUL_CONTINUE; | 3304 | int rc = X86EMUL_CONTINUE; |
3332 | 3305 | ||
3333 | op->type = OP_IMM; | 3306 | op->type = OP_IMM; |
3334 | op->bytes = size; | 3307 | op->bytes = size; |
3335 | op->addr.mem.ea = c->eip; | 3308 | op->addr.mem.ea = ctxt->_eip; |
3336 | /* NB. Immediates are sign-extended as necessary. */ | 3309 | /* NB. Immediates are sign-extended as necessary. */ |
3337 | switch (op->bytes) { | 3310 | switch (op->bytes) { |
3338 | case 1: | 3311 | case 1: |
3339 | op->val = insn_fetch(s8, 1, c->eip); | 3312 | op->val = insn_fetch(s8, 1, ctxt->_eip); |
3340 | break; | 3313 | break; |
3341 | case 2: | 3314 | case 2: |
3342 | op->val = insn_fetch(s16, 2, c->eip); | 3315 | op->val = insn_fetch(s16, 2, ctxt->_eip); |
3343 | break; | 3316 | break; |
3344 | case 4: | 3317 | case 4: |
3345 | op->val = insn_fetch(s32, 4, c->eip); | 3318 | op->val = insn_fetch(s32, 4, ctxt->_eip); |
3346 | break; | 3319 | break; |
3347 | } | 3320 | } |
3348 | if (!sign_extension) { | 3321 | if (!sign_extension) { |
@@ -3362,11 +3335,8 @@ done: | |||
3362 | return rc; | 3335 | return rc; |
3363 | } | 3336 | } |
3364 | 3337 | ||
3365 | int | 3338 | int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) |
3366 | x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) | ||
3367 | { | 3339 | { |
3368 | struct x86_emulate_ops *ops = ctxt->ops; | ||
3369 | struct decode_cache *c = &ctxt->decode; | ||
3370 | int rc = X86EMUL_CONTINUE; | 3340 | int rc = X86EMUL_CONTINUE; |
3371 | int mode = ctxt->mode; | 3341 | int mode = ctxt->mode; |
3372 | int def_op_bytes, def_ad_bytes, goffset, simd_prefix; | 3342 | int def_op_bytes, def_ad_bytes, goffset, simd_prefix; |
@@ -3374,11 +3344,11 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) | |||
3374 | struct opcode opcode; | 3344 | struct opcode opcode; |
3375 | struct operand memop = { .type = OP_NONE }, *memopp = NULL; | 3345 | struct operand memop = { .type = OP_NONE }, *memopp = NULL; |
3376 | 3346 | ||
3377 | c->eip = ctxt->eip; | 3347 | ctxt->_eip = ctxt->eip; |
3378 | c->fetch.start = c->eip; | 3348 | ctxt->fetch.start = ctxt->_eip; |
3379 | c->fetch.end = c->fetch.start + insn_len; | 3349 | ctxt->fetch.end = ctxt->fetch.start + insn_len; |
3380 | if (insn_len > 0) | 3350 | if (insn_len > 0) |
3381 | memcpy(c->fetch.data, insn, insn_len); | 3351 | memcpy(ctxt->fetch.data, insn, insn_len); |
3382 | 3352 | ||
3383 | switch (mode) { | 3353 | switch (mode) { |
3384 | case X86EMUL_MODE_REAL: | 3354 | case X86EMUL_MODE_REAL: |
@@ -3399,46 +3369,46 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) | |||
3399 | return -1; | 3369 | return -1; |
3400 | } | 3370 | } |
3401 | 3371 | ||
3402 | c->op_bytes = def_op_bytes; | 3372 | ctxt->op_bytes = def_op_bytes; |
3403 | c->ad_bytes = def_ad_bytes; | 3373 | ctxt->ad_bytes = def_ad_bytes; |
3404 | 3374 | ||
3405 | /* Legacy prefixes. */ | 3375 | /* Legacy prefixes. */ |
3406 | for (;;) { | 3376 | for (;;) { |
3407 | switch (c->b = insn_fetch(u8, 1, c->eip)) { | 3377 | switch (ctxt->b = insn_fetch(u8, 1, ctxt->_eip)) { |
3408 | case 0x66: /* operand-size override */ | 3378 | case 0x66: /* operand-size override */ |
3409 | op_prefix = true; | 3379 | op_prefix = true; |
3410 | /* switch between 2/4 bytes */ | 3380 | /* switch between 2/4 bytes */ |
3411 | c->op_bytes = def_op_bytes ^ 6; | 3381 | ctxt->op_bytes = def_op_bytes ^ 6; |
3412 | break; | 3382 | break; |
3413 | case 0x67: /* address-size override */ | 3383 | case 0x67: /* address-size override */ |
3414 | if (mode == X86EMUL_MODE_PROT64) | 3384 | if (mode == X86EMUL_MODE_PROT64) |
3415 | /* switch between 4/8 bytes */ | 3385 | /* switch between 4/8 bytes */ |
3416 | c->ad_bytes = def_ad_bytes ^ 12; | 3386 | ctxt->ad_bytes = def_ad_bytes ^ 12; |
3417 | else | 3387 | else |
3418 | /* switch between 2/4 bytes */ | 3388 | /* switch between 2/4 bytes */ |
3419 | c->ad_bytes = def_ad_bytes ^ 6; | 3389 | ctxt->ad_bytes = def_ad_bytes ^ 6; |
3420 | break; | 3390 | break; |
3421 | case 0x26: /* ES override */ | 3391 | case 0x26: /* ES override */ |
3422 | case 0x2e: /* CS override */ | 3392 | case 0x2e: /* CS override */ |
3423 | case 0x36: /* SS override */ | 3393 | case 0x36: /* SS override */ |
3424 | case 0x3e: /* DS override */ | 3394 | case 0x3e: /* DS override */ |
3425 | set_seg_override(c, (c->b >> 3) & 3); | 3395 | set_seg_override(ctxt, (ctxt->b >> 3) & 3); |
3426 | break; | 3396 | break; |
3427 | case 0x64: /* FS override */ | 3397 | case 0x64: /* FS override */ |
3428 | case 0x65: /* GS override */ | 3398 | case 0x65: /* GS override */ |
3429 | set_seg_override(c, c->b & 7); | 3399 | set_seg_override(ctxt, ctxt->b & 7); |
3430 | break; | 3400 | break; |
3431 | case 0x40 ... 0x4f: /* REX */ | 3401 | case 0x40 ... 0x4f: /* REX */ |
3432 | if (mode != X86EMUL_MODE_PROT64) | 3402 | if (mode != X86EMUL_MODE_PROT64) |
3433 | goto done_prefixes; | 3403 | goto done_prefixes; |
3434 | c->rex_prefix = c->b; | 3404 | ctxt->rex_prefix = ctxt->b; |
3435 | continue; | 3405 | continue; |
3436 | case 0xf0: /* LOCK */ | 3406 | case 0xf0: /* LOCK */ |
3437 | c->lock_prefix = 1; | 3407 | ctxt->lock_prefix = 1; |
3438 | break; | 3408 | break; |
3439 | case 0xf2: /* REPNE/REPNZ */ | 3409 | case 0xf2: /* REPNE/REPNZ */ |
3440 | case 0xf3: /* REP/REPE/REPZ */ | 3410 | case 0xf3: /* REP/REPE/REPZ */ |
3441 | c->rep_prefix = c->b; | 3411 | ctxt->rep_prefix = ctxt->b; |
3442 | break; | 3412 | break; |
3443 | default: | 3413 | default: |
3444 | goto done_prefixes; | 3414 | goto done_prefixes; |
@@ -3446,50 +3416,50 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) | |||
3446 | 3416 | ||
3447 | /* Any legacy prefix after a REX prefix nullifies its effect. */ | 3417 | /* Any legacy prefix after a REX prefix nullifies its effect. */ |
3448 | 3418 | ||
3449 | c->rex_prefix = 0; | 3419 | ctxt->rex_prefix = 0; |
3450 | } | 3420 | } |
3451 | 3421 | ||
3452 | done_prefixes: | 3422 | done_prefixes: |
3453 | 3423 | ||
3454 | /* REX prefix. */ | 3424 | /* REX prefix. */ |
3455 | if (c->rex_prefix & 8) | 3425 | if (ctxt->rex_prefix & 8) |
3456 | c->op_bytes = 8; /* REX.W */ | 3426 | ctxt->op_bytes = 8; /* REX.W */ |
3457 | 3427 | ||
3458 | /* Opcode byte(s). */ | 3428 | /* Opcode byte(s). */ |
3459 | opcode = opcode_table[c->b]; | 3429 | opcode = opcode_table[ctxt->b]; |
3460 | /* Two-byte opcode? */ | 3430 | /* Two-byte opcode? */ |
3461 | if (c->b == 0x0f) { | 3431 | if (ctxt->b == 0x0f) { |
3462 | c->twobyte = 1; | 3432 | ctxt->twobyte = 1; |
3463 | c->b = insn_fetch(u8, 1, c->eip); | 3433 | ctxt->b = insn_fetch(u8, 1, ctxt->_eip); |
3464 | opcode = twobyte_table[c->b]; | 3434 | opcode = twobyte_table[ctxt->b]; |
3465 | } | 3435 | } |
3466 | c->d = opcode.flags; | 3436 | ctxt->d = opcode.flags; |
3467 | 3437 | ||
3468 | while (c->d & GroupMask) { | 3438 | while (ctxt->d & GroupMask) { |
3469 | switch (c->d & GroupMask) { | 3439 | switch (ctxt->d & GroupMask) { |
3470 | case Group: | 3440 | case Group: |
3471 | c->modrm = insn_fetch(u8, 1, c->eip); | 3441 | ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); |
3472 | --c->eip; | 3442 | --ctxt->_eip; |
3473 | goffset = (c->modrm >> 3) & 7; | 3443 | goffset = (ctxt->modrm >> 3) & 7; |
3474 | opcode = opcode.u.group[goffset]; | 3444 | opcode = opcode.u.group[goffset]; |
3475 | break; | 3445 | break; |
3476 | case GroupDual: | 3446 | case GroupDual: |
3477 | c->modrm = insn_fetch(u8, 1, c->eip); | 3447 | ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); |
3478 | --c->eip; | 3448 | --ctxt->_eip; |
3479 | goffset = (c->modrm >> 3) & 7; | 3449 | goffset = (ctxt->modrm >> 3) & 7; |
3480 | if ((c->modrm >> 6) == 3) | 3450 | if ((ctxt->modrm >> 6) == 3) |
3481 | opcode = opcode.u.gdual->mod3[goffset]; | 3451 | opcode = opcode.u.gdual->mod3[goffset]; |
3482 | else | 3452 | else |
3483 | opcode = opcode.u.gdual->mod012[goffset]; | 3453 | opcode = opcode.u.gdual->mod012[goffset]; |
3484 | break; | 3454 | break; |
3485 | case RMExt: | 3455 | case RMExt: |
3486 | goffset = c->modrm & 7; | 3456 | goffset = ctxt->modrm & 7; |
3487 | opcode = opcode.u.group[goffset]; | 3457 | opcode = opcode.u.group[goffset]; |
3488 | break; | 3458 | break; |
3489 | case Prefix: | 3459 | case Prefix: |
3490 | if (c->rep_prefix && op_prefix) | 3460 | if (ctxt->rep_prefix && op_prefix) |
3491 | return X86EMUL_UNHANDLEABLE; | 3461 | return X86EMUL_UNHANDLEABLE; |
3492 | simd_prefix = op_prefix ? 0x66 : c->rep_prefix; | 3462 | simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix; |
3493 | switch (simd_prefix) { | 3463 | switch (simd_prefix) { |
3494 | case 0x00: opcode = opcode.u.gprefix->pfx_no; break; | 3464 | case 0x00: opcode = opcode.u.gprefix->pfx_no; break; |
3495 | case 0x66: opcode = opcode.u.gprefix->pfx_66; break; | 3465 | case 0x66: opcode = opcode.u.gprefix->pfx_66; break; |
@@ -3501,61 +3471,61 @@ done_prefixes: | |||
3501 | return X86EMUL_UNHANDLEABLE; | 3471 | return X86EMUL_UNHANDLEABLE; |
3502 | } | 3472 | } |
3503 | 3473 | ||
3504 | c->d &= ~GroupMask; | 3474 | ctxt->d &= ~GroupMask; |
3505 | c->d |= opcode.flags; | 3475 | ctxt->d |= opcode.flags; |
3506 | } | 3476 | } |
3507 | 3477 | ||
3508 | c->execute = opcode.u.execute; | 3478 | ctxt->execute = opcode.u.execute; |
3509 | c->check_perm = opcode.check_perm; | 3479 | ctxt->check_perm = opcode.check_perm; |
3510 | c->intercept = opcode.intercept; | 3480 | ctxt->intercept = opcode.intercept; |
3511 | 3481 | ||
3512 | /* Unrecognised? */ | 3482 | /* Unrecognised? */ |
3513 | if (c->d == 0 || (c->d & Undefined)) | 3483 | if (ctxt->d == 0 || (ctxt->d & Undefined)) |
3514 | return -1; | 3484 | return -1; |
3515 | 3485 | ||
3516 | if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn) | 3486 | if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) |
3517 | return -1; | 3487 | return -1; |
3518 | 3488 | ||
3519 | if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) | 3489 | if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) |
3520 | c->op_bytes = 8; | 3490 | ctxt->op_bytes = 8; |
3521 | 3491 | ||
3522 | if (c->d & Op3264) { | 3492 | if (ctxt->d & Op3264) { |
3523 | if (mode == X86EMUL_MODE_PROT64) | 3493 | if (mode == X86EMUL_MODE_PROT64) |
3524 | c->op_bytes = 8; | 3494 | ctxt->op_bytes = 8; |
3525 | else | 3495 | else |
3526 | c->op_bytes = 4; | 3496 | ctxt->op_bytes = 4; |
3527 | } | 3497 | } |
3528 | 3498 | ||
3529 | if (c->d & Sse) | 3499 | if (ctxt->d & Sse) |
3530 | c->op_bytes = 16; | 3500 | ctxt->op_bytes = 16; |
3531 | 3501 | ||
3532 | /* ModRM and SIB bytes. */ | 3502 | /* ModRM and SIB bytes. */ |
3533 | if (c->d & ModRM) { | 3503 | if (ctxt->d & ModRM) { |
3534 | rc = decode_modrm(ctxt, ops, &memop); | 3504 | rc = decode_modrm(ctxt, &memop); |
3535 | if (!c->has_seg_override) | 3505 | if (!ctxt->has_seg_override) |
3536 | set_seg_override(c, c->modrm_seg); | 3506 | set_seg_override(ctxt, ctxt->modrm_seg); |
3537 | } else if (c->d & MemAbs) | 3507 | } else if (ctxt->d & MemAbs) |
3538 | rc = decode_abs(ctxt, ops, &memop); | 3508 | rc = decode_abs(ctxt, &memop); |
3539 | if (rc != X86EMUL_CONTINUE) | 3509 | if (rc != X86EMUL_CONTINUE) |
3540 | goto done; | 3510 | goto done; |
3541 | 3511 | ||
3542 | if (!c->has_seg_override) | 3512 | if (!ctxt->has_seg_override) |
3543 | set_seg_override(c, VCPU_SREG_DS); | 3513 | set_seg_override(ctxt, VCPU_SREG_DS); |
3544 | 3514 | ||
3545 | memop.addr.mem.seg = seg_override(ctxt, c); | 3515 | memop.addr.mem.seg = seg_override(ctxt); |
3546 | 3516 | ||
3547 | if (memop.type == OP_MEM && c->ad_bytes != 8) | 3517 | if (memop.type == OP_MEM && ctxt->ad_bytes != 8) |
3548 | memop.addr.mem.ea = (u32)memop.addr.mem.ea; | 3518 | memop.addr.mem.ea = (u32)memop.addr.mem.ea; |
3549 | 3519 | ||
3550 | /* | 3520 | /* |
3551 | * Decode and fetch the source operand: register, memory | 3521 | * Decode and fetch the source operand: register, memory |
3552 | * or immediate. | 3522 | * or immediate. |
3553 | */ | 3523 | */ |
3554 | switch (c->d & SrcMask) { | 3524 | switch (ctxt->d & SrcMask) { |
3555 | case SrcNone: | 3525 | case SrcNone: |
3556 | break; | 3526 | break; |
3557 | case SrcReg: | 3527 | case SrcReg: |
3558 | decode_register_operand(ctxt, &c->src, c, 0); | 3528 | decode_register_operand(ctxt, &ctxt->src, 0); |
3559 | break; | 3529 | break; |
3560 | case SrcMem16: | 3530 | case SrcMem16: |
3561 | memop.bytes = 2; | 3531 | memop.bytes = 2; |
@@ -3564,60 +3534,60 @@ done_prefixes: | |||
3564 | memop.bytes = 4; | 3534 | memop.bytes = 4; |
3565 | goto srcmem_common; | 3535 | goto srcmem_common; |
3566 | case SrcMem: | 3536 | case SrcMem: |
3567 | memop.bytes = (c->d & ByteOp) ? 1 : | 3537 | memop.bytes = (ctxt->d & ByteOp) ? 1 : |
3568 | c->op_bytes; | 3538 | ctxt->op_bytes; |
3569 | srcmem_common: | 3539 | srcmem_common: |
3570 | c->src = memop; | 3540 | ctxt->src = memop; |
3571 | memopp = &c->src; | 3541 | memopp = &ctxt->src; |
3572 | break; | 3542 | break; |
3573 | case SrcImmU16: | 3543 | case SrcImmU16: |
3574 | rc = decode_imm(ctxt, &c->src, 2, false); | 3544 | rc = decode_imm(ctxt, &ctxt->src, 2, false); |
3575 | break; | 3545 | break; |
3576 | case SrcImm: | 3546 | case SrcImm: |
3577 | rc = decode_imm(ctxt, &c->src, imm_size(c), true); | 3547 | rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), true); |
3578 | break; | 3548 | break; |
3579 | case SrcImmU: | 3549 | case SrcImmU: |
3580 | rc = decode_imm(ctxt, &c->src, imm_size(c), false); | 3550 | rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), false); |
3581 | break; | 3551 | break; |
3582 | case SrcImmByte: | 3552 | case SrcImmByte: |
3583 | rc = decode_imm(ctxt, &c->src, 1, true); | 3553 | rc = decode_imm(ctxt, &ctxt->src, 1, true); |
3584 | break; | 3554 | break; |
3585 | case SrcImmUByte: | 3555 | case SrcImmUByte: |
3586 | rc = decode_imm(ctxt, &c->src, 1, false); | 3556 | rc = decode_imm(ctxt, &ctxt->src, 1, false); |
3587 | break; | 3557 | break; |
3588 | case SrcAcc: | 3558 | case SrcAcc: |
3589 | c->src.type = OP_REG; | 3559 | ctxt->src.type = OP_REG; |
3590 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 3560 | ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; |
3591 | c->src.addr.reg = &c->regs[VCPU_REGS_RAX]; | 3561 | ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RAX]; |
3592 | fetch_register_operand(&c->src); | 3562 | fetch_register_operand(&ctxt->src); |
3593 | break; | 3563 | break; |
3594 | case SrcOne: | 3564 | case SrcOne: |
3595 | c->src.bytes = 1; | 3565 | ctxt->src.bytes = 1; |
3596 | c->src.val = 1; | 3566 | ctxt->src.val = 1; |
3597 | break; | 3567 | break; |
3598 | case SrcSI: | 3568 | case SrcSI: |
3599 | c->src.type = OP_MEM; | 3569 | ctxt->src.type = OP_MEM; |
3600 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 3570 | ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; |
3601 | c->src.addr.mem.ea = | 3571 | ctxt->src.addr.mem.ea = |
3602 | register_address(c, c->regs[VCPU_REGS_RSI]); | 3572 | register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]); |
3603 | c->src.addr.mem.seg = seg_override(ctxt, c); | 3573 | ctxt->src.addr.mem.seg = seg_override(ctxt); |
3604 | c->src.val = 0; | 3574 | ctxt->src.val = 0; |
3605 | break; | 3575 | break; |
3606 | case SrcImmFAddr: | 3576 | case SrcImmFAddr: |
3607 | c->src.type = OP_IMM; | 3577 | ctxt->src.type = OP_IMM; |
3608 | c->src.addr.mem.ea = c->eip; | 3578 | ctxt->src.addr.mem.ea = ctxt->_eip; |
3609 | c->src.bytes = c->op_bytes + 2; | 3579 | ctxt->src.bytes = ctxt->op_bytes + 2; |
3610 | insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); | 3580 | insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt->_eip); |
3611 | break; | 3581 | break; |
3612 | case SrcMemFAddr: | 3582 | case SrcMemFAddr: |
3613 | memop.bytes = c->op_bytes + 2; | 3583 | memop.bytes = ctxt->op_bytes + 2; |
3614 | goto srcmem_common; | 3584 | goto srcmem_common; |
3615 | break; | 3585 | break; |
3616 | case SrcDX: | 3586 | case SrcDX: |
3617 | c->src.type = OP_REG; | 3587 | ctxt->src.type = OP_REG; |
3618 | c->src.bytes = 2; | 3588 | ctxt->src.bytes = 2; |
3619 | c->src.addr.reg = &c->regs[VCPU_REGS_RDX]; | 3589 | ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; |
3620 | fetch_register_operand(&c->src); | 3590 | fetch_register_operand(&ctxt->src); |
3621 | break; | 3591 | break; |
3622 | } | 3592 | } |
3623 | 3593 | ||
@@ -3628,22 +3598,22 @@ done_prefixes: | |||
3628 | * Decode and fetch the second source operand: register, memory | 3598 | * Decode and fetch the second source operand: register, memory |
3629 | * or immediate. | 3599 | * or immediate. |
3630 | */ | 3600 | */ |
3631 | switch (c->d & Src2Mask) { | 3601 | switch (ctxt->d & Src2Mask) { |
3632 | case Src2None: | 3602 | case Src2None: |
3633 | break; | 3603 | break; |
3634 | case Src2CL: | 3604 | case Src2CL: |
3635 | c->src2.bytes = 1; | 3605 | ctxt->src2.bytes = 1; |
3636 | c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; | 3606 | ctxt->src2.val = ctxt->regs[VCPU_REGS_RCX] & 0x8; |
3637 | break; | 3607 | break; |
3638 | case Src2ImmByte: | 3608 | case Src2ImmByte: |
3639 | rc = decode_imm(ctxt, &c->src2, 1, true); | 3609 | rc = decode_imm(ctxt, &ctxt->src2, 1, true); |
3640 | break; | 3610 | break; |
3641 | case Src2One: | 3611 | case Src2One: |
3642 | c->src2.bytes = 1; | 3612 | ctxt->src2.bytes = 1; |
3643 | c->src2.val = 1; | 3613 | ctxt->src2.val = 1; |
3644 | break; | 3614 | break; |
3645 | case Src2Imm: | 3615 | case Src2Imm: |
3646 | rc = decode_imm(ctxt, &c->src2, imm_size(c), true); | 3616 | rc = decode_imm(ctxt, &ctxt->src2, imm_size(ctxt), true); |
3647 | break; | 3617 | break; |
3648 | } | 3618 | } |
3649 | 3619 | ||
@@ -3651,68 +3621,66 @@ done_prefixes: | |||
3651 | goto done; | 3621 | goto done; |
3652 | 3622 | ||
3653 | /* Decode and fetch the destination operand: register or memory. */ | 3623 | /* Decode and fetch the destination operand: register or memory. */ |
3654 | switch (c->d & DstMask) { | 3624 | switch (ctxt->d & DstMask) { |
3655 | case DstReg: | 3625 | case DstReg: |
3656 | decode_register_operand(ctxt, &c->dst, c, | 3626 | decode_register_operand(ctxt, &ctxt->dst, |
3657 | c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); | 3627 | ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7)); |
3658 | break; | 3628 | break; |
3659 | case DstImmUByte: | 3629 | case DstImmUByte: |
3660 | c->dst.type = OP_IMM; | 3630 | ctxt->dst.type = OP_IMM; |
3661 | c->dst.addr.mem.ea = c->eip; | 3631 | ctxt->dst.addr.mem.ea = ctxt->_eip; |
3662 | c->dst.bytes = 1; | 3632 | ctxt->dst.bytes = 1; |
3663 | c->dst.val = insn_fetch(u8, 1, c->eip); | 3633 | ctxt->dst.val = insn_fetch(u8, 1, ctxt->_eip); |
3664 | break; | 3634 | break; |
3665 | case DstMem: | 3635 | case DstMem: |
3666 | case DstMem64: | 3636 | case DstMem64: |
3667 | c->dst = memop; | 3637 | ctxt->dst = memop; |
3668 | memopp = &c->dst; | 3638 | memopp = &ctxt->dst; |
3669 | if ((c->d & DstMask) == DstMem64) | 3639 | if ((ctxt->d & DstMask) == DstMem64) |
3670 | c->dst.bytes = 8; | 3640 | ctxt->dst.bytes = 8; |
3671 | else | 3641 | else |
3672 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 3642 | ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; |
3673 | if (c->d & BitOp) | 3643 | if (ctxt->d & BitOp) |
3674 | fetch_bit_operand(c); | 3644 | fetch_bit_operand(ctxt); |
3675 | c->dst.orig_val = c->dst.val; | 3645 | ctxt->dst.orig_val = ctxt->dst.val; |
3676 | break; | 3646 | break; |
3677 | case DstAcc: | 3647 | case DstAcc: |
3678 | c->dst.type = OP_REG; | 3648 | ctxt->dst.type = OP_REG; |
3679 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 3649 | ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; |
3680 | c->dst.addr.reg = &c->regs[VCPU_REGS_RAX]; | 3650 | ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RAX]; |
3681 | fetch_register_operand(&c->dst); | 3651 | fetch_register_operand(&ctxt->dst); |
3682 | c->dst.orig_val = c->dst.val; | 3652 | ctxt->dst.orig_val = ctxt->dst.val; |
3683 | break; | 3653 | break; |
3684 | case DstDI: | 3654 | case DstDI: |
3685 | c->dst.type = OP_MEM; | 3655 | ctxt->dst.type = OP_MEM; |
3686 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 3656 | ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; |
3687 | c->dst.addr.mem.ea = | 3657 | ctxt->dst.addr.mem.ea = |
3688 | register_address(c, c->regs[VCPU_REGS_RDI]); | 3658 | register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]); |
3689 | c->dst.addr.mem.seg = VCPU_SREG_ES; | 3659 | ctxt->dst.addr.mem.seg = VCPU_SREG_ES; |
3690 | c->dst.val = 0; | 3660 | ctxt->dst.val = 0; |
3691 | break; | 3661 | break; |
3692 | case DstDX: | 3662 | case DstDX: |
3693 | c->dst.type = OP_REG; | 3663 | ctxt->dst.type = OP_REG; |
3694 | c->dst.bytes = 2; | 3664 | ctxt->dst.bytes = 2; |
3695 | c->dst.addr.reg = &c->regs[VCPU_REGS_RDX]; | 3665 | ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; |
3696 | fetch_register_operand(&c->dst); | 3666 | fetch_register_operand(&ctxt->dst); |
3697 | break; | 3667 | break; |
3698 | case ImplicitOps: | 3668 | case ImplicitOps: |
3699 | /* Special instructions do their own operand decoding. */ | 3669 | /* Special instructions do their own operand decoding. */ |
3700 | default: | 3670 | default: |
3701 | c->dst.type = OP_NONE; /* Disable writeback. */ | 3671 | ctxt->dst.type = OP_NONE; /* Disable writeback. */ |
3702 | break; | 3672 | break; |
3703 | } | 3673 | } |
3704 | 3674 | ||
3705 | done: | 3675 | done: |
3706 | if (memopp && memopp->type == OP_MEM && c->rip_relative) | 3676 | if (memopp && memopp->type == OP_MEM && ctxt->rip_relative) |
3707 | memopp->addr.mem.ea += c->eip; | 3677 | memopp->addr.mem.ea += ctxt->_eip; |
3708 | 3678 | ||
3709 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; | 3679 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; |
3710 | } | 3680 | } |
3711 | 3681 | ||
3712 | static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) | 3682 | static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) |
3713 | { | 3683 | { |
3714 | struct decode_cache *c = &ctxt->decode; | ||
3715 | |||
3716 | /* The second termination condition only applies for REPE | 3684 | /* The second termination condition only applies for REPE |
3717 | * and REPNE. Test if the repeat string operation prefix is | 3685 | * and REPNE. Test if the repeat string operation prefix is |
3718 | * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the | 3686 | * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the |
@@ -3720,304 +3688,232 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) | |||
3720 | * - if REPE/REPZ and ZF = 0 then done | 3688 | * - if REPE/REPZ and ZF = 0 then done |
3721 | * - if REPNE/REPNZ and ZF = 1 then done | 3689 | * - if REPNE/REPNZ and ZF = 1 then done |
3722 | */ | 3690 | */ |
3723 | if (((c->b == 0xa6) || (c->b == 0xa7) || | 3691 | if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) || |
3724 | (c->b == 0xae) || (c->b == 0xaf)) | 3692 | (ctxt->b == 0xae) || (ctxt->b == 0xaf)) |
3725 | && (((c->rep_prefix == REPE_PREFIX) && | 3693 | && (((ctxt->rep_prefix == REPE_PREFIX) && |
3726 | ((ctxt->eflags & EFLG_ZF) == 0)) | 3694 | ((ctxt->eflags & EFLG_ZF) == 0)) |
3727 | || ((c->rep_prefix == REPNE_PREFIX) && | 3695 | || ((ctxt->rep_prefix == REPNE_PREFIX) && |
3728 | ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) | 3696 | ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) |
3729 | return true; | 3697 | return true; |
3730 | 3698 | ||
3731 | return false; | 3699 | return false; |
3732 | } | 3700 | } |
3733 | 3701 | ||
3734 | int | 3702 | int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) |
3735 | x86_emulate_insn(struct x86_emulate_ctxt *ctxt) | ||
3736 | { | 3703 | { |
3737 | struct x86_emulate_ops *ops = ctxt->ops; | 3704 | struct x86_emulate_ops *ops = ctxt->ops; |
3738 | u64 msr_data; | 3705 | u64 msr_data; |
3739 | struct decode_cache *c = &ctxt->decode; | ||
3740 | int rc = X86EMUL_CONTINUE; | 3706 | int rc = X86EMUL_CONTINUE; |
3741 | int saved_dst_type = c->dst.type; | 3707 | int saved_dst_type = ctxt->dst.type; |
3742 | int irq; /* Used for int 3, int, and into */ | ||
3743 | 3708 | ||
3744 | ctxt->decode.mem_read.pos = 0; | 3709 | ctxt->mem_read.pos = 0; |
3745 | 3710 | ||
3746 | if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { | 3711 | if (ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) { |
3747 | rc = emulate_ud(ctxt); | 3712 | rc = emulate_ud(ctxt); |
3748 | goto done; | 3713 | goto done; |
3749 | } | 3714 | } |
3750 | 3715 | ||
3751 | /* LOCK prefix is allowed only with some instructions */ | 3716 | /* LOCK prefix is allowed only with some instructions */ |
3752 | if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { | 3717 | if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) { |
3753 | rc = emulate_ud(ctxt); | 3718 | rc = emulate_ud(ctxt); |
3754 | goto done; | 3719 | goto done; |
3755 | } | 3720 | } |
3756 | 3721 | ||
3757 | if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { | 3722 | if ((ctxt->d & SrcMask) == SrcMemFAddr && ctxt->src.type != OP_MEM) { |
3758 | rc = emulate_ud(ctxt); | 3723 | rc = emulate_ud(ctxt); |
3759 | goto done; | 3724 | goto done; |
3760 | } | 3725 | } |
3761 | 3726 | ||
3762 | if ((c->d & Sse) | 3727 | if ((ctxt->d & Sse) |
3763 | && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) | 3728 | && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) |
3764 | || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { | 3729 | || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { |
3765 | rc = emulate_ud(ctxt); | 3730 | rc = emulate_ud(ctxt); |
3766 | goto done; | 3731 | goto done; |
3767 | } | 3732 | } |
3768 | 3733 | ||
3769 | if ((c->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { | 3734 | if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { |
3770 | rc = emulate_nm(ctxt); | 3735 | rc = emulate_nm(ctxt); |
3771 | goto done; | 3736 | goto done; |
3772 | } | 3737 | } |
3773 | 3738 | ||
3774 | if (unlikely(ctxt->guest_mode) && c->intercept) { | 3739 | if (unlikely(ctxt->guest_mode) && ctxt->intercept) { |
3775 | rc = emulator_check_intercept(ctxt, c->intercept, | 3740 | rc = emulator_check_intercept(ctxt, ctxt->intercept, |
3776 | X86_ICPT_PRE_EXCEPT); | 3741 | X86_ICPT_PRE_EXCEPT); |
3777 | if (rc != X86EMUL_CONTINUE) | 3742 | if (rc != X86EMUL_CONTINUE) |
3778 | goto done; | 3743 | goto done; |
3779 | } | 3744 | } |
3780 | 3745 | ||
3781 | /* Privileged instruction can be executed only in CPL=0 */ | 3746 | /* Privileged instruction can be executed only in CPL=0 */ |
3782 | if ((c->d & Priv) && ops->cpl(ctxt)) { | 3747 | if ((ctxt->d & Priv) && ops->cpl(ctxt)) { |
3783 | rc = emulate_gp(ctxt, 0); | 3748 | rc = emulate_gp(ctxt, 0); |
3784 | goto done; | 3749 | goto done; |
3785 | } | 3750 | } |
3786 | 3751 | ||
3787 | /* Instruction can only be executed in protected mode */ | 3752 | /* Instruction can only be executed in protected mode */ |
3788 | if ((c->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) { | 3753 | if ((ctxt->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) { |
3789 | rc = emulate_ud(ctxt); | 3754 | rc = emulate_ud(ctxt); |
3790 | goto done; | 3755 | goto done; |
3791 | } | 3756 | } |
3792 | 3757 | ||
3793 | /* Do instruction specific permission checks */ | 3758 | /* Do instruction specific permission checks */ |
3794 | if (c->check_perm) { | 3759 | if (ctxt->check_perm) { |
3795 | rc = c->check_perm(ctxt); | 3760 | rc = ctxt->check_perm(ctxt); |
3796 | if (rc != X86EMUL_CONTINUE) | 3761 | if (rc != X86EMUL_CONTINUE) |
3797 | goto done; | 3762 | goto done; |
3798 | } | 3763 | } |
3799 | 3764 | ||
3800 | if (unlikely(ctxt->guest_mode) && c->intercept) { | 3765 | if (unlikely(ctxt->guest_mode) && ctxt->intercept) { |
3801 | rc = emulator_check_intercept(ctxt, c->intercept, | 3766 | rc = emulator_check_intercept(ctxt, ctxt->intercept, |
3802 | X86_ICPT_POST_EXCEPT); | 3767 | X86_ICPT_POST_EXCEPT); |
3803 | if (rc != X86EMUL_CONTINUE) | 3768 | if (rc != X86EMUL_CONTINUE) |
3804 | goto done; | 3769 | goto done; |
3805 | } | 3770 | } |
3806 | 3771 | ||
3807 | if (c->rep_prefix && (c->d & String)) { | 3772 | if (ctxt->rep_prefix && (ctxt->d & String)) { |
3808 | /* All REP prefixes have the same first termination condition */ | 3773 | /* All REP prefixes have the same first termination condition */ |
3809 | if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { | 3774 | if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) { |
3810 | ctxt->eip = c->eip; | 3775 | ctxt->eip = ctxt->_eip; |
3811 | goto done; | 3776 | goto done; |
3812 | } | 3777 | } |
3813 | } | 3778 | } |
3814 | 3779 | ||
3815 | if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { | 3780 | if ((ctxt->src.type == OP_MEM) && !(ctxt->d & NoAccess)) { |
3816 | rc = segmented_read(ctxt, c->src.addr.mem, | 3781 | rc = segmented_read(ctxt, ctxt->src.addr.mem, |
3817 | c->src.valptr, c->src.bytes); | 3782 | ctxt->src.valptr, ctxt->src.bytes); |
3818 | if (rc != X86EMUL_CONTINUE) | 3783 | if (rc != X86EMUL_CONTINUE) |
3819 | goto done; | 3784 | goto done; |
3820 | c->src.orig_val64 = c->src.val64; | 3785 | ctxt->src.orig_val64 = ctxt->src.val64; |
3821 | } | 3786 | } |
3822 | 3787 | ||
3823 | if (c->src2.type == OP_MEM) { | 3788 | if (ctxt->src2.type == OP_MEM) { |
3824 | rc = segmented_read(ctxt, c->src2.addr.mem, | 3789 | rc = segmented_read(ctxt, ctxt->src2.addr.mem, |
3825 | &c->src2.val, c->src2.bytes); | 3790 | &ctxt->src2.val, ctxt->src2.bytes); |
3826 | if (rc != X86EMUL_CONTINUE) | 3791 | if (rc != X86EMUL_CONTINUE) |
3827 | goto done; | 3792 | goto done; |
3828 | } | 3793 | } |
3829 | 3794 | ||
3830 | if ((c->d & DstMask) == ImplicitOps) | 3795 | if ((ctxt->d & DstMask) == ImplicitOps) |
3831 | goto special_insn; | 3796 | goto special_insn; |
3832 | 3797 | ||
3833 | 3798 | ||
3834 | if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { | 3799 | if ((ctxt->dst.type == OP_MEM) && !(ctxt->d & Mov)) { |
3835 | /* optimisation - avoid slow emulated read if Mov */ | 3800 | /* optimisation - avoid slow emulated read if Mov */ |
3836 | rc = segmented_read(ctxt, c->dst.addr.mem, | 3801 | rc = segmented_read(ctxt, ctxt->dst.addr.mem, |
3837 | &c->dst.val, c->dst.bytes); | 3802 | &ctxt->dst.val, ctxt->dst.bytes); |
3838 | if (rc != X86EMUL_CONTINUE) | 3803 | if (rc != X86EMUL_CONTINUE) |
3839 | goto done; | 3804 | goto done; |
3840 | } | 3805 | } |
3841 | c->dst.orig_val = c->dst.val; | 3806 | ctxt->dst.orig_val = ctxt->dst.val; |
3842 | 3807 | ||
3843 | special_insn: | 3808 | special_insn: |
3844 | 3809 | ||
3845 | if (unlikely(ctxt->guest_mode) && c->intercept) { | 3810 | if (unlikely(ctxt->guest_mode) && ctxt->intercept) { |
3846 | rc = emulator_check_intercept(ctxt, c->intercept, | 3811 | rc = emulator_check_intercept(ctxt, ctxt->intercept, |
3847 | X86_ICPT_POST_MEMACCESS); | 3812 | X86_ICPT_POST_MEMACCESS); |
3848 | if (rc != X86EMUL_CONTINUE) | 3813 | if (rc != X86EMUL_CONTINUE) |
3849 | goto done; | 3814 | goto done; |
3850 | } | 3815 | } |
3851 | 3816 | ||
3852 | if (c->execute) { | 3817 | if (ctxt->execute) { |
3853 | rc = c->execute(ctxt); | 3818 | rc = ctxt->execute(ctxt); |
3854 | if (rc != X86EMUL_CONTINUE) | 3819 | if (rc != X86EMUL_CONTINUE) |
3855 | goto done; | 3820 | goto done; |
3856 | goto writeback; | 3821 | goto writeback; |
3857 | } | 3822 | } |
3858 | 3823 | ||
3859 | if (c->twobyte) | 3824 | if (ctxt->twobyte) |
3860 | goto twobyte_insn; | 3825 | goto twobyte_insn; |
3861 | 3826 | ||
3862 | switch (c->b) { | 3827 | switch (ctxt->b) { |
3863 | case 0x06: /* push es */ | 3828 | case 0x06: /* push es */ |
3864 | rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); | 3829 | rc = emulate_push_sreg(ctxt, VCPU_SREG_ES); |
3865 | break; | 3830 | break; |
3866 | case 0x07: /* pop es */ | 3831 | case 0x07: /* pop es */ |
3867 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); | 3832 | rc = emulate_pop_sreg(ctxt, VCPU_SREG_ES); |
3868 | break; | 3833 | break; |
3869 | case 0x0e: /* push cs */ | 3834 | case 0x0e: /* push cs */ |
3870 | rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); | 3835 | rc = emulate_push_sreg(ctxt, VCPU_SREG_CS); |
3871 | break; | 3836 | break; |
3872 | case 0x16: /* push ss */ | 3837 | case 0x16: /* push ss */ |
3873 | rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); | 3838 | rc = emulate_push_sreg(ctxt, VCPU_SREG_SS); |
3874 | break; | 3839 | break; |
3875 | case 0x17: /* pop ss */ | 3840 | case 0x17: /* pop ss */ |
3876 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); | 3841 | rc = emulate_pop_sreg(ctxt, VCPU_SREG_SS); |
3877 | break; | 3842 | break; |
3878 | case 0x1e: /* push ds */ | 3843 | case 0x1e: /* push ds */ |
3879 | rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); | 3844 | rc = emulate_push_sreg(ctxt, VCPU_SREG_DS); |
3880 | break; | 3845 | break; |
3881 | case 0x1f: /* pop ds */ | 3846 | case 0x1f: /* pop ds */ |
3882 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); | 3847 | rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS); |
3883 | break; | 3848 | break; |
3884 | case 0x40 ... 0x47: /* inc r16/r32 */ | 3849 | case 0x40 ... 0x47: /* inc r16/r32 */ |
3885 | emulate_1op("inc", c->dst, ctxt->eflags); | 3850 | emulate_1op("inc", ctxt->dst, ctxt->eflags); |
3886 | break; | 3851 | break; |
3887 | case 0x48 ... 0x4f: /* dec r16/r32 */ | 3852 | case 0x48 ... 0x4f: /* dec r16/r32 */ |
3888 | emulate_1op("dec", c->dst, ctxt->eflags); | 3853 | emulate_1op("dec", ctxt->dst, ctxt->eflags); |
3889 | break; | 3854 | break; |
3890 | case 0x63: /* movsxd */ | 3855 | case 0x63: /* movsxd */ |
3891 | if (ctxt->mode != X86EMUL_MODE_PROT64) | 3856 | if (ctxt->mode != X86EMUL_MODE_PROT64) |
3892 | goto cannot_emulate; | 3857 | goto cannot_emulate; |
3893 | c->dst.val = (s32) c->src.val; | 3858 | ctxt->dst.val = (s32) ctxt->src.val; |
3894 | break; | 3859 | break; |
3895 | case 0x6c: /* insb */ | 3860 | case 0x6c: /* insb */ |
3896 | case 0x6d: /* insw/insd */ | 3861 | case 0x6d: /* insw/insd */ |
3897 | c->src.val = c->regs[VCPU_REGS_RDX]; | 3862 | ctxt->src.val = ctxt->regs[VCPU_REGS_RDX]; |
3898 | goto do_io_in; | 3863 | goto do_io_in; |
3899 | case 0x6e: /* outsb */ | 3864 | case 0x6e: /* outsb */ |
3900 | case 0x6f: /* outsw/outsd */ | 3865 | case 0x6f: /* outsw/outsd */ |
3901 | c->dst.val = c->regs[VCPU_REGS_RDX]; | 3866 | ctxt->dst.val = ctxt->regs[VCPU_REGS_RDX]; |
3902 | goto do_io_out; | 3867 | goto do_io_out; |
3903 | break; | 3868 | break; |
3904 | case 0x70 ... 0x7f: /* jcc (short) */ | 3869 | case 0x70 ... 0x7f: /* jcc (short) */ |
3905 | if (test_cc(c->b, ctxt->eflags)) | 3870 | if (test_cc(ctxt->b, ctxt->eflags)) |
3906 | jmp_rel(c, c->src.val); | 3871 | jmp_rel(ctxt, ctxt->src.val); |
3907 | break; | ||
3908 | case 0x84 ... 0x85: | ||
3909 | test: | ||
3910 | emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); | ||
3911 | break; | ||
3912 | case 0x86 ... 0x87: /* xchg */ | ||
3913 | xchg: | ||
3914 | /* Write back the register source. */ | ||
3915 | c->src.val = c->dst.val; | ||
3916 | write_register_operand(&c->src); | ||
3917 | /* | ||
3918 | * Write back the memory destination with implicit LOCK | ||
3919 | * prefix. | ||
3920 | */ | ||
3921 | c->dst.val = c->src.orig_val; | ||
3922 | c->lock_prefix = 1; | ||
3923 | break; | ||
3924 | case 0x8c: /* mov r/m, sreg */ | ||
3925 | if (c->modrm_reg > VCPU_SREG_GS) { | ||
3926 | rc = emulate_ud(ctxt); | ||
3927 | goto done; | ||
3928 | } | ||
3929 | c->dst.val = get_segment_selector(ctxt, c->modrm_reg); | ||
3930 | break; | 3872 | break; |
3931 | case 0x8d: /* lea r16/r32, m */ | 3873 | case 0x8d: /* lea r16/r32, m */ |
3932 | c->dst.val = c->src.addr.mem.ea; | 3874 | ctxt->dst.val = ctxt->src.addr.mem.ea; |
3933 | break; | 3875 | break; |
3934 | case 0x8e: { /* mov seg, r/m16 */ | ||
3935 | uint16_t sel; | ||
3936 | |||
3937 | sel = c->src.val; | ||
3938 | |||
3939 | if (c->modrm_reg == VCPU_SREG_CS || | ||
3940 | c->modrm_reg > VCPU_SREG_GS) { | ||
3941 | rc = emulate_ud(ctxt); | ||
3942 | goto done; | ||
3943 | } | ||
3944 | |||
3945 | if (c->modrm_reg == VCPU_SREG_SS) | ||
3946 | ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; | ||
3947 | |||
3948 | rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg); | ||
3949 | |||
3950 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
3951 | break; | ||
3952 | } | ||
3953 | case 0x8f: /* pop (sole member of Grp1a) */ | 3876 | case 0x8f: /* pop (sole member of Grp1a) */ |
3954 | rc = em_grp1a(ctxt); | 3877 | rc = em_grp1a(ctxt); |
3955 | break; | 3878 | break; |
3956 | case 0x90 ... 0x97: /* nop / xchg reg, rax */ | 3879 | case 0x90 ... 0x97: /* nop / xchg reg, rax */ |
3957 | if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) | 3880 | if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX]) |
3958 | break; | 3881 | break; |
3959 | goto xchg; | 3882 | rc = em_xchg(ctxt); |
3883 | break; | ||
3960 | case 0x98: /* cbw/cwde/cdqe */ | 3884 | case 0x98: /* cbw/cwde/cdqe */ |
3961 | switch (c->op_bytes) { | 3885 | switch (ctxt->op_bytes) { |
3962 | case 2: c->dst.val = (s8)c->dst.val; break; | 3886 | case 2: ctxt->dst.val = (s8)ctxt->dst.val; break; |
3963 | case 4: c->dst.val = (s16)c->dst.val; break; | 3887 | case 4: ctxt->dst.val = (s16)ctxt->dst.val; break; |
3964 | case 8: c->dst.val = (s32)c->dst.val; break; | 3888 | case 8: ctxt->dst.val = (s32)ctxt->dst.val; break; |
3965 | } | 3889 | } |
3966 | break; | 3890 | break; |
3967 | case 0xa8 ... 0xa9: /* test ax, imm */ | ||
3968 | goto test; | ||
3969 | case 0xc0 ... 0xc1: | 3891 | case 0xc0 ... 0xc1: |
3970 | rc = em_grp2(ctxt); | 3892 | rc = em_grp2(ctxt); |
3971 | break; | 3893 | break; |
3972 | case 0xc3: /* ret */ | ||
3973 | c->dst.type = OP_REG; | ||
3974 | c->dst.addr.reg = &c->eip; | ||
3975 | c->dst.bytes = c->op_bytes; | ||
3976 | rc = em_pop(ctxt); | ||
3977 | break; | ||
3978 | case 0xc4: /* les */ | 3894 | case 0xc4: /* les */ |
3979 | rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES); | 3895 | rc = emulate_load_segment(ctxt, VCPU_SREG_ES); |
3980 | break; | 3896 | break; |
3981 | case 0xc5: /* lds */ | 3897 | case 0xc5: /* lds */ |
3982 | rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS); | 3898 | rc = emulate_load_segment(ctxt, VCPU_SREG_DS); |
3983 | break; | ||
3984 | case 0xcb: /* ret far */ | ||
3985 | rc = emulate_ret_far(ctxt, ops); | ||
3986 | break; | 3899 | break; |
3987 | case 0xcc: /* int3 */ | 3900 | case 0xcc: /* int3 */ |
3988 | irq = 3; | 3901 | rc = emulate_int(ctxt, 3); |
3989 | goto do_interrupt; | 3902 | break; |
3990 | case 0xcd: /* int n */ | 3903 | case 0xcd: /* int n */ |
3991 | irq = c->src.val; | 3904 | rc = emulate_int(ctxt, ctxt->src.val); |
3992 | do_interrupt: | ||
3993 | rc = emulate_int(ctxt, ops, irq); | ||
3994 | break; | 3905 | break; |
3995 | case 0xce: /* into */ | 3906 | case 0xce: /* into */ |
3996 | if (ctxt->eflags & EFLG_OF) { | 3907 | if (ctxt->eflags & EFLG_OF) |
3997 | irq = 4; | 3908 | rc = emulate_int(ctxt, 4); |
3998 | goto do_interrupt; | ||
3999 | } | ||
4000 | break; | ||
4001 | case 0xcf: /* iret */ | ||
4002 | rc = emulate_iret(ctxt, ops); | ||
4003 | break; | 3909 | break; |
4004 | case 0xd0 ... 0xd1: /* Grp2 */ | 3910 | case 0xd0 ... 0xd1: /* Grp2 */ |
4005 | rc = em_grp2(ctxt); | 3911 | rc = em_grp2(ctxt); |
4006 | break; | 3912 | break; |
4007 | case 0xd2 ... 0xd3: /* Grp2 */ | 3913 | case 0xd2 ... 0xd3: /* Grp2 */ |
4008 | c->src.val = c->regs[VCPU_REGS_RCX]; | 3914 | ctxt->src.val = ctxt->regs[VCPU_REGS_RCX]; |
4009 | rc = em_grp2(ctxt); | 3915 | rc = em_grp2(ctxt); |
4010 | break; | 3916 | break; |
4011 | case 0xe0 ... 0xe2: /* loop/loopz/loopnz */ | ||
4012 | register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); | ||
4013 | if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 && | ||
4014 | (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags))) | ||
4015 | jmp_rel(c, c->src.val); | ||
4016 | break; | ||
4017 | case 0xe3: /* jcxz/jecxz/jrcxz */ | ||
4018 | if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) | ||
4019 | jmp_rel(c, c->src.val); | ||
4020 | break; | ||
4021 | case 0xe4: /* inb */ | 3917 | case 0xe4: /* inb */ |
4022 | case 0xe5: /* in */ | 3918 | case 0xe5: /* in */ |
4023 | goto do_io_in; | 3919 | goto do_io_in; |
@@ -4025,35 +3921,30 @@ special_insn: | |||
4025 | case 0xe7: /* out */ | 3921 | case 0xe7: /* out */ |
4026 | goto do_io_out; | 3922 | goto do_io_out; |
4027 | case 0xe8: /* call (near) */ { | 3923 | case 0xe8: /* call (near) */ { |
4028 | long int rel = c->src.val; | 3924 | long int rel = ctxt->src.val; |
4029 | c->src.val = (unsigned long) c->eip; | 3925 | ctxt->src.val = (unsigned long) ctxt->_eip; |
4030 | jmp_rel(c, rel); | 3926 | jmp_rel(ctxt, rel); |
4031 | rc = em_push(ctxt); | 3927 | rc = em_push(ctxt); |
4032 | break; | 3928 | break; |
4033 | } | 3929 | } |
4034 | case 0xe9: /* jmp rel */ | 3930 | case 0xe9: /* jmp rel */ |
4035 | goto jmp; | 3931 | case 0xeb: /* jmp rel short */ |
4036 | case 0xea: /* jmp far */ | 3932 | jmp_rel(ctxt, ctxt->src.val); |
4037 | rc = em_jmp_far(ctxt); | 3933 | ctxt->dst.type = OP_NONE; /* Disable writeback. */ |
4038 | break; | ||
4039 | case 0xeb: | ||
4040 | jmp: /* jmp rel short */ | ||
4041 | jmp_rel(c, c->src.val); | ||
4042 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
4043 | break; | 3934 | break; |
4044 | case 0xec: /* in al,dx */ | 3935 | case 0xec: /* in al,dx */ |
4045 | case 0xed: /* in (e/r)ax,dx */ | 3936 | case 0xed: /* in (e/r)ax,dx */ |
4046 | do_io_in: | 3937 | do_io_in: |
4047 | if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, | 3938 | if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val, |
4048 | &c->dst.val)) | 3939 | &ctxt->dst.val)) |
4049 | goto done; /* IO is needed */ | 3940 | goto done; /* IO is needed */ |
4050 | break; | 3941 | break; |
4051 | case 0xee: /* out dx,al */ | 3942 | case 0xee: /* out dx,al */ |
4052 | case 0xef: /* out dx,(e/r)ax */ | 3943 | case 0xef: /* out dx,(e/r)ax */ |
4053 | do_io_out: | 3944 | do_io_out: |
4054 | ops->pio_out_emulated(ctxt, c->src.bytes, c->dst.val, | 3945 | ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val, |
4055 | &c->src.val, 1); | 3946 | &ctxt->src.val, 1); |
4056 | c->dst.type = OP_NONE; /* Disable writeback. */ | 3947 | ctxt->dst.type = OP_NONE; /* Disable writeback. */ |
4057 | break; | 3948 | break; |
4058 | case 0xf4: /* hlt */ | 3949 | case 0xf4: /* hlt */ |
4059 | ctxt->ops->halt(ctxt); | 3950 | ctxt->ops->halt(ctxt); |
@@ -4071,22 +3962,6 @@ special_insn: | |||
4071 | case 0xf9: /* stc */ | 3962 | case 0xf9: /* stc */ |
4072 | ctxt->eflags |= EFLG_CF; | 3963 | ctxt->eflags |= EFLG_CF; |
4073 | break; | 3964 | break; |
4074 | case 0xfa: /* cli */ | ||
4075 | if (emulator_bad_iopl(ctxt, ops)) { | ||
4076 | rc = emulate_gp(ctxt, 0); | ||
4077 | goto done; | ||
4078 | } else | ||
4079 | ctxt->eflags &= ~X86_EFLAGS_IF; | ||
4080 | break; | ||
4081 | case 0xfb: /* sti */ | ||
4082 | if (emulator_bad_iopl(ctxt, ops)) { | ||
4083 | rc = emulate_gp(ctxt, 0); | ||
4084 | goto done; | ||
4085 | } else { | ||
4086 | ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; | ||
4087 | ctxt->eflags |= X86_EFLAGS_IF; | ||
4088 | } | ||
4089 | break; | ||
4090 | case 0xfc: /* cld */ | 3965 | case 0xfc: /* cld */ |
4091 | ctxt->eflags &= ~EFLG_DF; | 3966 | ctxt->eflags &= ~EFLG_DF; |
4092 | break; | 3967 | break; |
@@ -4115,40 +3990,40 @@ writeback: | |||
4115 | * restore dst type in case the decoding will be reused | 3990 | * restore dst type in case the decoding will be reused |
4116 | * (happens for string instruction ) | 3991 | * (happens for string instruction ) |
4117 | */ | 3992 | */ |
4118 | c->dst.type = saved_dst_type; | 3993 | ctxt->dst.type = saved_dst_type; |
4119 | 3994 | ||
4120 | if ((c->d & SrcMask) == SrcSI) | 3995 | if ((ctxt->d & SrcMask) == SrcSI) |
4121 | string_addr_inc(ctxt, seg_override(ctxt, c), | 3996 | string_addr_inc(ctxt, seg_override(ctxt), |
4122 | VCPU_REGS_RSI, &c->src); | 3997 | VCPU_REGS_RSI, &ctxt->src); |
4123 | 3998 | ||
4124 | if ((c->d & DstMask) == DstDI) | 3999 | if ((ctxt->d & DstMask) == DstDI) |
4125 | string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI, | 4000 | string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI, |
4126 | &c->dst); | 4001 | &ctxt->dst); |
4127 | 4002 | ||
4128 | if (c->rep_prefix && (c->d & String)) { | 4003 | if (ctxt->rep_prefix && (ctxt->d & String)) { |
4129 | struct read_cache *r = &ctxt->decode.io_read; | 4004 | struct read_cache *r = &ctxt->io_read; |
4130 | register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); | 4005 | register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1); |
4131 | 4006 | ||
4132 | if (!string_insn_completed(ctxt)) { | 4007 | if (!string_insn_completed(ctxt)) { |
4133 | /* | 4008 | /* |
4134 | * Re-enter guest when pio read ahead buffer is empty | 4009 | * Re-enter guest when pio read ahead buffer is empty |
4135 | * or, if it is not used, after each 1024 iteration. | 4010 | * or, if it is not used, after each 1024 iteration. |
4136 | */ | 4011 | */ |
4137 | if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) && | 4012 | if ((r->end != 0 || ctxt->regs[VCPU_REGS_RCX] & 0x3ff) && |
4138 | (r->end == 0 || r->end != r->pos)) { | 4013 | (r->end == 0 || r->end != r->pos)) { |
4139 | /* | 4014 | /* |
4140 | * Reset read cache. Usually happens before | 4015 | * Reset read cache. Usually happens before |
4141 | * decode, but since instruction is restarted | 4016 | * decode, but since instruction is restarted |
4142 | * we have to do it here. | 4017 | * we have to do it here. |
4143 | */ | 4018 | */ |
4144 | ctxt->decode.mem_read.end = 0; | 4019 | ctxt->mem_read.end = 0; |
4145 | return EMULATION_RESTART; | 4020 | return EMULATION_RESTART; |
4146 | } | 4021 | } |
4147 | goto done; /* skip rip writeback */ | 4022 | goto done; /* skip rip writeback */ |
4148 | } | 4023 | } |
4149 | } | 4024 | } |
4150 | 4025 | ||
4151 | ctxt->eip = c->eip; | 4026 | ctxt->eip = ctxt->_eip; |
4152 | 4027 | ||
4153 | done: | 4028 | done: |
4154 | if (rc == X86EMUL_PROPAGATE_FAULT) | 4029 | if (rc == X86EMUL_PROPAGATE_FAULT) |
@@ -4159,13 +4034,7 @@ done: | |||
4159 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; | 4034 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; |
4160 | 4035 | ||
4161 | twobyte_insn: | 4036 | twobyte_insn: |
4162 | switch (c->b) { | 4037 | switch (ctxt->b) { |
4163 | case 0x05: /* syscall */ | ||
4164 | rc = emulate_syscall(ctxt, ops); | ||
4165 | break; | ||
4166 | case 0x06: | ||
4167 | rc = em_clts(ctxt); | ||
4168 | break; | ||
4169 | case 0x09: /* wbinvd */ | 4038 | case 0x09: /* wbinvd */ |
4170 | (ctxt->ops->wbinvd)(ctxt); | 4039 | (ctxt->ops->wbinvd)(ctxt); |
4171 | break; | 4040 | break; |
@@ -4174,21 +4043,21 @@ twobyte_insn: | |||
4174 | case 0x18: /* Grp16 (prefetch/nop) */ | 4043 | case 0x18: /* Grp16 (prefetch/nop) */ |
4175 | break; | 4044 | break; |
4176 | case 0x20: /* mov cr, reg */ | 4045 | case 0x20: /* mov cr, reg */ |
4177 | c->dst.val = ops->get_cr(ctxt, c->modrm_reg); | 4046 | ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg); |
4178 | break; | 4047 | break; |
4179 | case 0x21: /* mov from dr to reg */ | 4048 | case 0x21: /* mov from dr to reg */ |
4180 | ops->get_dr(ctxt, c->modrm_reg, &c->dst.val); | 4049 | ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val); |
4181 | break; | 4050 | break; |
4182 | case 0x22: /* mov reg, cr */ | 4051 | case 0x22: /* mov reg, cr */ |
4183 | if (ops->set_cr(ctxt, c->modrm_reg, c->src.val)) { | 4052 | if (ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) { |
4184 | emulate_gp(ctxt, 0); | 4053 | emulate_gp(ctxt, 0); |
4185 | rc = X86EMUL_PROPAGATE_FAULT; | 4054 | rc = X86EMUL_PROPAGATE_FAULT; |
4186 | goto done; | 4055 | goto done; |
4187 | } | 4056 | } |
4188 | c->dst.type = OP_NONE; | 4057 | ctxt->dst.type = OP_NONE; |
4189 | break; | 4058 | break; |
4190 | case 0x23: /* mov from reg to dr */ | 4059 | case 0x23: /* mov from reg to dr */ |
4191 | if (ops->set_dr(ctxt, c->modrm_reg, c->src.val & | 4060 | if (ops->set_dr(ctxt, ctxt->modrm_reg, ctxt->src.val & |
4192 | ((ctxt->mode == X86EMUL_MODE_PROT64) ? | 4061 | ((ctxt->mode == X86EMUL_MODE_PROT64) ? |
4193 | ~0ULL : ~0U)) < 0) { | 4062 | ~0ULL : ~0U)) < 0) { |
4194 | /* #UD condition is already handled by the code above */ | 4063 | /* #UD condition is already handled by the code above */ |
@@ -4197,13 +4066,13 @@ twobyte_insn: | |||
4197 | goto done; | 4066 | goto done; |
4198 | } | 4067 | } |
4199 | 4068 | ||
4200 | c->dst.type = OP_NONE; /* no writeback */ | 4069 | ctxt->dst.type = OP_NONE; /* no writeback */ |
4201 | break; | 4070 | break; |
4202 | case 0x30: | 4071 | case 0x30: |
4203 | /* wrmsr */ | 4072 | /* wrmsr */ |
4204 | msr_data = (u32)c->regs[VCPU_REGS_RAX] | 4073 | msr_data = (u32)ctxt->regs[VCPU_REGS_RAX] |
4205 | | ((u64)c->regs[VCPU_REGS_RDX] << 32); | 4074 | | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32); |
4206 | if (ops->set_msr(ctxt, c->regs[VCPU_REGS_RCX], msr_data)) { | 4075 | if (ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) { |
4207 | emulate_gp(ctxt, 0); | 4076 | emulate_gp(ctxt, 0); |
4208 | rc = X86EMUL_PROPAGATE_FAULT; | 4077 | rc = X86EMUL_PROPAGATE_FAULT; |
4209 | goto done; | 4078 | goto done; |
@@ -4212,64 +4081,58 @@ twobyte_insn: | |||
4212 | break; | 4081 | break; |
4213 | case 0x32: | 4082 | case 0x32: |
4214 | /* rdmsr */ | 4083 | /* rdmsr */ |
4215 | if (ops->get_msr(ctxt, c->regs[VCPU_REGS_RCX], &msr_data)) { | 4084 | if (ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) { |
4216 | emulate_gp(ctxt, 0); | 4085 | emulate_gp(ctxt, 0); |
4217 | rc = X86EMUL_PROPAGATE_FAULT; | 4086 | rc = X86EMUL_PROPAGATE_FAULT; |
4218 | goto done; | 4087 | goto done; |
4219 | } else { | 4088 | } else { |
4220 | c->regs[VCPU_REGS_RAX] = (u32)msr_data; | 4089 | ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data; |
4221 | c->regs[VCPU_REGS_RDX] = msr_data >> 32; | 4090 | ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32; |
4222 | } | 4091 | } |
4223 | rc = X86EMUL_CONTINUE; | 4092 | rc = X86EMUL_CONTINUE; |
4224 | break; | 4093 | break; |
4225 | case 0x34: /* sysenter */ | ||
4226 | rc = emulate_sysenter(ctxt, ops); | ||
4227 | break; | ||
4228 | case 0x35: /* sysexit */ | ||
4229 | rc = emulate_sysexit(ctxt, ops); | ||
4230 | break; | ||
4231 | case 0x40 ... 0x4f: /* cmov */ | 4094 | case 0x40 ... 0x4f: /* cmov */ |
4232 | c->dst.val = c->dst.orig_val = c->src.val; | 4095 | ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val; |
4233 | if (!test_cc(c->b, ctxt->eflags)) | 4096 | if (!test_cc(ctxt->b, ctxt->eflags)) |
4234 | c->dst.type = OP_NONE; /* no writeback */ | 4097 | ctxt->dst.type = OP_NONE; /* no writeback */ |
4235 | break; | 4098 | break; |
4236 | case 0x80 ... 0x8f: /* jnz rel, etc*/ | 4099 | case 0x80 ... 0x8f: /* jnz rel, etc*/ |
4237 | if (test_cc(c->b, ctxt->eflags)) | 4100 | if (test_cc(ctxt->b, ctxt->eflags)) |
4238 | jmp_rel(c, c->src.val); | 4101 | jmp_rel(ctxt, ctxt->src.val); |
4239 | break; | 4102 | break; |
4240 | case 0x90 ... 0x9f: /* setcc r/m8 */ | 4103 | case 0x90 ... 0x9f: /* setcc r/m8 */ |
4241 | c->dst.val = test_cc(c->b, ctxt->eflags); | 4104 | ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); |
4242 | break; | 4105 | break; |
4243 | case 0xa0: /* push fs */ | 4106 | case 0xa0: /* push fs */ |
4244 | rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); | 4107 | rc = emulate_push_sreg(ctxt, VCPU_SREG_FS); |
4245 | break; | 4108 | break; |
4246 | case 0xa1: /* pop fs */ | 4109 | case 0xa1: /* pop fs */ |
4247 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); | 4110 | rc = emulate_pop_sreg(ctxt, VCPU_SREG_FS); |
4248 | break; | 4111 | break; |
4249 | case 0xa3: | 4112 | case 0xa3: |
4250 | bt: /* bt */ | 4113 | bt: /* bt */ |
4251 | c->dst.type = OP_NONE; | 4114 | ctxt->dst.type = OP_NONE; |
4252 | /* only subword offset */ | 4115 | /* only subword offset */ |
4253 | c->src.val &= (c->dst.bytes << 3) - 1; | 4116 | ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; |
4254 | emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); | 4117 | emulate_2op_SrcV_nobyte("bt", ctxt->src, ctxt->dst, ctxt->eflags); |
4255 | break; | 4118 | break; |
4256 | case 0xa4: /* shld imm8, r, r/m */ | 4119 | case 0xa4: /* shld imm8, r, r/m */ |
4257 | case 0xa5: /* shld cl, r, r/m */ | 4120 | case 0xa5: /* shld cl, r, r/m */ |
4258 | emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); | 4121 | emulate_2op_cl("shld", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags); |
4259 | break; | 4122 | break; |
4260 | case 0xa8: /* push gs */ | 4123 | case 0xa8: /* push gs */ |
4261 | rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); | 4124 | rc = emulate_push_sreg(ctxt, VCPU_SREG_GS); |
4262 | break; | 4125 | break; |
4263 | case 0xa9: /* pop gs */ | 4126 | case 0xa9: /* pop gs */ |
4264 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); | 4127 | rc = emulate_pop_sreg(ctxt, VCPU_SREG_GS); |
4265 | break; | 4128 | break; |
4266 | case 0xab: | 4129 | case 0xab: |
4267 | bts: /* bts */ | 4130 | bts: /* bts */ |
4268 | emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); | 4131 | emulate_2op_SrcV_nobyte("bts", ctxt->src, ctxt->dst, ctxt->eflags); |
4269 | break; | 4132 | break; |
4270 | case 0xac: /* shrd imm8, r, r/m */ | 4133 | case 0xac: /* shrd imm8, r, r/m */ |
4271 | case 0xad: /* shrd cl, r, r/m */ | 4134 | case 0xad: /* shrd cl, r, r/m */ |
4272 | emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags); | 4135 | emulate_2op_cl("shrd", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags); |
4273 | break; | 4136 | break; |
4274 | case 0xae: /* clflush */ | 4137 | case 0xae: /* clflush */ |
4275 | break; | 4138 | break; |
@@ -4278,38 +4141,38 @@ twobyte_insn: | |||
4278 | * Save real source value, then compare EAX against | 4141 | * Save real source value, then compare EAX against |
4279 | * destination. | 4142 | * destination. |
4280 | */ | 4143 | */ |
4281 | c->src.orig_val = c->src.val; | 4144 | ctxt->src.orig_val = ctxt->src.val; |
4282 | c->src.val = c->regs[VCPU_REGS_RAX]; | 4145 | ctxt->src.val = ctxt->regs[VCPU_REGS_RAX]; |
4283 | emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); | 4146 | emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags); |
4284 | if (ctxt->eflags & EFLG_ZF) { | 4147 | if (ctxt->eflags & EFLG_ZF) { |
4285 | /* Success: write back to memory. */ | 4148 | /* Success: write back to memory. */ |
4286 | c->dst.val = c->src.orig_val; | 4149 | ctxt->dst.val = ctxt->src.orig_val; |
4287 | } else { | 4150 | } else { |
4288 | /* Failure: write the value we saw to EAX. */ | 4151 | /* Failure: write the value we saw to EAX. */ |
4289 | c->dst.type = OP_REG; | 4152 | ctxt->dst.type = OP_REG; |
4290 | c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX]; | 4153 | ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; |
4291 | } | 4154 | } |
4292 | break; | 4155 | break; |
4293 | case 0xb2: /* lss */ | 4156 | case 0xb2: /* lss */ |
4294 | rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS); | 4157 | rc = emulate_load_segment(ctxt, VCPU_SREG_SS); |
4295 | break; | 4158 | break; |
4296 | case 0xb3: | 4159 | case 0xb3: |
4297 | btr: /* btr */ | 4160 | btr: /* btr */ |
4298 | emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); | 4161 | emulate_2op_SrcV_nobyte("btr", ctxt->src, ctxt->dst, ctxt->eflags); |
4299 | break; | 4162 | break; |
4300 | case 0xb4: /* lfs */ | 4163 | case 0xb4: /* lfs */ |
4301 | rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS); | 4164 | rc = emulate_load_segment(ctxt, VCPU_SREG_FS); |
4302 | break; | 4165 | break; |
4303 | case 0xb5: /* lgs */ | 4166 | case 0xb5: /* lgs */ |
4304 | rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS); | 4167 | rc = emulate_load_segment(ctxt, VCPU_SREG_GS); |
4305 | break; | 4168 | break; |
4306 | case 0xb6 ... 0xb7: /* movzx */ | 4169 | case 0xb6 ... 0xb7: /* movzx */ |
4307 | c->dst.bytes = c->op_bytes; | 4170 | ctxt->dst.bytes = ctxt->op_bytes; |
4308 | c->dst.val = (c->d & ByteOp) ? (u8) c->src.val | 4171 | ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val |
4309 | : (u16) c->src.val; | 4172 | : (u16) ctxt->src.val; |
4310 | break; | 4173 | break; |
4311 | case 0xba: /* Grp8 */ | 4174 | case 0xba: /* Grp8 */ |
4312 | switch (c->modrm_reg & 3) { | 4175 | switch (ctxt->modrm_reg & 3) { |
4313 | case 0: | 4176 | case 0: |
4314 | goto bt; | 4177 | goto bt; |
4315 | case 1: | 4178 | case 1: |
@@ -4322,47 +4185,47 @@ twobyte_insn: | |||
4322 | break; | 4185 | break; |
4323 | case 0xbb: | 4186 | case 0xbb: |
4324 | btc: /* btc */ | 4187 | btc: /* btc */ |
4325 | emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); | 4188 | emulate_2op_SrcV_nobyte("btc", ctxt->src, ctxt->dst, ctxt->eflags); |
4326 | break; | 4189 | break; |
4327 | case 0xbc: { /* bsf */ | 4190 | case 0xbc: { /* bsf */ |
4328 | u8 zf; | 4191 | u8 zf; |
4329 | __asm__ ("bsf %2, %0; setz %1" | 4192 | __asm__ ("bsf %2, %0; setz %1" |
4330 | : "=r"(c->dst.val), "=q"(zf) | 4193 | : "=r"(ctxt->dst.val), "=q"(zf) |
4331 | : "r"(c->src.val)); | 4194 | : "r"(ctxt->src.val)); |
4332 | ctxt->eflags &= ~X86_EFLAGS_ZF; | 4195 | ctxt->eflags &= ~X86_EFLAGS_ZF; |
4333 | if (zf) { | 4196 | if (zf) { |
4334 | ctxt->eflags |= X86_EFLAGS_ZF; | 4197 | ctxt->eflags |= X86_EFLAGS_ZF; |
4335 | c->dst.type = OP_NONE; /* Disable writeback. */ | 4198 | ctxt->dst.type = OP_NONE; /* Disable writeback. */ |
4336 | } | 4199 | } |
4337 | break; | 4200 | break; |
4338 | } | 4201 | } |
4339 | case 0xbd: { /* bsr */ | 4202 | case 0xbd: { /* bsr */ |
4340 | u8 zf; | 4203 | u8 zf; |
4341 | __asm__ ("bsr %2, %0; setz %1" | 4204 | __asm__ ("bsr %2, %0; setz %1" |
4342 | : "=r"(c->dst.val), "=q"(zf) | 4205 | : "=r"(ctxt->dst.val), "=q"(zf) |
4343 | : "r"(c->src.val)); | 4206 | : "r"(ctxt->src.val)); |
4344 | ctxt->eflags &= ~X86_EFLAGS_ZF; | 4207 | ctxt->eflags &= ~X86_EFLAGS_ZF; |
4345 | if (zf) { | 4208 | if (zf) { |
4346 | ctxt->eflags |= X86_EFLAGS_ZF; | 4209 | ctxt->eflags |= X86_EFLAGS_ZF; |
4347 | c->dst.type = OP_NONE; /* Disable writeback. */ | 4210 | ctxt->dst.type = OP_NONE; /* Disable writeback. */ |
4348 | } | 4211 | } |
4349 | break; | 4212 | break; |
4350 | } | 4213 | } |
4351 | case 0xbe ... 0xbf: /* movsx */ | 4214 | case 0xbe ... 0xbf: /* movsx */ |
4352 | c->dst.bytes = c->op_bytes; | 4215 | ctxt->dst.bytes = ctxt->op_bytes; |
4353 | c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : | 4216 | ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : |
4354 | (s16) c->src.val; | 4217 | (s16) ctxt->src.val; |
4355 | break; | 4218 | break; |
4356 | case 0xc0 ... 0xc1: /* xadd */ | 4219 | case 0xc0 ... 0xc1: /* xadd */ |
4357 | emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); | 4220 | emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags); |
4358 | /* Write back the register source. */ | 4221 | /* Write back the register source. */ |
4359 | c->src.val = c->dst.orig_val; | 4222 | ctxt->src.val = ctxt->dst.orig_val; |
4360 | write_register_operand(&c->src); | 4223 | write_register_operand(&ctxt->src); |
4361 | break; | 4224 | break; |
4362 | case 0xc3: /* movnti */ | 4225 | case 0xc3: /* movnti */ |
4363 | c->dst.bytes = c->op_bytes; | 4226 | ctxt->dst.bytes = ctxt->op_bytes; |
4364 | c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : | 4227 | ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : |
4365 | (u64) c->src.val; | 4228 | (u64) ctxt->src.val; |
4366 | break; | 4229 | break; |
4367 | case 0xc7: /* Grp9 (cmpxchg8b) */ | 4230 | case 0xc7: /* Grp9 (cmpxchg8b) */ |
4368 | rc = em_grp9(ctxt); | 4231 | rc = em_grp9(ctxt); |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index aee38623b768..9335e1bf72ad 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -148,7 +148,7 @@ module_param(oos_shadow, bool, 0644); | |||
148 | #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | 148 | #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ |
149 | | PT64_NX_MASK) | 149 | | PT64_NX_MASK) |
150 | 150 | ||
151 | #define RMAP_EXT 4 | 151 | #define PTE_LIST_EXT 4 |
152 | 152 | ||
153 | #define ACC_EXEC_MASK 1 | 153 | #define ACC_EXEC_MASK 1 |
154 | #define ACC_WRITE_MASK PT_WRITABLE_MASK | 154 | #define ACC_WRITE_MASK PT_WRITABLE_MASK |
@@ -164,16 +164,16 @@ module_param(oos_shadow, bool, 0644); | |||
164 | 164 | ||
165 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | 165 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) |
166 | 166 | ||
167 | struct kvm_rmap_desc { | 167 | struct pte_list_desc { |
168 | u64 *sptes[RMAP_EXT]; | 168 | u64 *sptes[PTE_LIST_EXT]; |
169 | struct kvm_rmap_desc *more; | 169 | struct pte_list_desc *more; |
170 | }; | 170 | }; |
171 | 171 | ||
172 | struct kvm_shadow_walk_iterator { | 172 | struct kvm_shadow_walk_iterator { |
173 | u64 addr; | 173 | u64 addr; |
174 | hpa_t shadow_addr; | 174 | hpa_t shadow_addr; |
175 | int level; | ||
176 | u64 *sptep; | 175 | u64 *sptep; |
176 | int level; | ||
177 | unsigned index; | 177 | unsigned index; |
178 | }; | 178 | }; |
179 | 179 | ||
@@ -182,32 +182,68 @@ struct kvm_shadow_walk_iterator { | |||
182 | shadow_walk_okay(&(_walker)); \ | 182 | shadow_walk_okay(&(_walker)); \ |
183 | shadow_walk_next(&(_walker))) | 183 | shadow_walk_next(&(_walker))) |
184 | 184 | ||
185 | typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); | 185 | #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \ |
186 | for (shadow_walk_init(&(_walker), _vcpu, _addr); \ | ||
187 | shadow_walk_okay(&(_walker)) && \ | ||
188 | ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \ | ||
189 | __shadow_walk_next(&(_walker), spte)) | ||
186 | 190 | ||
187 | static struct kmem_cache *pte_chain_cache; | 191 | static struct kmem_cache *pte_list_desc_cache; |
188 | static struct kmem_cache *rmap_desc_cache; | ||
189 | static struct kmem_cache *mmu_page_header_cache; | 192 | static struct kmem_cache *mmu_page_header_cache; |
190 | static struct percpu_counter kvm_total_used_mmu_pages; | 193 | static struct percpu_counter kvm_total_used_mmu_pages; |
191 | 194 | ||
192 | static u64 __read_mostly shadow_trap_nonpresent_pte; | ||
193 | static u64 __read_mostly shadow_notrap_nonpresent_pte; | ||
194 | static u64 __read_mostly shadow_nx_mask; | 195 | static u64 __read_mostly shadow_nx_mask; |
195 | static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ | 196 | static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ |
196 | static u64 __read_mostly shadow_user_mask; | 197 | static u64 __read_mostly shadow_user_mask; |
197 | static u64 __read_mostly shadow_accessed_mask; | 198 | static u64 __read_mostly shadow_accessed_mask; |
198 | static u64 __read_mostly shadow_dirty_mask; | 199 | static u64 __read_mostly shadow_dirty_mask; |
200 | static u64 __read_mostly shadow_mmio_mask; | ||
199 | 201 | ||
200 | static inline u64 rsvd_bits(int s, int e) | 202 | static void mmu_spte_set(u64 *sptep, u64 spte); |
203 | |||
204 | void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) | ||
201 | { | 205 | { |
202 | return ((1ULL << (e - s + 1)) - 1) << s; | 206 | shadow_mmio_mask = mmio_mask; |
207 | } | ||
208 | EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); | ||
209 | |||
210 | static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) | ||
211 | { | ||
212 | access &= ACC_WRITE_MASK | ACC_USER_MASK; | ||
213 | |||
214 | trace_mark_mmio_spte(sptep, gfn, access); | ||
215 | mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT); | ||
203 | } | 216 | } |
204 | 217 | ||
205 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) | 218 | static bool is_mmio_spte(u64 spte) |
206 | { | 219 | { |
207 | shadow_trap_nonpresent_pte = trap_pte; | 220 | return (spte & shadow_mmio_mask) == shadow_mmio_mask; |
208 | shadow_notrap_nonpresent_pte = notrap_pte; | 221 | } |
222 | |||
223 | static gfn_t get_mmio_spte_gfn(u64 spte) | ||
224 | { | ||
225 | return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT; | ||
226 | } | ||
227 | |||
228 | static unsigned get_mmio_spte_access(u64 spte) | ||
229 | { | ||
230 | return (spte & ~shadow_mmio_mask) & ~PAGE_MASK; | ||
231 | } | ||
232 | |||
233 | static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) | ||
234 | { | ||
235 | if (unlikely(is_noslot_pfn(pfn))) { | ||
236 | mark_mmio_spte(sptep, gfn, access); | ||
237 | return true; | ||
238 | } | ||
239 | |||
240 | return false; | ||
241 | } | ||
242 | |||
243 | static inline u64 rsvd_bits(int s, int e) | ||
244 | { | ||
245 | return ((1ULL << (e - s + 1)) - 1) << s; | ||
209 | } | 246 | } |
210 | EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); | ||
211 | 247 | ||
212 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, | 248 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, |
213 | u64 dirty_mask, u64 nx_mask, u64 x_mask) | 249 | u64 dirty_mask, u64 nx_mask, u64 x_mask) |
@@ -220,11 +256,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, | |||
220 | } | 256 | } |
221 | EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); | 257 | EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); |
222 | 258 | ||
223 | static bool is_write_protection(struct kvm_vcpu *vcpu) | ||
224 | { | ||
225 | return kvm_read_cr0_bits(vcpu, X86_CR0_WP); | ||
226 | } | ||
227 | |||
228 | static int is_cpuid_PSE36(void) | 259 | static int is_cpuid_PSE36(void) |
229 | { | 260 | { |
230 | return 1; | 261 | return 1; |
@@ -237,8 +268,7 @@ static int is_nx(struct kvm_vcpu *vcpu) | |||
237 | 268 | ||
238 | static int is_shadow_present_pte(u64 pte) | 269 | static int is_shadow_present_pte(u64 pte) |
239 | { | 270 | { |
240 | return pte != shadow_trap_nonpresent_pte | 271 | return pte & PT_PRESENT_MASK && !is_mmio_spte(pte); |
241 | && pte != shadow_notrap_nonpresent_pte; | ||
242 | } | 272 | } |
243 | 273 | ||
244 | static int is_large_pte(u64 pte) | 274 | static int is_large_pte(u64 pte) |
@@ -246,11 +276,6 @@ static int is_large_pte(u64 pte) | |||
246 | return pte & PT_PAGE_SIZE_MASK; | 276 | return pte & PT_PAGE_SIZE_MASK; |
247 | } | 277 | } |
248 | 278 | ||
249 | static int is_writable_pte(unsigned long pte) | ||
250 | { | ||
251 | return pte & PT_WRITABLE_MASK; | ||
252 | } | ||
253 | |||
254 | static int is_dirty_gpte(unsigned long pte) | 279 | static int is_dirty_gpte(unsigned long pte) |
255 | { | 280 | { |
256 | return pte & PT_DIRTY_MASK; | 281 | return pte & PT_DIRTY_MASK; |
@@ -282,26 +307,154 @@ static gfn_t pse36_gfn_delta(u32 gpte) | |||
282 | return (gpte & PT32_DIR_PSE36_MASK) << shift; | 307 | return (gpte & PT32_DIR_PSE36_MASK) << shift; |
283 | } | 308 | } |
284 | 309 | ||
310 | #ifdef CONFIG_X86_64 | ||
285 | static void __set_spte(u64 *sptep, u64 spte) | 311 | static void __set_spte(u64 *sptep, u64 spte) |
286 | { | 312 | { |
287 | set_64bit(sptep, spte); | 313 | *sptep = spte; |
288 | } | 314 | } |
289 | 315 | ||
290 | static u64 __xchg_spte(u64 *sptep, u64 new_spte) | 316 | static void __update_clear_spte_fast(u64 *sptep, u64 spte) |
291 | { | 317 | { |
292 | #ifdef CONFIG_X86_64 | 318 | *sptep = spte; |
293 | return xchg(sptep, new_spte); | 319 | } |
320 | |||
321 | static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) | ||
322 | { | ||
323 | return xchg(sptep, spte); | ||
324 | } | ||
325 | |||
326 | static u64 __get_spte_lockless(u64 *sptep) | ||
327 | { | ||
328 | return ACCESS_ONCE(*sptep); | ||
329 | } | ||
330 | |||
331 | static bool __check_direct_spte_mmio_pf(u64 spte) | ||
332 | { | ||
333 | /* It is valid if the spte is zapped. */ | ||
334 | return spte == 0ull; | ||
335 | } | ||
294 | #else | 336 | #else |
295 | u64 old_spte; | 337 | union split_spte { |
338 | struct { | ||
339 | u32 spte_low; | ||
340 | u32 spte_high; | ||
341 | }; | ||
342 | u64 spte; | ||
343 | }; | ||
296 | 344 | ||
297 | do { | 345 | static void count_spte_clear(u64 *sptep, u64 spte) |
298 | old_spte = *sptep; | 346 | { |
299 | } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte); | 347 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); |
300 | 348 | ||
301 | return old_spte; | 349 | if (is_shadow_present_pte(spte)) |
302 | #endif | 350 | return; |
351 | |||
352 | /* Ensure the spte is completely set before we increase the count */ | ||
353 | smp_wmb(); | ||
354 | sp->clear_spte_count++; | ||
355 | } | ||
356 | |||
357 | static void __set_spte(u64 *sptep, u64 spte) | ||
358 | { | ||
359 | union split_spte *ssptep, sspte; | ||
360 | |||
361 | ssptep = (union split_spte *)sptep; | ||
362 | sspte = (union split_spte)spte; | ||
363 | |||
364 | ssptep->spte_high = sspte.spte_high; | ||
365 | |||
366 | /* | ||
367 | * If we map the spte from nonpresent to present, We should store | ||
368 | * the high bits firstly, then set present bit, so cpu can not | ||
369 | * fetch this spte while we are setting the spte. | ||
370 | */ | ||
371 | smp_wmb(); | ||
372 | |||
373 | ssptep->spte_low = sspte.spte_low; | ||
303 | } | 374 | } |
304 | 375 | ||
376 | static void __update_clear_spte_fast(u64 *sptep, u64 spte) | ||
377 | { | ||
378 | union split_spte *ssptep, sspte; | ||
379 | |||
380 | ssptep = (union split_spte *)sptep; | ||
381 | sspte = (union split_spte)spte; | ||
382 | |||
383 | ssptep->spte_low = sspte.spte_low; | ||
384 | |||
385 | /* | ||
386 | * If we map the spte from present to nonpresent, we should clear | ||
387 | * present bit firstly to avoid vcpu fetch the old high bits. | ||
388 | */ | ||
389 | smp_wmb(); | ||
390 | |||
391 | ssptep->spte_high = sspte.spte_high; | ||
392 | count_spte_clear(sptep, spte); | ||
393 | } | ||
394 | |||
395 | static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) | ||
396 | { | ||
397 | union split_spte *ssptep, sspte, orig; | ||
398 | |||
399 | ssptep = (union split_spte *)sptep; | ||
400 | sspte = (union split_spte)spte; | ||
401 | |||
402 | /* xchg acts as a barrier before the setting of the high bits */ | ||
403 | orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low); | ||
404 | orig.spte_high = ssptep->spte_high = sspte.spte_high; | ||
405 | count_spte_clear(sptep, spte); | ||
406 | |||
407 | return orig.spte; | ||
408 | } | ||
409 | |||
410 | /* | ||
411 | * The idea using the light way get the spte on x86_32 guest is from | ||
412 | * gup_get_pte(arch/x86/mm/gup.c). | ||
413 | * The difference is we can not catch the spte tlb flush if we leave | ||
414 | * guest mode, so we emulate it by increase clear_spte_count when spte | ||
415 | * is cleared. | ||
416 | */ | ||
417 | static u64 __get_spte_lockless(u64 *sptep) | ||
418 | { | ||
419 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); | ||
420 | union split_spte spte, *orig = (union split_spte *)sptep; | ||
421 | int count; | ||
422 | |||
423 | retry: | ||
424 | count = sp->clear_spte_count; | ||
425 | smp_rmb(); | ||
426 | |||
427 | spte.spte_low = orig->spte_low; | ||
428 | smp_rmb(); | ||
429 | |||
430 | spte.spte_high = orig->spte_high; | ||
431 | smp_rmb(); | ||
432 | |||
433 | if (unlikely(spte.spte_low != orig->spte_low || | ||
434 | count != sp->clear_spte_count)) | ||
435 | goto retry; | ||
436 | |||
437 | return spte.spte; | ||
438 | } | ||
439 | |||
440 | static bool __check_direct_spte_mmio_pf(u64 spte) | ||
441 | { | ||
442 | union split_spte sspte = (union split_spte)spte; | ||
443 | u32 high_mmio_mask = shadow_mmio_mask >> 32; | ||
444 | |||
445 | /* It is valid if the spte is zapped. */ | ||
446 | if (spte == 0ull) | ||
447 | return true; | ||
448 | |||
449 | /* It is valid if the spte is being zapped. */ | ||
450 | if (sspte.spte_low == 0ull && | ||
451 | (sspte.spte_high & high_mmio_mask) == high_mmio_mask) | ||
452 | return true; | ||
453 | |||
454 | return false; | ||
455 | } | ||
456 | #endif | ||
457 | |||
305 | static bool spte_has_volatile_bits(u64 spte) | 458 | static bool spte_has_volatile_bits(u64 spte) |
306 | { | 459 | { |
307 | if (!shadow_accessed_mask) | 460 | if (!shadow_accessed_mask) |
@@ -322,12 +475,30 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) | |||
322 | return (old_spte & bit_mask) && !(new_spte & bit_mask); | 475 | return (old_spte & bit_mask) && !(new_spte & bit_mask); |
323 | } | 476 | } |
324 | 477 | ||
325 | static void update_spte(u64 *sptep, u64 new_spte) | 478 | /* Rules for using mmu_spte_set: |
479 | * Set the sptep from nonpresent to present. | ||
480 | * Note: the sptep being assigned *must* be either not present | ||
481 | * or in a state where the hardware will not attempt to update | ||
482 | * the spte. | ||
483 | */ | ||
484 | static void mmu_spte_set(u64 *sptep, u64 new_spte) | ||
485 | { | ||
486 | WARN_ON(is_shadow_present_pte(*sptep)); | ||
487 | __set_spte(sptep, new_spte); | ||
488 | } | ||
489 | |||
490 | /* Rules for using mmu_spte_update: | ||
491 | * Update the state bits, it means the mapped pfn is not changged. | ||
492 | */ | ||
493 | static void mmu_spte_update(u64 *sptep, u64 new_spte) | ||
326 | { | 494 | { |
327 | u64 mask, old_spte = *sptep; | 495 | u64 mask, old_spte = *sptep; |
328 | 496 | ||
329 | WARN_ON(!is_rmap_spte(new_spte)); | 497 | WARN_ON(!is_rmap_spte(new_spte)); |
330 | 498 | ||
499 | if (!is_shadow_present_pte(old_spte)) | ||
500 | return mmu_spte_set(sptep, new_spte); | ||
501 | |||
331 | new_spte |= old_spte & shadow_dirty_mask; | 502 | new_spte |= old_spte & shadow_dirty_mask; |
332 | 503 | ||
333 | mask = shadow_accessed_mask; | 504 | mask = shadow_accessed_mask; |
@@ -335,9 +506,9 @@ static void update_spte(u64 *sptep, u64 new_spte) | |||
335 | mask |= shadow_dirty_mask; | 506 | mask |= shadow_dirty_mask; |
336 | 507 | ||
337 | if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) | 508 | if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) |
338 | __set_spte(sptep, new_spte); | 509 | __update_clear_spte_fast(sptep, new_spte); |
339 | else | 510 | else |
340 | old_spte = __xchg_spte(sptep, new_spte); | 511 | old_spte = __update_clear_spte_slow(sptep, new_spte); |
341 | 512 | ||
342 | if (!shadow_accessed_mask) | 513 | if (!shadow_accessed_mask) |
343 | return; | 514 | return; |
@@ -348,6 +519,64 @@ static void update_spte(u64 *sptep, u64 new_spte) | |||
348 | kvm_set_pfn_dirty(spte_to_pfn(old_spte)); | 519 | kvm_set_pfn_dirty(spte_to_pfn(old_spte)); |
349 | } | 520 | } |
350 | 521 | ||
522 | /* | ||
523 | * Rules for using mmu_spte_clear_track_bits: | ||
524 | * It sets the sptep from present to nonpresent, and track the | ||
525 | * state bits, it is used to clear the last level sptep. | ||
526 | */ | ||
527 | static int mmu_spte_clear_track_bits(u64 *sptep) | ||
528 | { | ||
529 | pfn_t pfn; | ||
530 | u64 old_spte = *sptep; | ||
531 | |||
532 | if (!spte_has_volatile_bits(old_spte)) | ||
533 | __update_clear_spte_fast(sptep, 0ull); | ||
534 | else | ||
535 | old_spte = __update_clear_spte_slow(sptep, 0ull); | ||
536 | |||
537 | if (!is_rmap_spte(old_spte)) | ||
538 | return 0; | ||
539 | |||
540 | pfn = spte_to_pfn(old_spte); | ||
541 | if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) | ||
542 | kvm_set_pfn_accessed(pfn); | ||
543 | if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) | ||
544 | kvm_set_pfn_dirty(pfn); | ||
545 | return 1; | ||
546 | } | ||
547 | |||
548 | /* | ||
549 | * Rules for using mmu_spte_clear_no_track: | ||
550 | * Directly clear spte without caring the state bits of sptep, | ||
551 | * it is used to set the upper level spte. | ||
552 | */ | ||
553 | static void mmu_spte_clear_no_track(u64 *sptep) | ||
554 | { | ||
555 | __update_clear_spte_fast(sptep, 0ull); | ||
556 | } | ||
557 | |||
558 | static u64 mmu_spte_get_lockless(u64 *sptep) | ||
559 | { | ||
560 | return __get_spte_lockless(sptep); | ||
561 | } | ||
562 | |||
563 | static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) | ||
564 | { | ||
565 | rcu_read_lock(); | ||
566 | atomic_inc(&vcpu->kvm->arch.reader_counter); | ||
567 | |||
568 | /* Increase the counter before walking shadow page table */ | ||
569 | smp_mb__after_atomic_inc(); | ||
570 | } | ||
571 | |||
572 | static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) | ||
573 | { | ||
574 | /* Decrease the counter after walking shadow page table finished */ | ||
575 | smp_mb__before_atomic_dec(); | ||
576 | atomic_dec(&vcpu->kvm->arch.reader_counter); | ||
577 | rcu_read_unlock(); | ||
578 | } | ||
579 | |||
351 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, | 580 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, |
352 | struct kmem_cache *base_cache, int min) | 581 | struct kmem_cache *base_cache, int min) |
353 | { | 582 | { |
@@ -397,12 +626,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) | |||
397 | { | 626 | { |
398 | int r; | 627 | int r; |
399 | 628 | ||
400 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, | 629 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, |
401 | pte_chain_cache, 4); | 630 | pte_list_desc_cache, 8 + PTE_PREFETCH_NUM); |
402 | if (r) | ||
403 | goto out; | ||
404 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, | ||
405 | rmap_desc_cache, 4 + PTE_PREFETCH_NUM); | ||
406 | if (r) | 631 | if (r) |
407 | goto out; | 632 | goto out; |
408 | r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); | 633 | r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); |
@@ -416,8 +641,8 @@ out: | |||
416 | 641 | ||
417 | static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) | 642 | static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) |
418 | { | 643 | { |
419 | mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache); | 644 | mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, |
420 | mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache); | 645 | pte_list_desc_cache); |
421 | mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); | 646 | mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); |
422 | mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, | 647 | mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, |
423 | mmu_page_header_cache); | 648 | mmu_page_header_cache); |
@@ -433,26 +658,15 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, | |||
433 | return p; | 658 | return p; |
434 | } | 659 | } |
435 | 660 | ||
436 | static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) | 661 | static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) |
437 | { | ||
438 | return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache, | ||
439 | sizeof(struct kvm_pte_chain)); | ||
440 | } | ||
441 | |||
442 | static void mmu_free_pte_chain(struct kvm_pte_chain *pc) | ||
443 | { | 662 | { |
444 | kmem_cache_free(pte_chain_cache, pc); | 663 | return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, |
664 | sizeof(struct pte_list_desc)); | ||
445 | } | 665 | } |
446 | 666 | ||
447 | static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) | 667 | static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) |
448 | { | 668 | { |
449 | return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache, | 669 | kmem_cache_free(pte_list_desc_cache, pte_list_desc); |
450 | sizeof(struct kvm_rmap_desc)); | ||
451 | } | ||
452 | |||
453 | static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) | ||
454 | { | ||
455 | kmem_cache_free(rmap_desc_cache, rd); | ||
456 | } | 670 | } |
457 | 671 | ||
458 | static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) | 672 | static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) |
@@ -498,6 +712,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn) | |||
498 | linfo = lpage_info_slot(gfn, slot, i); | 712 | linfo = lpage_info_slot(gfn, slot, i); |
499 | linfo->write_count += 1; | 713 | linfo->write_count += 1; |
500 | } | 714 | } |
715 | kvm->arch.indirect_shadow_pages++; | ||
501 | } | 716 | } |
502 | 717 | ||
503 | static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) | 718 | static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) |
@@ -513,6 +728,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) | |||
513 | linfo->write_count -= 1; | 728 | linfo->write_count -= 1; |
514 | WARN_ON(linfo->write_count < 0); | 729 | WARN_ON(linfo->write_count < 0); |
515 | } | 730 | } |
731 | kvm->arch.indirect_shadow_pages--; | ||
516 | } | 732 | } |
517 | 733 | ||
518 | static int has_wrprotected_page(struct kvm *kvm, | 734 | static int has_wrprotected_page(struct kvm *kvm, |
@@ -588,67 +804,42 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) | |||
588 | } | 804 | } |
589 | 805 | ||
590 | /* | 806 | /* |
591 | * Take gfn and return the reverse mapping to it. | 807 | * Pte mapping structures: |
592 | */ | ||
593 | |||
594 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) | ||
595 | { | ||
596 | struct kvm_memory_slot *slot; | ||
597 | struct kvm_lpage_info *linfo; | ||
598 | |||
599 | slot = gfn_to_memslot(kvm, gfn); | ||
600 | if (likely(level == PT_PAGE_TABLE_LEVEL)) | ||
601 | return &slot->rmap[gfn - slot->base_gfn]; | ||
602 | |||
603 | linfo = lpage_info_slot(gfn, slot, level); | ||
604 | |||
605 | return &linfo->rmap_pde; | ||
606 | } | ||
607 | |||
608 | /* | ||
609 | * Reverse mapping data structures: | ||
610 | * | 808 | * |
611 | * If rmapp bit zero is zero, then rmapp point to the shadw page table entry | 809 | * If pte_list bit zero is zero, then pte_list point to the spte. |
612 | * that points to page_address(page). | ||
613 | * | 810 | * |
614 | * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc | 811 | * If pte_list bit zero is one, (then pte_list & ~1) points to a struct |
615 | * containing more mappings. | 812 | * pte_list_desc containing more mappings. |
616 | * | 813 | * |
617 | * Returns the number of rmap entries before the spte was added or zero if | 814 | * Returns the number of pte entries before the spte was added or zero if |
618 | * the spte was not added. | 815 | * the spte was not added. |
619 | * | 816 | * |
620 | */ | 817 | */ |
621 | static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | 818 | static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, |
819 | unsigned long *pte_list) | ||
622 | { | 820 | { |
623 | struct kvm_mmu_page *sp; | 821 | struct pte_list_desc *desc; |
624 | struct kvm_rmap_desc *desc; | ||
625 | unsigned long *rmapp; | ||
626 | int i, count = 0; | 822 | int i, count = 0; |
627 | 823 | ||
628 | if (!is_rmap_spte(*spte)) | 824 | if (!*pte_list) { |
629 | return count; | 825 | rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte); |
630 | sp = page_header(__pa(spte)); | 826 | *pte_list = (unsigned long)spte; |
631 | kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); | 827 | } else if (!(*pte_list & 1)) { |
632 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); | 828 | rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte); |
633 | if (!*rmapp) { | 829 | desc = mmu_alloc_pte_list_desc(vcpu); |
634 | rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); | 830 | desc->sptes[0] = (u64 *)*pte_list; |
635 | *rmapp = (unsigned long)spte; | ||
636 | } else if (!(*rmapp & 1)) { | ||
637 | rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); | ||
638 | desc = mmu_alloc_rmap_desc(vcpu); | ||
639 | desc->sptes[0] = (u64 *)*rmapp; | ||
640 | desc->sptes[1] = spte; | 831 | desc->sptes[1] = spte; |
641 | *rmapp = (unsigned long)desc | 1; | 832 | *pte_list = (unsigned long)desc | 1; |
642 | ++count; | 833 | ++count; |
643 | } else { | 834 | } else { |
644 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); | 835 | rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte); |
645 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | 836 | desc = (struct pte_list_desc *)(*pte_list & ~1ul); |
646 | while (desc->sptes[RMAP_EXT-1] && desc->more) { | 837 | while (desc->sptes[PTE_LIST_EXT-1] && desc->more) { |
647 | desc = desc->more; | 838 | desc = desc->more; |
648 | count += RMAP_EXT; | 839 | count += PTE_LIST_EXT; |
649 | } | 840 | } |
650 | if (desc->sptes[RMAP_EXT-1]) { | 841 | if (desc->sptes[PTE_LIST_EXT-1]) { |
651 | desc->more = mmu_alloc_rmap_desc(vcpu); | 842 | desc->more = mmu_alloc_pte_list_desc(vcpu); |
652 | desc = desc->more; | 843 | desc = desc->more; |
653 | } | 844 | } |
654 | for (i = 0; desc->sptes[i]; ++i) | 845 | for (i = 0; desc->sptes[i]; ++i) |
@@ -658,59 +849,78 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | |||
658 | return count; | 849 | return count; |
659 | } | 850 | } |
660 | 851 | ||
661 | static void rmap_desc_remove_entry(unsigned long *rmapp, | 852 | static u64 *pte_list_next(unsigned long *pte_list, u64 *spte) |
662 | struct kvm_rmap_desc *desc, | 853 | { |
663 | int i, | 854 | struct pte_list_desc *desc; |
664 | struct kvm_rmap_desc *prev_desc) | 855 | u64 *prev_spte; |
856 | int i; | ||
857 | |||
858 | if (!*pte_list) | ||
859 | return NULL; | ||
860 | else if (!(*pte_list & 1)) { | ||
861 | if (!spte) | ||
862 | return (u64 *)*pte_list; | ||
863 | return NULL; | ||
864 | } | ||
865 | desc = (struct pte_list_desc *)(*pte_list & ~1ul); | ||
866 | prev_spte = NULL; | ||
867 | while (desc) { | ||
868 | for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { | ||
869 | if (prev_spte == spte) | ||
870 | return desc->sptes[i]; | ||
871 | prev_spte = desc->sptes[i]; | ||
872 | } | ||
873 | desc = desc->more; | ||
874 | } | ||
875 | return NULL; | ||
876 | } | ||
877 | |||
878 | static void | ||
879 | pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, | ||
880 | int i, struct pte_list_desc *prev_desc) | ||
665 | { | 881 | { |
666 | int j; | 882 | int j; |
667 | 883 | ||
668 | for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j) | 884 | for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j) |
669 | ; | 885 | ; |
670 | desc->sptes[i] = desc->sptes[j]; | 886 | desc->sptes[i] = desc->sptes[j]; |
671 | desc->sptes[j] = NULL; | 887 | desc->sptes[j] = NULL; |
672 | if (j != 0) | 888 | if (j != 0) |
673 | return; | 889 | return; |
674 | if (!prev_desc && !desc->more) | 890 | if (!prev_desc && !desc->more) |
675 | *rmapp = (unsigned long)desc->sptes[0]; | 891 | *pte_list = (unsigned long)desc->sptes[0]; |
676 | else | 892 | else |
677 | if (prev_desc) | 893 | if (prev_desc) |
678 | prev_desc->more = desc->more; | 894 | prev_desc->more = desc->more; |
679 | else | 895 | else |
680 | *rmapp = (unsigned long)desc->more | 1; | 896 | *pte_list = (unsigned long)desc->more | 1; |
681 | mmu_free_rmap_desc(desc); | 897 | mmu_free_pte_list_desc(desc); |
682 | } | 898 | } |
683 | 899 | ||
684 | static void rmap_remove(struct kvm *kvm, u64 *spte) | 900 | static void pte_list_remove(u64 *spte, unsigned long *pte_list) |
685 | { | 901 | { |
686 | struct kvm_rmap_desc *desc; | 902 | struct pte_list_desc *desc; |
687 | struct kvm_rmap_desc *prev_desc; | 903 | struct pte_list_desc *prev_desc; |
688 | struct kvm_mmu_page *sp; | ||
689 | gfn_t gfn; | ||
690 | unsigned long *rmapp; | ||
691 | int i; | 904 | int i; |
692 | 905 | ||
693 | sp = page_header(__pa(spte)); | 906 | if (!*pte_list) { |
694 | gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); | 907 | printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte); |
695 | rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); | ||
696 | if (!*rmapp) { | ||
697 | printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte); | ||
698 | BUG(); | 908 | BUG(); |
699 | } else if (!(*rmapp & 1)) { | 909 | } else if (!(*pte_list & 1)) { |
700 | rmap_printk("rmap_remove: %p 1->0\n", spte); | 910 | rmap_printk("pte_list_remove: %p 1->0\n", spte); |
701 | if ((u64 *)*rmapp != spte) { | 911 | if ((u64 *)*pte_list != spte) { |
702 | printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte); | 912 | printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte); |
703 | BUG(); | 913 | BUG(); |
704 | } | 914 | } |
705 | *rmapp = 0; | 915 | *pte_list = 0; |
706 | } else { | 916 | } else { |
707 | rmap_printk("rmap_remove: %p many->many\n", spte); | 917 | rmap_printk("pte_list_remove: %p many->many\n", spte); |
708 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | 918 | desc = (struct pte_list_desc *)(*pte_list & ~1ul); |
709 | prev_desc = NULL; | 919 | prev_desc = NULL; |
710 | while (desc) { | 920 | while (desc) { |
711 | for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) | 921 | for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) |
712 | if (desc->sptes[i] == spte) { | 922 | if (desc->sptes[i] == spte) { |
713 | rmap_desc_remove_entry(rmapp, | 923 | pte_list_desc_remove_entry(pte_list, |
714 | desc, i, | 924 | desc, i, |
715 | prev_desc); | 925 | prev_desc); |
716 | return; | 926 | return; |
@@ -718,62 +928,80 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) | |||
718 | prev_desc = desc; | 928 | prev_desc = desc; |
719 | desc = desc->more; | 929 | desc = desc->more; |
720 | } | 930 | } |
721 | pr_err("rmap_remove: %p many->many\n", spte); | 931 | pr_err("pte_list_remove: %p many->many\n", spte); |
722 | BUG(); | 932 | BUG(); |
723 | } | 933 | } |
724 | } | 934 | } |
725 | 935 | ||
726 | static int set_spte_track_bits(u64 *sptep, u64 new_spte) | 936 | typedef void (*pte_list_walk_fn) (u64 *spte); |
937 | static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) | ||
727 | { | 938 | { |
728 | pfn_t pfn; | 939 | struct pte_list_desc *desc; |
729 | u64 old_spte = *sptep; | 940 | int i; |
730 | 941 | ||
731 | if (!spte_has_volatile_bits(old_spte)) | 942 | if (!*pte_list) |
732 | __set_spte(sptep, new_spte); | 943 | return; |
733 | else | ||
734 | old_spte = __xchg_spte(sptep, new_spte); | ||
735 | 944 | ||
736 | if (!is_rmap_spte(old_spte)) | 945 | if (!(*pte_list & 1)) |
737 | return 0; | 946 | return fn((u64 *)*pte_list); |
738 | 947 | ||
739 | pfn = spte_to_pfn(old_spte); | 948 | desc = (struct pte_list_desc *)(*pte_list & ~1ul); |
740 | if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) | 949 | while (desc) { |
741 | kvm_set_pfn_accessed(pfn); | 950 | for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) |
742 | if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) | 951 | fn(desc->sptes[i]); |
743 | kvm_set_pfn_dirty(pfn); | 952 | desc = desc->more; |
744 | return 1; | 953 | } |
745 | } | 954 | } |
746 | 955 | ||
747 | static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) | 956 | /* |
957 | * Take gfn and return the reverse mapping to it. | ||
958 | */ | ||
959 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) | ||
748 | { | 960 | { |
749 | if (set_spte_track_bits(sptep, new_spte)) | 961 | struct kvm_memory_slot *slot; |
750 | rmap_remove(kvm, sptep); | 962 | struct kvm_lpage_info *linfo; |
963 | |||
964 | slot = gfn_to_memslot(kvm, gfn); | ||
965 | if (likely(level == PT_PAGE_TABLE_LEVEL)) | ||
966 | return &slot->rmap[gfn - slot->base_gfn]; | ||
967 | |||
968 | linfo = lpage_info_slot(gfn, slot, level); | ||
969 | |||
970 | return &linfo->rmap_pde; | ||
971 | } | ||
972 | |||
973 | static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | ||
974 | { | ||
975 | struct kvm_mmu_page *sp; | ||
976 | unsigned long *rmapp; | ||
977 | |||
978 | sp = page_header(__pa(spte)); | ||
979 | kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); | ||
980 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); | ||
981 | return pte_list_add(vcpu, spte, rmapp); | ||
751 | } | 982 | } |
752 | 983 | ||
753 | static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) | 984 | static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) |
754 | { | 985 | { |
755 | struct kvm_rmap_desc *desc; | 986 | return pte_list_next(rmapp, spte); |
756 | u64 *prev_spte; | 987 | } |
757 | int i; | ||
758 | 988 | ||
759 | if (!*rmapp) | 989 | static void rmap_remove(struct kvm *kvm, u64 *spte) |
760 | return NULL; | 990 | { |
761 | else if (!(*rmapp & 1)) { | 991 | struct kvm_mmu_page *sp; |
762 | if (!spte) | 992 | gfn_t gfn; |
763 | return (u64 *)*rmapp; | 993 | unsigned long *rmapp; |
764 | return NULL; | 994 | |
765 | } | 995 | sp = page_header(__pa(spte)); |
766 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | 996 | gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); |
767 | prev_spte = NULL; | 997 | rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); |
768 | while (desc) { | 998 | pte_list_remove(spte, rmapp); |
769 | for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { | 999 | } |
770 | if (prev_spte == spte) | 1000 | |
771 | return desc->sptes[i]; | 1001 | static void drop_spte(struct kvm *kvm, u64 *sptep) |
772 | prev_spte = desc->sptes[i]; | 1002 | { |
773 | } | 1003 | if (mmu_spte_clear_track_bits(sptep)) |
774 | desc = desc->more; | 1004 | rmap_remove(kvm, sptep); |
775 | } | ||
776 | return NULL; | ||
777 | } | 1005 | } |
778 | 1006 | ||
779 | static int rmap_write_protect(struct kvm *kvm, u64 gfn) | 1007 | static int rmap_write_protect(struct kvm *kvm, u64 gfn) |
@@ -790,7 +1018,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
790 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 1018 | BUG_ON(!(*spte & PT_PRESENT_MASK)); |
791 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); | 1019 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); |
792 | if (is_writable_pte(*spte)) { | 1020 | if (is_writable_pte(*spte)) { |
793 | update_spte(spte, *spte & ~PT_WRITABLE_MASK); | 1021 | mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); |
794 | write_protected = 1; | 1022 | write_protected = 1; |
795 | } | 1023 | } |
796 | spte = rmap_next(kvm, rmapp, spte); | 1024 | spte = rmap_next(kvm, rmapp, spte); |
@@ -807,8 +1035,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
807 | BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); | 1035 | BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); |
808 | pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); | 1036 | pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); |
809 | if (is_writable_pte(*spte)) { | 1037 | if (is_writable_pte(*spte)) { |
810 | drop_spte(kvm, spte, | 1038 | drop_spte(kvm, spte); |
811 | shadow_trap_nonpresent_pte); | ||
812 | --kvm->stat.lpages; | 1039 | --kvm->stat.lpages; |
813 | spte = NULL; | 1040 | spte = NULL; |
814 | write_protected = 1; | 1041 | write_protected = 1; |
@@ -829,7 +1056,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
829 | while ((spte = rmap_next(kvm, rmapp, NULL))) { | 1056 | while ((spte = rmap_next(kvm, rmapp, NULL))) { |
830 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 1057 | BUG_ON(!(*spte & PT_PRESENT_MASK)); |
831 | rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); | 1058 | rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); |
832 | drop_spte(kvm, spte, shadow_trap_nonpresent_pte); | 1059 | drop_spte(kvm, spte); |
833 | need_tlb_flush = 1; | 1060 | need_tlb_flush = 1; |
834 | } | 1061 | } |
835 | return need_tlb_flush; | 1062 | return need_tlb_flush; |
@@ -851,7 +1078,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
851 | rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); | 1078 | rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); |
852 | need_flush = 1; | 1079 | need_flush = 1; |
853 | if (pte_write(*ptep)) { | 1080 | if (pte_write(*ptep)) { |
854 | drop_spte(kvm, spte, shadow_trap_nonpresent_pte); | 1081 | drop_spte(kvm, spte); |
855 | spte = rmap_next(kvm, rmapp, NULL); | 1082 | spte = rmap_next(kvm, rmapp, NULL); |
856 | } else { | 1083 | } else { |
857 | new_spte = *spte &~ (PT64_BASE_ADDR_MASK); | 1084 | new_spte = *spte &~ (PT64_BASE_ADDR_MASK); |
@@ -860,7 +1087,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
860 | new_spte &= ~PT_WRITABLE_MASK; | 1087 | new_spte &= ~PT_WRITABLE_MASK; |
861 | new_spte &= ~SPTE_HOST_WRITEABLE; | 1088 | new_spte &= ~SPTE_HOST_WRITEABLE; |
862 | new_spte &= ~shadow_accessed_mask; | 1089 | new_spte &= ~shadow_accessed_mask; |
863 | set_spte_track_bits(spte, new_spte); | 1090 | mmu_spte_clear_track_bits(spte); |
1091 | mmu_spte_set(spte, new_spte); | ||
864 | spte = rmap_next(kvm, rmapp, spte); | 1092 | spte = rmap_next(kvm, rmapp, spte); |
865 | } | 1093 | } |
866 | } | 1094 | } |
@@ -1032,151 +1260,89 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) | |||
1032 | percpu_counter_add(&kvm_total_used_mmu_pages, nr); | 1260 | percpu_counter_add(&kvm_total_used_mmu_pages, nr); |
1033 | } | 1261 | } |
1034 | 1262 | ||
1035 | static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) | 1263 | /* |
1264 | * Remove the sp from shadow page cache, after call it, | ||
1265 | * we can not find this sp from the cache, and the shadow | ||
1266 | * page table is still valid. | ||
1267 | * It should be under the protection of mmu lock. | ||
1268 | */ | ||
1269 | static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp) | ||
1036 | { | 1270 | { |
1037 | ASSERT(is_empty_shadow_page(sp->spt)); | 1271 | ASSERT(is_empty_shadow_page(sp->spt)); |
1038 | hlist_del(&sp->hash_link); | 1272 | hlist_del(&sp->hash_link); |
1039 | list_del(&sp->link); | ||
1040 | free_page((unsigned long)sp->spt); | ||
1041 | if (!sp->role.direct) | 1273 | if (!sp->role.direct) |
1042 | free_page((unsigned long)sp->gfns); | 1274 | free_page((unsigned long)sp->gfns); |
1043 | kmem_cache_free(mmu_page_header_cache, sp); | ||
1044 | kvm_mod_used_mmu_pages(kvm, -1); | ||
1045 | } | 1275 | } |
1046 | 1276 | ||
1047 | static unsigned kvm_page_table_hashfn(gfn_t gfn) | 1277 | /* |
1278 | * Free the shadow page table and the sp, we can do it | ||
1279 | * out of the protection of mmu lock. | ||
1280 | */ | ||
1281 | static void kvm_mmu_free_page(struct kvm_mmu_page *sp) | ||
1048 | { | 1282 | { |
1049 | return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); | 1283 | list_del(&sp->link); |
1284 | free_page((unsigned long)sp->spt); | ||
1285 | kmem_cache_free(mmu_page_header_cache, sp); | ||
1050 | } | 1286 | } |
1051 | 1287 | ||
1052 | static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | 1288 | static unsigned kvm_page_table_hashfn(gfn_t gfn) |
1053 | u64 *parent_pte, int direct) | ||
1054 | { | 1289 | { |
1055 | struct kvm_mmu_page *sp; | 1290 | return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); |
1056 | |||
1057 | sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); | ||
1058 | sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); | ||
1059 | if (!direct) | ||
1060 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, | ||
1061 | PAGE_SIZE); | ||
1062 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | ||
1063 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); | ||
1064 | bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); | ||
1065 | sp->multimapped = 0; | ||
1066 | sp->parent_pte = parent_pte; | ||
1067 | kvm_mod_used_mmu_pages(vcpu->kvm, +1); | ||
1068 | return sp; | ||
1069 | } | 1291 | } |
1070 | 1292 | ||
1071 | static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, | 1293 | static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, |
1072 | struct kvm_mmu_page *sp, u64 *parent_pte) | 1294 | struct kvm_mmu_page *sp, u64 *parent_pte) |
1073 | { | 1295 | { |
1074 | struct kvm_pte_chain *pte_chain; | ||
1075 | struct hlist_node *node; | ||
1076 | int i; | ||
1077 | |||
1078 | if (!parent_pte) | 1296 | if (!parent_pte) |
1079 | return; | 1297 | return; |
1080 | if (!sp->multimapped) { | ||
1081 | u64 *old = sp->parent_pte; | ||
1082 | 1298 | ||
1083 | if (!old) { | 1299 | pte_list_add(vcpu, parent_pte, &sp->parent_ptes); |
1084 | sp->parent_pte = parent_pte; | ||
1085 | return; | ||
1086 | } | ||
1087 | sp->multimapped = 1; | ||
1088 | pte_chain = mmu_alloc_pte_chain(vcpu); | ||
1089 | INIT_HLIST_HEAD(&sp->parent_ptes); | ||
1090 | hlist_add_head(&pte_chain->link, &sp->parent_ptes); | ||
1091 | pte_chain->parent_ptes[0] = old; | ||
1092 | } | ||
1093 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) { | ||
1094 | if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) | ||
1095 | continue; | ||
1096 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) | ||
1097 | if (!pte_chain->parent_ptes[i]) { | ||
1098 | pte_chain->parent_ptes[i] = parent_pte; | ||
1099 | return; | ||
1100 | } | ||
1101 | } | ||
1102 | pte_chain = mmu_alloc_pte_chain(vcpu); | ||
1103 | BUG_ON(!pte_chain); | ||
1104 | hlist_add_head(&pte_chain->link, &sp->parent_ptes); | ||
1105 | pte_chain->parent_ptes[0] = parent_pte; | ||
1106 | } | 1300 | } |
1107 | 1301 | ||
1108 | static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, | 1302 | static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, |
1109 | u64 *parent_pte) | 1303 | u64 *parent_pte) |
1110 | { | 1304 | { |
1111 | struct kvm_pte_chain *pte_chain; | 1305 | pte_list_remove(parent_pte, &sp->parent_ptes); |
1112 | struct hlist_node *node; | ||
1113 | int i; | ||
1114 | |||
1115 | if (!sp->multimapped) { | ||
1116 | BUG_ON(sp->parent_pte != parent_pte); | ||
1117 | sp->parent_pte = NULL; | ||
1118 | return; | ||
1119 | } | ||
1120 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) | ||
1121 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { | ||
1122 | if (!pte_chain->parent_ptes[i]) | ||
1123 | break; | ||
1124 | if (pte_chain->parent_ptes[i] != parent_pte) | ||
1125 | continue; | ||
1126 | while (i + 1 < NR_PTE_CHAIN_ENTRIES | ||
1127 | && pte_chain->parent_ptes[i + 1]) { | ||
1128 | pte_chain->parent_ptes[i] | ||
1129 | = pte_chain->parent_ptes[i + 1]; | ||
1130 | ++i; | ||
1131 | } | ||
1132 | pte_chain->parent_ptes[i] = NULL; | ||
1133 | if (i == 0) { | ||
1134 | hlist_del(&pte_chain->link); | ||
1135 | mmu_free_pte_chain(pte_chain); | ||
1136 | if (hlist_empty(&sp->parent_ptes)) { | ||
1137 | sp->multimapped = 0; | ||
1138 | sp->parent_pte = NULL; | ||
1139 | } | ||
1140 | } | ||
1141 | return; | ||
1142 | } | ||
1143 | BUG(); | ||
1144 | } | 1306 | } |
1145 | 1307 | ||
1146 | static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) | 1308 | static void drop_parent_pte(struct kvm_mmu_page *sp, |
1309 | u64 *parent_pte) | ||
1147 | { | 1310 | { |
1148 | struct kvm_pte_chain *pte_chain; | 1311 | mmu_page_remove_parent_pte(sp, parent_pte); |
1149 | struct hlist_node *node; | 1312 | mmu_spte_clear_no_track(parent_pte); |
1150 | struct kvm_mmu_page *parent_sp; | 1313 | } |
1151 | int i; | ||
1152 | |||
1153 | if (!sp->multimapped && sp->parent_pte) { | ||
1154 | parent_sp = page_header(__pa(sp->parent_pte)); | ||
1155 | fn(parent_sp, sp->parent_pte); | ||
1156 | return; | ||
1157 | } | ||
1158 | |||
1159 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) | ||
1160 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { | ||
1161 | u64 *spte = pte_chain->parent_ptes[i]; | ||
1162 | 1314 | ||
1163 | if (!spte) | 1315 | static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, |
1164 | break; | 1316 | u64 *parent_pte, int direct) |
1165 | parent_sp = page_header(__pa(spte)); | 1317 | { |
1166 | fn(parent_sp, spte); | 1318 | struct kvm_mmu_page *sp; |
1167 | } | 1319 | sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, |
1320 | sizeof *sp); | ||
1321 | sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); | ||
1322 | if (!direct) | ||
1323 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, | ||
1324 | PAGE_SIZE); | ||
1325 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | ||
1326 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); | ||
1327 | bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); | ||
1328 | sp->parent_ptes = 0; | ||
1329 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); | ||
1330 | kvm_mod_used_mmu_pages(vcpu->kvm, +1); | ||
1331 | return sp; | ||
1168 | } | 1332 | } |
1169 | 1333 | ||
1170 | static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte); | 1334 | static void mark_unsync(u64 *spte); |
1171 | static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) | 1335 | static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) |
1172 | { | 1336 | { |
1173 | mmu_parent_walk(sp, mark_unsync); | 1337 | pte_list_walk(&sp->parent_ptes, mark_unsync); |
1174 | } | 1338 | } |
1175 | 1339 | ||
1176 | static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte) | 1340 | static void mark_unsync(u64 *spte) |
1177 | { | 1341 | { |
1342 | struct kvm_mmu_page *sp; | ||
1178 | unsigned int index; | 1343 | unsigned int index; |
1179 | 1344 | ||
1345 | sp = page_header(__pa(spte)); | ||
1180 | index = spte - sp->spt; | 1346 | index = spte - sp->spt; |
1181 | if (__test_and_set_bit(index, sp->unsync_child_bitmap)) | 1347 | if (__test_and_set_bit(index, sp->unsync_child_bitmap)) |
1182 | return; | 1348 | return; |
@@ -1185,15 +1351,6 @@ static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte) | |||
1185 | kvm_mmu_mark_parents_unsync(sp); | 1351 | kvm_mmu_mark_parents_unsync(sp); |
1186 | } | 1352 | } |
1187 | 1353 | ||
1188 | static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, | ||
1189 | struct kvm_mmu_page *sp) | ||
1190 | { | ||
1191 | int i; | ||
1192 | |||
1193 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | ||
1194 | sp->spt[i] = shadow_trap_nonpresent_pte; | ||
1195 | } | ||
1196 | |||
1197 | static int nonpaging_sync_page(struct kvm_vcpu *vcpu, | 1354 | static int nonpaging_sync_page(struct kvm_vcpu *vcpu, |
1198 | struct kvm_mmu_page *sp) | 1355 | struct kvm_mmu_page *sp) |
1199 | { | 1356 | { |
@@ -1475,6 +1632,14 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, | |||
1475 | } | 1632 | } |
1476 | } | 1633 | } |
1477 | 1634 | ||
1635 | static void init_shadow_page_table(struct kvm_mmu_page *sp) | ||
1636 | { | ||
1637 | int i; | ||
1638 | |||
1639 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | ||
1640 | sp->spt[i] = 0ull; | ||
1641 | } | ||
1642 | |||
1478 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | 1643 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, |
1479 | gfn_t gfn, | 1644 | gfn_t gfn, |
1480 | gva_t gaddr, | 1645 | gva_t gaddr, |
@@ -1537,10 +1702,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
1537 | 1702 | ||
1538 | account_shadowed(vcpu->kvm, gfn); | 1703 | account_shadowed(vcpu->kvm, gfn); |
1539 | } | 1704 | } |
1540 | if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) | 1705 | init_shadow_page_table(sp); |
1541 | vcpu->arch.mmu.prefetch_page(vcpu, sp); | ||
1542 | else | ||
1543 | nonpaging_prefetch_page(vcpu, sp); | ||
1544 | trace_kvm_mmu_get_page(sp, true); | 1706 | trace_kvm_mmu_get_page(sp, true); |
1545 | return sp; | 1707 | return sp; |
1546 | } | 1708 | } |
@@ -1572,21 +1734,28 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) | |||
1572 | if (iterator->level < PT_PAGE_TABLE_LEVEL) | 1734 | if (iterator->level < PT_PAGE_TABLE_LEVEL) |
1573 | return false; | 1735 | return false; |
1574 | 1736 | ||
1575 | if (iterator->level == PT_PAGE_TABLE_LEVEL) | ||
1576 | if (is_large_pte(*iterator->sptep)) | ||
1577 | return false; | ||
1578 | |||
1579 | iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); | 1737 | iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); |
1580 | iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; | 1738 | iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; |
1581 | return true; | 1739 | return true; |
1582 | } | 1740 | } |
1583 | 1741 | ||
1584 | static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) | 1742 | static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, |
1743 | u64 spte) | ||
1585 | { | 1744 | { |
1586 | iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK; | 1745 | if (is_last_spte(spte, iterator->level)) { |
1746 | iterator->level = 0; | ||
1747 | return; | ||
1748 | } | ||
1749 | |||
1750 | iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK; | ||
1587 | --iterator->level; | 1751 | --iterator->level; |
1588 | } | 1752 | } |
1589 | 1753 | ||
1754 | static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) | ||
1755 | { | ||
1756 | return __shadow_walk_next(iterator, *iterator->sptep); | ||
1757 | } | ||
1758 | |||
1590 | static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) | 1759 | static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) |
1591 | { | 1760 | { |
1592 | u64 spte; | 1761 | u64 spte; |
@@ -1594,13 +1763,13 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) | |||
1594 | spte = __pa(sp->spt) | 1763 | spte = __pa(sp->spt) |
1595 | | PT_PRESENT_MASK | PT_ACCESSED_MASK | 1764 | | PT_PRESENT_MASK | PT_ACCESSED_MASK |
1596 | | PT_WRITABLE_MASK | PT_USER_MASK; | 1765 | | PT_WRITABLE_MASK | PT_USER_MASK; |
1597 | __set_spte(sptep, spte); | 1766 | mmu_spte_set(sptep, spte); |
1598 | } | 1767 | } |
1599 | 1768 | ||
1600 | static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) | 1769 | static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) |
1601 | { | 1770 | { |
1602 | if (is_large_pte(*sptep)) { | 1771 | if (is_large_pte(*sptep)) { |
1603 | drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); | 1772 | drop_spte(vcpu->kvm, sptep); |
1604 | kvm_flush_remote_tlbs(vcpu->kvm); | 1773 | kvm_flush_remote_tlbs(vcpu->kvm); |
1605 | } | 1774 | } |
1606 | } | 1775 | } |
@@ -1622,38 +1791,39 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1622 | if (child->role.access == direct_access) | 1791 | if (child->role.access == direct_access) |
1623 | return; | 1792 | return; |
1624 | 1793 | ||
1625 | mmu_page_remove_parent_pte(child, sptep); | 1794 | drop_parent_pte(child, sptep); |
1626 | __set_spte(sptep, shadow_trap_nonpresent_pte); | ||
1627 | kvm_flush_remote_tlbs(vcpu->kvm); | 1795 | kvm_flush_remote_tlbs(vcpu->kvm); |
1628 | } | 1796 | } |
1629 | } | 1797 | } |
1630 | 1798 | ||
1799 | static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, | ||
1800 | u64 *spte) | ||
1801 | { | ||
1802 | u64 pte; | ||
1803 | struct kvm_mmu_page *child; | ||
1804 | |||
1805 | pte = *spte; | ||
1806 | if (is_shadow_present_pte(pte)) { | ||
1807 | if (is_last_spte(pte, sp->role.level)) | ||
1808 | drop_spte(kvm, spte); | ||
1809 | else { | ||
1810 | child = page_header(pte & PT64_BASE_ADDR_MASK); | ||
1811 | drop_parent_pte(child, spte); | ||
1812 | } | ||
1813 | } else if (is_mmio_spte(pte)) | ||
1814 | mmu_spte_clear_no_track(spte); | ||
1815 | |||
1816 | if (is_large_pte(pte)) | ||
1817 | --kvm->stat.lpages; | ||
1818 | } | ||
1819 | |||
1631 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, | 1820 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, |
1632 | struct kvm_mmu_page *sp) | 1821 | struct kvm_mmu_page *sp) |
1633 | { | 1822 | { |
1634 | unsigned i; | 1823 | unsigned i; |
1635 | u64 *pt; | 1824 | |
1636 | u64 ent; | 1825 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) |
1637 | 1826 | mmu_page_zap_pte(kvm, sp, sp->spt + i); | |
1638 | pt = sp->spt; | ||
1639 | |||
1640 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
1641 | ent = pt[i]; | ||
1642 | |||
1643 | if (is_shadow_present_pte(ent)) { | ||
1644 | if (!is_last_spte(ent, sp->role.level)) { | ||
1645 | ent &= PT64_BASE_ADDR_MASK; | ||
1646 | mmu_page_remove_parent_pte(page_header(ent), | ||
1647 | &pt[i]); | ||
1648 | } else { | ||
1649 | if (is_large_pte(ent)) | ||
1650 | --kvm->stat.lpages; | ||
1651 | drop_spte(kvm, &pt[i], | ||
1652 | shadow_trap_nonpresent_pte); | ||
1653 | } | ||
1654 | } | ||
1655 | pt[i] = shadow_trap_nonpresent_pte; | ||
1656 | } | ||
1657 | } | 1827 | } |
1658 | 1828 | ||
1659 | static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) | 1829 | static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) |
@@ -1674,20 +1844,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
1674 | { | 1844 | { |
1675 | u64 *parent_pte; | 1845 | u64 *parent_pte; |
1676 | 1846 | ||
1677 | while (sp->multimapped || sp->parent_pte) { | 1847 | while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL))) |
1678 | if (!sp->multimapped) | 1848 | drop_parent_pte(sp, parent_pte); |
1679 | parent_pte = sp->parent_pte; | ||
1680 | else { | ||
1681 | struct kvm_pte_chain *chain; | ||
1682 | |||
1683 | chain = container_of(sp->parent_ptes.first, | ||
1684 | struct kvm_pte_chain, link); | ||
1685 | parent_pte = chain->parent_ptes[0]; | ||
1686 | } | ||
1687 | BUG_ON(!parent_pte); | ||
1688 | kvm_mmu_put_page(sp, parent_pte); | ||
1689 | __set_spte(parent_pte, shadow_trap_nonpresent_pte); | ||
1690 | } | ||
1691 | } | 1849 | } |
1692 | 1850 | ||
1693 | static int mmu_zap_unsync_children(struct kvm *kvm, | 1851 | static int mmu_zap_unsync_children(struct kvm *kvm, |
@@ -1734,6 +1892,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, | |||
1734 | /* Count self */ | 1892 | /* Count self */ |
1735 | ret++; | 1893 | ret++; |
1736 | list_move(&sp->link, invalid_list); | 1894 | list_move(&sp->link, invalid_list); |
1895 | kvm_mod_used_mmu_pages(kvm, -1); | ||
1737 | } else { | 1896 | } else { |
1738 | list_move(&sp->link, &kvm->arch.active_mmu_pages); | 1897 | list_move(&sp->link, &kvm->arch.active_mmu_pages); |
1739 | kvm_reload_remote_mmus(kvm); | 1898 | kvm_reload_remote_mmus(kvm); |
@@ -1744,6 +1903,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, | |||
1744 | return ret; | 1903 | return ret; |
1745 | } | 1904 | } |
1746 | 1905 | ||
1906 | static void kvm_mmu_isolate_pages(struct list_head *invalid_list) | ||
1907 | { | ||
1908 | struct kvm_mmu_page *sp; | ||
1909 | |||
1910 | list_for_each_entry(sp, invalid_list, link) | ||
1911 | kvm_mmu_isolate_page(sp); | ||
1912 | } | ||
1913 | |||
1914 | static void free_pages_rcu(struct rcu_head *head) | ||
1915 | { | ||
1916 | struct kvm_mmu_page *next, *sp; | ||
1917 | |||
1918 | sp = container_of(head, struct kvm_mmu_page, rcu); | ||
1919 | while (sp) { | ||
1920 | if (!list_empty(&sp->link)) | ||
1921 | next = list_first_entry(&sp->link, | ||
1922 | struct kvm_mmu_page, link); | ||
1923 | else | ||
1924 | next = NULL; | ||
1925 | kvm_mmu_free_page(sp); | ||
1926 | sp = next; | ||
1927 | } | ||
1928 | } | ||
1929 | |||
1747 | static void kvm_mmu_commit_zap_page(struct kvm *kvm, | 1930 | static void kvm_mmu_commit_zap_page(struct kvm *kvm, |
1748 | struct list_head *invalid_list) | 1931 | struct list_head *invalid_list) |
1749 | { | 1932 | { |
@@ -1754,10 +1937,21 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, | |||
1754 | 1937 | ||
1755 | kvm_flush_remote_tlbs(kvm); | 1938 | kvm_flush_remote_tlbs(kvm); |
1756 | 1939 | ||
1940 | if (atomic_read(&kvm->arch.reader_counter)) { | ||
1941 | kvm_mmu_isolate_pages(invalid_list); | ||
1942 | sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); | ||
1943 | list_del_init(invalid_list); | ||
1944 | |||
1945 | trace_kvm_mmu_delay_free_pages(sp); | ||
1946 | call_rcu(&sp->rcu, free_pages_rcu); | ||
1947 | return; | ||
1948 | } | ||
1949 | |||
1757 | do { | 1950 | do { |
1758 | sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); | 1951 | sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); |
1759 | WARN_ON(!sp->role.invalid || sp->root_count); | 1952 | WARN_ON(!sp->role.invalid || sp->root_count); |
1760 | kvm_mmu_free_page(kvm, sp); | 1953 | kvm_mmu_isolate_page(sp); |
1954 | kvm_mmu_free_page(sp); | ||
1761 | } while (!list_empty(invalid_list)); | 1955 | } while (!list_empty(invalid_list)); |
1762 | 1956 | ||
1763 | } | 1957 | } |
@@ -1783,8 +1977,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) | |||
1783 | page = container_of(kvm->arch.active_mmu_pages.prev, | 1977 | page = container_of(kvm->arch.active_mmu_pages.prev, |
1784 | struct kvm_mmu_page, link); | 1978 | struct kvm_mmu_page, link); |
1785 | kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); | 1979 | kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); |
1786 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | ||
1787 | } | 1980 | } |
1981 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | ||
1788 | goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; | 1982 | goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; |
1789 | } | 1983 | } |
1790 | 1984 | ||
@@ -1833,20 +2027,6 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) | |||
1833 | __set_bit(slot, sp->slot_bitmap); | 2027 | __set_bit(slot, sp->slot_bitmap); |
1834 | } | 2028 | } |
1835 | 2029 | ||
1836 | static void mmu_convert_notrap(struct kvm_mmu_page *sp) | ||
1837 | { | ||
1838 | int i; | ||
1839 | u64 *pt = sp->spt; | ||
1840 | |||
1841 | if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte) | ||
1842 | return; | ||
1843 | |||
1844 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
1845 | if (pt[i] == shadow_notrap_nonpresent_pte) | ||
1846 | __set_spte(&pt[i], shadow_trap_nonpresent_pte); | ||
1847 | } | ||
1848 | } | ||
1849 | |||
1850 | /* | 2030 | /* |
1851 | * The function is based on mtrr_type_lookup() in | 2031 | * The function is based on mtrr_type_lookup() in |
1852 | * arch/x86/kernel/cpu/mtrr/generic.c | 2032 | * arch/x86/kernel/cpu/mtrr/generic.c |
@@ -1959,7 +2139,6 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
1959 | sp->unsync = 1; | 2139 | sp->unsync = 1; |
1960 | 2140 | ||
1961 | kvm_mmu_mark_parents_unsync(sp); | 2141 | kvm_mmu_mark_parents_unsync(sp); |
1962 | mmu_convert_notrap(sp); | ||
1963 | } | 2142 | } |
1964 | 2143 | ||
1965 | static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) | 2144 | static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) |
@@ -2002,13 +2181,16 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, | |||
2002 | 2181 | ||
2003 | static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | 2182 | static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, |
2004 | unsigned pte_access, int user_fault, | 2183 | unsigned pte_access, int user_fault, |
2005 | int write_fault, int dirty, int level, | 2184 | int write_fault, int level, |
2006 | gfn_t gfn, pfn_t pfn, bool speculative, | 2185 | gfn_t gfn, pfn_t pfn, bool speculative, |
2007 | bool can_unsync, bool host_writable) | 2186 | bool can_unsync, bool host_writable) |
2008 | { | 2187 | { |
2009 | u64 spte, entry = *sptep; | 2188 | u64 spte, entry = *sptep; |
2010 | int ret = 0; | 2189 | int ret = 0; |
2011 | 2190 | ||
2191 | if (set_mmio_spte(sptep, gfn, pfn, pte_access)) | ||
2192 | return 0; | ||
2193 | |||
2012 | /* | 2194 | /* |
2013 | * We don't set the accessed bit, since we sometimes want to see | 2195 | * We don't set the accessed bit, since we sometimes want to see |
2014 | * whether the guest actually used the pte (in order to detect | 2196 | * whether the guest actually used the pte (in order to detect |
@@ -2017,8 +2199,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2017 | spte = PT_PRESENT_MASK; | 2199 | spte = PT_PRESENT_MASK; |
2018 | if (!speculative) | 2200 | if (!speculative) |
2019 | spte |= shadow_accessed_mask; | 2201 | spte |= shadow_accessed_mask; |
2020 | if (!dirty) | 2202 | |
2021 | pte_access &= ~ACC_WRITE_MASK; | ||
2022 | if (pte_access & ACC_EXEC_MASK) | 2203 | if (pte_access & ACC_EXEC_MASK) |
2023 | spte |= shadow_x_mask; | 2204 | spte |= shadow_x_mask; |
2024 | else | 2205 | else |
@@ -2045,15 +2226,24 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2045 | if (level > PT_PAGE_TABLE_LEVEL && | 2226 | if (level > PT_PAGE_TABLE_LEVEL && |
2046 | has_wrprotected_page(vcpu->kvm, gfn, level)) { | 2227 | has_wrprotected_page(vcpu->kvm, gfn, level)) { |
2047 | ret = 1; | 2228 | ret = 1; |
2048 | drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); | 2229 | drop_spte(vcpu->kvm, sptep); |
2049 | goto done; | 2230 | goto done; |
2050 | } | 2231 | } |
2051 | 2232 | ||
2052 | spte |= PT_WRITABLE_MASK; | 2233 | spte |= PT_WRITABLE_MASK; |
2053 | 2234 | ||
2054 | if (!vcpu->arch.mmu.direct_map | 2235 | if (!vcpu->arch.mmu.direct_map |
2055 | && !(pte_access & ACC_WRITE_MASK)) | 2236 | && !(pte_access & ACC_WRITE_MASK)) { |
2056 | spte &= ~PT_USER_MASK; | 2237 | spte &= ~PT_USER_MASK; |
2238 | /* | ||
2239 | * If we converted a user page to a kernel page, | ||
2240 | * so that the kernel can write to it when cr0.wp=0, | ||
2241 | * then we should prevent the kernel from executing it | ||
2242 | * if SMEP is enabled. | ||
2243 | */ | ||
2244 | if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) | ||
2245 | spte |= PT64_NX_MASK; | ||
2246 | } | ||
2057 | 2247 | ||
2058 | /* | 2248 | /* |
2059 | * Optimization: for pte sync, if spte was writable the hash | 2249 | * Optimization: for pte sync, if spte was writable the hash |
@@ -2078,7 +2268,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2078 | mark_page_dirty(vcpu->kvm, gfn); | 2268 | mark_page_dirty(vcpu->kvm, gfn); |
2079 | 2269 | ||
2080 | set_pte: | 2270 | set_pte: |
2081 | update_spte(sptep, spte); | 2271 | mmu_spte_update(sptep, spte); |
2082 | /* | 2272 | /* |
2083 | * If we overwrite a writable spte with a read-only one we | 2273 | * If we overwrite a writable spte with a read-only one we |
2084 | * should flush remote TLBs. Otherwise rmap_write_protect | 2274 | * should flush remote TLBs. Otherwise rmap_write_protect |
@@ -2093,8 +2283,8 @@ done: | |||
2093 | 2283 | ||
2094 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | 2284 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, |
2095 | unsigned pt_access, unsigned pte_access, | 2285 | unsigned pt_access, unsigned pte_access, |
2096 | int user_fault, int write_fault, int dirty, | 2286 | int user_fault, int write_fault, |
2097 | int *ptwrite, int level, gfn_t gfn, | 2287 | int *emulate, int level, gfn_t gfn, |
2098 | pfn_t pfn, bool speculative, | 2288 | pfn_t pfn, bool speculative, |
2099 | bool host_writable) | 2289 | bool host_writable) |
2100 | { | 2290 | { |
@@ -2117,26 +2307,28 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2117 | u64 pte = *sptep; | 2307 | u64 pte = *sptep; |
2118 | 2308 | ||
2119 | child = page_header(pte & PT64_BASE_ADDR_MASK); | 2309 | child = page_header(pte & PT64_BASE_ADDR_MASK); |
2120 | mmu_page_remove_parent_pte(child, sptep); | 2310 | drop_parent_pte(child, sptep); |
2121 | __set_spte(sptep, shadow_trap_nonpresent_pte); | ||
2122 | kvm_flush_remote_tlbs(vcpu->kvm); | 2311 | kvm_flush_remote_tlbs(vcpu->kvm); |
2123 | } else if (pfn != spte_to_pfn(*sptep)) { | 2312 | } else if (pfn != spte_to_pfn(*sptep)) { |
2124 | pgprintk("hfn old %llx new %llx\n", | 2313 | pgprintk("hfn old %llx new %llx\n", |
2125 | spte_to_pfn(*sptep), pfn); | 2314 | spte_to_pfn(*sptep), pfn); |
2126 | drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); | 2315 | drop_spte(vcpu->kvm, sptep); |
2127 | kvm_flush_remote_tlbs(vcpu->kvm); | 2316 | kvm_flush_remote_tlbs(vcpu->kvm); |
2128 | } else | 2317 | } else |
2129 | was_rmapped = 1; | 2318 | was_rmapped = 1; |
2130 | } | 2319 | } |
2131 | 2320 | ||
2132 | if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, | 2321 | if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, |
2133 | dirty, level, gfn, pfn, speculative, true, | 2322 | level, gfn, pfn, speculative, true, |
2134 | host_writable)) { | 2323 | host_writable)) { |
2135 | if (write_fault) | 2324 | if (write_fault) |
2136 | *ptwrite = 1; | 2325 | *emulate = 1; |
2137 | kvm_mmu_flush_tlb(vcpu); | 2326 | kvm_mmu_flush_tlb(vcpu); |
2138 | } | 2327 | } |
2139 | 2328 | ||
2329 | if (unlikely(is_mmio_spte(*sptep) && emulate)) | ||
2330 | *emulate = 1; | ||
2331 | |||
2140 | pgprintk("%s: setting spte %llx\n", __func__, *sptep); | 2332 | pgprintk("%s: setting spte %llx\n", __func__, *sptep); |
2141 | pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", | 2333 | pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", |
2142 | is_large_pte(*sptep)? "2MB" : "4kB", | 2334 | is_large_pte(*sptep)? "2MB" : "4kB", |
@@ -2145,11 +2337,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2145 | if (!was_rmapped && is_large_pte(*sptep)) | 2337 | if (!was_rmapped && is_large_pte(*sptep)) |
2146 | ++vcpu->kvm->stat.lpages; | 2338 | ++vcpu->kvm->stat.lpages; |
2147 | 2339 | ||
2148 | page_header_update_slot(vcpu->kvm, sptep, gfn); | 2340 | if (is_shadow_present_pte(*sptep)) { |
2149 | if (!was_rmapped) { | 2341 | page_header_update_slot(vcpu->kvm, sptep, gfn); |
2150 | rmap_count = rmap_add(vcpu, sptep, gfn); | 2342 | if (!was_rmapped) { |
2151 | if (rmap_count > RMAP_RECYCLE_THRESHOLD) | 2343 | rmap_count = rmap_add(vcpu, sptep, gfn); |
2152 | rmap_recycle(vcpu, sptep, gfn); | 2344 | if (rmap_count > RMAP_RECYCLE_THRESHOLD) |
2345 | rmap_recycle(vcpu, sptep, gfn); | ||
2346 | } | ||
2153 | } | 2347 | } |
2154 | kvm_release_pfn_clean(pfn); | 2348 | kvm_release_pfn_clean(pfn); |
2155 | if (speculative) { | 2349 | if (speculative) { |
@@ -2170,8 +2364,8 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, | |||
2170 | 2364 | ||
2171 | slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); | 2365 | slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); |
2172 | if (!slot) { | 2366 | if (!slot) { |
2173 | get_page(bad_page); | 2367 | get_page(fault_page); |
2174 | return page_to_pfn(bad_page); | 2368 | return page_to_pfn(fault_page); |
2175 | } | 2369 | } |
2176 | 2370 | ||
2177 | hva = gfn_to_hva_memslot(slot, gfn); | 2371 | hva = gfn_to_hva_memslot(slot, gfn); |
@@ -2198,7 +2392,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, | |||
2198 | 2392 | ||
2199 | for (i = 0; i < ret; i++, gfn++, start++) | 2393 | for (i = 0; i < ret; i++, gfn++, start++) |
2200 | mmu_set_spte(vcpu, start, ACC_ALL, | 2394 | mmu_set_spte(vcpu, start, ACC_ALL, |
2201 | access, 0, 0, 1, NULL, | 2395 | access, 0, 0, NULL, |
2202 | sp->role.level, gfn, | 2396 | sp->role.level, gfn, |
2203 | page_to_pfn(pages[i]), true, true); | 2397 | page_to_pfn(pages[i]), true, true); |
2204 | 2398 | ||
@@ -2217,7 +2411,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, | |||
2217 | spte = sp->spt + i; | 2411 | spte = sp->spt + i; |
2218 | 2412 | ||
2219 | for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { | 2413 | for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { |
2220 | if (*spte != shadow_trap_nonpresent_pte || spte == sptep) { | 2414 | if (is_shadow_present_pte(*spte) || spte == sptep) { |
2221 | if (!start) | 2415 | if (!start) |
2222 | continue; | 2416 | continue; |
2223 | if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) | 2417 | if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) |
@@ -2254,7 +2448,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
2254 | { | 2448 | { |
2255 | struct kvm_shadow_walk_iterator iterator; | 2449 | struct kvm_shadow_walk_iterator iterator; |
2256 | struct kvm_mmu_page *sp; | 2450 | struct kvm_mmu_page *sp; |
2257 | int pt_write = 0; | 2451 | int emulate = 0; |
2258 | gfn_t pseudo_gfn; | 2452 | gfn_t pseudo_gfn; |
2259 | 2453 | ||
2260 | for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { | 2454 | for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { |
@@ -2262,14 +2456,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
2262 | unsigned pte_access = ACC_ALL; | 2456 | unsigned pte_access = ACC_ALL; |
2263 | 2457 | ||
2264 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, | 2458 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, |
2265 | 0, write, 1, &pt_write, | 2459 | 0, write, &emulate, |
2266 | level, gfn, pfn, prefault, map_writable); | 2460 | level, gfn, pfn, prefault, map_writable); |
2267 | direct_pte_prefetch(vcpu, iterator.sptep); | 2461 | direct_pte_prefetch(vcpu, iterator.sptep); |
2268 | ++vcpu->stat.pf_fixed; | 2462 | ++vcpu->stat.pf_fixed; |
2269 | break; | 2463 | break; |
2270 | } | 2464 | } |
2271 | 2465 | ||
2272 | if (*iterator.sptep == shadow_trap_nonpresent_pte) { | 2466 | if (!is_shadow_present_pte(*iterator.sptep)) { |
2273 | u64 base_addr = iterator.addr; | 2467 | u64 base_addr = iterator.addr; |
2274 | 2468 | ||
2275 | base_addr &= PT64_LVL_ADDR_MASK(iterator.level); | 2469 | base_addr &= PT64_LVL_ADDR_MASK(iterator.level); |
@@ -2283,14 +2477,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
2283 | return -ENOMEM; | 2477 | return -ENOMEM; |
2284 | } | 2478 | } |
2285 | 2479 | ||
2286 | __set_spte(iterator.sptep, | 2480 | mmu_spte_set(iterator.sptep, |
2287 | __pa(sp->spt) | 2481 | __pa(sp->spt) |
2288 | | PT_PRESENT_MASK | PT_WRITABLE_MASK | 2482 | | PT_PRESENT_MASK | PT_WRITABLE_MASK |
2289 | | shadow_user_mask | shadow_x_mask | 2483 | | shadow_user_mask | shadow_x_mask |
2290 | | shadow_accessed_mask); | 2484 | | shadow_accessed_mask); |
2291 | } | 2485 | } |
2292 | } | 2486 | } |
2293 | return pt_write; | 2487 | return emulate; |
2294 | } | 2488 | } |
2295 | 2489 | ||
2296 | static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) | 2490 | static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) |
@@ -2306,16 +2500,15 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct * | |||
2306 | send_sig_info(SIGBUS, &info, tsk); | 2500 | send_sig_info(SIGBUS, &info, tsk); |
2307 | } | 2501 | } |
2308 | 2502 | ||
2309 | static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) | 2503 | static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) |
2310 | { | 2504 | { |
2311 | kvm_release_pfn_clean(pfn); | 2505 | kvm_release_pfn_clean(pfn); |
2312 | if (is_hwpoison_pfn(pfn)) { | 2506 | if (is_hwpoison_pfn(pfn)) { |
2313 | kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current); | 2507 | kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); |
2314 | return 0; | 2508 | return 0; |
2315 | } else if (is_fault_pfn(pfn)) | 2509 | } |
2316 | return -EFAULT; | ||
2317 | 2510 | ||
2318 | return 1; | 2511 | return -EFAULT; |
2319 | } | 2512 | } |
2320 | 2513 | ||
2321 | static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, | 2514 | static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, |
@@ -2360,6 +2553,30 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, | |||
2360 | } | 2553 | } |
2361 | } | 2554 | } |
2362 | 2555 | ||
2556 | static bool mmu_invalid_pfn(pfn_t pfn) | ||
2557 | { | ||
2558 | return unlikely(is_invalid_pfn(pfn)); | ||
2559 | } | ||
2560 | |||
2561 | static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, | ||
2562 | pfn_t pfn, unsigned access, int *ret_val) | ||
2563 | { | ||
2564 | bool ret = true; | ||
2565 | |||
2566 | /* The pfn is invalid, report the error! */ | ||
2567 | if (unlikely(is_invalid_pfn(pfn))) { | ||
2568 | *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); | ||
2569 | goto exit; | ||
2570 | } | ||
2571 | |||
2572 | if (unlikely(is_noslot_pfn(pfn))) | ||
2573 | vcpu_cache_mmio_info(vcpu, gva, gfn, access); | ||
2574 | |||
2575 | ret = false; | ||
2576 | exit: | ||
2577 | return ret; | ||
2578 | } | ||
2579 | |||
2363 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, | 2580 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, |
2364 | gva_t gva, pfn_t *pfn, bool write, bool *writable); | 2581 | gva_t gva, pfn_t *pfn, bool write, bool *writable); |
2365 | 2582 | ||
@@ -2394,9 +2611,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, | |||
2394 | if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) | 2611 | if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) |
2395 | return 0; | 2612 | return 0; |
2396 | 2613 | ||
2397 | /* mmio */ | 2614 | if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) |
2398 | if (is_error_pfn(pfn)) | 2615 | return r; |
2399 | return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); | ||
2400 | 2616 | ||
2401 | spin_lock(&vcpu->kvm->mmu_lock); | 2617 | spin_lock(&vcpu->kvm->mmu_lock); |
2402 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 2618 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
@@ -2623,6 +2839,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) | |||
2623 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | 2839 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) |
2624 | return; | 2840 | return; |
2625 | 2841 | ||
2842 | vcpu_clear_mmio_info(vcpu, ~0ul); | ||
2626 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); | 2843 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); |
2627 | if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { | 2844 | if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { |
2628 | hpa_t root = vcpu->arch.mmu.root_hpa; | 2845 | hpa_t root = vcpu->arch.mmu.root_hpa; |
@@ -2667,6 +2884,94 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, | |||
2667 | return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); | 2884 | return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); |
2668 | } | 2885 | } |
2669 | 2886 | ||
2887 | static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) | ||
2888 | { | ||
2889 | if (direct) | ||
2890 | return vcpu_match_mmio_gpa(vcpu, addr); | ||
2891 | |||
2892 | return vcpu_match_mmio_gva(vcpu, addr); | ||
2893 | } | ||
2894 | |||
2895 | |||
2896 | /* | ||
2897 | * On direct hosts, the last spte is only allows two states | ||
2898 | * for mmio page fault: | ||
2899 | * - It is the mmio spte | ||
2900 | * - It is zapped or it is being zapped. | ||
2901 | * | ||
2902 | * This function completely checks the spte when the last spte | ||
2903 | * is not the mmio spte. | ||
2904 | */ | ||
2905 | static bool check_direct_spte_mmio_pf(u64 spte) | ||
2906 | { | ||
2907 | return __check_direct_spte_mmio_pf(spte); | ||
2908 | } | ||
2909 | |||
2910 | static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr) | ||
2911 | { | ||
2912 | struct kvm_shadow_walk_iterator iterator; | ||
2913 | u64 spte = 0ull; | ||
2914 | |||
2915 | walk_shadow_page_lockless_begin(vcpu); | ||
2916 | for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) | ||
2917 | if (!is_shadow_present_pte(spte)) | ||
2918 | break; | ||
2919 | walk_shadow_page_lockless_end(vcpu); | ||
2920 | |||
2921 | return spte; | ||
2922 | } | ||
2923 | |||
2924 | /* | ||
2925 | * If it is a real mmio page fault, return 1 and emulat the instruction | ||
2926 | * directly, return 0 to let CPU fault again on the address, -1 is | ||
2927 | * returned if bug is detected. | ||
2928 | */ | ||
2929 | int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) | ||
2930 | { | ||
2931 | u64 spte; | ||
2932 | |||
2933 | if (quickly_check_mmio_pf(vcpu, addr, direct)) | ||
2934 | return 1; | ||
2935 | |||
2936 | spte = walk_shadow_page_get_mmio_spte(vcpu, addr); | ||
2937 | |||
2938 | if (is_mmio_spte(spte)) { | ||
2939 | gfn_t gfn = get_mmio_spte_gfn(spte); | ||
2940 | unsigned access = get_mmio_spte_access(spte); | ||
2941 | |||
2942 | if (direct) | ||
2943 | addr = 0; | ||
2944 | |||
2945 | trace_handle_mmio_page_fault(addr, gfn, access); | ||
2946 | vcpu_cache_mmio_info(vcpu, addr, gfn, access); | ||
2947 | return 1; | ||
2948 | } | ||
2949 | |||
2950 | /* | ||
2951 | * It's ok if the gva is remapped by other cpus on shadow guest, | ||
2952 | * it's a BUG if the gfn is not a mmio page. | ||
2953 | */ | ||
2954 | if (direct && !check_direct_spte_mmio_pf(spte)) | ||
2955 | return -1; | ||
2956 | |||
2957 | /* | ||
2958 | * If the page table is zapped by other cpus, let CPU fault again on | ||
2959 | * the address. | ||
2960 | */ | ||
2961 | return 0; | ||
2962 | } | ||
2963 | EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); | ||
2964 | |||
2965 | static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, | ||
2966 | u32 error_code, bool direct) | ||
2967 | { | ||
2968 | int ret; | ||
2969 | |||
2970 | ret = handle_mmio_page_fault_common(vcpu, addr, direct); | ||
2971 | WARN_ON(ret < 0); | ||
2972 | return ret; | ||
2973 | } | ||
2974 | |||
2670 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | 2975 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, |
2671 | u32 error_code, bool prefault) | 2976 | u32 error_code, bool prefault) |
2672 | { | 2977 | { |
@@ -2674,6 +2979,10 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | |||
2674 | int r; | 2979 | int r; |
2675 | 2980 | ||
2676 | pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); | 2981 | pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); |
2982 | |||
2983 | if (unlikely(error_code & PFERR_RSVD_MASK)) | ||
2984 | return handle_mmio_page_fault(vcpu, gva, error_code, true); | ||
2985 | |||
2677 | r = mmu_topup_memory_caches(vcpu); | 2986 | r = mmu_topup_memory_caches(vcpu); |
2678 | if (r) | 2987 | if (r) |
2679 | return r; | 2988 | return r; |
@@ -2750,6 +3059,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | |||
2750 | ASSERT(vcpu); | 3059 | ASSERT(vcpu); |
2751 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); | 3060 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
2752 | 3061 | ||
3062 | if (unlikely(error_code & PFERR_RSVD_MASK)) | ||
3063 | return handle_mmio_page_fault(vcpu, gpa, error_code, true); | ||
3064 | |||
2753 | r = mmu_topup_memory_caches(vcpu); | 3065 | r = mmu_topup_memory_caches(vcpu); |
2754 | if (r) | 3066 | if (r) |
2755 | return r; | 3067 | return r; |
@@ -2767,9 +3079,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | |||
2767 | if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) | 3079 | if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) |
2768 | return 0; | 3080 | return 0; |
2769 | 3081 | ||
2770 | /* mmio */ | 3082 | if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) |
2771 | if (is_error_pfn(pfn)) | 3083 | return r; |
2772 | return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); | 3084 | |
2773 | spin_lock(&vcpu->kvm->mmu_lock); | 3085 | spin_lock(&vcpu->kvm->mmu_lock); |
2774 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 3086 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
2775 | goto out_unlock; | 3087 | goto out_unlock; |
@@ -2800,7 +3112,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu, | |||
2800 | context->page_fault = nonpaging_page_fault; | 3112 | context->page_fault = nonpaging_page_fault; |
2801 | context->gva_to_gpa = nonpaging_gva_to_gpa; | 3113 | context->gva_to_gpa = nonpaging_gva_to_gpa; |
2802 | context->free = nonpaging_free; | 3114 | context->free = nonpaging_free; |
2803 | context->prefetch_page = nonpaging_prefetch_page; | ||
2804 | context->sync_page = nonpaging_sync_page; | 3115 | context->sync_page = nonpaging_sync_page; |
2805 | context->invlpg = nonpaging_invlpg; | 3116 | context->invlpg = nonpaging_invlpg; |
2806 | context->update_pte = nonpaging_update_pte; | 3117 | context->update_pte = nonpaging_update_pte; |
@@ -2848,6 +3159,23 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) | |||
2848 | return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; | 3159 | return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; |
2849 | } | 3160 | } |
2850 | 3161 | ||
3162 | static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, | ||
3163 | int *nr_present) | ||
3164 | { | ||
3165 | if (unlikely(is_mmio_spte(*sptep))) { | ||
3166 | if (gfn != get_mmio_spte_gfn(*sptep)) { | ||
3167 | mmu_spte_clear_no_track(sptep); | ||
3168 | return true; | ||
3169 | } | ||
3170 | |||
3171 | (*nr_present)++; | ||
3172 | mark_mmio_spte(sptep, gfn, access); | ||
3173 | return true; | ||
3174 | } | ||
3175 | |||
3176 | return false; | ||
3177 | } | ||
3178 | |||
2851 | #define PTTYPE 64 | 3179 | #define PTTYPE 64 |
2852 | #include "paging_tmpl.h" | 3180 | #include "paging_tmpl.h" |
2853 | #undef PTTYPE | 3181 | #undef PTTYPE |
@@ -2930,7 +3258,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, | |||
2930 | context->new_cr3 = paging_new_cr3; | 3258 | context->new_cr3 = paging_new_cr3; |
2931 | context->page_fault = paging64_page_fault; | 3259 | context->page_fault = paging64_page_fault; |
2932 | context->gva_to_gpa = paging64_gva_to_gpa; | 3260 | context->gva_to_gpa = paging64_gva_to_gpa; |
2933 | context->prefetch_page = paging64_prefetch_page; | ||
2934 | context->sync_page = paging64_sync_page; | 3261 | context->sync_page = paging64_sync_page; |
2935 | context->invlpg = paging64_invlpg; | 3262 | context->invlpg = paging64_invlpg; |
2936 | context->update_pte = paging64_update_pte; | 3263 | context->update_pte = paging64_update_pte; |
@@ -2959,7 +3286,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, | |||
2959 | context->page_fault = paging32_page_fault; | 3286 | context->page_fault = paging32_page_fault; |
2960 | context->gva_to_gpa = paging32_gva_to_gpa; | 3287 | context->gva_to_gpa = paging32_gva_to_gpa; |
2961 | context->free = paging_free; | 3288 | context->free = paging_free; |
2962 | context->prefetch_page = paging32_prefetch_page; | ||
2963 | context->sync_page = paging32_sync_page; | 3289 | context->sync_page = paging32_sync_page; |
2964 | context->invlpg = paging32_invlpg; | 3290 | context->invlpg = paging32_invlpg; |
2965 | context->update_pte = paging32_update_pte; | 3291 | context->update_pte = paging32_update_pte; |
@@ -2984,7 +3310,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | |||
2984 | context->new_cr3 = nonpaging_new_cr3; | 3310 | context->new_cr3 = nonpaging_new_cr3; |
2985 | context->page_fault = tdp_page_fault; | 3311 | context->page_fault = tdp_page_fault; |
2986 | context->free = nonpaging_free; | 3312 | context->free = nonpaging_free; |
2987 | context->prefetch_page = nonpaging_prefetch_page; | ||
2988 | context->sync_page = nonpaging_sync_page; | 3313 | context->sync_page = nonpaging_sync_page; |
2989 | context->invlpg = nonpaging_invlpg; | 3314 | context->invlpg = nonpaging_invlpg; |
2990 | context->update_pte = nonpaging_update_pte; | 3315 | context->update_pte = nonpaging_update_pte; |
@@ -3023,6 +3348,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | |||
3023 | int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) | 3348 | int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) |
3024 | { | 3349 | { |
3025 | int r; | 3350 | int r; |
3351 | bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); | ||
3026 | ASSERT(vcpu); | 3352 | ASSERT(vcpu); |
3027 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | 3353 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
3028 | 3354 | ||
@@ -3037,6 +3363,8 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) | |||
3037 | 3363 | ||
3038 | vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); | 3364 | vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); |
3039 | vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); | 3365 | vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); |
3366 | vcpu->arch.mmu.base_role.smep_andnot_wp | ||
3367 | = smep && !is_write_protection(vcpu); | ||
3040 | 3368 | ||
3041 | return r; | 3369 | return r; |
3042 | } | 3370 | } |
@@ -3141,27 +3469,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu) | |||
3141 | } | 3469 | } |
3142 | EXPORT_SYMBOL_GPL(kvm_mmu_unload); | 3470 | EXPORT_SYMBOL_GPL(kvm_mmu_unload); |
3143 | 3471 | ||
3144 | static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | ||
3145 | struct kvm_mmu_page *sp, | ||
3146 | u64 *spte) | ||
3147 | { | ||
3148 | u64 pte; | ||
3149 | struct kvm_mmu_page *child; | ||
3150 | |||
3151 | pte = *spte; | ||
3152 | if (is_shadow_present_pte(pte)) { | ||
3153 | if (is_last_spte(pte, sp->role.level)) | ||
3154 | drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte); | ||
3155 | else { | ||
3156 | child = page_header(pte & PT64_BASE_ADDR_MASK); | ||
3157 | mmu_page_remove_parent_pte(child, spte); | ||
3158 | } | ||
3159 | } | ||
3160 | __set_spte(spte, shadow_trap_nonpresent_pte); | ||
3161 | if (is_large_pte(pte)) | ||
3162 | --vcpu->kvm->stat.lpages; | ||
3163 | } | ||
3164 | |||
3165 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | 3472 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, |
3166 | struct kvm_mmu_page *sp, u64 *spte, | 3473 | struct kvm_mmu_page *sp, u64 *spte, |
3167 | const void *new) | 3474 | const void *new) |
@@ -3233,6 +3540,13 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
3233 | int level, npte, invlpg_counter, r, flooded = 0; | 3540 | int level, npte, invlpg_counter, r, flooded = 0; |
3234 | bool remote_flush, local_flush, zap_page; | 3541 | bool remote_flush, local_flush, zap_page; |
3235 | 3542 | ||
3543 | /* | ||
3544 | * If we don't have indirect shadow pages, it means no page is | ||
3545 | * write-protected, so we can exit simply. | ||
3546 | */ | ||
3547 | if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) | ||
3548 | return; | ||
3549 | |||
3236 | zap_page = remote_flush = local_flush = false; | 3550 | zap_page = remote_flush = local_flush = false; |
3237 | offset = offset_in_page(gpa); | 3551 | offset = offset_in_page(gpa); |
3238 | 3552 | ||
@@ -3336,7 +3650,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
3336 | spte = &sp->spt[page_offset / sizeof(*spte)]; | 3650 | spte = &sp->spt[page_offset / sizeof(*spte)]; |
3337 | while (npte--) { | 3651 | while (npte--) { |
3338 | entry = *spte; | 3652 | entry = *spte; |
3339 | mmu_pte_write_zap_pte(vcpu, sp, spte); | 3653 | mmu_page_zap_pte(vcpu->kvm, sp, spte); |
3340 | if (gentry && | 3654 | if (gentry && |
3341 | !((sp->role.word ^ vcpu->arch.mmu.base_role.word) | 3655 | !((sp->role.word ^ vcpu->arch.mmu.base_role.word) |
3342 | & mask.word)) | 3656 | & mask.word)) |
@@ -3380,9 +3694,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | |||
3380 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, | 3694 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, |
3381 | struct kvm_mmu_page, link); | 3695 | struct kvm_mmu_page, link); |
3382 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); | 3696 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); |
3383 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | ||
3384 | ++vcpu->kvm->stat.mmu_recycled; | 3697 | ++vcpu->kvm->stat.mmu_recycled; |
3385 | } | 3698 | } |
3699 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | ||
3386 | } | 3700 | } |
3387 | 3701 | ||
3388 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, | 3702 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, |
@@ -3506,15 +3820,15 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
3506 | continue; | 3820 | continue; |
3507 | 3821 | ||
3508 | if (is_large_pte(pt[i])) { | 3822 | if (is_large_pte(pt[i])) { |
3509 | drop_spte(kvm, &pt[i], | 3823 | drop_spte(kvm, &pt[i]); |
3510 | shadow_trap_nonpresent_pte); | ||
3511 | --kvm->stat.lpages; | 3824 | --kvm->stat.lpages; |
3512 | continue; | 3825 | continue; |
3513 | } | 3826 | } |
3514 | 3827 | ||
3515 | /* avoid RMW */ | 3828 | /* avoid RMW */ |
3516 | if (is_writable_pte(pt[i])) | 3829 | if (is_writable_pte(pt[i])) |
3517 | update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); | 3830 | mmu_spte_update(&pt[i], |
3831 | pt[i] & ~PT_WRITABLE_MASK); | ||
3518 | } | 3832 | } |
3519 | } | 3833 | } |
3520 | kvm_flush_remote_tlbs(kvm); | 3834 | kvm_flush_remote_tlbs(kvm); |
@@ -3590,25 +3904,18 @@ static struct shrinker mmu_shrinker = { | |||
3590 | 3904 | ||
3591 | static void mmu_destroy_caches(void) | 3905 | static void mmu_destroy_caches(void) |
3592 | { | 3906 | { |
3593 | if (pte_chain_cache) | 3907 | if (pte_list_desc_cache) |
3594 | kmem_cache_destroy(pte_chain_cache); | 3908 | kmem_cache_destroy(pte_list_desc_cache); |
3595 | if (rmap_desc_cache) | ||
3596 | kmem_cache_destroy(rmap_desc_cache); | ||
3597 | if (mmu_page_header_cache) | 3909 | if (mmu_page_header_cache) |
3598 | kmem_cache_destroy(mmu_page_header_cache); | 3910 | kmem_cache_destroy(mmu_page_header_cache); |
3599 | } | 3911 | } |
3600 | 3912 | ||
3601 | int kvm_mmu_module_init(void) | 3913 | int kvm_mmu_module_init(void) |
3602 | { | 3914 | { |
3603 | pte_chain_cache = kmem_cache_create("kvm_pte_chain", | 3915 | pte_list_desc_cache = kmem_cache_create("pte_list_desc", |
3604 | sizeof(struct kvm_pte_chain), | 3916 | sizeof(struct pte_list_desc), |
3605 | 0, 0, NULL); | ||
3606 | if (!pte_chain_cache) | ||
3607 | goto nomem; | ||
3608 | rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", | ||
3609 | sizeof(struct kvm_rmap_desc), | ||
3610 | 0, 0, NULL); | 3917 | 0, 0, NULL); |
3611 | if (!rmap_desc_cache) | 3918 | if (!pte_list_desc_cache) |
3612 | goto nomem; | 3919 | goto nomem; |
3613 | 3920 | ||
3614 | mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", | 3921 | mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", |
@@ -3775,16 +4082,17 @@ out: | |||
3775 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) | 4082 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) |
3776 | { | 4083 | { |
3777 | struct kvm_shadow_walk_iterator iterator; | 4084 | struct kvm_shadow_walk_iterator iterator; |
4085 | u64 spte; | ||
3778 | int nr_sptes = 0; | 4086 | int nr_sptes = 0; |
3779 | 4087 | ||
3780 | spin_lock(&vcpu->kvm->mmu_lock); | 4088 | walk_shadow_page_lockless_begin(vcpu); |
3781 | for_each_shadow_entry(vcpu, addr, iterator) { | 4089 | for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { |
3782 | sptes[iterator.level-1] = *iterator.sptep; | 4090 | sptes[iterator.level-1] = spte; |
3783 | nr_sptes++; | 4091 | nr_sptes++; |
3784 | if (!is_shadow_present_pte(*iterator.sptep)) | 4092 | if (!is_shadow_present_pte(spte)) |
3785 | break; | 4093 | break; |
3786 | } | 4094 | } |
3787 | spin_unlock(&vcpu->kvm->mmu_lock); | 4095 | walk_shadow_page_lockless_end(vcpu); |
3788 | 4096 | ||
3789 | return nr_sptes; | 4097 | return nr_sptes; |
3790 | } | 4098 | } |
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 7086ca85d3e7..e374db9af021 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h | |||
@@ -49,6 +49,8 @@ | |||
49 | #define PFERR_FETCH_MASK (1U << 4) | 49 | #define PFERR_FETCH_MASK (1U << 4) |
50 | 50 | ||
51 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); | 51 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); |
52 | void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); | ||
53 | int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); | ||
52 | int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); | 54 | int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); |
53 | 55 | ||
54 | static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) | 56 | static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) |
@@ -76,4 +78,27 @@ static inline int is_present_gpte(unsigned long pte) | |||
76 | return pte & PT_PRESENT_MASK; | 78 | return pte & PT_PRESENT_MASK; |
77 | } | 79 | } |
78 | 80 | ||
81 | static inline int is_writable_pte(unsigned long pte) | ||
82 | { | ||
83 | return pte & PT_WRITABLE_MASK; | ||
84 | } | ||
85 | |||
86 | static inline bool is_write_protection(struct kvm_vcpu *vcpu) | ||
87 | { | ||
88 | return kvm_read_cr0_bits(vcpu, X86_CR0_WP); | ||
89 | } | ||
90 | |||
91 | static inline bool check_write_user_access(struct kvm_vcpu *vcpu, | ||
92 | bool write_fault, bool user_fault, | ||
93 | unsigned long pte) | ||
94 | { | ||
95 | if (unlikely(write_fault && !is_writable_pte(pte) | ||
96 | && (user_fault || is_write_protection(vcpu)))) | ||
97 | return false; | ||
98 | |||
99 | if (unlikely(user_fault && !(pte & PT_USER_MASK))) | ||
100 | return false; | ||
101 | |||
102 | return true; | ||
103 | } | ||
79 | #endif | 104 | #endif |
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 5f6223b8bcf7..2460a265be23 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c | |||
@@ -99,18 +99,6 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) | |||
99 | "level = %d\n", sp, level); | 99 | "level = %d\n", sp, level); |
100 | return; | 100 | return; |
101 | } | 101 | } |
102 | |||
103 | if (*sptep == shadow_notrap_nonpresent_pte) { | ||
104 | audit_printk(vcpu->kvm, "notrap spte in unsync " | ||
105 | "sp: %p\n", sp); | ||
106 | return; | ||
107 | } | ||
108 | } | ||
109 | |||
110 | if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { | ||
111 | audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n", | ||
112 | sp); | ||
113 | return; | ||
114 | } | 102 | } |
115 | 103 | ||
116 | if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level)) | 104 | if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level)) |
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index b60b4fdb3eda..eed67f34146d 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h | |||
@@ -196,6 +196,54 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, | |||
196 | TP_ARGS(sp) | 196 | TP_ARGS(sp) |
197 | ); | 197 | ); |
198 | 198 | ||
199 | DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages, | ||
200 | TP_PROTO(struct kvm_mmu_page *sp), | ||
201 | |||
202 | TP_ARGS(sp) | ||
203 | ); | ||
204 | |||
205 | TRACE_EVENT( | ||
206 | mark_mmio_spte, | ||
207 | TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access), | ||
208 | TP_ARGS(sptep, gfn, access), | ||
209 | |||
210 | TP_STRUCT__entry( | ||
211 | __field(void *, sptep) | ||
212 | __field(gfn_t, gfn) | ||
213 | __field(unsigned, access) | ||
214 | ), | ||
215 | |||
216 | TP_fast_assign( | ||
217 | __entry->sptep = sptep; | ||
218 | __entry->gfn = gfn; | ||
219 | __entry->access = access; | ||
220 | ), | ||
221 | |||
222 | TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn, | ||
223 | __entry->access) | ||
224 | ); | ||
225 | |||
226 | TRACE_EVENT( | ||
227 | handle_mmio_page_fault, | ||
228 | TP_PROTO(u64 addr, gfn_t gfn, unsigned access), | ||
229 | TP_ARGS(addr, gfn, access), | ||
230 | |||
231 | TP_STRUCT__entry( | ||
232 | __field(u64, addr) | ||
233 | __field(gfn_t, gfn) | ||
234 | __field(unsigned, access) | ||
235 | ), | ||
236 | |||
237 | TP_fast_assign( | ||
238 | __entry->addr = addr; | ||
239 | __entry->gfn = gfn; | ||
240 | __entry->access = access; | ||
241 | ), | ||
242 | |||
243 | TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, | ||
244 | __entry->access) | ||
245 | ); | ||
246 | |||
199 | TRACE_EVENT( | 247 | TRACE_EVENT( |
200 | kvm_mmu_audit, | 248 | kvm_mmu_audit, |
201 | TP_PROTO(struct kvm_vcpu *vcpu, int audit_point), | 249 | TP_PROTO(struct kvm_vcpu *vcpu, int audit_point), |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 9d03ad4dd5ec..507e2b844cfa 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -101,11 +101,15 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | |||
101 | return (ret != orig_pte); | 101 | return (ret != orig_pte); |
102 | } | 102 | } |
103 | 103 | ||
104 | static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) | 104 | static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte, |
105 | bool last) | ||
105 | { | 106 | { |
106 | unsigned access; | 107 | unsigned access; |
107 | 108 | ||
108 | access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; | 109 | access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; |
110 | if (last && !is_dirty_gpte(gpte)) | ||
111 | access &= ~ACC_WRITE_MASK; | ||
112 | |||
109 | #if PTTYPE == 64 | 113 | #if PTTYPE == 64 |
110 | if (vcpu->arch.mmu.nx) | 114 | if (vcpu->arch.mmu.nx) |
111 | access &= ~(gpte >> PT64_NX_SHIFT); | 115 | access &= ~(gpte >> PT64_NX_SHIFT); |
@@ -113,6 +117,24 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) | |||
113 | return access; | 117 | return access; |
114 | } | 118 | } |
115 | 119 | ||
120 | static bool FNAME(is_last_gpte)(struct guest_walker *walker, | ||
121 | struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | ||
122 | pt_element_t gpte) | ||
123 | { | ||
124 | if (walker->level == PT_PAGE_TABLE_LEVEL) | ||
125 | return true; | ||
126 | |||
127 | if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) && | ||
128 | (PTTYPE == 64 || is_pse(vcpu))) | ||
129 | return true; | ||
130 | |||
131 | if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) && | ||
132 | (mmu->root_level == PT64_ROOT_LEVEL)) | ||
133 | return true; | ||
134 | |||
135 | return false; | ||
136 | } | ||
137 | |||
116 | /* | 138 | /* |
117 | * Fetch a guest pte for a guest virtual address | 139 | * Fetch a guest pte for a guest virtual address |
118 | */ | 140 | */ |
@@ -125,18 +147,17 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, | |||
125 | gfn_t table_gfn; | 147 | gfn_t table_gfn; |
126 | unsigned index, pt_access, uninitialized_var(pte_access); | 148 | unsigned index, pt_access, uninitialized_var(pte_access); |
127 | gpa_t pte_gpa; | 149 | gpa_t pte_gpa; |
128 | bool eperm, present, rsvd_fault; | 150 | bool eperm; |
129 | int offset, write_fault, user_fault, fetch_fault; | 151 | int offset; |
130 | 152 | const int write_fault = access & PFERR_WRITE_MASK; | |
131 | write_fault = access & PFERR_WRITE_MASK; | 153 | const int user_fault = access & PFERR_USER_MASK; |
132 | user_fault = access & PFERR_USER_MASK; | 154 | const int fetch_fault = access & PFERR_FETCH_MASK; |
133 | fetch_fault = access & PFERR_FETCH_MASK; | 155 | u16 errcode = 0; |
134 | 156 | ||
135 | trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, | 157 | trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, |
136 | fetch_fault); | 158 | fetch_fault); |
137 | walk: | 159 | retry_walk: |
138 | present = true; | 160 | eperm = false; |
139 | eperm = rsvd_fault = false; | ||
140 | walker->level = mmu->root_level; | 161 | walker->level = mmu->root_level; |
141 | pte = mmu->get_cr3(vcpu); | 162 | pte = mmu->get_cr3(vcpu); |
142 | 163 | ||
@@ -144,10 +165,8 @@ walk: | |||
144 | if (walker->level == PT32E_ROOT_LEVEL) { | 165 | if (walker->level == PT32E_ROOT_LEVEL) { |
145 | pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3); | 166 | pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3); |
146 | trace_kvm_mmu_paging_element(pte, walker->level); | 167 | trace_kvm_mmu_paging_element(pte, walker->level); |
147 | if (!is_present_gpte(pte)) { | 168 | if (!is_present_gpte(pte)) |
148 | present = false; | ||
149 | goto error; | 169 | goto error; |
150 | } | ||
151 | --walker->level; | 170 | --walker->level; |
152 | } | 171 | } |
153 | #endif | 172 | #endif |
@@ -170,42 +189,31 @@ walk: | |||
170 | 189 | ||
171 | real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), | 190 | real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), |
172 | PFERR_USER_MASK|PFERR_WRITE_MASK); | 191 | PFERR_USER_MASK|PFERR_WRITE_MASK); |
173 | if (unlikely(real_gfn == UNMAPPED_GVA)) { | 192 | if (unlikely(real_gfn == UNMAPPED_GVA)) |
174 | present = false; | 193 | goto error; |
175 | break; | ||
176 | } | ||
177 | real_gfn = gpa_to_gfn(real_gfn); | 194 | real_gfn = gpa_to_gfn(real_gfn); |
178 | 195 | ||
179 | host_addr = gfn_to_hva(vcpu->kvm, real_gfn); | 196 | host_addr = gfn_to_hva(vcpu->kvm, real_gfn); |
180 | if (unlikely(kvm_is_error_hva(host_addr))) { | 197 | if (unlikely(kvm_is_error_hva(host_addr))) |
181 | present = false; | 198 | goto error; |
182 | break; | ||
183 | } | ||
184 | 199 | ||
185 | ptep_user = (pt_element_t __user *)((void *)host_addr + offset); | 200 | ptep_user = (pt_element_t __user *)((void *)host_addr + offset); |
186 | if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) { | 201 | if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) |
187 | present = false; | 202 | goto error; |
188 | break; | ||
189 | } | ||
190 | 203 | ||
191 | trace_kvm_mmu_paging_element(pte, walker->level); | 204 | trace_kvm_mmu_paging_element(pte, walker->level); |
192 | 205 | ||
193 | if (unlikely(!is_present_gpte(pte))) { | 206 | if (unlikely(!is_present_gpte(pte))) |
194 | present = false; | 207 | goto error; |
195 | break; | ||
196 | } | ||
197 | 208 | ||
198 | if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, | 209 | if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, |
199 | walker->level))) { | 210 | walker->level))) { |
200 | rsvd_fault = true; | 211 | errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; |
201 | break; | 212 | goto error; |
202 | } | 213 | } |
203 | 214 | ||
204 | if (unlikely(write_fault && !is_writable_pte(pte) | 215 | if (!check_write_user_access(vcpu, write_fault, user_fault, |
205 | && (user_fault || is_write_protection(vcpu)))) | 216 | pte)) |
206 | eperm = true; | ||
207 | |||
208 | if (unlikely(user_fault && !(pte & PT_USER_MASK))) | ||
209 | eperm = true; | 217 | eperm = true; |
210 | 218 | ||
211 | #if PTTYPE == 64 | 219 | #if PTTYPE == 64 |
@@ -213,39 +221,35 @@ walk: | |||
213 | eperm = true; | 221 | eperm = true; |
214 | #endif | 222 | #endif |
215 | 223 | ||
216 | if (!eperm && !rsvd_fault | 224 | if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) { |
217 | && unlikely(!(pte & PT_ACCESSED_MASK))) { | ||
218 | int ret; | 225 | int ret; |
219 | trace_kvm_mmu_set_accessed_bit(table_gfn, index, | 226 | trace_kvm_mmu_set_accessed_bit(table_gfn, index, |
220 | sizeof(pte)); | 227 | sizeof(pte)); |
221 | ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, | 228 | ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, |
222 | pte, pte|PT_ACCESSED_MASK); | 229 | pte, pte|PT_ACCESSED_MASK); |
223 | if (unlikely(ret < 0)) { | 230 | if (unlikely(ret < 0)) |
224 | present = false; | 231 | goto error; |
225 | break; | 232 | else if (ret) |
226 | } else if (ret) | 233 | goto retry_walk; |
227 | goto walk; | ||
228 | 234 | ||
229 | mark_page_dirty(vcpu->kvm, table_gfn); | 235 | mark_page_dirty(vcpu->kvm, table_gfn); |
230 | pte |= PT_ACCESSED_MASK; | 236 | pte |= PT_ACCESSED_MASK; |
231 | } | 237 | } |
232 | 238 | ||
233 | pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); | ||
234 | |||
235 | walker->ptes[walker->level - 1] = pte; | 239 | walker->ptes[walker->level - 1] = pte; |
236 | 240 | ||
237 | if ((walker->level == PT_PAGE_TABLE_LEVEL) || | 241 | if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) { |
238 | ((walker->level == PT_DIRECTORY_LEVEL) && | ||
239 | is_large_pte(pte) && | ||
240 | (PTTYPE == 64 || is_pse(vcpu))) || | ||
241 | ((walker->level == PT_PDPE_LEVEL) && | ||
242 | is_large_pte(pte) && | ||
243 | mmu->root_level == PT64_ROOT_LEVEL)) { | ||
244 | int lvl = walker->level; | 242 | int lvl = walker->level; |
245 | gpa_t real_gpa; | 243 | gpa_t real_gpa; |
246 | gfn_t gfn; | 244 | gfn_t gfn; |
247 | u32 ac; | 245 | u32 ac; |
248 | 246 | ||
247 | /* check if the kernel is fetching from user page */ | ||
248 | if (unlikely(pte_access & PT_USER_MASK) && | ||
249 | kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) | ||
250 | if (fetch_fault && !user_fault) | ||
251 | eperm = true; | ||
252 | |||
249 | gfn = gpte_to_gfn_lvl(pte, lvl); | 253 | gfn = gpte_to_gfn_lvl(pte, lvl); |
250 | gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; | 254 | gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; |
251 | 255 | ||
@@ -266,12 +270,14 @@ walk: | |||
266 | break; | 270 | break; |
267 | } | 271 | } |
268 | 272 | ||
269 | pt_access = pte_access; | 273 | pt_access &= FNAME(gpte_access)(vcpu, pte, false); |
270 | --walker->level; | 274 | --walker->level; |
271 | } | 275 | } |
272 | 276 | ||
273 | if (unlikely(!present || eperm || rsvd_fault)) | 277 | if (unlikely(eperm)) { |
278 | errcode |= PFERR_PRESENT_MASK; | ||
274 | goto error; | 279 | goto error; |
280 | } | ||
275 | 281 | ||
276 | if (write_fault && unlikely(!is_dirty_gpte(pte))) { | 282 | if (write_fault && unlikely(!is_dirty_gpte(pte))) { |
277 | int ret; | 283 | int ret; |
@@ -279,17 +285,17 @@ walk: | |||
279 | trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); | 285 | trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); |
280 | ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, | 286 | ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, |
281 | pte, pte|PT_DIRTY_MASK); | 287 | pte, pte|PT_DIRTY_MASK); |
282 | if (unlikely(ret < 0)) { | 288 | if (unlikely(ret < 0)) |
283 | present = false; | ||
284 | goto error; | 289 | goto error; |
285 | } else if (ret) | 290 | else if (ret) |
286 | goto walk; | 291 | goto retry_walk; |
287 | 292 | ||
288 | mark_page_dirty(vcpu->kvm, table_gfn); | 293 | mark_page_dirty(vcpu->kvm, table_gfn); |
289 | pte |= PT_DIRTY_MASK; | 294 | pte |= PT_DIRTY_MASK; |
290 | walker->ptes[walker->level - 1] = pte; | 295 | walker->ptes[walker->level - 1] = pte; |
291 | } | 296 | } |
292 | 297 | ||
298 | pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true); | ||
293 | walker->pt_access = pt_access; | 299 | walker->pt_access = pt_access; |
294 | walker->pte_access = pte_access; | 300 | walker->pte_access = pte_access; |
295 | pgprintk("%s: pte %llx pte_access %x pt_access %x\n", | 301 | pgprintk("%s: pte %llx pte_access %x pt_access %x\n", |
@@ -297,19 +303,14 @@ walk: | |||
297 | return 1; | 303 | return 1; |
298 | 304 | ||
299 | error: | 305 | error: |
306 | errcode |= write_fault | user_fault; | ||
307 | if (fetch_fault && (mmu->nx || | ||
308 | kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))) | ||
309 | errcode |= PFERR_FETCH_MASK; | ||
310 | |||
300 | walker->fault.vector = PF_VECTOR; | 311 | walker->fault.vector = PF_VECTOR; |
301 | walker->fault.error_code_valid = true; | 312 | walker->fault.error_code_valid = true; |
302 | walker->fault.error_code = 0; | 313 | walker->fault.error_code = errcode; |
303 | if (present) | ||
304 | walker->fault.error_code |= PFERR_PRESENT_MASK; | ||
305 | |||
306 | walker->fault.error_code |= write_fault | user_fault; | ||
307 | |||
308 | if (fetch_fault && mmu->nx) | ||
309 | walker->fault.error_code |= PFERR_FETCH_MASK; | ||
310 | if (rsvd_fault) | ||
311 | walker->fault.error_code |= PFERR_RSVD_MASK; | ||
312 | |||
313 | walker->fault.address = addr; | 314 | walker->fault.address = addr; |
314 | walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; | 315 | walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; |
315 | 316 | ||
@@ -336,16 +337,11 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, | |||
336 | struct kvm_mmu_page *sp, u64 *spte, | 337 | struct kvm_mmu_page *sp, u64 *spte, |
337 | pt_element_t gpte) | 338 | pt_element_t gpte) |
338 | { | 339 | { |
339 | u64 nonpresent = shadow_trap_nonpresent_pte; | ||
340 | |||
341 | if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) | 340 | if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) |
342 | goto no_present; | 341 | goto no_present; |
343 | 342 | ||
344 | if (!is_present_gpte(gpte)) { | 343 | if (!is_present_gpte(gpte)) |
345 | if (!sp->unsync) | ||
346 | nonpresent = shadow_notrap_nonpresent_pte; | ||
347 | goto no_present; | 344 | goto no_present; |
348 | } | ||
349 | 345 | ||
350 | if (!(gpte & PT_ACCESSED_MASK)) | 346 | if (!(gpte & PT_ACCESSED_MASK)) |
351 | goto no_present; | 347 | goto no_present; |
@@ -353,7 +349,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, | |||
353 | return false; | 349 | return false; |
354 | 350 | ||
355 | no_present: | 351 | no_present: |
356 | drop_spte(vcpu->kvm, spte, nonpresent); | 352 | drop_spte(vcpu->kvm, spte); |
357 | return true; | 353 | return true; |
358 | } | 354 | } |
359 | 355 | ||
@@ -369,9 +365,9 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
369 | return; | 365 | return; |
370 | 366 | ||
371 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); | 367 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); |
372 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | 368 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true); |
373 | pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); | 369 | pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); |
374 | if (is_error_pfn(pfn)) { | 370 | if (mmu_invalid_pfn(pfn)) { |
375 | kvm_release_pfn_clean(pfn); | 371 | kvm_release_pfn_clean(pfn); |
376 | return; | 372 | return; |
377 | } | 373 | } |
@@ -381,7 +377,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
381 | * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). | 377 | * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). |
382 | */ | 378 | */ |
383 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, | 379 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, |
384 | is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL, | 380 | NULL, PT_PAGE_TABLE_LEVEL, |
385 | gpte_to_gfn(gpte), pfn, true, true); | 381 | gpte_to_gfn(gpte), pfn, true, true); |
386 | } | 382 | } |
387 | 383 | ||
@@ -432,12 +428,11 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, | |||
432 | unsigned pte_access; | 428 | unsigned pte_access; |
433 | gfn_t gfn; | 429 | gfn_t gfn; |
434 | pfn_t pfn; | 430 | pfn_t pfn; |
435 | bool dirty; | ||
436 | 431 | ||
437 | if (spte == sptep) | 432 | if (spte == sptep) |
438 | continue; | 433 | continue; |
439 | 434 | ||
440 | if (*spte != shadow_trap_nonpresent_pte) | 435 | if (is_shadow_present_pte(*spte)) |
441 | continue; | 436 | continue; |
442 | 437 | ||
443 | gpte = gptep[i]; | 438 | gpte = gptep[i]; |
@@ -445,18 +440,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, | |||
445 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) | 440 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) |
446 | continue; | 441 | continue; |
447 | 442 | ||
448 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | 443 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, |
444 | true); | ||
449 | gfn = gpte_to_gfn(gpte); | 445 | gfn = gpte_to_gfn(gpte); |
450 | dirty = is_dirty_gpte(gpte); | ||
451 | pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, | 446 | pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, |
452 | (pte_access & ACC_WRITE_MASK) && dirty); | 447 | pte_access & ACC_WRITE_MASK); |
453 | if (is_error_pfn(pfn)) { | 448 | if (mmu_invalid_pfn(pfn)) { |
454 | kvm_release_pfn_clean(pfn); | 449 | kvm_release_pfn_clean(pfn); |
455 | break; | 450 | break; |
456 | } | 451 | } |
457 | 452 | ||
458 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, | 453 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, |
459 | dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn, | 454 | NULL, PT_PAGE_TABLE_LEVEL, gfn, |
460 | pfn, true, true); | 455 | pfn, true, true); |
461 | } | 456 | } |
462 | } | 457 | } |
@@ -467,12 +462,11 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, | |||
467 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | 462 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, |
468 | struct guest_walker *gw, | 463 | struct guest_walker *gw, |
469 | int user_fault, int write_fault, int hlevel, | 464 | int user_fault, int write_fault, int hlevel, |
470 | int *ptwrite, pfn_t pfn, bool map_writable, | 465 | int *emulate, pfn_t pfn, bool map_writable, |
471 | bool prefault) | 466 | bool prefault) |
472 | { | 467 | { |
473 | unsigned access = gw->pt_access; | 468 | unsigned access = gw->pt_access; |
474 | struct kvm_mmu_page *sp = NULL; | 469 | struct kvm_mmu_page *sp = NULL; |
475 | bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]); | ||
476 | int top_level; | 470 | int top_level; |
477 | unsigned direct_access; | 471 | unsigned direct_access; |
478 | struct kvm_shadow_walk_iterator it; | 472 | struct kvm_shadow_walk_iterator it; |
@@ -480,9 +474,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
480 | if (!is_present_gpte(gw->ptes[gw->level - 1])) | 474 | if (!is_present_gpte(gw->ptes[gw->level - 1])) |
481 | return NULL; | 475 | return NULL; |
482 | 476 | ||
483 | direct_access = gw->pt_access & gw->pte_access; | 477 | direct_access = gw->pte_access; |
484 | if (!dirty) | ||
485 | direct_access &= ~ACC_WRITE_MASK; | ||
486 | 478 | ||
487 | top_level = vcpu->arch.mmu.root_level; | 479 | top_level = vcpu->arch.mmu.root_level; |
488 | if (top_level == PT32E_ROOT_LEVEL) | 480 | if (top_level == PT32E_ROOT_LEVEL) |
@@ -540,8 +532,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
540 | link_shadow_page(it.sptep, sp); | 532 | link_shadow_page(it.sptep, sp); |
541 | } | 533 | } |
542 | 534 | ||
543 | mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, | 535 | mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, |
544 | user_fault, write_fault, dirty, ptwrite, it.level, | 536 | user_fault, write_fault, emulate, it.level, |
545 | gw->gfn, pfn, prefault, map_writable); | 537 | gw->gfn, pfn, prefault, map_writable); |
546 | FNAME(pte_prefetch)(vcpu, gw, it.sptep); | 538 | FNAME(pte_prefetch)(vcpu, gw, it.sptep); |
547 | 539 | ||
@@ -575,7 +567,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
575 | int user_fault = error_code & PFERR_USER_MASK; | 567 | int user_fault = error_code & PFERR_USER_MASK; |
576 | struct guest_walker walker; | 568 | struct guest_walker walker; |
577 | u64 *sptep; | 569 | u64 *sptep; |
578 | int write_pt = 0; | 570 | int emulate = 0; |
579 | int r; | 571 | int r; |
580 | pfn_t pfn; | 572 | pfn_t pfn; |
581 | int level = PT_PAGE_TABLE_LEVEL; | 573 | int level = PT_PAGE_TABLE_LEVEL; |
@@ -585,6 +577,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
585 | 577 | ||
586 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); | 578 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); |
587 | 579 | ||
580 | if (unlikely(error_code & PFERR_RSVD_MASK)) | ||
581 | return handle_mmio_page_fault(vcpu, addr, error_code, | ||
582 | mmu_is_nested(vcpu)); | ||
583 | |||
588 | r = mmu_topup_memory_caches(vcpu); | 584 | r = mmu_topup_memory_caches(vcpu); |
589 | if (r) | 585 | if (r) |
590 | return r; | 586 | return r; |
@@ -623,9 +619,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
623 | &map_writable)) | 619 | &map_writable)) |
624 | return 0; | 620 | return 0; |
625 | 621 | ||
626 | /* mmio */ | 622 | if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr, |
627 | if (is_error_pfn(pfn)) | 623 | walker.gfn, pfn, walker.pte_access, &r)) |
628 | return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn); | 624 | return r; |
629 | 625 | ||
630 | spin_lock(&vcpu->kvm->mmu_lock); | 626 | spin_lock(&vcpu->kvm->mmu_lock); |
631 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 627 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
@@ -636,19 +632,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
636 | if (!force_pt_level) | 632 | if (!force_pt_level) |
637 | transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); | 633 | transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); |
638 | sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | 634 | sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, |
639 | level, &write_pt, pfn, map_writable, prefault); | 635 | level, &emulate, pfn, map_writable, prefault); |
640 | (void)sptep; | 636 | (void)sptep; |
641 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, | 637 | pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__, |
642 | sptep, *sptep, write_pt); | 638 | sptep, *sptep, emulate); |
643 | 639 | ||
644 | if (!write_pt) | 640 | if (!emulate) |
645 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | 641 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ |
646 | 642 | ||
647 | ++vcpu->stat.pf_fixed; | 643 | ++vcpu->stat.pf_fixed; |
648 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); | 644 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); |
649 | spin_unlock(&vcpu->kvm->mmu_lock); | 645 | spin_unlock(&vcpu->kvm->mmu_lock); |
650 | 646 | ||
651 | return write_pt; | 647 | return emulate; |
652 | 648 | ||
653 | out_unlock: | 649 | out_unlock: |
654 | spin_unlock(&vcpu->kvm->mmu_lock); | 650 | spin_unlock(&vcpu->kvm->mmu_lock); |
@@ -665,6 +661,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | |||
665 | u64 *sptep; | 661 | u64 *sptep; |
666 | int need_flush = 0; | 662 | int need_flush = 0; |
667 | 663 | ||
664 | vcpu_clear_mmio_info(vcpu, gva); | ||
665 | |||
668 | spin_lock(&vcpu->kvm->mmu_lock); | 666 | spin_lock(&vcpu->kvm->mmu_lock); |
669 | 667 | ||
670 | for_each_shadow_entry(vcpu, gva, iterator) { | 668 | for_each_shadow_entry(vcpu, gva, iterator) { |
@@ -688,11 +686,11 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | |||
688 | if (is_shadow_present_pte(*sptep)) { | 686 | if (is_shadow_present_pte(*sptep)) { |
689 | if (is_large_pte(*sptep)) | 687 | if (is_large_pte(*sptep)) |
690 | --vcpu->kvm->stat.lpages; | 688 | --vcpu->kvm->stat.lpages; |
691 | drop_spte(vcpu->kvm, sptep, | 689 | drop_spte(vcpu->kvm, sptep); |
692 | shadow_trap_nonpresent_pte); | ||
693 | need_flush = 1; | 690 | need_flush = 1; |
694 | } else | 691 | } else if (is_mmio_spte(*sptep)) |
695 | __set_spte(sptep, shadow_trap_nonpresent_pte); | 692 | mmu_spte_clear_no_track(sptep); |
693 | |||
696 | break; | 694 | break; |
697 | } | 695 | } |
698 | 696 | ||
@@ -752,36 +750,6 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, | |||
752 | return gpa; | 750 | return gpa; |
753 | } | 751 | } |
754 | 752 | ||
755 | static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, | ||
756 | struct kvm_mmu_page *sp) | ||
757 | { | ||
758 | int i, j, offset, r; | ||
759 | pt_element_t pt[256 / sizeof(pt_element_t)]; | ||
760 | gpa_t pte_gpa; | ||
761 | |||
762 | if (sp->role.direct | ||
763 | || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { | ||
764 | nonpaging_prefetch_page(vcpu, sp); | ||
765 | return; | ||
766 | } | ||
767 | |||
768 | pte_gpa = gfn_to_gpa(sp->gfn); | ||
769 | if (PTTYPE == 32) { | ||
770 | offset = sp->role.quadrant << PT64_LEVEL_BITS; | ||
771 | pte_gpa += offset * sizeof(pt_element_t); | ||
772 | } | ||
773 | |||
774 | for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) { | ||
775 | r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt); | ||
776 | pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t); | ||
777 | for (j = 0; j < ARRAY_SIZE(pt); ++j) | ||
778 | if (r || is_present_gpte(pt[j])) | ||
779 | sp->spt[i+j] = shadow_trap_nonpresent_pte; | ||
780 | else | ||
781 | sp->spt[i+j] = shadow_notrap_nonpresent_pte; | ||
782 | } | ||
783 | } | ||
784 | |||
785 | /* | 753 | /* |
786 | * Using the cached information from sp->gfns is safe because: | 754 | * Using the cached information from sp->gfns is safe because: |
787 | * - The spte has a reference to the struct page, so the pfn for a given gfn | 755 | * - The spte has a reference to the struct page, so the pfn for a given gfn |
@@ -817,7 +785,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
817 | gpa_t pte_gpa; | 785 | gpa_t pte_gpa; |
818 | gfn_t gfn; | 786 | gfn_t gfn; |
819 | 787 | ||
820 | if (!is_shadow_present_pte(sp->spt[i])) | 788 | if (!sp->spt[i]) |
821 | continue; | 789 | continue; |
822 | 790 | ||
823 | pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); | 791 | pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); |
@@ -826,26 +794,30 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
826 | sizeof(pt_element_t))) | 794 | sizeof(pt_element_t))) |
827 | return -EINVAL; | 795 | return -EINVAL; |
828 | 796 | ||
829 | gfn = gpte_to_gfn(gpte); | ||
830 | |||
831 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { | 797 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { |
832 | vcpu->kvm->tlbs_dirty++; | 798 | vcpu->kvm->tlbs_dirty++; |
833 | continue; | 799 | continue; |
834 | } | 800 | } |
835 | 801 | ||
802 | gfn = gpte_to_gfn(gpte); | ||
803 | pte_access = sp->role.access; | ||
804 | pte_access &= FNAME(gpte_access)(vcpu, gpte, true); | ||
805 | |||
806 | if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) | ||
807 | continue; | ||
808 | |||
836 | if (gfn != sp->gfns[i]) { | 809 | if (gfn != sp->gfns[i]) { |
837 | drop_spte(vcpu->kvm, &sp->spt[i], | 810 | drop_spte(vcpu->kvm, &sp->spt[i]); |
838 | shadow_trap_nonpresent_pte); | ||
839 | vcpu->kvm->tlbs_dirty++; | 811 | vcpu->kvm->tlbs_dirty++; |
840 | continue; | 812 | continue; |
841 | } | 813 | } |
842 | 814 | ||
843 | nr_present++; | 815 | nr_present++; |
844 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | 816 | |
845 | host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; | 817 | host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; |
846 | 818 | ||
847 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, | 819 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, |
848 | is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, | 820 | PT_PAGE_TABLE_LEVEL, gfn, |
849 | spte_to_pfn(sp->spt[i]), true, false, | 821 | spte_to_pfn(sp->spt[i]), true, false, |
850 | host_writable); | 822 | host_writable); |
851 | } | 823 | } |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 506e4fe23adc..475d1c948501 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -1496,11 +1496,14 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
1496 | update_cr0_intercept(svm); | 1496 | update_cr0_intercept(svm); |
1497 | } | 1497 | } |
1498 | 1498 | ||
1499 | static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | 1499 | static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
1500 | { | 1500 | { |
1501 | unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; | 1501 | unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; |
1502 | unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; | 1502 | unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; |
1503 | 1503 | ||
1504 | if (cr4 & X86_CR4_VMXE) | ||
1505 | return 1; | ||
1506 | |||
1504 | if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) | 1507 | if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) |
1505 | svm_flush_tlb(vcpu); | 1508 | svm_flush_tlb(vcpu); |
1506 | 1509 | ||
@@ -1510,6 +1513,7 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
1510 | cr4 |= host_cr4_mce; | 1513 | cr4 |= host_cr4_mce; |
1511 | to_svm(vcpu)->vmcb->save.cr4 = cr4; | 1514 | to_svm(vcpu)->vmcb->save.cr4 = cr4; |
1512 | mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); | 1515 | mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); |
1516 | return 0; | ||
1513 | } | 1517 | } |
1514 | 1518 | ||
1515 | static void svm_set_segment(struct kvm_vcpu *vcpu, | 1519 | static void svm_set_segment(struct kvm_vcpu *vcpu, |
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index db932760ea82..3ff898c104f7 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h | |||
@@ -675,12 +675,12 @@ TRACE_EVENT(kvm_emulate_insn, | |||
675 | ), | 675 | ), |
676 | 676 | ||
677 | TP_fast_assign( | 677 | TP_fast_assign( |
678 | __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start; | 678 | __entry->rip = vcpu->arch.emulate_ctxt.fetch.start; |
679 | __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS); | 679 | __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS); |
680 | __entry->len = vcpu->arch.emulate_ctxt.decode.eip | 680 | __entry->len = vcpu->arch.emulate_ctxt._eip |
681 | - vcpu->arch.emulate_ctxt.decode.fetch.start; | 681 | - vcpu->arch.emulate_ctxt.fetch.start; |
682 | memcpy(__entry->insn, | 682 | memcpy(__entry->insn, |
683 | vcpu->arch.emulate_ctxt.decode.fetch.data, | 683 | vcpu->arch.emulate_ctxt.fetch.data, |
684 | 15); | 684 | 15); |
685 | __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode); | 685 | __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode); |
686 | __entry->failed = failed; | 686 | __entry->failed = failed; |
@@ -698,6 +698,29 @@ TRACE_EVENT(kvm_emulate_insn, | |||
698 | #define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0) | 698 | #define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0) |
699 | #define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1) | 699 | #define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1) |
700 | 700 | ||
701 | TRACE_EVENT( | ||
702 | vcpu_match_mmio, | ||
703 | TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match), | ||
704 | TP_ARGS(gva, gpa, write, gpa_match), | ||
705 | |||
706 | TP_STRUCT__entry( | ||
707 | __field(gva_t, gva) | ||
708 | __field(gpa_t, gpa) | ||
709 | __field(bool, write) | ||
710 | __field(bool, gpa_match) | ||
711 | ), | ||
712 | |||
713 | TP_fast_assign( | ||
714 | __entry->gva = gva; | ||
715 | __entry->gpa = gpa; | ||
716 | __entry->write = write; | ||
717 | __entry->gpa_match = gpa_match | ||
718 | ), | ||
719 | |||
720 | TP_printk("gva %#lx gpa %#llx %s %s", __entry->gva, __entry->gpa, | ||
721 | __entry->write ? "Write" : "Read", | ||
722 | __entry->gpa_match ? "GPA" : "GVA") | ||
723 | ); | ||
701 | #endif /* _TRACE_KVM_H */ | 724 | #endif /* _TRACE_KVM_H */ |
702 | 725 | ||
703 | #undef TRACE_INCLUDE_PATH | 726 | #undef TRACE_INCLUDE_PATH |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d48ec60ea421..e65a158dee64 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -43,13 +43,12 @@ | |||
43 | #include "trace.h" | 43 | #include "trace.h" |
44 | 44 | ||
45 | #define __ex(x) __kvm_handle_fault_on_reboot(x) | 45 | #define __ex(x) __kvm_handle_fault_on_reboot(x) |
46 | #define __ex_clear(x, reg) \ | ||
47 | ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg) | ||
46 | 48 | ||
47 | MODULE_AUTHOR("Qumranet"); | 49 | MODULE_AUTHOR("Qumranet"); |
48 | MODULE_LICENSE("GPL"); | 50 | MODULE_LICENSE("GPL"); |
49 | 51 | ||
50 | static int __read_mostly bypass_guest_pf = 1; | ||
51 | module_param(bypass_guest_pf, bool, S_IRUGO); | ||
52 | |||
53 | static int __read_mostly enable_vpid = 1; | 52 | static int __read_mostly enable_vpid = 1; |
54 | module_param_named(vpid, enable_vpid, bool, 0444); | 53 | module_param_named(vpid, enable_vpid, bool, 0444); |
55 | 54 | ||
@@ -72,6 +71,14 @@ module_param(vmm_exclusive, bool, S_IRUGO); | |||
72 | static int __read_mostly yield_on_hlt = 1; | 71 | static int __read_mostly yield_on_hlt = 1; |
73 | module_param(yield_on_hlt, bool, S_IRUGO); | 72 | module_param(yield_on_hlt, bool, S_IRUGO); |
74 | 73 | ||
74 | /* | ||
75 | * If nested=1, nested virtualization is supported, i.e., guests may use | ||
76 | * VMX and be a hypervisor for its own guests. If nested=0, guests may not | ||
77 | * use VMX instructions. | ||
78 | */ | ||
79 | static int __read_mostly nested = 0; | ||
80 | module_param(nested, bool, S_IRUGO); | ||
81 | |||
75 | #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ | 82 | #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ |
76 | (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) | 83 | (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) |
77 | #define KVM_GUEST_CR0_MASK \ | 84 | #define KVM_GUEST_CR0_MASK \ |
@@ -109,6 +116,7 @@ static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; | |||
109 | module_param(ple_window, int, S_IRUGO); | 116 | module_param(ple_window, int, S_IRUGO); |
110 | 117 | ||
111 | #define NR_AUTOLOAD_MSRS 1 | 118 | #define NR_AUTOLOAD_MSRS 1 |
119 | #define VMCS02_POOL_SIZE 1 | ||
112 | 120 | ||
113 | struct vmcs { | 121 | struct vmcs { |
114 | u32 revision_id; | 122 | u32 revision_id; |
@@ -116,17 +124,237 @@ struct vmcs { | |||
116 | char data[0]; | 124 | char data[0]; |
117 | }; | 125 | }; |
118 | 126 | ||
127 | /* | ||
128 | * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also | ||
129 | * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs | ||
130 | * loaded on this CPU (so we can clear them if the CPU goes down). | ||
131 | */ | ||
132 | struct loaded_vmcs { | ||
133 | struct vmcs *vmcs; | ||
134 | int cpu; | ||
135 | int launched; | ||
136 | struct list_head loaded_vmcss_on_cpu_link; | ||
137 | }; | ||
138 | |||
119 | struct shared_msr_entry { | 139 | struct shared_msr_entry { |
120 | unsigned index; | 140 | unsigned index; |
121 | u64 data; | 141 | u64 data; |
122 | u64 mask; | 142 | u64 mask; |
123 | }; | 143 | }; |
124 | 144 | ||
145 | /* | ||
146 | * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a | ||
147 | * single nested guest (L2), hence the name vmcs12. Any VMX implementation has | ||
148 | * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is | ||
149 | * stored in guest memory specified by VMPTRLD, but is opaque to the guest, | ||
150 | * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. | ||
151 | * More than one of these structures may exist, if L1 runs multiple L2 guests. | ||
152 | * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the | ||
153 | * underlying hardware which will be used to run L2. | ||
154 | * This structure is packed to ensure that its layout is identical across | ||
155 | * machines (necessary for live migration). | ||
156 | * If there are changes in this struct, VMCS12_REVISION must be changed. | ||
157 | */ | ||
158 | typedef u64 natural_width; | ||
159 | struct __packed vmcs12 { | ||
160 | /* According to the Intel spec, a VMCS region must start with the | ||
161 | * following two fields. Then follow implementation-specific data. | ||
162 | */ | ||
163 | u32 revision_id; | ||
164 | u32 abort; | ||
165 | |||
166 | u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ | ||
167 | u32 padding[7]; /* room for future expansion */ | ||
168 | |||
169 | u64 io_bitmap_a; | ||
170 | u64 io_bitmap_b; | ||
171 | u64 msr_bitmap; | ||
172 | u64 vm_exit_msr_store_addr; | ||
173 | u64 vm_exit_msr_load_addr; | ||
174 | u64 vm_entry_msr_load_addr; | ||
175 | u64 tsc_offset; | ||
176 | u64 virtual_apic_page_addr; | ||
177 | u64 apic_access_addr; | ||
178 | u64 ept_pointer; | ||
179 | u64 guest_physical_address; | ||
180 | u64 vmcs_link_pointer; | ||
181 | u64 guest_ia32_debugctl; | ||
182 | u64 guest_ia32_pat; | ||
183 | u64 guest_ia32_efer; | ||
184 | u64 guest_ia32_perf_global_ctrl; | ||
185 | u64 guest_pdptr0; | ||
186 | u64 guest_pdptr1; | ||
187 | u64 guest_pdptr2; | ||
188 | u64 guest_pdptr3; | ||
189 | u64 host_ia32_pat; | ||
190 | u64 host_ia32_efer; | ||
191 | u64 host_ia32_perf_global_ctrl; | ||
192 | u64 padding64[8]; /* room for future expansion */ | ||
193 | /* | ||
194 | * To allow migration of L1 (complete with its L2 guests) between | ||
195 | * machines of different natural widths (32 or 64 bit), we cannot have | ||
196 | * unsigned long fields with no explict size. We use u64 (aliased | ||
197 | * natural_width) instead. Luckily, x86 is little-endian. | ||
198 | */ | ||
199 | natural_width cr0_guest_host_mask; | ||
200 | natural_width cr4_guest_host_mask; | ||
201 | natural_width cr0_read_shadow; | ||
202 | natural_width cr4_read_shadow; | ||
203 | natural_width cr3_target_value0; | ||
204 | natural_width cr3_target_value1; | ||
205 | natural_width cr3_target_value2; | ||
206 | natural_width cr3_target_value3; | ||
207 | natural_width exit_qualification; | ||
208 | natural_width guest_linear_address; | ||
209 | natural_width guest_cr0; | ||
210 | natural_width guest_cr3; | ||
211 | natural_width guest_cr4; | ||
212 | natural_width guest_es_base; | ||
213 | natural_width guest_cs_base; | ||
214 | natural_width guest_ss_base; | ||
215 | natural_width guest_ds_base; | ||
216 | natural_width guest_fs_base; | ||
217 | natural_width guest_gs_base; | ||
218 | natural_width guest_ldtr_base; | ||
219 | natural_width guest_tr_base; | ||
220 | natural_width guest_gdtr_base; | ||
221 | natural_width guest_idtr_base; | ||
222 | natural_width guest_dr7; | ||
223 | natural_width guest_rsp; | ||
224 | natural_width guest_rip; | ||
225 | natural_width guest_rflags; | ||
226 | natural_width guest_pending_dbg_exceptions; | ||
227 | natural_width guest_sysenter_esp; | ||
228 | natural_width guest_sysenter_eip; | ||
229 | natural_width host_cr0; | ||
230 | natural_width host_cr3; | ||
231 | natural_width host_cr4; | ||
232 | natural_width host_fs_base; | ||
233 | natural_width host_gs_base; | ||
234 | natural_width host_tr_base; | ||
235 | natural_width host_gdtr_base; | ||
236 | natural_width host_idtr_base; | ||
237 | natural_width host_ia32_sysenter_esp; | ||
238 | natural_width host_ia32_sysenter_eip; | ||
239 | natural_width host_rsp; | ||
240 | natural_width host_rip; | ||
241 | natural_width paddingl[8]; /* room for future expansion */ | ||
242 | u32 pin_based_vm_exec_control; | ||
243 | u32 cpu_based_vm_exec_control; | ||
244 | u32 exception_bitmap; | ||
245 | u32 page_fault_error_code_mask; | ||
246 | u32 page_fault_error_code_match; | ||
247 | u32 cr3_target_count; | ||
248 | u32 vm_exit_controls; | ||
249 | u32 vm_exit_msr_store_count; | ||
250 | u32 vm_exit_msr_load_count; | ||
251 | u32 vm_entry_controls; | ||
252 | u32 vm_entry_msr_load_count; | ||
253 | u32 vm_entry_intr_info_field; | ||
254 | u32 vm_entry_exception_error_code; | ||
255 | u32 vm_entry_instruction_len; | ||
256 | u32 tpr_threshold; | ||
257 | u32 secondary_vm_exec_control; | ||
258 | u32 vm_instruction_error; | ||
259 | u32 vm_exit_reason; | ||
260 | u32 vm_exit_intr_info; | ||
261 | u32 vm_exit_intr_error_code; | ||
262 | u32 idt_vectoring_info_field; | ||
263 | u32 idt_vectoring_error_code; | ||
264 | u32 vm_exit_instruction_len; | ||
265 | u32 vmx_instruction_info; | ||
266 | u32 guest_es_limit; | ||
267 | u32 guest_cs_limit; | ||
268 | u32 guest_ss_limit; | ||
269 | u32 guest_ds_limit; | ||
270 | u32 guest_fs_limit; | ||
271 | u32 guest_gs_limit; | ||
272 | u32 guest_ldtr_limit; | ||
273 | u32 guest_tr_limit; | ||
274 | u32 guest_gdtr_limit; | ||
275 | u32 guest_idtr_limit; | ||
276 | u32 guest_es_ar_bytes; | ||
277 | u32 guest_cs_ar_bytes; | ||
278 | u32 guest_ss_ar_bytes; | ||
279 | u32 guest_ds_ar_bytes; | ||
280 | u32 guest_fs_ar_bytes; | ||
281 | u32 guest_gs_ar_bytes; | ||
282 | u32 guest_ldtr_ar_bytes; | ||
283 | u32 guest_tr_ar_bytes; | ||
284 | u32 guest_interruptibility_info; | ||
285 | u32 guest_activity_state; | ||
286 | u32 guest_sysenter_cs; | ||
287 | u32 host_ia32_sysenter_cs; | ||
288 | u32 padding32[8]; /* room for future expansion */ | ||
289 | u16 virtual_processor_id; | ||
290 | u16 guest_es_selector; | ||
291 | u16 guest_cs_selector; | ||
292 | u16 guest_ss_selector; | ||
293 | u16 guest_ds_selector; | ||
294 | u16 guest_fs_selector; | ||
295 | u16 guest_gs_selector; | ||
296 | u16 guest_ldtr_selector; | ||
297 | u16 guest_tr_selector; | ||
298 | u16 host_es_selector; | ||
299 | u16 host_cs_selector; | ||
300 | u16 host_ss_selector; | ||
301 | u16 host_ds_selector; | ||
302 | u16 host_fs_selector; | ||
303 | u16 host_gs_selector; | ||
304 | u16 host_tr_selector; | ||
305 | }; | ||
306 | |||
307 | /* | ||
308 | * VMCS12_REVISION is an arbitrary id that should be changed if the content or | ||
309 | * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and | ||
310 | * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. | ||
311 | */ | ||
312 | #define VMCS12_REVISION 0x11e57ed0 | ||
313 | |||
314 | /* | ||
315 | * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region | ||
316 | * and any VMCS region. Although only sizeof(struct vmcs12) are used by the | ||
317 | * current implementation, 4K are reserved to avoid future complications. | ||
318 | */ | ||
319 | #define VMCS12_SIZE 0x1000 | ||
320 | |||
321 | /* Used to remember the last vmcs02 used for some recently used vmcs12s */ | ||
322 | struct vmcs02_list { | ||
323 | struct list_head list; | ||
324 | gpa_t vmptr; | ||
325 | struct loaded_vmcs vmcs02; | ||
326 | }; | ||
327 | |||
328 | /* | ||
329 | * The nested_vmx structure is part of vcpu_vmx, and holds information we need | ||
330 | * for correct emulation of VMX (i.e., nested VMX) on this vcpu. | ||
331 | */ | ||
332 | struct nested_vmx { | ||
333 | /* Has the level1 guest done vmxon? */ | ||
334 | bool vmxon; | ||
335 | |||
336 | /* The guest-physical address of the current VMCS L1 keeps for L2 */ | ||
337 | gpa_t current_vmptr; | ||
338 | /* The host-usable pointer to the above */ | ||
339 | struct page *current_vmcs12_page; | ||
340 | struct vmcs12 *current_vmcs12; | ||
341 | |||
342 | /* vmcs02_list cache of VMCSs recently used to run L2 guests */ | ||
343 | struct list_head vmcs02_pool; | ||
344 | int vmcs02_num; | ||
345 | u64 vmcs01_tsc_offset; | ||
346 | /* L2 must run next, and mustn't decide to exit to L1. */ | ||
347 | bool nested_run_pending; | ||
348 | /* | ||
349 | * Guest pages referred to in vmcs02 with host-physical pointers, so | ||
350 | * we must keep them pinned while L2 runs. | ||
351 | */ | ||
352 | struct page *apic_access_page; | ||
353 | }; | ||
354 | |||
125 | struct vcpu_vmx { | 355 | struct vcpu_vmx { |
126 | struct kvm_vcpu vcpu; | 356 | struct kvm_vcpu vcpu; |
127 | struct list_head local_vcpus_link; | ||
128 | unsigned long host_rsp; | 357 | unsigned long host_rsp; |
129 | int launched; | ||
130 | u8 fail; | 358 | u8 fail; |
131 | u8 cpl; | 359 | u8 cpl; |
132 | bool nmi_known_unmasked; | 360 | bool nmi_known_unmasked; |
@@ -140,7 +368,14 @@ struct vcpu_vmx { | |||
140 | u64 msr_host_kernel_gs_base; | 368 | u64 msr_host_kernel_gs_base; |
141 | u64 msr_guest_kernel_gs_base; | 369 | u64 msr_guest_kernel_gs_base; |
142 | #endif | 370 | #endif |
143 | struct vmcs *vmcs; | 371 | /* |
372 | * loaded_vmcs points to the VMCS currently used in this vcpu. For a | ||
373 | * non-nested (L1) guest, it always points to vmcs01. For a nested | ||
374 | * guest (L2), it points to a different VMCS. | ||
375 | */ | ||
376 | struct loaded_vmcs vmcs01; | ||
377 | struct loaded_vmcs *loaded_vmcs; | ||
378 | bool __launched; /* temporary, used in vmx_vcpu_run */ | ||
144 | struct msr_autoload { | 379 | struct msr_autoload { |
145 | unsigned nr; | 380 | unsigned nr; |
146 | struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; | 381 | struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; |
@@ -176,6 +411,9 @@ struct vcpu_vmx { | |||
176 | u32 exit_reason; | 411 | u32 exit_reason; |
177 | 412 | ||
178 | bool rdtscp_enabled; | 413 | bool rdtscp_enabled; |
414 | |||
415 | /* Support for a guest hypervisor (nested VMX) */ | ||
416 | struct nested_vmx nested; | ||
179 | }; | 417 | }; |
180 | 418 | ||
181 | enum segment_cache_field { | 419 | enum segment_cache_field { |
@@ -192,6 +430,174 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) | |||
192 | return container_of(vcpu, struct vcpu_vmx, vcpu); | 430 | return container_of(vcpu, struct vcpu_vmx, vcpu); |
193 | } | 431 | } |
194 | 432 | ||
433 | #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) | ||
434 | #define FIELD(number, name) [number] = VMCS12_OFFSET(name) | ||
435 | #define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ | ||
436 | [number##_HIGH] = VMCS12_OFFSET(name)+4 | ||
437 | |||
438 | static unsigned short vmcs_field_to_offset_table[] = { | ||
439 | FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), | ||
440 | FIELD(GUEST_ES_SELECTOR, guest_es_selector), | ||
441 | FIELD(GUEST_CS_SELECTOR, guest_cs_selector), | ||
442 | FIELD(GUEST_SS_SELECTOR, guest_ss_selector), | ||
443 | FIELD(GUEST_DS_SELECTOR, guest_ds_selector), | ||
444 | FIELD(GUEST_FS_SELECTOR, guest_fs_selector), | ||
445 | FIELD(GUEST_GS_SELECTOR, guest_gs_selector), | ||
446 | FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), | ||
447 | FIELD(GUEST_TR_SELECTOR, guest_tr_selector), | ||
448 | FIELD(HOST_ES_SELECTOR, host_es_selector), | ||
449 | FIELD(HOST_CS_SELECTOR, host_cs_selector), | ||
450 | FIELD(HOST_SS_SELECTOR, host_ss_selector), | ||
451 | FIELD(HOST_DS_SELECTOR, host_ds_selector), | ||
452 | FIELD(HOST_FS_SELECTOR, host_fs_selector), | ||
453 | FIELD(HOST_GS_SELECTOR, host_gs_selector), | ||
454 | FIELD(HOST_TR_SELECTOR, host_tr_selector), | ||
455 | FIELD64(IO_BITMAP_A, io_bitmap_a), | ||
456 | FIELD64(IO_BITMAP_B, io_bitmap_b), | ||
457 | FIELD64(MSR_BITMAP, msr_bitmap), | ||
458 | FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr), | ||
459 | FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr), | ||
460 | FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr), | ||
461 | FIELD64(TSC_OFFSET, tsc_offset), | ||
462 | FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), | ||
463 | FIELD64(APIC_ACCESS_ADDR, apic_access_addr), | ||
464 | FIELD64(EPT_POINTER, ept_pointer), | ||
465 | FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), | ||
466 | FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), | ||
467 | FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), | ||
468 | FIELD64(GUEST_IA32_PAT, guest_ia32_pat), | ||
469 | FIELD64(GUEST_IA32_EFER, guest_ia32_efer), | ||
470 | FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl), | ||
471 | FIELD64(GUEST_PDPTR0, guest_pdptr0), | ||
472 | FIELD64(GUEST_PDPTR1, guest_pdptr1), | ||
473 | FIELD64(GUEST_PDPTR2, guest_pdptr2), | ||
474 | FIELD64(GUEST_PDPTR3, guest_pdptr3), | ||
475 | FIELD64(HOST_IA32_PAT, host_ia32_pat), | ||
476 | FIELD64(HOST_IA32_EFER, host_ia32_efer), | ||
477 | FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), | ||
478 | FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control), | ||
479 | FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control), | ||
480 | FIELD(EXCEPTION_BITMAP, exception_bitmap), | ||
481 | FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask), | ||
482 | FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match), | ||
483 | FIELD(CR3_TARGET_COUNT, cr3_target_count), | ||
484 | FIELD(VM_EXIT_CONTROLS, vm_exit_controls), | ||
485 | FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count), | ||
486 | FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count), | ||
487 | FIELD(VM_ENTRY_CONTROLS, vm_entry_controls), | ||
488 | FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count), | ||
489 | FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field), | ||
490 | FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code), | ||
491 | FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len), | ||
492 | FIELD(TPR_THRESHOLD, tpr_threshold), | ||
493 | FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control), | ||
494 | FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error), | ||
495 | FIELD(VM_EXIT_REASON, vm_exit_reason), | ||
496 | FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info), | ||
497 | FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code), | ||
498 | FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field), | ||
499 | FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code), | ||
500 | FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len), | ||
501 | FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info), | ||
502 | FIELD(GUEST_ES_LIMIT, guest_es_limit), | ||
503 | FIELD(GUEST_CS_LIMIT, guest_cs_limit), | ||
504 | FIELD(GUEST_SS_LIMIT, guest_ss_limit), | ||
505 | FIELD(GUEST_DS_LIMIT, guest_ds_limit), | ||
506 | FIELD(GUEST_FS_LIMIT, guest_fs_limit), | ||
507 | FIELD(GUEST_GS_LIMIT, guest_gs_limit), | ||
508 | FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit), | ||
509 | FIELD(GUEST_TR_LIMIT, guest_tr_limit), | ||
510 | FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit), | ||
511 | FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit), | ||
512 | FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes), | ||
513 | FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes), | ||
514 | FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes), | ||
515 | FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes), | ||
516 | FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes), | ||
517 | FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes), | ||
518 | FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes), | ||
519 | FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes), | ||
520 | FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info), | ||
521 | FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), | ||
522 | FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), | ||
523 | FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), | ||
524 | FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), | ||
525 | FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), | ||
526 | FIELD(CR0_READ_SHADOW, cr0_read_shadow), | ||
527 | FIELD(CR4_READ_SHADOW, cr4_read_shadow), | ||
528 | FIELD(CR3_TARGET_VALUE0, cr3_target_value0), | ||
529 | FIELD(CR3_TARGET_VALUE1, cr3_target_value1), | ||
530 | FIELD(CR3_TARGET_VALUE2, cr3_target_value2), | ||
531 | FIELD(CR3_TARGET_VALUE3, cr3_target_value3), | ||
532 | FIELD(EXIT_QUALIFICATION, exit_qualification), | ||
533 | FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address), | ||
534 | FIELD(GUEST_CR0, guest_cr0), | ||
535 | FIELD(GUEST_CR3, guest_cr3), | ||
536 | FIELD(GUEST_CR4, guest_cr4), | ||
537 | FIELD(GUEST_ES_BASE, guest_es_base), | ||
538 | FIELD(GUEST_CS_BASE, guest_cs_base), | ||
539 | FIELD(GUEST_SS_BASE, guest_ss_base), | ||
540 | FIELD(GUEST_DS_BASE, guest_ds_base), | ||
541 | FIELD(GUEST_FS_BASE, guest_fs_base), | ||
542 | FIELD(GUEST_GS_BASE, guest_gs_base), | ||
543 | FIELD(GUEST_LDTR_BASE, guest_ldtr_base), | ||
544 | FIELD(GUEST_TR_BASE, guest_tr_base), | ||
545 | FIELD(GUEST_GDTR_BASE, guest_gdtr_base), | ||
546 | FIELD(GUEST_IDTR_BASE, guest_idtr_base), | ||
547 | FIELD(GUEST_DR7, guest_dr7), | ||
548 | FIELD(GUEST_RSP, guest_rsp), | ||
549 | FIELD(GUEST_RIP, guest_rip), | ||
550 | FIELD(GUEST_RFLAGS, guest_rflags), | ||
551 | FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions), | ||
552 | FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp), | ||
553 | FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip), | ||
554 | FIELD(HOST_CR0, host_cr0), | ||
555 | FIELD(HOST_CR3, host_cr3), | ||
556 | FIELD(HOST_CR4, host_cr4), | ||
557 | FIELD(HOST_FS_BASE, host_fs_base), | ||
558 | FIELD(HOST_GS_BASE, host_gs_base), | ||
559 | FIELD(HOST_TR_BASE, host_tr_base), | ||
560 | FIELD(HOST_GDTR_BASE, host_gdtr_base), | ||
561 | FIELD(HOST_IDTR_BASE, host_idtr_base), | ||
562 | FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp), | ||
563 | FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), | ||
564 | FIELD(HOST_RSP, host_rsp), | ||
565 | FIELD(HOST_RIP, host_rip), | ||
566 | }; | ||
567 | static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table); | ||
568 | |||
569 | static inline short vmcs_field_to_offset(unsigned long field) | ||
570 | { | ||
571 | if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0) | ||
572 | return -1; | ||
573 | return vmcs_field_to_offset_table[field]; | ||
574 | } | ||
575 | |||
576 | static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) | ||
577 | { | ||
578 | return to_vmx(vcpu)->nested.current_vmcs12; | ||
579 | } | ||
580 | |||
581 | static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) | ||
582 | { | ||
583 | struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT); | ||
584 | if (is_error_page(page)) { | ||
585 | kvm_release_page_clean(page); | ||
586 | return NULL; | ||
587 | } | ||
588 | return page; | ||
589 | } | ||
590 | |||
591 | static void nested_release_page(struct page *page) | ||
592 | { | ||
593 | kvm_release_page_dirty(page); | ||
594 | } | ||
595 | |||
596 | static void nested_release_page_clean(struct page *page) | ||
597 | { | ||
598 | kvm_release_page_clean(page); | ||
599 | } | ||
600 | |||
195 | static u64 construct_eptp(unsigned long root_hpa); | 601 | static u64 construct_eptp(unsigned long root_hpa); |
196 | static void kvm_cpu_vmxon(u64 addr); | 602 | static void kvm_cpu_vmxon(u64 addr); |
197 | static void kvm_cpu_vmxoff(void); | 603 | static void kvm_cpu_vmxoff(void); |
@@ -200,7 +606,11 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); | |||
200 | 606 | ||
201 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); | 607 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); |
202 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); | 608 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); |
203 | static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); | 609 | /* |
610 | * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed | ||
611 | * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. | ||
612 | */ | ||
613 | static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); | ||
204 | static DEFINE_PER_CPU(struct desc_ptr, host_gdt); | 614 | static DEFINE_PER_CPU(struct desc_ptr, host_gdt); |
205 | 615 | ||
206 | static unsigned long *vmx_io_bitmap_a; | 616 | static unsigned long *vmx_io_bitmap_a; |
@@ -442,6 +852,35 @@ static inline bool report_flexpriority(void) | |||
442 | return flexpriority_enabled; | 852 | return flexpriority_enabled; |
443 | } | 853 | } |
444 | 854 | ||
855 | static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) | ||
856 | { | ||
857 | return vmcs12->cpu_based_vm_exec_control & bit; | ||
858 | } | ||
859 | |||
860 | static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) | ||
861 | { | ||
862 | return (vmcs12->cpu_based_vm_exec_control & | ||
863 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && | ||
864 | (vmcs12->secondary_vm_exec_control & bit); | ||
865 | } | ||
866 | |||
867 | static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, | ||
868 | struct kvm_vcpu *vcpu) | ||
869 | { | ||
870 | return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; | ||
871 | } | ||
872 | |||
873 | static inline bool is_exception(u32 intr_info) | ||
874 | { | ||
875 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) | ||
876 | == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); | ||
877 | } | ||
878 | |||
879 | static void nested_vmx_vmexit(struct kvm_vcpu *vcpu); | ||
880 | static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, | ||
881 | struct vmcs12 *vmcs12, | ||
882 | u32 reason, unsigned long qualification); | ||
883 | |||
445 | static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) | 884 | static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) |
446 | { | 885 | { |
447 | int i; | 886 | int i; |
@@ -501,6 +940,13 @@ static void vmcs_clear(struct vmcs *vmcs) | |||
501 | vmcs, phys_addr); | 940 | vmcs, phys_addr); |
502 | } | 941 | } |
503 | 942 | ||
943 | static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) | ||
944 | { | ||
945 | vmcs_clear(loaded_vmcs->vmcs); | ||
946 | loaded_vmcs->cpu = -1; | ||
947 | loaded_vmcs->launched = 0; | ||
948 | } | ||
949 | |||
504 | static void vmcs_load(struct vmcs *vmcs) | 950 | static void vmcs_load(struct vmcs *vmcs) |
505 | { | 951 | { |
506 | u64 phys_addr = __pa(vmcs); | 952 | u64 phys_addr = __pa(vmcs); |
@@ -510,29 +956,28 @@ static void vmcs_load(struct vmcs *vmcs) | |||
510 | : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) | 956 | : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) |
511 | : "cc", "memory"); | 957 | : "cc", "memory"); |
512 | if (error) | 958 | if (error) |
513 | printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", | 959 | printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", |
514 | vmcs, phys_addr); | 960 | vmcs, phys_addr); |
515 | } | 961 | } |
516 | 962 | ||
517 | static void __vcpu_clear(void *arg) | 963 | static void __loaded_vmcs_clear(void *arg) |
518 | { | 964 | { |
519 | struct vcpu_vmx *vmx = arg; | 965 | struct loaded_vmcs *loaded_vmcs = arg; |
520 | int cpu = raw_smp_processor_id(); | 966 | int cpu = raw_smp_processor_id(); |
521 | 967 | ||
522 | if (vmx->vcpu.cpu == cpu) | 968 | if (loaded_vmcs->cpu != cpu) |
523 | vmcs_clear(vmx->vmcs); | 969 | return; /* vcpu migration can race with cpu offline */ |
524 | if (per_cpu(current_vmcs, cpu) == vmx->vmcs) | 970 | if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) |
525 | per_cpu(current_vmcs, cpu) = NULL; | 971 | per_cpu(current_vmcs, cpu) = NULL; |
526 | list_del(&vmx->local_vcpus_link); | 972 | list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); |
527 | vmx->vcpu.cpu = -1; | 973 | loaded_vmcs_init(loaded_vmcs); |
528 | vmx->launched = 0; | ||
529 | } | 974 | } |
530 | 975 | ||
531 | static void vcpu_clear(struct vcpu_vmx *vmx) | 976 | static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) |
532 | { | 977 | { |
533 | if (vmx->vcpu.cpu == -1) | 978 | if (loaded_vmcs->cpu != -1) |
534 | return; | 979 | smp_call_function_single( |
535 | smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); | 980 | loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1); |
536 | } | 981 | } |
537 | 982 | ||
538 | static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) | 983 | static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) |
@@ -585,26 +1030,26 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) | |||
585 | } | 1030 | } |
586 | } | 1031 | } |
587 | 1032 | ||
588 | static unsigned long vmcs_readl(unsigned long field) | 1033 | static __always_inline unsigned long vmcs_readl(unsigned long field) |
589 | { | 1034 | { |
590 | unsigned long value = 0; | 1035 | unsigned long value; |
591 | 1036 | ||
592 | asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) | 1037 | asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0") |
593 | : "+a"(value) : "d"(field) : "cc"); | 1038 | : "=a"(value) : "d"(field) : "cc"); |
594 | return value; | 1039 | return value; |
595 | } | 1040 | } |
596 | 1041 | ||
597 | static u16 vmcs_read16(unsigned long field) | 1042 | static __always_inline u16 vmcs_read16(unsigned long field) |
598 | { | 1043 | { |
599 | return vmcs_readl(field); | 1044 | return vmcs_readl(field); |
600 | } | 1045 | } |
601 | 1046 | ||
602 | static u32 vmcs_read32(unsigned long field) | 1047 | static __always_inline u32 vmcs_read32(unsigned long field) |
603 | { | 1048 | { |
604 | return vmcs_readl(field); | 1049 | return vmcs_readl(field); |
605 | } | 1050 | } |
606 | 1051 | ||
607 | static u64 vmcs_read64(unsigned long field) | 1052 | static __always_inline u64 vmcs_read64(unsigned long field) |
608 | { | 1053 | { |
609 | #ifdef CONFIG_X86_64 | 1054 | #ifdef CONFIG_X86_64 |
610 | return vmcs_readl(field); | 1055 | return vmcs_readl(field); |
@@ -731,6 +1176,15 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) | |||
731 | eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ | 1176 | eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ |
732 | if (vcpu->fpu_active) | 1177 | if (vcpu->fpu_active) |
733 | eb &= ~(1u << NM_VECTOR); | 1178 | eb &= ~(1u << NM_VECTOR); |
1179 | |||
1180 | /* When we are running a nested L2 guest and L1 specified for it a | ||
1181 | * certain exception bitmap, we must trap the same exceptions and pass | ||
1182 | * them to L1. When running L2, we will only handle the exceptions | ||
1183 | * specified above if L1 did not want them. | ||
1184 | */ | ||
1185 | if (is_guest_mode(vcpu)) | ||
1186 | eb |= get_vmcs12(vcpu)->exception_bitmap; | ||
1187 | |||
734 | vmcs_write32(EXCEPTION_BITMAP, eb); | 1188 | vmcs_write32(EXCEPTION_BITMAP, eb); |
735 | } | 1189 | } |
736 | 1190 | ||
@@ -971,22 +1425,22 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
971 | 1425 | ||
972 | if (!vmm_exclusive) | 1426 | if (!vmm_exclusive) |
973 | kvm_cpu_vmxon(phys_addr); | 1427 | kvm_cpu_vmxon(phys_addr); |
974 | else if (vcpu->cpu != cpu) | 1428 | else if (vmx->loaded_vmcs->cpu != cpu) |
975 | vcpu_clear(vmx); | 1429 | loaded_vmcs_clear(vmx->loaded_vmcs); |
976 | 1430 | ||
977 | if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { | 1431 | if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { |
978 | per_cpu(current_vmcs, cpu) = vmx->vmcs; | 1432 | per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; |
979 | vmcs_load(vmx->vmcs); | 1433 | vmcs_load(vmx->loaded_vmcs->vmcs); |
980 | } | 1434 | } |
981 | 1435 | ||
982 | if (vcpu->cpu != cpu) { | 1436 | if (vmx->loaded_vmcs->cpu != cpu) { |
983 | struct desc_ptr *gdt = &__get_cpu_var(host_gdt); | 1437 | struct desc_ptr *gdt = &__get_cpu_var(host_gdt); |
984 | unsigned long sysenter_esp; | 1438 | unsigned long sysenter_esp; |
985 | 1439 | ||
986 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | 1440 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); |
987 | local_irq_disable(); | 1441 | local_irq_disable(); |
988 | list_add(&vmx->local_vcpus_link, | 1442 | list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, |
989 | &per_cpu(vcpus_on_cpu, cpu)); | 1443 | &per_cpu(loaded_vmcss_on_cpu, cpu)); |
990 | local_irq_enable(); | 1444 | local_irq_enable(); |
991 | 1445 | ||
992 | /* | 1446 | /* |
@@ -998,6 +1452,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
998 | 1452 | ||
999 | rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); | 1453 | rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); |
1000 | vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ | 1454 | vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ |
1455 | vmx->loaded_vmcs->cpu = cpu; | ||
1001 | } | 1456 | } |
1002 | } | 1457 | } |
1003 | 1458 | ||
@@ -1005,7 +1460,8 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) | |||
1005 | { | 1460 | { |
1006 | __vmx_load_host_state(to_vmx(vcpu)); | 1461 | __vmx_load_host_state(to_vmx(vcpu)); |
1007 | if (!vmm_exclusive) { | 1462 | if (!vmm_exclusive) { |
1008 | __vcpu_clear(to_vmx(vcpu)); | 1463 | __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); |
1464 | vcpu->cpu = -1; | ||
1009 | kvm_cpu_vmxoff(); | 1465 | kvm_cpu_vmxoff(); |
1010 | } | 1466 | } |
1011 | } | 1467 | } |
@@ -1023,19 +1479,55 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu) | |||
1023 | vmcs_writel(GUEST_CR0, cr0); | 1479 | vmcs_writel(GUEST_CR0, cr0); |
1024 | update_exception_bitmap(vcpu); | 1480 | update_exception_bitmap(vcpu); |
1025 | vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; | 1481 | vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; |
1482 | if (is_guest_mode(vcpu)) | ||
1483 | vcpu->arch.cr0_guest_owned_bits &= | ||
1484 | ~get_vmcs12(vcpu)->cr0_guest_host_mask; | ||
1026 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); | 1485 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); |
1027 | } | 1486 | } |
1028 | 1487 | ||
1029 | static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); | 1488 | static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); |
1030 | 1489 | ||
1490 | /* | ||
1491 | * Return the cr0 value that a nested guest would read. This is a combination | ||
1492 | * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by | ||
1493 | * its hypervisor (cr0_read_shadow). | ||
1494 | */ | ||
1495 | static inline unsigned long nested_read_cr0(struct vmcs12 *fields) | ||
1496 | { | ||
1497 | return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) | | ||
1498 | (fields->cr0_read_shadow & fields->cr0_guest_host_mask); | ||
1499 | } | ||
1500 | static inline unsigned long nested_read_cr4(struct vmcs12 *fields) | ||
1501 | { | ||
1502 | return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) | | ||
1503 | (fields->cr4_read_shadow & fields->cr4_guest_host_mask); | ||
1504 | } | ||
1505 | |||
1031 | static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) | 1506 | static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) |
1032 | { | 1507 | { |
1508 | /* Note that there is no vcpu->fpu_active = 0 here. The caller must | ||
1509 | * set this *before* calling this function. | ||
1510 | */ | ||
1033 | vmx_decache_cr0_guest_bits(vcpu); | 1511 | vmx_decache_cr0_guest_bits(vcpu); |
1034 | vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); | 1512 | vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); |
1035 | update_exception_bitmap(vcpu); | 1513 | update_exception_bitmap(vcpu); |
1036 | vcpu->arch.cr0_guest_owned_bits = 0; | 1514 | vcpu->arch.cr0_guest_owned_bits = 0; |
1037 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); | 1515 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); |
1038 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); | 1516 | if (is_guest_mode(vcpu)) { |
1517 | /* | ||
1518 | * L1's specified read shadow might not contain the TS bit, | ||
1519 | * so now that we turned on shadowing of this bit, we need to | ||
1520 | * set this bit of the shadow. Like in nested_vmx_run we need | ||
1521 | * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet | ||
1522 | * up-to-date here because we just decached cr0.TS (and we'll | ||
1523 | * only update vmcs12->guest_cr0 on nested exit). | ||
1524 | */ | ||
1525 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
1526 | vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) | | ||
1527 | (vcpu->arch.cr0 & X86_CR0_TS); | ||
1528 | vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); | ||
1529 | } else | ||
1530 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); | ||
1039 | } | 1531 | } |
1040 | 1532 | ||
1041 | static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) | 1533 | static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) |
@@ -1119,6 +1611,25 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu) | |||
1119 | vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); | 1611 | vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); |
1120 | } | 1612 | } |
1121 | 1613 | ||
1614 | /* | ||
1615 | * KVM wants to inject page-faults which it got to the guest. This function | ||
1616 | * checks whether in a nested guest, we need to inject them to L1 or L2. | ||
1617 | * This function assumes it is called with the exit reason in vmcs02 being | ||
1618 | * a #PF exception (this is the only case in which KVM injects a #PF when L2 | ||
1619 | * is running). | ||
1620 | */ | ||
1621 | static int nested_pf_handled(struct kvm_vcpu *vcpu) | ||
1622 | { | ||
1623 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
1624 | |||
1625 | /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ | ||
1626 | if (!(vmcs12->exception_bitmap & PF_VECTOR)) | ||
1627 | return 0; | ||
1628 | |||
1629 | nested_vmx_vmexit(vcpu); | ||
1630 | return 1; | ||
1631 | } | ||
1632 | |||
1122 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | 1633 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, |
1123 | bool has_error_code, u32 error_code, | 1634 | bool has_error_code, u32 error_code, |
1124 | bool reinject) | 1635 | bool reinject) |
@@ -1126,6 +1637,10 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | |||
1126 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 1637 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
1127 | u32 intr_info = nr | INTR_INFO_VALID_MASK; | 1638 | u32 intr_info = nr | INTR_INFO_VALID_MASK; |
1128 | 1639 | ||
1640 | if (nr == PF_VECTOR && is_guest_mode(vcpu) && | ||
1641 | nested_pf_handled(vcpu)) | ||
1642 | return; | ||
1643 | |||
1129 | if (has_error_code) { | 1644 | if (has_error_code) { |
1130 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); | 1645 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); |
1131 | intr_info |= INTR_INFO_DELIVER_CODE_MASK; | 1646 | intr_info |= INTR_INFO_DELIVER_CODE_MASK; |
@@ -1248,12 +1763,24 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) | |||
1248 | static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) | 1763 | static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) |
1249 | { | 1764 | { |
1250 | vmcs_write64(TSC_OFFSET, offset); | 1765 | vmcs_write64(TSC_OFFSET, offset); |
1766 | if (is_guest_mode(vcpu)) | ||
1767 | /* | ||
1768 | * We're here if L1 chose not to trap the TSC MSR. Since | ||
1769 | * prepare_vmcs12() does not copy tsc_offset, we need to also | ||
1770 | * set the vmcs12 field here. | ||
1771 | */ | ||
1772 | get_vmcs12(vcpu)->tsc_offset = offset - | ||
1773 | to_vmx(vcpu)->nested.vmcs01_tsc_offset; | ||
1251 | } | 1774 | } |
1252 | 1775 | ||
1253 | static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) | 1776 | static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) |
1254 | { | 1777 | { |
1255 | u64 offset = vmcs_read64(TSC_OFFSET); | 1778 | u64 offset = vmcs_read64(TSC_OFFSET); |
1256 | vmcs_write64(TSC_OFFSET, offset + adjustment); | 1779 | vmcs_write64(TSC_OFFSET, offset + adjustment); |
1780 | if (is_guest_mode(vcpu)) { | ||
1781 | /* Even when running L2, the adjustment needs to apply to L1 */ | ||
1782 | to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment; | ||
1783 | } | ||
1257 | } | 1784 | } |
1258 | 1785 | ||
1259 | static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) | 1786 | static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) |
@@ -1261,6 +1788,236 @@ static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) | |||
1261 | return target_tsc - native_read_tsc(); | 1788 | return target_tsc - native_read_tsc(); |
1262 | } | 1789 | } |
1263 | 1790 | ||
1791 | static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) | ||
1792 | { | ||
1793 | struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0); | ||
1794 | return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31))); | ||
1795 | } | ||
1796 | |||
1797 | /* | ||
1798 | * nested_vmx_allowed() checks whether a guest should be allowed to use VMX | ||
1799 | * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for | ||
1800 | * all guests if the "nested" module option is off, and can also be disabled | ||
1801 | * for a single guest by disabling its VMX cpuid bit. | ||
1802 | */ | ||
1803 | static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) | ||
1804 | { | ||
1805 | return nested && guest_cpuid_has_vmx(vcpu); | ||
1806 | } | ||
1807 | |||
1808 | /* | ||
1809 | * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be | ||
1810 | * returned for the various VMX controls MSRs when nested VMX is enabled. | ||
1811 | * The same values should also be used to verify that vmcs12 control fields are | ||
1812 | * valid during nested entry from L1 to L2. | ||
1813 | * Each of these control msrs has a low and high 32-bit half: A low bit is on | ||
1814 | * if the corresponding bit in the (32-bit) control field *must* be on, and a | ||
1815 | * bit in the high half is on if the corresponding bit in the control field | ||
1816 | * may be on. See also vmx_control_verify(). | ||
1817 | * TODO: allow these variables to be modified (downgraded) by module options | ||
1818 | * or other means. | ||
1819 | */ | ||
1820 | static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high; | ||
1821 | static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high; | ||
1822 | static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; | ||
1823 | static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; | ||
1824 | static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; | ||
1825 | static __init void nested_vmx_setup_ctls_msrs(void) | ||
1826 | { | ||
1827 | /* | ||
1828 | * Note that as a general rule, the high half of the MSRs (bits in | ||
1829 | * the control fields which may be 1) should be initialized by the | ||
1830 | * intersection of the underlying hardware's MSR (i.e., features which | ||
1831 | * can be supported) and the list of features we want to expose - | ||
1832 | * because they are known to be properly supported in our code. | ||
1833 | * Also, usually, the low half of the MSRs (bits which must be 1) can | ||
1834 | * be set to 0, meaning that L1 may turn off any of these bits. The | ||
1835 | * reason is that if one of these bits is necessary, it will appear | ||
1836 | * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control | ||
1837 | * fields of vmcs01 and vmcs02, will turn these bits off - and | ||
1838 | * nested_vmx_exit_handled() will not pass related exits to L1. | ||
1839 | * These rules have exceptions below. | ||
1840 | */ | ||
1841 | |||
1842 | /* pin-based controls */ | ||
1843 | /* | ||
1844 | * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is | ||
1845 | * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR. | ||
1846 | */ | ||
1847 | nested_vmx_pinbased_ctls_low = 0x16 ; | ||
1848 | nested_vmx_pinbased_ctls_high = 0x16 | | ||
1849 | PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | | ||
1850 | PIN_BASED_VIRTUAL_NMIS; | ||
1851 | |||
1852 | /* exit controls */ | ||
1853 | nested_vmx_exit_ctls_low = 0; | ||
1854 | /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ | ||
1855 | #ifdef CONFIG_X86_64 | ||
1856 | nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE; | ||
1857 | #else | ||
1858 | nested_vmx_exit_ctls_high = 0; | ||
1859 | #endif | ||
1860 | |||
1861 | /* entry controls */ | ||
1862 | rdmsr(MSR_IA32_VMX_ENTRY_CTLS, | ||
1863 | nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); | ||
1864 | nested_vmx_entry_ctls_low = 0; | ||
1865 | nested_vmx_entry_ctls_high &= | ||
1866 | VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; | ||
1867 | |||
1868 | /* cpu-based controls */ | ||
1869 | rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, | ||
1870 | nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); | ||
1871 | nested_vmx_procbased_ctls_low = 0; | ||
1872 | nested_vmx_procbased_ctls_high &= | ||
1873 | CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING | | ||
1874 | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | | ||
1875 | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | | ||
1876 | CPU_BASED_CR3_STORE_EXITING | | ||
1877 | #ifdef CONFIG_X86_64 | ||
1878 | CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | | ||
1879 | #endif | ||
1880 | CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | | ||
1881 | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | | ||
1882 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | ||
1883 | /* | ||
1884 | * We can allow some features even when not supported by the | ||
1885 | * hardware. For example, L1 can specify an MSR bitmap - and we | ||
1886 | * can use it to avoid exits to L1 - even when L0 runs L2 | ||
1887 | * without MSR bitmaps. | ||
1888 | */ | ||
1889 | nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS; | ||
1890 | |||
1891 | /* secondary cpu-based controls */ | ||
1892 | rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, | ||
1893 | nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high); | ||
1894 | nested_vmx_secondary_ctls_low = 0; | ||
1895 | nested_vmx_secondary_ctls_high &= | ||
1896 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
1897 | } | ||
1898 | |||
1899 | static inline bool vmx_control_verify(u32 control, u32 low, u32 high) | ||
1900 | { | ||
1901 | /* | ||
1902 | * Bits 0 in high must be 0, and bits 1 in low must be 1. | ||
1903 | */ | ||
1904 | return ((control & high) | low) == control; | ||
1905 | } | ||
1906 | |||
1907 | static inline u64 vmx_control_msr(u32 low, u32 high) | ||
1908 | { | ||
1909 | return low | ((u64)high << 32); | ||
1910 | } | ||
1911 | |||
1912 | /* | ||
1913 | * If we allow our guest to use VMX instructions (i.e., nested VMX), we should | ||
1914 | * also let it use VMX-specific MSRs. | ||
1915 | * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a | ||
1916 | * VMX-specific MSR, or 0 when we haven't (and the caller should handle it | ||
1917 | * like all other MSRs). | ||
1918 | */ | ||
1919 | static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | ||
1920 | { | ||
1921 | if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC && | ||
1922 | msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) { | ||
1923 | /* | ||
1924 | * According to the spec, processors which do not support VMX | ||
1925 | * should throw a #GP(0) when VMX capability MSRs are read. | ||
1926 | */ | ||
1927 | kvm_queue_exception_e(vcpu, GP_VECTOR, 0); | ||
1928 | return 1; | ||
1929 | } | ||
1930 | |||
1931 | switch (msr_index) { | ||
1932 | case MSR_IA32_FEATURE_CONTROL: | ||
1933 | *pdata = 0; | ||
1934 | break; | ||
1935 | case MSR_IA32_VMX_BASIC: | ||
1936 | /* | ||
1937 | * This MSR reports some information about VMX support. We | ||
1938 | * should return information about the VMX we emulate for the | ||
1939 | * guest, and the VMCS structure we give it - not about the | ||
1940 | * VMX support of the underlying hardware. | ||
1941 | */ | ||
1942 | *pdata = VMCS12_REVISION | | ||
1943 | ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | | ||
1944 | (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); | ||
1945 | break; | ||
1946 | case MSR_IA32_VMX_TRUE_PINBASED_CTLS: | ||
1947 | case MSR_IA32_VMX_PINBASED_CTLS: | ||
1948 | *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low, | ||
1949 | nested_vmx_pinbased_ctls_high); | ||
1950 | break; | ||
1951 | case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: | ||
1952 | case MSR_IA32_VMX_PROCBASED_CTLS: | ||
1953 | *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low, | ||
1954 | nested_vmx_procbased_ctls_high); | ||
1955 | break; | ||
1956 | case MSR_IA32_VMX_TRUE_EXIT_CTLS: | ||
1957 | case MSR_IA32_VMX_EXIT_CTLS: | ||
1958 | *pdata = vmx_control_msr(nested_vmx_exit_ctls_low, | ||
1959 | nested_vmx_exit_ctls_high); | ||
1960 | break; | ||
1961 | case MSR_IA32_VMX_TRUE_ENTRY_CTLS: | ||
1962 | case MSR_IA32_VMX_ENTRY_CTLS: | ||
1963 | *pdata = vmx_control_msr(nested_vmx_entry_ctls_low, | ||
1964 | nested_vmx_entry_ctls_high); | ||
1965 | break; | ||
1966 | case MSR_IA32_VMX_MISC: | ||
1967 | *pdata = 0; | ||
1968 | break; | ||
1969 | /* | ||
1970 | * These MSRs specify bits which the guest must keep fixed (on or off) | ||
1971 | * while L1 is in VMXON mode (in L1's root mode, or running an L2). | ||
1972 | * We picked the standard core2 setting. | ||
1973 | */ | ||
1974 | #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) | ||
1975 | #define VMXON_CR4_ALWAYSON X86_CR4_VMXE | ||
1976 | case MSR_IA32_VMX_CR0_FIXED0: | ||
1977 | *pdata = VMXON_CR0_ALWAYSON; | ||
1978 | break; | ||
1979 | case MSR_IA32_VMX_CR0_FIXED1: | ||
1980 | *pdata = -1ULL; | ||
1981 | break; | ||
1982 | case MSR_IA32_VMX_CR4_FIXED0: | ||
1983 | *pdata = VMXON_CR4_ALWAYSON; | ||
1984 | break; | ||
1985 | case MSR_IA32_VMX_CR4_FIXED1: | ||
1986 | *pdata = -1ULL; | ||
1987 | break; | ||
1988 | case MSR_IA32_VMX_VMCS_ENUM: | ||
1989 | *pdata = 0x1f; | ||
1990 | break; | ||
1991 | case MSR_IA32_VMX_PROCBASED_CTLS2: | ||
1992 | *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low, | ||
1993 | nested_vmx_secondary_ctls_high); | ||
1994 | break; | ||
1995 | case MSR_IA32_VMX_EPT_VPID_CAP: | ||
1996 | /* Currently, no nested ept or nested vpid */ | ||
1997 | *pdata = 0; | ||
1998 | break; | ||
1999 | default: | ||
2000 | return 0; | ||
2001 | } | ||
2002 | |||
2003 | return 1; | ||
2004 | } | ||
2005 | |||
2006 | static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | ||
2007 | { | ||
2008 | if (!nested_vmx_allowed(vcpu)) | ||
2009 | return 0; | ||
2010 | |||
2011 | if (msr_index == MSR_IA32_FEATURE_CONTROL) | ||
2012 | /* TODO: the right thing. */ | ||
2013 | return 1; | ||
2014 | /* | ||
2015 | * No need to treat VMX capability MSRs specially: If we don't handle | ||
2016 | * them, handle_wrmsr will #GP(0), which is correct (they are readonly) | ||
2017 | */ | ||
2018 | return 0; | ||
2019 | } | ||
2020 | |||
1264 | /* | 2021 | /* |
1265 | * Reads an msr value (of 'msr_index') into 'pdata'. | 2022 | * Reads an msr value (of 'msr_index') into 'pdata'. |
1266 | * Returns 0 on success, non-0 otherwise. | 2023 | * Returns 0 on success, non-0 otherwise. |
@@ -1309,6 +2066,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | |||
1309 | /* Otherwise falls through */ | 2066 | /* Otherwise falls through */ |
1310 | default: | 2067 | default: |
1311 | vmx_load_host_state(to_vmx(vcpu)); | 2068 | vmx_load_host_state(to_vmx(vcpu)); |
2069 | if (vmx_get_vmx_msr(vcpu, msr_index, pdata)) | ||
2070 | return 0; | ||
1312 | msr = find_msr_entry(to_vmx(vcpu), msr_index); | 2071 | msr = find_msr_entry(to_vmx(vcpu), msr_index); |
1313 | if (msr) { | 2072 | if (msr) { |
1314 | vmx_load_host_state(to_vmx(vcpu)); | 2073 | vmx_load_host_state(to_vmx(vcpu)); |
@@ -1380,6 +2139,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
1380 | return 1; | 2139 | return 1; |
1381 | /* Otherwise falls through */ | 2140 | /* Otherwise falls through */ |
1382 | default: | 2141 | default: |
2142 | if (vmx_set_vmx_msr(vcpu, msr_index, data)) | ||
2143 | break; | ||
1383 | msr = find_msr_entry(vmx, msr_index); | 2144 | msr = find_msr_entry(vmx, msr_index); |
1384 | if (msr) { | 2145 | if (msr) { |
1385 | vmx_load_host_state(vmx); | 2146 | vmx_load_host_state(vmx); |
@@ -1469,7 +2230,7 @@ static int hardware_enable(void *garbage) | |||
1469 | if (read_cr4() & X86_CR4_VMXE) | 2230 | if (read_cr4() & X86_CR4_VMXE) |
1470 | return -EBUSY; | 2231 | return -EBUSY; |
1471 | 2232 | ||
1472 | INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); | 2233 | INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); |
1473 | rdmsrl(MSR_IA32_FEATURE_CONTROL, old); | 2234 | rdmsrl(MSR_IA32_FEATURE_CONTROL, old); |
1474 | 2235 | ||
1475 | test_bits = FEATURE_CONTROL_LOCKED; | 2236 | test_bits = FEATURE_CONTROL_LOCKED; |
@@ -1493,14 +2254,14 @@ static int hardware_enable(void *garbage) | |||
1493 | return 0; | 2254 | return 0; |
1494 | } | 2255 | } |
1495 | 2256 | ||
1496 | static void vmclear_local_vcpus(void) | 2257 | static void vmclear_local_loaded_vmcss(void) |
1497 | { | 2258 | { |
1498 | int cpu = raw_smp_processor_id(); | 2259 | int cpu = raw_smp_processor_id(); |
1499 | struct vcpu_vmx *vmx, *n; | 2260 | struct loaded_vmcs *v, *n; |
1500 | 2261 | ||
1501 | list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu), | 2262 | list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), |
1502 | local_vcpus_link) | 2263 | loaded_vmcss_on_cpu_link) |
1503 | __vcpu_clear(vmx); | 2264 | __loaded_vmcs_clear(v); |
1504 | } | 2265 | } |
1505 | 2266 | ||
1506 | 2267 | ||
@@ -1515,7 +2276,7 @@ static void kvm_cpu_vmxoff(void) | |||
1515 | static void hardware_disable(void *garbage) | 2276 | static void hardware_disable(void *garbage) |
1516 | { | 2277 | { |
1517 | if (vmm_exclusive) { | 2278 | if (vmm_exclusive) { |
1518 | vmclear_local_vcpus(); | 2279 | vmclear_local_loaded_vmcss(); |
1519 | kvm_cpu_vmxoff(); | 2280 | kvm_cpu_vmxoff(); |
1520 | } | 2281 | } |
1521 | write_cr4(read_cr4() & ~X86_CR4_VMXE); | 2282 | write_cr4(read_cr4() & ~X86_CR4_VMXE); |
@@ -1696,6 +2457,18 @@ static void free_vmcs(struct vmcs *vmcs) | |||
1696 | free_pages((unsigned long)vmcs, vmcs_config.order); | 2457 | free_pages((unsigned long)vmcs, vmcs_config.order); |
1697 | } | 2458 | } |
1698 | 2459 | ||
2460 | /* | ||
2461 | * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded | ||
2462 | */ | ||
2463 | static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) | ||
2464 | { | ||
2465 | if (!loaded_vmcs->vmcs) | ||
2466 | return; | ||
2467 | loaded_vmcs_clear(loaded_vmcs); | ||
2468 | free_vmcs(loaded_vmcs->vmcs); | ||
2469 | loaded_vmcs->vmcs = NULL; | ||
2470 | } | ||
2471 | |||
1699 | static void free_kvm_area(void) | 2472 | static void free_kvm_area(void) |
1700 | { | 2473 | { |
1701 | int cpu; | 2474 | int cpu; |
@@ -1756,6 +2529,9 @@ static __init int hardware_setup(void) | |||
1756 | if (!cpu_has_vmx_ple()) | 2529 | if (!cpu_has_vmx_ple()) |
1757 | ple_gap = 0; | 2530 | ple_gap = 0; |
1758 | 2531 | ||
2532 | if (nested) | ||
2533 | nested_vmx_setup_ctls_msrs(); | ||
2534 | |||
1759 | return alloc_kvm_area(); | 2535 | return alloc_kvm_area(); |
1760 | } | 2536 | } |
1761 | 2537 | ||
@@ -2041,7 +2817,7 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu) | |||
2041 | (unsigned long *)&vcpu->arch.regs_dirty); | 2817 | (unsigned long *)&vcpu->arch.regs_dirty); |
2042 | } | 2818 | } |
2043 | 2819 | ||
2044 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); | 2820 | static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); |
2045 | 2821 | ||
2046 | static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, | 2822 | static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, |
2047 | unsigned long cr0, | 2823 | unsigned long cr0, |
@@ -2139,11 +2915,23 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
2139 | vmcs_writel(GUEST_CR3, guest_cr3); | 2915 | vmcs_writel(GUEST_CR3, guest_cr3); |
2140 | } | 2916 | } |
2141 | 2917 | ||
2142 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | 2918 | static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
2143 | { | 2919 | { |
2144 | unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? | 2920 | unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? |
2145 | KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); | 2921 | KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); |
2146 | 2922 | ||
2923 | if (cr4 & X86_CR4_VMXE) { | ||
2924 | /* | ||
2925 | * To use VMXON (and later other VMX instructions), a guest | ||
2926 | * must first be able to turn on cr4.VMXE (see handle_vmon()). | ||
2927 | * So basically the check on whether to allow nested VMX | ||
2928 | * is here. | ||
2929 | */ | ||
2930 | if (!nested_vmx_allowed(vcpu)) | ||
2931 | return 1; | ||
2932 | } else if (to_vmx(vcpu)->nested.vmxon) | ||
2933 | return 1; | ||
2934 | |||
2147 | vcpu->arch.cr4 = cr4; | 2935 | vcpu->arch.cr4 = cr4; |
2148 | if (enable_ept) { | 2936 | if (enable_ept) { |
2149 | if (!is_paging(vcpu)) { | 2937 | if (!is_paging(vcpu)) { |
@@ -2156,6 +2944,7 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
2156 | 2944 | ||
2157 | vmcs_writel(CR4_READ_SHADOW, cr4); | 2945 | vmcs_writel(CR4_READ_SHADOW, cr4); |
2158 | vmcs_writel(GUEST_CR4, hw_cr4); | 2946 | vmcs_writel(GUEST_CR4, hw_cr4); |
2947 | return 0; | ||
2159 | } | 2948 | } |
2160 | 2949 | ||
2161 | static void vmx_get_segment(struct kvm_vcpu *vcpu, | 2950 | static void vmx_get_segment(struct kvm_vcpu *vcpu, |
@@ -2721,18 +3510,110 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) | |||
2721 | } | 3510 | } |
2722 | 3511 | ||
2723 | /* | 3512 | /* |
3513 | * Set up the vmcs's constant host-state fields, i.e., host-state fields that | ||
3514 | * will not change in the lifetime of the guest. | ||
3515 | * Note that host-state that does change is set elsewhere. E.g., host-state | ||
3516 | * that is set differently for each CPU is set in vmx_vcpu_load(), not here. | ||
3517 | */ | ||
3518 | static void vmx_set_constant_host_state(void) | ||
3519 | { | ||
3520 | u32 low32, high32; | ||
3521 | unsigned long tmpl; | ||
3522 | struct desc_ptr dt; | ||
3523 | |||
3524 | vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */ | ||
3525 | vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ | ||
3526 | vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ | ||
3527 | |||
3528 | vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ | ||
3529 | vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
3530 | vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
3531 | vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
3532 | vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ | ||
3533 | |||
3534 | native_store_idt(&dt); | ||
3535 | vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ | ||
3536 | |||
3537 | asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl)); | ||
3538 | vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */ | ||
3539 | |||
3540 | rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); | ||
3541 | vmcs_write32(HOST_IA32_SYSENTER_CS, low32); | ||
3542 | rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); | ||
3543 | vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ | ||
3544 | |||
3545 | if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { | ||
3546 | rdmsr(MSR_IA32_CR_PAT, low32, high32); | ||
3547 | vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); | ||
3548 | } | ||
3549 | } | ||
3550 | |||
3551 | static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) | ||
3552 | { | ||
3553 | vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; | ||
3554 | if (enable_ept) | ||
3555 | vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; | ||
3556 | if (is_guest_mode(&vmx->vcpu)) | ||
3557 | vmx->vcpu.arch.cr4_guest_owned_bits &= | ||
3558 | ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; | ||
3559 | vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); | ||
3560 | } | ||
3561 | |||
3562 | static u32 vmx_exec_control(struct vcpu_vmx *vmx) | ||
3563 | { | ||
3564 | u32 exec_control = vmcs_config.cpu_based_exec_ctrl; | ||
3565 | if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { | ||
3566 | exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
3567 | #ifdef CONFIG_X86_64 | ||
3568 | exec_control |= CPU_BASED_CR8_STORE_EXITING | | ||
3569 | CPU_BASED_CR8_LOAD_EXITING; | ||
3570 | #endif | ||
3571 | } | ||
3572 | if (!enable_ept) | ||
3573 | exec_control |= CPU_BASED_CR3_STORE_EXITING | | ||
3574 | CPU_BASED_CR3_LOAD_EXITING | | ||
3575 | CPU_BASED_INVLPG_EXITING; | ||
3576 | return exec_control; | ||
3577 | } | ||
3578 | |||
3579 | static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) | ||
3580 | { | ||
3581 | u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; | ||
3582 | if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) | ||
3583 | exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
3584 | if (vmx->vpid == 0) | ||
3585 | exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; | ||
3586 | if (!enable_ept) { | ||
3587 | exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; | ||
3588 | enable_unrestricted_guest = 0; | ||
3589 | } | ||
3590 | if (!enable_unrestricted_guest) | ||
3591 | exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; | ||
3592 | if (!ple_gap) | ||
3593 | exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; | ||
3594 | return exec_control; | ||
3595 | } | ||
3596 | |||
3597 | static void ept_set_mmio_spte_mask(void) | ||
3598 | { | ||
3599 | /* | ||
3600 | * EPT Misconfigurations can be generated if the value of bits 2:0 | ||
3601 | * of an EPT paging-structure entry is 110b (write/execute). | ||
3602 | * Also, magic bits (0xffull << 49) is set to quickly identify mmio | ||
3603 | * spte. | ||
3604 | */ | ||
3605 | kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull); | ||
3606 | } | ||
3607 | |||
3608 | /* | ||
2724 | * Sets up the vmcs for emulated real mode. | 3609 | * Sets up the vmcs for emulated real mode. |
2725 | */ | 3610 | */ |
2726 | static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | 3611 | static int vmx_vcpu_setup(struct vcpu_vmx *vmx) |
2727 | { | 3612 | { |
2728 | u32 host_sysenter_cs, msr_low, msr_high; | 3613 | #ifdef CONFIG_X86_64 |
2729 | u32 junk; | ||
2730 | u64 host_pat; | ||
2731 | unsigned long a; | 3614 | unsigned long a; |
2732 | struct desc_ptr dt; | 3615 | #endif |
2733 | int i; | 3616 | int i; |
2734 | unsigned long kvm_vmx_return; | ||
2735 | u32 exec_control; | ||
2736 | 3617 | ||
2737 | /* I/O */ | 3618 | /* I/O */ |
2738 | vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); | 3619 | vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); |
@@ -2747,36 +3628,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2747 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, | 3628 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, |
2748 | vmcs_config.pin_based_exec_ctrl); | 3629 | vmcs_config.pin_based_exec_ctrl); |
2749 | 3630 | ||
2750 | exec_control = vmcs_config.cpu_based_exec_ctrl; | 3631 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); |
2751 | if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { | ||
2752 | exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
2753 | #ifdef CONFIG_X86_64 | ||
2754 | exec_control |= CPU_BASED_CR8_STORE_EXITING | | ||
2755 | CPU_BASED_CR8_LOAD_EXITING; | ||
2756 | #endif | ||
2757 | } | ||
2758 | if (!enable_ept) | ||
2759 | exec_control |= CPU_BASED_CR3_STORE_EXITING | | ||
2760 | CPU_BASED_CR3_LOAD_EXITING | | ||
2761 | CPU_BASED_INVLPG_EXITING; | ||
2762 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); | ||
2763 | 3632 | ||
2764 | if (cpu_has_secondary_exec_ctrls()) { | 3633 | if (cpu_has_secondary_exec_ctrls()) { |
2765 | exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; | 3634 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, |
2766 | if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) | 3635 | vmx_secondary_exec_control(vmx)); |
2767 | exec_control &= | ||
2768 | ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
2769 | if (vmx->vpid == 0) | ||
2770 | exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; | ||
2771 | if (!enable_ept) { | ||
2772 | exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; | ||
2773 | enable_unrestricted_guest = 0; | ||
2774 | } | ||
2775 | if (!enable_unrestricted_guest) | ||
2776 | exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; | ||
2777 | if (!ple_gap) | ||
2778 | exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; | ||
2779 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); | ||
2780 | } | 3636 | } |
2781 | 3637 | ||
2782 | if (ple_gap) { | 3638 | if (ple_gap) { |
@@ -2784,20 +3640,13 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2784 | vmcs_write32(PLE_WINDOW, ple_window); | 3640 | vmcs_write32(PLE_WINDOW, ple_window); |
2785 | } | 3641 | } |
2786 | 3642 | ||
2787 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); | 3643 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); |
2788 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); | 3644 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); |
2789 | vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ | 3645 | vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ |
2790 | 3646 | ||
2791 | vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */ | ||
2792 | vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ | ||
2793 | vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ | ||
2794 | |||
2795 | vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ | ||
2796 | vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
2797 | vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
2798 | vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ | 3647 | vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ |
2799 | vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ | 3648 | vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ |
2800 | vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | 3649 | vmx_set_constant_host_state(); |
2801 | #ifdef CONFIG_X86_64 | 3650 | #ifdef CONFIG_X86_64 |
2802 | rdmsrl(MSR_FS_BASE, a); | 3651 | rdmsrl(MSR_FS_BASE, a); |
2803 | vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ | 3652 | vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ |
@@ -2808,32 +3657,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2808 | vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ | 3657 | vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ |
2809 | #endif | 3658 | #endif |
2810 | 3659 | ||
2811 | vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ | ||
2812 | |||
2813 | native_store_idt(&dt); | ||
2814 | vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ | ||
2815 | |||
2816 | asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); | ||
2817 | vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ | ||
2818 | vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); | 3660 | vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); |
2819 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); | 3661 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); |
2820 | vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); | 3662 | vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); |
2821 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); | 3663 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); |
2822 | vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); | 3664 | vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); |
2823 | 3665 | ||
2824 | rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); | ||
2825 | vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); | ||
2826 | rdmsrl(MSR_IA32_SYSENTER_ESP, a); | ||
2827 | vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */ | ||
2828 | rdmsrl(MSR_IA32_SYSENTER_EIP, a); | ||
2829 | vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ | ||
2830 | |||
2831 | if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { | ||
2832 | rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); | ||
2833 | host_pat = msr_low | ((u64) msr_high << 32); | ||
2834 | vmcs_write64(HOST_IA32_PAT, host_pat); | ||
2835 | } | ||
2836 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { | 3666 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { |
3667 | u32 msr_low, msr_high; | ||
3668 | u64 host_pat; | ||
2837 | rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); | 3669 | rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); |
2838 | host_pat = msr_low | ((u64) msr_high << 32); | 3670 | host_pat = msr_low | ((u64) msr_high << 32); |
2839 | /* Write the default value follow host pat */ | 3671 | /* Write the default value follow host pat */ |
@@ -2863,10 +3695,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2863 | vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); | 3695 | vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); |
2864 | 3696 | ||
2865 | vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); | 3697 | vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); |
2866 | vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; | 3698 | set_cr4_guest_host_mask(vmx); |
2867 | if (enable_ept) | ||
2868 | vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; | ||
2869 | vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); | ||
2870 | 3699 | ||
2871 | kvm_write_tsc(&vmx->vcpu, 0); | 3700 | kvm_write_tsc(&vmx->vcpu, 0); |
2872 | 3701 | ||
@@ -2990,9 +3819,25 @@ out: | |||
2990 | return ret; | 3819 | return ret; |
2991 | } | 3820 | } |
2992 | 3821 | ||
3822 | /* | ||
3823 | * In nested virtualization, check if L1 asked to exit on external interrupts. | ||
3824 | * For most existing hypervisors, this will always return true. | ||
3825 | */ | ||
3826 | static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) | ||
3827 | { | ||
3828 | return get_vmcs12(vcpu)->pin_based_vm_exec_control & | ||
3829 | PIN_BASED_EXT_INTR_MASK; | ||
3830 | } | ||
3831 | |||
2993 | static void enable_irq_window(struct kvm_vcpu *vcpu) | 3832 | static void enable_irq_window(struct kvm_vcpu *vcpu) |
2994 | { | 3833 | { |
2995 | u32 cpu_based_vm_exec_control; | 3834 | u32 cpu_based_vm_exec_control; |
3835 | if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) | ||
3836 | /* We can get here when nested_run_pending caused | ||
3837 | * vmx_interrupt_allowed() to return false. In this case, do | ||
3838 | * nothing - the interrupt will be injected later. | ||
3839 | */ | ||
3840 | return; | ||
2996 | 3841 | ||
2997 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | 3842 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); |
2998 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; | 3843 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; |
@@ -3049,6 +3894,9 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | |||
3049 | { | 3894 | { |
3050 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3895 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3051 | 3896 | ||
3897 | if (is_guest_mode(vcpu)) | ||
3898 | return; | ||
3899 | |||
3052 | if (!cpu_has_virtual_nmis()) { | 3900 | if (!cpu_has_virtual_nmis()) { |
3053 | /* | 3901 | /* |
3054 | * Tracking the NMI-blocked state in software is built upon | 3902 | * Tracking the NMI-blocked state in software is built upon |
@@ -3115,6 +3963,17 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) | |||
3115 | 3963 | ||
3116 | static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) | 3964 | static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) |
3117 | { | 3965 | { |
3966 | if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { | ||
3967 | struct vmcs12 *vmcs12; | ||
3968 | if (to_vmx(vcpu)->nested.nested_run_pending) | ||
3969 | return 0; | ||
3970 | nested_vmx_vmexit(vcpu); | ||
3971 | vmcs12 = get_vmcs12(vcpu); | ||
3972 | vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT; | ||
3973 | vmcs12->vm_exit_intr_info = 0; | ||
3974 | /* fall through to normal code, but now in L1, not L2 */ | ||
3975 | } | ||
3976 | |||
3118 | return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && | 3977 | return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && |
3119 | !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & | 3978 | !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & |
3120 | (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); | 3979 | (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); |
@@ -3356,6 +4215,58 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | |||
3356 | hypercall[2] = 0xc1; | 4215 | hypercall[2] = 0xc1; |
3357 | } | 4216 | } |
3358 | 4217 | ||
4218 | /* called to set cr0 as approriate for a mov-to-cr0 exit. */ | ||
4219 | static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) | ||
4220 | { | ||
4221 | if (to_vmx(vcpu)->nested.vmxon && | ||
4222 | ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)) | ||
4223 | return 1; | ||
4224 | |||
4225 | if (is_guest_mode(vcpu)) { | ||
4226 | /* | ||
4227 | * We get here when L2 changed cr0 in a way that did not change | ||
4228 | * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), | ||
4229 | * but did change L0 shadowed bits. This can currently happen | ||
4230 | * with the TS bit: L0 may want to leave TS on (for lazy fpu | ||
4231 | * loading) while pretending to allow the guest to change it. | ||
4232 | */ | ||
4233 | if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) | | ||
4234 | (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits))) | ||
4235 | return 1; | ||
4236 | vmcs_writel(CR0_READ_SHADOW, val); | ||
4237 | return 0; | ||
4238 | } else | ||
4239 | return kvm_set_cr0(vcpu, val); | ||
4240 | } | ||
4241 | |||
4242 | static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) | ||
4243 | { | ||
4244 | if (is_guest_mode(vcpu)) { | ||
4245 | if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) | | ||
4246 | (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits))) | ||
4247 | return 1; | ||
4248 | vmcs_writel(CR4_READ_SHADOW, val); | ||
4249 | return 0; | ||
4250 | } else | ||
4251 | return kvm_set_cr4(vcpu, val); | ||
4252 | } | ||
4253 | |||
4254 | /* called to set cr0 as approriate for clts instruction exit. */ | ||
4255 | static void handle_clts(struct kvm_vcpu *vcpu) | ||
4256 | { | ||
4257 | if (is_guest_mode(vcpu)) { | ||
4258 | /* | ||
4259 | * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS | ||
4260 | * but we did (!fpu_active). We need to keep GUEST_CR0.TS on, | ||
4261 | * just pretend it's off (also in arch.cr0 for fpu_activate). | ||
4262 | */ | ||
4263 | vmcs_writel(CR0_READ_SHADOW, | ||
4264 | vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS); | ||
4265 | vcpu->arch.cr0 &= ~X86_CR0_TS; | ||
4266 | } else | ||
4267 | vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); | ||
4268 | } | ||
4269 | |||
3359 | static int handle_cr(struct kvm_vcpu *vcpu) | 4270 | static int handle_cr(struct kvm_vcpu *vcpu) |
3360 | { | 4271 | { |
3361 | unsigned long exit_qualification, val; | 4272 | unsigned long exit_qualification, val; |
@@ -3372,7 +4283,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) | |||
3372 | trace_kvm_cr_write(cr, val); | 4283 | trace_kvm_cr_write(cr, val); |
3373 | switch (cr) { | 4284 | switch (cr) { |
3374 | case 0: | 4285 | case 0: |
3375 | err = kvm_set_cr0(vcpu, val); | 4286 | err = handle_set_cr0(vcpu, val); |
3376 | kvm_complete_insn_gp(vcpu, err); | 4287 | kvm_complete_insn_gp(vcpu, err); |
3377 | return 1; | 4288 | return 1; |
3378 | case 3: | 4289 | case 3: |
@@ -3380,7 +4291,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) | |||
3380 | kvm_complete_insn_gp(vcpu, err); | 4291 | kvm_complete_insn_gp(vcpu, err); |
3381 | return 1; | 4292 | return 1; |
3382 | case 4: | 4293 | case 4: |
3383 | err = kvm_set_cr4(vcpu, val); | 4294 | err = handle_set_cr4(vcpu, val); |
3384 | kvm_complete_insn_gp(vcpu, err); | 4295 | kvm_complete_insn_gp(vcpu, err); |
3385 | return 1; | 4296 | return 1; |
3386 | case 8: { | 4297 | case 8: { |
@@ -3398,7 +4309,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) | |||
3398 | }; | 4309 | }; |
3399 | break; | 4310 | break; |
3400 | case 2: /* clts */ | 4311 | case 2: /* clts */ |
3401 | vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); | 4312 | handle_clts(vcpu); |
3402 | trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); | 4313 | trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); |
3403 | skip_emulated_instruction(vcpu); | 4314 | skip_emulated_instruction(vcpu); |
3404 | vmx_fpu_activate(vcpu); | 4315 | vmx_fpu_activate(vcpu); |
@@ -3574,12 +4485,6 @@ static int handle_vmcall(struct kvm_vcpu *vcpu) | |||
3574 | return 1; | 4485 | return 1; |
3575 | } | 4486 | } |
3576 | 4487 | ||
3577 | static int handle_vmx_insn(struct kvm_vcpu *vcpu) | ||
3578 | { | ||
3579 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
3580 | return 1; | ||
3581 | } | ||
3582 | |||
3583 | static int handle_invd(struct kvm_vcpu *vcpu) | 4488 | static int handle_invd(struct kvm_vcpu *vcpu) |
3584 | { | 4489 | { |
3585 | return emulate_instruction(vcpu, 0) == EMULATE_DONE; | 4490 | return emulate_instruction(vcpu, 0) == EMULATE_DONE; |
@@ -3777,11 +4682,19 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, | |||
3777 | static int handle_ept_misconfig(struct kvm_vcpu *vcpu) | 4682 | static int handle_ept_misconfig(struct kvm_vcpu *vcpu) |
3778 | { | 4683 | { |
3779 | u64 sptes[4]; | 4684 | u64 sptes[4]; |
3780 | int nr_sptes, i; | 4685 | int nr_sptes, i, ret; |
3781 | gpa_t gpa; | 4686 | gpa_t gpa; |
3782 | 4687 | ||
3783 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); | 4688 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); |
3784 | 4689 | ||
4690 | ret = handle_mmio_page_fault_common(vcpu, gpa, true); | ||
4691 | if (likely(ret == 1)) | ||
4692 | return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == | ||
4693 | EMULATE_DONE; | ||
4694 | if (unlikely(!ret)) | ||
4695 | return 1; | ||
4696 | |||
4697 | /* It is the real ept misconfig */ | ||
3785 | printk(KERN_ERR "EPT: Misconfiguration.\n"); | 4698 | printk(KERN_ERR "EPT: Misconfiguration.\n"); |
3786 | printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); | 4699 | printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); |
3787 | 4700 | ||
@@ -3866,6 +4779,639 @@ static int handle_invalid_op(struct kvm_vcpu *vcpu) | |||
3866 | } | 4779 | } |
3867 | 4780 | ||
3868 | /* | 4781 | /* |
4782 | * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. | ||
4783 | * We could reuse a single VMCS for all the L2 guests, but we also want the | ||
4784 | * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this | ||
4785 | * allows keeping them loaded on the processor, and in the future will allow | ||
4786 | * optimizations where prepare_vmcs02 doesn't need to set all the fields on | ||
4787 | * every entry if they never change. | ||
4788 | * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE | ||
4789 | * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first. | ||
4790 | * | ||
4791 | * The following functions allocate and free a vmcs02 in this pool. | ||
4792 | */ | ||
4793 | |||
4794 | /* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */ | ||
4795 | static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) | ||
4796 | { | ||
4797 | struct vmcs02_list *item; | ||
4798 | list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) | ||
4799 | if (item->vmptr == vmx->nested.current_vmptr) { | ||
4800 | list_move(&item->list, &vmx->nested.vmcs02_pool); | ||
4801 | return &item->vmcs02; | ||
4802 | } | ||
4803 | |||
4804 | if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) { | ||
4805 | /* Recycle the least recently used VMCS. */ | ||
4806 | item = list_entry(vmx->nested.vmcs02_pool.prev, | ||
4807 | struct vmcs02_list, list); | ||
4808 | item->vmptr = vmx->nested.current_vmptr; | ||
4809 | list_move(&item->list, &vmx->nested.vmcs02_pool); | ||
4810 | return &item->vmcs02; | ||
4811 | } | ||
4812 | |||
4813 | /* Create a new VMCS */ | ||
4814 | item = (struct vmcs02_list *) | ||
4815 | kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL); | ||
4816 | if (!item) | ||
4817 | return NULL; | ||
4818 | item->vmcs02.vmcs = alloc_vmcs(); | ||
4819 | if (!item->vmcs02.vmcs) { | ||
4820 | kfree(item); | ||
4821 | return NULL; | ||
4822 | } | ||
4823 | loaded_vmcs_init(&item->vmcs02); | ||
4824 | item->vmptr = vmx->nested.current_vmptr; | ||
4825 | list_add(&(item->list), &(vmx->nested.vmcs02_pool)); | ||
4826 | vmx->nested.vmcs02_num++; | ||
4827 | return &item->vmcs02; | ||
4828 | } | ||
4829 | |||
4830 | /* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */ | ||
4831 | static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) | ||
4832 | { | ||
4833 | struct vmcs02_list *item; | ||
4834 | list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) | ||
4835 | if (item->vmptr == vmptr) { | ||
4836 | free_loaded_vmcs(&item->vmcs02); | ||
4837 | list_del(&item->list); | ||
4838 | kfree(item); | ||
4839 | vmx->nested.vmcs02_num--; | ||
4840 | return; | ||
4841 | } | ||
4842 | } | ||
4843 | |||
4844 | /* | ||
4845 | * Free all VMCSs saved for this vcpu, except the one pointed by | ||
4846 | * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one | ||
4847 | * currently used, if running L2), and vmcs01 when running L2. | ||
4848 | */ | ||
4849 | static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) | ||
4850 | { | ||
4851 | struct vmcs02_list *item, *n; | ||
4852 | list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { | ||
4853 | if (vmx->loaded_vmcs != &item->vmcs02) | ||
4854 | free_loaded_vmcs(&item->vmcs02); | ||
4855 | list_del(&item->list); | ||
4856 | kfree(item); | ||
4857 | } | ||
4858 | vmx->nested.vmcs02_num = 0; | ||
4859 | |||
4860 | if (vmx->loaded_vmcs != &vmx->vmcs01) | ||
4861 | free_loaded_vmcs(&vmx->vmcs01); | ||
4862 | } | ||
4863 | |||
4864 | /* | ||
4865 | * Emulate the VMXON instruction. | ||
4866 | * Currently, we just remember that VMX is active, and do not save or even | ||
4867 | * inspect the argument to VMXON (the so-called "VMXON pointer") because we | ||
4868 | * do not currently need to store anything in that guest-allocated memory | ||
4869 | * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their | ||
4870 | * argument is different from the VMXON pointer (which the spec says they do). | ||
4871 | */ | ||
4872 | static int handle_vmon(struct kvm_vcpu *vcpu) | ||
4873 | { | ||
4874 | struct kvm_segment cs; | ||
4875 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
4876 | |||
4877 | /* The Intel VMX Instruction Reference lists a bunch of bits that | ||
4878 | * are prerequisite to running VMXON, most notably cr4.VMXE must be | ||
4879 | * set to 1 (see vmx_set_cr4() for when we allow the guest to set this). | ||
4880 | * Otherwise, we should fail with #UD. We test these now: | ||
4881 | */ | ||
4882 | if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) || | ||
4883 | !kvm_read_cr0_bits(vcpu, X86_CR0_PE) || | ||
4884 | (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { | ||
4885 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
4886 | return 1; | ||
4887 | } | ||
4888 | |||
4889 | vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
4890 | if (is_long_mode(vcpu) && !cs.l) { | ||
4891 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
4892 | return 1; | ||
4893 | } | ||
4894 | |||
4895 | if (vmx_get_cpl(vcpu)) { | ||
4896 | kvm_inject_gp(vcpu, 0); | ||
4897 | return 1; | ||
4898 | } | ||
4899 | |||
4900 | INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); | ||
4901 | vmx->nested.vmcs02_num = 0; | ||
4902 | |||
4903 | vmx->nested.vmxon = true; | ||
4904 | |||
4905 | skip_emulated_instruction(vcpu); | ||
4906 | return 1; | ||
4907 | } | ||
4908 | |||
4909 | /* | ||
4910 | * Intel's VMX Instruction Reference specifies a common set of prerequisites | ||
4911 | * for running VMX instructions (except VMXON, whose prerequisites are | ||
4912 | * slightly different). It also specifies what exception to inject otherwise. | ||
4913 | */ | ||
4914 | static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) | ||
4915 | { | ||
4916 | struct kvm_segment cs; | ||
4917 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
4918 | |||
4919 | if (!vmx->nested.vmxon) { | ||
4920 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
4921 | return 0; | ||
4922 | } | ||
4923 | |||
4924 | vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
4925 | if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) || | ||
4926 | (is_long_mode(vcpu) && !cs.l)) { | ||
4927 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
4928 | return 0; | ||
4929 | } | ||
4930 | |||
4931 | if (vmx_get_cpl(vcpu)) { | ||
4932 | kvm_inject_gp(vcpu, 0); | ||
4933 | return 0; | ||
4934 | } | ||
4935 | |||
4936 | return 1; | ||
4937 | } | ||
4938 | |||
4939 | /* | ||
4940 | * Free whatever needs to be freed from vmx->nested when L1 goes down, or | ||
4941 | * just stops using VMX. | ||
4942 | */ | ||
4943 | static void free_nested(struct vcpu_vmx *vmx) | ||
4944 | { | ||
4945 | if (!vmx->nested.vmxon) | ||
4946 | return; | ||
4947 | vmx->nested.vmxon = false; | ||
4948 | if (vmx->nested.current_vmptr != -1ull) { | ||
4949 | kunmap(vmx->nested.current_vmcs12_page); | ||
4950 | nested_release_page(vmx->nested.current_vmcs12_page); | ||
4951 | vmx->nested.current_vmptr = -1ull; | ||
4952 | vmx->nested.current_vmcs12 = NULL; | ||
4953 | } | ||
4954 | /* Unpin physical memory we referred to in current vmcs02 */ | ||
4955 | if (vmx->nested.apic_access_page) { | ||
4956 | nested_release_page(vmx->nested.apic_access_page); | ||
4957 | vmx->nested.apic_access_page = 0; | ||
4958 | } | ||
4959 | |||
4960 | nested_free_all_saved_vmcss(vmx); | ||
4961 | } | ||
4962 | |||
4963 | /* Emulate the VMXOFF instruction */ | ||
4964 | static int handle_vmoff(struct kvm_vcpu *vcpu) | ||
4965 | { | ||
4966 | if (!nested_vmx_check_permission(vcpu)) | ||
4967 | return 1; | ||
4968 | free_nested(to_vmx(vcpu)); | ||
4969 | skip_emulated_instruction(vcpu); | ||
4970 | return 1; | ||
4971 | } | ||
4972 | |||
4973 | /* | ||
4974 | * Decode the memory-address operand of a vmx instruction, as recorded on an | ||
4975 | * exit caused by such an instruction (run by a guest hypervisor). | ||
4976 | * On success, returns 0. When the operand is invalid, returns 1 and throws | ||
4977 | * #UD or #GP. | ||
4978 | */ | ||
4979 | static int get_vmx_mem_address(struct kvm_vcpu *vcpu, | ||
4980 | unsigned long exit_qualification, | ||
4981 | u32 vmx_instruction_info, gva_t *ret) | ||
4982 | { | ||
4983 | /* | ||
4984 | * According to Vol. 3B, "Information for VM Exits Due to Instruction | ||
4985 | * Execution", on an exit, vmx_instruction_info holds most of the | ||
4986 | * addressing components of the operand. Only the displacement part | ||
4987 | * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). | ||
4988 | * For how an actual address is calculated from all these components, | ||
4989 | * refer to Vol. 1, "Operand Addressing". | ||
4990 | */ | ||
4991 | int scaling = vmx_instruction_info & 3; | ||
4992 | int addr_size = (vmx_instruction_info >> 7) & 7; | ||
4993 | bool is_reg = vmx_instruction_info & (1u << 10); | ||
4994 | int seg_reg = (vmx_instruction_info >> 15) & 7; | ||
4995 | int index_reg = (vmx_instruction_info >> 18) & 0xf; | ||
4996 | bool index_is_valid = !(vmx_instruction_info & (1u << 22)); | ||
4997 | int base_reg = (vmx_instruction_info >> 23) & 0xf; | ||
4998 | bool base_is_valid = !(vmx_instruction_info & (1u << 27)); | ||
4999 | |||
5000 | if (is_reg) { | ||
5001 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
5002 | return 1; | ||
5003 | } | ||
5004 | |||
5005 | /* Addr = segment_base + offset */ | ||
5006 | /* offset = base + [index * scale] + displacement */ | ||
5007 | *ret = vmx_get_segment_base(vcpu, seg_reg); | ||
5008 | if (base_is_valid) | ||
5009 | *ret += kvm_register_read(vcpu, base_reg); | ||
5010 | if (index_is_valid) | ||
5011 | *ret += kvm_register_read(vcpu, index_reg)<<scaling; | ||
5012 | *ret += exit_qualification; /* holds the displacement */ | ||
5013 | |||
5014 | if (addr_size == 1) /* 32 bit */ | ||
5015 | *ret &= 0xffffffff; | ||
5016 | |||
5017 | /* | ||
5018 | * TODO: throw #GP (and return 1) in various cases that the VM* | ||
5019 | * instructions require it - e.g., offset beyond segment limit, | ||
5020 | * unusable or unreadable/unwritable segment, non-canonical 64-bit | ||
5021 | * address, and so on. Currently these are not checked. | ||
5022 | */ | ||
5023 | return 0; | ||
5024 | } | ||
5025 | |||
5026 | /* | ||
5027 | * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), | ||
5028 | * set the success or error code of an emulated VMX instruction, as specified | ||
5029 | * by Vol 2B, VMX Instruction Reference, "Conventions". | ||
5030 | */ | ||
5031 | static void nested_vmx_succeed(struct kvm_vcpu *vcpu) | ||
5032 | { | ||
5033 | vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) | ||
5034 | & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | | ||
5035 | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); | ||
5036 | } | ||
5037 | |||
5038 | static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) | ||
5039 | { | ||
5040 | vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) | ||
5041 | & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | | ||
5042 | X86_EFLAGS_SF | X86_EFLAGS_OF)) | ||
5043 | | X86_EFLAGS_CF); | ||
5044 | } | ||
5045 | |||
5046 | static void nested_vmx_failValid(struct kvm_vcpu *vcpu, | ||
5047 | u32 vm_instruction_error) | ||
5048 | { | ||
5049 | if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { | ||
5050 | /* | ||
5051 | * failValid writes the error number to the current VMCS, which | ||
5052 | * can't be done there isn't a current VMCS. | ||
5053 | */ | ||
5054 | nested_vmx_failInvalid(vcpu); | ||
5055 | return; | ||
5056 | } | ||
5057 | vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) | ||
5058 | & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | | ||
5059 | X86_EFLAGS_SF | X86_EFLAGS_OF)) | ||
5060 | | X86_EFLAGS_ZF); | ||
5061 | get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; | ||
5062 | } | ||
5063 | |||
5064 | /* Emulate the VMCLEAR instruction */ | ||
5065 | static int handle_vmclear(struct kvm_vcpu *vcpu) | ||
5066 | { | ||
5067 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
5068 | gva_t gva; | ||
5069 | gpa_t vmptr; | ||
5070 | struct vmcs12 *vmcs12; | ||
5071 | struct page *page; | ||
5072 | struct x86_exception e; | ||
5073 | |||
5074 | if (!nested_vmx_check_permission(vcpu)) | ||
5075 | return 1; | ||
5076 | |||
5077 | if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), | ||
5078 | vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) | ||
5079 | return 1; | ||
5080 | |||
5081 | if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, | ||
5082 | sizeof(vmptr), &e)) { | ||
5083 | kvm_inject_page_fault(vcpu, &e); | ||
5084 | return 1; | ||
5085 | } | ||
5086 | |||
5087 | if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { | ||
5088 | nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); | ||
5089 | skip_emulated_instruction(vcpu); | ||
5090 | return 1; | ||
5091 | } | ||
5092 | |||
5093 | if (vmptr == vmx->nested.current_vmptr) { | ||
5094 | kunmap(vmx->nested.current_vmcs12_page); | ||
5095 | nested_release_page(vmx->nested.current_vmcs12_page); | ||
5096 | vmx->nested.current_vmptr = -1ull; | ||
5097 | vmx->nested.current_vmcs12 = NULL; | ||
5098 | } | ||
5099 | |||
5100 | page = nested_get_page(vcpu, vmptr); | ||
5101 | if (page == NULL) { | ||
5102 | /* | ||
5103 | * For accurate processor emulation, VMCLEAR beyond available | ||
5104 | * physical memory should do nothing at all. However, it is | ||
5105 | * possible that a nested vmx bug, not a guest hypervisor bug, | ||
5106 | * resulted in this case, so let's shut down before doing any | ||
5107 | * more damage: | ||
5108 | */ | ||
5109 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); | ||
5110 | return 1; | ||
5111 | } | ||
5112 | vmcs12 = kmap(page); | ||
5113 | vmcs12->launch_state = 0; | ||
5114 | kunmap(page); | ||
5115 | nested_release_page(page); | ||
5116 | |||
5117 | nested_free_vmcs02(vmx, vmptr); | ||
5118 | |||
5119 | skip_emulated_instruction(vcpu); | ||
5120 | nested_vmx_succeed(vcpu); | ||
5121 | return 1; | ||
5122 | } | ||
5123 | |||
5124 | static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); | ||
5125 | |||
5126 | /* Emulate the VMLAUNCH instruction */ | ||
5127 | static int handle_vmlaunch(struct kvm_vcpu *vcpu) | ||
5128 | { | ||
5129 | return nested_vmx_run(vcpu, true); | ||
5130 | } | ||
5131 | |||
5132 | /* Emulate the VMRESUME instruction */ | ||
5133 | static int handle_vmresume(struct kvm_vcpu *vcpu) | ||
5134 | { | ||
5135 | |||
5136 | return nested_vmx_run(vcpu, false); | ||
5137 | } | ||
5138 | |||
5139 | enum vmcs_field_type { | ||
5140 | VMCS_FIELD_TYPE_U16 = 0, | ||
5141 | VMCS_FIELD_TYPE_U64 = 1, | ||
5142 | VMCS_FIELD_TYPE_U32 = 2, | ||
5143 | VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 | ||
5144 | }; | ||
5145 | |||
5146 | static inline int vmcs_field_type(unsigned long field) | ||
5147 | { | ||
5148 | if (0x1 & field) /* the *_HIGH fields are all 32 bit */ | ||
5149 | return VMCS_FIELD_TYPE_U32; | ||
5150 | return (field >> 13) & 0x3 ; | ||
5151 | } | ||
5152 | |||
5153 | static inline int vmcs_field_readonly(unsigned long field) | ||
5154 | { | ||
5155 | return (((field >> 10) & 0x3) == 1); | ||
5156 | } | ||
5157 | |||
5158 | /* | ||
5159 | * Read a vmcs12 field. Since these can have varying lengths and we return | ||
5160 | * one type, we chose the biggest type (u64) and zero-extend the return value | ||
5161 | * to that size. Note that the caller, handle_vmread, might need to use only | ||
5162 | * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of | ||
5163 | * 64-bit fields are to be returned). | ||
5164 | */ | ||
5165 | static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu, | ||
5166 | unsigned long field, u64 *ret) | ||
5167 | { | ||
5168 | short offset = vmcs_field_to_offset(field); | ||
5169 | char *p; | ||
5170 | |||
5171 | if (offset < 0) | ||
5172 | return 0; | ||
5173 | |||
5174 | p = ((char *)(get_vmcs12(vcpu))) + offset; | ||
5175 | |||
5176 | switch (vmcs_field_type(field)) { | ||
5177 | case VMCS_FIELD_TYPE_NATURAL_WIDTH: | ||
5178 | *ret = *((natural_width *)p); | ||
5179 | return 1; | ||
5180 | case VMCS_FIELD_TYPE_U16: | ||
5181 | *ret = *((u16 *)p); | ||
5182 | return 1; | ||
5183 | case VMCS_FIELD_TYPE_U32: | ||
5184 | *ret = *((u32 *)p); | ||
5185 | return 1; | ||
5186 | case VMCS_FIELD_TYPE_U64: | ||
5187 | *ret = *((u64 *)p); | ||
5188 | return 1; | ||
5189 | default: | ||
5190 | return 0; /* can never happen. */ | ||
5191 | } | ||
5192 | } | ||
5193 | |||
5194 | /* | ||
5195 | * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was | ||
5196 | * used before) all generate the same failure when it is missing. | ||
5197 | */ | ||
5198 | static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu) | ||
5199 | { | ||
5200 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
5201 | if (vmx->nested.current_vmptr == -1ull) { | ||
5202 | nested_vmx_failInvalid(vcpu); | ||
5203 | skip_emulated_instruction(vcpu); | ||
5204 | return 0; | ||
5205 | } | ||
5206 | return 1; | ||
5207 | } | ||
5208 | |||
5209 | static int handle_vmread(struct kvm_vcpu *vcpu) | ||
5210 | { | ||
5211 | unsigned long field; | ||
5212 | u64 field_value; | ||
5213 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
5214 | u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
5215 | gva_t gva = 0; | ||
5216 | |||
5217 | if (!nested_vmx_check_permission(vcpu) || | ||
5218 | !nested_vmx_check_vmcs12(vcpu)) | ||
5219 | return 1; | ||
5220 | |||
5221 | /* Decode instruction info and find the field to read */ | ||
5222 | field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); | ||
5223 | /* Read the field, zero-extended to a u64 field_value */ | ||
5224 | if (!vmcs12_read_any(vcpu, field, &field_value)) { | ||
5225 | nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); | ||
5226 | skip_emulated_instruction(vcpu); | ||
5227 | return 1; | ||
5228 | } | ||
5229 | /* | ||
5230 | * Now copy part of this value to register or memory, as requested. | ||
5231 | * Note that the number of bits actually copied is 32 or 64 depending | ||
5232 | * on the guest's mode (32 or 64 bit), not on the given field's length. | ||
5233 | */ | ||
5234 | if (vmx_instruction_info & (1u << 10)) { | ||
5235 | kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf), | ||
5236 | field_value); | ||
5237 | } else { | ||
5238 | if (get_vmx_mem_address(vcpu, exit_qualification, | ||
5239 | vmx_instruction_info, &gva)) | ||
5240 | return 1; | ||
5241 | /* _system ok, as nested_vmx_check_permission verified cpl=0 */ | ||
5242 | kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva, | ||
5243 | &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL); | ||
5244 | } | ||
5245 | |||
5246 | nested_vmx_succeed(vcpu); | ||
5247 | skip_emulated_instruction(vcpu); | ||
5248 | return 1; | ||
5249 | } | ||
5250 | |||
5251 | |||
5252 | static int handle_vmwrite(struct kvm_vcpu *vcpu) | ||
5253 | { | ||
5254 | unsigned long field; | ||
5255 | gva_t gva; | ||
5256 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
5257 | u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
5258 | char *p; | ||
5259 | short offset; | ||
5260 | /* The value to write might be 32 or 64 bits, depending on L1's long | ||
5261 | * mode, and eventually we need to write that into a field of several | ||
5262 | * possible lengths. The code below first zero-extends the value to 64 | ||
5263 | * bit (field_value), and then copies only the approriate number of | ||
5264 | * bits into the vmcs12 field. | ||
5265 | */ | ||
5266 | u64 field_value = 0; | ||
5267 | struct x86_exception e; | ||
5268 | |||
5269 | if (!nested_vmx_check_permission(vcpu) || | ||
5270 | !nested_vmx_check_vmcs12(vcpu)) | ||
5271 | return 1; | ||
5272 | |||
5273 | if (vmx_instruction_info & (1u << 10)) | ||
5274 | field_value = kvm_register_read(vcpu, | ||
5275 | (((vmx_instruction_info) >> 3) & 0xf)); | ||
5276 | else { | ||
5277 | if (get_vmx_mem_address(vcpu, exit_qualification, | ||
5278 | vmx_instruction_info, &gva)) | ||
5279 | return 1; | ||
5280 | if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, | ||
5281 | &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) { | ||
5282 | kvm_inject_page_fault(vcpu, &e); | ||
5283 | return 1; | ||
5284 | } | ||
5285 | } | ||
5286 | |||
5287 | |||
5288 | field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); | ||
5289 | if (vmcs_field_readonly(field)) { | ||
5290 | nested_vmx_failValid(vcpu, | ||
5291 | VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); | ||
5292 | skip_emulated_instruction(vcpu); | ||
5293 | return 1; | ||
5294 | } | ||
5295 | |||
5296 | offset = vmcs_field_to_offset(field); | ||
5297 | if (offset < 0) { | ||
5298 | nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); | ||
5299 | skip_emulated_instruction(vcpu); | ||
5300 | return 1; | ||
5301 | } | ||
5302 | p = ((char *) get_vmcs12(vcpu)) + offset; | ||
5303 | |||
5304 | switch (vmcs_field_type(field)) { | ||
5305 | case VMCS_FIELD_TYPE_U16: | ||
5306 | *(u16 *)p = field_value; | ||
5307 | break; | ||
5308 | case VMCS_FIELD_TYPE_U32: | ||
5309 | *(u32 *)p = field_value; | ||
5310 | break; | ||
5311 | case VMCS_FIELD_TYPE_U64: | ||
5312 | *(u64 *)p = field_value; | ||
5313 | break; | ||
5314 | case VMCS_FIELD_TYPE_NATURAL_WIDTH: | ||
5315 | *(natural_width *)p = field_value; | ||
5316 | break; | ||
5317 | default: | ||
5318 | nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); | ||
5319 | skip_emulated_instruction(vcpu); | ||
5320 | return 1; | ||
5321 | } | ||
5322 | |||
5323 | nested_vmx_succeed(vcpu); | ||
5324 | skip_emulated_instruction(vcpu); | ||
5325 | return 1; | ||
5326 | } | ||
5327 | |||
5328 | /* Emulate the VMPTRLD instruction */ | ||
5329 | static int handle_vmptrld(struct kvm_vcpu *vcpu) | ||
5330 | { | ||
5331 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
5332 | gva_t gva; | ||
5333 | gpa_t vmptr; | ||
5334 | struct x86_exception e; | ||
5335 | |||
5336 | if (!nested_vmx_check_permission(vcpu)) | ||
5337 | return 1; | ||
5338 | |||
5339 | if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), | ||
5340 | vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) | ||
5341 | return 1; | ||
5342 | |||
5343 | if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, | ||
5344 | sizeof(vmptr), &e)) { | ||
5345 | kvm_inject_page_fault(vcpu, &e); | ||
5346 | return 1; | ||
5347 | } | ||
5348 | |||
5349 | if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { | ||
5350 | nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); | ||
5351 | skip_emulated_instruction(vcpu); | ||
5352 | return 1; | ||
5353 | } | ||
5354 | |||
5355 | if (vmx->nested.current_vmptr != vmptr) { | ||
5356 | struct vmcs12 *new_vmcs12; | ||
5357 | struct page *page; | ||
5358 | page = nested_get_page(vcpu, vmptr); | ||
5359 | if (page == NULL) { | ||
5360 | nested_vmx_failInvalid(vcpu); | ||
5361 | skip_emulated_instruction(vcpu); | ||
5362 | return 1; | ||
5363 | } | ||
5364 | new_vmcs12 = kmap(page); | ||
5365 | if (new_vmcs12->revision_id != VMCS12_REVISION) { | ||
5366 | kunmap(page); | ||
5367 | nested_release_page_clean(page); | ||
5368 | nested_vmx_failValid(vcpu, | ||
5369 | VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); | ||
5370 | skip_emulated_instruction(vcpu); | ||
5371 | return 1; | ||
5372 | } | ||
5373 | if (vmx->nested.current_vmptr != -1ull) { | ||
5374 | kunmap(vmx->nested.current_vmcs12_page); | ||
5375 | nested_release_page(vmx->nested.current_vmcs12_page); | ||
5376 | } | ||
5377 | |||
5378 | vmx->nested.current_vmptr = vmptr; | ||
5379 | vmx->nested.current_vmcs12 = new_vmcs12; | ||
5380 | vmx->nested.current_vmcs12_page = page; | ||
5381 | } | ||
5382 | |||
5383 | nested_vmx_succeed(vcpu); | ||
5384 | skip_emulated_instruction(vcpu); | ||
5385 | return 1; | ||
5386 | } | ||
5387 | |||
5388 | /* Emulate the VMPTRST instruction */ | ||
5389 | static int handle_vmptrst(struct kvm_vcpu *vcpu) | ||
5390 | { | ||
5391 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
5392 | u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
5393 | gva_t vmcs_gva; | ||
5394 | struct x86_exception e; | ||
5395 | |||
5396 | if (!nested_vmx_check_permission(vcpu)) | ||
5397 | return 1; | ||
5398 | |||
5399 | if (get_vmx_mem_address(vcpu, exit_qualification, | ||
5400 | vmx_instruction_info, &vmcs_gva)) | ||
5401 | return 1; | ||
5402 | /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */ | ||
5403 | if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva, | ||
5404 | (void *)&to_vmx(vcpu)->nested.current_vmptr, | ||
5405 | sizeof(u64), &e)) { | ||
5406 | kvm_inject_page_fault(vcpu, &e); | ||
5407 | return 1; | ||
5408 | } | ||
5409 | nested_vmx_succeed(vcpu); | ||
5410 | skip_emulated_instruction(vcpu); | ||
5411 | return 1; | ||
5412 | } | ||
5413 | |||
5414 | /* | ||
3869 | * The exit handlers return 1 if the exit was handled fully and guest execution | 5415 | * The exit handlers return 1 if the exit was handled fully and guest execution |
3870 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs | 5416 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs |
3871 | * to be done to userspace and return 0. | 5417 | * to be done to userspace and return 0. |
@@ -3886,15 +5432,15 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { | |||
3886 | [EXIT_REASON_INVD] = handle_invd, | 5432 | [EXIT_REASON_INVD] = handle_invd, |
3887 | [EXIT_REASON_INVLPG] = handle_invlpg, | 5433 | [EXIT_REASON_INVLPG] = handle_invlpg, |
3888 | [EXIT_REASON_VMCALL] = handle_vmcall, | 5434 | [EXIT_REASON_VMCALL] = handle_vmcall, |
3889 | [EXIT_REASON_VMCLEAR] = handle_vmx_insn, | 5435 | [EXIT_REASON_VMCLEAR] = handle_vmclear, |
3890 | [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, | 5436 | [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, |
3891 | [EXIT_REASON_VMPTRLD] = handle_vmx_insn, | 5437 | [EXIT_REASON_VMPTRLD] = handle_vmptrld, |
3892 | [EXIT_REASON_VMPTRST] = handle_vmx_insn, | 5438 | [EXIT_REASON_VMPTRST] = handle_vmptrst, |
3893 | [EXIT_REASON_VMREAD] = handle_vmx_insn, | 5439 | [EXIT_REASON_VMREAD] = handle_vmread, |
3894 | [EXIT_REASON_VMRESUME] = handle_vmx_insn, | 5440 | [EXIT_REASON_VMRESUME] = handle_vmresume, |
3895 | [EXIT_REASON_VMWRITE] = handle_vmx_insn, | 5441 | [EXIT_REASON_VMWRITE] = handle_vmwrite, |
3896 | [EXIT_REASON_VMOFF] = handle_vmx_insn, | 5442 | [EXIT_REASON_VMOFF] = handle_vmoff, |
3897 | [EXIT_REASON_VMON] = handle_vmx_insn, | 5443 | [EXIT_REASON_VMON] = handle_vmon, |
3898 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, | 5444 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, |
3899 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, | 5445 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, |
3900 | [EXIT_REASON_WBINVD] = handle_wbinvd, | 5446 | [EXIT_REASON_WBINVD] = handle_wbinvd, |
@@ -3911,6 +5457,229 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { | |||
3911 | static const int kvm_vmx_max_exit_handlers = | 5457 | static const int kvm_vmx_max_exit_handlers = |
3912 | ARRAY_SIZE(kvm_vmx_exit_handlers); | 5458 | ARRAY_SIZE(kvm_vmx_exit_handlers); |
3913 | 5459 | ||
5460 | /* | ||
5461 | * Return 1 if we should exit from L2 to L1 to handle an MSR access access, | ||
5462 | * rather than handle it ourselves in L0. I.e., check whether L1 expressed | ||
5463 | * disinterest in the current event (read or write a specific MSR) by using an | ||
5464 | * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. | ||
5465 | */ | ||
5466 | static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, | ||
5467 | struct vmcs12 *vmcs12, u32 exit_reason) | ||
5468 | { | ||
5469 | u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
5470 | gpa_t bitmap; | ||
5471 | |||
5472 | if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS)) | ||
5473 | return 1; | ||
5474 | |||
5475 | /* | ||
5476 | * The MSR_BITMAP page is divided into four 1024-byte bitmaps, | ||
5477 | * for the four combinations of read/write and low/high MSR numbers. | ||
5478 | * First we need to figure out which of the four to use: | ||
5479 | */ | ||
5480 | bitmap = vmcs12->msr_bitmap; | ||
5481 | if (exit_reason == EXIT_REASON_MSR_WRITE) | ||
5482 | bitmap += 2048; | ||
5483 | if (msr_index >= 0xc0000000) { | ||
5484 | msr_index -= 0xc0000000; | ||
5485 | bitmap += 1024; | ||
5486 | } | ||
5487 | |||
5488 | /* Then read the msr_index'th bit from this bitmap: */ | ||
5489 | if (msr_index < 1024*8) { | ||
5490 | unsigned char b; | ||
5491 | kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1); | ||
5492 | return 1 & (b >> (msr_index & 7)); | ||
5493 | } else | ||
5494 | return 1; /* let L1 handle the wrong parameter */ | ||
5495 | } | ||
5496 | |||
5497 | /* | ||
5498 | * Return 1 if we should exit from L2 to L1 to handle a CR access exit, | ||
5499 | * rather than handle it ourselves in L0. I.e., check if L1 wanted to | ||
5500 | * intercept (via guest_host_mask etc.) the current event. | ||
5501 | */ | ||
5502 | static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, | ||
5503 | struct vmcs12 *vmcs12) | ||
5504 | { | ||
5505 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
5506 | int cr = exit_qualification & 15; | ||
5507 | int reg = (exit_qualification >> 8) & 15; | ||
5508 | unsigned long val = kvm_register_read(vcpu, reg); | ||
5509 | |||
5510 | switch ((exit_qualification >> 4) & 3) { | ||
5511 | case 0: /* mov to cr */ | ||
5512 | switch (cr) { | ||
5513 | case 0: | ||
5514 | if (vmcs12->cr0_guest_host_mask & | ||
5515 | (val ^ vmcs12->cr0_read_shadow)) | ||
5516 | return 1; | ||
5517 | break; | ||
5518 | case 3: | ||
5519 | if ((vmcs12->cr3_target_count >= 1 && | ||
5520 | vmcs12->cr3_target_value0 == val) || | ||
5521 | (vmcs12->cr3_target_count >= 2 && | ||
5522 | vmcs12->cr3_target_value1 == val) || | ||
5523 | (vmcs12->cr3_target_count >= 3 && | ||
5524 | vmcs12->cr3_target_value2 == val) || | ||
5525 | (vmcs12->cr3_target_count >= 4 && | ||
5526 | vmcs12->cr3_target_value3 == val)) | ||
5527 | return 0; | ||
5528 | if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) | ||
5529 | return 1; | ||
5530 | break; | ||
5531 | case 4: | ||
5532 | if (vmcs12->cr4_guest_host_mask & | ||
5533 | (vmcs12->cr4_read_shadow ^ val)) | ||
5534 | return 1; | ||
5535 | break; | ||
5536 | case 8: | ||
5537 | if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) | ||
5538 | return 1; | ||
5539 | break; | ||
5540 | } | ||
5541 | break; | ||
5542 | case 2: /* clts */ | ||
5543 | if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && | ||
5544 | (vmcs12->cr0_read_shadow & X86_CR0_TS)) | ||
5545 | return 1; | ||
5546 | break; | ||
5547 | case 1: /* mov from cr */ | ||
5548 | switch (cr) { | ||
5549 | case 3: | ||
5550 | if (vmcs12->cpu_based_vm_exec_control & | ||
5551 | CPU_BASED_CR3_STORE_EXITING) | ||
5552 | return 1; | ||
5553 | break; | ||
5554 | case 8: | ||
5555 | if (vmcs12->cpu_based_vm_exec_control & | ||
5556 | CPU_BASED_CR8_STORE_EXITING) | ||
5557 | return 1; | ||
5558 | break; | ||
5559 | } | ||
5560 | break; | ||
5561 | case 3: /* lmsw */ | ||
5562 | /* | ||
5563 | * lmsw can change bits 1..3 of cr0, and only set bit 0 of | ||
5564 | * cr0. Other attempted changes are ignored, with no exit. | ||
5565 | */ | ||
5566 | if (vmcs12->cr0_guest_host_mask & 0xe & | ||
5567 | (val ^ vmcs12->cr0_read_shadow)) | ||
5568 | return 1; | ||
5569 | if ((vmcs12->cr0_guest_host_mask & 0x1) && | ||
5570 | !(vmcs12->cr0_read_shadow & 0x1) && | ||
5571 | (val & 0x1)) | ||
5572 | return 1; | ||
5573 | break; | ||
5574 | } | ||
5575 | return 0; | ||
5576 | } | ||
5577 | |||
5578 | /* | ||
5579 | * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we | ||
5580 | * should handle it ourselves in L0 (and then continue L2). Only call this | ||
5581 | * when in is_guest_mode (L2). | ||
5582 | */ | ||
5583 | static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) | ||
5584 | { | ||
5585 | u32 exit_reason = vmcs_read32(VM_EXIT_REASON); | ||
5586 | u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
5587 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
5588 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
5589 | |||
5590 | if (vmx->nested.nested_run_pending) | ||
5591 | return 0; | ||
5592 | |||
5593 | if (unlikely(vmx->fail)) { | ||
5594 | printk(KERN_INFO "%s failed vm entry %x\n", | ||
5595 | __func__, vmcs_read32(VM_INSTRUCTION_ERROR)); | ||
5596 | return 1; | ||
5597 | } | ||
5598 | |||
5599 | switch (exit_reason) { | ||
5600 | case EXIT_REASON_EXCEPTION_NMI: | ||
5601 | if (!is_exception(intr_info)) | ||
5602 | return 0; | ||
5603 | else if (is_page_fault(intr_info)) | ||
5604 | return enable_ept; | ||
5605 | return vmcs12->exception_bitmap & | ||
5606 | (1u << (intr_info & INTR_INFO_VECTOR_MASK)); | ||
5607 | case EXIT_REASON_EXTERNAL_INTERRUPT: | ||
5608 | return 0; | ||
5609 | case EXIT_REASON_TRIPLE_FAULT: | ||
5610 | return 1; | ||
5611 | case EXIT_REASON_PENDING_INTERRUPT: | ||
5612 | case EXIT_REASON_NMI_WINDOW: | ||
5613 | /* | ||
5614 | * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit | ||
5615 | * (aka Interrupt Window Exiting) only when L1 turned it on, | ||
5616 | * so if we got a PENDING_INTERRUPT exit, this must be for L1. | ||
5617 | * Same for NMI Window Exiting. | ||
5618 | */ | ||
5619 | return 1; | ||
5620 | case EXIT_REASON_TASK_SWITCH: | ||
5621 | return 1; | ||
5622 | case EXIT_REASON_CPUID: | ||
5623 | return 1; | ||
5624 | case EXIT_REASON_HLT: | ||
5625 | return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); | ||
5626 | case EXIT_REASON_INVD: | ||
5627 | return 1; | ||
5628 | case EXIT_REASON_INVLPG: | ||
5629 | return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); | ||
5630 | case EXIT_REASON_RDPMC: | ||
5631 | return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); | ||
5632 | case EXIT_REASON_RDTSC: | ||
5633 | return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); | ||
5634 | case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: | ||
5635 | case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: | ||
5636 | case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: | ||
5637 | case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: | ||
5638 | case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: | ||
5639 | /* | ||
5640 | * VMX instructions trap unconditionally. This allows L1 to | ||
5641 | * emulate them for its L2 guest, i.e., allows 3-level nesting! | ||
5642 | */ | ||
5643 | return 1; | ||
5644 | case EXIT_REASON_CR_ACCESS: | ||
5645 | return nested_vmx_exit_handled_cr(vcpu, vmcs12); | ||
5646 | case EXIT_REASON_DR_ACCESS: | ||
5647 | return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); | ||
5648 | case EXIT_REASON_IO_INSTRUCTION: | ||
5649 | /* TODO: support IO bitmaps */ | ||
5650 | return 1; | ||
5651 | case EXIT_REASON_MSR_READ: | ||
5652 | case EXIT_REASON_MSR_WRITE: | ||
5653 | return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); | ||
5654 | case EXIT_REASON_INVALID_STATE: | ||
5655 | return 1; | ||
5656 | case EXIT_REASON_MWAIT_INSTRUCTION: | ||
5657 | return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); | ||
5658 | case EXIT_REASON_MONITOR_INSTRUCTION: | ||
5659 | return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); | ||
5660 | case EXIT_REASON_PAUSE_INSTRUCTION: | ||
5661 | return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || | ||
5662 | nested_cpu_has2(vmcs12, | ||
5663 | SECONDARY_EXEC_PAUSE_LOOP_EXITING); | ||
5664 | case EXIT_REASON_MCE_DURING_VMENTRY: | ||
5665 | return 0; | ||
5666 | case EXIT_REASON_TPR_BELOW_THRESHOLD: | ||
5667 | return 1; | ||
5668 | case EXIT_REASON_APIC_ACCESS: | ||
5669 | return nested_cpu_has2(vmcs12, | ||
5670 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); | ||
5671 | case EXIT_REASON_EPT_VIOLATION: | ||
5672 | case EXIT_REASON_EPT_MISCONFIG: | ||
5673 | return 0; | ||
5674 | case EXIT_REASON_WBINVD: | ||
5675 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); | ||
5676 | case EXIT_REASON_XSETBV: | ||
5677 | return 1; | ||
5678 | default: | ||
5679 | return 1; | ||
5680 | } | ||
5681 | } | ||
5682 | |||
3914 | static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) | 5683 | static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) |
3915 | { | 5684 | { |
3916 | *info1 = vmcs_readl(EXIT_QUALIFICATION); | 5685 | *info1 = vmcs_readl(EXIT_QUALIFICATION); |
@@ -3933,6 +5702,25 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) | |||
3933 | if (vmx->emulation_required && emulate_invalid_guest_state) | 5702 | if (vmx->emulation_required && emulate_invalid_guest_state) |
3934 | return handle_invalid_guest_state(vcpu); | 5703 | return handle_invalid_guest_state(vcpu); |
3935 | 5704 | ||
5705 | /* | ||
5706 | * the KVM_REQ_EVENT optimization bit is only on for one entry, and if | ||
5707 | * we did not inject a still-pending event to L1 now because of | ||
5708 | * nested_run_pending, we need to re-enable this bit. | ||
5709 | */ | ||
5710 | if (vmx->nested.nested_run_pending) | ||
5711 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
5712 | |||
5713 | if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH || | ||
5714 | exit_reason == EXIT_REASON_VMRESUME)) | ||
5715 | vmx->nested.nested_run_pending = 1; | ||
5716 | else | ||
5717 | vmx->nested.nested_run_pending = 0; | ||
5718 | |||
5719 | if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { | ||
5720 | nested_vmx_vmexit(vcpu); | ||
5721 | return 1; | ||
5722 | } | ||
5723 | |||
3936 | if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { | 5724 | if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { |
3937 | vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; | 5725 | vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; |
3938 | vcpu->run->fail_entry.hardware_entry_failure_reason | 5726 | vcpu->run->fail_entry.hardware_entry_failure_reason |
@@ -3955,7 +5743,9 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) | |||
3955 | "(0x%x) and exit reason is 0x%x\n", | 5743 | "(0x%x) and exit reason is 0x%x\n", |
3956 | __func__, vectoring_info, exit_reason); | 5744 | __func__, vectoring_info, exit_reason); |
3957 | 5745 | ||
3958 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { | 5746 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && |
5747 | !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( | ||
5748 | get_vmcs12(vcpu), vcpu)))) { | ||
3959 | if (vmx_interrupt_allowed(vcpu)) { | 5749 | if (vmx_interrupt_allowed(vcpu)) { |
3960 | vmx->soft_vnmi_blocked = 0; | 5750 | vmx->soft_vnmi_blocked = 0; |
3961 | } else if (vmx->vnmi_blocked_time > 1000000000LL && | 5751 | } else if (vmx->vnmi_blocked_time > 1000000000LL && |
@@ -4118,6 +5908,8 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, | |||
4118 | 5908 | ||
4119 | static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | 5909 | static void vmx_complete_interrupts(struct vcpu_vmx *vmx) |
4120 | { | 5910 | { |
5911 | if (is_guest_mode(&vmx->vcpu)) | ||
5912 | return; | ||
4121 | __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, | 5913 | __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, |
4122 | VM_EXIT_INSTRUCTION_LEN, | 5914 | VM_EXIT_INSTRUCTION_LEN, |
4123 | IDT_VECTORING_ERROR_CODE); | 5915 | IDT_VECTORING_ERROR_CODE); |
@@ -4125,6 +5917,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | |||
4125 | 5917 | ||
4126 | static void vmx_cancel_injection(struct kvm_vcpu *vcpu) | 5918 | static void vmx_cancel_injection(struct kvm_vcpu *vcpu) |
4127 | { | 5919 | { |
5920 | if (is_guest_mode(vcpu)) | ||
5921 | return; | ||
4128 | __vmx_complete_interrupts(to_vmx(vcpu), | 5922 | __vmx_complete_interrupts(to_vmx(vcpu), |
4129 | vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), | 5923 | vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), |
4130 | VM_ENTRY_INSTRUCTION_LEN, | 5924 | VM_ENTRY_INSTRUCTION_LEN, |
@@ -4145,6 +5939,21 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
4145 | { | 5939 | { |
4146 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 5940 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
4147 | 5941 | ||
5942 | if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) { | ||
5943 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
5944 | if (vmcs12->idt_vectoring_info_field & | ||
5945 | VECTORING_INFO_VALID_MASK) { | ||
5946 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
5947 | vmcs12->idt_vectoring_info_field); | ||
5948 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | ||
5949 | vmcs12->vm_exit_instruction_len); | ||
5950 | if (vmcs12->idt_vectoring_info_field & | ||
5951 | VECTORING_INFO_DELIVER_CODE_MASK) | ||
5952 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, | ||
5953 | vmcs12->idt_vectoring_error_code); | ||
5954 | } | ||
5955 | } | ||
5956 | |||
4148 | /* Record the guest's net vcpu time for enforced NMI injections. */ | 5957 | /* Record the guest's net vcpu time for enforced NMI injections. */ |
4149 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) | 5958 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) |
4150 | vmx->entry_time = ktime_get(); | 5959 | vmx->entry_time = ktime_get(); |
@@ -4167,6 +5976,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
4167 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) | 5976 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) |
4168 | vmx_set_interrupt_shadow(vcpu, 0); | 5977 | vmx_set_interrupt_shadow(vcpu, 0); |
4169 | 5978 | ||
5979 | vmx->__launched = vmx->loaded_vmcs->launched; | ||
4170 | asm( | 5980 | asm( |
4171 | /* Store host registers */ | 5981 | /* Store host registers */ |
4172 | "push %%"R"dx; push %%"R"bp;" | 5982 | "push %%"R"dx; push %%"R"bp;" |
@@ -4237,7 +6047,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
4237 | "pop %%"R"bp; pop %%"R"dx \n\t" | 6047 | "pop %%"R"bp; pop %%"R"dx \n\t" |
4238 | "setbe %c[fail](%0) \n\t" | 6048 | "setbe %c[fail](%0) \n\t" |
4239 | : : "c"(vmx), "d"((unsigned long)HOST_RSP), | 6049 | : : "c"(vmx), "d"((unsigned long)HOST_RSP), |
4240 | [launched]"i"(offsetof(struct vcpu_vmx, launched)), | 6050 | [launched]"i"(offsetof(struct vcpu_vmx, __launched)), |
4241 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), | 6051 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), |
4242 | [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), | 6052 | [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), |
4243 | [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), | 6053 | [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), |
@@ -4276,8 +6086,19 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
4276 | 6086 | ||
4277 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | 6087 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); |
4278 | 6088 | ||
6089 | if (is_guest_mode(vcpu)) { | ||
6090 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
6091 | vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info; | ||
6092 | if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { | ||
6093 | vmcs12->idt_vectoring_error_code = | ||
6094 | vmcs_read32(IDT_VECTORING_ERROR_CODE); | ||
6095 | vmcs12->vm_exit_instruction_len = | ||
6096 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | ||
6097 | } | ||
6098 | } | ||
6099 | |||
4279 | asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); | 6100 | asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); |
4280 | vmx->launched = 1; | 6101 | vmx->loaded_vmcs->launched = 1; |
4281 | 6102 | ||
4282 | vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); | 6103 | vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); |
4283 | 6104 | ||
@@ -4289,41 +6110,18 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
4289 | #undef R | 6110 | #undef R |
4290 | #undef Q | 6111 | #undef Q |
4291 | 6112 | ||
4292 | static void vmx_free_vmcs(struct kvm_vcpu *vcpu) | ||
4293 | { | ||
4294 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
4295 | |||
4296 | if (vmx->vmcs) { | ||
4297 | vcpu_clear(vmx); | ||
4298 | free_vmcs(vmx->vmcs); | ||
4299 | vmx->vmcs = NULL; | ||
4300 | } | ||
4301 | } | ||
4302 | |||
4303 | static void vmx_free_vcpu(struct kvm_vcpu *vcpu) | 6113 | static void vmx_free_vcpu(struct kvm_vcpu *vcpu) |
4304 | { | 6114 | { |
4305 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 6115 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
4306 | 6116 | ||
4307 | free_vpid(vmx); | 6117 | free_vpid(vmx); |
4308 | vmx_free_vmcs(vcpu); | 6118 | free_nested(vmx); |
6119 | free_loaded_vmcs(vmx->loaded_vmcs); | ||
4309 | kfree(vmx->guest_msrs); | 6120 | kfree(vmx->guest_msrs); |
4310 | kvm_vcpu_uninit(vcpu); | 6121 | kvm_vcpu_uninit(vcpu); |
4311 | kmem_cache_free(kvm_vcpu_cache, vmx); | 6122 | kmem_cache_free(kvm_vcpu_cache, vmx); |
4312 | } | 6123 | } |
4313 | 6124 | ||
4314 | static inline void vmcs_init(struct vmcs *vmcs) | ||
4315 | { | ||
4316 | u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id())); | ||
4317 | |||
4318 | if (!vmm_exclusive) | ||
4319 | kvm_cpu_vmxon(phys_addr); | ||
4320 | |||
4321 | vmcs_clear(vmcs); | ||
4322 | |||
4323 | if (!vmm_exclusive) | ||
4324 | kvm_cpu_vmxoff(); | ||
4325 | } | ||
4326 | |||
4327 | static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | 6125 | static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) |
4328 | { | 6126 | { |
4329 | int err; | 6127 | int err; |
@@ -4345,11 +6143,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | |||
4345 | goto uninit_vcpu; | 6143 | goto uninit_vcpu; |
4346 | } | 6144 | } |
4347 | 6145 | ||
4348 | vmx->vmcs = alloc_vmcs(); | 6146 | vmx->loaded_vmcs = &vmx->vmcs01; |
4349 | if (!vmx->vmcs) | 6147 | vmx->loaded_vmcs->vmcs = alloc_vmcs(); |
6148 | if (!vmx->loaded_vmcs->vmcs) | ||
4350 | goto free_msrs; | 6149 | goto free_msrs; |
4351 | 6150 | if (!vmm_exclusive) | |
4352 | vmcs_init(vmx->vmcs); | 6151 | kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id()))); |
6152 | loaded_vmcs_init(vmx->loaded_vmcs); | ||
6153 | if (!vmm_exclusive) | ||
6154 | kvm_cpu_vmxoff(); | ||
4353 | 6155 | ||
4354 | cpu = get_cpu(); | 6156 | cpu = get_cpu(); |
4355 | vmx_vcpu_load(&vmx->vcpu, cpu); | 6157 | vmx_vcpu_load(&vmx->vcpu, cpu); |
@@ -4375,10 +6177,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | |||
4375 | goto free_vmcs; | 6177 | goto free_vmcs; |
4376 | } | 6178 | } |
4377 | 6179 | ||
6180 | vmx->nested.current_vmptr = -1ull; | ||
6181 | vmx->nested.current_vmcs12 = NULL; | ||
6182 | |||
4378 | return &vmx->vcpu; | 6183 | return &vmx->vcpu; |
4379 | 6184 | ||
4380 | free_vmcs: | 6185 | free_vmcs: |
4381 | free_vmcs(vmx->vmcs); | 6186 | free_vmcs(vmx->loaded_vmcs->vmcs); |
4382 | free_msrs: | 6187 | free_msrs: |
4383 | kfree(vmx->guest_msrs); | 6188 | kfree(vmx->guest_msrs); |
4384 | uninit_vcpu: | 6189 | uninit_vcpu: |
@@ -4512,6 +6317,650 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) | |||
4512 | 6317 | ||
4513 | static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) | 6318 | static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) |
4514 | { | 6319 | { |
6320 | if (func == 1 && nested) | ||
6321 | entry->ecx |= bit(X86_FEATURE_VMX); | ||
6322 | } | ||
6323 | |||
6324 | /* | ||
6325 | * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested | ||
6326 | * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it | ||
6327 | * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2 | ||
6328 | * guest in a way that will both be appropriate to L1's requests, and our | ||
6329 | * needs. In addition to modifying the active vmcs (which is vmcs02), this | ||
6330 | * function also has additional necessary side-effects, like setting various | ||
6331 | * vcpu->arch fields. | ||
6332 | */ | ||
6333 | static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
6334 | { | ||
6335 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
6336 | u32 exec_control; | ||
6337 | |||
6338 | vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); | ||
6339 | vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); | ||
6340 | vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); | ||
6341 | vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); | ||
6342 | vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); | ||
6343 | vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); | ||
6344 | vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); | ||
6345 | vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); | ||
6346 | vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); | ||
6347 | vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); | ||
6348 | vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); | ||
6349 | vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); | ||
6350 | vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); | ||
6351 | vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); | ||
6352 | vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); | ||
6353 | vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); | ||
6354 | vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); | ||
6355 | vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); | ||
6356 | vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); | ||
6357 | vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); | ||
6358 | vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); | ||
6359 | vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); | ||
6360 | vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); | ||
6361 | vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); | ||
6362 | vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); | ||
6363 | vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); | ||
6364 | vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); | ||
6365 | vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); | ||
6366 | vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); | ||
6367 | vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); | ||
6368 | vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); | ||
6369 | vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); | ||
6370 | vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); | ||
6371 | vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); | ||
6372 | vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); | ||
6373 | vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); | ||
6374 | |||
6375 | vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); | ||
6376 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
6377 | vmcs12->vm_entry_intr_info_field); | ||
6378 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, | ||
6379 | vmcs12->vm_entry_exception_error_code); | ||
6380 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | ||
6381 | vmcs12->vm_entry_instruction_len); | ||
6382 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, | ||
6383 | vmcs12->guest_interruptibility_info); | ||
6384 | vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state); | ||
6385 | vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); | ||
6386 | vmcs_writel(GUEST_DR7, vmcs12->guest_dr7); | ||
6387 | vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags); | ||
6388 | vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, | ||
6389 | vmcs12->guest_pending_dbg_exceptions); | ||
6390 | vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); | ||
6391 | vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); | ||
6392 | |||
6393 | vmcs_write64(VMCS_LINK_POINTER, -1ull); | ||
6394 | |||
6395 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, | ||
6396 | (vmcs_config.pin_based_exec_ctrl | | ||
6397 | vmcs12->pin_based_vm_exec_control)); | ||
6398 | |||
6399 | /* | ||
6400 | * Whether page-faults are trapped is determined by a combination of | ||
6401 | * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. | ||
6402 | * If enable_ept, L0 doesn't care about page faults and we should | ||
6403 | * set all of these to L1's desires. However, if !enable_ept, L0 does | ||
6404 | * care about (at least some) page faults, and because it is not easy | ||
6405 | * (if at all possible?) to merge L0 and L1's desires, we simply ask | ||
6406 | * to exit on each and every L2 page fault. This is done by setting | ||
6407 | * MASK=MATCH=0 and (see below) EB.PF=1. | ||
6408 | * Note that below we don't need special code to set EB.PF beyond the | ||
6409 | * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, | ||
6410 | * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when | ||
6411 | * !enable_ept, EB.PF is 1, so the "or" will always be 1. | ||
6412 | * | ||
6413 | * A problem with this approach (when !enable_ept) is that L1 may be | ||
6414 | * injected with more page faults than it asked for. This could have | ||
6415 | * caused problems, but in practice existing hypervisors don't care. | ||
6416 | * To fix this, we will need to emulate the PFEC checking (on the L1 | ||
6417 | * page tables), using walk_addr(), when injecting PFs to L1. | ||
6418 | */ | ||
6419 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, | ||
6420 | enable_ept ? vmcs12->page_fault_error_code_mask : 0); | ||
6421 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, | ||
6422 | enable_ept ? vmcs12->page_fault_error_code_match : 0); | ||
6423 | |||
6424 | if (cpu_has_secondary_exec_ctrls()) { | ||
6425 | u32 exec_control = vmx_secondary_exec_control(vmx); | ||
6426 | if (!vmx->rdtscp_enabled) | ||
6427 | exec_control &= ~SECONDARY_EXEC_RDTSCP; | ||
6428 | /* Take the following fields only from vmcs12 */ | ||
6429 | exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
6430 | if (nested_cpu_has(vmcs12, | ||
6431 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) | ||
6432 | exec_control |= vmcs12->secondary_vm_exec_control; | ||
6433 | |||
6434 | if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) { | ||
6435 | /* | ||
6436 | * Translate L1 physical address to host physical | ||
6437 | * address for vmcs02. Keep the page pinned, so this | ||
6438 | * physical address remains valid. We keep a reference | ||
6439 | * to it so we can release it later. | ||
6440 | */ | ||
6441 | if (vmx->nested.apic_access_page) /* shouldn't happen */ | ||
6442 | nested_release_page(vmx->nested.apic_access_page); | ||
6443 | vmx->nested.apic_access_page = | ||
6444 | nested_get_page(vcpu, vmcs12->apic_access_addr); | ||
6445 | /* | ||
6446 | * If translation failed, no matter: This feature asks | ||
6447 | * to exit when accessing the given address, and if it | ||
6448 | * can never be accessed, this feature won't do | ||
6449 | * anything anyway. | ||
6450 | */ | ||
6451 | if (!vmx->nested.apic_access_page) | ||
6452 | exec_control &= | ||
6453 | ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
6454 | else | ||
6455 | vmcs_write64(APIC_ACCESS_ADDR, | ||
6456 | page_to_phys(vmx->nested.apic_access_page)); | ||
6457 | } | ||
6458 | |||
6459 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); | ||
6460 | } | ||
6461 | |||
6462 | |||
6463 | /* | ||
6464 | * Set host-state according to L0's settings (vmcs12 is irrelevant here) | ||
6465 | * Some constant fields are set here by vmx_set_constant_host_state(). | ||
6466 | * Other fields are different per CPU, and will be set later when | ||
6467 | * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. | ||
6468 | */ | ||
6469 | vmx_set_constant_host_state(); | ||
6470 | |||
6471 | /* | ||
6472 | * HOST_RSP is normally set correctly in vmx_vcpu_run() just before | ||
6473 | * entry, but only if the current (host) sp changed from the value | ||
6474 | * we wrote last (vmx->host_rsp). This cache is no longer relevant | ||
6475 | * if we switch vmcs, and rather than hold a separate cache per vmcs, | ||
6476 | * here we just force the write to happen on entry. | ||
6477 | */ | ||
6478 | vmx->host_rsp = 0; | ||
6479 | |||
6480 | exec_control = vmx_exec_control(vmx); /* L0's desires */ | ||
6481 | exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; | ||
6482 | exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; | ||
6483 | exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
6484 | exec_control |= vmcs12->cpu_based_vm_exec_control; | ||
6485 | /* | ||
6486 | * Merging of IO and MSR bitmaps not currently supported. | ||
6487 | * Rather, exit every time. | ||
6488 | */ | ||
6489 | exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; | ||
6490 | exec_control &= ~CPU_BASED_USE_IO_BITMAPS; | ||
6491 | exec_control |= CPU_BASED_UNCOND_IO_EXITING; | ||
6492 | |||
6493 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); | ||
6494 | |||
6495 | /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the | ||
6496 | * bitwise-or of what L1 wants to trap for L2, and what we want to | ||
6497 | * trap. Note that CR0.TS also needs updating - we do this later. | ||
6498 | */ | ||
6499 | update_exception_bitmap(vcpu); | ||
6500 | vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; | ||
6501 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); | ||
6502 | |||
6503 | /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */ | ||
6504 | vmcs_write32(VM_EXIT_CONTROLS, | ||
6505 | vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); | ||
6506 | vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | | ||
6507 | (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); | ||
6508 | |||
6509 | if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) | ||
6510 | vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); | ||
6511 | else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) | ||
6512 | vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); | ||
6513 | |||
6514 | |||
6515 | set_cr4_guest_host_mask(vmx); | ||
6516 | |||
6517 | vmcs_write64(TSC_OFFSET, | ||
6518 | vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); | ||
6519 | |||
6520 | if (enable_vpid) { | ||
6521 | /* | ||
6522 | * Trivially support vpid by letting L2s share their parent | ||
6523 | * L1's vpid. TODO: move to a more elaborate solution, giving | ||
6524 | * each L2 its own vpid and exposing the vpid feature to L1. | ||
6525 | */ | ||
6526 | vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); | ||
6527 | vmx_flush_tlb(vcpu); | ||
6528 | } | ||
6529 | |||
6530 | if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) | ||
6531 | vcpu->arch.efer = vmcs12->guest_ia32_efer; | ||
6532 | if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) | ||
6533 | vcpu->arch.efer |= (EFER_LMA | EFER_LME); | ||
6534 | else | ||
6535 | vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); | ||
6536 | /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ | ||
6537 | vmx_set_efer(vcpu, vcpu->arch.efer); | ||
6538 | |||
6539 | /* | ||
6540 | * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified | ||
6541 | * TS bit (for lazy fpu) and bits which we consider mandatory enabled. | ||
6542 | * The CR0_READ_SHADOW is what L2 should have expected to read given | ||
6543 | * the specifications by L1; It's not enough to take | ||
6544 | * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we | ||
6545 | * have more bits than L1 expected. | ||
6546 | */ | ||
6547 | vmx_set_cr0(vcpu, vmcs12->guest_cr0); | ||
6548 | vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); | ||
6549 | |||
6550 | vmx_set_cr4(vcpu, vmcs12->guest_cr4); | ||
6551 | vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); | ||
6552 | |||
6553 | /* shadow page tables on either EPT or shadow page tables */ | ||
6554 | kvm_set_cr3(vcpu, vmcs12->guest_cr3); | ||
6555 | kvm_mmu_reset_context(vcpu); | ||
6556 | |||
6557 | kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); | ||
6558 | kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); | ||
6559 | } | ||
6560 | |||
6561 | /* | ||
6562 | * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 | ||
6563 | * for running an L2 nested guest. | ||
6564 | */ | ||
6565 | static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) | ||
6566 | { | ||
6567 | struct vmcs12 *vmcs12; | ||
6568 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
6569 | int cpu; | ||
6570 | struct loaded_vmcs *vmcs02; | ||
6571 | |||
6572 | if (!nested_vmx_check_permission(vcpu) || | ||
6573 | !nested_vmx_check_vmcs12(vcpu)) | ||
6574 | return 1; | ||
6575 | |||
6576 | skip_emulated_instruction(vcpu); | ||
6577 | vmcs12 = get_vmcs12(vcpu); | ||
6578 | |||
6579 | /* | ||
6580 | * The nested entry process starts with enforcing various prerequisites | ||
6581 | * on vmcs12 as required by the Intel SDM, and act appropriately when | ||
6582 | * they fail: As the SDM explains, some conditions should cause the | ||
6583 | * instruction to fail, while others will cause the instruction to seem | ||
6584 | * to succeed, but return an EXIT_REASON_INVALID_STATE. | ||
6585 | * To speed up the normal (success) code path, we should avoid checking | ||
6586 | * for misconfigurations which will anyway be caught by the processor | ||
6587 | * when using the merged vmcs02. | ||
6588 | */ | ||
6589 | if (vmcs12->launch_state == launch) { | ||
6590 | nested_vmx_failValid(vcpu, | ||
6591 | launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS | ||
6592 | : VMXERR_VMRESUME_NONLAUNCHED_VMCS); | ||
6593 | return 1; | ||
6594 | } | ||
6595 | |||
6596 | if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) && | ||
6597 | !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) { | ||
6598 | /*TODO: Also verify bits beyond physical address width are 0*/ | ||
6599 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); | ||
6600 | return 1; | ||
6601 | } | ||
6602 | |||
6603 | if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && | ||
6604 | !IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) { | ||
6605 | /*TODO: Also verify bits beyond physical address width are 0*/ | ||
6606 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); | ||
6607 | return 1; | ||
6608 | } | ||
6609 | |||
6610 | if (vmcs12->vm_entry_msr_load_count > 0 || | ||
6611 | vmcs12->vm_exit_msr_load_count > 0 || | ||
6612 | vmcs12->vm_exit_msr_store_count > 0) { | ||
6613 | if (printk_ratelimit()) | ||
6614 | printk(KERN_WARNING | ||
6615 | "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__); | ||
6616 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); | ||
6617 | return 1; | ||
6618 | } | ||
6619 | |||
6620 | if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, | ||
6621 | nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) || | ||
6622 | !vmx_control_verify(vmcs12->secondary_vm_exec_control, | ||
6623 | nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) || | ||
6624 | !vmx_control_verify(vmcs12->pin_based_vm_exec_control, | ||
6625 | nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) || | ||
6626 | !vmx_control_verify(vmcs12->vm_exit_controls, | ||
6627 | nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) || | ||
6628 | !vmx_control_verify(vmcs12->vm_entry_controls, | ||
6629 | nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high)) | ||
6630 | { | ||
6631 | nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); | ||
6632 | return 1; | ||
6633 | } | ||
6634 | |||
6635 | if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || | ||
6636 | ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { | ||
6637 | nested_vmx_failValid(vcpu, | ||
6638 | VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); | ||
6639 | return 1; | ||
6640 | } | ||
6641 | |||
6642 | if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || | ||
6643 | ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { | ||
6644 | nested_vmx_entry_failure(vcpu, vmcs12, | ||
6645 | EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); | ||
6646 | return 1; | ||
6647 | } | ||
6648 | if (vmcs12->vmcs_link_pointer != -1ull) { | ||
6649 | nested_vmx_entry_failure(vcpu, vmcs12, | ||
6650 | EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR); | ||
6651 | return 1; | ||
6652 | } | ||
6653 | |||
6654 | /* | ||
6655 | * We're finally done with prerequisite checking, and can start with | ||
6656 | * the nested entry. | ||
6657 | */ | ||
6658 | |||
6659 | vmcs02 = nested_get_current_vmcs02(vmx); | ||
6660 | if (!vmcs02) | ||
6661 | return -ENOMEM; | ||
6662 | |||
6663 | enter_guest_mode(vcpu); | ||
6664 | |||
6665 | vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); | ||
6666 | |||
6667 | cpu = get_cpu(); | ||
6668 | vmx->loaded_vmcs = vmcs02; | ||
6669 | vmx_vcpu_put(vcpu); | ||
6670 | vmx_vcpu_load(vcpu, cpu); | ||
6671 | vcpu->cpu = cpu; | ||
6672 | put_cpu(); | ||
6673 | |||
6674 | vmcs12->launch_state = 1; | ||
6675 | |||
6676 | prepare_vmcs02(vcpu, vmcs12); | ||
6677 | |||
6678 | /* | ||
6679 | * Note no nested_vmx_succeed or nested_vmx_fail here. At this point | ||
6680 | * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet | ||
6681 | * returned as far as L1 is concerned. It will only return (and set | ||
6682 | * the success flag) when L2 exits (see nested_vmx_vmexit()). | ||
6683 | */ | ||
6684 | return 1; | ||
6685 | } | ||
6686 | |||
6687 | /* | ||
6688 | * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date | ||
6689 | * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). | ||
6690 | * This function returns the new value we should put in vmcs12.guest_cr0. | ||
6691 | * It's not enough to just return the vmcs02 GUEST_CR0. Rather, | ||
6692 | * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now | ||
6693 | * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 | ||
6694 | * didn't trap the bit, because if L1 did, so would L0). | ||
6695 | * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have | ||
6696 | * been modified by L2, and L1 knows it. So just leave the old value of | ||
6697 | * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 | ||
6698 | * isn't relevant, because if L0 traps this bit it can set it to anything. | ||
6699 | * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have | ||
6700 | * changed these bits, and therefore they need to be updated, but L0 | ||
6701 | * didn't necessarily allow them to be changed in GUEST_CR0 - and rather | ||
6702 | * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. | ||
6703 | */ | ||
6704 | static inline unsigned long | ||
6705 | vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
6706 | { | ||
6707 | return | ||
6708 | /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | | ||
6709 | /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | | ||
6710 | /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | | ||
6711 | vcpu->arch.cr0_guest_owned_bits)); | ||
6712 | } | ||
6713 | |||
6714 | static inline unsigned long | ||
6715 | vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
6716 | { | ||
6717 | return | ||
6718 | /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | | ||
6719 | /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | | ||
6720 | /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | | ||
6721 | vcpu->arch.cr4_guest_owned_bits)); | ||
6722 | } | ||
6723 | |||
6724 | /* | ||
6725 | * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits | ||
6726 | * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), | ||
6727 | * and this function updates it to reflect the changes to the guest state while | ||
6728 | * L2 was running (and perhaps made some exits which were handled directly by L0 | ||
6729 | * without going back to L1), and to reflect the exit reason. | ||
6730 | * Note that we do not have to copy here all VMCS fields, just those that | ||
6731 | * could have changed by the L2 guest or the exit - i.e., the guest-state and | ||
6732 | * exit-information fields only. Other fields are modified by L1 with VMWRITE, | ||
6733 | * which already writes to vmcs12 directly. | ||
6734 | */ | ||
6735 | void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
6736 | { | ||
6737 | /* update guest state fields: */ | ||
6738 | vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); | ||
6739 | vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); | ||
6740 | |||
6741 | kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); | ||
6742 | vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); | ||
6743 | vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); | ||
6744 | vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); | ||
6745 | |||
6746 | vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); | ||
6747 | vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); | ||
6748 | vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); | ||
6749 | vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); | ||
6750 | vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); | ||
6751 | vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); | ||
6752 | vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); | ||
6753 | vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); | ||
6754 | vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); | ||
6755 | vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); | ||
6756 | vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); | ||
6757 | vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); | ||
6758 | vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); | ||
6759 | vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); | ||
6760 | vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); | ||
6761 | vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); | ||
6762 | vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); | ||
6763 | vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); | ||
6764 | vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); | ||
6765 | vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); | ||
6766 | vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); | ||
6767 | vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); | ||
6768 | vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); | ||
6769 | vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); | ||
6770 | vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); | ||
6771 | vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); | ||
6772 | vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); | ||
6773 | vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); | ||
6774 | vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); | ||
6775 | vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); | ||
6776 | vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); | ||
6777 | vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); | ||
6778 | vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); | ||
6779 | vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); | ||
6780 | vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); | ||
6781 | vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); | ||
6782 | |||
6783 | vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE); | ||
6784 | vmcs12->guest_interruptibility_info = | ||
6785 | vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | ||
6786 | vmcs12->guest_pending_dbg_exceptions = | ||
6787 | vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); | ||
6788 | |||
6789 | /* TODO: These cannot have changed unless we have MSR bitmaps and | ||
6790 | * the relevant bit asks not to trap the change */ | ||
6791 | vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); | ||
6792 | if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT) | ||
6793 | vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); | ||
6794 | vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); | ||
6795 | vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); | ||
6796 | vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); | ||
6797 | |||
6798 | /* update exit information fields: */ | ||
6799 | |||
6800 | vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON); | ||
6801 | vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
6802 | |||
6803 | vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
6804 | vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); | ||
6805 | vmcs12->idt_vectoring_info_field = | ||
6806 | vmcs_read32(IDT_VECTORING_INFO_FIELD); | ||
6807 | vmcs12->idt_vectoring_error_code = | ||
6808 | vmcs_read32(IDT_VECTORING_ERROR_CODE); | ||
6809 | vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | ||
6810 | vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); | ||
6811 | |||
6812 | /* clear vm-entry fields which are to be cleared on exit */ | ||
6813 | if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) | ||
6814 | vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; | ||
6815 | } | ||
6816 | |||
6817 | /* | ||
6818 | * A part of what we need to when the nested L2 guest exits and we want to | ||
6819 | * run its L1 parent, is to reset L1's guest state to the host state specified | ||
6820 | * in vmcs12. | ||
6821 | * This function is to be called not only on normal nested exit, but also on | ||
6822 | * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry | ||
6823 | * Failures During or After Loading Guest State"). | ||
6824 | * This function should be called when the active VMCS is L1's (vmcs01). | ||
6825 | */ | ||
6826 | void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) | ||
6827 | { | ||
6828 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) | ||
6829 | vcpu->arch.efer = vmcs12->host_ia32_efer; | ||
6830 | if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) | ||
6831 | vcpu->arch.efer |= (EFER_LMA | EFER_LME); | ||
6832 | else | ||
6833 | vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); | ||
6834 | vmx_set_efer(vcpu, vcpu->arch.efer); | ||
6835 | |||
6836 | kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); | ||
6837 | kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); | ||
6838 | /* | ||
6839 | * Note that calling vmx_set_cr0 is important, even if cr0 hasn't | ||
6840 | * actually changed, because it depends on the current state of | ||
6841 | * fpu_active (which may have changed). | ||
6842 | * Note that vmx_set_cr0 refers to efer set above. | ||
6843 | */ | ||
6844 | kvm_set_cr0(vcpu, vmcs12->host_cr0); | ||
6845 | /* | ||
6846 | * If we did fpu_activate()/fpu_deactivate() during L2's run, we need | ||
6847 | * to apply the same changes to L1's vmcs. We just set cr0 correctly, | ||
6848 | * but we also need to update cr0_guest_host_mask and exception_bitmap. | ||
6849 | */ | ||
6850 | update_exception_bitmap(vcpu); | ||
6851 | vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0); | ||
6852 | vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); | ||
6853 | |||
6854 | /* | ||
6855 | * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01 | ||
6856 | * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask(); | ||
6857 | */ | ||
6858 | vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); | ||
6859 | kvm_set_cr4(vcpu, vmcs12->host_cr4); | ||
6860 | |||
6861 | /* shadow page tables on either EPT or shadow page tables */ | ||
6862 | kvm_set_cr3(vcpu, vmcs12->host_cr3); | ||
6863 | kvm_mmu_reset_context(vcpu); | ||
6864 | |||
6865 | if (enable_vpid) { | ||
6866 | /* | ||
6867 | * Trivially support vpid by letting L2s share their parent | ||
6868 | * L1's vpid. TODO: move to a more elaborate solution, giving | ||
6869 | * each L2 its own vpid and exposing the vpid feature to L1. | ||
6870 | */ | ||
6871 | vmx_flush_tlb(vcpu); | ||
6872 | } | ||
6873 | |||
6874 | |||
6875 | vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); | ||
6876 | vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); | ||
6877 | vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); | ||
6878 | vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); | ||
6879 | vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); | ||
6880 | vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base); | ||
6881 | vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base); | ||
6882 | vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base); | ||
6883 | vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector); | ||
6884 | vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector); | ||
6885 | vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector); | ||
6886 | vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector); | ||
6887 | vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector); | ||
6888 | vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector); | ||
6889 | vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector); | ||
6890 | |||
6891 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) | ||
6892 | vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); | ||
6893 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) | ||
6894 | vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, | ||
6895 | vmcs12->host_ia32_perf_global_ctrl); | ||
6896 | } | ||
6897 | |||
6898 | /* | ||
6899 | * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 | ||
6900 | * and modify vmcs12 to make it see what it would expect to see there if | ||
6901 | * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) | ||
6902 | */ | ||
6903 | static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) | ||
6904 | { | ||
6905 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
6906 | int cpu; | ||
6907 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | ||
6908 | |||
6909 | leave_guest_mode(vcpu); | ||
6910 | prepare_vmcs12(vcpu, vmcs12); | ||
6911 | |||
6912 | cpu = get_cpu(); | ||
6913 | vmx->loaded_vmcs = &vmx->vmcs01; | ||
6914 | vmx_vcpu_put(vcpu); | ||
6915 | vmx_vcpu_load(vcpu, cpu); | ||
6916 | vcpu->cpu = cpu; | ||
6917 | put_cpu(); | ||
6918 | |||
6919 | /* if no vmcs02 cache requested, remove the one we used */ | ||
6920 | if (VMCS02_POOL_SIZE == 0) | ||
6921 | nested_free_vmcs02(vmx, vmx->nested.current_vmptr); | ||
6922 | |||
6923 | load_vmcs12_host_state(vcpu, vmcs12); | ||
6924 | |||
6925 | /* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */ | ||
6926 | vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); | ||
6927 | |||
6928 | /* This is needed for same reason as it was needed in prepare_vmcs02 */ | ||
6929 | vmx->host_rsp = 0; | ||
6930 | |||
6931 | /* Unpin physical memory we referred to in vmcs02 */ | ||
6932 | if (vmx->nested.apic_access_page) { | ||
6933 | nested_release_page(vmx->nested.apic_access_page); | ||
6934 | vmx->nested.apic_access_page = 0; | ||
6935 | } | ||
6936 | |||
6937 | /* | ||
6938 | * Exiting from L2 to L1, we're now back to L1 which thinks it just | ||
6939 | * finished a VMLAUNCH or VMRESUME instruction, so we need to set the | ||
6940 | * success or failure flag accordingly. | ||
6941 | */ | ||
6942 | if (unlikely(vmx->fail)) { | ||
6943 | vmx->fail = 0; | ||
6944 | nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); | ||
6945 | } else | ||
6946 | nested_vmx_succeed(vcpu); | ||
6947 | } | ||
6948 | |||
6949 | /* | ||
6950 | * L1's failure to enter L2 is a subset of a normal exit, as explained in | ||
6951 | * 23.7 "VM-entry failures during or after loading guest state" (this also | ||
6952 | * lists the acceptable exit-reason and exit-qualification parameters). | ||
6953 | * It should only be called before L2 actually succeeded to run, and when | ||
6954 | * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss). | ||
6955 | */ | ||
6956 | static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, | ||
6957 | struct vmcs12 *vmcs12, | ||
6958 | u32 reason, unsigned long qualification) | ||
6959 | { | ||
6960 | load_vmcs12_host_state(vcpu, vmcs12); | ||
6961 | vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY; | ||
6962 | vmcs12->exit_qualification = qualification; | ||
6963 | nested_vmx_succeed(vcpu); | ||
4515 | } | 6964 | } |
4516 | 6965 | ||
4517 | static int vmx_check_intercept(struct kvm_vcpu *vcpu, | 6966 | static int vmx_check_intercept(struct kvm_vcpu *vcpu, |
@@ -4670,16 +7119,13 @@ static int __init vmx_init(void) | |||
4670 | vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); | 7119 | vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); |
4671 | 7120 | ||
4672 | if (enable_ept) { | 7121 | if (enable_ept) { |
4673 | bypass_guest_pf = 0; | ||
4674 | kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, | 7122 | kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, |
4675 | VMX_EPT_EXECUTABLE_MASK); | 7123 | VMX_EPT_EXECUTABLE_MASK); |
7124 | ept_set_mmio_spte_mask(); | ||
4676 | kvm_enable_tdp(); | 7125 | kvm_enable_tdp(); |
4677 | } else | 7126 | } else |
4678 | kvm_disable_tdp(); | 7127 | kvm_disable_tdp(); |
4679 | 7128 | ||
4680 | if (bypass_guest_pf) | ||
4681 | kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); | ||
4682 | |||
4683 | return 0; | 7129 | return 0; |
4684 | 7130 | ||
4685 | out3: | 7131 | out3: |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 77c9d8673dc4..84a28ea45fa4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -347,6 +347,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) | |||
347 | vcpu->arch.cr2 = fault->address; | 347 | vcpu->arch.cr2 = fault->address; |
348 | kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); | 348 | kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); |
349 | } | 349 | } |
350 | EXPORT_SYMBOL_GPL(kvm_inject_page_fault); | ||
350 | 351 | ||
351 | void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) | 352 | void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) |
352 | { | 353 | { |
@@ -579,6 +580,22 @@ static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) | |||
579 | return best && (best->ecx & bit(X86_FEATURE_XSAVE)); | 580 | return best && (best->ecx & bit(X86_FEATURE_XSAVE)); |
580 | } | 581 | } |
581 | 582 | ||
583 | static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) | ||
584 | { | ||
585 | struct kvm_cpuid_entry2 *best; | ||
586 | |||
587 | best = kvm_find_cpuid_entry(vcpu, 7, 0); | ||
588 | return best && (best->ebx & bit(X86_FEATURE_SMEP)); | ||
589 | } | ||
590 | |||
591 | static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) | ||
592 | { | ||
593 | struct kvm_cpuid_entry2 *best; | ||
594 | |||
595 | best = kvm_find_cpuid_entry(vcpu, 7, 0); | ||
596 | return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); | ||
597 | } | ||
598 | |||
582 | static void update_cpuid(struct kvm_vcpu *vcpu) | 599 | static void update_cpuid(struct kvm_vcpu *vcpu) |
583 | { | 600 | { |
584 | struct kvm_cpuid_entry2 *best; | 601 | struct kvm_cpuid_entry2 *best; |
@@ -598,14 +615,20 @@ static void update_cpuid(struct kvm_vcpu *vcpu) | |||
598 | int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | 615 | int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
599 | { | 616 | { |
600 | unsigned long old_cr4 = kvm_read_cr4(vcpu); | 617 | unsigned long old_cr4 = kvm_read_cr4(vcpu); |
601 | unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; | 618 | unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | |
602 | 619 | X86_CR4_PAE | X86_CR4_SMEP; | |
603 | if (cr4 & CR4_RESERVED_BITS) | 620 | if (cr4 & CR4_RESERVED_BITS) |
604 | return 1; | 621 | return 1; |
605 | 622 | ||
606 | if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) | 623 | if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) |
607 | return 1; | 624 | return 1; |
608 | 625 | ||
626 | if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) | ||
627 | return 1; | ||
628 | |||
629 | if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS)) | ||
630 | return 1; | ||
631 | |||
609 | if (is_long_mode(vcpu)) { | 632 | if (is_long_mode(vcpu)) { |
610 | if (!(cr4 & X86_CR4_PAE)) | 633 | if (!(cr4 & X86_CR4_PAE)) |
611 | return 1; | 634 | return 1; |
@@ -615,11 +638,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
615 | kvm_read_cr3(vcpu))) | 638 | kvm_read_cr3(vcpu))) |
616 | return 1; | 639 | return 1; |
617 | 640 | ||
618 | if (cr4 & X86_CR4_VMXE) | 641 | if (kvm_x86_ops->set_cr4(vcpu, cr4)) |
619 | return 1; | 642 | return 1; |
620 | 643 | ||
621 | kvm_x86_ops->set_cr4(vcpu, cr4); | ||
622 | |||
623 | if ((cr4 ^ old_cr4) & pdptr_bits) | 644 | if ((cr4 ^ old_cr4) & pdptr_bits) |
624 | kvm_mmu_reset_context(vcpu); | 645 | kvm_mmu_reset_context(vcpu); |
625 | 646 | ||
@@ -787,12 +808,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr); | |||
787 | * kvm-specific. Those are put in the beginning of the list. | 808 | * kvm-specific. Those are put in the beginning of the list. |
788 | */ | 809 | */ |
789 | 810 | ||
790 | #define KVM_SAVE_MSRS_BEGIN 8 | 811 | #define KVM_SAVE_MSRS_BEGIN 9 |
791 | static u32 msrs_to_save[] = { | 812 | static u32 msrs_to_save[] = { |
792 | MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, | 813 | MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, |
793 | MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, | 814 | MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, |
794 | HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, | 815 | HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, |
795 | HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, | 816 | HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, |
796 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | 817 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, |
797 | MSR_STAR, | 818 | MSR_STAR, |
798 | #ifdef CONFIG_X86_64 | 819 | #ifdef CONFIG_X86_64 |
@@ -1388,7 +1409,7 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1388 | return 1; | 1409 | return 1; |
1389 | kvm_x86_ops->patch_hypercall(vcpu, instructions); | 1410 | kvm_x86_ops->patch_hypercall(vcpu, instructions); |
1390 | ((unsigned char *)instructions)[3] = 0xc3; /* ret */ | 1411 | ((unsigned char *)instructions)[3] = 0xc3; /* ret */ |
1391 | if (copy_to_user((void __user *)addr, instructions, 4)) | 1412 | if (__copy_to_user((void __user *)addr, instructions, 4)) |
1392 | return 1; | 1413 | return 1; |
1393 | kvm->arch.hv_hypercall = data; | 1414 | kvm->arch.hv_hypercall = data; |
1394 | break; | 1415 | break; |
@@ -1415,7 +1436,7 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1415 | HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); | 1436 | HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); |
1416 | if (kvm_is_error_hva(addr)) | 1437 | if (kvm_is_error_hva(addr)) |
1417 | return 1; | 1438 | return 1; |
1418 | if (clear_user((void __user *)addr, PAGE_SIZE)) | 1439 | if (__clear_user((void __user *)addr, PAGE_SIZE)) |
1419 | return 1; | 1440 | return 1; |
1420 | vcpu->arch.hv_vapic = data; | 1441 | vcpu->arch.hv_vapic = data; |
1421 | break; | 1442 | break; |
@@ -1467,6 +1488,35 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) | |||
1467 | } | 1488 | } |
1468 | } | 1489 | } |
1469 | 1490 | ||
1491 | static void accumulate_steal_time(struct kvm_vcpu *vcpu) | ||
1492 | { | ||
1493 | u64 delta; | ||
1494 | |||
1495 | if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) | ||
1496 | return; | ||
1497 | |||
1498 | delta = current->sched_info.run_delay - vcpu->arch.st.last_steal; | ||
1499 | vcpu->arch.st.last_steal = current->sched_info.run_delay; | ||
1500 | vcpu->arch.st.accum_steal = delta; | ||
1501 | } | ||
1502 | |||
1503 | static void record_steal_time(struct kvm_vcpu *vcpu) | ||
1504 | { | ||
1505 | if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) | ||
1506 | return; | ||
1507 | |||
1508 | if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, | ||
1509 | &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) | ||
1510 | return; | ||
1511 | |||
1512 | vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal; | ||
1513 | vcpu->arch.st.steal.version += 2; | ||
1514 | vcpu->arch.st.accum_steal = 0; | ||
1515 | |||
1516 | kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, | ||
1517 | &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); | ||
1518 | } | ||
1519 | |||
1470 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | 1520 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) |
1471 | { | 1521 | { |
1472 | switch (msr) { | 1522 | switch (msr) { |
@@ -1549,6 +1599,33 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1549 | if (kvm_pv_enable_async_pf(vcpu, data)) | 1599 | if (kvm_pv_enable_async_pf(vcpu, data)) |
1550 | return 1; | 1600 | return 1; |
1551 | break; | 1601 | break; |
1602 | case MSR_KVM_STEAL_TIME: | ||
1603 | |||
1604 | if (unlikely(!sched_info_on())) | ||
1605 | return 1; | ||
1606 | |||
1607 | if (data & KVM_STEAL_RESERVED_MASK) | ||
1608 | return 1; | ||
1609 | |||
1610 | if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, | ||
1611 | data & KVM_STEAL_VALID_BITS)) | ||
1612 | return 1; | ||
1613 | |||
1614 | vcpu->arch.st.msr_val = data; | ||
1615 | |||
1616 | if (!(data & KVM_MSR_ENABLED)) | ||
1617 | break; | ||
1618 | |||
1619 | vcpu->arch.st.last_steal = current->sched_info.run_delay; | ||
1620 | |||
1621 | preempt_disable(); | ||
1622 | accumulate_steal_time(vcpu); | ||
1623 | preempt_enable(); | ||
1624 | |||
1625 | kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); | ||
1626 | |||
1627 | break; | ||
1628 | |||
1552 | case MSR_IA32_MCG_CTL: | 1629 | case MSR_IA32_MCG_CTL: |
1553 | case MSR_IA32_MCG_STATUS: | 1630 | case MSR_IA32_MCG_STATUS: |
1554 | case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: | 1631 | case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: |
@@ -1834,6 +1911,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
1834 | case MSR_KVM_ASYNC_PF_EN: | 1911 | case MSR_KVM_ASYNC_PF_EN: |
1835 | data = vcpu->arch.apf.msr_val; | 1912 | data = vcpu->arch.apf.msr_val; |
1836 | break; | 1913 | break; |
1914 | case MSR_KVM_STEAL_TIME: | ||
1915 | data = vcpu->arch.st.msr_val; | ||
1916 | break; | ||
1837 | case MSR_IA32_P5_MC_ADDR: | 1917 | case MSR_IA32_P5_MC_ADDR: |
1838 | case MSR_IA32_P5_MC_TYPE: | 1918 | case MSR_IA32_P5_MC_TYPE: |
1839 | case MSR_IA32_MCG_CAP: | 1919 | case MSR_IA32_MCG_CAP: |
@@ -2145,6 +2225,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
2145 | kvm_migrate_timers(vcpu); | 2225 | kvm_migrate_timers(vcpu); |
2146 | vcpu->cpu = cpu; | 2226 | vcpu->cpu = cpu; |
2147 | } | 2227 | } |
2228 | |||
2229 | accumulate_steal_time(vcpu); | ||
2230 | kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); | ||
2148 | } | 2231 | } |
2149 | 2232 | ||
2150 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) | 2233 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) |
@@ -2283,6 +2366,13 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2283 | entry->flags = 0; | 2366 | entry->flags = 0; |
2284 | } | 2367 | } |
2285 | 2368 | ||
2369 | static bool supported_xcr0_bit(unsigned bit) | ||
2370 | { | ||
2371 | u64 mask = ((u64)1 << bit); | ||
2372 | |||
2373 | return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0; | ||
2374 | } | ||
2375 | |||
2286 | #define F(x) bit(X86_FEATURE_##x) | 2376 | #define F(x) bit(X86_FEATURE_##x) |
2287 | 2377 | ||
2288 | static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | 2378 | static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, |
@@ -2328,7 +2418,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2328 | 0 /* Reserved, DCA */ | F(XMM4_1) | | 2418 | 0 /* Reserved, DCA */ | F(XMM4_1) | |
2329 | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | | 2419 | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | |
2330 | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | | 2420 | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | |
2331 | F(F16C); | 2421 | F(F16C) | F(RDRAND); |
2332 | /* cpuid 0x80000001.ecx */ | 2422 | /* cpuid 0x80000001.ecx */ |
2333 | const u32 kvm_supported_word6_x86_features = | 2423 | const u32 kvm_supported_word6_x86_features = |
2334 | F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | | 2424 | F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | |
@@ -2342,6 +2432,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2342 | F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | | 2432 | F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | |
2343 | F(PMM) | F(PMM_EN); | 2433 | F(PMM) | F(PMM_EN); |
2344 | 2434 | ||
2435 | /* cpuid 7.0.ebx */ | ||
2436 | const u32 kvm_supported_word9_x86_features = | ||
2437 | F(SMEP) | F(FSGSBASE) | F(ERMS); | ||
2438 | |||
2345 | /* all calls to cpuid_count() should be made on the same cpu */ | 2439 | /* all calls to cpuid_count() should be made on the same cpu */ |
2346 | get_cpu(); | 2440 | get_cpu(); |
2347 | do_cpuid_1_ent(entry, function, index); | 2441 | do_cpuid_1_ent(entry, function, index); |
@@ -2376,7 +2470,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2376 | } | 2470 | } |
2377 | break; | 2471 | break; |
2378 | } | 2472 | } |
2379 | /* function 4 and 0xb have additional index. */ | 2473 | /* function 4 has additional index. */ |
2380 | case 4: { | 2474 | case 4: { |
2381 | int i, cache_type; | 2475 | int i, cache_type; |
2382 | 2476 | ||
@@ -2393,6 +2487,22 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2393 | } | 2487 | } |
2394 | break; | 2488 | break; |
2395 | } | 2489 | } |
2490 | case 7: { | ||
2491 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
2492 | /* Mask ebx against host capbability word 9 */ | ||
2493 | if (index == 0) { | ||
2494 | entry->ebx &= kvm_supported_word9_x86_features; | ||
2495 | cpuid_mask(&entry->ebx, 9); | ||
2496 | } else | ||
2497 | entry->ebx = 0; | ||
2498 | entry->eax = 0; | ||
2499 | entry->ecx = 0; | ||
2500 | entry->edx = 0; | ||
2501 | break; | ||
2502 | } | ||
2503 | case 9: | ||
2504 | break; | ||
2505 | /* function 0xb has additional index. */ | ||
2396 | case 0xb: { | 2506 | case 0xb: { |
2397 | int i, level_type; | 2507 | int i, level_type; |
2398 | 2508 | ||
@@ -2410,16 +2520,17 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2410 | break; | 2520 | break; |
2411 | } | 2521 | } |
2412 | case 0xd: { | 2522 | case 0xd: { |
2413 | int i; | 2523 | int idx, i; |
2414 | 2524 | ||
2415 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | 2525 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; |
2416 | for (i = 1; *nent < maxnent && i < 64; ++i) { | 2526 | for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) { |
2417 | if (entry[i].eax == 0) | 2527 | do_cpuid_1_ent(&entry[i], function, idx); |
2528 | if (entry[i].eax == 0 || !supported_xcr0_bit(idx)) | ||
2418 | continue; | 2529 | continue; |
2419 | do_cpuid_1_ent(&entry[i], function, i); | ||
2420 | entry[i].flags |= | 2530 | entry[i].flags |= |
2421 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | 2531 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; |
2422 | ++*nent; | 2532 | ++*nent; |
2533 | ++i; | ||
2423 | } | 2534 | } |
2424 | break; | 2535 | break; |
2425 | } | 2536 | } |
@@ -2438,6 +2549,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2438 | (1 << KVM_FEATURE_CLOCKSOURCE2) | | 2549 | (1 << KVM_FEATURE_CLOCKSOURCE2) | |
2439 | (1 << KVM_FEATURE_ASYNC_PF) | | 2550 | (1 << KVM_FEATURE_ASYNC_PF) | |
2440 | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); | 2551 | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); |
2552 | |||
2553 | if (sched_info_on()) | ||
2554 | entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); | ||
2555 | |||
2441 | entry->ebx = 0; | 2556 | entry->ebx = 0; |
2442 | entry->ecx = 0; | 2557 | entry->ecx = 0; |
2443 | entry->edx = 0; | 2558 | entry->edx = 0; |
@@ -2451,6 +2566,24 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2451 | entry->ecx &= kvm_supported_word6_x86_features; | 2566 | entry->ecx &= kvm_supported_word6_x86_features; |
2452 | cpuid_mask(&entry->ecx, 6); | 2567 | cpuid_mask(&entry->ecx, 6); |
2453 | break; | 2568 | break; |
2569 | case 0x80000008: { | ||
2570 | unsigned g_phys_as = (entry->eax >> 16) & 0xff; | ||
2571 | unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); | ||
2572 | unsigned phys_as = entry->eax & 0xff; | ||
2573 | |||
2574 | if (!g_phys_as) | ||
2575 | g_phys_as = phys_as; | ||
2576 | entry->eax = g_phys_as | (virt_as << 8); | ||
2577 | entry->ebx = entry->edx = 0; | ||
2578 | break; | ||
2579 | } | ||
2580 | case 0x80000019: | ||
2581 | entry->ecx = entry->edx = 0; | ||
2582 | break; | ||
2583 | case 0x8000001a: | ||
2584 | break; | ||
2585 | case 0x8000001d: | ||
2586 | break; | ||
2454 | /*Add support for Centaur's CPUID instruction*/ | 2587 | /*Add support for Centaur's CPUID instruction*/ |
2455 | case 0xC0000000: | 2588 | case 0xC0000000: |
2456 | /*Just support up to 0xC0000004 now*/ | 2589 | /*Just support up to 0xC0000004 now*/ |
@@ -2460,10 +2593,16 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2460 | entry->edx &= kvm_supported_word5_x86_features; | 2593 | entry->edx &= kvm_supported_word5_x86_features; |
2461 | cpuid_mask(&entry->edx, 5); | 2594 | cpuid_mask(&entry->edx, 5); |
2462 | break; | 2595 | break; |
2596 | case 3: /* Processor serial number */ | ||
2597 | case 5: /* MONITOR/MWAIT */ | ||
2598 | case 6: /* Thermal management */ | ||
2599 | case 0xA: /* Architectural Performance Monitoring */ | ||
2600 | case 0x80000007: /* Advanced power management */ | ||
2463 | case 0xC0000002: | 2601 | case 0xC0000002: |
2464 | case 0xC0000003: | 2602 | case 0xC0000003: |
2465 | case 0xC0000004: | 2603 | case 0xC0000004: |
2466 | /*Now nothing to do, reserved for the future*/ | 2604 | default: |
2605 | entry->eax = entry->ebx = entry->ecx = entry->edx = 0; | ||
2467 | break; | 2606 | break; |
2468 | } | 2607 | } |
2469 | 2608 | ||
@@ -3817,7 +3956,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, | |||
3817 | exception); | 3956 | exception); |
3818 | } | 3957 | } |
3819 | 3958 | ||
3820 | static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, | 3959 | int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, |
3821 | gva_t addr, void *val, unsigned int bytes, | 3960 | gva_t addr, void *val, unsigned int bytes, |
3822 | struct x86_exception *exception) | 3961 | struct x86_exception *exception) |
3823 | { | 3962 | { |
@@ -3827,6 +3966,7 @@ static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, | |||
3827 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, | 3966 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, |
3828 | exception); | 3967 | exception); |
3829 | } | 3968 | } |
3969 | EXPORT_SYMBOL_GPL(kvm_read_guest_virt); | ||
3830 | 3970 | ||
3831 | static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, | 3971 | static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, |
3832 | gva_t addr, void *val, unsigned int bytes, | 3972 | gva_t addr, void *val, unsigned int bytes, |
@@ -3836,7 +3976,7 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, | |||
3836 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); | 3976 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); |
3837 | } | 3977 | } |
3838 | 3978 | ||
3839 | static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, | 3979 | int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, |
3840 | gva_t addr, void *val, | 3980 | gva_t addr, void *val, |
3841 | unsigned int bytes, | 3981 | unsigned int bytes, |
3842 | struct x86_exception *exception) | 3982 | struct x86_exception *exception) |
@@ -3868,6 +4008,42 @@ static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, | |||
3868 | out: | 4008 | out: |
3869 | return r; | 4009 | return r; |
3870 | } | 4010 | } |
4011 | EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); | ||
4012 | |||
4013 | static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, | ||
4014 | gpa_t *gpa, struct x86_exception *exception, | ||
4015 | bool write) | ||
4016 | { | ||
4017 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; | ||
4018 | |||
4019 | if (vcpu_match_mmio_gva(vcpu, gva) && | ||
4020 | check_write_user_access(vcpu, write, access, | ||
4021 | vcpu->arch.access)) { | ||
4022 | *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | | ||
4023 | (gva & (PAGE_SIZE - 1)); | ||
4024 | trace_vcpu_match_mmio(gva, *gpa, write, false); | ||
4025 | return 1; | ||
4026 | } | ||
4027 | |||
4028 | if (write) | ||
4029 | access |= PFERR_WRITE_MASK; | ||
4030 | |||
4031 | *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); | ||
4032 | |||
4033 | if (*gpa == UNMAPPED_GVA) | ||
4034 | return -1; | ||
4035 | |||
4036 | /* For APIC access vmexit */ | ||
4037 | if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | ||
4038 | return 1; | ||
4039 | |||
4040 | if (vcpu_match_mmio_gpa(vcpu, *gpa)) { | ||
4041 | trace_vcpu_match_mmio(gva, *gpa, write, true); | ||
4042 | return 1; | ||
4043 | } | ||
4044 | |||
4045 | return 0; | ||
4046 | } | ||
3871 | 4047 | ||
3872 | static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, | 4048 | static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, |
3873 | unsigned long addr, | 4049 | unsigned long addr, |
@@ -3876,8 +4052,8 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, | |||
3876 | struct x86_exception *exception) | 4052 | struct x86_exception *exception) |
3877 | { | 4053 | { |
3878 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); | 4054 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
3879 | gpa_t gpa; | 4055 | gpa_t gpa; |
3880 | int handled; | 4056 | int handled, ret; |
3881 | 4057 | ||
3882 | if (vcpu->mmio_read_completed) { | 4058 | if (vcpu->mmio_read_completed) { |
3883 | memcpy(val, vcpu->mmio_data, bytes); | 4059 | memcpy(val, vcpu->mmio_data, bytes); |
@@ -3887,13 +4063,12 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, | |||
3887 | return X86EMUL_CONTINUE; | 4063 | return X86EMUL_CONTINUE; |
3888 | } | 4064 | } |
3889 | 4065 | ||
3890 | gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception); | 4066 | ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false); |
3891 | 4067 | ||
3892 | if (gpa == UNMAPPED_GVA) | 4068 | if (ret < 0) |
3893 | return X86EMUL_PROPAGATE_FAULT; | 4069 | return X86EMUL_PROPAGATE_FAULT; |
3894 | 4070 | ||
3895 | /* For APIC access vmexit */ | 4071 | if (ret) |
3896 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | ||
3897 | goto mmio; | 4072 | goto mmio; |
3898 | 4073 | ||
3899 | if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception) | 4074 | if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception) |
@@ -3944,16 +4119,16 @@ static int emulator_write_emulated_onepage(unsigned long addr, | |||
3944 | struct x86_exception *exception, | 4119 | struct x86_exception *exception, |
3945 | struct kvm_vcpu *vcpu) | 4120 | struct kvm_vcpu *vcpu) |
3946 | { | 4121 | { |
3947 | gpa_t gpa; | 4122 | gpa_t gpa; |
3948 | int handled; | 4123 | int handled, ret; |
3949 | 4124 | ||
3950 | gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); | 4125 | ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true); |
3951 | 4126 | ||
3952 | if (gpa == UNMAPPED_GVA) | 4127 | if (ret < 0) |
3953 | return X86EMUL_PROPAGATE_FAULT; | 4128 | return X86EMUL_PROPAGATE_FAULT; |
3954 | 4129 | ||
3955 | /* For APIC access vmexit */ | 4130 | /* For APIC access vmexit */ |
3956 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | 4131 | if (ret) |
3957 | goto mmio; | 4132 | goto mmio; |
3958 | 4133 | ||
3959 | if (emulator_write_phys(vcpu, gpa, val, bytes)) | 4134 | if (emulator_write_phys(vcpu, gpa, val, bytes)) |
@@ -4473,9 +4648,24 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) | |||
4473 | kvm_queue_exception(vcpu, ctxt->exception.vector); | 4648 | kvm_queue_exception(vcpu, ctxt->exception.vector); |
4474 | } | 4649 | } |
4475 | 4650 | ||
4651 | static void init_decode_cache(struct x86_emulate_ctxt *ctxt, | ||
4652 | const unsigned long *regs) | ||
4653 | { | ||
4654 | memset(&ctxt->twobyte, 0, | ||
4655 | (void *)&ctxt->regs - (void *)&ctxt->twobyte); | ||
4656 | memcpy(ctxt->regs, regs, sizeof(ctxt->regs)); | ||
4657 | |||
4658 | ctxt->fetch.start = 0; | ||
4659 | ctxt->fetch.end = 0; | ||
4660 | ctxt->io_read.pos = 0; | ||
4661 | ctxt->io_read.end = 0; | ||
4662 | ctxt->mem_read.pos = 0; | ||
4663 | ctxt->mem_read.end = 0; | ||
4664 | } | ||
4665 | |||
4476 | static void init_emulate_ctxt(struct kvm_vcpu *vcpu) | 4666 | static void init_emulate_ctxt(struct kvm_vcpu *vcpu) |
4477 | { | 4667 | { |
4478 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; | 4668 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; |
4479 | int cs_db, cs_l; | 4669 | int cs_db, cs_l; |
4480 | 4670 | ||
4481 | /* | 4671 | /* |
@@ -4488,40 +4678,38 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) | |||
4488 | 4678 | ||
4489 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | 4679 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); |
4490 | 4680 | ||
4491 | vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); | 4681 | ctxt->eflags = kvm_get_rflags(vcpu); |
4492 | vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); | 4682 | ctxt->eip = kvm_rip_read(vcpu); |
4493 | vcpu->arch.emulate_ctxt.mode = | 4683 | ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : |
4494 | (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : | 4684 | (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : |
4495 | (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) | 4685 | cs_l ? X86EMUL_MODE_PROT64 : |
4496 | ? X86EMUL_MODE_VM86 : cs_l | 4686 | cs_db ? X86EMUL_MODE_PROT32 : |
4497 | ? X86EMUL_MODE_PROT64 : cs_db | 4687 | X86EMUL_MODE_PROT16; |
4498 | ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; | 4688 | ctxt->guest_mode = is_guest_mode(vcpu); |
4499 | vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu); | 4689 | |
4500 | memset(c, 0, sizeof(struct decode_cache)); | 4690 | init_decode_cache(ctxt, vcpu->arch.regs); |
4501 | memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); | ||
4502 | vcpu->arch.emulate_regs_need_sync_from_vcpu = false; | 4691 | vcpu->arch.emulate_regs_need_sync_from_vcpu = false; |
4503 | } | 4692 | } |
4504 | 4693 | ||
4505 | int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) | 4694 | int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) |
4506 | { | 4695 | { |
4507 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; | 4696 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; |
4508 | int ret; | 4697 | int ret; |
4509 | 4698 | ||
4510 | init_emulate_ctxt(vcpu); | 4699 | init_emulate_ctxt(vcpu); |
4511 | 4700 | ||
4512 | vcpu->arch.emulate_ctxt.decode.op_bytes = 2; | 4701 | ctxt->op_bytes = 2; |
4513 | vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; | 4702 | ctxt->ad_bytes = 2; |
4514 | vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip + | 4703 | ctxt->_eip = ctxt->eip + inc_eip; |
4515 | inc_eip; | 4704 | ret = emulate_int_real(ctxt, irq); |
4516 | ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq); | ||
4517 | 4705 | ||
4518 | if (ret != X86EMUL_CONTINUE) | 4706 | if (ret != X86EMUL_CONTINUE) |
4519 | return EMULATE_FAIL; | 4707 | return EMULATE_FAIL; |
4520 | 4708 | ||
4521 | vcpu->arch.emulate_ctxt.eip = c->eip; | 4709 | ctxt->eip = ctxt->_eip; |
4522 | memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); | 4710 | memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); |
4523 | kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); | 4711 | kvm_rip_write(vcpu, ctxt->eip); |
4524 | kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); | 4712 | kvm_set_rflags(vcpu, ctxt->eflags); |
4525 | 4713 | ||
4526 | if (irq == NMI_VECTOR) | 4714 | if (irq == NMI_VECTOR) |
4527 | vcpu->arch.nmi_pending = false; | 4715 | vcpu->arch.nmi_pending = false; |
@@ -4582,21 +4770,21 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, | |||
4582 | int insn_len) | 4770 | int insn_len) |
4583 | { | 4771 | { |
4584 | int r; | 4772 | int r; |
4585 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; | 4773 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; |
4586 | bool writeback = true; | 4774 | bool writeback = true; |
4587 | 4775 | ||
4588 | kvm_clear_exception_queue(vcpu); | 4776 | kvm_clear_exception_queue(vcpu); |
4589 | 4777 | ||
4590 | if (!(emulation_type & EMULTYPE_NO_DECODE)) { | 4778 | if (!(emulation_type & EMULTYPE_NO_DECODE)) { |
4591 | init_emulate_ctxt(vcpu); | 4779 | init_emulate_ctxt(vcpu); |
4592 | vcpu->arch.emulate_ctxt.interruptibility = 0; | 4780 | ctxt->interruptibility = 0; |
4593 | vcpu->arch.emulate_ctxt.have_exception = false; | 4781 | ctxt->have_exception = false; |
4594 | vcpu->arch.emulate_ctxt.perm_ok = false; | 4782 | ctxt->perm_ok = false; |
4595 | 4783 | ||
4596 | vcpu->arch.emulate_ctxt.only_vendor_specific_insn | 4784 | ctxt->only_vendor_specific_insn |
4597 | = emulation_type & EMULTYPE_TRAP_UD; | 4785 | = emulation_type & EMULTYPE_TRAP_UD; |
4598 | 4786 | ||
4599 | r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); | 4787 | r = x86_decode_insn(ctxt, insn, insn_len); |
4600 | 4788 | ||
4601 | trace_kvm_emulate_insn_start(vcpu); | 4789 | trace_kvm_emulate_insn_start(vcpu); |
4602 | ++vcpu->stat.insn_emulation; | 4790 | ++vcpu->stat.insn_emulation; |
@@ -4612,7 +4800,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, | |||
4612 | } | 4800 | } |
4613 | 4801 | ||
4614 | if (emulation_type & EMULTYPE_SKIP) { | 4802 | if (emulation_type & EMULTYPE_SKIP) { |
4615 | kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); | 4803 | kvm_rip_write(vcpu, ctxt->_eip); |
4616 | return EMULATE_DONE; | 4804 | return EMULATE_DONE; |
4617 | } | 4805 | } |
4618 | 4806 | ||
@@ -4620,11 +4808,11 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, | |||
4620 | changes registers values during IO operation */ | 4808 | changes registers values during IO operation */ |
4621 | if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { | 4809 | if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { |
4622 | vcpu->arch.emulate_regs_need_sync_from_vcpu = false; | 4810 | vcpu->arch.emulate_regs_need_sync_from_vcpu = false; |
4623 | memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); | 4811 | memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs); |
4624 | } | 4812 | } |
4625 | 4813 | ||
4626 | restart: | 4814 | restart: |
4627 | r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); | 4815 | r = x86_emulate_insn(ctxt); |
4628 | 4816 | ||
4629 | if (r == EMULATION_INTERCEPTED) | 4817 | if (r == EMULATION_INTERCEPTED) |
4630 | return EMULATE_DONE; | 4818 | return EMULATE_DONE; |
@@ -4636,7 +4824,7 @@ restart: | |||
4636 | return handle_emulation_failure(vcpu); | 4824 | return handle_emulation_failure(vcpu); |
4637 | } | 4825 | } |
4638 | 4826 | ||
4639 | if (vcpu->arch.emulate_ctxt.have_exception) { | 4827 | if (ctxt->have_exception) { |
4640 | inject_emulated_exception(vcpu); | 4828 | inject_emulated_exception(vcpu); |
4641 | r = EMULATE_DONE; | 4829 | r = EMULATE_DONE; |
4642 | } else if (vcpu->arch.pio.count) { | 4830 | } else if (vcpu->arch.pio.count) { |
@@ -4655,13 +4843,12 @@ restart: | |||
4655 | r = EMULATE_DONE; | 4843 | r = EMULATE_DONE; |
4656 | 4844 | ||
4657 | if (writeback) { | 4845 | if (writeback) { |
4658 | toggle_interruptibility(vcpu, | 4846 | toggle_interruptibility(vcpu, ctxt->interruptibility); |
4659 | vcpu->arch.emulate_ctxt.interruptibility); | 4847 | kvm_set_rflags(vcpu, ctxt->eflags); |
4660 | kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); | ||
4661 | kvm_make_request(KVM_REQ_EVENT, vcpu); | 4848 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
4662 | memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); | 4849 | memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); |
4663 | vcpu->arch.emulate_regs_need_sync_to_vcpu = false; | 4850 | vcpu->arch.emulate_regs_need_sync_to_vcpu = false; |
4664 | kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); | 4851 | kvm_rip_write(vcpu, ctxt->eip); |
4665 | } else | 4852 | } else |
4666 | vcpu->arch.emulate_regs_need_sync_to_vcpu = true; | 4853 | vcpu->arch.emulate_regs_need_sync_to_vcpu = true; |
4667 | 4854 | ||
@@ -4878,6 +5065,30 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) | |||
4878 | } | 5065 | } |
4879 | EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); | 5066 | EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); |
4880 | 5067 | ||
5068 | static void kvm_set_mmio_spte_mask(void) | ||
5069 | { | ||
5070 | u64 mask; | ||
5071 | int maxphyaddr = boot_cpu_data.x86_phys_bits; | ||
5072 | |||
5073 | /* | ||
5074 | * Set the reserved bits and the present bit of an paging-structure | ||
5075 | * entry to generate page fault with PFER.RSV = 1. | ||
5076 | */ | ||
5077 | mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr; | ||
5078 | mask |= 1ull; | ||
5079 | |||
5080 | #ifdef CONFIG_X86_64 | ||
5081 | /* | ||
5082 | * If reserved bit is not supported, clear the present bit to disable | ||
5083 | * mmio page fault. | ||
5084 | */ | ||
5085 | if (maxphyaddr == 52) | ||
5086 | mask &= ~1ull; | ||
5087 | #endif | ||
5088 | |||
5089 | kvm_mmu_set_mmio_spte_mask(mask); | ||
5090 | } | ||
5091 | |||
4881 | int kvm_arch_init(void *opaque) | 5092 | int kvm_arch_init(void *opaque) |
4882 | { | 5093 | { |
4883 | int r; | 5094 | int r; |
@@ -4904,10 +5115,10 @@ int kvm_arch_init(void *opaque) | |||
4904 | if (r) | 5115 | if (r) |
4905 | goto out; | 5116 | goto out; |
4906 | 5117 | ||
5118 | kvm_set_mmio_spte_mask(); | ||
4907 | kvm_init_msr_list(); | 5119 | kvm_init_msr_list(); |
4908 | 5120 | ||
4909 | kvm_x86_ops = ops; | 5121 | kvm_x86_ops = ops; |
4910 | kvm_mmu_set_nonpresent_ptes(0ull, 0ull); | ||
4911 | kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, | 5122 | kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, |
4912 | PT_DIRTY_MASK, PT64_NX_MASK, 0); | 5123 | PT_DIRTY_MASK, PT64_NX_MASK, 0); |
4913 | 5124 | ||
@@ -5082,8 +5293,7 @@ int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) | |||
5082 | 5293 | ||
5083 | kvm_x86_ops->patch_hypercall(vcpu, instruction); | 5294 | kvm_x86_ops->patch_hypercall(vcpu, instruction); |
5084 | 5295 | ||
5085 | return emulator_write_emulated(&vcpu->arch.emulate_ctxt, | 5296 | return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); |
5086 | rip, instruction, 3, NULL); | ||
5087 | } | 5297 | } |
5088 | 5298 | ||
5089 | static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) | 5299 | static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) |
@@ -5384,6 +5594,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
5384 | r = 1; | 5594 | r = 1; |
5385 | goto out; | 5595 | goto out; |
5386 | } | 5596 | } |
5597 | if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) | ||
5598 | record_steal_time(vcpu); | ||
5599 | |||
5387 | } | 5600 | } |
5388 | 5601 | ||
5389 | r = kvm_mmu_reload(vcpu); | 5602 | r = kvm_mmu_reload(vcpu); |
@@ -5671,8 +5884,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | |||
5671 | * that usually, but some bad designed PV devices (vmware | 5884 | * that usually, but some bad designed PV devices (vmware |
5672 | * backdoor interface) need this to work | 5885 | * backdoor interface) need this to work |
5673 | */ | 5886 | */ |
5674 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; | 5887 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; |
5675 | memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); | 5888 | memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); |
5676 | vcpu->arch.emulate_regs_need_sync_to_vcpu = false; | 5889 | vcpu->arch.emulate_regs_need_sync_to_vcpu = false; |
5677 | } | 5890 | } |
5678 | regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); | 5891 | regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); |
@@ -5801,21 +6014,20 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, | |||
5801 | int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, | 6014 | int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, |
5802 | bool has_error_code, u32 error_code) | 6015 | bool has_error_code, u32 error_code) |
5803 | { | 6016 | { |
5804 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; | 6017 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; |
5805 | int ret; | 6018 | int ret; |
5806 | 6019 | ||
5807 | init_emulate_ctxt(vcpu); | 6020 | init_emulate_ctxt(vcpu); |
5808 | 6021 | ||
5809 | ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, | 6022 | ret = emulator_task_switch(ctxt, tss_selector, reason, |
5810 | tss_selector, reason, has_error_code, | 6023 | has_error_code, error_code); |
5811 | error_code); | ||
5812 | 6024 | ||
5813 | if (ret) | 6025 | if (ret) |
5814 | return EMULATE_FAIL; | 6026 | return EMULATE_FAIL; |
5815 | 6027 | ||
5816 | memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); | 6028 | memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); |
5817 | kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); | 6029 | kvm_rip_write(vcpu, ctxt->eip); |
5818 | kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); | 6030 | kvm_set_rflags(vcpu, ctxt->eflags); |
5819 | kvm_make_request(KVM_REQ_EVENT, vcpu); | 6031 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
5820 | return EMULATE_DONE; | 6032 | return EMULATE_DONE; |
5821 | } | 6033 | } |
@@ -6093,12 +6305,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) | |||
6093 | if (r == 0) | 6305 | if (r == 0) |
6094 | r = kvm_mmu_setup(vcpu); | 6306 | r = kvm_mmu_setup(vcpu); |
6095 | vcpu_put(vcpu); | 6307 | vcpu_put(vcpu); |
6096 | if (r < 0) | ||
6097 | goto free_vcpu; | ||
6098 | 6308 | ||
6099 | return 0; | ||
6100 | free_vcpu: | ||
6101 | kvm_x86_ops->vcpu_free(vcpu); | ||
6102 | return r; | 6309 | return r; |
6103 | } | 6310 | } |
6104 | 6311 | ||
@@ -6126,6 +6333,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) | |||
6126 | 6333 | ||
6127 | kvm_make_request(KVM_REQ_EVENT, vcpu); | 6334 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
6128 | vcpu->arch.apf.msr_val = 0; | 6335 | vcpu->arch.apf.msr_val = 0; |
6336 | vcpu->arch.st.msr_val = 0; | ||
6129 | 6337 | ||
6130 | kvmclock_reset(vcpu); | 6338 | kvmclock_reset(vcpu); |
6131 | 6339 | ||
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index e407ed3df817..d36fe237c665 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h | |||
@@ -75,10 +75,54 @@ static inline u32 bit(int bitno) | |||
75 | return 1 << (bitno & 31); | 75 | return 1 << (bitno & 31); |
76 | } | 76 | } |
77 | 77 | ||
78 | static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, | ||
79 | gva_t gva, gfn_t gfn, unsigned access) | ||
80 | { | ||
81 | vcpu->arch.mmio_gva = gva & PAGE_MASK; | ||
82 | vcpu->arch.access = access; | ||
83 | vcpu->arch.mmio_gfn = gfn; | ||
84 | } | ||
85 | |||
86 | /* | ||
87 | * Clear the mmio cache info for the given gva, | ||
88 | * specially, if gva is ~0ul, we clear all mmio cache info. | ||
89 | */ | ||
90 | static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva) | ||
91 | { | ||
92 | if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK)) | ||
93 | return; | ||
94 | |||
95 | vcpu->arch.mmio_gva = 0; | ||
96 | } | ||
97 | |||
98 | static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva) | ||
99 | { | ||
100 | if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK)) | ||
101 | return true; | ||
102 | |||
103 | return false; | ||
104 | } | ||
105 | |||
106 | static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) | ||
107 | { | ||
108 | if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT) | ||
109 | return true; | ||
110 | |||
111 | return false; | ||
112 | } | ||
113 | |||
78 | void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); | 114 | void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); |
79 | void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); | 115 | void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); |
80 | int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); | 116 | int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); |
81 | 117 | ||
82 | void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); | 118 | void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); |
83 | 119 | ||
120 | int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, | ||
121 | gva_t addr, void *val, unsigned int bytes, | ||
122 | struct x86_exception *exception); | ||
123 | |||
124 | int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, | ||
125 | gva_t addr, void *val, unsigned int bytes, | ||
126 | struct x86_exception *exception); | ||
127 | |||
84 | #endif | 128 | #endif |