diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2010-05-21 20:16:21 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-05-21 20:16:21 -0400 |
commit | 98edb6ca4174f17a64890a02f44c211c8b44fb3c (patch) | |
tree | 033bc5f7da410046d28dd1cefcd2d63cda33d25b /Documentation | |
parent | a8251096b427283c47e7d8f9568be6b388dd68ec (diff) | |
parent | 8fbf065d625617bbbf6b72d5f78f84ad13c8b547 (diff) |
Merge branch 'kvm-updates/2.6.35' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/2.6.35' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (269 commits)
KVM: x86: Add missing locking to arch specific vcpu ioctls
KVM: PPC: Add missing vcpu_load()/vcpu_put() in vcpu ioctls
KVM: MMU: Segregate shadow pages with different cr0.wp
KVM: x86: Check LMA bit before set_efer
KVM: Don't allow lmsw to clear cr0.pe
KVM: Add cpuid.txt file
KVM: x86: Tell the guest we'll warn it about tsc stability
x86, paravirt: don't compute pvclock adjustments if we trust the tsc
x86: KVM guest: Try using new kvm clock msrs
KVM: x86: export paravirtual cpuid flags in KVM_GET_SUPPORTED_CPUID
KVM: x86: add new KVMCLOCK cpuid feature
KVM: x86: change msr numbers for kvmclock
x86, paravirt: Add a global synchronization point for pvclock
x86, paravirt: Enable pvclock flags in vcpu_time_info structure
KVM: x86: Inject #GP with the right rip on efer writes
KVM: SVM: Don't allow nested guest to VMMCALL into host
KVM: x86: Fix exception reinjection forced to true
KVM: Fix wallclock version writing race
KVM: MMU: Don't read pdptrs with mmu spinlock held in mmu_alloc_roots
KVM: VMX: enable VMXON check with SMX enabled (Intel TXT)
...
Diffstat (limited to 'Documentation')
-rw-r--r-- | Documentation/kvm/api.txt | 208 | ||||
-rw-r--r-- | Documentation/kvm/cpuid.txt | 42 | ||||
-rw-r--r-- | Documentation/kvm/mmu.txt | 304 |
3 files changed, 552 insertions, 2 deletions
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt index c6416a398163..a237518e51b9 100644 --- a/Documentation/kvm/api.txt +++ b/Documentation/kvm/api.txt | |||
@@ -656,6 +656,7 @@ struct kvm_clock_data { | |||
656 | 4.29 KVM_GET_VCPU_EVENTS | 656 | 4.29 KVM_GET_VCPU_EVENTS |
657 | 657 | ||
658 | Capability: KVM_CAP_VCPU_EVENTS | 658 | Capability: KVM_CAP_VCPU_EVENTS |
659 | Extended by: KVM_CAP_INTR_SHADOW | ||
659 | Architectures: x86 | 660 | Architectures: x86 |
660 | Type: vm ioctl | 661 | Type: vm ioctl |
661 | Parameters: struct kvm_vcpu_event (out) | 662 | Parameters: struct kvm_vcpu_event (out) |
@@ -676,7 +677,7 @@ struct kvm_vcpu_events { | |||
676 | __u8 injected; | 677 | __u8 injected; |
677 | __u8 nr; | 678 | __u8 nr; |
678 | __u8 soft; | 679 | __u8 soft; |
679 | __u8 pad; | 680 | __u8 shadow; |
680 | } interrupt; | 681 | } interrupt; |
681 | struct { | 682 | struct { |
682 | __u8 injected; | 683 | __u8 injected; |
@@ -688,9 +689,13 @@ struct kvm_vcpu_events { | |||
688 | __u32 flags; | 689 | __u32 flags; |
689 | }; | 690 | }; |
690 | 691 | ||
692 | KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that | ||
693 | interrupt.shadow contains a valid state. Otherwise, this field is undefined. | ||
694 | |||
691 | 4.30 KVM_SET_VCPU_EVENTS | 695 | 4.30 KVM_SET_VCPU_EVENTS |
692 | 696 | ||
693 | Capability: KVM_CAP_VCPU_EVENTS | 697 | Capability: KVM_CAP_VCPU_EVENTS |
698 | Extended by: KVM_CAP_INTR_SHADOW | ||
694 | Architectures: x86 | 699 | Architectures: x86 |
695 | Type: vm ioctl | 700 | Type: vm ioctl |
696 | Parameters: struct kvm_vcpu_event (in) | 701 | Parameters: struct kvm_vcpu_event (in) |
@@ -709,6 +714,183 @@ current in-kernel state. The bits are: | |||
709 | KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel | 714 | KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel |
710 | KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector | 715 | KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector |
711 | 716 | ||
717 | If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in | ||
718 | the flags field to signal that interrupt.shadow contains a valid state and | ||
719 | shall be written into the VCPU. | ||
720 | |||
721 | 4.32 KVM_GET_DEBUGREGS | ||
722 | |||
723 | Capability: KVM_CAP_DEBUGREGS | ||
724 | Architectures: x86 | ||
725 | Type: vm ioctl | ||
726 | Parameters: struct kvm_debugregs (out) | ||
727 | Returns: 0 on success, -1 on error | ||
728 | |||
729 | Reads debug registers from the vcpu. | ||
730 | |||
731 | struct kvm_debugregs { | ||
732 | __u64 db[4]; | ||
733 | __u64 dr6; | ||
734 | __u64 dr7; | ||
735 | __u64 flags; | ||
736 | __u64 reserved[9]; | ||
737 | }; | ||
738 | |||
739 | 4.33 KVM_SET_DEBUGREGS | ||
740 | |||
741 | Capability: KVM_CAP_DEBUGREGS | ||
742 | Architectures: x86 | ||
743 | Type: vm ioctl | ||
744 | Parameters: struct kvm_debugregs (in) | ||
745 | Returns: 0 on success, -1 on error | ||
746 | |||
747 | Writes debug registers into the vcpu. | ||
748 | |||
749 | See KVM_GET_DEBUGREGS for the data structure. The flags field is unused | ||
750 | yet and must be cleared on entry. | ||
751 | |||
752 | 4.34 KVM_SET_USER_MEMORY_REGION | ||
753 | |||
754 | Capability: KVM_CAP_USER_MEM | ||
755 | Architectures: all | ||
756 | Type: vm ioctl | ||
757 | Parameters: struct kvm_userspace_memory_region (in) | ||
758 | Returns: 0 on success, -1 on error | ||
759 | |||
760 | struct kvm_userspace_memory_region { | ||
761 | __u32 slot; | ||
762 | __u32 flags; | ||
763 | __u64 guest_phys_addr; | ||
764 | __u64 memory_size; /* bytes */ | ||
765 | __u64 userspace_addr; /* start of the userspace allocated memory */ | ||
766 | }; | ||
767 | |||
768 | /* for kvm_memory_region::flags */ | ||
769 | #define KVM_MEM_LOG_DIRTY_PAGES 1UL | ||
770 | |||
771 | This ioctl allows the user to create or modify a guest physical memory | ||
772 | slot. When changing an existing slot, it may be moved in the guest | ||
773 | physical memory space, or its flags may be modified. It may not be | ||
774 | resized. Slots may not overlap in guest physical address space. | ||
775 | |||
776 | Memory for the region is taken starting at the address denoted by the | ||
777 | field userspace_addr, which must point at user addressable memory for | ||
778 | the entire memory slot size. Any object may back this memory, including | ||
779 | anonymous memory, ordinary files, and hugetlbfs. | ||
780 | |||
781 | It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr | ||
782 | be identical. This allows large pages in the guest to be backed by large | ||
783 | pages in the host. | ||
784 | |||
785 | The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which | ||
786 | instructs kvm to keep track of writes to memory within the slot. See | ||
787 | the KVM_GET_DIRTY_LOG ioctl. | ||
788 | |||
789 | When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory | ||
790 | region are automatically reflected into the guest. For example, an mmap() | ||
791 | that affects the region will be made visible immediately. Another example | ||
792 | is madvise(MADV_DROP). | ||
793 | |||
794 | It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl. | ||
795 | The KVM_SET_MEMORY_REGION does not allow fine grained control over memory | ||
796 | allocation and is deprecated. | ||
797 | |||
798 | 4.35 KVM_SET_TSS_ADDR | ||
799 | |||
800 | Capability: KVM_CAP_SET_TSS_ADDR | ||
801 | Architectures: x86 | ||
802 | Type: vm ioctl | ||
803 | Parameters: unsigned long tss_address (in) | ||
804 | Returns: 0 on success, -1 on error | ||
805 | |||
806 | This ioctl defines the physical address of a three-page region in the guest | ||
807 | physical address space. The region must be within the first 4GB of the | ||
808 | guest physical address space and must not conflict with any memory slot | ||
809 | or any mmio address. The guest may malfunction if it accesses this memory | ||
810 | region. | ||
811 | |||
812 | This ioctl is required on Intel-based hosts. This is needed on Intel hardware | ||
813 | because of a quirk in the virtualization implementation (see the internals | ||
814 | documentation when it pops into existence). | ||
815 | |||
816 | 4.36 KVM_ENABLE_CAP | ||
817 | |||
818 | Capability: KVM_CAP_ENABLE_CAP | ||
819 | Architectures: ppc | ||
820 | Type: vcpu ioctl | ||
821 | Parameters: struct kvm_enable_cap (in) | ||
822 | Returns: 0 on success; -1 on error | ||
823 | |||
824 | +Not all extensions are enabled by default. Using this ioctl the application | ||
825 | can enable an extension, making it available to the guest. | ||
826 | |||
827 | On systems that do not support this ioctl, it always fails. On systems that | ||
828 | do support it, it only works for extensions that are supported for enablement. | ||
829 | |||
830 | To check if a capability can be enabled, the KVM_CHECK_EXTENSION ioctl should | ||
831 | be used. | ||
832 | |||
833 | struct kvm_enable_cap { | ||
834 | /* in */ | ||
835 | __u32 cap; | ||
836 | |||
837 | The capability that is supposed to get enabled. | ||
838 | |||
839 | __u32 flags; | ||
840 | |||
841 | A bitfield indicating future enhancements. Has to be 0 for now. | ||
842 | |||
843 | __u64 args[4]; | ||
844 | |||
845 | Arguments for enabling a feature. If a feature needs initial values to | ||
846 | function properly, this is the place to put them. | ||
847 | |||
848 | __u8 pad[64]; | ||
849 | }; | ||
850 | |||
851 | 4.37 KVM_GET_MP_STATE | ||
852 | |||
853 | Capability: KVM_CAP_MP_STATE | ||
854 | Architectures: x86, ia64 | ||
855 | Type: vcpu ioctl | ||
856 | Parameters: struct kvm_mp_state (out) | ||
857 | Returns: 0 on success; -1 on error | ||
858 | |||
859 | struct kvm_mp_state { | ||
860 | __u32 mp_state; | ||
861 | }; | ||
862 | |||
863 | Returns the vcpu's current "multiprocessing state" (though also valid on | ||
864 | uniprocessor guests). | ||
865 | |||
866 | Possible values are: | ||
867 | |||
868 | - KVM_MP_STATE_RUNNABLE: the vcpu is currently running | ||
869 | - KVM_MP_STATE_UNINITIALIZED: the vcpu is an application processor (AP) | ||
870 | which has not yet received an INIT signal | ||
871 | - KVM_MP_STATE_INIT_RECEIVED: the vcpu has received an INIT signal, and is | ||
872 | now ready for a SIPI | ||
873 | - KVM_MP_STATE_HALTED: the vcpu has executed a HLT instruction and | ||
874 | is waiting for an interrupt | ||
875 | - KVM_MP_STATE_SIPI_RECEIVED: the vcpu has just received a SIPI (vector | ||
876 | accesible via KVM_GET_VCPU_EVENTS) | ||
877 | |||
878 | This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel | ||
879 | irqchip, the multiprocessing state must be maintained by userspace. | ||
880 | |||
881 | 4.38 KVM_SET_MP_STATE | ||
882 | |||
883 | Capability: KVM_CAP_MP_STATE | ||
884 | Architectures: x86, ia64 | ||
885 | Type: vcpu ioctl | ||
886 | Parameters: struct kvm_mp_state (in) | ||
887 | Returns: 0 on success; -1 on error | ||
888 | |||
889 | Sets the vcpu's current "multiprocessing state"; see KVM_GET_MP_STATE for | ||
890 | arguments. | ||
891 | |||
892 | This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel | ||
893 | irqchip, the multiprocessing state must be maintained by userspace. | ||
712 | 894 | ||
713 | 5. The kvm_run structure | 895 | 5. The kvm_run structure |
714 | 896 | ||
@@ -820,6 +1002,13 @@ executed a memory-mapped I/O instruction which could not be satisfied | |||
820 | by kvm. The 'data' member contains the written data if 'is_write' is | 1002 | by kvm. The 'data' member contains the written data if 'is_write' is |
821 | true, and should be filled by application code otherwise. | 1003 | true, and should be filled by application code otherwise. |
822 | 1004 | ||
1005 | NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO and KVM_EXIT_OSI, the corresponding | ||
1006 | operations are complete (and guest state is consistent) only after userspace | ||
1007 | has re-entered the kernel with KVM_RUN. The kernel side will first finish | ||
1008 | incomplete operations and then check for pending signals. Userspace | ||
1009 | can re-enter the guest with an unmasked signal pending to complete | ||
1010 | pending operations. | ||
1011 | |||
823 | /* KVM_EXIT_HYPERCALL */ | 1012 | /* KVM_EXIT_HYPERCALL */ |
824 | struct { | 1013 | struct { |
825 | __u64 nr; | 1014 | __u64 nr; |
@@ -829,7 +1018,9 @@ true, and should be filled by application code otherwise. | |||
829 | __u32 pad; | 1018 | __u32 pad; |
830 | } hypercall; | 1019 | } hypercall; |
831 | 1020 | ||
832 | Unused. | 1021 | Unused. This was once used for 'hypercall to userspace'. To implement |
1022 | such functionality, use KVM_EXIT_IO (x86) or KVM_EXIT_MMIO (all except s390). | ||
1023 | Note KVM_EXIT_IO is significantly faster than KVM_EXIT_MMIO. | ||
833 | 1024 | ||
834 | /* KVM_EXIT_TPR_ACCESS */ | 1025 | /* KVM_EXIT_TPR_ACCESS */ |
835 | struct { | 1026 | struct { |
@@ -870,6 +1061,19 @@ s390 specific. | |||
870 | 1061 | ||
871 | powerpc specific. | 1062 | powerpc specific. |
872 | 1063 | ||
1064 | /* KVM_EXIT_OSI */ | ||
1065 | struct { | ||
1066 | __u64 gprs[32]; | ||
1067 | } osi; | ||
1068 | |||
1069 | MOL uses a special hypercall interface it calls 'OSI'. To enable it, we catch | ||
1070 | hypercalls and exit with this exit struct that contains all the guest gprs. | ||
1071 | |||
1072 | If exit_reason is KVM_EXIT_OSI, then the vcpu has triggered such a hypercall. | ||
1073 | Userspace can now handle the hypercall and when it's done modify the gprs as | ||
1074 | necessary. Upon guest entry all guest GPRs will then be replaced by the values | ||
1075 | in this struct. | ||
1076 | |||
873 | /* Fix the size of the union. */ | 1077 | /* Fix the size of the union. */ |
874 | char padding[256]; | 1078 | char padding[256]; |
875 | }; | 1079 | }; |
diff --git a/Documentation/kvm/cpuid.txt b/Documentation/kvm/cpuid.txt new file mode 100644 index 000000000000..14a12ea92b7f --- /dev/null +++ b/Documentation/kvm/cpuid.txt | |||
@@ -0,0 +1,42 @@ | |||
1 | KVM CPUID bits | ||
2 | Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010 | ||
3 | ===================================================== | ||
4 | |||
5 | A guest running on a kvm host, can check some of its features using | ||
6 | cpuid. This is not always guaranteed to work, since userspace can | ||
7 | mask-out some, or even all KVM-related cpuid features before launching | ||
8 | a guest. | ||
9 | |||
10 | KVM cpuid functions are: | ||
11 | |||
12 | function: KVM_CPUID_SIGNATURE (0x40000000) | ||
13 | returns : eax = 0, | ||
14 | ebx = 0x4b4d564b, | ||
15 | ecx = 0x564b4d56, | ||
16 | edx = 0x4d. | ||
17 | Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM". | ||
18 | This function queries the presence of KVM cpuid leafs. | ||
19 | |||
20 | |||
21 | function: define KVM_CPUID_FEATURES (0x40000001) | ||
22 | returns : ebx, ecx, edx = 0 | ||
23 | eax = and OR'ed group of (1 << flag), where each flags is: | ||
24 | |||
25 | |||
26 | flag || value || meaning | ||
27 | ============================================================================= | ||
28 | KVM_FEATURE_CLOCKSOURCE || 0 || kvmclock available at msrs | ||
29 | || || 0x11 and 0x12. | ||
30 | ------------------------------------------------------------------------------ | ||
31 | KVM_FEATURE_NOP_IO_DELAY || 1 || not necessary to perform delays | ||
32 | || || on PIO operations. | ||
33 | ------------------------------------------------------------------------------ | ||
34 | KVM_FEATURE_MMU_OP || 2 || deprecated. | ||
35 | ------------------------------------------------------------------------------ | ||
36 | KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs | ||
37 | || || 0x4b564d00 and 0x4b564d01 | ||
38 | ------------------------------------------------------------------------------ | ||
39 | KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side | ||
40 | || || per-cpu warps are expected in | ||
41 | || || kvmclock. | ||
42 | ------------------------------------------------------------------------------ | ||
diff --git a/Documentation/kvm/mmu.txt b/Documentation/kvm/mmu.txt new file mode 100644 index 000000000000..aaed6ab9d7ab --- /dev/null +++ b/Documentation/kvm/mmu.txt | |||
@@ -0,0 +1,304 @@ | |||
1 | The x86 kvm shadow mmu | ||
2 | ====================== | ||
3 | |||
4 | The mmu (in arch/x86/kvm, files mmu.[ch] and paging_tmpl.h) is responsible | ||
5 | for presenting a standard x86 mmu to the guest, while translating guest | ||
6 | physical addresses to host physical addresses. | ||
7 | |||
8 | The mmu code attempts to satisfy the following requirements: | ||
9 | |||
10 | - correctness: the guest should not be able to determine that it is running | ||
11 | on an emulated mmu except for timing (we attempt to comply | ||
12 | with the specification, not emulate the characteristics of | ||
13 | a particular implementation such as tlb size) | ||
14 | - security: the guest must not be able to touch host memory not assigned | ||
15 | to it | ||
16 | - performance: minimize the performance penalty imposed by the mmu | ||
17 | - scaling: need to scale to large memory and large vcpu guests | ||
18 | - hardware: support the full range of x86 virtualization hardware | ||
19 | - integration: Linux memory management code must be in control of guest memory | ||
20 | so that swapping, page migration, page merging, transparent | ||
21 | hugepages, and similar features work without change | ||
22 | - dirty tracking: report writes to guest memory to enable live migration | ||
23 | and framebuffer-based displays | ||
24 | - footprint: keep the amount of pinned kernel memory low (most memory | ||
25 | should be shrinkable) | ||
26 | - reliablity: avoid multipage or GFP_ATOMIC allocations | ||
27 | |||
28 | Acronyms | ||
29 | ======== | ||
30 | |||
31 | pfn host page frame number | ||
32 | hpa host physical address | ||
33 | hva host virtual address | ||
34 | gfn guest frame number | ||
35 | gpa guest physical address | ||
36 | gva guest virtual address | ||
37 | ngpa nested guest physical address | ||
38 | ngva nested guest virtual address | ||
39 | pte page table entry (used also to refer generically to paging structure | ||
40 | entries) | ||
41 | gpte guest pte (referring to gfns) | ||
42 | spte shadow pte (referring to pfns) | ||
43 | tdp two dimensional paging (vendor neutral term for NPT and EPT) | ||
44 | |||
45 | Virtual and real hardware supported | ||
46 | =================================== | ||
47 | |||
48 | The mmu supports first-generation mmu hardware, which allows an atomic switch | ||
49 | of the current paging mode and cr3 during guest entry, as well as | ||
50 | two-dimensional paging (AMD's NPT and Intel's EPT). The emulated hardware | ||
51 | it exposes is the traditional 2/3/4 level x86 mmu, with support for global | ||
52 | pages, pae, pse, pse36, cr0.wp, and 1GB pages. Work is in progress to support | ||
53 | exposing NPT capable hardware on NPT capable hosts. | ||
54 | |||
55 | Translation | ||
56 | =========== | ||
57 | |||
58 | The primary job of the mmu is to program the processor's mmu to translate | ||
59 | addresses for the guest. Different translations are required at different | ||
60 | times: | ||
61 | |||
62 | - when guest paging is disabled, we translate guest physical addresses to | ||
63 | host physical addresses (gpa->hpa) | ||
64 | - when guest paging is enabled, we translate guest virtual addresses, to | ||
65 | guest physical addresses, to host physical addresses (gva->gpa->hpa) | ||
66 | - when the guest launches a guest of its own, we translate nested guest | ||
67 | virtual addresses, to nested guest physical addresses, to guest physical | ||
68 | addresses, to host physical addresses (ngva->ngpa->gpa->hpa) | ||
69 | |||
70 | The primary challenge is to encode between 1 and 3 translations into hardware | ||
71 | that support only 1 (traditional) and 2 (tdp) translations. When the | ||
72 | number of required translations matches the hardware, the mmu operates in | ||
73 | direct mode; otherwise it operates in shadow mode (see below). | ||
74 | |||
75 | Memory | ||
76 | ====== | ||
77 | |||
78 | Guest memory (gpa) is part of the user address space of the process that is | ||
79 | using kvm. Userspace defines the translation between guest addresses and user | ||
80 | addresses (gpa->hva); note that two gpas may alias to the same gva, but not | ||
81 | vice versa. | ||
82 | |||
83 | These gvas may be backed using any method available to the host: anonymous | ||
84 | memory, file backed memory, and device memory. Memory might be paged by the | ||
85 | host at any time. | ||
86 | |||
87 | Events | ||
88 | ====== | ||
89 | |||
90 | The mmu is driven by events, some from the guest, some from the host. | ||
91 | |||
92 | Guest generated events: | ||
93 | - writes to control registers (especially cr3) | ||
94 | - invlpg/invlpga instruction execution | ||
95 | - access to missing or protected translations | ||
96 | |||
97 | Host generated events: | ||
98 | - changes in the gpa->hpa translation (either through gpa->hva changes or | ||
99 | through hva->hpa changes) | ||
100 | - memory pressure (the shrinker) | ||
101 | |||
102 | Shadow pages | ||
103 | ============ | ||
104 | |||
105 | The principal data structure is the shadow page, 'struct kvm_mmu_page'. A | ||
106 | shadow page contains 512 sptes, which can be either leaf or nonleaf sptes. A | ||
107 | shadow page may contain a mix of leaf and nonleaf sptes. | ||
108 | |||
109 | A nonleaf spte allows the hardware mmu to reach the leaf pages and | ||
110 | is not related to a translation directly. It points to other shadow pages. | ||
111 | |||
112 | A leaf spte corresponds to either one or two translations encoded into | ||
113 | one paging structure entry. These are always the lowest level of the | ||
114 | translation stack, with optional higher level translations left to NPT/EPT. | ||
115 | Leaf ptes point at guest pages. | ||
116 | |||
117 | The following table shows translations encoded by leaf ptes, with higher-level | ||
118 | translations in parentheses: | ||
119 | |||
120 | Non-nested guests: | ||
121 | nonpaging: gpa->hpa | ||
122 | paging: gva->gpa->hpa | ||
123 | paging, tdp: (gva->)gpa->hpa | ||
124 | Nested guests: | ||
125 | non-tdp: ngva->gpa->hpa (*) | ||
126 | tdp: (ngva->)ngpa->gpa->hpa | ||
127 | |||
128 | (*) the guest hypervisor will encode the ngva->gpa translation into its page | ||
129 | tables if npt is not present | ||
130 | |||
131 | Shadow pages contain the following information: | ||
132 | role.level: | ||
133 | The level in the shadow paging hierarchy that this shadow page belongs to. | ||
134 | 1=4k sptes, 2=2M sptes, 3=1G sptes, etc. | ||
135 | role.direct: | ||
136 | If set, leaf sptes reachable from this page are for a linear range. | ||
137 | Examples include real mode translation, large guest pages backed by small | ||
138 | host pages, and gpa->hpa translations when NPT or EPT is active. | ||
139 | The linear range starts at (gfn << PAGE_SHIFT) and its size is determined | ||
140 | by role.level (2MB for first level, 1GB for second level, 0.5TB for third | ||
141 | level, 256TB for fourth level) | ||
142 | If clear, this page corresponds to a guest page table denoted by the gfn | ||
143 | field. | ||
144 | role.quadrant: | ||
145 | When role.cr4_pae=0, the guest uses 32-bit gptes while the host uses 64-bit | ||
146 | sptes. That means a guest page table contains more ptes than the host, | ||
147 | so multiple shadow pages are needed to shadow one guest page. | ||
148 | For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the | ||
149 | first or second 512-gpte block in the guest page table. For second-level | ||
150 | page tables, each 32-bit gpte is converted to two 64-bit sptes | ||
151 | (since each first-level guest page is shadowed by two first-level | ||
152 | shadow pages) so role.quadrant takes values in the range 0..3. Each | ||
153 | quadrant maps 1GB virtual address space. | ||
154 | role.access: | ||
155 | Inherited guest access permissions in the form uwx. Note execute | ||
156 | permission is positive, not negative. | ||
157 | role.invalid: | ||
158 | The page is invalid and should not be used. It is a root page that is | ||
159 | currently pinned (by a cpu hardware register pointing to it); once it is | ||
160 | unpinned it will be destroyed. | ||
161 | role.cr4_pae: | ||
162 | Contains the value of cr4.pae for which the page is valid (e.g. whether | ||
163 | 32-bit or 64-bit gptes are in use). | ||
164 | role.cr4_nxe: | ||
165 | Contains the value of efer.nxe for which the page is valid. | ||
166 | role.cr0_wp: | ||
167 | Contains the value of cr0.wp for which the page is valid. | ||
168 | gfn: | ||
169 | Either the guest page table containing the translations shadowed by this | ||
170 | page, or the base page frame for linear translations. See role.direct. | ||
171 | spt: | ||
172 | A pageful of 64-bit sptes containing the translations for this page. | ||
173 | Accessed by both kvm and hardware. | ||
174 | The page pointed to by spt will have its page->private pointing back | ||
175 | at the shadow page structure. | ||
176 | sptes in spt point either at guest pages, or at lower-level shadow pages. | ||
177 | Specifically, if sp1 and sp2 are shadow pages, then sp1->spt[n] may point | ||
178 | at __pa(sp2->spt). sp2 will point back at sp1 through parent_pte. | ||
179 | The spt array forms a DAG structure with the shadow page as a node, and | ||
180 | guest pages as leaves. | ||
181 | gfns: | ||
182 | An array of 512 guest frame numbers, one for each present pte. Used to | ||
183 | perform a reverse map from a pte to a gfn. | ||
184 | slot_bitmap: | ||
185 | A bitmap containing one bit per memory slot. If the page contains a pte | ||
186 | mapping a page from memory slot n, then bit n of slot_bitmap will be set | ||
187 | (if a page is aliased among several slots, then it is not guaranteed that | ||
188 | all slots will be marked). | ||
189 | Used during dirty logging to avoid scanning a shadow page if none if its | ||
190 | pages need tracking. | ||
191 | root_count: | ||
192 | A counter keeping track of how many hardware registers (guest cr3 or | ||
193 | pdptrs) are now pointing at the page. While this counter is nonzero, the | ||
194 | page cannot be destroyed. See role.invalid. | ||
195 | multimapped: | ||
196 | Whether there exist multiple sptes pointing at this page. | ||
197 | parent_pte/parent_ptes: | ||
198 | If multimapped is zero, parent_pte points at the single spte that points at | ||
199 | this page's spt. Otherwise, parent_ptes points at a data structure | ||
200 | with a list of parent_ptes. | ||
201 | unsync: | ||
202 | If true, then the translations in this page may not match the guest's | ||
203 | translation. This is equivalent to the state of the tlb when a pte is | ||
204 | changed but before the tlb entry is flushed. Accordingly, unsync ptes | ||
205 | are synchronized when the guest executes invlpg or flushes its tlb by | ||
206 | other means. Valid for leaf pages. | ||
207 | unsync_children: | ||
208 | How many sptes in the page point at pages that are unsync (or have | ||
209 | unsynchronized children). | ||
210 | unsync_child_bitmap: | ||
211 | A bitmap indicating which sptes in spt point (directly or indirectly) at | ||
212 | pages that may be unsynchronized. Used to quickly locate all unsychronized | ||
213 | pages reachable from a given page. | ||
214 | |||
215 | Reverse map | ||
216 | =========== | ||
217 | |||
218 | The mmu maintains a reverse mapping whereby all ptes mapping a page can be | ||
219 | reached given its gfn. This is used, for example, when swapping out a page. | ||
220 | |||
221 | Synchronized and unsynchronized pages | ||
222 | ===================================== | ||
223 | |||
224 | The guest uses two events to synchronize its tlb and page tables: tlb flushes | ||
225 | and page invalidations (invlpg). | ||
226 | |||
227 | A tlb flush means that we need to synchronize all sptes reachable from the | ||
228 | guest's cr3. This is expensive, so we keep all guest page tables write | ||
229 | protected, and synchronize sptes to gptes when a gpte is written. | ||
230 | |||
231 | A special case is when a guest page table is reachable from the current | ||
232 | guest cr3. In this case, the guest is obliged to issue an invlpg instruction | ||
233 | before using the translation. We take advantage of that by removing write | ||
234 | protection from the guest page, and allowing the guest to modify it freely. | ||
235 | We synchronize modified gptes when the guest invokes invlpg. This reduces | ||
236 | the amount of emulation we have to do when the guest modifies multiple gptes, | ||
237 | or when the a guest page is no longer used as a page table and is used for | ||
238 | random guest data. | ||
239 | |||
240 | As a side effect we have to resynchronize all reachable unsynchronized shadow | ||
241 | pages on a tlb flush. | ||
242 | |||
243 | |||
244 | Reaction to events | ||
245 | ================== | ||
246 | |||
247 | - guest page fault (or npt page fault, or ept violation) | ||
248 | |||
249 | This is the most complicated event. The cause of a page fault can be: | ||
250 | |||
251 | - a true guest fault (the guest translation won't allow the access) (*) | ||
252 | - access to a missing translation | ||
253 | - access to a protected translation | ||
254 | - when logging dirty pages, memory is write protected | ||
255 | - synchronized shadow pages are write protected (*) | ||
256 | - access to untranslatable memory (mmio) | ||
257 | |||
258 | (*) not applicable in direct mode | ||
259 | |||
260 | Handling a page fault is performed as follows: | ||
261 | |||
262 | - if needed, walk the guest page tables to determine the guest translation | ||
263 | (gva->gpa or ngpa->gpa) | ||
264 | - if permissions are insufficient, reflect the fault back to the guest | ||
265 | - determine the host page | ||
266 | - if this is an mmio request, there is no host page; call the emulator | ||
267 | to emulate the instruction instead | ||
268 | - walk the shadow page table to find the spte for the translation, | ||
269 | instantiating missing intermediate page tables as necessary | ||
270 | - try to unsynchronize the page | ||
271 | - if successful, we can let the guest continue and modify the gpte | ||
272 | - emulate the instruction | ||
273 | - if failed, unshadow the page and let the guest continue | ||
274 | - update any translations that were modified by the instruction | ||
275 | |||
276 | invlpg handling: | ||
277 | |||
278 | - walk the shadow page hierarchy and drop affected translations | ||
279 | - try to reinstantiate the indicated translation in the hope that the | ||
280 | guest will use it in the near future | ||
281 | |||
282 | Guest control register updates: | ||
283 | |||
284 | - mov to cr3 | ||
285 | - look up new shadow roots | ||
286 | - synchronize newly reachable shadow pages | ||
287 | |||
288 | - mov to cr0/cr4/efer | ||
289 | - set up mmu context for new paging mode | ||
290 | - look up new shadow roots | ||
291 | - synchronize newly reachable shadow pages | ||
292 | |||
293 | Host translation updates: | ||
294 | |||
295 | - mmu notifier called with updated hva | ||
296 | - look up affected sptes through reverse map | ||
297 | - drop (or update) translations | ||
298 | |||
299 | Further reading | ||
300 | =============== | ||
301 | |||
302 | - NPT presentation from KVM Forum 2008 | ||
303 | http://www.linux-kvm.org/wiki/images/c/c8/KvmForum2008%24kdf2008_21.pdf | ||
304 | |||