diff options
Diffstat (limited to 'Documentation/virtual/kvm/msr.txt')
-rw-r--r-- | Documentation/virtual/kvm/msr.txt | 187 |
1 files changed, 187 insertions, 0 deletions
diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt new file mode 100644 index 000000000000..d079aed27e03 --- /dev/null +++ b/Documentation/virtual/kvm/msr.txt | |||
@@ -0,0 +1,187 @@ | |||
1 | KVM-specific MSRs. | ||
2 | Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010 | ||
3 | ===================================================== | ||
4 | |||
5 | KVM makes use of some custom MSRs to service some requests. | ||
6 | |||
7 | Custom MSRs have a range reserved for them, that goes from | ||
8 | 0x4b564d00 to 0x4b564dff. There are MSRs outside this area, | ||
9 | but they are deprecated and their use is discouraged. | ||
10 | |||
11 | Custom MSR list | ||
12 | -------- | ||
13 | |||
14 | The current supported Custom MSR list is: | ||
15 | |||
16 | MSR_KVM_WALL_CLOCK_NEW: 0x4b564d00 | ||
17 | |||
18 | data: 4-byte alignment physical address of a memory area which must be | ||
19 | in guest RAM. This memory is expected to hold a copy of the following | ||
20 | structure: | ||
21 | |||
22 | struct pvclock_wall_clock { | ||
23 | u32 version; | ||
24 | u32 sec; | ||
25 | u32 nsec; | ||
26 | } __attribute__((__packed__)); | ||
27 | |||
28 | whose data will be filled in by the hypervisor. The hypervisor is only | ||
29 | guaranteed to update this data at the moment of MSR write. | ||
30 | Users that want to reliably query this information more than once have | ||
31 | to write more than once to this MSR. Fields have the following meanings: | ||
32 | |||
33 | version: guest has to check version before and after grabbing | ||
34 | time information and check that they are both equal and even. | ||
35 | An odd version indicates an in-progress update. | ||
36 | |||
37 | sec: number of seconds for wallclock. | ||
38 | |||
39 | nsec: number of nanoseconds for wallclock. | ||
40 | |||
41 | Note that although MSRs are per-CPU entities, the effect of this | ||
42 | particular MSR is global. | ||
43 | |||
44 | Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid | ||
45 | leaf prior to usage. | ||
46 | |||
47 | MSR_KVM_SYSTEM_TIME_NEW: 0x4b564d01 | ||
48 | |||
49 | data: 4-byte aligned physical address of a memory area which must be in | ||
50 | guest RAM, plus an enable bit in bit 0. This memory is expected to hold | ||
51 | a copy of the following structure: | ||
52 | |||
53 | struct pvclock_vcpu_time_info { | ||
54 | u32 version; | ||
55 | u32 pad0; | ||
56 | u64 tsc_timestamp; | ||
57 | u64 system_time; | ||
58 | u32 tsc_to_system_mul; | ||
59 | s8 tsc_shift; | ||
60 | u8 flags; | ||
61 | u8 pad[2]; | ||
62 | } __attribute__((__packed__)); /* 32 bytes */ | ||
63 | |||
64 | whose data will be filled in by the hypervisor periodically. Only one | ||
65 | write, or registration, is needed for each VCPU. The interval between | ||
66 | updates of this structure is arbitrary and implementation-dependent. | ||
67 | The hypervisor may update this structure at any time it sees fit until | ||
68 | anything with bit0 == 0 is written to it. | ||
69 | |||
70 | Fields have the following meanings: | ||
71 | |||
72 | version: guest has to check version before and after grabbing | ||
73 | time information and check that they are both equal and even. | ||
74 | An odd version indicates an in-progress update. | ||
75 | |||
76 | tsc_timestamp: the tsc value at the current VCPU at the time | ||
77 | of the update of this structure. Guests can subtract this value | ||
78 | from current tsc to derive a notion of elapsed time since the | ||
79 | structure update. | ||
80 | |||
81 | system_time: a host notion of monotonic time, including sleep | ||
82 | time at the time this structure was last updated. Unit is | ||
83 | nanoseconds. | ||
84 | |||
85 | tsc_to_system_mul: a function of the tsc frequency. One has | ||
86 | to multiply any tsc-related quantity by this value to get | ||
87 | a value in nanoseconds, besides dividing by 2^tsc_shift | ||
88 | |||
89 | tsc_shift: cycle to nanosecond divider, as a power of two, to | ||
90 | allow for shift rights. One has to shift right any tsc-related | ||
91 | quantity by this value to get a value in nanoseconds, besides | ||
92 | multiplying by tsc_to_system_mul. | ||
93 | |||
94 | With this information, guests can derive per-CPU time by | ||
95 | doing: | ||
96 | |||
97 | time = (current_tsc - tsc_timestamp) | ||
98 | time = (time * tsc_to_system_mul) >> tsc_shift | ||
99 | time = time + system_time | ||
100 | |||
101 | flags: bits in this field indicate extended capabilities | ||
102 | coordinated between the guest and the hypervisor. Availability | ||
103 | of specific flags has to be checked in 0x40000001 cpuid leaf. | ||
104 | Current flags are: | ||
105 | |||
106 | flag bit | cpuid bit | meaning | ||
107 | ------------------------------------------------------------- | ||
108 | | | time measures taken across | ||
109 | 0 | 24 | multiple cpus are guaranteed to | ||
110 | | | be monotonic | ||
111 | ------------------------------------------------------------- | ||
112 | |||
113 | Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid | ||
114 | leaf prior to usage. | ||
115 | |||
116 | |||
117 | MSR_KVM_WALL_CLOCK: 0x11 | ||
118 | |||
119 | data and functioning: same as MSR_KVM_WALL_CLOCK_NEW. Use that instead. | ||
120 | |||
121 | This MSR falls outside the reserved KVM range and may be removed in the | ||
122 | future. Its usage is deprecated. | ||
123 | |||
124 | Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid | ||
125 | leaf prior to usage. | ||
126 | |||
127 | MSR_KVM_SYSTEM_TIME: 0x12 | ||
128 | |||
129 | data and functioning: same as MSR_KVM_SYSTEM_TIME_NEW. Use that instead. | ||
130 | |||
131 | This MSR falls outside the reserved KVM range and may be removed in the | ||
132 | future. Its usage is deprecated. | ||
133 | |||
134 | Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid | ||
135 | leaf prior to usage. | ||
136 | |||
137 | The suggested algorithm for detecting kvmclock presence is then: | ||
138 | |||
139 | if (!kvm_para_available()) /* refer to cpuid.txt */ | ||
140 | return NON_PRESENT; | ||
141 | |||
142 | flags = cpuid_eax(0x40000001); | ||
143 | if (flags & 3) { | ||
144 | msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; | ||
145 | msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; | ||
146 | return PRESENT; | ||
147 | } else if (flags & 0) { | ||
148 | msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; | ||
149 | msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; | ||
150 | return PRESENT; | ||
151 | } else | ||
152 | return NON_PRESENT; | ||
153 | |||
154 | MSR_KVM_ASYNC_PF_EN: 0x4b564d02 | ||
155 | data: Bits 63-6 hold 64-byte aligned physical address of a | ||
156 | 64 byte memory area which must be in guest RAM and must be | ||
157 | zeroed. Bits 5-2 are reserved and should be zero. Bit 0 is 1 | ||
158 | when asynchronous page faults are enabled on the vcpu 0 when | ||
159 | disabled. Bit 2 is 1 if asynchronous page faults can be injected | ||
160 | when vcpu is in cpl == 0. | ||
161 | |||
162 | First 4 byte of 64 byte memory location will be written to by | ||
163 | the hypervisor at the time of asynchronous page fault (APF) | ||
164 | injection to indicate type of asynchronous page fault. Value | ||
165 | of 1 means that the page referred to by the page fault is not | ||
166 | present. Value 2 means that the page is now available. Disabling | ||
167 | interrupt inhibits APFs. Guest must not enable interrupt | ||
168 | before the reason is read, or it may be overwritten by another | ||
169 | APF. Since APF uses the same exception vector as regular page | ||
170 | fault guest must reset the reason to 0 before it does | ||
171 | something that can generate normal page fault. If during page | ||
172 | fault APF reason is 0 it means that this is regular page | ||
173 | fault. | ||
174 | |||
175 | During delivery of type 1 APF cr2 contains a token that will | ||
176 | be used to notify a guest when missing page becomes | ||
177 | available. When page becomes available type 2 APF is sent with | ||
178 | cr2 set to the token associated with the page. There is special | ||
179 | kind of token 0xffffffff which tells vcpu that it should wake | ||
180 | up all processes waiting for APFs and no individual type 2 APFs | ||
181 | will be sent. | ||
182 | |||
183 | If APF is disabled while there are outstanding APFs, they will | ||
184 | not be delivered. | ||
185 | |||
186 | Currently type 2 APF will be always delivered on the same vcpu as | ||
187 | type 1 was, but guest should not rely on that. | ||