diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-24 12:07:03 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-24 12:07:03 -0400 |
commit | 5fabc487c96819dd12ddb9414835d170fd9cd6d5 (patch) | |
tree | 01532d492e5074b0d3add29bf92ebf9a9d161e9e /Documentation | |
parent | c61264f98c1a974ee6f545f61a4ab33b141d6bda (diff) | |
parent | 3f68b0318bbbd61bf08478ab99a149f0d9e5156e (diff) |
Merge branch 'kvm-updates/3.1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/3.1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (143 commits)
KVM: IOMMU: Disable device assignment without interrupt remapping
KVM: MMU: trace mmio page fault
KVM: MMU: mmio page fault support
KVM: MMU: reorganize struct kvm_shadow_walk_iterator
KVM: MMU: lockless walking shadow page table
KVM: MMU: do not need atomicly to set/clear spte
KVM: MMU: introduce the rules to modify shadow page table
KVM: MMU: abstract some functions to handle fault pfn
KVM: MMU: filter out the mmio pfn from the fault pfn
KVM: MMU: remove bypass_guest_pf
KVM: MMU: split kvm_mmu_free_page
KVM: MMU: count used shadow pages on prepareing path
KVM: MMU: rename 'pt_write' to 'emulate'
KVM: MMU: cleanup for FNAME(fetch)
KVM: MMU: optimize to handle dirty bit
KVM: MMU: cache mmio info on page fault path
KVM: x86: introduce vcpu_mmio_gva_to_gpa to cleanup the code
KVM: MMU: do not update slot bitmap if spte is nonpresent
KVM: MMU: fix walking shadow page table
KVM guest: KVM Steal time registration
...
Diffstat (limited to 'Documentation')
-rw-r--r-- | Documentation/kernel-parameters.txt | 8 | ||||
-rw-r--r-- | Documentation/virtual/kvm/api.txt | 172 | ||||
-rw-r--r-- | Documentation/virtual/kvm/mmu.txt | 18 | ||||
-rw-r--r-- | Documentation/virtual/kvm/msr.txt | 34 | ||||
-rw-r--r-- | Documentation/virtual/kvm/nested-vmx.txt | 251 | ||||
-rw-r--r-- | Documentation/virtual/kvm/ppc-pv.txt | 8 |
6 files changed, 476 insertions, 15 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index aa47be71df4c..40cc653984ee 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -1159,10 +1159,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
1159 | for all guests. | 1159 | for all guests. |
1160 | Default is 1 (enabled) if in 64bit or 32bit-PAE mode | 1160 | Default is 1 (enabled) if in 64bit or 32bit-PAE mode |
1161 | 1161 | ||
1162 | kvm-intel.bypass_guest_pf= | ||
1163 | [KVM,Intel] Disables bypassing of guest page faults | ||
1164 | on Intel chips. Default is 1 (enabled) | ||
1165 | |||
1166 | kvm-intel.ept= [KVM,Intel] Disable extended page tables | 1162 | kvm-intel.ept= [KVM,Intel] Disable extended page tables |
1167 | (virtualized MMU) support on capable Intel chips. | 1163 | (virtualized MMU) support on capable Intel chips. |
1168 | Default is 1 (enabled) | 1164 | Default is 1 (enabled) |
@@ -1737,6 +1733,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
1737 | no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page | 1733 | no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page |
1738 | fault handling. | 1734 | fault handling. |
1739 | 1735 | ||
1736 | no-steal-acc [X86,KVM] Disable paravirtualized steal time accounting. | ||
1737 | steal time is computed, but won't influence scheduler | ||
1738 | behaviour | ||
1739 | |||
1740 | nolapic [X86-32,APIC] Do not enable or use the local APIC. | 1740 | nolapic [X86-32,APIC] Do not enable or use the local APIC. |
1741 | 1741 | ||
1742 | nolapic_timer [X86-32,APIC] Do not use the local APIC timer. | 1742 | nolapic_timer [X86-32,APIC] Do not use the local APIC timer. |
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 42542eb802ca..b0e4b9cd6a66 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt | |||
@@ -180,6 +180,19 @@ KVM_CHECK_EXTENSION ioctl() to determine the value for max_vcpus at run-time. | |||
180 | If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4 | 180 | If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4 |
181 | cpus max. | 181 | cpus max. |
182 | 182 | ||
183 | On powerpc using book3s_hv mode, the vcpus are mapped onto virtual | ||
184 | threads in one or more virtual CPU cores. (This is because the | ||
185 | hardware requires all the hardware threads in a CPU core to be in the | ||
186 | same partition.) The KVM_CAP_PPC_SMT capability indicates the number | ||
187 | of vcpus per virtual core (vcore). The vcore id is obtained by | ||
188 | dividing the vcpu id by the number of vcpus per vcore. The vcpus in a | ||
189 | given vcore will always be in the same physical core as each other | ||
190 | (though that might be a different physical core from time to time). | ||
191 | Userspace can control the threading (SMT) mode of the guest by its | ||
192 | allocation of vcpu ids. For example, if userspace wants | ||
193 | single-threaded guest vcpus, it should make all vcpu ids be a multiple | ||
194 | of the number of vcpus per vcore. | ||
195 | |||
183 | 4.8 KVM_GET_DIRTY_LOG (vm ioctl) | 196 | 4.8 KVM_GET_DIRTY_LOG (vm ioctl) |
184 | 197 | ||
185 | Capability: basic | 198 | Capability: basic |
@@ -1143,15 +1156,10 @@ Assigns an IRQ to a passed-through device. | |||
1143 | 1156 | ||
1144 | struct kvm_assigned_irq { | 1157 | struct kvm_assigned_irq { |
1145 | __u32 assigned_dev_id; | 1158 | __u32 assigned_dev_id; |
1146 | __u32 host_irq; | 1159 | __u32 host_irq; /* ignored (legacy field) */ |
1147 | __u32 guest_irq; | 1160 | __u32 guest_irq; |
1148 | __u32 flags; | 1161 | __u32 flags; |
1149 | union { | 1162 | union { |
1150 | struct { | ||
1151 | __u32 addr_lo; | ||
1152 | __u32 addr_hi; | ||
1153 | __u32 data; | ||
1154 | } guest_msi; | ||
1155 | __u32 reserved[12]; | 1163 | __u32 reserved[12]; |
1156 | }; | 1164 | }; |
1157 | }; | 1165 | }; |
@@ -1239,8 +1247,10 @@ Type: vm ioctl | |||
1239 | Parameters: struct kvm_assigned_msix_nr (in) | 1247 | Parameters: struct kvm_assigned_msix_nr (in) |
1240 | Returns: 0 on success, -1 on error | 1248 | Returns: 0 on success, -1 on error |
1241 | 1249 | ||
1242 | Set the number of MSI-X interrupts for an assigned device. This service can | 1250 | Set the number of MSI-X interrupts for an assigned device. The number is |
1243 | only be called once in the lifetime of an assigned device. | 1251 | reset again by terminating the MSI-X assignment of the device via |
1252 | KVM_DEASSIGN_DEV_IRQ. Calling this service more than once at any earlier | ||
1253 | point will fail. | ||
1244 | 1254 | ||
1245 | struct kvm_assigned_msix_nr { | 1255 | struct kvm_assigned_msix_nr { |
1246 | __u32 assigned_dev_id; | 1256 | __u32 assigned_dev_id; |
@@ -1291,6 +1301,135 @@ Returns the tsc frequency of the guest. The unit of the return value is | |||
1291 | KHz. If the host has unstable tsc this ioctl returns -EIO instead as an | 1301 | KHz. If the host has unstable tsc this ioctl returns -EIO instead as an |
1292 | error. | 1302 | error. |
1293 | 1303 | ||
1304 | 4.56 KVM_GET_LAPIC | ||
1305 | |||
1306 | Capability: KVM_CAP_IRQCHIP | ||
1307 | Architectures: x86 | ||
1308 | Type: vcpu ioctl | ||
1309 | Parameters: struct kvm_lapic_state (out) | ||
1310 | Returns: 0 on success, -1 on error | ||
1311 | |||
1312 | #define KVM_APIC_REG_SIZE 0x400 | ||
1313 | struct kvm_lapic_state { | ||
1314 | char regs[KVM_APIC_REG_SIZE]; | ||
1315 | }; | ||
1316 | |||
1317 | Reads the Local APIC registers and copies them into the input argument. The | ||
1318 | data format and layout are the same as documented in the architecture manual. | ||
1319 | |||
1320 | 4.57 KVM_SET_LAPIC | ||
1321 | |||
1322 | Capability: KVM_CAP_IRQCHIP | ||
1323 | Architectures: x86 | ||
1324 | Type: vcpu ioctl | ||
1325 | Parameters: struct kvm_lapic_state (in) | ||
1326 | Returns: 0 on success, -1 on error | ||
1327 | |||
1328 | #define KVM_APIC_REG_SIZE 0x400 | ||
1329 | struct kvm_lapic_state { | ||
1330 | char regs[KVM_APIC_REG_SIZE]; | ||
1331 | }; | ||
1332 | |||
1333 | Copies the input argument into the the Local APIC registers. The data format | ||
1334 | and layout are the same as documented in the architecture manual. | ||
1335 | |||
1336 | 4.58 KVM_IOEVENTFD | ||
1337 | |||
1338 | Capability: KVM_CAP_IOEVENTFD | ||
1339 | Architectures: all | ||
1340 | Type: vm ioctl | ||
1341 | Parameters: struct kvm_ioeventfd (in) | ||
1342 | Returns: 0 on success, !0 on error | ||
1343 | |||
1344 | This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address | ||
1345 | within the guest. A guest write in the registered address will signal the | ||
1346 | provided event instead of triggering an exit. | ||
1347 | |||
1348 | struct kvm_ioeventfd { | ||
1349 | __u64 datamatch; | ||
1350 | __u64 addr; /* legal pio/mmio address */ | ||
1351 | __u32 len; /* 1, 2, 4, or 8 bytes */ | ||
1352 | __s32 fd; | ||
1353 | __u32 flags; | ||
1354 | __u8 pad[36]; | ||
1355 | }; | ||
1356 | |||
1357 | The following flags are defined: | ||
1358 | |||
1359 | #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch) | ||
1360 | #define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio) | ||
1361 | #define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) | ||
1362 | |||
1363 | If datamatch flag is set, the event will be signaled only if the written value | ||
1364 | to the registered address is equal to datamatch in struct kvm_ioeventfd. | ||
1365 | |||
1366 | 4.62 KVM_CREATE_SPAPR_TCE | ||
1367 | |||
1368 | Capability: KVM_CAP_SPAPR_TCE | ||
1369 | Architectures: powerpc | ||
1370 | Type: vm ioctl | ||
1371 | Parameters: struct kvm_create_spapr_tce (in) | ||
1372 | Returns: file descriptor for manipulating the created TCE table | ||
1373 | |||
1374 | This creates a virtual TCE (translation control entry) table, which | ||
1375 | is an IOMMU for PAPR-style virtual I/O. It is used to translate | ||
1376 | logical addresses used in virtual I/O into guest physical addresses, | ||
1377 | and provides a scatter/gather capability for PAPR virtual I/O. | ||
1378 | |||
1379 | /* for KVM_CAP_SPAPR_TCE */ | ||
1380 | struct kvm_create_spapr_tce { | ||
1381 | __u64 liobn; | ||
1382 | __u32 window_size; | ||
1383 | }; | ||
1384 | |||
1385 | The liobn field gives the logical IO bus number for which to create a | ||
1386 | TCE table. The window_size field specifies the size of the DMA window | ||
1387 | which this TCE table will translate - the table will contain one 64 | ||
1388 | bit TCE entry for every 4kiB of the DMA window. | ||
1389 | |||
1390 | When the guest issues an H_PUT_TCE hcall on a liobn for which a TCE | ||
1391 | table has been created using this ioctl(), the kernel will handle it | ||
1392 | in real mode, updating the TCE table. H_PUT_TCE calls for other | ||
1393 | liobns will cause a vm exit and must be handled by userspace. | ||
1394 | |||
1395 | The return value is a file descriptor which can be passed to mmap(2) | ||
1396 | to map the created TCE table into userspace. This lets userspace read | ||
1397 | the entries written by kernel-handled H_PUT_TCE calls, and also lets | ||
1398 | userspace update the TCE table directly which is useful in some | ||
1399 | circumstances. | ||
1400 | |||
1401 | 4.63 KVM_ALLOCATE_RMA | ||
1402 | |||
1403 | Capability: KVM_CAP_PPC_RMA | ||
1404 | Architectures: powerpc | ||
1405 | Type: vm ioctl | ||
1406 | Parameters: struct kvm_allocate_rma (out) | ||
1407 | Returns: file descriptor for mapping the allocated RMA | ||
1408 | |||
1409 | This allocates a Real Mode Area (RMA) from the pool allocated at boot | ||
1410 | time by the kernel. An RMA is a physically-contiguous, aligned region | ||
1411 | of memory used on older POWER processors to provide the memory which | ||
1412 | will be accessed by real-mode (MMU off) accesses in a KVM guest. | ||
1413 | POWER processors support a set of sizes for the RMA that usually | ||
1414 | includes 64MB, 128MB, 256MB and some larger powers of two. | ||
1415 | |||
1416 | /* for KVM_ALLOCATE_RMA */ | ||
1417 | struct kvm_allocate_rma { | ||
1418 | __u64 rma_size; | ||
1419 | }; | ||
1420 | |||
1421 | The return value is a file descriptor which can be passed to mmap(2) | ||
1422 | to map the allocated RMA into userspace. The mapped area can then be | ||
1423 | passed to the KVM_SET_USER_MEMORY_REGION ioctl to establish it as the | ||
1424 | RMA for a virtual machine. The size of the RMA in bytes (which is | ||
1425 | fixed at host kernel boot time) is returned in the rma_size field of | ||
1426 | the argument structure. | ||
1427 | |||
1428 | The KVM_CAP_PPC_RMA capability is 1 or 2 if the KVM_ALLOCATE_RMA ioctl | ||
1429 | is supported; 2 if the processor requires all virtual machines to have | ||
1430 | an RMA, or 1 if the processor can use an RMA but doesn't require it, | ||
1431 | because it supports the Virtual RMA (VRMA) facility. | ||
1432 | |||
1294 | 5. The kvm_run structure | 1433 | 5. The kvm_run structure |
1295 | 1434 | ||
1296 | Application code obtains a pointer to the kvm_run structure by | 1435 | Application code obtains a pointer to the kvm_run structure by |
@@ -1473,6 +1612,23 @@ Userspace can now handle the hypercall and when it's done modify the gprs as | |||
1473 | necessary. Upon guest entry all guest GPRs will then be replaced by the values | 1612 | necessary. Upon guest entry all guest GPRs will then be replaced by the values |
1474 | in this struct. | 1613 | in this struct. |
1475 | 1614 | ||
1615 | /* KVM_EXIT_PAPR_HCALL */ | ||
1616 | struct { | ||
1617 | __u64 nr; | ||
1618 | __u64 ret; | ||
1619 | __u64 args[9]; | ||
1620 | } papr_hcall; | ||
1621 | |||
1622 | This is used on 64-bit PowerPC when emulating a pSeries partition, | ||
1623 | e.g. with the 'pseries' machine type in qemu. It occurs when the | ||
1624 | guest does a hypercall using the 'sc 1' instruction. The 'nr' field | ||
1625 | contains the hypercall number (from the guest R3), and 'args' contains | ||
1626 | the arguments (from the guest R4 - R12). Userspace should put the | ||
1627 | return code in 'ret' and any extra returned values in args[]. | ||
1628 | The possible hypercalls are defined in the Power Architecture Platform | ||
1629 | Requirements (PAPR) document available from www.power.org (free | ||
1630 | developer registration required to access it). | ||
1631 | |||
1476 | /* Fix the size of the union. */ | 1632 | /* Fix the size of the union. */ |
1477 | char padding[256]; | 1633 | char padding[256]; |
1478 | }; | 1634 | }; |
diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt index f46aa58389ca..5dc972c09b55 100644 --- a/Documentation/virtual/kvm/mmu.txt +++ b/Documentation/virtual/kvm/mmu.txt | |||
@@ -165,6 +165,10 @@ Shadow pages contain the following information: | |||
165 | Contains the value of efer.nxe for which the page is valid. | 165 | Contains the value of efer.nxe for which the page is valid. |
166 | role.cr0_wp: | 166 | role.cr0_wp: |
167 | Contains the value of cr0.wp for which the page is valid. | 167 | Contains the value of cr0.wp for which the page is valid. |
168 | role.smep_andnot_wp: | ||
169 | Contains the value of cr4.smep && !cr0.wp for which the page is valid | ||
170 | (pages for which this is true are different from other pages; see the | ||
171 | treatment of cr0.wp=0 below). | ||
168 | gfn: | 172 | gfn: |
169 | Either the guest page table containing the translations shadowed by this | 173 | Either the guest page table containing the translations shadowed by this |
170 | page, or the base page frame for linear translations. See role.direct. | 174 | page, or the base page frame for linear translations. See role.direct. |
@@ -317,6 +321,20 @@ on fault type: | |||
317 | 321 | ||
318 | (user write faults generate a #PF) | 322 | (user write faults generate a #PF) |
319 | 323 | ||
324 | In the first case there is an additional complication if CR4.SMEP is | ||
325 | enabled: since we've turned the page into a kernel page, the kernel may now | ||
326 | execute it. We handle this by also setting spte.nx. If we get a user | ||
327 | fetch or read fault, we'll change spte.u=1 and spte.nx=gpte.nx back. | ||
328 | |||
329 | To prevent an spte that was converted into a kernel page with cr0.wp=0 | ||
330 | from being written by the kernel after cr0.wp has changed to 1, we make | ||
331 | the value of cr0.wp part of the page role. This means that an spte created | ||
332 | with one value of cr0.wp cannot be used when cr0.wp has a different value - | ||
333 | it will simply be missed by the shadow page lookup code. A similar issue | ||
334 | exists when an spte created with cr0.wp=0 and cr4.smep=0 is used after | ||
335 | changing cr4.smep to 1. To avoid this, the value of !cr0.wp && cr4.smep | ||
336 | is also made a part of the page role. | ||
337 | |||
320 | Large pages | 338 | Large pages |
321 | =========== | 339 | =========== |
322 | 340 | ||
diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt index d079aed27e03..50317809113d 100644 --- a/Documentation/virtual/kvm/msr.txt +++ b/Documentation/virtual/kvm/msr.txt | |||
@@ -185,3 +185,37 @@ MSR_KVM_ASYNC_PF_EN: 0x4b564d02 | |||
185 | 185 | ||
186 | Currently type 2 APF will be always delivered on the same vcpu as | 186 | Currently type 2 APF will be always delivered on the same vcpu as |
187 | type 1 was, but guest should not rely on that. | 187 | type 1 was, but guest should not rely on that. |
188 | |||
189 | MSR_KVM_STEAL_TIME: 0x4b564d03 | ||
190 | |||
191 | data: 64-byte alignment physical address of a memory area which must be | ||
192 | in guest RAM, plus an enable bit in bit 0. This memory is expected to | ||
193 | hold a copy of the following structure: | ||
194 | |||
195 | struct kvm_steal_time { | ||
196 | __u64 steal; | ||
197 | __u32 version; | ||
198 | __u32 flags; | ||
199 | __u32 pad[12]; | ||
200 | } | ||
201 | |||
202 | whose data will be filled in by the hypervisor periodically. Only one | ||
203 | write, or registration, is needed for each VCPU. The interval between | ||
204 | updates of this structure is arbitrary and implementation-dependent. | ||
205 | The hypervisor may update this structure at any time it sees fit until | ||
206 | anything with bit0 == 0 is written to it. Guest is required to make sure | ||
207 | this structure is initialized to zero. | ||
208 | |||
209 | Fields have the following meanings: | ||
210 | |||
211 | version: a sequence counter. In other words, guest has to check | ||
212 | this field before and after grabbing time information and make | ||
213 | sure they are both equal and even. An odd version indicates an | ||
214 | in-progress update. | ||
215 | |||
216 | flags: At this point, always zero. May be used to indicate | ||
217 | changes in this structure in the future. | ||
218 | |||
219 | steal: the amount of time in which this vCPU did not run, in | ||
220 | nanoseconds. Time during which the vcpu is idle, will not be | ||
221 | reported as steal time. | ||
diff --git a/Documentation/virtual/kvm/nested-vmx.txt b/Documentation/virtual/kvm/nested-vmx.txt new file mode 100644 index 000000000000..8ed937de1163 --- /dev/null +++ b/Documentation/virtual/kvm/nested-vmx.txt | |||
@@ -0,0 +1,251 @@ | |||
1 | Nested VMX | ||
2 | ========== | ||
3 | |||
4 | Overview | ||
5 | --------- | ||
6 | |||
7 | On Intel processors, KVM uses Intel's VMX (Virtual-Machine eXtensions) | ||
8 | to easily and efficiently run guest operating systems. Normally, these guests | ||
9 | *cannot* themselves be hypervisors running their own guests, because in VMX, | ||
10 | guests cannot use VMX instructions. | ||
11 | |||
12 | The "Nested VMX" feature adds this missing capability - of running guest | ||
13 | hypervisors (which use VMX) with their own nested guests. It does so by | ||
14 | allowing a guest to use VMX instructions, and correctly and efficiently | ||
15 | emulating them using the single level of VMX available in the hardware. | ||
16 | |||
17 | We describe in much greater detail the theory behind the nested VMX feature, | ||
18 | its implementation and its performance characteristics, in the OSDI 2010 paper | ||
19 | "The Turtles Project: Design and Implementation of Nested Virtualization", | ||
20 | available at: | ||
21 | |||
22 | http://www.usenix.org/events/osdi10/tech/full_papers/Ben-Yehuda.pdf | ||
23 | |||
24 | |||
25 | Terminology | ||
26 | ----------- | ||
27 | |||
28 | Single-level virtualization has two levels - the host (KVM) and the guests. | ||
29 | In nested virtualization, we have three levels: The host (KVM), which we call | ||
30 | L0, the guest hypervisor, which we call L1, and its nested guest, which we | ||
31 | call L2. | ||
32 | |||
33 | |||
34 | Known limitations | ||
35 | ----------------- | ||
36 | |||
37 | The current code supports running Linux guests under KVM guests. | ||
38 | Only 64-bit guest hypervisors are supported. | ||
39 | |||
40 | Additional patches for running Windows under guest KVM, and Linux under | ||
41 | guest VMware server, and support for nested EPT, are currently running in | ||
42 | the lab, and will be sent as follow-on patchsets. | ||
43 | |||
44 | |||
45 | Running nested VMX | ||
46 | ------------------ | ||
47 | |||
48 | The nested VMX feature is disabled by default. It can be enabled by giving | ||
49 | the "nested=1" option to the kvm-intel module. | ||
50 | |||
51 | No modifications are required to user space (qemu). However, qemu's default | ||
52 | emulated CPU type (qemu64) does not list the "VMX" CPU feature, so it must be | ||
53 | explicitly enabled, by giving qemu one of the following options: | ||
54 | |||
55 | -cpu host (emulated CPU has all features of the real CPU) | ||
56 | |||
57 | -cpu qemu64,+vmx (add just the vmx feature to a named CPU type) | ||
58 | |||
59 | |||
60 | ABIs | ||
61 | ---- | ||
62 | |||
63 | Nested VMX aims to present a standard and (eventually) fully-functional VMX | ||
64 | implementation for the a guest hypervisor to use. As such, the official | ||
65 | specification of the ABI that it provides is Intel's VMX specification, | ||
66 | namely volume 3B of their "Intel 64 and IA-32 Architectures Software | ||
67 | Developer's Manual". Not all of VMX's features are currently fully supported, | ||
68 | but the goal is to eventually support them all, starting with the VMX features | ||
69 | which are used in practice by popular hypervisors (KVM and others). | ||
70 | |||
71 | As a VMX implementation, nested VMX presents a VMCS structure to L1. | ||
72 | As mandated by the spec, other than the two fields revision_id and abort, | ||
73 | this structure is *opaque* to its user, who is not supposed to know or care | ||
74 | about its internal structure. Rather, the structure is accessed through the | ||
75 | VMREAD and VMWRITE instructions. | ||
76 | Still, for debugging purposes, KVM developers might be interested to know the | ||
77 | internals of this structure; This is struct vmcs12 from arch/x86/kvm/vmx.c. | ||
78 | |||
79 | The name "vmcs12" refers to the VMCS that L1 builds for L2. In the code we | ||
80 | also have "vmcs01", the VMCS that L0 built for L1, and "vmcs02" is the VMCS | ||
81 | which L0 builds to actually run L2 - how this is done is explained in the | ||
82 | aforementioned paper. | ||
83 | |||
84 | For convenience, we repeat the content of struct vmcs12 here. If the internals | ||
85 | of this structure changes, this can break live migration across KVM versions. | ||
86 | VMCS12_REVISION (from vmx.c) should be changed if struct vmcs12 or its inner | ||
87 | struct shadow_vmcs is ever changed. | ||
88 | |||
89 | typedef u64 natural_width; | ||
90 | struct __packed vmcs12 { | ||
91 | /* According to the Intel spec, a VMCS region must start with | ||
92 | * these two user-visible fields */ | ||
93 | u32 revision_id; | ||
94 | u32 abort; | ||
95 | |||
96 | u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ | ||
97 | u32 padding[7]; /* room for future expansion */ | ||
98 | |||
99 | u64 io_bitmap_a; | ||
100 | u64 io_bitmap_b; | ||
101 | u64 msr_bitmap; | ||
102 | u64 vm_exit_msr_store_addr; | ||
103 | u64 vm_exit_msr_load_addr; | ||
104 | u64 vm_entry_msr_load_addr; | ||
105 | u64 tsc_offset; | ||
106 | u64 virtual_apic_page_addr; | ||
107 | u64 apic_access_addr; | ||
108 | u64 ept_pointer; | ||
109 | u64 guest_physical_address; | ||
110 | u64 vmcs_link_pointer; | ||
111 | u64 guest_ia32_debugctl; | ||
112 | u64 guest_ia32_pat; | ||
113 | u64 guest_ia32_efer; | ||
114 | u64 guest_pdptr0; | ||
115 | u64 guest_pdptr1; | ||
116 | u64 guest_pdptr2; | ||
117 | u64 guest_pdptr3; | ||
118 | u64 host_ia32_pat; | ||
119 | u64 host_ia32_efer; | ||
120 | u64 padding64[8]; /* room for future expansion */ | ||
121 | natural_width cr0_guest_host_mask; | ||
122 | natural_width cr4_guest_host_mask; | ||
123 | natural_width cr0_read_shadow; | ||
124 | natural_width cr4_read_shadow; | ||
125 | natural_width cr3_target_value0; | ||
126 | natural_width cr3_target_value1; | ||
127 | natural_width cr3_target_value2; | ||
128 | natural_width cr3_target_value3; | ||
129 | natural_width exit_qualification; | ||
130 | natural_width guest_linear_address; | ||
131 | natural_width guest_cr0; | ||
132 | natural_width guest_cr3; | ||
133 | natural_width guest_cr4; | ||
134 | natural_width guest_es_base; | ||
135 | natural_width guest_cs_base; | ||
136 | natural_width guest_ss_base; | ||
137 | natural_width guest_ds_base; | ||
138 | natural_width guest_fs_base; | ||
139 | natural_width guest_gs_base; | ||
140 | natural_width guest_ldtr_base; | ||
141 | natural_width guest_tr_base; | ||
142 | natural_width guest_gdtr_base; | ||
143 | natural_width guest_idtr_base; | ||
144 | natural_width guest_dr7; | ||
145 | natural_width guest_rsp; | ||
146 | natural_width guest_rip; | ||
147 | natural_width guest_rflags; | ||
148 | natural_width guest_pending_dbg_exceptions; | ||
149 | natural_width guest_sysenter_esp; | ||
150 | natural_width guest_sysenter_eip; | ||
151 | natural_width host_cr0; | ||
152 | natural_width host_cr3; | ||
153 | natural_width host_cr4; | ||
154 | natural_width host_fs_base; | ||
155 | natural_width host_gs_base; | ||
156 | natural_width host_tr_base; | ||
157 | natural_width host_gdtr_base; | ||
158 | natural_width host_idtr_base; | ||
159 | natural_width host_ia32_sysenter_esp; | ||
160 | natural_width host_ia32_sysenter_eip; | ||
161 | natural_width host_rsp; | ||
162 | natural_width host_rip; | ||
163 | natural_width paddingl[8]; /* room for future expansion */ | ||
164 | u32 pin_based_vm_exec_control; | ||
165 | u32 cpu_based_vm_exec_control; | ||
166 | u32 exception_bitmap; | ||
167 | u32 page_fault_error_code_mask; | ||
168 | u32 page_fault_error_code_match; | ||
169 | u32 cr3_target_count; | ||
170 | u32 vm_exit_controls; | ||
171 | u32 vm_exit_msr_store_count; | ||
172 | u32 vm_exit_msr_load_count; | ||
173 | u32 vm_entry_controls; | ||
174 | u32 vm_entry_msr_load_count; | ||
175 | u32 vm_entry_intr_info_field; | ||
176 | u32 vm_entry_exception_error_code; | ||
177 | u32 vm_entry_instruction_len; | ||
178 | u32 tpr_threshold; | ||
179 | u32 secondary_vm_exec_control; | ||
180 | u32 vm_instruction_error; | ||
181 | u32 vm_exit_reason; | ||
182 | u32 vm_exit_intr_info; | ||
183 | u32 vm_exit_intr_error_code; | ||
184 | u32 idt_vectoring_info_field; | ||
185 | u32 idt_vectoring_error_code; | ||
186 | u32 vm_exit_instruction_len; | ||
187 | u32 vmx_instruction_info; | ||
188 | u32 guest_es_limit; | ||
189 | u32 guest_cs_limit; | ||
190 | u32 guest_ss_limit; | ||
191 | u32 guest_ds_limit; | ||
192 | u32 guest_fs_limit; | ||
193 | u32 guest_gs_limit; | ||
194 | u32 guest_ldtr_limit; | ||
195 | u32 guest_tr_limit; | ||
196 | u32 guest_gdtr_limit; | ||
197 | u32 guest_idtr_limit; | ||
198 | u32 guest_es_ar_bytes; | ||
199 | u32 guest_cs_ar_bytes; | ||
200 | u32 guest_ss_ar_bytes; | ||
201 | u32 guest_ds_ar_bytes; | ||
202 | u32 guest_fs_ar_bytes; | ||
203 | u32 guest_gs_ar_bytes; | ||
204 | u32 guest_ldtr_ar_bytes; | ||
205 | u32 guest_tr_ar_bytes; | ||
206 | u32 guest_interruptibility_info; | ||
207 | u32 guest_activity_state; | ||
208 | u32 guest_sysenter_cs; | ||
209 | u32 host_ia32_sysenter_cs; | ||
210 | u32 padding32[8]; /* room for future expansion */ | ||
211 | u16 virtual_processor_id; | ||
212 | u16 guest_es_selector; | ||
213 | u16 guest_cs_selector; | ||
214 | u16 guest_ss_selector; | ||
215 | u16 guest_ds_selector; | ||
216 | u16 guest_fs_selector; | ||
217 | u16 guest_gs_selector; | ||
218 | u16 guest_ldtr_selector; | ||
219 | u16 guest_tr_selector; | ||
220 | u16 host_es_selector; | ||
221 | u16 host_cs_selector; | ||
222 | u16 host_ss_selector; | ||
223 | u16 host_ds_selector; | ||
224 | u16 host_fs_selector; | ||
225 | u16 host_gs_selector; | ||
226 | u16 host_tr_selector; | ||
227 | }; | ||
228 | |||
229 | |||
230 | Authors | ||
231 | ------- | ||
232 | |||
233 | These patches were written by: | ||
234 | Abel Gordon, abelg <at> il.ibm.com | ||
235 | Nadav Har'El, nyh <at> il.ibm.com | ||
236 | Orit Wasserman, oritw <at> il.ibm.com | ||
237 | Ben-Ami Yassor, benami <at> il.ibm.com | ||
238 | Muli Ben-Yehuda, muli <at> il.ibm.com | ||
239 | |||
240 | With contributions by: | ||
241 | Anthony Liguori, aliguori <at> us.ibm.com | ||
242 | Mike Day, mdday <at> us.ibm.com | ||
243 | Michael Factor, factor <at> il.ibm.com | ||
244 | Zvi Dubitzky, dubi <at> il.ibm.com | ||
245 | |||
246 | And valuable reviews by: | ||
247 | Avi Kivity, avi <at> redhat.com | ||
248 | Gleb Natapov, gleb <at> redhat.com | ||
249 | Marcelo Tosatti, mtosatti <at> redhat.com | ||
250 | Kevin Tian, kevin.tian <at> intel.com | ||
251 | and others. | ||
diff --git a/Documentation/virtual/kvm/ppc-pv.txt b/Documentation/virtual/kvm/ppc-pv.txt index 3ab969c59046..2b7ce190cde4 100644 --- a/Documentation/virtual/kvm/ppc-pv.txt +++ b/Documentation/virtual/kvm/ppc-pv.txt | |||
@@ -68,9 +68,11 @@ page that contains parts of supervisor visible register state. The guest can | |||
68 | map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE. | 68 | map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE. |
69 | 69 | ||
70 | With this hypercall issued the guest always gets the magic page mapped at the | 70 | With this hypercall issued the guest always gets the magic page mapped at the |
71 | desired location in effective and physical address space. For now, we always | 71 | desired location. The first parameter indicates the effective address when the |
72 | map the page to -4096. This way we can access it using absolute load and store | 72 | MMU is enabled. The second parameter indicates the address in real mode, if |
73 | functions. The following instruction reads the first field of the magic page: | 73 | applicable to the target. For now, we always map the page to -4096. This way we |
74 | can access it using absolute load and store functions. The following | ||
75 | instruction reads the first field of the magic page: | ||
74 | 76 | ||
75 | ld rX, -4096(0) | 77 | ld rX, -4096(0) |
76 | 78 | ||