diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-28 17:35:31 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-28 17:35:31 -0400 |
commit | 2e7580b0e75d771d93e24e681031a165b1d31071 (patch) | |
tree | d9449702609eeaab28913a43b5a4434667e09d43 /Documentation | |
parent | d25413efa9536e2f425ea45c7720598035c597bc (diff) | |
parent | cf9eeac46350b8b43730b7dc5e999757bed089a4 (diff) |
Merge branch 'kvm-updates/3.4' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Avi Kivity:
"Changes include timekeeping improvements, support for assigning host
PCI devices that share interrupt lines, s390 user-controlled guests, a
large ppc update, and random fixes."
This is with the sign-off's fixed, hopefully next merge window we won't
have rebased commits.
* 'kvm-updates/3.4' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (130 commits)
KVM: Convert intx_mask_lock to spin lock
KVM: x86: fix kvm_write_tsc() TSC matching thinko
x86: kvmclock: abstract save/restore sched_clock_state
KVM: nVMX: Fix erroneous exception bitmap check
KVM: Ignore the writes to MSR_K7_HWCR(3)
KVM: MMU: make use of ->root_level in reset_rsvds_bits_mask
KVM: PMU: add proper support for fixed counter 2
KVM: PMU: Fix raw event check
KVM: PMU: warn when pin control is set in eventsel msr
KVM: VMX: Fix delayed load of shared MSRs
KVM: use correct tlbs dirty type in cmpxchg
KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
KVM: Ensure all vcpus are consistent with in-kernel irqchip settings
KVM: x86 emulator: Allow PM/VM86 switch during task switch
KVM: SVM: Fix CPL updates
KVM: x86 emulator: VM86 segments must have DPL 3
KVM: x86 emulator: Fix task switch privilege checks
arch/powerpc/kvm/book3s_hv.c: included linux/sched.h twice
KVM: x86 emulator: correctly mask pmc index bits in RDPMC instruction emulation
KVM: mmu_notifier: Flush TLBs before releasing mmu_lock
...
Diffstat (limited to 'Documentation')
-rw-r--r-- | Documentation/virtual/kvm/api.txt | 259 | ||||
-rw-r--r-- | Documentation/virtual/kvm/ppc-pv.txt | 24 |
2 files changed, 260 insertions, 23 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index e1d94bf4056e..6386f8c0482e 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt | |||
@@ -95,7 +95,7 @@ described as 'basic' will be available. | |||
95 | Capability: basic | 95 | Capability: basic |
96 | Architectures: all | 96 | Architectures: all |
97 | Type: system ioctl | 97 | Type: system ioctl |
98 | Parameters: none | 98 | Parameters: machine type identifier (KVM_VM_*) |
99 | Returns: a VM fd that can be used to control the new virtual machine. | 99 | Returns: a VM fd that can be used to control the new virtual machine. |
100 | 100 | ||
101 | The new VM has no virtual cpus and no memory. An mmap() of a VM fd | 101 | The new VM has no virtual cpus and no memory. An mmap() of a VM fd |
@@ -103,6 +103,11 @@ will access the virtual machine's physical address space; offset zero | |||
103 | corresponds to guest physical address zero. Use of mmap() on a VM fd | 103 | corresponds to guest physical address zero. Use of mmap() on a VM fd |
104 | is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is | 104 | is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is |
105 | available. | 105 | available. |
106 | You most certainly want to use 0 as machine type. | ||
107 | |||
108 | In order to create user controlled virtual machines on S390, check | ||
109 | KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as | ||
110 | privileged user (CAP_SYS_ADMIN). | ||
106 | 111 | ||
107 | 4.3 KVM_GET_MSR_INDEX_LIST | 112 | 4.3 KVM_GET_MSR_INDEX_LIST |
108 | 113 | ||
@@ -213,6 +218,11 @@ allocation of vcpu ids. For example, if userspace wants | |||
213 | single-threaded guest vcpus, it should make all vcpu ids be a multiple | 218 | single-threaded guest vcpus, it should make all vcpu ids be a multiple |
214 | of the number of vcpus per vcore. | 219 | of the number of vcpus per vcore. |
215 | 220 | ||
221 | For virtual cpus that have been created with S390 user controlled virtual | ||
222 | machines, the resulting vcpu fd can be memory mapped at page offset | ||
223 | KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual | ||
224 | cpu's hardware control block. | ||
225 | |||
216 | 4.8 KVM_GET_DIRTY_LOG (vm ioctl) | 226 | 4.8 KVM_GET_DIRTY_LOG (vm ioctl) |
217 | 227 | ||
218 | Capability: basic | 228 | Capability: basic |
@@ -1159,6 +1169,14 @@ following flags are specified: | |||
1159 | 1169 | ||
1160 | /* Depends on KVM_CAP_IOMMU */ | 1170 | /* Depends on KVM_CAP_IOMMU */ |
1161 | #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) | 1171 | #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) |
1172 | /* The following two depend on KVM_CAP_PCI_2_3 */ | ||
1173 | #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) | ||
1174 | #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) | ||
1175 | |||
1176 | If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts | ||
1177 | via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other | ||
1178 | assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the | ||
1179 | guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details. | ||
1162 | 1180 | ||
1163 | The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure | 1181 | The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure |
1164 | isolation of the device. Usages not specifying this flag are deprecated. | 1182 | isolation of the device. Usages not specifying this flag are deprecated. |
@@ -1399,6 +1417,71 @@ The following flags are defined: | |||
1399 | If datamatch flag is set, the event will be signaled only if the written value | 1417 | If datamatch flag is set, the event will be signaled only if the written value |
1400 | to the registered address is equal to datamatch in struct kvm_ioeventfd. | 1418 | to the registered address is equal to datamatch in struct kvm_ioeventfd. |
1401 | 1419 | ||
1420 | 4.59 KVM_DIRTY_TLB | ||
1421 | |||
1422 | Capability: KVM_CAP_SW_TLB | ||
1423 | Architectures: ppc | ||
1424 | Type: vcpu ioctl | ||
1425 | Parameters: struct kvm_dirty_tlb (in) | ||
1426 | Returns: 0 on success, -1 on error | ||
1427 | |||
1428 | struct kvm_dirty_tlb { | ||
1429 | __u64 bitmap; | ||
1430 | __u32 num_dirty; | ||
1431 | }; | ||
1432 | |||
1433 | This must be called whenever userspace has changed an entry in the shared | ||
1434 | TLB, prior to calling KVM_RUN on the associated vcpu. | ||
1435 | |||
1436 | The "bitmap" field is the userspace address of an array. This array | ||
1437 | consists of a number of bits, equal to the total number of TLB entries as | ||
1438 | determined by the last successful call to KVM_CONFIG_TLB, rounded up to the | ||
1439 | nearest multiple of 64. | ||
1440 | |||
1441 | Each bit corresponds to one TLB entry, ordered the same as in the shared TLB | ||
1442 | array. | ||
1443 | |||
1444 | The array is little-endian: the bit 0 is the least significant bit of the | ||
1445 | first byte, bit 8 is the least significant bit of the second byte, etc. | ||
1446 | This avoids any complications with differing word sizes. | ||
1447 | |||
1448 | The "num_dirty" field is a performance hint for KVM to determine whether it | ||
1449 | should skip processing the bitmap and just invalidate everything. It must | ||
1450 | be set to the number of set bits in the bitmap. | ||
1451 | |||
1452 | 4.60 KVM_ASSIGN_SET_INTX_MASK | ||
1453 | |||
1454 | Capability: KVM_CAP_PCI_2_3 | ||
1455 | Architectures: x86 | ||
1456 | Type: vm ioctl | ||
1457 | Parameters: struct kvm_assigned_pci_dev (in) | ||
1458 | Returns: 0 on success, -1 on error | ||
1459 | |||
1460 | Allows userspace to mask PCI INTx interrupts from the assigned device. The | ||
1461 | kernel will not deliver INTx interrupts to the guest between setting and | ||
1462 | clearing of KVM_ASSIGN_SET_INTX_MASK via this interface. This enables use of | ||
1463 | and emulation of PCI 2.3 INTx disable command register behavior. | ||
1464 | |||
1465 | This may be used for both PCI 2.3 devices supporting INTx disable natively and | ||
1466 | older devices lacking this support. Userspace is responsible for emulating the | ||
1467 | read value of the INTx disable bit in the guest visible PCI command register. | ||
1468 | When modifying the INTx disable state, userspace should precede updating the | ||
1469 | physical device command register by calling this ioctl to inform the kernel of | ||
1470 | the new intended INTx mask state. | ||
1471 | |||
1472 | Note that the kernel uses the device INTx disable bit to internally manage the | ||
1473 | device interrupt state for PCI 2.3 devices. Reads of this register may | ||
1474 | therefore not match the expected value. Writes should always use the guest | ||
1475 | intended INTx disable value rather than attempting to read-copy-update the | ||
1476 | current physical device state. Races between user and kernel updates to the | ||
1477 | INTx disable bit are handled lazily in the kernel. It's possible the device | ||
1478 | may generate unintended interrupts, but they will not be injected into the | ||
1479 | guest. | ||
1480 | |||
1481 | See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified | ||
1482 | by assigned_dev_id. In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is | ||
1483 | evaluated. | ||
1484 | |||
1402 | 4.62 KVM_CREATE_SPAPR_TCE | 1485 | 4.62 KVM_CREATE_SPAPR_TCE |
1403 | 1486 | ||
1404 | Capability: KVM_CAP_SPAPR_TCE | 1487 | Capability: KVM_CAP_SPAPR_TCE |
@@ -1491,6 +1574,101 @@ following algorithm: | |||
1491 | Some guests configure the LINT1 NMI input to cause a panic, aiding in | 1574 | Some guests configure the LINT1 NMI input to cause a panic, aiding in |
1492 | debugging. | 1575 | debugging. |
1493 | 1576 | ||
1577 | 4.65 KVM_S390_UCAS_MAP | ||
1578 | |||
1579 | Capability: KVM_CAP_S390_UCONTROL | ||
1580 | Architectures: s390 | ||
1581 | Type: vcpu ioctl | ||
1582 | Parameters: struct kvm_s390_ucas_mapping (in) | ||
1583 | Returns: 0 in case of success | ||
1584 | |||
1585 | The parameter is defined like this: | ||
1586 | struct kvm_s390_ucas_mapping { | ||
1587 | __u64 user_addr; | ||
1588 | __u64 vcpu_addr; | ||
1589 | __u64 length; | ||
1590 | }; | ||
1591 | |||
1592 | This ioctl maps the memory at "user_addr" with the length "length" to | ||
1593 | the vcpu's address space starting at "vcpu_addr". All parameters need to | ||
1594 | be alligned by 1 megabyte. | ||
1595 | |||
1596 | 4.66 KVM_S390_UCAS_UNMAP | ||
1597 | |||
1598 | Capability: KVM_CAP_S390_UCONTROL | ||
1599 | Architectures: s390 | ||
1600 | Type: vcpu ioctl | ||
1601 | Parameters: struct kvm_s390_ucas_mapping (in) | ||
1602 | Returns: 0 in case of success | ||
1603 | |||
1604 | The parameter is defined like this: | ||
1605 | struct kvm_s390_ucas_mapping { | ||
1606 | __u64 user_addr; | ||
1607 | __u64 vcpu_addr; | ||
1608 | __u64 length; | ||
1609 | }; | ||
1610 | |||
1611 | This ioctl unmaps the memory in the vcpu's address space starting at | ||
1612 | "vcpu_addr" with the length "length". The field "user_addr" is ignored. | ||
1613 | All parameters need to be alligned by 1 megabyte. | ||
1614 | |||
1615 | 4.67 KVM_S390_VCPU_FAULT | ||
1616 | |||
1617 | Capability: KVM_CAP_S390_UCONTROL | ||
1618 | Architectures: s390 | ||
1619 | Type: vcpu ioctl | ||
1620 | Parameters: vcpu absolute address (in) | ||
1621 | Returns: 0 in case of success | ||
1622 | |||
1623 | This call creates a page table entry on the virtual cpu's address space | ||
1624 | (for user controlled virtual machines) or the virtual machine's address | ||
1625 | space (for regular virtual machines). This only works for minor faults, | ||
1626 | thus it's recommended to access subject memory page via the user page | ||
1627 | table upfront. This is useful to handle validity intercepts for user | ||
1628 | controlled virtual machines to fault in the virtual cpu's lowcore pages | ||
1629 | prior to calling the KVM_RUN ioctl. | ||
1630 | |||
1631 | 4.68 KVM_SET_ONE_REG | ||
1632 | |||
1633 | Capability: KVM_CAP_ONE_REG | ||
1634 | Architectures: all | ||
1635 | Type: vcpu ioctl | ||
1636 | Parameters: struct kvm_one_reg (in) | ||
1637 | Returns: 0 on success, negative value on failure | ||
1638 | |||
1639 | struct kvm_one_reg { | ||
1640 | __u64 id; | ||
1641 | __u64 addr; | ||
1642 | }; | ||
1643 | |||
1644 | Using this ioctl, a single vcpu register can be set to a specific value | ||
1645 | defined by user space with the passed in struct kvm_one_reg, where id | ||
1646 | refers to the register identifier as described below and addr is a pointer | ||
1647 | to a variable with the respective size. There can be architecture agnostic | ||
1648 | and architecture specific registers. Each have their own range of operation | ||
1649 | and their own constants and width. To keep track of the implemented | ||
1650 | registers, find a list below: | ||
1651 | |||
1652 | Arch | Register | Width (bits) | ||
1653 | | | | ||
1654 | PPC | KVM_REG_PPC_HIOR | 64 | ||
1655 | |||
1656 | 4.69 KVM_GET_ONE_REG | ||
1657 | |||
1658 | Capability: KVM_CAP_ONE_REG | ||
1659 | Architectures: all | ||
1660 | Type: vcpu ioctl | ||
1661 | Parameters: struct kvm_one_reg (in and out) | ||
1662 | Returns: 0 on success, negative value on failure | ||
1663 | |||
1664 | This ioctl allows to receive the value of a single register implemented | ||
1665 | in a vcpu. The register to read is indicated by the "id" field of the | ||
1666 | kvm_one_reg struct passed in. On success, the register value can be found | ||
1667 | at the memory location pointed to by "addr". | ||
1668 | |||
1669 | The list of registers accessible using this interface is identical to the | ||
1670 | list in 4.64. | ||
1671 | |||
1494 | 5. The kvm_run structure | 1672 | 5. The kvm_run structure |
1495 | 1673 | ||
1496 | Application code obtains a pointer to the kvm_run structure by | 1674 | Application code obtains a pointer to the kvm_run structure by |
@@ -1651,6 +1829,20 @@ s390 specific. | |||
1651 | 1829 | ||
1652 | s390 specific. | 1830 | s390 specific. |
1653 | 1831 | ||
1832 | /* KVM_EXIT_S390_UCONTROL */ | ||
1833 | struct { | ||
1834 | __u64 trans_exc_code; | ||
1835 | __u32 pgm_code; | ||
1836 | } s390_ucontrol; | ||
1837 | |||
1838 | s390 specific. A page fault has occurred for a user controlled virtual | ||
1839 | machine (KVM_VM_S390_UNCONTROL) on it's host page table that cannot be | ||
1840 | resolved by the kernel. | ||
1841 | The program code and the translation exception code that were placed | ||
1842 | in the cpu's lowcore are presented here as defined by the z Architecture | ||
1843 | Principles of Operation Book in the Chapter for Dynamic Address Translation | ||
1844 | (DAT) | ||
1845 | |||
1654 | /* KVM_EXIT_DCR */ | 1846 | /* KVM_EXIT_DCR */ |
1655 | struct { | 1847 | struct { |
1656 | __u32 dcrn; | 1848 | __u32 dcrn; |
@@ -1693,6 +1885,29 @@ developer registration required to access it). | |||
1693 | /* Fix the size of the union. */ | 1885 | /* Fix the size of the union. */ |
1694 | char padding[256]; | 1886 | char padding[256]; |
1695 | }; | 1887 | }; |
1888 | |||
1889 | /* | ||
1890 | * shared registers between kvm and userspace. | ||
1891 | * kvm_valid_regs specifies the register classes set by the host | ||
1892 | * kvm_dirty_regs specified the register classes dirtied by userspace | ||
1893 | * struct kvm_sync_regs is architecture specific, as well as the | ||
1894 | * bits for kvm_valid_regs and kvm_dirty_regs | ||
1895 | */ | ||
1896 | __u64 kvm_valid_regs; | ||
1897 | __u64 kvm_dirty_regs; | ||
1898 | union { | ||
1899 | struct kvm_sync_regs regs; | ||
1900 | char padding[1024]; | ||
1901 | } s; | ||
1902 | |||
1903 | If KVM_CAP_SYNC_REGS is defined, these fields allow userspace to access | ||
1904 | certain guest registers without having to call SET/GET_*REGS. Thus we can | ||
1905 | avoid some system call overhead if userspace has to handle the exit. | ||
1906 | Userspace can query the validity of the structure by checking | ||
1907 | kvm_valid_regs for specific bits. These bits are architecture specific | ||
1908 | and usually define the validity of a groups of registers. (e.g. one bit | ||
1909 | for general purpose registers) | ||
1910 | |||
1696 | }; | 1911 | }; |
1697 | 1912 | ||
1698 | 6. Capabilities that can be enabled | 1913 | 6. Capabilities that can be enabled |
@@ -1741,3 +1956,45 @@ HTAB address part of SDR1 contains an HVA instead of a GPA, as PAPR keeps the | |||
1741 | HTAB invisible to the guest. | 1956 | HTAB invisible to the guest. |
1742 | 1957 | ||
1743 | When this capability is enabled, KVM_EXIT_PAPR_HCALL can occur. | 1958 | When this capability is enabled, KVM_EXIT_PAPR_HCALL can occur. |
1959 | |||
1960 | 6.3 KVM_CAP_SW_TLB | ||
1961 | |||
1962 | Architectures: ppc | ||
1963 | Parameters: args[0] is the address of a struct kvm_config_tlb | ||
1964 | Returns: 0 on success; -1 on error | ||
1965 | |||
1966 | struct kvm_config_tlb { | ||
1967 | __u64 params; | ||
1968 | __u64 array; | ||
1969 | __u32 mmu_type; | ||
1970 | __u32 array_len; | ||
1971 | }; | ||
1972 | |||
1973 | Configures the virtual CPU's TLB array, establishing a shared memory area | ||
1974 | between userspace and KVM. The "params" and "array" fields are userspace | ||
1975 | addresses of mmu-type-specific data structures. The "array_len" field is an | ||
1976 | safety mechanism, and should be set to the size in bytes of the memory that | ||
1977 | userspace has reserved for the array. It must be at least the size dictated | ||
1978 | by "mmu_type" and "params". | ||
1979 | |||
1980 | While KVM_RUN is active, the shared region is under control of KVM. Its | ||
1981 | contents are undefined, and any modification by userspace results in | ||
1982 | boundedly undefined behavior. | ||
1983 | |||
1984 | On return from KVM_RUN, the shared region will reflect the current state of | ||
1985 | the guest's TLB. If userspace makes any changes, it must call KVM_DIRTY_TLB | ||
1986 | to tell KVM which entries have been changed, prior to calling KVM_RUN again | ||
1987 | on this vcpu. | ||
1988 | |||
1989 | For mmu types KVM_MMU_FSL_BOOKE_NOHV and KVM_MMU_FSL_BOOKE_HV: | ||
1990 | - The "params" field is of type "struct kvm_book3e_206_tlb_params". | ||
1991 | - The "array" field points to an array of type "struct | ||
1992 | kvm_book3e_206_tlb_entry". | ||
1993 | - The array consists of all entries in the first TLB, followed by all | ||
1994 | entries in the second TLB. | ||
1995 | - Within a TLB, entries are ordered first by increasing set number. Within a | ||
1996 | set, entries are ordered by way (increasing ESEL). | ||
1997 | - The hash for determining set number in TLB0 is: (MAS2 >> 12) & (num_sets - 1) | ||
1998 | where "num_sets" is the tlb_sizes[] value divided by the tlb_ways[] value. | ||
1999 | - The tsize field of mas1 shall be set to 4K on TLB0, even though the | ||
2000 | hardware ignores this value for TLB0. | ||
diff --git a/Documentation/virtual/kvm/ppc-pv.txt b/Documentation/virtual/kvm/ppc-pv.txt index 2b7ce190cde4..6e7c37050930 100644 --- a/Documentation/virtual/kvm/ppc-pv.txt +++ b/Documentation/virtual/kvm/ppc-pv.txt | |||
@@ -81,28 +81,8 @@ additional registers to the magic page. If you add fields to the magic page, | |||
81 | also define a new hypercall feature to indicate that the host can give you more | 81 | also define a new hypercall feature to indicate that the host can give you more |
82 | registers. Only if the host supports the additional features, make use of them. | 82 | registers. Only if the host supports the additional features, make use of them. |
83 | 83 | ||
84 | The magic page has the following layout as described in | 84 | The magic page layout is described by struct kvm_vcpu_arch_shared |
85 | arch/powerpc/include/asm/kvm_para.h: | 85 | in arch/powerpc/include/asm/kvm_para.h. |
86 | |||
87 | struct kvm_vcpu_arch_shared { | ||
88 | __u64 scratch1; | ||
89 | __u64 scratch2; | ||
90 | __u64 scratch3; | ||
91 | __u64 critical; /* Guest may not get interrupts if == r1 */ | ||
92 | __u64 sprg0; | ||
93 | __u64 sprg1; | ||
94 | __u64 sprg2; | ||
95 | __u64 sprg3; | ||
96 | __u64 srr0; | ||
97 | __u64 srr1; | ||
98 | __u64 dar; | ||
99 | __u64 msr; | ||
100 | __u32 dsisr; | ||
101 | __u32 int_pending; /* Tells the guest if we have an interrupt */ | ||
102 | }; | ||
103 | |||
104 | Additions to the page must only occur at the end. Struct fields are always 32 | ||
105 | or 64 bit aligned, depending on them being 32 or 64 bit wide respectively. | ||
106 | 86 | ||
107 | Magic page features | 87 | Magic page features |
108 | =================== | 88 | =================== |