diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-24 15:47:25 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-24 15:47:25 -0400 |
commit | 1765a1fe5d6f82c0eceb1ad10594cfc83759b6d0 (patch) | |
tree | a701020f0fa3a1932a36d174c5fffd20496303a9 /Documentation | |
parent | bdaf12b41235b0c59949914de022341e77907461 (diff) | |
parent | 2a31339aa014c0d0b97c57d3ebc997732f8f47fc (diff) |
Merge branch 'kvm-updates/2.6.37' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/2.6.37' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (321 commits)
KVM: Drop CONFIG_DMAR dependency around kvm_iommu_map_pages
KVM: Fix signature of kvm_iommu_map_pages stub
KVM: MCE: Send SRAR SIGBUS directly
KVM: MCE: Add MCG_SER_P into KVM_MCE_CAP_SUPPORTED
KVM: fix typo in copyright notice
KVM: Disable interrupts around get_kernel_ns()
KVM: MMU: Avoid sign extension in mmu_alloc_direct_roots() pae root address
KVM: MMU: move access code parsing to FNAME(walk_addr) function
KVM: MMU: audit: check whether have unsync sps after root sync
KVM: MMU: audit: introduce audit_printk to cleanup audit code
KVM: MMU: audit: unregister audit tracepoints before module unloaded
KVM: MMU: audit: fix vcpu's spte walking
KVM: MMU: set access bit for direct mapping
KVM: MMU: cleanup for error mask set while walk guest page table
KVM: MMU: update 'root_hpa' out of loop in PAE shadow path
KVM: x86 emulator: Eliminate compilation warning in x86_decode_insn()
KVM: x86: Fix constant type in kvm_get_time_scale
KVM: VMX: Add AX to list of registers clobbered by guest switch
KVM guest: Move a printk that's using the clock before it's ready
KVM: x86: TSC catchup mode
...
Diffstat (limited to 'Documentation')
-rw-r--r-- | Documentation/kernel-parameters.txt | 8 | ||||
-rw-r--r-- | Documentation/kvm/api.txt | 61 | ||||
-rw-r--r-- | Documentation/kvm/ppc-pv.txt | 196 | ||||
-rw-r--r-- | Documentation/kvm/timekeeping.txt | 612 |
4 files changed, 872 insertions, 5 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 4cd8b86e00ea..9533af74a127 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -1131,9 +1131,13 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1131 | kvm.oos_shadow= [KVM] Disable out-of-sync shadow paging. | 1131 | kvm.oos_shadow= [KVM] Disable out-of-sync shadow paging. |
1132 | Default is 1 (enabled) | 1132 | Default is 1 (enabled) |
1133 | 1133 | ||
1134 | kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM. | 1134 | kvm.mmu_audit= [KVM] This is a R/W parameter which allows audit |
1135 | KVM MMU at runtime. | ||
1135 | Default is 0 (off) | 1136 | Default is 0 (off) |
1136 | 1137 | ||
1138 | kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM. | ||
1139 | Default is 1 (enabled) | ||
1140 | |||
1137 | kvm-amd.npt= [KVM,AMD] Disable nested paging (virtualized MMU) | 1141 | kvm-amd.npt= [KVM,AMD] Disable nested paging (virtualized MMU) |
1138 | for all guests. | 1142 | for all guests. |
1139 | Default is 1 (enabled) if in 64bit or 32bit-PAE mode | 1143 | Default is 1 (enabled) if in 64bit or 32bit-PAE mode |
@@ -1698,6 +1702,8 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1698 | 1702 | ||
1699 | nojitter [IA64] Disables jitter checking for ITC timers. | 1703 | nojitter [IA64] Disables jitter checking for ITC timers. |
1700 | 1704 | ||
1705 | no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver | ||
1706 | |||
1701 | nolapic [X86-32,APIC] Do not enable or use the local APIC. | 1707 | nolapic [X86-32,APIC] Do not enable or use the local APIC. |
1702 | 1708 | ||
1703 | nolapic_timer [X86-32,APIC] Do not use the local APIC timer. | 1709 | nolapic_timer [X86-32,APIC] Do not use the local APIC timer. |
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt index 5f5b64982b1a..b336266bea5e 100644 --- a/Documentation/kvm/api.txt +++ b/Documentation/kvm/api.txt | |||
@@ -320,13 +320,13 @@ struct kvm_translation { | |||
320 | 4.15 KVM_INTERRUPT | 320 | 4.15 KVM_INTERRUPT |
321 | 321 | ||
322 | Capability: basic | 322 | Capability: basic |
323 | Architectures: x86 | 323 | Architectures: x86, ppc |
324 | Type: vcpu ioctl | 324 | Type: vcpu ioctl |
325 | Parameters: struct kvm_interrupt (in) | 325 | Parameters: struct kvm_interrupt (in) |
326 | Returns: 0 on success, -1 on error | 326 | Returns: 0 on success, -1 on error |
327 | 327 | ||
328 | Queues a hardware interrupt vector to be injected. This is only | 328 | Queues a hardware interrupt vector to be injected. This is only |
329 | useful if in-kernel local APIC is not used. | 329 | useful if in-kernel local APIC or equivalent is not used. |
330 | 330 | ||
331 | /* for KVM_INTERRUPT */ | 331 | /* for KVM_INTERRUPT */ |
332 | struct kvm_interrupt { | 332 | struct kvm_interrupt { |
@@ -334,8 +334,37 @@ struct kvm_interrupt { | |||
334 | __u32 irq; | 334 | __u32 irq; |
335 | }; | 335 | }; |
336 | 336 | ||
337 | X86: | ||
338 | |||
337 | Note 'irq' is an interrupt vector, not an interrupt pin or line. | 339 | Note 'irq' is an interrupt vector, not an interrupt pin or line. |
338 | 340 | ||
341 | PPC: | ||
342 | |||
343 | Queues an external interrupt to be injected. This ioctl is overleaded | ||
344 | with 3 different irq values: | ||
345 | |||
346 | a) KVM_INTERRUPT_SET | ||
347 | |||
348 | This injects an edge type external interrupt into the guest once it's ready | ||
349 | to receive interrupts. When injected, the interrupt is done. | ||
350 | |||
351 | b) KVM_INTERRUPT_UNSET | ||
352 | |||
353 | This unsets any pending interrupt. | ||
354 | |||
355 | Only available with KVM_CAP_PPC_UNSET_IRQ. | ||
356 | |||
357 | c) KVM_INTERRUPT_SET_LEVEL | ||
358 | |||
359 | This injects a level type external interrupt into the guest context. The | ||
360 | interrupt stays pending until a specific ioctl with KVM_INTERRUPT_UNSET | ||
361 | is triggered. | ||
362 | |||
363 | Only available with KVM_CAP_PPC_IRQ_LEVEL. | ||
364 | |||
365 | Note that any value for 'irq' other than the ones stated above is invalid | ||
366 | and incurs unexpected behavior. | ||
367 | |||
339 | 4.16 KVM_DEBUG_GUEST | 368 | 4.16 KVM_DEBUG_GUEST |
340 | 369 | ||
341 | Capability: basic | 370 | Capability: basic |
@@ -1013,8 +1042,9 @@ number is just right, the 'nent' field is adjusted to the number of valid | |||
1013 | entries in the 'entries' array, which is then filled. | 1042 | entries in the 'entries' array, which is then filled. |
1014 | 1043 | ||
1015 | The entries returned are the host cpuid as returned by the cpuid instruction, | 1044 | The entries returned are the host cpuid as returned by the cpuid instruction, |
1016 | with unknown or unsupported features masked out. The fields in each entry | 1045 | with unknown or unsupported features masked out. Some features (for example, |
1017 | are defined as follows: | 1046 | x2apic), may not be present in the host cpu, but are exposed by kvm if it can |
1047 | emulate them efficiently. The fields in each entry are defined as follows: | ||
1018 | 1048 | ||
1019 | function: the eax value used to obtain the entry | 1049 | function: the eax value used to obtain the entry |
1020 | index: the ecx value used to obtain the entry (for entries that are | 1050 | index: the ecx value used to obtain the entry (for entries that are |
@@ -1032,6 +1062,29 @@ are defined as follows: | |||
1032 | eax, ebx, ecx, edx: the values returned by the cpuid instruction for | 1062 | eax, ebx, ecx, edx: the values returned by the cpuid instruction for |
1033 | this function/index combination | 1063 | this function/index combination |
1034 | 1064 | ||
1065 | 4.46 KVM_PPC_GET_PVINFO | ||
1066 | |||
1067 | Capability: KVM_CAP_PPC_GET_PVINFO | ||
1068 | Architectures: ppc | ||
1069 | Type: vm ioctl | ||
1070 | Parameters: struct kvm_ppc_pvinfo (out) | ||
1071 | Returns: 0 on success, !0 on error | ||
1072 | |||
1073 | struct kvm_ppc_pvinfo { | ||
1074 | __u32 flags; | ||
1075 | __u32 hcall[4]; | ||
1076 | __u8 pad[108]; | ||
1077 | }; | ||
1078 | |||
1079 | This ioctl fetches PV specific information that need to be passed to the guest | ||
1080 | using the device tree or other means from vm context. | ||
1081 | |||
1082 | For now the only implemented piece of information distributed here is an array | ||
1083 | of 4 instructions that make up a hypercall. | ||
1084 | |||
1085 | If any additional field gets added to this structure later on, a bit for that | ||
1086 | additional piece of information will be set in the flags bitmap. | ||
1087 | |||
1035 | 5. The kvm_run structure | 1088 | 5. The kvm_run structure |
1036 | 1089 | ||
1037 | Application code obtains a pointer to the kvm_run structure by | 1090 | Application code obtains a pointer to the kvm_run structure by |
diff --git a/Documentation/kvm/ppc-pv.txt b/Documentation/kvm/ppc-pv.txt new file mode 100644 index 000000000000..a7f2244b3be9 --- /dev/null +++ b/Documentation/kvm/ppc-pv.txt | |||
@@ -0,0 +1,196 @@ | |||
1 | The PPC KVM paravirtual interface | ||
2 | ================================= | ||
3 | |||
4 | The basic execution principle by which KVM on PowerPC works is to run all kernel | ||
5 | space code in PR=1 which is user space. This way we trap all privileged | ||
6 | instructions and can emulate them accordingly. | ||
7 | |||
8 | Unfortunately that is also the downfall. There are quite some privileged | ||
9 | instructions that needlessly return us to the hypervisor even though they | ||
10 | could be handled differently. | ||
11 | |||
12 | This is what the PPC PV interface helps with. It takes privileged instructions | ||
13 | and transforms them into unprivileged ones with some help from the hypervisor. | ||
14 | This cuts down virtualization costs by about 50% on some of my benchmarks. | ||
15 | |||
16 | The code for that interface can be found in arch/powerpc/kernel/kvm* | ||
17 | |||
18 | Querying for existence | ||
19 | ====================== | ||
20 | |||
21 | To find out if we're running on KVM or not, we leverage the device tree. When | ||
22 | Linux is running on KVM, a node /hypervisor exists. That node contains a | ||
23 | compatible property with the value "linux,kvm". | ||
24 | |||
25 | Once you determined you're running under a PV capable KVM, you can now use | ||
26 | hypercalls as described below. | ||
27 | |||
28 | KVM hypercalls | ||
29 | ============== | ||
30 | |||
31 | Inside the device tree's /hypervisor node there's a property called | ||
32 | 'hypercall-instructions'. This property contains at most 4 opcodes that make | ||
33 | up the hypercall. To call a hypercall, just call these instructions. | ||
34 | |||
35 | The parameters are as follows: | ||
36 | |||
37 | Register IN OUT | ||
38 | |||
39 | r0 - volatile | ||
40 | r3 1st parameter Return code | ||
41 | r4 2nd parameter 1st output value | ||
42 | r5 3rd parameter 2nd output value | ||
43 | r6 4th parameter 3rd output value | ||
44 | r7 5th parameter 4th output value | ||
45 | r8 6th parameter 5th output value | ||
46 | r9 7th parameter 6th output value | ||
47 | r10 8th parameter 7th output value | ||
48 | r11 hypercall number 8th output value | ||
49 | r12 - volatile | ||
50 | |||
51 | Hypercall definitions are shared in generic code, so the same hypercall numbers | ||
52 | apply for x86 and powerpc alike with the exception that each KVM hypercall | ||
53 | also needs to be ORed with the KVM vendor code which is (42 << 16). | ||
54 | |||
55 | Return codes can be as follows: | ||
56 | |||
57 | Code Meaning | ||
58 | |||
59 | 0 Success | ||
60 | 12 Hypercall not implemented | ||
61 | <0 Error | ||
62 | |||
63 | The magic page | ||
64 | ============== | ||
65 | |||
66 | To enable communication between the hypervisor and guest there is a new shared | ||
67 | page that contains parts of supervisor visible register state. The guest can | ||
68 | map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE. | ||
69 | |||
70 | With this hypercall issued the guest always gets the magic page mapped at the | ||
71 | desired location in effective and physical address space. For now, we always | ||
72 | map the page to -4096. This way we can access it using absolute load and store | ||
73 | functions. The following instruction reads the first field of the magic page: | ||
74 | |||
75 | ld rX, -4096(0) | ||
76 | |||
77 | The interface is designed to be extensible should there be need later to add | ||
78 | additional registers to the magic page. If you add fields to the magic page, | ||
79 | also define a new hypercall feature to indicate that the host can give you more | ||
80 | registers. Only if the host supports the additional features, make use of them. | ||
81 | |||
82 | The magic page has the following layout as described in | ||
83 | arch/powerpc/include/asm/kvm_para.h: | ||
84 | |||
85 | struct kvm_vcpu_arch_shared { | ||
86 | __u64 scratch1; | ||
87 | __u64 scratch2; | ||
88 | __u64 scratch3; | ||
89 | __u64 critical; /* Guest may not get interrupts if == r1 */ | ||
90 | __u64 sprg0; | ||
91 | __u64 sprg1; | ||
92 | __u64 sprg2; | ||
93 | __u64 sprg3; | ||
94 | __u64 srr0; | ||
95 | __u64 srr1; | ||
96 | __u64 dar; | ||
97 | __u64 msr; | ||
98 | __u32 dsisr; | ||
99 | __u32 int_pending; /* Tells the guest if we have an interrupt */ | ||
100 | }; | ||
101 | |||
102 | Additions to the page must only occur at the end. Struct fields are always 32 | ||
103 | or 64 bit aligned, depending on them being 32 or 64 bit wide respectively. | ||
104 | |||
105 | Magic page features | ||
106 | =================== | ||
107 | |||
108 | When mapping the magic page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE, | ||
109 | a second return value is passed to the guest. This second return value contains | ||
110 | a bitmap of available features inside the magic page. | ||
111 | |||
112 | The following enhancements to the magic page are currently available: | ||
113 | |||
114 | KVM_MAGIC_FEAT_SR Maps SR registers r/w in the magic page | ||
115 | |||
116 | For enhanced features in the magic page, please check for the existence of the | ||
117 | feature before using them! | ||
118 | |||
119 | MSR bits | ||
120 | ======== | ||
121 | |||
122 | The MSR contains bits that require hypervisor intervention and bits that do | ||
123 | not require direct hypervisor intervention because they only get interpreted | ||
124 | when entering the guest or don't have any impact on the hypervisor's behavior. | ||
125 | |||
126 | The following bits are safe to be set inside the guest: | ||
127 | |||
128 | MSR_EE | ||
129 | MSR_RI | ||
130 | MSR_CR | ||
131 | MSR_ME | ||
132 | |||
133 | If any other bit changes in the MSR, please still use mtmsr(d). | ||
134 | |||
135 | Patched instructions | ||
136 | ==================== | ||
137 | |||
138 | The "ld" and "std" instructions are transormed to "lwz" and "stw" instructions | ||
139 | respectively on 32 bit systems with an added offset of 4 to accomodate for big | ||
140 | endianness. | ||
141 | |||
142 | The following is a list of mapping the Linux kernel performs when running as | ||
143 | guest. Implementing any of those mappings is optional, as the instruction traps | ||
144 | also act on the shared page. So calling privileged instructions still works as | ||
145 | before. | ||
146 | |||
147 | From To | ||
148 | ==== == | ||
149 | |||
150 | mfmsr rX ld rX, magic_page->msr | ||
151 | mfsprg rX, 0 ld rX, magic_page->sprg0 | ||
152 | mfsprg rX, 1 ld rX, magic_page->sprg1 | ||
153 | mfsprg rX, 2 ld rX, magic_page->sprg2 | ||
154 | mfsprg rX, 3 ld rX, magic_page->sprg3 | ||
155 | mfsrr0 rX ld rX, magic_page->srr0 | ||
156 | mfsrr1 rX ld rX, magic_page->srr1 | ||
157 | mfdar rX ld rX, magic_page->dar | ||
158 | mfdsisr rX lwz rX, magic_page->dsisr | ||
159 | |||
160 | mtmsr rX std rX, magic_page->msr | ||
161 | mtsprg 0, rX std rX, magic_page->sprg0 | ||
162 | mtsprg 1, rX std rX, magic_page->sprg1 | ||
163 | mtsprg 2, rX std rX, magic_page->sprg2 | ||
164 | mtsprg 3, rX std rX, magic_page->sprg3 | ||
165 | mtsrr0 rX std rX, magic_page->srr0 | ||
166 | mtsrr1 rX std rX, magic_page->srr1 | ||
167 | mtdar rX std rX, magic_page->dar | ||
168 | mtdsisr rX stw rX, magic_page->dsisr | ||
169 | |||
170 | tlbsync nop | ||
171 | |||
172 | mtmsrd rX, 0 b <special mtmsr section> | ||
173 | mtmsr rX b <special mtmsr section> | ||
174 | |||
175 | mtmsrd rX, 1 b <special mtmsrd section> | ||
176 | |||
177 | [Book3S only] | ||
178 | mtsrin rX, rY b <special mtsrin section> | ||
179 | |||
180 | [BookE only] | ||
181 | wrteei [0|1] b <special wrteei section> | ||
182 | |||
183 | |||
184 | Some instructions require more logic to determine what's going on than a load | ||
185 | or store instruction can deliver. To enable patching of those, we keep some | ||
186 | RAM around where we can live translate instructions to. What happens is the | ||
187 | following: | ||
188 | |||
189 | 1) copy emulation code to memory | ||
190 | 2) patch that code to fit the emulated instruction | ||
191 | 3) patch that code to return to the original pc + 4 | ||
192 | 4) patch the original instruction to branch to the new code | ||
193 | |||
194 | That way we can inject an arbitrary amount of code as replacement for a single | ||
195 | instruction. This allows us to check for pending interrupts when setting EE=1 | ||
196 | for example. | ||
diff --git a/Documentation/kvm/timekeeping.txt b/Documentation/kvm/timekeeping.txt new file mode 100644 index 000000000000..0c5033a58c9e --- /dev/null +++ b/Documentation/kvm/timekeeping.txt | |||
@@ -0,0 +1,612 @@ | |||
1 | |||
2 | Timekeeping Virtualization for X86-Based Architectures | ||
3 | |||
4 | Zachary Amsden <zamsden@redhat.com> | ||
5 | Copyright (c) 2010, Red Hat. All rights reserved. | ||
6 | |||
7 | 1) Overview | ||
8 | 2) Timing Devices | ||
9 | 3) TSC Hardware | ||
10 | 4) Virtualization Problems | ||
11 | |||
12 | ========================================================================= | ||
13 | |||
14 | 1) Overview | ||
15 | |||
16 | One of the most complicated parts of the X86 platform, and specifically, | ||
17 | the virtualization of this platform is the plethora of timing devices available | ||
18 | and the complexity of emulating those devices. In addition, virtualization of | ||
19 | time introduces a new set of challenges because it introduces a multiplexed | ||
20 | division of time beyond the control of the guest CPU. | ||
21 | |||
22 | First, we will describe the various timekeeping hardware available, then | ||
23 | present some of the problems which arise and solutions available, giving | ||
24 | specific recommendations for certain classes of KVM guests. | ||
25 | |||
26 | The purpose of this document is to collect data and information relevant to | ||
27 | timekeeping which may be difficult to find elsewhere, specifically, | ||
28 | information relevant to KVM and hardware-based virtualization. | ||
29 | |||
30 | ========================================================================= | ||
31 | |||
32 | 2) Timing Devices | ||
33 | |||
34 | First we discuss the basic hardware devices available. TSC and the related | ||
35 | KVM clock are special enough to warrant a full exposition and are described in | ||
36 | the following section. | ||
37 | |||
38 | 2.1) i8254 - PIT | ||
39 | |||
40 | One of the first timer devices available is the programmable interrupt timer, | ||
41 | or PIT. The PIT has a fixed frequency 1.193182 MHz base clock and three | ||
42 | channels which can be programmed to deliver periodic or one-shot interrupts. | ||
43 | These three channels can be configured in different modes and have individual | ||
44 | counters. Channel 1 and 2 were not available for general use in the original | ||
45 | IBM PC, and historically were connected to control RAM refresh and the PC | ||
46 | speaker. Now the PIT is typically integrated as part of an emulated chipset | ||
47 | and a separate physical PIT is not used. | ||
48 | |||
49 | The PIT uses I/O ports 0x40 - 0x43. Access to the 16-bit counters is done | ||
50 | using single or multiple byte access to the I/O ports. There are 6 modes | ||
51 | available, but not all modes are available to all timers, as only timer 2 | ||
52 | has a connected gate input, required for modes 1 and 5. The gate line is | ||
53 | controlled by port 61h, bit 0, as illustrated in the following diagram. | ||
54 | |||
55 | -------------- ---------------- | ||
56 | | | | | | ||
57 | | 1.1932 MHz |---------->| CLOCK OUT | ---------> IRQ 0 | ||
58 | | Clock | | | | | ||
59 | -------------- | +->| GATE TIMER 0 | | ||
60 | | ---------------- | ||
61 | | | ||
62 | | ---------------- | ||
63 | | | | | ||
64 | |------>| CLOCK OUT | ---------> 66.3 KHZ DRAM | ||
65 | | | | (aka /dev/null) | ||
66 | | +->| GATE TIMER 1 | | ||
67 | | ---------------- | ||
68 | | | ||
69 | | ---------------- | ||
70 | | | | | ||
71 | |------>| CLOCK OUT | ---------> Port 61h, bit 5 | ||
72 | | | | | ||
73 | Port 61h, bit 0 ---------->| GATE TIMER 2 | \_.---- ____ | ||
74 | ---------------- _| )--|LPF|---Speaker | ||
75 | / *---- \___/ | ||
76 | Port 61h, bit 1 -----------------------------------/ | ||
77 | |||
78 | The timer modes are now described. | ||
79 | |||
80 | Mode 0: Single Timeout. This is a one-shot software timeout that counts down | ||
81 | when the gate is high (always true for timers 0 and 1). When the count | ||
82 | reaches zero, the output goes high. | ||
83 | |||
84 | Mode 1: Triggered One-shot. The output is intially set high. When the gate | ||
85 | line is set high, a countdown is initiated (which does not stop if the gate is | ||
86 | lowered), during which the output is set low. When the count reaches zero, | ||
87 | the output goes high. | ||
88 | |||
89 | Mode 2: Rate Generator. The output is initially set high. When the countdown | ||
90 | reaches 1, the output goes low for one count and then returns high. The value | ||
91 | is reloaded and the countdown automatically resumes. If the gate line goes | ||
92 | low, the count is halted. If the output is low when the gate is lowered, the | ||
93 | output automatically goes high (this only affects timer 2). | ||
94 | |||
95 | Mode 3: Square Wave. This generates a high / low square wave. The count | ||
96 | determines the length of the pulse, which alternates between high and low | ||
97 | when zero is reached. The count only proceeds when gate is high and is | ||
98 | automatically reloaded on reaching zero. The count is decremented twice at | ||
99 | each clock to generate a full high / low cycle at the full periodic rate. | ||
100 | If the count is even, the clock remains high for N/2 counts and low for N/2 | ||
101 | counts; if the clock is odd, the clock is high for (N+1)/2 counts and low | ||
102 | for (N-1)/2 counts. Only even values are latched by the counter, so odd | ||
103 | values are not observed when reading. This is the intended mode for timer 2, | ||
104 | which generates sine-like tones by low-pass filtering the square wave output. | ||
105 | |||
106 | Mode 4: Software Strobe. After programming this mode and loading the counter, | ||
107 | the output remains high until the counter reaches zero. Then the output | ||
108 | goes low for 1 clock cycle and returns high. The counter is not reloaded. | ||
109 | Counting only occurs when gate is high. | ||
110 | |||
111 | Mode 5: Hardware Strobe. After programming and loading the counter, the | ||
112 | output remains high. When the gate is raised, a countdown is initiated | ||
113 | (which does not stop if the gate is lowered). When the counter reaches zero, | ||
114 | the output goes low for 1 clock cycle and then returns high. The counter is | ||
115 | not reloaded. | ||
116 | |||
117 | In addition to normal binary counting, the PIT supports BCD counting. The | ||
118 | command port, 0x43 is used to set the counter and mode for each of the three | ||
119 | timers. | ||
120 | |||
121 | PIT commands, issued to port 0x43, using the following bit encoding: | ||
122 | |||
123 | Bit 7-4: Command (See table below) | ||
124 | Bit 3-1: Mode (000 = Mode 0, 101 = Mode 5, 11X = undefined) | ||
125 | Bit 0 : Binary (0) / BCD (1) | ||
126 | |||
127 | Command table: | ||
128 | |||
129 | 0000 - Latch Timer 0 count for port 0x40 | ||
130 | sample and hold the count to be read in port 0x40; | ||
131 | additional commands ignored until counter is read; | ||
132 | mode bits ignored. | ||
133 | |||
134 | 0001 - Set Timer 0 LSB mode for port 0x40 | ||
135 | set timer to read LSB only and force MSB to zero; | ||
136 | mode bits set timer mode | ||
137 | |||
138 | 0010 - Set Timer 0 MSB mode for port 0x40 | ||
139 | set timer to read MSB only and force LSB to zero; | ||
140 | mode bits set timer mode | ||
141 | |||
142 | 0011 - Set Timer 0 16-bit mode for port 0x40 | ||
143 | set timer to read / write LSB first, then MSB; | ||
144 | mode bits set timer mode | ||
145 | |||
146 | 0100 - Latch Timer 1 count for port 0x41 - as described above | ||
147 | 0101 - Set Timer 1 LSB mode for port 0x41 - as described above | ||
148 | 0110 - Set Timer 1 MSB mode for port 0x41 - as described above | ||
149 | 0111 - Set Timer 1 16-bit mode for port 0x41 - as described above | ||
150 | |||
151 | 1000 - Latch Timer 2 count for port 0x42 - as described above | ||
152 | 1001 - Set Timer 2 LSB mode for port 0x42 - as described above | ||
153 | 1010 - Set Timer 2 MSB mode for port 0x42 - as described above | ||
154 | 1011 - Set Timer 2 16-bit mode for port 0x42 as described above | ||
155 | |||
156 | 1101 - General counter latch | ||
157 | Latch combination of counters into corresponding ports | ||
158 | Bit 3 = Counter 2 | ||
159 | Bit 2 = Counter 1 | ||
160 | Bit 1 = Counter 0 | ||
161 | Bit 0 = Unused | ||
162 | |||
163 | 1110 - Latch timer status | ||
164 | Latch combination of counter mode into corresponding ports | ||
165 | Bit 3 = Counter 2 | ||
166 | Bit 2 = Counter 1 | ||
167 | Bit 1 = Counter 0 | ||
168 | |||
169 | The output of ports 0x40-0x42 following this command will be: | ||
170 | |||
171 | Bit 7 = Output pin | ||
172 | Bit 6 = Count loaded (0 if timer has expired) | ||
173 | Bit 5-4 = Read / Write mode | ||
174 | 01 = MSB only | ||
175 | 10 = LSB only | ||
176 | 11 = LSB / MSB (16-bit) | ||
177 | Bit 3-1 = Mode | ||
178 | Bit 0 = Binary (0) / BCD mode (1) | ||
179 | |||
180 | 2.2) RTC | ||
181 | |||
182 | The second device which was available in the original PC was the MC146818 real | ||
183 | time clock. The original device is now obsolete, and usually emulated by the | ||
184 | system chipset, sometimes by an HPET and some frankenstein IRQ routing. | ||
185 | |||
186 | The RTC is accessed through CMOS variables, which uses an index register to | ||
187 | control which bytes are read. Since there is only one index register, read | ||
188 | of the CMOS and read of the RTC require lock protection (in addition, it is | ||
189 | dangerous to allow userspace utilities such as hwclock to have direct RTC | ||
190 | access, as they could corrupt kernel reads and writes of CMOS memory). | ||
191 | |||
192 | The RTC generates an interrupt which is usually routed to IRQ 8. The interrupt | ||
193 | can function as a periodic timer, an additional once a day alarm, and can issue | ||
194 | interrupts after an update of the CMOS registers by the MC146818 is complete. | ||
195 | The type of interrupt is signalled in the RTC status registers. | ||
196 | |||
197 | The RTC will update the current time fields by battery power even while the | ||
198 | system is off. The current time fields should not be read while an update is | ||
199 | in progress, as indicated in the status register. | ||
200 | |||
201 | The clock uses a 32.768kHz crystal, so bits 6-4 of register A should be | ||
202 | programmed to a 32kHz divider if the RTC is to count seconds. | ||
203 | |||
204 | This is the RAM map originally used for the RTC/CMOS: | ||
205 | |||
206 | Location Size Description | ||
207 | ------------------------------------------ | ||
208 | 00h byte Current second (BCD) | ||
209 | 01h byte Seconds alarm (BCD) | ||
210 | 02h byte Current minute (BCD) | ||
211 | 03h byte Minutes alarm (BCD) | ||
212 | 04h byte Current hour (BCD) | ||
213 | 05h byte Hours alarm (BCD) | ||
214 | 06h byte Current day of week (BCD) | ||
215 | 07h byte Current day of month (BCD) | ||
216 | 08h byte Current month (BCD) | ||
217 | 09h byte Current year (BCD) | ||
218 | 0Ah byte Register A | ||
219 | bit 7 = Update in progress | ||
220 | bit 6-4 = Divider for clock | ||
221 | 000 = 4.194 MHz | ||
222 | 001 = 1.049 MHz | ||
223 | 010 = 32 kHz | ||
224 | 10X = test modes | ||
225 | 110 = reset / disable | ||
226 | 111 = reset / disable | ||
227 | bit 3-0 = Rate selection for periodic interrupt | ||
228 | 000 = periodic timer disabled | ||
229 | 001 = 3.90625 uS | ||
230 | 010 = 7.8125 uS | ||
231 | 011 = .122070 mS | ||
232 | 100 = .244141 mS | ||
233 | ... | ||
234 | 1101 = 125 mS | ||
235 | 1110 = 250 mS | ||
236 | 1111 = 500 mS | ||
237 | 0Bh byte Register B | ||
238 | bit 7 = Run (0) / Halt (1) | ||
239 | bit 6 = Periodic interrupt enable | ||
240 | bit 5 = Alarm interrupt enable | ||
241 | bit 4 = Update-ended interrupt enable | ||
242 | bit 3 = Square wave interrupt enable | ||
243 | bit 2 = BCD calendar (0) / Binary (1) | ||
244 | bit 1 = 12-hour mode (0) / 24-hour mode (1) | ||
245 | bit 0 = 0 (DST off) / 1 (DST enabled) | ||
246 | OCh byte Register C (read only) | ||
247 | bit 7 = interrupt request flag (IRQF) | ||
248 | bit 6 = periodic interrupt flag (PF) | ||
249 | bit 5 = alarm interrupt flag (AF) | ||
250 | bit 4 = update interrupt flag (UF) | ||
251 | bit 3-0 = reserved | ||
252 | ODh byte Register D (read only) | ||
253 | bit 7 = RTC has power | ||
254 | bit 6-0 = reserved | ||
255 | 32h byte Current century BCD (*) | ||
256 | (*) location vendor specific and now determined from ACPI global tables | ||
257 | |||
258 | 2.3) APIC | ||
259 | |||
260 | On Pentium and later processors, an on-board timer is available to each CPU | ||
261 | as part of the Advanced Programmable Interrupt Controller. The APIC is | ||
262 | accessed through memory-mapped registers and provides interrupt service to each | ||
263 | CPU, used for IPIs and local timer interrupts. | ||
264 | |||
265 | Although in theory the APIC is a safe and stable source for local interrupts, | ||
266 | in practice, many bugs and glitches have occurred due to the special nature of | ||
267 | the APIC CPU-local memory-mapped hardware. Beware that CPU errata may affect | ||
268 | the use of the APIC and that workarounds may be required. In addition, some of | ||
269 | these workarounds pose unique constraints for virtualization - requiring either | ||
270 | extra overhead incurred from extra reads of memory-mapped I/O or additional | ||
271 | functionality that may be more computationally expensive to implement. | ||
272 | |||
273 | Since the APIC is documented quite well in the Intel and AMD manuals, we will | ||
274 | avoid repetition of the detail here. It should be pointed out that the APIC | ||
275 | timer is programmed through the LVT (local vector timer) register, is capable | ||
276 | of one-shot or periodic operation, and is based on the bus clock divided down | ||
277 | by the programmable divider register. | ||
278 | |||
279 | 2.4) HPET | ||
280 | |||
281 | HPET is quite complex, and was originally intended to replace the PIT / RTC | ||
282 | support of the X86 PC. It remains to be seen whether that will be the case, as | ||
283 | the de facto standard of PC hardware is to emulate these older devices. Some | ||
284 | systems designated as legacy free may support only the HPET as a hardware timer | ||
285 | device. | ||
286 | |||
287 | The HPET spec is rather loose and vague, requiring at least 3 hardware timers, | ||
288 | but allowing implementation freedom to support many more. It also imposes no | ||
289 | fixed rate on the timer frequency, but does impose some extremal values on | ||
290 | frequency, error and slew. | ||
291 | |||
292 | In general, the HPET is recommended as a high precision (compared to PIT /RTC) | ||
293 | time source which is independent of local variation (as there is only one HPET | ||
294 | in any given system). The HPET is also memory-mapped, and its presence is | ||
295 | indicated through ACPI tables by the BIOS. | ||
296 | |||
297 | Detailed specification of the HPET is beyond the current scope of this | ||
298 | document, as it is also very well documented elsewhere. | ||
299 | |||
300 | 2.5) Offboard Timers | ||
301 | |||
302 | Several cards, both proprietary (watchdog boards) and commonplace (e1000) have | ||
303 | timing chips built into the cards which may have registers which are accessible | ||
304 | to kernel or user drivers. To the author's knowledge, using these to generate | ||
305 | a clocksource for a Linux or other kernel has not yet been attempted and is in | ||
306 | general frowned upon as not playing by the agreed rules of the game. Such a | ||
307 | timer device would require additional support to be virtualized properly and is | ||
308 | not considered important at this time as no known operating system does this. | ||
309 | |||
310 | ========================================================================= | ||
311 | |||
312 | 3) TSC Hardware | ||
313 | |||
314 | The TSC or time stamp counter is relatively simple in theory; it counts | ||
315 | instruction cycles issued by the processor, which can be used as a measure of | ||
316 | time. In practice, due to a number of problems, it is the most complicated | ||
317 | timekeeping device to use. | ||
318 | |||
319 | The TSC is represented internally as a 64-bit MSR which can be read with the | ||
320 | RDMSR, RDTSC, or RDTSCP (when available) instructions. In the past, hardware | ||
321 | limitations made it possible to write the TSC, but generally on old hardware it | ||
322 | was only possible to write the low 32-bits of the 64-bit counter, and the upper | ||
323 | 32-bits of the counter were cleared. Now, however, on Intel processors family | ||
324 | 0Fh, for models 3, 4 and 6, and family 06h, models e and f, this restriction | ||
325 | has been lifted and all 64-bits are writable. On AMD systems, the ability to | ||
326 | write the TSC MSR is not an architectural guarantee. | ||
327 | |||
328 | The TSC is accessible from CPL-0 and conditionally, for CPL > 0 software by | ||
329 | means of the CR4.TSD bit, which when enabled, disables CPL > 0 TSC access. | ||
330 | |||
331 | Some vendors have implemented an additional instruction, RDTSCP, which returns | ||
332 | atomically not just the TSC, but an indicator which corresponds to the | ||
333 | processor number. This can be used to index into an array of TSC variables to | ||
334 | determine offset information in SMP systems where TSCs are not synchronized. | ||
335 | The presence of this instruction must be determined by consulting CPUID feature | ||
336 | bits. | ||
337 | |||
338 | Both VMX and SVM provide extension fields in the virtualization hardware which | ||
339 | allows the guest visible TSC to be offset by a constant. Newer implementations | ||
340 | promise to allow the TSC to additionally be scaled, but this hardware is not | ||
341 | yet widely available. | ||
342 | |||
343 | 3.1) TSC synchronization | ||
344 | |||
345 | The TSC is a CPU-local clock in most implementations. This means, on SMP | ||
346 | platforms, the TSCs of different CPUs may start at different times depending | ||
347 | on when the CPUs are powered on. Generally, CPUs on the same die will share | ||
348 | the same clock, however, this is not always the case. | ||
349 | |||
350 | The BIOS may attempt to resynchronize the TSCs during the poweron process and | ||
351 | the operating system or other system software may attempt to do this as well. | ||
352 | Several hardware limitations make the problem worse - if it is not possible to | ||
353 | write the full 64-bits of the TSC, it may be impossible to match the TSC in | ||
354 | newly arriving CPUs to that of the rest of the system, resulting in | ||
355 | unsynchronized TSCs. This may be done by BIOS or system software, but in | ||
356 | practice, getting a perfectly synchronized TSC will not be possible unless all | ||
357 | values are read from the same clock, which generally only is possible on single | ||
358 | socket systems or those with special hardware support. | ||
359 | |||
360 | 3.2) TSC and CPU hotplug | ||
361 | |||
362 | As touched on already, CPUs which arrive later than the boot time of the system | ||
363 | may not have a TSC value that is synchronized with the rest of the system. | ||
364 | Either system software, BIOS, or SMM code may actually try to establish the TSC | ||
365 | to a value matching the rest of the system, but a perfect match is usually not | ||
366 | a guarantee. This can have the effect of bringing a system from a state where | ||
367 | TSC is synchronized back to a state where TSC synchronization flaws, however | ||
368 | small, may be exposed to the OS and any virtualization environment. | ||
369 | |||
370 | 3.3) TSC and multi-socket / NUMA | ||
371 | |||
372 | Multi-socket systems, especially large multi-socket systems are likely to have | ||
373 | individual clocksources rather than a single, universally distributed clock. | ||
374 | Since these clocks are driven by different crystals, they will not have | ||
375 | perfectly matched frequency, and temperature and electrical variations will | ||
376 | cause the CPU clocks, and thus the TSCs to drift over time. Depending on the | ||
377 | exact clock and bus design, the drift may or may not be fixed in absolute | ||
378 | error, and may accumulate over time. | ||
379 | |||
380 | In addition, very large systems may deliberately slew the clocks of individual | ||
381 | cores. This technique, known as spread-spectrum clocking, reduces EMI at the | ||
382 | clock frequency and harmonics of it, which may be required to pass FCC | ||
383 | standards for telecommunications and computer equipment. | ||
384 | |||
385 | It is recommended not to trust the TSCs to remain synchronized on NUMA or | ||
386 | multiple socket systems for these reasons. | ||
387 | |||
388 | 3.4) TSC and C-states | ||
389 | |||
390 | C-states, or idling states of the processor, especially C1E and deeper sleep | ||
391 | states may be problematic for TSC as well. The TSC may stop advancing in such | ||
392 | a state, resulting in a TSC which is behind that of other CPUs when execution | ||
393 | is resumed. Such CPUs must be detected and flagged by the operating system | ||
394 | based on CPU and chipset identifications. | ||
395 | |||
396 | The TSC in such a case may be corrected by catching it up to a known external | ||
397 | clocksource. | ||
398 | |||
399 | 3.5) TSC frequency change / P-states | ||
400 | |||
401 | To make things slightly more interesting, some CPUs may change frequency. They | ||
402 | may or may not run the TSC at the same rate, and because the frequency change | ||
403 | may be staggered or slewed, at some points in time, the TSC rate may not be | ||
404 | known other than falling within a range of values. In this case, the TSC will | ||
405 | not be a stable time source, and must be calibrated against a known, stable, | ||
406 | external clock to be a usable source of time. | ||
407 | |||
408 | Whether the TSC runs at a constant rate or scales with the P-state is model | ||
409 | dependent and must be determined by inspecting CPUID, chipset or vendor | ||
410 | specific MSR fields. | ||
411 | |||
412 | In addition, some vendors have known bugs where the P-state is actually | ||
413 | compensated for properly during normal operation, but when the processor is | ||
414 | inactive, the P-state may be raised temporarily to service cache misses from | ||
415 | other processors. In such cases, the TSC on halted CPUs could advance faster | ||
416 | than that of non-halted processors. AMD Turion processors are known to have | ||
417 | this problem. | ||
418 | |||
419 | 3.6) TSC and STPCLK / T-states | ||
420 | |||
421 | External signals given to the processor may also have the effect of stopping | ||
422 | the TSC. This is typically done for thermal emergency power control to prevent | ||
423 | an overheating condition, and typically, there is no way to detect that this | ||
424 | condition has happened. | ||
425 | |||
426 | 3.7) TSC virtualization - VMX | ||
427 | |||
428 | VMX provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP | ||
429 | instructions, which is enough for full virtualization of TSC in any manner. In | ||
430 | addition, VMX allows passing through the host TSC plus an additional TSC_OFFSET | ||
431 | field specified in the VMCS. Special instructions must be used to read and | ||
432 | write the VMCS field. | ||
433 | |||
434 | 3.8) TSC virtualization - SVM | ||
435 | |||
436 | SVM provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP | ||
437 | instructions, which is enough for full virtualization of TSC in any manner. In | ||
438 | addition, SVM allows passing through the host TSC plus an additional offset | ||
439 | field specified in the SVM control block. | ||
440 | |||
441 | 3.9) TSC feature bits in Linux | ||
442 | |||
443 | In summary, there is no way to guarantee the TSC remains in perfect | ||
444 | synchronization unless it is explicitly guaranteed by the architecture. Even | ||
445 | if so, the TSCs in multi-sockets or NUMA systems may still run independently | ||
446 | despite being locally consistent. | ||
447 | |||
448 | The following feature bits are used by Linux to signal various TSC attributes, | ||
449 | but they can only be taken to be meaningful for UP or single node systems. | ||
450 | |||
451 | X86_FEATURE_TSC : The TSC is available in hardware | ||
452 | X86_FEATURE_RDTSCP : The RDTSCP instruction is available | ||
453 | X86_FEATURE_CONSTANT_TSC : The TSC rate is unchanged with P-states | ||
454 | X86_FEATURE_NONSTOP_TSC : The TSC does not stop in C-states | ||
455 | X86_FEATURE_TSC_RELIABLE : TSC sync checks are skipped (VMware) | ||
456 | |||
457 | 4) Virtualization Problems | ||
458 | |||
459 | Timekeeping is especially problematic for virtualization because a number of | ||
460 | challenges arise. The most obvious problem is that time is now shared between | ||
461 | the host and, potentially, a number of virtual machines. Thus the virtual | ||
462 | operating system does not run with 100% usage of the CPU, despite the fact that | ||
463 | it may very well make that assumption. It may expect it to remain true to very | ||
464 | exacting bounds when interrupt sources are disabled, but in reality only its | ||
465 | virtual interrupt sources are disabled, and the machine may still be preempted | ||
466 | at any time. This causes problems as the passage of real time, the injection | ||
467 | of machine interrupts and the associated clock sources are no longer completely | ||
468 | synchronized with real time. | ||
469 | |||
470 | This same problem can occur on native harware to a degree, as SMM mode may | ||
471 | steal cycles from the naturally on X86 systems when SMM mode is used by the | ||
472 | BIOS, but not in such an extreme fashion. However, the fact that SMM mode may | ||
473 | cause similar problems to virtualization makes it a good justification for | ||
474 | solving many of these problems on bare metal. | ||
475 | |||
476 | 4.1) Interrupt clocking | ||
477 | |||
478 | One of the most immediate problems that occurs with legacy operating systems | ||
479 | is that the system timekeeping routines are often designed to keep track of | ||
480 | time by counting periodic interrupts. These interrupts may come from the PIT | ||
481 | or the RTC, but the problem is the same: the host virtualization engine may not | ||
482 | be able to deliver the proper number of interrupts per second, and so guest | ||
483 | time may fall behind. This is especially problematic if a high interrupt rate | ||
484 | is selected, such as 1000 HZ, which is unfortunately the default for many Linux | ||
485 | guests. | ||
486 | |||
487 | There are three approaches to solving this problem; first, it may be possible | ||
488 | to simply ignore it. Guests which have a separate time source for tracking | ||
489 | 'wall clock' or 'real time' may not need any adjustment of their interrupts to | ||
490 | maintain proper time. If this is not sufficient, it may be necessary to inject | ||
491 | additional interrupts into the guest in order to increase the effective | ||
492 | interrupt rate. This approach leads to complications in extreme conditions, | ||
493 | where host load or guest lag is too much to compensate for, and thus another | ||
494 | solution to the problem has risen: the guest may need to become aware of lost | ||
495 | ticks and compensate for them internally. Although promising in theory, the | ||
496 | implementation of this policy in Linux has been extremely error prone, and a | ||
497 | number of buggy variants of lost tick compensation are distributed across | ||
498 | commonly used Linux systems. | ||
499 | |||
500 | Windows uses periodic RTC clocking as a means of keeping time internally, and | ||
501 | thus requires interrupt slewing to keep proper time. It does use a low enough | ||
502 | rate (ed: is it 18.2 Hz?) however that it has not yet been a problem in | ||
503 | practice. | ||
504 | |||
505 | 4.2) TSC sampling and serialization | ||
506 | |||
507 | As the highest precision time source available, the cycle counter of the CPU | ||
508 | has aroused much interest from developers. As explained above, this timer has | ||
509 | many problems unique to its nature as a local, potentially unstable and | ||
510 | potentially unsynchronized source. One issue which is not unique to the TSC, | ||
511 | but is highlighted because of its very precise nature is sampling delay. By | ||
512 | definition, the counter, once read is already old. However, it is also | ||
513 | possible for the counter to be read ahead of the actual use of the result. | ||
514 | This is a consequence of the superscalar execution of the instruction stream, | ||
515 | which may execute instructions out of order. Such execution is called | ||
516 | non-serialized. Forcing serialized execution is necessary for precise | ||
517 | measurement with the TSC, and requires a serializing instruction, such as CPUID | ||
518 | or an MSR read. | ||
519 | |||
520 | Since CPUID may actually be virtualized by a trap and emulate mechanism, this | ||
521 | serialization can pose a performance issue for hardware virtualization. An | ||
522 | accurate time stamp counter reading may therefore not always be available, and | ||
523 | it may be necessary for an implementation to guard against "backwards" reads of | ||
524 | the TSC as seen from other CPUs, even in an otherwise perfectly synchronized | ||
525 | system. | ||
526 | |||
527 | 4.3) Timespec aliasing | ||
528 | |||
529 | Additionally, this lack of serialization from the TSC poses another challenge | ||
530 | when using results of the TSC when measured against another time source. As | ||
531 | the TSC is much higher precision, many possible values of the TSC may be read | ||
532 | while another clock is still expressing the same value. | ||
533 | |||
534 | That is, you may read (T,T+10) while external clock C maintains the same value. | ||
535 | Due to non-serialized reads, you may actually end up with a range which | ||
536 | fluctuates - from (T-1.. T+10). Thus, any time calculated from a TSC, but | ||
537 | calibrated against an external value may have a range of valid values. | ||
538 | Re-calibrating this computation may actually cause time, as computed after the | ||
539 | calibration, to go backwards, compared with time computed before the | ||
540 | calibration. | ||
541 | |||
542 | This problem is particularly pronounced with an internal time source in Linux, | ||
543 | the kernel time, which is expressed in the theoretically high resolution | ||
544 | timespec - but which advances in much larger granularity intervals, sometimes | ||
545 | at the rate of jiffies, and possibly in catchup modes, at a much larger step. | ||
546 | |||
547 | This aliasing requires care in the computation and recalibration of kvmclock | ||
548 | and any other values derived from TSC computation (such as TSC virtualization | ||
549 | itself). | ||
550 | |||
551 | 4.4) Migration | ||
552 | |||
553 | Migration of a virtual machine raises problems for timekeeping in two ways. | ||
554 | First, the migration itself may take time, during which interrupts cannot be | ||
555 | delivered, and after which, the guest time may need to be caught up. NTP may | ||
556 | be able to help to some degree here, as the clock correction required is | ||
557 | typically small enough to fall in the NTP-correctable window. | ||
558 | |||
559 | An additional concern is that timers based off the TSC (or HPET, if the raw bus | ||
560 | clock is exposed) may now be running at different rates, requiring compensation | ||
561 | in some way in the hypervisor by virtualizing these timers. In addition, | ||
562 | migrating to a faster machine may preclude the use of a passthrough TSC, as a | ||
563 | faster clock cannot be made visible to a guest without the potential of time | ||
564 | advancing faster than usual. A slower clock is less of a problem, as it can | ||
565 | always be caught up to the original rate. KVM clock avoids these problems by | ||
566 | simply storing multipliers and offsets against the TSC for the guest to convert | ||
567 | back into nanosecond resolution values. | ||
568 | |||
569 | 4.5) Scheduling | ||
570 | |||
571 | Since scheduling may be based on precise timing and firing of interrupts, the | ||
572 | scheduling algorithms of an operating system may be adversely affected by | ||
573 | virtualization. In theory, the effect is random and should be universally | ||
574 | distributed, but in contrived as well as real scenarios (guest device access, | ||
575 | causes of virtualization exits, possible context switch), this may not always | ||
576 | be the case. The effect of this has not been well studied. | ||
577 | |||
578 | In an attempt to work around this, several implementations have provided a | ||
579 | paravirtualized scheduler clock, which reveals the true amount of CPU time for | ||
580 | which a virtual machine has been running. | ||
581 | |||
582 | 4.6) Watchdogs | ||
583 | |||
584 | Watchdog timers, such as the lock detector in Linux may fire accidentally when | ||
585 | running under hardware virtualization due to timer interrupts being delayed or | ||
586 | misinterpretation of the passage of real time. Usually, these warnings are | ||
587 | spurious and can be ignored, but in some circumstances it may be necessary to | ||
588 | disable such detection. | ||
589 | |||
590 | 4.7) Delays and precision timing | ||
591 | |||
592 | Precise timing and delays may not be possible in a virtualized system. This | ||
593 | can happen if the system is controlling physical hardware, or issues delays to | ||
594 | compensate for slower I/O to and from devices. The first issue is not solvable | ||
595 | in general for a virtualized system; hardware control software can't be | ||
596 | adequately virtualized without a full real-time operating system, which would | ||
597 | require an RT aware virtualization platform. | ||
598 | |||
599 | The second issue may cause performance problems, but this is unlikely to be a | ||
600 | significant issue. In many cases these delays may be eliminated through | ||
601 | configuration or paravirtualization. | ||
602 | |||
603 | 4.8) Covert channels and leaks | ||
604 | |||
605 | In addition to the above problems, time information will inevitably leak to the | ||
606 | guest about the host in anything but a perfect implementation of virtualized | ||
607 | time. This may allow the guest to infer the presence of a hypervisor (as in a | ||
608 | red-pill type detection), and it may allow information to leak between guests | ||
609 | by using CPU utilization itself as a signalling channel. Preventing such | ||
610 | problems would require completely isolated virtual time which may not track | ||
611 | real time any longer. This may be useful in certain security or QA contexts, | ||
612 | but in general isn't recommended for real-world deployment scenarios. | ||