aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/crash.c3
-rw-r--r--arch/x86/kernel/kvm.c248
-rw-r--r--arch/x86/kernel/kvmclock.c187
-rw-r--r--arch/x86/kernel/reboot.c13
-rw-r--r--arch/x86/kernel/setup_32.c6
-rw-r--r--arch/x86/kernel/setup_64.c7
7 files changed, 463 insertions, 3 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 90e092d0af0c..fa19c3819540 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -80,6 +80,8 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
80obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o 80obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
81 81
82obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o 82obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
83obj-$(CONFIG_KVM_GUEST) += kvm.o
84obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
83obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 85obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
84 86
85ifdef CONFIG_INPUT_PCSPKR 87ifdef CONFIG_INPUT_PCSPKR
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 2251d0ae9570..268553817909 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -25,6 +25,7 @@
25#include <asm/hpet.h> 25#include <asm/hpet.h>
26#include <linux/kdebug.h> 26#include <linux/kdebug.h>
27#include <asm/smp.h> 27#include <asm/smp.h>
28#include <asm/reboot.h>
28 29
29#include <mach_ipi.h> 30#include <mach_ipi.h>
30 31
@@ -117,7 +118,7 @@ static void nmi_shootdown_cpus(void)
117} 118}
118#endif 119#endif
119 120
120void machine_crash_shutdown(struct pt_regs *regs) 121void native_machine_crash_shutdown(struct pt_regs *regs)
121{ 122{
122 /* This function is only called after the system 123 /* This function is only called after the system
123 * has panicked or is otherwise in a critical state. 124 * has panicked or is otherwise in a critical state.
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
new file mode 100644
index 000000000000..8b7a3cf37d2b
--- /dev/null
+++ b/arch/x86/kernel/kvm.c
@@ -0,0 +1,248 @@
1/*
2 * KVM paravirt_ops implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 *
18 * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
19 * Copyright IBM Corporation, 2007
20 * Authors: Anthony Liguori <aliguori@us.ibm.com>
21 */
22
23#include <linux/module.h>
24#include <linux/kernel.h>
25#include <linux/kvm_para.h>
26#include <linux/cpu.h>
27#include <linux/mm.h>
28#include <linux/highmem.h>
29#include <linux/hardirq.h>
30
31#define MMU_QUEUE_SIZE 1024
32
33struct kvm_para_state {
34 u8 mmu_queue[MMU_QUEUE_SIZE];
35 int mmu_queue_len;
36 enum paravirt_lazy_mode mode;
37};
38
39static DEFINE_PER_CPU(struct kvm_para_state, para_state);
40
41static struct kvm_para_state *kvm_para_state(void)
42{
43 return &per_cpu(para_state, raw_smp_processor_id());
44}
45
46/*
47 * No need for any "IO delay" on KVM
48 */
49static void kvm_io_delay(void)
50{
51}
52
53static void kvm_mmu_op(void *buffer, unsigned len)
54{
55 int r;
56 unsigned long a1, a2;
57
58 do {
59 a1 = __pa(buffer);
60 a2 = 0; /* on i386 __pa() always returns <4G */
61 r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2);
62 buffer += r;
63 len -= r;
64 } while (len);
65}
66
67static void mmu_queue_flush(struct kvm_para_state *state)
68{
69 if (state->mmu_queue_len) {
70 kvm_mmu_op(state->mmu_queue, state->mmu_queue_len);
71 state->mmu_queue_len = 0;
72 }
73}
74
75static void kvm_deferred_mmu_op(void *buffer, int len)
76{
77 struct kvm_para_state *state = kvm_para_state();
78
79 if (state->mode != PARAVIRT_LAZY_MMU) {
80 kvm_mmu_op(buffer, len);
81 return;
82 }
83 if (state->mmu_queue_len + len > sizeof state->mmu_queue)
84 mmu_queue_flush(state);
85 memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len);
86 state->mmu_queue_len += len;
87}
88
89static void kvm_mmu_write(void *dest, u64 val)
90{
91 __u64 pte_phys;
92 struct kvm_mmu_op_write_pte wpte;
93
94#ifdef CONFIG_HIGHPTE
95 struct page *page;
96 unsigned long dst = (unsigned long) dest;
97
98 page = kmap_atomic_to_page(dest);
99 pte_phys = page_to_pfn(page);
100 pte_phys <<= PAGE_SHIFT;
101 pte_phys += (dst & ~(PAGE_MASK));
102#else
103 pte_phys = (unsigned long)__pa(dest);
104#endif
105 wpte.header.op = KVM_MMU_OP_WRITE_PTE;
106 wpte.pte_val = val;
107 wpte.pte_phys = pte_phys;
108
109 kvm_deferred_mmu_op(&wpte, sizeof wpte);
110}
111
112/*
113 * We only need to hook operations that are MMU writes. We hook these so that
114 * we can use lazy MMU mode to batch these operations. We could probably
115 * improve the performance of the host code if we used some of the information
116 * here to simplify processing of batched writes.
117 */
118static void kvm_set_pte(pte_t *ptep, pte_t pte)
119{
120 kvm_mmu_write(ptep, pte_val(pte));
121}
122
123static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr,
124 pte_t *ptep, pte_t pte)
125{
126 kvm_mmu_write(ptep, pte_val(pte));
127}
128
129static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd)
130{
131 kvm_mmu_write(pmdp, pmd_val(pmd));
132}
133
134#if PAGETABLE_LEVELS >= 3
135#ifdef CONFIG_X86_PAE
136static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
137{
138 kvm_mmu_write(ptep, pte_val(pte));
139}
140
141static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr,
142 pte_t *ptep, pte_t pte)
143{
144 kvm_mmu_write(ptep, pte_val(pte));
145}
146
147static void kvm_pte_clear(struct mm_struct *mm,
148 unsigned long addr, pte_t *ptep)
149{
150 kvm_mmu_write(ptep, 0);
151}
152
153static void kvm_pmd_clear(pmd_t *pmdp)
154{
155 kvm_mmu_write(pmdp, 0);
156}
157#endif
158
159static void kvm_set_pud(pud_t *pudp, pud_t pud)
160{
161 kvm_mmu_write(pudp, pud_val(pud));
162}
163
164#if PAGETABLE_LEVELS == 4
165static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd)
166{
167 kvm_mmu_write(pgdp, pgd_val(pgd));
168}
169#endif
170#endif /* PAGETABLE_LEVELS >= 3 */
171
172static void kvm_flush_tlb(void)
173{
174 struct kvm_mmu_op_flush_tlb ftlb = {
175 .header.op = KVM_MMU_OP_FLUSH_TLB,
176 };
177
178 kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
179}
180
181static void kvm_release_pt(u32 pfn)
182{
183 struct kvm_mmu_op_release_pt rpt = {
184 .header.op = KVM_MMU_OP_RELEASE_PT,
185 .pt_phys = (u64)pfn << PAGE_SHIFT,
186 };
187
188 kvm_mmu_op(&rpt, sizeof rpt);
189}
190
191static void kvm_enter_lazy_mmu(void)
192{
193 struct kvm_para_state *state = kvm_para_state();
194
195 paravirt_enter_lazy_mmu();
196 state->mode = paravirt_get_lazy_mode();
197}
198
199static void kvm_leave_lazy_mmu(void)
200{
201 struct kvm_para_state *state = kvm_para_state();
202
203 mmu_queue_flush(state);
204 paravirt_leave_lazy(paravirt_get_lazy_mode());
205 state->mode = paravirt_get_lazy_mode();
206}
207
208static void paravirt_ops_setup(void)
209{
210 pv_info.name = "KVM";
211 pv_info.paravirt_enabled = 1;
212
213 if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
214 pv_cpu_ops.io_delay = kvm_io_delay;
215
216 if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
217 pv_mmu_ops.set_pte = kvm_set_pte;
218 pv_mmu_ops.set_pte_at = kvm_set_pte_at;
219 pv_mmu_ops.set_pmd = kvm_set_pmd;
220#if PAGETABLE_LEVELS >= 3
221#ifdef CONFIG_X86_PAE
222 pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
223 pv_mmu_ops.set_pte_present = kvm_set_pte_present;
224 pv_mmu_ops.pte_clear = kvm_pte_clear;
225 pv_mmu_ops.pmd_clear = kvm_pmd_clear;
226#endif
227 pv_mmu_ops.set_pud = kvm_set_pud;
228#if PAGETABLE_LEVELS == 4
229 pv_mmu_ops.set_pgd = kvm_set_pgd;
230#endif
231#endif
232 pv_mmu_ops.flush_tlb_user = kvm_flush_tlb;
233 pv_mmu_ops.release_pte = kvm_release_pt;
234 pv_mmu_ops.release_pmd = kvm_release_pt;
235 pv_mmu_ops.release_pud = kvm_release_pt;
236
237 pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
238 pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
239 }
240}
241
242void __init kvm_guest_init(void)
243{
244 if (!kvm_para_available())
245 return;
246
247 paravirt_ops_setup();
248}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
new file mode 100644
index 000000000000..ddee04043aeb
--- /dev/null
+++ b/arch/x86/kernel/kvmclock.c
@@ -0,0 +1,187 @@
1/* KVM paravirtual clock driver. A clocksource implementation
2 Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17*/
18
19#include <linux/clocksource.h>
20#include <linux/kvm_para.h>
21#include <asm/arch_hooks.h>
22#include <asm/msr.h>
23#include <asm/apic.h>
24#include <linux/percpu.h>
25#include <asm/reboot.h>
26
27#define KVM_SCALE 22
28
29static int kvmclock = 1;
30
31static int parse_no_kvmclock(char *arg)
32{
33 kvmclock = 0;
34 return 0;
35}
36early_param("no-kvmclock", parse_no_kvmclock);
37
38/* The hypervisor will put information about time periodically here */
39static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock);
40#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field
41
42static inline u64 kvm_get_delta(u64 last_tsc)
43{
44 int cpu = smp_processor_id();
45 u64 delta = native_read_tsc() - last_tsc;
46 return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
47}
48
49static struct kvm_wall_clock wall_clock;
50static cycle_t kvm_clock_read(void);
51/*
52 * The wallclock is the time of day when we booted. Since then, some time may
53 * have elapsed since the hypervisor wrote the data. So we try to account for
54 * that with system time
55 */
56unsigned long kvm_get_wallclock(void)
57{
58 u32 wc_sec, wc_nsec;
59 u64 delta;
60 struct timespec ts;
61 int version, nsec;
62 int low, high;
63
64 low = (int)__pa(&wall_clock);
65 high = ((u64)__pa(&wall_clock) >> 32);
66
67 delta = kvm_clock_read();
68
69 native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
70 do {
71 version = wall_clock.wc_version;
72 rmb();
73 wc_sec = wall_clock.wc_sec;
74 wc_nsec = wall_clock.wc_nsec;
75 rmb();
76 } while ((wall_clock.wc_version != version) || (version & 1));
77
78 delta = kvm_clock_read() - delta;
79 delta += wc_nsec;
80 nsec = do_div(delta, NSEC_PER_SEC);
81 set_normalized_timespec(&ts, wc_sec + delta, nsec);
82 /*
83 * Of all mechanisms of time adjustment I've tested, this one
84 * was the champion!
85 */
86 return ts.tv_sec + 1;
87}
88
89int kvm_set_wallclock(unsigned long now)
90{
91 return 0;
92}
93
94/*
95 * This is our read_clock function. The host puts an tsc timestamp each time
96 * it updates a new time. Without the tsc adjustment, we can have a situation
97 * in which a vcpu starts to run earlier (smaller system_time), but probes
98 * time later (compared to another vcpu), leading to backwards time
99 */
100static cycle_t kvm_clock_read(void)
101{
102 u64 last_tsc, now;
103 int cpu;
104
105 preempt_disable();
106 cpu = smp_processor_id();
107
108 last_tsc = get_clock(cpu, tsc_timestamp);
109 now = get_clock(cpu, system_time);
110
111 now += kvm_get_delta(last_tsc);
112 preempt_enable();
113
114 return now;
115}
116static struct clocksource kvm_clock = {
117 .name = "kvm-clock",
118 .read = kvm_clock_read,
119 .rating = 400,
120 .mask = CLOCKSOURCE_MASK(64),
121 .mult = 1 << KVM_SCALE,
122 .shift = KVM_SCALE,
123 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
124};
125
126static int kvm_register_clock(void)
127{
128 int cpu = smp_processor_id();
129 int low, high;
130 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
131 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
132
133 return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
134}
135
136static void kvm_setup_secondary_clock(void)
137{
138 /*
139 * Now that the first cpu already had this clocksource initialized,
140 * we shouldn't fail.
141 */
142 WARN_ON(kvm_register_clock());
143 /* ok, done with our trickery, call native */
144 setup_secondary_APIC_clock();
145}
146
147/*
148 * After the clock is registered, the host will keep writing to the
149 * registered memory location. If the guest happens to shutdown, this memory
150 * won't be valid. In cases like kexec, in which you install a new kernel, this
151 * means a random memory location will be kept being written. So before any
152 * kind of shutdown from our side, we unregister the clock by writting anything
153 * that does not have the 'enable' bit set in the msr
154 */
155#ifdef CONFIG_KEXEC
156static void kvm_crash_shutdown(struct pt_regs *regs)
157{
158 native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
159 native_machine_crash_shutdown(regs);
160}
161#endif
162
163static void kvm_shutdown(void)
164{
165 native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
166 native_machine_shutdown();
167}
168
169void __init kvmclock_init(void)
170{
171 if (!kvm_para_available())
172 return;
173
174 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
175 if (kvm_register_clock())
176 return;
177 pv_time_ops.get_wallclock = kvm_get_wallclock;
178 pv_time_ops.set_wallclock = kvm_set_wallclock;
179 pv_time_ops.sched_clock = kvm_clock_read;
180 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
181 machine_ops.shutdown = kvm_shutdown;
182#ifdef CONFIG_KEXEC
183 machine_ops.crash_shutdown = kvm_crash_shutdown;
184#endif
185 clocksource_register(&kvm_clock);
186 }
187}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1791a751a772..a4a838306b2c 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -399,7 +399,7 @@ static void native_machine_emergency_restart(void)
399 } 399 }
400} 400}
401 401
402static void native_machine_shutdown(void) 402void native_machine_shutdown(void)
403{ 403{
404 /* Stop the cpus and apics */ 404 /* Stop the cpus and apics */
405#ifdef CONFIG_SMP 405#ifdef CONFIG_SMP
@@ -470,7 +470,10 @@ struct machine_ops machine_ops = {
470 .shutdown = native_machine_shutdown, 470 .shutdown = native_machine_shutdown,
471 .emergency_restart = native_machine_emergency_restart, 471 .emergency_restart = native_machine_emergency_restart,
472 .restart = native_machine_restart, 472 .restart = native_machine_restart,
473 .halt = native_machine_halt 473 .halt = native_machine_halt,
474#ifdef CONFIG_KEXEC
475 .crash_shutdown = native_machine_crash_shutdown,
476#endif
474}; 477};
475 478
476void machine_power_off(void) 479void machine_power_off(void)
@@ -498,3 +501,9 @@ void machine_halt(void)
498 machine_ops.halt(); 501 machine_ops.halt();
499} 502}
500 503
504#ifdef CONFIG_KEXEC
505void machine_crash_shutdown(struct pt_regs *regs)
506{
507 machine_ops.crash_shutdown(regs);
508}
509#endif
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 44cc9b933932..2283422af794 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -47,6 +47,7 @@
47#include <linux/pfn.h> 47#include <linux/pfn.h>
48#include <linux/pci.h> 48#include <linux/pci.h>
49#include <linux/init_ohci1394_dma.h> 49#include <linux/init_ohci1394_dma.h>
50#include <linux/kvm_para.h>
50 51
51#include <video/edid.h> 52#include <video/edid.h>
52 53
@@ -820,6 +821,10 @@ void __init setup_arch(char **cmdline_p)
820 821
821 max_low_pfn = setup_memory(); 822 max_low_pfn = setup_memory();
822 823
824#ifdef CONFIG_KVM_CLOCK
825 kvmclock_init();
826#endif
827
823#ifdef CONFIG_VMI 828#ifdef CONFIG_VMI
824 /* 829 /*
825 * Must be after max_low_pfn is determined, and before kernel 830 * Must be after max_low_pfn is determined, and before kernel
@@ -827,6 +832,7 @@ void __init setup_arch(char **cmdline_p)
827 */ 832 */
828 vmi_init(); 833 vmi_init();
829#endif 834#endif
835 kvm_guest_init();
830 836
831 /* 837 /*
832 * NOTE: before this point _nobody_ is allowed to allocate 838 * NOTE: before this point _nobody_ is allowed to allocate
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 60e64c8eee92..a94fb959a87a 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -42,6 +42,7 @@
42#include <linux/ctype.h> 42#include <linux/ctype.h>
43#include <linux/uaccess.h> 43#include <linux/uaccess.h>
44#include <linux/init_ohci1394_dma.h> 44#include <linux/init_ohci1394_dma.h>
45#include <linux/kvm_para.h>
45 46
46#include <asm/mtrr.h> 47#include <asm/mtrr.h>
47#include <asm/uaccess.h> 48#include <asm/uaccess.h>
@@ -384,6 +385,10 @@ void __init setup_arch(char **cmdline_p)
384 385
385 io_delay_init(); 386 io_delay_init();
386 387
388#ifdef CONFIG_KVM_CLOCK
389 kvmclock_init();
390#endif
391
387#ifdef CONFIG_SMP 392#ifdef CONFIG_SMP
388 /* setup to use the early static init tables during kernel startup */ 393 /* setup to use the early static init tables during kernel startup */
389 x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init; 394 x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
@@ -488,6 +493,8 @@ void __init setup_arch(char **cmdline_p)
488 init_apic_mappings(); 493 init_apic_mappings();
489 ioapic_init_mappings(); 494 ioapic_init_mappings();
490 495
496 kvm_guest_init();
497
491 /* 498 /*
492 * We trust e820 completely. No explicit ROM probing in memory. 499 * We trust e820 completely. No explicit ROM probing in memory.
493 */ 500 */