aboutsummaryrefslogtreecommitdiffstats
path: root/arch/i386
diff options
context:
space:
mode:
authorZachary Amsden <zach@vmware.com>2007-02-13 07:26:21 -0500
committerAndi Kleen <andi@basil.nowhere.org>2007-02-13 07:26:21 -0500
commit7ce0bcfd1667736f1293cff845139bbee53186de (patch)
tree54e70c1e0731d279dfe51efdc06db58d432575ff /arch/i386
parentae5da273fe3352febd38658d8d34484cbcfb3423 (diff)
[PATCH] i386: vMI backend for paravirt-ops
Fairly straightforward implementation of VMI backend for paravirt-ops. [Adrian Bunk: some cleanups] Signed-off-by: Zachary Amsden <zach@vmware.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Andi Kleen <ak@suse.de> Cc: Jeremy Fitzhardinge <jeremy@xensource.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Chris Wright <chrisw@sous-sol.org> Signed-off-by: Andrew Morton <akpm@osdl.org>
Diffstat (limited to 'arch/i386')
-rw-r--r--arch/i386/Kconfig9
-rw-r--r--arch/i386/kernel/Makefile2
-rw-r--r--arch/i386/kernel/head.S2
-rw-r--r--arch/i386/kernel/io_apic.c2
-rw-r--r--arch/i386/kernel/setup.c9
-rw-r--r--arch/i386/kernel/smpboot.c4
-rw-r--r--arch/i386/kernel/vmi.c904
-rw-r--r--arch/i386/mm/pgtable.c2
8 files changed, 932 insertions, 2 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 63d5e841caf5..a3b3f6ee3642 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -203,6 +203,15 @@ config PARAVIRT
203 However, when run without a hypervisor the kernel is 203 However, when run without a hypervisor the kernel is
204 theoretically slower. If in doubt, say N. 204 theoretically slower. If in doubt, say N.
205 205
206config VMI
207 bool "VMI Paravirt-ops support"
208 depends on PARAVIRT
209 default y
210 help
211 VMI provides a paravirtualized interface to multiple hypervisors
212 include VMware ESX server and Xen by connecting to a ROM module
213 provided by the hypervisor.
214
206config ACPI_SRAT 215config ACPI_SRAT
207 bool 216 bool
208 default y 217 default y
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 1e8988e558c5..9cfb58911f14 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -40,6 +40,8 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
40obj-$(CONFIG_HPET_TIMER) += hpet.o 40obj-$(CONFIG_HPET_TIMER) += hpet.o
41obj-$(CONFIG_K8_NB) += k8.o 41obj-$(CONFIG_K8_NB) += k8.o
42 42
43obj-$(CONFIG_VMI) += vmi.o
44
43# Make sure this is linked after any other paravirt_ops structs: see head.S 45# Make sure this is linked after any other paravirt_ops structs: see head.S
44obj-$(CONFIG_PARAVIRT) += paravirt.o 46obj-$(CONFIG_PARAVIRT) += paravirt.o
45 47
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index 15336c8b5960..6c7f71176977 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -360,7 +360,7 @@ check_x87:
360 * cpu_gdt_table and boot_pda; for secondary CPUs, these will be 360 * cpu_gdt_table and boot_pda; for secondary CPUs, these will be
361 * that CPU's GDT and PDA. 361 * that CPU's GDT and PDA.
362 */ 362 */
363setup_pda: 363ENTRY(setup_pda)
364 /* get the PDA pointer */ 364 /* get the PDA pointer */
365 movl start_pda, %eax 365 movl start_pda, %eax
366 366
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 1711f4e1093f..e30ccedad0b9 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -1920,7 +1920,7 @@ static void __init setup_ioapic_ids_from_mpc(void)
1920static void __init setup_ioapic_ids_from_mpc(void) { } 1920static void __init setup_ioapic_ids_from_mpc(void) { }
1921#endif 1921#endif
1922 1922
1923static int no_timer_check __initdata; 1923int no_timer_check __initdata;
1924 1924
1925static int __init notimercheck(char *s) 1925static int __init notimercheck(char *s)
1926{ 1926{
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 4694ac980cd2..bd8c218d94af 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -60,6 +60,7 @@
60#include <asm/io_apic.h> 60#include <asm/io_apic.h>
61#include <asm/ist.h> 61#include <asm/ist.h>
62#include <asm/io.h> 62#include <asm/io.h>
63#include <asm/vmi.h>
63#include <setup_arch.h> 64#include <setup_arch.h>
64#include <bios_ebda.h> 65#include <bios_ebda.h>
65 66
@@ -581,6 +582,14 @@ void __init setup_arch(char **cmdline_p)
581 582
582 max_low_pfn = setup_memory(); 583 max_low_pfn = setup_memory();
583 584
585#ifdef CONFIG_VMI
586 /*
587 * Must be after max_low_pfn is determined, and before kernel
588 * pagetables are setup.
589 */
590 vmi_init();
591#endif
592
584 /* 593 /*
585 * NOTE: before this point _nobody_ is allowed to allocate 594 * NOTE: before this point _nobody_ is allowed to allocate
586 * any memory using the bootmem allocator. Although the 595 * any memory using the bootmem allocator. Although the
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 1908afa265b9..42502d820e4f 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -63,6 +63,7 @@
63#include <mach_apic.h> 63#include <mach_apic.h>
64#include <mach_wakecpu.h> 64#include <mach_wakecpu.h>
65#include <smpboot_hooks.h> 65#include <smpboot_hooks.h>
66#include <asm/vmi.h>
66 67
67/* Set if we find a B stepping CPU */ 68/* Set if we find a B stepping CPU */
68static int __devinitdata smp_b_stepping; 69static int __devinitdata smp_b_stepping;
@@ -545,6 +546,9 @@ static void __cpuinit start_secondary(void *unused)
545 * booting is too fragile that we want to limit the 546 * booting is too fragile that we want to limit the
546 * things done here to the most necessary things. 547 * things done here to the most necessary things.
547 */ 548 */
549#ifdef CONFIG_VMI
550 vmi_bringup();
551#endif
548 secondary_cpu_init(); 552 secondary_cpu_init();
549 preempt_disable(); 553 preempt_disable();
550 smp_callin(); 554 smp_callin();
diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c
new file mode 100644
index 000000000000..a94d64b10f75
--- /dev/null
+++ b/arch/i386/kernel/vmi.c
@@ -0,0 +1,904 @@
1/*
2 * VMI specific paravirt-ops implementation
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to zach@vmware.com
22 *
23 */
24
25#include <linux/module.h>
26#include <linux/license.h>
27#include <linux/cpu.h>
28#include <linux/bootmem.h>
29#include <linux/mm.h>
30#include <asm/vmi.h>
31#include <asm/io.h>
32#include <asm/fixmap.h>
33#include <asm/apicdef.h>
34#include <asm/apic.h>
35#include <asm/processor.h>
36#include <asm/timer.h>
37
38/* Convenient for calling VMI functions indirectly in the ROM */
39typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
40typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
41
42#define call_vrom_func(rom,func) \
43 (((VROMFUNC *)(rom->func))())
44
45#define call_vrom_long_func(rom,func,arg) \
46 (((VROMLONGFUNC *)(rom->func)) (arg))
47
48static struct vrom_header *vmi_rom;
49static int license_gplok;
50static int disable_nodelay;
51static int disable_pge;
52static int disable_pse;
53static int disable_sep;
54static int disable_tsc;
55static int disable_mtrr;
56
57/* Cached VMI operations */
58struct {
59 void (*cpuid)(void /* non-c */);
60 void (*_set_ldt)(u32 selector);
61 void (*set_tr)(u32 selector);
62 void (*set_kernel_stack)(u32 selector, u32 esp0);
63 void (*allocate_page)(u32, u32, u32, u32, u32);
64 void (*release_page)(u32, u32);
65 void (*set_pte)(pte_t, pte_t *, unsigned);
66 void (*update_pte)(pte_t *, unsigned);
67 void (*set_linear_mapping)(int, u32, u32, u32);
68 void (*flush_tlb)(int);
69 void (*set_initial_ap_state)(int, int);
70} vmi_ops;
71
72/* XXX move this to alternative.h */
73extern struct paravirt_patch __start_parainstructions[],
74 __stop_parainstructions[];
75
76/*
77 * VMI patching routines.
78 */
79#define MNEM_CALL 0xe8
80#define MNEM_JMP 0xe9
81#define MNEM_RET 0xc3
82
83static char irq_save_disable_callout[] = {
84 MNEM_CALL, 0, 0, 0, 0,
85 MNEM_CALL, 0, 0, 0, 0,
86 MNEM_RET
87};
88#define IRQ_PATCH_INT_MASK 0
89#define IRQ_PATCH_DISABLE 5
90
91static inline void patch_offset(unsigned char *eip, unsigned char *dest)
92{
93 *(unsigned long *)(eip+1) = dest-eip-5;
94}
95
96static unsigned patch_internal(int call, unsigned len, void *insns)
97{
98 u64 reloc;
99 struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
100 reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
101 switch(rel->type) {
102 case VMI_RELOCATION_CALL_REL:
103 BUG_ON(len < 5);
104 *(char *)insns = MNEM_CALL;
105 patch_offset(insns, rel->eip);
106 return 5;
107
108 case VMI_RELOCATION_JUMP_REL:
109 BUG_ON(len < 5);
110 *(char *)insns = MNEM_JMP;
111 patch_offset(insns, rel->eip);
112 return 5;
113
114 case VMI_RELOCATION_NOP:
115 /* obliterate the whole thing */
116 return 0;
117
118 case VMI_RELOCATION_NONE:
119 /* leave native code in place */
120 break;
121
122 default:
123 BUG();
124 }
125 return len;
126}
127
128/*
129 * Apply patch if appropriate, return length of new instruction
130 * sequence. The callee does nop padding for us.
131 */
132static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned len)
133{
134 switch (type) {
135 case PARAVIRT_IRQ_DISABLE:
136 return patch_internal(VMI_CALL_DisableInterrupts, len, insns);
137 case PARAVIRT_IRQ_ENABLE:
138 return patch_internal(VMI_CALL_EnableInterrupts, len, insns);
139 case PARAVIRT_RESTORE_FLAGS:
140 return patch_internal(VMI_CALL_SetInterruptMask, len, insns);
141 case PARAVIRT_SAVE_FLAGS:
142 return patch_internal(VMI_CALL_GetInterruptMask, len, insns);
143 case PARAVIRT_SAVE_FLAGS_IRQ_DISABLE:
144 if (len >= 10) {
145 patch_internal(VMI_CALL_GetInterruptMask, len, insns);
146 patch_internal(VMI_CALL_DisableInterrupts, len-5, insns+5);
147 return 10;
148 } else {
149 /*
150 * You bastards didn't leave enough room to
151 * patch save_flags_irq_disable inline. Patch
152 * to a helper
153 */
154 BUG_ON(len < 5);
155 *(char *)insns = MNEM_CALL;
156 patch_offset(insns, irq_save_disable_callout);
157 return 5;
158 }
159 case PARAVIRT_INTERRUPT_RETURN:
160 return patch_internal(VMI_CALL_IRET, len, insns);
161 case PARAVIRT_STI_SYSEXIT:
162 return patch_internal(VMI_CALL_SYSEXIT, len, insns);
163 default:
164 break;
165 }
166 return len;
167}
168
169/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
170static void vmi_cpuid(unsigned int *eax, unsigned int *ebx,
171 unsigned int *ecx, unsigned int *edx)
172{
173 int override = 0;
174 if (*eax == 1)
175 override = 1;
176 asm volatile ("call *%6"
177 : "=a" (*eax),
178 "=b" (*ebx),
179 "=c" (*ecx),
180 "=d" (*edx)
181 : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid));
182 if (override) {
183 if (disable_pse)
184 *edx &= ~X86_FEATURE_PSE;
185 if (disable_pge)
186 *edx &= ~X86_FEATURE_PGE;
187 if (disable_sep)
188 *edx &= ~X86_FEATURE_SEP;
189 if (disable_tsc)
190 *edx &= ~X86_FEATURE_TSC;
191 if (disable_mtrr)
192 *edx &= ~X86_FEATURE_MTRR;
193 }
194}
195
196static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
197{
198 if (gdt[nr].a != new->a || gdt[nr].b != new->b)
199 write_gdt_entry(gdt, nr, new->a, new->b);
200}
201
202static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
203{
204 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
205 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
206 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
207 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
208}
209
210static void vmi_set_ldt(const void *addr, unsigned entries)
211{
212 unsigned cpu = smp_processor_id();
213 u32 low, high;
214
215 pack_descriptor(&low, &high, (unsigned long)addr,
216 entries * sizeof(struct desc_struct) - 1,
217 DESCTYPE_LDT, 0);
218 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high);
219 vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
220}
221
222static void vmi_set_tr(void)
223{
224 vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
225}
226
227static void vmi_load_esp0(struct tss_struct *tss,
228 struct thread_struct *thread)
229{
230 tss->esp0 = thread->esp0;
231
232 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
233 if (unlikely(tss->ss1 != thread->sysenter_cs)) {
234 tss->ss1 = thread->sysenter_cs;
235 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
236 }
237 vmi_ops.set_kernel_stack(__KERNEL_DS, tss->esp0);
238}
239
240static void vmi_flush_tlb_user(void)
241{
242 vmi_ops.flush_tlb(VMI_FLUSH_TLB);
243}
244
245static void vmi_flush_tlb_kernel(void)
246{
247 vmi_ops.flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
248}
249
250/* Stub to do nothing at all; used for delays and unimplemented calls */
251static void vmi_nop(void)
252{
253}
254
255
256#ifdef CONFIG_DEBUG_PAGE_TYPE
257
258#ifdef CONFIG_X86_PAE
259#define MAX_BOOT_PTS (2048+4+1)
260#else
261#define MAX_BOOT_PTS (1024+1)
262#endif
263
264/*
265 * During boot, mem_map is not yet available in paging_init, so stash
266 * all the boot page allocations here.
267 */
268static struct {
269 u32 pfn;
270 int type;
271} boot_page_allocations[MAX_BOOT_PTS];
272static int num_boot_page_allocations;
273static int boot_allocations_applied;
274
275void vmi_apply_boot_page_allocations(void)
276{
277 int i;
278 BUG_ON(!mem_map);
279 for (i = 0; i < num_boot_page_allocations; i++) {
280 struct page *page = pfn_to_page(boot_page_allocations[i].pfn);
281 page->type = boot_page_allocations[i].type;
282 page->type = boot_page_allocations[i].type &
283 ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
284 }
285 boot_allocations_applied = 1;
286}
287
288static void record_page_type(u32 pfn, int type)
289{
290 BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS);
291 boot_page_allocations[num_boot_page_allocations].pfn = pfn;
292 boot_page_allocations[num_boot_page_allocations].type = type;
293 num_boot_page_allocations++;
294}
295
296static void check_zeroed_page(u32 pfn, int type, struct page *page)
297{
298 u32 *ptr;
299 int i;
300 int limit = PAGE_SIZE / sizeof(int);
301
302 if (page_address(page))
303 ptr = (u32 *)page_address(page);
304 else
305 ptr = (u32 *)__va(pfn << PAGE_SHIFT);
306 /*
307 * When cloning the root in non-PAE mode, only the userspace
308 * pdes need to be zeroed.
309 */
310 if (type & VMI_PAGE_CLONE)
311 limit = USER_PTRS_PER_PGD;
312 for (i = 0; i < limit; i++)
313 BUG_ON(ptr[i]);
314}
315
316/*
317 * We stash the page type into struct page so we can verify the page
318 * types are used properly.
319 */
320static void vmi_set_page_type(u32 pfn, int type)
321{
322 /* PAE can have multiple roots per page - don't track */
323 if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
324 return;
325
326 if (boot_allocations_applied) {
327 struct page *page = pfn_to_page(pfn);
328 if (type != VMI_PAGE_NORMAL)
329 BUG_ON(page->type);
330 else
331 BUG_ON(page->type == VMI_PAGE_NORMAL);
332 page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
333 if (type & VMI_PAGE_ZEROED)
334 check_zeroed_page(pfn, type, page);
335 } else {
336 record_page_type(pfn, type);
337 }
338}
339
340static void vmi_check_page_type(u32 pfn, int type)
341{
342 /* PAE can have multiple roots per page - skip checks */
343 if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
344 return;
345
346 type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
347 if (boot_allocations_applied) {
348 struct page *page = pfn_to_page(pfn);
349 BUG_ON((page->type ^ type) & VMI_PAGE_PAE);
350 BUG_ON(type == VMI_PAGE_NORMAL && page->type);
351 BUG_ON((type & page->type) == 0);
352 }
353}
354#else
355#define vmi_set_page_type(p,t) do { } while (0)
356#define vmi_check_page_type(p,t) do { } while (0)
357#endif
358
359static void vmi_allocate_pt(u32 pfn)
360{
361 vmi_set_page_type(pfn, VMI_PAGE_L1);
362 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
363}
364
365static void vmi_allocate_pd(u32 pfn)
366{
367 /*
368 * This call comes in very early, before mem_map is setup.
369 * It is called only for swapper_pg_dir, which already has
370 * data on it.
371 */
372 vmi_set_page_type(pfn, VMI_PAGE_L2);
373 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
374}
375
376static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
377{
378 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
379 vmi_check_page_type(clonepfn, VMI_PAGE_L2);
380 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
381}
382
383static void vmi_release_pt(u32 pfn)
384{
385 vmi_ops.release_page(pfn, VMI_PAGE_L1);
386 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
387}
388
389static void vmi_release_pd(u32 pfn)
390{
391 vmi_ops.release_page(pfn, VMI_PAGE_L2);
392 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
393}
394
395/*
396 * Helper macros for MMU update flags. We can defer updates until a flush
397 * or page invalidation only if the update is to the current address space
398 * (otherwise, there is no flush). We must check against init_mm, since
399 * this could be a kernel update, which usually passes init_mm, although
400 * sometimes this check can be skipped if we know the particular function
401 * is only called on user mode PTEs. We could change the kernel to pass
402 * current->active_mm here, but in particular, I was unsure if changing
403 * mm/highmem.c to do this would still be correct on other architectures.
404 */
405#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \
406 (!mustbeuser && (mm) == &init_mm))
407#define vmi_flags_addr(mm, addr, level, user) \
408 ((level) | (is_current_as(mm, user) ? \
409 (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
410#define vmi_flags_addr_defer(mm, addr, level, user) \
411 ((level) | (is_current_as(mm, user) ? \
412 (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
413
414static void vmi_update_pte(struct mm_struct *mm, u32 addr, pte_t *ptep)
415{
416 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
417 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
418}
419
420static void vmi_update_pte_defer(struct mm_struct *mm, u32 addr, pte_t *ptep)
421{
422 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
423 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
424}
425
426static void vmi_set_pte(pte_t *ptep, pte_t pte)
427{
428 /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
429 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD);
430 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
431}
432
433static void vmi_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
434{
435 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
436 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
437}
438
439static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
440{
441#ifdef CONFIG_X86_PAE
442 const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 };
443 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
444#else
445 const pte_t pte = { pmdval.pud.pgd.pgd };
446 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD);
447#endif
448 vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
449}
450
451#ifdef CONFIG_X86_PAE
452
453static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
454{
455 /*
456 * XXX This is called from set_pmd_pte, but at both PT
457 * and PD layers so the VMI_PAGE_PT flag is wrong. But
458 * it is only called for large page mapping changes,
459 * the Xen backend, doesn't support large pages, and the
460 * ESX backend doesn't depend on the flag.
461 */
462 set_64bit((unsigned long long *)ptep,pte_val(pteval));
463 vmi_ops.update_pte(ptep, VMI_PAGE_PT);
464}
465
466static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
467{
468 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
469 vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
470}
471
472static void vmi_set_pud(pud_t *pudp, pud_t pudval)
473{
474 /* Um, eww */
475 const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 };
476 vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
477 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
478}
479
480static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
481{
482 const pte_t pte = { 0 };
483 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
484 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
485}
486
487void vmi_pmd_clear(pmd_t *pmd)
488{
489 const pte_t pte = { 0 };
490 vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
491 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
492}
493#endif
494
495#ifdef CONFIG_SMP
496struct vmi_ap_state ap;
497extern void setup_pda(void);
498
499static void __init /* XXX cpu hotplug */
500vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
501 unsigned long start_esp)
502{
503 /* Default everything to zero. This is fine for most GPRs. */
504 memset(&ap, 0, sizeof(struct vmi_ap_state));
505
506 ap.gdtr_limit = GDT_SIZE - 1;
507 ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
508
509 ap.idtr_limit = IDT_ENTRIES * 8 - 1;
510 ap.idtr_base = (unsigned long) idt_table;
511
512 ap.ldtr = 0;
513
514 ap.cs = __KERNEL_CS;
515 ap.eip = (unsigned long) start_eip;
516 ap.ss = __KERNEL_DS;
517 ap.esp = (unsigned long) start_esp;
518
519 ap.ds = __USER_DS;
520 ap.es = __USER_DS;
521 ap.fs = __KERNEL_PDA;
522 ap.gs = 0;
523
524 ap.eflags = 0;
525
526 setup_pda();
527
528#ifdef CONFIG_X86_PAE
529 /* efer should match BSP efer. */
530 if (cpu_has_nx) {
531 unsigned l, h;
532 rdmsr(MSR_EFER, l, h);
533 ap.efer = (unsigned long long) h << 32 | l;
534 }
535#endif
536
537 ap.cr3 = __pa(swapper_pg_dir);
538 /* Protected mode, paging, AM, WP, NE, MP. */
539 ap.cr0 = 0x80050023;
540 ap.cr4 = mmu_cr4_features;
541 vmi_ops.set_initial_ap_state(__pa(&ap), phys_apicid);
542}
543#endif
544
545static inline int __init check_vmi_rom(struct vrom_header *rom)
546{
547 struct pci_header *pci;
548 struct pnp_header *pnp;
549 const char *manufacturer = "UNKNOWN";
550 const char *product = "UNKNOWN";
551 const char *license = "unspecified";
552
553 if (rom->rom_signature != 0xaa55)
554 return 0;
555 if (rom->vrom_signature != VMI_SIGNATURE)
556 return 0;
557 if (rom->api_version_maj != VMI_API_REV_MAJOR ||
558 rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
559 printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
560 rom->api_version_maj,
561 rom->api_version_min);
562 return 0;
563 }
564
565 /*
566 * Relying on the VMI_SIGNATURE field is not 100% safe, so check
567 * the PCI header and device type to make sure this is really a
568 * VMI device.
569 */
570 if (!rom->pci_header_offs) {
571 printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
572 return 0;
573 }
574
575 pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
576 if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
577 pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
578 /* Allow it to run... anyways, but warn */
579 printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
580 }
581
582 if (rom->pnp_header_offs) {
583 pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
584 if (pnp->manufacturer_offset)
585 manufacturer = (const char *)rom+pnp->manufacturer_offset;
586 if (pnp->product_offset)
587 product = (const char *)rom+pnp->product_offset;
588 }
589
590 if (rom->license_offs)
591 license = (char *)rom+rom->license_offs;
592
593 printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
594 manufacturer, product,
595 rom->api_version_maj, rom->api_version_min,
596 pci->rom_version_maj, pci->rom_version_min);
597
598 license_gplok = license_is_gpl_compatible(license);
599 if (!license_gplok) {
600 printk(KERN_WARNING "VMI: ROM license '%s' taints kernel... "
601 "inlining disabled\n",
602 license);
603 add_taint(TAINT_PROPRIETARY_MODULE);
604 }
605 return 1;
606}
607
608/*
609 * Probe for the VMI option ROM
610 */
611static inline int __init probe_vmi_rom(void)
612{
613 unsigned long base;
614
615 /* VMI ROM is in option ROM area, check signature */
616 for (base = 0xC0000; base < 0xE0000; base += 2048) {
617 struct vrom_header *romstart;
618 romstart = (struct vrom_header *)isa_bus_to_virt(base);
619 if (check_vmi_rom(romstart)) {
620 vmi_rom = romstart;
621 return 1;
622 }
623 }
624 return 0;
625}
626
627/*
628 * VMI setup common to all processors
629 */
630void vmi_bringup(void)
631{
632 /* We must establish the lowmem mapping for MMU ops to work */
633 if (vmi_rom)
634 vmi_ops.set_linear_mapping(0, __PAGE_OFFSET, max_low_pfn, 0);
635}
636
637/*
638 * Return a pointer to the VMI function or a NOP stub
639 */
640static void *vmi_get_function(int vmicall)
641{
642 u64 reloc;
643 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
644 reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
645 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
646 if (rel->type == VMI_RELOCATION_CALL_REL)
647 return (void *)rel->eip;
648 else
649 return (void *)vmi_nop;
650}
651
652/*
653 * Helper macro for making the VMI paravirt-ops fill code readable.
654 * For unimplemented operations, fall back to default.
655 */
656#define para_fill(opname, vmicall) \
657do { \
658 reloc = call_vrom_long_func(vmi_rom, get_reloc, \
659 VMI_CALL_##vmicall); \
660 if (rel->type != VMI_RELOCATION_NONE) { \
661 BUG_ON(rel->type != VMI_RELOCATION_CALL_REL); \
662 paravirt_ops.opname = (void *)rel->eip; \
663 } \
664} while (0)
665
666/*
667 * Activate the VMI interface and switch into paravirtualized mode
668 */
669static inline int __init activate_vmi(void)
670{
671 short kernel_cs;
672 u64 reloc;
673 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
674
675 if (call_vrom_func(vmi_rom, vmi_init) != 0) {
676 printk(KERN_ERR "VMI ROM failed to initialize!");
677 return 0;
678 }
679 savesegment(cs, kernel_cs);
680
681 paravirt_ops.paravirt_enabled = 1;
682 paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
683
684 paravirt_ops.patch = vmi_patch;
685 paravirt_ops.name = "vmi";
686
687 /*
688 * Many of these operations are ABI compatible with VMI.
689 * This means we can fill in the paravirt-ops with direct
690 * pointers into the VMI ROM. If the calling convention for
691 * these operations changes, this code needs to be updated.
692 *
693 * Exceptions
694 * CPUID paravirt-op uses pointers, not the native ISA
695 * halt has no VMI equivalent; all VMI halts are "safe"
696 * no MSR support yet - just trap and emulate. VMI uses the
697 * same ABI as the native ISA, but Linux wants exceptions
698 * from bogus MSR read / write handled
699 * rdpmc is not yet used in Linux
700 */
701
702 /* CPUID is special, so very special */
703 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_CPUID);
704 if (rel->type != VMI_RELOCATION_NONE) {
705 BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
706 vmi_ops.cpuid = (void *)rel->eip;
707 paravirt_ops.cpuid = vmi_cpuid;
708 }
709
710 para_fill(clts, CLTS);
711 para_fill(get_debugreg, GetDR);
712 para_fill(set_debugreg, SetDR);
713 para_fill(read_cr0, GetCR0);
714 para_fill(read_cr2, GetCR2);
715 para_fill(read_cr3, GetCR3);
716 para_fill(read_cr4, GetCR4);
717 para_fill(write_cr0, SetCR0);
718 para_fill(write_cr2, SetCR2);
719 para_fill(write_cr3, SetCR3);
720 para_fill(write_cr4, SetCR4);
721 para_fill(save_fl, GetInterruptMask);
722 para_fill(restore_fl, SetInterruptMask);
723 para_fill(irq_disable, DisableInterrupts);
724 para_fill(irq_enable, EnableInterrupts);
725 /* irq_save_disable !!! sheer pain */
726 patch_offset(&irq_save_disable_callout[IRQ_PATCH_INT_MASK],
727 (char *)paravirt_ops.save_fl);
728 patch_offset(&irq_save_disable_callout[IRQ_PATCH_DISABLE],
729 (char *)paravirt_ops.irq_disable);
730 para_fill(safe_halt, Halt);
731 para_fill(wbinvd, WBINVD);
732 /* paravirt_ops.read_msr = vmi_rdmsr */
733 /* paravirt_ops.write_msr = vmi_wrmsr */
734 para_fill(read_tsc, RDTSC);
735 /* paravirt_ops.rdpmc = vmi_rdpmc */
736
737 /* TR interface doesn't pass TR value */
738 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_SetTR);
739 if (rel->type != VMI_RELOCATION_NONE) {
740 BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
741 vmi_ops.set_tr = (void *)rel->eip;
742 paravirt_ops.load_tr_desc = vmi_set_tr;
743 }
744
745 /* LDT is special, too */
746 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_SetLDT);
747 if (rel->type != VMI_RELOCATION_NONE) {
748 BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
749 vmi_ops._set_ldt = (void *)rel->eip;
750 paravirt_ops.set_ldt = vmi_set_ldt;
751 }
752
753 para_fill(load_gdt, SetGDT);
754 para_fill(load_idt, SetIDT);
755 para_fill(store_gdt, GetGDT);
756 para_fill(store_idt, GetIDT);
757 para_fill(store_tr, GetTR);
758 paravirt_ops.load_tls = vmi_load_tls;
759 para_fill(write_ldt_entry, WriteLDTEntry);
760 para_fill(write_gdt_entry, WriteGDTEntry);
761 para_fill(write_idt_entry, WriteIDTEntry);
762 reloc = call_vrom_long_func(vmi_rom, get_reloc,
763 VMI_CALL_UpdateKernelStack);
764 if (rel->type != VMI_RELOCATION_NONE) {
765 BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
766 vmi_ops.set_kernel_stack = (void *)rel->eip;
767 paravirt_ops.load_esp0 = vmi_load_esp0;
768 }
769
770 para_fill(set_iopl_mask, SetIOPLMask);
771 paravirt_ops.io_delay = (void *)vmi_nop;
772 if (!disable_nodelay) {
773 paravirt_ops.const_udelay = (void *)vmi_nop;
774 }
775
776 para_fill(set_lazy_mode, SetLazyMode);
777
778 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_FlushTLB);
779 if (rel->type != VMI_RELOCATION_NONE) {
780 vmi_ops.flush_tlb = (void *)rel->eip;
781 paravirt_ops.flush_tlb_user = vmi_flush_tlb_user;
782 paravirt_ops.flush_tlb_kernel = vmi_flush_tlb_kernel;
783 }
784 para_fill(flush_tlb_single, InvalPage);
785
786 /*
787 * Until a standard flag format can be agreed on, we need to
788 * implement these as wrappers in Linux. Get the VMI ROM
789 * function pointers for the two backend calls.
790 */
791#ifdef CONFIG_X86_PAE
792 vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
793 vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
794#else
795 vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
796 vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
797#endif
798 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
799 vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
800 vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
801
802 paravirt_ops.alloc_pt = vmi_allocate_pt;
803 paravirt_ops.alloc_pd = vmi_allocate_pd;
804 paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone;
805 paravirt_ops.release_pt = vmi_release_pt;
806 paravirt_ops.release_pd = vmi_release_pd;
807 paravirt_ops.set_pte = vmi_set_pte;
808 paravirt_ops.set_pte_at = vmi_set_pte_at;
809 paravirt_ops.set_pmd = vmi_set_pmd;
810 paravirt_ops.pte_update = vmi_update_pte;
811 paravirt_ops.pte_update_defer = vmi_update_pte_defer;
812#ifdef CONFIG_X86_PAE
813 paravirt_ops.set_pte_atomic = vmi_set_pte_atomic;
814 paravirt_ops.set_pte_present = vmi_set_pte_present;
815 paravirt_ops.set_pud = vmi_set_pud;
816 paravirt_ops.pte_clear = vmi_pte_clear;
817 paravirt_ops.pmd_clear = vmi_pmd_clear;
818#endif
819 /*
820 * These MUST always be patched. Don't support indirect jumps
821 * through these operations, as the VMI interface may use either
822 * a jump or a call to get to these operations, depending on
823 * the backend. They are performance critical anyway, so requiring
824 * a patch is not a big problem.
825 */
826 paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0;
827 paravirt_ops.iret = (void *)0xbadbab0;
828
829#ifdef CONFIG_SMP
830 paravirt_ops.startup_ipi_hook = vmi_startup_ipi_hook;
831 vmi_ops.set_initial_ap_state = vmi_get_function(VMI_CALL_SetInitialAPState);
832#endif
833
834#ifdef CONFIG_X86_LOCAL_APIC
835 paravirt_ops.apic_read = vmi_get_function(VMI_CALL_APICRead);
836 paravirt_ops.apic_write = vmi_get_function(VMI_CALL_APICWrite);
837 paravirt_ops.apic_write_atomic = vmi_get_function(VMI_CALL_APICWrite);
838#endif
839
840 /*
841 * Alternative instruction rewriting doesn't happen soon enough
842 * to convert VMI_IRET to a call instead of a jump; so we have
843 * to do this before IRQs get reenabled. Fortunately, it is
844 * idempotent.
845 */
846 apply_paravirt(__start_parainstructions, __stop_parainstructions);
847
848 vmi_bringup();
849
850 return 1;
851}
852
853#undef para_fill
854
855void __init vmi_init(void)
856{
857 unsigned long flags;
858
859 if (!vmi_rom)
860 probe_vmi_rom();
861 else
862 check_vmi_rom(vmi_rom);
863
864 /* In case probing for or validating the ROM failed, basil */
865 if (!vmi_rom)
866 return;
867
868 reserve_top_address(-vmi_rom->virtual_top);
869
870 local_irq_save(flags);
871 activate_vmi();
872#ifdef CONFIG_SMP
873 no_timer_check = 1;
874#endif
875 local_irq_restore(flags & X86_EFLAGS_IF);
876}
877
878static int __init parse_vmi(char *arg)
879{
880 if (!arg)
881 return -EINVAL;
882
883 if (!strcmp(arg, "disable_nodelay"))
884 disable_nodelay = 1;
885 else if (!strcmp(arg, "disable_pge")) {
886 clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
887 disable_pge = 1;
888 } else if (!strcmp(arg, "disable_pse")) {
889 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
890 disable_pse = 1;
891 } else if (!strcmp(arg, "disable_sep")) {
892 clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
893 disable_sep = 1;
894 } else if (!strcmp(arg, "disable_tsc")) {
895 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
896 disable_tsc = 1;
897 } else if (!strcmp(arg, "disable_mtrr")) {
898 clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability);
899 disable_mtrr = 1;
900 }
901 return 0;
902}
903
904early_param("vmi", parse_vmi);
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index b5f538f52272..fa0cfbd551e1 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -171,6 +171,8 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
171void reserve_top_address(unsigned long reserve) 171void reserve_top_address(unsigned long reserve)
172{ 172{
173 BUG_ON(fixmaps > 0); 173 BUG_ON(fixmaps > 0);
174 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
175 (int)-reserve);
174#ifdef CONFIG_COMPAT_VDSO 176#ifdef CONFIG_COMPAT_VDSO
175 BUG_ON(reserve != 0); 177 BUG_ON(reserve != 0);
176#else 178#else