aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/i386/Kconfig32
-rw-r--r--arch/i386/Makefile3
-rw-r--r--arch/x86/kernel/asm-offsets_32.c1
-rw-r--r--arch/x86/lguest/Kconfig14
-rw-r--r--arch/x86/lguest/Makefile1
-rw-r--r--arch/x86/lguest/boot.c1070
-rw-r--r--arch/x86/lguest/i386_head.S115
-rw-r--r--arch/x86/xen/Kconfig5
8 files changed, 1229 insertions, 12 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index f6e44fc5283c..5bed8be34ba5 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -227,28 +227,40 @@ config SCHED_NO_NO_OMIT_FRAME_POINTER
227 If in doubt, say "Y". 227 If in doubt, say "Y".
228 228
229config PARAVIRT 229config PARAVIRT
230 bool "Paravirtualization support (EXPERIMENTAL)" 230 bool
231 depends on EXPERIMENTAL
232 depends on !(X86_VISWS || X86_VOYAGER) 231 depends on !(X86_VISWS || X86_VOYAGER)
233 help 232 help
234 Paravirtualization is a way of running multiple instances of 233 This changes the kernel so it can modify itself when it is run
235 Linux on the same machine, under a hypervisor. This option 234 under a hypervisor, potentially improving performance significantly
236 changes the kernel so it can modify itself when it is run 235 over full virtualization. However, when run without a hypervisor
237 under a hypervisor, improving performance significantly. 236 the kernel is theoretically slower and slightly larger.
238 However, when run without a hypervisor the kernel is 237
239 theoretically slower. If in doubt, say N. 238menuconfig PARAVIRT_GUEST
239 bool "Paravirtualized guest support"
240 help
241 Say Y here to get to see options related to running Linux under
242 various hypervisors. This option alone does not add any kernel code.
243
244 If you say N, all options in this submenu will be skipped and disabled.
245
246if PARAVIRT_GUEST
240 247
241source "arch/x86/xen/Kconfig" 248source "arch/x86/xen/Kconfig"
242 249
243config VMI 250config VMI
244 bool "VMI Paravirt-ops support" 251 bool "VMI Guest support"
245 depends on PARAVIRT 252 select PARAVIRT
253 depends on !(X86_VISWS || X86_VOYAGER)
246 help 254 help
247 VMI provides a paravirtualized interface to the VMware ESX server 255 VMI provides a paravirtualized interface to the VMware ESX server
248 (it could be used by other hypervisors in theory too, but is not 256 (it could be used by other hypervisors in theory too, but is not
249 at the moment), by linking the kernel to a GPL-ed ROM module 257 at the moment), by linking the kernel to a GPL-ed ROM module
250 provided by the hypervisor. 258 provided by the hypervisor.
251 259
260source "arch/x86/lguest/Kconfig"
261
262endif
263
252config ACPI_SRAT 264config ACPI_SRAT
253 bool 265 bool
254 default y 266 default y
diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index b88e47ca3032..b81cb64d48e5 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -99,6 +99,9 @@ core-$(CONFIG_X86_ES7000) := arch/x86/mach-es7000/
99# Xen paravirtualization support 99# Xen paravirtualization support
100core-$(CONFIG_XEN) += arch/x86/xen/ 100core-$(CONFIG_XEN) += arch/x86/xen/
101 101
102# lguest paravirtualization support
103core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
104
102# default subarch .h files 105# default subarch .h files
103mflags-y += -Iinclude/asm-x86/mach-default 106mflags-y += -Iinclude/asm-x86/mach-default
104 107
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index f8764716b0c0..0e45981b2dd7 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -136,6 +136,7 @@ void foo(void)
136#ifdef CONFIG_LGUEST_GUEST 136#ifdef CONFIG_LGUEST_GUEST
137 BLANK(); 137 BLANK();
138 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 138 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
139 OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
139 OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); 140 OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
140 OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); 141 OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
141 OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3); 142 OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
new file mode 100644
index 000000000000..c4dffbeea5e1
--- /dev/null
+++ b/arch/x86/lguest/Kconfig
@@ -0,0 +1,14 @@
1config LGUEST_GUEST
2 bool "Lguest guest support"
3 select PARAVIRT
4 depends on !X86_PAE
5 select VIRTIO
6 select VIRTIO_RING
7 select VIRTIO_CONSOLE
8 help
9 Lguest is a tiny in-kernel hypervisor. Selecting this will
10 allow your kernel to boot under lguest. This option will increase
11 your kernel size by about 6k. If in doubt, say N.
12
13 If you say Y here, make sure you say Y (or M) to the virtio block
14 and net drivers which lguest needs.
diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile
new file mode 100644
index 000000000000..27f0c9ed7f60
--- /dev/null
+++ b/arch/x86/lguest/Makefile
@@ -0,0 +1 @@
obj-y := i386_head.o boot.o
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
new file mode 100644
index 000000000000..d2235db4085f
--- /dev/null
+++ b/arch/x86/lguest/boot.c
@@ -0,0 +1,1070 @@
1/*P:010
2 * A hypervisor allows multiple Operating Systems to run on a single machine.
3 * To quote David Wheeler: "Any problem in computer science can be solved with
4 * another layer of indirection."
5 *
6 * We keep things simple in two ways. First, we start with a normal Linux
7 * kernel and insert a module (lg.ko) which allows us to run other Linux
8 * kernels the same way we'd run processes. We call the first kernel the Host,
9 * and the others the Guests. The program which sets up and configures Guests
10 * (such as the example in Documentation/lguest/lguest.c) is called the
11 * Launcher.
12 *
13 * Secondly, we only run specially modified Guests, not normal kernels. When
14 * you set CONFIG_LGUEST to 'y' or 'm', this automatically sets
15 * CONFIG_LGUEST_GUEST=y, which compiles this file into the kernel so it knows
16 * how to be a Guest. This means that you can use the same kernel you boot
17 * normally (ie. as a Host) as a Guest.
18 *
19 * These Guests know that they cannot do privileged operations, such as disable
20 * interrupts, and that they have to ask the Host to do such things explicitly.
21 * This file consists of all the replacements for such low-level native
22 * hardware operations: these special Guest versions call the Host.
23 *
24 * So how does the kernel know it's a Guest? The Guest starts at a special
25 * entry point marked with a magic string, which sets up a few things then
26 * calls here. We replace the native functions various "paravirt" structures
27 * with our Guest versions, then boot like normal. :*/
28
29/*
30 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
31 *
32 * This program is free software; you can redistribute it and/or modify
33 * it under the terms of the GNU General Public License as published by
34 * the Free Software Foundation; either version 2 of the License, or
35 * (at your option) any later version.
36 *
37 * This program is distributed in the hope that it will be useful, but
38 * WITHOUT ANY WARRANTY; without even the implied warranty of
39 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
40 * NON INFRINGEMENT. See the GNU General Public License for more
41 * details.
42 *
43 * You should have received a copy of the GNU General Public License
44 * along with this program; if not, write to the Free Software
45 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
46 */
47#include <linux/kernel.h>
48#include <linux/start_kernel.h>
49#include <linux/string.h>
50#include <linux/console.h>
51#include <linux/screen_info.h>
52#include <linux/irq.h>
53#include <linux/interrupt.h>
54#include <linux/clocksource.h>
55#include <linux/clockchips.h>
56#include <linux/lguest.h>
57#include <linux/lguest_launcher.h>
58#include <linux/virtio_console.h>
59#include <asm/paravirt.h>
60#include <asm/param.h>
61#include <asm/page.h>
62#include <asm/pgtable.h>
63#include <asm/desc.h>
64#include <asm/setup.h>
65#include <asm/e820.h>
66#include <asm/mce.h>
67#include <asm/io.h>
68#include <asm/i387.h>
69
70/*G:010 Welcome to the Guest!
71 *
72 * The Guest in our tale is a simple creature: identical to the Host but
73 * behaving in simplified but equivalent ways. In particular, the Guest is the
74 * same kernel as the Host (or at least, built from the same source code). :*/
75
76/* Declarations for definitions in lguest_guest.S */
77extern char lguest_noirq_start[], lguest_noirq_end[];
78extern const char lgstart_cli[], lgend_cli[];
79extern const char lgstart_sti[], lgend_sti[];
80extern const char lgstart_popf[], lgend_popf[];
81extern const char lgstart_pushf[], lgend_pushf[];
82extern const char lgstart_iret[], lgend_iret[];
83extern void lguest_iret(void);
84
85struct lguest_data lguest_data = {
86 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
87 .noirq_start = (u32)lguest_noirq_start,
88 .noirq_end = (u32)lguest_noirq_end,
89 .kernel_address = PAGE_OFFSET,
90 .blocked_interrupts = { 1 }, /* Block timer interrupts */
91 .syscall_vec = SYSCALL_VECTOR,
92};
93static cycle_t clock_base;
94
95/*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first
96 * real optimization trick!
97 *
98 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
99 * them as a batch when lazy_mode is eventually turned off. Because hypercalls
100 * are reasonably expensive, batching them up makes sense. For example, a
101 * large mmap might update dozens of page table entries: that code calls
102 * paravirt_enter_lazy_mmu(), does the dozen updates, then calls
103 * lguest_leave_lazy_mode().
104 *
105 * So, when we're in lazy mode, we call async_hypercall() to store the call for
106 * future processing. When lazy mode is turned off we issue a hypercall to
107 * flush the stored calls.
108 */
109static void lguest_leave_lazy_mode(void)
110{
111 paravirt_leave_lazy(paravirt_get_lazy_mode());
112 hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
113}
114
115static void lazy_hcall(unsigned long call,
116 unsigned long arg1,
117 unsigned long arg2,
118 unsigned long arg3)
119{
120 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
121 hcall(call, arg1, arg2, arg3);
122 else
123 async_hcall(call, arg1, arg2, arg3);
124}
125
126/* async_hcall() is pretty simple: I'm quite proud of it really. We have a
127 * ring buffer of stored hypercalls which the Host will run though next time we
128 * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall
129 * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
130 * and 255 once the Host has finished with it.
131 *
132 * If we come around to a slot which hasn't been finished, then the table is
133 * full and we just make the hypercall directly. This has the nice side
134 * effect of causing the Host to run all the stored calls in the ring buffer
135 * which empties it for next time! */
136void async_hcall(unsigned long call,
137 unsigned long arg1, unsigned long arg2, unsigned long arg3)
138{
139 /* Note: This code assumes we're uniprocessor. */
140 static unsigned int next_call;
141 unsigned long flags;
142
143 /* Disable interrupts if not already disabled: we don't want an
144 * interrupt handler making a hypercall while we're already doing
145 * one! */
146 local_irq_save(flags);
147 if (lguest_data.hcall_status[next_call] != 0xFF) {
148 /* Table full, so do normal hcall which will flush table. */
149 hcall(call, arg1, arg2, arg3);
150 } else {
151 lguest_data.hcalls[next_call].arg0 = call;
152 lguest_data.hcalls[next_call].arg1 = arg1;
153 lguest_data.hcalls[next_call].arg2 = arg2;
154 lguest_data.hcalls[next_call].arg3 = arg3;
155 /* Arguments must all be written before we mark it to go */
156 wmb();
157 lguest_data.hcall_status[next_call] = 0;
158 if (++next_call == LHCALL_RING_SIZE)
159 next_call = 0;
160 }
161 local_irq_restore(flags);
162}
163/*:*/
164
165/*G:033
166 * Here are our first native-instruction replacements: four functions for
167 * interrupt control.
168 *
169 * The simplest way of implementing these would be to have "turn interrupts
170 * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow:
171 * these are by far the most commonly called functions of those we override.
172 *
173 * So instead we keep an "irq_enabled" field inside our "struct lguest_data",
174 * which the Guest can update with a single instruction. The Host knows to
175 * check there when it wants to deliver an interrupt.
176 */
177
178/* save_flags() is expected to return the processor state (ie. "eflags"). The
179 * eflags word contains all kind of stuff, but in practice Linux only cares
180 * about the interrupt flag. Our "save_flags()" just returns that. */
181static unsigned long save_fl(void)
182{
183 return lguest_data.irq_enabled;
184}
185
186/* "restore_flags" just sets the flags back to the value given. */
187static void restore_fl(unsigned long flags)
188{
189 lguest_data.irq_enabled = flags;
190}
191
192/* Interrupts go off... */
193static void irq_disable(void)
194{
195 lguest_data.irq_enabled = 0;
196}
197
198/* Interrupts go on... */
199static void irq_enable(void)
200{
201 lguest_data.irq_enabled = X86_EFLAGS_IF;
202}
203/*:*/
204/*M:003 Note that we don't check for outstanding interrupts when we re-enable
205 * them (or when we unmask an interrupt). This seems to work for the moment,
206 * since interrupts are rare and we'll just get the interrupt on the next timer
207 * tick, but when we turn on CONFIG_NO_HZ, we should revisit this. One way
208 * would be to put the "irq_enabled" field in a page by itself, and have the
209 * Host write-protect it when an interrupt comes in when irqs are disabled.
210 * There will then be a page fault as soon as interrupts are re-enabled. :*/
211
212/*G:034
213 * The Interrupt Descriptor Table (IDT).
214 *
215 * The IDT tells the processor what to do when an interrupt comes in. Each
216 * entry in the table is a 64-bit descriptor: this holds the privilege level,
217 * address of the handler, and... well, who cares? The Guest just asks the
218 * Host to make the change anyway, because the Host controls the real IDT.
219 */
220static void lguest_write_idt_entry(struct desc_struct *dt,
221 int entrynum, u32 low, u32 high)
222{
223 /* Keep the local copy up to date. */
224 write_dt_entry(dt, entrynum, low, high);
225 /* Tell Host about this new entry. */
226 hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high);
227}
228
229/* Changing to a different IDT is very rare: we keep the IDT up-to-date every
230 * time it is written, so we can simply loop through all entries and tell the
231 * Host about them. */
232static void lguest_load_idt(const struct Xgt_desc_struct *desc)
233{
234 unsigned int i;
235 struct desc_struct *idt = (void *)desc->address;
236
237 for (i = 0; i < (desc->size+1)/8; i++)
238 hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);
239}
240
241/*
242 * The Global Descriptor Table.
243 *
244 * The Intel architecture defines another table, called the Global Descriptor
245 * Table (GDT). You tell the CPU where it is (and its size) using the "lgdt"
246 * instruction, and then several other instructions refer to entries in the
247 * table. There are three entries which the Switcher needs, so the Host simply
248 * controls the entire thing and the Guest asks it to make changes using the
249 * LOAD_GDT hypercall.
250 *
251 * This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY
252 * hypercall and use that repeatedly to load a new IDT. I don't think it
253 * really matters, but wouldn't it be nice if they were the same?
254 */
255static void lguest_load_gdt(const struct Xgt_desc_struct *desc)
256{
257 BUG_ON((desc->size+1)/8 != GDT_ENTRIES);
258 hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0);
259}
260
261/* For a single GDT entry which changes, we do the lazy thing: alter our GDT,
262 * then tell the Host to reload the entire thing. This operation is so rare
263 * that this naive implementation is reasonable. */
264static void lguest_write_gdt_entry(struct desc_struct *dt,
265 int entrynum, u32 low, u32 high)
266{
267 write_dt_entry(dt, entrynum, low, high);
268 hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);
269}
270
271/* OK, I lied. There are three "thread local storage" GDT entries which change
272 * on every context switch (these three entries are how glibc implements
273 * __thread variables). So we have a hypercall specifically for this case. */
274static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
275{
276 /* There's one problem which normal hardware doesn't have: the Host
277 * can't handle us removing entries we're currently using. So we clear
278 * the GS register here: if it's needed it'll be reloaded anyway. */
279 loadsegment(gs, 0);
280 lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
281}
282
283/*G:038 That's enough excitement for now, back to ploughing through each of
284 * the different pv_ops structures (we're about 1/3 of the way through).
285 *
286 * This is the Local Descriptor Table, another weird Intel thingy. Linux only
287 * uses this for some strange applications like Wine. We don't do anything
288 * here, so they'll get an informative and friendly Segmentation Fault. */
289static void lguest_set_ldt(const void *addr, unsigned entries)
290{
291}
292
293/* This loads a GDT entry into the "Task Register": that entry points to a
294 * structure called the Task State Segment. Some comments scattered though the
295 * kernel code indicate that this used for task switching in ages past, along
296 * with blood sacrifice and astrology.
297 *
298 * Now there's nothing interesting in here that we don't get told elsewhere.
299 * But the native version uses the "ltr" instruction, which makes the Host
300 * complain to the Guest about a Segmentation Fault and it'll oops. So we
301 * override the native version with a do-nothing version. */
302static void lguest_load_tr_desc(void)
303{
304}
305
306/* The "cpuid" instruction is a way of querying both the CPU identity
307 * (manufacturer, model, etc) and its features. It was introduced before the
308 * Pentium in 1993 and keeps getting extended by both Intel and AMD. As you
309 * might imagine, after a decade and a half this treatment, it is now a giant
310 * ball of hair. Its entry in the current Intel manual runs to 28 pages.
311 *
312 * This instruction even it has its own Wikipedia entry. The Wikipedia entry
313 * has been translated into 4 languages. I am not making this up!
314 *
315 * We could get funky here and identify ourselves as "GenuineLguest", but
316 * instead we just use the real "cpuid" instruction. Then I pretty much turned
317 * off feature bits until the Guest booted. (Don't say that: you'll damage
318 * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is
319 * hardly future proof.) Noone's listening! They don't like you anyway,
320 * parenthetic weirdo!
321 *
322 * Replacing the cpuid so we can turn features off is great for the kernel, but
323 * anyone (including userspace) can just use the raw "cpuid" instruction and
324 * the Host won't even notice since it isn't privileged. So we try not to get
325 * too worked up about it. */
326static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
327 unsigned int *ecx, unsigned int *edx)
328{
329 int function = *eax;
330
331 native_cpuid(eax, ebx, ecx, edx);
332 switch (function) {
333 case 1: /* Basic feature request. */
334 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
335 *ecx &= 0x00002201;
336 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
337 *edx &= 0x07808101;
338 /* The Host can do a nice optimization if it knows that the
339 * kernel mappings (addresses above 0xC0000000 or whatever
340 * PAGE_OFFSET is set to) haven't changed. But Linux calls
341 * flush_tlb_user() for both user and kernel mappings unless
342 * the Page Global Enable (PGE) feature bit is set. */
343 *edx |= 0x00002000;
344 break;
345 case 0x80000000:
346 /* Futureproof this a little: if they ask how much extended
347 * processor information there is, limit it to known fields. */
348 if (*eax > 0x80000008)
349 *eax = 0x80000008;
350 break;
351 }
352}
353
354/* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
355 * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother
356 * it. The Host needs to know when the Guest wants to change them, so we have
357 * a whole series of functions like read_cr0() and write_cr0().
358 *
359 * We start with CR0. CR0 allows you to turn on and off all kinds of basic
360 * features, but Linux only really cares about one: the horrifically-named Task
361 * Switched (TS) bit at bit 3 (ie. 8)
362 *
363 * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if
364 * the floating point unit is used. Which allows us to restore FPU state
365 * lazily after a task switch, and Linux uses that gratefully, but wouldn't a
366 * name like "FPUTRAP bit" be a little less cryptic?
367 *
368 * We store cr0 (and cr3) locally, because the Host never changes it. The
369 * Guest sometimes wants to read it and we'd prefer not to bother the Host
370 * unnecessarily. */
371static unsigned long current_cr0, current_cr3;
372static void lguest_write_cr0(unsigned long val)
373{
374 /* 8 == TS bit. */
375 lazy_hcall(LHCALL_TS, val & 8, 0, 0);
376 current_cr0 = val;
377}
378
379static unsigned long lguest_read_cr0(void)
380{
381 return current_cr0;
382}
383
384/* Intel provided a special instruction to clear the TS bit for people too cool
385 * to use write_cr0() to do it. This "clts" instruction is faster, because all
386 * the vowels have been optimized out. */
387static void lguest_clts(void)
388{
389 lazy_hcall(LHCALL_TS, 0, 0, 0);
390 current_cr0 &= ~8U;
391}
392
393/* CR2 is the virtual address of the last page fault, which the Guest only ever
394 * reads. The Host kindly writes this into our "struct lguest_data", so we
395 * just read it out of there. */
396static unsigned long lguest_read_cr2(void)
397{
398 return lguest_data.cr2;
399}
400
401/* CR3 is the current toplevel pagetable page: the principle is the same as
402 * cr0. Keep a local copy, and tell the Host when it changes. */
403static void lguest_write_cr3(unsigned long cr3)
404{
405 lazy_hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0);
406 current_cr3 = cr3;
407}
408
409static unsigned long lguest_read_cr3(void)
410{
411 return current_cr3;
412}
413
414/* CR4 is used to enable and disable PGE, but we don't care. */
415static unsigned long lguest_read_cr4(void)
416{
417 return 0;
418}
419
420static void lguest_write_cr4(unsigned long val)
421{
422}
423
424/*
425 * Page Table Handling.
426 *
427 * Now would be a good time to take a rest and grab a coffee or similarly
428 * relaxing stimulant. The easy parts are behind us, and the trek gradually
429 * winds uphill from here.
430 *
431 * Quick refresher: memory is divided into "pages" of 4096 bytes each. The CPU
432 * maps virtual addresses to physical addresses using "page tables". We could
433 * use one huge index of 1 million entries: each address is 4 bytes, so that's
434 * 1024 pages just to hold the page tables. But since most virtual addresses
435 * are unused, we use a two level index which saves space. The CR3 register
436 * contains the physical address of the top level "page directory" page, which
437 * contains physical addresses of up to 1024 second-level pages. Each of these
438 * second level pages contains up to 1024 physical addresses of actual pages,
439 * or Page Table Entries (PTEs).
440 *
441 * Here's a diagram, where arrows indicate physical addresses:
442 *
443 * CR3 ---> +---------+
444 * | --------->+---------+
445 * | | | PADDR1 |
446 * Top-level | | PADDR2 |
447 * (PMD) page | | |
448 * | | Lower-level |
449 * | | (PTE) page |
450 * | | | |
451 * .... ....
452 *
453 * So to convert a virtual address to a physical address, we look up the top
454 * level, which points us to the second level, which gives us the physical
455 * address of that page. If the top level entry was not present, or the second
456 * level entry was not present, then the virtual address is invalid (we
457 * say "the page was not mapped").
458 *
459 * Put another way, a 32-bit virtual address is divided up like so:
460 *
461 * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
462 * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>|
463 * Index into top Index into second Offset within page
464 * page directory page pagetable page
465 *
466 * The kernel spends a lot of time changing both the top-level page directory
467 * and lower-level pagetable pages. The Guest doesn't know physical addresses,
468 * so while it maintains these page tables exactly like normal, it also needs
469 * to keep the Host informed whenever it makes a change: the Host will create
470 * the real page tables based on the Guests'.
471 */
472
473/* The Guest calls this to set a second-level entry (pte), ie. to map a page
474 * into a process' address space. We set the entry then tell the Host the
475 * toplevel and address this corresponds to. The Guest uses one pagetable per
476 * process, so we need to tell the Host which one we're changing (mm->pgd). */
477static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
478 pte_t *ptep, pte_t pteval)
479{
480 *ptep = pteval;
481 lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low);
482}
483
484/* The Guest calls this to set a top-level entry. Again, we set the entry then
485 * tell the Host which top-level page we changed, and the index of the entry we
486 * changed. */
487static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
488{
489 *pmdp = pmdval;
490 lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK,
491 (__pa(pmdp)&(PAGE_SIZE-1))/4, 0);
492}
493
494/* There are a couple of legacy places where the kernel sets a PTE, but we
495 * don't know the top level any more. This is useless for us, since we don't
496 * know which pagetable is changing or what address, so we just tell the Host
497 * to forget all of them. Fortunately, this is very rare.
498 *
499 * ... except in early boot when the kernel sets up the initial pagetables,
500 * which makes booting astonishingly slow. So we don't even tell the Host
501 * anything changed until we've done the first page table switch.
502 */
503static void lguest_set_pte(pte_t *ptep, pte_t pteval)
504{
505 *ptep = pteval;
506 /* Don't bother with hypercall before initial setup. */
507 if (current_cr3)
508 lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
509}
510
511/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
512 * native page table operations. On native hardware you can set a new page
513 * table entry whenever you want, but if you want to remove one you have to do
514 * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
515 *
516 * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only
517 * called when a valid entry is written, not when it's removed (ie. marked not
518 * present). Instead, this is where we come when the Guest wants to remove a
519 * page table entry: we tell the Host to set that entry to 0 (ie. the present
520 * bit is zero). */
521static void lguest_flush_tlb_single(unsigned long addr)
522{
523 /* Simply set it to zero: if it was not, it will fault back in. */
524 lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0);
525}
526
527/* This is what happens after the Guest has removed a large number of entries.
528 * This tells the Host that any of the page table entries for userspace might
529 * have changed, ie. virtual addresses below PAGE_OFFSET. */
530static void lguest_flush_tlb_user(void)
531{
532 lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0);
533}
534
535/* This is called when the kernel page tables have changed. That's not very
536 * common (unless the Guest is using highmem, which makes the Guest extremely
537 * slow), so it's worth separating this from the user flushing above. */
538static void lguest_flush_tlb_kernel(void)
539{
540 lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
541}
542
543/*
544 * The Unadvanced Programmable Interrupt Controller.
545 *
546 * This is an attempt to implement the simplest possible interrupt controller.
547 * I spent some time looking though routines like set_irq_chip_and_handler,
548 * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and
549 * I *think* this is as simple as it gets.
550 *
551 * We can tell the Host what interrupts we want blocked ready for using the
552 * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as
553 * simple as setting a bit. We don't actually "ack" interrupts as such, we
554 * just mask and unmask them. I wonder if we should be cleverer?
555 */
556static void disable_lguest_irq(unsigned int irq)
557{
558 set_bit(irq, lguest_data.blocked_interrupts);
559}
560
561static void enable_lguest_irq(unsigned int irq)
562{
563 clear_bit(irq, lguest_data.blocked_interrupts);
564}
565
566/* This structure describes the lguest IRQ controller. */
567static struct irq_chip lguest_irq_controller = {
568 .name = "lguest",
569 .mask = disable_lguest_irq,
570 .mask_ack = disable_lguest_irq,
571 .unmask = enable_lguest_irq,
572};
573
574/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
575 * interrupt (except 128, which is used for system calls), and then tells the
576 * Linux infrastructure that each interrupt is controlled by our level-based
577 * lguest interrupt controller. */
578static void __init lguest_init_IRQ(void)
579{
580 unsigned int i;
581
582 for (i = 0; i < LGUEST_IRQS; i++) {
583 int vector = FIRST_EXTERNAL_VECTOR + i;
584 if (vector != SYSCALL_VECTOR) {
585 set_intr_gate(vector, interrupt[i]);
586 set_irq_chip_and_handler(i, &lguest_irq_controller,
587 handle_level_irq);
588 }
589 }
590 /* This call is required to set up for 4k stacks, where we have
591 * separate stacks for hard and soft interrupts. */
592 irq_ctx_init(smp_processor_id());
593}
594
595/*
596 * Time.
597 *
598 * It would be far better for everyone if the Guest had its own clock, but
599 * until then the Host gives us the time on every interrupt.
600 */
601static unsigned long lguest_get_wallclock(void)
602{
603 return lguest_data.time.tv_sec;
604}
605
606static cycle_t lguest_clock_read(void)
607{
608 unsigned long sec, nsec;
609
610 /* If the Host tells the TSC speed, we can trust that. */
611 if (lguest_data.tsc_khz)
612 return native_read_tsc();
613
614 /* If we can't use the TSC, we read the time value written by the Host.
615 * Since it's in two parts (seconds and nanoseconds), we risk reading
616 * it just as it's changing from 99 & 0.999999999 to 100 and 0, and
617 * getting 99 and 0. As Linux tends to come apart under the stress of
618 * time travel, we must be careful: */
619 do {
620 /* First we read the seconds part. */
621 sec = lguest_data.time.tv_sec;
622 /* This read memory barrier tells the compiler and the CPU that
623 * this can't be reordered: we have to complete the above
624 * before going on. */
625 rmb();
626 /* Now we read the nanoseconds part. */
627 nsec = lguest_data.time.tv_nsec;
628 /* Make sure we've done that. */
629 rmb();
630 /* Now if the seconds part has changed, try again. */
631 } while (unlikely(lguest_data.time.tv_sec != sec));
632
633 /* Our non-TSC clock is in real nanoseconds. */
634 return sec*1000000000ULL + nsec;
635}
636
637/* This is what we tell the kernel is our clocksource. */
638static struct clocksource lguest_clock = {
639 .name = "lguest",
640 .rating = 400,
641 .read = lguest_clock_read,
642 .mask = CLOCKSOURCE_MASK(64),
643 .mult = 1 << 22,
644 .shift = 22,
645 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
646};
647
648/* The "scheduler clock" is just our real clock, adjusted to start at zero */
649static unsigned long long lguest_sched_clock(void)
650{
651 return cyc2ns(&lguest_clock, lguest_clock_read() - clock_base);
652}
653
654/* We also need a "struct clock_event_device": Linux asks us to set it to go
655 * off some time in the future. Actually, James Morris figured all this out, I
656 * just applied the patch. */
657static int lguest_clockevent_set_next_event(unsigned long delta,
658 struct clock_event_device *evt)
659{
660 if (delta < LG_CLOCK_MIN_DELTA) {
661 if (printk_ratelimit())
662 printk(KERN_DEBUG "%s: small delta %lu ns\n",
663 __FUNCTION__, delta);
664 return -ETIME;
665 }
666 hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0);
667 return 0;
668}
669
670static void lguest_clockevent_set_mode(enum clock_event_mode mode,
671 struct clock_event_device *evt)
672{
673 switch (mode) {
674 case CLOCK_EVT_MODE_UNUSED:
675 case CLOCK_EVT_MODE_SHUTDOWN:
676 /* A 0 argument shuts the clock down. */
677 hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0);
678 break;
679 case CLOCK_EVT_MODE_ONESHOT:
680 /* This is what we expect. */
681 break;
682 case CLOCK_EVT_MODE_PERIODIC:
683 BUG();
684 case CLOCK_EVT_MODE_RESUME:
685 break;
686 }
687}
688
689/* This describes our primitive timer chip. */
690static struct clock_event_device lguest_clockevent = {
691 .name = "lguest",
692 .features = CLOCK_EVT_FEAT_ONESHOT,
693 .set_next_event = lguest_clockevent_set_next_event,
694 .set_mode = lguest_clockevent_set_mode,
695 .rating = INT_MAX,
696 .mult = 1,
697 .shift = 0,
698 .min_delta_ns = LG_CLOCK_MIN_DELTA,
699 .max_delta_ns = LG_CLOCK_MAX_DELTA,
700};
701
702/* This is the Guest timer interrupt handler (hardware interrupt 0). We just
703 * call the clockevent infrastructure and it does whatever needs doing. */
704static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
705{
706 unsigned long flags;
707
708 /* Don't interrupt us while this is running. */
709 local_irq_save(flags);
710 lguest_clockevent.event_handler(&lguest_clockevent);
711 local_irq_restore(flags);
712}
713
714/* At some point in the boot process, we get asked to set up our timing
715 * infrastructure. The kernel doesn't expect timer interrupts before this, but
716 * we cleverly initialized the "blocked_interrupts" field of "struct
717 * lguest_data" so that timer interrupts were blocked until now. */
718static void lguest_time_init(void)
719{
720 /* Set up the timer interrupt (0) to go to our simple timer routine */
721 set_irq_handler(0, lguest_time_irq);
722
723 /* Our clock structure look like arch/i386/kernel/tsc.c if we can use
724 * the TSC, otherwise it's a dumb nanosecond-resolution clock. Either
725 * way, the "rating" is initialized so high that it's always chosen
726 * over any other clocksource. */
727 if (lguest_data.tsc_khz)
728 lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz,
729 lguest_clock.shift);
730 clock_base = lguest_clock_read();
731 clocksource_register(&lguest_clock);
732
733 /* Now we've set up our clock, we can use it as the scheduler clock */
734 pv_time_ops.sched_clock = lguest_sched_clock;
735
736 /* We can't set cpumask in the initializer: damn C limitations! Set it
737 * here and register our timer device. */
738 lguest_clockevent.cpumask = cpumask_of_cpu(0);
739 clockevents_register_device(&lguest_clockevent);
740
741 /* Finally, we unblock the timer interrupt. */
742 enable_lguest_irq(0);
743}
744
745/*
746 * Miscellaneous bits and pieces.
747 *
748 * Here is an oddball collection of functions which the Guest needs for things
749 * to work. They're pretty simple.
750 */
751
752/* The Guest needs to tell the host what stack it expects traps to use. For
753 * native hardware, this is part of the Task State Segment mentioned above in
754 * lguest_load_tr_desc(), but to help hypervisors there's this special call.
755 *
756 * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data
757 * segment), the privilege level (we're privilege level 1, the Host is 0 and
758 * will not tolerate us trying to use that), the stack pointer, and the number
759 * of pages in the stack. */
760static void lguest_load_esp0(struct tss_struct *tss,
761 struct thread_struct *thread)
762{
763 lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->esp0,
764 THREAD_SIZE/PAGE_SIZE);
765}
766
767/* Let's just say, I wouldn't do debugging under a Guest. */
768static void lguest_set_debugreg(int regno, unsigned long value)
769{
770 /* FIXME: Implement */
771}
772
773/* There are times when the kernel wants to make sure that no memory writes are
774 * caught in the cache (that they've all reached real hardware devices). This
775 * doesn't matter for the Guest which has virtual hardware.
776 *
777 * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush
778 * (clflush) instruction is available and the kernel uses that. Otherwise, it
779 * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction.
780 * Unlike clflush, wbinvd can only be run at privilege level 0. So we can
781 * ignore clflush, but replace wbinvd.
782 */
783static void lguest_wbinvd(void)
784{
785}
786
787/* If the Guest expects to have an Advanced Programmable Interrupt Controller,
788 * we play dumb by ignoring writes and returning 0 for reads. So it's no
789 * longer Programmable nor Controlling anything, and I don't think 8 lines of
790 * code qualifies for Advanced. It will also never interrupt anything. It
791 * does, however, allow us to get through the Linux boot code. */
792#ifdef CONFIG_X86_LOCAL_APIC
793static void lguest_apic_write(unsigned long reg, unsigned long v)
794{
795}
796
797static unsigned long lguest_apic_read(unsigned long reg)
798{
799 return 0;
800}
801#endif
802
803/* STOP! Until an interrupt comes in. */
804static void lguest_safe_halt(void)
805{
806 hcall(LHCALL_HALT, 0, 0, 0);
807}
808
809/* Perhaps CRASH isn't the best name for this hypercall, but we use it to get a
810 * message out when we're crashing as well as elegant termination like powering
811 * off.
812 *
813 * Note that the Host always prefers that the Guest speak in physical addresses
814 * rather than virtual addresses, so we use __pa() here. */
815static void lguest_power_off(void)
816{
817 hcall(LHCALL_CRASH, __pa("Power down"), 0, 0);
818}
819
820/*
821 * Panicing.
822 *
823 * Don't. But if you did, this is what happens.
824 */
825static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
826{
827 hcall(LHCALL_CRASH, __pa(p), 0, 0);
828 /* The hcall won't return, but to keep gcc happy, we're "done". */
829 return NOTIFY_DONE;
830}
831
832static struct notifier_block paniced = {
833 .notifier_call = lguest_panic
834};
835
836/* Setting up memory is fairly easy. */
837static __init char *lguest_memory_setup(void)
838{
839 /* We do this here and not earlier because lockcheck barfs if we do it
840 * before start_kernel() */
841 atomic_notifier_chain_register(&panic_notifier_list, &paniced);
842
843 /* The Linux bootloader header contains an "e820" memory map: the
844 * Launcher populated the first entry with our memory limit. */
845 add_memory_region(boot_params.e820_map[0].addr,
846 boot_params.e820_map[0].size,
847 boot_params.e820_map[0].type);
848
849 /* This string is for the boot messages. */
850 return "LGUEST";
851}
852
853/* Before virtqueues are set up, we use LHCALL_NOTIFY on normal memory to
854 * produce console output. */
855static __init int early_put_chars(u32 vtermno, const char *buf, int count)
856{
857 char scratch[17];
858 unsigned int len = count;
859
860 if (len > sizeof(scratch) - 1)
861 len = sizeof(scratch) - 1;
862 scratch[len] = '\0';
863 memcpy(scratch, buf, len);
864 hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0);
865
866 /* This routine returns the number of bytes actually written. */
867 return len;
868}
869
870/*G:050
871 * Patching (Powerfully Placating Performance Pedants)
872 *
873 * We have already seen that pv_ops structures let us replace simple
874 * native instructions with calls to the appropriate back end all throughout
875 * the kernel. This allows the same kernel to run as a Guest and as a native
876 * kernel, but it's slow because of all the indirect branches.
877 *
878 * Remember that David Wheeler quote about "Any problem in computer science can
879 * be solved with another layer of indirection"? The rest of that quote is
880 * "... But that usually will create another problem." This is the first of
881 * those problems.
882 *
883 * Our current solution is to allow the paravirt back end to optionally patch
884 * over the indirect calls to replace them with something more efficient. We
885 * patch the four most commonly called functions: disable interrupts, enable
886 * interrupts, restore interrupts and save interrupts. We usually have 10
887 * bytes to patch into: the Guest versions of these operations are small enough
888 * that we can fit comfortably.
889 *
890 * First we need assembly templates of each of the patchable Guest operations,
891 * and these are in lguest_asm.S. */
892
893/*G:060 We construct a table from the assembler templates: */
894static const struct lguest_insns
895{
896 const char *start, *end;
897} lguest_insns[] = {
898 [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
899 [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti },
900 [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf },
901 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
902};
903
904/* Now our patch routine is fairly simple (based on the native one in
905 * paravirt.c). If we have a replacement, we copy it in and return how much of
906 * the available space we used. */
907static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
908 unsigned long addr, unsigned len)
909{
910 unsigned int insn_len;
911
912 /* Don't do anything special if we don't have a replacement */
913 if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
914 return paravirt_patch_default(type, clobber, ibuf, addr, len);
915
916 insn_len = lguest_insns[type].end - lguest_insns[type].start;
917
918 /* Similarly if we can't fit replacement (shouldn't happen, but let's
919 * be thorough). */
920 if (len < insn_len)
921 return paravirt_patch_default(type, clobber, ibuf, addr, len);
922
923 /* Copy in our instructions. */
924 memcpy(ibuf, lguest_insns[type].start, insn_len);
925 return insn_len;
926}
927
928/*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops
929 * structures in the kernel provide points for (almost) every routine we have
930 * to override to avoid privileged instructions. */
931__init void lguest_init(void)
932{
933 /* We're under lguest, paravirt is enabled, and we're running at
934 * privilege level 1, not 0 as normal. */
935 pv_info.name = "lguest";
936 pv_info.paravirt_enabled = 1;
937 pv_info.kernel_rpl = 1;
938
939 /* We set up all the lguest overrides for sensitive operations. These
940 * are detailed with the operations themselves. */
941
942 /* interrupt-related operations */
943 pv_irq_ops.init_IRQ = lguest_init_IRQ;
944 pv_irq_ops.save_fl = save_fl;
945 pv_irq_ops.restore_fl = restore_fl;
946 pv_irq_ops.irq_disable = irq_disable;
947 pv_irq_ops.irq_enable = irq_enable;
948 pv_irq_ops.safe_halt = lguest_safe_halt;
949
950 /* init-time operations */
951 pv_init_ops.memory_setup = lguest_memory_setup;
952 pv_init_ops.patch = lguest_patch;
953
954 /* Intercepts of various cpu instructions */
955 pv_cpu_ops.load_gdt = lguest_load_gdt;
956 pv_cpu_ops.cpuid = lguest_cpuid;
957 pv_cpu_ops.load_idt = lguest_load_idt;
958 pv_cpu_ops.iret = lguest_iret;
959 pv_cpu_ops.load_esp0 = lguest_load_esp0;
960 pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
961 pv_cpu_ops.set_ldt = lguest_set_ldt;
962 pv_cpu_ops.load_tls = lguest_load_tls;
963 pv_cpu_ops.set_debugreg = lguest_set_debugreg;
964 pv_cpu_ops.clts = lguest_clts;
965 pv_cpu_ops.read_cr0 = lguest_read_cr0;
966 pv_cpu_ops.write_cr0 = lguest_write_cr0;
967 pv_cpu_ops.read_cr4 = lguest_read_cr4;
968 pv_cpu_ops.write_cr4 = lguest_write_cr4;
969 pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
970 pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
971 pv_cpu_ops.wbinvd = lguest_wbinvd;
972 pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu;
973 pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
974
975 /* pagetable management */
976 pv_mmu_ops.write_cr3 = lguest_write_cr3;
977 pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
978 pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
979 pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
980 pv_mmu_ops.set_pte = lguest_set_pte;
981 pv_mmu_ops.set_pte_at = lguest_set_pte_at;
982 pv_mmu_ops.set_pmd = lguest_set_pmd;
983 pv_mmu_ops.read_cr2 = lguest_read_cr2;
984 pv_mmu_ops.read_cr3 = lguest_read_cr3;
985 pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
986 pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
987
988#ifdef CONFIG_X86_LOCAL_APIC
989 /* apic read/write intercepts */
990 pv_apic_ops.apic_write = lguest_apic_write;
991 pv_apic_ops.apic_write_atomic = lguest_apic_write;
992 pv_apic_ops.apic_read = lguest_apic_read;
993#endif
994
995 /* time operations */
996 pv_time_ops.get_wallclock = lguest_get_wallclock;
997 pv_time_ops.time_init = lguest_time_init;
998
999 /* Now is a good time to look at the implementations of these functions
1000 * before returning to the rest of lguest_init(). */
1001
1002 /*G:070 Now we've seen all the paravirt_ops, we return to
1003 * lguest_init() where the rest of the fairly chaotic boot setup
1004 * occurs. */
1005
1006 /* The native boot code sets up initial page tables immediately after
1007 * the kernel itself, and sets init_pg_tables_end so they're not
1008 * clobbered. The Launcher places our initial pagetables somewhere at
1009 * the top of our physical memory, so we don't need extra space: set
1010 * init_pg_tables_end to the end of the kernel. */
1011 init_pg_tables_end = __pa(pg0);
1012
1013 /* Load the %fs segment register (the per-cpu segment register) with
1014 * the normal data segment to get through booting. */
1015 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
1016
1017 /* The Host uses the top of the Guest's virtual address space for the
1018 * Host<->Guest Switcher, and it tells us how much it needs in
1019 * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */
1020 reserve_top_address(lguest_data.reserve_mem);
1021
1022 /* If we don't initialize the lock dependency checker now, it crashes
1023 * paravirt_disable_iospace. */
1024 lockdep_init();
1025
1026 /* The IDE code spends about 3 seconds probing for disks: if we reserve
1027 * all the I/O ports up front it can't get them and so doesn't probe.
1028 * Other device drivers are similar (but less severe). This cuts the
1029 * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */
1030 paravirt_disable_iospace();
1031
1032 /* This is messy CPU setup stuff which the native boot code does before
1033 * start_kernel, so we have to do, too: */
1034 cpu_detect(&new_cpu_data);
1035 /* head.S usually sets up the first capability word, so do it here. */
1036 new_cpu_data.x86_capability[0] = cpuid_edx(1);
1037
1038 /* Math is always hard! */
1039 new_cpu_data.hard_math = 1;
1040
1041#ifdef CONFIG_X86_MCE
1042 mce_disabled = 1;
1043#endif
1044#ifdef CONFIG_ACPI
1045 acpi_disabled = 1;
1046 acpi_ht = 0;
1047#endif
1048
1049 /* We set the perferred console to "hvc". This is the "hypervisor
1050 * virtual console" driver written by the PowerPC people, which we also
1051 * adapted for lguest's use. */
1052 add_preferred_console("hvc", 0, NULL);
1053
1054 /* Register our very early console. */
1055 virtio_cons_early_init(early_put_chars);
1056
1057 /* Last of all, we set the power management poweroff hook to point to
1058 * the Guest routine to power off. */
1059 pm_power_off = lguest_power_off;
1060
1061 /* Now we're set up, call start_kernel() in init/main.c and we proceed
1062 * to boot as normal. It never returns. */
1063 start_kernel();
1064}
1065/*
1066 * This marks the end of stage II of our journey, The Guest.
1067 *
1068 * It is now time for us to explore the nooks and crannies of the three Guest
1069 * devices and complete our understanding of the Guest in "make Drivers".
1070 */
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
new file mode 100644
index 000000000000..ebc6ac733899
--- /dev/null
+++ b/arch/x86/lguest/i386_head.S
@@ -0,0 +1,115 @@
1#include <linux/linkage.h>
2#include <linux/lguest.h>
3#include <asm/lguest_hcall.h>
4#include <asm/asm-offsets.h>
5#include <asm/thread_info.h>
6#include <asm/processor-flags.h>
7
8/*G:020 This is where we begin: head.S notes that the boot header's platform
9 * type field is "1" (lguest), so calls us here. The boot header is in %esi.
10 *
11 * WARNING: be very careful here! We're running at addresses equal to physical
12 * addesses (around 0), not above PAGE_OFFSET as most code expectes
13 * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
14 * data.
15 *
16 * The .section line puts this code in .init.text so it will be discarded after
17 * boot. */
18.section .init.text, "ax", @progbits
19ENTRY(lguest_entry)
20 /* Make initial hypercall now, so we can set up the pagetables. */
21 movl $LHCALL_LGUEST_INIT, %eax
22 movl $lguest_data - __PAGE_OFFSET, %edx
23 int $LGUEST_TRAP_ENTRY
24
25 /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl
26 * instruction uses %esi implicitly. */
27 movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi
28
29 /* Copy first 32 entries of page directory to __PAGE_OFFSET entries.
30 * This means the first 128M of kernel memory will be mapped at
31 * PAGE_OFFSET where the kernel expects to run. This will get it far
32 * enough through boot to switch to its own pagetables. */
33 movl $32, %ecx
34 movl %esi, %edi
35 addl $((__PAGE_OFFSET >> 22) * 4), %edi
36 rep
37 movsl
38
39 /* Set up the initial stack so we can run C code. */
40 movl $(init_thread_union+THREAD_SIZE),%esp
41
42 /* Jumps are relative, and we're running __PAGE_OFFSET too low at the
43 * moment. */
44 jmp lguest_init+__PAGE_OFFSET
45
46/*G:055 We create a macro which puts the assembler code between lgstart_ and
47 * lgend_ markers. These templates are put in the .text section: they can't be
48 * discarded after boot as we may need to patch modules, too. */
49.text
50#define LGUEST_PATCH(name, insns...) \
51 lgstart_##name: insns; lgend_##name:; \
52 .globl lgstart_##name; .globl lgend_##name
53
54LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
55LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled)
56LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled)
57LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
58/*:*/
59
60/* These demark the EIP range where host should never deliver interrupts. */
61.global lguest_noirq_start
62.global lguest_noirq_end
63
64/*M:004 When the Host reflects a trap or injects an interrupt into the Guest,
65 * it sets the eflags interrupt bit on the stack based on
66 * lguest_data.irq_enabled, so the Guest iret logic does the right thing when
67 * restoring it. However, when the Host sets the Guest up for direct traps,
68 * such as system calls, the processor is the one to push eflags onto the
69 * stack, and the interrupt bit will be 1 (in reality, interrupts are always
70 * enabled in the Guest).
71 *
72 * This turns out to be harmless: the only trap which should happen under Linux
73 * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
74 * regions), which has to be reflected through the Host anyway. If another
75 * trap *does* go off when interrupts are disabled, the Guest will panic, and
76 * we'll never get to this iret! :*/
77
78/*G:045 There is one final paravirt_op that the Guest implements, and glancing
79 * at it you can see why I left it to last. It's *cool*! It's in *assembler*!
80 *
81 * The "iret" instruction is used to return from an interrupt or trap. The
82 * stack looks like this:
83 * old address
84 * old code segment & privilege level
85 * old processor flags ("eflags")
86 *
87 * The "iret" instruction pops those values off the stack and restores them all
88 * at once. The only problem is that eflags includes the Interrupt Flag which
89 * the Guest can't change: the CPU will simply ignore it when we do an "iret".
90 * So we have to copy eflags from the stack to lguest_data.irq_enabled before
91 * we do the "iret".
92 *
93 * There are two problems with this: firstly, we need to use a register to do
94 * the copy and secondly, the whole thing needs to be atomic. The first
95 * problem is easy to solve: push %eax on the stack so we can use it, and then
96 * restore it at the end just before the real "iret".
97 *
98 * The second is harder: copying eflags to lguest_data.irq_enabled will turn
99 * interrupts on before we're finished, so we could be interrupted before we
100 * return to userspace or wherever. Our solution to this is to surround the
101 * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the
102 * Host that it is *never* to interrupt us there, even if interrupts seem to be
103 * enabled. */
104ENTRY(lguest_iret)
105 pushl %eax
106 movl 12(%esp), %eax
107lguest_noirq_start:
108 /* Note the %ss: segment prefix here. Normal data accesses use the
109 * "ds" segment, but that will have already been restored for whatever
110 * we're returning to (such as userspace): we can't trust it. The %ss:
111 * prefix makes sure we use the stack segment, which is still valid. */
112 movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled
113 popl %eax
114 iret
115lguest_noirq_end:
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 9df99e1885a4..fbfa55ce0d55 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -3,8 +3,9 @@
3# 3#
4 4
5config XEN 5config XEN
6 bool "Enable support for Xen hypervisor" 6 bool "Xen guest support"
7 depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES 7 select PARAVIRT
8 depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER)
8 help 9 help
9 This is the Linux Xen port. Enabling this will allow the 10 This is the Linux Xen port. Enabling this will allow the
10 kernel to boot in a paravirtualized environment under the 11 kernel to boot in a paravirtualized environment under the