aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorRusty Russell <rusty@rustcorp.com.au>2009-07-30 18:03:45 -0400
committerRusty Russell <rusty@rustcorp.com.au>2009-07-30 02:33:45 -0400
commit2e04ef76916d1e29a077ea9d0f2003c8fd86724d (patch)
tree2ff8d625d6e467be9f9f1b67a3674cb6e125e970 /arch/x86
parente969fed542cae08cb11d666efac4f7c5d624d09f (diff)
lguest: fix comment style
I don't really notice it (except to begrudge the extra vertical space), but Ingo does. And he pointed out that one excuse of lguest is as a teaching tool, it should set a good example. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Cc: Ingo Molnar <mingo@redhat.com>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/lguest.h3
-rw-r--r--arch/x86/include/asm/lguest_hcall.h10
-rw-r--r--arch/x86/lguest/boot.c428
-rw-r--r--arch/x86/lguest/i386_head.S110
4 files changed, 353 insertions, 198 deletions
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 313389cd50d2..5136dad57cbb 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -17,8 +17,7 @@
17/* Pages for switcher itself, then two pages per cpu */ 17/* Pages for switcher itself, then two pages per cpu */
18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) 18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
19 19
20/* We map at -4M (-2M when PAE is activated) for ease of mapping 20/* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */
21 * into the guest (one PTE page). */
22#ifdef CONFIG_X86_PAE 21#ifdef CONFIG_X86_PAE
23#define SWITCHER_ADDR 0xFFE00000 22#define SWITCHER_ADDR 0xFFE00000
24#else 23#else
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index 33600a66755f..cceb73e12e50 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -30,7 +30,8 @@
30#include <asm/hw_irq.h> 30#include <asm/hw_irq.h>
31#include <asm/kvm_para.h> 31#include <asm/kvm_para.h>
32 32
33/*G:030 But first, how does our Guest contact the Host to ask for privileged 33/*G:030
34 * But first, how does our Guest contact the Host to ask for privileged
34 * operations? There are two ways: the direct way is to make a "hypercall", 35 * operations? There are two ways: the direct way is to make a "hypercall",
35 * to make requests of the Host Itself. 36 * to make requests of the Host Itself.
36 * 37 *
@@ -41,16 +42,15 @@
41 * 42 *
42 * Grossly invalid calls result in Sudden Death at the hands of the vengeful 43 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
43 * Host, rather than returning failure. This reflects Winston Churchill's 44 * Host, rather than returning failure. This reflects Winston Churchill's
44 * definition of a gentleman: "someone who is only rude intentionally". */ 45 * definition of a gentleman: "someone who is only rude intentionally".
45/*:*/ 46:*/
46 47
47/* Can't use our min() macro here: needs to be a constant */ 48/* Can't use our min() macro here: needs to be a constant */
48#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) 49#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
49 50
50#define LHCALL_RING_SIZE 64 51#define LHCALL_RING_SIZE 64
51struct hcall_args { 52struct hcall_args {
52 /* These map directly onto eax, ebx, ecx, edx and esi 53 /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */
53 * in struct lguest_regs */
54 unsigned long arg0, arg1, arg2, arg3, arg4; 54 unsigned long arg0, arg1, arg2, arg3, arg4;
55}; 55};
56 56
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index f2bf1f73d468..025c04d18f2b 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -22,7 +22,8 @@
22 * 22 *
23 * So how does the kernel know it's a Guest? We'll see that later, but let's 23 * So how does the kernel know it's a Guest? We'll see that later, but let's
24 * just say that we end up here where we replace the native functions various 24 * just say that we end up here where we replace the native functions various
25 * "paravirt" structures with our Guest versions, then boot like normal. :*/ 25 * "paravirt" structures with our Guest versions, then boot like normal.
26:*/
26 27
27/* 28/*
28 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. 29 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
@@ -74,7 +75,8 @@
74 * 75 *
75 * The Guest in our tale is a simple creature: identical to the Host but 76 * The Guest in our tale is a simple creature: identical to the Host but
76 * behaving in simplified but equivalent ways. In particular, the Guest is the 77 * behaving in simplified but equivalent ways. In particular, the Guest is the
77 * same kernel as the Host (or at least, built from the same source code). :*/ 78 * same kernel as the Host (or at least, built from the same source code).
79:*/
78 80
79struct lguest_data lguest_data = { 81struct lguest_data lguest_data = {
80 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, 82 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
@@ -85,7 +87,8 @@ struct lguest_data lguest_data = {
85 .syscall_vec = SYSCALL_VECTOR, 87 .syscall_vec = SYSCALL_VECTOR,
86}; 88};
87 89
88/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a 90/*G:037
91 * async_hcall() is pretty simple: I'm quite proud of it really. We have a
89 * ring buffer of stored hypercalls which the Host will run though next time we 92 * ring buffer of stored hypercalls which the Host will run though next time we
90 * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall 93 * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall
91 * arguments, and a "hcall_status" word which is 0 if the call is ready to go, 94 * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
@@ -94,7 +97,8 @@ struct lguest_data lguest_data = {
94 * If we come around to a slot which hasn't been finished, then the table is 97 * If we come around to a slot which hasn't been finished, then the table is
95 * full and we just make the hypercall directly. This has the nice side 98 * full and we just make the hypercall directly. This has the nice side
96 * effect of causing the Host to run all the stored calls in the ring buffer 99 * effect of causing the Host to run all the stored calls in the ring buffer
97 * which empties it for next time! */ 100 * which empties it for next time!
101 */
98static void async_hcall(unsigned long call, unsigned long arg1, 102static void async_hcall(unsigned long call, unsigned long arg1,
99 unsigned long arg2, unsigned long arg3, 103 unsigned long arg2, unsigned long arg3,
100 unsigned long arg4) 104 unsigned long arg4)
@@ -103,9 +107,11 @@ static void async_hcall(unsigned long call, unsigned long arg1,
103 static unsigned int next_call; 107 static unsigned int next_call;
104 unsigned long flags; 108 unsigned long flags;
105 109
106 /* Disable interrupts if not already disabled: we don't want an 110 /*
111 * Disable interrupts if not already disabled: we don't want an
107 * interrupt handler making a hypercall while we're already doing 112 * interrupt handler making a hypercall while we're already doing
108 * one! */ 113 * one!
114 */
109 local_irq_save(flags); 115 local_irq_save(flags);
110 if (lguest_data.hcall_status[next_call] != 0xFF) { 116 if (lguest_data.hcall_status[next_call] != 0xFF) {
111 /* Table full, so do normal hcall which will flush table. */ 117 /* Table full, so do normal hcall which will flush table. */
@@ -125,8 +131,9 @@ static void async_hcall(unsigned long call, unsigned long arg1,
125 local_irq_restore(flags); 131 local_irq_restore(flags);
126} 132}
127 133
128/*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first 134/*G:035
129 * real optimization trick! 135 * Notice the lazy_hcall() above, rather than hcall(). This is our first real
136 * optimization trick!
130 * 137 *
131 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do 138 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
132 * them as a batch when lazy_mode is eventually turned off. Because hypercalls 139 * them as a batch when lazy_mode is eventually turned off. Because hypercalls
@@ -136,7 +143,8 @@ static void async_hcall(unsigned long call, unsigned long arg1,
136 * lguest_leave_lazy_mode(). 143 * lguest_leave_lazy_mode().
137 * 144 *
138 * So, when we're in lazy mode, we call async_hcall() to store the call for 145 * So, when we're in lazy mode, we call async_hcall() to store the call for
139 * future processing: */ 146 * future processing:
147 */
140static void lazy_hcall1(unsigned long call, 148static void lazy_hcall1(unsigned long call,
141 unsigned long arg1) 149 unsigned long arg1)
142{ 150{
@@ -208,9 +216,11 @@ static void lguest_end_context_switch(struct task_struct *next)
208 * check there before it tries to deliver an interrupt. 216 * check there before it tries to deliver an interrupt.
209 */ 217 */
210 218
211/* save_flags() is expected to return the processor state (ie. "flags"). The 219/*
220 * save_flags() is expected to return the processor state (ie. "flags"). The
212 * flags word contains all kind of stuff, but in practice Linux only cares 221 * flags word contains all kind of stuff, but in practice Linux only cares
213 * about the interrupt flag. Our "save_flags()" just returns that. */ 222 * about the interrupt flag. Our "save_flags()" just returns that.
223 */
214static unsigned long save_fl(void) 224static unsigned long save_fl(void)
215{ 225{
216 return lguest_data.irq_enabled; 226 return lguest_data.irq_enabled;
@@ -222,13 +232,15 @@ static void irq_disable(void)
222 lguest_data.irq_enabled = 0; 232 lguest_data.irq_enabled = 0;
223} 233}
224 234
225/* Let's pause a moment. Remember how I said these are called so often? 235/*
236 * Let's pause a moment. Remember how I said these are called so often?
226 * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to 237 * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
227 * break some rules. In particular, these functions are assumed to save their 238 * break some rules. In particular, these functions are assumed to save their
228 * own registers if they need to: normal C functions assume they can trash the 239 * own registers if they need to: normal C functions assume they can trash the
229 * eax register. To use normal C functions, we use 240 * eax register. To use normal C functions, we use
230 * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the 241 * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
231 * C function, then restores it. */ 242 * C function, then restores it.
243 */
232PV_CALLEE_SAVE_REGS_THUNK(save_fl); 244PV_CALLEE_SAVE_REGS_THUNK(save_fl);
233PV_CALLEE_SAVE_REGS_THUNK(irq_disable); 245PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
234/*:*/ 246/*:*/
@@ -237,18 +249,20 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
237extern void lg_irq_enable(void); 249extern void lg_irq_enable(void);
238extern void lg_restore_fl(unsigned long flags); 250extern void lg_restore_fl(unsigned long flags);
239 251
240/*M:003 Note that we don't check for outstanding interrupts when we re-enable 252/*M:003
241 * them (or when we unmask an interrupt). This seems to work for the moment, 253 * Note that we don't check for outstanding interrupts when we re-enable them
242 * since interrupts are rare and we'll just get the interrupt on the next timer 254 * (or when we unmask an interrupt). This seems to work for the moment, since
243 * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way 255 * interrupts are rare and we'll just get the interrupt on the next timer tick,
244 * would be to put the "irq_enabled" field in a page by itself, and have the 256 * but now we can run with CONFIG_NO_HZ, we should revisit this. One way would
245 * Host write-protect it when an interrupt comes in when irqs are disabled. 257 * be to put the "irq_enabled" field in a page by itself, and have the Host
246 * There will then be a page fault as soon as interrupts are re-enabled. 258 * write-protect it when an interrupt comes in when irqs are disabled. There
259 * will then be a page fault as soon as interrupts are re-enabled.
247 * 260 *
248 * A better method is to implement soft interrupt disable generally for x86: 261 * A better method is to implement soft interrupt disable generally for x86:
249 * instead of disabling interrupts, we set a flag. If an interrupt does come 262 * instead of disabling interrupts, we set a flag. If an interrupt does come
250 * in, we then disable them for real. This is uncommon, so we could simply use 263 * in, we then disable them for real. This is uncommon, so we could simply use
251 * a hypercall for interrupt control and not worry about efficiency. :*/ 264 * a hypercall for interrupt control and not worry about efficiency.
265:*/
252 266
253/*G:034 267/*G:034
254 * The Interrupt Descriptor Table (IDT). 268 * The Interrupt Descriptor Table (IDT).
@@ -261,10 +275,12 @@ extern void lg_restore_fl(unsigned long flags);
261static void lguest_write_idt_entry(gate_desc *dt, 275static void lguest_write_idt_entry(gate_desc *dt,
262 int entrynum, const gate_desc *g) 276 int entrynum, const gate_desc *g)
263{ 277{
264 /* The gate_desc structure is 8 bytes long: we hand it to the Host in 278 /*
279 * The gate_desc structure is 8 bytes long: we hand it to the Host in
265 * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors 280 * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors
266 * around like this; typesafety wasn't a big concern in Linux's early 281 * around like this; typesafety wasn't a big concern in Linux's early
267 * years. */ 282 * years.
283 */
268 u32 *desc = (u32 *)g; 284 u32 *desc = (u32 *)g;
269 /* Keep the local copy up to date. */ 285 /* Keep the local copy up to date. */
270 native_write_idt_entry(dt, entrynum, g); 286 native_write_idt_entry(dt, entrynum, g);
@@ -272,9 +288,11 @@ static void lguest_write_idt_entry(gate_desc *dt,
272 kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); 288 kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
273} 289}
274 290
275/* Changing to a different IDT is very rare: we keep the IDT up-to-date every 291/*
292 * Changing to a different IDT is very rare: we keep the IDT up-to-date every
276 * time it is written, so we can simply loop through all entries and tell the 293 * time it is written, so we can simply loop through all entries and tell the
277 * Host about them. */ 294 * Host about them.
295 */
278static void lguest_load_idt(const struct desc_ptr *desc) 296static void lguest_load_idt(const struct desc_ptr *desc)
279{ 297{
280 unsigned int i; 298 unsigned int i;
@@ -305,9 +323,11 @@ static void lguest_load_gdt(const struct desc_ptr *desc)
305 kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); 323 kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b);
306} 324}
307 325
308/* For a single GDT entry which changes, we do the lazy thing: alter our GDT, 326/*
327 * For a single GDT entry which changes, we do the lazy thing: alter our GDT,
309 * then tell the Host to reload the entire thing. This operation is so rare 328 * then tell the Host to reload the entire thing. This operation is so rare
310 * that this naive implementation is reasonable. */ 329 * that this naive implementation is reasonable.
330 */
311static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, 331static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
312 const void *desc, int type) 332 const void *desc, int type)
313{ 333{
@@ -317,29 +337,36 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
317 dt[entrynum].a, dt[entrynum].b); 337 dt[entrynum].a, dt[entrynum].b);
318} 338}
319 339
320/* OK, I lied. There are three "thread local storage" GDT entries which change 340/*
341 * OK, I lied. There are three "thread local storage" GDT entries which change
321 * on every context switch (these three entries are how glibc implements 342 * on every context switch (these three entries are how glibc implements
322 * __thread variables). So we have a hypercall specifically for this case. */ 343 * __thread variables). So we have a hypercall specifically for this case.
344 */
323static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) 345static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
324{ 346{
325 /* There's one problem which normal hardware doesn't have: the Host 347 /*
348 * There's one problem which normal hardware doesn't have: the Host
326 * can't handle us removing entries we're currently using. So we clear 349 * can't handle us removing entries we're currently using. So we clear
327 * the GS register here: if it's needed it'll be reloaded anyway. */ 350 * the GS register here: if it's needed it'll be reloaded anyway.
351 */
328 lazy_load_gs(0); 352 lazy_load_gs(0);
329 lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); 353 lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
330} 354}
331 355
332/*G:038 That's enough excitement for now, back to ploughing through each of 356/*G:038
333 * the different pv_ops structures (we're about 1/3 of the way through). 357 * That's enough excitement for now, back to ploughing through each of the
358 * different pv_ops structures (we're about 1/3 of the way through).
334 * 359 *
335 * This is the Local Descriptor Table, another weird Intel thingy. Linux only 360 * This is the Local Descriptor Table, another weird Intel thingy. Linux only
336 * uses this for some strange applications like Wine. We don't do anything 361 * uses this for some strange applications like Wine. We don't do anything
337 * here, so they'll get an informative and friendly Segmentation Fault. */ 362 * here, so they'll get an informative and friendly Segmentation Fault.
363 */
338static void lguest_set_ldt(const void *addr, unsigned entries) 364static void lguest_set_ldt(const void *addr, unsigned entries)
339{ 365{
340} 366}
341 367
342/* This loads a GDT entry into the "Task Register": that entry points to a 368/*
369 * This loads a GDT entry into the "Task Register": that entry points to a
343 * structure called the Task State Segment. Some comments scattered though the 370 * structure called the Task State Segment. Some comments scattered though the
344 * kernel code indicate that this used for task switching in ages past, along 371 * kernel code indicate that this used for task switching in ages past, along
345 * with blood sacrifice and astrology. 372 * with blood sacrifice and astrology.
@@ -347,19 +374,21 @@ static void lguest_set_ldt(const void *addr, unsigned entries)
347 * Now there's nothing interesting in here that we don't get told elsewhere. 374 * Now there's nothing interesting in here that we don't get told elsewhere.
348 * But the native version uses the "ltr" instruction, which makes the Host 375 * But the native version uses the "ltr" instruction, which makes the Host
349 * complain to the Guest about a Segmentation Fault and it'll oops. So we 376 * complain to the Guest about a Segmentation Fault and it'll oops. So we
350 * override the native version with a do-nothing version. */ 377 * override the native version with a do-nothing version.
378 */
351static void lguest_load_tr_desc(void) 379static void lguest_load_tr_desc(void)
352{ 380{
353} 381}
354 382
355/* The "cpuid" instruction is a way of querying both the CPU identity 383/*
384 * The "cpuid" instruction is a way of querying both the CPU identity
356 * (manufacturer, model, etc) and its features. It was introduced before the 385 * (manufacturer, model, etc) and its features. It was introduced before the
357 * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. 386 * Pentium in 1993 and keeps getting extended by both Intel, AMD and others.
358 * As you might imagine, after a decade and a half this treatment, it is now a 387 * As you might imagine, after a decade and a half this treatment, it is now a
359 * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. 388 * giant ball of hair. Its entry in the current Intel manual runs to 28 pages.
360 * 389 *
361 * This instruction even it has its own Wikipedia entry. The Wikipedia entry 390 * This instruction even it has its own Wikipedia entry. The Wikipedia entry
362 * has been translated into 4 languages. I am not making this up! 391 * has been translated into 5 languages. I am not making this up!
363 * 392 *
364 * We could get funky here and identify ourselves as "GenuineLguest", but 393 * We could get funky here and identify ourselves as "GenuineLguest", but
365 * instead we just use the real "cpuid" instruction. Then I pretty much turned 394 * instead we just use the real "cpuid" instruction. Then I pretty much turned
@@ -371,7 +400,8 @@ static void lguest_load_tr_desc(void)
371 * Replacing the cpuid so we can turn features off is great for the kernel, but 400 * Replacing the cpuid so we can turn features off is great for the kernel, but
372 * anyone (including userspace) can just use the raw "cpuid" instruction and 401 * anyone (including userspace) can just use the raw "cpuid" instruction and
373 * the Host won't even notice since it isn't privileged. So we try not to get 402 * the Host won't even notice since it isn't privileged. So we try not to get
374 * too worked up about it. */ 403 * too worked up about it.
404 */
375static void lguest_cpuid(unsigned int *ax, unsigned int *bx, 405static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
376 unsigned int *cx, unsigned int *dx) 406 unsigned int *cx, unsigned int *dx)
377{ 407{
@@ -379,43 +409,63 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
379 409
380 native_cpuid(ax, bx, cx, dx); 410 native_cpuid(ax, bx, cx, dx);
381 switch (function) { 411 switch (function) {
382 case 0: /* ID and highest CPUID. Futureproof a little by sticking to 412 /*
383 * older ones. */ 413 * CPUID 0 gives the highest legal CPUID number (and the ID string).
414 * We futureproof our code a little by sticking to known CPUID values.
415 */
416 case 0:
384 if (*ax > 5) 417 if (*ax > 5)
385 *ax = 5; 418 *ax = 5;
386 break; 419 break;
387 case 1: /* Basic feature request. */ 420
388 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ 421 /*
422 * CPUID 1 is a basic feature request.
423 *
424 * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3
425 * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE.
426 */
427 case 1:
389 *cx &= 0x00002201; 428 *cx &= 0x00002201;
390 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */
391 *dx &= 0x07808151; 429 *dx &= 0x07808151;
392 /* The Host can do a nice optimization if it knows that the 430 /*
431 * The Host can do a nice optimization if it knows that the
393 * kernel mappings (addresses above 0xC0000000 or whatever 432 * kernel mappings (addresses above 0xC0000000 or whatever
394 * PAGE_OFFSET is set to) haven't changed. But Linux calls 433 * PAGE_OFFSET is set to) haven't changed. But Linux calls
395 * flush_tlb_user() for both user and kernel mappings unless 434 * flush_tlb_user() for both user and kernel mappings unless
396 * the Page Global Enable (PGE) feature bit is set. */ 435 * the Page Global Enable (PGE) feature bit is set.
436 */
397 *dx |= 0x00002000; 437 *dx |= 0x00002000;
398 /* We also lie, and say we're family id 5. 6 or greater 438 /*
439 * We also lie, and say we're family id 5. 6 or greater
399 * leads to a rdmsr in early_init_intel which we can't handle. 440 * leads to a rdmsr in early_init_intel which we can't handle.
400 * Family ID is returned as bits 8-12 in ax. */ 441 * Family ID is returned as bits 8-12 in ax.
442 */
401 *ax &= 0xFFFFF0FF; 443 *ax &= 0xFFFFF0FF;
402 *ax |= 0x00000500; 444 *ax |= 0x00000500;
403 break; 445 break;
446 /*
447 * 0x80000000 returns the highest Extended Function, so we futureproof
448 * like we do above by limiting it to known fields.
449 */
404 case 0x80000000: 450 case 0x80000000:
405 /* Futureproof this a little: if they ask how much extended
406 * processor information there is, limit it to known fields. */
407 if (*ax > 0x80000008) 451 if (*ax > 0x80000008)
408 *ax = 0x80000008; 452 *ax = 0x80000008;
409 break; 453 break;
454
455 /*
456 * PAE systems can mark pages as non-executable. Linux calls this the
457 * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
458 * Virus Protection). We just switch turn if off here, since we don't
459 * support it.
460 */
410 case 0x80000001: 461 case 0x80000001:
411 /* Here we should fix nx cap depending on host. */
412 /* For this version of PAE, we just clear NX bit. */
413 *dx &= ~(1 << 20); 462 *dx &= ~(1 << 20);
414 break; 463 break;
415 } 464 }
416} 465}
417 466
418/* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. 467/*
468 * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
419 * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother 469 * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother
420 * it. The Host needs to know when the Guest wants to change them, so we have 470 * it. The Host needs to know when the Guest wants to change them, so we have
421 * a whole series of functions like read_cr0() and write_cr0(). 471 * a whole series of functions like read_cr0() and write_cr0().
@@ -430,7 +480,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
430 * name like "FPUTRAP bit" be a little less cryptic? 480 * name like "FPUTRAP bit" be a little less cryptic?
431 * 481 *
432 * We store cr0 locally because the Host never changes it. The Guest sometimes 482 * We store cr0 locally because the Host never changes it. The Guest sometimes
433 * wants to read it and we'd prefer not to bother the Host unnecessarily. */ 483 * wants to read it and we'd prefer not to bother the Host unnecessarily.
484 */
434static unsigned long current_cr0; 485static unsigned long current_cr0;
435static void lguest_write_cr0(unsigned long val) 486static void lguest_write_cr0(unsigned long val)
436{ 487{
@@ -443,18 +494,22 @@ static unsigned long lguest_read_cr0(void)
443 return current_cr0; 494 return current_cr0;
444} 495}
445 496
446/* Intel provided a special instruction to clear the TS bit for people too cool 497/*
498 * Intel provided a special instruction to clear the TS bit for people too cool
447 * to use write_cr0() to do it. This "clts" instruction is faster, because all 499 * to use write_cr0() to do it. This "clts" instruction is faster, because all
448 * the vowels have been optimized out. */ 500 * the vowels have been optimized out.
501 */
449static void lguest_clts(void) 502static void lguest_clts(void)
450{ 503{
451 lazy_hcall1(LHCALL_TS, 0); 504 lazy_hcall1(LHCALL_TS, 0);
452 current_cr0 &= ~X86_CR0_TS; 505 current_cr0 &= ~X86_CR0_TS;
453} 506}
454 507
455/* cr2 is the virtual address of the last page fault, which the Guest only ever 508/*
509 * cr2 is the virtual address of the last page fault, which the Guest only ever
456 * reads. The Host kindly writes this into our "struct lguest_data", so we 510 * reads. The Host kindly writes this into our "struct lguest_data", so we
457 * just read it out of there. */ 511 * just read it out of there.
512 */
458static unsigned long lguest_read_cr2(void) 513static unsigned long lguest_read_cr2(void)
459{ 514{
460 return lguest_data.cr2; 515 return lguest_data.cr2;
@@ -463,10 +518,12 @@ static unsigned long lguest_read_cr2(void)
463/* See lguest_set_pte() below. */ 518/* See lguest_set_pte() below. */
464static bool cr3_changed = false; 519static bool cr3_changed = false;
465 520
466/* cr3 is the current toplevel pagetable page: the principle is the same as 521/*
522 * cr3 is the current toplevel pagetable page: the principle is the same as
467 * cr0. Keep a local copy, and tell the Host when it changes. The only 523 * cr0. Keep a local copy, and tell the Host when it changes. The only
468 * difference is that our local copy is in lguest_data because the Host needs 524 * difference is that our local copy is in lguest_data because the Host needs
469 * to set it upon our initial hypercall. */ 525 * to set it upon our initial hypercall.
526 */
470static void lguest_write_cr3(unsigned long cr3) 527static void lguest_write_cr3(unsigned long cr3)
471{ 528{
472 lguest_data.pgdir = cr3; 529 lguest_data.pgdir = cr3;
@@ -538,10 +595,12 @@ static void lguest_write_cr4(unsigned long val)
538 * the real page tables based on the Guests'. 595 * the real page tables based on the Guests'.
539 */ 596 */
540 597
541/* The Guest calls this to set a second-level entry (pte), ie. to map a page 598/*
599 * The Guest calls this to set a second-level entry (pte), ie. to map a page
542 * into a process' address space. We set the entry then tell the Host the 600 * into a process' address space. We set the entry then tell the Host the
543 * toplevel and address this corresponds to. The Guest uses one pagetable per 601 * toplevel and address this corresponds to. The Guest uses one pagetable per
544 * process, so we need to tell the Host which one we're changing (mm->pgd). */ 602 * process, so we need to tell the Host which one we're changing (mm->pgd).
603 */
545static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, 604static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
546 pte_t *ptep) 605 pte_t *ptep)
547{ 606{
@@ -560,10 +619,13 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
560 lguest_pte_update(mm, addr, ptep); 619 lguest_pte_update(mm, addr, ptep);
561} 620}
562 621
563/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd 622/*
623 * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
564 * to set a middle-level entry when PAE is activated. 624 * to set a middle-level entry when PAE is activated.
625 *
565 * Again, we set the entry then tell the Host which page we changed, 626 * Again, we set the entry then tell the Host which page we changed,
566 * and the index of the entry we changed. */ 627 * and the index of the entry we changed.
628 */
567#ifdef CONFIG_X86_PAE 629#ifdef CONFIG_X86_PAE
568static void lguest_set_pud(pud_t *pudp, pud_t pudval) 630static void lguest_set_pud(pud_t *pudp, pud_t pudval)
569{ 631{
@@ -582,8 +644,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
582} 644}
583#else 645#else
584 646
585/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not 647/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */
586 * activated. */
587static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) 648static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
588{ 649{
589 native_set_pmd(pmdp, pmdval); 650 native_set_pmd(pmdp, pmdval);
@@ -592,7 +653,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
592} 653}
593#endif 654#endif
594 655
595/* There are a couple of legacy places where the kernel sets a PTE, but we 656/*
657 * There are a couple of legacy places where the kernel sets a PTE, but we
596 * don't know the top level any more. This is useless for us, since we don't 658 * don't know the top level any more. This is useless for us, since we don't
597 * know which pagetable is changing or what address, so we just tell the Host 659 * know which pagetable is changing or what address, so we just tell the Host
598 * to forget all of them. Fortunately, this is very rare. 660 * to forget all of them. Fortunately, this is very rare.
@@ -600,7 +662,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
600 * ... except in early boot when the kernel sets up the initial pagetables, 662 * ... except in early boot when the kernel sets up the initial pagetables,
601 * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell 663 * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell
602 * the Host anything changed until we've done the first page table switch, 664 * the Host anything changed until we've done the first page table switch,
603 * which brings boot back to 0.25 seconds. */ 665 * which brings boot back to 0.25 seconds.
666 */
604static void lguest_set_pte(pte_t *ptep, pte_t pteval) 667static void lguest_set_pte(pte_t *ptep, pte_t pteval)
605{ 668{
606 native_set_pte(ptep, pteval); 669 native_set_pte(ptep, pteval);
@@ -628,7 +691,8 @@ void lguest_pmd_clear(pmd_t *pmdp)
628} 691}
629#endif 692#endif
630 693
631/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on 694/*
695 * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
632 * native page table operations. On native hardware you can set a new page 696 * native page table operations. On native hardware you can set a new page
633 * table entry whenever you want, but if you want to remove one you have to do 697 * table entry whenever you want, but if you want to remove one you have to do
634 * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). 698 * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
@@ -637,24 +701,29 @@ void lguest_pmd_clear(pmd_t *pmdp)
637 * called when a valid entry is written, not when it's removed (ie. marked not 701 * called when a valid entry is written, not when it's removed (ie. marked not
638 * present). Instead, this is where we come when the Guest wants to remove a 702 * present). Instead, this is where we come when the Guest wants to remove a
639 * page table entry: we tell the Host to set that entry to 0 (ie. the present 703 * page table entry: we tell the Host to set that entry to 0 (ie. the present
640 * bit is zero). */ 704 * bit is zero).
705 */
641static void lguest_flush_tlb_single(unsigned long addr) 706static void lguest_flush_tlb_single(unsigned long addr)
642{ 707{
643 /* Simply set it to zero: if it was not, it will fault back in. */ 708 /* Simply set it to zero: if it was not, it will fault back in. */
644 lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); 709 lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
645} 710}
646 711
647/* This is what happens after the Guest has removed a large number of entries. 712/*
713 * This is what happens after the Guest has removed a large number of entries.
648 * This tells the Host that any of the page table entries for userspace might 714 * This tells the Host that any of the page table entries for userspace might
649 * have changed, ie. virtual addresses below PAGE_OFFSET. */ 715 * have changed, ie. virtual addresses below PAGE_OFFSET.
716 */
650static void lguest_flush_tlb_user(void) 717static void lguest_flush_tlb_user(void)
651{ 718{
652 lazy_hcall1(LHCALL_FLUSH_TLB, 0); 719 lazy_hcall1(LHCALL_FLUSH_TLB, 0);
653} 720}
654 721
655/* This is called when the kernel page tables have changed. That's not very 722/*
723 * This is called when the kernel page tables have changed. That's not very
656 * common (unless the Guest is using highmem, which makes the Guest extremely 724 * common (unless the Guest is using highmem, which makes the Guest extremely
657 * slow), so it's worth separating this from the user flushing above. */ 725 * slow), so it's worth separating this from the user flushing above.
726 */
658static void lguest_flush_tlb_kernel(void) 727static void lguest_flush_tlb_kernel(void)
659{ 728{
660 lazy_hcall1(LHCALL_FLUSH_TLB, 1); 729 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
@@ -691,23 +760,27 @@ static struct irq_chip lguest_irq_controller = {
691 .unmask = enable_lguest_irq, 760 .unmask = enable_lguest_irq,
692}; 761};
693 762
694/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware 763/*
764 * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
695 * interrupt (except 128, which is used for system calls), and then tells the 765 * interrupt (except 128, which is used for system calls), and then tells the
696 * Linux infrastructure that each interrupt is controlled by our level-based 766 * Linux infrastructure that each interrupt is controlled by our level-based
697 * lguest interrupt controller. */ 767 * lguest interrupt controller.
768 */
698static void __init lguest_init_IRQ(void) 769static void __init lguest_init_IRQ(void)
699{ 770{
700 unsigned int i; 771 unsigned int i;
701 772
702 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 773 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
703 /* Some systems map "vectors" to interrupts weirdly. Lguest has 774 /* Some systems map "vectors" to interrupts weirdly. Not us! */
704 * a straightforward 1 to 1 mapping, so force that here. */
705 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; 775 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR;
706 if (i != SYSCALL_VECTOR) 776 if (i != SYSCALL_VECTOR)
707 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); 777 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
708 } 778 }
709 /* This call is required to set up for 4k stacks, where we have 779
710 * separate stacks for hard and soft interrupts. */ 780 /*
781 * This call is required to set up for 4k stacks, where we have
782 * separate stacks for hard and soft interrupts.
783 */
711 irq_ctx_init(smp_processor_id()); 784 irq_ctx_init(smp_processor_id());
712} 785}
713 786
@@ -729,31 +802,39 @@ static unsigned long lguest_get_wallclock(void)
729 return lguest_data.time.tv_sec; 802 return lguest_data.time.tv_sec;
730} 803}
731 804
732/* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us 805/*
806 * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us
733 * what speed it runs at, or 0 if it's unusable as a reliable clock source. 807 * what speed it runs at, or 0 if it's unusable as a reliable clock source.
734 * This matches what we want here: if we return 0 from this function, the x86 808 * This matches what we want here: if we return 0 from this function, the x86
735 * TSC clock will give up and not register itself. */ 809 * TSC clock will give up and not register itself.
810 */
736static unsigned long lguest_tsc_khz(void) 811static unsigned long lguest_tsc_khz(void)
737{ 812{
738 return lguest_data.tsc_khz; 813 return lguest_data.tsc_khz;
739} 814}
740 815
741/* If we can't use the TSC, the kernel falls back to our lower-priority 816/*
742 * "lguest_clock", where we read the time value given to us by the Host. */ 817 * If we can't use the TSC, the kernel falls back to our lower-priority
818 * "lguest_clock", where we read the time value given to us by the Host.
819 */
743static cycle_t lguest_clock_read(struct clocksource *cs) 820static cycle_t lguest_clock_read(struct clocksource *cs)
744{ 821{
745 unsigned long sec, nsec; 822 unsigned long sec, nsec;
746 823
747 /* Since the time is in two parts (seconds and nanoseconds), we risk 824 /*
825 * Since the time is in two parts (seconds and nanoseconds), we risk
748 * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, 826 * reading it just as it's changing from 99 & 0.999999999 to 100 and 0,
749 * and getting 99 and 0. As Linux tends to come apart under the stress 827 * and getting 99 and 0. As Linux tends to come apart under the stress
750 * of time travel, we must be careful: */ 828 * of time travel, we must be careful:
829 */
751 do { 830 do {
752 /* First we read the seconds part. */ 831 /* First we read the seconds part. */
753 sec = lguest_data.time.tv_sec; 832 sec = lguest_data.time.tv_sec;
754 /* This read memory barrier tells the compiler and the CPU that 833 /*
834 * This read memory barrier tells the compiler and the CPU that
755 * this can't be reordered: we have to complete the above 835 * this can't be reordered: we have to complete the above
756 * before going on. */ 836 * before going on.
837 */
757 rmb(); 838 rmb();
758 /* Now we read the nanoseconds part. */ 839 /* Now we read the nanoseconds part. */
759 nsec = lguest_data.time.tv_nsec; 840 nsec = lguest_data.time.tv_nsec;
@@ -777,9 +858,11 @@ static struct clocksource lguest_clock = {
777 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 858 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
778}; 859};
779 860
780/* We also need a "struct clock_event_device": Linux asks us to set it to go 861/*
862 * We also need a "struct clock_event_device": Linux asks us to set it to go
781 * off some time in the future. Actually, James Morris figured all this out, I 863 * off some time in the future. Actually, James Morris figured all this out, I
782 * just applied the patch. */ 864 * just applied the patch.
865 */
783static int lguest_clockevent_set_next_event(unsigned long delta, 866static int lguest_clockevent_set_next_event(unsigned long delta,
784 struct clock_event_device *evt) 867 struct clock_event_device *evt)
785{ 868{
@@ -829,8 +912,10 @@ static struct clock_event_device lguest_clockevent = {
829 .max_delta_ns = LG_CLOCK_MAX_DELTA, 912 .max_delta_ns = LG_CLOCK_MAX_DELTA,
830}; 913};
831 914
832/* This is the Guest timer interrupt handler (hardware interrupt 0). We just 915/*
833 * call the clockevent infrastructure and it does whatever needs doing. */ 916 * This is the Guest timer interrupt handler (hardware interrupt 0). We just
917 * call the clockevent infrastructure and it does whatever needs doing.
918 */
834static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) 919static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
835{ 920{
836 unsigned long flags; 921 unsigned long flags;
@@ -841,10 +926,12 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
841 local_irq_restore(flags); 926 local_irq_restore(flags);
842} 927}
843 928
844/* At some point in the boot process, we get asked to set up our timing 929/*
930 * At some point in the boot process, we get asked to set up our timing
845 * infrastructure. The kernel doesn't expect timer interrupts before this, but 931 * infrastructure. The kernel doesn't expect timer interrupts before this, but
846 * we cleverly initialized the "blocked_interrupts" field of "struct 932 * we cleverly initialized the "blocked_interrupts" field of "struct
847 * lguest_data" so that timer interrupts were blocked until now. */ 933 * lguest_data" so that timer interrupts were blocked until now.
934 */
848static void lguest_time_init(void) 935static void lguest_time_init(void)
849{ 936{
850 /* Set up the timer interrupt (0) to go to our simple timer routine */ 937 /* Set up the timer interrupt (0) to go to our simple timer routine */
@@ -868,14 +955,16 @@ static void lguest_time_init(void)
868 * to work. They're pretty simple. 955 * to work. They're pretty simple.
869 */ 956 */
870 957
871/* The Guest needs to tell the Host what stack it expects traps to use. For 958/*
959 * The Guest needs to tell the Host what stack it expects traps to use. For
872 * native hardware, this is part of the Task State Segment mentioned above in 960 * native hardware, this is part of the Task State Segment mentioned above in
873 * lguest_load_tr_desc(), but to help hypervisors there's this special call. 961 * lguest_load_tr_desc(), but to help hypervisors there's this special call.
874 * 962 *
875 * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data 963 * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data
876 * segment), the privilege level (we're privilege level 1, the Host is 0 and 964 * segment), the privilege level (we're privilege level 1, the Host is 0 and
877 * will not tolerate us trying to use that), the stack pointer, and the number 965 * will not tolerate us trying to use that), the stack pointer, and the number
878 * of pages in the stack. */ 966 * of pages in the stack.
967 */
879static void lguest_load_sp0(struct tss_struct *tss, 968static void lguest_load_sp0(struct tss_struct *tss,
880 struct thread_struct *thread) 969 struct thread_struct *thread)
881{ 970{
@@ -889,7 +978,8 @@ static void lguest_set_debugreg(int regno, unsigned long value)
889 /* FIXME: Implement */ 978 /* FIXME: Implement */
890} 979}
891 980
892/* There are times when the kernel wants to make sure that no memory writes are 981/*
982 * There are times when the kernel wants to make sure that no memory writes are
893 * caught in the cache (that they've all reached real hardware devices). This 983 * caught in the cache (that they've all reached real hardware devices). This
894 * doesn't matter for the Guest which has virtual hardware. 984 * doesn't matter for the Guest which has virtual hardware.
895 * 985 *
@@ -903,11 +993,13 @@ static void lguest_wbinvd(void)
903{ 993{
904} 994}
905 995
906/* If the Guest expects to have an Advanced Programmable Interrupt Controller, 996/*
997 * If the Guest expects to have an Advanced Programmable Interrupt Controller,
907 * we play dumb by ignoring writes and returning 0 for reads. So it's no 998 * we play dumb by ignoring writes and returning 0 for reads. So it's no
908 * longer Programmable nor Controlling anything, and I don't think 8 lines of 999 * longer Programmable nor Controlling anything, and I don't think 8 lines of
909 * code qualifies for Advanced. It will also never interrupt anything. It 1000 * code qualifies for Advanced. It will also never interrupt anything. It
910 * does, however, allow us to get through the Linux boot code. */ 1001 * does, however, allow us to get through the Linux boot code.
1002 */
911#ifdef CONFIG_X86_LOCAL_APIC 1003#ifdef CONFIG_X86_LOCAL_APIC
912static void lguest_apic_write(u32 reg, u32 v) 1004static void lguest_apic_write(u32 reg, u32 v)
913{ 1005{
@@ -956,11 +1048,13 @@ static void lguest_safe_halt(void)
956 kvm_hypercall0(LHCALL_HALT); 1048 kvm_hypercall0(LHCALL_HALT);
957} 1049}
958 1050
959/* The SHUTDOWN hypercall takes a string to describe what's happening, and 1051/*
1052 * The SHUTDOWN hypercall takes a string to describe what's happening, and
960 * an argument which says whether this to restart (reboot) the Guest or not. 1053 * an argument which says whether this to restart (reboot) the Guest or not.
961 * 1054 *
962 * Note that the Host always prefers that the Guest speak in physical addresses 1055 * Note that the Host always prefers that the Guest speak in physical addresses
963 * rather than virtual addresses, so we use __pa() here. */ 1056 * rather than virtual addresses, so we use __pa() here.
1057 */
964static void lguest_power_off(void) 1058static void lguest_power_off(void)
965{ 1059{
966 kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), 1060 kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"),
@@ -991,8 +1085,10 @@ static __init char *lguest_memory_setup(void)
991 * nice to move it back to lguest_init. Patch welcome... */ 1085 * nice to move it back to lguest_init. Patch welcome... */
992 atomic_notifier_chain_register(&panic_notifier_list, &paniced); 1086 atomic_notifier_chain_register(&panic_notifier_list, &paniced);
993 1087
994 /* The Linux bootloader header contains an "e820" memory map: the 1088 /*
995 * Launcher populated the first entry with our memory limit. */ 1089 *The Linux bootloader header contains an "e820" memory map: the
1090 * Launcher populated the first entry with our memory limit.
1091 */
996 e820_add_region(boot_params.e820_map[0].addr, 1092 e820_add_region(boot_params.e820_map[0].addr,
997 boot_params.e820_map[0].size, 1093 boot_params.e820_map[0].size,
998 boot_params.e820_map[0].type); 1094 boot_params.e820_map[0].type);
@@ -1001,16 +1097,17 @@ static __init char *lguest_memory_setup(void)
1001 return "LGUEST"; 1097 return "LGUEST";
1002} 1098}
1003 1099
1004/* We will eventually use the virtio console device to produce console output, 1100/*
1101 * We will eventually use the virtio console device to produce console output,
1005 * but before that is set up we use LHCALL_NOTIFY on normal memory to produce 1102 * but before that is set up we use LHCALL_NOTIFY on normal memory to produce
1006 * console output. */ 1103 * console output.
1104 */
1007static __init int early_put_chars(u32 vtermno, const char *buf, int count) 1105static __init int early_put_chars(u32 vtermno, const char *buf, int count)
1008{ 1106{
1009 char scratch[17]; 1107 char scratch[17];
1010 unsigned int len = count; 1108 unsigned int len = count;
1011 1109
1012 /* We use a nul-terminated string, so we have to make a copy. Icky, 1110 /* We use a nul-terminated string, so we make a copy. Icky, huh? */
1013 * huh? */
1014 if (len > sizeof(scratch) - 1) 1111 if (len > sizeof(scratch) - 1)
1015 len = sizeof(scratch) - 1; 1112 len = sizeof(scratch) - 1;
1016 scratch[len] = '\0'; 1113 scratch[len] = '\0';
@@ -1021,8 +1118,10 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
1021 return len; 1118 return len;
1022} 1119}
1023 1120
1024/* Rebooting also tells the Host we're finished, but the RESTART flag tells the 1121/*
1025 * Launcher to reboot us. */ 1122 * Rebooting also tells the Host we're finished, but the RESTART flag tells the
1123 * Launcher to reboot us.
1124 */
1026static void lguest_restart(char *reason) 1125static void lguest_restart(char *reason)
1027{ 1126{
1028 kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); 1127 kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART);
@@ -1049,7 +1148,8 @@ static void lguest_restart(char *reason)
1049 * fit comfortably. 1148 * fit comfortably.
1050 * 1149 *
1051 * First we need assembly templates of each of the patchable Guest operations, 1150 * First we need assembly templates of each of the patchable Guest operations,
1052 * and these are in i386_head.S. */ 1151 * and these are in i386_head.S.
1152 */
1053 1153
1054/*G:060 We construct a table from the assembler templates: */ 1154/*G:060 We construct a table from the assembler templates: */
1055static const struct lguest_insns 1155static const struct lguest_insns
@@ -1060,9 +1160,11 @@ static const struct lguest_insns
1060 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, 1160 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
1061}; 1161};
1062 1162
1063/* Now our patch routine is fairly simple (based on the native one in 1163/*
1164 * Now our patch routine is fairly simple (based on the native one in
1064 * paravirt.c). If we have a replacement, we copy it in and return how much of 1165 * paravirt.c). If we have a replacement, we copy it in and return how much of
1065 * the available space we used. */ 1166 * the available space we used.
1167 */
1066static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, 1168static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
1067 unsigned long addr, unsigned len) 1169 unsigned long addr, unsigned len)
1068{ 1170{
@@ -1074,8 +1176,7 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
1074 1176
1075 insn_len = lguest_insns[type].end - lguest_insns[type].start; 1177 insn_len = lguest_insns[type].end - lguest_insns[type].start;
1076 1178
1077 /* Similarly if we can't fit replacement (shouldn't happen, but let's 1179 /* Similarly if it can't fit (doesn't happen, but let's be thorough). */
1078 * be thorough). */
1079 if (len < insn_len) 1180 if (len < insn_len)
1080 return paravirt_patch_default(type, clobber, ibuf, addr, len); 1181 return paravirt_patch_default(type, clobber, ibuf, addr, len);
1081 1182
@@ -1084,22 +1185,28 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
1084 return insn_len; 1185 return insn_len;
1085} 1186}
1086 1187
1087/*G:029 Once we get to lguest_init(), we know we're a Guest. The various 1188/*G:029
1189 * Once we get to lguest_init(), we know we're a Guest. The various
1088 * pv_ops structures in the kernel provide points for (almost) every routine we 1190 * pv_ops structures in the kernel provide points for (almost) every routine we
1089 * have to override to avoid privileged instructions. */ 1191 * have to override to avoid privileged instructions.
1192 */
1090__init void lguest_init(void) 1193__init void lguest_init(void)
1091{ 1194{
1092 /* We're under lguest, paravirt is enabled, and we're running at 1195 /* We're under lguest. */
1093 * privilege level 1, not 0 as normal. */
1094 pv_info.name = "lguest"; 1196 pv_info.name = "lguest";
1197 /* Paravirt is enabled. */
1095 pv_info.paravirt_enabled = 1; 1198 pv_info.paravirt_enabled = 1;
1199 /* We're running at privilege level 1, not 0 as normal. */
1096 pv_info.kernel_rpl = 1; 1200 pv_info.kernel_rpl = 1;
1201 /* Everyone except Xen runs with this set. */
1097 pv_info.shared_kernel_pmd = 1; 1202 pv_info.shared_kernel_pmd = 1;
1098 1203
1099 /* We set up all the lguest overrides for sensitive operations. These 1204 /*
1100 * are detailed with the operations themselves. */ 1205 * We set up all the lguest overrides for sensitive operations. These
1206 * are detailed with the operations themselves.
1207 */
1101 1208
1102 /* interrupt-related operations */ 1209 /* Interrupt-related operations */
1103 pv_irq_ops.init_IRQ = lguest_init_IRQ; 1210 pv_irq_ops.init_IRQ = lguest_init_IRQ;
1104 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); 1211 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
1105 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); 1212 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
@@ -1107,11 +1214,11 @@ __init void lguest_init(void)
1107 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); 1214 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
1108 pv_irq_ops.safe_halt = lguest_safe_halt; 1215 pv_irq_ops.safe_halt = lguest_safe_halt;
1109 1216
1110 /* init-time operations */ 1217 /* Setup operations */
1111 pv_init_ops.memory_setup = lguest_memory_setup; 1218 pv_init_ops.memory_setup = lguest_memory_setup;
1112 pv_init_ops.patch = lguest_patch; 1219 pv_init_ops.patch = lguest_patch;
1113 1220
1114 /* Intercepts of various cpu instructions */ 1221 /* Intercepts of various CPU instructions */
1115 pv_cpu_ops.load_gdt = lguest_load_gdt; 1222 pv_cpu_ops.load_gdt = lguest_load_gdt;
1116 pv_cpu_ops.cpuid = lguest_cpuid; 1223 pv_cpu_ops.cpuid = lguest_cpuid;
1117 pv_cpu_ops.load_idt = lguest_load_idt; 1224 pv_cpu_ops.load_idt = lguest_load_idt;
@@ -1132,7 +1239,7 @@ __init void lguest_init(void)
1132 pv_cpu_ops.start_context_switch = paravirt_start_context_switch; 1239 pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
1133 pv_cpu_ops.end_context_switch = lguest_end_context_switch; 1240 pv_cpu_ops.end_context_switch = lguest_end_context_switch;
1134 1241
1135 /* pagetable management */ 1242 /* Pagetable management */
1136 pv_mmu_ops.write_cr3 = lguest_write_cr3; 1243 pv_mmu_ops.write_cr3 = lguest_write_cr3;
1137 pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; 1244 pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
1138 pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; 1245 pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
@@ -1154,54 +1261,71 @@ __init void lguest_init(void)
1154 pv_mmu_ops.pte_update_defer = lguest_pte_update; 1261 pv_mmu_ops.pte_update_defer = lguest_pte_update;
1155 1262
1156#ifdef CONFIG_X86_LOCAL_APIC 1263#ifdef CONFIG_X86_LOCAL_APIC
1157 /* apic read/write intercepts */ 1264 /* APIC read/write intercepts */
1158 set_lguest_basic_apic_ops(); 1265 set_lguest_basic_apic_ops();
1159#endif 1266#endif
1160 1267
1161 /* time operations */ 1268 /* Time operations */
1162 pv_time_ops.get_wallclock = lguest_get_wallclock; 1269 pv_time_ops.get_wallclock = lguest_get_wallclock;
1163 pv_time_ops.time_init = lguest_time_init; 1270 pv_time_ops.time_init = lguest_time_init;
1164 pv_time_ops.get_tsc_khz = lguest_tsc_khz; 1271 pv_time_ops.get_tsc_khz = lguest_tsc_khz;
1165 1272
1166 /* Now is a good time to look at the implementations of these functions 1273 /*
1167 * before returning to the rest of lguest_init(). */ 1274 * Now is a good time to look at the implementations of these functions
1275 * before returning to the rest of lguest_init().
1276 */
1168 1277
1169 /*G:070 Now we've seen all the paravirt_ops, we return to 1278 /*G:070
1279 * Now we've seen all the paravirt_ops, we return to
1170 * lguest_init() where the rest of the fairly chaotic boot setup 1280 * lguest_init() where the rest of the fairly chaotic boot setup
1171 * occurs. */ 1281 * occurs.
1282 */
1172 1283
1173 /* The stack protector is a weird thing where gcc places a canary 1284 /*
1285 * The stack protector is a weird thing where gcc places a canary
1174 * value on the stack and then checks it on return. This file is 1286 * value on the stack and then checks it on return. This file is
1175 * compiled with -fno-stack-protector it, so we got this far without 1287 * compiled with -fno-stack-protector it, so we got this far without
1176 * problems. The value of the canary is kept at offset 20 from the 1288 * problems. The value of the canary is kept at offset 20 from the
1177 * %gs register, so we need to set that up before calling C functions 1289 * %gs register, so we need to set that up before calling C functions
1178 * in other files. */ 1290 * in other files.
1291 */
1179 setup_stack_canary_segment(0); 1292 setup_stack_canary_segment(0);
1180 /* We could just call load_stack_canary_segment(), but we might as 1293
1181 * call switch_to_new_gdt() which loads the whole table and sets up 1294 /*
1182 * the per-cpu segment descriptor register %fs as well. */ 1295 * We could just call load_stack_canary_segment(), but we might as well
1296 * call switch_to_new_gdt() which loads the whole table and sets up the
1297 * per-cpu segment descriptor register %fs as well.
1298 */
1183 switch_to_new_gdt(0); 1299 switch_to_new_gdt(0);
1184 1300
1185 /* As described in head_32.S, we map the first 128M of memory. */ 1301 /* As described in head_32.S, we map the first 128M of memory. */
1186 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; 1302 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1187 1303
1188 /* The Host<->Guest Switcher lives at the top of our address space, and 1304 /*
1305 * The Host<->Guest Switcher lives at the top of our address space, and
1189 * the Host told us how big it is when we made LGUEST_INIT hypercall: 1306 * the Host told us how big it is when we made LGUEST_INIT hypercall:
1190 * it put the answer in lguest_data.reserve_mem */ 1307 * it put the answer in lguest_data.reserve_mem
1308 */
1191 reserve_top_address(lguest_data.reserve_mem); 1309 reserve_top_address(lguest_data.reserve_mem);
1192 1310
1193 /* If we don't initialize the lock dependency checker now, it crashes 1311 /*
1194 * paravirt_disable_iospace. */ 1312 * If we don't initialize the lock dependency checker now, it crashes
1313 * paravirt_disable_iospace.
1314 */
1195 lockdep_init(); 1315 lockdep_init();
1196 1316
1197 /* The IDE code spends about 3 seconds probing for disks: if we reserve 1317 /*
1318 * The IDE code spends about 3 seconds probing for disks: if we reserve
1198 * all the I/O ports up front it can't get them and so doesn't probe. 1319 * all the I/O ports up front it can't get them and so doesn't probe.
1199 * Other device drivers are similar (but less severe). This cuts the 1320 * Other device drivers are similar (but less severe). This cuts the
1200 * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */ 1321 * kernel boot time on my machine from 4.1 seconds to 0.45 seconds.
1322 */
1201 paravirt_disable_iospace(); 1323 paravirt_disable_iospace();
1202 1324
1203 /* This is messy CPU setup stuff which the native boot code does before 1325 /*
1204 * start_kernel, so we have to do, too: */ 1326 * This is messy CPU setup stuff which the native boot code does before
1327 * start_kernel, so we have to do, too:
1328 */
1205 cpu_detect(&new_cpu_data); 1329 cpu_detect(&new_cpu_data);
1206 /* head.S usually sets up the first capability word, so do it here. */ 1330 /* head.S usually sets up the first capability word, so do it here. */
1207 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1331 new_cpu_data.x86_capability[0] = cpuid_edx(1);
@@ -1218,22 +1342,28 @@ __init void lguest_init(void)
1218 acpi_ht = 0; 1342 acpi_ht = 0;
1219#endif 1343#endif
1220 1344
1221 /* We set the preferred console to "hvc". This is the "hypervisor 1345 /*
1346 * We set the preferred console to "hvc". This is the "hypervisor
1222 * virtual console" driver written by the PowerPC people, which we also 1347 * virtual console" driver written by the PowerPC people, which we also
1223 * adapted for lguest's use. */ 1348 * adapted for lguest's use.
1349 */
1224 add_preferred_console("hvc", 0, NULL); 1350 add_preferred_console("hvc", 0, NULL);
1225 1351
1226 /* Register our very early console. */ 1352 /* Register our very early console. */
1227 virtio_cons_early_init(early_put_chars); 1353 virtio_cons_early_init(early_put_chars);
1228 1354
1229 /* Last of all, we set the power management poweroff hook to point to 1355 /*
1356 * Last of all, we set the power management poweroff hook to point to
1230 * the Guest routine to power off, and the reboot hook to our restart 1357 * the Guest routine to power off, and the reboot hook to our restart
1231 * routine. */ 1358 * routine.
1359 */
1232 pm_power_off = lguest_power_off; 1360 pm_power_off = lguest_power_off;
1233 machine_ops.restart = lguest_restart; 1361 machine_ops.restart = lguest_restart;
1234 1362
1235 /* Now we're set up, call i386_start_kernel() in head32.c and we proceed 1363 /*
1236 * to boot as normal. It never returns. */ 1364 * Now we're set up, call i386_start_kernel() in head32.c and we proceed
1365 * to boot as normal. It never returns.
1366 */
1237 i386_start_kernel(); 1367 i386_start_kernel();
1238} 1368}
1239/* 1369/*
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index a9c8cfe61cd4..db6aa95eb054 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -5,7 +5,8 @@
5#include <asm/thread_info.h> 5#include <asm/thread_info.h>
6#include <asm/processor-flags.h> 6#include <asm/processor-flags.h>
7 7
8/*G:020 Our story starts with the kernel booting into startup_32 in 8/*G:020
9 * Our story starts with the kernel booting into startup_32 in
9 * arch/x86/kernel/head_32.S. It expects a boot header, which is created by 10 * arch/x86/kernel/head_32.S. It expects a boot header, which is created by
10 * the bootloader (the Launcher in our case). 11 * the bootloader (the Launcher in our case).
11 * 12 *
@@ -21,11 +22,14 @@
21 * data without remembering to subtract __PAGE_OFFSET! 22 * data without remembering to subtract __PAGE_OFFSET!
22 * 23 *
23 * The .section line puts this code in .init.text so it will be discarded after 24 * The .section line puts this code in .init.text so it will be discarded after
24 * boot. */ 25 * boot.
26 */
25.section .init.text, "ax", @progbits 27.section .init.text, "ax", @progbits
26ENTRY(lguest_entry) 28ENTRY(lguest_entry)
27 /* We make the "initialization" hypercall now to tell the Host about 29 /*
28 * us, and also find out where it put our page tables. */ 30 * We make the "initialization" hypercall now to tell the Host about
31 * us, and also find out where it put our page tables.
32 */
29 movl $LHCALL_LGUEST_INIT, %eax 33 movl $LHCALL_LGUEST_INIT, %eax
30 movl $lguest_data - __PAGE_OFFSET, %ebx 34 movl $lguest_data - __PAGE_OFFSET, %ebx
31 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ 35 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
@@ -33,13 +37,14 @@ ENTRY(lguest_entry)
33 /* Set up the initial stack so we can run C code. */ 37 /* Set up the initial stack so we can run C code. */
34 movl $(init_thread_union+THREAD_SIZE),%esp 38 movl $(init_thread_union+THREAD_SIZE),%esp
35 39
36 /* Jumps are relative, and we're running __PAGE_OFFSET too low at the 40 /* Jumps are relative: we're running __PAGE_OFFSET too low. */
37 * moment. */
38 jmp lguest_init+__PAGE_OFFSET 41 jmp lguest_init+__PAGE_OFFSET
39 42
40/*G:055 We create a macro which puts the assembler code between lgstart_ and 43/*G:055
41 * lgend_ markers. These templates are put in the .text section: they can't be 44 * We create a macro which puts the assembler code between lgstart_ and lgend_
42 * discarded after boot as we may need to patch modules, too. */ 45 * markers. These templates are put in the .text section: they can't be
46 * discarded after boot as we may need to patch modules, too.
47 */
43.text 48.text
44#define LGUEST_PATCH(name, insns...) \ 49#define LGUEST_PATCH(name, insns...) \
45 lgstart_##name: insns; lgend_##name:; \ 50 lgstart_##name: insns; lgend_##name:; \
@@ -48,58 +53,74 @@ ENTRY(lguest_entry)
48LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) 53LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
49LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) 54LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
50 55
51/*G:033 But using those wrappers is inefficient (we'll see why that doesn't 56/*G:033
52 * matter for save_fl and irq_disable later). If we write our routines 57 * But using those wrappers is inefficient (we'll see why that doesn't matter
53 * carefully in assembler, we can avoid clobbering any registers and avoid 58 * for save_fl and irq_disable later). If we write our routines carefully in
54 * jumping through the wrapper functions. 59 * assembler, we can avoid clobbering any registers and avoid jumping through
60 * the wrapper functions.
55 * 61 *
56 * I skipped over our first piece of assembler, but this one is worth studying 62 * I skipped over our first piece of assembler, but this one is worth studying
57 * in a bit more detail so I'll describe in easy stages. First, the routine 63 * in a bit more detail so I'll describe in easy stages. First, the routine to
58 * to enable interrupts: */ 64 * enable interrupts:
65 */
59ENTRY(lg_irq_enable) 66ENTRY(lg_irq_enable)
60 /* The reverse of irq_disable, this sets lguest_data.irq_enabled to 67 /*
61 * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ 68 * The reverse of irq_disable, this sets lguest_data.irq_enabled to
69 * X86_EFLAGS_IF (ie. "Interrupts enabled").
70 */
62 movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled 71 movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
63 /* But now we need to check if the Host wants to know: there might have 72 /*
73 * But now we need to check if the Host wants to know: there might have
64 * been interrupts waiting to be delivered, in which case it will have 74 * been interrupts waiting to be delivered, in which case it will have
65 * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we 75 * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we
66 * jump to send_interrupts, otherwise we're done. */ 76 * jump to send_interrupts, otherwise we're done.
77 */
67 testl $0, lguest_data+LGUEST_DATA_irq_pending 78 testl $0, lguest_data+LGUEST_DATA_irq_pending
68 jnz send_interrupts 79 jnz send_interrupts
69 /* One cool thing about x86 is that you can do many things without using 80 /*
81 * One cool thing about x86 is that you can do many things without using
70 * a register. In this case, the normal path hasn't needed to save or 82 * a register. In this case, the normal path hasn't needed to save or
71 * restore any registers at all! */ 83 * restore any registers at all!
84 */
72 ret 85 ret
73send_interrupts: 86send_interrupts:
74 /* OK, now we need a register: eax is used for the hypercall number, 87 /*
88 * OK, now we need a register: eax is used for the hypercall number,
75 * which is LHCALL_SEND_INTERRUPTS. 89 * which is LHCALL_SEND_INTERRUPTS.
76 * 90 *
77 * We used not to bother with this pending detection at all, which was 91 * We used not to bother with this pending detection at all, which was
78 * much simpler. Sooner or later the Host would realize it had to 92 * much simpler. Sooner or later the Host would realize it had to
79 * send us an interrupt. But that turns out to make performance 7 93 * send us an interrupt. But that turns out to make performance 7
80 * times worse on a simple tcp benchmark. So now we do this the hard 94 * times worse on a simple tcp benchmark. So now we do this the hard
81 * way. */ 95 * way.
96 */
82 pushl %eax 97 pushl %eax
83 movl $LHCALL_SEND_INTERRUPTS, %eax 98 movl $LHCALL_SEND_INTERRUPTS, %eax
84 /* This is a vmcall instruction (same thing that KVM uses). Older 99 /*
100 * This is a vmcall instruction (same thing that KVM uses). Older
85 * assembler versions might not know the "vmcall" instruction, so we 101 * assembler versions might not know the "vmcall" instruction, so we
86 * create one manually here. */ 102 * create one manually here.
103 */
87 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ 104 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
88 popl %eax 105 popl %eax
89 ret 106 ret
90 107
91/* Finally, the "popf" or "restore flags" routine. The %eax register holds the 108/*
109 * Finally, the "popf" or "restore flags" routine. The %eax register holds the
92 * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're 110 * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
93 * enabling interrupts again, if it's 0 we're leaving them off. */ 111 * enabling interrupts again, if it's 0 we're leaving them off.
112 */
94ENTRY(lg_restore_fl) 113ENTRY(lg_restore_fl)
95 /* This is just "lguest_data.irq_enabled = flags;" */ 114 /* This is just "lguest_data.irq_enabled = flags;" */
96 movl %eax, lguest_data+LGUEST_DATA_irq_enabled 115 movl %eax, lguest_data+LGUEST_DATA_irq_enabled
97 /* Now, if the %eax value has enabled interrupts and 116 /*
117 * Now, if the %eax value has enabled interrupts and
98 * lguest_data.irq_pending is set, we want to tell the Host so it can 118 * lguest_data.irq_pending is set, we want to tell the Host so it can
99 * deliver any outstanding interrupts. Fortunately, both values will 119 * deliver any outstanding interrupts. Fortunately, both values will
100 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" 120 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
101 * instruction will AND them together for us. If both are set, we 121 * instruction will AND them together for us. If both are set, we
102 * jump to send_interrupts. */ 122 * jump to send_interrupts.
123 */
103 testl lguest_data+LGUEST_DATA_irq_pending, %eax 124 testl lguest_data+LGUEST_DATA_irq_pending, %eax
104 jnz send_interrupts 125 jnz send_interrupts
105 /* Again, the normal path has used no extra registers. Clever, huh? */ 126 /* Again, the normal path has used no extra registers. Clever, huh? */
@@ -109,22 +130,24 @@ ENTRY(lg_restore_fl)
109.global lguest_noirq_start 130.global lguest_noirq_start
110.global lguest_noirq_end 131.global lguest_noirq_end
111 132
112/*M:004 When the Host reflects a trap or injects an interrupt into the Guest, 133/*M:004
113 * it sets the eflags interrupt bit on the stack based on 134 * When the Host reflects a trap or injects an interrupt into the Guest, it
114 * lguest_data.irq_enabled, so the Guest iret logic does the right thing when 135 * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled,
115 * restoring it. However, when the Host sets the Guest up for direct traps, 136 * so the Guest iret logic does the right thing when restoring it. However,
116 * such as system calls, the processor is the one to push eflags onto the 137 * when the Host sets the Guest up for direct traps, such as system calls, the
117 * stack, and the interrupt bit will be 1 (in reality, interrupts are always 138 * processor is the one to push eflags onto the stack, and the interrupt bit
118 * enabled in the Guest). 139 * will be 1 (in reality, interrupts are always enabled in the Guest).
119 * 140 *
120 * This turns out to be harmless: the only trap which should happen under Linux 141 * This turns out to be harmless: the only trap which should happen under Linux
121 * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc 142 * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
122 * regions), which has to be reflected through the Host anyway. If another 143 * regions), which has to be reflected through the Host anyway. If another
123 * trap *does* go off when interrupts are disabled, the Guest will panic, and 144 * trap *does* go off when interrupts are disabled, the Guest will panic, and
124 * we'll never get to this iret! :*/ 145 * we'll never get to this iret!
146:*/
125 147
126/*G:045 There is one final paravirt_op that the Guest implements, and glancing 148/*G:045
127 * at it you can see why I left it to last. It's *cool*! It's in *assembler*! 149 * There is one final paravirt_op that the Guest implements, and glancing at it
150 * you can see why I left it to last. It's *cool*! It's in *assembler*!
128 * 151 *
129 * The "iret" instruction is used to return from an interrupt or trap. The 152 * The "iret" instruction is used to return from an interrupt or trap. The
130 * stack looks like this: 153 * stack looks like this:
@@ -148,15 +171,18 @@ ENTRY(lg_restore_fl)
148 * return to userspace or wherever. Our solution to this is to surround the 171 * return to userspace or wherever. Our solution to this is to surround the
149 * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the 172 * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the
150 * Host that it is *never* to interrupt us there, even if interrupts seem to be 173 * Host that it is *never* to interrupt us there, even if interrupts seem to be
151 * enabled. */ 174 * enabled.
175 */
152ENTRY(lguest_iret) 176ENTRY(lguest_iret)
153 pushl %eax 177 pushl %eax
154 movl 12(%esp), %eax 178 movl 12(%esp), %eax
155lguest_noirq_start: 179lguest_noirq_start:
156 /* Note the %ss: segment prefix here. Normal data accesses use the 180 /*
181 * Note the %ss: segment prefix here. Normal data accesses use the
157 * "ds" segment, but that will have already been restored for whatever 182 * "ds" segment, but that will have already been restored for whatever
158 * we're returning to (such as userspace): we can't trust it. The %ss: 183 * we're returning to (such as userspace): we can't trust it. The %ss:
159 * prefix makes sure we use the stack segment, which is still valid. */ 184 * prefix makes sure we use the stack segment, which is still valid.
185 */
160 movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled 186 movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled
161 popl %eax 187 popl %eax
162 iret 188 iret