aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRusty Russell <rusty@rustcorp.com.au>2007-07-26 13:41:02 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-26 14:35:17 -0400
commitb2b47c214f4e85ce3968120d42e8b18eccb4f4e3 (patch)
treef77d6898a769b8e0fcb552207e87f273bdc19f09
parentf938d2c892db0d80d144253d4a7b7083efdbedeb (diff)
lguest: documentation II: Guest
Documentation: The Guest Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--drivers/lguest/lguest.c450
-rw-r--r--drivers/lguest/lguest_asm.S57
-rw-r--r--include/linux/lguest.h47
3 files changed, 508 insertions, 46 deletions
diff --git a/drivers/lguest/lguest.c b/drivers/lguest/lguest.c
index e7d128312b23..7e7e9fb3aefd 100644
--- a/drivers/lguest/lguest.c
+++ b/drivers/lguest/lguest.c
@@ -66,6 +66,12 @@
66#include <asm/mce.h> 66#include <asm/mce.h>
67#include <asm/io.h> 67#include <asm/io.h>
68 68
69/*G:010 Welcome to the Guest!
70 *
71 * The Guest in our tale is a simple creature: identical to the Host but
72 * behaving in simplified but equivalent ways. In particular, the Guest is the
73 * same kernel as the Host (or at least, built from the same source code). :*/
74
69/* Declarations for definitions in lguest_guest.S */ 75/* Declarations for definitions in lguest_guest.S */
70extern char lguest_noirq_start[], lguest_noirq_end[]; 76extern char lguest_noirq_start[], lguest_noirq_end[];
71extern const char lgstart_cli[], lgend_cli[]; 77extern const char lgstart_cli[], lgend_cli[];
@@ -84,7 +90,26 @@ struct lguest_data lguest_data = {
84struct lguest_device_desc *lguest_devices; 90struct lguest_device_desc *lguest_devices;
85static cycle_t clock_base; 91static cycle_t clock_base;
86 92
87static enum paravirt_lazy_mode lazy_mode; 93/*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first
94 * real optimization trick!
95 *
96 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
97 * them as a batch when lazy_mode is eventually turned off. Because hypercalls
98 * are reasonably expensive, batching them up makes sense. For example, a
99 * large mmap might update dozens of page table entries: that code calls
100 * lguest_lazy_mode(PARAVIRT_LAZY_MMU), does the dozen updates, then calls
101 * lguest_lazy_mode(PARAVIRT_LAZY_NONE).
102 *
103 * So, when we're in lazy mode, we call async_hypercall() to store the call for
104 * future processing. When lazy mode is turned off we issue a hypercall to
105 * flush the stored calls.
106 *
107 * There's also a hack where "mode" is set to "PARAVIRT_LAZY_FLUSH" which
108 * indicates we're to flush any outstanding calls immediately. This is used
109 * when an interrupt handler does a kmap_atomic(): the page table changes must
110 * happen immediately even if we're in the middle of a batch. Usually we're
111 * not, though, so there's nothing to do. */
112static enum paravirt_lazy_mode lazy_mode; /* Note: not SMP-safe! */
88static void lguest_lazy_mode(enum paravirt_lazy_mode mode) 113static void lguest_lazy_mode(enum paravirt_lazy_mode mode)
89{ 114{
90 if (mode == PARAVIRT_LAZY_FLUSH) { 115 if (mode == PARAVIRT_LAZY_FLUSH) {
@@ -108,6 +133,16 @@ static void lazy_hcall(unsigned long call,
108 async_hcall(call, arg1, arg2, arg3); 133 async_hcall(call, arg1, arg2, arg3);
109} 134}
110 135
136/* async_hcall() is pretty simple: I'm quite proud of it really. We have a
137 * ring buffer of stored hypercalls which the Host will run though next time we
138 * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall
139 * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
140 * and 255 once the Host has finished with it.
141 *
142 * If we come around to a slot which hasn't been finished, then the table is
143 * full and we just make the hypercall directly. This has the nice side
144 * effect of causing the Host to run all the stored calls in the ring buffer
145 * which empties it for next time! */
111void async_hcall(unsigned long call, 146void async_hcall(unsigned long call,
112 unsigned long arg1, unsigned long arg2, unsigned long arg3) 147 unsigned long arg1, unsigned long arg2, unsigned long arg3)
113{ 148{
@@ -115,6 +150,9 @@ void async_hcall(unsigned long call,
115 static unsigned int next_call; 150 static unsigned int next_call;
116 unsigned long flags; 151 unsigned long flags;
117 152
153 /* Disable interrupts if not already disabled: we don't want an
154 * interrupt handler making a hypercall while we're already doing
155 * one! */
118 local_irq_save(flags); 156 local_irq_save(flags);
119 if (lguest_data.hcall_status[next_call] != 0xFF) { 157 if (lguest_data.hcall_status[next_call] != 0xFF) {
120 /* Table full, so do normal hcall which will flush table. */ 158 /* Table full, so do normal hcall which will flush table. */
@@ -124,7 +162,7 @@ void async_hcall(unsigned long call,
124 lguest_data.hcalls[next_call].edx = arg1; 162 lguest_data.hcalls[next_call].edx = arg1;
125 lguest_data.hcalls[next_call].ebx = arg2; 163 lguest_data.hcalls[next_call].ebx = arg2;
126 lguest_data.hcalls[next_call].ecx = arg3; 164 lguest_data.hcalls[next_call].ecx = arg3;
127 /* Make sure host sees arguments before "valid" flag. */ 165 /* Arguments must all be written before we mark it to go */
128 wmb(); 166 wmb();
129 lguest_data.hcall_status[next_call] = 0; 167 lguest_data.hcall_status[next_call] = 0;
130 if (++next_call == LHCALL_RING_SIZE) 168 if (++next_call == LHCALL_RING_SIZE)
@@ -132,9 +170,14 @@ void async_hcall(unsigned long call,
132 } 170 }
133 local_irq_restore(flags); 171 local_irq_restore(flags);
134} 172}
173/*:*/
135 174
175/* Wrappers for the SEND_DMA and BIND_DMA hypercalls. This is mainly because
176 * Jeff Garzik complained that __pa() should never appear in drivers, and this
177 * helps remove most of them. But also, it wraps some ugliness. */
136void lguest_send_dma(unsigned long key, struct lguest_dma *dma) 178void lguest_send_dma(unsigned long key, struct lguest_dma *dma)
137{ 179{
180 /* The hcall might not write this if something goes wrong */
138 dma->used_len = 0; 181 dma->used_len = 0;
139 hcall(LHCALL_SEND_DMA, key, __pa(dma), 0); 182 hcall(LHCALL_SEND_DMA, key, __pa(dma), 0);
140} 183}
@@ -142,11 +185,16 @@ void lguest_send_dma(unsigned long key, struct lguest_dma *dma)
142int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas, 185int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas,
143 unsigned int num, u8 irq) 186 unsigned int num, u8 irq)
144{ 187{
188 /* This is the only hypercall which actually wants 5 arguments, and we
189 * only support 4. Fortunately the interrupt number is always less
190 * than 256, so we can pack it with the number of dmas in the final
191 * argument. */
145 if (!hcall(LHCALL_BIND_DMA, key, __pa(dmas), (num << 8) | irq)) 192 if (!hcall(LHCALL_BIND_DMA, key, __pa(dmas), (num << 8) | irq))
146 return -ENOMEM; 193 return -ENOMEM;
147 return 0; 194 return 0;
148} 195}
149 196
197/* Unbinding is the same hypercall as binding, but with 0 num & irq. */
150void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas) 198void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas)
151{ 199{
152 hcall(LHCALL_BIND_DMA, key, __pa(dmas), 0); 200 hcall(LHCALL_BIND_DMA, key, __pa(dmas), 0);
@@ -164,35 +212,65 @@ void lguest_unmap(void *addr)
164 iounmap((__force void __iomem *)addr); 212 iounmap((__force void __iomem *)addr);
165} 213}
166 214
215/*G:033
216 * Here are our first native-instruction replacements: four functions for
217 * interrupt control.
218 *
219 * The simplest way of implementing these would be to have "turn interrupts
220 * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow:
221 * these are by far the most commonly called functions of those we override.
222 *
223 * So instead we keep an "irq_enabled" field inside our "struct lguest_data",
224 * which the Guest can update with a single instruction. The Host knows to
225 * check there when it wants to deliver an interrupt.
226 */
227
228/* save_flags() is expected to return the processor state (ie. "eflags"). The
229 * eflags word contains all kind of stuff, but in practice Linux only cares
230 * about the interrupt flag. Our "save_flags()" just returns that. */
167static unsigned long save_fl(void) 231static unsigned long save_fl(void)
168{ 232{
169 return lguest_data.irq_enabled; 233 return lguest_data.irq_enabled;
170} 234}
171 235
236/* "restore_flags" just sets the flags back to the value given. */
172static void restore_fl(unsigned long flags) 237static void restore_fl(unsigned long flags)
173{ 238{
174 /* FIXME: Check if interrupt pending... */
175 lguest_data.irq_enabled = flags; 239 lguest_data.irq_enabled = flags;
176} 240}
177 241
242/* Interrupts go off... */
178static void irq_disable(void) 243static void irq_disable(void)
179{ 244{
180 lguest_data.irq_enabled = 0; 245 lguest_data.irq_enabled = 0;
181} 246}
182 247
248/* Interrupts go on... */
183static void irq_enable(void) 249static void irq_enable(void)
184{ 250{
185 /* FIXME: Check if interrupt pending... */
186 lguest_data.irq_enabled = X86_EFLAGS_IF; 251 lguest_data.irq_enabled = X86_EFLAGS_IF;
187} 252}
188 253
254/*G:034
255 * The Interrupt Descriptor Table (IDT).
256 *
257 * The IDT tells the processor what to do when an interrupt comes in. Each
258 * entry in the table is a 64-bit descriptor: this holds the privilege level,
259 * address of the handler, and... well, who cares? The Guest just asks the
260 * Host to make the change anyway, because the Host controls the real IDT.
261 */
189static void lguest_write_idt_entry(struct desc_struct *dt, 262static void lguest_write_idt_entry(struct desc_struct *dt,
190 int entrynum, u32 low, u32 high) 263 int entrynum, u32 low, u32 high)
191{ 264{
265 /* Keep the local copy up to date. */
192 write_dt_entry(dt, entrynum, low, high); 266 write_dt_entry(dt, entrynum, low, high);
267 /* Tell Host about this new entry. */
193 hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high); 268 hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high);
194} 269}
195 270
271/* Changing to a different IDT is very rare: we keep the IDT up-to-date every
272 * time it is written, so we can simply loop through all entries and tell the
273 * Host about them. */
196static void lguest_load_idt(const struct Xgt_desc_struct *desc) 274static void lguest_load_idt(const struct Xgt_desc_struct *desc)
197{ 275{
198 unsigned int i; 276 unsigned int i;
@@ -202,12 +280,29 @@ static void lguest_load_idt(const struct Xgt_desc_struct *desc)
202 hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b); 280 hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);
203} 281}
204 282
283/*
284 * The Global Descriptor Table.
285 *
286 * The Intel architecture defines another table, called the Global Descriptor
287 * Table (GDT). You tell the CPU where it is (and its size) using the "lgdt"
288 * instruction, and then several other instructions refer to entries in the
289 * table. There are three entries which the Switcher needs, so the Host simply
290 * controls the entire thing and the Guest asks it to make changes using the
291 * LOAD_GDT hypercall.
292 *
293 * This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY
294 * hypercall and use that repeatedly to load a new IDT. I don't think it
295 * really matters, but wouldn't it be nice if they were the same?
296 */
205static void lguest_load_gdt(const struct Xgt_desc_struct *desc) 297static void lguest_load_gdt(const struct Xgt_desc_struct *desc)
206{ 298{
207 BUG_ON((desc->size+1)/8 != GDT_ENTRIES); 299 BUG_ON((desc->size+1)/8 != GDT_ENTRIES);
208 hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0); 300 hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0);
209} 301}
210 302
303/* For a single GDT entry which changes, we do the lazy thing: alter our GDT,
304 * then tell the Host to reload the entire thing. This operation is so rare
305 * that this naive implementation is reasonable. */
211static void lguest_write_gdt_entry(struct desc_struct *dt, 306static void lguest_write_gdt_entry(struct desc_struct *dt,
212 int entrynum, u32 low, u32 high) 307 int entrynum, u32 low, u32 high)
213{ 308{
@@ -215,19 +310,58 @@ static void lguest_write_gdt_entry(struct desc_struct *dt,
215 hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0); 310 hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);
216} 311}
217 312
313/* OK, I lied. There are three "thread local storage" GDT entries which change
314 * on every context switch (these three entries are how glibc implements
315 * __thread variables). So we have a hypercall specifically for this case. */
218static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) 316static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
219{ 317{
220 lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0); 318 lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
221} 319}
320/*:*/
222 321
322/*G:038 That's enough excitement for now, back to ploughing through each of
323 * the paravirt_ops (we're about 1/3 of the way through).
324 *
325 * This is the Local Descriptor Table, another weird Intel thingy. Linux only
326 * uses this for some strange applications like Wine. We don't do anything
327 * here, so they'll get an informative and friendly Segmentation Fault. */
223static void lguest_set_ldt(const void *addr, unsigned entries) 328static void lguest_set_ldt(const void *addr, unsigned entries)
224{ 329{
225} 330}
226 331
332/* This loads a GDT entry into the "Task Register": that entry points to a
333 * structure called the Task State Segment. Some comments scattered though the
334 * kernel code indicate that this used for task switching in ages past, along
335 * with blood sacrifice and astrology.
336 *
337 * Now there's nothing interesting in here that we don't get told elsewhere.
338 * But the native version uses the "ltr" instruction, which makes the Host
339 * complain to the Guest about a Segmentation Fault and it'll oops. So we
340 * override the native version with a do-nothing version. */
227static void lguest_load_tr_desc(void) 341static void lguest_load_tr_desc(void)
228{ 342{
229} 343}
230 344
345/* The "cpuid" instruction is a way of querying both the CPU identity
346 * (manufacturer, model, etc) and its features. It was introduced before the
347 * Pentium in 1993 and keeps getting extended by both Intel and AMD. As you
348 * might imagine, after a decade and a half this treatment, it is now a giant
349 * ball of hair. Its entry in the current Intel manual runs to 28 pages.
350 *
351 * This instruction even it has its own Wikipedia entry. The Wikipedia entry
352 * has been translated into 4 languages. I am not making this up!
353 *
354 * We could get funky here and identify ourselves as "GenuineLguest", but
355 * instead we just use the real "cpuid" instruction. Then I pretty much turned
356 * off feature bits until the Guest booted. (Don't say that: you'll damage
357 * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is
358 * hardly future proof.) Noone's listening! They don't like you anyway,
359 * parenthetic weirdo!
360 *
361 * Replacing the cpuid so we can turn features off is great for the kernel, but
362 * anyone (including userspace) can just use the raw "cpuid" instruction and
363 * the Host won't even notice since it isn't privileged. So we try not to get
364 * too worked up about it. */
231static void lguest_cpuid(unsigned int *eax, unsigned int *ebx, 365static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
232 unsigned int *ecx, unsigned int *edx) 366 unsigned int *ecx, unsigned int *edx)
233{ 367{
@@ -240,21 +374,43 @@ static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
240 *ecx &= 0x00002201; 374 *ecx &= 0x00002201;
241 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */ 375 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
242 *edx &= 0x07808101; 376 *edx &= 0x07808101;
243 /* Host wants to know when we flush kernel pages: set PGE. */ 377 /* The Host can do a nice optimization if it knows that the
378 * kernel mappings (addresses above 0xC0000000 or whatever
379 * PAGE_OFFSET is set to) haven't changed. But Linux calls
380 * flush_tlb_user() for both user and kernel mappings unless
381 * the Page Global Enable (PGE) feature bit is set. */
244 *edx |= 0x00002000; 382 *edx |= 0x00002000;
245 break; 383 break;
246 case 0x80000000: 384 case 0x80000000:
247 /* Futureproof this a little: if they ask how much extended 385 /* Futureproof this a little: if they ask how much extended
248 * processor information, limit it to known fields. */ 386 * processor information there is, limit it to known fields. */
249 if (*eax > 0x80000008) 387 if (*eax > 0x80000008)
250 *eax = 0x80000008; 388 *eax = 0x80000008;
251 break; 389 break;
252 } 390 }
253} 391}
254 392
393/* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
394 * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother
395 * it. The Host needs to know when the Guest wants to change them, so we have
396 * a whole series of functions like read_cr0() and write_cr0().
397 *
398 * We start with CR0. CR0 allows you to turn on and off all kinds of basic
399 * features, but Linux only really cares about one: the horrifically-named Task
400 * Switched (TS) bit at bit 3 (ie. 8)
401 *
402 * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if
403 * the floating point unit is used. Which allows us to restore FPU state
404 * lazily after a task switch, and Linux uses that gratefully, but wouldn't a
405 * name like "FPUTRAP bit" be a little less cryptic?
406 *
407 * We store cr0 (and cr3) locally, because the Host never changes it. The
408 * Guest sometimes wants to read it and we'd prefer not to bother the Host
409 * unnecessarily. */
255static unsigned long current_cr0, current_cr3; 410static unsigned long current_cr0, current_cr3;
256static void lguest_write_cr0(unsigned long val) 411static void lguest_write_cr0(unsigned long val)
257{ 412{
413 /* 8 == TS bit. */
258 lazy_hcall(LHCALL_TS, val & 8, 0, 0); 414 lazy_hcall(LHCALL_TS, val & 8, 0, 0);
259 current_cr0 = val; 415 current_cr0 = val;
260} 416}
@@ -264,17 +420,25 @@ static unsigned long lguest_read_cr0(void)
264 return current_cr0; 420 return current_cr0;
265} 421}
266 422
423/* Intel provided a special instruction to clear the TS bit for people too cool
424 * to use write_cr0() to do it. This "clts" instruction is faster, because all
425 * the vowels have been optimized out. */
267static void lguest_clts(void) 426static void lguest_clts(void)
268{ 427{
269 lazy_hcall(LHCALL_TS, 0, 0, 0); 428 lazy_hcall(LHCALL_TS, 0, 0, 0);
270 current_cr0 &= ~8U; 429 current_cr0 &= ~8U;
271} 430}
272 431
432/* CR2 is the virtual address of the last page fault, which the Guest only ever
433 * reads. The Host kindly writes this into our "struct lguest_data", so we
434 * just read it out of there. */
273static unsigned long lguest_read_cr2(void) 435static unsigned long lguest_read_cr2(void)
274{ 436{
275 return lguest_data.cr2; 437 return lguest_data.cr2;
276} 438}
277 439
440/* CR3 is the current toplevel pagetable page: the principle is the same as
441 * cr0. Keep a local copy, and tell the Host when it changes. */
278static void lguest_write_cr3(unsigned long cr3) 442static void lguest_write_cr3(unsigned long cr3)
279{ 443{
280 lazy_hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0); 444 lazy_hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0);
@@ -286,7 +450,7 @@ static unsigned long lguest_read_cr3(void)
286 return current_cr3; 450 return current_cr3;
287} 451}
288 452
289/* Used to enable/disable PGE, but we don't care. */ 453/* CR4 is used to enable and disable PGE, but we don't care. */
290static unsigned long lguest_read_cr4(void) 454static unsigned long lguest_read_cr4(void)
291{ 455{
292 return 0; 456 return 0;
@@ -296,6 +460,59 @@ static void lguest_write_cr4(unsigned long val)
296{ 460{
297} 461}
298 462
463/*
464 * Page Table Handling.
465 *
466 * Now would be a good time to take a rest and grab a coffee or similarly
467 * relaxing stimulant. The easy parts are behind us, and the trek gradually
468 * winds uphill from here.
469 *
470 * Quick refresher: memory is divided into "pages" of 4096 bytes each. The CPU
471 * maps virtual addresses to physical addresses using "page tables". We could
472 * use one huge index of 1 million entries: each address is 4 bytes, so that's
473 * 1024 pages just to hold the page tables. But since most virtual addresses
474 * are unused, we use a two level index which saves space. The CR3 register
475 * contains the physical address of the top level "page directory" page, which
476 * contains physical addresses of up to 1024 second-level pages. Each of these
477 * second level pages contains up to 1024 physical addresses of actual pages,
478 * or Page Table Entries (PTEs).
479 *
480 * Here's a diagram, where arrows indicate physical addresses:
481 *
482 * CR3 ---> +---------+
483 * | --------->+---------+
484 * | | | PADDR1 |
485 * Top-level | | PADDR2 |
486 * (PMD) page | | |
487 * | | Lower-level |
488 * | | (PTE) page |
489 * | | | |
490 * .... ....
491 *
492 * So to convert a virtual address to a physical address, we look up the top
493 * level, which points us to the second level, which gives us the physical
494 * address of that page. If the top level entry was not present, or the second
495 * level entry was not present, then the virtual address is invalid (we
496 * say "the page was not mapped").
497 *
498 * Put another way, a 32-bit virtual address is divided up like so:
499 *
500 * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
501 * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>|
502 * Index into top Index into second Offset within page
503 * page directory page pagetable page
504 *
505 * The kernel spends a lot of time changing both the top-level page directory
506 * and lower-level pagetable pages. The Guest doesn't know physical addresses,
507 * so while it maintains these page tables exactly like normal, it also needs
508 * to keep the Host informed whenever it makes a change: the Host will create
509 * the real page tables based on the Guests'.
510 */
511
512/* The Guest calls this to set a second-level entry (pte), ie. to map a page
513 * into a process' address space. We set the entry then tell the Host the
514 * toplevel and address this corresponds to. The Guest uses one pagetable per
515 * process, so we need to tell the Host which one we're changing (mm->pgd). */
299static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, 516static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
300 pte_t *ptep, pte_t pteval) 517 pte_t *ptep, pte_t pteval)
301{ 518{
@@ -303,7 +520,9 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
303 lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low); 520 lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low);
304} 521}
305 522
306/* We only support two-level pagetables at the moment. */ 523/* The Guest calls this to set a top-level entry. Again, we set the entry then
524 * tell the Host which top-level page we changed, and the index of the entry we
525 * changed. */
307static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) 526static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
308{ 527{
309 *pmdp = pmdval; 528 *pmdp = pmdval;
@@ -311,7 +530,15 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
311 (__pa(pmdp)&(PAGE_SIZE-1))/4, 0); 530 (__pa(pmdp)&(PAGE_SIZE-1))/4, 0);
312} 531}
313 532
314/* FIXME: Eliminate all callers of this. */ 533/* There are a couple of legacy places where the kernel sets a PTE, but we
534 * don't know the top level any more. This is useless for us, since we don't
535 * know which pagetable is changing or what address, so we just tell the Host
536 * to forget all of them. Fortunately, this is very rare.
537 *
538 * ... except in early boot when the kernel sets up the initial pagetables,
539 * which makes booting astonishingly slow. So we don't even tell the Host
540 * anything changed until we've done the first page table switch.
541 */
315static void lguest_set_pte(pte_t *ptep, pte_t pteval) 542static void lguest_set_pte(pte_t *ptep, pte_t pteval)
316{ 543{
317 *ptep = pteval; 544 *ptep = pteval;
@@ -320,22 +547,51 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
320 lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); 547 lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
321} 548}
322 549
550/* Unfortunately for Lguest, the paravirt_ops for page tables were based on
551 * native page table operations. On native hardware you can set a new page
552 * table entry whenever you want, but if you want to remove one you have to do
553 * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
554 *
555 * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only
556 * called when a valid entry is written, not when it's removed (ie. marked not
557 * present). Instead, this is where we come when the Guest wants to remove a
558 * page table entry: we tell the Host to set that entry to 0 (ie. the present
559 * bit is zero). */
323static void lguest_flush_tlb_single(unsigned long addr) 560static void lguest_flush_tlb_single(unsigned long addr)
324{ 561{
325 /* Simply set it to zero, and it will fault back in. */ 562 /* Simply set it to zero: if it was not, it will fault back in. */
326 lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0); 563 lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0);
327} 564}
328 565
566/* This is what happens after the Guest has removed a large number of entries.
567 * This tells the Host that any of the page table entries for userspace might
568 * have changed, ie. virtual addresses below PAGE_OFFSET. */
329static void lguest_flush_tlb_user(void) 569static void lguest_flush_tlb_user(void)
330{ 570{
331 lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0); 571 lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0);
332} 572}
333 573
574/* This is called when the kernel page tables have changed. That's not very
575 * common (unless the Guest is using highmem, which makes the Guest extremely
576 * slow), so it's worth separating this from the user flushing above. */
334static void lguest_flush_tlb_kernel(void) 577static void lguest_flush_tlb_kernel(void)
335{ 578{
336 lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); 579 lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
337} 580}
338 581
582/*
583 * The Unadvanced Programmable Interrupt Controller.
584 *
585 * This is an attempt to implement the simplest possible interrupt controller.
586 * I spent some time looking though routines like set_irq_chip_and_handler,
587 * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and
588 * I *think* this is as simple as it gets.
589 *
590 * We can tell the Host what interrupts we want blocked ready for using the
591 * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as
592 * simple as setting a bit. We don't actually "ack" interrupts as such, we
593 * just mask and unmask them. I wonder if we should be cleverer?
594 */
339static void disable_lguest_irq(unsigned int irq) 595static void disable_lguest_irq(unsigned int irq)
340{ 596{
341 set_bit(irq, lguest_data.blocked_interrupts); 597 set_bit(irq, lguest_data.blocked_interrupts);
@@ -344,9 +600,9 @@ static void disable_lguest_irq(unsigned int irq)
344static void enable_lguest_irq(unsigned int irq) 600static void enable_lguest_irq(unsigned int irq)
345{ 601{
346 clear_bit(irq, lguest_data.blocked_interrupts); 602 clear_bit(irq, lguest_data.blocked_interrupts);
347 /* FIXME: If it's pending? */
348} 603}
349 604
605/* This structure describes the lguest IRQ controller. */
350static struct irq_chip lguest_irq_controller = { 606static struct irq_chip lguest_irq_controller = {
351 .name = "lguest", 607 .name = "lguest",
352 .mask = disable_lguest_irq, 608 .mask = disable_lguest_irq,
@@ -354,6 +610,10 @@ static struct irq_chip lguest_irq_controller = {
354 .unmask = enable_lguest_irq, 610 .unmask = enable_lguest_irq,
355}; 611};
356 612
613/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
614 * interrupt (except 128, which is used for system calls), and then tells the
615 * Linux infrastructure that each interrupt is controlled by our level-based
616 * lguest interrupt controller. */
357static void __init lguest_init_IRQ(void) 617static void __init lguest_init_IRQ(void)
358{ 618{
359 unsigned int i; 619 unsigned int i;
@@ -366,14 +626,24 @@ static void __init lguest_init_IRQ(void)
366 handle_level_irq); 626 handle_level_irq);
367 } 627 }
368 } 628 }
629 /* This call is required to set up for 4k stacks, where we have
630 * separate stacks for hard and soft interrupts. */
369 irq_ctx_init(smp_processor_id()); 631 irq_ctx_init(smp_processor_id());
370} 632}
371 633
634/*
635 * Time.
636 *
637 * It would be far better for everyone if the Guest had its own clock, but
638 * until then it must ask the Host for the time.
639 */
372static unsigned long lguest_get_wallclock(void) 640static unsigned long lguest_get_wallclock(void)
373{ 641{
374 return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0); 642 return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0);
375} 643}
376 644
645/* If the Host tells us we can trust the TSC, we use that, otherwise we simply
646 * use the imprecise but reliable "jiffies" counter. */
377static cycle_t lguest_clock_read(void) 647static cycle_t lguest_clock_read(void)
378{ 648{
379 if (lguest_data.tsc_khz) 649 if (lguest_data.tsc_khz)
@@ -454,12 +724,19 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
454 local_irq_restore(flags); 724 local_irq_restore(flags);
455} 725}
456 726
727/* At some point in the boot process, we get asked to set up our timing
728 * infrastructure. The kernel doesn't expect timer interrupts before this, but
729 * we cleverly initialized the "blocked_interrupts" field of "struct
730 * lguest_data" so that timer interrupts were blocked until now. */
457static void lguest_time_init(void) 731static void lguest_time_init(void)
458{ 732{
733 /* Set up the timer interrupt (0) to go to our simple timer routine */
459 set_irq_handler(0, lguest_time_irq); 734 set_irq_handler(0, lguest_time_irq);
460 735
461 /* We use the TSC if the Host tells us we can, otherwise a dumb 736 /* Our clock structure look like arch/i386/kernel/tsc.c if we can use
462 * jiffies-based clock. */ 737 * the TSC, otherwise it looks like kernel/time/jiffies.c. Either way,
738 * the "rating" is initialized so high that it's always chosen over any
739 * other clocksource. */
463 if (lguest_data.tsc_khz) { 740 if (lguest_data.tsc_khz) {
464 lguest_clock.shift = 22; 741 lguest_clock.shift = 22;
465 lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz, 742 lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz,
@@ -475,13 +752,30 @@ static void lguest_time_init(void)
475 clock_base = lguest_clock_read(); 752 clock_base = lguest_clock_read();
476 clocksource_register(&lguest_clock); 753 clocksource_register(&lguest_clock);
477 754
478 /* We can't set cpumask in the initializer: damn C limitations! */ 755 /* We can't set cpumask in the initializer: damn C limitations! Set it
756 * here and register our timer device. */
479 lguest_clockevent.cpumask = cpumask_of_cpu(0); 757 lguest_clockevent.cpumask = cpumask_of_cpu(0);
480 clockevents_register_device(&lguest_clockevent); 758 clockevents_register_device(&lguest_clockevent);
481 759
760 /* Finally, we unblock the timer interrupt. */
482 enable_lguest_irq(0); 761 enable_lguest_irq(0);
483} 762}
484 763
764/*
765 * Miscellaneous bits and pieces.
766 *
767 * Here is an oddball collection of functions which the Guest needs for things
768 * to work. They're pretty simple.
769 */
770
771/* The Guest needs to tell the host what stack it expects traps to use. For
772 * native hardware, this is part of the Task State Segment mentioned above in
773 * lguest_load_tr_desc(), but to help hypervisors there's this special call.
774 *
775 * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data
776 * segment), the privilege level (we're privilege level 1, the Host is 0 and
777 * will not tolerate us trying to use that), the stack pointer, and the number
778 * of pages in the stack. */
485static void lguest_load_esp0(struct tss_struct *tss, 779static void lguest_load_esp0(struct tss_struct *tss,
486 struct thread_struct *thread) 780 struct thread_struct *thread)
487{ 781{
@@ -489,15 +783,31 @@ static void lguest_load_esp0(struct tss_struct *tss,
489 THREAD_SIZE/PAGE_SIZE); 783 THREAD_SIZE/PAGE_SIZE);
490} 784}
491 785
786/* Let's just say, I wouldn't do debugging under a Guest. */
492static void lguest_set_debugreg(int regno, unsigned long value) 787static void lguest_set_debugreg(int regno, unsigned long value)
493{ 788{
494 /* FIXME: Implement */ 789 /* FIXME: Implement */
495} 790}
496 791
792/* There are times when the kernel wants to make sure that no memory writes are
793 * caught in the cache (that they've all reached real hardware devices). This
794 * doesn't matter for the Guest which has virtual hardware.
795 *
796 * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush
797 * (clflush) instruction is available and the kernel uses that. Otherwise, it
798 * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction.
799 * Unlike clflush, wbinvd can only be run at privilege level 0. So we can
800 * ignore clflush, but replace wbinvd.
801 */
497static void lguest_wbinvd(void) 802static void lguest_wbinvd(void)
498{ 803{
499} 804}
500 805
806/* If the Guest expects to have an Advanced Programmable Interrupt Controller,
807 * we play dumb by ignoring writes and returning 0 for reads. So it's no
808 * longer Programmable nor Controlling anything, and I don't think 8 lines of
809 * code qualifies for Advanced. It will also never interrupt anything. It
810 * does, however, allow us to get through the Linux boot code. */
501#ifdef CONFIG_X86_LOCAL_APIC 811#ifdef CONFIG_X86_LOCAL_APIC
502static void lguest_apic_write(unsigned long reg, unsigned long v) 812static void lguest_apic_write(unsigned long reg, unsigned long v)
503{ 813{
@@ -509,19 +819,32 @@ static unsigned long lguest_apic_read(unsigned long reg)
509} 819}
510#endif 820#endif
511 821
822/* STOP! Until an interrupt comes in. */
512static void lguest_safe_halt(void) 823static void lguest_safe_halt(void)
513{ 824{
514 hcall(LHCALL_HALT, 0, 0, 0); 825 hcall(LHCALL_HALT, 0, 0, 0);
515} 826}
516 827
828/* Perhaps CRASH isn't the best name for this hypercall, but we use it to get a
829 * message out when we're crashing as well as elegant termination like powering
830 * off.
831 *
832 * Note that the Host always prefers that the Guest speak in physical addresses
833 * rather than virtual addresses, so we use __pa() here. */
517static void lguest_power_off(void) 834static void lguest_power_off(void)
518{ 835{
519 hcall(LHCALL_CRASH, __pa("Power down"), 0, 0); 836 hcall(LHCALL_CRASH, __pa("Power down"), 0, 0);
520} 837}
521 838
839/*
840 * Panicing.
841 *
842 * Don't. But if you did, this is what happens.
843 */
522static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) 844static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
523{ 845{
524 hcall(LHCALL_CRASH, __pa(p), 0, 0); 846 hcall(LHCALL_CRASH, __pa(p), 0, 0);
847 /* The hcall won't return, but to keep gcc happy, we're "done". */
525 return NOTIFY_DONE; 848 return NOTIFY_DONE;
526} 849}
527 850
@@ -529,15 +852,45 @@ static struct notifier_block paniced = {
529 .notifier_call = lguest_panic 852 .notifier_call = lguest_panic
530}; 853};
531 854
855/* Setting up memory is fairly easy. */
532static __init char *lguest_memory_setup(void) 856static __init char *lguest_memory_setup(void)
533{ 857{
534 /* We do this here because lockcheck barfs if before start_kernel */ 858 /* We do this here and not earlier because lockcheck barfs if we do it
859 * before start_kernel() */
535 atomic_notifier_chain_register(&panic_notifier_list, &paniced); 860 atomic_notifier_chain_register(&panic_notifier_list, &paniced);
536 861
862 /* The Linux bootloader header contains an "e820" memory map: the
863 * Launcher populated the first entry with our memory limit. */
537 add_memory_region(E820_MAP->addr, E820_MAP->size, E820_MAP->type); 864 add_memory_region(E820_MAP->addr, E820_MAP->size, E820_MAP->type);
865
866 /* This string is for the boot messages. */
538 return "LGUEST"; 867 return "LGUEST";
539} 868}
540 869
870/*G:050
871 * Patching (Powerfully Placating Performance Pedants)
872 *
873 * We have already seen that "struct paravirt_ops" lets us replace simple
874 * native instructions with calls to the appropriate back end all throughout
875 * the kernel. This allows the same kernel to run as a Guest and as a native
876 * kernel, but it's slow because of all the indirect branches.
877 *
878 * Remember that David Wheeler quote about "Any problem in computer science can
879 * be solved with another layer of indirection"? The rest of that quote is
880 * "... But that usually will create another problem." This is the first of
881 * those problems.
882 *
883 * Our current solution is to allow the paravirt back end to optionally patch
884 * over the indirect calls to replace them with something more efficient. We
885 * patch the four most commonly called functions: disable interrupts, enable
886 * interrupts, restore interrupts and save interrupts. We usually have 10
887 * bytes to patch into: the Guest versions of these operations are small enough
888 * that we can fit comfortably.
889 *
890 * First we need assembly templates of each of the patchable Guest operations,
891 * and these are in lguest_asm.S. */
892
893/*G:060 We construct a table from the assembler templates: */
541static const struct lguest_insns 894static const struct lguest_insns
542{ 895{
543 const char *start, *end; 896 const char *start, *end;
@@ -547,35 +900,52 @@ static const struct lguest_insns
547 [PARAVIRT_PATCH(restore_fl)] = { lgstart_popf, lgend_popf }, 900 [PARAVIRT_PATCH(restore_fl)] = { lgstart_popf, lgend_popf },
548 [PARAVIRT_PATCH(save_fl)] = { lgstart_pushf, lgend_pushf }, 901 [PARAVIRT_PATCH(save_fl)] = { lgstart_pushf, lgend_pushf },
549}; 902};
903
904/* Now our patch routine is fairly simple (based on the native one in
905 * paravirt.c). If we have a replacement, we copy it in and return how much of
906 * the available space we used. */
550static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len) 907static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len)
551{ 908{
552 unsigned int insn_len; 909 unsigned int insn_len;
553 910
554 /* Don't touch it if we don't have a replacement */ 911 /* Don't do anything special if we don't have a replacement */
555 if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start) 912 if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
556 return paravirt_patch_default(type, clobber, insns, len); 913 return paravirt_patch_default(type, clobber, insns, len);
557 914
558 insn_len = lguest_insns[type].end - lguest_insns[type].start; 915 insn_len = lguest_insns[type].end - lguest_insns[type].start;
559 916
560 /* Similarly if we can't fit replacement. */ 917 /* Similarly if we can't fit replacement (shouldn't happen, but let's
918 * be thorough). */
561 if (len < insn_len) 919 if (len < insn_len)
562 return paravirt_patch_default(type, clobber, insns, len); 920 return paravirt_patch_default(type, clobber, insns, len);
563 921
922 /* Copy in our instructions. */
564 memcpy(insns, lguest_insns[type].start, insn_len); 923 memcpy(insns, lguest_insns[type].start, insn_len);
565 return insn_len; 924 return insn_len;
566} 925}
567 926
927/*G:030 Once we get to lguest_init(), we know we're a Guest. The paravirt_ops
928 * structure in the kernel provides a single point for (almost) every routine
929 * we have to override to avoid privileged instructions. */
568__init void lguest_init(void *boot) 930__init void lguest_init(void *boot)
569{ 931{
570 /* Copy boot parameters first. */ 932 /* Copy boot parameters first: the Launcher put the physical location
933 * in %esi, and head.S converted that to a virtual address and handed
934 * it to us. */
571 memcpy(&boot_params, boot, PARAM_SIZE); 935 memcpy(&boot_params, boot, PARAM_SIZE);
936 /* The boot parameters also tell us where the command-line is: save
937 * that, too. */
572 memcpy(boot_command_line, __va(boot_params.hdr.cmd_line_ptr), 938 memcpy(boot_command_line, __va(boot_params.hdr.cmd_line_ptr),
573 COMMAND_LINE_SIZE); 939 COMMAND_LINE_SIZE);
574 940
941 /* We're under lguest, paravirt is enabled, and we're running at
942 * privilege level 1, not 0 as normal. */
575 paravirt_ops.name = "lguest"; 943 paravirt_ops.name = "lguest";
576 paravirt_ops.paravirt_enabled = 1; 944 paravirt_ops.paravirt_enabled = 1;
577 paravirt_ops.kernel_rpl = 1; 945 paravirt_ops.kernel_rpl = 1;
578 946
947 /* We set up all the lguest overrides for sensitive operations. These
948 * are detailed with the operations themselves. */
579 paravirt_ops.save_fl = save_fl; 949 paravirt_ops.save_fl = save_fl;
580 paravirt_ops.restore_fl = restore_fl; 950 paravirt_ops.restore_fl = restore_fl;
581 paravirt_ops.irq_disable = irq_disable; 951 paravirt_ops.irq_disable = irq_disable;
@@ -619,20 +989,45 @@ __init void lguest_init(void *boot)
619 paravirt_ops.set_lazy_mode = lguest_lazy_mode; 989 paravirt_ops.set_lazy_mode = lguest_lazy_mode;
620 paravirt_ops.wbinvd = lguest_wbinvd; 990 paravirt_ops.wbinvd = lguest_wbinvd;
621 paravirt_ops.sched_clock = lguest_sched_clock; 991 paravirt_ops.sched_clock = lguest_sched_clock;
622 992 /* Now is a good time to look at the implementations of these functions
993 * before returning to the rest of lguest_init(). */
994
995 /*G:070 Now we've seen all the paravirt_ops, we return to
996 * lguest_init() where the rest of the fairly chaotic boot setup
997 * occurs.
998 *
999 * The Host expects our first hypercall to tell it where our "struct
1000 * lguest_data" is, so we do that first. */
623 hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0); 1001 hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
624 1002
625 /* We use top of mem for initial pagetables. */ 1003 /* The native boot code sets up initial page tables immediately after
1004 * the kernel itself, and sets init_pg_tables_end so they're not
1005 * clobbered. The Launcher places our initial pagetables somewhere at
1006 * the top of our physical memory, so we don't need extra space: set
1007 * init_pg_tables_end to the end of the kernel. */
626 init_pg_tables_end = __pa(pg0); 1008 init_pg_tables_end = __pa(pg0);
627 1009
1010 /* Load the %fs segment register (the per-cpu segment register) with
1011 * the normal data segment to get through booting. */
628 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); 1012 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
629 1013
1014 /* The Host uses the top of the Guest's virtual address space for the
1015 * Host<->Guest Switcher, and it tells us how much it needs in
1016 * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */
630 reserve_top_address(lguest_data.reserve_mem); 1017 reserve_top_address(lguest_data.reserve_mem);
631 1018
1019 /* If we don't initialize the lock dependency checker now, it crashes
1020 * paravirt_disable_iospace. */
632 lockdep_init(); 1021 lockdep_init();
633 1022
1023 /* The IDE code spends about 3 seconds probing for disks: if we reserve
1024 * all the I/O ports up front it can't get them and so doesn't probe.
1025 * Other device drivers are similar (but less severe). This cuts the
1026 * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */
634 paravirt_disable_iospace(); 1027 paravirt_disable_iospace();
635 1028
1029 /* This is messy CPU setup stuff which the native boot code does before
1030 * start_kernel, so we have to do, too: */
636 cpu_detect(&new_cpu_data); 1031 cpu_detect(&new_cpu_data);
637 /* head.S usually sets up the first capability word, so do it here. */ 1032 /* head.S usually sets up the first capability word, so do it here. */
638 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1033 new_cpu_data.x86_capability[0] = cpuid_edx(1);
@@ -643,14 +1038,27 @@ __init void lguest_init(void *boot)
643#ifdef CONFIG_X86_MCE 1038#ifdef CONFIG_X86_MCE
644 mce_disabled = 1; 1039 mce_disabled = 1;
645#endif 1040#endif
646
647#ifdef CONFIG_ACPI 1041#ifdef CONFIG_ACPI
648 acpi_disabled = 1; 1042 acpi_disabled = 1;
649 acpi_ht = 0; 1043 acpi_ht = 0;
650#endif 1044#endif
651 1045
1046 /* We set the perferred console to "hvc". This is the "hypervisor
1047 * virtual console" driver written by the PowerPC people, which we also
1048 * adapted for lguest's use. */
652 add_preferred_console("hvc", 0, NULL); 1049 add_preferred_console("hvc", 0, NULL);
653 1050
1051 /* Last of all, we set the power management poweroff hook to point to
1052 * the Guest routine to power off. */
654 pm_power_off = lguest_power_off; 1053 pm_power_off = lguest_power_off;
1054
1055 /* Now we're set up, call start_kernel() in init/main.c and we proceed
1056 * to boot as normal. It never returns. */
655 start_kernel(); 1057 start_kernel();
656} 1058}
1059/*
1060 * This marks the end of stage II of our journey, The Guest.
1061 *
1062 * It is now time for us to explore the nooks and crannies of the three Guest
1063 * devices and complete our understanding of the Guest in "make Drivers".
1064 */
diff --git a/drivers/lguest/lguest_asm.S b/drivers/lguest/lguest_asm.S
index a3dbf22ee365..3126ae923cc0 100644
--- a/drivers/lguest/lguest_asm.S
+++ b/drivers/lguest/lguest_asm.S
@@ -4,15 +4,15 @@
4#include <asm/thread_info.h> 4#include <asm/thread_info.h>
5#include <asm/processor-flags.h> 5#include <asm/processor-flags.h>
6 6
7/* 7/*G:020 This is where we begin: we have a magic signature which the launcher
8 * This is where we begin: we have a magic signature which the launcher looks 8 * looks for. The plan is that the Linux boot protocol will be extended with a
9 * for. The plan is that the Linux boot protocol will be extended with a
10 * "platform type" field which will guide us here from the normal entry point, 9 * "platform type" field which will guide us here from the normal entry point,
11 * but for the moment this suffices. We pass the virtual address of the boot 10 * but for the moment this suffices. The normal boot code uses %esi for the
12 * info to lguest_init(). 11 * boot header, so we do too. We convert it to a virtual address by adding
12 * PAGE_OFFSET, and hand it to lguest_init() as its argument (ie. %eax).
13 * 13 *
14 * We put it in .init.text will be discarded after boot. 14 * The .section line puts this code in .init.text so it will be discarded after
15 */ 15 * boot. */
16.section .init.text, "ax", @progbits 16.section .init.text, "ax", @progbits
17.ascii "GenuineLguest" 17.ascii "GenuineLguest"
18 /* Set up initial stack. */ 18 /* Set up initial stack. */
@@ -21,7 +21,9 @@
21 addl $__PAGE_OFFSET, %eax 21 addl $__PAGE_OFFSET, %eax
22 jmp lguest_init 22 jmp lguest_init
23 23
24/* The templates for inline patching. */ 24/*G:055 We create a macro which puts the assembler code between lgstart_ and
25 * lgend_ markers. These templates end up in the .init.text section, so they
26 * are discarded after boot. */
25#define LGUEST_PATCH(name, insns...) \ 27#define LGUEST_PATCH(name, insns...) \
26 lgstart_##name: insns; lgend_##name:; \ 28 lgstart_##name: insns; lgend_##name:; \
27 .globl lgstart_##name; .globl lgend_##name 29 .globl lgstart_##name; .globl lgend_##name
@@ -30,24 +32,47 @@ LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
30LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled) 32LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled)
31LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled) 33LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled)
32LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) 34LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
35/*:*/
33 36
34.text 37.text
35/* These demark the EIP range where host should never deliver interrupts. */ 38/* These demark the EIP range where host should never deliver interrupts. */
36.global lguest_noirq_start 39.global lguest_noirq_start
37.global lguest_noirq_end 40.global lguest_noirq_end
38 41
39/* 42/*G:045 There is one final paravirt_op that the Guest implements, and glancing
40 * We move eflags word to lguest_data.irq_enabled to restore interrupt state. 43 * at it you can see why I left it to last. It's *cool*! It's in *assembler*!
41 * For page faults, gpfs and virtual interrupts, the hypervisor has saved 44 *
42 * eflags manually, otherwise it was delivered directly and so eflags reflects 45 * The "iret" instruction is used to return from an interrupt or trap. The
43 * the real machine IF state, ie. interrupts on. Since the kernel always dies 46 * stack looks like this:
44 * if it takes such a trap with interrupts disabled anyway, turning interrupts 47 * old address
45 * back on unconditionally here is OK. 48 * old code segment & privilege level
46 */ 49 * old processor flags ("eflags")
50 *
51 * The "iret" instruction pops those values off the stack and restores them all
52 * at once. The only problem is that eflags includes the Interrupt Flag which
53 * the Guest can't change: the CPU will simply ignore it when we do an "iret".
54 * So we have to copy eflags from the stack to lguest_data.irq_enabled before
55 * we do the "iret".
56 *
57 * There are two problems with this: firstly, we need to use a register to do
58 * the copy and secondly, the whole thing needs to be atomic. The first
59 * problem is easy to solve: push %eax on the stack so we can use it, and then
60 * restore it at the end just before the real "iret".
61 *
62 * The second is harder: copying eflags to lguest_data.irq_enabled will turn
63 * interrupts on before we're finished, so we could be interrupted before we
64 * return to userspace or wherever. Our solution to this is to surround the
65 * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the
66 * Host that it is *never* to interrupt us there, even if interrupts seem to be
67 * enabled. */
47ENTRY(lguest_iret) 68ENTRY(lguest_iret)
48 pushl %eax 69 pushl %eax
49 movl 12(%esp), %eax 70 movl 12(%esp), %eax
50lguest_noirq_start: 71lguest_noirq_start:
72 /* Note the %ss: segment prefix here. Normal data accesses use the
73 * "ds" segment, but that will have already been restored for whatever
74 * we're returning to (such as userspace): we can't trust it. The %ss:
75 * prefix makes sure we use the stack segment, which is still valid. */
51 movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled 76 movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled
52 popl %eax 77 popl %eax
53 iret 78 iret
diff --git a/include/linux/lguest.h b/include/linux/lguest.h
index 500aace21ca7..e76c151c7129 100644
--- a/include/linux/lguest.h
+++ b/include/linux/lguest.h
@@ -27,18 +27,38 @@
27#define LG_CLOCK_MIN_DELTA 100UL 27#define LG_CLOCK_MIN_DELTA 100UL
28#define LG_CLOCK_MAX_DELTA ULONG_MAX 28#define LG_CLOCK_MAX_DELTA ULONG_MAX
29 29
30/*G:031 First, how does our Guest contact the Host to ask for privileged
31 * operations? There are two ways: the direct way is to make a "hypercall",
32 * to make requests of the Host Itself.
33 *
34 * Our hypercall mechanism uses the highest unused trap code (traps 32 and
35 * above are used by real hardware interrupts). Seventeen hypercalls are
36 * available: the hypercall number is put in the %eax register, and the
37 * arguments (when required) are placed in %edx, %ebx and %ecx. If a return
38 * value makes sense, it's returned in %eax.
39 *
40 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
41 * Host, rather than returning failure. This reflects Winston Churchill's
42 * definition of a gentleman: "someone who is only rude intentionally". */
30#define LGUEST_TRAP_ENTRY 0x1F 43#define LGUEST_TRAP_ENTRY 0x1F
31 44
32static inline unsigned long 45static inline unsigned long
33hcall(unsigned long call, 46hcall(unsigned long call,
34 unsigned long arg1, unsigned long arg2, unsigned long arg3) 47 unsigned long arg1, unsigned long arg2, unsigned long arg3)
35{ 48{
49 /* "int" is the Intel instruction to trigger a trap. */
36 asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) 50 asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
51 /* The call is in %eax (aka "a"), and can be replaced */
37 : "=a"(call) 52 : "=a"(call)
53 /* The other arguments are in %eax, %edx, %ebx & %ecx */
38 : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3) 54 : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3)
55 /* "memory" means this might write somewhere in memory.
56 * This isn't true for all calls, but it's safe to tell
57 * gcc that it might happen so it doesn't get clever. */
39 : "memory"); 58 : "memory");
40 return call; 59 return call;
41} 60}
61/*:*/
42 62
43void async_hcall(unsigned long call, 63void async_hcall(unsigned long call,
44 unsigned long arg1, unsigned long arg2, unsigned long arg3); 64 unsigned long arg1, unsigned long arg2, unsigned long arg3);
@@ -52,31 +72,40 @@ struct hcall_ring
52 u32 eax, edx, ebx, ecx; 72 u32 eax, edx, ebx, ecx;
53}; 73};
54 74
55/* All the good stuff happens here: guest registers it with LGUEST_INIT */ 75/*G:032 The second method of communicating with the Host is to via "struct
76 * lguest_data". The Guest's very first hypercall is to tell the Host where
77 * this is, and then the Guest and Host both publish information in it. :*/
56struct lguest_data 78struct lguest_data
57{ 79{
58/* Fields which change during running: */ 80 /* 512 == enabled (same as eflags in normal hardware). The Guest
59 /* 512 == enabled (same as eflags) */ 81 * changes interrupts so often that a hypercall is too slow. */
60 unsigned int irq_enabled; 82 unsigned int irq_enabled;
61 /* Interrupts blocked by guest. */ 83 /* Fine-grained interrupt disabling by the Guest */
62 DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS); 84 DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS);
63 85
64 /* Virtual address of page fault. */ 86 /* The Host writes the virtual address of the last page fault here,
87 * which saves the Guest a hypercall. CR2 is the native register where
88 * this address would normally be found. */
65 unsigned long cr2; 89 unsigned long cr2;
66 90
67 /* Async hypercall ring. 0xFF == done, 0 == pending. */ 91 /* Async hypercall ring. Instead of directly making hypercalls, we can
92 * place them in here for processing the next time the Host wants.
93 * This batching can be quite efficient. */
94
95 /* 0xFF == done (set by Host), 0 == pending (set by Guest). */
68 u8 hcall_status[LHCALL_RING_SIZE]; 96 u8 hcall_status[LHCALL_RING_SIZE];
97 /* The actual registers for the hypercalls. */
69 struct hcall_ring hcalls[LHCALL_RING_SIZE]; 98 struct hcall_ring hcalls[LHCALL_RING_SIZE];
70 99
71/* Fields initialized by the hypervisor at boot: */ 100/* Fields initialized by the Host at boot: */
72 /* Memory not to try to access */ 101 /* Memory not to try to access */
73 unsigned long reserve_mem; 102 unsigned long reserve_mem;
74 /* ID of this guest (used by network driver to set ethernet address) */ 103 /* ID of this Guest (used by network driver to set ethernet address) */
75 u16 guestid; 104 u16 guestid;
76 /* KHz for the TSC clock. */ 105 /* KHz for the TSC clock. */
77 u32 tsc_khz; 106 u32 tsc_khz;
78 107
79/* Fields initialized by the guest at boot: */ 108/* Fields initialized by the Guest at boot: */
80 /* Instruction range to suppress interrupts even if enabled */ 109 /* Instruction range to suppress interrupts even if enabled */
81 unsigned long noirq_start, noirq_end; 110 unsigned long noirq_start, noirq_end;
82}; 111};