diff options
author | Rusty Russell <rusty@rustcorp.com.au> | 2009-07-30 18:03:45 -0400 |
---|---|---|
committer | Rusty Russell <rusty@rustcorp.com.au> | 2009-07-30 02:33:45 -0400 |
commit | 2e04ef76916d1e29a077ea9d0f2003c8fd86724d (patch) | |
tree | 2ff8d625d6e467be9f9f1b67a3674cb6e125e970 /arch/x86/lguest/boot.c | |
parent | e969fed542cae08cb11d666efac4f7c5d624d09f (diff) |
lguest: fix comment style
I don't really notice it (except to begrudge the extra vertical
space), but Ingo does. And he pointed out that one excuse of lguest
is as a teaching tool, it should set a good example.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@redhat.com>
Diffstat (limited to 'arch/x86/lguest/boot.c')
-rw-r--r-- | arch/x86/lguest/boot.c | 428 |
1 files changed, 279 insertions, 149 deletions
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index f2bf1f73d468..025c04d18f2b 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
@@ -22,7 +22,8 @@ | |||
22 | * | 22 | * |
23 | * So how does the kernel know it's a Guest? We'll see that later, but let's | 23 | * So how does the kernel know it's a Guest? We'll see that later, but let's |
24 | * just say that we end up here where we replace the native functions various | 24 | * just say that we end up here where we replace the native functions various |
25 | * "paravirt" structures with our Guest versions, then boot like normal. :*/ | 25 | * "paravirt" structures with our Guest versions, then boot like normal. |
26 | :*/ | ||
26 | 27 | ||
27 | /* | 28 | /* |
28 | * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. | 29 | * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. |
@@ -74,7 +75,8 @@ | |||
74 | * | 75 | * |
75 | * The Guest in our tale is a simple creature: identical to the Host but | 76 | * The Guest in our tale is a simple creature: identical to the Host but |
76 | * behaving in simplified but equivalent ways. In particular, the Guest is the | 77 | * behaving in simplified but equivalent ways. In particular, the Guest is the |
77 | * same kernel as the Host (or at least, built from the same source code). :*/ | 78 | * same kernel as the Host (or at least, built from the same source code). |
79 | :*/ | ||
78 | 80 | ||
79 | struct lguest_data lguest_data = { | 81 | struct lguest_data lguest_data = { |
80 | .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, | 82 | .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, |
@@ -85,7 +87,8 @@ struct lguest_data lguest_data = { | |||
85 | .syscall_vec = SYSCALL_VECTOR, | 87 | .syscall_vec = SYSCALL_VECTOR, |
86 | }; | 88 | }; |
87 | 89 | ||
88 | /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a | 90 | /*G:037 |
91 | * async_hcall() is pretty simple: I'm quite proud of it really. We have a | ||
89 | * ring buffer of stored hypercalls which the Host will run though next time we | 92 | * ring buffer of stored hypercalls which the Host will run though next time we |
90 | * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall | 93 | * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall |
91 | * arguments, and a "hcall_status" word which is 0 if the call is ready to go, | 94 | * arguments, and a "hcall_status" word which is 0 if the call is ready to go, |
@@ -94,7 +97,8 @@ struct lguest_data lguest_data = { | |||
94 | * If we come around to a slot which hasn't been finished, then the table is | 97 | * If we come around to a slot which hasn't been finished, then the table is |
95 | * full and we just make the hypercall directly. This has the nice side | 98 | * full and we just make the hypercall directly. This has the nice side |
96 | * effect of causing the Host to run all the stored calls in the ring buffer | 99 | * effect of causing the Host to run all the stored calls in the ring buffer |
97 | * which empties it for next time! */ | 100 | * which empties it for next time! |
101 | */ | ||
98 | static void async_hcall(unsigned long call, unsigned long arg1, | 102 | static void async_hcall(unsigned long call, unsigned long arg1, |
99 | unsigned long arg2, unsigned long arg3, | 103 | unsigned long arg2, unsigned long arg3, |
100 | unsigned long arg4) | 104 | unsigned long arg4) |
@@ -103,9 +107,11 @@ static void async_hcall(unsigned long call, unsigned long arg1, | |||
103 | static unsigned int next_call; | 107 | static unsigned int next_call; |
104 | unsigned long flags; | 108 | unsigned long flags; |
105 | 109 | ||
106 | /* Disable interrupts if not already disabled: we don't want an | 110 | /* |
111 | * Disable interrupts if not already disabled: we don't want an | ||
107 | * interrupt handler making a hypercall while we're already doing | 112 | * interrupt handler making a hypercall while we're already doing |
108 | * one! */ | 113 | * one! |
114 | */ | ||
109 | local_irq_save(flags); | 115 | local_irq_save(flags); |
110 | if (lguest_data.hcall_status[next_call] != 0xFF) { | 116 | if (lguest_data.hcall_status[next_call] != 0xFF) { |
111 | /* Table full, so do normal hcall which will flush table. */ | 117 | /* Table full, so do normal hcall which will flush table. */ |
@@ -125,8 +131,9 @@ static void async_hcall(unsigned long call, unsigned long arg1, | |||
125 | local_irq_restore(flags); | 131 | local_irq_restore(flags); |
126 | } | 132 | } |
127 | 133 | ||
128 | /*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first | 134 | /*G:035 |
129 | * real optimization trick! | 135 | * Notice the lazy_hcall() above, rather than hcall(). This is our first real |
136 | * optimization trick! | ||
130 | * | 137 | * |
131 | * When lazy_mode is set, it means we're allowed to defer all hypercalls and do | 138 | * When lazy_mode is set, it means we're allowed to defer all hypercalls and do |
132 | * them as a batch when lazy_mode is eventually turned off. Because hypercalls | 139 | * them as a batch when lazy_mode is eventually turned off. Because hypercalls |
@@ -136,7 +143,8 @@ static void async_hcall(unsigned long call, unsigned long arg1, | |||
136 | * lguest_leave_lazy_mode(). | 143 | * lguest_leave_lazy_mode(). |
137 | * | 144 | * |
138 | * So, when we're in lazy mode, we call async_hcall() to store the call for | 145 | * So, when we're in lazy mode, we call async_hcall() to store the call for |
139 | * future processing: */ | 146 | * future processing: |
147 | */ | ||
140 | static void lazy_hcall1(unsigned long call, | 148 | static void lazy_hcall1(unsigned long call, |
141 | unsigned long arg1) | 149 | unsigned long arg1) |
142 | { | 150 | { |
@@ -208,9 +216,11 @@ static void lguest_end_context_switch(struct task_struct *next) | |||
208 | * check there before it tries to deliver an interrupt. | 216 | * check there before it tries to deliver an interrupt. |
209 | */ | 217 | */ |
210 | 218 | ||
211 | /* save_flags() is expected to return the processor state (ie. "flags"). The | 219 | /* |
220 | * save_flags() is expected to return the processor state (ie. "flags"). The | ||
212 | * flags word contains all kind of stuff, but in practice Linux only cares | 221 | * flags word contains all kind of stuff, but in practice Linux only cares |
213 | * about the interrupt flag. Our "save_flags()" just returns that. */ | 222 | * about the interrupt flag. Our "save_flags()" just returns that. |
223 | */ | ||
214 | static unsigned long save_fl(void) | 224 | static unsigned long save_fl(void) |
215 | { | 225 | { |
216 | return lguest_data.irq_enabled; | 226 | return lguest_data.irq_enabled; |
@@ -222,13 +232,15 @@ static void irq_disable(void) | |||
222 | lguest_data.irq_enabled = 0; | 232 | lguest_data.irq_enabled = 0; |
223 | } | 233 | } |
224 | 234 | ||
225 | /* Let's pause a moment. Remember how I said these are called so often? | 235 | /* |
236 | * Let's pause a moment. Remember how I said these are called so often? | ||
226 | * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to | 237 | * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to |
227 | * break some rules. In particular, these functions are assumed to save their | 238 | * break some rules. In particular, these functions are assumed to save their |
228 | * own registers if they need to: normal C functions assume they can trash the | 239 | * own registers if they need to: normal C functions assume they can trash the |
229 | * eax register. To use normal C functions, we use | 240 | * eax register. To use normal C functions, we use |
230 | * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the | 241 | * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the |
231 | * C function, then restores it. */ | 242 | * C function, then restores it. |
243 | */ | ||
232 | PV_CALLEE_SAVE_REGS_THUNK(save_fl); | 244 | PV_CALLEE_SAVE_REGS_THUNK(save_fl); |
233 | PV_CALLEE_SAVE_REGS_THUNK(irq_disable); | 245 | PV_CALLEE_SAVE_REGS_THUNK(irq_disable); |
234 | /*:*/ | 246 | /*:*/ |
@@ -237,18 +249,20 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_disable); | |||
237 | extern void lg_irq_enable(void); | 249 | extern void lg_irq_enable(void); |
238 | extern void lg_restore_fl(unsigned long flags); | 250 | extern void lg_restore_fl(unsigned long flags); |
239 | 251 | ||
240 | /*M:003 Note that we don't check for outstanding interrupts when we re-enable | 252 | /*M:003 |
241 | * them (or when we unmask an interrupt). This seems to work for the moment, | 253 | * Note that we don't check for outstanding interrupts when we re-enable them |
242 | * since interrupts are rare and we'll just get the interrupt on the next timer | 254 | * (or when we unmask an interrupt). This seems to work for the moment, since |
243 | * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way | 255 | * interrupts are rare and we'll just get the interrupt on the next timer tick, |
244 | * would be to put the "irq_enabled" field in a page by itself, and have the | 256 | * but now we can run with CONFIG_NO_HZ, we should revisit this. One way would |
245 | * Host write-protect it when an interrupt comes in when irqs are disabled. | 257 | * be to put the "irq_enabled" field in a page by itself, and have the Host |
246 | * There will then be a page fault as soon as interrupts are re-enabled. | 258 | * write-protect it when an interrupt comes in when irqs are disabled. There |
259 | * will then be a page fault as soon as interrupts are re-enabled. | ||
247 | * | 260 | * |
248 | * A better method is to implement soft interrupt disable generally for x86: | 261 | * A better method is to implement soft interrupt disable generally for x86: |
249 | * instead of disabling interrupts, we set a flag. If an interrupt does come | 262 | * instead of disabling interrupts, we set a flag. If an interrupt does come |
250 | * in, we then disable them for real. This is uncommon, so we could simply use | 263 | * in, we then disable them for real. This is uncommon, so we could simply use |
251 | * a hypercall for interrupt control and not worry about efficiency. :*/ | 264 | * a hypercall for interrupt control and not worry about efficiency. |
265 | :*/ | ||
252 | 266 | ||
253 | /*G:034 | 267 | /*G:034 |
254 | * The Interrupt Descriptor Table (IDT). | 268 | * The Interrupt Descriptor Table (IDT). |
@@ -261,10 +275,12 @@ extern void lg_restore_fl(unsigned long flags); | |||
261 | static void lguest_write_idt_entry(gate_desc *dt, | 275 | static void lguest_write_idt_entry(gate_desc *dt, |
262 | int entrynum, const gate_desc *g) | 276 | int entrynum, const gate_desc *g) |
263 | { | 277 | { |
264 | /* The gate_desc structure is 8 bytes long: we hand it to the Host in | 278 | /* |
279 | * The gate_desc structure is 8 bytes long: we hand it to the Host in | ||
265 | * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors | 280 | * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors |
266 | * around like this; typesafety wasn't a big concern in Linux's early | 281 | * around like this; typesafety wasn't a big concern in Linux's early |
267 | * years. */ | 282 | * years. |
283 | */ | ||
268 | u32 *desc = (u32 *)g; | 284 | u32 *desc = (u32 *)g; |
269 | /* Keep the local copy up to date. */ | 285 | /* Keep the local copy up to date. */ |
270 | native_write_idt_entry(dt, entrynum, g); | 286 | native_write_idt_entry(dt, entrynum, g); |
@@ -272,9 +288,11 @@ static void lguest_write_idt_entry(gate_desc *dt, | |||
272 | kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); | 288 | kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); |
273 | } | 289 | } |
274 | 290 | ||
275 | /* Changing to a different IDT is very rare: we keep the IDT up-to-date every | 291 | /* |
292 | * Changing to a different IDT is very rare: we keep the IDT up-to-date every | ||
276 | * time it is written, so we can simply loop through all entries and tell the | 293 | * time it is written, so we can simply loop through all entries and tell the |
277 | * Host about them. */ | 294 | * Host about them. |
295 | */ | ||
278 | static void lguest_load_idt(const struct desc_ptr *desc) | 296 | static void lguest_load_idt(const struct desc_ptr *desc) |
279 | { | 297 | { |
280 | unsigned int i; | 298 | unsigned int i; |
@@ -305,9 +323,11 @@ static void lguest_load_gdt(const struct desc_ptr *desc) | |||
305 | kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); | 323 | kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); |
306 | } | 324 | } |
307 | 325 | ||
308 | /* For a single GDT entry which changes, we do the lazy thing: alter our GDT, | 326 | /* |
327 | * For a single GDT entry which changes, we do the lazy thing: alter our GDT, | ||
309 | * then tell the Host to reload the entire thing. This operation is so rare | 328 | * then tell the Host to reload the entire thing. This operation is so rare |
310 | * that this naive implementation is reasonable. */ | 329 | * that this naive implementation is reasonable. |
330 | */ | ||
311 | static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, | 331 | static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, |
312 | const void *desc, int type) | 332 | const void *desc, int type) |
313 | { | 333 | { |
@@ -317,29 +337,36 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, | |||
317 | dt[entrynum].a, dt[entrynum].b); | 337 | dt[entrynum].a, dt[entrynum].b); |
318 | } | 338 | } |
319 | 339 | ||
320 | /* OK, I lied. There are three "thread local storage" GDT entries which change | 340 | /* |
341 | * OK, I lied. There are three "thread local storage" GDT entries which change | ||
321 | * on every context switch (these three entries are how glibc implements | 342 | * on every context switch (these three entries are how glibc implements |
322 | * __thread variables). So we have a hypercall specifically for this case. */ | 343 | * __thread variables). So we have a hypercall specifically for this case. |
344 | */ | ||
323 | static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) | 345 | static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) |
324 | { | 346 | { |
325 | /* There's one problem which normal hardware doesn't have: the Host | 347 | /* |
348 | * There's one problem which normal hardware doesn't have: the Host | ||
326 | * can't handle us removing entries we're currently using. So we clear | 349 | * can't handle us removing entries we're currently using. So we clear |
327 | * the GS register here: if it's needed it'll be reloaded anyway. */ | 350 | * the GS register here: if it's needed it'll be reloaded anyway. |
351 | */ | ||
328 | lazy_load_gs(0); | 352 | lazy_load_gs(0); |
329 | lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); | 353 | lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); |
330 | } | 354 | } |
331 | 355 | ||
332 | /*G:038 That's enough excitement for now, back to ploughing through each of | 356 | /*G:038 |
333 | * the different pv_ops structures (we're about 1/3 of the way through). | 357 | * That's enough excitement for now, back to ploughing through each of the |
358 | * different pv_ops structures (we're about 1/3 of the way through). | ||
334 | * | 359 | * |
335 | * This is the Local Descriptor Table, another weird Intel thingy. Linux only | 360 | * This is the Local Descriptor Table, another weird Intel thingy. Linux only |
336 | * uses this for some strange applications like Wine. We don't do anything | 361 | * uses this for some strange applications like Wine. We don't do anything |
337 | * here, so they'll get an informative and friendly Segmentation Fault. */ | 362 | * here, so they'll get an informative and friendly Segmentation Fault. |
363 | */ | ||
338 | static void lguest_set_ldt(const void *addr, unsigned entries) | 364 | static void lguest_set_ldt(const void *addr, unsigned entries) |
339 | { | 365 | { |
340 | } | 366 | } |
341 | 367 | ||
342 | /* This loads a GDT entry into the "Task Register": that entry points to a | 368 | /* |
369 | * This loads a GDT entry into the "Task Register": that entry points to a | ||
343 | * structure called the Task State Segment. Some comments scattered though the | 370 | * structure called the Task State Segment. Some comments scattered though the |
344 | * kernel code indicate that this used for task switching in ages past, along | 371 | * kernel code indicate that this used for task switching in ages past, along |
345 | * with blood sacrifice and astrology. | 372 | * with blood sacrifice and astrology. |
@@ -347,19 +374,21 @@ static void lguest_set_ldt(const void *addr, unsigned entries) | |||
347 | * Now there's nothing interesting in here that we don't get told elsewhere. | 374 | * Now there's nothing interesting in here that we don't get told elsewhere. |
348 | * But the native version uses the "ltr" instruction, which makes the Host | 375 | * But the native version uses the "ltr" instruction, which makes the Host |
349 | * complain to the Guest about a Segmentation Fault and it'll oops. So we | 376 | * complain to the Guest about a Segmentation Fault and it'll oops. So we |
350 | * override the native version with a do-nothing version. */ | 377 | * override the native version with a do-nothing version. |
378 | */ | ||
351 | static void lguest_load_tr_desc(void) | 379 | static void lguest_load_tr_desc(void) |
352 | { | 380 | { |
353 | } | 381 | } |
354 | 382 | ||
355 | /* The "cpuid" instruction is a way of querying both the CPU identity | 383 | /* |
384 | * The "cpuid" instruction is a way of querying both the CPU identity | ||
356 | * (manufacturer, model, etc) and its features. It was introduced before the | 385 | * (manufacturer, model, etc) and its features. It was introduced before the |
357 | * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. | 386 | * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. |
358 | * As you might imagine, after a decade and a half this treatment, it is now a | 387 | * As you might imagine, after a decade and a half this treatment, it is now a |
359 | * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. | 388 | * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. |
360 | * | 389 | * |
361 | * This instruction even it has its own Wikipedia entry. The Wikipedia entry | 390 | * This instruction even it has its own Wikipedia entry. The Wikipedia entry |
362 | * has been translated into 4 languages. I am not making this up! | 391 | * has been translated into 5 languages. I am not making this up! |
363 | * | 392 | * |
364 | * We could get funky here and identify ourselves as "GenuineLguest", but | 393 | * We could get funky here and identify ourselves as "GenuineLguest", but |
365 | * instead we just use the real "cpuid" instruction. Then I pretty much turned | 394 | * instead we just use the real "cpuid" instruction. Then I pretty much turned |
@@ -371,7 +400,8 @@ static void lguest_load_tr_desc(void) | |||
371 | * Replacing the cpuid so we can turn features off is great for the kernel, but | 400 | * Replacing the cpuid so we can turn features off is great for the kernel, but |
372 | * anyone (including userspace) can just use the raw "cpuid" instruction and | 401 | * anyone (including userspace) can just use the raw "cpuid" instruction and |
373 | * the Host won't even notice since it isn't privileged. So we try not to get | 402 | * the Host won't even notice since it isn't privileged. So we try not to get |
374 | * too worked up about it. */ | 403 | * too worked up about it. |
404 | */ | ||
375 | static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | 405 | static void lguest_cpuid(unsigned int *ax, unsigned int *bx, |
376 | unsigned int *cx, unsigned int *dx) | 406 | unsigned int *cx, unsigned int *dx) |
377 | { | 407 | { |
@@ -379,43 +409,63 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | |||
379 | 409 | ||
380 | native_cpuid(ax, bx, cx, dx); | 410 | native_cpuid(ax, bx, cx, dx); |
381 | switch (function) { | 411 | switch (function) { |
382 | case 0: /* ID and highest CPUID. Futureproof a little by sticking to | 412 | /* |
383 | * older ones. */ | 413 | * CPUID 0 gives the highest legal CPUID number (and the ID string). |
414 | * We futureproof our code a little by sticking to known CPUID values. | ||
415 | */ | ||
416 | case 0: | ||
384 | if (*ax > 5) | 417 | if (*ax > 5) |
385 | *ax = 5; | 418 | *ax = 5; |
386 | break; | 419 | break; |
387 | case 1: /* Basic feature request. */ | 420 | |
388 | /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ | 421 | /* |
422 | * CPUID 1 is a basic feature request. | ||
423 | * | ||
424 | * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3 | ||
425 | * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE. | ||
426 | */ | ||
427 | case 1: | ||
389 | *cx &= 0x00002201; | 428 | *cx &= 0x00002201; |
390 | /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ | ||
391 | *dx &= 0x07808151; | 429 | *dx &= 0x07808151; |
392 | /* The Host can do a nice optimization if it knows that the | 430 | /* |
431 | * The Host can do a nice optimization if it knows that the | ||
393 | * kernel mappings (addresses above 0xC0000000 or whatever | 432 | * kernel mappings (addresses above 0xC0000000 or whatever |
394 | * PAGE_OFFSET is set to) haven't changed. But Linux calls | 433 | * PAGE_OFFSET is set to) haven't changed. But Linux calls |
395 | * flush_tlb_user() for both user and kernel mappings unless | 434 | * flush_tlb_user() for both user and kernel mappings unless |
396 | * the Page Global Enable (PGE) feature bit is set. */ | 435 | * the Page Global Enable (PGE) feature bit is set. |
436 | */ | ||
397 | *dx |= 0x00002000; | 437 | *dx |= 0x00002000; |
398 | /* We also lie, and say we're family id 5. 6 or greater | 438 | /* |
439 | * We also lie, and say we're family id 5. 6 or greater | ||
399 | * leads to a rdmsr in early_init_intel which we can't handle. | 440 | * leads to a rdmsr in early_init_intel which we can't handle. |
400 | * Family ID is returned as bits 8-12 in ax. */ | 441 | * Family ID is returned as bits 8-12 in ax. |
442 | */ | ||
401 | *ax &= 0xFFFFF0FF; | 443 | *ax &= 0xFFFFF0FF; |
402 | *ax |= 0x00000500; | 444 | *ax |= 0x00000500; |
403 | break; | 445 | break; |
446 | /* | ||
447 | * 0x80000000 returns the highest Extended Function, so we futureproof | ||
448 | * like we do above by limiting it to known fields. | ||
449 | */ | ||
404 | case 0x80000000: | 450 | case 0x80000000: |
405 | /* Futureproof this a little: if they ask how much extended | ||
406 | * processor information there is, limit it to known fields. */ | ||
407 | if (*ax > 0x80000008) | 451 | if (*ax > 0x80000008) |
408 | *ax = 0x80000008; | 452 | *ax = 0x80000008; |
409 | break; | 453 | break; |
454 | |||
455 | /* | ||
456 | * PAE systems can mark pages as non-executable. Linux calls this the | ||
457 | * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced | ||
458 | * Virus Protection). We just switch turn if off here, since we don't | ||
459 | * support it. | ||
460 | */ | ||
410 | case 0x80000001: | 461 | case 0x80000001: |
411 | /* Here we should fix nx cap depending on host. */ | ||
412 | /* For this version of PAE, we just clear NX bit. */ | ||
413 | *dx &= ~(1 << 20); | 462 | *dx &= ~(1 << 20); |
414 | break; | 463 | break; |
415 | } | 464 | } |
416 | } | 465 | } |
417 | 466 | ||
418 | /* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. | 467 | /* |
468 | * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. | ||
419 | * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother | 469 | * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother |
420 | * it. The Host needs to know when the Guest wants to change them, so we have | 470 | * it. The Host needs to know when the Guest wants to change them, so we have |
421 | * a whole series of functions like read_cr0() and write_cr0(). | 471 | * a whole series of functions like read_cr0() and write_cr0(). |
@@ -430,7 +480,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | |||
430 | * name like "FPUTRAP bit" be a little less cryptic? | 480 | * name like "FPUTRAP bit" be a little less cryptic? |
431 | * | 481 | * |
432 | * We store cr0 locally because the Host never changes it. The Guest sometimes | 482 | * We store cr0 locally because the Host never changes it. The Guest sometimes |
433 | * wants to read it and we'd prefer not to bother the Host unnecessarily. */ | 483 | * wants to read it and we'd prefer not to bother the Host unnecessarily. |
484 | */ | ||
434 | static unsigned long current_cr0; | 485 | static unsigned long current_cr0; |
435 | static void lguest_write_cr0(unsigned long val) | 486 | static void lguest_write_cr0(unsigned long val) |
436 | { | 487 | { |
@@ -443,18 +494,22 @@ static unsigned long lguest_read_cr0(void) | |||
443 | return current_cr0; | 494 | return current_cr0; |
444 | } | 495 | } |
445 | 496 | ||
446 | /* Intel provided a special instruction to clear the TS bit for people too cool | 497 | /* |
498 | * Intel provided a special instruction to clear the TS bit for people too cool | ||
447 | * to use write_cr0() to do it. This "clts" instruction is faster, because all | 499 | * to use write_cr0() to do it. This "clts" instruction is faster, because all |
448 | * the vowels have been optimized out. */ | 500 | * the vowels have been optimized out. |
501 | */ | ||
449 | static void lguest_clts(void) | 502 | static void lguest_clts(void) |
450 | { | 503 | { |
451 | lazy_hcall1(LHCALL_TS, 0); | 504 | lazy_hcall1(LHCALL_TS, 0); |
452 | current_cr0 &= ~X86_CR0_TS; | 505 | current_cr0 &= ~X86_CR0_TS; |
453 | } | 506 | } |
454 | 507 | ||
455 | /* cr2 is the virtual address of the last page fault, which the Guest only ever | 508 | /* |
509 | * cr2 is the virtual address of the last page fault, which the Guest only ever | ||
456 | * reads. The Host kindly writes this into our "struct lguest_data", so we | 510 | * reads. The Host kindly writes this into our "struct lguest_data", so we |
457 | * just read it out of there. */ | 511 | * just read it out of there. |
512 | */ | ||
458 | static unsigned long lguest_read_cr2(void) | 513 | static unsigned long lguest_read_cr2(void) |
459 | { | 514 | { |
460 | return lguest_data.cr2; | 515 | return lguest_data.cr2; |
@@ -463,10 +518,12 @@ static unsigned long lguest_read_cr2(void) | |||
463 | /* See lguest_set_pte() below. */ | 518 | /* See lguest_set_pte() below. */ |
464 | static bool cr3_changed = false; | 519 | static bool cr3_changed = false; |
465 | 520 | ||
466 | /* cr3 is the current toplevel pagetable page: the principle is the same as | 521 | /* |
522 | * cr3 is the current toplevel pagetable page: the principle is the same as | ||
467 | * cr0. Keep a local copy, and tell the Host when it changes. The only | 523 | * cr0. Keep a local copy, and tell the Host when it changes. The only |
468 | * difference is that our local copy is in lguest_data because the Host needs | 524 | * difference is that our local copy is in lguest_data because the Host needs |
469 | * to set it upon our initial hypercall. */ | 525 | * to set it upon our initial hypercall. |
526 | */ | ||
470 | static void lguest_write_cr3(unsigned long cr3) | 527 | static void lguest_write_cr3(unsigned long cr3) |
471 | { | 528 | { |
472 | lguest_data.pgdir = cr3; | 529 | lguest_data.pgdir = cr3; |
@@ -538,10 +595,12 @@ static void lguest_write_cr4(unsigned long val) | |||
538 | * the real page tables based on the Guests'. | 595 | * the real page tables based on the Guests'. |
539 | */ | 596 | */ |
540 | 597 | ||
541 | /* The Guest calls this to set a second-level entry (pte), ie. to map a page | 598 | /* |
599 | * The Guest calls this to set a second-level entry (pte), ie. to map a page | ||
542 | * into a process' address space. We set the entry then tell the Host the | 600 | * into a process' address space. We set the entry then tell the Host the |
543 | * toplevel and address this corresponds to. The Guest uses one pagetable per | 601 | * toplevel and address this corresponds to. The Guest uses one pagetable per |
544 | * process, so we need to tell the Host which one we're changing (mm->pgd). */ | 602 | * process, so we need to tell the Host which one we're changing (mm->pgd). |
603 | */ | ||
545 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, | 604 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, |
546 | pte_t *ptep) | 605 | pte_t *ptep) |
547 | { | 606 | { |
@@ -560,10 +619,13 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, | |||
560 | lguest_pte_update(mm, addr, ptep); | 619 | lguest_pte_update(mm, addr, ptep); |
561 | } | 620 | } |
562 | 621 | ||
563 | /* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd | 622 | /* |
623 | * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd | ||
564 | * to set a middle-level entry when PAE is activated. | 624 | * to set a middle-level entry when PAE is activated. |
625 | * | ||
565 | * Again, we set the entry then tell the Host which page we changed, | 626 | * Again, we set the entry then tell the Host which page we changed, |
566 | * and the index of the entry we changed. */ | 627 | * and the index of the entry we changed. |
628 | */ | ||
567 | #ifdef CONFIG_X86_PAE | 629 | #ifdef CONFIG_X86_PAE |
568 | static void lguest_set_pud(pud_t *pudp, pud_t pudval) | 630 | static void lguest_set_pud(pud_t *pudp, pud_t pudval) |
569 | { | 631 | { |
@@ -582,8 +644,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | |||
582 | } | 644 | } |
583 | #else | 645 | #else |
584 | 646 | ||
585 | /* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not | 647 | /* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */ |
586 | * activated. */ | ||
587 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | 648 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) |
588 | { | 649 | { |
589 | native_set_pmd(pmdp, pmdval); | 650 | native_set_pmd(pmdp, pmdval); |
@@ -592,7 +653,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | |||
592 | } | 653 | } |
593 | #endif | 654 | #endif |
594 | 655 | ||
595 | /* There are a couple of legacy places where the kernel sets a PTE, but we | 656 | /* |
657 | * There are a couple of legacy places where the kernel sets a PTE, but we | ||
596 | * don't know the top level any more. This is useless for us, since we don't | 658 | * don't know the top level any more. This is useless for us, since we don't |
597 | * know which pagetable is changing or what address, so we just tell the Host | 659 | * know which pagetable is changing or what address, so we just tell the Host |
598 | * to forget all of them. Fortunately, this is very rare. | 660 | * to forget all of them. Fortunately, this is very rare. |
@@ -600,7 +662,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | |||
600 | * ... except in early boot when the kernel sets up the initial pagetables, | 662 | * ... except in early boot when the kernel sets up the initial pagetables, |
601 | * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell | 663 | * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell |
602 | * the Host anything changed until we've done the first page table switch, | 664 | * the Host anything changed until we've done the first page table switch, |
603 | * which brings boot back to 0.25 seconds. */ | 665 | * which brings boot back to 0.25 seconds. |
666 | */ | ||
604 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) | 667 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) |
605 | { | 668 | { |
606 | native_set_pte(ptep, pteval); | 669 | native_set_pte(ptep, pteval); |
@@ -628,7 +691,8 @@ void lguest_pmd_clear(pmd_t *pmdp) | |||
628 | } | 691 | } |
629 | #endif | 692 | #endif |
630 | 693 | ||
631 | /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on | 694 | /* |
695 | * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on | ||
632 | * native page table operations. On native hardware you can set a new page | 696 | * native page table operations. On native hardware you can set a new page |
633 | * table entry whenever you want, but if you want to remove one you have to do | 697 | * table entry whenever you want, but if you want to remove one you have to do |
634 | * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). | 698 | * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). |
@@ -637,24 +701,29 @@ void lguest_pmd_clear(pmd_t *pmdp) | |||
637 | * called when a valid entry is written, not when it's removed (ie. marked not | 701 | * called when a valid entry is written, not when it's removed (ie. marked not |
638 | * present). Instead, this is where we come when the Guest wants to remove a | 702 | * present). Instead, this is where we come when the Guest wants to remove a |
639 | * page table entry: we tell the Host to set that entry to 0 (ie. the present | 703 | * page table entry: we tell the Host to set that entry to 0 (ie. the present |
640 | * bit is zero). */ | 704 | * bit is zero). |
705 | */ | ||
641 | static void lguest_flush_tlb_single(unsigned long addr) | 706 | static void lguest_flush_tlb_single(unsigned long addr) |
642 | { | 707 | { |
643 | /* Simply set it to zero: if it was not, it will fault back in. */ | 708 | /* Simply set it to zero: if it was not, it will fault back in. */ |
644 | lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); | 709 | lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); |
645 | } | 710 | } |
646 | 711 | ||
647 | /* This is what happens after the Guest has removed a large number of entries. | 712 | /* |
713 | * This is what happens after the Guest has removed a large number of entries. | ||
648 | * This tells the Host that any of the page table entries for userspace might | 714 | * This tells the Host that any of the page table entries for userspace might |
649 | * have changed, ie. virtual addresses below PAGE_OFFSET. */ | 715 | * have changed, ie. virtual addresses below PAGE_OFFSET. |
716 | */ | ||
650 | static void lguest_flush_tlb_user(void) | 717 | static void lguest_flush_tlb_user(void) |
651 | { | 718 | { |
652 | lazy_hcall1(LHCALL_FLUSH_TLB, 0); | 719 | lazy_hcall1(LHCALL_FLUSH_TLB, 0); |
653 | } | 720 | } |
654 | 721 | ||
655 | /* This is called when the kernel page tables have changed. That's not very | 722 | /* |
723 | * This is called when the kernel page tables have changed. That's not very | ||
656 | * common (unless the Guest is using highmem, which makes the Guest extremely | 724 | * common (unless the Guest is using highmem, which makes the Guest extremely |
657 | * slow), so it's worth separating this from the user flushing above. */ | 725 | * slow), so it's worth separating this from the user flushing above. |
726 | */ | ||
658 | static void lguest_flush_tlb_kernel(void) | 727 | static void lguest_flush_tlb_kernel(void) |
659 | { | 728 | { |
660 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); | 729 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); |
@@ -691,23 +760,27 @@ static struct irq_chip lguest_irq_controller = { | |||
691 | .unmask = enable_lguest_irq, | 760 | .unmask = enable_lguest_irq, |
692 | }; | 761 | }; |
693 | 762 | ||
694 | /* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware | 763 | /* |
764 | * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware | ||
695 | * interrupt (except 128, which is used for system calls), and then tells the | 765 | * interrupt (except 128, which is used for system calls), and then tells the |
696 | * Linux infrastructure that each interrupt is controlled by our level-based | 766 | * Linux infrastructure that each interrupt is controlled by our level-based |
697 | * lguest interrupt controller. */ | 767 | * lguest interrupt controller. |
768 | */ | ||
698 | static void __init lguest_init_IRQ(void) | 769 | static void __init lguest_init_IRQ(void) |
699 | { | 770 | { |
700 | unsigned int i; | 771 | unsigned int i; |
701 | 772 | ||
702 | for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { | 773 | for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { |
703 | /* Some systems map "vectors" to interrupts weirdly. Lguest has | 774 | /* Some systems map "vectors" to interrupts weirdly. Not us! */ |
704 | * a straightforward 1 to 1 mapping, so force that here. */ | ||
705 | __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; | 775 | __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; |
706 | if (i != SYSCALL_VECTOR) | 776 | if (i != SYSCALL_VECTOR) |
707 | set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); | 777 | set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); |
708 | } | 778 | } |
709 | /* This call is required to set up for 4k stacks, where we have | 779 | |
710 | * separate stacks for hard and soft interrupts. */ | 780 | /* |
781 | * This call is required to set up for 4k stacks, where we have | ||
782 | * separate stacks for hard and soft interrupts. | ||
783 | */ | ||
711 | irq_ctx_init(smp_processor_id()); | 784 | irq_ctx_init(smp_processor_id()); |
712 | } | 785 | } |
713 | 786 | ||
@@ -729,31 +802,39 @@ static unsigned long lguest_get_wallclock(void) | |||
729 | return lguest_data.time.tv_sec; | 802 | return lguest_data.time.tv_sec; |
730 | } | 803 | } |
731 | 804 | ||
732 | /* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us | 805 | /* |
806 | * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us | ||
733 | * what speed it runs at, or 0 if it's unusable as a reliable clock source. | 807 | * what speed it runs at, or 0 if it's unusable as a reliable clock source. |
734 | * This matches what we want here: if we return 0 from this function, the x86 | 808 | * This matches what we want here: if we return 0 from this function, the x86 |
735 | * TSC clock will give up and not register itself. */ | 809 | * TSC clock will give up and not register itself. |
810 | */ | ||
736 | static unsigned long lguest_tsc_khz(void) | 811 | static unsigned long lguest_tsc_khz(void) |
737 | { | 812 | { |
738 | return lguest_data.tsc_khz; | 813 | return lguest_data.tsc_khz; |
739 | } | 814 | } |
740 | 815 | ||
741 | /* If we can't use the TSC, the kernel falls back to our lower-priority | 816 | /* |
742 | * "lguest_clock", where we read the time value given to us by the Host. */ | 817 | * If we can't use the TSC, the kernel falls back to our lower-priority |
818 | * "lguest_clock", where we read the time value given to us by the Host. | ||
819 | */ | ||
743 | static cycle_t lguest_clock_read(struct clocksource *cs) | 820 | static cycle_t lguest_clock_read(struct clocksource *cs) |
744 | { | 821 | { |
745 | unsigned long sec, nsec; | 822 | unsigned long sec, nsec; |
746 | 823 | ||
747 | /* Since the time is in two parts (seconds and nanoseconds), we risk | 824 | /* |
825 | * Since the time is in two parts (seconds and nanoseconds), we risk | ||
748 | * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, | 826 | * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, |
749 | * and getting 99 and 0. As Linux tends to come apart under the stress | 827 | * and getting 99 and 0. As Linux tends to come apart under the stress |
750 | * of time travel, we must be careful: */ | 828 | * of time travel, we must be careful: |
829 | */ | ||
751 | do { | 830 | do { |
752 | /* First we read the seconds part. */ | 831 | /* First we read the seconds part. */ |
753 | sec = lguest_data.time.tv_sec; | 832 | sec = lguest_data.time.tv_sec; |
754 | /* This read memory barrier tells the compiler and the CPU that | 833 | /* |
834 | * This read memory barrier tells the compiler and the CPU that | ||
755 | * this can't be reordered: we have to complete the above | 835 | * this can't be reordered: we have to complete the above |
756 | * before going on. */ | 836 | * before going on. |
837 | */ | ||
757 | rmb(); | 838 | rmb(); |
758 | /* Now we read the nanoseconds part. */ | 839 | /* Now we read the nanoseconds part. */ |
759 | nsec = lguest_data.time.tv_nsec; | 840 | nsec = lguest_data.time.tv_nsec; |
@@ -777,9 +858,11 @@ static struct clocksource lguest_clock = { | |||
777 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | 858 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, |
778 | }; | 859 | }; |
779 | 860 | ||
780 | /* We also need a "struct clock_event_device": Linux asks us to set it to go | 861 | /* |
862 | * We also need a "struct clock_event_device": Linux asks us to set it to go | ||
781 | * off some time in the future. Actually, James Morris figured all this out, I | 863 | * off some time in the future. Actually, James Morris figured all this out, I |
782 | * just applied the patch. */ | 864 | * just applied the patch. |
865 | */ | ||
783 | static int lguest_clockevent_set_next_event(unsigned long delta, | 866 | static int lguest_clockevent_set_next_event(unsigned long delta, |
784 | struct clock_event_device *evt) | 867 | struct clock_event_device *evt) |
785 | { | 868 | { |
@@ -829,8 +912,10 @@ static struct clock_event_device lguest_clockevent = { | |||
829 | .max_delta_ns = LG_CLOCK_MAX_DELTA, | 912 | .max_delta_ns = LG_CLOCK_MAX_DELTA, |
830 | }; | 913 | }; |
831 | 914 | ||
832 | /* This is the Guest timer interrupt handler (hardware interrupt 0). We just | 915 | /* |
833 | * call the clockevent infrastructure and it does whatever needs doing. */ | 916 | * This is the Guest timer interrupt handler (hardware interrupt 0). We just |
917 | * call the clockevent infrastructure and it does whatever needs doing. | ||
918 | */ | ||
834 | static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) | 919 | static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) |
835 | { | 920 | { |
836 | unsigned long flags; | 921 | unsigned long flags; |
@@ -841,10 +926,12 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) | |||
841 | local_irq_restore(flags); | 926 | local_irq_restore(flags); |
842 | } | 927 | } |
843 | 928 | ||
844 | /* At some point in the boot process, we get asked to set up our timing | 929 | /* |
930 | * At some point in the boot process, we get asked to set up our timing | ||
845 | * infrastructure. The kernel doesn't expect timer interrupts before this, but | 931 | * infrastructure. The kernel doesn't expect timer interrupts before this, but |
846 | * we cleverly initialized the "blocked_interrupts" field of "struct | 932 | * we cleverly initialized the "blocked_interrupts" field of "struct |
847 | * lguest_data" so that timer interrupts were blocked until now. */ | 933 | * lguest_data" so that timer interrupts were blocked until now. |
934 | */ | ||
848 | static void lguest_time_init(void) | 935 | static void lguest_time_init(void) |
849 | { | 936 | { |
850 | /* Set up the timer interrupt (0) to go to our simple timer routine */ | 937 | /* Set up the timer interrupt (0) to go to our simple timer routine */ |
@@ -868,14 +955,16 @@ static void lguest_time_init(void) | |||
868 | * to work. They're pretty simple. | 955 | * to work. They're pretty simple. |
869 | */ | 956 | */ |
870 | 957 | ||
871 | /* The Guest needs to tell the Host what stack it expects traps to use. For | 958 | /* |
959 | * The Guest needs to tell the Host what stack it expects traps to use. For | ||
872 | * native hardware, this is part of the Task State Segment mentioned above in | 960 | * native hardware, this is part of the Task State Segment mentioned above in |
873 | * lguest_load_tr_desc(), but to help hypervisors there's this special call. | 961 | * lguest_load_tr_desc(), but to help hypervisors there's this special call. |
874 | * | 962 | * |
875 | * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data | 963 | * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data |
876 | * segment), the privilege level (we're privilege level 1, the Host is 0 and | 964 | * segment), the privilege level (we're privilege level 1, the Host is 0 and |
877 | * will not tolerate us trying to use that), the stack pointer, and the number | 965 | * will not tolerate us trying to use that), the stack pointer, and the number |
878 | * of pages in the stack. */ | 966 | * of pages in the stack. |
967 | */ | ||
879 | static void lguest_load_sp0(struct tss_struct *tss, | 968 | static void lguest_load_sp0(struct tss_struct *tss, |
880 | struct thread_struct *thread) | 969 | struct thread_struct *thread) |
881 | { | 970 | { |
@@ -889,7 +978,8 @@ static void lguest_set_debugreg(int regno, unsigned long value) | |||
889 | /* FIXME: Implement */ | 978 | /* FIXME: Implement */ |
890 | } | 979 | } |
891 | 980 | ||
892 | /* There are times when the kernel wants to make sure that no memory writes are | 981 | /* |
982 | * There are times when the kernel wants to make sure that no memory writes are | ||
893 | * caught in the cache (that they've all reached real hardware devices). This | 983 | * caught in the cache (that they've all reached real hardware devices). This |
894 | * doesn't matter for the Guest which has virtual hardware. | 984 | * doesn't matter for the Guest which has virtual hardware. |
895 | * | 985 | * |
@@ -903,11 +993,13 @@ static void lguest_wbinvd(void) | |||
903 | { | 993 | { |
904 | } | 994 | } |
905 | 995 | ||
906 | /* If the Guest expects to have an Advanced Programmable Interrupt Controller, | 996 | /* |
997 | * If the Guest expects to have an Advanced Programmable Interrupt Controller, | ||
907 | * we play dumb by ignoring writes and returning 0 for reads. So it's no | 998 | * we play dumb by ignoring writes and returning 0 for reads. So it's no |
908 | * longer Programmable nor Controlling anything, and I don't think 8 lines of | 999 | * longer Programmable nor Controlling anything, and I don't think 8 lines of |
909 | * code qualifies for Advanced. It will also never interrupt anything. It | 1000 | * code qualifies for Advanced. It will also never interrupt anything. It |
910 | * does, however, allow us to get through the Linux boot code. */ | 1001 | * does, however, allow us to get through the Linux boot code. |
1002 | */ | ||
911 | #ifdef CONFIG_X86_LOCAL_APIC | 1003 | #ifdef CONFIG_X86_LOCAL_APIC |
912 | static void lguest_apic_write(u32 reg, u32 v) | 1004 | static void lguest_apic_write(u32 reg, u32 v) |
913 | { | 1005 | { |
@@ -956,11 +1048,13 @@ static void lguest_safe_halt(void) | |||
956 | kvm_hypercall0(LHCALL_HALT); | 1048 | kvm_hypercall0(LHCALL_HALT); |
957 | } | 1049 | } |
958 | 1050 | ||
959 | /* The SHUTDOWN hypercall takes a string to describe what's happening, and | 1051 | /* |
1052 | * The SHUTDOWN hypercall takes a string to describe what's happening, and | ||
960 | * an argument which says whether this to restart (reboot) the Guest or not. | 1053 | * an argument which says whether this to restart (reboot) the Guest or not. |
961 | * | 1054 | * |
962 | * Note that the Host always prefers that the Guest speak in physical addresses | 1055 | * Note that the Host always prefers that the Guest speak in physical addresses |
963 | * rather than virtual addresses, so we use __pa() here. */ | 1056 | * rather than virtual addresses, so we use __pa() here. |
1057 | */ | ||
964 | static void lguest_power_off(void) | 1058 | static void lguest_power_off(void) |
965 | { | 1059 | { |
966 | kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), | 1060 | kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), |
@@ -991,8 +1085,10 @@ static __init char *lguest_memory_setup(void) | |||
991 | * nice to move it back to lguest_init. Patch welcome... */ | 1085 | * nice to move it back to lguest_init. Patch welcome... */ |
992 | atomic_notifier_chain_register(&panic_notifier_list, &paniced); | 1086 | atomic_notifier_chain_register(&panic_notifier_list, &paniced); |
993 | 1087 | ||
994 | /* The Linux bootloader header contains an "e820" memory map: the | 1088 | /* |
995 | * Launcher populated the first entry with our memory limit. */ | 1089 | *The Linux bootloader header contains an "e820" memory map: the |
1090 | * Launcher populated the first entry with our memory limit. | ||
1091 | */ | ||
996 | e820_add_region(boot_params.e820_map[0].addr, | 1092 | e820_add_region(boot_params.e820_map[0].addr, |
997 | boot_params.e820_map[0].size, | 1093 | boot_params.e820_map[0].size, |
998 | boot_params.e820_map[0].type); | 1094 | boot_params.e820_map[0].type); |
@@ -1001,16 +1097,17 @@ static __init char *lguest_memory_setup(void) | |||
1001 | return "LGUEST"; | 1097 | return "LGUEST"; |
1002 | } | 1098 | } |
1003 | 1099 | ||
1004 | /* We will eventually use the virtio console device to produce console output, | 1100 | /* |
1101 | * We will eventually use the virtio console device to produce console output, | ||
1005 | * but before that is set up we use LHCALL_NOTIFY on normal memory to produce | 1102 | * but before that is set up we use LHCALL_NOTIFY on normal memory to produce |
1006 | * console output. */ | 1103 | * console output. |
1104 | */ | ||
1007 | static __init int early_put_chars(u32 vtermno, const char *buf, int count) | 1105 | static __init int early_put_chars(u32 vtermno, const char *buf, int count) |
1008 | { | 1106 | { |
1009 | char scratch[17]; | 1107 | char scratch[17]; |
1010 | unsigned int len = count; | 1108 | unsigned int len = count; |
1011 | 1109 | ||
1012 | /* We use a nul-terminated string, so we have to make a copy. Icky, | 1110 | /* We use a nul-terminated string, so we make a copy. Icky, huh? */ |
1013 | * huh? */ | ||
1014 | if (len > sizeof(scratch) - 1) | 1111 | if (len > sizeof(scratch) - 1) |
1015 | len = sizeof(scratch) - 1; | 1112 | len = sizeof(scratch) - 1; |
1016 | scratch[len] = '\0'; | 1113 | scratch[len] = '\0'; |
@@ -1021,8 +1118,10 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) | |||
1021 | return len; | 1118 | return len; |
1022 | } | 1119 | } |
1023 | 1120 | ||
1024 | /* Rebooting also tells the Host we're finished, but the RESTART flag tells the | 1121 | /* |
1025 | * Launcher to reboot us. */ | 1122 | * Rebooting also tells the Host we're finished, but the RESTART flag tells the |
1123 | * Launcher to reboot us. | ||
1124 | */ | ||
1026 | static void lguest_restart(char *reason) | 1125 | static void lguest_restart(char *reason) |
1027 | { | 1126 | { |
1028 | kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); | 1127 | kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); |
@@ -1049,7 +1148,8 @@ static void lguest_restart(char *reason) | |||
1049 | * fit comfortably. | 1148 | * fit comfortably. |
1050 | * | 1149 | * |
1051 | * First we need assembly templates of each of the patchable Guest operations, | 1150 | * First we need assembly templates of each of the patchable Guest operations, |
1052 | * and these are in i386_head.S. */ | 1151 | * and these are in i386_head.S. |
1152 | */ | ||
1053 | 1153 | ||
1054 | /*G:060 We construct a table from the assembler templates: */ | 1154 | /*G:060 We construct a table from the assembler templates: */ |
1055 | static const struct lguest_insns | 1155 | static const struct lguest_insns |
@@ -1060,9 +1160,11 @@ static const struct lguest_insns | |||
1060 | [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, | 1160 | [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, |
1061 | }; | 1161 | }; |
1062 | 1162 | ||
1063 | /* Now our patch routine is fairly simple (based on the native one in | 1163 | /* |
1164 | * Now our patch routine is fairly simple (based on the native one in | ||
1064 | * paravirt.c). If we have a replacement, we copy it in and return how much of | 1165 | * paravirt.c). If we have a replacement, we copy it in and return how much of |
1065 | * the available space we used. */ | 1166 | * the available space we used. |
1167 | */ | ||
1066 | static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, | 1168 | static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, |
1067 | unsigned long addr, unsigned len) | 1169 | unsigned long addr, unsigned len) |
1068 | { | 1170 | { |
@@ -1074,8 +1176,7 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, | |||
1074 | 1176 | ||
1075 | insn_len = lguest_insns[type].end - lguest_insns[type].start; | 1177 | insn_len = lguest_insns[type].end - lguest_insns[type].start; |
1076 | 1178 | ||
1077 | /* Similarly if we can't fit replacement (shouldn't happen, but let's | 1179 | /* Similarly if it can't fit (doesn't happen, but let's be thorough). */ |
1078 | * be thorough). */ | ||
1079 | if (len < insn_len) | 1180 | if (len < insn_len) |
1080 | return paravirt_patch_default(type, clobber, ibuf, addr, len); | 1181 | return paravirt_patch_default(type, clobber, ibuf, addr, len); |
1081 | 1182 | ||
@@ -1084,22 +1185,28 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, | |||
1084 | return insn_len; | 1185 | return insn_len; |
1085 | } | 1186 | } |
1086 | 1187 | ||
1087 | /*G:029 Once we get to lguest_init(), we know we're a Guest. The various | 1188 | /*G:029 |
1189 | * Once we get to lguest_init(), we know we're a Guest. The various | ||
1088 | * pv_ops structures in the kernel provide points for (almost) every routine we | 1190 | * pv_ops structures in the kernel provide points for (almost) every routine we |
1089 | * have to override to avoid privileged instructions. */ | 1191 | * have to override to avoid privileged instructions. |
1192 | */ | ||
1090 | __init void lguest_init(void) | 1193 | __init void lguest_init(void) |
1091 | { | 1194 | { |
1092 | /* We're under lguest, paravirt is enabled, and we're running at | 1195 | /* We're under lguest. */ |
1093 | * privilege level 1, not 0 as normal. */ | ||
1094 | pv_info.name = "lguest"; | 1196 | pv_info.name = "lguest"; |
1197 | /* Paravirt is enabled. */ | ||
1095 | pv_info.paravirt_enabled = 1; | 1198 | pv_info.paravirt_enabled = 1; |
1199 | /* We're running at privilege level 1, not 0 as normal. */ | ||
1096 | pv_info.kernel_rpl = 1; | 1200 | pv_info.kernel_rpl = 1; |
1201 | /* Everyone except Xen runs with this set. */ | ||
1097 | pv_info.shared_kernel_pmd = 1; | 1202 | pv_info.shared_kernel_pmd = 1; |
1098 | 1203 | ||
1099 | /* We set up all the lguest overrides for sensitive operations. These | 1204 | /* |
1100 | * are detailed with the operations themselves. */ | 1205 | * We set up all the lguest overrides for sensitive operations. These |
1206 | * are detailed with the operations themselves. | ||
1207 | */ | ||
1101 | 1208 | ||
1102 | /* interrupt-related operations */ | 1209 | /* Interrupt-related operations */ |
1103 | pv_irq_ops.init_IRQ = lguest_init_IRQ; | 1210 | pv_irq_ops.init_IRQ = lguest_init_IRQ; |
1104 | pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); | 1211 | pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); |
1105 | pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); | 1212 | pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); |
@@ -1107,11 +1214,11 @@ __init void lguest_init(void) | |||
1107 | pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); | 1214 | pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); |
1108 | pv_irq_ops.safe_halt = lguest_safe_halt; | 1215 | pv_irq_ops.safe_halt = lguest_safe_halt; |
1109 | 1216 | ||
1110 | /* init-time operations */ | 1217 | /* Setup operations */ |
1111 | pv_init_ops.memory_setup = lguest_memory_setup; | 1218 | pv_init_ops.memory_setup = lguest_memory_setup; |
1112 | pv_init_ops.patch = lguest_patch; | 1219 | pv_init_ops.patch = lguest_patch; |
1113 | 1220 | ||
1114 | /* Intercepts of various cpu instructions */ | 1221 | /* Intercepts of various CPU instructions */ |
1115 | pv_cpu_ops.load_gdt = lguest_load_gdt; | 1222 | pv_cpu_ops.load_gdt = lguest_load_gdt; |
1116 | pv_cpu_ops.cpuid = lguest_cpuid; | 1223 | pv_cpu_ops.cpuid = lguest_cpuid; |
1117 | pv_cpu_ops.load_idt = lguest_load_idt; | 1224 | pv_cpu_ops.load_idt = lguest_load_idt; |
@@ -1132,7 +1239,7 @@ __init void lguest_init(void) | |||
1132 | pv_cpu_ops.start_context_switch = paravirt_start_context_switch; | 1239 | pv_cpu_ops.start_context_switch = paravirt_start_context_switch; |
1133 | pv_cpu_ops.end_context_switch = lguest_end_context_switch; | 1240 | pv_cpu_ops.end_context_switch = lguest_end_context_switch; |
1134 | 1241 | ||
1135 | /* pagetable management */ | 1242 | /* Pagetable management */ |
1136 | pv_mmu_ops.write_cr3 = lguest_write_cr3; | 1243 | pv_mmu_ops.write_cr3 = lguest_write_cr3; |
1137 | pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; | 1244 | pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; |
1138 | pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; | 1245 | pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; |
@@ -1154,54 +1261,71 @@ __init void lguest_init(void) | |||
1154 | pv_mmu_ops.pte_update_defer = lguest_pte_update; | 1261 | pv_mmu_ops.pte_update_defer = lguest_pte_update; |
1155 | 1262 | ||
1156 | #ifdef CONFIG_X86_LOCAL_APIC | 1263 | #ifdef CONFIG_X86_LOCAL_APIC |
1157 | /* apic read/write intercepts */ | 1264 | /* APIC read/write intercepts */ |
1158 | set_lguest_basic_apic_ops(); | 1265 | set_lguest_basic_apic_ops(); |
1159 | #endif | 1266 | #endif |
1160 | 1267 | ||
1161 | /* time operations */ | 1268 | /* Time operations */ |
1162 | pv_time_ops.get_wallclock = lguest_get_wallclock; | 1269 | pv_time_ops.get_wallclock = lguest_get_wallclock; |
1163 | pv_time_ops.time_init = lguest_time_init; | 1270 | pv_time_ops.time_init = lguest_time_init; |
1164 | pv_time_ops.get_tsc_khz = lguest_tsc_khz; | 1271 | pv_time_ops.get_tsc_khz = lguest_tsc_khz; |
1165 | 1272 | ||
1166 | /* Now is a good time to look at the implementations of these functions | 1273 | /* |
1167 | * before returning to the rest of lguest_init(). */ | 1274 | * Now is a good time to look at the implementations of these functions |
1275 | * before returning to the rest of lguest_init(). | ||
1276 | */ | ||
1168 | 1277 | ||
1169 | /*G:070 Now we've seen all the paravirt_ops, we return to | 1278 | /*G:070 |
1279 | * Now we've seen all the paravirt_ops, we return to | ||
1170 | * lguest_init() where the rest of the fairly chaotic boot setup | 1280 | * lguest_init() where the rest of the fairly chaotic boot setup |
1171 | * occurs. */ | 1281 | * occurs. |
1282 | */ | ||
1172 | 1283 | ||
1173 | /* The stack protector is a weird thing where gcc places a canary | 1284 | /* |
1285 | * The stack protector is a weird thing where gcc places a canary | ||
1174 | * value on the stack and then checks it on return. This file is | 1286 | * value on the stack and then checks it on return. This file is |
1175 | * compiled with -fno-stack-protector it, so we got this far without | 1287 | * compiled with -fno-stack-protector it, so we got this far without |
1176 | * problems. The value of the canary is kept at offset 20 from the | 1288 | * problems. The value of the canary is kept at offset 20 from the |
1177 | * %gs register, so we need to set that up before calling C functions | 1289 | * %gs register, so we need to set that up before calling C functions |
1178 | * in other files. */ | 1290 | * in other files. |
1291 | */ | ||
1179 | setup_stack_canary_segment(0); | 1292 | setup_stack_canary_segment(0); |
1180 | /* We could just call load_stack_canary_segment(), but we might as | 1293 | |
1181 | * call switch_to_new_gdt() which loads the whole table and sets up | 1294 | /* |
1182 | * the per-cpu segment descriptor register %fs as well. */ | 1295 | * We could just call load_stack_canary_segment(), but we might as well |
1296 | * call switch_to_new_gdt() which loads the whole table and sets up the | ||
1297 | * per-cpu segment descriptor register %fs as well. | ||
1298 | */ | ||
1183 | switch_to_new_gdt(0); | 1299 | switch_to_new_gdt(0); |
1184 | 1300 | ||
1185 | /* As described in head_32.S, we map the first 128M of memory. */ | 1301 | /* As described in head_32.S, we map the first 128M of memory. */ |
1186 | max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; | 1302 | max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; |
1187 | 1303 | ||
1188 | /* The Host<->Guest Switcher lives at the top of our address space, and | 1304 | /* |
1305 | * The Host<->Guest Switcher lives at the top of our address space, and | ||
1189 | * the Host told us how big it is when we made LGUEST_INIT hypercall: | 1306 | * the Host told us how big it is when we made LGUEST_INIT hypercall: |
1190 | * it put the answer in lguest_data.reserve_mem */ | 1307 | * it put the answer in lguest_data.reserve_mem |
1308 | */ | ||
1191 | reserve_top_address(lguest_data.reserve_mem); | 1309 | reserve_top_address(lguest_data.reserve_mem); |
1192 | 1310 | ||
1193 | /* If we don't initialize the lock dependency checker now, it crashes | 1311 | /* |
1194 | * paravirt_disable_iospace. */ | 1312 | * If we don't initialize the lock dependency checker now, it crashes |
1313 | * paravirt_disable_iospace. | ||
1314 | */ | ||
1195 | lockdep_init(); | 1315 | lockdep_init(); |
1196 | 1316 | ||
1197 | /* The IDE code spends about 3 seconds probing for disks: if we reserve | 1317 | /* |
1318 | * The IDE code spends about 3 seconds probing for disks: if we reserve | ||
1198 | * all the I/O ports up front it can't get them and so doesn't probe. | 1319 | * all the I/O ports up front it can't get them and so doesn't probe. |
1199 | * Other device drivers are similar (but less severe). This cuts the | 1320 | * Other device drivers are similar (but less severe). This cuts the |
1200 | * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */ | 1321 | * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. |
1322 | */ | ||
1201 | paravirt_disable_iospace(); | 1323 | paravirt_disable_iospace(); |
1202 | 1324 | ||
1203 | /* This is messy CPU setup stuff which the native boot code does before | 1325 | /* |
1204 | * start_kernel, so we have to do, too: */ | 1326 | * This is messy CPU setup stuff which the native boot code does before |
1327 | * start_kernel, so we have to do, too: | ||
1328 | */ | ||
1205 | cpu_detect(&new_cpu_data); | 1329 | cpu_detect(&new_cpu_data); |
1206 | /* head.S usually sets up the first capability word, so do it here. */ | 1330 | /* head.S usually sets up the first capability word, so do it here. */ |
1207 | new_cpu_data.x86_capability[0] = cpuid_edx(1); | 1331 | new_cpu_data.x86_capability[0] = cpuid_edx(1); |
@@ -1218,22 +1342,28 @@ __init void lguest_init(void) | |||
1218 | acpi_ht = 0; | 1342 | acpi_ht = 0; |
1219 | #endif | 1343 | #endif |
1220 | 1344 | ||
1221 | /* We set the preferred console to "hvc". This is the "hypervisor | 1345 | /* |
1346 | * We set the preferred console to "hvc". This is the "hypervisor | ||
1222 | * virtual console" driver written by the PowerPC people, which we also | 1347 | * virtual console" driver written by the PowerPC people, which we also |
1223 | * adapted for lguest's use. */ | 1348 | * adapted for lguest's use. |
1349 | */ | ||
1224 | add_preferred_console("hvc", 0, NULL); | 1350 | add_preferred_console("hvc", 0, NULL); |
1225 | 1351 | ||
1226 | /* Register our very early console. */ | 1352 | /* Register our very early console. */ |
1227 | virtio_cons_early_init(early_put_chars); | 1353 | virtio_cons_early_init(early_put_chars); |
1228 | 1354 | ||
1229 | /* Last of all, we set the power management poweroff hook to point to | 1355 | /* |
1356 | * Last of all, we set the power management poweroff hook to point to | ||
1230 | * the Guest routine to power off, and the reboot hook to our restart | 1357 | * the Guest routine to power off, and the reboot hook to our restart |
1231 | * routine. */ | 1358 | * routine. |
1359 | */ | ||
1232 | pm_power_off = lguest_power_off; | 1360 | pm_power_off = lguest_power_off; |
1233 | machine_ops.restart = lguest_restart; | 1361 | machine_ops.restart = lguest_restart; |
1234 | 1362 | ||
1235 | /* Now we're set up, call i386_start_kernel() in head32.c and we proceed | 1363 | /* |
1236 | * to boot as normal. It never returns. */ | 1364 | * Now we're set up, call i386_start_kernel() in head32.c and we proceed |
1365 | * to boot as normal. It never returns. | ||
1366 | */ | ||
1237 | i386_start_kernel(); | 1367 | i386_start_kernel(); |
1238 | } | 1368 | } |
1239 | /* | 1369 | /* |