diff options
Diffstat (limited to 'arch/x86/lguest')
-rw-r--r-- | arch/x86/lguest/boot.c | 509 | ||||
-rw-r--r-- | arch/x86/lguest/i386_head.S | 112 |
2 files changed, 417 insertions, 204 deletions
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index f2bf1f73d468..d677fa9ca650 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
@@ -22,7 +22,8 @@ | |||
22 | * | 22 | * |
23 | * So how does the kernel know it's a Guest? We'll see that later, but let's | 23 | * So how does the kernel know it's a Guest? We'll see that later, but let's |
24 | * just say that we end up here where we replace the native functions various | 24 | * just say that we end up here where we replace the native functions various |
25 | * "paravirt" structures with our Guest versions, then boot like normal. :*/ | 25 | * "paravirt" structures with our Guest versions, then boot like normal. |
26 | :*/ | ||
26 | 27 | ||
27 | /* | 28 | /* |
28 | * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. | 29 | * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. |
@@ -74,7 +75,8 @@ | |||
74 | * | 75 | * |
75 | * The Guest in our tale is a simple creature: identical to the Host but | 76 | * The Guest in our tale is a simple creature: identical to the Host but |
76 | * behaving in simplified but equivalent ways. In particular, the Guest is the | 77 | * behaving in simplified but equivalent ways. In particular, the Guest is the |
77 | * same kernel as the Host (or at least, built from the same source code). :*/ | 78 | * same kernel as the Host (or at least, built from the same source code). |
79 | :*/ | ||
78 | 80 | ||
79 | struct lguest_data lguest_data = { | 81 | struct lguest_data lguest_data = { |
80 | .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, | 82 | .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, |
@@ -85,7 +87,8 @@ struct lguest_data lguest_data = { | |||
85 | .syscall_vec = SYSCALL_VECTOR, | 87 | .syscall_vec = SYSCALL_VECTOR, |
86 | }; | 88 | }; |
87 | 89 | ||
88 | /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a | 90 | /*G:037 |
91 | * async_hcall() is pretty simple: I'm quite proud of it really. We have a | ||
89 | * ring buffer of stored hypercalls which the Host will run though next time we | 92 | * ring buffer of stored hypercalls which the Host will run though next time we |
90 | * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall | 93 | * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall |
91 | * arguments, and a "hcall_status" word which is 0 if the call is ready to go, | 94 | * arguments, and a "hcall_status" word which is 0 if the call is ready to go, |
@@ -94,7 +97,8 @@ struct lguest_data lguest_data = { | |||
94 | * If we come around to a slot which hasn't been finished, then the table is | 97 | * If we come around to a slot which hasn't been finished, then the table is |
95 | * full and we just make the hypercall directly. This has the nice side | 98 | * full and we just make the hypercall directly. This has the nice side |
96 | * effect of causing the Host to run all the stored calls in the ring buffer | 99 | * effect of causing the Host to run all the stored calls in the ring buffer |
97 | * which empties it for next time! */ | 100 | * which empties it for next time! |
101 | */ | ||
98 | static void async_hcall(unsigned long call, unsigned long arg1, | 102 | static void async_hcall(unsigned long call, unsigned long arg1, |
99 | unsigned long arg2, unsigned long arg3, | 103 | unsigned long arg2, unsigned long arg3, |
100 | unsigned long arg4) | 104 | unsigned long arg4) |
@@ -103,9 +107,11 @@ static void async_hcall(unsigned long call, unsigned long arg1, | |||
103 | static unsigned int next_call; | 107 | static unsigned int next_call; |
104 | unsigned long flags; | 108 | unsigned long flags; |
105 | 109 | ||
106 | /* Disable interrupts if not already disabled: we don't want an | 110 | /* |
111 | * Disable interrupts if not already disabled: we don't want an | ||
107 | * interrupt handler making a hypercall while we're already doing | 112 | * interrupt handler making a hypercall while we're already doing |
108 | * one! */ | 113 | * one! |
114 | */ | ||
109 | local_irq_save(flags); | 115 | local_irq_save(flags); |
110 | if (lguest_data.hcall_status[next_call] != 0xFF) { | 116 | if (lguest_data.hcall_status[next_call] != 0xFF) { |
111 | /* Table full, so do normal hcall which will flush table. */ | 117 | /* Table full, so do normal hcall which will flush table. */ |
@@ -125,8 +131,9 @@ static void async_hcall(unsigned long call, unsigned long arg1, | |||
125 | local_irq_restore(flags); | 131 | local_irq_restore(flags); |
126 | } | 132 | } |
127 | 133 | ||
128 | /*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first | 134 | /*G:035 |
129 | * real optimization trick! | 135 | * Notice the lazy_hcall() above, rather than hcall(). This is our first real |
136 | * optimization trick! | ||
130 | * | 137 | * |
131 | * When lazy_mode is set, it means we're allowed to defer all hypercalls and do | 138 | * When lazy_mode is set, it means we're allowed to defer all hypercalls and do |
132 | * them as a batch when lazy_mode is eventually turned off. Because hypercalls | 139 | * them as a batch when lazy_mode is eventually turned off. Because hypercalls |
@@ -136,7 +143,8 @@ static void async_hcall(unsigned long call, unsigned long arg1, | |||
136 | * lguest_leave_lazy_mode(). | 143 | * lguest_leave_lazy_mode(). |
137 | * | 144 | * |
138 | * So, when we're in lazy mode, we call async_hcall() to store the call for | 145 | * So, when we're in lazy mode, we call async_hcall() to store the call for |
139 | * future processing: */ | 146 | * future processing: |
147 | */ | ||
140 | static void lazy_hcall1(unsigned long call, | 148 | static void lazy_hcall1(unsigned long call, |
141 | unsigned long arg1) | 149 | unsigned long arg1) |
142 | { | 150 | { |
@@ -146,6 +154,7 @@ static void lazy_hcall1(unsigned long call, | |||
146 | async_hcall(call, arg1, 0, 0, 0); | 154 | async_hcall(call, arg1, 0, 0, 0); |
147 | } | 155 | } |
148 | 156 | ||
157 | /* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ | ||
149 | static void lazy_hcall2(unsigned long call, | 158 | static void lazy_hcall2(unsigned long call, |
150 | unsigned long arg1, | 159 | unsigned long arg1, |
151 | unsigned long arg2) | 160 | unsigned long arg2) |
@@ -181,8 +190,10 @@ static void lazy_hcall4(unsigned long call, | |||
181 | } | 190 | } |
182 | #endif | 191 | #endif |
183 | 192 | ||
184 | /* When lazy mode is turned off reset the per-cpu lazy mode variable and then | 193 | /*G:036 |
185 | * issue the do-nothing hypercall to flush any stored calls. */ | 194 | * When lazy mode is turned off reset the per-cpu lazy mode variable and then |
195 | * issue the do-nothing hypercall to flush any stored calls. | ||
196 | :*/ | ||
186 | static void lguest_leave_lazy_mmu_mode(void) | 197 | static void lguest_leave_lazy_mmu_mode(void) |
187 | { | 198 | { |
188 | kvm_hypercall0(LHCALL_FLUSH_ASYNC); | 199 | kvm_hypercall0(LHCALL_FLUSH_ASYNC); |
@@ -208,9 +219,11 @@ static void lguest_end_context_switch(struct task_struct *next) | |||
208 | * check there before it tries to deliver an interrupt. | 219 | * check there before it tries to deliver an interrupt. |
209 | */ | 220 | */ |
210 | 221 | ||
211 | /* save_flags() is expected to return the processor state (ie. "flags"). The | 222 | /* |
223 | * save_flags() is expected to return the processor state (ie. "flags"). The | ||
212 | * flags word contains all kind of stuff, but in practice Linux only cares | 224 | * flags word contains all kind of stuff, but in practice Linux only cares |
213 | * about the interrupt flag. Our "save_flags()" just returns that. */ | 225 | * about the interrupt flag. Our "save_flags()" just returns that. |
226 | */ | ||
214 | static unsigned long save_fl(void) | 227 | static unsigned long save_fl(void) |
215 | { | 228 | { |
216 | return lguest_data.irq_enabled; | 229 | return lguest_data.irq_enabled; |
@@ -222,13 +235,15 @@ static void irq_disable(void) | |||
222 | lguest_data.irq_enabled = 0; | 235 | lguest_data.irq_enabled = 0; |
223 | } | 236 | } |
224 | 237 | ||
225 | /* Let's pause a moment. Remember how I said these are called so often? | 238 | /* |
239 | * Let's pause a moment. Remember how I said these are called so often? | ||
226 | * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to | 240 | * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to |
227 | * break some rules. In particular, these functions are assumed to save their | 241 | * break some rules. In particular, these functions are assumed to save their |
228 | * own registers if they need to: normal C functions assume they can trash the | 242 | * own registers if they need to: normal C functions assume they can trash the |
229 | * eax register. To use normal C functions, we use | 243 | * eax register. To use normal C functions, we use |
230 | * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the | 244 | * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the |
231 | * C function, then restores it. */ | 245 | * C function, then restores it. |
246 | */ | ||
232 | PV_CALLEE_SAVE_REGS_THUNK(save_fl); | 247 | PV_CALLEE_SAVE_REGS_THUNK(save_fl); |
233 | PV_CALLEE_SAVE_REGS_THUNK(irq_disable); | 248 | PV_CALLEE_SAVE_REGS_THUNK(irq_disable); |
234 | /*:*/ | 249 | /*:*/ |
@@ -237,18 +252,18 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_disable); | |||
237 | extern void lg_irq_enable(void); | 252 | extern void lg_irq_enable(void); |
238 | extern void lg_restore_fl(unsigned long flags); | 253 | extern void lg_restore_fl(unsigned long flags); |
239 | 254 | ||
240 | /*M:003 Note that we don't check for outstanding interrupts when we re-enable | 255 | /*M:003 |
241 | * them (or when we unmask an interrupt). This seems to work for the moment, | 256 | * We could be more efficient in our checking of outstanding interrupts, rather |
242 | * since interrupts are rare and we'll just get the interrupt on the next timer | 257 | * than using a branch. One way would be to put the "irq_enabled" field in a |
243 | * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way | 258 | * page by itself, and have the Host write-protect it when an interrupt comes |
244 | * would be to put the "irq_enabled" field in a page by itself, and have the | 259 | * in when irqs are disabled. There will then be a page fault as soon as |
245 | * Host write-protect it when an interrupt comes in when irqs are disabled. | 260 | * interrupts are re-enabled. |
246 | * There will then be a page fault as soon as interrupts are re-enabled. | ||
247 | * | 261 | * |
248 | * A better method is to implement soft interrupt disable generally for x86: | 262 | * A better method is to implement soft interrupt disable generally for x86: |
249 | * instead of disabling interrupts, we set a flag. If an interrupt does come | 263 | * instead of disabling interrupts, we set a flag. If an interrupt does come |
250 | * in, we then disable them for real. This is uncommon, so we could simply use | 264 | * in, we then disable them for real. This is uncommon, so we could simply use |
251 | * a hypercall for interrupt control and not worry about efficiency. :*/ | 265 | * a hypercall for interrupt control and not worry about efficiency. |
266 | :*/ | ||
252 | 267 | ||
253 | /*G:034 | 268 | /*G:034 |
254 | * The Interrupt Descriptor Table (IDT). | 269 | * The Interrupt Descriptor Table (IDT). |
@@ -261,10 +276,12 @@ extern void lg_restore_fl(unsigned long flags); | |||
261 | static void lguest_write_idt_entry(gate_desc *dt, | 276 | static void lguest_write_idt_entry(gate_desc *dt, |
262 | int entrynum, const gate_desc *g) | 277 | int entrynum, const gate_desc *g) |
263 | { | 278 | { |
264 | /* The gate_desc structure is 8 bytes long: we hand it to the Host in | 279 | /* |
280 | * The gate_desc structure is 8 bytes long: we hand it to the Host in | ||
265 | * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors | 281 | * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors |
266 | * around like this; typesafety wasn't a big concern in Linux's early | 282 | * around like this; typesafety wasn't a big concern in Linux's early |
267 | * years. */ | 283 | * years. |
284 | */ | ||
268 | u32 *desc = (u32 *)g; | 285 | u32 *desc = (u32 *)g; |
269 | /* Keep the local copy up to date. */ | 286 | /* Keep the local copy up to date. */ |
270 | native_write_idt_entry(dt, entrynum, g); | 287 | native_write_idt_entry(dt, entrynum, g); |
@@ -272,9 +289,11 @@ static void lguest_write_idt_entry(gate_desc *dt, | |||
272 | kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); | 289 | kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); |
273 | } | 290 | } |
274 | 291 | ||
275 | /* Changing to a different IDT is very rare: we keep the IDT up-to-date every | 292 | /* |
293 | * Changing to a different IDT is very rare: we keep the IDT up-to-date every | ||
276 | * time it is written, so we can simply loop through all entries and tell the | 294 | * time it is written, so we can simply loop through all entries and tell the |
277 | * Host about them. */ | 295 | * Host about them. |
296 | */ | ||
278 | static void lguest_load_idt(const struct desc_ptr *desc) | 297 | static void lguest_load_idt(const struct desc_ptr *desc) |
279 | { | 298 | { |
280 | unsigned int i; | 299 | unsigned int i; |
@@ -305,9 +324,11 @@ static void lguest_load_gdt(const struct desc_ptr *desc) | |||
305 | kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); | 324 | kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); |
306 | } | 325 | } |
307 | 326 | ||
308 | /* For a single GDT entry which changes, we do the lazy thing: alter our GDT, | 327 | /* |
328 | * For a single GDT entry which changes, we do the lazy thing: alter our GDT, | ||
309 | * then tell the Host to reload the entire thing. This operation is so rare | 329 | * then tell the Host to reload the entire thing. This operation is so rare |
310 | * that this naive implementation is reasonable. */ | 330 | * that this naive implementation is reasonable. |
331 | */ | ||
311 | static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, | 332 | static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, |
312 | const void *desc, int type) | 333 | const void *desc, int type) |
313 | { | 334 | { |
@@ -317,29 +338,36 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, | |||
317 | dt[entrynum].a, dt[entrynum].b); | 338 | dt[entrynum].a, dt[entrynum].b); |
318 | } | 339 | } |
319 | 340 | ||
320 | /* OK, I lied. There are three "thread local storage" GDT entries which change | 341 | /* |
342 | * OK, I lied. There are three "thread local storage" GDT entries which change | ||
321 | * on every context switch (these three entries are how glibc implements | 343 | * on every context switch (these three entries are how glibc implements |
322 | * __thread variables). So we have a hypercall specifically for this case. */ | 344 | * __thread variables). So we have a hypercall specifically for this case. |
345 | */ | ||
323 | static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) | 346 | static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) |
324 | { | 347 | { |
325 | /* There's one problem which normal hardware doesn't have: the Host | 348 | /* |
349 | * There's one problem which normal hardware doesn't have: the Host | ||
326 | * can't handle us removing entries we're currently using. So we clear | 350 | * can't handle us removing entries we're currently using. So we clear |
327 | * the GS register here: if it's needed it'll be reloaded anyway. */ | 351 | * the GS register here: if it's needed it'll be reloaded anyway. |
352 | */ | ||
328 | lazy_load_gs(0); | 353 | lazy_load_gs(0); |
329 | lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); | 354 | lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); |
330 | } | 355 | } |
331 | 356 | ||
332 | /*G:038 That's enough excitement for now, back to ploughing through each of | 357 | /*G:038 |
333 | * the different pv_ops structures (we're about 1/3 of the way through). | 358 | * That's enough excitement for now, back to ploughing through each of the |
359 | * different pv_ops structures (we're about 1/3 of the way through). | ||
334 | * | 360 | * |
335 | * This is the Local Descriptor Table, another weird Intel thingy. Linux only | 361 | * This is the Local Descriptor Table, another weird Intel thingy. Linux only |
336 | * uses this for some strange applications like Wine. We don't do anything | 362 | * uses this for some strange applications like Wine. We don't do anything |
337 | * here, so they'll get an informative and friendly Segmentation Fault. */ | 363 | * here, so they'll get an informative and friendly Segmentation Fault. |
364 | */ | ||
338 | static void lguest_set_ldt(const void *addr, unsigned entries) | 365 | static void lguest_set_ldt(const void *addr, unsigned entries) |
339 | { | 366 | { |
340 | } | 367 | } |
341 | 368 | ||
342 | /* This loads a GDT entry into the "Task Register": that entry points to a | 369 | /* |
370 | * This loads a GDT entry into the "Task Register": that entry points to a | ||
343 | * structure called the Task State Segment. Some comments scattered though the | 371 | * structure called the Task State Segment. Some comments scattered though the |
344 | * kernel code indicate that this used for task switching in ages past, along | 372 | * kernel code indicate that this used for task switching in ages past, along |
345 | * with blood sacrifice and astrology. | 373 | * with blood sacrifice and astrology. |
@@ -347,19 +375,21 @@ static void lguest_set_ldt(const void *addr, unsigned entries) | |||
347 | * Now there's nothing interesting in here that we don't get told elsewhere. | 375 | * Now there's nothing interesting in here that we don't get told elsewhere. |
348 | * But the native version uses the "ltr" instruction, which makes the Host | 376 | * But the native version uses the "ltr" instruction, which makes the Host |
349 | * complain to the Guest about a Segmentation Fault and it'll oops. So we | 377 | * complain to the Guest about a Segmentation Fault and it'll oops. So we |
350 | * override the native version with a do-nothing version. */ | 378 | * override the native version with a do-nothing version. |
379 | */ | ||
351 | static void lguest_load_tr_desc(void) | 380 | static void lguest_load_tr_desc(void) |
352 | { | 381 | { |
353 | } | 382 | } |
354 | 383 | ||
355 | /* The "cpuid" instruction is a way of querying both the CPU identity | 384 | /* |
385 | * The "cpuid" instruction is a way of querying both the CPU identity | ||
356 | * (manufacturer, model, etc) and its features. It was introduced before the | 386 | * (manufacturer, model, etc) and its features. It was introduced before the |
357 | * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. | 387 | * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. |
358 | * As you might imagine, after a decade and a half this treatment, it is now a | 388 | * As you might imagine, after a decade and a half this treatment, it is now a |
359 | * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. | 389 | * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. |
360 | * | 390 | * |
361 | * This instruction even it has its own Wikipedia entry. The Wikipedia entry | 391 | * This instruction even it has its own Wikipedia entry. The Wikipedia entry |
362 | * has been translated into 4 languages. I am not making this up! | 392 | * has been translated into 5 languages. I am not making this up! |
363 | * | 393 | * |
364 | * We could get funky here and identify ourselves as "GenuineLguest", but | 394 | * We could get funky here and identify ourselves as "GenuineLguest", but |
365 | * instead we just use the real "cpuid" instruction. Then I pretty much turned | 395 | * instead we just use the real "cpuid" instruction. Then I pretty much turned |
@@ -371,7 +401,8 @@ static void lguest_load_tr_desc(void) | |||
371 | * Replacing the cpuid so we can turn features off is great for the kernel, but | 401 | * Replacing the cpuid so we can turn features off is great for the kernel, but |
372 | * anyone (including userspace) can just use the raw "cpuid" instruction and | 402 | * anyone (including userspace) can just use the raw "cpuid" instruction and |
373 | * the Host won't even notice since it isn't privileged. So we try not to get | 403 | * the Host won't even notice since it isn't privileged. So we try not to get |
374 | * too worked up about it. */ | 404 | * too worked up about it. |
405 | */ | ||
375 | static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | 406 | static void lguest_cpuid(unsigned int *ax, unsigned int *bx, |
376 | unsigned int *cx, unsigned int *dx) | 407 | unsigned int *cx, unsigned int *dx) |
377 | { | 408 | { |
@@ -379,43 +410,63 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | |||
379 | 410 | ||
380 | native_cpuid(ax, bx, cx, dx); | 411 | native_cpuid(ax, bx, cx, dx); |
381 | switch (function) { | 412 | switch (function) { |
382 | case 0: /* ID and highest CPUID. Futureproof a little by sticking to | 413 | /* |
383 | * older ones. */ | 414 | * CPUID 0 gives the highest legal CPUID number (and the ID string). |
415 | * We futureproof our code a little by sticking to known CPUID values. | ||
416 | */ | ||
417 | case 0: | ||
384 | if (*ax > 5) | 418 | if (*ax > 5) |
385 | *ax = 5; | 419 | *ax = 5; |
386 | break; | 420 | break; |
387 | case 1: /* Basic feature request. */ | 421 | |
388 | /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ | 422 | /* |
423 | * CPUID 1 is a basic feature request. | ||
424 | * | ||
425 | * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3 | ||
426 | * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE. | ||
427 | */ | ||
428 | case 1: | ||
389 | *cx &= 0x00002201; | 429 | *cx &= 0x00002201; |
390 | /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ | ||
391 | *dx &= 0x07808151; | 430 | *dx &= 0x07808151; |
392 | /* The Host can do a nice optimization if it knows that the | 431 | /* |
432 | * The Host can do a nice optimization if it knows that the | ||
393 | * kernel mappings (addresses above 0xC0000000 or whatever | 433 | * kernel mappings (addresses above 0xC0000000 or whatever |
394 | * PAGE_OFFSET is set to) haven't changed. But Linux calls | 434 | * PAGE_OFFSET is set to) haven't changed. But Linux calls |
395 | * flush_tlb_user() for both user and kernel mappings unless | 435 | * flush_tlb_user() for both user and kernel mappings unless |
396 | * the Page Global Enable (PGE) feature bit is set. */ | 436 | * the Page Global Enable (PGE) feature bit is set. |
437 | */ | ||
397 | *dx |= 0x00002000; | 438 | *dx |= 0x00002000; |
398 | /* We also lie, and say we're family id 5. 6 or greater | 439 | /* |
440 | * We also lie, and say we're family id 5. 6 or greater | ||
399 | * leads to a rdmsr in early_init_intel which we can't handle. | 441 | * leads to a rdmsr in early_init_intel which we can't handle. |
400 | * Family ID is returned as bits 8-12 in ax. */ | 442 | * Family ID is returned as bits 8-12 in ax. |
443 | */ | ||
401 | *ax &= 0xFFFFF0FF; | 444 | *ax &= 0xFFFFF0FF; |
402 | *ax |= 0x00000500; | 445 | *ax |= 0x00000500; |
403 | break; | 446 | break; |
447 | /* | ||
448 | * 0x80000000 returns the highest Extended Function, so we futureproof | ||
449 | * like we do above by limiting it to known fields. | ||
450 | */ | ||
404 | case 0x80000000: | 451 | case 0x80000000: |
405 | /* Futureproof this a little: if they ask how much extended | ||
406 | * processor information there is, limit it to known fields. */ | ||
407 | if (*ax > 0x80000008) | 452 | if (*ax > 0x80000008) |
408 | *ax = 0x80000008; | 453 | *ax = 0x80000008; |
409 | break; | 454 | break; |
455 | |||
456 | /* | ||
457 | * PAE systems can mark pages as non-executable. Linux calls this the | ||
458 | * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced | ||
459 | * Virus Protection). We just switch turn if off here, since we don't | ||
460 | * support it. | ||
461 | */ | ||
410 | case 0x80000001: | 462 | case 0x80000001: |
411 | /* Here we should fix nx cap depending on host. */ | ||
412 | /* For this version of PAE, we just clear NX bit. */ | ||
413 | *dx &= ~(1 << 20); | 463 | *dx &= ~(1 << 20); |
414 | break; | 464 | break; |
415 | } | 465 | } |
416 | } | 466 | } |
417 | 467 | ||
418 | /* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. | 468 | /* |
469 | * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. | ||
419 | * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother | 470 | * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother |
420 | * it. The Host needs to know when the Guest wants to change them, so we have | 471 | * it. The Host needs to know when the Guest wants to change them, so we have |
421 | * a whole series of functions like read_cr0() and write_cr0(). | 472 | * a whole series of functions like read_cr0() and write_cr0(). |
@@ -430,7 +481,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | |||
430 | * name like "FPUTRAP bit" be a little less cryptic? | 481 | * name like "FPUTRAP bit" be a little less cryptic? |
431 | * | 482 | * |
432 | * We store cr0 locally because the Host never changes it. The Guest sometimes | 483 | * We store cr0 locally because the Host never changes it. The Guest sometimes |
433 | * wants to read it and we'd prefer not to bother the Host unnecessarily. */ | 484 | * wants to read it and we'd prefer not to bother the Host unnecessarily. |
485 | */ | ||
434 | static unsigned long current_cr0; | 486 | static unsigned long current_cr0; |
435 | static void lguest_write_cr0(unsigned long val) | 487 | static void lguest_write_cr0(unsigned long val) |
436 | { | 488 | { |
@@ -443,18 +495,22 @@ static unsigned long lguest_read_cr0(void) | |||
443 | return current_cr0; | 495 | return current_cr0; |
444 | } | 496 | } |
445 | 497 | ||
446 | /* Intel provided a special instruction to clear the TS bit for people too cool | 498 | /* |
499 | * Intel provided a special instruction to clear the TS bit for people too cool | ||
447 | * to use write_cr0() to do it. This "clts" instruction is faster, because all | 500 | * to use write_cr0() to do it. This "clts" instruction is faster, because all |
448 | * the vowels have been optimized out. */ | 501 | * the vowels have been optimized out. |
502 | */ | ||
449 | static void lguest_clts(void) | 503 | static void lguest_clts(void) |
450 | { | 504 | { |
451 | lazy_hcall1(LHCALL_TS, 0); | 505 | lazy_hcall1(LHCALL_TS, 0); |
452 | current_cr0 &= ~X86_CR0_TS; | 506 | current_cr0 &= ~X86_CR0_TS; |
453 | } | 507 | } |
454 | 508 | ||
455 | /* cr2 is the virtual address of the last page fault, which the Guest only ever | 509 | /* |
510 | * cr2 is the virtual address of the last page fault, which the Guest only ever | ||
456 | * reads. The Host kindly writes this into our "struct lguest_data", so we | 511 | * reads. The Host kindly writes this into our "struct lguest_data", so we |
457 | * just read it out of there. */ | 512 | * just read it out of there. |
513 | */ | ||
458 | static unsigned long lguest_read_cr2(void) | 514 | static unsigned long lguest_read_cr2(void) |
459 | { | 515 | { |
460 | return lguest_data.cr2; | 516 | return lguest_data.cr2; |
@@ -463,10 +519,12 @@ static unsigned long lguest_read_cr2(void) | |||
463 | /* See lguest_set_pte() below. */ | 519 | /* See lguest_set_pte() below. */ |
464 | static bool cr3_changed = false; | 520 | static bool cr3_changed = false; |
465 | 521 | ||
466 | /* cr3 is the current toplevel pagetable page: the principle is the same as | 522 | /* |
523 | * cr3 is the current toplevel pagetable page: the principle is the same as | ||
467 | * cr0. Keep a local copy, and tell the Host when it changes. The only | 524 | * cr0. Keep a local copy, and tell the Host when it changes. The only |
468 | * difference is that our local copy is in lguest_data because the Host needs | 525 | * difference is that our local copy is in lguest_data because the Host needs |
469 | * to set it upon our initial hypercall. */ | 526 | * to set it upon our initial hypercall. |
527 | */ | ||
470 | static void lguest_write_cr3(unsigned long cr3) | 528 | static void lguest_write_cr3(unsigned long cr3) |
471 | { | 529 | { |
472 | lguest_data.pgdir = cr3; | 530 | lguest_data.pgdir = cr3; |
@@ -511,7 +569,7 @@ static void lguest_write_cr4(unsigned long val) | |||
511 | * cr3 ---> +---------+ | 569 | * cr3 ---> +---------+ |
512 | * | --------->+---------+ | 570 | * | --------->+---------+ |
513 | * | | | PADDR1 | | 571 | * | | | PADDR1 | |
514 | * Top-level | | PADDR2 | | 572 | * Mid-level | | PADDR2 | |
515 | * (PMD) page | | | | 573 | * (PMD) page | | | |
516 | * | | Lower-level | | 574 | * | | Lower-level | |
517 | * | | (PTE) page | | 575 | * | | (PTE) page | |
@@ -531,21 +589,62 @@ static void lguest_write_cr4(unsigned long val) | |||
531 | * Index into top Index into second Offset within page | 589 | * Index into top Index into second Offset within page |
532 | * page directory page pagetable page | 590 | * page directory page pagetable page |
533 | * | 591 | * |
534 | * The kernel spends a lot of time changing both the top-level page directory | 592 | * Now, unfortunately, this isn't the whole story: Intel added Physical Address |
535 | * and lower-level pagetable pages. The Guest doesn't know physical addresses, | 593 | * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). |
536 | * so while it maintains these page tables exactly like normal, it also needs | 594 | * These are held in 64-bit page table entries, so we can now only fit 512 |
537 | * to keep the Host informed whenever it makes a change: the Host will create | 595 | * entries in a page, and the neat three-level tree breaks down. |
538 | * the real page tables based on the Guests'. | 596 | * |
597 | * The result is a four level page table: | ||
598 | * | ||
599 | * cr3 --> [ 4 Upper ] | ||
600 | * [ Level ] | ||
601 | * [ Entries ] | ||
602 | * [(PUD Page)]---> +---------+ | ||
603 | * | --------->+---------+ | ||
604 | * | | | PADDR1 | | ||
605 | * Mid-level | | PADDR2 | | ||
606 | * (PMD) page | | | | ||
607 | * | | Lower-level | | ||
608 | * | | (PTE) page | | ||
609 | * | | | | | ||
610 | * .... .... | ||
611 | * | ||
612 | * | ||
613 | * And the virtual address is decoded as: | ||
614 | * | ||
615 | * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | ||
616 | * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| | ||
617 | * Index into Index into mid Index into lower Offset within page | ||
618 | * top entries directory page pagetable page | ||
619 | * | ||
620 | * It's too hard to switch between these two formats at runtime, so Linux only | ||
621 | * supports one or the other depending on whether CONFIG_X86_PAE is set. Many | ||
622 | * distributions turn it on, and not just for people with silly amounts of | ||
623 | * memory: the larger PTE entries allow room for the NX bit, which lets the | ||
624 | * kernel disable execution of pages and increase security. | ||
625 | * | ||
626 | * This was a problem for lguest, which couldn't run on these distributions; | ||
627 | * then Matias Zabaljauregui figured it all out and implemented it, and only a | ||
628 | * handful of puppies were crushed in the process! | ||
629 | * | ||
630 | * Back to our point: the kernel spends a lot of time changing both the | ||
631 | * top-level page directory and lower-level pagetable pages. The Guest doesn't | ||
632 | * know physical addresses, so while it maintains these page tables exactly | ||
633 | * like normal, it also needs to keep the Host informed whenever it makes a | ||
634 | * change: the Host will create the real page tables based on the Guests'. | ||
539 | */ | 635 | */ |
540 | 636 | ||
541 | /* The Guest calls this to set a second-level entry (pte), ie. to map a page | 637 | /* |
542 | * into a process' address space. We set the entry then tell the Host the | 638 | * The Guest calls this after it has set a second-level entry (pte), ie. to map |
543 | * toplevel and address this corresponds to. The Guest uses one pagetable per | 639 | * a page into a process' address space. Wetell the Host the toplevel and |
544 | * process, so we need to tell the Host which one we're changing (mm->pgd). */ | 640 | * address this corresponds to. The Guest uses one pagetable per process, so |
641 | * we need to tell the Host which one we're changing (mm->pgd). | ||
642 | */ | ||
545 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, | 643 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, |
546 | pte_t *ptep) | 644 | pte_t *ptep) |
547 | { | 645 | { |
548 | #ifdef CONFIG_X86_PAE | 646 | #ifdef CONFIG_X86_PAE |
647 | /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ | ||
549 | lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, | 648 | lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, |
550 | ptep->pte_low, ptep->pte_high); | 649 | ptep->pte_low, ptep->pte_high); |
551 | #else | 650 | #else |
@@ -553,6 +652,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, | |||
553 | #endif | 652 | #endif |
554 | } | 653 | } |
555 | 654 | ||
655 | /* This is the "set and update" combo-meal-deal version. */ | ||
556 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, | 656 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, |
557 | pte_t *ptep, pte_t pteval) | 657 | pte_t *ptep, pte_t pteval) |
558 | { | 658 | { |
@@ -560,10 +660,13 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, | |||
560 | lguest_pte_update(mm, addr, ptep); | 660 | lguest_pte_update(mm, addr, ptep); |
561 | } | 661 | } |
562 | 662 | ||
563 | /* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd | 663 | /* |
664 | * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd | ||
564 | * to set a middle-level entry when PAE is activated. | 665 | * to set a middle-level entry when PAE is activated. |
666 | * | ||
565 | * Again, we set the entry then tell the Host which page we changed, | 667 | * Again, we set the entry then tell the Host which page we changed, |
566 | * and the index of the entry we changed. */ | 668 | * and the index of the entry we changed. |
669 | */ | ||
567 | #ifdef CONFIG_X86_PAE | 670 | #ifdef CONFIG_X86_PAE |
568 | static void lguest_set_pud(pud_t *pudp, pud_t pudval) | 671 | static void lguest_set_pud(pud_t *pudp, pud_t pudval) |
569 | { | 672 | { |
@@ -582,8 +685,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | |||
582 | } | 685 | } |
583 | #else | 686 | #else |
584 | 687 | ||
585 | /* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not | 688 | /* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */ |
586 | * activated. */ | ||
587 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | 689 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) |
588 | { | 690 | { |
589 | native_set_pmd(pmdp, pmdval); | 691 | native_set_pmd(pmdp, pmdval); |
@@ -592,7 +694,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | |||
592 | } | 694 | } |
593 | #endif | 695 | #endif |
594 | 696 | ||
595 | /* There are a couple of legacy places where the kernel sets a PTE, but we | 697 | /* |
698 | * There are a couple of legacy places where the kernel sets a PTE, but we | ||
596 | * don't know the top level any more. This is useless for us, since we don't | 699 | * don't know the top level any more. This is useless for us, since we don't |
597 | * know which pagetable is changing or what address, so we just tell the Host | 700 | * know which pagetable is changing or what address, so we just tell the Host |
598 | * to forget all of them. Fortunately, this is very rare. | 701 | * to forget all of them. Fortunately, this is very rare. |
@@ -600,7 +703,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | |||
600 | * ... except in early boot when the kernel sets up the initial pagetables, | 703 | * ... except in early boot when the kernel sets up the initial pagetables, |
601 | * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell | 704 | * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell |
602 | * the Host anything changed until we've done the first page table switch, | 705 | * the Host anything changed until we've done the first page table switch, |
603 | * which brings boot back to 0.25 seconds. */ | 706 | * which brings boot back to 0.25 seconds. |
707 | */ | ||
604 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) | 708 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) |
605 | { | 709 | { |
606 | native_set_pte(ptep, pteval); | 710 | native_set_pte(ptep, pteval); |
@@ -609,6 +713,11 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval) | |||
609 | } | 713 | } |
610 | 714 | ||
611 | #ifdef CONFIG_X86_PAE | 715 | #ifdef CONFIG_X86_PAE |
716 | /* | ||
717 | * With 64-bit PTE values, we need to be careful setting them: if we set 32 | ||
718 | * bits at a time, the hardware could see a weird half-set entry. These | ||
719 | * versions ensure we update all 64 bits at once. | ||
720 | */ | ||
612 | static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) | 721 | static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) |
613 | { | 722 | { |
614 | native_set_pte_atomic(ptep, pte); | 723 | native_set_pte_atomic(ptep, pte); |
@@ -616,19 +725,21 @@ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) | |||
616 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); | 725 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); |
617 | } | 726 | } |
618 | 727 | ||
619 | void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | 728 | static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, |
729 | pte_t *ptep) | ||
620 | { | 730 | { |
621 | native_pte_clear(mm, addr, ptep); | 731 | native_pte_clear(mm, addr, ptep); |
622 | lguest_pte_update(mm, addr, ptep); | 732 | lguest_pte_update(mm, addr, ptep); |
623 | } | 733 | } |
624 | 734 | ||
625 | void lguest_pmd_clear(pmd_t *pmdp) | 735 | static void lguest_pmd_clear(pmd_t *pmdp) |
626 | { | 736 | { |
627 | lguest_set_pmd(pmdp, __pmd(0)); | 737 | lguest_set_pmd(pmdp, __pmd(0)); |
628 | } | 738 | } |
629 | #endif | 739 | #endif |
630 | 740 | ||
631 | /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on | 741 | /* |
742 | * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on | ||
632 | * native page table operations. On native hardware you can set a new page | 743 | * native page table operations. On native hardware you can set a new page |
633 | * table entry whenever you want, but if you want to remove one you have to do | 744 | * table entry whenever you want, but if you want to remove one you have to do |
634 | * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). | 745 | * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). |
@@ -637,24 +748,29 @@ void lguest_pmd_clear(pmd_t *pmdp) | |||
637 | * called when a valid entry is written, not when it's removed (ie. marked not | 748 | * called when a valid entry is written, not when it's removed (ie. marked not |
638 | * present). Instead, this is where we come when the Guest wants to remove a | 749 | * present). Instead, this is where we come when the Guest wants to remove a |
639 | * page table entry: we tell the Host to set that entry to 0 (ie. the present | 750 | * page table entry: we tell the Host to set that entry to 0 (ie. the present |
640 | * bit is zero). */ | 751 | * bit is zero). |
752 | */ | ||
641 | static void lguest_flush_tlb_single(unsigned long addr) | 753 | static void lguest_flush_tlb_single(unsigned long addr) |
642 | { | 754 | { |
643 | /* Simply set it to zero: if it was not, it will fault back in. */ | 755 | /* Simply set it to zero: if it was not, it will fault back in. */ |
644 | lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); | 756 | lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); |
645 | } | 757 | } |
646 | 758 | ||
647 | /* This is what happens after the Guest has removed a large number of entries. | 759 | /* |
760 | * This is what happens after the Guest has removed a large number of entries. | ||
648 | * This tells the Host that any of the page table entries for userspace might | 761 | * This tells the Host that any of the page table entries for userspace might |
649 | * have changed, ie. virtual addresses below PAGE_OFFSET. */ | 762 | * have changed, ie. virtual addresses below PAGE_OFFSET. |
763 | */ | ||
650 | static void lguest_flush_tlb_user(void) | 764 | static void lguest_flush_tlb_user(void) |
651 | { | 765 | { |
652 | lazy_hcall1(LHCALL_FLUSH_TLB, 0); | 766 | lazy_hcall1(LHCALL_FLUSH_TLB, 0); |
653 | } | 767 | } |
654 | 768 | ||
655 | /* This is called when the kernel page tables have changed. That's not very | 769 | /* |
770 | * This is called when the kernel page tables have changed. That's not very | ||
656 | * common (unless the Guest is using highmem, which makes the Guest extremely | 771 | * common (unless the Guest is using highmem, which makes the Guest extremely |
657 | * slow), so it's worth separating this from the user flushing above. */ | 772 | * slow), so it's worth separating this from the user flushing above. |
773 | */ | ||
658 | static void lguest_flush_tlb_kernel(void) | 774 | static void lguest_flush_tlb_kernel(void) |
659 | { | 775 | { |
660 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); | 776 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); |
@@ -691,26 +807,38 @@ static struct irq_chip lguest_irq_controller = { | |||
691 | .unmask = enable_lguest_irq, | 807 | .unmask = enable_lguest_irq, |
692 | }; | 808 | }; |
693 | 809 | ||
694 | /* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware | 810 | /* |
811 | * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware | ||
695 | * interrupt (except 128, which is used for system calls), and then tells the | 812 | * interrupt (except 128, which is used for system calls), and then tells the |
696 | * Linux infrastructure that each interrupt is controlled by our level-based | 813 | * Linux infrastructure that each interrupt is controlled by our level-based |
697 | * lguest interrupt controller. */ | 814 | * lguest interrupt controller. |
815 | */ | ||
698 | static void __init lguest_init_IRQ(void) | 816 | static void __init lguest_init_IRQ(void) |
699 | { | 817 | { |
700 | unsigned int i; | 818 | unsigned int i; |
701 | 819 | ||
702 | for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { | 820 | for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { |
703 | /* Some systems map "vectors" to interrupts weirdly. Lguest has | 821 | /* Some systems map "vectors" to interrupts weirdly. Not us! */ |
704 | * a straightforward 1 to 1 mapping, so force that here. */ | ||
705 | __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; | 822 | __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; |
706 | if (i != SYSCALL_VECTOR) | 823 | if (i != SYSCALL_VECTOR) |
707 | set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); | 824 | set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); |
708 | } | 825 | } |
709 | /* This call is required to set up for 4k stacks, where we have | 826 | |
710 | * separate stacks for hard and soft interrupts. */ | 827 | /* |
828 | * This call is required to set up for 4k stacks, where we have | ||
829 | * separate stacks for hard and soft interrupts. | ||
830 | */ | ||
711 | irq_ctx_init(smp_processor_id()); | 831 | irq_ctx_init(smp_processor_id()); |
712 | } | 832 | } |
713 | 833 | ||
834 | /* | ||
835 | * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so | ||
836 | * rather than set them in lguest_init_IRQ we are called here every time an | ||
837 | * lguest device needs an interrupt. | ||
838 | * | ||
839 | * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should | ||
840 | * pass that up! | ||
841 | */ | ||
714 | void lguest_setup_irq(unsigned int irq) | 842 | void lguest_setup_irq(unsigned int irq) |
715 | { | 843 | { |
716 | irq_to_desc_alloc_node(irq, 0); | 844 | irq_to_desc_alloc_node(irq, 0); |
@@ -729,31 +857,39 @@ static unsigned long lguest_get_wallclock(void) | |||
729 | return lguest_data.time.tv_sec; | 857 | return lguest_data.time.tv_sec; |
730 | } | 858 | } |
731 | 859 | ||
732 | /* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us | 860 | /* |
861 | * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us | ||
733 | * what speed it runs at, or 0 if it's unusable as a reliable clock source. | 862 | * what speed it runs at, or 0 if it's unusable as a reliable clock source. |
734 | * This matches what we want here: if we return 0 from this function, the x86 | 863 | * This matches what we want here: if we return 0 from this function, the x86 |
735 | * TSC clock will give up and not register itself. */ | 864 | * TSC clock will give up and not register itself. |
865 | */ | ||
736 | static unsigned long lguest_tsc_khz(void) | 866 | static unsigned long lguest_tsc_khz(void) |
737 | { | 867 | { |
738 | return lguest_data.tsc_khz; | 868 | return lguest_data.tsc_khz; |
739 | } | 869 | } |
740 | 870 | ||
741 | /* If we can't use the TSC, the kernel falls back to our lower-priority | 871 | /* |
742 | * "lguest_clock", where we read the time value given to us by the Host. */ | 872 | * If we can't use the TSC, the kernel falls back to our lower-priority |
873 | * "lguest_clock", where we read the time value given to us by the Host. | ||
874 | */ | ||
743 | static cycle_t lguest_clock_read(struct clocksource *cs) | 875 | static cycle_t lguest_clock_read(struct clocksource *cs) |
744 | { | 876 | { |
745 | unsigned long sec, nsec; | 877 | unsigned long sec, nsec; |
746 | 878 | ||
747 | /* Since the time is in two parts (seconds and nanoseconds), we risk | 879 | /* |
880 | * Since the time is in two parts (seconds and nanoseconds), we risk | ||
748 | * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, | 881 | * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, |
749 | * and getting 99 and 0. As Linux tends to come apart under the stress | 882 | * and getting 99 and 0. As Linux tends to come apart under the stress |
750 | * of time travel, we must be careful: */ | 883 | * of time travel, we must be careful: |
884 | */ | ||
751 | do { | 885 | do { |
752 | /* First we read the seconds part. */ | 886 | /* First we read the seconds part. */ |
753 | sec = lguest_data.time.tv_sec; | 887 | sec = lguest_data.time.tv_sec; |
754 | /* This read memory barrier tells the compiler and the CPU that | 888 | /* |
889 | * This read memory barrier tells the compiler and the CPU that | ||
755 | * this can't be reordered: we have to complete the above | 890 | * this can't be reordered: we have to complete the above |
756 | * before going on. */ | 891 | * before going on. |
892 | */ | ||
757 | rmb(); | 893 | rmb(); |
758 | /* Now we read the nanoseconds part. */ | 894 | /* Now we read the nanoseconds part. */ |
759 | nsec = lguest_data.time.tv_nsec; | 895 | nsec = lguest_data.time.tv_nsec; |
@@ -777,9 +913,11 @@ static struct clocksource lguest_clock = { | |||
777 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | 913 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, |
778 | }; | 914 | }; |
779 | 915 | ||
780 | /* We also need a "struct clock_event_device": Linux asks us to set it to go | 916 | /* |
917 | * We also need a "struct clock_event_device": Linux asks us to set it to go | ||
781 | * off some time in the future. Actually, James Morris figured all this out, I | 918 | * off some time in the future. Actually, James Morris figured all this out, I |
782 | * just applied the patch. */ | 919 | * just applied the patch. |
920 | */ | ||
783 | static int lguest_clockevent_set_next_event(unsigned long delta, | 921 | static int lguest_clockevent_set_next_event(unsigned long delta, |
784 | struct clock_event_device *evt) | 922 | struct clock_event_device *evt) |
785 | { | 923 | { |
@@ -829,8 +967,10 @@ static struct clock_event_device lguest_clockevent = { | |||
829 | .max_delta_ns = LG_CLOCK_MAX_DELTA, | 967 | .max_delta_ns = LG_CLOCK_MAX_DELTA, |
830 | }; | 968 | }; |
831 | 969 | ||
832 | /* This is the Guest timer interrupt handler (hardware interrupt 0). We just | 970 | /* |
833 | * call the clockevent infrastructure and it does whatever needs doing. */ | 971 | * This is the Guest timer interrupt handler (hardware interrupt 0). We just |
972 | * call the clockevent infrastructure and it does whatever needs doing. | ||
973 | */ | ||
834 | static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) | 974 | static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) |
835 | { | 975 | { |
836 | unsigned long flags; | 976 | unsigned long flags; |
@@ -841,10 +981,12 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) | |||
841 | local_irq_restore(flags); | 981 | local_irq_restore(flags); |
842 | } | 982 | } |
843 | 983 | ||
844 | /* At some point in the boot process, we get asked to set up our timing | 984 | /* |
985 | * At some point in the boot process, we get asked to set up our timing | ||
845 | * infrastructure. The kernel doesn't expect timer interrupts before this, but | 986 | * infrastructure. The kernel doesn't expect timer interrupts before this, but |
846 | * we cleverly initialized the "blocked_interrupts" field of "struct | 987 | * we cleverly initialized the "blocked_interrupts" field of "struct |
847 | * lguest_data" so that timer interrupts were blocked until now. */ | 988 | * lguest_data" so that timer interrupts were blocked until now. |
989 | */ | ||
848 | static void lguest_time_init(void) | 990 | static void lguest_time_init(void) |
849 | { | 991 | { |
850 | /* Set up the timer interrupt (0) to go to our simple timer routine */ | 992 | /* Set up the timer interrupt (0) to go to our simple timer routine */ |
@@ -868,14 +1010,16 @@ static void lguest_time_init(void) | |||
868 | * to work. They're pretty simple. | 1010 | * to work. They're pretty simple. |
869 | */ | 1011 | */ |
870 | 1012 | ||
871 | /* The Guest needs to tell the Host what stack it expects traps to use. For | 1013 | /* |
1014 | * The Guest needs to tell the Host what stack it expects traps to use. For | ||
872 | * native hardware, this is part of the Task State Segment mentioned above in | 1015 | * native hardware, this is part of the Task State Segment mentioned above in |
873 | * lguest_load_tr_desc(), but to help hypervisors there's this special call. | 1016 | * lguest_load_tr_desc(), but to help hypervisors there's this special call. |
874 | * | 1017 | * |
875 | * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data | 1018 | * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data |
876 | * segment), the privilege level (we're privilege level 1, the Host is 0 and | 1019 | * segment), the privilege level (we're privilege level 1, the Host is 0 and |
877 | * will not tolerate us trying to use that), the stack pointer, and the number | 1020 | * will not tolerate us trying to use that), the stack pointer, and the number |
878 | * of pages in the stack. */ | 1021 | * of pages in the stack. |
1022 | */ | ||
879 | static void lguest_load_sp0(struct tss_struct *tss, | 1023 | static void lguest_load_sp0(struct tss_struct *tss, |
880 | struct thread_struct *thread) | 1024 | struct thread_struct *thread) |
881 | { | 1025 | { |
@@ -889,7 +1033,8 @@ static void lguest_set_debugreg(int regno, unsigned long value) | |||
889 | /* FIXME: Implement */ | 1033 | /* FIXME: Implement */ |
890 | } | 1034 | } |
891 | 1035 | ||
892 | /* There are times when the kernel wants to make sure that no memory writes are | 1036 | /* |
1037 | * There are times when the kernel wants to make sure that no memory writes are | ||
893 | * caught in the cache (that they've all reached real hardware devices). This | 1038 | * caught in the cache (that they've all reached real hardware devices). This |
894 | * doesn't matter for the Guest which has virtual hardware. | 1039 | * doesn't matter for the Guest which has virtual hardware. |
895 | * | 1040 | * |
@@ -903,11 +1048,13 @@ static void lguest_wbinvd(void) | |||
903 | { | 1048 | { |
904 | } | 1049 | } |
905 | 1050 | ||
906 | /* If the Guest expects to have an Advanced Programmable Interrupt Controller, | 1051 | /* |
1052 | * If the Guest expects to have an Advanced Programmable Interrupt Controller, | ||
907 | * we play dumb by ignoring writes and returning 0 for reads. So it's no | 1053 | * we play dumb by ignoring writes and returning 0 for reads. So it's no |
908 | * longer Programmable nor Controlling anything, and I don't think 8 lines of | 1054 | * longer Programmable nor Controlling anything, and I don't think 8 lines of |
909 | * code qualifies for Advanced. It will also never interrupt anything. It | 1055 | * code qualifies for Advanced. It will also never interrupt anything. It |
910 | * does, however, allow us to get through the Linux boot code. */ | 1056 | * does, however, allow us to get through the Linux boot code. |
1057 | */ | ||
911 | #ifdef CONFIG_X86_LOCAL_APIC | 1058 | #ifdef CONFIG_X86_LOCAL_APIC |
912 | static void lguest_apic_write(u32 reg, u32 v) | 1059 | static void lguest_apic_write(u32 reg, u32 v) |
913 | { | 1060 | { |
@@ -956,11 +1103,13 @@ static void lguest_safe_halt(void) | |||
956 | kvm_hypercall0(LHCALL_HALT); | 1103 | kvm_hypercall0(LHCALL_HALT); |
957 | } | 1104 | } |
958 | 1105 | ||
959 | /* The SHUTDOWN hypercall takes a string to describe what's happening, and | 1106 | /* |
1107 | * The SHUTDOWN hypercall takes a string to describe what's happening, and | ||
960 | * an argument which says whether this to restart (reboot) the Guest or not. | 1108 | * an argument which says whether this to restart (reboot) the Guest or not. |
961 | * | 1109 | * |
962 | * Note that the Host always prefers that the Guest speak in physical addresses | 1110 | * Note that the Host always prefers that the Guest speak in physical addresses |
963 | * rather than virtual addresses, so we use __pa() here. */ | 1111 | * rather than virtual addresses, so we use __pa() here. |
1112 | */ | ||
964 | static void lguest_power_off(void) | 1113 | static void lguest_power_off(void) |
965 | { | 1114 | { |
966 | kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), | 1115 | kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), |
@@ -991,8 +1140,10 @@ static __init char *lguest_memory_setup(void) | |||
991 | * nice to move it back to lguest_init. Patch welcome... */ | 1140 | * nice to move it back to lguest_init. Patch welcome... */ |
992 | atomic_notifier_chain_register(&panic_notifier_list, &paniced); | 1141 | atomic_notifier_chain_register(&panic_notifier_list, &paniced); |
993 | 1142 | ||
994 | /* The Linux bootloader header contains an "e820" memory map: the | 1143 | /* |
995 | * Launcher populated the first entry with our memory limit. */ | 1144 | *The Linux bootloader header contains an "e820" memory map: the |
1145 | * Launcher populated the first entry with our memory limit. | ||
1146 | */ | ||
996 | e820_add_region(boot_params.e820_map[0].addr, | 1147 | e820_add_region(boot_params.e820_map[0].addr, |
997 | boot_params.e820_map[0].size, | 1148 | boot_params.e820_map[0].size, |
998 | boot_params.e820_map[0].type); | 1149 | boot_params.e820_map[0].type); |
@@ -1001,16 +1152,17 @@ static __init char *lguest_memory_setup(void) | |||
1001 | return "LGUEST"; | 1152 | return "LGUEST"; |
1002 | } | 1153 | } |
1003 | 1154 | ||
1004 | /* We will eventually use the virtio console device to produce console output, | 1155 | /* |
1156 | * We will eventually use the virtio console device to produce console output, | ||
1005 | * but before that is set up we use LHCALL_NOTIFY on normal memory to produce | 1157 | * but before that is set up we use LHCALL_NOTIFY on normal memory to produce |
1006 | * console output. */ | 1158 | * console output. |
1159 | */ | ||
1007 | static __init int early_put_chars(u32 vtermno, const char *buf, int count) | 1160 | static __init int early_put_chars(u32 vtermno, const char *buf, int count) |
1008 | { | 1161 | { |
1009 | char scratch[17]; | 1162 | char scratch[17]; |
1010 | unsigned int len = count; | 1163 | unsigned int len = count; |
1011 | 1164 | ||
1012 | /* We use a nul-terminated string, so we have to make a copy. Icky, | 1165 | /* We use a nul-terminated string, so we make a copy. Icky, huh? */ |
1013 | * huh? */ | ||
1014 | if (len > sizeof(scratch) - 1) | 1166 | if (len > sizeof(scratch) - 1) |
1015 | len = sizeof(scratch) - 1; | 1167 | len = sizeof(scratch) - 1; |
1016 | scratch[len] = '\0'; | 1168 | scratch[len] = '\0'; |
@@ -1021,8 +1173,10 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) | |||
1021 | return len; | 1173 | return len; |
1022 | } | 1174 | } |
1023 | 1175 | ||
1024 | /* Rebooting also tells the Host we're finished, but the RESTART flag tells the | 1176 | /* |
1025 | * Launcher to reboot us. */ | 1177 | * Rebooting also tells the Host we're finished, but the RESTART flag tells the |
1178 | * Launcher to reboot us. | ||
1179 | */ | ||
1026 | static void lguest_restart(char *reason) | 1180 | static void lguest_restart(char *reason) |
1027 | { | 1181 | { |
1028 | kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); | 1182 | kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); |
@@ -1049,7 +1203,8 @@ static void lguest_restart(char *reason) | |||
1049 | * fit comfortably. | 1203 | * fit comfortably. |
1050 | * | 1204 | * |
1051 | * First we need assembly templates of each of the patchable Guest operations, | 1205 | * First we need assembly templates of each of the patchable Guest operations, |
1052 | * and these are in i386_head.S. */ | 1206 | * and these are in i386_head.S. |
1207 | */ | ||
1053 | 1208 | ||
1054 | /*G:060 We construct a table from the assembler templates: */ | 1209 | /*G:060 We construct a table from the assembler templates: */ |
1055 | static const struct lguest_insns | 1210 | static const struct lguest_insns |
@@ -1060,9 +1215,11 @@ static const struct lguest_insns | |||
1060 | [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, | 1215 | [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, |
1061 | }; | 1216 | }; |
1062 | 1217 | ||
1063 | /* Now our patch routine is fairly simple (based on the native one in | 1218 | /* |
1219 | * Now our patch routine is fairly simple (based on the native one in | ||
1064 | * paravirt.c). If we have a replacement, we copy it in and return how much of | 1220 | * paravirt.c). If we have a replacement, we copy it in and return how much of |
1065 | * the available space we used. */ | 1221 | * the available space we used. |
1222 | */ | ||
1066 | static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, | 1223 | static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, |
1067 | unsigned long addr, unsigned len) | 1224 | unsigned long addr, unsigned len) |
1068 | { | 1225 | { |
@@ -1074,8 +1231,7 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, | |||
1074 | 1231 | ||
1075 | insn_len = lguest_insns[type].end - lguest_insns[type].start; | 1232 | insn_len = lguest_insns[type].end - lguest_insns[type].start; |
1076 | 1233 | ||
1077 | /* Similarly if we can't fit replacement (shouldn't happen, but let's | 1234 | /* Similarly if it can't fit (doesn't happen, but let's be thorough). */ |
1078 | * be thorough). */ | ||
1079 | if (len < insn_len) | 1235 | if (len < insn_len) |
1080 | return paravirt_patch_default(type, clobber, ibuf, addr, len); | 1236 | return paravirt_patch_default(type, clobber, ibuf, addr, len); |
1081 | 1237 | ||
@@ -1084,22 +1240,28 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, | |||
1084 | return insn_len; | 1240 | return insn_len; |
1085 | } | 1241 | } |
1086 | 1242 | ||
1087 | /*G:029 Once we get to lguest_init(), we know we're a Guest. The various | 1243 | /*G:029 |
1244 | * Once we get to lguest_init(), we know we're a Guest. The various | ||
1088 | * pv_ops structures in the kernel provide points for (almost) every routine we | 1245 | * pv_ops structures in the kernel provide points for (almost) every routine we |
1089 | * have to override to avoid privileged instructions. */ | 1246 | * have to override to avoid privileged instructions. |
1247 | */ | ||
1090 | __init void lguest_init(void) | 1248 | __init void lguest_init(void) |
1091 | { | 1249 | { |
1092 | /* We're under lguest, paravirt is enabled, and we're running at | 1250 | /* We're under lguest. */ |
1093 | * privilege level 1, not 0 as normal. */ | ||
1094 | pv_info.name = "lguest"; | 1251 | pv_info.name = "lguest"; |
1252 | /* Paravirt is enabled. */ | ||
1095 | pv_info.paravirt_enabled = 1; | 1253 | pv_info.paravirt_enabled = 1; |
1254 | /* We're running at privilege level 1, not 0 as normal. */ | ||
1096 | pv_info.kernel_rpl = 1; | 1255 | pv_info.kernel_rpl = 1; |
1256 | /* Everyone except Xen runs with this set. */ | ||
1097 | pv_info.shared_kernel_pmd = 1; | 1257 | pv_info.shared_kernel_pmd = 1; |
1098 | 1258 | ||
1099 | /* We set up all the lguest overrides for sensitive operations. These | 1259 | /* |
1100 | * are detailed with the operations themselves. */ | 1260 | * We set up all the lguest overrides for sensitive operations. These |
1261 | * are detailed with the operations themselves. | ||
1262 | */ | ||
1101 | 1263 | ||
1102 | /* interrupt-related operations */ | 1264 | /* Interrupt-related operations */ |
1103 | pv_irq_ops.init_IRQ = lguest_init_IRQ; | 1265 | pv_irq_ops.init_IRQ = lguest_init_IRQ; |
1104 | pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); | 1266 | pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); |
1105 | pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); | 1267 | pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); |
@@ -1107,11 +1269,11 @@ __init void lguest_init(void) | |||
1107 | pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); | 1269 | pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); |
1108 | pv_irq_ops.safe_halt = lguest_safe_halt; | 1270 | pv_irq_ops.safe_halt = lguest_safe_halt; |
1109 | 1271 | ||
1110 | /* init-time operations */ | 1272 | /* Setup operations */ |
1111 | pv_init_ops.memory_setup = lguest_memory_setup; | 1273 | pv_init_ops.memory_setup = lguest_memory_setup; |
1112 | pv_init_ops.patch = lguest_patch; | 1274 | pv_init_ops.patch = lguest_patch; |
1113 | 1275 | ||
1114 | /* Intercepts of various cpu instructions */ | 1276 | /* Intercepts of various CPU instructions */ |
1115 | pv_cpu_ops.load_gdt = lguest_load_gdt; | 1277 | pv_cpu_ops.load_gdt = lguest_load_gdt; |
1116 | pv_cpu_ops.cpuid = lguest_cpuid; | 1278 | pv_cpu_ops.cpuid = lguest_cpuid; |
1117 | pv_cpu_ops.load_idt = lguest_load_idt; | 1279 | pv_cpu_ops.load_idt = lguest_load_idt; |
@@ -1132,7 +1294,7 @@ __init void lguest_init(void) | |||
1132 | pv_cpu_ops.start_context_switch = paravirt_start_context_switch; | 1294 | pv_cpu_ops.start_context_switch = paravirt_start_context_switch; |
1133 | pv_cpu_ops.end_context_switch = lguest_end_context_switch; | 1295 | pv_cpu_ops.end_context_switch = lguest_end_context_switch; |
1134 | 1296 | ||
1135 | /* pagetable management */ | 1297 | /* Pagetable management */ |
1136 | pv_mmu_ops.write_cr3 = lguest_write_cr3; | 1298 | pv_mmu_ops.write_cr3 = lguest_write_cr3; |
1137 | pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; | 1299 | pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; |
1138 | pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; | 1300 | pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; |
@@ -1154,54 +1316,71 @@ __init void lguest_init(void) | |||
1154 | pv_mmu_ops.pte_update_defer = lguest_pte_update; | 1316 | pv_mmu_ops.pte_update_defer = lguest_pte_update; |
1155 | 1317 | ||
1156 | #ifdef CONFIG_X86_LOCAL_APIC | 1318 | #ifdef CONFIG_X86_LOCAL_APIC |
1157 | /* apic read/write intercepts */ | 1319 | /* APIC read/write intercepts */ |
1158 | set_lguest_basic_apic_ops(); | 1320 | set_lguest_basic_apic_ops(); |
1159 | #endif | 1321 | #endif |
1160 | 1322 | ||
1161 | /* time operations */ | 1323 | /* Time operations */ |
1162 | pv_time_ops.get_wallclock = lguest_get_wallclock; | 1324 | pv_time_ops.get_wallclock = lguest_get_wallclock; |
1163 | pv_time_ops.time_init = lguest_time_init; | 1325 | pv_time_ops.time_init = lguest_time_init; |
1164 | pv_time_ops.get_tsc_khz = lguest_tsc_khz; | 1326 | pv_time_ops.get_tsc_khz = lguest_tsc_khz; |
1165 | 1327 | ||
1166 | /* Now is a good time to look at the implementations of these functions | 1328 | /* |
1167 | * before returning to the rest of lguest_init(). */ | 1329 | * Now is a good time to look at the implementations of these functions |
1330 | * before returning to the rest of lguest_init(). | ||
1331 | */ | ||
1168 | 1332 | ||
1169 | /*G:070 Now we've seen all the paravirt_ops, we return to | 1333 | /*G:070 |
1334 | * Now we've seen all the paravirt_ops, we return to | ||
1170 | * lguest_init() where the rest of the fairly chaotic boot setup | 1335 | * lguest_init() where the rest of the fairly chaotic boot setup |
1171 | * occurs. */ | 1336 | * occurs. |
1337 | */ | ||
1172 | 1338 | ||
1173 | /* The stack protector is a weird thing where gcc places a canary | 1339 | /* |
1340 | * The stack protector is a weird thing where gcc places a canary | ||
1174 | * value on the stack and then checks it on return. This file is | 1341 | * value on the stack and then checks it on return. This file is |
1175 | * compiled with -fno-stack-protector it, so we got this far without | 1342 | * compiled with -fno-stack-protector it, so we got this far without |
1176 | * problems. The value of the canary is kept at offset 20 from the | 1343 | * problems. The value of the canary is kept at offset 20 from the |
1177 | * %gs register, so we need to set that up before calling C functions | 1344 | * %gs register, so we need to set that up before calling C functions |
1178 | * in other files. */ | 1345 | * in other files. |
1346 | */ | ||
1179 | setup_stack_canary_segment(0); | 1347 | setup_stack_canary_segment(0); |
1180 | /* We could just call load_stack_canary_segment(), but we might as | 1348 | |
1181 | * call switch_to_new_gdt() which loads the whole table and sets up | 1349 | /* |
1182 | * the per-cpu segment descriptor register %fs as well. */ | 1350 | * We could just call load_stack_canary_segment(), but we might as well |
1351 | * call switch_to_new_gdt() which loads the whole table and sets up the | ||
1352 | * per-cpu segment descriptor register %fs as well. | ||
1353 | */ | ||
1183 | switch_to_new_gdt(0); | 1354 | switch_to_new_gdt(0); |
1184 | 1355 | ||
1185 | /* As described in head_32.S, we map the first 128M of memory. */ | 1356 | /* We actually boot with all memory mapped, but let's say 128MB. */ |
1186 | max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; | 1357 | max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; |
1187 | 1358 | ||
1188 | /* The Host<->Guest Switcher lives at the top of our address space, and | 1359 | /* |
1360 | * The Host<->Guest Switcher lives at the top of our address space, and | ||
1189 | * the Host told us how big it is when we made LGUEST_INIT hypercall: | 1361 | * the Host told us how big it is when we made LGUEST_INIT hypercall: |
1190 | * it put the answer in lguest_data.reserve_mem */ | 1362 | * it put the answer in lguest_data.reserve_mem |
1363 | */ | ||
1191 | reserve_top_address(lguest_data.reserve_mem); | 1364 | reserve_top_address(lguest_data.reserve_mem); |
1192 | 1365 | ||
1193 | /* If we don't initialize the lock dependency checker now, it crashes | 1366 | /* |
1194 | * paravirt_disable_iospace. */ | 1367 | * If we don't initialize the lock dependency checker now, it crashes |
1368 | * paravirt_disable_iospace. | ||
1369 | */ | ||
1195 | lockdep_init(); | 1370 | lockdep_init(); |
1196 | 1371 | ||
1197 | /* The IDE code spends about 3 seconds probing for disks: if we reserve | 1372 | /* |
1373 | * The IDE code spends about 3 seconds probing for disks: if we reserve | ||
1198 | * all the I/O ports up front it can't get them and so doesn't probe. | 1374 | * all the I/O ports up front it can't get them and so doesn't probe. |
1199 | * Other device drivers are similar (but less severe). This cuts the | 1375 | * Other device drivers are similar (but less severe). This cuts the |
1200 | * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */ | 1376 | * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. |
1377 | */ | ||
1201 | paravirt_disable_iospace(); | 1378 | paravirt_disable_iospace(); |
1202 | 1379 | ||
1203 | /* This is messy CPU setup stuff which the native boot code does before | 1380 | /* |
1204 | * start_kernel, so we have to do, too: */ | 1381 | * This is messy CPU setup stuff which the native boot code does before |
1382 | * start_kernel, so we have to do, too: | ||
1383 | */ | ||
1205 | cpu_detect(&new_cpu_data); | 1384 | cpu_detect(&new_cpu_data); |
1206 | /* head.S usually sets up the first capability word, so do it here. */ | 1385 | /* head.S usually sets up the first capability word, so do it here. */ |
1207 | new_cpu_data.x86_capability[0] = cpuid_edx(1); | 1386 | new_cpu_data.x86_capability[0] = cpuid_edx(1); |
@@ -1218,22 +1397,28 @@ __init void lguest_init(void) | |||
1218 | acpi_ht = 0; | 1397 | acpi_ht = 0; |
1219 | #endif | 1398 | #endif |
1220 | 1399 | ||
1221 | /* We set the preferred console to "hvc". This is the "hypervisor | 1400 | /* |
1401 | * We set the preferred console to "hvc". This is the "hypervisor | ||
1222 | * virtual console" driver written by the PowerPC people, which we also | 1402 | * virtual console" driver written by the PowerPC people, which we also |
1223 | * adapted for lguest's use. */ | 1403 | * adapted for lguest's use. |
1404 | */ | ||
1224 | add_preferred_console("hvc", 0, NULL); | 1405 | add_preferred_console("hvc", 0, NULL); |
1225 | 1406 | ||
1226 | /* Register our very early console. */ | 1407 | /* Register our very early console. */ |
1227 | virtio_cons_early_init(early_put_chars); | 1408 | virtio_cons_early_init(early_put_chars); |
1228 | 1409 | ||
1229 | /* Last of all, we set the power management poweroff hook to point to | 1410 | /* |
1411 | * Last of all, we set the power management poweroff hook to point to | ||
1230 | * the Guest routine to power off, and the reboot hook to our restart | 1412 | * the Guest routine to power off, and the reboot hook to our restart |
1231 | * routine. */ | 1413 | * routine. |
1414 | */ | ||
1232 | pm_power_off = lguest_power_off; | 1415 | pm_power_off = lguest_power_off; |
1233 | machine_ops.restart = lguest_restart; | 1416 | machine_ops.restart = lguest_restart; |
1234 | 1417 | ||
1235 | /* Now we're set up, call i386_start_kernel() in head32.c and we proceed | 1418 | /* |
1236 | * to boot as normal. It never returns. */ | 1419 | * Now we're set up, call i386_start_kernel() in head32.c and we proceed |
1420 | * to boot as normal. It never returns. | ||
1421 | */ | ||
1237 | i386_start_kernel(); | 1422 | i386_start_kernel(); |
1238 | } | 1423 | } |
1239 | /* | 1424 | /* |
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index a9c8cfe61cd4..27eac0faee48 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S | |||
@@ -5,7 +5,8 @@ | |||
5 | #include <asm/thread_info.h> | 5 | #include <asm/thread_info.h> |
6 | #include <asm/processor-flags.h> | 6 | #include <asm/processor-flags.h> |
7 | 7 | ||
8 | /*G:020 Our story starts with the kernel booting into startup_32 in | 8 | /*G:020 |
9 | * Our story starts with the kernel booting into startup_32 in | ||
9 | * arch/x86/kernel/head_32.S. It expects a boot header, which is created by | 10 | * arch/x86/kernel/head_32.S. It expects a boot header, which is created by |
10 | * the bootloader (the Launcher in our case). | 11 | * the bootloader (the Launcher in our case). |
11 | * | 12 | * |
@@ -21,11 +22,14 @@ | |||
21 | * data without remembering to subtract __PAGE_OFFSET! | 22 | * data without remembering to subtract __PAGE_OFFSET! |
22 | * | 23 | * |
23 | * The .section line puts this code in .init.text so it will be discarded after | 24 | * The .section line puts this code in .init.text so it will be discarded after |
24 | * boot. */ | 25 | * boot. |
26 | */ | ||
25 | .section .init.text, "ax", @progbits | 27 | .section .init.text, "ax", @progbits |
26 | ENTRY(lguest_entry) | 28 | ENTRY(lguest_entry) |
27 | /* We make the "initialization" hypercall now to tell the Host about | 29 | /* |
28 | * us, and also find out where it put our page tables. */ | 30 | * We make the "initialization" hypercall now to tell the Host about |
31 | * us, and also find out where it put our page tables. | ||
32 | */ | ||
29 | movl $LHCALL_LGUEST_INIT, %eax | 33 | movl $LHCALL_LGUEST_INIT, %eax |
30 | movl $lguest_data - __PAGE_OFFSET, %ebx | 34 | movl $lguest_data - __PAGE_OFFSET, %ebx |
31 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ | 35 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ |
@@ -33,13 +37,14 @@ ENTRY(lguest_entry) | |||
33 | /* Set up the initial stack so we can run C code. */ | 37 | /* Set up the initial stack so we can run C code. */ |
34 | movl $(init_thread_union+THREAD_SIZE),%esp | 38 | movl $(init_thread_union+THREAD_SIZE),%esp |
35 | 39 | ||
36 | /* Jumps are relative, and we're running __PAGE_OFFSET too low at the | 40 | /* Jumps are relative: we're running __PAGE_OFFSET too low. */ |
37 | * moment. */ | ||
38 | jmp lguest_init+__PAGE_OFFSET | 41 | jmp lguest_init+__PAGE_OFFSET |
39 | 42 | ||
40 | /*G:055 We create a macro which puts the assembler code between lgstart_ and | 43 | /*G:055 |
41 | * lgend_ markers. These templates are put in the .text section: they can't be | 44 | * We create a macro which puts the assembler code between lgstart_ and lgend_ |
42 | * discarded after boot as we may need to patch modules, too. */ | 45 | * markers. These templates are put in the .text section: they can't be |
46 | * discarded after boot as we may need to patch modules, too. | ||
47 | */ | ||
43 | .text | 48 | .text |
44 | #define LGUEST_PATCH(name, insns...) \ | 49 | #define LGUEST_PATCH(name, insns...) \ |
45 | lgstart_##name: insns; lgend_##name:; \ | 50 | lgstart_##name: insns; lgend_##name:; \ |
@@ -48,83 +53,103 @@ ENTRY(lguest_entry) | |||
48 | LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) | 53 | LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) |
49 | LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) | 54 | LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) |
50 | 55 | ||
51 | /*G:033 But using those wrappers is inefficient (we'll see why that doesn't | 56 | /*G:033 |
52 | * matter for save_fl and irq_disable later). If we write our routines | 57 | * But using those wrappers is inefficient (we'll see why that doesn't matter |
53 | * carefully in assembler, we can avoid clobbering any registers and avoid | 58 | * for save_fl and irq_disable later). If we write our routines carefully in |
54 | * jumping through the wrapper functions. | 59 | * assembler, we can avoid clobbering any registers and avoid jumping through |
60 | * the wrapper functions. | ||
55 | * | 61 | * |
56 | * I skipped over our first piece of assembler, but this one is worth studying | 62 | * I skipped over our first piece of assembler, but this one is worth studying |
57 | * in a bit more detail so I'll describe in easy stages. First, the routine | 63 | * in a bit more detail so I'll describe in easy stages. First, the routine to |
58 | * to enable interrupts: */ | 64 | * enable interrupts: |
65 | */ | ||
59 | ENTRY(lg_irq_enable) | 66 | ENTRY(lg_irq_enable) |
60 | /* The reverse of irq_disable, this sets lguest_data.irq_enabled to | 67 | /* |
61 | * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ | 68 | * The reverse of irq_disable, this sets lguest_data.irq_enabled to |
69 | * X86_EFLAGS_IF (ie. "Interrupts enabled"). | ||
70 | */ | ||
62 | movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled | 71 | movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled |
63 | /* But now we need to check if the Host wants to know: there might have | 72 | /* |
73 | * But now we need to check if the Host wants to know: there might have | ||
64 | * been interrupts waiting to be delivered, in which case it will have | 74 | * been interrupts waiting to be delivered, in which case it will have |
65 | * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we | 75 | * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we |
66 | * jump to send_interrupts, otherwise we're done. */ | 76 | * jump to send_interrupts, otherwise we're done. |
77 | */ | ||
67 | testl $0, lguest_data+LGUEST_DATA_irq_pending | 78 | testl $0, lguest_data+LGUEST_DATA_irq_pending |
68 | jnz send_interrupts | 79 | jnz send_interrupts |
69 | /* One cool thing about x86 is that you can do many things without using | 80 | /* |
81 | * One cool thing about x86 is that you can do many things without using | ||
70 | * a register. In this case, the normal path hasn't needed to save or | 82 | * a register. In this case, the normal path hasn't needed to save or |
71 | * restore any registers at all! */ | 83 | * restore any registers at all! |
84 | */ | ||
72 | ret | 85 | ret |
73 | send_interrupts: | 86 | send_interrupts: |
74 | /* OK, now we need a register: eax is used for the hypercall number, | 87 | /* |
88 | * OK, now we need a register: eax is used for the hypercall number, | ||
75 | * which is LHCALL_SEND_INTERRUPTS. | 89 | * which is LHCALL_SEND_INTERRUPTS. |
76 | * | 90 | * |
77 | * We used not to bother with this pending detection at all, which was | 91 | * We used not to bother with this pending detection at all, which was |
78 | * much simpler. Sooner or later the Host would realize it had to | 92 | * much simpler. Sooner or later the Host would realize it had to |
79 | * send us an interrupt. But that turns out to make performance 7 | 93 | * send us an interrupt. But that turns out to make performance 7 |
80 | * times worse on a simple tcp benchmark. So now we do this the hard | 94 | * times worse on a simple tcp benchmark. So now we do this the hard |
81 | * way. */ | 95 | * way. |
96 | */ | ||
82 | pushl %eax | 97 | pushl %eax |
83 | movl $LHCALL_SEND_INTERRUPTS, %eax | 98 | movl $LHCALL_SEND_INTERRUPTS, %eax |
84 | /* This is a vmcall instruction (same thing that KVM uses). Older | 99 | /* |
100 | * This is a vmcall instruction (same thing that KVM uses). Older | ||
85 | * assembler versions might not know the "vmcall" instruction, so we | 101 | * assembler versions might not know the "vmcall" instruction, so we |
86 | * create one manually here. */ | 102 | * create one manually here. |
103 | */ | ||
87 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ | 104 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ |
105 | /* Put eax back the way we found it. */ | ||
88 | popl %eax | 106 | popl %eax |
89 | ret | 107 | ret |
90 | 108 | ||
91 | /* Finally, the "popf" or "restore flags" routine. The %eax register holds the | 109 | /* |
110 | * Finally, the "popf" or "restore flags" routine. The %eax register holds the | ||
92 | * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're | 111 | * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're |
93 | * enabling interrupts again, if it's 0 we're leaving them off. */ | 112 | * enabling interrupts again, if it's 0 we're leaving them off. |
113 | */ | ||
94 | ENTRY(lg_restore_fl) | 114 | ENTRY(lg_restore_fl) |
95 | /* This is just "lguest_data.irq_enabled = flags;" */ | 115 | /* This is just "lguest_data.irq_enabled = flags;" */ |
96 | movl %eax, lguest_data+LGUEST_DATA_irq_enabled | 116 | movl %eax, lguest_data+LGUEST_DATA_irq_enabled |
97 | /* Now, if the %eax value has enabled interrupts and | 117 | /* |
118 | * Now, if the %eax value has enabled interrupts and | ||
98 | * lguest_data.irq_pending is set, we want to tell the Host so it can | 119 | * lguest_data.irq_pending is set, we want to tell the Host so it can |
99 | * deliver any outstanding interrupts. Fortunately, both values will | 120 | * deliver any outstanding interrupts. Fortunately, both values will |
100 | * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" | 121 | * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" |
101 | * instruction will AND them together for us. If both are set, we | 122 | * instruction will AND them together for us. If both are set, we |
102 | * jump to send_interrupts. */ | 123 | * jump to send_interrupts. |
124 | */ | ||
103 | testl lguest_data+LGUEST_DATA_irq_pending, %eax | 125 | testl lguest_data+LGUEST_DATA_irq_pending, %eax |
104 | jnz send_interrupts | 126 | jnz send_interrupts |
105 | /* Again, the normal path has used no extra registers. Clever, huh? */ | 127 | /* Again, the normal path has used no extra registers. Clever, huh? */ |
106 | ret | 128 | ret |
129 | /*:*/ | ||
107 | 130 | ||
108 | /* These demark the EIP range where host should never deliver interrupts. */ | 131 | /* These demark the EIP range where host should never deliver interrupts. */ |
109 | .global lguest_noirq_start | 132 | .global lguest_noirq_start |
110 | .global lguest_noirq_end | 133 | .global lguest_noirq_end |
111 | 134 | ||
112 | /*M:004 When the Host reflects a trap or injects an interrupt into the Guest, | 135 | /*M:004 |
113 | * it sets the eflags interrupt bit on the stack based on | 136 | * When the Host reflects a trap or injects an interrupt into the Guest, it |
114 | * lguest_data.irq_enabled, so the Guest iret logic does the right thing when | 137 | * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled, |
115 | * restoring it. However, when the Host sets the Guest up for direct traps, | 138 | * so the Guest iret logic does the right thing when restoring it. However, |
116 | * such as system calls, the processor is the one to push eflags onto the | 139 | * when the Host sets the Guest up for direct traps, such as system calls, the |
117 | * stack, and the interrupt bit will be 1 (in reality, interrupts are always | 140 | * processor is the one to push eflags onto the stack, and the interrupt bit |
118 | * enabled in the Guest). | 141 | * will be 1 (in reality, interrupts are always enabled in the Guest). |
119 | * | 142 | * |
120 | * This turns out to be harmless: the only trap which should happen under Linux | 143 | * This turns out to be harmless: the only trap which should happen under Linux |
121 | * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc | 144 | * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc |
122 | * regions), which has to be reflected through the Host anyway. If another | 145 | * regions), which has to be reflected through the Host anyway. If another |
123 | * trap *does* go off when interrupts are disabled, the Guest will panic, and | 146 | * trap *does* go off when interrupts are disabled, the Guest will panic, and |
124 | * we'll never get to this iret! :*/ | 147 | * we'll never get to this iret! |
148 | :*/ | ||
125 | 149 | ||
126 | /*G:045 There is one final paravirt_op that the Guest implements, and glancing | 150 | /*G:045 |
127 | * at it you can see why I left it to last. It's *cool*! It's in *assembler*! | 151 | * There is one final paravirt_op that the Guest implements, and glancing at it |
152 | * you can see why I left it to last. It's *cool*! It's in *assembler*! | ||
128 | * | 153 | * |
129 | * The "iret" instruction is used to return from an interrupt or trap. The | 154 | * The "iret" instruction is used to return from an interrupt or trap. The |
130 | * stack looks like this: | 155 | * stack looks like this: |
@@ -148,15 +173,18 @@ ENTRY(lg_restore_fl) | |||
148 | * return to userspace or wherever. Our solution to this is to surround the | 173 | * return to userspace or wherever. Our solution to this is to surround the |
149 | * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the | 174 | * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the |
150 | * Host that it is *never* to interrupt us there, even if interrupts seem to be | 175 | * Host that it is *never* to interrupt us there, even if interrupts seem to be |
151 | * enabled. */ | 176 | * enabled. |
177 | */ | ||
152 | ENTRY(lguest_iret) | 178 | ENTRY(lguest_iret) |
153 | pushl %eax | 179 | pushl %eax |
154 | movl 12(%esp), %eax | 180 | movl 12(%esp), %eax |
155 | lguest_noirq_start: | 181 | lguest_noirq_start: |
156 | /* Note the %ss: segment prefix here. Normal data accesses use the | 182 | /* |
183 | * Note the %ss: segment prefix here. Normal data accesses use the | ||
157 | * "ds" segment, but that will have already been restored for whatever | 184 | * "ds" segment, but that will have already been restored for whatever |
158 | * we're returning to (such as userspace): we can't trust it. The %ss: | 185 | * we're returning to (such as userspace): we can't trust it. The %ss: |
159 | * prefix makes sure we use the stack segment, which is still valid. */ | 186 | * prefix makes sure we use the stack segment, which is still valid. |
187 | */ | ||
160 | movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled | 188 | movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled |
161 | popl %eax | 189 | popl %eax |
162 | iret | 190 | iret |