aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRusty Russell <rusty@rustcorp.com.au>2007-10-21 21:03:36 -0400
committerRusty Russell <rusty@rustcorp.com.au>2007-10-23 01:49:54 -0400
commit47436aa4ad054c1c7c8231618e86ebd9305308dc (patch)
treea9ba6e0521f9116442144a86e781a3164ec86094
parentc18acd73ffc209def08003a1927473096f66c5ad (diff)
Boot with virtual == physical to get closer to native Linux.
1) This allows us to get alot closer to booting bzImages. 2) It means we don't have to know page_offset. 3) The Guest needs to modify the boot pagetables to create the PAGE_OFFSET mapping before jumping to C code. 4) guest_pa() walks the page tables rather than using page_offset. 5) We don't use page_offset to figure out whether to emulate: it was always kinda quesationable, and won't work for instructions done before remapping (bzImage unpacking in particular). 6) We still want the kernel address for tlb flushing: have the initial hypercall give us that, too. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
-rw-r--r--Documentation/lguest/lguest.c134
-rw-r--r--arch/x86/kernel/asm-offsets_32.c1
-rw-r--r--arch/x86/lguest/boot.c7
-rw-r--r--arch/x86/lguest/i386_head.S41
-rw-r--r--drivers/lguest/hypercalls.c8
-rw-r--r--drivers/lguest/interrupts_and_traps.c13
-rw-r--r--drivers/lguest/lg.h8
-rw-r--r--drivers/lguest/lguest_user.c11
-rw-r--r--drivers/lguest/page_tables.c47
-rw-r--r--drivers/lguest/x86/core.c7
-rw-r--r--include/asm-x86/lguest_hcall.h7
-rw-r--r--include/linux/lguest.h5
12 files changed, 141 insertions, 148 deletions
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 4950b03514e6..32c2eaf94c4d 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -178,19 +178,16 @@ static void *get_pages(unsigned int num)
178/* To find out where to start we look for the magic Guest string, which marks 178/* To find out where to start we look for the magic Guest string, which marks
179 * the code we see in lguest_asm.S. This is a hack which we are currently 179 * the code we see in lguest_asm.S. This is a hack which we are currently
180 * plotting to replace with the normal Linux entry point. */ 180 * plotting to replace with the normal Linux entry point. */
181static unsigned long entry_point(const void *start, const void *end, 181static unsigned long entry_point(const void *start, const void *end)
182 unsigned long page_offset)
183{ 182{
184 const void *p; 183 const void *p;
185 184
186 /* The scan gives us the physical starting address. We want the 185 /* The scan gives us the physical starting address. We boot with
187 * virtual address in this case, and fortunately, we already figured 186 * pagetables set up with virtual and physical the same, so that's
188 * out the physical-virtual difference and passed it here in 187 * OK. */
189 * "page_offset". */
190 for (p = start; p < end; p++) 188 for (p = start; p < end; p++)
191 if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0) 189 if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0)
192 return to_guest_phys(p + strlen("GenuineLguest")) 190 return to_guest_phys(p + strlen("GenuineLguest"));
193 + page_offset;
194 191
195 errx(1, "Is this image a genuine lguest?"); 192 errx(1, "Is this image a genuine lguest?");
196} 193}
@@ -224,14 +221,11 @@ static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
224 * by all modern binaries on Linux including the kernel. 221 * by all modern binaries on Linux including the kernel.
225 * 222 *
226 * The ELF headers give *two* addresses: a physical address, and a virtual 223 * The ELF headers give *two* addresses: a physical address, and a virtual
227 * address. The Guest kernel expects to be placed in memory at the physical 224 * address. We use the physical address; the Guest will map itself to the
228 * address, and the page tables set up so it will correspond to that virtual 225 * virtual address.
229 * address. We return the difference between the virtual and physical
230 * addresses in the "page_offset" pointer.
231 * 226 *
232 * We return the starting address. */ 227 * We return the starting address. */
233static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, 228static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
234 unsigned long *page_offset)
235{ 229{
236 void *start = (void *)-1, *end = NULL; 230 void *start = (void *)-1, *end = NULL;
237 Elf32_Phdr phdr[ehdr->e_phnum]; 231 Elf32_Phdr phdr[ehdr->e_phnum];
@@ -255,9 +249,6 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
255 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) 249 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
256 err(1, "Reading program headers"); 250 err(1, "Reading program headers");
257 251
258 /* We don't know page_offset yet. */
259 *page_offset = 0;
260
261 /* Try all the headers: there are usually only three. A read-only one, 252 /* Try all the headers: there are usually only three. A read-only one,
262 * a read-write one, and a "note" section which isn't loadable. */ 253 * a read-write one, and a "note" section which isn't loadable. */
263 for (i = 0; i < ehdr->e_phnum; i++) { 254 for (i = 0; i < ehdr->e_phnum; i++) {
@@ -268,14 +259,6 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
268 verbose("Section %i: size %i addr %p\n", 259 verbose("Section %i: size %i addr %p\n",
269 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); 260 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
270 261
271 /* We expect a simple linear address space: every segment must
272 * have the same difference between virtual (p_vaddr) and
273 * physical (p_paddr) address. */
274 if (!*page_offset)
275 *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
276 else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
277 errx(1, "Page offset of section %i different", i);
278
279 /* We track the first and last address we mapped, so we can 262 /* We track the first and last address we mapped, so we can
280 * tell entry_point() where to scan. */ 263 * tell entry_point() where to scan. */
281 if (from_guest_phys(phdr[i].p_paddr) < start) 264 if (from_guest_phys(phdr[i].p_paddr) < start)
@@ -288,50 +271,13 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
288 phdr[i].p_offset, phdr[i].p_filesz); 271 phdr[i].p_offset, phdr[i].p_filesz);
289 } 272 }
290 273
291 return entry_point(start, end, *page_offset); 274 return entry_point(start, end);
292}
293
294/*L:170 Prepare to be SHOCKED and AMAZED. And possibly a trifle nauseated.
295 *
296 * We know that CONFIG_PAGE_OFFSET sets what virtual address the kernel expects
297 * to be. We don't know what that option was, but we can figure it out
298 * approximately by looking at the addresses in the code. I chose the common
299 * case of reading a memory location into the %eax register:
300 *
301 * movl <some-address>, %eax
302 *
303 * This gets encoded as five bytes: "0xA1 <4-byte-address>". For example,
304 * "0xA1 0x18 0x60 0x47 0xC0" reads the address 0xC0476018 into %eax.
305 *
306 * In this example can guess that the kernel was compiled with
307 * CONFIG_PAGE_OFFSET set to 0xC0000000 (it's always a round number). If the
308 * kernel were larger than 16MB, we might see 0xC1 addresses show up, but our
309 * kernel isn't that bloated yet.
310 *
311 * Unfortunately, x86 has variable-length instructions, so finding this
312 * particular instruction properly involves writing a disassembler. Instead,
313 * we rely on statistics. We look for "0xA1" and tally the different bytes
314 * which occur 4 bytes later (the "0xC0" in our example above). When one of
315 * those bytes appears three times, we can be reasonably confident that it
316 * forms the start of CONFIG_PAGE_OFFSET.
317 *
318 * This is amazingly reliable. */
319static unsigned long intuit_page_offset(unsigned char *img, unsigned long len)
320{
321 unsigned int i, possibilities[256] = { 0 };
322
323 for (i = 0; i + 4 < len; i++) {
324 /* mov 0xXXXXXXXX,%eax */
325 if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3)
326 return (unsigned long)img[i+4] << 24;
327 }
328 errx(1, "could not determine page offset");
329} 275}
330 276
331/*L:160 Unfortunately the entire ELF image isn't compressed: the segments 277/*L:160 Unfortunately the entire ELF image isn't compressed: the segments
332 * which need loading are extracted and compressed raw. This denies us the 278 * which need loading are extracted and compressed raw. This denies us the
333 * information we need to make a fully-general loader. */ 279 * information we need to make a fully-general loader. */
334static unsigned long unpack_bzimage(int fd, unsigned long *page_offset) 280static unsigned long unpack_bzimage(int fd)
335{ 281{
336 gzFile f; 282 gzFile f;
337 int ret, len = 0; 283 int ret, len = 0;
@@ -352,12 +298,7 @@ static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)
352 298
353 verbose("Unpacked size %i addr %p\n", len, img); 299 verbose("Unpacked size %i addr %p\n", len, img);
354 300
355 /* Without the ELF header, we can't tell virtual-physical gap. This is 301 return entry_point(img, img + len);
356 * CONFIG_PAGE_OFFSET, and people do actually change it. Fortunately,
357 * I have a clever way of figuring it out from the code itself. */
358 *page_offset = intuit_page_offset(img, len);
359
360 return entry_point(img, img + len, *page_offset);
361} 302}
362 303
363/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're 304/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're
@@ -368,7 +309,7 @@ static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)
368 * The bzImage is formed by putting the decompressing code in front of the 309 * The bzImage is formed by putting the decompressing code in front of the
369 * compressed kernel code. So we can simple scan through it looking for the 310 * compressed kernel code. So we can simple scan through it looking for the
370 * first "gzip" header, and start decompressing from there. */ 311 * first "gzip" header, and start decompressing from there. */
371static unsigned long load_bzimage(int fd, unsigned long *page_offset) 312static unsigned long load_bzimage(int fd)
372{ 313{
373 unsigned char c; 314 unsigned char c;
374 int state = 0; 315 int state = 0;
@@ -396,7 +337,7 @@ static unsigned long load_bzimage(int fd, unsigned long *page_offset)
396 if (c != 0x03) 337 if (c != 0x03)
397 state = -1; 338 state = -1;
398 else 339 else
399 return unpack_bzimage(fd, page_offset); 340 return unpack_bzimage(fd);
400 } 341 }
401 } 342 }
402 errx(1, "Could not find kernel in bzImage"); 343 errx(1, "Could not find kernel in bzImage");
@@ -405,7 +346,7 @@ static unsigned long load_bzimage(int fd, unsigned long *page_offset)
405/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels 346/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels
406 * come wrapped up in the self-decompressing "bzImage" format. With some funky 347 * come wrapped up in the self-decompressing "bzImage" format. With some funky
407 * coding, we can load those, too. */ 348 * coding, we can load those, too. */
408static unsigned long load_kernel(int fd, unsigned long *page_offset) 349static unsigned long load_kernel(int fd)
409{ 350{
410 Elf32_Ehdr hdr; 351 Elf32_Ehdr hdr;
411 352
@@ -415,10 +356,10 @@ static unsigned long load_kernel(int fd, unsigned long *page_offset)
415 356
416 /* If it's an ELF file, it starts with "\177ELF" */ 357 /* If it's an ELF file, it starts with "\177ELF" */
417 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) 358 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
418 return map_elf(fd, &hdr, page_offset); 359 return map_elf(fd, &hdr);
419 360
420 /* Otherwise we assume it's a bzImage, and try to unpack it */ 361 /* Otherwise we assume it's a bzImage, and try to unpack it */
421 return load_bzimage(fd, page_offset); 362 return load_bzimage(fd);
422} 363}
423 364
424/* This is a trivial little helper to align pages. Andi Kleen hated it because 365/* This is a trivial little helper to align pages. Andi Kleen hated it because
@@ -463,27 +404,20 @@ static unsigned long load_initrd(const char *name, unsigned long mem)
463 return len; 404 return len;
464} 405}
465 406
466/* Once we know the address the Guest kernel expects, we can construct simple 407/* Once we know how much memory we have, we can construct simple linear page
467 * linear page tables for all of memory which will get the Guest far enough 408 * tables which set virtual == physical which will get the Guest far enough
468 * into the boot to create its own. 409 * into the boot to create its own.
469 * 410 *
470 * We lay them out of the way, just below the initrd (which is why we need to 411 * We lay them out of the way, just below the initrd (which is why we need to
471 * know its size). */ 412 * know its size). */
472static unsigned long setup_pagetables(unsigned long mem, 413static unsigned long setup_pagetables(unsigned long mem,
473 unsigned long initrd_size, 414 unsigned long initrd_size)
474 unsigned long page_offset)
475{ 415{
476 unsigned long *pgdir, *linear; 416 unsigned long *pgdir, *linear;
477 unsigned int mapped_pages, i, linear_pages; 417 unsigned int mapped_pages, i, linear_pages;
478 unsigned int ptes_per_page = getpagesize()/sizeof(void *); 418 unsigned int ptes_per_page = getpagesize()/sizeof(void *);
479 419
480 /* Ideally we map all physical memory starting at page_offset. 420 mapped_pages = mem/getpagesize();
481 * However, if page_offset is 0xC0000000 we can only map 1G of physical
482 * (0xC0000000 + 1G overflows). */
483 if (mem <= -page_offset)
484 mapped_pages = mem/getpagesize();
485 else
486 mapped_pages = -page_offset/getpagesize();
487 421
488 /* Each PTE page can map ptes_per_page pages: how many do we need? */ 422 /* Each PTE page can map ptes_per_page pages: how many do we need? */
489 linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; 423 linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page;
@@ -500,11 +434,9 @@ static unsigned long setup_pagetables(unsigned long mem,
500 for (i = 0; i < mapped_pages; i++) 434 for (i = 0; i < mapped_pages; i++)
501 linear[i] = ((i * getpagesize()) | PAGE_PRESENT); 435 linear[i] = ((i * getpagesize()) | PAGE_PRESENT);
502 436
503 /* The top level points to the linear page table pages above. The 437 /* The top level points to the linear page table pages above. */
504 * entry representing page_offset points to the first one, and they
505 * continue from there. */
506 for (i = 0; i < mapped_pages; i += ptes_per_page) { 438 for (i = 0; i < mapped_pages; i += ptes_per_page) {
507 pgdir[(i + page_offset/getpagesize())/ptes_per_page] 439 pgdir[i/ptes_per_page]
508 = ((to_guest_phys(linear) + i*sizeof(void *)) 440 = ((to_guest_phys(linear) + i*sizeof(void *))
509 | PAGE_PRESENT); 441 | PAGE_PRESENT);
510 } 442 }
@@ -535,15 +467,12 @@ static void concat(char *dst, char *args[])
535/* This is where we actually tell the kernel to initialize the Guest. We saw 467/* This is where we actually tell the kernel to initialize the Guest. We saw
536 * the arguments it expects when we looked at initialize() in lguest_user.c: 468 * the arguments it expects when we looked at initialize() in lguest_user.c:
537 * the base of guest "physical" memory, the top physical page to allow, the 469 * the base of guest "physical" memory, the top physical page to allow, the
538 * top level pagetable, the entry point and the page_offset constant for the 470 * top level pagetable and the entry point for the Guest. */
539 * Guest. */ 471static int tell_kernel(unsigned long pgdir, unsigned long start)
540static int tell_kernel(unsigned long pgdir, unsigned long start,
541 unsigned long page_offset)
542{ 472{
543 unsigned long args[] = { LHREQ_INITIALIZE, 473 unsigned long args[] = { LHREQ_INITIALIZE,
544 (unsigned long)guest_base, 474 (unsigned long)guest_base,
545 guest_limit / getpagesize(), 475 guest_limit / getpagesize(), pgdir, start };
546 pgdir, start, page_offset };
547 int fd; 476 int fd;
548 477
549 verbose("Guest: %p - %p (%#lx)\n", 478 verbose("Guest: %p - %p (%#lx)\n",
@@ -1424,9 +1353,9 @@ static void usage(void)
1424/*L:105 The main routine is where the real work begins: */ 1353/*L:105 The main routine is where the real work begins: */
1425int main(int argc, char *argv[]) 1354int main(int argc, char *argv[])
1426{ 1355{
1427 /* Memory, top-level pagetable, code startpoint, PAGE_OFFSET and size 1356 /* Memory, top-level pagetable, code startpoint and size of the
1428 * of the (optional) initrd. */ 1357 * (optional) initrd. */
1429 unsigned long mem = 0, pgdir, start, page_offset, initrd_size = 0; 1358 unsigned long mem = 0, pgdir, start, initrd_size = 0;
1430 /* A temporary and the /dev/lguest file descriptor. */ 1359 /* A temporary and the /dev/lguest file descriptor. */
1431 int i, c, lguest_fd; 1360 int i, c, lguest_fd;
1432 /* The list of Guest devices, based on command line arguments. */ 1361 /* The list of Guest devices, based on command line arguments. */
@@ -1500,8 +1429,7 @@ int main(int argc, char *argv[])
1500 setup_console(&device_list); 1429 setup_console(&device_list);
1501 1430
1502 /* Now we load the kernel */ 1431 /* Now we load the kernel */
1503 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY), 1432 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
1504 &page_offset);
1505 1433
1506 /* Boot information is stashed at physical address 0 */ 1434 /* Boot information is stashed at physical address 0 */
1507 boot = from_guest_phys(0); 1435 boot = from_guest_phys(0);
@@ -1518,7 +1446,7 @@ int main(int argc, char *argv[])
1518 } 1446 }
1519 1447
1520 /* Set up the initial linear pagetables, starting below the initrd. */ 1448 /* Set up the initial linear pagetables, starting below the initrd. */
1521 pgdir = setup_pagetables(mem, initrd_size, page_offset); 1449 pgdir = setup_pagetables(mem, initrd_size);
1522 1450
1523 /* The Linux boot header contains an "E820" memory map: ours is a 1451 /* The Linux boot header contains an "E820" memory map: ours is a
1524 * simple, single region. */ 1452 * simple, single region. */
@@ -1535,7 +1463,7 @@ int main(int argc, char *argv[])
1535 1463
1536 /* We tell the kernel to initialize the Guest: this returns the open 1464 /* We tell the kernel to initialize the Guest: this returns the open
1537 * /dev/lguest file descriptor. */ 1465 * /dev/lguest file descriptor. */
1538 lguest_fd = tell_kernel(pgdir, start, page_offset); 1466 lguest_fd = tell_kernel(pgdir, start);
1539 1467
1540 /* We fork off a child process, which wakes the Launcher whenever one 1468 /* We fork off a child process, which wakes the Launcher whenever one
1541 * of the input file descriptors needs attention. Otherwise we would 1469 * of the input file descriptors needs attention. Otherwise we would
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index f8764716b0c0..0e45981b2dd7 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -136,6 +136,7 @@ void foo(void)
136#ifdef CONFIG_LGUEST_GUEST 136#ifdef CONFIG_LGUEST_GUEST
137 BLANK(); 137 BLANK();
138 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 138 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
139 OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
139 OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); 140 OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
140 OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); 141 OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
141 OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3); 142 OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 3a06b51c98ad..090f30cbf24c 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -86,6 +86,7 @@ struct lguest_data lguest_data = {
86 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, 86 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
87 .noirq_start = (u32)lguest_noirq_start, 87 .noirq_start = (u32)lguest_noirq_start,
88 .noirq_end = (u32)lguest_noirq_end, 88 .noirq_end = (u32)lguest_noirq_end,
89 .kernel_address = PAGE_OFFSET,
89 .blocked_interrupts = { 1 }, /* Block timer interrupts */ 90 .blocked_interrupts = { 1 }, /* Block timer interrupts */
90 .syscall_vec = SYSCALL_VECTOR, 91 .syscall_vec = SYSCALL_VECTOR,
91}; 92};
@@ -1033,11 +1034,7 @@ __init void lguest_init(void *boot)
1033 1034
1034 /*G:070 Now we've seen all the paravirt_ops, we return to 1035 /*G:070 Now we've seen all the paravirt_ops, we return to
1035 * lguest_init() where the rest of the fairly chaotic boot setup 1036 * lguest_init() where the rest of the fairly chaotic boot setup
1036 * occurs. 1037 * occurs. */
1037 *
1038 * The Host expects our first hypercall to tell it where our "struct
1039 * lguest_data" is, so we do that first. */
1040 hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
1041 1038
1042 /* The native boot code sets up initial page tables immediately after 1039 /* The native boot code sets up initial page tables immediately after
1043 * the kernel itself, and sets init_pg_tables_end so they're not 1040 * the kernel itself, and sets init_pg_tables_end so they're not
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 6d7a74f07c41..ba4282eba5bf 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -1,5 +1,6 @@
1#include <linux/linkage.h> 1#include <linux/linkage.h>
2#include <linux/lguest.h> 2#include <linux/lguest.h>
3#include <asm/lguest_hcall.h>
3#include <asm/asm-offsets.h> 4#include <asm/asm-offsets.h>
4#include <asm/thread_info.h> 5#include <asm/thread_info.h>
5#include <asm/processor-flags.h> 6#include <asm/processor-flags.h>
@@ -8,18 +9,48 @@
8 * looks for. The plan is that the Linux boot protocol will be extended with a 9 * looks for. The plan is that the Linux boot protocol will be extended with a
9 * "platform type" field which will guide us here from the normal entry point, 10 * "platform type" field which will guide us here from the normal entry point,
10 * but for the moment this suffices. The normal boot code uses %esi for the 11 * but for the moment this suffices. The normal boot code uses %esi for the
11 * boot header, so we do too. We convert it to a virtual address by adding 12 * boot header, so we do too.
12 * PAGE_OFFSET, and hand it to lguest_init() as its argument (ie. %eax). 13 *
14 * WARNING: be very careful here! We're running at addresses equal to physical
15 * addesses (around 0), not above PAGE_OFFSET as most code expectes
16 * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
17 * data.
13 * 18 *
14 * The .section line puts this code in .init.text so it will be discarded after 19 * The .section line puts this code in .init.text so it will be discarded after
15 * boot. */ 20 * boot. */
16.section .init.text, "ax", @progbits 21.section .init.text, "ax", @progbits
17.ascii "GenuineLguest" 22.ascii "GenuineLguest"
18 /* Set up initial stack. */ 23 /* Make initial hypercall now, so we can set up the pagetables. */
19 movl $(init_thread_union+THREAD_SIZE),%esp 24 movl $LHCALL_LGUEST_INIT, %eax
25 movl $lguest_data - __PAGE_OFFSET, %edx
26 int $LGUEST_TRAP_ENTRY
27
28 /* Set up boot information pointer to hand to lguest_init(): it wants
29 * a virtual address. */
20 movl %esi, %eax 30 movl %esi, %eax
21 addl $__PAGE_OFFSET, %eax 31 addl $__PAGE_OFFSET, %eax
22 jmp lguest_init 32
33 /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl
34 * instruction uses %esi, so we needed to save it above. */
35 movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi
36
37 /* Copy first 32 entries of page directory to __PAGE_OFFSET entries.
38 * This means the first 128M of kernel memory will be mapped at
39 * PAGE_OFFSET where the kernel expects to run. This will get it far
40 * enough through boot to switch to its own pagetables. */
41 movl $32, %ecx
42 movl %esi, %edi
43 addl $((__PAGE_OFFSET >> 22) * 4), %edi
44 rep
45 movsl
46
47 /* Set up the initial stack so we can run C code. */
48 movl $(init_thread_union+THREAD_SIZE),%esp
49
50
51 /* Jumps are relative, and we're running __PAGE_OFFSET too low at the
52 * moment. */
53 jmp lguest_init+__PAGE_OFFSET
23 54
24/*G:055 We create a macro which puts the assembler code between lgstart_ and 55/*G:055 We create a macro which puts the assembler code between lgstart_ and
25 * lgend_ markers. These templates are put in the .text section: they can't be 56 * lgend_ markers. These templates are put in the .text section: they can't be
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 02d0ae268267..13b5f2f813de 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -181,15 +181,15 @@ static void initialize(struct lguest *lg)
181 /* The Guest tells us where we're not to deliver interrupts by putting 181 /* The Guest tells us where we're not to deliver interrupts by putting
182 * the range of addresses into "struct lguest_data". */ 182 * the range of addresses into "struct lguest_data". */
183 if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start) 183 if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start)
184 || get_user(lg->noirq_end, &lg->lguest_data->noirq_end) 184 || get_user(lg->noirq_end, &lg->lguest_data->noirq_end))
185 /* We tell the Guest that it can't use the top 4MB of virtual
186 * addresses used by the Switcher. */
187 || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem))
188 kill_guest(lg, "bad guest page %p", lg->lguest_data); 185 kill_guest(lg, "bad guest page %p", lg->lguest_data);
189 186
190 /* We write the current time into the Guest's data page once now. */ 187 /* We write the current time into the Guest's data page once now. */
191 write_timestamp(lg); 188 write_timestamp(lg);
192 189
190 /* page_tables.c will also do some setup. */
191 page_table_guest_data_init(lg);
192
193 /* This is the one case where the above accesses might have been the 193 /* This is the one case where the above accesses might have been the
194 * first write to a Guest page. This may have caused a copy-on-write 194 * first write to a Guest page. This may have caused a copy-on-write
195 * fault, but the Guest might be referring to the old (read-only) 195 * fault, but the Guest might be referring to the old (read-only)
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index a57d757eab6e..3271c0031a1b 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -62,8 +62,9 @@ static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
62 * it). */ 62 * it). */
63static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) 63static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
64{ 64{
65 unsigned long gstack; 65 unsigned long gstack, origstack;
66 u32 eflags, ss, irq_enable; 66 u32 eflags, ss, irq_enable;
67 unsigned long virtstack;
67 68
68 /* There are two cases for interrupts: one where the Guest is already 69 /* There are two cases for interrupts: one where the Guest is already
69 * in the kernel, and a more complex one where the Guest is in 70 * in the kernel, and a more complex one where the Guest is in
@@ -71,8 +72,10 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
71 if ((lg->regs->ss&0x3) != GUEST_PL) { 72 if ((lg->regs->ss&0x3) != GUEST_PL) {
72 /* The Guest told us their kernel stack with the SET_STACK 73 /* The Guest told us their kernel stack with the SET_STACK
73 * hypercall: both the virtual address and the segment */ 74 * hypercall: both the virtual address and the segment */
74 gstack = guest_pa(lg, lg->esp1); 75 virtstack = lg->esp1;
75 ss = lg->ss1; 76 ss = lg->ss1;
77
78 origstack = gstack = guest_pa(lg, virtstack);
76 /* We push the old stack segment and pointer onto the new 79 /* We push the old stack segment and pointer onto the new
77 * stack: when the Guest does an "iret" back from the interrupt 80 * stack: when the Guest does an "iret" back from the interrupt
78 * handler the CPU will notice they're dropping privilege 81 * handler the CPU will notice they're dropping privilege
@@ -81,8 +84,10 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
81 push_guest_stack(lg, &gstack, lg->regs->esp); 84 push_guest_stack(lg, &gstack, lg->regs->esp);
82 } else { 85 } else {
83 /* We're staying on the same Guest (kernel) stack. */ 86 /* We're staying on the same Guest (kernel) stack. */
84 gstack = guest_pa(lg, lg->regs->esp); 87 virtstack = lg->regs->esp;
85 ss = lg->regs->ss; 88 ss = lg->regs->ss;
89
90 origstack = gstack = guest_pa(lg, virtstack);
86 } 91 }
87 92
88 /* Remember that we never let the Guest actually disable interrupts, so 93 /* Remember that we never let the Guest actually disable interrupts, so
@@ -108,7 +113,7 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
108 /* Now we've pushed all the old state, we change the stack, the code 113 /* Now we've pushed all the old state, we change the stack, the code
109 * segment and the address to execute. */ 114 * segment and the address to execute. */
110 lg->regs->ss = ss; 115 lg->regs->ss = ss;
111 lg->regs->esp = gstack + lg->page_offset; 116 lg->regs->esp = virtstack + (gstack - origstack);
112 lg->regs->cs = (__KERNEL_CS|GUEST_PL); 117 lg->regs->cs = (__KERNEL_CS|GUEST_PL);
113 lg->regs->eip = idt_address(lo, hi); 118 lg->regs->eip = idt_address(lo, hi);
114 119
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 7408cebe995e..e4845d7f0688 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -63,7 +63,7 @@ struct lguest
63 /* This provides the offset to the base of guest-physical 63 /* This provides the offset to the base of guest-physical
64 * memory in the Launcher. */ 64 * memory in the Launcher. */
65 void __user *mem_base; 65 void __user *mem_base;
66 u32 page_offset; 66 unsigned long kernel_address;
67 u32 cr2; 67 u32 cr2;
68 int halted; 68 int halted;
69 int ts; 69 int ts;
@@ -165,6 +165,8 @@ void guest_set_pte(struct lguest *lg, unsigned long gpgdir,
165void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages); 165void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages);
166int demand_page(struct lguest *info, unsigned long cr2, int errcode); 166int demand_page(struct lguest *info, unsigned long cr2, int errcode);
167void pin_page(struct lguest *lg, unsigned long vaddr); 167void pin_page(struct lguest *lg, unsigned long vaddr);
168unsigned long guest_pa(struct lguest *lg, unsigned long vaddr);
169void page_table_guest_data_init(struct lguest *lg);
168 170
169/* <arch>/core.c: */ 171/* <arch>/core.c: */
170void lguest_arch_host_init(void); 172void lguest_arch_host_init(void);
@@ -229,9 +231,5 @@ do { \
229} while(0) 231} while(0)
230/* (End of aside) :*/ 232/* (End of aside) :*/
231 233
232static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
233{
234 return vaddr - lg->page_offset;
235}
236#endif /* __ASSEMBLY__ */ 234#endif /* __ASSEMBLY__ */
237#endif /* _LGUEST_H */ 235#endif /* _LGUEST_H */
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index b184652e45d7..61b177e1e649 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -111,7 +111,7 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
111 return run_guest(lg, (unsigned long __user *)user); 111 return run_guest(lg, (unsigned long __user *)user);
112} 112}
113 113
114/*L:020 The initialization write supplies 5 pointer sized (32 or 64 bit) 114/*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit)
115 * values (in addition to the LHREQ_INITIALIZE value). These are: 115 * values (in addition to the LHREQ_INITIALIZE value). These are:
116 * 116 *
117 * base: The start of the Guest-physical memory inside the Launcher memory. 117 * base: The start of the Guest-physical memory inside the Launcher memory.
@@ -124,12 +124,6 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
124 * pagetables (which are set up by the Launcher). 124 * pagetables (which are set up by the Launcher).
125 * 125 *
126 * start: The first instruction to execute ("eip" in x86-speak). 126 * start: The first instruction to execute ("eip" in x86-speak).
127 *
128 * page_offset: The PAGE_OFFSET constant in the Guest kernel. We should
129 * probably wean the code off this, but it's a very useful constant! Any
130 * address above this is within the Guest kernel, and any kernel address can
131 * quickly converted from physical to virtual by adding PAGE_OFFSET. It's
132 * 0xC0000000 (3G) by default, but it's configurable at kernel build time.
133 */ 127 */
134static int initialize(struct file *file, const unsigned long __user *input) 128static int initialize(struct file *file, const unsigned long __user *input)
135{ 129{
@@ -137,7 +131,7 @@ static int initialize(struct file *file, const unsigned long __user *input)
137 * Guest. */ 131 * Guest. */
138 struct lguest *lg; 132 struct lguest *lg;
139 int err; 133 int err;
140 unsigned long args[5]; 134 unsigned long args[4];
141 135
142 /* We grab the Big Lguest lock, which protects against multiple 136 /* We grab the Big Lguest lock, which protects against multiple
143 * simultaneous initializations. */ 137 * simultaneous initializations. */
@@ -162,7 +156,6 @@ static int initialize(struct file *file, const unsigned long __user *input)
162 /* Populate the easy fields of our "struct lguest" */ 156 /* Populate the easy fields of our "struct lguest" */
163 lg->mem_base = (void __user *)(long)args[0]; 157 lg->mem_base = (void __user *)(long)args[0];
164 lg->pfn_limit = args[1]; 158 lg->pfn_limit = args[1];
165 lg->page_offset = args[4];
166 159
167 /* We need a complete page for the Guest registers: they are accessible 160 /* We need a complete page for the Guest registers: they are accessible
168 * to the Guest and we can only grant it access to whole pages. */ 161 * to the Guest and we can only grant it access to whole pages. */
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index bfe3650b28d6..fe3c7575647b 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -13,6 +13,7 @@
13#include <linux/random.h> 13#include <linux/random.h>
14#include <linux/percpu.h> 14#include <linux/percpu.h>
15#include <asm/tlbflush.h> 15#include <asm/tlbflush.h>
16#include <asm/uaccess.h>
16#include "lg.h" 17#include "lg.h"
17 18
18/*M:008 We hold reference to pages, which prevents them from being swapped. 19/*M:008 We hold reference to pages, which prevents them from being swapped.
@@ -345,7 +346,7 @@ static void flush_user_mappings(struct lguest *lg, int idx)
345{ 346{
346 unsigned int i; 347 unsigned int i;
347 /* Release every pgd entry up to the kernel's address. */ 348 /* Release every pgd entry up to the kernel's address. */
348 for (i = 0; i < pgd_index(lg->page_offset); i++) 349 for (i = 0; i < pgd_index(lg->kernel_address); i++)
349 release_pgd(lg, lg->pgdirs[idx].pgdir + i); 350 release_pgd(lg, lg->pgdirs[idx].pgdir + i);
350} 351}
351 352
@@ -358,6 +359,25 @@ void guest_pagetable_flush_user(struct lguest *lg)
358} 359}
359/*:*/ 360/*:*/
360 361
362/* We walk down the guest page tables to get a guest-physical address */
363unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
364{
365 pgd_t gpgd;
366 pte_t gpte;
367
368 /* First step: get the top-level Guest page table entry. */
369 gpgd = __pgd(lgread_u32(lg, gpgd_addr(lg, vaddr)));
370 /* Toplevel not present? We can't map it in. */
371 if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
372 kill_guest(lg, "Bad address %#lx", vaddr);
373
374 gpte = __pte(lgread_u32(lg, gpte_addr(lg, gpgd, vaddr)));
375 if (!(pte_flags(gpte) & _PAGE_PRESENT))
376 kill_guest(lg, "Bad address %#lx", vaddr);
377
378 return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
379}
380
361/* We keep several page tables. This is a simple routine to find the page 381/* We keep several page tables. This is a simple routine to find the page
362 * table (if any) corresponding to this top-level address the Guest has given 382 * table (if any) corresponding to this top-level address the Guest has given
363 * us. */ 383 * us. */
@@ -500,7 +520,7 @@ void guest_set_pte(struct lguest *lg,
500{ 520{
501 /* Kernel mappings must be changed on all top levels. Slow, but 521 /* Kernel mappings must be changed on all top levels. Slow, but
502 * doesn't happen often. */ 522 * doesn't happen often. */
503 if (vaddr >= lg->page_offset) { 523 if (vaddr >= lg->kernel_address) {
504 unsigned int i; 524 unsigned int i;
505 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 525 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
506 if (lg->pgdirs[i].pgdir) 526 if (lg->pgdirs[i].pgdir)
@@ -550,11 +570,6 @@ void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx)
550 * its first page table is. We set some things up here: */ 570 * its first page table is. We set some things up here: */
551int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) 571int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
552{ 572{
553 /* In flush_user_mappings() we loop from 0 to
554 * "pgd_index(lg->page_offset)". This assumes it won't hit
555 * the Switcher mappings, so check that now. */
556 if (pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX)
557 return -EINVAL;
558 /* We start on the first shadow page table, and give it a blank PGD 573 /* We start on the first shadow page table, and give it a blank PGD
559 * page. */ 574 * page. */
560 lg->pgdidx = 0; 575 lg->pgdidx = 0;
@@ -565,6 +580,24 @@ int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
565 return 0; 580 return 0;
566} 581}
567 582
583/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
584void page_table_guest_data_init(struct lguest *lg)
585{
586 /* We get the kernel address: above this is all kernel memory. */
587 if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address)
588 /* We tell the Guest that it can't use the top 4MB of virtual
589 * addresses used by the Switcher. */
590 || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
591 || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir))
592 kill_guest(lg, "bad guest page %p", lg->lguest_data);
593
594 /* In flush_user_mappings() we loop from 0 to
595 * "pgd_index(lg->kernel_address)". This assumes it won't hit the
596 * Switcher mappings, so check that now. */
597 if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX)
598 kill_guest(lg, "bad kernel address %#lx", lg->kernel_address);
599}
600
568/* When a Guest dies, our cleanup is fairly simple. */ 601/* When a Guest dies, our cleanup is fairly simple. */
569void free_guest_pagetable(struct lguest *lg) 602void free_guest_pagetable(struct lguest *lg)
570{ 603{
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index a125109446dc..39f64c95de18 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -216,9 +216,10 @@ static int emulate_insn(struct lguest *lg)
216 * guest_pa just subtracts the Guest's page_offset. */ 216 * guest_pa just subtracts the Guest's page_offset. */
217 unsigned long physaddr = guest_pa(lg, lg->regs->eip); 217 unsigned long physaddr = guest_pa(lg, lg->regs->eip);
218 218
219 /* The guest_pa() function only works for Guest kernel addresses, but 219 /* This must be the Guest kernel trying to do something, not userspace!
220 * that's all we're trying to do anyway. */ 220 * The bottom two bits of the CS segment register are the privilege
221 if (lg->regs->eip < lg->page_offset) 221 * level. */
222 if ((lg->regs->cs & 3) != GUEST_PL)
222 return 0; 223 return 0;
223 224
224 /* Decoding x86 instructions is icky. */ 225 /* Decoding x86 instructions is icky. */
diff --git a/include/asm-x86/lguest_hcall.h b/include/asm-x86/lguest_hcall.h
index 8f2a1edc4fe2..0c553ef36240 100644
--- a/include/asm-x86/lguest_hcall.h
+++ b/include/asm-x86/lguest_hcall.h
@@ -2,8 +2,6 @@
2#ifndef _X86_LGUEST_HCALL_H 2#ifndef _X86_LGUEST_HCALL_H
3#define _X86_LGUEST_HCALL_H 3#define _X86_LGUEST_HCALL_H
4 4
5#include <asm/hw_irq.h>
6
7#define LHCALL_FLUSH_ASYNC 0 5#define LHCALL_FLUSH_ASYNC 0
8#define LHCALL_LGUEST_INIT 1 6#define LHCALL_LGUEST_INIT 1
9#define LHCALL_CRASH 2 7#define LHCALL_CRASH 2
@@ -36,6 +34,9 @@
36 * definition of a gentleman: "someone who is only rude intentionally". */ 34 * definition of a gentleman: "someone who is only rude intentionally". */
37#define LGUEST_TRAP_ENTRY 0x1F 35#define LGUEST_TRAP_ENTRY 0x1F
38 36
37#ifndef __ASSEMBLY__
38#include <asm/hw_irq.h>
39
39static inline unsigned long 40static inline unsigned long
40hcall(unsigned long call, 41hcall(unsigned long call,
41 unsigned long arg1, unsigned long arg2, unsigned long arg3) 42 unsigned long arg1, unsigned long arg2, unsigned long arg3)
@@ -66,4 +67,6 @@ struct hcall_args
66 /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */ 67 /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */
67 unsigned long arg0, arg2, arg3, arg1; 68 unsigned long arg0, arg2, arg3, arg1;
68}; 69};
70
71#endif /* !__ASSEMBLY__ */
69#endif /* _I386_LGUEST_HCALL_H */ 72#endif /* _I386_LGUEST_HCALL_H */
diff --git a/include/linux/lguest.h b/include/linux/lguest.h
index 083052236db9..8beb29134626 100644
--- a/include/linux/lguest.h
+++ b/include/linux/lguest.h
@@ -44,11 +44,14 @@ struct lguest_data
44 unsigned long reserve_mem; 44 unsigned long reserve_mem;
45 /* KHz for the TSC clock. */ 45 /* KHz for the TSC clock. */
46 u32 tsc_khz; 46 u32 tsc_khz;
47 /* Page where the top-level pagetable is */
48 unsigned long pgdir;
47 49
48/* Fields initialized by the Guest at boot: */ 50/* Fields initialized by the Guest at boot: */
49 /* Instruction range to suppress interrupts even if enabled */ 51 /* Instruction range to suppress interrupts even if enabled */
50 unsigned long noirq_start, noirq_end; 52 unsigned long noirq_start, noirq_end;
51 53 /* Address above which page tables are all identical. */
54 unsigned long kernel_address;
52 /* The vector to try to use for system calls (0x40 or 0x80). */ 55 /* The vector to try to use for system calls (0x40 or 0x80). */
53 unsigned int syscall_vec; 56 unsigned int syscall_vec;
54}; 57};