diff options
author | Rusty Russell <rusty@rustcorp.com.au> | 2007-10-21 21:03:36 -0400 |
---|---|---|
committer | Rusty Russell <rusty@rustcorp.com.au> | 2007-10-23 01:49:54 -0400 |
commit | 47436aa4ad054c1c7c8231618e86ebd9305308dc (patch) | |
tree | a9ba6e0521f9116442144a86e781a3164ec86094 /Documentation | |
parent | c18acd73ffc209def08003a1927473096f66c5ad (diff) |
Boot with virtual == physical to get closer to native Linux.
1) This allows us to get alot closer to booting bzImages.
2) It means we don't have to know page_offset.
3) The Guest needs to modify the boot pagetables to create the
PAGE_OFFSET mapping before jumping to C code.
4) guest_pa() walks the page tables rather than using page_offset.
5) We don't use page_offset to figure out whether to emulate: it was
always kinda quesationable, and won't work for instructions done
before remapping (bzImage unpacking in particular).
6) We still want the kernel address for tlb flushing: have the initial
hypercall give us that, too.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Diffstat (limited to 'Documentation')
-rw-r--r-- | Documentation/lguest/lguest.c | 134 |
1 files changed, 31 insertions, 103 deletions
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 4950b03514e6..32c2eaf94c4d 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c | |||
@@ -178,19 +178,16 @@ static void *get_pages(unsigned int num) | |||
178 | /* To find out where to start we look for the magic Guest string, which marks | 178 | /* To find out where to start we look for the magic Guest string, which marks |
179 | * the code we see in lguest_asm.S. This is a hack which we are currently | 179 | * the code we see in lguest_asm.S. This is a hack which we are currently |
180 | * plotting to replace with the normal Linux entry point. */ | 180 | * plotting to replace with the normal Linux entry point. */ |
181 | static unsigned long entry_point(const void *start, const void *end, | 181 | static unsigned long entry_point(const void *start, const void *end) |
182 | unsigned long page_offset) | ||
183 | { | 182 | { |
184 | const void *p; | 183 | const void *p; |
185 | 184 | ||
186 | /* The scan gives us the physical starting address. We want the | 185 | /* The scan gives us the physical starting address. We boot with |
187 | * virtual address in this case, and fortunately, we already figured | 186 | * pagetables set up with virtual and physical the same, so that's |
188 | * out the physical-virtual difference and passed it here in | 187 | * OK. */ |
189 | * "page_offset". */ | ||
190 | for (p = start; p < end; p++) | 188 | for (p = start; p < end; p++) |
191 | if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0) | 189 | if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0) |
192 | return to_guest_phys(p + strlen("GenuineLguest")) | 190 | return to_guest_phys(p + strlen("GenuineLguest")); |
193 | + page_offset; | ||
194 | 191 | ||
195 | errx(1, "Is this image a genuine lguest?"); | 192 | errx(1, "Is this image a genuine lguest?"); |
196 | } | 193 | } |
@@ -224,14 +221,11 @@ static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) | |||
224 | * by all modern binaries on Linux including the kernel. | 221 | * by all modern binaries on Linux including the kernel. |
225 | * | 222 | * |
226 | * The ELF headers give *two* addresses: a physical address, and a virtual | 223 | * The ELF headers give *two* addresses: a physical address, and a virtual |
227 | * address. The Guest kernel expects to be placed in memory at the physical | 224 | * address. We use the physical address; the Guest will map itself to the |
228 | * address, and the page tables set up so it will correspond to that virtual | 225 | * virtual address. |
229 | * address. We return the difference between the virtual and physical | ||
230 | * addresses in the "page_offset" pointer. | ||
231 | * | 226 | * |
232 | * We return the starting address. */ | 227 | * We return the starting address. */ |
233 | static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, | 228 | static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) |
234 | unsigned long *page_offset) | ||
235 | { | 229 | { |
236 | void *start = (void *)-1, *end = NULL; | 230 | void *start = (void *)-1, *end = NULL; |
237 | Elf32_Phdr phdr[ehdr->e_phnum]; | 231 | Elf32_Phdr phdr[ehdr->e_phnum]; |
@@ -255,9 +249,6 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, | |||
255 | if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) | 249 | if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) |
256 | err(1, "Reading program headers"); | 250 | err(1, "Reading program headers"); |
257 | 251 | ||
258 | /* We don't know page_offset yet. */ | ||
259 | *page_offset = 0; | ||
260 | |||
261 | /* Try all the headers: there are usually only three. A read-only one, | 252 | /* Try all the headers: there are usually only three. A read-only one, |
262 | * a read-write one, and a "note" section which isn't loadable. */ | 253 | * a read-write one, and a "note" section which isn't loadable. */ |
263 | for (i = 0; i < ehdr->e_phnum; i++) { | 254 | for (i = 0; i < ehdr->e_phnum; i++) { |
@@ -268,14 +259,6 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, | |||
268 | verbose("Section %i: size %i addr %p\n", | 259 | verbose("Section %i: size %i addr %p\n", |
269 | i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); | 260 | i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); |
270 | 261 | ||
271 | /* We expect a simple linear address space: every segment must | ||
272 | * have the same difference between virtual (p_vaddr) and | ||
273 | * physical (p_paddr) address. */ | ||
274 | if (!*page_offset) | ||
275 | *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr; | ||
276 | else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr) | ||
277 | errx(1, "Page offset of section %i different", i); | ||
278 | |||
279 | /* We track the first and last address we mapped, so we can | 262 | /* We track the first and last address we mapped, so we can |
280 | * tell entry_point() where to scan. */ | 263 | * tell entry_point() where to scan. */ |
281 | if (from_guest_phys(phdr[i].p_paddr) < start) | 264 | if (from_guest_phys(phdr[i].p_paddr) < start) |
@@ -288,50 +271,13 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, | |||
288 | phdr[i].p_offset, phdr[i].p_filesz); | 271 | phdr[i].p_offset, phdr[i].p_filesz); |
289 | } | 272 | } |
290 | 273 | ||
291 | return entry_point(start, end, *page_offset); | 274 | return entry_point(start, end); |
292 | } | ||
293 | |||
294 | /*L:170 Prepare to be SHOCKED and AMAZED. And possibly a trifle nauseated. | ||
295 | * | ||
296 | * We know that CONFIG_PAGE_OFFSET sets what virtual address the kernel expects | ||
297 | * to be. We don't know what that option was, but we can figure it out | ||
298 | * approximately by looking at the addresses in the code. I chose the common | ||
299 | * case of reading a memory location into the %eax register: | ||
300 | * | ||
301 | * movl <some-address>, %eax | ||
302 | * | ||
303 | * This gets encoded as five bytes: "0xA1 <4-byte-address>". For example, | ||
304 | * "0xA1 0x18 0x60 0x47 0xC0" reads the address 0xC0476018 into %eax. | ||
305 | * | ||
306 | * In this example can guess that the kernel was compiled with | ||
307 | * CONFIG_PAGE_OFFSET set to 0xC0000000 (it's always a round number). If the | ||
308 | * kernel were larger than 16MB, we might see 0xC1 addresses show up, but our | ||
309 | * kernel isn't that bloated yet. | ||
310 | * | ||
311 | * Unfortunately, x86 has variable-length instructions, so finding this | ||
312 | * particular instruction properly involves writing a disassembler. Instead, | ||
313 | * we rely on statistics. We look for "0xA1" and tally the different bytes | ||
314 | * which occur 4 bytes later (the "0xC0" in our example above). When one of | ||
315 | * those bytes appears three times, we can be reasonably confident that it | ||
316 | * forms the start of CONFIG_PAGE_OFFSET. | ||
317 | * | ||
318 | * This is amazingly reliable. */ | ||
319 | static unsigned long intuit_page_offset(unsigned char *img, unsigned long len) | ||
320 | { | ||
321 | unsigned int i, possibilities[256] = { 0 }; | ||
322 | |||
323 | for (i = 0; i + 4 < len; i++) { | ||
324 | /* mov 0xXXXXXXXX,%eax */ | ||
325 | if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3) | ||
326 | return (unsigned long)img[i+4] << 24; | ||
327 | } | ||
328 | errx(1, "could not determine page offset"); | ||
329 | } | 275 | } |
330 | 276 | ||
331 | /*L:160 Unfortunately the entire ELF image isn't compressed: the segments | 277 | /*L:160 Unfortunately the entire ELF image isn't compressed: the segments |
332 | * which need loading are extracted and compressed raw. This denies us the | 278 | * which need loading are extracted and compressed raw. This denies us the |
333 | * information we need to make a fully-general loader. */ | 279 | * information we need to make a fully-general loader. */ |
334 | static unsigned long unpack_bzimage(int fd, unsigned long *page_offset) | 280 | static unsigned long unpack_bzimage(int fd) |
335 | { | 281 | { |
336 | gzFile f; | 282 | gzFile f; |
337 | int ret, len = 0; | 283 | int ret, len = 0; |
@@ -352,12 +298,7 @@ static unsigned long unpack_bzimage(int fd, unsigned long *page_offset) | |||
352 | 298 | ||
353 | verbose("Unpacked size %i addr %p\n", len, img); | 299 | verbose("Unpacked size %i addr %p\n", len, img); |
354 | 300 | ||
355 | /* Without the ELF header, we can't tell virtual-physical gap. This is | 301 | return entry_point(img, img + len); |
356 | * CONFIG_PAGE_OFFSET, and people do actually change it. Fortunately, | ||
357 | * I have a clever way of figuring it out from the code itself. */ | ||
358 | *page_offset = intuit_page_offset(img, len); | ||
359 | |||
360 | return entry_point(img, img + len, *page_offset); | ||
361 | } | 302 | } |
362 | 303 | ||
363 | /*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're | 304 | /*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're |
@@ -368,7 +309,7 @@ static unsigned long unpack_bzimage(int fd, unsigned long *page_offset) | |||
368 | * The bzImage is formed by putting the decompressing code in front of the | 309 | * The bzImage is formed by putting the decompressing code in front of the |
369 | * compressed kernel code. So we can simple scan through it looking for the | 310 | * compressed kernel code. So we can simple scan through it looking for the |
370 | * first "gzip" header, and start decompressing from there. */ | 311 | * first "gzip" header, and start decompressing from there. */ |
371 | static unsigned long load_bzimage(int fd, unsigned long *page_offset) | 312 | static unsigned long load_bzimage(int fd) |
372 | { | 313 | { |
373 | unsigned char c; | 314 | unsigned char c; |
374 | int state = 0; | 315 | int state = 0; |
@@ -396,7 +337,7 @@ static unsigned long load_bzimage(int fd, unsigned long *page_offset) | |||
396 | if (c != 0x03) | 337 | if (c != 0x03) |
397 | state = -1; | 338 | state = -1; |
398 | else | 339 | else |
399 | return unpack_bzimage(fd, page_offset); | 340 | return unpack_bzimage(fd); |
400 | } | 341 | } |
401 | } | 342 | } |
402 | errx(1, "Could not find kernel in bzImage"); | 343 | errx(1, "Could not find kernel in bzImage"); |
@@ -405,7 +346,7 @@ static unsigned long load_bzimage(int fd, unsigned long *page_offset) | |||
405 | /*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels | 346 | /*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels |
406 | * come wrapped up in the self-decompressing "bzImage" format. With some funky | 347 | * come wrapped up in the self-decompressing "bzImage" format. With some funky |
407 | * coding, we can load those, too. */ | 348 | * coding, we can load those, too. */ |
408 | static unsigned long load_kernel(int fd, unsigned long *page_offset) | 349 | static unsigned long load_kernel(int fd) |
409 | { | 350 | { |
410 | Elf32_Ehdr hdr; | 351 | Elf32_Ehdr hdr; |
411 | 352 | ||
@@ -415,10 +356,10 @@ static unsigned long load_kernel(int fd, unsigned long *page_offset) | |||
415 | 356 | ||
416 | /* If it's an ELF file, it starts with "\177ELF" */ | 357 | /* If it's an ELF file, it starts with "\177ELF" */ |
417 | if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) | 358 | if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) |
418 | return map_elf(fd, &hdr, page_offset); | 359 | return map_elf(fd, &hdr); |
419 | 360 | ||
420 | /* Otherwise we assume it's a bzImage, and try to unpack it */ | 361 | /* Otherwise we assume it's a bzImage, and try to unpack it */ |
421 | return load_bzimage(fd, page_offset); | 362 | return load_bzimage(fd); |
422 | } | 363 | } |
423 | 364 | ||
424 | /* This is a trivial little helper to align pages. Andi Kleen hated it because | 365 | /* This is a trivial little helper to align pages. Andi Kleen hated it because |
@@ -463,27 +404,20 @@ static unsigned long load_initrd(const char *name, unsigned long mem) | |||
463 | return len; | 404 | return len; |
464 | } | 405 | } |
465 | 406 | ||
466 | /* Once we know the address the Guest kernel expects, we can construct simple | 407 | /* Once we know how much memory we have, we can construct simple linear page |
467 | * linear page tables for all of memory which will get the Guest far enough | 408 | * tables which set virtual == physical which will get the Guest far enough |
468 | * into the boot to create its own. | 409 | * into the boot to create its own. |
469 | * | 410 | * |
470 | * We lay them out of the way, just below the initrd (which is why we need to | 411 | * We lay them out of the way, just below the initrd (which is why we need to |
471 | * know its size). */ | 412 | * know its size). */ |
472 | static unsigned long setup_pagetables(unsigned long mem, | 413 | static unsigned long setup_pagetables(unsigned long mem, |
473 | unsigned long initrd_size, | 414 | unsigned long initrd_size) |
474 | unsigned long page_offset) | ||
475 | { | 415 | { |
476 | unsigned long *pgdir, *linear; | 416 | unsigned long *pgdir, *linear; |
477 | unsigned int mapped_pages, i, linear_pages; | 417 | unsigned int mapped_pages, i, linear_pages; |
478 | unsigned int ptes_per_page = getpagesize()/sizeof(void *); | 418 | unsigned int ptes_per_page = getpagesize()/sizeof(void *); |
479 | 419 | ||
480 | /* Ideally we map all physical memory starting at page_offset. | 420 | mapped_pages = mem/getpagesize(); |
481 | * However, if page_offset is 0xC0000000 we can only map 1G of physical | ||
482 | * (0xC0000000 + 1G overflows). */ | ||
483 | if (mem <= -page_offset) | ||
484 | mapped_pages = mem/getpagesize(); | ||
485 | else | ||
486 | mapped_pages = -page_offset/getpagesize(); | ||
487 | 421 | ||
488 | /* Each PTE page can map ptes_per_page pages: how many do we need? */ | 422 | /* Each PTE page can map ptes_per_page pages: how many do we need? */ |
489 | linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; | 423 | linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; |
@@ -500,11 +434,9 @@ static unsigned long setup_pagetables(unsigned long mem, | |||
500 | for (i = 0; i < mapped_pages; i++) | 434 | for (i = 0; i < mapped_pages; i++) |
501 | linear[i] = ((i * getpagesize()) | PAGE_PRESENT); | 435 | linear[i] = ((i * getpagesize()) | PAGE_PRESENT); |
502 | 436 | ||
503 | /* The top level points to the linear page table pages above. The | 437 | /* The top level points to the linear page table pages above. */ |
504 | * entry representing page_offset points to the first one, and they | ||
505 | * continue from there. */ | ||
506 | for (i = 0; i < mapped_pages; i += ptes_per_page) { | 438 | for (i = 0; i < mapped_pages; i += ptes_per_page) { |
507 | pgdir[(i + page_offset/getpagesize())/ptes_per_page] | 439 | pgdir[i/ptes_per_page] |
508 | = ((to_guest_phys(linear) + i*sizeof(void *)) | 440 | = ((to_guest_phys(linear) + i*sizeof(void *)) |
509 | | PAGE_PRESENT); | 441 | | PAGE_PRESENT); |
510 | } | 442 | } |
@@ -535,15 +467,12 @@ static void concat(char *dst, char *args[]) | |||
535 | /* This is where we actually tell the kernel to initialize the Guest. We saw | 467 | /* This is where we actually tell the kernel to initialize the Guest. We saw |
536 | * the arguments it expects when we looked at initialize() in lguest_user.c: | 468 | * the arguments it expects when we looked at initialize() in lguest_user.c: |
537 | * the base of guest "physical" memory, the top physical page to allow, the | 469 | * the base of guest "physical" memory, the top physical page to allow, the |
538 | * top level pagetable, the entry point and the page_offset constant for the | 470 | * top level pagetable and the entry point for the Guest. */ |
539 | * Guest. */ | 471 | static int tell_kernel(unsigned long pgdir, unsigned long start) |
540 | static int tell_kernel(unsigned long pgdir, unsigned long start, | ||
541 | unsigned long page_offset) | ||
542 | { | 472 | { |
543 | unsigned long args[] = { LHREQ_INITIALIZE, | 473 | unsigned long args[] = { LHREQ_INITIALIZE, |
544 | (unsigned long)guest_base, | 474 | (unsigned long)guest_base, |
545 | guest_limit / getpagesize(), | 475 | guest_limit / getpagesize(), pgdir, start }; |
546 | pgdir, start, page_offset }; | ||
547 | int fd; | 476 | int fd; |
548 | 477 | ||
549 | verbose("Guest: %p - %p (%#lx)\n", | 478 | verbose("Guest: %p - %p (%#lx)\n", |
@@ -1424,9 +1353,9 @@ static void usage(void) | |||
1424 | /*L:105 The main routine is where the real work begins: */ | 1353 | /*L:105 The main routine is where the real work begins: */ |
1425 | int main(int argc, char *argv[]) | 1354 | int main(int argc, char *argv[]) |
1426 | { | 1355 | { |
1427 | /* Memory, top-level pagetable, code startpoint, PAGE_OFFSET and size | 1356 | /* Memory, top-level pagetable, code startpoint and size of the |
1428 | * of the (optional) initrd. */ | 1357 | * (optional) initrd. */ |
1429 | unsigned long mem = 0, pgdir, start, page_offset, initrd_size = 0; | 1358 | unsigned long mem = 0, pgdir, start, initrd_size = 0; |
1430 | /* A temporary and the /dev/lguest file descriptor. */ | 1359 | /* A temporary and the /dev/lguest file descriptor. */ |
1431 | int i, c, lguest_fd; | 1360 | int i, c, lguest_fd; |
1432 | /* The list of Guest devices, based on command line arguments. */ | 1361 | /* The list of Guest devices, based on command line arguments. */ |
@@ -1500,8 +1429,7 @@ int main(int argc, char *argv[]) | |||
1500 | setup_console(&device_list); | 1429 | setup_console(&device_list); |
1501 | 1430 | ||
1502 | /* Now we load the kernel */ | 1431 | /* Now we load the kernel */ |
1503 | start = load_kernel(open_or_die(argv[optind+1], O_RDONLY), | 1432 | start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); |
1504 | &page_offset); | ||
1505 | 1433 | ||
1506 | /* Boot information is stashed at physical address 0 */ | 1434 | /* Boot information is stashed at physical address 0 */ |
1507 | boot = from_guest_phys(0); | 1435 | boot = from_guest_phys(0); |
@@ -1518,7 +1446,7 @@ int main(int argc, char *argv[]) | |||
1518 | } | 1446 | } |
1519 | 1447 | ||
1520 | /* Set up the initial linear pagetables, starting below the initrd. */ | 1448 | /* Set up the initial linear pagetables, starting below the initrd. */ |
1521 | pgdir = setup_pagetables(mem, initrd_size, page_offset); | 1449 | pgdir = setup_pagetables(mem, initrd_size); |
1522 | 1450 | ||
1523 | /* The Linux boot header contains an "E820" memory map: ours is a | 1451 | /* The Linux boot header contains an "E820" memory map: ours is a |
1524 | * simple, single region. */ | 1452 | * simple, single region. */ |
@@ -1535,7 +1463,7 @@ int main(int argc, char *argv[]) | |||
1535 | 1463 | ||
1536 | /* We tell the kernel to initialize the Guest: this returns the open | 1464 | /* We tell the kernel to initialize the Guest: this returns the open |
1537 | * /dev/lguest file descriptor. */ | 1465 | * /dev/lguest file descriptor. */ |
1538 | lguest_fd = tell_kernel(pgdir, start, page_offset); | 1466 | lguest_fd = tell_kernel(pgdir, start); |
1539 | 1467 | ||
1540 | /* We fork off a child process, which wakes the Launcher whenever one | 1468 | /* We fork off a child process, which wakes the Launcher whenever one |
1541 | * of the input file descriptors needs attention. Otherwise we would | 1469 | * of the input file descriptors needs attention. Otherwise we would |