aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/lguest
diff options
context:
space:
mode:
authorRusty Russell <rusty@rustcorp.com.au>2007-10-21 21:03:36 -0400
committerRusty Russell <rusty@rustcorp.com.au>2007-10-23 01:49:54 -0400
commit47436aa4ad054c1c7c8231618e86ebd9305308dc (patch)
treea9ba6e0521f9116442144a86e781a3164ec86094 /Documentation/lguest
parentc18acd73ffc209def08003a1927473096f66c5ad (diff)
Boot with virtual == physical to get closer to native Linux.
1) This allows us to get alot closer to booting bzImages. 2) It means we don't have to know page_offset. 3) The Guest needs to modify the boot pagetables to create the PAGE_OFFSET mapping before jumping to C code. 4) guest_pa() walks the page tables rather than using page_offset. 5) We don't use page_offset to figure out whether to emulate: it was always kinda quesationable, and won't work for instructions done before remapping (bzImage unpacking in particular). 6) We still want the kernel address for tlb flushing: have the initial hypercall give us that, too. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Diffstat (limited to 'Documentation/lguest')
-rw-r--r--Documentation/lguest/lguest.c134
1 files changed, 31 insertions, 103 deletions
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 4950b03514e6..32c2eaf94c4d 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -178,19 +178,16 @@ static void *get_pages(unsigned int num)
178/* To find out where to start we look for the magic Guest string, which marks 178/* To find out where to start we look for the magic Guest string, which marks
179 * the code we see in lguest_asm.S. This is a hack which we are currently 179 * the code we see in lguest_asm.S. This is a hack which we are currently
180 * plotting to replace with the normal Linux entry point. */ 180 * plotting to replace with the normal Linux entry point. */
181static unsigned long entry_point(const void *start, const void *end, 181static unsigned long entry_point(const void *start, const void *end)
182 unsigned long page_offset)
183{ 182{
184 const void *p; 183 const void *p;
185 184
186 /* The scan gives us the physical starting address. We want the 185 /* The scan gives us the physical starting address. We boot with
187 * virtual address in this case, and fortunately, we already figured 186 * pagetables set up with virtual and physical the same, so that's
188 * out the physical-virtual difference and passed it here in 187 * OK. */
189 * "page_offset". */
190 for (p = start; p < end; p++) 188 for (p = start; p < end; p++)
191 if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0) 189 if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0)
192 return to_guest_phys(p + strlen("GenuineLguest")) 190 return to_guest_phys(p + strlen("GenuineLguest"));
193 + page_offset;
194 191
195 errx(1, "Is this image a genuine lguest?"); 192 errx(1, "Is this image a genuine lguest?");
196} 193}
@@ -224,14 +221,11 @@ static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
224 * by all modern binaries on Linux including the kernel. 221 * by all modern binaries on Linux including the kernel.
225 * 222 *
226 * The ELF headers give *two* addresses: a physical address, and a virtual 223 * The ELF headers give *two* addresses: a physical address, and a virtual
227 * address. The Guest kernel expects to be placed in memory at the physical 224 * address. We use the physical address; the Guest will map itself to the
228 * address, and the page tables set up so it will correspond to that virtual 225 * virtual address.
229 * address. We return the difference between the virtual and physical
230 * addresses in the "page_offset" pointer.
231 * 226 *
232 * We return the starting address. */ 227 * We return the starting address. */
233static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, 228static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
234 unsigned long *page_offset)
235{ 229{
236 void *start = (void *)-1, *end = NULL; 230 void *start = (void *)-1, *end = NULL;
237 Elf32_Phdr phdr[ehdr->e_phnum]; 231 Elf32_Phdr phdr[ehdr->e_phnum];
@@ -255,9 +249,6 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
255 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) 249 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
256 err(1, "Reading program headers"); 250 err(1, "Reading program headers");
257 251
258 /* We don't know page_offset yet. */
259 *page_offset = 0;
260
261 /* Try all the headers: there are usually only three. A read-only one, 252 /* Try all the headers: there are usually only three. A read-only one,
262 * a read-write one, and a "note" section which isn't loadable. */ 253 * a read-write one, and a "note" section which isn't loadable. */
263 for (i = 0; i < ehdr->e_phnum; i++) { 254 for (i = 0; i < ehdr->e_phnum; i++) {
@@ -268,14 +259,6 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
268 verbose("Section %i: size %i addr %p\n", 259 verbose("Section %i: size %i addr %p\n",
269 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); 260 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
270 261
271 /* We expect a simple linear address space: every segment must
272 * have the same difference between virtual (p_vaddr) and
273 * physical (p_paddr) address. */
274 if (!*page_offset)
275 *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
276 else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
277 errx(1, "Page offset of section %i different", i);
278
279 /* We track the first and last address we mapped, so we can 262 /* We track the first and last address we mapped, so we can
280 * tell entry_point() where to scan. */ 263 * tell entry_point() where to scan. */
281 if (from_guest_phys(phdr[i].p_paddr) < start) 264 if (from_guest_phys(phdr[i].p_paddr) < start)
@@ -288,50 +271,13 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
288 phdr[i].p_offset, phdr[i].p_filesz); 271 phdr[i].p_offset, phdr[i].p_filesz);
289 } 272 }
290 273
291 return entry_point(start, end, *page_offset); 274 return entry_point(start, end);
292}
293
294/*L:170 Prepare to be SHOCKED and AMAZED. And possibly a trifle nauseated.
295 *
296 * We know that CONFIG_PAGE_OFFSET sets what virtual address the kernel expects
297 * to be. We don't know what that option was, but we can figure it out
298 * approximately by looking at the addresses in the code. I chose the common
299 * case of reading a memory location into the %eax register:
300 *
301 * movl <some-address>, %eax
302 *
303 * This gets encoded as five bytes: "0xA1 <4-byte-address>". For example,
304 * "0xA1 0x18 0x60 0x47 0xC0" reads the address 0xC0476018 into %eax.
305 *
306 * In this example can guess that the kernel was compiled with
307 * CONFIG_PAGE_OFFSET set to 0xC0000000 (it's always a round number). If the
308 * kernel were larger than 16MB, we might see 0xC1 addresses show up, but our
309 * kernel isn't that bloated yet.
310 *
311 * Unfortunately, x86 has variable-length instructions, so finding this
312 * particular instruction properly involves writing a disassembler. Instead,
313 * we rely on statistics. We look for "0xA1" and tally the different bytes
314 * which occur 4 bytes later (the "0xC0" in our example above). When one of
315 * those bytes appears three times, we can be reasonably confident that it
316 * forms the start of CONFIG_PAGE_OFFSET.
317 *
318 * This is amazingly reliable. */
319static unsigned long intuit_page_offset(unsigned char *img, unsigned long len)
320{
321 unsigned int i, possibilities[256] = { 0 };
322
323 for (i = 0; i + 4 < len; i++) {
324 /* mov 0xXXXXXXXX,%eax */
325 if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3)
326 return (unsigned long)img[i+4] << 24;
327 }
328 errx(1, "could not determine page offset");
329} 275}
330 276
331/*L:160 Unfortunately the entire ELF image isn't compressed: the segments 277/*L:160 Unfortunately the entire ELF image isn't compressed: the segments
332 * which need loading are extracted and compressed raw. This denies us the 278 * which need loading are extracted and compressed raw. This denies us the
333 * information we need to make a fully-general loader. */ 279 * information we need to make a fully-general loader. */
334static unsigned long unpack_bzimage(int fd, unsigned long *page_offset) 280static unsigned long unpack_bzimage(int fd)
335{ 281{
336 gzFile f; 282 gzFile f;
337 int ret, len = 0; 283 int ret, len = 0;
@@ -352,12 +298,7 @@ static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)
352 298
353 verbose("Unpacked size %i addr %p\n", len, img); 299 verbose("Unpacked size %i addr %p\n", len, img);
354 300
355 /* Without the ELF header, we can't tell virtual-physical gap. This is 301 return entry_point(img, img + len);
356 * CONFIG_PAGE_OFFSET, and people do actually change it. Fortunately,
357 * I have a clever way of figuring it out from the code itself. */
358 *page_offset = intuit_page_offset(img, len);
359
360 return entry_point(img, img + len, *page_offset);
361} 302}
362 303
363/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're 304/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're
@@ -368,7 +309,7 @@ static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)
368 * The bzImage is formed by putting the decompressing code in front of the 309 * The bzImage is formed by putting the decompressing code in front of the
369 * compressed kernel code. So we can simple scan through it looking for the 310 * compressed kernel code. So we can simple scan through it looking for the
370 * first "gzip" header, and start decompressing from there. */ 311 * first "gzip" header, and start decompressing from there. */
371static unsigned long load_bzimage(int fd, unsigned long *page_offset) 312static unsigned long load_bzimage(int fd)
372{ 313{
373 unsigned char c; 314 unsigned char c;
374 int state = 0; 315 int state = 0;
@@ -396,7 +337,7 @@ static unsigned long load_bzimage(int fd, unsigned long *page_offset)
396 if (c != 0x03) 337 if (c != 0x03)
397 state = -1; 338 state = -1;
398 else 339 else
399 return unpack_bzimage(fd, page_offset); 340 return unpack_bzimage(fd);
400 } 341 }
401 } 342 }
402 errx(1, "Could not find kernel in bzImage"); 343 errx(1, "Could not find kernel in bzImage");
@@ -405,7 +346,7 @@ static unsigned long load_bzimage(int fd, unsigned long *page_offset)
405/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels 346/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels
406 * come wrapped up in the self-decompressing "bzImage" format. With some funky 347 * come wrapped up in the self-decompressing "bzImage" format. With some funky
407 * coding, we can load those, too. */ 348 * coding, we can load those, too. */
408static unsigned long load_kernel(int fd, unsigned long *page_offset) 349static unsigned long load_kernel(int fd)
409{ 350{
410 Elf32_Ehdr hdr; 351 Elf32_Ehdr hdr;
411 352
@@ -415,10 +356,10 @@ static unsigned long load_kernel(int fd, unsigned long *page_offset)
415 356
416 /* If it's an ELF file, it starts with "\177ELF" */ 357 /* If it's an ELF file, it starts with "\177ELF" */
417 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) 358 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
418 return map_elf(fd, &hdr, page_offset); 359 return map_elf(fd, &hdr);
419 360
420 /* Otherwise we assume it's a bzImage, and try to unpack it */ 361 /* Otherwise we assume it's a bzImage, and try to unpack it */
421 return load_bzimage(fd, page_offset); 362 return load_bzimage(fd);
422} 363}
423 364
424/* This is a trivial little helper to align pages. Andi Kleen hated it because 365/* This is a trivial little helper to align pages. Andi Kleen hated it because
@@ -463,27 +404,20 @@ static unsigned long load_initrd(const char *name, unsigned long mem)
463 return len; 404 return len;
464} 405}
465 406
466/* Once we know the address the Guest kernel expects, we can construct simple 407/* Once we know how much memory we have, we can construct simple linear page
467 * linear page tables for all of memory which will get the Guest far enough 408 * tables which set virtual == physical which will get the Guest far enough
468 * into the boot to create its own. 409 * into the boot to create its own.
469 * 410 *
470 * We lay them out of the way, just below the initrd (which is why we need to 411 * We lay them out of the way, just below the initrd (which is why we need to
471 * know its size). */ 412 * know its size). */
472static unsigned long setup_pagetables(unsigned long mem, 413static unsigned long setup_pagetables(unsigned long mem,
473 unsigned long initrd_size, 414 unsigned long initrd_size)
474 unsigned long page_offset)
475{ 415{
476 unsigned long *pgdir, *linear; 416 unsigned long *pgdir, *linear;
477 unsigned int mapped_pages, i, linear_pages; 417 unsigned int mapped_pages, i, linear_pages;
478 unsigned int ptes_per_page = getpagesize()/sizeof(void *); 418 unsigned int ptes_per_page = getpagesize()/sizeof(void *);
479 419
480 /* Ideally we map all physical memory starting at page_offset. 420 mapped_pages = mem/getpagesize();
481 * However, if page_offset is 0xC0000000 we can only map 1G of physical
482 * (0xC0000000 + 1G overflows). */
483 if (mem <= -page_offset)
484 mapped_pages = mem/getpagesize();
485 else
486 mapped_pages = -page_offset/getpagesize();
487 421
488 /* Each PTE page can map ptes_per_page pages: how many do we need? */ 422 /* Each PTE page can map ptes_per_page pages: how many do we need? */
489 linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; 423 linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page;
@@ -500,11 +434,9 @@ static unsigned long setup_pagetables(unsigned long mem,
500 for (i = 0; i < mapped_pages; i++) 434 for (i = 0; i < mapped_pages; i++)
501 linear[i] = ((i * getpagesize()) | PAGE_PRESENT); 435 linear[i] = ((i * getpagesize()) | PAGE_PRESENT);
502 436
503 /* The top level points to the linear page table pages above. The 437 /* The top level points to the linear page table pages above. */
504 * entry representing page_offset points to the first one, and they
505 * continue from there. */
506 for (i = 0; i < mapped_pages; i += ptes_per_page) { 438 for (i = 0; i < mapped_pages; i += ptes_per_page) {
507 pgdir[(i + page_offset/getpagesize())/ptes_per_page] 439 pgdir[i/ptes_per_page]
508 = ((to_guest_phys(linear) + i*sizeof(void *)) 440 = ((to_guest_phys(linear) + i*sizeof(void *))
509 | PAGE_PRESENT); 441 | PAGE_PRESENT);
510 } 442 }
@@ -535,15 +467,12 @@ static void concat(char *dst, char *args[])
535/* This is where we actually tell the kernel to initialize the Guest. We saw 467/* This is where we actually tell the kernel to initialize the Guest. We saw
536 * the arguments it expects when we looked at initialize() in lguest_user.c: 468 * the arguments it expects when we looked at initialize() in lguest_user.c:
537 * the base of guest "physical" memory, the top physical page to allow, the 469 * the base of guest "physical" memory, the top physical page to allow, the
538 * top level pagetable, the entry point and the page_offset constant for the 470 * top level pagetable and the entry point for the Guest. */
539 * Guest. */ 471static int tell_kernel(unsigned long pgdir, unsigned long start)
540static int tell_kernel(unsigned long pgdir, unsigned long start,
541 unsigned long page_offset)
542{ 472{
543 unsigned long args[] = { LHREQ_INITIALIZE, 473 unsigned long args[] = { LHREQ_INITIALIZE,
544 (unsigned long)guest_base, 474 (unsigned long)guest_base,
545 guest_limit / getpagesize(), 475 guest_limit / getpagesize(), pgdir, start };
546 pgdir, start, page_offset };
547 int fd; 476 int fd;
548 477
549 verbose("Guest: %p - %p (%#lx)\n", 478 verbose("Guest: %p - %p (%#lx)\n",
@@ -1424,9 +1353,9 @@ static void usage(void)
1424/*L:105 The main routine is where the real work begins: */ 1353/*L:105 The main routine is where the real work begins: */
1425int main(int argc, char *argv[]) 1354int main(int argc, char *argv[])
1426{ 1355{
1427 /* Memory, top-level pagetable, code startpoint, PAGE_OFFSET and size 1356 /* Memory, top-level pagetable, code startpoint and size of the
1428 * of the (optional) initrd. */ 1357 * (optional) initrd. */
1429 unsigned long mem = 0, pgdir, start, page_offset, initrd_size = 0; 1358 unsigned long mem = 0, pgdir, start, initrd_size = 0;
1430 /* A temporary and the /dev/lguest file descriptor. */ 1359 /* A temporary and the /dev/lguest file descriptor. */
1431 int i, c, lguest_fd; 1360 int i, c, lguest_fd;
1432 /* The list of Guest devices, based on command line arguments. */ 1361 /* The list of Guest devices, based on command line arguments. */
@@ -1500,8 +1429,7 @@ int main(int argc, char *argv[])
1500 setup_console(&device_list); 1429 setup_console(&device_list);
1501 1430
1502 /* Now we load the kernel */ 1431 /* Now we load the kernel */
1503 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY), 1432 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
1504 &page_offset);
1505 1433
1506 /* Boot information is stashed at physical address 0 */ 1434 /* Boot information is stashed at physical address 0 */
1507 boot = from_guest_phys(0); 1435 boot = from_guest_phys(0);
@@ -1518,7 +1446,7 @@ int main(int argc, char *argv[])
1518 } 1446 }
1519 1447
1520 /* Set up the initial linear pagetables, starting below the initrd. */ 1448 /* Set up the initial linear pagetables, starting below the initrd. */
1521 pgdir = setup_pagetables(mem, initrd_size, page_offset); 1449 pgdir = setup_pagetables(mem, initrd_size);
1522 1450
1523 /* The Linux boot header contains an "E820" memory map: ours is a 1451 /* The Linux boot header contains an "E820" memory map: ours is a
1524 * simple, single region. */ 1452 * simple, single region. */
@@ -1535,7 +1463,7 @@ int main(int argc, char *argv[])
1535 1463
1536 /* We tell the kernel to initialize the Guest: this returns the open 1464 /* We tell the kernel to initialize the Guest: this returns the open
1537 * /dev/lguest file descriptor. */ 1465 * /dev/lguest file descriptor. */
1538 lguest_fd = tell_kernel(pgdir, start, page_offset); 1466 lguest_fd = tell_kernel(pgdir, start);
1539 1467
1540 /* We fork off a child process, which wakes the Launcher whenever one 1468 /* We fork off a child process, which wakes the Launcher whenever one
1541 * of the input file descriptors needs attention. Otherwise we would 1469 * of the input file descriptors needs attention. Otherwise we would