aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/lguest/lguest.c
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/lguest/lguest.c')
-rw-r--r--Documentation/lguest/lguest.c189
1 files changed, 110 insertions, 79 deletions
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 401d26b464ff..140bd98a8417 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -1,10 +1,7 @@
1/*P:100 This is the Launcher code, a simple program which lays out the 1/*P:100 This is the Launcher code, a simple program which lays out the
2 * "physical" memory for the new Guest by mapping the kernel image and the 2 * "physical" memory for the new Guest by mapping the kernel image and the
3 * virtual devices, then reads repeatedly from /dev/lguest to run the Guest. 3 * virtual devices, then reads repeatedly from /dev/lguest to run the Guest.
4 * 4:*/
5 * The only trick: the Makefile links it at a high address so it will be clear
6 * of the guest memory region. It means that each Guest cannot have more than
7 * about 2.5G of memory on a normally configured Host. :*/
8#define _LARGEFILE64_SOURCE 5#define _LARGEFILE64_SOURCE
9#define _GNU_SOURCE 6#define _GNU_SOURCE
10#include <stdio.h> 7#include <stdio.h>
@@ -56,6 +53,8 @@ typedef uint8_t u8;
56#ifndef SIOCBRADDIF 53#ifndef SIOCBRADDIF
57#define SIOCBRADDIF 0x89a2 /* add interface to bridge */ 54#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
58#endif 55#endif
56/* We can have up to 256 pages for devices. */
57#define DEVICE_PAGES 256
59 58
60/*L:120 verbose is both a global flag and a macro. The C preprocessor allows 59/*L:120 verbose is both a global flag and a macro. The C preprocessor allows
61 * this, and although I wouldn't recommend it, it works quite nicely here. */ 60 * this, and although I wouldn't recommend it, it works quite nicely here. */
@@ -66,8 +65,10 @@ static bool verbose;
66 65
67/* The pipe to send commands to the waker process */ 66/* The pipe to send commands to the waker process */
68static int waker_fd; 67static int waker_fd;
69/* The top of guest physical memory. */ 68/* The pointer to the start of guest memory. */
70static u32 top; 69static void *guest_base;
70/* The maximum guest physical address allowed, and maximum possible. */
71static unsigned long guest_limit, guest_max;
71 72
72/* This is our list of devices. */ 73/* This is our list of devices. */
73struct device_list 74struct device_list
@@ -111,6 +112,29 @@ struct device
111 void *priv; 112 void *priv;
112}; 113};
113 114
115/*L:100 The Launcher code itself takes us out into userspace, that scary place
116 * where pointers run wild and free! Unfortunately, like most userspace
117 * programs, it's quite boring (which is why everyone likes to hack on the
118 * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it
119 * will get you through this section. Or, maybe not.
120 *
121 * The Launcher sets up a big chunk of memory to be the Guest's "physical"
122 * memory and stores it in "guest_base". In other words, Guest physical ==
123 * Launcher virtual with an offset.
124 *
125 * This can be tough to get your head around, but usually it just means that we
126 * use these trivial conversion functions when the Guest gives us it's
127 * "physical" addresses: */
128static void *from_guest_phys(unsigned long addr)
129{
130 return guest_base + addr;
131}
132
133static unsigned long to_guest_phys(const void *addr)
134{
135 return (addr - guest_base);
136}
137
114/*L:130 138/*L:130
115 * Loading the Kernel. 139 * Loading the Kernel.
116 * 140 *
@@ -124,33 +148,40 @@ static int open_or_die(const char *name, int flags)
124 return fd; 148 return fd;
125} 149}
126 150
127/* map_zeroed_pages() takes a (page-aligned) address and a number of pages. */ 151/* map_zeroed_pages() takes a number of pages. */
128static void *map_zeroed_pages(unsigned long addr, unsigned int num) 152static void *map_zeroed_pages(unsigned int num)
129{ 153{
130 /* We cache the /dev/zero file-descriptor so we only open it once. */ 154 int fd = open_or_die("/dev/zero", O_RDONLY);
131 static int fd = -1; 155 void *addr;
132
133 if (fd == -1)
134 fd = open_or_die("/dev/zero", O_RDONLY);
135 156
136 /* We use a private mapping (ie. if we write to the page, it will be 157 /* We use a private mapping (ie. if we write to the page, it will be
137 * copied), and obviously we insist that it be mapped where we ask. */ 158 * copied). */
138 if (mmap((void *)addr, getpagesize() * num, 159 addr = mmap(NULL, getpagesize() * num,
139 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0) 160 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0);
140 != (void *)addr) 161 if (addr == MAP_FAILED)
141 err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr); 162 err(1, "Mmaping %u pages of /dev/zero", num);
142 163
143 /* Returning the address is just a courtesy: can simplify callers. */ 164 return addr;
144 return (void *)addr; 165}
166
167/* Get some more pages for a device. */
168static void *get_pages(unsigned int num)
169{
170 void *addr = from_guest_phys(guest_limit);
171
172 guest_limit += num * getpagesize();
173 if (guest_limit > guest_max)
174 errx(1, "Not enough memory for devices");
175 return addr;
145} 176}
146 177
147/* To find out where to start we look for the magic Guest string, which marks 178/* To find out where to start we look for the magic Guest string, which marks
148 * the code we see in lguest_asm.S. This is a hack which we are currently 179 * the code we see in lguest_asm.S. This is a hack which we are currently
149 * plotting to replace with the normal Linux entry point. */ 180 * plotting to replace with the normal Linux entry point. */
150static unsigned long entry_point(void *start, void *end, 181static unsigned long entry_point(const void *start, const void *end,
151 unsigned long page_offset) 182 unsigned long page_offset)
152{ 183{
153 void *p; 184 const void *p;
154 185
155 /* The scan gives us the physical starting address. We want the 186 /* The scan gives us the physical starting address. We want the
156 * virtual address in this case, and fortunately, we already figured 187 * virtual address in this case, and fortunately, we already figured
@@ -158,7 +189,8 @@ static unsigned long entry_point(void *start, void *end,
158 * "page_offset". */ 189 * "page_offset". */
159 for (p = start; p < end; p++) 190 for (p = start; p < end; p++)
160 if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0) 191 if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0)
161 return (long)p + strlen("GenuineLguest") + page_offset; 192 return to_guest_phys(p + strlen("GenuineLguest"))
193 + page_offset;
162 194
163 errx(1, "Is this image a genuine lguest?"); 195 errx(1, "Is this image a genuine lguest?");
164} 196}
@@ -201,9 +233,9 @@ static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
201static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, 233static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
202 unsigned long *page_offset) 234 unsigned long *page_offset)
203{ 235{
236 void *start = (void *)-1, *end = NULL;
204 Elf32_Phdr phdr[ehdr->e_phnum]; 237 Elf32_Phdr phdr[ehdr->e_phnum];
205 unsigned int i; 238 unsigned int i;
206 unsigned long start = -1UL, end = 0;
207 239
208 /* Sanity checks on the main ELF header: an x86 executable with a 240 /* Sanity checks on the main ELF header: an x86 executable with a
209 * reasonable number of correctly-sized program headers. */ 241 * reasonable number of correctly-sized program headers. */
@@ -246,17 +278,17 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
246 278
247 /* We track the first and last address we mapped, so we can 279 /* We track the first and last address we mapped, so we can
248 * tell entry_point() where to scan. */ 280 * tell entry_point() where to scan. */
249 if (phdr[i].p_paddr < start) 281 if (from_guest_phys(phdr[i].p_paddr) < start)
250 start = phdr[i].p_paddr; 282 start = from_guest_phys(phdr[i].p_paddr);
251 if (phdr[i].p_paddr + phdr[i].p_filesz > end) 283 if (from_guest_phys(phdr[i].p_paddr) + phdr[i].p_filesz > end)
252 end = phdr[i].p_paddr + phdr[i].p_filesz; 284 end=from_guest_phys(phdr[i].p_paddr)+phdr[i].p_filesz;
253 285
254 /* We map this section of the file at its physical address. */ 286 /* We map this section of the file at its physical address. */
255 map_at(elf_fd, (void *)phdr[i].p_paddr, 287 map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
256 phdr[i].p_offset, phdr[i].p_filesz); 288 phdr[i].p_offset, phdr[i].p_filesz);
257 } 289 }
258 290
259 return entry_point((void *)start, (void *)end, *page_offset); 291 return entry_point(start, end, *page_offset);
260} 292}
261 293
262/*L:170 Prepare to be SHOCKED and AMAZED. And possibly a trifle nauseated. 294/*L:170 Prepare to be SHOCKED and AMAZED. And possibly a trifle nauseated.
@@ -307,7 +339,7 @@ static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)
307 * actually configurable as CONFIG_PHYSICAL_START, but as the comment 339 * actually configurable as CONFIG_PHYSICAL_START, but as the comment
308 * there says, "Don't change this unless you know what you are doing". 340 * there says, "Don't change this unless you know what you are doing".
309 * Indeed. */ 341 * Indeed. */
310 void *img = (void *)0x100000; 342 void *img = from_guest_phys(0x100000);
311 343
312 /* gzdopen takes our file descriptor (carefully placed at the start of 344 /* gzdopen takes our file descriptor (carefully placed at the start of
313 * the GZIP header we found) and returns a gzFile. */ 345 * the GZIP header we found) and returns a gzFile. */
@@ -421,7 +453,7 @@ static unsigned long load_initrd(const char *name, unsigned long mem)
421 /* We map the initrd at the top of memory, but mmap wants it to be 453 /* We map the initrd at the top of memory, but mmap wants it to be
422 * page-aligned, so we round the size up for that. */ 454 * page-aligned, so we round the size up for that. */
423 len = page_align(st.st_size); 455 len = page_align(st.st_size);
424 map_at(ifd, (void *)mem - len, 0, st.st_size); 456 map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
425 /* Once a file is mapped, you can close the file descriptor. It's a 457 /* Once a file is mapped, you can close the file descriptor. It's a
426 * little odd, but quite useful. */ 458 * little odd, but quite useful. */
427 close(ifd); 459 close(ifd);
@@ -431,9 +463,9 @@ static unsigned long load_initrd(const char *name, unsigned long mem)
431 return len; 463 return len;
432} 464}
433 465
434/* Once we know how much memory we have, and the address the Guest kernel 466/* Once we know the address the Guest kernel expects, we can construct simple
435 * expects, we can construct simple linear page tables which will get the Guest 467 * linear page tables for all of memory which will get the Guest far enough
436 * far enough into the boot to create its own. 468 * into the boot to create its own.
437 * 469 *
438 * We lay them out of the way, just below the initrd (which is why we need to 470 * We lay them out of the way, just below the initrd (which is why we need to
439 * know its size). */ 471 * know its size). */
@@ -457,7 +489,7 @@ static unsigned long setup_pagetables(unsigned long mem,
457 linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; 489 linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page;
458 490
459 /* We put the toplevel page directory page at the top of memory. */ 491 /* We put the toplevel page directory page at the top of memory. */
460 pgdir = (void *)mem - initrd_size - getpagesize(); 492 pgdir = from_guest_phys(mem) - initrd_size - getpagesize();
461 493
462 /* Now we use the next linear_pages pages as pte pages */ 494 /* Now we use the next linear_pages pages as pte pages */
463 linear = (void *)pgdir - linear_pages*getpagesize(); 495 linear = (void *)pgdir - linear_pages*getpagesize();
@@ -473,15 +505,16 @@ static unsigned long setup_pagetables(unsigned long mem,
473 * continue from there. */ 505 * continue from there. */
474 for (i = 0; i < mapped_pages; i += ptes_per_page) { 506 for (i = 0; i < mapped_pages; i += ptes_per_page) {
475 pgdir[(i + page_offset/getpagesize())/ptes_per_page] 507 pgdir[(i + page_offset/getpagesize())/ptes_per_page]
476 = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT); 508 = ((to_guest_phys(linear) + i*sizeof(u32))
509 | PAGE_PRESENT);
477 } 510 }
478 511
479 verbose("Linear mapping of %u pages in %u pte pages at %p\n", 512 verbose("Linear mapping of %u pages in %u pte pages at %#lx\n",
480 mapped_pages, linear_pages, linear); 513 mapped_pages, linear_pages, to_guest_phys(linear));
481 514
482 /* We return the top level (guest-physical) address: the kernel needs 515 /* We return the top level (guest-physical) address: the kernel needs
483 * to know where it is. */ 516 * to know where it is. */
484 return (unsigned long)pgdir; 517 return to_guest_phys(pgdir);
485} 518}
486 519
487/* Simple routine to roll all the commandline arguments together with spaces 520/* Simple routine to roll all the commandline arguments together with spaces
@@ -501,14 +534,19 @@ static void concat(char *dst, char *args[])
501 534
502/* This is where we actually tell the kernel to initialize the Guest. We saw 535/* This is where we actually tell the kernel to initialize the Guest. We saw
503 * the arguments it expects when we looked at initialize() in lguest_user.c: 536 * the arguments it expects when we looked at initialize() in lguest_user.c:
504 * the top physical page to allow, the top level pagetable, the entry point and 537 * the base of guest "physical" memory, the top physical page to allow, the
505 * the page_offset constant for the Guest. */ 538 * top level pagetable, the entry point and the page_offset constant for the
539 * Guest. */
506static int tell_kernel(u32 pgdir, u32 start, u32 page_offset) 540static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
507{ 541{
508 u32 args[] = { LHREQ_INITIALIZE, 542 u32 args[] = { LHREQ_INITIALIZE,
509 top/getpagesize(), pgdir, start, page_offset }; 543 (unsigned long)guest_base,
544 guest_limit / getpagesize(),
545 pgdir, start, page_offset };
510 int fd; 546 int fd;
511 547
548 verbose("Guest: %p - %p (%#lx)\n",
549 guest_base, guest_base + guest_limit, guest_limit);
512 fd = open_or_die("/dev/lguest", O_RDWR); 550 fd = open_or_die("/dev/lguest", O_RDWR);
513 if (write(fd, args, sizeof(args)) < 0) 551 if (write(fd, args, sizeof(args)) < 0)
514 err(1, "Writing to /dev/lguest"); 552 err(1, "Writing to /dev/lguest");
@@ -605,11 +643,11 @@ static void *_check_pointer(unsigned long addr, unsigned int size,
605{ 643{
606 /* We have to separately check addr and addr+size, because size could 644 /* We have to separately check addr and addr+size, because size could
607 * be huge and addr + size might wrap around. */ 645 * be huge and addr + size might wrap around. */
608 if (addr >= top || addr + size >= top) 646 if (addr >= guest_limit || addr + size >= guest_limit)
609 errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr); 647 errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr);
610 /* We return a pointer for the caller's convenience, now we know it's 648 /* We return a pointer for the caller's convenience, now we know it's
611 * safe to use. */ 649 * safe to use. */
612 return (void *)addr; 650 return from_guest_phys(addr);
613} 651}
614/* A macro which transparently hands the line number to the real function. */ 652/* A macro which transparently hands the line number to the real function. */
615#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) 653#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
@@ -646,7 +684,7 @@ static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
646static u32 *get_dma_buffer(int fd, void *key, 684static u32 *get_dma_buffer(int fd, void *key,
647 struct iovec iov[], unsigned int *num, u32 *irq) 685 struct iovec iov[], unsigned int *num, u32 *irq)
648{ 686{
649 u32 buf[] = { LHREQ_GETDMA, (u32)key }; 687 u32 buf[] = { LHREQ_GETDMA, to_guest_phys(key) };
650 unsigned long udma; 688 unsigned long udma;
651 u32 *res; 689 u32 *res;
652 690
@@ -998,11 +1036,11 @@ new_dev_desc(struct lguest_device_desc *descs,
998 descs[i].features = features; 1036 descs[i].features = features;
999 descs[i].num_pages = num_pages; 1037 descs[i].num_pages = num_pages;
1000 /* If they said the device needs memory, we allocate 1038 /* If they said the device needs memory, we allocate
1001 * that now, bumping up the top of Guest memory. */ 1039 * that now. */
1002 if (num_pages) { 1040 if (num_pages) {
1003 map_zeroed_pages(top, num_pages); 1041 unsigned long pa;
1004 descs[i].pfn = top/getpagesize(); 1042 pa = to_guest_phys(get_pages(num_pages));
1005 top += num_pages*getpagesize(); 1043 descs[i].pfn = pa / getpagesize();
1006 } 1044 }
1007 return &descs[i]; 1045 return &descs[i];
1008 } 1046 }
@@ -1040,9 +1078,9 @@ static struct device *new_device(struct device_list *devices,
1040 if (handle_input) 1078 if (handle_input)
1041 set_fd(dev->fd, devices); 1079 set_fd(dev->fd, devices);
1042 dev->desc = new_dev_desc(devices->descs, type, features, num_pages); 1080 dev->desc = new_dev_desc(devices->descs, type, features, num_pages);
1043 dev->mem = (void *)(dev->desc->pfn * getpagesize()); 1081 dev->mem = from_guest_phys(dev->desc->pfn * getpagesize());
1044 dev->handle_input = handle_input; 1082 dev->handle_input = handle_input;
1045 dev->watch_key = (unsigned long)dev->mem + watch_off; 1083 dev->watch_key = to_guest_phys(dev->mem) + watch_off;
1046 dev->handle_output = handle_output; 1084 dev->handle_output = handle_output;
1047 return dev; 1085 return dev;
1048} 1086}
@@ -1382,21 +1420,7 @@ static void usage(void)
1382 "<mem-in-mb> vmlinux [args...]"); 1420 "<mem-in-mb> vmlinux [args...]");
1383} 1421}
1384 1422
1385/*L:100 The Launcher code itself takes us out into userspace, that scary place 1423/*L:105 The main routine is where the real work begins: */
1386 * where pointers run wild and free! Unfortunately, like most userspace
1387 * programs, it's quite boring (which is why everyone like to hack on the
1388 * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it
1389 * will get you through this section. Or, maybe not.
1390 *
1391 * The Launcher binary sits up high, usually starting at address 0xB8000000.
1392 * Everything below this is the "physical" memory for the Guest. For example,
1393 * if the Guest were to write a "1" at physical address 0, we would see a "1"
1394 * in the Launcher at "(int *)0". Guest physical == Launcher virtual.
1395 *
1396 * This can be tough to get your head around, but usually it just means that we
1397 * don't need to do any conversion when the Guest gives us it's "physical"
1398 * addresses.
1399 */
1400int main(int argc, char *argv[]) 1424int main(int argc, char *argv[])
1401{ 1425{
1402 /* Memory, top-level pagetable, code startpoint, PAGE_OFFSET and size 1426 /* Memory, top-level pagetable, code startpoint, PAGE_OFFSET and size
@@ -1406,8 +1430,8 @@ int main(int argc, char *argv[])
1406 int i, c, lguest_fd; 1430 int i, c, lguest_fd;
1407 /* The list of Guest devices, based on command line arguments. */ 1431 /* The list of Guest devices, based on command line arguments. */
1408 struct device_list device_list; 1432 struct device_list device_list;
1409 /* The boot information for the Guest: at guest-physical address 0. */ 1433 /* The boot information for the Guest. */
1410 void *boot = (void *)0; 1434 void *boot;
1411 /* If they specify an initrd file to load. */ 1435 /* If they specify an initrd file to load. */
1412 const char *initrd_name = NULL; 1436 const char *initrd_name = NULL;
1413 1437
@@ -1427,9 +1451,16 @@ int main(int argc, char *argv[])
1427 * of memory now. */ 1451 * of memory now. */
1428 for (i = 1; i < argc; i++) { 1452 for (i = 1; i < argc; i++) {
1429 if (argv[i][0] != '-') { 1453 if (argv[i][0] != '-') {
1430 mem = top = atoi(argv[i]) * 1024 * 1024; 1454 mem = atoi(argv[i]) * 1024 * 1024;
1431 device_list.descs = map_zeroed_pages(top, 1); 1455 /* We start by mapping anonymous pages over all of
1432 top += getpagesize(); 1456 * guest-physical memory range. This fills it with 0,
1457 * and ensures that the Guest won't be killed when it
1458 * tries to access it. */
1459 guest_base = map_zeroed_pages(mem / getpagesize()
1460 + DEVICE_PAGES);
1461 guest_limit = mem;
1462 guest_max = mem + DEVICE_PAGES*getpagesize();
1463 device_list.descs = get_pages(1);
1433 break; 1464 break;
1434 } 1465 }
1435 } 1466 }
@@ -1462,18 +1493,18 @@ int main(int argc, char *argv[])
1462 if (optind + 2 > argc) 1493 if (optind + 2 > argc)
1463 usage(); 1494 usage();
1464 1495
1496 verbose("Guest base is at %p\n", guest_base);
1497
1465 /* We always have a console device */ 1498 /* We always have a console device */
1466 setup_console(&device_list); 1499 setup_console(&device_list);
1467 1500
1468 /* We start by mapping anonymous pages over all of guest-physical
1469 * memory range. This fills it with 0, and ensures that the Guest
1470 * won't be killed when it tries to access it. */
1471 map_zeroed_pages(0, mem / getpagesize());
1472
1473 /* Now we load the kernel */ 1501 /* Now we load the kernel */
1474 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY), 1502 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY),
1475 &page_offset); 1503 &page_offset);
1476 1504
1505 /* Boot information is stashed at physical address 0 */
1506 boot = from_guest_phys(0);
1507
1477 /* Map the initrd image if requested (at top of physical memory) */ 1508 /* Map the initrd image if requested (at top of physical memory) */
1478 if (initrd_name) { 1509 if (initrd_name) {
1479 initrd_size = load_initrd(initrd_name, mem); 1510 initrd_size = load_initrd(initrd_name, mem);
@@ -1495,7 +1526,7 @@ int main(int argc, char *argv[])
1495 = ((struct e820entry) { 0, mem, E820_RAM }); 1526 = ((struct e820entry) { 0, mem, E820_RAM });
1496 /* The boot header contains a command line pointer: we put the command 1527 /* The boot header contains a command line pointer: we put the command
1497 * line after the boot header (at address 4096) */ 1528 * line after the boot header (at address 4096) */
1498 *(void **)(boot + 0x228) = boot + 4096; 1529 *(u32 *)(boot + 0x228) = 4096;
1499 concat(boot + 4096, argv+optind+2); 1530 concat(boot + 4096, argv+optind+2);
1500 1531
1501 /* The guest type value of "1" tells the Guest it's under lguest. */ 1532 /* The guest type value of "1" tells the Guest it's under lguest. */