aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/lguest/lguest.c
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/lguest/lguest.c')
-rw-r--r--Documentation/lguest/lguest.c540
1 files changed, 349 insertions, 191 deletions
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 45d7d6dcae7a..aa66a52b73e9 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -1,7 +1,9 @@
1/*P:100 This is the Launcher code, a simple program which lays out the 1/*P:100
2 * "physical" memory for the new Guest by mapping the kernel image and 2 * This is the Launcher code, a simple program which lays out the "physical"
3 * the virtual devices, then opens /dev/lguest to tell the kernel 3 * memory for the new Guest by mapping the kernel image and the virtual
4 * about the Guest and control it. :*/ 4 * devices, then opens /dev/lguest to tell the kernel about the Guest and
5 * control it.
6:*/
5#define _LARGEFILE64_SOURCE 7#define _LARGEFILE64_SOURCE
6#define _GNU_SOURCE 8#define _GNU_SOURCE
7#include <stdio.h> 9#include <stdio.h>
@@ -46,13 +48,15 @@
46#include "linux/virtio_rng.h" 48#include "linux/virtio_rng.h"
47#include "linux/virtio_ring.h" 49#include "linux/virtio_ring.h"
48#include "asm/bootparam.h" 50#include "asm/bootparam.h"
49/*L:110 We can ignore the 39 include files we need for this program, but I do 51/*L:110
50 * want to draw attention to the use of kernel-style types. 52 * We can ignore the 39 include files we need for this program, but I do want
53 * to draw attention to the use of kernel-style types.
51 * 54 *
52 * As Linus said, "C is a Spartan language, and so should your naming be." I 55 * As Linus said, "C is a Spartan language, and so should your naming be." I
53 * like these abbreviations, so we define them here. Note that u64 is always 56 * like these abbreviations, so we define them here. Note that u64 is always
54 * unsigned long long, which works on all Linux systems: this means that we can 57 * unsigned long long, which works on all Linux systems: this means that we can
55 * use %llu in printf for any u64. */ 58 * use %llu in printf for any u64.
59 */
56typedef unsigned long long u64; 60typedef unsigned long long u64;
57typedef uint32_t u32; 61typedef uint32_t u32;
58typedef uint16_t u16; 62typedef uint16_t u16;
@@ -69,8 +73,10 @@ typedef uint8_t u8;
69/* This will occupy 3 pages: it must be a power of 2. */ 73/* This will occupy 3 pages: it must be a power of 2. */
70#define VIRTQUEUE_NUM 256 74#define VIRTQUEUE_NUM 256
71 75
72/*L:120 verbose is both a global flag and a macro. The C preprocessor allows 76/*L:120
73 * this, and although I wouldn't recommend it, it works quite nicely here. */ 77 * verbose is both a global flag and a macro. The C preprocessor allows
78 * this, and although I wouldn't recommend it, it works quite nicely here.
79 */
74static bool verbose; 80static bool verbose;
75#define verbose(args...) \ 81#define verbose(args...) \
76 do { if (verbose) printf(args); } while(0) 82 do { if (verbose) printf(args); } while(0)
@@ -100,8 +106,7 @@ struct device_list
100 106
101 /* A single linked list of devices. */ 107 /* A single linked list of devices. */
102 struct device *dev; 108 struct device *dev;
103 /* And a pointer to the last device for easy append and also for 109 /* And a pointer to the last device for easy append. */
104 * configuration appending. */
105 struct device *lastdev; 110 struct device *lastdev;
106}; 111};
107 112
@@ -168,20 +173,24 @@ static char **main_args;
168/* The original tty settings to restore on exit. */ 173/* The original tty settings to restore on exit. */
169static struct termios orig_term; 174static struct termios orig_term;
170 175
171/* We have to be careful with barriers: our devices are all run in separate 176/*
177 * We have to be careful with barriers: our devices are all run in separate
172 * threads and so we need to make sure that changes visible to the Guest happen 178 * threads and so we need to make sure that changes visible to the Guest happen
173 * in precise order. */ 179 * in precise order.
180 */
174#define wmb() __asm__ __volatile__("" : : : "memory") 181#define wmb() __asm__ __volatile__("" : : : "memory")
175#define mb() __asm__ __volatile__("" : : : "memory") 182#define mb() __asm__ __volatile__("" : : : "memory")
176 183
177/* Convert an iovec element to the given type. 184/*
185 * Convert an iovec element to the given type.
178 * 186 *
179 * This is a fairly ugly trick: we need to know the size of the type and 187 * This is a fairly ugly trick: we need to know the size of the type and
180 * alignment requirement to check the pointer is kosher. It's also nice to 188 * alignment requirement to check the pointer is kosher. It's also nice to
181 * have the name of the type in case we report failure. 189 * have the name of the type in case we report failure.
182 * 190 *
183 * Typing those three things all the time is cumbersome and error prone, so we 191 * Typing those three things all the time is cumbersome and error prone, so we
184 * have a macro which sets them all up and passes to the real function. */ 192 * have a macro which sets them all up and passes to the real function.
193 */
185#define convert(iov, type) \ 194#define convert(iov, type) \
186 ((type *)_convert((iov), sizeof(type), __alignof__(type), #type)) 195 ((type *)_convert((iov), sizeof(type), __alignof__(type), #type))
187 196
@@ -198,8 +207,10 @@ static void *_convert(struct iovec *iov, size_t size, size_t align,
198/* Wrapper for the last available index. Makes it easier to change. */ 207/* Wrapper for the last available index. Makes it easier to change. */
199#define lg_last_avail(vq) ((vq)->last_avail_idx) 208#define lg_last_avail(vq) ((vq)->last_avail_idx)
200 209
201/* The virtio configuration space is defined to be little-endian. x86 is 210/*
202 * little-endian too, but it's nice to be explicit so we have these helpers. */ 211 * The virtio configuration space is defined to be little-endian. x86 is
212 * little-endian too, but it's nice to be explicit so we have these helpers.
213 */
203#define cpu_to_le16(v16) (v16) 214#define cpu_to_le16(v16) (v16)
204#define cpu_to_le32(v32) (v32) 215#define cpu_to_le32(v32) (v32)
205#define cpu_to_le64(v64) (v64) 216#define cpu_to_le64(v64) (v64)
@@ -241,11 +252,12 @@ static u8 *get_feature_bits(struct device *dev)
241 + dev->num_vq * sizeof(struct lguest_vqconfig); 252 + dev->num_vq * sizeof(struct lguest_vqconfig);
242} 253}
243 254
244/*L:100 The Launcher code itself takes us out into userspace, that scary place 255/*L:100
245 * where pointers run wild and free! Unfortunately, like most userspace 256 * The Launcher code itself takes us out into userspace, that scary place where
246 * programs, it's quite boring (which is why everyone likes to hack on the 257 * pointers run wild and free! Unfortunately, like most userspace programs,
247 * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it 258 * it's quite boring (which is why everyone likes to hack on the kernel!).
248 * will get you through this section. Or, maybe not. 259 * Perhaps if you make up an Lguest Drinking Game at this point, it will get
260 * you through this section. Or, maybe not.
249 * 261 *
250 * The Launcher sets up a big chunk of memory to be the Guest's "physical" 262 * The Launcher sets up a big chunk of memory to be the Guest's "physical"
251 * memory and stores it in "guest_base". In other words, Guest physical == 263 * memory and stores it in "guest_base". In other words, Guest physical ==
@@ -253,7 +265,8 @@ static u8 *get_feature_bits(struct device *dev)
253 * 265 *
254 * This can be tough to get your head around, but usually it just means that we 266 * This can be tough to get your head around, but usually it just means that we
255 * use these trivial conversion functions when the Guest gives us it's 267 * use these trivial conversion functions when the Guest gives us it's
256 * "physical" addresses: */ 268 * "physical" addresses:
269 */
257static void *from_guest_phys(unsigned long addr) 270static void *from_guest_phys(unsigned long addr)
258{ 271{
259 return guest_base + addr; 272 return guest_base + addr;
@@ -268,7 +281,8 @@ static unsigned long to_guest_phys(const void *addr)
268 * Loading the Kernel. 281 * Loading the Kernel.
269 * 282 *
270 * We start with couple of simple helper routines. open_or_die() avoids 283 * We start with couple of simple helper routines. open_or_die() avoids
271 * error-checking code cluttering the callers: */ 284 * error-checking code cluttering the callers:
285 */
272static int open_or_die(const char *name, int flags) 286static int open_or_die(const char *name, int flags)
273{ 287{
274 int fd = open(name, flags); 288 int fd = open(name, flags);
@@ -283,8 +297,10 @@ static void *map_zeroed_pages(unsigned int num)
283 int fd = open_or_die("/dev/zero", O_RDONLY); 297 int fd = open_or_die("/dev/zero", O_RDONLY);
284 void *addr; 298 void *addr;
285 299
286 /* We use a private mapping (ie. if we write to the page, it will be 300 /*
287 * copied). */ 301 * We use a private mapping (ie. if we write to the page, it will be
302 * copied).
303 */
288 addr = mmap(NULL, getpagesize() * num, 304 addr = mmap(NULL, getpagesize() * num,
289 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); 305 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0);
290 if (addr == MAP_FAILED) 306 if (addr == MAP_FAILED)
@@ -305,20 +321,24 @@ static void *get_pages(unsigned int num)
305 return addr; 321 return addr;
306} 322}
307 323
308/* This routine is used to load the kernel or initrd. It tries mmap, but if 324/*
325 * This routine is used to load the kernel or initrd. It tries mmap, but if
309 * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), 326 * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
310 * it falls back to reading the memory in. */ 327 * it falls back to reading the memory in.
328 */
311static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) 329static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
312{ 330{
313 ssize_t r; 331 ssize_t r;
314 332
315 /* We map writable even though for some segments are marked read-only. 333 /*
334 * We map writable even though for some segments are marked read-only.
316 * The kernel really wants to be writable: it patches its own 335 * The kernel really wants to be writable: it patches its own
317 * instructions. 336 * instructions.
318 * 337 *
319 * MAP_PRIVATE means that the page won't be copied until a write is 338 * MAP_PRIVATE means that the page won't be copied until a write is
320 * done to it. This allows us to share untouched memory between 339 * done to it. This allows us to share untouched memory between
321 * Guests. */ 340 * Guests.
341 */
322 if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC, 342 if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC,
323 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) 343 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
324 return; 344 return;
@@ -329,7 +349,8 @@ static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
329 err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); 349 err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
330} 350}
331 351
332/* This routine takes an open vmlinux image, which is in ELF, and maps it into 352/*
353 * This routine takes an open vmlinux image, which is in ELF, and maps it into
333 * the Guest memory. ELF = Embedded Linking Format, which is the format used 354 * the Guest memory. ELF = Embedded Linking Format, which is the format used
334 * by all modern binaries on Linux including the kernel. 355 * by all modern binaries on Linux including the kernel.
335 * 356 *
@@ -337,23 +358,28 @@ static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
337 * address. We use the physical address; the Guest will map itself to the 358 * address. We use the physical address; the Guest will map itself to the
338 * virtual address. 359 * virtual address.
339 * 360 *
340 * We return the starting address. */ 361 * We return the starting address.
362 */
341static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) 363static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
342{ 364{
343 Elf32_Phdr phdr[ehdr->e_phnum]; 365 Elf32_Phdr phdr[ehdr->e_phnum];
344 unsigned int i; 366 unsigned int i;
345 367
346 /* Sanity checks on the main ELF header: an x86 executable with a 368 /*
347 * reasonable number of correctly-sized program headers. */ 369 * Sanity checks on the main ELF header: an x86 executable with a
370 * reasonable number of correctly-sized program headers.
371 */
348 if (ehdr->e_type != ET_EXEC 372 if (ehdr->e_type != ET_EXEC
349 || ehdr->e_machine != EM_386 373 || ehdr->e_machine != EM_386
350 || ehdr->e_phentsize != sizeof(Elf32_Phdr) 374 || ehdr->e_phentsize != sizeof(Elf32_Phdr)
351 || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) 375 || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
352 errx(1, "Malformed elf header"); 376 errx(1, "Malformed elf header");
353 377
354 /* An ELF executable contains an ELF header and a number of "program" 378 /*
379 * An ELF executable contains an ELF header and a number of "program"
355 * headers which indicate which parts ("segments") of the program to 380 * headers which indicate which parts ("segments") of the program to
356 * load where. */ 381 * load where.
382 */
357 383
358 /* We read in all the program headers at once: */ 384 /* We read in all the program headers at once: */
359 if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) 385 if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
@@ -361,8 +387,10 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
361 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) 387 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
362 err(1, "Reading program headers"); 388 err(1, "Reading program headers");
363 389
364 /* Try all the headers: there are usually only three. A read-only one, 390 /*
365 * a read-write one, and a "note" section which we don't load. */ 391 * Try all the headers: there are usually only three. A read-only one,
392 * a read-write one, and a "note" section which we don't load.
393 */
366 for (i = 0; i < ehdr->e_phnum; i++) { 394 for (i = 0; i < ehdr->e_phnum; i++) {
367 /* If this isn't a loadable segment, we ignore it */ 395 /* If this isn't a loadable segment, we ignore it */
368 if (phdr[i].p_type != PT_LOAD) 396 if (phdr[i].p_type != PT_LOAD)
@@ -380,13 +408,15 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
380 return ehdr->e_entry; 408 return ehdr->e_entry;
381} 409}
382 410
383/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're 411/*L:150
384 * supposed to jump into it and it will unpack itself. We used to have to 412 * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed
385 * perform some hairy magic because the unpacking code scared me. 413 * to jump into it and it will unpack itself. We used to have to perform some
414 * hairy magic because the unpacking code scared me.
386 * 415 *
387 * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote 416 * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
388 * a small patch to jump over the tricky bits in the Guest, so now we just read 417 * a small patch to jump over the tricky bits in the Guest, so now we just read
389 * the funky header so we know where in the file to load, and away we go! */ 418 * the funky header so we know where in the file to load, and away we go!
419 */
390static unsigned long load_bzimage(int fd) 420static unsigned long load_bzimage(int fd)
391{ 421{
392 struct boot_params boot; 422 struct boot_params boot;
@@ -394,8 +424,10 @@ static unsigned long load_bzimage(int fd)
394 /* Modern bzImages get loaded at 1M. */ 424 /* Modern bzImages get loaded at 1M. */
395 void *p = from_guest_phys(0x100000); 425 void *p = from_guest_phys(0x100000);
396 426
397 /* Go back to the start of the file and read the header. It should be 427 /*
398 * a Linux boot header (see Documentation/x86/i386/boot.txt) */ 428 * Go back to the start of the file and read the header. It should be
429 * a Linux boot header (see Documentation/x86/i386/boot.txt)
430 */
399 lseek(fd, 0, SEEK_SET); 431 lseek(fd, 0, SEEK_SET);
400 read(fd, &boot, sizeof(boot)); 432 read(fd, &boot, sizeof(boot));
401 433
@@ -414,9 +446,11 @@ static unsigned long load_bzimage(int fd)
414 return boot.hdr.code32_start; 446 return boot.hdr.code32_start;
415} 447}
416 448
417/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels 449/*L:140
450 * Loading the kernel is easy when it's a "vmlinux", but most kernels
418 * come wrapped up in the self-decompressing "bzImage" format. With a little 451 * come wrapped up in the self-decompressing "bzImage" format. With a little
419 * work, we can load those, too. */ 452 * work, we can load those, too.
453 */
420static unsigned long load_kernel(int fd) 454static unsigned long load_kernel(int fd)
421{ 455{
422 Elf32_Ehdr hdr; 456 Elf32_Ehdr hdr;
@@ -433,24 +467,28 @@ static unsigned long load_kernel(int fd)
433 return load_bzimage(fd); 467 return load_bzimage(fd);
434} 468}
435 469
436/* This is a trivial little helper to align pages. Andi Kleen hated it because 470/*
471 * This is a trivial little helper to align pages. Andi Kleen hated it because
437 * it calls getpagesize() twice: "it's dumb code." 472 * it calls getpagesize() twice: "it's dumb code."
438 * 473 *
439 * Kernel guys get really het up about optimization, even when it's not 474 * Kernel guys get really het up about optimization, even when it's not
440 * necessary. I leave this code as a reaction against that. */ 475 * necessary. I leave this code as a reaction against that.
476 */
441static inline unsigned long page_align(unsigned long addr) 477static inline unsigned long page_align(unsigned long addr)
442{ 478{
443 /* Add upwards and truncate downwards. */ 479 /* Add upwards and truncate downwards. */
444 return ((addr + getpagesize()-1) & ~(getpagesize()-1)); 480 return ((addr + getpagesize()-1) & ~(getpagesize()-1));
445} 481}
446 482
447/*L:180 An "initial ram disk" is a disk image loaded into memory along with 483/*L:180
448 * the kernel which the kernel can use to boot from without needing any 484 * An "initial ram disk" is a disk image loaded into memory along with the
449 * drivers. Most distributions now use this as standard: the initrd contains 485 * kernel which the kernel can use to boot from without needing any drivers.
450 * the code to load the appropriate driver modules for the current machine. 486 * Most distributions now use this as standard: the initrd contains the code to
487 * load the appropriate driver modules for the current machine.
451 * 488 *
452 * Importantly, James Morris works for RedHat, and Fedora uses initrds for its 489 * Importantly, James Morris works for RedHat, and Fedora uses initrds for its
453 * kernels. He sent me this (and tells me when I break it). */ 490 * kernels. He sent me this (and tells me when I break it).
491 */
454static unsigned long load_initrd(const char *name, unsigned long mem) 492static unsigned long load_initrd(const char *name, unsigned long mem)
455{ 493{
456 int ifd; 494 int ifd;
@@ -462,12 +500,16 @@ static unsigned long load_initrd(const char *name, unsigned long mem)
462 if (fstat(ifd, &st) < 0) 500 if (fstat(ifd, &st) < 0)
463 err(1, "fstat() on initrd '%s'", name); 501 err(1, "fstat() on initrd '%s'", name);
464 502
465 /* We map the initrd at the top of memory, but mmap wants it to be 503 /*
466 * page-aligned, so we round the size up for that. */ 504 * We map the initrd at the top of memory, but mmap wants it to be
505 * page-aligned, so we round the size up for that.
506 */
467 len = page_align(st.st_size); 507 len = page_align(st.st_size);
468 map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); 508 map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
469 /* Once a file is mapped, you can close the file descriptor. It's a 509 /*
470 * little odd, but quite useful. */ 510 * Once a file is mapped, you can close the file descriptor. It's a
511 * little odd, but quite useful.
512 */
471 close(ifd); 513 close(ifd);
472 verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); 514 verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
473 515
@@ -476,8 +518,10 @@ static unsigned long load_initrd(const char *name, unsigned long mem)
476} 518}
477/*:*/ 519/*:*/
478 520
479/* Simple routine to roll all the commandline arguments together with spaces 521/*
480 * between them. */ 522 * Simple routine to roll all the commandline arguments together with spaces
523 * between them.
524 */
481static void concat(char *dst, char *args[]) 525static void concat(char *dst, char *args[])
482{ 526{
483 unsigned int i, len = 0; 527 unsigned int i, len = 0;
@@ -494,10 +538,12 @@ static void concat(char *dst, char *args[])
494 dst[len] = '\0'; 538 dst[len] = '\0';
495} 539}
496 540
497/*L:185 This is where we actually tell the kernel to initialize the Guest. We 541/*L:185
542 * This is where we actually tell the kernel to initialize the Guest. We
498 * saw the arguments it expects when we looked at initialize() in lguest_user.c: 543 * saw the arguments it expects when we looked at initialize() in lguest_user.c:
499 * the base of Guest "physical" memory, the top physical page to allow and the 544 * the base of Guest "physical" memory, the top physical page to allow and the
500 * entry point for the Guest. */ 545 * entry point for the Guest.
546 */
501static void tell_kernel(unsigned long start) 547static void tell_kernel(unsigned long start)
502{ 548{
503 unsigned long args[] = { LHREQ_INITIALIZE, 549 unsigned long args[] = { LHREQ_INITIALIZE,
@@ -522,20 +568,26 @@ static void tell_kernel(unsigned long start)
522static void *_check_pointer(unsigned long addr, unsigned int size, 568static void *_check_pointer(unsigned long addr, unsigned int size,
523 unsigned int line) 569 unsigned int line)
524{ 570{
525 /* We have to separately check addr and addr+size, because size could 571 /*
526 * be huge and addr + size might wrap around. */ 572 * We have to separately check addr and addr+size, because size could
573 * be huge and addr + size might wrap around.
574 */
527 if (addr >= guest_limit || addr + size >= guest_limit) 575 if (addr >= guest_limit || addr + size >= guest_limit)
528 errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); 576 errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr);
529 /* We return a pointer for the caller's convenience, now we know it's 577 /*
530 * safe to use. */ 578 * We return a pointer for the caller's convenience, now we know it's
579 * safe to use.
580 */
531 return from_guest_phys(addr); 581 return from_guest_phys(addr);
532} 582}
533/* A macro which transparently hands the line number to the real function. */ 583/* A macro which transparently hands the line number to the real function. */
534#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) 584#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
535 585
536/* Each buffer in the virtqueues is actually a chain of descriptors. This 586/*
587 * Each buffer in the virtqueues is actually a chain of descriptors. This
537 * function returns the next descriptor in the chain, or vq->vring.num if we're 588 * function returns the next descriptor in the chain, or vq->vring.num if we're
538 * at the end. */ 589 * at the end.
590 */
539static unsigned next_desc(struct vring_desc *desc, 591static unsigned next_desc(struct vring_desc *desc,
540 unsigned int i, unsigned int max) 592 unsigned int i, unsigned int max)
541{ 593{
@@ -576,12 +628,14 @@ static void trigger_irq(struct virtqueue *vq)
576 err(1, "Triggering irq %i", vq->config.irq); 628 err(1, "Triggering irq %i", vq->config.irq);
577} 629}
578 630
579/* This looks in the virtqueue and for the first available buffer, and converts 631/*
632 * This looks in the virtqueue and for the first available buffer, and converts
580 * it to an iovec for convenient access. Since descriptors consist of some 633 * it to an iovec for convenient access. Since descriptors consist of some
581 * number of output then some number of input descriptors, it's actually two 634 * number of output then some number of input descriptors, it's actually two
582 * iovecs, but we pack them into one and note how many of each there were. 635 * iovecs, but we pack them into one and note how many of each there were.
583 * 636 *
584 * This function returns the descriptor number found. */ 637 * This function returns the descriptor number found.
638 */
585static unsigned wait_for_vq_desc(struct virtqueue *vq, 639static unsigned wait_for_vq_desc(struct virtqueue *vq,
586 struct iovec iov[], 640 struct iovec iov[],
587 unsigned int *out_num, unsigned int *in_num) 641 unsigned int *out_num, unsigned int *in_num)
@@ -599,8 +653,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
599 /* OK, now we need to know about added descriptors. */ 653 /* OK, now we need to know about added descriptors. */
600 vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; 654 vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
601 655
602 /* They could have slipped one in as we were doing that: make 656 /*
603 * sure it's written, then check again. */ 657 * They could have slipped one in as we were doing that: make
658 * sure it's written, then check again.
659 */
604 mb(); 660 mb();
605 if (last_avail != vq->vring.avail->idx) { 661 if (last_avail != vq->vring.avail->idx) {
606 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; 662 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
@@ -620,8 +676,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
620 errx(1, "Guest moved used index from %u to %u", 676 errx(1, "Guest moved used index from %u to %u",
621 last_avail, vq->vring.avail->idx); 677 last_avail, vq->vring.avail->idx);
622 678
623 /* Grab the next descriptor number they're advertising, and increment 679 /*
624 * the index we've seen. */ 680 * Grab the next descriptor number they're advertising, and increment
681 * the index we've seen.
682 */
625 head = vq->vring.avail->ring[last_avail % vq->vring.num]; 683 head = vq->vring.avail->ring[last_avail % vq->vring.num];
626 lg_last_avail(vq)++; 684 lg_last_avail(vq)++;
627 685
@@ -636,8 +694,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
636 desc = vq->vring.desc; 694 desc = vq->vring.desc;
637 i = head; 695 i = head;
638 696
639 /* If this is an indirect entry, then this buffer contains a descriptor 697 /*
640 * table which we handle as if it's any normal descriptor chain. */ 698 * If this is an indirect entry, then this buffer contains a descriptor
699 * table which we handle as if it's any normal descriptor chain.
700 */
641 if (desc[i].flags & VRING_DESC_F_INDIRECT) { 701 if (desc[i].flags & VRING_DESC_F_INDIRECT) {
642 if (desc[i].len % sizeof(struct vring_desc)) 702 if (desc[i].len % sizeof(struct vring_desc))
643 errx(1, "Invalid size for indirect buffer table"); 703 errx(1, "Invalid size for indirect buffer table");
@@ -656,8 +716,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
656 if (desc[i].flags & VRING_DESC_F_WRITE) 716 if (desc[i].flags & VRING_DESC_F_WRITE)
657 (*in_num)++; 717 (*in_num)++;
658 else { 718 else {
659 /* If it's an output descriptor, they're all supposed 719 /*
660 * to come before any input descriptors. */ 720 * If it's an output descriptor, they're all supposed
721 * to come before any input descriptors.
722 */
661 if (*in_num) 723 if (*in_num)
662 errx(1, "Descriptor has out after in"); 724 errx(1, "Descriptor has out after in");
663 (*out_num)++; 725 (*out_num)++;
@@ -671,14 +733,18 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
671 return head; 733 return head;
672} 734}
673 735
674/* After we've used one of their buffers, we tell them about it. We'll then 736/*
675 * want to send them an interrupt, using trigger_irq(). */ 737 * After we've used one of their buffers, we tell them about it. We'll then
738 * want to send them an interrupt, using trigger_irq().
739 */
676static void add_used(struct virtqueue *vq, unsigned int head, int len) 740static void add_used(struct virtqueue *vq, unsigned int head, int len)
677{ 741{
678 struct vring_used_elem *used; 742 struct vring_used_elem *used;
679 743
680 /* The virtqueue contains a ring of used buffers. Get a pointer to the 744 /*
681 * next entry in that used ring. */ 745 * The virtqueue contains a ring of used buffers. Get a pointer to the
746 * next entry in that used ring.
747 */
682 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; 748 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
683 used->id = head; 749 used->id = head;
684 used->len = len; 750 used->len = len;
@@ -698,7 +764,8 @@ static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
698/* 764/*
699 * The Console 765 * The Console
700 * 766 *
701 * We associate some data with the console for our exit hack. */ 767 * We associate some data with the console for our exit hack.
768 */
702struct console_abort 769struct console_abort
703{ 770{
704 /* How many times have they hit ^C? */ 771 /* How many times have they hit ^C? */
@@ -725,20 +792,24 @@ static void console_input(struct virtqueue *vq)
725 if (len <= 0) { 792 if (len <= 0) {
726 /* Ran out of input? */ 793 /* Ran out of input? */
727 warnx("Failed to get console input, ignoring console."); 794 warnx("Failed to get console input, ignoring console.");
728 /* For simplicity, dying threads kill the whole Launcher. So 795 /*
729 * just nap here. */ 796 * For simplicity, dying threads kill the whole Launcher. So
797 * just nap here.
798 */
730 for (;;) 799 for (;;)
731 pause(); 800 pause();
732 } 801 }
733 802
734 add_used_and_trigger(vq, head, len); 803 add_used_and_trigger(vq, head, len);
735 804
736 /* Three ^C within one second? Exit. 805 /*
806 * Three ^C within one second? Exit.
737 * 807 *
738 * This is such a hack, but works surprisingly well. Each ^C has to 808 * This is such a hack, but works surprisingly well. Each ^C has to
739 * be in a buffer by itself, so they can't be too fast. But we check 809 * be in a buffer by itself, so they can't be too fast. But we check
740 * that we get three within about a second, so they can't be too 810 * that we get three within about a second, so they can't be too
741 * slow. */ 811 * slow.
812 */
742 if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { 813 if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
743 abort->count = 0; 814 abort->count = 0;
744 return; 815 return;
@@ -809,8 +880,7 @@ static bool will_block(int fd)
809 return select(fd+1, &fdset, NULL, NULL, &zero) != 1; 880 return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
810} 881}
811 882
812/* This is where we handle packets coming in from the tun device to our 883/* This handles packets coming in from the tun device to our Guest. */
813 * Guest. */
814static void net_input(struct virtqueue *vq) 884static void net_input(struct virtqueue *vq)
815{ 885{
816 int len; 886 int len;
@@ -842,8 +912,10 @@ static int do_thread(void *_vq)
842 return 0; 912 return 0;
843} 913}
844 914
845/* When a child dies, we kill our entire process group with SIGTERM. This 915/*
846 * also has the side effect that the shell restores the console for us! */ 916 * When a child dies, we kill our entire process group with SIGTERM. This
917 * also has the side effect that the shell restores the console for us!
918 */
847static void kill_launcher(int signal) 919static void kill_launcher(int signal)
848{ 920{
849 kill(0, SIGTERM); 921 kill(0, SIGTERM);
@@ -880,9 +952,10 @@ static void reset_device(struct device *dev)
880 952
881static void create_thread(struct virtqueue *vq) 953static void create_thread(struct virtqueue *vq)
882{ 954{
883 /* Create stack for thread and run it. Since stack grows 955 /*
884 * upwards, we point the stack pointer to the end of this 956 * Create stack for thread and run it. Since the stack grows upwards,
885 * region. */ 957 * we point the stack pointer to the end of this region.
958 */
886 char *stack = malloc(32768); 959 char *stack = malloc(32768);
887 unsigned long args[] = { LHREQ_EVENTFD, 960 unsigned long args[] = { LHREQ_EVENTFD,
888 vq->config.pfn*getpagesize(), 0 }; 961 vq->config.pfn*getpagesize(), 0 };
@@ -981,8 +1054,11 @@ static void handle_output(unsigned long addr)
981 } 1054 }
982 } 1055 }
983 1056
984 /* Early console write is done using notify on a nul-terminated string 1057 /*
985 * in Guest memory. */ 1058 * Early console write is done using notify on a nul-terminated string
1059 * in Guest memory. It's also great for hacking debugging messages
1060 * into a Guest.
1061 */
986 if (addr >= guest_limit) 1062 if (addr >= guest_limit)
987 errx(1, "Bad NOTIFY %#lx", addr); 1063 errx(1, "Bad NOTIFY %#lx", addr);
988 1064
@@ -998,10 +1074,12 @@ static void handle_output(unsigned long addr)
998 * routines to allocate and manage them. 1074 * routines to allocate and manage them.
999 */ 1075 */
1000 1076
1001/* The layout of the device page is a "struct lguest_device_desc" followed by a 1077/*
1078 * The layout of the device page is a "struct lguest_device_desc" followed by a
1002 * number of virtqueue descriptors, then two sets of feature bits, then an 1079 * number of virtqueue descriptors, then two sets of feature bits, then an
1003 * array of configuration bytes. This routine returns the configuration 1080 * array of configuration bytes. This routine returns the configuration
1004 * pointer. */ 1081 * pointer.
1082 */
1005static u8 *device_config(const struct device *dev) 1083static u8 *device_config(const struct device *dev)
1006{ 1084{
1007 return (void *)(dev->desc + 1) 1085 return (void *)(dev->desc + 1)
@@ -1009,9 +1087,11 @@ static u8 *device_config(const struct device *dev)
1009 + dev->feature_len * 2; 1087 + dev->feature_len * 2;
1010} 1088}
1011 1089
1012/* This routine allocates a new "struct lguest_device_desc" from descriptor 1090/*
1091 * This routine allocates a new "struct lguest_device_desc" from descriptor
1013 * table page just above the Guest's normal memory. It returns a pointer to 1092 * table page just above the Guest's normal memory. It returns a pointer to
1014 * that descriptor. */ 1093 * that descriptor.
1094 */
1015static struct lguest_device_desc *new_dev_desc(u16 type) 1095static struct lguest_device_desc *new_dev_desc(u16 type)
1016{ 1096{
1017 struct lguest_device_desc d = { .type = type }; 1097 struct lguest_device_desc d = { .type = type };
@@ -1032,8 +1112,10 @@ static struct lguest_device_desc *new_dev_desc(u16 type)
1032 return memcpy(p, &d, sizeof(d)); 1112 return memcpy(p, &d, sizeof(d));
1033} 1113}
1034 1114
1035/* Each device descriptor is followed by the description of its virtqueues. We 1115/*
1036 * specify how many descriptors the virtqueue is to have. */ 1116 * Each device descriptor is followed by the description of its virtqueues. We
1117 * specify how many descriptors the virtqueue is to have.
1118 */
1037static void add_virtqueue(struct device *dev, unsigned int num_descs, 1119static void add_virtqueue(struct device *dev, unsigned int num_descs,
1038 void (*service)(struct virtqueue *)) 1120 void (*service)(struct virtqueue *))
1039{ 1121{
@@ -1061,10 +1143,12 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1061 /* Initialize the vring. */ 1143 /* Initialize the vring. */
1062 vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN); 1144 vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN);
1063 1145
1064 /* Append virtqueue to this device's descriptor. We use 1146 /*
1147 * Append virtqueue to this device's descriptor. We use
1065 * device_config() to get the end of the device's current virtqueues; 1148 * device_config() to get the end of the device's current virtqueues;
1066 * we check that we haven't added any config or feature information 1149 * we check that we haven't added any config or feature information
1067 * yet, otherwise we'd be overwriting them. */ 1150 * yet, otherwise we'd be overwriting them.
1151 */
1068 assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); 1152 assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
1069 memcpy(device_config(dev), &vq->config, sizeof(vq->config)); 1153 memcpy(device_config(dev), &vq->config, sizeof(vq->config));
1070 dev->num_vq++; 1154 dev->num_vq++;
@@ -1072,14 +1156,18 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1072 1156
1073 verbose("Virtqueue page %#lx\n", to_guest_phys(p)); 1157 verbose("Virtqueue page %#lx\n", to_guest_phys(p));
1074 1158
1075 /* Add to tail of list, so dev->vq is first vq, dev->vq->next is 1159 /*
1076 * second. */ 1160 * Add to tail of list, so dev->vq is first vq, dev->vq->next is
1161 * second.
1162 */
1077 for (i = &dev->vq; *i; i = &(*i)->next); 1163 for (i = &dev->vq; *i; i = &(*i)->next);
1078 *i = vq; 1164 *i = vq;
1079} 1165}
1080 1166
1081/* The first half of the feature bitmask is for us to advertise features. The 1167/*
1082 * second half is for the Guest to accept features. */ 1168 * The first half of the feature bitmask is for us to advertise features. The
1169 * second half is for the Guest to accept features.
1170 */
1083static void add_feature(struct device *dev, unsigned bit) 1171static void add_feature(struct device *dev, unsigned bit)
1084{ 1172{
1085 u8 *features = get_feature_bits(dev); 1173 u8 *features = get_feature_bits(dev);
@@ -1093,9 +1181,11 @@ static void add_feature(struct device *dev, unsigned bit)
1093 features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); 1181 features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT));
1094} 1182}
1095 1183
1096/* This routine sets the configuration fields for an existing device's 1184/*
1185 * This routine sets the configuration fields for an existing device's
1097 * descriptor. It only works for the last device, but that's OK because that's 1186 * descriptor. It only works for the last device, but that's OK because that's
1098 * how we use it. */ 1187 * how we use it.
1188 */
1099static void set_config(struct device *dev, unsigned len, const void *conf) 1189static void set_config(struct device *dev, unsigned len, const void *conf)
1100{ 1190{
1101 /* Check we haven't overflowed our single page. */ 1191 /* Check we haven't overflowed our single page. */
@@ -1110,10 +1200,12 @@ static void set_config(struct device *dev, unsigned len, const void *conf)
1110 assert(dev->desc->config_len == len); 1200 assert(dev->desc->config_len == len);
1111} 1201}
1112 1202
1113/* This routine does all the creation and setup of a new device, including 1203/*
1204 * This routine does all the creation and setup of a new device, including
1114 * calling new_dev_desc() to allocate the descriptor and device memory. 1205 * calling new_dev_desc() to allocate the descriptor and device memory.
1115 * 1206 *
1116 * See what I mean about userspace being boring? */ 1207 * See what I mean about userspace being boring?
1208 */
1117static struct device *new_device(const char *name, u16 type) 1209static struct device *new_device(const char *name, u16 type)
1118{ 1210{
1119 struct device *dev = malloc(sizeof(*dev)); 1211 struct device *dev = malloc(sizeof(*dev));
@@ -1126,10 +1218,12 @@ static struct device *new_device(const char *name, u16 type)
1126 dev->num_vq = 0; 1218 dev->num_vq = 0;
1127 dev->running = false; 1219 dev->running = false;
1128 1220
1129 /* Append to device list. Prepending to a single-linked list is 1221 /*
1222 * Append to device list. Prepending to a single-linked list is
1130 * easier, but the user expects the devices to be arranged on the bus 1223 * easier, but the user expects the devices to be arranged on the bus
1131 * in command-line order. The first network device on the command line 1224 * in command-line order. The first network device on the command line
1132 * is eth0, the first block device /dev/vda, etc. */ 1225 * is eth0, the first block device /dev/vda, etc.
1226 */
1133 if (devices.lastdev) 1227 if (devices.lastdev)
1134 devices.lastdev->next = dev; 1228 devices.lastdev->next = dev;
1135 else 1229 else
@@ -1139,8 +1233,10 @@ static struct device *new_device(const char *name, u16 type)
1139 return dev; 1233 return dev;
1140} 1234}
1141 1235
1142/* Our first setup routine is the console. It's a fairly simple device, but 1236/*
1143 * UNIX tty handling makes it uglier than it could be. */ 1237 * Our first setup routine is the console. It's a fairly simple device, but
1238 * UNIX tty handling makes it uglier than it could be.
1239 */
1144static void setup_console(void) 1240static void setup_console(void)
1145{ 1241{
1146 struct device *dev; 1242 struct device *dev;
@@ -1148,8 +1244,10 @@ static void setup_console(void)
1148 /* If we can save the initial standard input settings... */ 1244 /* If we can save the initial standard input settings... */
1149 if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { 1245 if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
1150 struct termios term = orig_term; 1246 struct termios term = orig_term;
1151 /* Then we turn off echo, line buffering and ^C etc. We want a 1247 /*
1152 * raw input stream to the Guest. */ 1248 * Then we turn off echo, line buffering and ^C etc: We want a
1249 * raw input stream to the Guest.
1250 */
1153 term.c_lflag &= ~(ISIG|ICANON|ECHO); 1251 term.c_lflag &= ~(ISIG|ICANON|ECHO);
1154 tcsetattr(STDIN_FILENO, TCSANOW, &term); 1252 tcsetattr(STDIN_FILENO, TCSANOW, &term);
1155 } 1253 }
@@ -1160,10 +1258,12 @@ static void setup_console(void)
1160 dev->priv = malloc(sizeof(struct console_abort)); 1258 dev->priv = malloc(sizeof(struct console_abort));
1161 ((struct console_abort *)dev->priv)->count = 0; 1259 ((struct console_abort *)dev->priv)->count = 0;
1162 1260
1163 /* The console needs two virtqueues: the input then the output. When 1261 /*
1262 * The console needs two virtqueues: the input then the output. When
1164 * they put something the input queue, we make sure we're listening to 1263 * they put something the input queue, we make sure we're listening to
1165 * stdin. When they put something in the output queue, we write it to 1264 * stdin. When they put something in the output queue, we write it to
1166 * stdout. */ 1265 * stdout.
1266 */
1167 add_virtqueue(dev, VIRTQUEUE_NUM, console_input); 1267 add_virtqueue(dev, VIRTQUEUE_NUM, console_input);
1168 add_virtqueue(dev, VIRTQUEUE_NUM, console_output); 1268 add_virtqueue(dev, VIRTQUEUE_NUM, console_output);
1169 1269
@@ -1171,7 +1271,8 @@ static void setup_console(void)
1171} 1271}
1172/*:*/ 1272/*:*/
1173 1273
1174/*M:010 Inter-guest networking is an interesting area. Simplest is to have a 1274/*M:010
1275 * Inter-guest networking is an interesting area. Simplest is to have a
1175 * --sharenet=<name> option which opens or creates a named pipe. This can be 1276 * --sharenet=<name> option which opens or creates a named pipe. This can be
1176 * used to send packets to another guest in a 1:1 manner. 1277 * used to send packets to another guest in a 1:1 manner.
1177 * 1278 *
@@ -1185,7 +1286,8 @@ static void setup_console(void)
1185 * multiple inter-guest channels behind one interface, although it would 1286 * multiple inter-guest channels behind one interface, although it would
1186 * require some manner of hotplugging new virtio channels. 1287 * require some manner of hotplugging new virtio channels.
1187 * 1288 *
1188 * Finally, we could implement a virtio network switch in the kernel. :*/ 1289 * Finally, we could implement a virtio network switch in the kernel.
1290:*/
1189 1291
1190static u32 str2ip(const char *ipaddr) 1292static u32 str2ip(const char *ipaddr)
1191{ 1293{
@@ -1210,11 +1312,13 @@ static void str2mac(const char *macaddr, unsigned char mac[6])
1210 mac[5] = m[5]; 1312 mac[5] = m[5];
1211} 1313}
1212 1314
1213/* This code is "adapted" from libbridge: it attaches the Host end of the 1315/*
1316 * This code is "adapted" from libbridge: it attaches the Host end of the
1214 * network device to the bridge device specified by the command line. 1317 * network device to the bridge device specified by the command line.
1215 * 1318 *
1216 * This is yet another James Morris contribution (I'm an IP-level guy, so I 1319 * This is yet another James Morris contribution (I'm an IP-level guy, so I
1217 * dislike bridging), and I just try not to break it. */ 1320 * dislike bridging), and I just try not to break it.
1321 */
1218static void add_to_bridge(int fd, const char *if_name, const char *br_name) 1322static void add_to_bridge(int fd, const char *if_name, const char *br_name)
1219{ 1323{
1220 int ifidx; 1324 int ifidx;
@@ -1234,9 +1338,11 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name)
1234 err(1, "can't add %s to bridge %s", if_name, br_name); 1338 err(1, "can't add %s to bridge %s", if_name, br_name);
1235} 1339}
1236 1340
1237/* This sets up the Host end of the network device with an IP address, brings 1341/*
1342 * This sets up the Host end of the network device with an IP address, brings
1238 * it up so packets will flow, the copies the MAC address into the hwaddr 1343 * it up so packets will flow, the copies the MAC address into the hwaddr
1239 * pointer. */ 1344 * pointer.
1345 */
1240static void configure_device(int fd, const char *tapif, u32 ipaddr) 1346static void configure_device(int fd, const char *tapif, u32 ipaddr)
1241{ 1347{
1242 struct ifreq ifr; 1348 struct ifreq ifr;
@@ -1263,10 +1369,12 @@ static int get_tun_device(char tapif[IFNAMSIZ])
1263 /* Start with this zeroed. Messy but sure. */ 1369 /* Start with this zeroed. Messy but sure. */
1264 memset(&ifr, 0, sizeof(ifr)); 1370 memset(&ifr, 0, sizeof(ifr));
1265 1371
1266 /* We open the /dev/net/tun device and tell it we want a tap device. A 1372 /*
1373 * We open the /dev/net/tun device and tell it we want a tap device. A
1267 * tap device is like a tun device, only somehow different. To tell 1374 * tap device is like a tun device, only somehow different. To tell
1268 * the truth, I completely blundered my way through this code, but it 1375 * the truth, I completely blundered my way through this code, but it
1269 * works now! */ 1376 * works now!
1377 */
1270 netfd = open_or_die("/dev/net/tun", O_RDWR); 1378 netfd = open_or_die("/dev/net/tun", O_RDWR);
1271 ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; 1379 ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
1272 strcpy(ifr.ifr_name, "tap%d"); 1380 strcpy(ifr.ifr_name, "tap%d");
@@ -1277,18 +1385,22 @@ static int get_tun_device(char tapif[IFNAMSIZ])
1277 TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0) 1385 TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0)
1278 err(1, "Could not set features for tun device"); 1386 err(1, "Could not set features for tun device");
1279 1387
1280 /* We don't need checksums calculated for packets coming in this 1388 /*
1281 * device: trust us! */ 1389 * We don't need checksums calculated for packets coming in this
1390 * device: trust us!
1391 */
1282 ioctl(netfd, TUNSETNOCSUM, 1); 1392 ioctl(netfd, TUNSETNOCSUM, 1);
1283 1393
1284 memcpy(tapif, ifr.ifr_name, IFNAMSIZ); 1394 memcpy(tapif, ifr.ifr_name, IFNAMSIZ);
1285 return netfd; 1395 return netfd;
1286} 1396}
1287 1397
1288/*L:195 Our network is a Host<->Guest network. This can either use bridging or 1398/*L:195
1399 * Our network is a Host<->Guest network. This can either use bridging or
1289 * routing, but the principle is the same: it uses the "tun" device to inject 1400 * routing, but the principle is the same: it uses the "tun" device to inject
1290 * packets into the Host as if they came in from a normal network card. We 1401 * packets into the Host as if they came in from a normal network card. We
1291 * just shunt packets between the Guest and the tun device. */ 1402 * just shunt packets between the Guest and the tun device.
1403 */
1292static void setup_tun_net(char *arg) 1404static void setup_tun_net(char *arg)
1293{ 1405{
1294 struct device *dev; 1406 struct device *dev;
@@ -1305,13 +1417,14 @@ static void setup_tun_net(char *arg)
1305 dev = new_device("net", VIRTIO_ID_NET); 1417 dev = new_device("net", VIRTIO_ID_NET);
1306 dev->priv = net_info; 1418 dev->priv = net_info;
1307 1419
1308 /* Network devices need a receive and a send queue, just like 1420 /* Network devices need a recv and a send queue, just like console. */
1309 * console. */
1310 add_virtqueue(dev, VIRTQUEUE_NUM, net_input); 1421 add_virtqueue(dev, VIRTQUEUE_NUM, net_input);
1311 add_virtqueue(dev, VIRTQUEUE_NUM, net_output); 1422 add_virtqueue(dev, VIRTQUEUE_NUM, net_output);
1312 1423
1313 /* We need a socket to perform the magic network ioctls to bring up the 1424 /*
1314 * tap interface, connect to the bridge etc. Any socket will do! */ 1425 * We need a socket to perform the magic network ioctls to bring up the
1426 * tap interface, connect to the bridge etc. Any socket will do!
1427 */
1315 ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 1428 ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
1316 if (ipfd < 0) 1429 if (ipfd < 0)
1317 err(1, "opening IP socket"); 1430 err(1, "opening IP socket");
@@ -1366,7 +1479,8 @@ static void setup_tun_net(char *arg)
1366 devices.device_num, tapif, arg); 1479 devices.device_num, tapif, arg);
1367} 1480}
1368 1481
1369/* Our block (disk) device should be really simple: the Guest asks for a block 1482/*
1483 * Our block (disk) device should be really simple: the Guest asks for a block
1370 * number and we read or write that position in the file. Unfortunately, that 1484 * number and we read or write that position in the file. Unfortunately, that
1371 * was amazingly slow: the Guest waits until the read is finished before 1485 * was amazingly slow: the Guest waits until the read is finished before
1372 * running anything else, even if it could have been doing useful work. 1486 * running anything else, even if it could have been doing useful work.
@@ -1374,7 +1488,9 @@ static void setup_tun_net(char *arg)
1374 * We could use async I/O, except it's reputed to suck so hard that characters 1488 * We could use async I/O, except it's reputed to suck so hard that characters
1375 * actually go missing from your code when you try to use it. 1489 * actually go missing from your code when you try to use it.
1376 * 1490 *
1377 * So we farm the I/O out to thread, and communicate with it via a pipe. */ 1491 * So this was one reason why lguest now does all virtqueue servicing in
1492 * separate threads: it's more efficient and more like a real device.
1493 */
1378 1494
1379/* This hangs off device->priv. */ 1495/* This hangs off device->priv. */
1380struct vblk_info 1496struct vblk_info
@@ -1412,9 +1528,11 @@ static void blk_request(struct virtqueue *vq)
1412 /* Get the next request. */ 1528 /* Get the next request. */
1413 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 1529 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1414 1530
1415 /* Every block request should contain at least one output buffer 1531 /*
1532 * Every block request should contain at least one output buffer
1416 * (detailing the location on disk and the type of request) and one 1533 * (detailing the location on disk and the type of request) and one
1417 * input buffer (to hold the result). */ 1534 * input buffer (to hold the result).
1535 */
1418 if (out_num == 0 || in_num == 0) 1536 if (out_num == 0 || in_num == 0)
1419 errx(1, "Bad virtblk cmd %u out=%u in=%u", 1537 errx(1, "Bad virtblk cmd %u out=%u in=%u",
1420 head, out_num, in_num); 1538 head, out_num, in_num);
@@ -1423,33 +1541,41 @@ static void blk_request(struct virtqueue *vq)
1423 in = convert(&iov[out_num+in_num-1], u8); 1541 in = convert(&iov[out_num+in_num-1], u8);
1424 off = out->sector * 512; 1542 off = out->sector * 512;
1425 1543
1426 /* The block device implements "barriers", where the Guest indicates 1544 /*
1545 * The block device implements "barriers", where the Guest indicates
1427 * that it wants all previous writes to occur before this write. We 1546 * that it wants all previous writes to occur before this write. We
1428 * don't have a way of asking our kernel to do a barrier, so we just 1547 * don't have a way of asking our kernel to do a barrier, so we just
1429 * synchronize all the data in the file. Pretty poor, no? */ 1548 * synchronize all the data in the file. Pretty poor, no?
1549 */
1430 if (out->type & VIRTIO_BLK_T_BARRIER) 1550 if (out->type & VIRTIO_BLK_T_BARRIER)
1431 fdatasync(vblk->fd); 1551 fdatasync(vblk->fd);
1432 1552
1433 /* In general the virtio block driver is allowed to try SCSI commands. 1553 /*
1434 * It'd be nice if we supported eject, for example, but we don't. */ 1554 * In general the virtio block driver is allowed to try SCSI commands.
1555 * It'd be nice if we supported eject, for example, but we don't.
1556 */
1435 if (out->type & VIRTIO_BLK_T_SCSI_CMD) { 1557 if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
1436 fprintf(stderr, "Scsi commands unsupported\n"); 1558 fprintf(stderr, "Scsi commands unsupported\n");
1437 *in = VIRTIO_BLK_S_UNSUPP; 1559 *in = VIRTIO_BLK_S_UNSUPP;
1438 wlen = sizeof(*in); 1560 wlen = sizeof(*in);
1439 } else if (out->type & VIRTIO_BLK_T_OUT) { 1561 } else if (out->type & VIRTIO_BLK_T_OUT) {
1440 /* Write */ 1562 /*
1441 1563 * Write
1442 /* Move to the right location in the block file. This can fail 1564 *
1443 * if they try to write past end. */ 1565 * Move to the right location in the block file. This can fail
1566 * if they try to write past end.
1567 */
1444 if (lseek64(vblk->fd, off, SEEK_SET) != off) 1568 if (lseek64(vblk->fd, off, SEEK_SET) != off)
1445 err(1, "Bad seek to sector %llu", out->sector); 1569 err(1, "Bad seek to sector %llu", out->sector);
1446 1570
1447 ret = writev(vblk->fd, iov+1, out_num-1); 1571 ret = writev(vblk->fd, iov+1, out_num-1);
1448 verbose("WRITE to sector %llu: %i\n", out->sector, ret); 1572 verbose("WRITE to sector %llu: %i\n", out->sector, ret);
1449 1573
1450 /* Grr... Now we know how long the descriptor they sent was, we 1574 /*
1575 * Grr... Now we know how long the descriptor they sent was, we
1451 * make sure they didn't try to write over the end of the block 1576 * make sure they didn't try to write over the end of the block
1452 * file (possibly extending it). */ 1577 * file (possibly extending it).
1578 */
1453 if (ret > 0 && off + ret > vblk->len) { 1579 if (ret > 0 && off + ret > vblk->len) {
1454 /* Trim it back to the correct length */ 1580 /* Trim it back to the correct length */
1455 ftruncate64(vblk->fd, vblk->len); 1581 ftruncate64(vblk->fd, vblk->len);
@@ -1459,10 +1585,12 @@ static void blk_request(struct virtqueue *vq)
1459 wlen = sizeof(*in); 1585 wlen = sizeof(*in);
1460 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); 1586 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
1461 } else { 1587 } else {
1462 /* Read */ 1588 /*
1463 1589 * Read
1464 /* Move to the right location in the block file. This can fail 1590 *
1465 * if they try to read past end. */ 1591 * Move to the right location in the block file. This can fail
1592 * if they try to read past end.
1593 */
1466 if (lseek64(vblk->fd, off, SEEK_SET) != off) 1594 if (lseek64(vblk->fd, off, SEEK_SET) != off)
1467 err(1, "Bad seek to sector %llu", out->sector); 1595 err(1, "Bad seek to sector %llu", out->sector);
1468 1596
@@ -1477,10 +1605,12 @@ static void blk_request(struct virtqueue *vq)
1477 } 1605 }
1478 } 1606 }
1479 1607
1480 /* OK, so we noted that it was pretty poor to use an fdatasync as a 1608 /*
1609 * OK, so we noted that it was pretty poor to use an fdatasync as a
1481 * barrier. But Christoph Hellwig points out that we need a sync 1610 * barrier. But Christoph Hellwig points out that we need a sync
1482 * *afterwards* as well: "Barriers specify no reordering to the front 1611 * *afterwards* as well: "Barriers specify no reordering to the front
1483 * or the back." And Jens Axboe confirmed it, so here we are: */ 1612 * or the back." And Jens Axboe confirmed it, so here we are:
1613 */
1484 if (out->type & VIRTIO_BLK_T_BARRIER) 1614 if (out->type & VIRTIO_BLK_T_BARRIER)
1485 fdatasync(vblk->fd); 1615 fdatasync(vblk->fd);
1486 1616
@@ -1494,7 +1624,7 @@ static void setup_block_file(const char *filename)
1494 struct vblk_info *vblk; 1624 struct vblk_info *vblk;
1495 struct virtio_blk_config conf; 1625 struct virtio_blk_config conf;
1496 1626
1497 /* The device responds to return from I/O thread. */ 1627 /* Creat the device. */
1498 dev = new_device("block", VIRTIO_ID_BLOCK); 1628 dev = new_device("block", VIRTIO_ID_BLOCK);
1499 1629
1500 /* The device has one virtqueue, where the Guest places requests. */ 1630 /* The device has one virtqueue, where the Guest places requests. */
@@ -1513,8 +1643,10 @@ static void setup_block_file(const char *filename)
1513 /* Tell Guest how many sectors this device has. */ 1643 /* Tell Guest how many sectors this device has. */
1514 conf.capacity = cpu_to_le64(vblk->len / 512); 1644 conf.capacity = cpu_to_le64(vblk->len / 512);
1515 1645
1516 /* Tell Guest not to put in too many descriptors at once: two are used 1646 /*
1517 * for the in and out elements. */ 1647 * Tell Guest not to put in too many descriptors at once: two are used
1648 * for the in and out elements.
1649 */
1518 add_feature(dev, VIRTIO_BLK_F_SEG_MAX); 1650 add_feature(dev, VIRTIO_BLK_F_SEG_MAX);
1519 conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); 1651 conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2);
1520 1652
@@ -1525,16 +1657,18 @@ static void setup_block_file(const char *filename)
1525 ++devices.device_num, le64_to_cpu(conf.capacity)); 1657 ++devices.device_num, le64_to_cpu(conf.capacity));
1526} 1658}
1527 1659
1528struct rng_info { 1660/*L:211
1529 int rfd; 1661 * Our random number generator device reads from /dev/random into the Guest's
1530};
1531
1532/* Our random number generator device reads from /dev/random into the Guest's
1533 * input buffers. The usual case is that the Guest doesn't want random numbers 1662 * input buffers. The usual case is that the Guest doesn't want random numbers
1534 * and so has no buffers although /dev/random is still readable, whereas 1663 * and so has no buffers although /dev/random is still readable, whereas
1535 * console is the reverse. 1664 * console is the reverse.
1536 * 1665 *
1537 * The same logic applies, however. */ 1666 * The same logic applies, however.
1667 */
1668struct rng_info {
1669 int rfd;
1670};
1671
1538static void rng_input(struct virtqueue *vq) 1672static void rng_input(struct virtqueue *vq)
1539{ 1673{
1540 int len; 1674 int len;
@@ -1547,9 +1681,11 @@ static void rng_input(struct virtqueue *vq)
1547 if (out_num) 1681 if (out_num)
1548 errx(1, "Output buffers in rng?"); 1682 errx(1, "Output buffers in rng?");
1549 1683
1550 /* This is why we convert to iovecs: the readv() call uses them, and so 1684 /*
1685 * This is why we convert to iovecs: the readv() call uses them, and so
1551 * it reads straight into the Guest's buffer. We loop to make sure we 1686 * it reads straight into the Guest's buffer. We loop to make sure we
1552 * fill it. */ 1687 * fill it.
1688 */
1553 while (!iov_empty(iov, in_num)) { 1689 while (!iov_empty(iov, in_num)) {
1554 len = readv(rng_info->rfd, iov, in_num); 1690 len = readv(rng_info->rfd, iov, in_num);
1555 if (len <= 0) 1691 if (len <= 0)
@@ -1562,15 +1698,18 @@ static void rng_input(struct virtqueue *vq)
1562 add_used(vq, head, totlen); 1698 add_used(vq, head, totlen);
1563} 1699}
1564 1700
1565/* And this creates a "hardware" random number device for the Guest. */ 1701/*L:199
1702 * This creates a "hardware" random number device for the Guest.
1703 */
1566static void setup_rng(void) 1704static void setup_rng(void)
1567{ 1705{
1568 struct device *dev; 1706 struct device *dev;
1569 struct rng_info *rng_info = malloc(sizeof(*rng_info)); 1707 struct rng_info *rng_info = malloc(sizeof(*rng_info));
1570 1708
1709 /* Our device's privat info simply contains the /dev/random fd. */
1571 rng_info->rfd = open_or_die("/dev/random", O_RDONLY); 1710 rng_info->rfd = open_or_die("/dev/random", O_RDONLY);
1572 1711
1573 /* The device responds to return from I/O thread. */ 1712 /* Create the new device. */
1574 dev = new_device("rng", VIRTIO_ID_RNG); 1713 dev = new_device("rng", VIRTIO_ID_RNG);
1575 dev->priv = rng_info; 1714 dev->priv = rng_info;
1576 1715
@@ -1586,8 +1725,10 @@ static void __attribute__((noreturn)) restart_guest(void)
1586{ 1725{
1587 unsigned int i; 1726 unsigned int i;
1588 1727
1589 /* Since we don't track all open fds, we simply close everything beyond 1728 /*
1590 * stderr. */ 1729 * Since we don't track all open fds, we simply close everything beyond
1730 * stderr.
1731 */
1591 for (i = 3; i < FD_SETSIZE; i++) 1732 for (i = 3; i < FD_SETSIZE; i++)
1592 close(i); 1733 close(i);
1593 1734
@@ -1598,8 +1739,10 @@ static void __attribute__((noreturn)) restart_guest(void)
1598 err(1, "Could not exec %s", main_args[0]); 1739 err(1, "Could not exec %s", main_args[0]);
1599} 1740}
1600 1741
1601/*L:220 Finally we reach the core of the Launcher which runs the Guest, serves 1742/*L:220
1602 * its input and output, and finally, lays it to rest. */ 1743 * Finally we reach the core of the Launcher which runs the Guest, serves
1744 * its input and output, and finally, lays it to rest.
1745 */
1603static void __attribute__((noreturn)) run_guest(void) 1746static void __attribute__((noreturn)) run_guest(void)
1604{ 1747{
1605 for (;;) { 1748 for (;;) {
@@ -1634,7 +1777,7 @@ static void __attribute__((noreturn)) run_guest(void)
1634 * 1777 *
1635 * Are you ready? Take a deep breath and join me in the core of the Host, in 1778 * Are you ready? Take a deep breath and join me in the core of the Host, in
1636 * "make Host". 1779 * "make Host".
1637 :*/ 1780:*/
1638 1781
1639static struct option opts[] = { 1782static struct option opts[] = {
1640 { "verbose", 0, NULL, 'v' }, 1783 { "verbose", 0, NULL, 'v' },
@@ -1655,8 +1798,7 @@ static void usage(void)
1655/*L:105 The main routine is where the real work begins: */ 1798/*L:105 The main routine is where the real work begins: */
1656int main(int argc, char *argv[]) 1799int main(int argc, char *argv[])
1657{ 1800{
1658 /* Memory, top-level pagetable, code startpoint and size of the 1801 /* Memory, code startpoint and size of the (optional) initrd. */
1659 * (optional) initrd. */
1660 unsigned long mem = 0, start, initrd_size = 0; 1802 unsigned long mem = 0, start, initrd_size = 0;
1661 /* Two temporaries. */ 1803 /* Two temporaries. */
1662 int i, c; 1804 int i, c;
@@ -1668,24 +1810,30 @@ int main(int argc, char *argv[])
1668 /* Save the args: we "reboot" by execing ourselves again. */ 1810 /* Save the args: we "reboot" by execing ourselves again. */
1669 main_args = argv; 1811 main_args = argv;
1670 1812
1671 /* First we initialize the device list. We keep a pointer to the last 1813 /*
1814 * First we initialize the device list. We keep a pointer to the last
1672 * device, and the next interrupt number to use for devices (1: 1815 * device, and the next interrupt number to use for devices (1:
1673 * remember that 0 is used by the timer). */ 1816 * remember that 0 is used by the timer).
1817 */
1674 devices.lastdev = NULL; 1818 devices.lastdev = NULL;
1675 devices.next_irq = 1; 1819 devices.next_irq = 1;
1676 1820
1677 cpu_id = 0; 1821 cpu_id = 0;
1678 /* We need to know how much memory so we can set up the device 1822 /*
1823 * We need to know how much memory so we can set up the device
1679 * descriptor and memory pages for the devices as we parse the command 1824 * descriptor and memory pages for the devices as we parse the command
1680 * line. So we quickly look through the arguments to find the amount 1825 * line. So we quickly look through the arguments to find the amount
1681 * of memory now. */ 1826 * of memory now.
1827 */
1682 for (i = 1; i < argc; i++) { 1828 for (i = 1; i < argc; i++) {
1683 if (argv[i][0] != '-') { 1829 if (argv[i][0] != '-') {
1684 mem = atoi(argv[i]) * 1024 * 1024; 1830 mem = atoi(argv[i]) * 1024 * 1024;
1685 /* We start by mapping anonymous pages over all of 1831 /*
1832 * We start by mapping anonymous pages over all of
1686 * guest-physical memory range. This fills it with 0, 1833 * guest-physical memory range. This fills it with 0,
1687 * and ensures that the Guest won't be killed when it 1834 * and ensures that the Guest won't be killed when it
1688 * tries to access it. */ 1835 * tries to access it.
1836 */
1689 guest_base = map_zeroed_pages(mem / getpagesize() 1837 guest_base = map_zeroed_pages(mem / getpagesize()
1690 + DEVICE_PAGES); 1838 + DEVICE_PAGES);
1691 guest_limit = mem; 1839 guest_limit = mem;
@@ -1718,8 +1866,10 @@ int main(int argc, char *argv[])
1718 usage(); 1866 usage();
1719 } 1867 }
1720 } 1868 }
1721 /* After the other arguments we expect memory and kernel image name, 1869 /*
1722 * followed by command line arguments for the kernel. */ 1870 * After the other arguments we expect memory and kernel image name,
1871 * followed by command line arguments for the kernel.
1872 */
1723 if (optind + 2 > argc) 1873 if (optind + 2 > argc)
1724 usage(); 1874 usage();
1725 1875
@@ -1737,20 +1887,26 @@ int main(int argc, char *argv[])
1737 /* Map the initrd image if requested (at top of physical memory) */ 1887 /* Map the initrd image if requested (at top of physical memory) */
1738 if (initrd_name) { 1888 if (initrd_name) {
1739 initrd_size = load_initrd(initrd_name, mem); 1889 initrd_size = load_initrd(initrd_name, mem);
1740 /* These are the location in the Linux boot header where the 1890 /*
1741 * start and size of the initrd are expected to be found. */ 1891 * These are the location in the Linux boot header where the
1892 * start and size of the initrd are expected to be found.
1893 */
1742 boot->hdr.ramdisk_image = mem - initrd_size; 1894 boot->hdr.ramdisk_image = mem - initrd_size;
1743 boot->hdr.ramdisk_size = initrd_size; 1895 boot->hdr.ramdisk_size = initrd_size;
1744 /* The bootloader type 0xFF means "unknown"; that's OK. */ 1896 /* The bootloader type 0xFF means "unknown"; that's OK. */
1745 boot->hdr.type_of_loader = 0xFF; 1897 boot->hdr.type_of_loader = 0xFF;
1746 } 1898 }
1747 1899
1748 /* The Linux boot header contains an "E820" memory map: ours is a 1900 /*
1749 * simple, single region. */ 1901 * The Linux boot header contains an "E820" memory map: ours is a
1902 * simple, single region.
1903 */
1750 boot->e820_entries = 1; 1904 boot->e820_entries = 1;
1751 boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM }); 1905 boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM });
1752 /* The boot header contains a command line pointer: we put the command 1906 /*
1753 * line after the boot header. */ 1907 * The boot header contains a command line pointer: we put the command
1908 * line after the boot header.
1909 */
1754 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); 1910 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
1755 /* We use a simple helper to copy the arguments separated by spaces. */ 1911 /* We use a simple helper to copy the arguments separated by spaces. */
1756 concat((char *)(boot + 1), argv+optind+2); 1912 concat((char *)(boot + 1), argv+optind+2);
@@ -1764,8 +1920,10 @@ int main(int argc, char *argv[])
1764 /* Tell the entry path not to try to reload segment registers. */ 1920 /* Tell the entry path not to try to reload segment registers. */
1765 boot->hdr.loadflags |= KEEP_SEGMENTS; 1921 boot->hdr.loadflags |= KEEP_SEGMENTS;
1766 1922
1767 /* We tell the kernel to initialize the Guest: this returns the open 1923 /*
1768 * /dev/lguest file descriptor. */ 1924 * We tell the kernel to initialize the Guest: this returns the open
1925 * /dev/lguest file descriptor.
1926 */
1769 tell_kernel(start); 1927 tell_kernel(start);
1770 1928
1771 /* Ensure that we terminate if a child dies. */ 1929 /* Ensure that we terminate if a child dies. */