aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/virtual/lguest
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/virtual/lguest')
-rw-r--r--Documentation/virtual/lguest/.gitignore1
-rw-r--r--Documentation/virtual/lguest/Makefile8
-rw-r--r--Documentation/virtual/lguest/extract58
-rw-r--r--Documentation/virtual/lguest/lguest.c2077
-rw-r--r--Documentation/virtual/lguest/lguest.txt129
5 files changed, 2273 insertions, 0 deletions
diff --git a/Documentation/virtual/lguest/.gitignore b/Documentation/virtual/lguest/.gitignore
new file mode 100644
index 000000000000..115587fd5f65
--- /dev/null
+++ b/Documentation/virtual/lguest/.gitignore
@@ -0,0 +1 @@
lguest
diff --git a/Documentation/virtual/lguest/Makefile b/Documentation/virtual/lguest/Makefile
new file mode 100644
index 000000000000..0ac34206f7a7
--- /dev/null
+++ b/Documentation/virtual/lguest/Makefile
@@ -0,0 +1,8 @@
1# This creates the demonstration utility "lguest" which runs a Linux guest.
2# Missing headers? Add "-I../../../include -I../../../arch/x86/include"
3CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE
4
5all: lguest
6
7clean:
8 rm -f lguest
diff --git a/Documentation/virtual/lguest/extract b/Documentation/virtual/lguest/extract
new file mode 100644
index 000000000000..7730bb6e4b94
--- /dev/null
+++ b/Documentation/virtual/lguest/extract
@@ -0,0 +1,58 @@
1#! /bin/sh
2
3set -e
4
5PREFIX=$1
6shift
7
8trap 'rm -r $TMPDIR' 0
9TMPDIR=`mktemp -d`
10
11exec 3>/dev/null
12for f; do
13 while IFS="
14" read -r LINE; do
15 case "$LINE" in
16 *$PREFIX:[0-9]*:\**)
17 NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
18 if [ -f $TMPDIR/$NUM ]; then
19 echo "$TMPDIR/$NUM already exits prior to $f"
20 exit 1
21 fi
22 exec 3>>$TMPDIR/$NUM
23 echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
24 /bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3
25 ;;
26 *$PREFIX:[0-9]*)
27 NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
28 if [ -f $TMPDIR/$NUM ]; then
29 echo "$TMPDIR/$NUM already exits prior to $f"
30 exit 1
31 fi
32 exec 3>>$TMPDIR/$NUM
33 echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
34 /bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3
35 ;;
36 *:\**)
37 /bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3
38 echo >&3
39 exec 3>/dev/null
40 ;;
41 *)
42 /bin/echo "$LINE" >&3
43 ;;
44 esac
45 done < $f
46 echo >&3
47 exec 3>/dev/null
48done
49
50LASTFILE=""
51for f in $TMPDIR/*; do
52 if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then
53 LASTFILE=$(cat $TMPDIR/.$(basename $f) )
54 echo "[ $LASTFILE ]"
55 fi
56 cat $f
57done
58
diff --git a/Documentation/virtual/lguest/lguest.c b/Documentation/virtual/lguest/lguest.c
new file mode 100644
index 000000000000..cd9d6af61d07
--- /dev/null
+++ b/Documentation/virtual/lguest/lguest.c
@@ -0,0 +1,2077 @@
1/*P:100
2 * This is the Launcher code, a simple program which lays out the "physical"
3 * memory for the new Guest by mapping the kernel image and the virtual
4 * devices, then opens /dev/lguest to tell the kernel about the Guest and
5 * control it.
6:*/
7#define _LARGEFILE64_SOURCE
8#define _GNU_SOURCE
9#include <stdio.h>
10#include <string.h>
11#include <unistd.h>
12#include <err.h>
13#include <stdint.h>
14#include <stdlib.h>
15#include <elf.h>
16#include <sys/mman.h>
17#include <sys/param.h>
18#include <sys/types.h>
19#include <sys/stat.h>
20#include <sys/wait.h>
21#include <sys/eventfd.h>
22#include <fcntl.h>
23#include <stdbool.h>
24#include <errno.h>
25#include <ctype.h>
26#include <sys/socket.h>
27#include <sys/ioctl.h>
28#include <sys/time.h>
29#include <time.h>
30#include <netinet/in.h>
31#include <net/if.h>
32#include <linux/sockios.h>
33#include <linux/if_tun.h>
34#include <sys/uio.h>
35#include <termios.h>
36#include <getopt.h>
37#include <assert.h>
38#include <sched.h>
39#include <limits.h>
40#include <stddef.h>
41#include <signal.h>
42#include <pwd.h>
43#include <grp.h>
44
45#include <linux/virtio_config.h>
46#include <linux/virtio_net.h>
47#include <linux/virtio_blk.h>
48#include <linux/virtio_console.h>
49#include <linux/virtio_rng.h>
50#include <linux/virtio_ring.h>
51#include <asm/bootparam.h>
52#include "../../../include/linux/lguest_launcher.h"
53/*L:110
54 * We can ignore the 42 include files we need for this program, but I do want
55 * to draw attention to the use of kernel-style types.
56 *
57 * As Linus said, "C is a Spartan language, and so should your naming be." I
58 * like these abbreviations, so we define them here. Note that u64 is always
59 * unsigned long long, which works on all Linux systems: this means that we can
60 * use %llu in printf for any u64.
61 */
62typedef unsigned long long u64;
63typedef uint32_t u32;
64typedef uint16_t u16;
65typedef uint8_t u8;
66/*:*/
67
68#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
69#define BRIDGE_PFX "bridge:"
70#ifndef SIOCBRADDIF
71#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
72#endif
73/* We can have up to 256 pages for devices. */
74#define DEVICE_PAGES 256
75/* This will occupy 3 pages: it must be a power of 2. */
76#define VIRTQUEUE_NUM 256
77
78/*L:120
79 * verbose is both a global flag and a macro. The C preprocessor allows
80 * this, and although I wouldn't recommend it, it works quite nicely here.
81 */
82static bool verbose;
83#define verbose(args...) \
84 do { if (verbose) printf(args); } while(0)
85/*:*/
86
87/* The pointer to the start of guest memory. */
88static void *guest_base;
89/* The maximum guest physical address allowed, and maximum possible. */
90static unsigned long guest_limit, guest_max;
91/* The /dev/lguest file descriptor. */
92static int lguest_fd;
93
94/* a per-cpu variable indicating whose vcpu is currently running */
95static unsigned int __thread cpu_id;
96
97/* This is our list of devices. */
98struct device_list {
99 /* Counter to assign interrupt numbers. */
100 unsigned int next_irq;
101
102 /* Counter to print out convenient device numbers. */
103 unsigned int device_num;
104
105 /* The descriptor page for the devices. */
106 u8 *descpage;
107
108 /* A single linked list of devices. */
109 struct device *dev;
110 /* And a pointer to the last device for easy append. */
111 struct device *lastdev;
112};
113
114/* The list of Guest devices, based on command line arguments. */
115static struct device_list devices;
116
117/* The device structure describes a single device. */
118struct device {
119 /* The linked-list pointer. */
120 struct device *next;
121
122 /* The device's descriptor, as mapped into the Guest. */
123 struct lguest_device_desc *desc;
124
125 /* We can't trust desc values once Guest has booted: we use these. */
126 unsigned int feature_len;
127 unsigned int num_vq;
128
129 /* The name of this device, for --verbose. */
130 const char *name;
131
132 /* Any queues attached to this device */
133 struct virtqueue *vq;
134
135 /* Is it operational */
136 bool running;
137
138 /* Device-specific data. */
139 void *priv;
140};
141
142/* The virtqueue structure describes a queue attached to a device. */
143struct virtqueue {
144 struct virtqueue *next;
145
146 /* Which device owns me. */
147 struct device *dev;
148
149 /* The configuration for this queue. */
150 struct lguest_vqconfig config;
151
152 /* The actual ring of buffers. */
153 struct vring vring;
154
155 /* Last available index we saw. */
156 u16 last_avail_idx;
157
158 /* How many are used since we sent last irq? */
159 unsigned int pending_used;
160
161 /* Eventfd where Guest notifications arrive. */
162 int eventfd;
163
164 /* Function for the thread which is servicing this virtqueue. */
165 void (*service)(struct virtqueue *vq);
166 pid_t thread;
167};
168
169/* Remember the arguments to the program so we can "reboot" */
170static char **main_args;
171
172/* The original tty settings to restore on exit. */
173static struct termios orig_term;
174
175/*
176 * We have to be careful with barriers: our devices are all run in separate
177 * threads and so we need to make sure that changes visible to the Guest happen
178 * in precise order.
179 */
180#define wmb() __asm__ __volatile__("" : : : "memory")
181#define mb() __asm__ __volatile__("" : : : "memory")
182
183/*
184 * Convert an iovec element to the given type.
185 *
186 * This is a fairly ugly trick: we need to know the size of the type and
187 * alignment requirement to check the pointer is kosher. It's also nice to
188 * have the name of the type in case we report failure.
189 *
190 * Typing those three things all the time is cumbersome and error prone, so we
191 * have a macro which sets them all up and passes to the real function.
192 */
193#define convert(iov, type) \
194 ((type *)_convert((iov), sizeof(type), __alignof__(type), #type))
195
196static void *_convert(struct iovec *iov, size_t size, size_t align,
197 const char *name)
198{
199 if (iov->iov_len != size)
200 errx(1, "Bad iovec size %zu for %s", iov->iov_len, name);
201 if ((unsigned long)iov->iov_base % align != 0)
202 errx(1, "Bad alignment %p for %s", iov->iov_base, name);
203 return iov->iov_base;
204}
205
206/* Wrapper for the last available index. Makes it easier to change. */
207#define lg_last_avail(vq) ((vq)->last_avail_idx)
208
209/*
210 * The virtio configuration space is defined to be little-endian. x86 is
211 * little-endian too, but it's nice to be explicit so we have these helpers.
212 */
213#define cpu_to_le16(v16) (v16)
214#define cpu_to_le32(v32) (v32)
215#define cpu_to_le64(v64) (v64)
216#define le16_to_cpu(v16) (v16)
217#define le32_to_cpu(v32) (v32)
218#define le64_to_cpu(v64) (v64)
219
220/* Is this iovec empty? */
221static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
222{
223 unsigned int i;
224
225 for (i = 0; i < num_iov; i++)
226 if (iov[i].iov_len)
227 return false;
228 return true;
229}
230
231/* Take len bytes from the front of this iovec. */
232static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len)
233{
234 unsigned int i;
235
236 for (i = 0; i < num_iov; i++) {
237 unsigned int used;
238
239 used = iov[i].iov_len < len ? iov[i].iov_len : len;
240 iov[i].iov_base += used;
241 iov[i].iov_len -= used;
242 len -= used;
243 }
244 assert(len == 0);
245}
246
247/* The device virtqueue descriptors are followed by feature bitmasks. */
248static u8 *get_feature_bits(struct device *dev)
249{
250 return (u8 *)(dev->desc + 1)
251 + dev->num_vq * sizeof(struct lguest_vqconfig);
252}
253
254/*L:100
255 * The Launcher code itself takes us out into userspace, that scary place where
256 * pointers run wild and free! Unfortunately, like most userspace programs,
257 * it's quite boring (which is why everyone likes to hack on the kernel!).
258 * Perhaps if you make up an Lguest Drinking Game at this point, it will get
259 * you through this section. Or, maybe not.
260 *
261 * The Launcher sets up a big chunk of memory to be the Guest's "physical"
262 * memory and stores it in "guest_base". In other words, Guest physical ==
263 * Launcher virtual with an offset.
264 *
265 * This can be tough to get your head around, but usually it just means that we
266 * use these trivial conversion functions when the Guest gives us its
267 * "physical" addresses:
268 */
269static void *from_guest_phys(unsigned long addr)
270{
271 return guest_base + addr;
272}
273
274static unsigned long to_guest_phys(const void *addr)
275{
276 return (addr - guest_base);
277}
278
279/*L:130
280 * Loading the Kernel.
281 *
282 * We start with couple of simple helper routines. open_or_die() avoids
283 * error-checking code cluttering the callers:
284 */
285static int open_or_die(const char *name, int flags)
286{
287 int fd = open(name, flags);
288 if (fd < 0)
289 err(1, "Failed to open %s", name);
290 return fd;
291}
292
293/* map_zeroed_pages() takes a number of pages. */
294static void *map_zeroed_pages(unsigned int num)
295{
296 int fd = open_or_die("/dev/zero", O_RDONLY);
297 void *addr;
298
299 /*
300 * We use a private mapping (ie. if we write to the page, it will be
301 * copied). We allocate an extra two pages PROT_NONE to act as guard
302 * pages against read/write attempts that exceed allocated space.
303 */
304 addr = mmap(NULL, getpagesize() * (num+2),
305 PROT_NONE, MAP_PRIVATE, fd, 0);
306
307 if (addr == MAP_FAILED)
308 err(1, "Mmapping %u pages of /dev/zero", num);
309
310 if (mprotect(addr + getpagesize(), getpagesize() * num,
311 PROT_READ|PROT_WRITE) == -1)
312 err(1, "mprotect rw %u pages failed", num);
313
314 /*
315 * One neat mmap feature is that you can close the fd, and it
316 * stays mapped.
317 */
318 close(fd);
319
320 /* Return address after PROT_NONE page */
321 return addr + getpagesize();
322}
323
324/* Get some more pages for a device. */
325static void *get_pages(unsigned int num)
326{
327 void *addr = from_guest_phys(guest_limit);
328
329 guest_limit += num * getpagesize();
330 if (guest_limit > guest_max)
331 errx(1, "Not enough memory for devices");
332 return addr;
333}
334
335/*
336 * This routine is used to load the kernel or initrd. It tries mmap, but if
337 * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
338 * it falls back to reading the memory in.
339 */
340static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
341{
342 ssize_t r;
343
344 /*
345 * We map writable even though for some segments are marked read-only.
346 * The kernel really wants to be writable: it patches its own
347 * instructions.
348 *
349 * MAP_PRIVATE means that the page won't be copied until a write is
350 * done to it. This allows us to share untouched memory between
351 * Guests.
352 */
353 if (mmap(addr, len, PROT_READ|PROT_WRITE,
354 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
355 return;
356
357 /* pread does a seek and a read in one shot: saves a few lines. */
358 r = pread(fd, addr, len, offset);
359 if (r != len)
360 err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
361}
362
363/*
364 * This routine takes an open vmlinux image, which is in ELF, and maps it into
365 * the Guest memory. ELF = Embedded Linking Format, which is the format used
366 * by all modern binaries on Linux including the kernel.
367 *
368 * The ELF headers give *two* addresses: a physical address, and a virtual
369 * address. We use the physical address; the Guest will map itself to the
370 * virtual address.
371 *
372 * We return the starting address.
373 */
374static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
375{
376 Elf32_Phdr phdr[ehdr->e_phnum];
377 unsigned int i;
378
379 /*
380 * Sanity checks on the main ELF header: an x86 executable with a
381 * reasonable number of correctly-sized program headers.
382 */
383 if (ehdr->e_type != ET_EXEC
384 || ehdr->e_machine != EM_386
385 || ehdr->e_phentsize != sizeof(Elf32_Phdr)
386 || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
387 errx(1, "Malformed elf header");
388
389 /*
390 * An ELF executable contains an ELF header and a number of "program"
391 * headers which indicate which parts ("segments") of the program to
392 * load where.
393 */
394
395 /* We read in all the program headers at once: */
396 if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
397 err(1, "Seeking to program headers");
398 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
399 err(1, "Reading program headers");
400
401 /*
402 * Try all the headers: there are usually only three. A read-only one,
403 * a read-write one, and a "note" section which we don't load.
404 */
405 for (i = 0; i < ehdr->e_phnum; i++) {
406 /* If this isn't a loadable segment, we ignore it */
407 if (phdr[i].p_type != PT_LOAD)
408 continue;
409
410 verbose("Section %i: size %i addr %p\n",
411 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
412
413 /* We map this section of the file at its physical address. */
414 map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
415 phdr[i].p_offset, phdr[i].p_filesz);
416 }
417
418 /* The entry point is given in the ELF header. */
419 return ehdr->e_entry;
420}
421
422/*L:150
423 * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed
424 * to jump into it and it will unpack itself. We used to have to perform some
425 * hairy magic because the unpacking code scared me.
426 *
427 * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
428 * a small patch to jump over the tricky bits in the Guest, so now we just read
429 * the funky header so we know where in the file to load, and away we go!
430 */
431static unsigned long load_bzimage(int fd)
432{
433 struct boot_params boot;
434 int r;
435 /* Modern bzImages get loaded at 1M. */
436 void *p = from_guest_phys(0x100000);
437
438 /*
439 * Go back to the start of the file and read the header. It should be
440 * a Linux boot header (see Documentation/x86/i386/boot.txt)
441 */
442 lseek(fd, 0, SEEK_SET);
443 read(fd, &boot, sizeof(boot));
444
445 /* Inside the setup_hdr, we expect the magic "HdrS" */
446 if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
447 errx(1, "This doesn't look like a bzImage to me");
448
449 /* Skip over the extra sectors of the header. */
450 lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
451
452 /* Now read everything into memory. in nice big chunks. */
453 while ((r = read(fd, p, 65536)) > 0)
454 p += r;
455
456 /* Finally, code32_start tells us where to enter the kernel. */
457 return boot.hdr.code32_start;
458}
459
460/*L:140
461 * Loading the kernel is easy when it's a "vmlinux", but most kernels
462 * come wrapped up in the self-decompressing "bzImage" format. With a little
463 * work, we can load those, too.
464 */
465static unsigned long load_kernel(int fd)
466{
467 Elf32_Ehdr hdr;
468
469 /* Read in the first few bytes. */
470 if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
471 err(1, "Reading kernel");
472
473 /* If it's an ELF file, it starts with "\177ELF" */
474 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
475 return map_elf(fd, &hdr);
476
477 /* Otherwise we assume it's a bzImage, and try to load it. */
478 return load_bzimage(fd);
479}
480
481/*
482 * This is a trivial little helper to align pages. Andi Kleen hated it because
483 * it calls getpagesize() twice: "it's dumb code."
484 *
485 * Kernel guys get really het up about optimization, even when it's not
486 * necessary. I leave this code as a reaction against that.
487 */
488static inline unsigned long page_align(unsigned long addr)
489{
490 /* Add upwards and truncate downwards. */
491 return ((addr + getpagesize()-1) & ~(getpagesize()-1));
492}
493
494/*L:180
495 * An "initial ram disk" is a disk image loaded into memory along with the
496 * kernel which the kernel can use to boot from without needing any drivers.
497 * Most distributions now use this as standard: the initrd contains the code to
498 * load the appropriate driver modules for the current machine.
499 *
500 * Importantly, James Morris works for RedHat, and Fedora uses initrds for its
501 * kernels. He sent me this (and tells me when I break it).
502 */
503static unsigned long load_initrd(const char *name, unsigned long mem)
504{
505 int ifd;
506 struct stat st;
507 unsigned long len;
508
509 ifd = open_or_die(name, O_RDONLY);
510 /* fstat() is needed to get the file size. */
511 if (fstat(ifd, &st) < 0)
512 err(1, "fstat() on initrd '%s'", name);
513
514 /*
515 * We map the initrd at the top of memory, but mmap wants it to be
516 * page-aligned, so we round the size up for that.
517 */
518 len = page_align(st.st_size);
519 map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
520 /*
521 * Once a file is mapped, you can close the file descriptor. It's a
522 * little odd, but quite useful.
523 */
524 close(ifd);
525 verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
526
527 /* We return the initrd size. */
528 return len;
529}
530/*:*/
531
532/*
533 * Simple routine to roll all the commandline arguments together with spaces
534 * between them.
535 */
536static void concat(char *dst, char *args[])
537{
538 unsigned int i, len = 0;
539
540 for (i = 0; args[i]; i++) {
541 if (i) {
542 strcat(dst+len, " ");
543 len++;
544 }
545 strcpy(dst+len, args[i]);
546 len += strlen(args[i]);
547 }
548 /* In case it's empty. */
549 dst[len] = '\0';
550}
551
552/*L:185
553 * This is where we actually tell the kernel to initialize the Guest. We
554 * saw the arguments it expects when we looked at initialize() in lguest_user.c:
555 * the base of Guest "physical" memory, the top physical page to allow and the
556 * entry point for the Guest.
557 */
558static void tell_kernel(unsigned long start)
559{
560 unsigned long args[] = { LHREQ_INITIALIZE,
561 (unsigned long)guest_base,
562 guest_limit / getpagesize(), start };
563 verbose("Guest: %p - %p (%#lx)\n",
564 guest_base, guest_base + guest_limit, guest_limit);
565 lguest_fd = open_or_die("/dev/lguest", O_RDWR);
566 if (write(lguest_fd, args, sizeof(args)) < 0)
567 err(1, "Writing to /dev/lguest");
568}
569/*:*/
570
571/*L:200
572 * Device Handling.
573 *
574 * When the Guest gives us a buffer, it sends an array of addresses and sizes.
575 * We need to make sure it's not trying to reach into the Launcher itself, so
576 * we have a convenient routine which checks it and exits with an error message
577 * if something funny is going on:
578 */
579static void *_check_pointer(unsigned long addr, unsigned int size,
580 unsigned int line)
581{
582 /*
583 * Check if the requested address and size exceeds the allocated memory,
584 * or addr + size wraps around.
585 */
586 if ((addr + size) > guest_limit || (addr + size) < addr)
587 errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr);
588 /*
589 * We return a pointer for the caller's convenience, now we know it's
590 * safe to use.
591 */
592 return from_guest_phys(addr);
593}
594/* A macro which transparently hands the line number to the real function. */
595#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
596
597/*
598 * Each buffer in the virtqueues is actually a chain of descriptors. This
599 * function returns the next descriptor in the chain, or vq->vring.num if we're
600 * at the end.
601 */
602static unsigned next_desc(struct vring_desc *desc,
603 unsigned int i, unsigned int max)
604{
605 unsigned int next;
606
607 /* If this descriptor says it doesn't chain, we're done. */
608 if (!(desc[i].flags & VRING_DESC_F_NEXT))
609 return max;
610
611 /* Check they're not leading us off end of descriptors. */
612 next = desc[i].next;
613 /* Make sure compiler knows to grab that: we don't want it changing! */
614 wmb();
615
616 if (next >= max)
617 errx(1, "Desc next is %u", next);
618
619 return next;
620}
621
622/*
623 * This actually sends the interrupt for this virtqueue, if we've used a
624 * buffer.
625 */
626static void trigger_irq(struct virtqueue *vq)
627{
628 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
629
630 /* Don't inform them if nothing used. */
631 if (!vq->pending_used)
632 return;
633 vq->pending_used = 0;
634
635 /* If they don't want an interrupt, don't send one... */
636 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
637 return;
638 }
639
640 /* Send the Guest an interrupt tell them we used something up. */
641 if (write(lguest_fd, buf, sizeof(buf)) != 0)
642 err(1, "Triggering irq %i", vq->config.irq);
643}
644
645/*
646 * This looks in the virtqueue for the first available buffer, and converts
647 * it to an iovec for convenient access. Since descriptors consist of some
648 * number of output then some number of input descriptors, it's actually two
649 * iovecs, but we pack them into one and note how many of each there were.
650 *
651 * This function waits if necessary, and returns the descriptor number found.
652 */
653static unsigned wait_for_vq_desc(struct virtqueue *vq,
654 struct iovec iov[],
655 unsigned int *out_num, unsigned int *in_num)
656{
657 unsigned int i, head, max;
658 struct vring_desc *desc;
659 u16 last_avail = lg_last_avail(vq);
660
661 /* There's nothing available? */
662 while (last_avail == vq->vring.avail->idx) {
663 u64 event;
664
665 /*
666 * Since we're about to sleep, now is a good time to tell the
667 * Guest about what we've used up to now.
668 */
669 trigger_irq(vq);
670
671 /* OK, now we need to know about added descriptors. */
672 vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
673
674 /*
675 * They could have slipped one in as we were doing that: make
676 * sure it's written, then check again.
677 */
678 mb();
679 if (last_avail != vq->vring.avail->idx) {
680 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
681 break;
682 }
683
684 /* Nothing new? Wait for eventfd to tell us they refilled. */
685 if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
686 errx(1, "Event read failed?");
687
688 /* We don't need to be notified again. */
689 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
690 }
691
692 /* Check it isn't doing very strange things with descriptor numbers. */
693 if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
694 errx(1, "Guest moved used index from %u to %u",
695 last_avail, vq->vring.avail->idx);
696
697 /*
698 * Grab the next descriptor number they're advertising, and increment
699 * the index we've seen.
700 */
701 head = vq->vring.avail->ring[last_avail % vq->vring.num];
702 lg_last_avail(vq)++;
703
704 /* If their number is silly, that's a fatal mistake. */
705 if (head >= vq->vring.num)
706 errx(1, "Guest says index %u is available", head);
707
708 /* When we start there are none of either input nor output. */
709 *out_num = *in_num = 0;
710
711 max = vq->vring.num;
712 desc = vq->vring.desc;
713 i = head;
714
715 /*
716 * If this is an indirect entry, then this buffer contains a descriptor
717 * table which we handle as if it's any normal descriptor chain.
718 */
719 if (desc[i].flags & VRING_DESC_F_INDIRECT) {
720 if (desc[i].len % sizeof(struct vring_desc))
721 errx(1, "Invalid size for indirect buffer table");
722
723 max = desc[i].len / sizeof(struct vring_desc);
724 desc = check_pointer(desc[i].addr, desc[i].len);
725 i = 0;
726 }
727
728 do {
729 /* Grab the first descriptor, and check it's OK. */
730 iov[*out_num + *in_num].iov_len = desc[i].len;
731 iov[*out_num + *in_num].iov_base
732 = check_pointer(desc[i].addr, desc[i].len);
733 /* If this is an input descriptor, increment that count. */
734 if (desc[i].flags & VRING_DESC_F_WRITE)
735 (*in_num)++;
736 else {
737 /*
738 * If it's an output descriptor, they're all supposed
739 * to come before any input descriptors.
740 */
741 if (*in_num)
742 errx(1, "Descriptor has out after in");
743 (*out_num)++;
744 }
745
746 /* If we've got too many, that implies a descriptor loop. */
747 if (*out_num + *in_num > max)
748 errx(1, "Looped descriptor");
749 } while ((i = next_desc(desc, i, max)) != max);
750
751 return head;
752}
753
754/*
755 * After we've used one of their buffers, we tell the Guest about it. Sometime
756 * later we'll want to send them an interrupt using trigger_irq(); note that
757 * wait_for_vq_desc() does that for us if it has to wait.
758 */
759static void add_used(struct virtqueue *vq, unsigned int head, int len)
760{
761 struct vring_used_elem *used;
762
763 /*
764 * The virtqueue contains a ring of used buffers. Get a pointer to the
765 * next entry in that used ring.
766 */
767 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
768 used->id = head;
769 used->len = len;
770 /* Make sure buffer is written before we update index. */
771 wmb();
772 vq->vring.used->idx++;
773 vq->pending_used++;
774}
775
776/* And here's the combo meal deal. Supersize me! */
777static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
778{
779 add_used(vq, head, len);
780 trigger_irq(vq);
781}
782
783/*
784 * The Console
785 *
786 * We associate some data with the console for our exit hack.
787 */
788struct console_abort {
789 /* How many times have they hit ^C? */
790 int count;
791 /* When did they start? */
792 struct timeval start;
793};
794
795/* This is the routine which handles console input (ie. stdin). */
796static void console_input(struct virtqueue *vq)
797{
798 int len;
799 unsigned int head, in_num, out_num;
800 struct console_abort *abort = vq->dev->priv;
801 struct iovec iov[vq->vring.num];
802
803 /* Make sure there's a descriptor available. */
804 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
805 if (out_num)
806 errx(1, "Output buffers in console in queue?");
807
808 /* Read into it. This is where we usually wait. */
809 len = readv(STDIN_FILENO, iov, in_num);
810 if (len <= 0) {
811 /* Ran out of input? */
812 warnx("Failed to get console input, ignoring console.");
813 /*
814 * For simplicity, dying threads kill the whole Launcher. So
815 * just nap here.
816 */
817 for (;;)
818 pause();
819 }
820
821 /* Tell the Guest we used a buffer. */
822 add_used_and_trigger(vq, head, len);
823
824 /*
825 * Three ^C within one second? Exit.
826 *
827 * This is such a hack, but works surprisingly well. Each ^C has to
828 * be in a buffer by itself, so they can't be too fast. But we check
829 * that we get three within about a second, so they can't be too
830 * slow.
831 */
832 if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
833 abort->count = 0;
834 return;
835 }
836
837 abort->count++;
838 if (abort->count == 1)
839 gettimeofday(&abort->start, NULL);
840 else if (abort->count == 3) {
841 struct timeval now;
842 gettimeofday(&now, NULL);
843 /* Kill all Launcher processes with SIGINT, like normal ^C */
844 if (now.tv_sec <= abort->start.tv_sec+1)
845 kill(0, SIGINT);
846 abort->count = 0;
847 }
848}
849
850/* This is the routine which handles console output (ie. stdout). */
851static void console_output(struct virtqueue *vq)
852{
853 unsigned int head, out, in;
854 struct iovec iov[vq->vring.num];
855
856 /* We usually wait in here, for the Guest to give us something. */
857 head = wait_for_vq_desc(vq, iov, &out, &in);
858 if (in)
859 errx(1, "Input buffers in console output queue?");
860
861 /* writev can return a partial write, so we loop here. */
862 while (!iov_empty(iov, out)) {
863 int len = writev(STDOUT_FILENO, iov, out);
864 if (len <= 0)
865 err(1, "Write to stdout gave %i", len);
866 iov_consume(iov, out, len);
867 }
868
869 /*
870 * We're finished with that buffer: if we're going to sleep,
871 * wait_for_vq_desc() will prod the Guest with an interrupt.
872 */
873 add_used(vq, head, 0);
874}
875
876/*
877 * The Network
878 *
879 * Handling output for network is also simple: we get all the output buffers
880 * and write them to /dev/net/tun.
881 */
882struct net_info {
883 int tunfd;
884};
885
886static void net_output(struct virtqueue *vq)
887{
888 struct net_info *net_info = vq->dev->priv;
889 unsigned int head, out, in;
890 struct iovec iov[vq->vring.num];
891
892 /* We usually wait in here for the Guest to give us a packet. */
893 head = wait_for_vq_desc(vq, iov, &out, &in);
894 if (in)
895 errx(1, "Input buffers in net output queue?");
896 /*
897 * Send the whole thing through to /dev/net/tun. It expects the exact
898 * same format: what a coincidence!
899 */
900 if (writev(net_info->tunfd, iov, out) < 0)
901 errx(1, "Write to tun failed?");
902
903 /*
904 * Done with that one; wait_for_vq_desc() will send the interrupt if
905 * all packets are processed.
906 */
907 add_used(vq, head, 0);
908}
909
910/*
911 * Handling network input is a bit trickier, because I've tried to optimize it.
912 *
913 * First we have a helper routine which tells is if from this file descriptor
914 * (ie. the /dev/net/tun device) will block:
915 */
916static bool will_block(int fd)
917{
918 fd_set fdset;
919 struct timeval zero = { 0, 0 };
920 FD_ZERO(&fdset);
921 FD_SET(fd, &fdset);
922 return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
923}
924
925/*
926 * This handles packets coming in from the tun device to our Guest. Like all
927 * service routines, it gets called again as soon as it returns, so you don't
928 * see a while(1) loop here.
929 */
930static void net_input(struct virtqueue *vq)
931{
932 int len;
933 unsigned int head, out, in;
934 struct iovec iov[vq->vring.num];
935 struct net_info *net_info = vq->dev->priv;
936
937 /*
938 * Get a descriptor to write an incoming packet into. This will also
939 * send an interrupt if they're out of descriptors.
940 */
941 head = wait_for_vq_desc(vq, iov, &out, &in);
942 if (out)
943 errx(1, "Output buffers in net input queue?");
944
945 /*
946 * If it looks like we'll block reading from the tun device, send them
947 * an interrupt.
948 */
949 if (vq->pending_used && will_block(net_info->tunfd))
950 trigger_irq(vq);
951
952 /*
953 * Read in the packet. This is where we normally wait (when there's no
954 * incoming network traffic).
955 */
956 len = readv(net_info->tunfd, iov, in);
957 if (len <= 0)
958 err(1, "Failed to read from tun.");
959
960 /*
961 * Mark that packet buffer as used, but don't interrupt here. We want
962 * to wait until we've done as much work as we can.
963 */
964 add_used(vq, head, len);
965}
966/*:*/
967
968/* This is the helper to create threads: run the service routine in a loop. */
969static int do_thread(void *_vq)
970{
971 struct virtqueue *vq = _vq;
972
973 for (;;)
974 vq->service(vq);
975 return 0;
976}
977
978/*
979 * When a child dies, we kill our entire process group with SIGTERM. This
980 * also has the side effect that the shell restores the console for us!
981 */
982static void kill_launcher(int signal)
983{
984 kill(0, SIGTERM);
985}
986
987static void reset_device(struct device *dev)
988{
989 struct virtqueue *vq;
990
991 verbose("Resetting device %s\n", dev->name);
992
993 /* Clear any features they've acked. */
994 memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len);
995
996 /* We're going to be explicitly killing threads, so ignore them. */
997 signal(SIGCHLD, SIG_IGN);
998
999 /* Zero out the virtqueues, get rid of their threads */
1000 for (vq = dev->vq; vq; vq = vq->next) {
1001 if (vq->thread != (pid_t)-1) {
1002 kill(vq->thread, SIGTERM);
1003 waitpid(vq->thread, NULL, 0);
1004 vq->thread = (pid_t)-1;
1005 }
1006 memset(vq->vring.desc, 0,
1007 vring_size(vq->config.num, LGUEST_VRING_ALIGN));
1008 lg_last_avail(vq) = 0;
1009 }
1010 dev->running = false;
1011
1012 /* Now we care if threads die. */
1013 signal(SIGCHLD, (void *)kill_launcher);
1014}
1015
1016/*L:216
1017 * This actually creates the thread which services the virtqueue for a device.
1018 */
1019static void create_thread(struct virtqueue *vq)
1020{
1021 /*
1022 * Create stack for thread. Since the stack grows upwards, we point
1023 * the stack pointer to the end of this region.
1024 */
1025 char *stack = malloc(32768);
1026 unsigned long args[] = { LHREQ_EVENTFD,
1027 vq->config.pfn*getpagesize(), 0 };
1028
1029 /* Create a zero-initialized eventfd. */
1030 vq->eventfd = eventfd(0, 0);
1031 if (vq->eventfd < 0)
1032 err(1, "Creating eventfd");
1033 args[2] = vq->eventfd;
1034
1035 /*
1036 * Attach an eventfd to this virtqueue: it will go off when the Guest
1037 * does an LHCALL_NOTIFY for this vq.
1038 */
1039 if (write(lguest_fd, &args, sizeof(args)) != 0)
1040 err(1, "Attaching eventfd");
1041
1042 /*
1043 * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so
1044 * we get a signal if it dies.
1045 */
1046 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
1047 if (vq->thread == (pid_t)-1)
1048 err(1, "Creating clone");
1049
1050 /* We close our local copy now the child has it. */
1051 close(vq->eventfd);
1052}
1053
1054static void start_device(struct device *dev)
1055{
1056 unsigned int i;
1057 struct virtqueue *vq;
1058
1059 verbose("Device %s OK: offered", dev->name);
1060 for (i = 0; i < dev->feature_len; i++)
1061 verbose(" %02x", get_feature_bits(dev)[i]);
1062 verbose(", accepted");
1063 for (i = 0; i < dev->feature_len; i++)
1064 verbose(" %02x", get_feature_bits(dev)
1065 [dev->feature_len+i]);
1066
1067 for (vq = dev->vq; vq; vq = vq->next) {
1068 if (vq->service)
1069 create_thread(vq);
1070 }
1071 dev->running = true;
1072}
1073
1074static void cleanup_devices(void)
1075{
1076 struct device *dev;
1077
1078 for (dev = devices.dev; dev; dev = dev->next)
1079 reset_device(dev);
1080
1081 /* If we saved off the original terminal settings, restore them now. */
1082 if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
1083 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
1084}
1085
1086/* When the Guest tells us they updated the status field, we handle it. */
1087static void update_device_status(struct device *dev)
1088{
1089 /* A zero status is a reset, otherwise it's a set of flags. */
1090 if (dev->desc->status == 0)
1091 reset_device(dev);
1092 else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) {
1093 warnx("Device %s configuration FAILED", dev->name);
1094 if (dev->running)
1095 reset_device(dev);
1096 } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) {
1097 if (!dev->running)
1098 start_device(dev);
1099 }
1100}
1101
1102/*L:215
1103 * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In
1104 * particular, it's used to notify us of device status changes during boot.
1105 */
1106static void handle_output(unsigned long addr)
1107{
1108 struct device *i;
1109
1110 /* Check each device. */
1111 for (i = devices.dev; i; i = i->next) {
1112 struct virtqueue *vq;
1113
1114 /*
1115 * Notifications to device descriptors mean they updated the
1116 * device status.
1117 */
1118 if (from_guest_phys(addr) == i->desc) {
1119 update_device_status(i);
1120 return;
1121 }
1122
1123 /*
1124 * Devices *can* be used before status is set to DRIVER_OK.
1125 * The original plan was that they would never do this: they
1126 * would always finish setting up their status bits before
1127 * actually touching the virtqueues. In practice, we allowed
1128 * them to, and they do (eg. the disk probes for partition
1129 * tables as part of initialization).
1130 *
1131 * If we see this, we start the device: once it's running, we
1132 * expect the device to catch all the notifications.
1133 */
1134 for (vq = i->vq; vq; vq = vq->next) {
1135 if (addr != vq->config.pfn*getpagesize())
1136 continue;
1137 if (i->running)
1138 errx(1, "Notification on running %s", i->name);
1139 /* This just calls create_thread() for each virtqueue */
1140 start_device(i);
1141 return;
1142 }
1143 }
1144
1145 /*
1146 * Early console write is done using notify on a nul-terminated string
1147 * in Guest memory. It's also great for hacking debugging messages
1148 * into a Guest.
1149 */
1150 if (addr >= guest_limit)
1151 errx(1, "Bad NOTIFY %#lx", addr);
1152
1153 write(STDOUT_FILENO, from_guest_phys(addr),
1154 strnlen(from_guest_phys(addr), guest_limit - addr));
1155}
1156
1157/*L:190
1158 * Device Setup
1159 *
1160 * All devices need a descriptor so the Guest knows it exists, and a "struct
1161 * device" so the Launcher can keep track of it. We have common helper
1162 * routines to allocate and manage them.
1163 */
1164
1165/*
1166 * The layout of the device page is a "struct lguest_device_desc" followed by a
1167 * number of virtqueue descriptors, then two sets of feature bits, then an
1168 * array of configuration bytes. This routine returns the configuration
1169 * pointer.
1170 */
1171static u8 *device_config(const struct device *dev)
1172{
1173 return (void *)(dev->desc + 1)
1174 + dev->num_vq * sizeof(struct lguest_vqconfig)
1175 + dev->feature_len * 2;
1176}
1177
1178/*
1179 * This routine allocates a new "struct lguest_device_desc" from descriptor
1180 * table page just above the Guest's normal memory. It returns a pointer to
1181 * that descriptor.
1182 */
1183static struct lguest_device_desc *new_dev_desc(u16 type)
1184{
1185 struct lguest_device_desc d = { .type = type };
1186 void *p;
1187
1188 /* Figure out where the next device config is, based on the last one. */
1189 if (devices.lastdev)
1190 p = device_config(devices.lastdev)
1191 + devices.lastdev->desc->config_len;
1192 else
1193 p = devices.descpage;
1194
1195 /* We only have one page for all the descriptors. */
1196 if (p + sizeof(d) > (void *)devices.descpage + getpagesize())
1197 errx(1, "Too many devices");
1198
1199 /* p might not be aligned, so we memcpy in. */
1200 return memcpy(p, &d, sizeof(d));
1201}
1202
1203/*
1204 * Each device descriptor is followed by the description of its virtqueues. We
1205 * specify how many descriptors the virtqueue is to have.
1206 */
1207static void add_virtqueue(struct device *dev, unsigned int num_descs,
1208 void (*service)(struct virtqueue *))
1209{
1210 unsigned int pages;
1211 struct virtqueue **i, *vq = malloc(sizeof(*vq));
1212 void *p;
1213
1214 /* First we need some memory for this virtqueue. */
1215 pages = (vring_size(num_descs, LGUEST_VRING_ALIGN) + getpagesize() - 1)
1216 / getpagesize();
1217 p = get_pages(pages);
1218
1219 /* Initialize the virtqueue */
1220 vq->next = NULL;
1221 vq->last_avail_idx = 0;
1222 vq->dev = dev;
1223
1224 /*
1225 * This is the routine the service thread will run, and its Process ID
1226 * once it's running.
1227 */
1228 vq->service = service;
1229 vq->thread = (pid_t)-1;
1230
1231 /* Initialize the configuration. */
1232 vq->config.num = num_descs;
1233 vq->config.irq = devices.next_irq++;
1234 vq->config.pfn = to_guest_phys(p) / getpagesize();
1235
1236 /* Initialize the vring. */
1237 vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN);
1238
1239 /*
1240 * Append virtqueue to this device's descriptor. We use
1241 * device_config() to get the end of the device's current virtqueues;
1242 * we check that we haven't added any config or feature information
1243 * yet, otherwise we'd be overwriting them.
1244 */
1245 assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
1246 memcpy(device_config(dev), &vq->config, sizeof(vq->config));
1247 dev->num_vq++;
1248 dev->desc->num_vq++;
1249
1250 verbose("Virtqueue page %#lx\n", to_guest_phys(p));
1251
1252 /*
1253 * Add to tail of list, so dev->vq is first vq, dev->vq->next is
1254 * second.
1255 */
1256 for (i = &dev->vq; *i; i = &(*i)->next);
1257 *i = vq;
1258}
1259
1260/*
1261 * The first half of the feature bitmask is for us to advertise features. The
1262 * second half is for the Guest to accept features.
1263 */
1264static void add_feature(struct device *dev, unsigned bit)
1265{
1266 u8 *features = get_feature_bits(dev);
1267
1268 /* We can't extend the feature bits once we've added config bytes */
1269 if (dev->desc->feature_len <= bit / CHAR_BIT) {
1270 assert(dev->desc->config_len == 0);
1271 dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1;
1272 }
1273
1274 features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT));
1275}
1276
1277/*
1278 * This routine sets the configuration fields for an existing device's
1279 * descriptor. It only works for the last device, but that's OK because that's
1280 * how we use it.
1281 */
1282static void set_config(struct device *dev, unsigned len, const void *conf)
1283{
1284 /* Check we haven't overflowed our single page. */
1285 if (device_config(dev) + len > devices.descpage + getpagesize())
1286 errx(1, "Too many devices");
1287
1288 /* Copy in the config information, and store the length. */
1289 memcpy(device_config(dev), conf, len);
1290 dev->desc->config_len = len;
1291
1292 /* Size must fit in config_len field (8 bits)! */
1293 assert(dev->desc->config_len == len);
1294}
1295
1296/*
1297 * This routine does all the creation and setup of a new device, including
1298 * calling new_dev_desc() to allocate the descriptor and device memory. We
1299 * don't actually start the service threads until later.
1300 *
1301 * See what I mean about userspace being boring?
1302 */
1303static struct device *new_device(const char *name, u16 type)
1304{
1305 struct device *dev = malloc(sizeof(*dev));
1306
1307 /* Now we populate the fields one at a time. */
1308 dev->desc = new_dev_desc(type);
1309 dev->name = name;
1310 dev->vq = NULL;
1311 dev->feature_len = 0;
1312 dev->num_vq = 0;
1313 dev->running = false;
1314
1315 /*
1316 * Append to device list. Prepending to a single-linked list is
1317 * easier, but the user expects the devices to be arranged on the bus
1318 * in command-line order. The first network device on the command line
1319 * is eth0, the first block device /dev/vda, etc.
1320 */
1321 if (devices.lastdev)
1322 devices.lastdev->next = dev;
1323 else
1324 devices.dev = dev;
1325 devices.lastdev = dev;
1326
1327 return dev;
1328}
1329
1330/*
1331 * Our first setup routine is the console. It's a fairly simple device, but
1332 * UNIX tty handling makes it uglier than it could be.
1333 */
1334static void setup_console(void)
1335{
1336 struct device *dev;
1337
1338 /* If we can save the initial standard input settings... */
1339 if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
1340 struct termios term = orig_term;
1341 /*
1342 * Then we turn off echo, line buffering and ^C etc: We want a
1343 * raw input stream to the Guest.
1344 */
1345 term.c_lflag &= ~(ISIG|ICANON|ECHO);
1346 tcsetattr(STDIN_FILENO, TCSANOW, &term);
1347 }
1348
1349 dev = new_device("console", VIRTIO_ID_CONSOLE);
1350
1351 /* We store the console state in dev->priv, and initialize it. */
1352 dev->priv = malloc(sizeof(struct console_abort));
1353 ((struct console_abort *)dev->priv)->count = 0;
1354
1355 /*
1356 * The console needs two virtqueues: the input then the output. When
1357 * they put something the input queue, we make sure we're listening to
1358 * stdin. When they put something in the output queue, we write it to
1359 * stdout.
1360 */
1361 add_virtqueue(dev, VIRTQUEUE_NUM, console_input);
1362 add_virtqueue(dev, VIRTQUEUE_NUM, console_output);
1363
1364 verbose("device %u: console\n", ++devices.device_num);
1365}
1366/*:*/
1367
1368/*M:010
1369 * Inter-guest networking is an interesting area. Simplest is to have a
1370 * --sharenet=<name> option which opens or creates a named pipe. This can be
1371 * used to send packets to another guest in a 1:1 manner.
1372 *
1373 * More sopisticated is to use one of the tools developed for project like UML
1374 * to do networking.
1375 *
1376 * Faster is to do virtio bonding in kernel. Doing this 1:1 would be
1377 * completely generic ("here's my vring, attach to your vring") and would work
1378 * for any traffic. Of course, namespace and permissions issues need to be
1379 * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide
1380 * multiple inter-guest channels behind one interface, although it would
1381 * require some manner of hotplugging new virtio channels.
1382 *
1383 * Finally, we could implement a virtio network switch in the kernel.
1384:*/
1385
1386static u32 str2ip(const char *ipaddr)
1387{
1388 unsigned int b[4];
1389
1390 if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4)
1391 errx(1, "Failed to parse IP address '%s'", ipaddr);
1392 return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
1393}
1394
1395static void str2mac(const char *macaddr, unsigned char mac[6])
1396{
1397 unsigned int m[6];
1398 if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x",
1399 &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6)
1400 errx(1, "Failed to parse mac address '%s'", macaddr);
1401 mac[0] = m[0];
1402 mac[1] = m[1];
1403 mac[2] = m[2];
1404 mac[3] = m[3];
1405 mac[4] = m[4];
1406 mac[5] = m[5];
1407}
1408
1409/*
1410 * This code is "adapted" from libbridge: it attaches the Host end of the
1411 * network device to the bridge device specified by the command line.
1412 *
1413 * This is yet another James Morris contribution (I'm an IP-level guy, so I
1414 * dislike bridging), and I just try not to break it.
1415 */
1416static void add_to_bridge(int fd, const char *if_name, const char *br_name)
1417{
1418 int ifidx;
1419 struct ifreq ifr;
1420
1421 if (!*br_name)
1422 errx(1, "must specify bridge name");
1423
1424 ifidx = if_nametoindex(if_name);
1425 if (!ifidx)
1426 errx(1, "interface %s does not exist!", if_name);
1427
1428 strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
1429 ifr.ifr_name[IFNAMSIZ-1] = '\0';
1430 ifr.ifr_ifindex = ifidx;
1431 if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
1432 err(1, "can't add %s to bridge %s", if_name, br_name);
1433}
1434
1435/*
1436 * This sets up the Host end of the network device with an IP address, brings
1437 * it up so packets will flow, the copies the MAC address into the hwaddr
1438 * pointer.
1439 */
1440static void configure_device(int fd, const char *tapif, u32 ipaddr)
1441{
1442 struct ifreq ifr;
1443 struct sockaddr_in sin;
1444
1445 memset(&ifr, 0, sizeof(ifr));
1446 strcpy(ifr.ifr_name, tapif);
1447
1448 /* Don't read these incantations. Just cut & paste them like I did! */
1449 sin.sin_family = AF_INET;
1450 sin.sin_addr.s_addr = htonl(ipaddr);
1451 memcpy(&ifr.ifr_addr, &sin, sizeof(sin));
1452 if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
1453 err(1, "Setting %s interface address", tapif);
1454 ifr.ifr_flags = IFF_UP;
1455 if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
1456 err(1, "Bringing interface %s up", tapif);
1457}
1458
1459static int get_tun_device(char tapif[IFNAMSIZ])
1460{
1461 struct ifreq ifr;
1462 int netfd;
1463
1464 /* Start with this zeroed. Messy but sure. */
1465 memset(&ifr, 0, sizeof(ifr));
1466
1467 /*
1468 * We open the /dev/net/tun device and tell it we want a tap device. A
1469 * tap device is like a tun device, only somehow different. To tell
1470 * the truth, I completely blundered my way through this code, but it
1471 * works now!
1472 */
1473 netfd = open_or_die("/dev/net/tun", O_RDWR);
1474 ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
1475 strcpy(ifr.ifr_name, "tap%d");
1476 if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
1477 err(1, "configuring /dev/net/tun");
1478
1479 if (ioctl(netfd, TUNSETOFFLOAD,
1480 TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0)
1481 err(1, "Could not set features for tun device");
1482
1483 /*
1484 * We don't need checksums calculated for packets coming in this
1485 * device: trust us!
1486 */
1487 ioctl(netfd, TUNSETNOCSUM, 1);
1488
1489 memcpy(tapif, ifr.ifr_name, IFNAMSIZ);
1490 return netfd;
1491}
1492
1493/*L:195
1494 * Our network is a Host<->Guest network. This can either use bridging or
1495 * routing, but the principle is the same: it uses the "tun" device to inject
1496 * packets into the Host as if they came in from a normal network card. We
1497 * just shunt packets between the Guest and the tun device.
1498 */
1499static void setup_tun_net(char *arg)
1500{
1501 struct device *dev;
1502 struct net_info *net_info = malloc(sizeof(*net_info));
1503 int ipfd;
1504 u32 ip = INADDR_ANY;
1505 bool bridging = false;
1506 char tapif[IFNAMSIZ], *p;
1507 struct virtio_net_config conf;
1508
1509 net_info->tunfd = get_tun_device(tapif);
1510
1511 /* First we create a new network device. */
1512 dev = new_device("net", VIRTIO_ID_NET);
1513 dev->priv = net_info;
1514
1515 /* Network devices need a recv and a send queue, just like console. */
1516 add_virtqueue(dev, VIRTQUEUE_NUM, net_input);
1517 add_virtqueue(dev, VIRTQUEUE_NUM, net_output);
1518
1519 /*
1520 * We need a socket to perform the magic network ioctls to bring up the
1521 * tap interface, connect to the bridge etc. Any socket will do!
1522 */
1523 ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
1524 if (ipfd < 0)
1525 err(1, "opening IP socket");
1526
1527 /* If the command line was --tunnet=bridge:<name> do bridging. */
1528 if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
1529 arg += strlen(BRIDGE_PFX);
1530 bridging = true;
1531 }
1532
1533 /* A mac address may follow the bridge name or IP address */
1534 p = strchr(arg, ':');
1535 if (p) {
1536 str2mac(p+1, conf.mac);
1537 add_feature(dev, VIRTIO_NET_F_MAC);
1538 *p = '\0';
1539 }
1540
1541 /* arg is now either an IP address or a bridge name */
1542 if (bridging)
1543 add_to_bridge(ipfd, tapif, arg);
1544 else
1545 ip = str2ip(arg);
1546
1547 /* Set up the tun device. */
1548 configure_device(ipfd, tapif, ip);
1549
1550 /* Expect Guest to handle everything except UFO */
1551 add_feature(dev, VIRTIO_NET_F_CSUM);
1552 add_feature(dev, VIRTIO_NET_F_GUEST_CSUM);
1553 add_feature(dev, VIRTIO_NET_F_GUEST_TSO4);
1554 add_feature(dev, VIRTIO_NET_F_GUEST_TSO6);
1555 add_feature(dev, VIRTIO_NET_F_GUEST_ECN);
1556 add_feature(dev, VIRTIO_NET_F_HOST_TSO4);
1557 add_feature(dev, VIRTIO_NET_F_HOST_TSO6);
1558 add_feature(dev, VIRTIO_NET_F_HOST_ECN);
1559 /* We handle indirect ring entries */
1560 add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC);
1561 set_config(dev, sizeof(conf), &conf);
1562
1563 /* We don't need the socket any more; setup is done. */
1564 close(ipfd);
1565
1566 devices.device_num++;
1567
1568 if (bridging)
1569 verbose("device %u: tun %s attached to bridge: %s\n",
1570 devices.device_num, tapif, arg);
1571 else
1572 verbose("device %u: tun %s: %s\n",
1573 devices.device_num, tapif, arg);
1574}
1575/*:*/
1576
1577/* This hangs off device->priv. */
1578struct vblk_info {
1579 /* The size of the file. */
1580 off64_t len;
1581
1582 /* The file descriptor for the file. */
1583 int fd;
1584
1585};
1586
1587/*L:210
1588 * The Disk
1589 *
1590 * The disk only has one virtqueue, so it only has one thread. It is really
1591 * simple: the Guest asks for a block number and we read or write that position
1592 * in the file.
1593 *
1594 * Before we serviced each virtqueue in a separate thread, that was unacceptably
1595 * slow: the Guest waits until the read is finished before running anything
1596 * else, even if it could have been doing useful work.
1597 *
1598 * We could have used async I/O, except it's reputed to suck so hard that
1599 * characters actually go missing from your code when you try to use it.
1600 */
1601static void blk_request(struct virtqueue *vq)
1602{
1603 struct vblk_info *vblk = vq->dev->priv;
1604 unsigned int head, out_num, in_num, wlen;
1605 int ret;
1606 u8 *in;
1607 struct virtio_blk_outhdr *out;
1608 struct iovec iov[vq->vring.num];
1609 off64_t off;
1610
1611 /*
1612 * Get the next request, where we normally wait. It triggers the
1613 * interrupt to acknowledge previously serviced requests (if any).
1614 */
1615 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1616
1617 /*
1618 * Every block request should contain at least one output buffer
1619 * (detailing the location on disk and the type of request) and one
1620 * input buffer (to hold the result).
1621 */
1622 if (out_num == 0 || in_num == 0)
1623 errx(1, "Bad virtblk cmd %u out=%u in=%u",
1624 head, out_num, in_num);
1625
1626 out = convert(&iov[0], struct virtio_blk_outhdr);
1627 in = convert(&iov[out_num+in_num-1], u8);
1628 /*
1629 * For historical reasons, block operations are expressed in 512 byte
1630 * "sectors".
1631 */
1632 off = out->sector * 512;
1633
1634 /*
1635 * In general the virtio block driver is allowed to try SCSI commands.
1636 * It'd be nice if we supported eject, for example, but we don't.
1637 */
1638 if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
1639 fprintf(stderr, "Scsi commands unsupported\n");
1640 *in = VIRTIO_BLK_S_UNSUPP;
1641 wlen = sizeof(*in);
1642 } else if (out->type & VIRTIO_BLK_T_OUT) {
1643 /*
1644 * Write
1645 *
1646 * Move to the right location in the block file. This can fail
1647 * if they try to write past end.
1648 */
1649 if (lseek64(vblk->fd, off, SEEK_SET) != off)
1650 err(1, "Bad seek to sector %llu", out->sector);
1651
1652 ret = writev(vblk->fd, iov+1, out_num-1);
1653 verbose("WRITE to sector %llu: %i\n", out->sector, ret);
1654
1655 /*
1656 * Grr... Now we know how long the descriptor they sent was, we
1657 * make sure they didn't try to write over the end of the block
1658 * file (possibly extending it).
1659 */
1660 if (ret > 0 && off + ret > vblk->len) {
1661 /* Trim it back to the correct length */
1662 ftruncate64(vblk->fd, vblk->len);
1663 /* Die, bad Guest, die. */
1664 errx(1, "Write past end %llu+%u", off, ret);
1665 }
1666
1667 wlen = sizeof(*in);
1668 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
1669 } else if (out->type & VIRTIO_BLK_T_FLUSH) {
1670 /* Flush */
1671 ret = fdatasync(vblk->fd);
1672 verbose("FLUSH fdatasync: %i\n", ret);
1673 wlen = sizeof(*in);
1674 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
1675 } else {
1676 /*
1677 * Read
1678 *
1679 * Move to the right location in the block file. This can fail
1680 * if they try to read past end.
1681 */
1682 if (lseek64(vblk->fd, off, SEEK_SET) != off)
1683 err(1, "Bad seek to sector %llu", out->sector);
1684
1685 ret = readv(vblk->fd, iov+1, in_num-1);
1686 verbose("READ from sector %llu: %i\n", out->sector, ret);
1687 if (ret >= 0) {
1688 wlen = sizeof(*in) + ret;
1689 *in = VIRTIO_BLK_S_OK;
1690 } else {
1691 wlen = sizeof(*in);
1692 *in = VIRTIO_BLK_S_IOERR;
1693 }
1694 }
1695
1696 /* Finished that request. */
1697 add_used(vq, head, wlen);
1698}
1699
1700/*L:198 This actually sets up a virtual block device. */
1701static void setup_block_file(const char *filename)
1702{
1703 struct device *dev;
1704 struct vblk_info *vblk;
1705 struct virtio_blk_config conf;
1706
1707 /* Creat the device. */
1708 dev = new_device("block", VIRTIO_ID_BLOCK);
1709
1710 /* The device has one virtqueue, where the Guest places requests. */
1711 add_virtqueue(dev, VIRTQUEUE_NUM, blk_request);
1712
1713 /* Allocate the room for our own bookkeeping */
1714 vblk = dev->priv = malloc(sizeof(*vblk));
1715
1716 /* First we open the file and store the length. */
1717 vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
1718 vblk->len = lseek64(vblk->fd, 0, SEEK_END);
1719
1720 /* We support FLUSH. */
1721 add_feature(dev, VIRTIO_BLK_F_FLUSH);
1722
1723 /* Tell Guest how many sectors this device has. */
1724 conf.capacity = cpu_to_le64(vblk->len / 512);
1725
1726 /*
1727 * Tell Guest not to put in too many descriptors at once: two are used
1728 * for the in and out elements.
1729 */
1730 add_feature(dev, VIRTIO_BLK_F_SEG_MAX);
1731 conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2);
1732
1733 /* Don't try to put whole struct: we have 8 bit limit. */
1734 set_config(dev, offsetof(struct virtio_blk_config, geometry), &conf);
1735
1736 verbose("device %u: virtblock %llu sectors\n",
1737 ++devices.device_num, le64_to_cpu(conf.capacity));
1738}
1739
1740/*L:211
1741 * Our random number generator device reads from /dev/random into the Guest's
1742 * input buffers. The usual case is that the Guest doesn't want random numbers
1743 * and so has no buffers although /dev/random is still readable, whereas
1744 * console is the reverse.
1745 *
1746 * The same logic applies, however.
1747 */
1748struct rng_info {
1749 int rfd;
1750};
1751
1752static void rng_input(struct virtqueue *vq)
1753{
1754 int len;
1755 unsigned int head, in_num, out_num, totlen = 0;
1756 struct rng_info *rng_info = vq->dev->priv;
1757 struct iovec iov[vq->vring.num];
1758
1759 /* First we need a buffer from the Guests's virtqueue. */
1760 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1761 if (out_num)
1762 errx(1, "Output buffers in rng?");
1763
1764 /*
1765 * Just like the console write, we loop to cover the whole iovec.
1766 * In this case, short reads actually happen quite a bit.
1767 */
1768 while (!iov_empty(iov, in_num)) {
1769 len = readv(rng_info->rfd, iov, in_num);
1770 if (len <= 0)
1771 err(1, "Read from /dev/random gave %i", len);
1772 iov_consume(iov, in_num, len);
1773 totlen += len;
1774 }
1775
1776 /* Tell the Guest about the new input. */
1777 add_used(vq, head, totlen);
1778}
1779
1780/*L:199
1781 * This creates a "hardware" random number device for the Guest.
1782 */
1783static void setup_rng(void)
1784{
1785 struct device *dev;
1786 struct rng_info *rng_info = malloc(sizeof(*rng_info));
1787
1788 /* Our device's privat info simply contains the /dev/random fd. */
1789 rng_info->rfd = open_or_die("/dev/random", O_RDONLY);
1790
1791 /* Create the new device. */
1792 dev = new_device("rng", VIRTIO_ID_RNG);
1793 dev->priv = rng_info;
1794
1795 /* The device has one virtqueue, where the Guest places inbufs. */
1796 add_virtqueue(dev, VIRTQUEUE_NUM, rng_input);
1797
1798 verbose("device %u: rng\n", devices.device_num++);
1799}
1800/* That's the end of device setup. */
1801
1802/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */
1803static void __attribute__((noreturn)) restart_guest(void)
1804{
1805 unsigned int i;
1806
1807 /*
1808 * Since we don't track all open fds, we simply close everything beyond
1809 * stderr.
1810 */
1811 for (i = 3; i < FD_SETSIZE; i++)
1812 close(i);
1813
1814 /* Reset all the devices (kills all threads). */
1815 cleanup_devices();
1816
1817 execv(main_args[0], main_args);
1818 err(1, "Could not exec %s", main_args[0]);
1819}
1820
1821/*L:220
1822 * Finally we reach the core of the Launcher which runs the Guest, serves
1823 * its input and output, and finally, lays it to rest.
1824 */
1825static void __attribute__((noreturn)) run_guest(void)
1826{
1827 for (;;) {
1828 unsigned long notify_addr;
1829 int readval;
1830
1831 /* We read from the /dev/lguest device to run the Guest. */
1832 readval = pread(lguest_fd, &notify_addr,
1833 sizeof(notify_addr), cpu_id);
1834
1835 /* One unsigned long means the Guest did HCALL_NOTIFY */
1836 if (readval == sizeof(notify_addr)) {
1837 verbose("Notify on address %#lx\n", notify_addr);
1838 handle_output(notify_addr);
1839 /* ENOENT means the Guest died. Reading tells us why. */
1840 } else if (errno == ENOENT) {
1841 char reason[1024] = { 0 };
1842 pread(lguest_fd, reason, sizeof(reason)-1, cpu_id);
1843 errx(1, "%s", reason);
1844 /* ERESTART means that we need to reboot the guest */
1845 } else if (errno == ERESTART) {
1846 restart_guest();
1847 /* Anything else means a bug or incompatible change. */
1848 } else
1849 err(1, "Running guest failed");
1850 }
1851}
1852/*L:240
1853 * This is the end of the Launcher. The good news: we are over halfway
1854 * through! The bad news: the most fiendish part of the code still lies ahead
1855 * of us.
1856 *
1857 * Are you ready? Take a deep breath and join me in the core of the Host, in
1858 * "make Host".
1859:*/
1860
1861static struct option opts[] = {
1862 { "verbose", 0, NULL, 'v' },
1863 { "tunnet", 1, NULL, 't' },
1864 { "block", 1, NULL, 'b' },
1865 { "rng", 0, NULL, 'r' },
1866 { "initrd", 1, NULL, 'i' },
1867 { "username", 1, NULL, 'u' },
1868 { "chroot", 1, NULL, 'c' },
1869 { NULL },
1870};
1871static void usage(void)
1872{
1873 errx(1, "Usage: lguest [--verbose] "
1874 "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)\n"
1875 "|--block=<filename>|--initrd=<filename>]...\n"
1876 "<mem-in-mb> vmlinux [args...]");
1877}
1878
1879/*L:105 The main routine is where the real work begins: */
1880int main(int argc, char *argv[])
1881{
1882 /* Memory, code startpoint and size of the (optional) initrd. */
1883 unsigned long mem = 0, start, initrd_size = 0;
1884 /* Two temporaries. */
1885 int i, c;
1886 /* The boot information for the Guest. */
1887 struct boot_params *boot;
1888 /* If they specify an initrd file to load. */
1889 const char *initrd_name = NULL;
1890
1891 /* Password structure for initgroups/setres[gu]id */
1892 struct passwd *user_details = NULL;
1893
1894 /* Directory to chroot to */
1895 char *chroot_path = NULL;
1896
1897 /* Save the args: we "reboot" by execing ourselves again. */
1898 main_args = argv;
1899
1900 /*
1901 * First we initialize the device list. We keep a pointer to the last
1902 * device, and the next interrupt number to use for devices (1:
1903 * remember that 0 is used by the timer).
1904 */
1905 devices.lastdev = NULL;
1906 devices.next_irq = 1;
1907
1908 /* We're CPU 0. In fact, that's the only CPU possible right now. */
1909 cpu_id = 0;
1910
1911 /*
1912 * We need to know how much memory so we can set up the device
1913 * descriptor and memory pages for the devices as we parse the command
1914 * line. So we quickly look through the arguments to find the amount
1915 * of memory now.
1916 */
1917 for (i = 1; i < argc; i++) {
1918 if (argv[i][0] != '-') {
1919 mem = atoi(argv[i]) * 1024 * 1024;
1920 /*
1921 * We start by mapping anonymous pages over all of
1922 * guest-physical memory range. This fills it with 0,
1923 * and ensures that the Guest won't be killed when it
1924 * tries to access it.
1925 */
1926 guest_base = map_zeroed_pages(mem / getpagesize()
1927 + DEVICE_PAGES);
1928 guest_limit = mem;
1929 guest_max = mem + DEVICE_PAGES*getpagesize();
1930 devices.descpage = get_pages(1);
1931 break;
1932 }
1933 }
1934
1935 /* The options are fairly straight-forward */
1936 while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
1937 switch (c) {
1938 case 'v':
1939 verbose = true;
1940 break;
1941 case 't':
1942 setup_tun_net(optarg);
1943 break;
1944 case 'b':
1945 setup_block_file(optarg);
1946 break;
1947 case 'r':
1948 setup_rng();
1949 break;
1950 case 'i':
1951 initrd_name = optarg;
1952 break;
1953 case 'u':
1954 user_details = getpwnam(optarg);
1955 if (!user_details)
1956 err(1, "getpwnam failed, incorrect username?");
1957 break;
1958 case 'c':
1959 chroot_path = optarg;
1960 break;
1961 default:
1962 warnx("Unknown argument %s", argv[optind]);
1963 usage();
1964 }
1965 }
1966 /*
1967 * After the other arguments we expect memory and kernel image name,
1968 * followed by command line arguments for the kernel.
1969 */
1970 if (optind + 2 > argc)
1971 usage();
1972
1973 verbose("Guest base is at %p\n", guest_base);
1974
1975 /* We always have a console device */
1976 setup_console();
1977
1978 /* Now we load the kernel */
1979 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
1980
1981 /* Boot information is stashed at physical address 0 */
1982 boot = from_guest_phys(0);
1983
1984 /* Map the initrd image if requested (at top of physical memory) */
1985 if (initrd_name) {
1986 initrd_size = load_initrd(initrd_name, mem);
1987 /*
1988 * These are the location in the Linux boot header where the
1989 * start and size of the initrd are expected to be found.
1990 */
1991 boot->hdr.ramdisk_image = mem - initrd_size;
1992 boot->hdr.ramdisk_size = initrd_size;
1993 /* The bootloader type 0xFF means "unknown"; that's OK. */
1994 boot->hdr.type_of_loader = 0xFF;
1995 }
1996
1997 /*
1998 * The Linux boot header contains an "E820" memory map: ours is a
1999 * simple, single region.
2000 */
2001 boot->e820_entries = 1;
2002 boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM });
2003 /*
2004 * The boot header contains a command line pointer: we put the command
2005 * line after the boot header.
2006 */
2007 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
2008 /* We use a simple helper to copy the arguments separated by spaces. */
2009 concat((char *)(boot + 1), argv+optind+2);
2010
2011 /* Boot protocol version: 2.07 supports the fields for lguest. */
2012 boot->hdr.version = 0x207;
2013
2014 /* The hardware_subarch value of "1" tells the Guest it's an lguest. */
2015 boot->hdr.hardware_subarch = 1;
2016
2017 /* Tell the entry path not to try to reload segment registers. */
2018 boot->hdr.loadflags |= KEEP_SEGMENTS;
2019
2020 /*
2021 * We tell the kernel to initialize the Guest: this returns the open
2022 * /dev/lguest file descriptor.
2023 */
2024 tell_kernel(start);
2025
2026 /* Ensure that we terminate if a device-servicing child dies. */
2027 signal(SIGCHLD, kill_launcher);
2028
2029 /* If we exit via err(), this kills all the threads, restores tty. */
2030 atexit(cleanup_devices);
2031
2032 /* If requested, chroot to a directory */
2033 if (chroot_path) {
2034 if (chroot(chroot_path) != 0)
2035 err(1, "chroot(\"%s\") failed", chroot_path);
2036
2037 if (chdir("/") != 0)
2038 err(1, "chdir(\"/\") failed");
2039
2040 verbose("chroot done\n");
2041 }
2042
2043 /* If requested, drop privileges */
2044 if (user_details) {
2045 uid_t u;
2046 gid_t g;
2047
2048 u = user_details->pw_uid;
2049 g = user_details->pw_gid;
2050
2051 if (initgroups(user_details->pw_name, g) != 0)
2052 err(1, "initgroups failed");
2053
2054 if (setresgid(g, g, g) != 0)
2055 err(1, "setresgid failed");
2056
2057 if (setresuid(u, u, u) != 0)
2058 err(1, "setresuid failed");
2059
2060 verbose("Dropping privileges completed\n");
2061 }
2062
2063 /* Finally, run the Guest. This doesn't return. */
2064 run_guest();
2065}
2066/*:*/
2067
2068/*M:999
2069 * Mastery is done: you now know everything I do.
2070 *
2071 * But surely you have seen code, features and bugs in your wanderings which
2072 * you now yearn to attack? That is the real game, and I look forward to you
2073 * patching and forking lguest into the Your-Name-Here-visor.
2074 *
2075 * Farewell, and good coding!
2076 * Rusty Russell.
2077 */
diff --git a/Documentation/virtual/lguest/lguest.txt b/Documentation/virtual/lguest/lguest.txt
new file mode 100644
index 000000000000..bff0c554485d
--- /dev/null
+++ b/Documentation/virtual/lguest/lguest.txt
@@ -0,0 +1,129 @@
1 __
2 (___()'`; Rusty's Remarkably Unreliable Guide to Lguest
3 /, /` - or, A Young Coder's Illustrated Hypervisor
4 \\"--\\ http://lguest.ozlabs.org
5
6Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel,
7for Linux developers and users to experiment with virtualization with the
8minimum of complexity. Nonetheless, it should have sufficient features to
9make it useful for specific tasks, and, of course, you are encouraged to fork
10and enhance it (see drivers/lguest/README).
11
12Features:
13
14- Kernel module which runs in a normal kernel.
15- Simple I/O model for communication.
16- Simple program to create new guests.
17- Logo contains cute puppies: http://lguest.ozlabs.org
18
19Developer features:
20
21- Fun to hack on.
22- No ABI: being tied to a specific kernel anyway, you can change anything.
23- Many opportunities for improvement or feature implementation.
24
25Running Lguest:
26
27- The easiest way to run lguest is to use same kernel as guest and host.
28 You can configure them differently, but usually it's easiest not to.
29
30 You will need to configure your kernel with the following options:
31
32 "General setup":
33 "Prompt for development and/or incomplete code/drivers" = Y
34 (CONFIG_EXPERIMENTAL=y)
35
36 "Processor type and features":
37 "Paravirtualized guest support" = Y
38 "Lguest guest support" = Y
39 "High Memory Support" = off/4GB
40 "Alignment value to which kernel should be aligned" = 0x100000
41 (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
42 CONFIG_PHYSICAL_ALIGN=0x100000)
43
44 "Device Drivers":
45 "Block devices"
46 "Virtio block driver (EXPERIMENTAL)" = M/Y
47 "Network device support"
48 "Universal TUN/TAP device driver support" = M/Y
49 "Virtio network driver (EXPERIMENTAL)" = M/Y
50 (CONFIG_VIRTIO_BLK=m, CONFIG_VIRTIO_NET=m and CONFIG_TUN=m)
51
52 "Virtualization"
53 "Linux hypervisor example code" = M/Y
54 (CONFIG_LGUEST=m)
55
56- A tool called "lguest" is available in this directory: type "make"
57 to build it. If you didn't build your kernel in-tree, use "make
58 O=<builddir>".
59
60- Create or find a root disk image. There are several useful ones
61 around, such as the xm-test tiny root image at
62 http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img
63
64 For more serious work, I usually use a distribution ISO image and
65 install it under qemu, then make multiple copies:
66
67 dd if=/dev/zero of=rootfile bs=1M count=2048
68 qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
69
70 Make sure that you install a getty on /dev/hvc0 if you want to log in on the
71 console!
72
73- "modprobe lg" if you built it as a module.
74
75- Run an lguest as root:
76
77 Documentation/virtual/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \
78 --block=rootfile root=/dev/vda
79
80 Explanation:
81 64: the amount of memory to use, in MB.
82
83 vmlinux: the kernel image found in the top of your build directory. You
84 can also use a standard bzImage.
85
86 --tunnet=192.168.19.1: configures a "tap" device for networking with this
87 IP address.
88
89 --block=rootfile: a file or block device which becomes /dev/vda
90 inside the guest.
91
92 root=/dev/vda: this (and anything else on the command line) are
93 kernel boot parameters.
94
95- Configuring networking. I usually have the host masquerade, using
96 "iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 >
97 /proc/sys/net/ipv4/ip_forward". In this example, I would configure
98 eth0 inside the guest at 192.168.19.2.
99
100 Another method is to bridge the tap device to an external interface
101 using --tunnet=bridge:<bridgename>, and perhaps run dhcp on the guest
102 to obtain an IP address. The bridge needs to be configured first:
103 this option simply adds the tap interface to it.
104
105 A simple example on my system:
106
107 ifconfig eth0 0.0.0.0
108 brctl addbr lg0
109 ifconfig lg0 up
110 brctl addif lg0 eth0
111 dhclient lg0
112
113 Then use --tunnet=bridge:lg0 when launching the guest.
114
115 See:
116
117 http://www.linuxfoundation.org/collaborate/workgroups/networking/bridge
118
119 for general information on how to get bridging to work.
120
121- Random number generation. Using the --rng option will provide a
122 /dev/hwrng in the guest that will read from the host's /dev/random.
123 Use this option in conjunction with rng-tools (see ../hw_random.txt)
124 to provide entropy to the guest kernel's /dev/random.
125
126There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest
127
128Good luck!
129Rusty Russell rusty@rustcorp.com.au.