aboutsummaryrefslogtreecommitdiffstats
path: root/tools/lguest
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
committerJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
commit8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
treea8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /tools/lguest
parent406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
Patched in Tegra support.
Diffstat (limited to 'tools/lguest')
-rw-r--r--tools/lguest/Makefile8
-rw-r--r--tools/lguest/extract58
-rw-r--r--tools/lguest/lguest.c2052
-rw-r--r--tools/lguest/lguest.txt129
4 files changed, 0 insertions, 2247 deletions
diff --git a/tools/lguest/Makefile b/tools/lguest/Makefile
deleted file mode 100644
index 0ac34206f7a..00000000000
--- a/tools/lguest/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
1# This creates the demonstration utility "lguest" which runs a Linux guest.
2# Missing headers? Add "-I../../../include -I../../../arch/x86/include"
3CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE
4
5all: lguest
6
7clean:
8 rm -f lguest
diff --git a/tools/lguest/extract b/tools/lguest/extract
deleted file mode 100644
index 7730bb6e4b9..00000000000
--- a/tools/lguest/extract
+++ /dev/null
@@ -1,58 +0,0 @@
1#! /bin/sh
2
3set -e
4
5PREFIX=$1
6shift
7
8trap 'rm -r $TMPDIR' 0
9TMPDIR=`mktemp -d`
10
11exec 3>/dev/null
12for f; do
13 while IFS="
14" read -r LINE; do
15 case "$LINE" in
16 *$PREFIX:[0-9]*:\**)
17 NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
18 if [ -f $TMPDIR/$NUM ]; then
19 echo "$TMPDIR/$NUM already exits prior to $f"
20 exit 1
21 fi
22 exec 3>>$TMPDIR/$NUM
23 echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
24 /bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3
25 ;;
26 *$PREFIX:[0-9]*)
27 NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
28 if [ -f $TMPDIR/$NUM ]; then
29 echo "$TMPDIR/$NUM already exits prior to $f"
30 exit 1
31 fi
32 exec 3>>$TMPDIR/$NUM
33 echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
34 /bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3
35 ;;
36 *:\**)
37 /bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3
38 echo >&3
39 exec 3>/dev/null
40 ;;
41 *)
42 /bin/echo "$LINE" >&3
43 ;;
44 esac
45 done < $f
46 echo >&3
47 exec 3>/dev/null
48done
49
50LASTFILE=""
51for f in $TMPDIR/*; do
52 if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then
53 LASTFILE=$(cat $TMPDIR/.$(basename $f) )
54 echo "[ $LASTFILE ]"
55 fi
56 cat $f
57done
58
diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c
deleted file mode 100644
index 07a03452c22..00000000000
--- a/tools/lguest/lguest.c
+++ /dev/null
@@ -1,2052 +0,0 @@
1/*P:100
2 * This is the Launcher code, a simple program which lays out the "physical"
3 * memory for the new Guest by mapping the kernel image and the virtual
4 * devices, then opens /dev/lguest to tell the kernel about the Guest and
5 * control it.
6:*/
7#define _LARGEFILE64_SOURCE
8#define _GNU_SOURCE
9#include <stdio.h>
10#include <string.h>
11#include <unistd.h>
12#include <err.h>
13#include <stdint.h>
14#include <stdlib.h>
15#include <elf.h>
16#include <sys/mman.h>
17#include <sys/param.h>
18#include <sys/types.h>
19#include <sys/stat.h>
20#include <sys/wait.h>
21#include <sys/eventfd.h>
22#include <fcntl.h>
23#include <stdbool.h>
24#include <errno.h>
25#include <ctype.h>
26#include <sys/socket.h>
27#include <sys/ioctl.h>
28#include <sys/time.h>
29#include <time.h>
30#include <netinet/in.h>
31#include <net/if.h>
32#include <linux/sockios.h>
33#include <linux/if_tun.h>
34#include <sys/uio.h>
35#include <termios.h>
36#include <getopt.h>
37#include <assert.h>
38#include <sched.h>
39#include <limits.h>
40#include <stddef.h>
41#include <signal.h>
42#include <pwd.h>
43#include <grp.h>
44
45#include <linux/virtio_config.h>
46#include <linux/virtio_net.h>
47#include <linux/virtio_blk.h>
48#include <linux/virtio_console.h>
49#include <linux/virtio_rng.h>
50#include <linux/virtio_ring.h>
51#include <asm/bootparam.h>
52#include "../../include/linux/lguest_launcher.h"
53/*L:110
54 * We can ignore the 43 include files we need for this program, but I do want
55 * to draw attention to the use of kernel-style types.
56 *
57 * As Linus said, "C is a Spartan language, and so should your naming be." I
58 * like these abbreviations, so we define them here. Note that u64 is always
59 * unsigned long long, which works on all Linux systems: this means that we can
60 * use %llu in printf for any u64.
61 */
62typedef unsigned long long u64;
63typedef uint32_t u32;
64typedef uint16_t u16;
65typedef uint8_t u8;
66/*:*/
67
68#define BRIDGE_PFX "bridge:"
69#ifndef SIOCBRADDIF
70#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
71#endif
72/* We can have up to 256 pages for devices. */
73#define DEVICE_PAGES 256
74/* This will occupy 3 pages: it must be a power of 2. */
75#define VIRTQUEUE_NUM 256
76
77/*L:120
78 * verbose is both a global flag and a macro. The C preprocessor allows
79 * this, and although I wouldn't recommend it, it works quite nicely here.
80 */
81static bool verbose;
82#define verbose(args...) \
83 do { if (verbose) printf(args); } while(0)
84/*:*/
85
86/* The pointer to the start of guest memory. */
87static void *guest_base;
88/* The maximum guest physical address allowed, and maximum possible. */
89static unsigned long guest_limit, guest_max;
90/* The /dev/lguest file descriptor. */
91static int lguest_fd;
92
93/* a per-cpu variable indicating whose vcpu is currently running */
94static unsigned int __thread cpu_id;
95
96/* This is our list of devices. */
97struct device_list {
98 /* Counter to assign interrupt numbers. */
99 unsigned int next_irq;
100
101 /* Counter to print out convenient device numbers. */
102 unsigned int device_num;
103
104 /* The descriptor page for the devices. */
105 u8 *descpage;
106
107 /* A single linked list of devices. */
108 struct device *dev;
109 /* And a pointer to the last device for easy append. */
110 struct device *lastdev;
111};
112
113/* The list of Guest devices, based on command line arguments. */
114static struct device_list devices;
115
116/* The device structure describes a single device. */
117struct device {
118 /* The linked-list pointer. */
119 struct device *next;
120
121 /* The device's descriptor, as mapped into the Guest. */
122 struct lguest_device_desc *desc;
123
124 /* We can't trust desc values once Guest has booted: we use these. */
125 unsigned int feature_len;
126 unsigned int num_vq;
127
128 /* The name of this device, for --verbose. */
129 const char *name;
130
131 /* Any queues attached to this device */
132 struct virtqueue *vq;
133
134 /* Is it operational */
135 bool running;
136
137 /* Device-specific data. */
138 void *priv;
139};
140
141/* The virtqueue structure describes a queue attached to a device. */
142struct virtqueue {
143 struct virtqueue *next;
144
145 /* Which device owns me. */
146 struct device *dev;
147
148 /* The configuration for this queue. */
149 struct lguest_vqconfig config;
150
151 /* The actual ring of buffers. */
152 struct vring vring;
153
154 /* Last available index we saw. */
155 u16 last_avail_idx;
156
157 /* How many are used since we sent last irq? */
158 unsigned int pending_used;
159
160 /* Eventfd where Guest notifications arrive. */
161 int eventfd;
162
163 /* Function for the thread which is servicing this virtqueue. */
164 void (*service)(struct virtqueue *vq);
165 pid_t thread;
166};
167
168/* Remember the arguments to the program so we can "reboot" */
169static char **main_args;
170
171/* The original tty settings to restore on exit. */
172static struct termios orig_term;
173
174/*
175 * We have to be careful with barriers: our devices are all run in separate
176 * threads and so we need to make sure that changes visible to the Guest happen
177 * in precise order.
178 */
179#define wmb() __asm__ __volatile__("" : : : "memory")
180#define mb() __asm__ __volatile__("" : : : "memory")
181
182/* Wrapper for the last available index. Makes it easier to change. */
183#define lg_last_avail(vq) ((vq)->last_avail_idx)
184
185/*
186 * The virtio configuration space is defined to be little-endian. x86 is
187 * little-endian too, but it's nice to be explicit so we have these helpers.
188 */
189#define cpu_to_le16(v16) (v16)
190#define cpu_to_le32(v32) (v32)
191#define cpu_to_le64(v64) (v64)
192#define le16_to_cpu(v16) (v16)
193#define le32_to_cpu(v32) (v32)
194#define le64_to_cpu(v64) (v64)
195
196/* Is this iovec empty? */
197static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
198{
199 unsigned int i;
200
201 for (i = 0; i < num_iov; i++)
202 if (iov[i].iov_len)
203 return false;
204 return true;
205}
206
207/* Take len bytes from the front of this iovec. */
208static void iov_consume(struct iovec iov[], unsigned num_iov,
209 void *dest, unsigned len)
210{
211 unsigned int i;
212
213 for (i = 0; i < num_iov; i++) {
214 unsigned int used;
215
216 used = iov[i].iov_len < len ? iov[i].iov_len : len;
217 if (dest) {
218 memcpy(dest, iov[i].iov_base, used);
219 dest += used;
220 }
221 iov[i].iov_base += used;
222 iov[i].iov_len -= used;
223 len -= used;
224 }
225 if (len != 0)
226 errx(1, "iovec too short!");
227}
228
229/* The device virtqueue descriptors are followed by feature bitmasks. */
230static u8 *get_feature_bits(struct device *dev)
231{
232 return (u8 *)(dev->desc + 1)
233 + dev->num_vq * sizeof(struct lguest_vqconfig);
234}
235
236/*L:100
237 * The Launcher code itself takes us out into userspace, that scary place where
238 * pointers run wild and free! Unfortunately, like most userspace programs,
239 * it's quite boring (which is why everyone likes to hack on the kernel!).
240 * Perhaps if you make up an Lguest Drinking Game at this point, it will get
241 * you through this section. Or, maybe not.
242 *
243 * The Launcher sets up a big chunk of memory to be the Guest's "physical"
244 * memory and stores it in "guest_base". In other words, Guest physical ==
245 * Launcher virtual with an offset.
246 *
247 * This can be tough to get your head around, but usually it just means that we
248 * use these trivial conversion functions when the Guest gives us its
249 * "physical" addresses:
250 */
251static void *from_guest_phys(unsigned long addr)
252{
253 return guest_base + addr;
254}
255
256static unsigned long to_guest_phys(const void *addr)
257{
258 return (addr - guest_base);
259}
260
261/*L:130
262 * Loading the Kernel.
263 *
264 * We start with couple of simple helper routines. open_or_die() avoids
265 * error-checking code cluttering the callers:
266 */
267static int open_or_die(const char *name, int flags)
268{
269 int fd = open(name, flags);
270 if (fd < 0)
271 err(1, "Failed to open %s", name);
272 return fd;
273}
274
275/* map_zeroed_pages() takes a number of pages. */
276static void *map_zeroed_pages(unsigned int num)
277{
278 int fd = open_or_die("/dev/zero", O_RDONLY);
279 void *addr;
280
281 /*
282 * We use a private mapping (ie. if we write to the page, it will be
283 * copied). We allocate an extra two pages PROT_NONE to act as guard
284 * pages against read/write attempts that exceed allocated space.
285 */
286 addr = mmap(NULL, getpagesize() * (num+2),
287 PROT_NONE, MAP_PRIVATE, fd, 0);
288
289 if (addr == MAP_FAILED)
290 err(1, "Mmapping %u pages of /dev/zero", num);
291
292 if (mprotect(addr + getpagesize(), getpagesize() * num,
293 PROT_READ|PROT_WRITE) == -1)
294 err(1, "mprotect rw %u pages failed", num);
295
296 /*
297 * One neat mmap feature is that you can close the fd, and it
298 * stays mapped.
299 */
300 close(fd);
301
302 /* Return address after PROT_NONE page */
303 return addr + getpagesize();
304}
305
306/* Get some more pages for a device. */
307static void *get_pages(unsigned int num)
308{
309 void *addr = from_guest_phys(guest_limit);
310
311 guest_limit += num * getpagesize();
312 if (guest_limit > guest_max)
313 errx(1, "Not enough memory for devices");
314 return addr;
315}
316
317/*
318 * This routine is used to load the kernel or initrd. It tries mmap, but if
319 * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
320 * it falls back to reading the memory in.
321 */
322static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
323{
324 ssize_t r;
325
326 /*
327 * We map writable even though for some segments are marked read-only.
328 * The kernel really wants to be writable: it patches its own
329 * instructions.
330 *
331 * MAP_PRIVATE means that the page won't be copied until a write is
332 * done to it. This allows us to share untouched memory between
333 * Guests.
334 */
335 if (mmap(addr, len, PROT_READ|PROT_WRITE,
336 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
337 return;
338
339 /* pread does a seek and a read in one shot: saves a few lines. */
340 r = pread(fd, addr, len, offset);
341 if (r != len)
342 err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
343}
344
345/*
346 * This routine takes an open vmlinux image, which is in ELF, and maps it into
347 * the Guest memory. ELF = Embedded Linking Format, which is the format used
348 * by all modern binaries on Linux including the kernel.
349 *
350 * The ELF headers give *two* addresses: a physical address, and a virtual
351 * address. We use the physical address; the Guest will map itself to the
352 * virtual address.
353 *
354 * We return the starting address.
355 */
356static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
357{
358 Elf32_Phdr phdr[ehdr->e_phnum];
359 unsigned int i;
360
361 /*
362 * Sanity checks on the main ELF header: an x86 executable with a
363 * reasonable number of correctly-sized program headers.
364 */
365 if (ehdr->e_type != ET_EXEC
366 || ehdr->e_machine != EM_386
367 || ehdr->e_phentsize != sizeof(Elf32_Phdr)
368 || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
369 errx(1, "Malformed elf header");
370
371 /*
372 * An ELF executable contains an ELF header and a number of "program"
373 * headers which indicate which parts ("segments") of the program to
374 * load where.
375 */
376
377 /* We read in all the program headers at once: */
378 if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
379 err(1, "Seeking to program headers");
380 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
381 err(1, "Reading program headers");
382
383 /*
384 * Try all the headers: there are usually only three. A read-only one,
385 * a read-write one, and a "note" section which we don't load.
386 */
387 for (i = 0; i < ehdr->e_phnum; i++) {
388 /* If this isn't a loadable segment, we ignore it */
389 if (phdr[i].p_type != PT_LOAD)
390 continue;
391
392 verbose("Section %i: size %i addr %p\n",
393 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
394
395 /* We map this section of the file at its physical address. */
396 map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
397 phdr[i].p_offset, phdr[i].p_filesz);
398 }
399
400 /* The entry point is given in the ELF header. */
401 return ehdr->e_entry;
402}
403
404/*L:150
405 * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed
406 * to jump into it and it will unpack itself. We used to have to perform some
407 * hairy magic because the unpacking code scared me.
408 *
409 * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
410 * a small patch to jump over the tricky bits in the Guest, so now we just read
411 * the funky header so we know where in the file to load, and away we go!
412 */
413static unsigned long load_bzimage(int fd)
414{
415 struct boot_params boot;
416 int r;
417 /* Modern bzImages get loaded at 1M. */
418 void *p = from_guest_phys(0x100000);
419
420 /*
421 * Go back to the start of the file and read the header. It should be
422 * a Linux boot header (see Documentation/x86/boot.txt)
423 */
424 lseek(fd, 0, SEEK_SET);
425 read(fd, &boot, sizeof(boot));
426
427 /* Inside the setup_hdr, we expect the magic "HdrS" */
428 if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
429 errx(1, "This doesn't look like a bzImage to me");
430
431 /* Skip over the extra sectors of the header. */
432 lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
433
434 /* Now read everything into memory. in nice big chunks. */
435 while ((r = read(fd, p, 65536)) > 0)
436 p += r;
437
438 /* Finally, code32_start tells us where to enter the kernel. */
439 return boot.hdr.code32_start;
440}
441
442/*L:140
443 * Loading the kernel is easy when it's a "vmlinux", but most kernels
444 * come wrapped up in the self-decompressing "bzImage" format. With a little
445 * work, we can load those, too.
446 */
447static unsigned long load_kernel(int fd)
448{
449 Elf32_Ehdr hdr;
450
451 /* Read in the first few bytes. */
452 if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
453 err(1, "Reading kernel");
454
455 /* If it's an ELF file, it starts with "\177ELF" */
456 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
457 return map_elf(fd, &hdr);
458
459 /* Otherwise we assume it's a bzImage, and try to load it. */
460 return load_bzimage(fd);
461}
462
463/*
464 * This is a trivial little helper to align pages. Andi Kleen hated it because
465 * it calls getpagesize() twice: "it's dumb code."
466 *
467 * Kernel guys get really het up about optimization, even when it's not
468 * necessary. I leave this code as a reaction against that.
469 */
470static inline unsigned long page_align(unsigned long addr)
471{
472 /* Add upwards and truncate downwards. */
473 return ((addr + getpagesize()-1) & ~(getpagesize()-1));
474}
475
476/*L:180
477 * An "initial ram disk" is a disk image loaded into memory along with the
478 * kernel which the kernel can use to boot from without needing any drivers.
479 * Most distributions now use this as standard: the initrd contains the code to
480 * load the appropriate driver modules for the current machine.
481 *
482 * Importantly, James Morris works for RedHat, and Fedora uses initrds for its
483 * kernels. He sent me this (and tells me when I break it).
484 */
485static unsigned long load_initrd(const char *name, unsigned long mem)
486{
487 int ifd;
488 struct stat st;
489 unsigned long len;
490
491 ifd = open_or_die(name, O_RDONLY);
492 /* fstat() is needed to get the file size. */
493 if (fstat(ifd, &st) < 0)
494 err(1, "fstat() on initrd '%s'", name);
495
496 /*
497 * We map the initrd at the top of memory, but mmap wants it to be
498 * page-aligned, so we round the size up for that.
499 */
500 len = page_align(st.st_size);
501 map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
502 /*
503 * Once a file is mapped, you can close the file descriptor. It's a
504 * little odd, but quite useful.
505 */
506 close(ifd);
507 verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
508
509 /* We return the initrd size. */
510 return len;
511}
512/*:*/
513
514/*
515 * Simple routine to roll all the commandline arguments together with spaces
516 * between them.
517 */
518static void concat(char *dst, char *args[])
519{
520 unsigned int i, len = 0;
521
522 for (i = 0; args[i]; i++) {
523 if (i) {
524 strcat(dst+len, " ");
525 len++;
526 }
527 strcpy(dst+len, args[i]);
528 len += strlen(args[i]);
529 }
530 /* In case it's empty. */
531 dst[len] = '\0';
532}
533
534/*L:185
535 * This is where we actually tell the kernel to initialize the Guest. We
536 * saw the arguments it expects when we looked at initialize() in lguest_user.c:
537 * the base of Guest "physical" memory, the top physical page to allow and the
538 * entry point for the Guest.
539 */
540static void tell_kernel(unsigned long start)
541{
542 unsigned long args[] = { LHREQ_INITIALIZE,
543 (unsigned long)guest_base,
544 guest_limit / getpagesize(), start };
545 verbose("Guest: %p - %p (%#lx)\n",
546 guest_base, guest_base + guest_limit, guest_limit);
547 lguest_fd = open_or_die("/dev/lguest", O_RDWR);
548 if (write(lguest_fd, args, sizeof(args)) < 0)
549 err(1, "Writing to /dev/lguest");
550}
551/*:*/
552
553/*L:200
554 * Device Handling.
555 *
556 * When the Guest gives us a buffer, it sends an array of addresses and sizes.
557 * We need to make sure it's not trying to reach into the Launcher itself, so
558 * we have a convenient routine which checks it and exits with an error message
559 * if something funny is going on:
560 */
561static void *_check_pointer(unsigned long addr, unsigned int size,
562 unsigned int line)
563{
564 /*
565 * Check if the requested address and size exceeds the allocated memory,
566 * or addr + size wraps around.
567 */
568 if ((addr + size) > guest_limit || (addr + size) < addr)
569 errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr);
570 /*
571 * We return a pointer for the caller's convenience, now we know it's
572 * safe to use.
573 */
574 return from_guest_phys(addr);
575}
576/* A macro which transparently hands the line number to the real function. */
577#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
578
579/*
580 * Each buffer in the virtqueues is actually a chain of descriptors. This
581 * function returns the next descriptor in the chain, or vq->vring.num if we're
582 * at the end.
583 */
584static unsigned next_desc(struct vring_desc *desc,
585 unsigned int i, unsigned int max)
586{
587 unsigned int next;
588
589 /* If this descriptor says it doesn't chain, we're done. */
590 if (!(desc[i].flags & VRING_DESC_F_NEXT))
591 return max;
592
593 /* Check they're not leading us off end of descriptors. */
594 next = desc[i].next;
595 /* Make sure compiler knows to grab that: we don't want it changing! */
596 wmb();
597
598 if (next >= max)
599 errx(1, "Desc next is %u", next);
600
601 return next;
602}
603
604/*
605 * This actually sends the interrupt for this virtqueue, if we've used a
606 * buffer.
607 */
608static void trigger_irq(struct virtqueue *vq)
609{
610 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
611
612 /* Don't inform them if nothing used. */
613 if (!vq->pending_used)
614 return;
615 vq->pending_used = 0;
616
617 /* If they don't want an interrupt, don't send one... */
618 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
619 return;
620 }
621
622 /* Send the Guest an interrupt tell them we used something up. */
623 if (write(lguest_fd, buf, sizeof(buf)) != 0)
624 err(1, "Triggering irq %i", vq->config.irq);
625}
626
627/*
628 * This looks in the virtqueue for the first available buffer, and converts
629 * it to an iovec for convenient access. Since descriptors consist of some
630 * number of output then some number of input descriptors, it's actually two
631 * iovecs, but we pack them into one and note how many of each there were.
632 *
633 * This function waits if necessary, and returns the descriptor number found.
634 */
635static unsigned wait_for_vq_desc(struct virtqueue *vq,
636 struct iovec iov[],
637 unsigned int *out_num, unsigned int *in_num)
638{
639 unsigned int i, head, max;
640 struct vring_desc *desc;
641 u16 last_avail = lg_last_avail(vq);
642
643 /* There's nothing available? */
644 while (last_avail == vq->vring.avail->idx) {
645 u64 event;
646
647 /*
648 * Since we're about to sleep, now is a good time to tell the
649 * Guest about what we've used up to now.
650 */
651 trigger_irq(vq);
652
653 /* OK, now we need to know about added descriptors. */
654 vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
655
656 /*
657 * They could have slipped one in as we were doing that: make
658 * sure it's written, then check again.
659 */
660 mb();
661 if (last_avail != vq->vring.avail->idx) {
662 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
663 break;
664 }
665
666 /* Nothing new? Wait for eventfd to tell us they refilled. */
667 if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
668 errx(1, "Event read failed?");
669
670 /* We don't need to be notified again. */
671 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
672 }
673
674 /* Check it isn't doing very strange things with descriptor numbers. */
675 if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
676 errx(1, "Guest moved used index from %u to %u",
677 last_avail, vq->vring.avail->idx);
678
679 /*
680 * Grab the next descriptor number they're advertising, and increment
681 * the index we've seen.
682 */
683 head = vq->vring.avail->ring[last_avail % vq->vring.num];
684 lg_last_avail(vq)++;
685
686 /* If their number is silly, that's a fatal mistake. */
687 if (head >= vq->vring.num)
688 errx(1, "Guest says index %u is available", head);
689
690 /* When we start there are none of either input nor output. */
691 *out_num = *in_num = 0;
692
693 max = vq->vring.num;
694 desc = vq->vring.desc;
695 i = head;
696
697 /*
698 * If this is an indirect entry, then this buffer contains a descriptor
699 * table which we handle as if it's any normal descriptor chain.
700 */
701 if (desc[i].flags & VRING_DESC_F_INDIRECT) {
702 if (desc[i].len % sizeof(struct vring_desc))
703 errx(1, "Invalid size for indirect buffer table");
704
705 max = desc[i].len / sizeof(struct vring_desc);
706 desc = check_pointer(desc[i].addr, desc[i].len);
707 i = 0;
708 }
709
710 do {
711 /* Grab the first descriptor, and check it's OK. */
712 iov[*out_num + *in_num].iov_len = desc[i].len;
713 iov[*out_num + *in_num].iov_base
714 = check_pointer(desc[i].addr, desc[i].len);
715 /* If this is an input descriptor, increment that count. */
716 if (desc[i].flags & VRING_DESC_F_WRITE)
717 (*in_num)++;
718 else {
719 /*
720 * If it's an output descriptor, they're all supposed
721 * to come before any input descriptors.
722 */
723 if (*in_num)
724 errx(1, "Descriptor has out after in");
725 (*out_num)++;
726 }
727
728 /* If we've got too many, that implies a descriptor loop. */
729 if (*out_num + *in_num > max)
730 errx(1, "Looped descriptor");
731 } while ((i = next_desc(desc, i, max)) != max);
732
733 return head;
734}
735
736/*
737 * After we've used one of their buffers, we tell the Guest about it. Sometime
738 * later we'll want to send them an interrupt using trigger_irq(); note that
739 * wait_for_vq_desc() does that for us if it has to wait.
740 */
741static void add_used(struct virtqueue *vq, unsigned int head, int len)
742{
743 struct vring_used_elem *used;
744
745 /*
746 * The virtqueue contains a ring of used buffers. Get a pointer to the
747 * next entry in that used ring.
748 */
749 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
750 used->id = head;
751 used->len = len;
752 /* Make sure buffer is written before we update index. */
753 wmb();
754 vq->vring.used->idx++;
755 vq->pending_used++;
756}
757
758/* And here's the combo meal deal. Supersize me! */
759static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
760{
761 add_used(vq, head, len);
762 trigger_irq(vq);
763}
764
765/*
766 * The Console
767 *
768 * We associate some data with the console for our exit hack.
769 */
770struct console_abort {
771 /* How many times have they hit ^C? */
772 int count;
773 /* When did they start? */
774 struct timeval start;
775};
776
777/* This is the routine which handles console input (ie. stdin). */
778static void console_input(struct virtqueue *vq)
779{
780 int len;
781 unsigned int head, in_num, out_num;
782 struct console_abort *abort = vq->dev->priv;
783 struct iovec iov[vq->vring.num];
784
785 /* Make sure there's a descriptor available. */
786 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
787 if (out_num)
788 errx(1, "Output buffers in console in queue?");
789
790 /* Read into it. This is where we usually wait. */
791 len = readv(STDIN_FILENO, iov, in_num);
792 if (len <= 0) {
793 /* Ran out of input? */
794 warnx("Failed to get console input, ignoring console.");
795 /*
796 * For simplicity, dying threads kill the whole Launcher. So
797 * just nap here.
798 */
799 for (;;)
800 pause();
801 }
802
803 /* Tell the Guest we used a buffer. */
804 add_used_and_trigger(vq, head, len);
805
806 /*
807 * Three ^C within one second? Exit.
808 *
809 * This is such a hack, but works surprisingly well. Each ^C has to
810 * be in a buffer by itself, so they can't be too fast. But we check
811 * that we get three within about a second, so they can't be too
812 * slow.
813 */
814 if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
815 abort->count = 0;
816 return;
817 }
818
819 abort->count++;
820 if (abort->count == 1)
821 gettimeofday(&abort->start, NULL);
822 else if (abort->count == 3) {
823 struct timeval now;
824 gettimeofday(&now, NULL);
825 /* Kill all Launcher processes with SIGINT, like normal ^C */
826 if (now.tv_sec <= abort->start.tv_sec+1)
827 kill(0, SIGINT);
828 abort->count = 0;
829 }
830}
831
832/* This is the routine which handles console output (ie. stdout). */
833static void console_output(struct virtqueue *vq)
834{
835 unsigned int head, out, in;
836 struct iovec iov[vq->vring.num];
837
838 /* We usually wait in here, for the Guest to give us something. */
839 head = wait_for_vq_desc(vq, iov, &out, &in);
840 if (in)
841 errx(1, "Input buffers in console output queue?");
842
843 /* writev can return a partial write, so we loop here. */
844 while (!iov_empty(iov, out)) {
845 int len = writev(STDOUT_FILENO, iov, out);
846 if (len <= 0) {
847 warn("Write to stdout gave %i (%d)", len, errno);
848 break;
849 }
850 iov_consume(iov, out, NULL, len);
851 }
852
853 /*
854 * We're finished with that buffer: if we're going to sleep,
855 * wait_for_vq_desc() will prod the Guest with an interrupt.
856 */
857 add_used(vq, head, 0);
858}
859
860/*
861 * The Network
862 *
863 * Handling output for network is also simple: we get all the output buffers
864 * and write them to /dev/net/tun.
865 */
866struct net_info {
867 int tunfd;
868};
869
870static void net_output(struct virtqueue *vq)
871{
872 struct net_info *net_info = vq->dev->priv;
873 unsigned int head, out, in;
874 struct iovec iov[vq->vring.num];
875
876 /* We usually wait in here for the Guest to give us a packet. */
877 head = wait_for_vq_desc(vq, iov, &out, &in);
878 if (in)
879 errx(1, "Input buffers in net output queue?");
880 /*
881 * Send the whole thing through to /dev/net/tun. It expects the exact
882 * same format: what a coincidence!
883 */
884 if (writev(net_info->tunfd, iov, out) < 0)
885 warnx("Write to tun failed (%d)?", errno);
886
887 /*
888 * Done with that one; wait_for_vq_desc() will send the interrupt if
889 * all packets are processed.
890 */
891 add_used(vq, head, 0);
892}
893
894/*
895 * Handling network input is a bit trickier, because I've tried to optimize it.
896 *
897 * First we have a helper routine which tells is if from this file descriptor
898 * (ie. the /dev/net/tun device) will block:
899 */
900static bool will_block(int fd)
901{
902 fd_set fdset;
903 struct timeval zero = { 0, 0 };
904 FD_ZERO(&fdset);
905 FD_SET(fd, &fdset);
906 return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
907}
908
909/*
910 * This handles packets coming in from the tun device to our Guest. Like all
911 * service routines, it gets called again as soon as it returns, so you don't
912 * see a while(1) loop here.
913 */
914static void net_input(struct virtqueue *vq)
915{
916 int len;
917 unsigned int head, out, in;
918 struct iovec iov[vq->vring.num];
919 struct net_info *net_info = vq->dev->priv;
920
921 /*
922 * Get a descriptor to write an incoming packet into. This will also
923 * send an interrupt if they're out of descriptors.
924 */
925 head = wait_for_vq_desc(vq, iov, &out, &in);
926 if (out)
927 errx(1, "Output buffers in net input queue?");
928
929 /*
930 * If it looks like we'll block reading from the tun device, send them
931 * an interrupt.
932 */
933 if (vq->pending_used && will_block(net_info->tunfd))
934 trigger_irq(vq);
935
936 /*
937 * Read in the packet. This is where we normally wait (when there's no
938 * incoming network traffic).
939 */
940 len = readv(net_info->tunfd, iov, in);
941 if (len <= 0)
942 warn("Failed to read from tun (%d).", errno);
943
944 /*
945 * Mark that packet buffer as used, but don't interrupt here. We want
946 * to wait until we've done as much work as we can.
947 */
948 add_used(vq, head, len);
949}
950/*:*/
951
952/* This is the helper to create threads: run the service routine in a loop. */
953static int do_thread(void *_vq)
954{
955 struct virtqueue *vq = _vq;
956
957 for (;;)
958 vq->service(vq);
959 return 0;
960}
961
962/*
963 * When a child dies, we kill our entire process group with SIGTERM. This
964 * also has the side effect that the shell restores the console for us!
965 */
966static void kill_launcher(int signal)
967{
968 kill(0, SIGTERM);
969}
970
971static void reset_device(struct device *dev)
972{
973 struct virtqueue *vq;
974
975 verbose("Resetting device %s\n", dev->name);
976
977 /* Clear any features they've acked. */
978 memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len);
979
980 /* We're going to be explicitly killing threads, so ignore them. */
981 signal(SIGCHLD, SIG_IGN);
982
983 /* Zero out the virtqueues, get rid of their threads */
984 for (vq = dev->vq; vq; vq = vq->next) {
985 if (vq->thread != (pid_t)-1) {
986 kill(vq->thread, SIGTERM);
987 waitpid(vq->thread, NULL, 0);
988 vq->thread = (pid_t)-1;
989 }
990 memset(vq->vring.desc, 0,
991 vring_size(vq->config.num, LGUEST_VRING_ALIGN));
992 lg_last_avail(vq) = 0;
993 }
994 dev->running = false;
995
996 /* Now we care if threads die. */
997 signal(SIGCHLD, (void *)kill_launcher);
998}
999
1000/*L:216
1001 * This actually creates the thread which services the virtqueue for a device.
1002 */
1003static void create_thread(struct virtqueue *vq)
1004{
1005 /*
1006 * Create stack for thread. Since the stack grows upwards, we point
1007 * the stack pointer to the end of this region.
1008 */
1009 char *stack = malloc(32768);
1010 unsigned long args[] = { LHREQ_EVENTFD,
1011 vq->config.pfn*getpagesize(), 0 };
1012
1013 /* Create a zero-initialized eventfd. */
1014 vq->eventfd = eventfd(0, 0);
1015 if (vq->eventfd < 0)
1016 err(1, "Creating eventfd");
1017 args[2] = vq->eventfd;
1018
1019 /*
1020 * Attach an eventfd to this virtqueue: it will go off when the Guest
1021 * does an LHCALL_NOTIFY for this vq.
1022 */
1023 if (write(lguest_fd, &args, sizeof(args)) != 0)
1024 err(1, "Attaching eventfd");
1025
1026 /*
1027 * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so
1028 * we get a signal if it dies.
1029 */
1030 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
1031 if (vq->thread == (pid_t)-1)
1032 err(1, "Creating clone");
1033
1034 /* We close our local copy now the child has it. */
1035 close(vq->eventfd);
1036}
1037
1038static void start_device(struct device *dev)
1039{
1040 unsigned int i;
1041 struct virtqueue *vq;
1042
1043 verbose("Device %s OK: offered", dev->name);
1044 for (i = 0; i < dev->feature_len; i++)
1045 verbose(" %02x", get_feature_bits(dev)[i]);
1046 verbose(", accepted");
1047 for (i = 0; i < dev->feature_len; i++)
1048 verbose(" %02x", get_feature_bits(dev)
1049 [dev->feature_len+i]);
1050
1051 for (vq = dev->vq; vq; vq = vq->next) {
1052 if (vq->service)
1053 create_thread(vq);
1054 }
1055 dev->running = true;
1056}
1057
1058static void cleanup_devices(void)
1059{
1060 struct device *dev;
1061
1062 for (dev = devices.dev; dev; dev = dev->next)
1063 reset_device(dev);
1064
1065 /* If we saved off the original terminal settings, restore them now. */
1066 if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
1067 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
1068}
1069
1070/* When the Guest tells us they updated the status field, we handle it. */
1071static void update_device_status(struct device *dev)
1072{
1073 /* A zero status is a reset, otherwise it's a set of flags. */
1074 if (dev->desc->status == 0)
1075 reset_device(dev);
1076 else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) {
1077 warnx("Device %s configuration FAILED", dev->name);
1078 if (dev->running)
1079 reset_device(dev);
1080 } else {
1081 if (dev->running)
1082 err(1, "Device %s features finalized twice", dev->name);
1083 start_device(dev);
1084 }
1085}
1086
1087/*L:215
1088 * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In
1089 * particular, it's used to notify us of device status changes during boot.
1090 */
1091static void handle_output(unsigned long addr)
1092{
1093 struct device *i;
1094
1095 /* Check each device. */
1096 for (i = devices.dev; i; i = i->next) {
1097 struct virtqueue *vq;
1098
1099 /*
1100 * Notifications to device descriptors mean they updated the
1101 * device status.
1102 */
1103 if (from_guest_phys(addr) == i->desc) {
1104 update_device_status(i);
1105 return;
1106 }
1107
1108 /* Devices should not be used before features are finalized. */
1109 for (vq = i->vq; vq; vq = vq->next) {
1110 if (addr != vq->config.pfn*getpagesize())
1111 continue;
1112 errx(1, "Notification on %s before setup!", i->name);
1113 }
1114 }
1115
1116 /*
1117 * Early console write is done using notify on a nul-terminated string
1118 * in Guest memory. It's also great for hacking debugging messages
1119 * into a Guest.
1120 */
1121 if (addr >= guest_limit)
1122 errx(1, "Bad NOTIFY %#lx", addr);
1123
1124 write(STDOUT_FILENO, from_guest_phys(addr),
1125 strnlen(from_guest_phys(addr), guest_limit - addr));
1126}
1127
1128/*L:190
1129 * Device Setup
1130 *
1131 * All devices need a descriptor so the Guest knows it exists, and a "struct
1132 * device" so the Launcher can keep track of it. We have common helper
1133 * routines to allocate and manage them.
1134 */
1135
1136/*
1137 * The layout of the device page is a "struct lguest_device_desc" followed by a
1138 * number of virtqueue descriptors, then two sets of feature bits, then an
1139 * array of configuration bytes. This routine returns the configuration
1140 * pointer.
1141 */
1142static u8 *device_config(const struct device *dev)
1143{
1144 return (void *)(dev->desc + 1)
1145 + dev->num_vq * sizeof(struct lguest_vqconfig)
1146 + dev->feature_len * 2;
1147}
1148
1149/*
1150 * This routine allocates a new "struct lguest_device_desc" from descriptor
1151 * table page just above the Guest's normal memory. It returns a pointer to
1152 * that descriptor.
1153 */
1154static struct lguest_device_desc *new_dev_desc(u16 type)
1155{
1156 struct lguest_device_desc d = { .type = type };
1157 void *p;
1158
1159 /* Figure out where the next device config is, based on the last one. */
1160 if (devices.lastdev)
1161 p = device_config(devices.lastdev)
1162 + devices.lastdev->desc->config_len;
1163 else
1164 p = devices.descpage;
1165
1166 /* We only have one page for all the descriptors. */
1167 if (p + sizeof(d) > (void *)devices.descpage + getpagesize())
1168 errx(1, "Too many devices");
1169
1170 /* p might not be aligned, so we memcpy in. */
1171 return memcpy(p, &d, sizeof(d));
1172}
1173
1174/*
1175 * Each device descriptor is followed by the description of its virtqueues. We
1176 * specify how many descriptors the virtqueue is to have.
1177 */
1178static void add_virtqueue(struct device *dev, unsigned int num_descs,
1179 void (*service)(struct virtqueue *))
1180{
1181 unsigned int pages;
1182 struct virtqueue **i, *vq = malloc(sizeof(*vq));
1183 void *p;
1184
1185 /* First we need some memory for this virtqueue. */
1186 pages = (vring_size(num_descs, LGUEST_VRING_ALIGN) + getpagesize() - 1)
1187 / getpagesize();
1188 p = get_pages(pages);
1189
1190 /* Initialize the virtqueue */
1191 vq->next = NULL;
1192 vq->last_avail_idx = 0;
1193 vq->dev = dev;
1194
1195 /*
1196 * This is the routine the service thread will run, and its Process ID
1197 * once it's running.
1198 */
1199 vq->service = service;
1200 vq->thread = (pid_t)-1;
1201
1202 /* Initialize the configuration. */
1203 vq->config.num = num_descs;
1204 vq->config.irq = devices.next_irq++;
1205 vq->config.pfn = to_guest_phys(p) / getpagesize();
1206
1207 /* Initialize the vring. */
1208 vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN);
1209
1210 /*
1211 * Append virtqueue to this device's descriptor. We use
1212 * device_config() to get the end of the device's current virtqueues;
1213 * we check that we haven't added any config or feature information
1214 * yet, otherwise we'd be overwriting them.
1215 */
1216 assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
1217 memcpy(device_config(dev), &vq->config, sizeof(vq->config));
1218 dev->num_vq++;
1219 dev->desc->num_vq++;
1220
1221 verbose("Virtqueue page %#lx\n", to_guest_phys(p));
1222
1223 /*
1224 * Add to tail of list, so dev->vq is first vq, dev->vq->next is
1225 * second.
1226 */
1227 for (i = &dev->vq; *i; i = &(*i)->next);
1228 *i = vq;
1229}
1230
1231/*
1232 * The first half of the feature bitmask is for us to advertise features. The
1233 * second half is for the Guest to accept features.
1234 */
1235static void add_feature(struct device *dev, unsigned bit)
1236{
1237 u8 *features = get_feature_bits(dev);
1238
1239 /* We can't extend the feature bits once we've added config bytes */
1240 if (dev->desc->feature_len <= bit / CHAR_BIT) {
1241 assert(dev->desc->config_len == 0);
1242 dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1;
1243 }
1244
1245 features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT));
1246}
1247
1248/*
1249 * This routine sets the configuration fields for an existing device's
1250 * descriptor. It only works for the last device, but that's OK because that's
1251 * how we use it.
1252 */
1253static void set_config(struct device *dev, unsigned len, const void *conf)
1254{
1255 /* Check we haven't overflowed our single page. */
1256 if (device_config(dev) + len > devices.descpage + getpagesize())
1257 errx(1, "Too many devices");
1258
1259 /* Copy in the config information, and store the length. */
1260 memcpy(device_config(dev), conf, len);
1261 dev->desc->config_len = len;
1262
1263 /* Size must fit in config_len field (8 bits)! */
1264 assert(dev->desc->config_len == len);
1265}
1266
1267/*
1268 * This routine does all the creation and setup of a new device, including
1269 * calling new_dev_desc() to allocate the descriptor and device memory. We
1270 * don't actually start the service threads until later.
1271 *
1272 * See what I mean about userspace being boring?
1273 */
1274static struct device *new_device(const char *name, u16 type)
1275{
1276 struct device *dev = malloc(sizeof(*dev));
1277
1278 /* Now we populate the fields one at a time. */
1279 dev->desc = new_dev_desc(type);
1280 dev->name = name;
1281 dev->vq = NULL;
1282 dev->feature_len = 0;
1283 dev->num_vq = 0;
1284 dev->running = false;
1285 dev->next = NULL;
1286
1287 /*
1288 * Append to device list. Prepending to a single-linked list is
1289 * easier, but the user expects the devices to be arranged on the bus
1290 * in command-line order. The first network device on the command line
1291 * is eth0, the first block device /dev/vda, etc.
1292 */
1293 if (devices.lastdev)
1294 devices.lastdev->next = dev;
1295 else
1296 devices.dev = dev;
1297 devices.lastdev = dev;
1298
1299 return dev;
1300}
1301
1302/*
1303 * Our first setup routine is the console. It's a fairly simple device, but
1304 * UNIX tty handling makes it uglier than it could be.
1305 */
1306static void setup_console(void)
1307{
1308 struct device *dev;
1309
1310 /* If we can save the initial standard input settings... */
1311 if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
1312 struct termios term = orig_term;
1313 /*
1314 * Then we turn off echo, line buffering and ^C etc: We want a
1315 * raw input stream to the Guest.
1316 */
1317 term.c_lflag &= ~(ISIG|ICANON|ECHO);
1318 tcsetattr(STDIN_FILENO, TCSANOW, &term);
1319 }
1320
1321 dev = new_device("console", VIRTIO_ID_CONSOLE);
1322
1323 /* We store the console state in dev->priv, and initialize it. */
1324 dev->priv = malloc(sizeof(struct console_abort));
1325 ((struct console_abort *)dev->priv)->count = 0;
1326
1327 /*
1328 * The console needs two virtqueues: the input then the output. When
1329 * they put something the input queue, we make sure we're listening to
1330 * stdin. When they put something in the output queue, we write it to
1331 * stdout.
1332 */
1333 add_virtqueue(dev, VIRTQUEUE_NUM, console_input);
1334 add_virtqueue(dev, VIRTQUEUE_NUM, console_output);
1335
1336 verbose("device %u: console\n", ++devices.device_num);
1337}
1338/*:*/
1339
1340/*M:010
1341 * Inter-guest networking is an interesting area. Simplest is to have a
1342 * --sharenet=<name> option which opens or creates a named pipe. This can be
1343 * used to send packets to another guest in a 1:1 manner.
1344 *
1345 * More sophisticated is to use one of the tools developed for project like UML
1346 * to do networking.
1347 *
1348 * Faster is to do virtio bonding in kernel. Doing this 1:1 would be
1349 * completely generic ("here's my vring, attach to your vring") and would work
1350 * for any traffic. Of course, namespace and permissions issues need to be
1351 * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide
1352 * multiple inter-guest channels behind one interface, although it would
1353 * require some manner of hotplugging new virtio channels.
1354 *
1355 * Finally, we could use a virtio network switch in the kernel, ie. vhost.
1356:*/
1357
1358static u32 str2ip(const char *ipaddr)
1359{
1360 unsigned int b[4];
1361
1362 if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4)
1363 errx(1, "Failed to parse IP address '%s'", ipaddr);
1364 return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
1365}
1366
1367static void str2mac(const char *macaddr, unsigned char mac[6])
1368{
1369 unsigned int m[6];
1370 if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x",
1371 &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6)
1372 errx(1, "Failed to parse mac address '%s'", macaddr);
1373 mac[0] = m[0];
1374 mac[1] = m[1];
1375 mac[2] = m[2];
1376 mac[3] = m[3];
1377 mac[4] = m[4];
1378 mac[5] = m[5];
1379}
1380
1381/*
1382 * This code is "adapted" from libbridge: it attaches the Host end of the
1383 * network device to the bridge device specified by the command line.
1384 *
1385 * This is yet another James Morris contribution (I'm an IP-level guy, so I
1386 * dislike bridging), and I just try not to break it.
1387 */
1388static void add_to_bridge(int fd, const char *if_name, const char *br_name)
1389{
1390 int ifidx;
1391 struct ifreq ifr;
1392
1393 if (!*br_name)
1394 errx(1, "must specify bridge name");
1395
1396 ifidx = if_nametoindex(if_name);
1397 if (!ifidx)
1398 errx(1, "interface %s does not exist!", if_name);
1399
1400 strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
1401 ifr.ifr_name[IFNAMSIZ-1] = '\0';
1402 ifr.ifr_ifindex = ifidx;
1403 if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
1404 err(1, "can't add %s to bridge %s", if_name, br_name);
1405}
1406
1407/*
1408 * This sets up the Host end of the network device with an IP address, brings
1409 * it up so packets will flow, the copies the MAC address into the hwaddr
1410 * pointer.
1411 */
1412static void configure_device(int fd, const char *tapif, u32 ipaddr)
1413{
1414 struct ifreq ifr;
1415 struct sockaddr_in sin;
1416
1417 memset(&ifr, 0, sizeof(ifr));
1418 strcpy(ifr.ifr_name, tapif);
1419
1420 /* Don't read these incantations. Just cut & paste them like I did! */
1421 sin.sin_family = AF_INET;
1422 sin.sin_addr.s_addr = htonl(ipaddr);
1423 memcpy(&ifr.ifr_addr, &sin, sizeof(sin));
1424 if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
1425 err(1, "Setting %s interface address", tapif);
1426 ifr.ifr_flags = IFF_UP;
1427 if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
1428 err(1, "Bringing interface %s up", tapif);
1429}
1430
1431static int get_tun_device(char tapif[IFNAMSIZ])
1432{
1433 struct ifreq ifr;
1434 int netfd;
1435
1436 /* Start with this zeroed. Messy but sure. */
1437 memset(&ifr, 0, sizeof(ifr));
1438
1439 /*
1440 * We open the /dev/net/tun device and tell it we want a tap device. A
1441 * tap device is like a tun device, only somehow different. To tell
1442 * the truth, I completely blundered my way through this code, but it
1443 * works now!
1444 */
1445 netfd = open_or_die("/dev/net/tun", O_RDWR);
1446 ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
1447 strcpy(ifr.ifr_name, "tap%d");
1448 if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
1449 err(1, "configuring /dev/net/tun");
1450
1451 if (ioctl(netfd, TUNSETOFFLOAD,
1452 TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0)
1453 err(1, "Could not set features for tun device");
1454
1455 /*
1456 * We don't need checksums calculated for packets coming in this
1457 * device: trust us!
1458 */
1459 ioctl(netfd, TUNSETNOCSUM, 1);
1460
1461 memcpy(tapif, ifr.ifr_name, IFNAMSIZ);
1462 return netfd;
1463}
1464
1465/*L:195
1466 * Our network is a Host<->Guest network. This can either use bridging or
1467 * routing, but the principle is the same: it uses the "tun" device to inject
1468 * packets into the Host as if they came in from a normal network card. We
1469 * just shunt packets between the Guest and the tun device.
1470 */
1471static void setup_tun_net(char *arg)
1472{
1473 struct device *dev;
1474 struct net_info *net_info = malloc(sizeof(*net_info));
1475 int ipfd;
1476 u32 ip = INADDR_ANY;
1477 bool bridging = false;
1478 char tapif[IFNAMSIZ], *p;
1479 struct virtio_net_config conf;
1480
1481 net_info->tunfd = get_tun_device(tapif);
1482
1483 /* First we create a new network device. */
1484 dev = new_device("net", VIRTIO_ID_NET);
1485 dev->priv = net_info;
1486
1487 /* Network devices need a recv and a send queue, just like console. */
1488 add_virtqueue(dev, VIRTQUEUE_NUM, net_input);
1489 add_virtqueue(dev, VIRTQUEUE_NUM, net_output);
1490
1491 /*
1492 * We need a socket to perform the magic network ioctls to bring up the
1493 * tap interface, connect to the bridge etc. Any socket will do!
1494 */
1495 ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
1496 if (ipfd < 0)
1497 err(1, "opening IP socket");
1498
1499 /* If the command line was --tunnet=bridge:<name> do bridging. */
1500 if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
1501 arg += strlen(BRIDGE_PFX);
1502 bridging = true;
1503 }
1504
1505 /* A mac address may follow the bridge name or IP address */
1506 p = strchr(arg, ':');
1507 if (p) {
1508 str2mac(p+1, conf.mac);
1509 add_feature(dev, VIRTIO_NET_F_MAC);
1510 *p = '\0';
1511 }
1512
1513 /* arg is now either an IP address or a bridge name */
1514 if (bridging)
1515 add_to_bridge(ipfd, tapif, arg);
1516 else
1517 ip = str2ip(arg);
1518
1519 /* Set up the tun device. */
1520 configure_device(ipfd, tapif, ip);
1521
1522 /* Expect Guest to handle everything except UFO */
1523 add_feature(dev, VIRTIO_NET_F_CSUM);
1524 add_feature(dev, VIRTIO_NET_F_GUEST_CSUM);
1525 add_feature(dev, VIRTIO_NET_F_GUEST_TSO4);
1526 add_feature(dev, VIRTIO_NET_F_GUEST_TSO6);
1527 add_feature(dev, VIRTIO_NET_F_GUEST_ECN);
1528 add_feature(dev, VIRTIO_NET_F_HOST_TSO4);
1529 add_feature(dev, VIRTIO_NET_F_HOST_TSO6);
1530 add_feature(dev, VIRTIO_NET_F_HOST_ECN);
1531 /* We handle indirect ring entries */
1532 add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC);
1533 set_config(dev, sizeof(conf), &conf);
1534
1535 /* We don't need the socket any more; setup is done. */
1536 close(ipfd);
1537
1538 devices.device_num++;
1539
1540 if (bridging)
1541 verbose("device %u: tun %s attached to bridge: %s\n",
1542 devices.device_num, tapif, arg);
1543 else
1544 verbose("device %u: tun %s: %s\n",
1545 devices.device_num, tapif, arg);
1546}
1547/*:*/
1548
1549/* This hangs off device->priv. */
1550struct vblk_info {
1551 /* The size of the file. */
1552 off64_t len;
1553
1554 /* The file descriptor for the file. */
1555 int fd;
1556
1557};
1558
1559/*L:210
1560 * The Disk
1561 *
1562 * The disk only has one virtqueue, so it only has one thread. It is really
1563 * simple: the Guest asks for a block number and we read or write that position
1564 * in the file.
1565 *
1566 * Before we serviced each virtqueue in a separate thread, that was unacceptably
1567 * slow: the Guest waits until the read is finished before running anything
1568 * else, even if it could have been doing useful work.
1569 *
1570 * We could have used async I/O, except it's reputed to suck so hard that
1571 * characters actually go missing from your code when you try to use it.
1572 */
1573static void blk_request(struct virtqueue *vq)
1574{
1575 struct vblk_info *vblk = vq->dev->priv;
1576 unsigned int head, out_num, in_num, wlen;
1577 int ret, i;
1578 u8 *in;
1579 struct virtio_blk_outhdr out;
1580 struct iovec iov[vq->vring.num];
1581 off64_t off;
1582
1583 /*
1584 * Get the next request, where we normally wait. It triggers the
1585 * interrupt to acknowledge previously serviced requests (if any).
1586 */
1587 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1588
1589 /* Copy the output header from the front of the iov (adjusts iov) */
1590 iov_consume(iov, out_num, &out, sizeof(out));
1591
1592 /* Find and trim end of iov input array, for our status byte. */
1593 in = NULL;
1594 for (i = out_num + in_num - 1; i >= out_num; i--) {
1595 if (iov[i].iov_len > 0) {
1596 in = iov[i].iov_base + iov[i].iov_len - 1;
1597 iov[i].iov_len--;
1598 break;
1599 }
1600 }
1601 if (!in)
1602 errx(1, "Bad virtblk cmd with no room for status");
1603
1604 /*
1605 * For historical reasons, block operations are expressed in 512 byte
1606 * "sectors".
1607 */
1608 off = out.sector * 512;
1609
1610 /*
1611 * In general the virtio block driver is allowed to try SCSI commands.
1612 * It'd be nice if we supported eject, for example, but we don't.
1613 */
1614 if (out.type & VIRTIO_BLK_T_SCSI_CMD) {
1615 fprintf(stderr, "Scsi commands unsupported\n");
1616 *in = VIRTIO_BLK_S_UNSUPP;
1617 wlen = sizeof(*in);
1618 } else if (out.type & VIRTIO_BLK_T_OUT) {
1619 /*
1620 * Write
1621 *
1622 * Move to the right location in the block file. This can fail
1623 * if they try to write past end.
1624 */
1625 if (lseek64(vblk->fd, off, SEEK_SET) != off)
1626 err(1, "Bad seek to sector %llu", out.sector);
1627
1628 ret = writev(vblk->fd, iov, out_num);
1629 verbose("WRITE to sector %llu: %i\n", out.sector, ret);
1630
1631 /*
1632 * Grr... Now we know how long the descriptor they sent was, we
1633 * make sure they didn't try to write over the end of the block
1634 * file (possibly extending it).
1635 */
1636 if (ret > 0 && off + ret > vblk->len) {
1637 /* Trim it back to the correct length */
1638 ftruncate64(vblk->fd, vblk->len);
1639 /* Die, bad Guest, die. */
1640 errx(1, "Write past end %llu+%u", off, ret);
1641 }
1642
1643 wlen = sizeof(*in);
1644 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
1645 } else if (out.type & VIRTIO_BLK_T_FLUSH) {
1646 /* Flush */
1647 ret = fdatasync(vblk->fd);
1648 verbose("FLUSH fdatasync: %i\n", ret);
1649 wlen = sizeof(*in);
1650 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
1651 } else {
1652 /*
1653 * Read
1654 *
1655 * Move to the right location in the block file. This can fail
1656 * if they try to read past end.
1657 */
1658 if (lseek64(vblk->fd, off, SEEK_SET) != off)
1659 err(1, "Bad seek to sector %llu", out.sector);
1660
1661 ret = readv(vblk->fd, iov + out_num, in_num);
1662 if (ret >= 0) {
1663 wlen = sizeof(*in) + ret;
1664 *in = VIRTIO_BLK_S_OK;
1665 } else {
1666 wlen = sizeof(*in);
1667 *in = VIRTIO_BLK_S_IOERR;
1668 }
1669 }
1670
1671 /* Finished that request. */
1672 add_used(vq, head, wlen);
1673}
1674
1675/*L:198 This actually sets up a virtual block device. */
1676static void setup_block_file(const char *filename)
1677{
1678 struct device *dev;
1679 struct vblk_info *vblk;
1680 struct virtio_blk_config conf;
1681
1682 /* Creat the device. */
1683 dev = new_device("block", VIRTIO_ID_BLOCK);
1684
1685 /* The device has one virtqueue, where the Guest places requests. */
1686 add_virtqueue(dev, VIRTQUEUE_NUM, blk_request);
1687
1688 /* Allocate the room for our own bookkeeping */
1689 vblk = dev->priv = malloc(sizeof(*vblk));
1690
1691 /* First we open the file and store the length. */
1692 vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
1693 vblk->len = lseek64(vblk->fd, 0, SEEK_END);
1694
1695 /* We support FLUSH. */
1696 add_feature(dev, VIRTIO_BLK_F_FLUSH);
1697
1698 /* Tell Guest how many sectors this device has. */
1699 conf.capacity = cpu_to_le64(vblk->len / 512);
1700
1701 /*
1702 * Tell Guest not to put in too many descriptors at once: two are used
1703 * for the in and out elements.
1704 */
1705 add_feature(dev, VIRTIO_BLK_F_SEG_MAX);
1706 conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2);
1707
1708 /* Don't try to put whole struct: we have 8 bit limit. */
1709 set_config(dev, offsetof(struct virtio_blk_config, geometry), &conf);
1710
1711 verbose("device %u: virtblock %llu sectors\n",
1712 ++devices.device_num, le64_to_cpu(conf.capacity));
1713}
1714
1715/*L:211
1716 * Our random number generator device reads from /dev/random into the Guest's
1717 * input buffers. The usual case is that the Guest doesn't want random numbers
1718 * and so has no buffers although /dev/random is still readable, whereas
1719 * console is the reverse.
1720 *
1721 * The same logic applies, however.
1722 */
1723struct rng_info {
1724 int rfd;
1725};
1726
1727static void rng_input(struct virtqueue *vq)
1728{
1729 int len;
1730 unsigned int head, in_num, out_num, totlen = 0;
1731 struct rng_info *rng_info = vq->dev->priv;
1732 struct iovec iov[vq->vring.num];
1733
1734 /* First we need a buffer from the Guests's virtqueue. */
1735 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1736 if (out_num)
1737 errx(1, "Output buffers in rng?");
1738
1739 /*
1740 * Just like the console write, we loop to cover the whole iovec.
1741 * In this case, short reads actually happen quite a bit.
1742 */
1743 while (!iov_empty(iov, in_num)) {
1744 len = readv(rng_info->rfd, iov, in_num);
1745 if (len <= 0)
1746 err(1, "Read from /dev/random gave %i", len);
1747 iov_consume(iov, in_num, NULL, len);
1748 totlen += len;
1749 }
1750
1751 /* Tell the Guest about the new input. */
1752 add_used(vq, head, totlen);
1753}
1754
1755/*L:199
1756 * This creates a "hardware" random number device for the Guest.
1757 */
1758static void setup_rng(void)
1759{
1760 struct device *dev;
1761 struct rng_info *rng_info = malloc(sizeof(*rng_info));
1762
1763 /* Our device's privat info simply contains the /dev/random fd. */
1764 rng_info->rfd = open_or_die("/dev/random", O_RDONLY);
1765
1766 /* Create the new device. */
1767 dev = new_device("rng", VIRTIO_ID_RNG);
1768 dev->priv = rng_info;
1769
1770 /* The device has one virtqueue, where the Guest places inbufs. */
1771 add_virtqueue(dev, VIRTQUEUE_NUM, rng_input);
1772
1773 verbose("device %u: rng\n", devices.device_num++);
1774}
1775/* That's the end of device setup. */
1776
1777/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */
1778static void __attribute__((noreturn)) restart_guest(void)
1779{
1780 unsigned int i;
1781
1782 /*
1783 * Since we don't track all open fds, we simply close everything beyond
1784 * stderr.
1785 */
1786 for (i = 3; i < FD_SETSIZE; i++)
1787 close(i);
1788
1789 /* Reset all the devices (kills all threads). */
1790 cleanup_devices();
1791
1792 execv(main_args[0], main_args);
1793 err(1, "Could not exec %s", main_args[0]);
1794}
1795
1796/*L:220
1797 * Finally we reach the core of the Launcher which runs the Guest, serves
1798 * its input and output, and finally, lays it to rest.
1799 */
1800static void __attribute__((noreturn)) run_guest(void)
1801{
1802 for (;;) {
1803 unsigned long notify_addr;
1804 int readval;
1805
1806 /* We read from the /dev/lguest device to run the Guest. */
1807 readval = pread(lguest_fd, &notify_addr,
1808 sizeof(notify_addr), cpu_id);
1809
1810 /* One unsigned long means the Guest did HCALL_NOTIFY */
1811 if (readval == sizeof(notify_addr)) {
1812 verbose("Notify on address %#lx\n", notify_addr);
1813 handle_output(notify_addr);
1814 /* ENOENT means the Guest died. Reading tells us why. */
1815 } else if (errno == ENOENT) {
1816 char reason[1024] = { 0 };
1817 pread(lguest_fd, reason, sizeof(reason)-1, cpu_id);
1818 errx(1, "%s", reason);
1819 /* ERESTART means that we need to reboot the guest */
1820 } else if (errno == ERESTART) {
1821 restart_guest();
1822 /* Anything else means a bug or incompatible change. */
1823 } else
1824 err(1, "Running guest failed");
1825 }
1826}
1827/*L:240
1828 * This is the end of the Launcher. The good news: we are over halfway
1829 * through! The bad news: the most fiendish part of the code still lies ahead
1830 * of us.
1831 *
1832 * Are you ready? Take a deep breath and join me in the core of the Host, in
1833 * "make Host".
1834:*/
1835
1836static struct option opts[] = {
1837 { "verbose", 0, NULL, 'v' },
1838 { "tunnet", 1, NULL, 't' },
1839 { "block", 1, NULL, 'b' },
1840 { "rng", 0, NULL, 'r' },
1841 { "initrd", 1, NULL, 'i' },
1842 { "username", 1, NULL, 'u' },
1843 { "chroot", 1, NULL, 'c' },
1844 { NULL },
1845};
1846static void usage(void)
1847{
1848 errx(1, "Usage: lguest [--verbose] "
1849 "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)\n"
1850 "|--block=<filename>|--initrd=<filename>]...\n"
1851 "<mem-in-mb> vmlinux [args...]");
1852}
1853
1854/*L:105 The main routine is where the real work begins: */
1855int main(int argc, char *argv[])
1856{
1857 /* Memory, code startpoint and size of the (optional) initrd. */
1858 unsigned long mem = 0, start, initrd_size = 0;
1859 /* Two temporaries. */
1860 int i, c;
1861 /* The boot information for the Guest. */
1862 struct boot_params *boot;
1863 /* If they specify an initrd file to load. */
1864 const char *initrd_name = NULL;
1865
1866 /* Password structure for initgroups/setres[gu]id */
1867 struct passwd *user_details = NULL;
1868
1869 /* Directory to chroot to */
1870 char *chroot_path = NULL;
1871
1872 /* Save the args: we "reboot" by execing ourselves again. */
1873 main_args = argv;
1874
1875 /*
1876 * First we initialize the device list. We keep a pointer to the last
1877 * device, and the next interrupt number to use for devices (1:
1878 * remember that 0 is used by the timer).
1879 */
1880 devices.lastdev = NULL;
1881 devices.next_irq = 1;
1882
1883 /* We're CPU 0. In fact, that's the only CPU possible right now. */
1884 cpu_id = 0;
1885
1886 /*
1887 * We need to know how much memory so we can set up the device
1888 * descriptor and memory pages for the devices as we parse the command
1889 * line. So we quickly look through the arguments to find the amount
1890 * of memory now.
1891 */
1892 for (i = 1; i < argc; i++) {
1893 if (argv[i][0] != '-') {
1894 mem = atoi(argv[i]) * 1024 * 1024;
1895 /*
1896 * We start by mapping anonymous pages over all of
1897 * guest-physical memory range. This fills it with 0,
1898 * and ensures that the Guest won't be killed when it
1899 * tries to access it.
1900 */
1901 guest_base = map_zeroed_pages(mem / getpagesize()
1902 + DEVICE_PAGES);
1903 guest_limit = mem;
1904 guest_max = mem + DEVICE_PAGES*getpagesize();
1905 devices.descpage = get_pages(1);
1906 break;
1907 }
1908 }
1909
1910 /* The options are fairly straight-forward */
1911 while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
1912 switch (c) {
1913 case 'v':
1914 verbose = true;
1915 break;
1916 case 't':
1917 setup_tun_net(optarg);
1918 break;
1919 case 'b':
1920 setup_block_file(optarg);
1921 break;
1922 case 'r':
1923 setup_rng();
1924 break;
1925 case 'i':
1926 initrd_name = optarg;
1927 break;
1928 case 'u':
1929 user_details = getpwnam(optarg);
1930 if (!user_details)
1931 err(1, "getpwnam failed, incorrect username?");
1932 break;
1933 case 'c':
1934 chroot_path = optarg;
1935 break;
1936 default:
1937 warnx("Unknown argument %s", argv[optind]);
1938 usage();
1939 }
1940 }
1941 /*
1942 * After the other arguments we expect memory and kernel image name,
1943 * followed by command line arguments for the kernel.
1944 */
1945 if (optind + 2 > argc)
1946 usage();
1947
1948 verbose("Guest base is at %p\n", guest_base);
1949
1950 /* We always have a console device */
1951 setup_console();
1952
1953 /* Now we load the kernel */
1954 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
1955
1956 /* Boot information is stashed at physical address 0 */
1957 boot = from_guest_phys(0);
1958
1959 /* Map the initrd image if requested (at top of physical memory) */
1960 if (initrd_name) {
1961 initrd_size = load_initrd(initrd_name, mem);
1962 /*
1963 * These are the location in the Linux boot header where the
1964 * start and size of the initrd are expected to be found.
1965 */
1966 boot->hdr.ramdisk_image = mem - initrd_size;
1967 boot->hdr.ramdisk_size = initrd_size;
1968 /* The bootloader type 0xFF means "unknown"; that's OK. */
1969 boot->hdr.type_of_loader = 0xFF;
1970 }
1971
1972 /*
1973 * The Linux boot header contains an "E820" memory map: ours is a
1974 * simple, single region.
1975 */
1976 boot->e820_entries = 1;
1977 boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM });
1978 /*
1979 * The boot header contains a command line pointer: we put the command
1980 * line after the boot header.
1981 */
1982 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
1983 /* We use a simple helper to copy the arguments separated by spaces. */
1984 concat((char *)(boot + 1), argv+optind+2);
1985
1986 /* Set kernel alignment to 16M (CONFIG_PHYSICAL_ALIGN) */
1987 boot->hdr.kernel_alignment = 0x1000000;
1988
1989 /* Boot protocol version: 2.07 supports the fields for lguest. */
1990 boot->hdr.version = 0x207;
1991
1992 /* The hardware_subarch value of "1" tells the Guest it's an lguest. */
1993 boot->hdr.hardware_subarch = 1;
1994
1995 /* Tell the entry path not to try to reload segment registers. */
1996 boot->hdr.loadflags |= KEEP_SEGMENTS;
1997
1998 /* We tell the kernel to initialize the Guest. */
1999 tell_kernel(start);
2000
2001 /* Ensure that we terminate if a device-servicing child dies. */
2002 signal(SIGCHLD, kill_launcher);
2003
2004 /* If we exit via err(), this kills all the threads, restores tty. */
2005 atexit(cleanup_devices);
2006
2007 /* If requested, chroot to a directory */
2008 if (chroot_path) {
2009 if (chroot(chroot_path) != 0)
2010 err(1, "chroot(\"%s\") failed", chroot_path);
2011
2012 if (chdir("/") != 0)
2013 err(1, "chdir(\"/\") failed");
2014
2015 verbose("chroot done\n");
2016 }
2017
2018 /* If requested, drop privileges */
2019 if (user_details) {
2020 uid_t u;
2021 gid_t g;
2022
2023 u = user_details->pw_uid;
2024 g = user_details->pw_gid;
2025
2026 if (initgroups(user_details->pw_name, g) != 0)
2027 err(1, "initgroups failed");
2028
2029 if (setresgid(g, g, g) != 0)
2030 err(1, "setresgid failed");
2031
2032 if (setresuid(u, u, u) != 0)
2033 err(1, "setresuid failed");
2034
2035 verbose("Dropping privileges completed\n");
2036 }
2037
2038 /* Finally, run the Guest. This doesn't return. */
2039 run_guest();
2040}
2041/*:*/
2042
2043/*M:999
2044 * Mastery is done: you now know everything I do.
2045 *
2046 * But surely you have seen code, features and bugs in your wanderings which
2047 * you now yearn to attack? That is the real game, and I look forward to you
2048 * patching and forking lguest into the Your-Name-Here-visor.
2049 *
2050 * Farewell, and good coding!
2051 * Rusty Russell.
2052 */
diff --git a/tools/lguest/lguest.txt b/tools/lguest/lguest.txt
deleted file mode 100644
index bff0c554485..00000000000
--- a/tools/lguest/lguest.txt
+++ /dev/null
@@ -1,129 +0,0 @@
1 __
2 (___()'`; Rusty's Remarkably Unreliable Guide to Lguest
3 /, /` - or, A Young Coder's Illustrated Hypervisor
4 \\"--\\ http://lguest.ozlabs.org
5
6Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel,
7for Linux developers and users to experiment with virtualization with the
8minimum of complexity. Nonetheless, it should have sufficient features to
9make it useful for specific tasks, and, of course, you are encouraged to fork
10and enhance it (see drivers/lguest/README).
11
12Features:
13
14- Kernel module which runs in a normal kernel.
15- Simple I/O model for communication.
16- Simple program to create new guests.
17- Logo contains cute puppies: http://lguest.ozlabs.org
18
19Developer features:
20
21- Fun to hack on.
22- No ABI: being tied to a specific kernel anyway, you can change anything.
23- Many opportunities for improvement or feature implementation.
24
25Running Lguest:
26
27- The easiest way to run lguest is to use same kernel as guest and host.
28 You can configure them differently, but usually it's easiest not to.
29
30 You will need to configure your kernel with the following options:
31
32 "General setup":
33 "Prompt for development and/or incomplete code/drivers" = Y
34 (CONFIG_EXPERIMENTAL=y)
35
36 "Processor type and features":
37 "Paravirtualized guest support" = Y
38 "Lguest guest support" = Y
39 "High Memory Support" = off/4GB
40 "Alignment value to which kernel should be aligned" = 0x100000
41 (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
42 CONFIG_PHYSICAL_ALIGN=0x100000)
43
44 "Device Drivers":
45 "Block devices"
46 "Virtio block driver (EXPERIMENTAL)" = M/Y
47 "Network device support"
48 "Universal TUN/TAP device driver support" = M/Y
49 "Virtio network driver (EXPERIMENTAL)" = M/Y
50 (CONFIG_VIRTIO_BLK=m, CONFIG_VIRTIO_NET=m and CONFIG_TUN=m)
51
52 "Virtualization"
53 "Linux hypervisor example code" = M/Y
54 (CONFIG_LGUEST=m)
55
56- A tool called "lguest" is available in this directory: type "make"
57 to build it. If you didn't build your kernel in-tree, use "make
58 O=<builddir>".
59
60- Create or find a root disk image. There are several useful ones
61 around, such as the xm-test tiny root image at
62 http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img
63
64 For more serious work, I usually use a distribution ISO image and
65 install it under qemu, then make multiple copies:
66
67 dd if=/dev/zero of=rootfile bs=1M count=2048
68 qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
69
70 Make sure that you install a getty on /dev/hvc0 if you want to log in on the
71 console!
72
73- "modprobe lg" if you built it as a module.
74
75- Run an lguest as root:
76
77 Documentation/virtual/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \
78 --block=rootfile root=/dev/vda
79
80 Explanation:
81 64: the amount of memory to use, in MB.
82
83 vmlinux: the kernel image found in the top of your build directory. You
84 can also use a standard bzImage.
85
86 --tunnet=192.168.19.1: configures a "tap" device for networking with this
87 IP address.
88
89 --block=rootfile: a file or block device which becomes /dev/vda
90 inside the guest.
91
92 root=/dev/vda: this (and anything else on the command line) are
93 kernel boot parameters.
94
95- Configuring networking. I usually have the host masquerade, using
96 "iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 >
97 /proc/sys/net/ipv4/ip_forward". In this example, I would configure
98 eth0 inside the guest at 192.168.19.2.
99
100 Another method is to bridge the tap device to an external interface
101 using --tunnet=bridge:<bridgename>, and perhaps run dhcp on the guest
102 to obtain an IP address. The bridge needs to be configured first:
103 this option simply adds the tap interface to it.
104
105 A simple example on my system:
106
107 ifconfig eth0 0.0.0.0
108 brctl addbr lg0
109 ifconfig lg0 up
110 brctl addif lg0 eth0
111 dhclient lg0
112
113 Then use --tunnet=bridge:lg0 when launching the guest.
114
115 See:
116
117 http://www.linuxfoundation.org/collaborate/workgroups/networking/bridge
118
119 for general information on how to get bridging to work.
120
121- Random number generation. Using the --rng option will provide a
122 /dev/hwrng in the guest that will read from the host's /dev/random.
123 Use this option in conjunction with rng-tools (see ../hw_random.txt)
124 to provide entropy to the guest kernel's /dev/random.
125
126There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest
127
128Good luck!
129Rusty Russell rusty@rustcorp.com.au.