aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/lguest
diff options
context:
space:
mode:
authorRob Landley <rlandley@parallels.com>2011-05-06 12:22:02 -0400
committerRandy Dunlap <randy.dunlap@oracle.com>2011-05-06 12:22:02 -0400
commited16648eb5b86917f0b90bdcdbc857202da72f90 (patch)
treea8198415a6c2f1909f02340b05d36e1d53b82320 /Documentation/lguest
parentbfd412db9e7b0d8f7b9c09d12d07aa2ac785f1d0 (diff)
Move kvm, uml, and lguest subdirectories under a common "virtual" directory, I.E:
cd Documentation mkdir virtual git mv kvm uml lguest virtual Signed-off-by: Rob Landley <rlandley@parallels.com> Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Diffstat (limited to 'Documentation/lguest')
-rw-r--r--Documentation/lguest/.gitignore1
-rw-r--r--Documentation/lguest/Makefile8
-rw-r--r--Documentation/lguest/extract58
-rw-r--r--Documentation/lguest/lguest.c2095
-rw-r--r--Documentation/lguest/lguest.txt128
5 files changed, 0 insertions, 2290 deletions
diff --git a/Documentation/lguest/.gitignore b/Documentation/lguest/.gitignore
deleted file mode 100644
index 115587fd5f65..000000000000
--- a/Documentation/lguest/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
1lguest
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile
deleted file mode 100644
index bebac6b4f332..000000000000
--- a/Documentation/lguest/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
1# This creates the demonstration utility "lguest" which runs a Linux guest.
2# Missing headers? Add "-I../../include -I../../arch/x86/include"
3CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE
4
5all: lguest
6
7clean:
8 rm -f lguest
diff --git a/Documentation/lguest/extract b/Documentation/lguest/extract
deleted file mode 100644
index 7730bb6e4b94..000000000000
--- a/Documentation/lguest/extract
+++ /dev/null
@@ -1,58 +0,0 @@
1#! /bin/sh
2
3set -e
4
5PREFIX=$1
6shift
7
8trap 'rm -r $TMPDIR' 0
9TMPDIR=`mktemp -d`
10
11exec 3>/dev/null
12for f; do
13 while IFS="
14" read -r LINE; do
15 case "$LINE" in
16 *$PREFIX:[0-9]*:\**)
17 NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
18 if [ -f $TMPDIR/$NUM ]; then
19 echo "$TMPDIR/$NUM already exits prior to $f"
20 exit 1
21 fi
22 exec 3>>$TMPDIR/$NUM
23 echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
24 /bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3
25 ;;
26 *$PREFIX:[0-9]*)
27 NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
28 if [ -f $TMPDIR/$NUM ]; then
29 echo "$TMPDIR/$NUM already exits prior to $f"
30 exit 1
31 fi
32 exec 3>>$TMPDIR/$NUM
33 echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
34 /bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3
35 ;;
36 *:\**)
37 /bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3
38 echo >&3
39 exec 3>/dev/null
40 ;;
41 *)
42 /bin/echo "$LINE" >&3
43 ;;
44 esac
45 done < $f
46 echo >&3
47 exec 3>/dev/null
48done
49
50LASTFILE=""
51for f in $TMPDIR/*; do
52 if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then
53 LASTFILE=$(cat $TMPDIR/.$(basename $f) )
54 echo "[ $LASTFILE ]"
55 fi
56 cat $f
57done
58
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
deleted file mode 100644
index d9da7e148538..000000000000
--- a/Documentation/lguest/lguest.c
+++ /dev/null
@@ -1,2095 +0,0 @@
1/*P:100
2 * This is the Launcher code, a simple program which lays out the "physical"
3 * memory for the new Guest by mapping the kernel image and the virtual
4 * devices, then opens /dev/lguest to tell the kernel about the Guest and
5 * control it.
6:*/
7#define _LARGEFILE64_SOURCE
8#define _GNU_SOURCE
9#include <stdio.h>
10#include <string.h>
11#include <unistd.h>
12#include <err.h>
13#include <stdint.h>
14#include <stdlib.h>
15#include <elf.h>
16#include <sys/mman.h>
17#include <sys/param.h>
18#include <sys/types.h>
19#include <sys/stat.h>
20#include <sys/wait.h>
21#include <sys/eventfd.h>
22#include <fcntl.h>
23#include <stdbool.h>
24#include <errno.h>
25#include <ctype.h>
26#include <sys/socket.h>
27#include <sys/ioctl.h>
28#include <sys/time.h>
29#include <time.h>
30#include <netinet/in.h>
31#include <net/if.h>
32#include <linux/sockios.h>
33#include <linux/if_tun.h>
34#include <sys/uio.h>
35#include <termios.h>
36#include <getopt.h>
37#include <assert.h>
38#include <sched.h>
39#include <limits.h>
40#include <stddef.h>
41#include <signal.h>
42#include <pwd.h>
43#include <grp.h>
44
45#include <linux/virtio_config.h>
46#include <linux/virtio_net.h>
47#include <linux/virtio_blk.h>
48#include <linux/virtio_console.h>
49#include <linux/virtio_rng.h>
50#include <linux/virtio_ring.h>
51#include <asm/bootparam.h>
52#include "../../include/linux/lguest_launcher.h"
53/*L:110
54 * We can ignore the 42 include files we need for this program, but I do want
55 * to draw attention to the use of kernel-style types.
56 *
57 * As Linus said, "C is a Spartan language, and so should your naming be." I
58 * like these abbreviations, so we define them here. Note that u64 is always
59 * unsigned long long, which works on all Linux systems: this means that we can
60 * use %llu in printf for any u64.
61 */
62typedef unsigned long long u64;
63typedef uint32_t u32;
64typedef uint16_t u16;
65typedef uint8_t u8;
66/*:*/
67
68#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
69#define BRIDGE_PFX "bridge:"
70#ifndef SIOCBRADDIF
71#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
72#endif
73/* We can have up to 256 pages for devices. */
74#define DEVICE_PAGES 256
75/* This will occupy 3 pages: it must be a power of 2. */
76#define VIRTQUEUE_NUM 256
77
78/*L:120
79 * verbose is both a global flag and a macro. The C preprocessor allows
80 * this, and although I wouldn't recommend it, it works quite nicely here.
81 */
82static bool verbose;
83#define verbose(args...) \
84 do { if (verbose) printf(args); } while(0)
85/*:*/
86
87/* The pointer to the start of guest memory. */
88static void *guest_base;
89/* The maximum guest physical address allowed, and maximum possible. */
90static unsigned long guest_limit, guest_max;
91/* The /dev/lguest file descriptor. */
92static int lguest_fd;
93
94/* a per-cpu variable indicating whose vcpu is currently running */
95static unsigned int __thread cpu_id;
96
97/* This is our list of devices. */
98struct device_list {
99 /* Counter to assign interrupt numbers. */
100 unsigned int next_irq;
101
102 /* Counter to print out convenient device numbers. */
103 unsigned int device_num;
104
105 /* The descriptor page for the devices. */
106 u8 *descpage;
107
108 /* A single linked list of devices. */
109 struct device *dev;
110 /* And a pointer to the last device for easy append. */
111 struct device *lastdev;
112};
113
114/* The list of Guest devices, based on command line arguments. */
115static struct device_list devices;
116
117/* The device structure describes a single device. */
118struct device {
119 /* The linked-list pointer. */
120 struct device *next;
121
122 /* The device's descriptor, as mapped into the Guest. */
123 struct lguest_device_desc *desc;
124
125 /* We can't trust desc values once Guest has booted: we use these. */
126 unsigned int feature_len;
127 unsigned int num_vq;
128
129 /* The name of this device, for --verbose. */
130 const char *name;
131
132 /* Any queues attached to this device */
133 struct virtqueue *vq;
134
135 /* Is it operational */
136 bool running;
137
138 /* Does Guest want an intrrupt on empty? */
139 bool irq_on_empty;
140
141 /* Device-specific data. */
142 void *priv;
143};
144
145/* The virtqueue structure describes a queue attached to a device. */
146struct virtqueue {
147 struct virtqueue *next;
148
149 /* Which device owns me. */
150 struct device *dev;
151
152 /* The configuration for this queue. */
153 struct lguest_vqconfig config;
154
155 /* The actual ring of buffers. */
156 struct vring vring;
157
158 /* Last available index we saw. */
159 u16 last_avail_idx;
160
161 /* How many are used since we sent last irq? */
162 unsigned int pending_used;
163
164 /* Eventfd where Guest notifications arrive. */
165 int eventfd;
166
167 /* Function for the thread which is servicing this virtqueue. */
168 void (*service)(struct virtqueue *vq);
169 pid_t thread;
170};
171
172/* Remember the arguments to the program so we can "reboot" */
173static char **main_args;
174
175/* The original tty settings to restore on exit. */
176static struct termios orig_term;
177
178/*
179 * We have to be careful with barriers: our devices are all run in separate
180 * threads and so we need to make sure that changes visible to the Guest happen
181 * in precise order.
182 */
183#define wmb() __asm__ __volatile__("" : : : "memory")
184#define mb() __asm__ __volatile__("" : : : "memory")
185
186/*
187 * Convert an iovec element to the given type.
188 *
189 * This is a fairly ugly trick: we need to know the size of the type and
190 * alignment requirement to check the pointer is kosher. It's also nice to
191 * have the name of the type in case we report failure.
192 *
193 * Typing those three things all the time is cumbersome and error prone, so we
194 * have a macro which sets them all up and passes to the real function.
195 */
196#define convert(iov, type) \
197 ((type *)_convert((iov), sizeof(type), __alignof__(type), #type))
198
199static void *_convert(struct iovec *iov, size_t size, size_t align,
200 const char *name)
201{
202 if (iov->iov_len != size)
203 errx(1, "Bad iovec size %zu for %s", iov->iov_len, name);
204 if ((unsigned long)iov->iov_base % align != 0)
205 errx(1, "Bad alignment %p for %s", iov->iov_base, name);
206 return iov->iov_base;
207}
208
209/* Wrapper for the last available index. Makes it easier to change. */
210#define lg_last_avail(vq) ((vq)->last_avail_idx)
211
212/*
213 * The virtio configuration space is defined to be little-endian. x86 is
214 * little-endian too, but it's nice to be explicit so we have these helpers.
215 */
216#define cpu_to_le16(v16) (v16)
217#define cpu_to_le32(v32) (v32)
218#define cpu_to_le64(v64) (v64)
219#define le16_to_cpu(v16) (v16)
220#define le32_to_cpu(v32) (v32)
221#define le64_to_cpu(v64) (v64)
222
223/* Is this iovec empty? */
224static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
225{
226 unsigned int i;
227
228 for (i = 0; i < num_iov; i++)
229 if (iov[i].iov_len)
230 return false;
231 return true;
232}
233
234/* Take len bytes from the front of this iovec. */
235static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len)
236{
237 unsigned int i;
238
239 for (i = 0; i < num_iov; i++) {
240 unsigned int used;
241
242 used = iov[i].iov_len < len ? iov[i].iov_len : len;
243 iov[i].iov_base += used;
244 iov[i].iov_len -= used;
245 len -= used;
246 }
247 assert(len == 0);
248}
249
250/* The device virtqueue descriptors are followed by feature bitmasks. */
251static u8 *get_feature_bits(struct device *dev)
252{
253 return (u8 *)(dev->desc + 1)
254 + dev->num_vq * sizeof(struct lguest_vqconfig);
255}
256
257/*L:100
258 * The Launcher code itself takes us out into userspace, that scary place where
259 * pointers run wild and free! Unfortunately, like most userspace programs,
260 * it's quite boring (which is why everyone likes to hack on the kernel!).
261 * Perhaps if you make up an Lguest Drinking Game at this point, it will get
262 * you through this section. Or, maybe not.
263 *
264 * The Launcher sets up a big chunk of memory to be the Guest's "physical"
265 * memory and stores it in "guest_base". In other words, Guest physical ==
266 * Launcher virtual with an offset.
267 *
268 * This can be tough to get your head around, but usually it just means that we
269 * use these trivial conversion functions when the Guest gives us its
270 * "physical" addresses:
271 */
272static void *from_guest_phys(unsigned long addr)
273{
274 return guest_base + addr;
275}
276
277static unsigned long to_guest_phys(const void *addr)
278{
279 return (addr - guest_base);
280}
281
282/*L:130
283 * Loading the Kernel.
284 *
285 * We start with couple of simple helper routines. open_or_die() avoids
286 * error-checking code cluttering the callers:
287 */
288static int open_or_die(const char *name, int flags)
289{
290 int fd = open(name, flags);
291 if (fd < 0)
292 err(1, "Failed to open %s", name);
293 return fd;
294}
295
296/* map_zeroed_pages() takes a number of pages. */
297static void *map_zeroed_pages(unsigned int num)
298{
299 int fd = open_or_die("/dev/zero", O_RDONLY);
300 void *addr;
301
302 /*
303 * We use a private mapping (ie. if we write to the page, it will be
304 * copied). We allocate an extra two pages PROT_NONE to act as guard
305 * pages against read/write attempts that exceed allocated space.
306 */
307 addr = mmap(NULL, getpagesize() * (num+2),
308 PROT_NONE, MAP_PRIVATE, fd, 0);
309
310 if (addr == MAP_FAILED)
311 err(1, "Mmapping %u pages of /dev/zero", num);
312
313 if (mprotect(addr + getpagesize(), getpagesize() * num,
314 PROT_READ|PROT_WRITE) == -1)
315 err(1, "mprotect rw %u pages failed", num);
316
317 /*
318 * One neat mmap feature is that you can close the fd, and it
319 * stays mapped.
320 */
321 close(fd);
322
323 /* Return address after PROT_NONE page */
324 return addr + getpagesize();
325}
326
327/* Get some more pages for a device. */
328static void *get_pages(unsigned int num)
329{
330 void *addr = from_guest_phys(guest_limit);
331
332 guest_limit += num * getpagesize();
333 if (guest_limit > guest_max)
334 errx(1, "Not enough memory for devices");
335 return addr;
336}
337
338/*
339 * This routine is used to load the kernel or initrd. It tries mmap, but if
340 * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
341 * it falls back to reading the memory in.
342 */
343static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
344{
345 ssize_t r;
346
347 /*
348 * We map writable even though for some segments are marked read-only.
349 * The kernel really wants to be writable: it patches its own
350 * instructions.
351 *
352 * MAP_PRIVATE means that the page won't be copied until a write is
353 * done to it. This allows us to share untouched memory between
354 * Guests.
355 */
356 if (mmap(addr, len, PROT_READ|PROT_WRITE,
357 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
358 return;
359
360 /* pread does a seek and a read in one shot: saves a few lines. */
361 r = pread(fd, addr, len, offset);
362 if (r != len)
363 err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
364}
365
366/*
367 * This routine takes an open vmlinux image, which is in ELF, and maps it into
368 * the Guest memory. ELF = Embedded Linking Format, which is the format used
369 * by all modern binaries on Linux including the kernel.
370 *
371 * The ELF headers give *two* addresses: a physical address, and a virtual
372 * address. We use the physical address; the Guest will map itself to the
373 * virtual address.
374 *
375 * We return the starting address.
376 */
377static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
378{
379 Elf32_Phdr phdr[ehdr->e_phnum];
380 unsigned int i;
381
382 /*
383 * Sanity checks on the main ELF header: an x86 executable with a
384 * reasonable number of correctly-sized program headers.
385 */
386 if (ehdr->e_type != ET_EXEC
387 || ehdr->e_machine != EM_386
388 || ehdr->e_phentsize != sizeof(Elf32_Phdr)
389 || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
390 errx(1, "Malformed elf header");
391
392 /*
393 * An ELF executable contains an ELF header and a number of "program"
394 * headers which indicate which parts ("segments") of the program to
395 * load where.
396 */
397
398 /* We read in all the program headers at once: */
399 if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
400 err(1, "Seeking to program headers");
401 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
402 err(1, "Reading program headers");
403
404 /*
405 * Try all the headers: there are usually only three. A read-only one,
406 * a read-write one, and a "note" section which we don't load.
407 */
408 for (i = 0; i < ehdr->e_phnum; i++) {
409 /* If this isn't a loadable segment, we ignore it */
410 if (phdr[i].p_type != PT_LOAD)
411 continue;
412
413 verbose("Section %i: size %i addr %p\n",
414 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
415
416 /* We map this section of the file at its physical address. */
417 map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
418 phdr[i].p_offset, phdr[i].p_filesz);
419 }
420
421 /* The entry point is given in the ELF header. */
422 return ehdr->e_entry;
423}
424
425/*L:150
426 * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed
427 * to jump into it and it will unpack itself. We used to have to perform some
428 * hairy magic because the unpacking code scared me.
429 *
430 * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
431 * a small patch to jump over the tricky bits in the Guest, so now we just read
432 * the funky header so we know where in the file to load, and away we go!
433 */
434static unsigned long load_bzimage(int fd)
435{
436 struct boot_params boot;
437 int r;
438 /* Modern bzImages get loaded at 1M. */
439 void *p = from_guest_phys(0x100000);
440
441 /*
442 * Go back to the start of the file and read the header. It should be
443 * a Linux boot header (see Documentation/x86/i386/boot.txt)
444 */
445 lseek(fd, 0, SEEK_SET);
446 read(fd, &boot, sizeof(boot));
447
448 /* Inside the setup_hdr, we expect the magic "HdrS" */
449 if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
450 errx(1, "This doesn't look like a bzImage to me");
451
452 /* Skip over the extra sectors of the header. */
453 lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
454
455 /* Now read everything into memory. in nice big chunks. */
456 while ((r = read(fd, p, 65536)) > 0)
457 p += r;
458
459 /* Finally, code32_start tells us where to enter the kernel. */
460 return boot.hdr.code32_start;
461}
462
463/*L:140
464 * Loading the kernel is easy when it's a "vmlinux", but most kernels
465 * come wrapped up in the self-decompressing "bzImage" format. With a little
466 * work, we can load those, too.
467 */
468static unsigned long load_kernel(int fd)
469{
470 Elf32_Ehdr hdr;
471
472 /* Read in the first few bytes. */
473 if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
474 err(1, "Reading kernel");
475
476 /* If it's an ELF file, it starts with "\177ELF" */
477 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
478 return map_elf(fd, &hdr);
479
480 /* Otherwise we assume it's a bzImage, and try to load it. */
481 return load_bzimage(fd);
482}
483
484/*
485 * This is a trivial little helper to align pages. Andi Kleen hated it because
486 * it calls getpagesize() twice: "it's dumb code."
487 *
488 * Kernel guys get really het up about optimization, even when it's not
489 * necessary. I leave this code as a reaction against that.
490 */
491static inline unsigned long page_align(unsigned long addr)
492{
493 /* Add upwards and truncate downwards. */
494 return ((addr + getpagesize()-1) & ~(getpagesize()-1));
495}
496
497/*L:180
498 * An "initial ram disk" is a disk image loaded into memory along with the
499 * kernel which the kernel can use to boot from without needing any drivers.
500 * Most distributions now use this as standard: the initrd contains the code to
501 * load the appropriate driver modules for the current machine.
502 *
503 * Importantly, James Morris works for RedHat, and Fedora uses initrds for its
504 * kernels. He sent me this (and tells me when I break it).
505 */
506static unsigned long load_initrd(const char *name, unsigned long mem)
507{
508 int ifd;
509 struct stat st;
510 unsigned long len;
511
512 ifd = open_or_die(name, O_RDONLY);
513 /* fstat() is needed to get the file size. */
514 if (fstat(ifd, &st) < 0)
515 err(1, "fstat() on initrd '%s'", name);
516
517 /*
518 * We map the initrd at the top of memory, but mmap wants it to be
519 * page-aligned, so we round the size up for that.
520 */
521 len = page_align(st.st_size);
522 map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
523 /*
524 * Once a file is mapped, you can close the file descriptor. It's a
525 * little odd, but quite useful.
526 */
527 close(ifd);
528 verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
529
530 /* We return the initrd size. */
531 return len;
532}
533/*:*/
534
535/*
536 * Simple routine to roll all the commandline arguments together with spaces
537 * between them.
538 */
539static void concat(char *dst, char *args[])
540{
541 unsigned int i, len = 0;
542
543 for (i = 0; args[i]; i++) {
544 if (i) {
545 strcat(dst+len, " ");
546 len++;
547 }
548 strcpy(dst+len, args[i]);
549 len += strlen(args[i]);
550 }
551 /* In case it's empty. */
552 dst[len] = '\0';
553}
554
555/*L:185
556 * This is where we actually tell the kernel to initialize the Guest. We
557 * saw the arguments it expects when we looked at initialize() in lguest_user.c:
558 * the base of Guest "physical" memory, the top physical page to allow and the
559 * entry point for the Guest.
560 */
561static void tell_kernel(unsigned long start)
562{
563 unsigned long args[] = { LHREQ_INITIALIZE,
564 (unsigned long)guest_base,
565 guest_limit / getpagesize(), start };
566 verbose("Guest: %p - %p (%#lx)\n",
567 guest_base, guest_base + guest_limit, guest_limit);
568 lguest_fd = open_or_die("/dev/lguest", O_RDWR);
569 if (write(lguest_fd, args, sizeof(args)) < 0)
570 err(1, "Writing to /dev/lguest");
571}
572/*:*/
573
574/*L:200
575 * Device Handling.
576 *
577 * When the Guest gives us a buffer, it sends an array of addresses and sizes.
578 * We need to make sure it's not trying to reach into the Launcher itself, so
579 * we have a convenient routine which checks it and exits with an error message
580 * if something funny is going on:
581 */
582static void *_check_pointer(unsigned long addr, unsigned int size,
583 unsigned int line)
584{
585 /*
586 * Check if the requested address and size exceeds the allocated memory,
587 * or addr + size wraps around.
588 */
589 if ((addr + size) > guest_limit || (addr + size) < addr)
590 errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr);
591 /*
592 * We return a pointer for the caller's convenience, now we know it's
593 * safe to use.
594 */
595 return from_guest_phys(addr);
596}
597/* A macro which transparently hands the line number to the real function. */
598#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
599
600/*
601 * Each buffer in the virtqueues is actually a chain of descriptors. This
602 * function returns the next descriptor in the chain, or vq->vring.num if we're
603 * at the end.
604 */
605static unsigned next_desc(struct vring_desc *desc,
606 unsigned int i, unsigned int max)
607{
608 unsigned int next;
609
610 /* If this descriptor says it doesn't chain, we're done. */
611 if (!(desc[i].flags & VRING_DESC_F_NEXT))
612 return max;
613
614 /* Check they're not leading us off end of descriptors. */
615 next = desc[i].next;
616 /* Make sure compiler knows to grab that: we don't want it changing! */
617 wmb();
618
619 if (next >= max)
620 errx(1, "Desc next is %u", next);
621
622 return next;
623}
624
625/*
626 * This actually sends the interrupt for this virtqueue, if we've used a
627 * buffer.
628 */
629static void trigger_irq(struct virtqueue *vq)
630{
631 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
632
633 /* Don't inform them if nothing used. */
634 if (!vq->pending_used)
635 return;
636 vq->pending_used = 0;
637
638 /* If they don't want an interrupt, don't send one... */
639 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
640 /* ... unless they've asked us to force one on empty. */
641 if (!vq->dev->irq_on_empty
642 || lg_last_avail(vq) != vq->vring.avail->idx)
643 return;
644 }
645
646 /* Send the Guest an interrupt tell them we used something up. */
647 if (write(lguest_fd, buf, sizeof(buf)) != 0)
648 err(1, "Triggering irq %i", vq->config.irq);
649}
650
651/*
652 * This looks in the virtqueue for the first available buffer, and converts
653 * it to an iovec for convenient access. Since descriptors consist of some
654 * number of output then some number of input descriptors, it's actually two
655 * iovecs, but we pack them into one and note how many of each there were.
656 *
657 * This function waits if necessary, and returns the descriptor number found.
658 */
659static unsigned wait_for_vq_desc(struct virtqueue *vq,
660 struct iovec iov[],
661 unsigned int *out_num, unsigned int *in_num)
662{
663 unsigned int i, head, max;
664 struct vring_desc *desc;
665 u16 last_avail = lg_last_avail(vq);
666
667 /* There's nothing available? */
668 while (last_avail == vq->vring.avail->idx) {
669 u64 event;
670
671 /*
672 * Since we're about to sleep, now is a good time to tell the
673 * Guest about what we've used up to now.
674 */
675 trigger_irq(vq);
676
677 /* OK, now we need to know about added descriptors. */
678 vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
679
680 /*
681 * They could have slipped one in as we were doing that: make
682 * sure it's written, then check again.
683 */
684 mb();
685 if (last_avail != vq->vring.avail->idx) {
686 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
687 break;
688 }
689
690 /* Nothing new? Wait for eventfd to tell us they refilled. */
691 if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
692 errx(1, "Event read failed?");
693
694 /* We don't need to be notified again. */
695 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
696 }
697
698 /* Check it isn't doing very strange things with descriptor numbers. */
699 if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
700 errx(1, "Guest moved used index from %u to %u",
701 last_avail, vq->vring.avail->idx);
702
703 /*
704 * Grab the next descriptor number they're advertising, and increment
705 * the index we've seen.
706 */
707 head = vq->vring.avail->ring[last_avail % vq->vring.num];
708 lg_last_avail(vq)++;
709
710 /* If their number is silly, that's a fatal mistake. */
711 if (head >= vq->vring.num)
712 errx(1, "Guest says index %u is available", head);
713
714 /* When we start there are none of either input nor output. */
715 *out_num = *in_num = 0;
716
717 max = vq->vring.num;
718 desc = vq->vring.desc;
719 i = head;
720
721 /*
722 * If this is an indirect entry, then this buffer contains a descriptor
723 * table which we handle as if it's any normal descriptor chain.
724 */
725 if (desc[i].flags & VRING_DESC_F_INDIRECT) {
726 if (desc[i].len % sizeof(struct vring_desc))
727 errx(1, "Invalid size for indirect buffer table");
728
729 max = desc[i].len / sizeof(struct vring_desc);
730 desc = check_pointer(desc[i].addr, desc[i].len);
731 i = 0;
732 }
733
734 do {
735 /* Grab the first descriptor, and check it's OK. */
736 iov[*out_num + *in_num].iov_len = desc[i].len;
737 iov[*out_num + *in_num].iov_base
738 = check_pointer(desc[i].addr, desc[i].len);
739 /* If this is an input descriptor, increment that count. */
740 if (desc[i].flags & VRING_DESC_F_WRITE)
741 (*in_num)++;
742 else {
743 /*
744 * If it's an output descriptor, they're all supposed
745 * to come before any input descriptors.
746 */
747 if (*in_num)
748 errx(1, "Descriptor has out after in");
749 (*out_num)++;
750 }
751
752 /* If we've got too many, that implies a descriptor loop. */
753 if (*out_num + *in_num > max)
754 errx(1, "Looped descriptor");
755 } while ((i = next_desc(desc, i, max)) != max);
756
757 return head;
758}
759
760/*
761 * After we've used one of their buffers, we tell the Guest about it. Sometime
762 * later we'll want to send them an interrupt using trigger_irq(); note that
763 * wait_for_vq_desc() does that for us if it has to wait.
764 */
765static void add_used(struct virtqueue *vq, unsigned int head, int len)
766{
767 struct vring_used_elem *used;
768
769 /*
770 * The virtqueue contains a ring of used buffers. Get a pointer to the
771 * next entry in that used ring.
772 */
773 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
774 used->id = head;
775 used->len = len;
776 /* Make sure buffer is written before we update index. */
777 wmb();
778 vq->vring.used->idx++;
779 vq->pending_used++;
780}
781
782/* And here's the combo meal deal. Supersize me! */
783static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
784{
785 add_used(vq, head, len);
786 trigger_irq(vq);
787}
788
789/*
790 * The Console
791 *
792 * We associate some data with the console for our exit hack.
793 */
794struct console_abort {
795 /* How many times have they hit ^C? */
796 int count;
797 /* When did they start? */
798 struct timeval start;
799};
800
801/* This is the routine which handles console input (ie. stdin). */
802static void console_input(struct virtqueue *vq)
803{
804 int len;
805 unsigned int head, in_num, out_num;
806 struct console_abort *abort = vq->dev->priv;
807 struct iovec iov[vq->vring.num];
808
809 /* Make sure there's a descriptor available. */
810 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
811 if (out_num)
812 errx(1, "Output buffers in console in queue?");
813
814 /* Read into it. This is where we usually wait. */
815 len = readv(STDIN_FILENO, iov, in_num);
816 if (len <= 0) {
817 /* Ran out of input? */
818 warnx("Failed to get console input, ignoring console.");
819 /*
820 * For simplicity, dying threads kill the whole Launcher. So
821 * just nap here.
822 */
823 for (;;)
824 pause();
825 }
826
827 /* Tell the Guest we used a buffer. */
828 add_used_and_trigger(vq, head, len);
829
830 /*
831 * Three ^C within one second? Exit.
832 *
833 * This is such a hack, but works surprisingly well. Each ^C has to
834 * be in a buffer by itself, so they can't be too fast. But we check
835 * that we get three within about a second, so they can't be too
836 * slow.
837 */
838 if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
839 abort->count = 0;
840 return;
841 }
842
843 abort->count++;
844 if (abort->count == 1)
845 gettimeofday(&abort->start, NULL);
846 else if (abort->count == 3) {
847 struct timeval now;
848 gettimeofday(&now, NULL);
849 /* Kill all Launcher processes with SIGINT, like normal ^C */
850 if (now.tv_sec <= abort->start.tv_sec+1)
851 kill(0, SIGINT);
852 abort->count = 0;
853 }
854}
855
856/* This is the routine which handles console output (ie. stdout). */
857static void console_output(struct virtqueue *vq)
858{
859 unsigned int head, out, in;
860 struct iovec iov[vq->vring.num];
861
862 /* We usually wait in here, for the Guest to give us something. */
863 head = wait_for_vq_desc(vq, iov, &out, &in);
864 if (in)
865 errx(1, "Input buffers in console output queue?");
866
867 /* writev can return a partial write, so we loop here. */
868 while (!iov_empty(iov, out)) {
869 int len = writev(STDOUT_FILENO, iov, out);
870 if (len <= 0)
871 err(1, "Write to stdout gave %i", len);
872 iov_consume(iov, out, len);
873 }
874
875 /*
876 * We're finished with that buffer: if we're going to sleep,
877 * wait_for_vq_desc() will prod the Guest with an interrupt.
878 */
879 add_used(vq, head, 0);
880}
881
882/*
883 * The Network
884 *
885 * Handling output for network is also simple: we get all the output buffers
886 * and write them to /dev/net/tun.
887 */
888struct net_info {
889 int tunfd;
890};
891
892static void net_output(struct virtqueue *vq)
893{
894 struct net_info *net_info = vq->dev->priv;
895 unsigned int head, out, in;
896 struct iovec iov[vq->vring.num];
897
898 /* We usually wait in here for the Guest to give us a packet. */
899 head = wait_for_vq_desc(vq, iov, &out, &in);
900 if (in)
901 errx(1, "Input buffers in net output queue?");
902 /*
903 * Send the whole thing through to /dev/net/tun. It expects the exact
904 * same format: what a coincidence!
905 */
906 if (writev(net_info->tunfd, iov, out) < 0)
907 errx(1, "Write to tun failed?");
908
909 /*
910 * Done with that one; wait_for_vq_desc() will send the interrupt if
911 * all packets are processed.
912 */
913 add_used(vq, head, 0);
914}
915
916/*
917 * Handling network input is a bit trickier, because I've tried to optimize it.
918 *
919 * First we have a helper routine which tells is if from this file descriptor
920 * (ie. the /dev/net/tun device) will block:
921 */
922static bool will_block(int fd)
923{
924 fd_set fdset;
925 struct timeval zero = { 0, 0 };
926 FD_ZERO(&fdset);
927 FD_SET(fd, &fdset);
928 return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
929}
930
931/*
932 * This handles packets coming in from the tun device to our Guest. Like all
933 * service routines, it gets called again as soon as it returns, so you don't
934 * see a while(1) loop here.
935 */
936static void net_input(struct virtqueue *vq)
937{
938 int len;
939 unsigned int head, out, in;
940 struct iovec iov[vq->vring.num];
941 struct net_info *net_info = vq->dev->priv;
942
943 /*
944 * Get a descriptor to write an incoming packet into. This will also
945 * send an interrupt if they're out of descriptors.
946 */
947 head = wait_for_vq_desc(vq, iov, &out, &in);
948 if (out)
949 errx(1, "Output buffers in net input queue?");
950
951 /*
952 * If it looks like we'll block reading from the tun device, send them
953 * an interrupt.
954 */
955 if (vq->pending_used && will_block(net_info->tunfd))
956 trigger_irq(vq);
957
958 /*
959 * Read in the packet. This is where we normally wait (when there's no
960 * incoming network traffic).
961 */
962 len = readv(net_info->tunfd, iov, in);
963 if (len <= 0)
964 err(1, "Failed to read from tun.");
965
966 /*
967 * Mark that packet buffer as used, but don't interrupt here. We want
968 * to wait until we've done as much work as we can.
969 */
970 add_used(vq, head, len);
971}
972/*:*/
973
974/* This is the helper to create threads: run the service routine in a loop. */
975static int do_thread(void *_vq)
976{
977 struct virtqueue *vq = _vq;
978
979 for (;;)
980 vq->service(vq);
981 return 0;
982}
983
984/*
985 * When a child dies, we kill our entire process group with SIGTERM. This
986 * also has the side effect that the shell restores the console for us!
987 */
988static void kill_launcher(int signal)
989{
990 kill(0, SIGTERM);
991}
992
993static void reset_device(struct device *dev)
994{
995 struct virtqueue *vq;
996
997 verbose("Resetting device %s\n", dev->name);
998
999 /* Clear any features they've acked. */
1000 memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len);
1001
1002 /* We're going to be explicitly killing threads, so ignore them. */
1003 signal(SIGCHLD, SIG_IGN);
1004
1005 /* Zero out the virtqueues, get rid of their threads */
1006 for (vq = dev->vq; vq; vq = vq->next) {
1007 if (vq->thread != (pid_t)-1) {
1008 kill(vq->thread, SIGTERM);
1009 waitpid(vq->thread, NULL, 0);
1010 vq->thread = (pid_t)-1;
1011 }
1012 memset(vq->vring.desc, 0,
1013 vring_size(vq->config.num, LGUEST_VRING_ALIGN));
1014 lg_last_avail(vq) = 0;
1015 }
1016 dev->running = false;
1017
1018 /* Now we care if threads die. */
1019 signal(SIGCHLD, (void *)kill_launcher);
1020}
1021
1022/*L:216
1023 * This actually creates the thread which services the virtqueue for a device.
1024 */
1025static void create_thread(struct virtqueue *vq)
1026{
1027 /*
1028 * Create stack for thread. Since the stack grows upwards, we point
1029 * the stack pointer to the end of this region.
1030 */
1031 char *stack = malloc(32768);
1032 unsigned long args[] = { LHREQ_EVENTFD,
1033 vq->config.pfn*getpagesize(), 0 };
1034
1035 /* Create a zero-initialized eventfd. */
1036 vq->eventfd = eventfd(0, 0);
1037 if (vq->eventfd < 0)
1038 err(1, "Creating eventfd");
1039 args[2] = vq->eventfd;
1040
1041 /*
1042 * Attach an eventfd to this virtqueue: it will go off when the Guest
1043 * does an LHCALL_NOTIFY for this vq.
1044 */
1045 if (write(lguest_fd, &args, sizeof(args)) != 0)
1046 err(1, "Attaching eventfd");
1047
1048 /*
1049 * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so
1050 * we get a signal if it dies.
1051 */
1052 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
1053 if (vq->thread == (pid_t)-1)
1054 err(1, "Creating clone");
1055
1056 /* We close our local copy now the child has it. */
1057 close(vq->eventfd);
1058}
1059
1060static bool accepted_feature(struct device *dev, unsigned int bit)
1061{
1062 const u8 *features = get_feature_bits(dev) + dev->feature_len;
1063
1064 if (dev->feature_len < bit / CHAR_BIT)
1065 return false;
1066 return features[bit / CHAR_BIT] & (1 << (bit % CHAR_BIT));
1067}
1068
1069static void start_device(struct device *dev)
1070{
1071 unsigned int i;
1072 struct virtqueue *vq;
1073
1074 verbose("Device %s OK: offered", dev->name);
1075 for (i = 0; i < dev->feature_len; i++)
1076 verbose(" %02x", get_feature_bits(dev)[i]);
1077 verbose(", accepted");
1078 for (i = 0; i < dev->feature_len; i++)
1079 verbose(" %02x", get_feature_bits(dev)
1080 [dev->feature_len+i]);
1081
1082 dev->irq_on_empty = accepted_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY);
1083
1084 for (vq = dev->vq; vq; vq = vq->next) {
1085 if (vq->service)
1086 create_thread(vq);
1087 }
1088 dev->running = true;
1089}
1090
1091static void cleanup_devices(void)
1092{
1093 struct device *dev;
1094
1095 for (dev = devices.dev; dev; dev = dev->next)
1096 reset_device(dev);
1097
1098 /* If we saved off the original terminal settings, restore them now. */
1099 if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
1100 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
1101}
1102
1103/* When the Guest tells us they updated the status field, we handle it. */
1104static void update_device_status(struct device *dev)
1105{
1106 /* A zero status is a reset, otherwise it's a set of flags. */
1107 if (dev->desc->status == 0)
1108 reset_device(dev);
1109 else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) {
1110 warnx("Device %s configuration FAILED", dev->name);
1111 if (dev->running)
1112 reset_device(dev);
1113 } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) {
1114 if (!dev->running)
1115 start_device(dev);
1116 }
1117}
1118
1119/*L:215
1120 * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In
1121 * particular, it's used to notify us of device status changes during boot.
1122 */
1123static void handle_output(unsigned long addr)
1124{
1125 struct device *i;
1126
1127 /* Check each device. */
1128 for (i = devices.dev; i; i = i->next) {
1129 struct virtqueue *vq;
1130
1131 /*
1132 * Notifications to device descriptors mean they updated the
1133 * device status.
1134 */
1135 if (from_guest_phys(addr) == i->desc) {
1136 update_device_status(i);
1137 return;
1138 }
1139
1140 /*
1141 * Devices *can* be used before status is set to DRIVER_OK.
1142 * The original plan was that they would never do this: they
1143 * would always finish setting up their status bits before
1144 * actually touching the virtqueues. In practice, we allowed
1145 * them to, and they do (eg. the disk probes for partition
1146 * tables as part of initialization).
1147 *
1148 * If we see this, we start the device: once it's running, we
1149 * expect the device to catch all the notifications.
1150 */
1151 for (vq = i->vq; vq; vq = vq->next) {
1152 if (addr != vq->config.pfn*getpagesize())
1153 continue;
1154 if (i->running)
1155 errx(1, "Notification on running %s", i->name);
1156 /* This just calls create_thread() for each virtqueue */
1157 start_device(i);
1158 return;
1159 }
1160 }
1161
1162 /*
1163 * Early console write is done using notify on a nul-terminated string
1164 * in Guest memory. It's also great for hacking debugging messages
1165 * into a Guest.
1166 */
1167 if (addr >= guest_limit)
1168 errx(1, "Bad NOTIFY %#lx", addr);
1169
1170 write(STDOUT_FILENO, from_guest_phys(addr),
1171 strnlen(from_guest_phys(addr), guest_limit - addr));
1172}
1173
1174/*L:190
1175 * Device Setup
1176 *
1177 * All devices need a descriptor so the Guest knows it exists, and a "struct
1178 * device" so the Launcher can keep track of it. We have common helper
1179 * routines to allocate and manage them.
1180 */
1181
1182/*
1183 * The layout of the device page is a "struct lguest_device_desc" followed by a
1184 * number of virtqueue descriptors, then two sets of feature bits, then an
1185 * array of configuration bytes. This routine returns the configuration
1186 * pointer.
1187 */
1188static u8 *device_config(const struct device *dev)
1189{
1190 return (void *)(dev->desc + 1)
1191 + dev->num_vq * sizeof(struct lguest_vqconfig)
1192 + dev->feature_len * 2;
1193}
1194
1195/*
1196 * This routine allocates a new "struct lguest_device_desc" from descriptor
1197 * table page just above the Guest's normal memory. It returns a pointer to
1198 * that descriptor.
1199 */
1200static struct lguest_device_desc *new_dev_desc(u16 type)
1201{
1202 struct lguest_device_desc d = { .type = type };
1203 void *p;
1204
1205 /* Figure out where the next device config is, based on the last one. */
1206 if (devices.lastdev)
1207 p = device_config(devices.lastdev)
1208 + devices.lastdev->desc->config_len;
1209 else
1210 p = devices.descpage;
1211
1212 /* We only have one page for all the descriptors. */
1213 if (p + sizeof(d) > (void *)devices.descpage + getpagesize())
1214 errx(1, "Too many devices");
1215
1216 /* p might not be aligned, so we memcpy in. */
1217 return memcpy(p, &d, sizeof(d));
1218}
1219
1220/*
1221 * Each device descriptor is followed by the description of its virtqueues. We
1222 * specify how many descriptors the virtqueue is to have.
1223 */
1224static void add_virtqueue(struct device *dev, unsigned int num_descs,
1225 void (*service)(struct virtqueue *))
1226{
1227 unsigned int pages;
1228 struct virtqueue **i, *vq = malloc(sizeof(*vq));
1229 void *p;
1230
1231 /* First we need some memory for this virtqueue. */
1232 pages = (vring_size(num_descs, LGUEST_VRING_ALIGN) + getpagesize() - 1)
1233 / getpagesize();
1234 p = get_pages(pages);
1235
1236 /* Initialize the virtqueue */
1237 vq->next = NULL;
1238 vq->last_avail_idx = 0;
1239 vq->dev = dev;
1240
1241 /*
1242 * This is the routine the service thread will run, and its Process ID
1243 * once it's running.
1244 */
1245 vq->service = service;
1246 vq->thread = (pid_t)-1;
1247
1248 /* Initialize the configuration. */
1249 vq->config.num = num_descs;
1250 vq->config.irq = devices.next_irq++;
1251 vq->config.pfn = to_guest_phys(p) / getpagesize();
1252
1253 /* Initialize the vring. */
1254 vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN);
1255
1256 /*
1257 * Append virtqueue to this device's descriptor. We use
1258 * device_config() to get the end of the device's current virtqueues;
1259 * we check that we haven't added any config or feature information
1260 * yet, otherwise we'd be overwriting them.
1261 */
1262 assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
1263 memcpy(device_config(dev), &vq->config, sizeof(vq->config));
1264 dev->num_vq++;
1265 dev->desc->num_vq++;
1266
1267 verbose("Virtqueue page %#lx\n", to_guest_phys(p));
1268
1269 /*
1270 * Add to tail of list, so dev->vq is first vq, dev->vq->next is
1271 * second.
1272 */
1273 for (i = &dev->vq; *i; i = &(*i)->next);
1274 *i = vq;
1275}
1276
1277/*
1278 * The first half of the feature bitmask is for us to advertise features. The
1279 * second half is for the Guest to accept features.
1280 */
1281static void add_feature(struct device *dev, unsigned bit)
1282{
1283 u8 *features = get_feature_bits(dev);
1284
1285 /* We can't extend the feature bits once we've added config bytes */
1286 if (dev->desc->feature_len <= bit / CHAR_BIT) {
1287 assert(dev->desc->config_len == 0);
1288 dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1;
1289 }
1290
1291 features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT));
1292}
1293
1294/*
1295 * This routine sets the configuration fields for an existing device's
1296 * descriptor. It only works for the last device, but that's OK because that's
1297 * how we use it.
1298 */
1299static void set_config(struct device *dev, unsigned len, const void *conf)
1300{
1301 /* Check we haven't overflowed our single page. */
1302 if (device_config(dev) + len > devices.descpage + getpagesize())
1303 errx(1, "Too many devices");
1304
1305 /* Copy in the config information, and store the length. */
1306 memcpy(device_config(dev), conf, len);
1307 dev->desc->config_len = len;
1308
1309 /* Size must fit in config_len field (8 bits)! */
1310 assert(dev->desc->config_len == len);
1311}
1312
1313/*
1314 * This routine does all the creation and setup of a new device, including
1315 * calling new_dev_desc() to allocate the descriptor and device memory. We
1316 * don't actually start the service threads until later.
1317 *
1318 * See what I mean about userspace being boring?
1319 */
1320static struct device *new_device(const char *name, u16 type)
1321{
1322 struct device *dev = malloc(sizeof(*dev));
1323
1324 /* Now we populate the fields one at a time. */
1325 dev->desc = new_dev_desc(type);
1326 dev->name = name;
1327 dev->vq = NULL;
1328 dev->feature_len = 0;
1329 dev->num_vq = 0;
1330 dev->running = false;
1331
1332 /*
1333 * Append to device list. Prepending to a single-linked list is
1334 * easier, but the user expects the devices to be arranged on the bus
1335 * in command-line order. The first network device on the command line
1336 * is eth0, the first block device /dev/vda, etc.
1337 */
1338 if (devices.lastdev)
1339 devices.lastdev->next = dev;
1340 else
1341 devices.dev = dev;
1342 devices.lastdev = dev;
1343
1344 return dev;
1345}
1346
1347/*
1348 * Our first setup routine is the console. It's a fairly simple device, but
1349 * UNIX tty handling makes it uglier than it could be.
1350 */
1351static void setup_console(void)
1352{
1353 struct device *dev;
1354
1355 /* If we can save the initial standard input settings... */
1356 if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
1357 struct termios term = orig_term;
1358 /*
1359 * Then we turn off echo, line buffering and ^C etc: We want a
1360 * raw input stream to the Guest.
1361 */
1362 term.c_lflag &= ~(ISIG|ICANON|ECHO);
1363 tcsetattr(STDIN_FILENO, TCSANOW, &term);
1364 }
1365
1366 dev = new_device("console", VIRTIO_ID_CONSOLE);
1367
1368 /* We store the console state in dev->priv, and initialize it. */
1369 dev->priv = malloc(sizeof(struct console_abort));
1370 ((struct console_abort *)dev->priv)->count = 0;
1371
1372 /*
1373 * The console needs two virtqueues: the input then the output. When
1374 * they put something the input queue, we make sure we're listening to
1375 * stdin. When they put something in the output queue, we write it to
1376 * stdout.
1377 */
1378 add_virtqueue(dev, VIRTQUEUE_NUM, console_input);
1379 add_virtqueue(dev, VIRTQUEUE_NUM, console_output);
1380
1381 verbose("device %u: console\n", ++devices.device_num);
1382}
1383/*:*/
1384
1385/*M:010
1386 * Inter-guest networking is an interesting area. Simplest is to have a
1387 * --sharenet=<name> option which opens or creates a named pipe. This can be
1388 * used to send packets to another guest in a 1:1 manner.
1389 *
1390 * More sopisticated is to use one of the tools developed for project like UML
1391 * to do networking.
1392 *
1393 * Faster is to do virtio bonding in kernel. Doing this 1:1 would be
1394 * completely generic ("here's my vring, attach to your vring") and would work
1395 * for any traffic. Of course, namespace and permissions issues need to be
1396 * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide
1397 * multiple inter-guest channels behind one interface, although it would
1398 * require some manner of hotplugging new virtio channels.
1399 *
1400 * Finally, we could implement a virtio network switch in the kernel.
1401:*/
1402
1403static u32 str2ip(const char *ipaddr)
1404{
1405 unsigned int b[4];
1406
1407 if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4)
1408 errx(1, "Failed to parse IP address '%s'", ipaddr);
1409 return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
1410}
1411
1412static void str2mac(const char *macaddr, unsigned char mac[6])
1413{
1414 unsigned int m[6];
1415 if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x",
1416 &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6)
1417 errx(1, "Failed to parse mac address '%s'", macaddr);
1418 mac[0] = m[0];
1419 mac[1] = m[1];
1420 mac[2] = m[2];
1421 mac[3] = m[3];
1422 mac[4] = m[4];
1423 mac[5] = m[5];
1424}
1425
1426/*
1427 * This code is "adapted" from libbridge: it attaches the Host end of the
1428 * network device to the bridge device specified by the command line.
1429 *
1430 * This is yet another James Morris contribution (I'm an IP-level guy, so I
1431 * dislike bridging), and I just try not to break it.
1432 */
1433static void add_to_bridge(int fd, const char *if_name, const char *br_name)
1434{
1435 int ifidx;
1436 struct ifreq ifr;
1437
1438 if (!*br_name)
1439 errx(1, "must specify bridge name");
1440
1441 ifidx = if_nametoindex(if_name);
1442 if (!ifidx)
1443 errx(1, "interface %s does not exist!", if_name);
1444
1445 strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
1446 ifr.ifr_name[IFNAMSIZ-1] = '\0';
1447 ifr.ifr_ifindex = ifidx;
1448 if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
1449 err(1, "can't add %s to bridge %s", if_name, br_name);
1450}
1451
1452/*
1453 * This sets up the Host end of the network device with an IP address, brings
1454 * it up so packets will flow, the copies the MAC address into the hwaddr
1455 * pointer.
1456 */
1457static void configure_device(int fd, const char *tapif, u32 ipaddr)
1458{
1459 struct ifreq ifr;
1460 struct sockaddr_in sin;
1461
1462 memset(&ifr, 0, sizeof(ifr));
1463 strcpy(ifr.ifr_name, tapif);
1464
1465 /* Don't read these incantations. Just cut & paste them like I did! */
1466 sin.sin_family = AF_INET;
1467 sin.sin_addr.s_addr = htonl(ipaddr);
1468 memcpy(&ifr.ifr_addr, &sin, sizeof(sin));
1469 if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
1470 err(1, "Setting %s interface address", tapif);
1471 ifr.ifr_flags = IFF_UP;
1472 if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
1473 err(1, "Bringing interface %s up", tapif);
1474}
1475
1476static int get_tun_device(char tapif[IFNAMSIZ])
1477{
1478 struct ifreq ifr;
1479 int netfd;
1480
1481 /* Start with this zeroed. Messy but sure. */
1482 memset(&ifr, 0, sizeof(ifr));
1483
1484 /*
1485 * We open the /dev/net/tun device and tell it we want a tap device. A
1486 * tap device is like a tun device, only somehow different. To tell
1487 * the truth, I completely blundered my way through this code, but it
1488 * works now!
1489 */
1490 netfd = open_or_die("/dev/net/tun", O_RDWR);
1491 ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
1492 strcpy(ifr.ifr_name, "tap%d");
1493 if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
1494 err(1, "configuring /dev/net/tun");
1495
1496 if (ioctl(netfd, TUNSETOFFLOAD,
1497 TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0)
1498 err(1, "Could not set features for tun device");
1499
1500 /*
1501 * We don't need checksums calculated for packets coming in this
1502 * device: trust us!
1503 */
1504 ioctl(netfd, TUNSETNOCSUM, 1);
1505
1506 memcpy(tapif, ifr.ifr_name, IFNAMSIZ);
1507 return netfd;
1508}
1509
1510/*L:195
1511 * Our network is a Host<->Guest network. This can either use bridging or
1512 * routing, but the principle is the same: it uses the "tun" device to inject
1513 * packets into the Host as if they came in from a normal network card. We
1514 * just shunt packets between the Guest and the tun device.
1515 */
1516static void setup_tun_net(char *arg)
1517{
1518 struct device *dev;
1519 struct net_info *net_info = malloc(sizeof(*net_info));
1520 int ipfd;
1521 u32 ip = INADDR_ANY;
1522 bool bridging = false;
1523 char tapif[IFNAMSIZ], *p;
1524 struct virtio_net_config conf;
1525
1526 net_info->tunfd = get_tun_device(tapif);
1527
1528 /* First we create a new network device. */
1529 dev = new_device("net", VIRTIO_ID_NET);
1530 dev->priv = net_info;
1531
1532 /* Network devices need a recv and a send queue, just like console. */
1533 add_virtqueue(dev, VIRTQUEUE_NUM, net_input);
1534 add_virtqueue(dev, VIRTQUEUE_NUM, net_output);
1535
1536 /*
1537 * We need a socket to perform the magic network ioctls to bring up the
1538 * tap interface, connect to the bridge etc. Any socket will do!
1539 */
1540 ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
1541 if (ipfd < 0)
1542 err(1, "opening IP socket");
1543
1544 /* If the command line was --tunnet=bridge:<name> do bridging. */
1545 if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
1546 arg += strlen(BRIDGE_PFX);
1547 bridging = true;
1548 }
1549
1550 /* A mac address may follow the bridge name or IP address */
1551 p = strchr(arg, ':');
1552 if (p) {
1553 str2mac(p+1, conf.mac);
1554 add_feature(dev, VIRTIO_NET_F_MAC);
1555 *p = '\0';
1556 }
1557
1558 /* arg is now either an IP address or a bridge name */
1559 if (bridging)
1560 add_to_bridge(ipfd, tapif, arg);
1561 else
1562 ip = str2ip(arg);
1563
1564 /* Set up the tun device. */
1565 configure_device(ipfd, tapif, ip);
1566
1567 add_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY);
1568 /* Expect Guest to handle everything except UFO */
1569 add_feature(dev, VIRTIO_NET_F_CSUM);
1570 add_feature(dev, VIRTIO_NET_F_GUEST_CSUM);
1571 add_feature(dev, VIRTIO_NET_F_GUEST_TSO4);
1572 add_feature(dev, VIRTIO_NET_F_GUEST_TSO6);
1573 add_feature(dev, VIRTIO_NET_F_GUEST_ECN);
1574 add_feature(dev, VIRTIO_NET_F_HOST_TSO4);
1575 add_feature(dev, VIRTIO_NET_F_HOST_TSO6);
1576 add_feature(dev, VIRTIO_NET_F_HOST_ECN);
1577 /* We handle indirect ring entries */
1578 add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC);
1579 set_config(dev, sizeof(conf), &conf);
1580
1581 /* We don't need the socket any more; setup is done. */
1582 close(ipfd);
1583
1584 devices.device_num++;
1585
1586 if (bridging)
1587 verbose("device %u: tun %s attached to bridge: %s\n",
1588 devices.device_num, tapif, arg);
1589 else
1590 verbose("device %u: tun %s: %s\n",
1591 devices.device_num, tapif, arg);
1592}
1593/*:*/
1594
1595/* This hangs off device->priv. */
1596struct vblk_info {
1597 /* The size of the file. */
1598 off64_t len;
1599
1600 /* The file descriptor for the file. */
1601 int fd;
1602
1603};
1604
1605/*L:210
1606 * The Disk
1607 *
1608 * The disk only has one virtqueue, so it only has one thread. It is really
1609 * simple: the Guest asks for a block number and we read or write that position
1610 * in the file.
1611 *
1612 * Before we serviced each virtqueue in a separate thread, that was unacceptably
1613 * slow: the Guest waits until the read is finished before running anything
1614 * else, even if it could have been doing useful work.
1615 *
1616 * We could have used async I/O, except it's reputed to suck so hard that
1617 * characters actually go missing from your code when you try to use it.
1618 */
1619static void blk_request(struct virtqueue *vq)
1620{
1621 struct vblk_info *vblk = vq->dev->priv;
1622 unsigned int head, out_num, in_num, wlen;
1623 int ret;
1624 u8 *in;
1625 struct virtio_blk_outhdr *out;
1626 struct iovec iov[vq->vring.num];
1627 off64_t off;
1628
1629 /*
1630 * Get the next request, where we normally wait. It triggers the
1631 * interrupt to acknowledge previously serviced requests (if any).
1632 */
1633 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1634
1635 /*
1636 * Every block request should contain at least one output buffer
1637 * (detailing the location on disk and the type of request) and one
1638 * input buffer (to hold the result).
1639 */
1640 if (out_num == 0 || in_num == 0)
1641 errx(1, "Bad virtblk cmd %u out=%u in=%u",
1642 head, out_num, in_num);
1643
1644 out = convert(&iov[0], struct virtio_blk_outhdr);
1645 in = convert(&iov[out_num+in_num-1], u8);
1646 /*
1647 * For historical reasons, block operations are expressed in 512 byte
1648 * "sectors".
1649 */
1650 off = out->sector * 512;
1651
1652 /*
1653 * In general the virtio block driver is allowed to try SCSI commands.
1654 * It'd be nice if we supported eject, for example, but we don't.
1655 */
1656 if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
1657 fprintf(stderr, "Scsi commands unsupported\n");
1658 *in = VIRTIO_BLK_S_UNSUPP;
1659 wlen = sizeof(*in);
1660 } else if (out->type & VIRTIO_BLK_T_OUT) {
1661 /*
1662 * Write
1663 *
1664 * Move to the right location in the block file. This can fail
1665 * if they try to write past end.
1666 */
1667 if (lseek64(vblk->fd, off, SEEK_SET) != off)
1668 err(1, "Bad seek to sector %llu", out->sector);
1669
1670 ret = writev(vblk->fd, iov+1, out_num-1);
1671 verbose("WRITE to sector %llu: %i\n", out->sector, ret);
1672
1673 /*
1674 * Grr... Now we know how long the descriptor they sent was, we
1675 * make sure they didn't try to write over the end of the block
1676 * file (possibly extending it).
1677 */
1678 if (ret > 0 && off + ret > vblk->len) {
1679 /* Trim it back to the correct length */
1680 ftruncate64(vblk->fd, vblk->len);
1681 /* Die, bad Guest, die. */
1682 errx(1, "Write past end %llu+%u", off, ret);
1683 }
1684
1685 wlen = sizeof(*in);
1686 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
1687 } else if (out->type & VIRTIO_BLK_T_FLUSH) {
1688 /* Flush */
1689 ret = fdatasync(vblk->fd);
1690 verbose("FLUSH fdatasync: %i\n", ret);
1691 wlen = sizeof(*in);
1692 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
1693 } else {
1694 /*
1695 * Read
1696 *
1697 * Move to the right location in the block file. This can fail
1698 * if they try to read past end.
1699 */
1700 if (lseek64(vblk->fd, off, SEEK_SET) != off)
1701 err(1, "Bad seek to sector %llu", out->sector);
1702
1703 ret = readv(vblk->fd, iov+1, in_num-1);
1704 verbose("READ from sector %llu: %i\n", out->sector, ret);
1705 if (ret >= 0) {
1706 wlen = sizeof(*in) + ret;
1707 *in = VIRTIO_BLK_S_OK;
1708 } else {
1709 wlen = sizeof(*in);
1710 *in = VIRTIO_BLK_S_IOERR;
1711 }
1712 }
1713
1714 /* Finished that request. */
1715 add_used(vq, head, wlen);
1716}
1717
1718/*L:198 This actually sets up a virtual block device. */
1719static void setup_block_file(const char *filename)
1720{
1721 struct device *dev;
1722 struct vblk_info *vblk;
1723 struct virtio_blk_config conf;
1724
1725 /* Creat the device. */
1726 dev = new_device("block", VIRTIO_ID_BLOCK);
1727
1728 /* The device has one virtqueue, where the Guest places requests. */
1729 add_virtqueue(dev, VIRTQUEUE_NUM, blk_request);
1730
1731 /* Allocate the room for our own bookkeeping */
1732 vblk = dev->priv = malloc(sizeof(*vblk));
1733
1734 /* First we open the file and store the length. */
1735 vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
1736 vblk->len = lseek64(vblk->fd, 0, SEEK_END);
1737
1738 /* We support FLUSH. */
1739 add_feature(dev, VIRTIO_BLK_F_FLUSH);
1740
1741 /* Tell Guest how many sectors this device has. */
1742 conf.capacity = cpu_to_le64(vblk->len / 512);
1743
1744 /*
1745 * Tell Guest not to put in too many descriptors at once: two are used
1746 * for the in and out elements.
1747 */
1748 add_feature(dev, VIRTIO_BLK_F_SEG_MAX);
1749 conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2);
1750
1751 /* Don't try to put whole struct: we have 8 bit limit. */
1752 set_config(dev, offsetof(struct virtio_blk_config, geometry), &conf);
1753
1754 verbose("device %u: virtblock %llu sectors\n",
1755 ++devices.device_num, le64_to_cpu(conf.capacity));
1756}
1757
1758/*L:211
1759 * Our random number generator device reads from /dev/random into the Guest's
1760 * input buffers. The usual case is that the Guest doesn't want random numbers
1761 * and so has no buffers although /dev/random is still readable, whereas
1762 * console is the reverse.
1763 *
1764 * The same logic applies, however.
1765 */
1766struct rng_info {
1767 int rfd;
1768};
1769
1770static void rng_input(struct virtqueue *vq)
1771{
1772 int len;
1773 unsigned int head, in_num, out_num, totlen = 0;
1774 struct rng_info *rng_info = vq->dev->priv;
1775 struct iovec iov[vq->vring.num];
1776
1777 /* First we need a buffer from the Guests's virtqueue. */
1778 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1779 if (out_num)
1780 errx(1, "Output buffers in rng?");
1781
1782 /*
1783 * Just like the console write, we loop to cover the whole iovec.
1784 * In this case, short reads actually happen quite a bit.
1785 */
1786 while (!iov_empty(iov, in_num)) {
1787 len = readv(rng_info->rfd, iov, in_num);
1788 if (len <= 0)
1789 err(1, "Read from /dev/random gave %i", len);
1790 iov_consume(iov, in_num, len);
1791 totlen += len;
1792 }
1793
1794 /* Tell the Guest about the new input. */
1795 add_used(vq, head, totlen);
1796}
1797
1798/*L:199
1799 * This creates a "hardware" random number device for the Guest.
1800 */
1801static void setup_rng(void)
1802{
1803 struct device *dev;
1804 struct rng_info *rng_info = malloc(sizeof(*rng_info));
1805
1806 /* Our device's privat info simply contains the /dev/random fd. */
1807 rng_info->rfd = open_or_die("/dev/random", O_RDONLY);
1808
1809 /* Create the new device. */
1810 dev = new_device("rng", VIRTIO_ID_RNG);
1811 dev->priv = rng_info;
1812
1813 /* The device has one virtqueue, where the Guest places inbufs. */
1814 add_virtqueue(dev, VIRTQUEUE_NUM, rng_input);
1815
1816 verbose("device %u: rng\n", devices.device_num++);
1817}
1818/* That's the end of device setup. */
1819
1820/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */
1821static void __attribute__((noreturn)) restart_guest(void)
1822{
1823 unsigned int i;
1824
1825 /*
1826 * Since we don't track all open fds, we simply close everything beyond
1827 * stderr.
1828 */
1829 for (i = 3; i < FD_SETSIZE; i++)
1830 close(i);
1831
1832 /* Reset all the devices (kills all threads). */
1833 cleanup_devices();
1834
1835 execv(main_args[0], main_args);
1836 err(1, "Could not exec %s", main_args[0]);
1837}
1838
1839/*L:220
1840 * Finally we reach the core of the Launcher which runs the Guest, serves
1841 * its input and output, and finally, lays it to rest.
1842 */
1843static void __attribute__((noreturn)) run_guest(void)
1844{
1845 for (;;) {
1846 unsigned long notify_addr;
1847 int readval;
1848
1849 /* We read from the /dev/lguest device to run the Guest. */
1850 readval = pread(lguest_fd, &notify_addr,
1851 sizeof(notify_addr), cpu_id);
1852
1853 /* One unsigned long means the Guest did HCALL_NOTIFY */
1854 if (readval == sizeof(notify_addr)) {
1855 verbose("Notify on address %#lx\n", notify_addr);
1856 handle_output(notify_addr);
1857 /* ENOENT means the Guest died. Reading tells us why. */
1858 } else if (errno == ENOENT) {
1859 char reason[1024] = { 0 };
1860 pread(lguest_fd, reason, sizeof(reason)-1, cpu_id);
1861 errx(1, "%s", reason);
1862 /* ERESTART means that we need to reboot the guest */
1863 } else if (errno == ERESTART) {
1864 restart_guest();
1865 /* Anything else means a bug or incompatible change. */
1866 } else
1867 err(1, "Running guest failed");
1868 }
1869}
1870/*L:240
1871 * This is the end of the Launcher. The good news: we are over halfway
1872 * through! The bad news: the most fiendish part of the code still lies ahead
1873 * of us.
1874 *
1875 * Are you ready? Take a deep breath and join me in the core of the Host, in
1876 * "make Host".
1877:*/
1878
1879static struct option opts[] = {
1880 { "verbose", 0, NULL, 'v' },
1881 { "tunnet", 1, NULL, 't' },
1882 { "block", 1, NULL, 'b' },
1883 { "rng", 0, NULL, 'r' },
1884 { "initrd", 1, NULL, 'i' },
1885 { "username", 1, NULL, 'u' },
1886 { "chroot", 1, NULL, 'c' },
1887 { NULL },
1888};
1889static void usage(void)
1890{
1891 errx(1, "Usage: lguest [--verbose] "
1892 "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)\n"
1893 "|--block=<filename>|--initrd=<filename>]...\n"
1894 "<mem-in-mb> vmlinux [args...]");
1895}
1896
1897/*L:105 The main routine is where the real work begins: */
1898int main(int argc, char *argv[])
1899{
1900 /* Memory, code startpoint and size of the (optional) initrd. */
1901 unsigned long mem = 0, start, initrd_size = 0;
1902 /* Two temporaries. */
1903 int i, c;
1904 /* The boot information for the Guest. */
1905 struct boot_params *boot;
1906 /* If they specify an initrd file to load. */
1907 const char *initrd_name = NULL;
1908
1909 /* Password structure for initgroups/setres[gu]id */
1910 struct passwd *user_details = NULL;
1911
1912 /* Directory to chroot to */
1913 char *chroot_path = NULL;
1914
1915 /* Save the args: we "reboot" by execing ourselves again. */
1916 main_args = argv;
1917
1918 /*
1919 * First we initialize the device list. We keep a pointer to the last
1920 * device, and the next interrupt number to use for devices (1:
1921 * remember that 0 is used by the timer).
1922 */
1923 devices.lastdev = NULL;
1924 devices.next_irq = 1;
1925
1926 /* We're CPU 0. In fact, that's the only CPU possible right now. */
1927 cpu_id = 0;
1928
1929 /*
1930 * We need to know how much memory so we can set up the device
1931 * descriptor and memory pages for the devices as we parse the command
1932 * line. So we quickly look through the arguments to find the amount
1933 * of memory now.
1934 */
1935 for (i = 1; i < argc; i++) {
1936 if (argv[i][0] != '-') {
1937 mem = atoi(argv[i]) * 1024 * 1024;
1938 /*
1939 * We start by mapping anonymous pages over all of
1940 * guest-physical memory range. This fills it with 0,
1941 * and ensures that the Guest won't be killed when it
1942 * tries to access it.
1943 */
1944 guest_base = map_zeroed_pages(mem / getpagesize()
1945 + DEVICE_PAGES);
1946 guest_limit = mem;
1947 guest_max = mem + DEVICE_PAGES*getpagesize();
1948 devices.descpage = get_pages(1);
1949 break;
1950 }
1951 }
1952
1953 /* The options are fairly straight-forward */
1954 while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
1955 switch (c) {
1956 case 'v':
1957 verbose = true;
1958 break;
1959 case 't':
1960 setup_tun_net(optarg);
1961 break;
1962 case 'b':
1963 setup_block_file(optarg);
1964 break;
1965 case 'r':
1966 setup_rng();
1967 break;
1968 case 'i':
1969 initrd_name = optarg;
1970 break;
1971 case 'u':
1972 user_details = getpwnam(optarg);
1973 if (!user_details)
1974 err(1, "getpwnam failed, incorrect username?");
1975 break;
1976 case 'c':
1977 chroot_path = optarg;
1978 break;
1979 default:
1980 warnx("Unknown argument %s", argv[optind]);
1981 usage();
1982 }
1983 }
1984 /*
1985 * After the other arguments we expect memory and kernel image name,
1986 * followed by command line arguments for the kernel.
1987 */
1988 if (optind + 2 > argc)
1989 usage();
1990
1991 verbose("Guest base is at %p\n", guest_base);
1992
1993 /* We always have a console device */
1994 setup_console();
1995
1996 /* Now we load the kernel */
1997 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
1998
1999 /* Boot information is stashed at physical address 0 */
2000 boot = from_guest_phys(0);
2001
2002 /* Map the initrd image if requested (at top of physical memory) */
2003 if (initrd_name) {
2004 initrd_size = load_initrd(initrd_name, mem);
2005 /*
2006 * These are the location in the Linux boot header where the
2007 * start and size of the initrd are expected to be found.
2008 */
2009 boot->hdr.ramdisk_image = mem - initrd_size;
2010 boot->hdr.ramdisk_size = initrd_size;
2011 /* The bootloader type 0xFF means "unknown"; that's OK. */
2012 boot->hdr.type_of_loader = 0xFF;
2013 }
2014
2015 /*
2016 * The Linux boot header contains an "E820" memory map: ours is a
2017 * simple, single region.
2018 */
2019 boot->e820_entries = 1;
2020 boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM });
2021 /*
2022 * The boot header contains a command line pointer: we put the command
2023 * line after the boot header.
2024 */
2025 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
2026 /* We use a simple helper to copy the arguments separated by spaces. */
2027 concat((char *)(boot + 1), argv+optind+2);
2028
2029 /* Boot protocol version: 2.07 supports the fields for lguest. */
2030 boot->hdr.version = 0x207;
2031
2032 /* The hardware_subarch value of "1" tells the Guest it's an lguest. */
2033 boot->hdr.hardware_subarch = 1;
2034
2035 /* Tell the entry path not to try to reload segment registers. */
2036 boot->hdr.loadflags |= KEEP_SEGMENTS;
2037
2038 /*
2039 * We tell the kernel to initialize the Guest: this returns the open
2040 * /dev/lguest file descriptor.
2041 */
2042 tell_kernel(start);
2043
2044 /* Ensure that we terminate if a device-servicing child dies. */
2045 signal(SIGCHLD, kill_launcher);
2046
2047 /* If we exit via err(), this kills all the threads, restores tty. */
2048 atexit(cleanup_devices);
2049
2050 /* If requested, chroot to a directory */
2051 if (chroot_path) {
2052 if (chroot(chroot_path) != 0)
2053 err(1, "chroot(\"%s\") failed", chroot_path);
2054
2055 if (chdir("/") != 0)
2056 err(1, "chdir(\"/\") failed");
2057
2058 verbose("chroot done\n");
2059 }
2060
2061 /* If requested, drop privileges */
2062 if (user_details) {
2063 uid_t u;
2064 gid_t g;
2065
2066 u = user_details->pw_uid;
2067 g = user_details->pw_gid;
2068
2069 if (initgroups(user_details->pw_name, g) != 0)
2070 err(1, "initgroups failed");
2071
2072 if (setresgid(g, g, g) != 0)
2073 err(1, "setresgid failed");
2074
2075 if (setresuid(u, u, u) != 0)
2076 err(1, "setresuid failed");
2077
2078 verbose("Dropping privileges completed\n");
2079 }
2080
2081 /* Finally, run the Guest. This doesn't return. */
2082 run_guest();
2083}
2084/*:*/
2085
2086/*M:999
2087 * Mastery is done: you now know everything I do.
2088 *
2089 * But surely you have seen code, features and bugs in your wanderings which
2090 * you now yearn to attack? That is the real game, and I look forward to you
2091 * patching and forking lguest into the Your-Name-Here-visor.
2092 *
2093 * Farewell, and good coding!
2094 * Rusty Russell.
2095 */
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt
deleted file mode 100644
index dad99978a6a8..000000000000
--- a/Documentation/lguest/lguest.txt
+++ /dev/null
@@ -1,128 +0,0 @@
1 __
2 (___()'`; Rusty's Remarkably Unreliable Guide to Lguest
3 /, /` - or, A Young Coder's Illustrated Hypervisor
4 \\"--\\ http://lguest.ozlabs.org
5
6Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel,
7for Linux developers and users to experiment with virtualization with the
8minimum of complexity. Nonetheless, it should have sufficient features to
9make it useful for specific tasks, and, of course, you are encouraged to fork
10and enhance it (see drivers/lguest/README).
11
12Features:
13
14- Kernel module which runs in a normal kernel.
15- Simple I/O model for communication.
16- Simple program to create new guests.
17- Logo contains cute puppies: http://lguest.ozlabs.org
18
19Developer features:
20
21- Fun to hack on.
22- No ABI: being tied to a specific kernel anyway, you can change anything.
23- Many opportunities for improvement or feature implementation.
24
25Running Lguest:
26
27- The easiest way to run lguest is to use same kernel as guest and host.
28 You can configure them differently, but usually it's easiest not to.
29
30 You will need to configure your kernel with the following options:
31
32 "General setup":
33 "Prompt for development and/or incomplete code/drivers" = Y
34 (CONFIG_EXPERIMENTAL=y)
35
36 "Processor type and features":
37 "Paravirtualized guest support" = Y
38 "Lguest guest support" = Y
39 "High Memory Support" = off/4GB
40 "Alignment value to which kernel should be aligned" = 0x100000
41 (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
42 CONFIG_PHYSICAL_ALIGN=0x100000)
43
44 "Device Drivers":
45 "Block devices"
46 "Virtio block driver (EXPERIMENTAL)" = M/Y
47 "Network device support"
48 "Universal TUN/TAP device driver support" = M/Y
49 "Virtio network driver (EXPERIMENTAL)" = M/Y
50 (CONFIG_VIRTIO_BLK=m, CONFIG_VIRTIO_NET=m and CONFIG_TUN=m)
51
52 "Virtualization"
53 "Linux hypervisor example code" = M/Y
54 (CONFIG_LGUEST=m)
55
56- A tool called "lguest" is available in this directory: type "make"
57 to build it. If you didn't build your kernel in-tree, use "make
58 O=<builddir>".
59
60- Create or find a root disk image. There are several useful ones
61 around, such as the xm-test tiny root image at
62 http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img
63
64 For more serious work, I usually use a distribution ISO image and
65 install it under qemu, then make multiple copies:
66
67 dd if=/dev/zero of=rootfile bs=1M count=2048
68 qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
69
70 Make sure that you install a getty on /dev/hvc0 if you want to log in on the
71 console!
72
73- "modprobe lg" if you built it as a module.
74
75- Run an lguest as root:
76
77 Documentation/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/vda
78
79 Explanation:
80 64: the amount of memory to use, in MB.
81
82 vmlinux: the kernel image found in the top of your build directory. You
83 can also use a standard bzImage.
84
85 --tunnet=192.168.19.1: configures a "tap" device for networking with this
86 IP address.
87
88 --block=rootfile: a file or block device which becomes /dev/vda
89 inside the guest.
90
91 root=/dev/vda: this (and anything else on the command line) are
92 kernel boot parameters.
93
94- Configuring networking. I usually have the host masquerade, using
95 "iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 >
96 /proc/sys/net/ipv4/ip_forward". In this example, I would configure
97 eth0 inside the guest at 192.168.19.2.
98
99 Another method is to bridge the tap device to an external interface
100 using --tunnet=bridge:<bridgename>, and perhaps run dhcp on the guest
101 to obtain an IP address. The bridge needs to be configured first:
102 this option simply adds the tap interface to it.
103
104 A simple example on my system:
105
106 ifconfig eth0 0.0.0.0
107 brctl addbr lg0
108 ifconfig lg0 up
109 brctl addif lg0 eth0
110 dhclient lg0
111
112 Then use --tunnet=bridge:lg0 when launching the guest.
113
114 See:
115
116 http://www.linuxfoundation.org/collaborate/workgroups/networking/bridge
117
118 for general information on how to get bridging to work.
119
120- Random number generation. Using the --rng option will provide a
121 /dev/hwrng in the guest that will read from the host's /dev/random.
122 Use this option in conjunction with rng-tools (see ../hw_random.txt)
123 to provide entropy to the guest kernel's /dev/random.
124
125There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest
126
127Good luck!
128Rusty Russell rusty@rustcorp.com.au.