aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/lguest
diff options
context:
space:
mode:
authorRusty Russell <rusty@rustcorp.com.au>2007-07-19 04:49:29 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-19 13:04:53 -0400
commit8ca47e00690914a9e5e6c734baa37c829a2f2fa1 (patch)
tree928350cf70a685428cc5a9779044aa88c5771af9 /Documentation/lguest
parentb754416bfe9adac6468e45fba244d77f52048aeb (diff)
lguest: the documentation, example launcher
A brief document describing how to use lguest. Because lguest doesn't have an ABI we also include an example launcher in the Documentation directory. [jmorris@namei.org: Fix up nat example in documentation] Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Cc: Andi Kleen <ak@suse.de> Signed-off-by: James Morris <jmorris@namei.org> Cc: Matias Zabaljauregui <matias.zabaljauregui@cern.ch> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'Documentation/lguest')
-rw-r--r--Documentation/lguest/Makefile27
-rw-r--r--Documentation/lguest/lguest.c1012
-rw-r--r--Documentation/lguest/lguest.txt129
3 files changed, 1168 insertions, 0 deletions
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile
new file mode 100644
index 00000000000..b9b9427376e
--- /dev/null
+++ b/Documentation/lguest/Makefile
@@ -0,0 +1,27 @@
1# This creates the demonstration utility "lguest" which runs a Linux guest.
2
3# For those people that have a separate object dir, look there for .config
4KBUILD_OUTPUT := ../..
5ifdef O
6 ifeq ("$(origin O)", "command line")
7 KBUILD_OUTPUT := $(O)
8 endif
9endif
10# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
11include $(KBUILD_OUTPUT)/.config
12LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
13
14CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 \
15 -static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds
16LDLIBS:=-lz
17
18all: lguest.lds lguest
19
20# The linker script on x86 is so complex the only way of creating one
21# which will link our binary in the right place is to mangle the
22# default one.
23lguest.lds:
24 $(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@
25
26clean:
27 rm -f lguest.lds lguest
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
new file mode 100644
index 00000000000..1432b502a2d
--- /dev/null
+++ b/Documentation/lguest/lguest.c
@@ -0,0 +1,1012 @@
1/* Simple program to layout "physical" memory for new lguest guest.
2 * Linked high to avoid likely physical memory. */
3#define _LARGEFILE64_SOURCE
4#define _GNU_SOURCE
5#include <stdio.h>
6#include <string.h>
7#include <unistd.h>
8#include <err.h>
9#include <stdint.h>
10#include <stdlib.h>
11#include <elf.h>
12#include <sys/mman.h>
13#include <sys/types.h>
14#include <sys/stat.h>
15#include <sys/wait.h>
16#include <fcntl.h>
17#include <stdbool.h>
18#include <errno.h>
19#include <ctype.h>
20#include <sys/socket.h>
21#include <sys/ioctl.h>
22#include <sys/time.h>
23#include <time.h>
24#include <netinet/in.h>
25#include <net/if.h>
26#include <linux/sockios.h>
27#include <linux/if_tun.h>
28#include <sys/uio.h>
29#include <termios.h>
30#include <getopt.h>
31#include <zlib.h>
32typedef unsigned long long u64;
33typedef uint32_t u32;
34typedef uint16_t u16;
35typedef uint8_t u8;
36#include "../../include/linux/lguest_launcher.h"
37#include "../../include/asm-i386/e820.h"
38
39#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
40#define NET_PEERNUM 1
41#define BRIDGE_PFX "bridge:"
42#ifndef SIOCBRADDIF
43#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
44#endif
45
46static bool verbose;
47#define verbose(args...) \
48 do { if (verbose) printf(args); } while(0)
49static int waker_fd;
50
51struct device_list
52{
53 fd_set infds;
54 int max_infd;
55
56 struct device *dev;
57 struct device **lastdev;
58};
59
60struct device
61{
62 struct device *next;
63 struct lguest_device_desc *desc;
64 void *mem;
65
66 /* Watch this fd if handle_input non-NULL. */
67 int fd;
68 bool (*handle_input)(int fd, struct device *me);
69
70 /* Watch DMA to this key if handle_input non-NULL. */
71 unsigned long watch_key;
72 u32 (*handle_output)(int fd, const struct iovec *iov,
73 unsigned int num, struct device *me);
74
75 /* Device-specific data. */
76 void *priv;
77};
78
79static int open_or_die(const char *name, int flags)
80{
81 int fd = open(name, flags);
82 if (fd < 0)
83 err(1, "Failed to open %s", name);
84 return fd;
85}
86
87static void *map_zeroed_pages(unsigned long addr, unsigned int num)
88{
89 static int fd = -1;
90
91 if (fd == -1)
92 fd = open_or_die("/dev/zero", O_RDONLY);
93
94 if (mmap((void *)addr, getpagesize() * num,
95 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0)
96 != (void *)addr)
97 err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr);
98 return (void *)addr;
99}
100
101/* Find magic string marking entry point, return entry point. */
102static unsigned long entry_point(void *start, void *end,
103 unsigned long page_offset)
104{
105 void *p;
106
107 for (p = start; p < end; p++)
108 if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0)
109 return (long)p + strlen("GenuineLguest") + page_offset;
110
111 err(1, "Is this image a genuine lguest?");
112}
113
114/* Returns the entry point */
115static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
116 unsigned long *page_offset)
117{
118 void *addr;
119 Elf32_Phdr phdr[ehdr->e_phnum];
120 unsigned int i;
121 unsigned long start = -1UL, end = 0;
122
123 /* Sanity checks. */
124 if (ehdr->e_type != ET_EXEC
125 || ehdr->e_machine != EM_386
126 || ehdr->e_phentsize != sizeof(Elf32_Phdr)
127 || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
128 errx(1, "Malformed elf header");
129
130 if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
131 err(1, "Seeking to program headers");
132 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
133 err(1, "Reading program headers");
134
135 *page_offset = 0;
136 /* We map the loadable segments at virtual addresses corresponding
137 * to their physical addresses (our virtual == guest physical). */
138 for (i = 0; i < ehdr->e_phnum; i++) {
139 if (phdr[i].p_type != PT_LOAD)
140 continue;
141
142 verbose("Section %i: size %i addr %p\n",
143 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
144
145 /* We expect linear address space. */
146 if (!*page_offset)
147 *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
148 else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
149 errx(1, "Page offset of section %i different", i);
150
151 if (phdr[i].p_paddr < start)
152 start = phdr[i].p_paddr;
153 if (phdr[i].p_paddr + phdr[i].p_filesz > end)
154 end = phdr[i].p_paddr + phdr[i].p_filesz;
155
156 /* We map everything private, writable. */
157 addr = mmap((void *)phdr[i].p_paddr,
158 phdr[i].p_filesz,
159 PROT_READ|PROT_WRITE|PROT_EXEC,
160 MAP_FIXED|MAP_PRIVATE,
161 elf_fd, phdr[i].p_offset);
162 if (addr != (void *)phdr[i].p_paddr)
163 err(1, "Mmaping vmlinux seg %i gave %p not %p",
164 i, addr, (void *)phdr[i].p_paddr);
165 }
166
167 return entry_point((void *)start, (void *)end, *page_offset);
168}
169
170/* This is amazingly reliable. */
171static unsigned long intuit_page_offset(unsigned char *img, unsigned long len)
172{
173 unsigned int i, possibilities[256] = { 0 };
174
175 for (i = 0; i + 4 < len; i++) {
176 /* mov 0xXXXXXXXX,%eax */
177 if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3)
178 return (unsigned long)img[i+4] << 24;
179 }
180 errx(1, "could not determine page offset");
181}
182
183static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)
184{
185 gzFile f;
186 int ret, len = 0;
187 void *img = (void *)0x100000;
188
189 f = gzdopen(fd, "rb");
190 while ((ret = gzread(f, img + len, 65536)) > 0)
191 len += ret;
192 if (ret < 0)
193 err(1, "reading image from bzImage");
194
195 verbose("Unpacked size %i addr %p\n", len, img);
196 *page_offset = intuit_page_offset(img, len);
197
198 return entry_point(img, img + len, *page_offset);
199}
200
201static unsigned long load_bzimage(int fd, unsigned long *page_offset)
202{
203 unsigned char c;
204 int state = 0;
205
206 /* Ugly brute force search for gzip header. */
207 while (read(fd, &c, 1) == 1) {
208 switch (state) {
209 case 0:
210 if (c == 0x1F)
211 state++;
212 break;
213 case 1:
214 if (c == 0x8B)
215 state++;
216 else
217 state = 0;
218 break;
219 case 2 ... 8:
220 state++;
221 break;
222 case 9:
223 lseek(fd, -10, SEEK_CUR);
224 if (c != 0x03) /* Compressed under UNIX. */
225 state = -1;
226 else
227 return unpack_bzimage(fd, page_offset);
228 }
229 }
230 errx(1, "Could not find kernel in bzImage");
231}
232
233static unsigned long load_kernel(int fd, unsigned long *page_offset)
234{
235 Elf32_Ehdr hdr;
236
237 if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
238 err(1, "Reading kernel");
239
240 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
241 return map_elf(fd, &hdr, page_offset);
242
243 return load_bzimage(fd, page_offset);
244}
245
246static inline unsigned long page_align(unsigned long addr)
247{
248 return ((addr + getpagesize()-1) & ~(getpagesize()-1));
249}
250
251/* initrd gets loaded at top of memory: return length. */
252static unsigned long load_initrd(const char *name, unsigned long mem)
253{
254 int ifd;
255 struct stat st;
256 unsigned long len;
257 void *iaddr;
258
259 ifd = open_or_die(name, O_RDONLY);
260 if (fstat(ifd, &st) < 0)
261 err(1, "fstat() on initrd '%s'", name);
262
263 len = page_align(st.st_size);
264 iaddr = mmap((void *)mem - len, st.st_size,
265 PROT_READ|PROT_EXEC|PROT_WRITE,
266 MAP_FIXED|MAP_PRIVATE, ifd, 0);
267 if (iaddr != (void *)mem - len)
268 err(1, "Mmaping initrd '%s' returned %p not %p",
269 name, iaddr, (void *)mem - len);
270 close(ifd);
271 verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr);
272 return len;
273}
274
275static unsigned long setup_pagetables(unsigned long mem,
276 unsigned long initrd_size,
277 unsigned long page_offset)
278{
279 u32 *pgdir, *linear;
280 unsigned int mapped_pages, i, linear_pages;
281 unsigned int ptes_per_page = getpagesize()/sizeof(u32);
282
283 /* If we can map all of memory above page_offset, we do so. */
284 if (mem <= -page_offset)
285 mapped_pages = mem/getpagesize();
286 else
287 mapped_pages = -page_offset/getpagesize();
288
289 /* Each linear PTE page can map ptes_per_page pages. */
290 linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page;
291
292 /* We lay out top-level then linear mapping immediately below initrd */
293 pgdir = (void *)mem - initrd_size - getpagesize();
294 linear = (void *)pgdir - linear_pages*getpagesize();
295
296 for (i = 0; i < mapped_pages; i++)
297 linear[i] = ((i * getpagesize()) | PAGE_PRESENT);
298
299 /* Now set up pgd so that this memory is at page_offset */
300 for (i = 0; i < mapped_pages; i += ptes_per_page) {
301 pgdir[(i + page_offset/getpagesize())/ptes_per_page]
302 = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT);
303 }
304
305 verbose("Linear mapping of %u pages in %u pte pages at %p\n",
306 mapped_pages, linear_pages, linear);
307
308 return (unsigned long)pgdir;
309}
310
311static void concat(char *dst, char *args[])
312{
313 unsigned int i, len = 0;
314
315 for (i = 0; args[i]; i++) {
316 strcpy(dst+len, args[i]);
317 strcat(dst+len, " ");
318 len += strlen(args[i]) + 1;
319 }
320 /* In case it's empty. */
321 dst[len] = '\0';
322}
323
324static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
325{
326 u32 args[] = { LHREQ_INITIALIZE,
327 LGUEST_GUEST_TOP/getpagesize(), /* Just below us */
328 pgdir, start, page_offset };
329 int fd;
330
331 fd = open_or_die("/dev/lguest", O_RDWR);
332 if (write(fd, args, sizeof(args)) < 0)
333 err(1, "Writing to /dev/lguest");
334 return fd;
335}
336
337static void set_fd(int fd, struct device_list *devices)
338{
339 FD_SET(fd, &devices->infds);
340 if (fd > devices->max_infd)
341 devices->max_infd = fd;
342}
343
344/* When input arrives, we tell the kernel to kick lguest out with -EAGAIN. */
345static void wake_parent(int pipefd, int lguest_fd, struct device_list *devices)
346{
347 set_fd(pipefd, devices);
348
349 for (;;) {
350 fd_set rfds = devices->infds;
351 u32 args[] = { LHREQ_BREAK, 1 };
352
353 select(devices->max_infd+1, &rfds, NULL, NULL, NULL);
354 if (FD_ISSET(pipefd, &rfds)) {
355 int ignorefd;
356 if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0)
357 exit(0);
358 FD_CLR(ignorefd, &devices->infds);
359 } else
360 write(lguest_fd, args, sizeof(args));
361 }
362}
363
364static int setup_waker(int lguest_fd, struct device_list *device_list)
365{
366 int pipefd[2], child;
367
368 pipe(pipefd);
369 child = fork();
370 if (child == -1)
371 err(1, "forking");
372
373 if (child == 0) {
374 close(pipefd[1]);
375 wake_parent(pipefd[0], lguest_fd, device_list);
376 }
377 close(pipefd[0]);
378
379 return pipefd[1];
380}
381
382static void *_check_pointer(unsigned long addr, unsigned int size,
383 unsigned int line)
384{
385 if (addr >= LGUEST_GUEST_TOP || addr + size >= LGUEST_GUEST_TOP)
386 errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr);
387 return (void *)addr;
388}
389#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
390
391/* Returns pointer to dma->used_len */
392static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
393{
394 unsigned int i;
395 struct lguest_dma *udma;
396
397 udma = check_pointer(dma, sizeof(*udma));
398 for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
399 if (!udma->len[i])
400 break;
401
402 iov[i].iov_base = check_pointer(udma->addr[i], udma->len[i]);
403 iov[i].iov_len = udma->len[i];
404 }
405 *num = i;
406 return &udma->used_len;
407}
408
409static u32 *get_dma_buffer(int fd, void *key,
410 struct iovec iov[], unsigned int *num, u32 *irq)
411{
412 u32 buf[] = { LHREQ_GETDMA, (u32)key };
413 unsigned long udma;
414 u32 *res;
415
416 udma = write(fd, buf, sizeof(buf));
417 if (udma == (unsigned long)-1)
418 return NULL;
419
420 /* Kernel stashes irq in ->used_len. */
421 res = dma2iov(udma, iov, num);
422 *irq = *res;
423 return res;
424}
425
426static void trigger_irq(int fd, u32 irq)
427{
428 u32 buf[] = { LHREQ_IRQ, irq };
429 if (write(fd, buf, sizeof(buf)) != 0)
430 err(1, "Triggering irq %i", irq);
431}
432
433static void discard_iovec(struct iovec *iov, unsigned int *num)
434{
435 static char discard_buf[1024];
436 *num = 1;
437 iov->iov_base = discard_buf;
438 iov->iov_len = sizeof(discard_buf);
439}
440
441static struct termios orig_term;
442static void restore_term(void)
443{
444 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
445}
446
447struct console_abort
448{
449 int count;
450 struct timeval start;
451};
452
453/* We DMA input to buffer bound at start of console page. */
454static bool handle_console_input(int fd, struct device *dev)
455{
456 u32 irq = 0, *lenp;
457 int len;
458 unsigned int num;
459 struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
460 struct console_abort *abort = dev->priv;
461
462 lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq);
463 if (!lenp) {
464 warn("console: no dma buffer!");
465 discard_iovec(iov, &num);
466 }
467
468 len = readv(dev->fd, iov, num);
469 if (len <= 0) {
470 warnx("Failed to get console input, ignoring console.");
471 len = 0;
472 }
473
474 if (lenp) {
475 *lenp = len;
476 trigger_irq(fd, irq);
477 }
478
479 /* Three ^C within one second? Exit. */
480 if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) {
481 if (!abort->count++)
482 gettimeofday(&abort->start, NULL);
483 else if (abort->count == 3) {
484 struct timeval now;
485 gettimeofday(&now, NULL);
486 if (now.tv_sec <= abort->start.tv_sec+1) {
487 /* Make sure waker is not blocked in BREAK */
488 u32 args[] = { LHREQ_BREAK, 0 };
489 close(waker_fd);
490 write(fd, args, sizeof(args));
491 exit(2);
492 }
493 abort->count = 0;
494 }
495 } else
496 abort->count = 0;
497
498 if (!len) {
499 restore_term();
500 return false;
501 }
502 return true;
503}
504
505static u32 handle_console_output(int fd, const struct iovec *iov,
506 unsigned num, struct device*dev)
507{
508 return writev(STDOUT_FILENO, iov, num);
509}
510
511static u32 handle_tun_output(int fd, const struct iovec *iov,
512 unsigned num, struct device *dev)
513{
514 /* Now we've seen output, we should warn if we can't get buffers. */
515 *(bool *)dev->priv = true;
516 return writev(dev->fd, iov, num);
517}
518
519static unsigned long peer_offset(unsigned int peernum)
520{
521 return 4 * peernum;
522}
523
524static bool handle_tun_input(int fd, struct device *dev)
525{
526 u32 irq = 0, *lenp;
527 int len;
528 unsigned num;
529 struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
530
531 lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num,
532 &irq);
533 if (!lenp) {
534 if (*(bool *)dev->priv)
535 warn("network: no dma buffer!");
536 discard_iovec(iov, &num);
537 }
538
539 len = readv(dev->fd, iov, num);
540 if (len <= 0)
541 err(1, "reading network");
542 if (lenp) {
543 *lenp = len;
544 trigger_irq(fd, irq);
545 }
546 verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
547 ((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1],
548 lenp ? "sent" : "discarded");
549 return true;
550}
551
552static u32 handle_block_output(int fd, const struct iovec *iov,
553 unsigned num, struct device *dev)
554{
555 struct lguest_block_page *p = dev->mem;
556 u32 irq, *lenp;
557 unsigned int len, reply_num;
558 struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
559 off64_t device_len, off = (off64_t)p->sector * 512;
560
561 device_len = *(off64_t *)dev->priv;
562
563 if (off >= device_len)
564 err(1, "Bad offset %llu vs %llu", off, device_len);
565 if (lseek64(dev->fd, off, SEEK_SET) != off)
566 err(1, "Bad seek to sector %i", p->sector);
567
568 verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off);
569
570 lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
571 if (!lenp)
572 err(1, "Block request didn't give us a dma buffer");
573
574 if (p->type) {
575 len = writev(dev->fd, iov, num);
576 if (off + len > device_len) {
577 ftruncate(dev->fd, device_len);
578 errx(1, "Write past end %llu+%u", off, len);
579 }
580 *lenp = 0;
581 } else {
582 len = readv(dev->fd, reply, reply_num);
583 *lenp = len;
584 }
585
586 p->result = 1 + (p->bytes != len);
587 trigger_irq(fd, irq);
588 return 0;
589}
590
591static void handle_output(int fd, unsigned long dma, unsigned long key,
592 struct device_list *devices)
593{
594 struct device *i;
595 u32 *lenp;
596 struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
597 unsigned num = 0;
598
599 lenp = dma2iov(dma, iov, &num);
600 for (i = devices->dev; i; i = i->next) {
601 if (i->handle_output && key == i->watch_key) {
602 *lenp = i->handle_output(fd, iov, num, i);
603 return;
604 }
605 }
606 warnx("Pending dma %p, key %p", (void *)dma, (void *)key);
607}
608
609static void handle_input(int fd, struct device_list *devices)
610{
611 struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
612
613 for (;;) {
614 struct device *i;
615 fd_set fds = devices->infds;
616
617 if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0)
618 break;
619
620 for (i = devices->dev; i; i = i->next) {
621 if (i->handle_input && FD_ISSET(i->fd, &fds)) {
622 if (!i->handle_input(fd, i)) {
623 FD_CLR(i->fd, &devices->infds);
624 /* Tell waker to ignore it too... */
625 write(waker_fd, &i->fd, sizeof(i->fd));
626 }
627 }
628 }
629 }
630}
631
632static struct lguest_device_desc *new_dev_desc(u16 type, u16 features,
633 u16 num_pages)
634{
635 static unsigned long top = LGUEST_GUEST_TOP;
636 struct lguest_device_desc *desc;
637
638 desc = malloc(sizeof(*desc));
639 desc->type = type;
640 desc->num_pages = num_pages;
641 desc->features = features;
642 desc->status = 0;
643 if (num_pages) {
644 top -= num_pages*getpagesize();
645 map_zeroed_pages(top, num_pages);
646 desc->pfn = top / getpagesize();
647 } else
648 desc->pfn = 0;
649 return desc;
650}
651
652static struct device *new_device(struct device_list *devices,
653 u16 type, u16 num_pages, u16 features,
654 int fd,
655 bool (*handle_input)(int, struct device *),
656 unsigned long watch_off,
657 u32 (*handle_output)(int,
658 const struct iovec *,
659 unsigned,
660 struct device *))
661{
662 struct device *dev = malloc(sizeof(*dev));
663
664 /* Append to device list. */
665 *devices->lastdev = dev;
666 dev->next = NULL;
667 devices->lastdev = &dev->next;
668
669 dev->fd = fd;
670 if (handle_input)
671 set_fd(dev->fd, devices);
672 dev->desc = new_dev_desc(type, features, num_pages);
673 dev->mem = (void *)(dev->desc->pfn * getpagesize());
674 dev->handle_input = handle_input;
675 dev->watch_key = (unsigned long)dev->mem + watch_off;
676 dev->handle_output = handle_output;
677 return dev;
678}
679
680static void setup_console(struct device_list *devices)
681{
682 struct device *dev;
683
684 if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
685 struct termios term = orig_term;
686 term.c_lflag &= ~(ISIG|ICANON|ECHO);
687 tcsetattr(STDIN_FILENO, TCSANOW, &term);
688 atexit(restore_term);
689 }
690
691 /* We don't currently require a page for the console. */
692 dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0,
693 STDIN_FILENO, handle_console_input,
694 LGUEST_CONSOLE_DMA_KEY, handle_console_output);
695 dev->priv = malloc(sizeof(struct console_abort));
696 ((struct console_abort *)dev->priv)->count = 0;
697 verbose("device %p: console\n",
698 (void *)(dev->desc->pfn * getpagesize()));
699}
700
701static void setup_block_file(const char *filename, struct device_list *devices)
702{
703 int fd;
704 struct device *dev;
705 off64_t *device_len;
706 struct lguest_block_page *p;
707
708 fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT);
709 dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1,
710 LGUEST_DEVICE_F_RANDOMNESS,
711 fd, NULL, 0, handle_block_output);
712 device_len = dev->priv = malloc(sizeof(*device_len));
713 *device_len = lseek64(fd, 0, SEEK_END);
714 p = dev->mem;
715
716 p->num_sectors = *device_len/512;
717 verbose("device %p: block %i sectors\n",
718 (void *)(dev->desc->pfn * getpagesize()), p->num_sectors);
719}
720
721/* We use fnctl locks to reserve network slots (autocleanup!) */
722static unsigned int find_slot(int netfd, const char *filename)
723{
724 struct flock fl;
725
726 fl.l_type = F_WRLCK;
727 fl.l_whence = SEEK_SET;
728 fl.l_len = 1;
729 for (fl.l_start = 0;
730 fl.l_start < getpagesize()/sizeof(struct lguest_net);
731 fl.l_start++) {
732 if (fcntl(netfd, F_SETLK, &fl) == 0)
733 return fl.l_start;
734 }
735 errx(1, "No free slots in network file %s", filename);
736}
737
738static void setup_net_file(const char *filename,
739 struct device_list *devices)
740{
741 int netfd;
742 struct device *dev;
743
744 netfd = open(filename, O_RDWR, 0);
745 if (netfd < 0) {
746 if (errno == ENOENT) {
747 netfd = open(filename, O_RDWR|O_CREAT, 0600);
748 if (netfd >= 0) {
749 char page[getpagesize()];
750 memset(page, 0, sizeof(page));
751 write(netfd, page, sizeof(page));
752 }
753 }
754 if (netfd < 0)
755 err(1, "cannot open net file '%s'", filename);
756 }
757
758 dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
759 find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM,
760 -1, NULL, 0, NULL);
761
762 /* We overwrite the /dev/zero mapping with the actual file. */
763 if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE,
764 MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem)
765 err(1, "could not mmap '%s'", filename);
766 verbose("device %p: shared net %s, peer %i\n",
767 (void *)(dev->desc->pfn * getpagesize()), filename,
768 dev->desc->features & ~LGUEST_NET_F_NOCSUM);
769}
770
771static u32 str2ip(const char *ipaddr)
772{
773 unsigned int byte[4];
774
775 sscanf(ipaddr, "%u.%u.%u.%u", &byte[0], &byte[1], &byte[2], &byte[3]);
776 return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3];
777}
778
779/* adapted from libbridge */
780static void add_to_bridge(int fd, const char *if_name, const char *br_name)
781{
782 int ifidx;
783 struct ifreq ifr;
784
785 if (!*br_name)
786 errx(1, "must specify bridge name");
787
788 ifidx = if_nametoindex(if_name);
789 if (!ifidx)
790 errx(1, "interface %s does not exist!", if_name);
791
792 strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
793 ifr.ifr_ifindex = ifidx;
794 if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
795 err(1, "can't add %s to bridge %s", if_name, br_name);
796}
797
798static void configure_device(int fd, const char *devname, u32 ipaddr,
799 unsigned char hwaddr[6])
800{
801 struct ifreq ifr;
802 struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
803
804 memset(&ifr, 0, sizeof(ifr));
805 strcpy(ifr.ifr_name, devname);
806 sin->sin_family = AF_INET;
807 sin->sin_addr.s_addr = htonl(ipaddr);
808 if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
809 err(1, "Setting %s interface address", devname);
810 ifr.ifr_flags = IFF_UP;
811 if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
812 err(1, "Bringing interface %s up", devname);
813
814 if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
815 err(1, "getting hw address for %s", devname);
816
817 memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
818}
819
820static void setup_tun_net(const char *arg, struct device_list *devices)
821{
822 struct device *dev;
823 struct ifreq ifr;
824 int netfd, ipfd;
825 u32 ip;
826 const char *br_name = NULL;
827
828 netfd = open_or_die("/dev/net/tun", O_RDWR);
829 memset(&ifr, 0, sizeof(ifr));
830 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
831 strcpy(ifr.ifr_name, "tap%d");
832 if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
833 err(1, "configuring /dev/net/tun");
834 ioctl(netfd, TUNSETNOCSUM, 1);
835
836 /* You will be peer 1: we should create enough jitter to randomize */
837 dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
838 NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd,
839 handle_tun_input, peer_offset(0), handle_tun_output);
840 dev->priv = malloc(sizeof(bool));
841 *(bool *)dev->priv = false;
842
843 ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
844 if (ipfd < 0)
845 err(1, "opening IP socket");
846
847 if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
848 ip = INADDR_ANY;
849 br_name = arg + strlen(BRIDGE_PFX);
850 add_to_bridge(ipfd, ifr.ifr_name, br_name);
851 } else
852 ip = str2ip(arg);
853
854 /* We are peer 0, ie. first slot. */
855 configure_device(ipfd, ifr.ifr_name, ip, dev->mem);
856
857 /* Set "promisc" bit: we want every single packet. */
858 *((u8 *)dev->mem) |= 0x1;
859
860 close(ipfd);
861
862 verbose("device %p: tun net %u.%u.%u.%u\n",
863 (void *)(dev->desc->pfn * getpagesize()),
864 (u8)(ip>>24), (u8)(ip>>16), (u8)(ip>>8), (u8)ip);
865 if (br_name)
866 verbose("attached to bridge: %s\n", br_name);
867}
868
869/* Now we know how much memory we have, we copy in device descriptors */
870static void map_device_descriptors(struct device_list *devs, unsigned long mem)
871{
872 struct device *i;
873 unsigned int num;
874 struct lguest_device_desc *descs;
875
876 /* Device descriptor array sits just above top of normal memory */
877 descs = map_zeroed_pages(mem, 1);
878
879 for (i = devs->dev, num = 0; i; i = i->next, num++) {
880 if (num == LGUEST_MAX_DEVICES)
881 errx(1, "too many devices");
882 verbose("Device %i: %s\n", num,
883 i->desc->type == LGUEST_DEVICE_T_NET ? "net"
884 : i->desc->type == LGUEST_DEVICE_T_CONSOLE ? "console"
885 : i->desc->type == LGUEST_DEVICE_T_BLOCK ? "block"
886 : "unknown");
887 descs[num] = *i->desc;
888 free(i->desc);
889 i->desc = &descs[num];
890 }
891}
892
893static void __attribute__((noreturn))
894run_guest(int lguest_fd, struct device_list *device_list)
895{
896 for (;;) {
897 u32 args[] = { LHREQ_BREAK, 0 };
898 unsigned long arr[2];
899 int readval;
900
901 /* We read from the /dev/lguest device to run the Guest. */
902 readval = read(lguest_fd, arr, sizeof(arr));
903
904 if (readval == sizeof(arr)) {
905 handle_output(lguest_fd, arr[0], arr[1], device_list);
906 continue;
907 } else if (errno == ENOENT) {
908 char reason[1024] = { 0 };
909 read(lguest_fd, reason, sizeof(reason)-1);
910 errx(1, "%s", reason);
911 } else if (errno != EAGAIN)
912 err(1, "Running guest failed");
913 handle_input(lguest_fd, device_list);
914 if (write(lguest_fd, args, sizeof(args)) < 0)
915 err(1, "Resetting break");
916 }
917}
918
919static struct option opts[] = {
920 { "verbose", 0, NULL, 'v' },
921 { "sharenet", 1, NULL, 's' },
922 { "tunnet", 1, NULL, 't' },
923 { "block", 1, NULL, 'b' },
924 { "initrd", 1, NULL, 'i' },
925 { NULL },
926};
927static void usage(void)
928{
929 errx(1, "Usage: lguest [--verbose] "
930 "[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)\n"
931 "|--block=<filename>|--initrd=<filename>]...\n"
932 "<mem-in-mb> vmlinux [args...]");
933}
934
935int main(int argc, char *argv[])
936{
937 unsigned long mem, pgdir, start, page_offset, initrd_size = 0;
938 int c, lguest_fd;
939 struct device_list device_list;
940 void *boot = (void *)0;
941 const char *initrd_name = NULL;
942
943 device_list.max_infd = -1;
944 device_list.dev = NULL;
945 device_list.lastdev = &device_list.dev;
946 FD_ZERO(&device_list.infds);
947
948 while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
949 switch (c) {
950 case 'v':
951 verbose = true;
952 break;
953 case 's':
954 setup_net_file(optarg, &device_list);
955 break;
956 case 't':
957 setup_tun_net(optarg, &device_list);
958 break;
959 case 'b':
960 setup_block_file(optarg, &device_list);
961 break;
962 case 'i':
963 initrd_name = optarg;
964 break;
965 default:
966 warnx("Unknown argument %s", argv[optind]);
967 usage();
968 }
969 }
970 if (optind + 2 > argc)
971 usage();
972
973 /* We need a console device */
974 setup_console(&device_list);
975
976 /* First we map /dev/zero over all of guest-physical memory. */
977 mem = atoi(argv[optind]) * 1024 * 1024;
978 map_zeroed_pages(0, mem / getpagesize());
979
980 /* Now we load the kernel */
981 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY),
982 &page_offset);
983
984 /* Write the device descriptors into memory. */
985 map_device_descriptors(&device_list, mem);
986
987 /* Map the initrd image if requested */
988 if (initrd_name) {
989 initrd_size = load_initrd(initrd_name, mem);
990 *(unsigned long *)(boot+0x218) = mem - initrd_size;
991 *(unsigned long *)(boot+0x21c) = initrd_size;
992 *(unsigned char *)(boot+0x210) = 0xFF;
993 }
994
995 /* Set up the initial linar pagetables. */
996 pgdir = setup_pagetables(mem, initrd_size, page_offset);
997
998 /* E820 memory map: ours is a simple, single region. */
999 *(char*)(boot+E820NR) = 1;
1000 *((struct e820entry *)(boot+E820MAP))
1001 = ((struct e820entry) { 0, mem, E820_RAM });
1002 /* Command line pointer and command line (at 4096) */
1003 *(void **)(boot + 0x228) = boot + 4096;
1004 concat(boot + 4096, argv+optind+2);
1005 /* Paravirt type: 1 == lguest */
1006 *(int *)(boot + 0x23c) = 1;
1007
1008 lguest_fd = tell_kernel(pgdir, start, page_offset);
1009 waker_fd = setup_waker(lguest_fd, &device_list);
1010
1011 run_guest(lguest_fd, &device_list);
1012}
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt
new file mode 100644
index 00000000000..821617bd6c0
--- /dev/null
+++ b/Documentation/lguest/lguest.txt
@@ -0,0 +1,129 @@
1Rusty's Remarkably Unreliable Guide to Lguest
2 - or, A Young Coder's Illustrated Hypervisor
3http://lguest.ozlabs.org
4
5Lguest is designed to be a minimal hypervisor for the Linux kernel, for
6Linux developers and users to experiment with virtualization with the
7minimum of complexity. Nonetheless, it should have sufficient
8features to make it useful for specific tasks, and, of course, you are
9encouraged to fork and enhance it.
10
11Features:
12
13- Kernel module which runs in a normal kernel.
14- Simple I/O model for communication.
15- Simple program to create new guests.
16- Logo contains cute puppies: http://lguest.ozlabs.org
17
18Developer features:
19
20- Fun to hack on.
21- No ABI: being tied to a specific kernel anyway, you can change anything.
22- Many opportunities for improvement or feature implementation.
23
24Running Lguest:
25
26- Lguest runs the same kernel as guest and host. You can configure
27 them differently, but usually it's easiest not to.
28
29 You will need to configure your kernel with the following options:
30
31 CONFIG_HIGHMEM64G=n ("High Memory Support" "64GB")[1]
32 CONFIG_TUN=y/m ("Universal TUN/TAP device driver support")
33 CONFIG_EXPERIMENTAL=y ("Prompt for development and/or incomplete code/drivers")
34 CONFIG_PARAVIRT=y ("Paravirtualization support (EXPERIMENTAL)")
35 CONFIG_LGUEST=y/m ("Linux hypervisor example code")
36
37 and I recommend:
38 CONFIG_HZ=100 ("Timer frequency")[2]
39
40- A tool called "lguest" is available in this directory: type "make"
41 to build it. If you didn't build your kernel in-tree, use "make
42 O=<builddir>".
43
44- Create or find a root disk image. There are several useful ones
45 around, such as the xm-test tiny root image at
46 http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img
47
48 For more serious work, I usually use a distribution ISO image and
49 install it under qemu, then make multiple copies:
50
51 dd if=/dev/zero of=rootfile bs=1M count=2048
52 qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
53
54- "modprobe lg" if you built it as a module.
55
56- Run an lguest as root:
57
58 Documentation/lguest/lguest 64m vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/lgba
59
60 Explanation:
61 64m: the amount of memory to use.
62
63 vmlinux: the kernel image found in the top of your build directory. You
64 can also use a standard bzImage.
65
66 --tunnet=192.168.19.1: configures a "tap" device for networking with this
67 IP address.
68
69 --block=rootfile: a file or block device which becomes /dev/lgba
70 inside the guest.
71
72 root=/dev/lgba: this (and anything else on the command line) are
73 kernel boot parameters.
74
75- Configuring networking. I usually have the host masquerade, using
76 "iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 >
77 /proc/sys/net/ipv4/ip_forward". In this example, I would configure
78 eth0 inside the guest at 192.168.19.2.
79
80 Another method is to bridge the tap device to an external interface
81 using --tunnet=bridge:<bridgename>, and perhaps run dhcp on the guest
82 to obtain an IP address. The bridge needs to be configured first:
83 this option simply adds the tap interface to it.
84
85 A simple example on my system:
86
87 ifconfig eth0 0.0.0.0
88 brctl addbr lg0
89 ifconfig lg0 up
90 brctl addif lg0 eth0
91 dhclient lg0
92
93 Then use --tunnet=bridge:lg0 when launching the guest.
94
95 See http://linux-net.osdl.org/index.php/Bridge for general information
96 on how to get bridging working.
97
98- You can also create an inter-guest network using
99 "--sharenet=<filename>": any two guests using the same file are on
100 the same network. This file is created if it does not exist.
101
102Lguest I/O model:
103
104Lguest uses a simplified DMA model plus shared memory for I/O. Guests
105can communicate with each other if they share underlying memory
106(usually by the lguest program mmaping the same file), but they can
107use any non-shared memory to communicate with the lguest process.
108
109Guests can register DMA buffers at any key (must be a valid physical
110address) using the LHCALL_BIND_DMA(key, dmabufs, num<<8|irq)
111hypercall. "dmabufs" is the physical address of an array of "num"
112"struct lguest_dma": each contains a used_len, and an array of
113physical addresses and lengths. When a transfer occurs, the
114"used_len" field of one of the buffers which has used_len 0 will be
115set to the length transferred and the irq will fire.
116
117Using an irq value of 0 unbinds the dma buffers.
118
119To send DMA, the LHCALL_SEND_DMA(key, dma_physaddr) hypercall is used,
120and the bytes used is written to the used_len field. This can be 0 if
121noone else has bound a DMA buffer to that key or some other error.
122DMA buffers bound by the same guest are ignored.
123
124Cheers!
125Rusty Russell rusty@rustcorp.com.au.
126
127[1] These are on various places on the TODO list, waiting for you to
128 get annoyed enough at the limitation to fix it.
129[2] Lguest is not yet tickless when idle. See [1].