diff options
-rw-r--r-- | Documentation/lguest/Makefile | 27 | ||||
-rw-r--r-- | Documentation/lguest/lguest.c | 1012 | ||||
-rw-r--r-- | Documentation/lguest/lguest.txt | 129 |
3 files changed, 1168 insertions, 0 deletions
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile new file mode 100644 index 00000000000..b9b9427376e --- /dev/null +++ b/Documentation/lguest/Makefile | |||
@@ -0,0 +1,27 @@ | |||
1 | # This creates the demonstration utility "lguest" which runs a Linux guest. | ||
2 | |||
3 | # For those people that have a separate object dir, look there for .config | ||
4 | KBUILD_OUTPUT := ../.. | ||
5 | ifdef O | ||
6 | ifeq ("$(origin O)", "command line") | ||
7 | KBUILD_OUTPUT := $(O) | ||
8 | endif | ||
9 | endif | ||
10 | # We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary. | ||
11 | include $(KBUILD_OUTPUT)/.config | ||
12 | LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000) | ||
13 | |||
14 | CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 \ | ||
15 | -static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds | ||
16 | LDLIBS:=-lz | ||
17 | |||
18 | all: lguest.lds lguest | ||
19 | |||
20 | # The linker script on x86 is so complex the only way of creating one | ||
21 | # which will link our binary in the right place is to mangle the | ||
22 | # default one. | ||
23 | lguest.lds: | ||
24 | $(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@ | ||
25 | |||
26 | clean: | ||
27 | rm -f lguest.lds lguest | ||
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c new file mode 100644 index 00000000000..1432b502a2d --- /dev/null +++ b/Documentation/lguest/lguest.c | |||
@@ -0,0 +1,1012 @@ | |||
1 | /* Simple program to layout "physical" memory for new lguest guest. | ||
2 | * Linked high to avoid likely physical memory. */ | ||
3 | #define _LARGEFILE64_SOURCE | ||
4 | #define _GNU_SOURCE | ||
5 | #include <stdio.h> | ||
6 | #include <string.h> | ||
7 | #include <unistd.h> | ||
8 | #include <err.h> | ||
9 | #include <stdint.h> | ||
10 | #include <stdlib.h> | ||
11 | #include <elf.h> | ||
12 | #include <sys/mman.h> | ||
13 | #include <sys/types.h> | ||
14 | #include <sys/stat.h> | ||
15 | #include <sys/wait.h> | ||
16 | #include <fcntl.h> | ||
17 | #include <stdbool.h> | ||
18 | #include <errno.h> | ||
19 | #include <ctype.h> | ||
20 | #include <sys/socket.h> | ||
21 | #include <sys/ioctl.h> | ||
22 | #include <sys/time.h> | ||
23 | #include <time.h> | ||
24 | #include <netinet/in.h> | ||
25 | #include <net/if.h> | ||
26 | #include <linux/sockios.h> | ||
27 | #include <linux/if_tun.h> | ||
28 | #include <sys/uio.h> | ||
29 | #include <termios.h> | ||
30 | #include <getopt.h> | ||
31 | #include <zlib.h> | ||
32 | typedef unsigned long long u64; | ||
33 | typedef uint32_t u32; | ||
34 | typedef uint16_t u16; | ||
35 | typedef uint8_t u8; | ||
36 | #include "../../include/linux/lguest_launcher.h" | ||
37 | #include "../../include/asm-i386/e820.h" | ||
38 | |||
39 | #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ | ||
40 | #define NET_PEERNUM 1 | ||
41 | #define BRIDGE_PFX "bridge:" | ||
42 | #ifndef SIOCBRADDIF | ||
43 | #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ | ||
44 | #endif | ||
45 | |||
46 | static bool verbose; | ||
47 | #define verbose(args...) \ | ||
48 | do { if (verbose) printf(args); } while(0) | ||
49 | static int waker_fd; | ||
50 | |||
51 | struct device_list | ||
52 | { | ||
53 | fd_set infds; | ||
54 | int max_infd; | ||
55 | |||
56 | struct device *dev; | ||
57 | struct device **lastdev; | ||
58 | }; | ||
59 | |||
60 | struct device | ||
61 | { | ||
62 | struct device *next; | ||
63 | struct lguest_device_desc *desc; | ||
64 | void *mem; | ||
65 | |||
66 | /* Watch this fd if handle_input non-NULL. */ | ||
67 | int fd; | ||
68 | bool (*handle_input)(int fd, struct device *me); | ||
69 | |||
70 | /* Watch DMA to this key if handle_input non-NULL. */ | ||
71 | unsigned long watch_key; | ||
72 | u32 (*handle_output)(int fd, const struct iovec *iov, | ||
73 | unsigned int num, struct device *me); | ||
74 | |||
75 | /* Device-specific data. */ | ||
76 | void *priv; | ||
77 | }; | ||
78 | |||
79 | static int open_or_die(const char *name, int flags) | ||
80 | { | ||
81 | int fd = open(name, flags); | ||
82 | if (fd < 0) | ||
83 | err(1, "Failed to open %s", name); | ||
84 | return fd; | ||
85 | } | ||
86 | |||
87 | static void *map_zeroed_pages(unsigned long addr, unsigned int num) | ||
88 | { | ||
89 | static int fd = -1; | ||
90 | |||
91 | if (fd == -1) | ||
92 | fd = open_or_die("/dev/zero", O_RDONLY); | ||
93 | |||
94 | if (mmap((void *)addr, getpagesize() * num, | ||
95 | PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0) | ||
96 | != (void *)addr) | ||
97 | err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr); | ||
98 | return (void *)addr; | ||
99 | } | ||
100 | |||
101 | /* Find magic string marking entry point, return entry point. */ | ||
102 | static unsigned long entry_point(void *start, void *end, | ||
103 | unsigned long page_offset) | ||
104 | { | ||
105 | void *p; | ||
106 | |||
107 | for (p = start; p < end; p++) | ||
108 | if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0) | ||
109 | return (long)p + strlen("GenuineLguest") + page_offset; | ||
110 | |||
111 | err(1, "Is this image a genuine lguest?"); | ||
112 | } | ||
113 | |||
114 | /* Returns the entry point */ | ||
115 | static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, | ||
116 | unsigned long *page_offset) | ||
117 | { | ||
118 | void *addr; | ||
119 | Elf32_Phdr phdr[ehdr->e_phnum]; | ||
120 | unsigned int i; | ||
121 | unsigned long start = -1UL, end = 0; | ||
122 | |||
123 | /* Sanity checks. */ | ||
124 | if (ehdr->e_type != ET_EXEC | ||
125 | || ehdr->e_machine != EM_386 | ||
126 | || ehdr->e_phentsize != sizeof(Elf32_Phdr) | ||
127 | || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) | ||
128 | errx(1, "Malformed elf header"); | ||
129 | |||
130 | if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) | ||
131 | err(1, "Seeking to program headers"); | ||
132 | if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) | ||
133 | err(1, "Reading program headers"); | ||
134 | |||
135 | *page_offset = 0; | ||
136 | /* We map the loadable segments at virtual addresses corresponding | ||
137 | * to their physical addresses (our virtual == guest physical). */ | ||
138 | for (i = 0; i < ehdr->e_phnum; i++) { | ||
139 | if (phdr[i].p_type != PT_LOAD) | ||
140 | continue; | ||
141 | |||
142 | verbose("Section %i: size %i addr %p\n", | ||
143 | i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); | ||
144 | |||
145 | /* We expect linear address space. */ | ||
146 | if (!*page_offset) | ||
147 | *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr; | ||
148 | else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr) | ||
149 | errx(1, "Page offset of section %i different", i); | ||
150 | |||
151 | if (phdr[i].p_paddr < start) | ||
152 | start = phdr[i].p_paddr; | ||
153 | if (phdr[i].p_paddr + phdr[i].p_filesz > end) | ||
154 | end = phdr[i].p_paddr + phdr[i].p_filesz; | ||
155 | |||
156 | /* We map everything private, writable. */ | ||
157 | addr = mmap((void *)phdr[i].p_paddr, | ||
158 | phdr[i].p_filesz, | ||
159 | PROT_READ|PROT_WRITE|PROT_EXEC, | ||
160 | MAP_FIXED|MAP_PRIVATE, | ||
161 | elf_fd, phdr[i].p_offset); | ||
162 | if (addr != (void *)phdr[i].p_paddr) | ||
163 | err(1, "Mmaping vmlinux seg %i gave %p not %p", | ||
164 | i, addr, (void *)phdr[i].p_paddr); | ||
165 | } | ||
166 | |||
167 | return entry_point((void *)start, (void *)end, *page_offset); | ||
168 | } | ||
169 | |||
170 | /* This is amazingly reliable. */ | ||
171 | static unsigned long intuit_page_offset(unsigned char *img, unsigned long len) | ||
172 | { | ||
173 | unsigned int i, possibilities[256] = { 0 }; | ||
174 | |||
175 | for (i = 0; i + 4 < len; i++) { | ||
176 | /* mov 0xXXXXXXXX,%eax */ | ||
177 | if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3) | ||
178 | return (unsigned long)img[i+4] << 24; | ||
179 | } | ||
180 | errx(1, "could not determine page offset"); | ||
181 | } | ||
182 | |||
183 | static unsigned long unpack_bzimage(int fd, unsigned long *page_offset) | ||
184 | { | ||
185 | gzFile f; | ||
186 | int ret, len = 0; | ||
187 | void *img = (void *)0x100000; | ||
188 | |||
189 | f = gzdopen(fd, "rb"); | ||
190 | while ((ret = gzread(f, img + len, 65536)) > 0) | ||
191 | len += ret; | ||
192 | if (ret < 0) | ||
193 | err(1, "reading image from bzImage"); | ||
194 | |||
195 | verbose("Unpacked size %i addr %p\n", len, img); | ||
196 | *page_offset = intuit_page_offset(img, len); | ||
197 | |||
198 | return entry_point(img, img + len, *page_offset); | ||
199 | } | ||
200 | |||
201 | static unsigned long load_bzimage(int fd, unsigned long *page_offset) | ||
202 | { | ||
203 | unsigned char c; | ||
204 | int state = 0; | ||
205 | |||
206 | /* Ugly brute force search for gzip header. */ | ||
207 | while (read(fd, &c, 1) == 1) { | ||
208 | switch (state) { | ||
209 | case 0: | ||
210 | if (c == 0x1F) | ||
211 | state++; | ||
212 | break; | ||
213 | case 1: | ||
214 | if (c == 0x8B) | ||
215 | state++; | ||
216 | else | ||
217 | state = 0; | ||
218 | break; | ||
219 | case 2 ... 8: | ||
220 | state++; | ||
221 | break; | ||
222 | case 9: | ||
223 | lseek(fd, -10, SEEK_CUR); | ||
224 | if (c != 0x03) /* Compressed under UNIX. */ | ||
225 | state = -1; | ||
226 | else | ||
227 | return unpack_bzimage(fd, page_offset); | ||
228 | } | ||
229 | } | ||
230 | errx(1, "Could not find kernel in bzImage"); | ||
231 | } | ||
232 | |||
233 | static unsigned long load_kernel(int fd, unsigned long *page_offset) | ||
234 | { | ||
235 | Elf32_Ehdr hdr; | ||
236 | |||
237 | if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) | ||
238 | err(1, "Reading kernel"); | ||
239 | |||
240 | if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) | ||
241 | return map_elf(fd, &hdr, page_offset); | ||
242 | |||
243 | return load_bzimage(fd, page_offset); | ||
244 | } | ||
245 | |||
246 | static inline unsigned long page_align(unsigned long addr) | ||
247 | { | ||
248 | return ((addr + getpagesize()-1) & ~(getpagesize()-1)); | ||
249 | } | ||
250 | |||
251 | /* initrd gets loaded at top of memory: return length. */ | ||
252 | static unsigned long load_initrd(const char *name, unsigned long mem) | ||
253 | { | ||
254 | int ifd; | ||
255 | struct stat st; | ||
256 | unsigned long len; | ||
257 | void *iaddr; | ||
258 | |||
259 | ifd = open_or_die(name, O_RDONLY); | ||
260 | if (fstat(ifd, &st) < 0) | ||
261 | err(1, "fstat() on initrd '%s'", name); | ||
262 | |||
263 | len = page_align(st.st_size); | ||
264 | iaddr = mmap((void *)mem - len, st.st_size, | ||
265 | PROT_READ|PROT_EXEC|PROT_WRITE, | ||
266 | MAP_FIXED|MAP_PRIVATE, ifd, 0); | ||
267 | if (iaddr != (void *)mem - len) | ||
268 | err(1, "Mmaping initrd '%s' returned %p not %p", | ||
269 | name, iaddr, (void *)mem - len); | ||
270 | close(ifd); | ||
271 | verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr); | ||
272 | return len; | ||
273 | } | ||
274 | |||
275 | static unsigned long setup_pagetables(unsigned long mem, | ||
276 | unsigned long initrd_size, | ||
277 | unsigned long page_offset) | ||
278 | { | ||
279 | u32 *pgdir, *linear; | ||
280 | unsigned int mapped_pages, i, linear_pages; | ||
281 | unsigned int ptes_per_page = getpagesize()/sizeof(u32); | ||
282 | |||
283 | /* If we can map all of memory above page_offset, we do so. */ | ||
284 | if (mem <= -page_offset) | ||
285 | mapped_pages = mem/getpagesize(); | ||
286 | else | ||
287 | mapped_pages = -page_offset/getpagesize(); | ||
288 | |||
289 | /* Each linear PTE page can map ptes_per_page pages. */ | ||
290 | linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; | ||
291 | |||
292 | /* We lay out top-level then linear mapping immediately below initrd */ | ||
293 | pgdir = (void *)mem - initrd_size - getpagesize(); | ||
294 | linear = (void *)pgdir - linear_pages*getpagesize(); | ||
295 | |||
296 | for (i = 0; i < mapped_pages; i++) | ||
297 | linear[i] = ((i * getpagesize()) | PAGE_PRESENT); | ||
298 | |||
299 | /* Now set up pgd so that this memory is at page_offset */ | ||
300 | for (i = 0; i < mapped_pages; i += ptes_per_page) { | ||
301 | pgdir[(i + page_offset/getpagesize())/ptes_per_page] | ||
302 | = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT); | ||
303 | } | ||
304 | |||
305 | verbose("Linear mapping of %u pages in %u pte pages at %p\n", | ||
306 | mapped_pages, linear_pages, linear); | ||
307 | |||
308 | return (unsigned long)pgdir; | ||
309 | } | ||
310 | |||
311 | static void concat(char *dst, char *args[]) | ||
312 | { | ||
313 | unsigned int i, len = 0; | ||
314 | |||
315 | for (i = 0; args[i]; i++) { | ||
316 | strcpy(dst+len, args[i]); | ||
317 | strcat(dst+len, " "); | ||
318 | len += strlen(args[i]) + 1; | ||
319 | } | ||
320 | /* In case it's empty. */ | ||
321 | dst[len] = '\0'; | ||
322 | } | ||
323 | |||
324 | static int tell_kernel(u32 pgdir, u32 start, u32 page_offset) | ||
325 | { | ||
326 | u32 args[] = { LHREQ_INITIALIZE, | ||
327 | LGUEST_GUEST_TOP/getpagesize(), /* Just below us */ | ||
328 | pgdir, start, page_offset }; | ||
329 | int fd; | ||
330 | |||
331 | fd = open_or_die("/dev/lguest", O_RDWR); | ||
332 | if (write(fd, args, sizeof(args)) < 0) | ||
333 | err(1, "Writing to /dev/lguest"); | ||
334 | return fd; | ||
335 | } | ||
336 | |||
337 | static void set_fd(int fd, struct device_list *devices) | ||
338 | { | ||
339 | FD_SET(fd, &devices->infds); | ||
340 | if (fd > devices->max_infd) | ||
341 | devices->max_infd = fd; | ||
342 | } | ||
343 | |||
344 | /* When input arrives, we tell the kernel to kick lguest out with -EAGAIN. */ | ||
345 | static void wake_parent(int pipefd, int lguest_fd, struct device_list *devices) | ||
346 | { | ||
347 | set_fd(pipefd, devices); | ||
348 | |||
349 | for (;;) { | ||
350 | fd_set rfds = devices->infds; | ||
351 | u32 args[] = { LHREQ_BREAK, 1 }; | ||
352 | |||
353 | select(devices->max_infd+1, &rfds, NULL, NULL, NULL); | ||
354 | if (FD_ISSET(pipefd, &rfds)) { | ||
355 | int ignorefd; | ||
356 | if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0) | ||
357 | exit(0); | ||
358 | FD_CLR(ignorefd, &devices->infds); | ||
359 | } else | ||
360 | write(lguest_fd, args, sizeof(args)); | ||
361 | } | ||
362 | } | ||
363 | |||
364 | static int setup_waker(int lguest_fd, struct device_list *device_list) | ||
365 | { | ||
366 | int pipefd[2], child; | ||
367 | |||
368 | pipe(pipefd); | ||
369 | child = fork(); | ||
370 | if (child == -1) | ||
371 | err(1, "forking"); | ||
372 | |||
373 | if (child == 0) { | ||
374 | close(pipefd[1]); | ||
375 | wake_parent(pipefd[0], lguest_fd, device_list); | ||
376 | } | ||
377 | close(pipefd[0]); | ||
378 | |||
379 | return pipefd[1]; | ||
380 | } | ||
381 | |||
382 | static void *_check_pointer(unsigned long addr, unsigned int size, | ||
383 | unsigned int line) | ||
384 | { | ||
385 | if (addr >= LGUEST_GUEST_TOP || addr + size >= LGUEST_GUEST_TOP) | ||
386 | errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr); | ||
387 | return (void *)addr; | ||
388 | } | ||
389 | #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) | ||
390 | |||
391 | /* Returns pointer to dma->used_len */ | ||
392 | static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num) | ||
393 | { | ||
394 | unsigned int i; | ||
395 | struct lguest_dma *udma; | ||
396 | |||
397 | udma = check_pointer(dma, sizeof(*udma)); | ||
398 | for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { | ||
399 | if (!udma->len[i]) | ||
400 | break; | ||
401 | |||
402 | iov[i].iov_base = check_pointer(udma->addr[i], udma->len[i]); | ||
403 | iov[i].iov_len = udma->len[i]; | ||
404 | } | ||
405 | *num = i; | ||
406 | return &udma->used_len; | ||
407 | } | ||
408 | |||
409 | static u32 *get_dma_buffer(int fd, void *key, | ||
410 | struct iovec iov[], unsigned int *num, u32 *irq) | ||
411 | { | ||
412 | u32 buf[] = { LHREQ_GETDMA, (u32)key }; | ||
413 | unsigned long udma; | ||
414 | u32 *res; | ||
415 | |||
416 | udma = write(fd, buf, sizeof(buf)); | ||
417 | if (udma == (unsigned long)-1) | ||
418 | return NULL; | ||
419 | |||
420 | /* Kernel stashes irq in ->used_len. */ | ||
421 | res = dma2iov(udma, iov, num); | ||
422 | *irq = *res; | ||
423 | return res; | ||
424 | } | ||
425 | |||
426 | static void trigger_irq(int fd, u32 irq) | ||
427 | { | ||
428 | u32 buf[] = { LHREQ_IRQ, irq }; | ||
429 | if (write(fd, buf, sizeof(buf)) != 0) | ||
430 | err(1, "Triggering irq %i", irq); | ||
431 | } | ||
432 | |||
433 | static void discard_iovec(struct iovec *iov, unsigned int *num) | ||
434 | { | ||
435 | static char discard_buf[1024]; | ||
436 | *num = 1; | ||
437 | iov->iov_base = discard_buf; | ||
438 | iov->iov_len = sizeof(discard_buf); | ||
439 | } | ||
440 | |||
441 | static struct termios orig_term; | ||
442 | static void restore_term(void) | ||
443 | { | ||
444 | tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); | ||
445 | } | ||
446 | |||
447 | struct console_abort | ||
448 | { | ||
449 | int count; | ||
450 | struct timeval start; | ||
451 | }; | ||
452 | |||
453 | /* We DMA input to buffer bound at start of console page. */ | ||
454 | static bool handle_console_input(int fd, struct device *dev) | ||
455 | { | ||
456 | u32 irq = 0, *lenp; | ||
457 | int len; | ||
458 | unsigned int num; | ||
459 | struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; | ||
460 | struct console_abort *abort = dev->priv; | ||
461 | |||
462 | lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq); | ||
463 | if (!lenp) { | ||
464 | warn("console: no dma buffer!"); | ||
465 | discard_iovec(iov, &num); | ||
466 | } | ||
467 | |||
468 | len = readv(dev->fd, iov, num); | ||
469 | if (len <= 0) { | ||
470 | warnx("Failed to get console input, ignoring console."); | ||
471 | len = 0; | ||
472 | } | ||
473 | |||
474 | if (lenp) { | ||
475 | *lenp = len; | ||
476 | trigger_irq(fd, irq); | ||
477 | } | ||
478 | |||
479 | /* Three ^C within one second? Exit. */ | ||
480 | if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) { | ||
481 | if (!abort->count++) | ||
482 | gettimeofday(&abort->start, NULL); | ||
483 | else if (abort->count == 3) { | ||
484 | struct timeval now; | ||
485 | gettimeofday(&now, NULL); | ||
486 | if (now.tv_sec <= abort->start.tv_sec+1) { | ||
487 | /* Make sure waker is not blocked in BREAK */ | ||
488 | u32 args[] = { LHREQ_BREAK, 0 }; | ||
489 | close(waker_fd); | ||
490 | write(fd, args, sizeof(args)); | ||
491 | exit(2); | ||
492 | } | ||
493 | abort->count = 0; | ||
494 | } | ||
495 | } else | ||
496 | abort->count = 0; | ||
497 | |||
498 | if (!len) { | ||
499 | restore_term(); | ||
500 | return false; | ||
501 | } | ||
502 | return true; | ||
503 | } | ||
504 | |||
505 | static u32 handle_console_output(int fd, const struct iovec *iov, | ||
506 | unsigned num, struct device*dev) | ||
507 | { | ||
508 | return writev(STDOUT_FILENO, iov, num); | ||
509 | } | ||
510 | |||
511 | static u32 handle_tun_output(int fd, const struct iovec *iov, | ||
512 | unsigned num, struct device *dev) | ||
513 | { | ||
514 | /* Now we've seen output, we should warn if we can't get buffers. */ | ||
515 | *(bool *)dev->priv = true; | ||
516 | return writev(dev->fd, iov, num); | ||
517 | } | ||
518 | |||
519 | static unsigned long peer_offset(unsigned int peernum) | ||
520 | { | ||
521 | return 4 * peernum; | ||
522 | } | ||
523 | |||
524 | static bool handle_tun_input(int fd, struct device *dev) | ||
525 | { | ||
526 | u32 irq = 0, *lenp; | ||
527 | int len; | ||
528 | unsigned num; | ||
529 | struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; | ||
530 | |||
531 | lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num, | ||
532 | &irq); | ||
533 | if (!lenp) { | ||
534 | if (*(bool *)dev->priv) | ||
535 | warn("network: no dma buffer!"); | ||
536 | discard_iovec(iov, &num); | ||
537 | } | ||
538 | |||
539 | len = readv(dev->fd, iov, num); | ||
540 | if (len <= 0) | ||
541 | err(1, "reading network"); | ||
542 | if (lenp) { | ||
543 | *lenp = len; | ||
544 | trigger_irq(fd, irq); | ||
545 | } | ||
546 | verbose("tun input packet len %i [%02x %02x] (%s)\n", len, | ||
547 | ((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1], | ||
548 | lenp ? "sent" : "discarded"); | ||
549 | return true; | ||
550 | } | ||
551 | |||
552 | static u32 handle_block_output(int fd, const struct iovec *iov, | ||
553 | unsigned num, struct device *dev) | ||
554 | { | ||
555 | struct lguest_block_page *p = dev->mem; | ||
556 | u32 irq, *lenp; | ||
557 | unsigned int len, reply_num; | ||
558 | struct iovec reply[LGUEST_MAX_DMA_SECTIONS]; | ||
559 | off64_t device_len, off = (off64_t)p->sector * 512; | ||
560 | |||
561 | device_len = *(off64_t *)dev->priv; | ||
562 | |||
563 | if (off >= device_len) | ||
564 | err(1, "Bad offset %llu vs %llu", off, device_len); | ||
565 | if (lseek64(dev->fd, off, SEEK_SET) != off) | ||
566 | err(1, "Bad seek to sector %i", p->sector); | ||
567 | |||
568 | verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off); | ||
569 | |||
570 | lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq); | ||
571 | if (!lenp) | ||
572 | err(1, "Block request didn't give us a dma buffer"); | ||
573 | |||
574 | if (p->type) { | ||
575 | len = writev(dev->fd, iov, num); | ||
576 | if (off + len > device_len) { | ||
577 | ftruncate(dev->fd, device_len); | ||
578 | errx(1, "Write past end %llu+%u", off, len); | ||
579 | } | ||
580 | *lenp = 0; | ||
581 | } else { | ||
582 | len = readv(dev->fd, reply, reply_num); | ||
583 | *lenp = len; | ||
584 | } | ||
585 | |||
586 | p->result = 1 + (p->bytes != len); | ||
587 | trigger_irq(fd, irq); | ||
588 | return 0; | ||
589 | } | ||
590 | |||
591 | static void handle_output(int fd, unsigned long dma, unsigned long key, | ||
592 | struct device_list *devices) | ||
593 | { | ||
594 | struct device *i; | ||
595 | u32 *lenp; | ||
596 | struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; | ||
597 | unsigned num = 0; | ||
598 | |||
599 | lenp = dma2iov(dma, iov, &num); | ||
600 | for (i = devices->dev; i; i = i->next) { | ||
601 | if (i->handle_output && key == i->watch_key) { | ||
602 | *lenp = i->handle_output(fd, iov, num, i); | ||
603 | return; | ||
604 | } | ||
605 | } | ||
606 | warnx("Pending dma %p, key %p", (void *)dma, (void *)key); | ||
607 | } | ||
608 | |||
609 | static void handle_input(int fd, struct device_list *devices) | ||
610 | { | ||
611 | struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; | ||
612 | |||
613 | for (;;) { | ||
614 | struct device *i; | ||
615 | fd_set fds = devices->infds; | ||
616 | |||
617 | if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0) | ||
618 | break; | ||
619 | |||
620 | for (i = devices->dev; i; i = i->next) { | ||
621 | if (i->handle_input && FD_ISSET(i->fd, &fds)) { | ||
622 | if (!i->handle_input(fd, i)) { | ||
623 | FD_CLR(i->fd, &devices->infds); | ||
624 | /* Tell waker to ignore it too... */ | ||
625 | write(waker_fd, &i->fd, sizeof(i->fd)); | ||
626 | } | ||
627 | } | ||
628 | } | ||
629 | } | ||
630 | } | ||
631 | |||
632 | static struct lguest_device_desc *new_dev_desc(u16 type, u16 features, | ||
633 | u16 num_pages) | ||
634 | { | ||
635 | static unsigned long top = LGUEST_GUEST_TOP; | ||
636 | struct lguest_device_desc *desc; | ||
637 | |||
638 | desc = malloc(sizeof(*desc)); | ||
639 | desc->type = type; | ||
640 | desc->num_pages = num_pages; | ||
641 | desc->features = features; | ||
642 | desc->status = 0; | ||
643 | if (num_pages) { | ||
644 | top -= num_pages*getpagesize(); | ||
645 | map_zeroed_pages(top, num_pages); | ||
646 | desc->pfn = top / getpagesize(); | ||
647 | } else | ||
648 | desc->pfn = 0; | ||
649 | return desc; | ||
650 | } | ||
651 | |||
652 | static struct device *new_device(struct device_list *devices, | ||
653 | u16 type, u16 num_pages, u16 features, | ||
654 | int fd, | ||
655 | bool (*handle_input)(int, struct device *), | ||
656 | unsigned long watch_off, | ||
657 | u32 (*handle_output)(int, | ||
658 | const struct iovec *, | ||
659 | unsigned, | ||
660 | struct device *)) | ||
661 | { | ||
662 | struct device *dev = malloc(sizeof(*dev)); | ||
663 | |||
664 | /* Append to device list. */ | ||
665 | *devices->lastdev = dev; | ||
666 | dev->next = NULL; | ||
667 | devices->lastdev = &dev->next; | ||
668 | |||
669 | dev->fd = fd; | ||
670 | if (handle_input) | ||
671 | set_fd(dev->fd, devices); | ||
672 | dev->desc = new_dev_desc(type, features, num_pages); | ||
673 | dev->mem = (void *)(dev->desc->pfn * getpagesize()); | ||
674 | dev->handle_input = handle_input; | ||
675 | dev->watch_key = (unsigned long)dev->mem + watch_off; | ||
676 | dev->handle_output = handle_output; | ||
677 | return dev; | ||
678 | } | ||
679 | |||
680 | static void setup_console(struct device_list *devices) | ||
681 | { | ||
682 | struct device *dev; | ||
683 | |||
684 | if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { | ||
685 | struct termios term = orig_term; | ||
686 | term.c_lflag &= ~(ISIG|ICANON|ECHO); | ||
687 | tcsetattr(STDIN_FILENO, TCSANOW, &term); | ||
688 | atexit(restore_term); | ||
689 | } | ||
690 | |||
691 | /* We don't currently require a page for the console. */ | ||
692 | dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0, | ||
693 | STDIN_FILENO, handle_console_input, | ||
694 | LGUEST_CONSOLE_DMA_KEY, handle_console_output); | ||
695 | dev->priv = malloc(sizeof(struct console_abort)); | ||
696 | ((struct console_abort *)dev->priv)->count = 0; | ||
697 | verbose("device %p: console\n", | ||
698 | (void *)(dev->desc->pfn * getpagesize())); | ||
699 | } | ||
700 | |||
701 | static void setup_block_file(const char *filename, struct device_list *devices) | ||
702 | { | ||
703 | int fd; | ||
704 | struct device *dev; | ||
705 | off64_t *device_len; | ||
706 | struct lguest_block_page *p; | ||
707 | |||
708 | fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT); | ||
709 | dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1, | ||
710 | LGUEST_DEVICE_F_RANDOMNESS, | ||
711 | fd, NULL, 0, handle_block_output); | ||
712 | device_len = dev->priv = malloc(sizeof(*device_len)); | ||
713 | *device_len = lseek64(fd, 0, SEEK_END); | ||
714 | p = dev->mem; | ||
715 | |||
716 | p->num_sectors = *device_len/512; | ||
717 | verbose("device %p: block %i sectors\n", | ||
718 | (void *)(dev->desc->pfn * getpagesize()), p->num_sectors); | ||
719 | } | ||
720 | |||
721 | /* We use fnctl locks to reserve network slots (autocleanup!) */ | ||
722 | static unsigned int find_slot(int netfd, const char *filename) | ||
723 | { | ||
724 | struct flock fl; | ||
725 | |||
726 | fl.l_type = F_WRLCK; | ||
727 | fl.l_whence = SEEK_SET; | ||
728 | fl.l_len = 1; | ||
729 | for (fl.l_start = 0; | ||
730 | fl.l_start < getpagesize()/sizeof(struct lguest_net); | ||
731 | fl.l_start++) { | ||
732 | if (fcntl(netfd, F_SETLK, &fl) == 0) | ||
733 | return fl.l_start; | ||
734 | } | ||
735 | errx(1, "No free slots in network file %s", filename); | ||
736 | } | ||
737 | |||
738 | static void setup_net_file(const char *filename, | ||
739 | struct device_list *devices) | ||
740 | { | ||
741 | int netfd; | ||
742 | struct device *dev; | ||
743 | |||
744 | netfd = open(filename, O_RDWR, 0); | ||
745 | if (netfd < 0) { | ||
746 | if (errno == ENOENT) { | ||
747 | netfd = open(filename, O_RDWR|O_CREAT, 0600); | ||
748 | if (netfd >= 0) { | ||
749 | char page[getpagesize()]; | ||
750 | memset(page, 0, sizeof(page)); | ||
751 | write(netfd, page, sizeof(page)); | ||
752 | } | ||
753 | } | ||
754 | if (netfd < 0) | ||
755 | err(1, "cannot open net file '%s'", filename); | ||
756 | } | ||
757 | |||
758 | dev = new_device(devices, LGUEST_DEVICE_T_NET, 1, | ||
759 | find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM, | ||
760 | -1, NULL, 0, NULL); | ||
761 | |||
762 | /* We overwrite the /dev/zero mapping with the actual file. */ | ||
763 | if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE, | ||
764 | MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem) | ||
765 | err(1, "could not mmap '%s'", filename); | ||
766 | verbose("device %p: shared net %s, peer %i\n", | ||
767 | (void *)(dev->desc->pfn * getpagesize()), filename, | ||
768 | dev->desc->features & ~LGUEST_NET_F_NOCSUM); | ||
769 | } | ||
770 | |||
771 | static u32 str2ip(const char *ipaddr) | ||
772 | { | ||
773 | unsigned int byte[4]; | ||
774 | |||
775 | sscanf(ipaddr, "%u.%u.%u.%u", &byte[0], &byte[1], &byte[2], &byte[3]); | ||
776 | return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3]; | ||
777 | } | ||
778 | |||
779 | /* adapted from libbridge */ | ||
780 | static void add_to_bridge(int fd, const char *if_name, const char *br_name) | ||
781 | { | ||
782 | int ifidx; | ||
783 | struct ifreq ifr; | ||
784 | |||
785 | if (!*br_name) | ||
786 | errx(1, "must specify bridge name"); | ||
787 | |||
788 | ifidx = if_nametoindex(if_name); | ||
789 | if (!ifidx) | ||
790 | errx(1, "interface %s does not exist!", if_name); | ||
791 | |||
792 | strncpy(ifr.ifr_name, br_name, IFNAMSIZ); | ||
793 | ifr.ifr_ifindex = ifidx; | ||
794 | if (ioctl(fd, SIOCBRADDIF, &ifr) < 0) | ||
795 | err(1, "can't add %s to bridge %s", if_name, br_name); | ||
796 | } | ||
797 | |||
798 | static void configure_device(int fd, const char *devname, u32 ipaddr, | ||
799 | unsigned char hwaddr[6]) | ||
800 | { | ||
801 | struct ifreq ifr; | ||
802 | struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr; | ||
803 | |||
804 | memset(&ifr, 0, sizeof(ifr)); | ||
805 | strcpy(ifr.ifr_name, devname); | ||
806 | sin->sin_family = AF_INET; | ||
807 | sin->sin_addr.s_addr = htonl(ipaddr); | ||
808 | if (ioctl(fd, SIOCSIFADDR, &ifr) != 0) | ||
809 | err(1, "Setting %s interface address", devname); | ||
810 | ifr.ifr_flags = IFF_UP; | ||
811 | if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0) | ||
812 | err(1, "Bringing interface %s up", devname); | ||
813 | |||
814 | if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0) | ||
815 | err(1, "getting hw address for %s", devname); | ||
816 | |||
817 | memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6); | ||
818 | } | ||
819 | |||
820 | static void setup_tun_net(const char *arg, struct device_list *devices) | ||
821 | { | ||
822 | struct device *dev; | ||
823 | struct ifreq ifr; | ||
824 | int netfd, ipfd; | ||
825 | u32 ip; | ||
826 | const char *br_name = NULL; | ||
827 | |||
828 | netfd = open_or_die("/dev/net/tun", O_RDWR); | ||
829 | memset(&ifr, 0, sizeof(ifr)); | ||
830 | ifr.ifr_flags = IFF_TAP | IFF_NO_PI; | ||
831 | strcpy(ifr.ifr_name, "tap%d"); | ||
832 | if (ioctl(netfd, TUNSETIFF, &ifr) != 0) | ||
833 | err(1, "configuring /dev/net/tun"); | ||
834 | ioctl(netfd, TUNSETNOCSUM, 1); | ||
835 | |||
836 | /* You will be peer 1: we should create enough jitter to randomize */ | ||
837 | dev = new_device(devices, LGUEST_DEVICE_T_NET, 1, | ||
838 | NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd, | ||
839 | handle_tun_input, peer_offset(0), handle_tun_output); | ||
840 | dev->priv = malloc(sizeof(bool)); | ||
841 | *(bool *)dev->priv = false; | ||
842 | |||
843 | ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); | ||
844 | if (ipfd < 0) | ||
845 | err(1, "opening IP socket"); | ||
846 | |||
847 | if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) { | ||
848 | ip = INADDR_ANY; | ||
849 | br_name = arg + strlen(BRIDGE_PFX); | ||
850 | add_to_bridge(ipfd, ifr.ifr_name, br_name); | ||
851 | } else | ||
852 | ip = str2ip(arg); | ||
853 | |||
854 | /* We are peer 0, ie. first slot. */ | ||
855 | configure_device(ipfd, ifr.ifr_name, ip, dev->mem); | ||
856 | |||
857 | /* Set "promisc" bit: we want every single packet. */ | ||
858 | *((u8 *)dev->mem) |= 0x1; | ||
859 | |||
860 | close(ipfd); | ||
861 | |||
862 | verbose("device %p: tun net %u.%u.%u.%u\n", | ||
863 | (void *)(dev->desc->pfn * getpagesize()), | ||
864 | (u8)(ip>>24), (u8)(ip>>16), (u8)(ip>>8), (u8)ip); | ||
865 | if (br_name) | ||
866 | verbose("attached to bridge: %s\n", br_name); | ||
867 | } | ||
868 | |||
869 | /* Now we know how much memory we have, we copy in device descriptors */ | ||
870 | static void map_device_descriptors(struct device_list *devs, unsigned long mem) | ||
871 | { | ||
872 | struct device *i; | ||
873 | unsigned int num; | ||
874 | struct lguest_device_desc *descs; | ||
875 | |||
876 | /* Device descriptor array sits just above top of normal memory */ | ||
877 | descs = map_zeroed_pages(mem, 1); | ||
878 | |||
879 | for (i = devs->dev, num = 0; i; i = i->next, num++) { | ||
880 | if (num == LGUEST_MAX_DEVICES) | ||
881 | errx(1, "too many devices"); | ||
882 | verbose("Device %i: %s\n", num, | ||
883 | i->desc->type == LGUEST_DEVICE_T_NET ? "net" | ||
884 | : i->desc->type == LGUEST_DEVICE_T_CONSOLE ? "console" | ||
885 | : i->desc->type == LGUEST_DEVICE_T_BLOCK ? "block" | ||
886 | : "unknown"); | ||
887 | descs[num] = *i->desc; | ||
888 | free(i->desc); | ||
889 | i->desc = &descs[num]; | ||
890 | } | ||
891 | } | ||
892 | |||
893 | static void __attribute__((noreturn)) | ||
894 | run_guest(int lguest_fd, struct device_list *device_list) | ||
895 | { | ||
896 | for (;;) { | ||
897 | u32 args[] = { LHREQ_BREAK, 0 }; | ||
898 | unsigned long arr[2]; | ||
899 | int readval; | ||
900 | |||
901 | /* We read from the /dev/lguest device to run the Guest. */ | ||
902 | readval = read(lguest_fd, arr, sizeof(arr)); | ||
903 | |||
904 | if (readval == sizeof(arr)) { | ||
905 | handle_output(lguest_fd, arr[0], arr[1], device_list); | ||
906 | continue; | ||
907 | } else if (errno == ENOENT) { | ||
908 | char reason[1024] = { 0 }; | ||
909 | read(lguest_fd, reason, sizeof(reason)-1); | ||
910 | errx(1, "%s", reason); | ||
911 | } else if (errno != EAGAIN) | ||
912 | err(1, "Running guest failed"); | ||
913 | handle_input(lguest_fd, device_list); | ||
914 | if (write(lguest_fd, args, sizeof(args)) < 0) | ||
915 | err(1, "Resetting break"); | ||
916 | } | ||
917 | } | ||
918 | |||
919 | static struct option opts[] = { | ||
920 | { "verbose", 0, NULL, 'v' }, | ||
921 | { "sharenet", 1, NULL, 's' }, | ||
922 | { "tunnet", 1, NULL, 't' }, | ||
923 | { "block", 1, NULL, 'b' }, | ||
924 | { "initrd", 1, NULL, 'i' }, | ||
925 | { NULL }, | ||
926 | }; | ||
927 | static void usage(void) | ||
928 | { | ||
929 | errx(1, "Usage: lguest [--verbose] " | ||
930 | "[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)\n" | ||
931 | "|--block=<filename>|--initrd=<filename>]...\n" | ||
932 | "<mem-in-mb> vmlinux [args...]"); | ||
933 | } | ||
934 | |||
935 | int main(int argc, char *argv[]) | ||
936 | { | ||
937 | unsigned long mem, pgdir, start, page_offset, initrd_size = 0; | ||
938 | int c, lguest_fd; | ||
939 | struct device_list device_list; | ||
940 | void *boot = (void *)0; | ||
941 | const char *initrd_name = NULL; | ||
942 | |||
943 | device_list.max_infd = -1; | ||
944 | device_list.dev = NULL; | ||
945 | device_list.lastdev = &device_list.dev; | ||
946 | FD_ZERO(&device_list.infds); | ||
947 | |||
948 | while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) { | ||
949 | switch (c) { | ||
950 | case 'v': | ||
951 | verbose = true; | ||
952 | break; | ||
953 | case 's': | ||
954 | setup_net_file(optarg, &device_list); | ||
955 | break; | ||
956 | case 't': | ||
957 | setup_tun_net(optarg, &device_list); | ||
958 | break; | ||
959 | case 'b': | ||
960 | setup_block_file(optarg, &device_list); | ||
961 | break; | ||
962 | case 'i': | ||
963 | initrd_name = optarg; | ||
964 | break; | ||
965 | default: | ||
966 | warnx("Unknown argument %s", argv[optind]); | ||
967 | usage(); | ||
968 | } | ||
969 | } | ||
970 | if (optind + 2 > argc) | ||
971 | usage(); | ||
972 | |||
973 | /* We need a console device */ | ||
974 | setup_console(&device_list); | ||
975 | |||
976 | /* First we map /dev/zero over all of guest-physical memory. */ | ||
977 | mem = atoi(argv[optind]) * 1024 * 1024; | ||
978 | map_zeroed_pages(0, mem / getpagesize()); | ||
979 | |||
980 | /* Now we load the kernel */ | ||
981 | start = load_kernel(open_or_die(argv[optind+1], O_RDONLY), | ||
982 | &page_offset); | ||
983 | |||
984 | /* Write the device descriptors into memory. */ | ||
985 | map_device_descriptors(&device_list, mem); | ||
986 | |||
987 | /* Map the initrd image if requested */ | ||
988 | if (initrd_name) { | ||
989 | initrd_size = load_initrd(initrd_name, mem); | ||
990 | *(unsigned long *)(boot+0x218) = mem - initrd_size; | ||
991 | *(unsigned long *)(boot+0x21c) = initrd_size; | ||
992 | *(unsigned char *)(boot+0x210) = 0xFF; | ||
993 | } | ||
994 | |||
995 | /* Set up the initial linar pagetables. */ | ||
996 | pgdir = setup_pagetables(mem, initrd_size, page_offset); | ||
997 | |||
998 | /* E820 memory map: ours is a simple, single region. */ | ||
999 | *(char*)(boot+E820NR) = 1; | ||
1000 | *((struct e820entry *)(boot+E820MAP)) | ||
1001 | = ((struct e820entry) { 0, mem, E820_RAM }); | ||
1002 | /* Command line pointer and command line (at 4096) */ | ||
1003 | *(void **)(boot + 0x228) = boot + 4096; | ||
1004 | concat(boot + 4096, argv+optind+2); | ||
1005 | /* Paravirt type: 1 == lguest */ | ||
1006 | *(int *)(boot + 0x23c) = 1; | ||
1007 | |||
1008 | lguest_fd = tell_kernel(pgdir, start, page_offset); | ||
1009 | waker_fd = setup_waker(lguest_fd, &device_list); | ||
1010 | |||
1011 | run_guest(lguest_fd, &device_list); | ||
1012 | } | ||
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt new file mode 100644 index 00000000000..821617bd6c0 --- /dev/null +++ b/Documentation/lguest/lguest.txt | |||
@@ -0,0 +1,129 @@ | |||
1 | Rusty's Remarkably Unreliable Guide to Lguest | ||
2 | - or, A Young Coder's Illustrated Hypervisor | ||
3 | http://lguest.ozlabs.org | ||
4 | |||
5 | Lguest is designed to be a minimal hypervisor for the Linux kernel, for | ||
6 | Linux developers and users to experiment with virtualization with the | ||
7 | minimum of complexity. Nonetheless, it should have sufficient | ||
8 | features to make it useful for specific tasks, and, of course, you are | ||
9 | encouraged to fork and enhance it. | ||
10 | |||
11 | Features: | ||
12 | |||
13 | - Kernel module which runs in a normal kernel. | ||
14 | - Simple I/O model for communication. | ||
15 | - Simple program to create new guests. | ||
16 | - Logo contains cute puppies: http://lguest.ozlabs.org | ||
17 | |||
18 | Developer features: | ||
19 | |||
20 | - Fun to hack on. | ||
21 | - No ABI: being tied to a specific kernel anyway, you can change anything. | ||
22 | - Many opportunities for improvement or feature implementation. | ||
23 | |||
24 | Running Lguest: | ||
25 | |||
26 | - Lguest runs the same kernel as guest and host. You can configure | ||
27 | them differently, but usually it's easiest not to. | ||
28 | |||
29 | You will need to configure your kernel with the following options: | ||
30 | |||
31 | CONFIG_HIGHMEM64G=n ("High Memory Support" "64GB")[1] | ||
32 | CONFIG_TUN=y/m ("Universal TUN/TAP device driver support") | ||
33 | CONFIG_EXPERIMENTAL=y ("Prompt for development and/or incomplete code/drivers") | ||
34 | CONFIG_PARAVIRT=y ("Paravirtualization support (EXPERIMENTAL)") | ||
35 | CONFIG_LGUEST=y/m ("Linux hypervisor example code") | ||
36 | |||
37 | and I recommend: | ||
38 | CONFIG_HZ=100 ("Timer frequency")[2] | ||
39 | |||
40 | - A tool called "lguest" is available in this directory: type "make" | ||
41 | to build it. If you didn't build your kernel in-tree, use "make | ||
42 | O=<builddir>". | ||
43 | |||
44 | - Create or find a root disk image. There are several useful ones | ||
45 | around, such as the xm-test tiny root image at | ||
46 | http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img | ||
47 | |||
48 | For more serious work, I usually use a distribution ISO image and | ||
49 | install it under qemu, then make multiple copies: | ||
50 | |||
51 | dd if=/dev/zero of=rootfile bs=1M count=2048 | ||
52 | qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d | ||
53 | |||
54 | - "modprobe lg" if you built it as a module. | ||
55 | |||
56 | - Run an lguest as root: | ||
57 | |||
58 | Documentation/lguest/lguest 64m vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/lgba | ||
59 | |||
60 | Explanation: | ||
61 | 64m: the amount of memory to use. | ||
62 | |||
63 | vmlinux: the kernel image found in the top of your build directory. You | ||
64 | can also use a standard bzImage. | ||
65 | |||
66 | --tunnet=192.168.19.1: configures a "tap" device for networking with this | ||
67 | IP address. | ||
68 | |||
69 | --block=rootfile: a file or block device which becomes /dev/lgba | ||
70 | inside the guest. | ||
71 | |||
72 | root=/dev/lgba: this (and anything else on the command line) are | ||
73 | kernel boot parameters. | ||
74 | |||
75 | - Configuring networking. I usually have the host masquerade, using | ||
76 | "iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 > | ||
77 | /proc/sys/net/ipv4/ip_forward". In this example, I would configure | ||
78 | eth0 inside the guest at 192.168.19.2. | ||
79 | |||
80 | Another method is to bridge the tap device to an external interface | ||
81 | using --tunnet=bridge:<bridgename>, and perhaps run dhcp on the guest | ||
82 | to obtain an IP address. The bridge needs to be configured first: | ||
83 | this option simply adds the tap interface to it. | ||
84 | |||
85 | A simple example on my system: | ||
86 | |||
87 | ifconfig eth0 0.0.0.0 | ||
88 | brctl addbr lg0 | ||
89 | ifconfig lg0 up | ||
90 | brctl addif lg0 eth0 | ||
91 | dhclient lg0 | ||
92 | |||
93 | Then use --tunnet=bridge:lg0 when launching the guest. | ||
94 | |||
95 | See http://linux-net.osdl.org/index.php/Bridge for general information | ||
96 | on how to get bridging working. | ||
97 | |||
98 | - You can also create an inter-guest network using | ||
99 | "--sharenet=<filename>": any two guests using the same file are on | ||
100 | the same network. This file is created if it does not exist. | ||
101 | |||
102 | Lguest I/O model: | ||
103 | |||
104 | Lguest uses a simplified DMA model plus shared memory for I/O. Guests | ||
105 | can communicate with each other if they share underlying memory | ||
106 | (usually by the lguest program mmaping the same file), but they can | ||
107 | use any non-shared memory to communicate with the lguest process. | ||
108 | |||
109 | Guests can register DMA buffers at any key (must be a valid physical | ||
110 | address) using the LHCALL_BIND_DMA(key, dmabufs, num<<8|irq) | ||
111 | hypercall. "dmabufs" is the physical address of an array of "num" | ||
112 | "struct lguest_dma": each contains a used_len, and an array of | ||
113 | physical addresses and lengths. When a transfer occurs, the | ||
114 | "used_len" field of one of the buffers which has used_len 0 will be | ||
115 | set to the length transferred and the irq will fire. | ||
116 | |||
117 | Using an irq value of 0 unbinds the dma buffers. | ||
118 | |||
119 | To send DMA, the LHCALL_SEND_DMA(key, dma_physaddr) hypercall is used, | ||
120 | and the bytes used is written to the used_len field. This can be 0 if | ||
121 | noone else has bound a DMA buffer to that key or some other error. | ||
122 | DMA buffers bound by the same guest are ignored. | ||
123 | |||
124 | Cheers! | ||
125 | Rusty Russell rusty@rustcorp.com.au. | ||
126 | |||
127 | [1] These are on various places on the TODO list, waiting for you to | ||
128 | get annoyed enough at the limitation to fix it. | ||
129 | [2] Lguest is not yet tickless when idle. See [1]. | ||