aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/filesystems/9p.txt8
-rw-r--r--Documentation/lguest/Makefile26
-rw-r--r--Documentation/lguest/lguest.c1629
-rw-r--r--Documentation/lguest/lguest.txt72
4 files changed, 921 insertions, 814 deletions
diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt
index b90f537af35c..bf8080640eba 100644
--- a/Documentation/filesystems/9p.txt
+++ b/Documentation/filesystems/9p.txt
@@ -42,10 +42,12 @@ OPTIONS
42 42
43 trans=name select an alternative transport. Valid options are 43 trans=name select an alternative transport. Valid options are
44 currently: 44 currently:
45 unix - specifying a named pipe mount point 45 unix - specifying a named pipe mount point
46 tcp - specifying a normal TCP/IP connection 46 tcp - specifying a normal TCP/IP connection
47 fd - used passed file descriptors for connection 47 fd - used passed file descriptors for connection
48 (see rfdno and wfdno) 48 (see rfdno and wfdno)
49 virtio - connect to the next virtio channel available
50 (from lguest or KVM with trans_virtio module)
49 51
50 uname=name user name to attempt mount as on the remote server. The 52 uname=name user name to attempt mount as on the remote server. The
51 server may override or ignore this value. Certain user 53 server may override or ignore this value. Certain user
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile
index c0b7a4556390..bac037eb1cda 100644
--- a/Documentation/lguest/Makefile
+++ b/Documentation/lguest/Makefile
@@ -1,28 +1,8 @@
1# This creates the demonstration utility "lguest" which runs a Linux guest. 1# This creates the demonstration utility "lguest" which runs a Linux guest.
2 2CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include
3# For those people that have a separate object dir, look there for .config
4KBUILD_OUTPUT := ../..
5ifdef O
6 ifeq ("$(origin O)", "command line")
7 KBUILD_OUTPUT := $(O)
8 endif
9endif
10# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
11include $(KBUILD_OUTPUT)/.config
12LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
13
14CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds
15LDLIBS:=-lz 3LDLIBS:=-lz
16# Removing this works for some versions of ld.so (eg. Ubuntu Feisty) and
17# not others (eg. FC7).
18LDFLAGS+=-static
19all: lguest.lds lguest
20 4
21# The linker script on x86 is so complex the only way of creating one 5all: lguest
22# which will link our binary in the right place is to mangle the
23# default one.
24lguest.lds:
25 $(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@
26 6
27clean: 7clean:
28 rm -f lguest.lds lguest 8 rm -f lguest
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 103e346c8b6a..5bdc37f81842 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -1,10 +1,7 @@
1/*P:100 This is the Launcher code, a simple program which lays out the 1/*P:100 This is the Launcher code, a simple program which lays out the
2 * "physical" memory for the new Guest by mapping the kernel image and the 2 * "physical" memory for the new Guest by mapping the kernel image and the
3 * virtual devices, then reads repeatedly from /dev/lguest to run the Guest. 3 * virtual devices, then reads repeatedly from /dev/lguest to run the Guest.
4 * 4:*/
5 * The only trick: the Makefile links it at a high address so it will be clear
6 * of the guest memory region. It means that each Guest cannot have more than
7 * about 2.5G of memory on a normally configured Host. :*/
8#define _LARGEFILE64_SOURCE 5#define _LARGEFILE64_SOURCE
9#define _GNU_SOURCE 6#define _GNU_SOURCE
10#include <stdio.h> 7#include <stdio.h>
@@ -15,6 +12,7 @@
15#include <stdlib.h> 12#include <stdlib.h>
16#include <elf.h> 13#include <elf.h>
17#include <sys/mman.h> 14#include <sys/mman.h>
15#include <sys/param.h>
18#include <sys/types.h> 16#include <sys/types.h>
19#include <sys/stat.h> 17#include <sys/stat.h>
20#include <sys/wait.h> 18#include <sys/wait.h>
@@ -34,7 +32,9 @@
34#include <termios.h> 32#include <termios.h>
35#include <getopt.h> 33#include <getopt.h>
36#include <zlib.h> 34#include <zlib.h>
37/*L:110 We can ignore the 28 include files we need for this program, but I do 35#include <assert.h>
36#include <sched.h>
37/*L:110 We can ignore the 30 include files we need for this program, but I do
38 * want to draw attention to the use of kernel-style types. 38 * want to draw attention to the use of kernel-style types.
39 * 39 *
40 * As Linus said, "C is a Spartan language, and so should your naming be." I 40 * As Linus said, "C is a Spartan language, and so should your naming be." I
@@ -45,8 +45,14 @@ typedef unsigned long long u64;
45typedef uint32_t u32; 45typedef uint32_t u32;
46typedef uint16_t u16; 46typedef uint16_t u16;
47typedef uint8_t u8; 47typedef uint8_t u8;
48#include "../../include/linux/lguest_launcher.h" 48#include "linux/lguest_launcher.h"
49#include "../../include/asm-x86/e820_32.h" 49#include "linux/pci_ids.h"
50#include "linux/virtio_config.h"
51#include "linux/virtio_net.h"
52#include "linux/virtio_blk.h"
53#include "linux/virtio_console.h"
54#include "linux/virtio_ring.h"
55#include "asm-x86/bootparam.h"
50/*:*/ 56/*:*/
51 57
52#define PAGE_PRESENT 0x7 /* Present, RW, Execute */ 58#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
@@ -55,6 +61,10 @@ typedef uint8_t u8;
55#ifndef SIOCBRADDIF 61#ifndef SIOCBRADDIF
56#define SIOCBRADDIF 0x89a2 /* add interface to bridge */ 62#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
57#endif 63#endif
64/* We can have up to 256 pages for devices. */
65#define DEVICE_PAGES 256
66/* This fits nicely in a single 4096-byte page. */
67#define VIRTQUEUE_NUM 127
58 68
59/*L:120 verbose is both a global flag and a macro. The C preprocessor allows 69/*L:120 verbose is both a global flag and a macro. The C preprocessor allows
60 * this, and although I wouldn't recommend it, it works quite nicely here. */ 70 * this, and although I wouldn't recommend it, it works quite nicely here. */
@@ -65,8 +75,10 @@ static bool verbose;
65 75
66/* The pipe to send commands to the waker process */ 76/* The pipe to send commands to the waker process */
67static int waker_fd; 77static int waker_fd;
68/* The top of guest physical memory. */ 78/* The pointer to the start of guest memory. */
69static u32 top; 79static void *guest_base;
80/* The maximum guest physical address allowed, and maximum possible. */
81static unsigned long guest_limit, guest_max;
70 82
71/* This is our list of devices. */ 83/* This is our list of devices. */
72struct device_list 84struct device_list
@@ -76,8 +88,17 @@ struct device_list
76 fd_set infds; 88 fd_set infds;
77 int max_infd; 89 int max_infd;
78 90
91 /* Counter to assign interrupt numbers. */
92 unsigned int next_irq;
93
94 /* Counter to print out convenient device numbers. */
95 unsigned int device_num;
96
79 /* The descriptor page for the devices. */ 97 /* The descriptor page for the devices. */
80 struct lguest_device_desc *descs; 98 u8 *descpage;
99
100 /* The tail of the last descriptor. */
101 unsigned int desc_used;
81 102
82 /* A single linked list of devices. */ 103 /* A single linked list of devices. */
83 struct device *dev; 104 struct device *dev;
@@ -85,31 +106,111 @@ struct device_list
85 struct device **lastdev; 106 struct device **lastdev;
86}; 107};
87 108
109/* The list of Guest devices, based on command line arguments. */
110static struct device_list devices;
111
88/* The device structure describes a single device. */ 112/* The device structure describes a single device. */
89struct device 113struct device
90{ 114{
91 /* The linked-list pointer. */ 115 /* The linked-list pointer. */
92 struct device *next; 116 struct device *next;
93 /* The descriptor for this device, as mapped into the Guest. */ 117
118 /* The this device's descriptor, as mapped into the Guest. */
94 struct lguest_device_desc *desc; 119 struct lguest_device_desc *desc;
95 /* The memory page(s) of this device, if any. Also mapped in Guest. */ 120
96 void *mem; 121 /* The name of this device, for --verbose. */
122 const char *name;
97 123
98 /* If handle_input is set, it wants to be called when this file 124 /* If handle_input is set, it wants to be called when this file
99 * descriptor is ready. */ 125 * descriptor is ready. */
100 int fd; 126 int fd;
101 bool (*handle_input)(int fd, struct device *me); 127 bool (*handle_input)(int fd, struct device *me);
102 128
103 /* If handle_output is set, it wants to be called when the Guest sends 129 /* Any queues attached to this device */
104 * DMA to this key. */ 130 struct virtqueue *vq;
105 unsigned long watch_key;
106 u32 (*handle_output)(int fd, const struct iovec *iov,
107 unsigned int num, struct device *me);
108 131
109 /* Device-specific data. */ 132 /* Device-specific data. */
110 void *priv; 133 void *priv;
111}; 134};
112 135
136/* The virtqueue structure describes a queue attached to a device. */
137struct virtqueue
138{
139 struct virtqueue *next;
140
141 /* Which device owns me. */
142 struct device *dev;
143
144 /* The configuration for this queue. */
145 struct lguest_vqconfig config;
146
147 /* The actual ring of buffers. */
148 struct vring vring;
149
150 /* Last available index we saw. */
151 u16 last_avail_idx;
152
153 /* The routine to call when the Guest pings us. */
154 void (*handle_output)(int fd, struct virtqueue *me);
155};
156
157/* Since guest is UP and we don't run at the same time, we don't need barriers.
158 * But I include them in the code in case others copy it. */
159#define wmb()
160
161/* Convert an iovec element to the given type.
162 *
163 * This is a fairly ugly trick: we need to know the size of the type and
164 * alignment requirement to check the pointer is kosher. It's also nice to
165 * have the name of the type in case we report failure.
166 *
167 * Typing those three things all the time is cumbersome and error prone, so we
168 * have a macro which sets them all up and passes to the real function. */
169#define convert(iov, type) \
170 ((type *)_convert((iov), sizeof(type), __alignof__(type), #type))
171
172static void *_convert(struct iovec *iov, size_t size, size_t align,
173 const char *name)
174{
175 if (iov->iov_len != size)
176 errx(1, "Bad iovec size %zu for %s", iov->iov_len, name);
177 if ((unsigned long)iov->iov_base % align != 0)
178 errx(1, "Bad alignment %p for %s", iov->iov_base, name);
179 return iov->iov_base;
180}
181
182/* The virtio configuration space is defined to be little-endian. x86 is
183 * little-endian too, but it's nice to be explicit so we have these helpers. */
184#define cpu_to_le16(v16) (v16)
185#define cpu_to_le32(v32) (v32)
186#define cpu_to_le64(v64) (v64)
187#define le16_to_cpu(v16) (v16)
188#define le32_to_cpu(v32) (v32)
189#define le64_to_cpu(v32) (v64)
190
191/*L:100 The Launcher code itself takes us out into userspace, that scary place
192 * where pointers run wild and free! Unfortunately, like most userspace
193 * programs, it's quite boring (which is why everyone likes to hack on the
194 * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it
195 * will get you through this section. Or, maybe not.
196 *
197 * The Launcher sets up a big chunk of memory to be the Guest's "physical"
198 * memory and stores it in "guest_base". In other words, Guest physical ==
199 * Launcher virtual with an offset.
200 *
201 * This can be tough to get your head around, but usually it just means that we
202 * use these trivial conversion functions when the Guest gives us it's
203 * "physical" addresses: */
204static void *from_guest_phys(unsigned long addr)
205{
206 return guest_base + addr;
207}
208
209static unsigned long to_guest_phys(const void *addr)
210{
211 return (addr - guest_base);
212}
213
113/*L:130 214/*L:130
114 * Loading the Kernel. 215 * Loading the Kernel.
115 * 216 *
@@ -123,43 +224,55 @@ static int open_or_die(const char *name, int flags)
123 return fd; 224 return fd;
124} 225}
125 226
126/* map_zeroed_pages() takes a (page-aligned) address and a number of pages. */ 227/* map_zeroed_pages() takes a number of pages. */
127static void *map_zeroed_pages(unsigned long addr, unsigned int num) 228static void *map_zeroed_pages(unsigned int num)
128{ 229{
129 /* We cache the /dev/zero file-descriptor so we only open it once. */ 230 int fd = open_or_die("/dev/zero", O_RDONLY);
130 static int fd = -1; 231 void *addr;
131
132 if (fd == -1)
133 fd = open_or_die("/dev/zero", O_RDONLY);
134 232
135 /* We use a private mapping (ie. if we write to the page, it will be 233 /* We use a private mapping (ie. if we write to the page, it will be
136 * copied), and obviously we insist that it be mapped where we ask. */ 234 * copied). */
137 if (mmap((void *)addr, getpagesize() * num, 235 addr = mmap(NULL, getpagesize() * num,
138 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0) 236 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0);
139 != (void *)addr) 237 if (addr == MAP_FAILED)
140 err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr); 238 err(1, "Mmaping %u pages of /dev/zero", num);
141 239
142 /* Returning the address is just a courtesy: can simplify callers. */ 240 return addr;
143 return (void *)addr;
144} 241}
145 242
146/* To find out where to start we look for the magic Guest string, which marks 243/* Get some more pages for a device. */
147 * the code we see in lguest_asm.S. This is a hack which we are currently 244static void *get_pages(unsigned int num)
148 * plotting to replace with the normal Linux entry point. */
149static unsigned long entry_point(void *start, void *end,
150 unsigned long page_offset)
151{ 245{
152 void *p; 246 void *addr = from_guest_phys(guest_limit);
153 247
154 /* The scan gives us the physical starting address. We want the 248 guest_limit += num * getpagesize();
155 * virtual address in this case, and fortunately, we already figured 249 if (guest_limit > guest_max)
156 * out the physical-virtual difference and passed it here in 250 errx(1, "Not enough memory for devices");
157 * "page_offset". */ 251 return addr;
158 for (p = start; p < end; p++) 252}
159 if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0)
160 return (long)p + strlen("GenuineLguest") + page_offset;
161 253
162 err(1, "Is this image a genuine lguest?"); 254/* This routine is used to load the kernel or initrd. It tries mmap, but if
255 * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
256 * it falls back to reading the memory in. */
257static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
258{
259 ssize_t r;
260
261 /* We map writable even though for some segments are marked read-only.
262 * The kernel really wants to be writable: it patches its own
263 * instructions.
264 *
265 * MAP_PRIVATE means that the page won't be copied until a write is
266 * done to it. This allows us to share untouched memory between
267 * Guests. */
268 if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC,
269 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
270 return;
271
272 /* pread does a seek and a read in one shot: saves a few lines. */
273 r = pread(fd, addr, len, offset);
274 if (r != len)
275 err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
163} 276}
164 277
165/* This routine takes an open vmlinux image, which is in ELF, and maps it into 278/* This routine takes an open vmlinux image, which is in ELF, and maps it into
@@ -167,19 +280,14 @@ static unsigned long entry_point(void *start, void *end,
167 * by all modern binaries on Linux including the kernel. 280 * by all modern binaries on Linux including the kernel.
168 * 281 *
169 * The ELF headers give *two* addresses: a physical address, and a virtual 282 * The ELF headers give *two* addresses: a physical address, and a virtual
170 * address. The Guest kernel expects to be placed in memory at the physical 283 * address. We use the physical address; the Guest will map itself to the
171 * address, and the page tables set up so it will correspond to that virtual 284 * virtual address.
172 * address. We return the difference between the virtual and physical
173 * addresses in the "page_offset" pointer.
174 * 285 *
175 * We return the starting address. */ 286 * We return the starting address. */
176static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, 287static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
177 unsigned long *page_offset)
178{ 288{
179 void *addr;
180 Elf32_Phdr phdr[ehdr->e_phnum]; 289 Elf32_Phdr phdr[ehdr->e_phnum];
181 unsigned int i; 290 unsigned int i;
182 unsigned long start = -1UL, end = 0;
183 291
184 /* Sanity checks on the main ELF header: an x86 executable with a 292 /* Sanity checks on the main ELF header: an x86 executable with a
185 * reasonable number of correctly-sized program headers. */ 293 * reasonable number of correctly-sized program headers. */
@@ -199,9 +307,6 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
199 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) 307 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
200 err(1, "Reading program headers"); 308 err(1, "Reading program headers");
201 309
202 /* We don't know page_offset yet. */
203 *page_offset = 0;
204
205 /* Try all the headers: there are usually only three. A read-only one, 310 /* Try all the headers: there are usually only three. A read-only one,
206 * a read-write one, and a "note" section which isn't loadable. */ 311 * a read-write one, and a "note" section which isn't loadable. */
207 for (i = 0; i < ehdr->e_phnum; i++) { 312 for (i = 0; i < ehdr->e_phnum; i++) {
@@ -212,158 +317,53 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
212 verbose("Section %i: size %i addr %p\n", 317 verbose("Section %i: size %i addr %p\n",
213 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); 318 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
214 319
215 /* We expect a simple linear address space: every segment must 320 /* We map this section of the file at its physical address. */
216 * have the same difference between virtual (p_vaddr) and 321 map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
217 * physical (p_paddr) address. */ 322 phdr[i].p_offset, phdr[i].p_filesz);
218 if (!*page_offset)
219 *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
220 else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
221 errx(1, "Page offset of section %i different", i);
222
223 /* We track the first and last address we mapped, so we can
224 * tell entry_point() where to scan. */
225 if (phdr[i].p_paddr < start)
226 start = phdr[i].p_paddr;
227 if (phdr[i].p_paddr + phdr[i].p_filesz > end)
228 end = phdr[i].p_paddr + phdr[i].p_filesz;
229
230 /* We map this section of the file at its physical address. We
231 * map it read & write even if the header says this segment is
232 * read-only. The kernel really wants to be writable: it
233 * patches its own instructions which would normally be
234 * read-only.
235 *
236 * MAP_PRIVATE means that the page won't be copied until a
237 * write is done to it. This allows us to share much of the
238 * kernel memory between Guests. */
239 addr = mmap((void *)phdr[i].p_paddr,
240 phdr[i].p_filesz,
241 PROT_READ|PROT_WRITE|PROT_EXEC,
242 MAP_FIXED|MAP_PRIVATE,
243 elf_fd, phdr[i].p_offset);
244 if (addr != (void *)phdr[i].p_paddr)
245 err(1, "Mmaping vmlinux seg %i gave %p not %p",
246 i, addr, (void *)phdr[i].p_paddr);
247 } 323 }
248 324
249 return entry_point((void *)start, (void *)end, *page_offset); 325 /* The entry point is given in the ELF header. */
326 return ehdr->e_entry;
250} 327}
251 328
252/*L:170 Prepare to be SHOCKED and AMAZED. And possibly a trifle nauseated. 329/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're
253 * 330 * supposed to jump into it and it will unpack itself. We used to have to
254 * We know that CONFIG_PAGE_OFFSET sets what virtual address the kernel expects 331 * perform some hairy magic because the unpacking code scared me.
255 * to be. We don't know what that option was, but we can figure it out
256 * approximately by looking at the addresses in the code. I chose the common
257 * case of reading a memory location into the %eax register:
258 *
259 * movl <some-address>, %eax
260 *
261 * This gets encoded as five bytes: "0xA1 <4-byte-address>". For example,
262 * "0xA1 0x18 0x60 0x47 0xC0" reads the address 0xC0476018 into %eax.
263 *
264 * In this example can guess that the kernel was compiled with
265 * CONFIG_PAGE_OFFSET set to 0xC0000000 (it's always a round number). If the
266 * kernel were larger than 16MB, we might see 0xC1 addresses show up, but our
267 * kernel isn't that bloated yet.
268 *
269 * Unfortunately, x86 has variable-length instructions, so finding this
270 * particular instruction properly involves writing a disassembler. Instead,
271 * we rely on statistics. We look for "0xA1" and tally the different bytes
272 * which occur 4 bytes later (the "0xC0" in our example above). When one of
273 * those bytes appears three times, we can be reasonably confident that it
274 * forms the start of CONFIG_PAGE_OFFSET.
275 * 332 *
276 * This is amazingly reliable. */ 333 * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
277static unsigned long intuit_page_offset(unsigned char *img, unsigned long len) 334 * a small patch to jump over the tricky bits in the Guest, so now we just read
335 * the funky header so we know where in the file to load, and away we go! */
336static unsigned long load_bzimage(int fd)
278{ 337{
279 unsigned int i, possibilities[256] = { 0 }; 338 struct boot_params boot;
339 int r;
340 /* Modern bzImages get loaded at 1M. */
341 void *p = from_guest_phys(0x100000);
280 342
281 for (i = 0; i + 4 < len; i++) { 343 /* Go back to the start of the file and read the header. It should be
282 /* mov 0xXXXXXXXX,%eax */ 344 * a Linux boot header (see Documentation/i386/boot.txt) */
283 if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3) 345 lseek(fd, 0, SEEK_SET);
284 return (unsigned long)img[i+4] << 24; 346 read(fd, &boot, sizeof(boot));
285 }
286 errx(1, "could not determine page offset");
287}
288 347
289/*L:160 Unfortunately the entire ELF image isn't compressed: the segments 348 /* Inside the setup_hdr, we expect the magic "HdrS" */
290 * which need loading are extracted and compressed raw. This denies us the 349 if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
291 * information we need to make a fully-general loader. */ 350 errx(1, "This doesn't look like a bzImage to me");
292static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)
293{
294 gzFile f;
295 int ret, len = 0;
296 /* A bzImage always gets loaded at physical address 1M. This is
297 * actually configurable as CONFIG_PHYSICAL_START, but as the comment
298 * there says, "Don't change this unless you know what you are doing".
299 * Indeed. */
300 void *img = (void *)0x100000;
301
302 /* gzdopen takes our file descriptor (carefully placed at the start of
303 * the GZIP header we found) and returns a gzFile. */
304 f = gzdopen(fd, "rb");
305 /* We read it into memory in 64k chunks until we hit the end. */
306 while ((ret = gzread(f, img + len, 65536)) > 0)
307 len += ret;
308 if (ret < 0)
309 err(1, "reading image from bzImage");
310
311 verbose("Unpacked size %i addr %p\n", len, img);
312
313 /* Without the ELF header, we can't tell virtual-physical gap. This is
314 * CONFIG_PAGE_OFFSET, and people do actually change it. Fortunately,
315 * I have a clever way of figuring it out from the code itself. */
316 *page_offset = intuit_page_offset(img, len);
317
318 return entry_point(img, img + len, *page_offset);
319}
320 351
321/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're 352 /* Skip over the extra sectors of the header. */
322 * supposed to jump into it and it will unpack itself. We can't do that 353 lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
323 * because the Guest can't run the unpacking code, and adding features to 354
324 * lguest kills puppies, so we don't want to. 355 /* Now read everything into memory. in nice big chunks. */
325 * 356 while ((r = read(fd, p, 65536)) > 0)
326 * The bzImage is formed by putting the decompressing code in front of the 357 p += r;
327 * compressed kernel code. So we can simple scan through it looking for the 358
328 * first "gzip" header, and start decompressing from there. */ 359 /* Finally, code32_start tells us where to enter the kernel. */
329static unsigned long load_bzimage(int fd, unsigned long *page_offset) 360 return boot.hdr.code32_start;
330{
331 unsigned char c;
332 int state = 0;
333
334 /* GZIP header is 0x1F 0x8B <method> <flags>... <compressed-by>. */
335 while (read(fd, &c, 1) == 1) {
336 switch (state) {
337 case 0:
338 if (c == 0x1F)
339 state++;
340 break;
341 case 1:
342 if (c == 0x8B)
343 state++;
344 else
345 state = 0;
346 break;
347 case 2 ... 8:
348 state++;
349 break;
350 case 9:
351 /* Seek back to the start of the gzip header. */
352 lseek(fd, -10, SEEK_CUR);
353 /* One final check: "compressed under UNIX". */
354 if (c != 0x03)
355 state = -1;
356 else
357 return unpack_bzimage(fd, page_offset);
358 }
359 }
360 errx(1, "Could not find kernel in bzImage");
361} 361}
362 362
363/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels 363/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels
364 * come wrapped up in the self-decompressing "bzImage" format. With some funky 364 * come wrapped up in the self-decompressing "bzImage" format. With some funky
365 * coding, we can load those, too. */ 365 * coding, we can load those, too. */
366static unsigned long load_kernel(int fd, unsigned long *page_offset) 366static unsigned long load_kernel(int fd)
367{ 367{
368 Elf32_Ehdr hdr; 368 Elf32_Ehdr hdr;
369 369
@@ -373,10 +373,10 @@ static unsigned long load_kernel(int fd, unsigned long *page_offset)
373 373
374 /* If it's an ELF file, it starts with "\177ELF" */ 374 /* If it's an ELF file, it starts with "\177ELF" */
375 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) 375 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
376 return map_elf(fd, &hdr, page_offset); 376 return map_elf(fd, &hdr);
377 377
378 /* Otherwise we assume it's a bzImage, and try to unpack it */ 378 /* Otherwise we assume it's a bzImage, and try to unpack it */
379 return load_bzimage(fd, page_offset); 379 return load_bzimage(fd);
380} 380}
381 381
382/* This is a trivial little helper to align pages. Andi Kleen hated it because 382/* This is a trivial little helper to align pages. Andi Kleen hated it because
@@ -402,59 +402,45 @@ static unsigned long load_initrd(const char *name, unsigned long mem)
402 int ifd; 402 int ifd;
403 struct stat st; 403 struct stat st;
404 unsigned long len; 404 unsigned long len;
405 void *iaddr;
406 405
407 ifd = open_or_die(name, O_RDONLY); 406 ifd = open_or_die(name, O_RDONLY);
408 /* fstat() is needed to get the file size. */ 407 /* fstat() is needed to get the file size. */
409 if (fstat(ifd, &st) < 0) 408 if (fstat(ifd, &st) < 0)
410 err(1, "fstat() on initrd '%s'", name); 409 err(1, "fstat() on initrd '%s'", name);
411 410
412 /* The length needs to be rounded up to a page size: mmap needs the 411 /* We map the initrd at the top of memory, but mmap wants it to be
413 * address to be page aligned. */ 412 * page-aligned, so we round the size up for that. */
414 len = page_align(st.st_size); 413 len = page_align(st.st_size);
415 /* We map the initrd at the top of memory. */ 414 map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
416 iaddr = mmap((void *)mem - len, st.st_size,
417 PROT_READ|PROT_EXEC|PROT_WRITE,
418 MAP_FIXED|MAP_PRIVATE, ifd, 0);
419 if (iaddr != (void *)mem - len)
420 err(1, "Mmaping initrd '%s' returned %p not %p",
421 name, iaddr, (void *)mem - len);
422 /* Once a file is mapped, you can close the file descriptor. It's a 415 /* Once a file is mapped, you can close the file descriptor. It's a
423 * little odd, but quite useful. */ 416 * little odd, but quite useful. */
424 close(ifd); 417 close(ifd);
425 verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr); 418 verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
426 419
427 /* We return the initrd size. */ 420 /* We return the initrd size. */
428 return len; 421 return len;
429} 422}
430 423
431/* Once we know how much memory we have, and the address the Guest kernel 424/* Once we know how much memory we have, we can construct simple linear page
432 * expects, we can construct simple linear page tables which will get the Guest 425 * tables which set virtual == physical which will get the Guest far enough
433 * far enough into the boot to create its own. 426 * into the boot to create its own.
434 * 427 *
435 * We lay them out of the way, just below the initrd (which is why we need to 428 * We lay them out of the way, just below the initrd (which is why we need to
436 * know its size). */ 429 * know its size). */
437static unsigned long setup_pagetables(unsigned long mem, 430static unsigned long setup_pagetables(unsigned long mem,
438 unsigned long initrd_size, 431 unsigned long initrd_size)
439 unsigned long page_offset)
440{ 432{
441 u32 *pgdir, *linear; 433 unsigned long *pgdir, *linear;
442 unsigned int mapped_pages, i, linear_pages; 434 unsigned int mapped_pages, i, linear_pages;
443 unsigned int ptes_per_page = getpagesize()/sizeof(u32); 435 unsigned int ptes_per_page = getpagesize()/sizeof(void *);
444 436
445 /* Ideally we map all physical memory starting at page_offset. 437 mapped_pages = mem/getpagesize();
446 * However, if page_offset is 0xC0000000 we can only map 1G of physical
447 * (0xC0000000 + 1G overflows). */
448 if (mem <= -page_offset)
449 mapped_pages = mem/getpagesize();
450 else
451 mapped_pages = -page_offset/getpagesize();
452 438
453 /* Each PTE page can map ptes_per_page pages: how many do we need? */ 439 /* Each PTE page can map ptes_per_page pages: how many do we need? */
454 linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; 440 linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page;
455 441
456 /* We put the toplevel page directory page at the top of memory. */ 442 /* We put the toplevel page directory page at the top of memory. */
457 pgdir = (void *)mem - initrd_size - getpagesize(); 443 pgdir = from_guest_phys(mem) - initrd_size - getpagesize();
458 444
459 /* Now we use the next linear_pages pages as pte pages */ 445 /* Now we use the next linear_pages pages as pte pages */
460 linear = (void *)pgdir - linear_pages*getpagesize(); 446 linear = (void *)pgdir - linear_pages*getpagesize();
@@ -465,20 +451,19 @@ static unsigned long setup_pagetables(unsigned long mem,
465 for (i = 0; i < mapped_pages; i++) 451 for (i = 0; i < mapped_pages; i++)
466 linear[i] = ((i * getpagesize()) | PAGE_PRESENT); 452 linear[i] = ((i * getpagesize()) | PAGE_PRESENT);
467 453
468 /* The top level points to the linear page table pages above. The 454 /* The top level points to the linear page table pages above. */
469 * entry representing page_offset points to the first one, and they
470 * continue from there. */
471 for (i = 0; i < mapped_pages; i += ptes_per_page) { 455 for (i = 0; i < mapped_pages; i += ptes_per_page) {
472 pgdir[(i + page_offset/getpagesize())/ptes_per_page] 456 pgdir[i/ptes_per_page]
473 = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT); 457 = ((to_guest_phys(linear) + i*sizeof(void *))
458 | PAGE_PRESENT);
474 } 459 }
475 460
476 verbose("Linear mapping of %u pages in %u pte pages at %p\n", 461 verbose("Linear mapping of %u pages in %u pte pages at %#lx\n",
477 mapped_pages, linear_pages, linear); 462 mapped_pages, linear_pages, to_guest_phys(linear));
478 463
479 /* We return the top level (guest-physical) address: the kernel needs 464 /* We return the top level (guest-physical) address: the kernel needs
480 * to know where it is. */ 465 * to know where it is. */
481 return (unsigned long)pgdir; 466 return to_guest_phys(pgdir);
482} 467}
483 468
484/* Simple routine to roll all the commandline arguments together with spaces 469/* Simple routine to roll all the commandline arguments together with spaces
@@ -498,14 +483,17 @@ static void concat(char *dst, char *args[])
498 483
499/* This is where we actually tell the kernel to initialize the Guest. We saw 484/* This is where we actually tell the kernel to initialize the Guest. We saw
500 * the arguments it expects when we looked at initialize() in lguest_user.c: 485 * the arguments it expects when we looked at initialize() in lguest_user.c:
501 * the top physical page to allow, the top level pagetable, the entry point and 486 * the base of guest "physical" memory, the top physical page to allow, the
502 * the page_offset constant for the Guest. */ 487 * top level pagetable and the entry point for the Guest. */
503static int tell_kernel(u32 pgdir, u32 start, u32 page_offset) 488static int tell_kernel(unsigned long pgdir, unsigned long start)
504{ 489{
505 u32 args[] = { LHREQ_INITIALIZE, 490 unsigned long args[] = { LHREQ_INITIALIZE,
506 top/getpagesize(), pgdir, start, page_offset }; 491 (unsigned long)guest_base,
492 guest_limit / getpagesize(), pgdir, start };
507 int fd; 493 int fd;
508 494
495 verbose("Guest: %p - %p (%#lx)\n",
496 guest_base, guest_base + guest_limit, guest_limit);
509 fd = open_or_die("/dev/lguest", O_RDWR); 497 fd = open_or_die("/dev/lguest", O_RDWR);
510 if (write(fd, args, sizeof(args)) < 0) 498 if (write(fd, args, sizeof(args)) < 0)
511 err(1, "Writing to /dev/lguest"); 499 err(1, "Writing to /dev/lguest");
@@ -515,11 +503,11 @@ static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
515} 503}
516/*:*/ 504/*:*/
517 505
518static void set_fd(int fd, struct device_list *devices) 506static void add_device_fd(int fd)
519{ 507{
520 FD_SET(fd, &devices->infds); 508 FD_SET(fd, &devices.infds);
521 if (fd > devices->max_infd) 509 if (fd > devices.max_infd)
522 devices->max_infd = fd; 510 devices.max_infd = fd;
523} 511}
524 512
525/*L:200 513/*L:200
@@ -537,36 +525,38 @@ static void set_fd(int fd, struct device_list *devices)
537 * 525 *
538 * This, of course, is merely a different *kind* of icky. 526 * This, of course, is merely a different *kind* of icky.
539 */ 527 */
540static void wake_parent(int pipefd, int lguest_fd, struct device_list *devices) 528static void wake_parent(int pipefd, int lguest_fd)
541{ 529{
542 /* Add the pipe from the Launcher to the fdset in the device_list, so 530 /* Add the pipe from the Launcher to the fdset in the device_list, so
543 * we watch it, too. */ 531 * we watch it, too. */
544 set_fd(pipefd, devices); 532 add_device_fd(pipefd);
545 533
546 for (;;) { 534 for (;;) {
547 fd_set rfds = devices->infds; 535 fd_set rfds = devices.infds;
548 u32 args[] = { LHREQ_BREAK, 1 }; 536 unsigned long args[] = { LHREQ_BREAK, 1 };
549 537
550 /* Wait until input is ready from one of the devices. */ 538 /* Wait until input is ready from one of the devices. */
551 select(devices->max_infd+1, &rfds, NULL, NULL, NULL); 539 select(devices.max_infd+1, &rfds, NULL, NULL, NULL);
552 /* Is it a message from the Launcher? */ 540 /* Is it a message from the Launcher? */
553 if (FD_ISSET(pipefd, &rfds)) { 541 if (FD_ISSET(pipefd, &rfds)) {
554 int ignorefd; 542 int fd;
555 /* If read() returns 0, it means the Launcher has 543 /* If read() returns 0, it means the Launcher has
556 * exited. We silently follow. */ 544 * exited. We silently follow. */
557 if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0) 545 if (read(pipefd, &fd, sizeof(fd)) == 0)
558 exit(0); 546 exit(0);
559 /* Otherwise it's telling us there's a problem with one 547 /* Otherwise it's telling us to change what file
560 * of the devices, and we should ignore that file 548 * descriptors we're to listen to. */
561 * descriptor from now on. */ 549 if (fd >= 0)
562 FD_CLR(ignorefd, &devices->infds); 550 FD_SET(fd, &devices.infds);
551 else
552 FD_CLR(-fd - 1, &devices.infds);
563 } else /* Send LHREQ_BREAK command. */ 553 } else /* Send LHREQ_BREAK command. */
564 write(lguest_fd, args, sizeof(args)); 554 write(lguest_fd, args, sizeof(args));
565 } 555 }
566} 556}
567 557
568/* This routine just sets up a pipe to the Waker process. */ 558/* This routine just sets up a pipe to the Waker process. */
569static int setup_waker(int lguest_fd, struct device_list *device_list) 559static int setup_waker(int lguest_fd)
570{ 560{
571 int pipefd[2], child; 561 int pipefd[2], child;
572 562
@@ -580,7 +570,7 @@ static int setup_waker(int lguest_fd, struct device_list *device_list)
580 if (child == 0) { 570 if (child == 0) {
581 /* Close the "writing" end of our copy of the pipe */ 571 /* Close the "writing" end of our copy of the pipe */
582 close(pipefd[1]); 572 close(pipefd[1]);
583 wake_parent(pipefd[0], lguest_fd, device_list); 573 wake_parent(pipefd[0], lguest_fd);
584 } 574 }
585 /* Close the reading end of our copy of the pipe. */ 575 /* Close the reading end of our copy of the pipe. */
586 close(pipefd[0]); 576 close(pipefd[0]);
@@ -602,83 +592,128 @@ static void *_check_pointer(unsigned long addr, unsigned int size,
602{ 592{
603 /* We have to separately check addr and addr+size, because size could 593 /* We have to separately check addr and addr+size, because size could
604 * be huge and addr + size might wrap around. */ 594 * be huge and addr + size might wrap around. */
605 if (addr >= top || addr + size >= top) 595 if (addr >= guest_limit || addr + size >= guest_limit)
606 errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr); 596 errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr);
607 /* We return a pointer for the caller's convenience, now we know it's 597 /* We return a pointer for the caller's convenience, now we know it's
608 * safe to use. */ 598 * safe to use. */
609 return (void *)addr; 599 return from_guest_phys(addr);
610} 600}
611/* A macro which transparently hands the line number to the real function. */ 601/* A macro which transparently hands the line number to the real function. */
612#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) 602#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
613 603
614/* The Guest has given us the address of a "struct lguest_dma". We check it's 604/* This function returns the next descriptor in the chain, or vq->vring.num. */
615 * OK and convert it to an iovec (which is a simple array of ptr/size 605static unsigned next_desc(struct virtqueue *vq, unsigned int i)
616 * pairs). */
617static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
618{ 606{
619 unsigned int i; 607 unsigned int next;
620 struct lguest_dma *udma;
621
622 /* First we make sure that the array memory itself is valid. */
623 udma = check_pointer(dma, sizeof(*udma));
624 /* Now we check each element */
625 for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
626 /* A zero length ends the array. */
627 if (!udma->len[i])
628 break;
629 608
630 iov[i].iov_base = check_pointer(udma->addr[i], udma->len[i]); 609 /* If this descriptor says it doesn't chain, we're done. */
631 iov[i].iov_len = udma->len[i]; 610 if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT))
632 } 611 return vq->vring.num;
633 *num = i; 612
613 /* Check they're not leading us off end of descriptors. */
614 next = vq->vring.desc[i].next;
615 /* Make sure compiler knows to grab that: we don't want it changing! */
616 wmb();
634 617
635 /* We return the pointer to where the caller should write the amount of 618 if (next >= vq->vring.num)
636 * the buffer used. */ 619 errx(1, "Desc next is %u", next);
637 return &udma->used_len; 620
621 return next;
622}
623
624/* This looks in the virtqueue and for the first available buffer, and converts
625 * it to an iovec for convenient access. Since descriptors consist of some
626 * number of output then some number of input descriptors, it's actually two
627 * iovecs, but we pack them into one and note how many of each there were.
628 *
629 * This function returns the descriptor number found, or vq->vring.num (which
630 * is never a valid descriptor number) if none was found. */
631static unsigned get_vq_desc(struct virtqueue *vq,
632 struct iovec iov[],
633 unsigned int *out_num, unsigned int *in_num)
634{
635 unsigned int i, head;
636
637 /* Check it isn't doing very strange things with descriptor numbers. */
638 if ((u16)(vq->vring.avail->idx - vq->last_avail_idx) > vq->vring.num)
639 errx(1, "Guest moved used index from %u to %u",
640 vq->last_avail_idx, vq->vring.avail->idx);
641
642 /* If there's nothing new since last we looked, return invalid. */
643 if (vq->vring.avail->idx == vq->last_avail_idx)
644 return vq->vring.num;
645
646 /* Grab the next descriptor number they're advertising, and increment
647 * the index we've seen. */
648 head = vq->vring.avail->ring[vq->last_avail_idx++ % vq->vring.num];
649
650 /* If their number is silly, that's a fatal mistake. */
651 if (head >= vq->vring.num)
652 errx(1, "Guest says index %u is available", head);
653
654 /* When we start there are none of either input nor output. */
655 *out_num = *in_num = 0;
656
657 i = head;
658 do {
659 /* Grab the first descriptor, and check it's OK. */
660 iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len;
661 iov[*out_num + *in_num].iov_base
662 = check_pointer(vq->vring.desc[i].addr,
663 vq->vring.desc[i].len);
664 /* If this is an input descriptor, increment that count. */
665 if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE)
666 (*in_num)++;
667 else {
668 /* If it's an output descriptor, they're all supposed
669 * to come before any input descriptors. */
670 if (*in_num)
671 errx(1, "Descriptor has out after in");
672 (*out_num)++;
673 }
674
675 /* If we've got too many, that implies a descriptor loop. */
676 if (*out_num + *in_num > vq->vring.num)
677 errx(1, "Looped descriptor");
678 } while ((i = next_desc(vq, i)) != vq->vring.num);
679
680 return head;
638} 681}
639 682
640/* This routine gets a DMA buffer from the Guest for a given key, and converts 683/* Once we've used one of their buffers, we tell them about it. We'll then
641 * it to an iovec array. It returns the interrupt the Guest wants when we're 684 * want to send them an interrupt, using trigger_irq(). */
642 * finished, and a pointer to the "used_len" field to fill in. */ 685static void add_used(struct virtqueue *vq, unsigned int head, int len)
643static u32 *get_dma_buffer(int fd, void *key,
644 struct iovec iov[], unsigned int *num, u32 *irq)
645{ 686{
646 u32 buf[] = { LHREQ_GETDMA, (u32)key }; 687 struct vring_used_elem *used;
647 unsigned long udma; 688
648 u32 *res; 689 /* Get a pointer to the next entry in the used ring. */
649 690 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
650 /* Ask the kernel for a DMA buffer corresponding to this key. */ 691 used->id = head;
651 udma = write(fd, buf, sizeof(buf)); 692 used->len = len;
652 /* They haven't registered any, or they're all used? */ 693 /* Make sure buffer is written before we update index. */
653 if (udma == (unsigned long)-1) 694 wmb();
654 return NULL; 695 vq->vring.used->idx++;
655
656 /* Convert it into our iovec array */
657 res = dma2iov(udma, iov, num);
658 /* The kernel stashes irq in ->used_len to get it out to us. */
659 *irq = *res;
660 /* Return a pointer to ((struct lguest_dma *)udma)->used_len. */
661 return res;
662} 696}
663 697
664/* This is a convenient routine to send the Guest an interrupt. */ 698/* This actually sends the interrupt for this virtqueue */
665static void trigger_irq(int fd, u32 irq) 699static void trigger_irq(int fd, struct virtqueue *vq)
666{ 700{
667 u32 buf[] = { LHREQ_IRQ, irq }; 701 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
702
703 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
704 return;
705
706 /* Send the Guest an interrupt tell them we used something up. */
668 if (write(fd, buf, sizeof(buf)) != 0) 707 if (write(fd, buf, sizeof(buf)) != 0)
669 err(1, "Triggering irq %i", irq); 708 err(1, "Triggering irq %i", vq->config.irq);
670} 709}
671 710
672/* This simply sets up an iovec array where we can put data to be discarded. 711/* And here's the combo meal deal. Supersize me! */
673 * This happens when the Guest doesn't want or can't handle the input: we have 712static void add_used_and_trigger(int fd, struct virtqueue *vq,
674 * to get rid of it somewhere, and if we bury it in the ceiling space it will 713 unsigned int head, int len)
675 * start to smell after a week. */
676static void discard_iovec(struct iovec *iov, unsigned int *num)
677{ 714{
678 static char discard_buf[1024]; 715 add_used(vq, head, len);
679 *num = 1; 716 trigger_irq(fd, vq);
680 iov->iov_base = discard_buf;
681 iov->iov_len = sizeof(discard_buf);
682} 717}
683 718
684/* Here is the input terminal setting we save, and the routine to restore them 719/* Here is the input terminal setting we save, and the routine to restore them
@@ -701,38 +736,39 @@ struct console_abort
701/* This is the routine which handles console input (ie. stdin). */ 736/* This is the routine which handles console input (ie. stdin). */
702static bool handle_console_input(int fd, struct device *dev) 737static bool handle_console_input(int fd, struct device *dev)
703{ 738{
704 u32 irq = 0, *lenp;
705 int len; 739 int len;
706 unsigned int num; 740 unsigned int head, in_num, out_num;
707 struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; 741 struct iovec iov[dev->vq->vring.num];
708 struct console_abort *abort = dev->priv; 742 struct console_abort *abort = dev->priv;
709 743
710 /* First we get the console buffer from the Guest. The key is dev->mem 744 /* First we need a console buffer from the Guests's input virtqueue. */
711 * which was set to 0 in setup_console(). */ 745 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
712 lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq); 746
713 if (!lenp) { 747 /* If they're not ready for input, stop listening to this file
714 /* If it's not ready for input, warn and set up to discard. */ 748 * descriptor. We'll start again once they add an input buffer. */
715 warn("console: no dma buffer!"); 749 if (head == dev->vq->vring.num)
716 discard_iovec(iov, &num); 750 return false;
717 } 751
752 if (out_num)
753 errx(1, "Output buffers in console in queue?");
718 754
719 /* This is why we convert to iovecs: the readv() call uses them, and so 755 /* This is why we convert to iovecs: the readv() call uses them, and so
720 * it reads straight into the Guest's buffer. */ 756 * it reads straight into the Guest's buffer. */
721 len = readv(dev->fd, iov, num); 757 len = readv(dev->fd, iov, in_num);
722 if (len <= 0) { 758 if (len <= 0) {
723 /* This implies that the console is closed, is /dev/null, or 759 /* This implies that the console is closed, is /dev/null, or
724 * something went terribly wrong. We still go through the rest 760 * something went terribly wrong. */
725 * of the logic, though, especially the exit handling below. */
726 warnx("Failed to get console input, ignoring console."); 761 warnx("Failed to get console input, ignoring console.");
727 len = 0; 762 /* Put the input terminal back. */
763 restore_term();
764 /* Remove callback from input vq, so it doesn't restart us. */
765 dev->vq->handle_output = NULL;
766 /* Stop listening to this fd: don't call us again. */
767 return false;
728 } 768 }
729 769
730 /* If we read the data into the Guest, fill in the length and send the 770 /* Tell the Guest about the new input. */
731 * interrupt. */ 771 add_used_and_trigger(fd, dev->vq, head, len);
732 if (lenp) {
733 *lenp = len;
734 trigger_irq(fd, irq);
735 }
736 772
737 /* Three ^C within one second? Exit. 773 /* Three ^C within one second? Exit.
738 * 774 *
@@ -746,7 +782,7 @@ static bool handle_console_input(int fd, struct device *dev)
746 struct timeval now; 782 struct timeval now;
747 gettimeofday(&now, NULL); 783 gettimeofday(&now, NULL);
748 if (now.tv_sec <= abort->start.tv_sec+1) { 784 if (now.tv_sec <= abort->start.tv_sec+1) {
749 u32 args[] = { LHREQ_BREAK, 0 }; 785 unsigned long args[] = { LHREQ_BREAK, 0 };
750 /* Close the fd so Waker will know it has to 786 /* Close the fd so Waker will know it has to
751 * exit. */ 787 * exit. */
752 close(waker_fd); 788 close(waker_fd);
@@ -761,214 +797,163 @@ static bool handle_console_input(int fd, struct device *dev)
761 /* Any other key resets the abort counter. */ 797 /* Any other key resets the abort counter. */
762 abort->count = 0; 798 abort->count = 0;
763 799
764 /* Now, if we didn't read anything, put the input terminal back and
765 * return failure (meaning, don't call us again). */
766 if (!len) {
767 restore_term();
768 return false;
769 }
770 /* Everything went OK! */ 800 /* Everything went OK! */
771 return true; 801 return true;
772} 802}
773 803
774/* Handling console output is much simpler than input. */ 804/* Handling output for console is simple: we just get all the output buffers
775static u32 handle_console_output(int fd, const struct iovec *iov, 805 * and write them to stdout. */
776 unsigned num, struct device*dev) 806static void handle_console_output(int fd, struct virtqueue *vq)
777{ 807{
778 /* Whatever the Guest sends, write it to standard output. Return the 808 unsigned int head, out, in;
779 * number of bytes written. */ 809 int len;
780 return writev(STDOUT_FILENO, iov, num); 810 struct iovec iov[vq->vring.num];
781} 811
782 812 /* Keep getting output buffers from the Guest until we run out. */
783/* Guest->Host network output is also pretty easy. */ 813 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
784static u32 handle_tun_output(int fd, const struct iovec *iov, 814 if (in)
785 unsigned num, struct device *dev) 815 errx(1, "Input buffers in output queue?");
786{ 816 len = writev(STDOUT_FILENO, iov, out);
787 /* We put a flag in the "priv" pointer of the network device, and set 817 add_used_and_trigger(fd, vq, head, len);
788 * it as soon as we see output. We'll see why in handle_tun_input() */ 818 }
789 *(bool *)dev->priv = true;
790 /* Whatever packet the Guest sent us, write it out to the tun
791 * device. */
792 return writev(dev->fd, iov, num);
793} 819}
794 820
795/* This matches the peer_key() in lguest_net.c. The key for any given slot 821/* Handling output for network is also simple: we get all the output buffers
796 * is the address of the network device's page plus 4 * the slot number. */ 822 * and write them (ignoring the first element) to this device's file descriptor
797static unsigned long peer_offset(unsigned int peernum) 823 * (stdout). */
824static void handle_net_output(int fd, struct virtqueue *vq)
798{ 825{
799 return 4 * peernum; 826 unsigned int head, out, in;
827 int len;
828 struct iovec iov[vq->vring.num];
829
830 /* Keep getting output buffers from the Guest until we run out. */
831 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
832 if (in)
833 errx(1, "Input buffers in output queue?");
834 /* Check header, but otherwise ignore it (we said we supported
835 * no features). */
836 (void)convert(&iov[0], struct virtio_net_hdr);
837 len = writev(vq->dev->fd, iov+1, out-1);
838 add_used_and_trigger(fd, vq, head, len);
839 }
800} 840}
801 841
802/* This is where we handle a packet coming in from the tun device */ 842/* This is where we handle a packet coming in from the tun device to our
843 * Guest. */
803static bool handle_tun_input(int fd, struct device *dev) 844static bool handle_tun_input(int fd, struct device *dev)
804{ 845{
805 u32 irq = 0, *lenp; 846 unsigned int head, in_num, out_num;
806 int len; 847 int len;
807 unsigned num; 848 struct iovec iov[dev->vq->vring.num];
808 struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; 849 struct virtio_net_hdr *hdr;
809 850
810 /* First we get a buffer the Guest has bound to its key. */ 851 /* First we need a network buffer from the Guests's recv virtqueue. */
811 lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num, 852 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
812 &irq); 853 if (head == dev->vq->vring.num) {
813 if (!lenp) {
814 /* Now, it's expected that if we try to send a packet too 854 /* Now, it's expected that if we try to send a packet too
815 * early, the Guest won't be ready yet. This is why we set a 855 * early, the Guest won't be ready yet. Wait until the device
816 * flag when the Guest sends its first packet. If it's sent a 856 * status says it's ready. */
817 * packet we assume it should be ready to receive them. 857 /* FIXME: Actually want DRIVER_ACTIVE here. */
818 * 858 if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK)
819 * Actually, this is what the status bits in the descriptor are
820 * for: we should *use* them. FIXME! */
821 if (*(bool *)dev->priv)
822 warn("network: no dma buffer!"); 859 warn("network: no dma buffer!");
823 discard_iovec(iov, &num); 860 /* We'll turn this back on if input buffers are registered. */
824 } 861 return false;
862 } else if (out_num)
863 errx(1, "Output buffers in network recv queue?");
864
865 /* First element is the header: we set it to 0 (no features). */
866 hdr = convert(&iov[0], struct virtio_net_hdr);
867 hdr->flags = 0;
868 hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
825 869
826 /* Read the packet from the device directly into the Guest's buffer. */ 870 /* Read the packet from the device directly into the Guest's buffer. */
827 len = readv(dev->fd, iov, num); 871 len = readv(dev->fd, iov+1, in_num-1);
828 if (len <= 0) 872 if (len <= 0)
829 err(1, "reading network"); 873 err(1, "reading network");
830 874
831 /* Write the used_len, and trigger the interrupt for the Guest */ 875 /* Tell the Guest about the new packet. */
832 if (lenp) { 876 add_used_and_trigger(fd, dev->vq, head, sizeof(*hdr) + len);
833 *lenp = len; 877
834 trigger_irq(fd, irq);
835 }
836 verbose("tun input packet len %i [%02x %02x] (%s)\n", len, 878 verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
837 ((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1], 879 ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1],
838 lenp ? "sent" : "discarded"); 880 head != dev->vq->vring.num ? "sent" : "discarded");
881
839 /* All good. */ 882 /* All good. */
840 return true; 883 return true;
841} 884}
842 885
843/* The last device handling routine is block output: the Guest has sent a DMA 886/* This callback ensures we try again, in case we stopped console or net
844 * to the block device. It will have placed the command it wants in the 887 * delivery because Guest didn't have any buffers. */
845 * "struct lguest_block_page". */ 888static void enable_fd(int fd, struct virtqueue *vq)
846static u32 handle_block_output(int fd, const struct iovec *iov,
847 unsigned num, struct device *dev)
848{ 889{
849 struct lguest_block_page *p = dev->mem; 890 add_device_fd(vq->dev->fd);
850 u32 irq, *lenp; 891 /* Tell waker to listen to it again */
851 unsigned int len, reply_num; 892 write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd));
852 struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
853 off64_t device_len, off = (off64_t)p->sector * 512;
854
855 /* First we extract the device length from the dev->priv pointer. */
856 device_len = *(off64_t *)dev->priv;
857
858 /* We first check that the read or write is within the length of the
859 * block file. */
860 if (off >= device_len)
861 err(1, "Bad offset %llu vs %llu", off, device_len);
862 /* Move to the right location in the block file. This shouldn't fail,
863 * but best to check. */
864 if (lseek64(dev->fd, off, SEEK_SET) != off)
865 err(1, "Bad seek to sector %i", p->sector);
866
867 verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off);
868
869 /* They were supposed to bind a reply buffer at key equal to the start
870 * of the block device memory. We need this to tell them when the
871 * request is finished. */
872 lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
873 if (!lenp)
874 err(1, "Block request didn't give us a dma buffer");
875
876 if (p->type) {
877 /* A write request. The DMA they sent contained the data, so
878 * write it out. */
879 len = writev(dev->fd, iov, num);
880 /* Grr... Now we know how long the "struct lguest_dma" they
881 * sent was, we make sure they didn't try to write over the end
882 * of the block file (possibly extending it). */
883 if (off + len > device_len) {
884 /* Trim it back to the correct length */
885 ftruncate64(dev->fd, device_len);
886 /* Die, bad Guest, die. */
887 errx(1, "Write past end %llu+%u", off, len);
888 }
889 /* The reply length is 0: we just send back an empty DMA to
890 * interrupt them and tell them the write is finished. */
891 *lenp = 0;
892 } else {
893 /* A read request. They sent an empty DMA to start the
894 * request, and we put the read contents into the reply
895 * buffer. */
896 len = readv(dev->fd, reply, reply_num);
897 *lenp = len;
898 }
899
900 /* The result is 1 (done), 2 if there was an error (short read or
901 * write). */
902 p->result = 1 + (p->bytes != len);
903 /* Now tell them we've used their reply buffer. */
904 trigger_irq(fd, irq);
905
906 /* We're supposed to return the number of bytes of the output buffer we
907 * used. But the block device uses the "result" field instead, so we
908 * don't bother. */
909 return 0;
910} 893}
911 894
912/* This is the generic routine we call when the Guest sends some DMA out. */ 895/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */
913static void handle_output(int fd, unsigned long dma, unsigned long key, 896static void handle_output(int fd, unsigned long addr)
914 struct device_list *devices)
915{ 897{
916 struct device *i; 898 struct device *i;
917 u32 *lenp; 899 struct virtqueue *vq;
918 struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; 900
919 unsigned num = 0; 901 /* Check each virtqueue. */
920 902 for (i = devices.dev; i; i = i->next) {
921 /* Convert the "struct lguest_dma" they're sending to a "struct 903 for (vq = i->vq; vq; vq = vq->next) {
922 * iovec". */ 904 if (vq->config.pfn == addr/getpagesize()
923 lenp = dma2iov(dma, iov, &num); 905 && vq->handle_output) {
924 906 verbose("Output to %s\n", vq->dev->name);
925 /* Check each device: if they expect output to this key, tell them to 907 vq->handle_output(fd, vq);
926 * handle it. */ 908 return;
927 for (i = devices->dev; i; i = i->next) { 909 }
928 if (i->handle_output && key == i->watch_key) {
929 /* We write the result straight into the used_len field
930 * for them. */
931 *lenp = i->handle_output(fd, iov, num, i);
932 return;
933 } 910 }
934 } 911 }
935 912
936 /* This can happen: the kernel sends any SEND_DMA which doesn't match 913 /* Early console write is done using notify on a nul-terminated string
937 * another Guest to us. It could be that another Guest just left a 914 * in Guest memory. */
938 * network, for example. But it's unusual. */ 915 if (addr >= guest_limit)
939 warnx("Pending dma %p, key %p", (void *)dma, (void *)key); 916 errx(1, "Bad NOTIFY %#lx", addr);
917
918 write(STDOUT_FILENO, from_guest_phys(addr),
919 strnlen(from_guest_phys(addr), guest_limit - addr));
940} 920}
941 921
942/* This is called when the waker wakes us up: check for incoming file 922/* This is called when the waker wakes us up: check for incoming file
943 * descriptors. */ 923 * descriptors. */
944static void handle_input(int fd, struct device_list *devices) 924static void handle_input(int fd)
945{ 925{
946 /* select() wants a zeroed timeval to mean "don't wait". */ 926 /* select() wants a zeroed timeval to mean "don't wait". */
947 struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; 927 struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
948 928
949 for (;;) { 929 for (;;) {
950 struct device *i; 930 struct device *i;
951 fd_set fds = devices->infds; 931 fd_set fds = devices.infds;
952 932
953 /* If nothing is ready, we're done. */ 933 /* If nothing is ready, we're done. */
954 if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0) 934 if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0)
955 break; 935 break;
956 936
957 /* Otherwise, call the device(s) which have readable 937 /* Otherwise, call the device(s) which have readable
958 * file descriptors and a method of handling them. */ 938 * file descriptors and a method of handling them. */
959 for (i = devices->dev; i; i = i->next) { 939 for (i = devices.dev; i; i = i->next) {
960 if (i->handle_input && FD_ISSET(i->fd, &fds)) { 940 if (i->handle_input && FD_ISSET(i->fd, &fds)) {
941 int dev_fd;
942 if (i->handle_input(fd, i))
943 continue;
944
961 /* If handle_input() returns false, it means we 945 /* If handle_input() returns false, it means we
962 * should no longer service it. 946 * should no longer service it. Networking and
963 * handle_console_input() does this. */ 947 * console do this when there's no input
964 if (!i->handle_input(fd, i)) { 948 * buffers to deliver into. Console also uses
965 /* Clear it from the set of input file 949 * it when it discovers that stdin is
966 * descriptors kept at the head of the 950 * closed. */
967 * device list. */ 951 FD_CLR(i->fd, &devices.infds);
968 FD_CLR(i->fd, &devices->infds); 952 /* Tell waker to ignore it too, by sending a
969 /* Tell waker to ignore it too... */ 953 * negative fd number (-1, since 0 is a valid
970 write(waker_fd, &i->fd, sizeof(i->fd)); 954 * FD number). */
971 } 955 dev_fd = -i->fd - 1;
956 write(waker_fd, &dev_fd, sizeof(dev_fd));
972 } 957 }
973 } 958 }
974 } 959 }
@@ -982,43 +967,93 @@ static void handle_input(int fd, struct device_list *devices)
982 * routines to allocate them. 967 * routines to allocate them.
983 * 968 *
984 * This routine allocates a new "struct lguest_device_desc" from descriptor 969 * This routine allocates a new "struct lguest_device_desc" from descriptor
985 * table in the devices array just above the Guest's normal memory. */ 970 * table just above the Guest's normal memory. It returns a pointer to that
986static struct lguest_device_desc * 971 * descriptor. */
987new_dev_desc(struct lguest_device_desc *descs, 972static struct lguest_device_desc *new_dev_desc(u16 type)
988 u16 type, u16 features, u16 num_pages)
989{ 973{
990 unsigned int i; 974 struct lguest_device_desc *d;
991 975
992 for (i = 0; i < LGUEST_MAX_DEVICES; i++) { 976 /* We only have one page for all the descriptors. */
993 if (!descs[i].type) { 977 if (devices.desc_used + sizeof(*d) > getpagesize())
994 descs[i].type = type; 978 errx(1, "Too many devices");
995 descs[i].features = features; 979
996 descs[i].num_pages = num_pages; 980 /* We don't need to set config_len or status: page is 0 already. */
997 /* If they said the device needs memory, we allocate 981 d = (void *)devices.descpage + devices.desc_used;
998 * that now, bumping up the top of Guest memory. */ 982 d->type = type;
999 if (num_pages) { 983 devices.desc_used += sizeof(*d);
1000 map_zeroed_pages(top, num_pages); 984
1001 descs[i].pfn = top/getpagesize(); 985 return d;
1002 top += num_pages*getpagesize();
1003 }
1004 return &descs[i];
1005 }
1006 }
1007 errx(1, "too many devices");
1008} 986}
1009 987
1010/* This monster routine does all the creation and setup of a new device, 988/* Each device descriptor is followed by some configuration information.
1011 * including caling new_dev_desc() to allocate the descriptor and device 989 * The first byte is a "status" byte for the Guest to report what's happening.
1012 * memory. */ 990 * After that are fields: u8 type, u8 len, [... len bytes...].
1013static struct device *new_device(struct device_list *devices, 991 *
1014 u16 type, u16 num_pages, u16 features, 992 * This routine adds a new field to an existing device's descriptor. It only
1015 int fd, 993 * works for the last device, but that's OK because that's how we use it. */
1016 bool (*handle_input)(int, struct device *), 994static void add_desc_field(struct device *dev, u8 type, u8 len, const void *c)
1017 unsigned long watch_off, 995{
1018 u32 (*handle_output)(int, 996 /* This is the last descriptor, right? */
1019 const struct iovec *, 997 assert(devices.descpage + devices.desc_used
1020 unsigned, 998 == (u8 *)(dev->desc + 1) + dev->desc->config_len);
1021 struct device *)) 999
1000 /* We only have one page of device descriptions. */
1001 if (devices.desc_used + 2 + len > getpagesize())
1002 errx(1, "Too many devices");
1003
1004 /* Copy in the new config header: type then length. */
1005 devices.descpage[devices.desc_used++] = type;
1006 devices.descpage[devices.desc_used++] = len;
1007 memcpy(devices.descpage + devices.desc_used, c, len);
1008 devices.desc_used += len;
1009
1010 /* Update the device descriptor length: two byte head then data. */
1011 dev->desc->config_len += 2 + len;
1012}
1013
1014/* This routine adds a virtqueue to a device. We specify how many descriptors
1015 * the virtqueue is to have. */
1016static void add_virtqueue(struct device *dev, unsigned int num_descs,
1017 void (*handle_output)(int fd, struct virtqueue *me))
1018{
1019 unsigned int pages;
1020 struct virtqueue **i, *vq = malloc(sizeof(*vq));
1021 void *p;
1022
1023 /* First we need some pages for this virtqueue. */
1024 pages = (vring_size(num_descs) + getpagesize() - 1) / getpagesize();
1025 p = get_pages(pages);
1026
1027 /* Initialize the configuration. */
1028 vq->config.num = num_descs;
1029 vq->config.irq = devices.next_irq++;
1030 vq->config.pfn = to_guest_phys(p) / getpagesize();
1031
1032 /* Initialize the vring. */
1033 vring_init(&vq->vring, num_descs, p);
1034
1035 /* Add the configuration information to this device's descriptor. */
1036 add_desc_field(dev, VIRTIO_CONFIG_F_VIRTQUEUE,
1037 sizeof(vq->config), &vq->config);
1038
1039 /* Add to tail of list, so dev->vq is first vq, dev->vq->next is
1040 * second. */
1041 for (i = &dev->vq; *i; i = &(*i)->next);
1042 *i = vq;
1043
1044 /* Link virtqueue back to device. */
1045 vq->dev = dev;
1046
1047 /* Set up handler. */
1048 vq->handle_output = handle_output;
1049 if (!handle_output)
1050 vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
1051}
1052
1053/* This routine does all the creation and setup of a new device, including
1054 * caling new_dev_desc() to allocate the descriptor and device memory. */
1055static struct device *new_device(const char *name, u16 type, int fd,
1056 bool (*handle_input)(int, struct device *))
1022{ 1057{
1023 struct device *dev = malloc(sizeof(*dev)); 1058 struct device *dev = malloc(sizeof(*dev));
1024 1059
@@ -1026,27 +1061,25 @@ static struct device *new_device(struct device_list *devices,
1026 * easier, but the user expects the devices to be arranged on the bus 1061 * easier, but the user expects the devices to be arranged on the bus
1027 * in command-line order. The first network device on the command line 1062 * in command-line order. The first network device on the command line
1028 * is eth0, the first block device /dev/lgba, etc. */ 1063 * is eth0, the first block device /dev/lgba, etc. */
1029 *devices->lastdev = dev; 1064 *devices.lastdev = dev;
1030 dev->next = NULL; 1065 dev->next = NULL;
1031 devices->lastdev = &dev->next; 1066 devices.lastdev = &dev->next;
1032 1067
1033 /* Now we populate the fields one at a time. */ 1068 /* Now we populate the fields one at a time. */
1034 dev->fd = fd; 1069 dev->fd = fd;
1035 /* If we have an input handler for this file descriptor, then we add it 1070 /* If we have an input handler for this file descriptor, then we add it
1036 * to the device_list's fdset and maxfd. */ 1071 * to the device_list's fdset and maxfd. */
1037 if (handle_input) 1072 if (handle_input)
1038 set_fd(dev->fd, devices); 1073 add_device_fd(dev->fd);
1039 dev->desc = new_dev_desc(devices->descs, type, features, num_pages); 1074 dev->desc = new_dev_desc(type);
1040 dev->mem = (void *)(dev->desc->pfn * getpagesize());
1041 dev->handle_input = handle_input; 1075 dev->handle_input = handle_input;
1042 dev->watch_key = (unsigned long)dev->mem + watch_off; 1076 dev->name = name;
1043 dev->handle_output = handle_output;
1044 return dev; 1077 return dev;
1045} 1078}
1046 1079
1047/* Our first setup routine is the console. It's a fairly simple device, but 1080/* Our first setup routine is the console. It's a fairly simple device, but
1048 * UNIX tty handling makes it uglier than it could be. */ 1081 * UNIX tty handling makes it uglier than it could be. */
1049static void setup_console(struct device_list *devices) 1082static void setup_console(void)
1050{ 1083{
1051 struct device *dev; 1084 struct device *dev;
1052 1085
@@ -1062,127 +1095,38 @@ static void setup_console(struct device_list *devices)
1062 atexit(restore_term); 1095 atexit(restore_term);
1063 } 1096 }
1064 1097
1065 /* We don't currently require any memory for the console, so we ask for 1098 dev = new_device("console", VIRTIO_ID_CONSOLE,
1066 * 0 pages. */ 1099 STDIN_FILENO, handle_console_input);
1067 dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0,
1068 STDIN_FILENO, handle_console_input,
1069 LGUEST_CONSOLE_DMA_KEY, handle_console_output);
1070 /* We store the console state in dev->priv, and initialize it. */ 1100 /* We store the console state in dev->priv, and initialize it. */
1071 dev->priv = malloc(sizeof(struct console_abort)); 1101 dev->priv = malloc(sizeof(struct console_abort));
1072 ((struct console_abort *)dev->priv)->count = 0; 1102 ((struct console_abort *)dev->priv)->count = 0;
1073 verbose("device %p: console\n",
1074 (void *)(dev->desc->pfn * getpagesize()));
1075}
1076 1103
1077/* Setting up a block file is also fairly straightforward. */ 1104 /* The console needs two virtqueues: the input then the output. When
1078static void setup_block_file(const char *filename, struct device_list *devices) 1105 * they put something the input queue, we make sure we're listening to
1079{ 1106 * stdin. When they put something in the output queue, we write it to
1080 int fd; 1107 * stdout. */
1081 struct device *dev; 1108 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
1082 off64_t *device_len; 1109 add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output);
1083 struct lguest_block_page *p; 1110
1084 1111 verbose("device %u: console\n", devices.device_num++);
1085 /* We open with O_LARGEFILE because otherwise we get stuck at 2G. We
1086 * open with O_DIRECT because otherwise our benchmarks go much too
1087 * fast. */
1088 fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT);
1089
1090 /* We want one page, and have no input handler (the block file never
1091 * has anything interesting to say to us). Our timing will be quite
1092 * random, so it should be a reasonable randomness source. */
1093 dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1,
1094 LGUEST_DEVICE_F_RANDOMNESS,
1095 fd, NULL, 0, handle_block_output);
1096
1097 /* We store the device size in the private area */
1098 device_len = dev->priv = malloc(sizeof(*device_len));
1099 /* This is the safe way of establishing the size of our device: it
1100 * might be a normal file or an actual block device like /dev/hdb. */
1101 *device_len = lseek64(fd, 0, SEEK_END);
1102
1103 /* The device memory is a "struct lguest_block_page". It's zeroed
1104 * already, we just need to put in the device size. Block devices
1105 * think in sectors (ie. 512 byte chunks), so we translate here. */
1106 p = dev->mem;
1107 p->num_sectors = *device_len/512;
1108 verbose("device %p: block %i sectors\n",
1109 (void *)(dev->desc->pfn * getpagesize()), p->num_sectors);
1110} 1112}
1113/*:*/
1111 1114
1112/* 1115/*M:010 Inter-guest networking is an interesting area. Simplest is to have a
1113 * Network Devices. 1116 * --sharenet=<name> option which opens or creates a named pipe. This can be
1117 * used to send packets to another guest in a 1:1 manner.
1114 * 1118 *
1115 * Setting up network devices is quite a pain, because we have three types. 1119 * More sopisticated is to use one of the tools developed for project like UML
1116 * First, we have the inter-Guest network. This is a file which is mapped into 1120 * to do networking.
1117 * the address space of the Guests who are on the network. Because it is a
1118 * shared mapping, the same page underlies all the devices, and they can send
1119 * DMA to each other.
1120 * 1121 *
1121 * Remember from our network driver, the Guest is told what slot in the page it 1122 * Faster is to do virtio bonding in kernel. Doing this 1:1 would be
1122 * is to use. We use exclusive fnctl locks to reserve a slot. If another 1123 * completely generic ("here's my vring, attach to your vring") and would work
1123 * Guest is using a slot, the lock will fail and we try another. Because fnctl 1124 * for any traffic. Of course, namespace and permissions issues need to be
1124 * locks are cleaned up automatically when we die, this cleverly means that our 1125 * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide
1125 * reservation on the slot will vanish if we crash. */ 1126 * multiple inter-guest channels behind one interface, although it would
1126static unsigned int find_slot(int netfd, const char *filename) 1127 * require some manner of hotplugging new virtio channels.
1127{ 1128 *
1128 struct flock fl; 1129 * Finally, we could implement a virtio network switch in the kernel. :*/
1129
1130 fl.l_type = F_WRLCK;
1131 fl.l_whence = SEEK_SET;
1132 fl.l_len = 1;
1133 /* Try a 1 byte lock in each possible position number */
1134 for (fl.l_start = 0;
1135 fl.l_start < getpagesize()/sizeof(struct lguest_net);
1136 fl.l_start++) {
1137 /* If we succeed, return the slot number. */
1138 if (fcntl(netfd, F_SETLK, &fl) == 0)
1139 return fl.l_start;
1140 }
1141 errx(1, "No free slots in network file %s", filename);
1142}
1143
1144/* This function sets up the network file */
1145static void setup_net_file(const char *filename,
1146 struct device_list *devices)
1147{
1148 int netfd;
1149 struct device *dev;
1150
1151 /* We don't use open_or_die() here: for friendliness we create the file
1152 * if it doesn't already exist. */
1153 netfd = open(filename, O_RDWR, 0);
1154 if (netfd < 0) {
1155 if (errno == ENOENT) {
1156 netfd = open(filename, O_RDWR|O_CREAT, 0600);
1157 if (netfd >= 0) {
1158 /* If we succeeded, initialize the file with a
1159 * blank page. */
1160 char page[getpagesize()];
1161 memset(page, 0, sizeof(page));
1162 write(netfd, page, sizeof(page));
1163 }
1164 }
1165 if (netfd < 0)
1166 err(1, "cannot open net file '%s'", filename);
1167 }
1168
1169 /* We need 1 page, and the features indicate the slot to use and that
1170 * no checksum is needed. We never touch this device again; it's
1171 * between the Guests on the network, so we don't register input or
1172 * output handlers. */
1173 dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
1174 find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM,
1175 -1, NULL, 0, NULL);
1176
1177 /* Map the shared file. */
1178 if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE,
1179 MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem)
1180 err(1, "could not mmap '%s'", filename);
1181 verbose("device %p: shared net %s, peer %i\n",
1182 (void *)(dev->desc->pfn * getpagesize()), filename,
1183 dev->desc->features & ~LGUEST_NET_F_NOCSUM);
1184}
1185/*:*/
1186 1130
1187static u32 str2ip(const char *ipaddr) 1131static u32 str2ip(const char *ipaddr)
1188{ 1132{
@@ -1217,7 +1161,7 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name)
1217 1161
1218/* This sets up the Host end of the network device with an IP address, brings 1162/* This sets up the Host end of the network device with an IP address, brings
1219 * it up so packets will flow, the copies the MAC address into the hwaddr 1163 * it up so packets will flow, the copies the MAC address into the hwaddr
1220 * pointer (in practice, the Host's slot in the network device's memory). */ 1164 * pointer. */
1221static void configure_device(int fd, const char *devname, u32 ipaddr, 1165static void configure_device(int fd, const char *devname, u32 ipaddr,
1222 unsigned char hwaddr[6]) 1166 unsigned char hwaddr[6])
1223{ 1167{
@@ -1243,18 +1187,18 @@ static void configure_device(int fd, const char *devname, u32 ipaddr,
1243 memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6); 1187 memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
1244} 1188}
1245 1189
1246/*L:195 The other kind of network is a Host<->Guest network. This can either 1190/*L:195 Our network is a Host<->Guest network. This can either use bridging or
1247 * use briding or routing, but the principle is the same: it uses the "tun" 1191 * routing, but the principle is the same: it uses the "tun" device to inject
1248 * device to inject packets into the Host as if they came in from a normal 1192 * packets into the Host as if they came in from a normal network card. We
1249 * network card. We just shunt packets between the Guest and the tun 1193 * just shunt packets between the Guest and the tun device. */
1250 * device. */ 1194static void setup_tun_net(const char *arg)
1251static void setup_tun_net(const char *arg, struct device_list *devices)
1252{ 1195{
1253 struct device *dev; 1196 struct device *dev;
1254 struct ifreq ifr; 1197 struct ifreq ifr;
1255 int netfd, ipfd; 1198 int netfd, ipfd;
1256 u32 ip; 1199 u32 ip;
1257 const char *br_name = NULL; 1200 const char *br_name = NULL;
1201 u8 hwaddr[6];
1258 1202
1259 /* We open the /dev/net/tun device and tell it we want a tap device. A 1203 /* We open the /dev/net/tun device and tell it we want a tap device. A
1260 * tap device is like a tun device, only somehow different. To tell 1204 * tap device is like a tun device, only somehow different. To tell
@@ -1270,21 +1214,13 @@ static void setup_tun_net(const char *arg, struct device_list *devices)
1270 * device: trust us! */ 1214 * device: trust us! */
1271 ioctl(netfd, TUNSETNOCSUM, 1); 1215 ioctl(netfd, TUNSETNOCSUM, 1);
1272 1216
1273 /* We create the net device with 1 page, using the features field of 1217 /* First we create a new network device. */
1274 * the descriptor to tell the Guest it is in slot 1 (NET_PEERNUM), and 1218 dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input);
1275 * that the device has fairly random timing. We do *not* specify
1276 * LGUEST_NET_F_NOCSUM: these packets can reach the real world.
1277 *
1278 * We will put our MAC address is slot 0 for the Guest to see, so
1279 * it will send packets to us using the key "peer_offset(0)": */
1280 dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
1281 NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd,
1282 handle_tun_input, peer_offset(0), handle_tun_output);
1283 1219
1284 /* We keep a flag which says whether we've seen packets come out from 1220 /* Network devices need a receive and a send queue, just like
1285 * this network device. */ 1221 * console. */
1286 dev->priv = malloc(sizeof(bool)); 1222 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
1287 *(bool *)dev->priv = false; 1223 add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
1288 1224
1289 /* We need a socket to perform the magic network ioctls to bring up the 1225 /* We need a socket to perform the magic network ioctls to bring up the
1290 * tap interface, connect to the bridge etc. Any socket will do! */ 1226 * tap interface, connect to the bridge etc. Any socket will do! */
@@ -1300,44 +1236,251 @@ static void setup_tun_net(const char *arg, struct device_list *devices)
1300 } else /* It is an IP address to set up the device with */ 1236 } else /* It is an IP address to set up the device with */
1301 ip = str2ip(arg); 1237 ip = str2ip(arg);
1302 1238
1303 /* We are peer 0, ie. first slot, so we hand dev->mem to this routine 1239 /* Set up the tun device, and get the mac address for the interface. */
1304 * to write the MAC address at the start of the device memory. */ 1240 configure_device(ipfd, ifr.ifr_name, ip, hwaddr);
1305 configure_device(ipfd, ifr.ifr_name, ip, dev->mem);
1306 1241
1307 /* Set "promisc" bit: we want every single packet if we're going to 1242 /* Tell Guest what MAC address to use. */
1308 * bridge to other machines (and otherwise it doesn't matter). */ 1243 add_desc_field(dev, VIRTIO_CONFIG_NET_MAC_F, sizeof(hwaddr), hwaddr);
1309 *((u8 *)dev->mem) |= 0x1;
1310 1244
1245 /* We don't seed the socket any more; setup is done. */
1311 close(ipfd); 1246 close(ipfd);
1312 1247
1313 verbose("device %p: tun net %u.%u.%u.%u\n", 1248 verbose("device %u: tun net %u.%u.%u.%u\n",
1314 (void *)(dev->desc->pfn * getpagesize()), 1249 devices.device_num++,
1315 (u8)(ip>>24), (u8)(ip>>16), (u8)(ip>>8), (u8)ip); 1250 (u8)(ip>>24),(u8)(ip>>16),(u8)(ip>>8),(u8)ip);
1316 if (br_name) 1251 if (br_name)
1317 verbose("attached to bridge: %s\n", br_name); 1252 verbose("attached to bridge: %s\n", br_name);
1318} 1253}
1254
1255
1256/*
1257 * Block device.
1258 *
1259 * Serving a block device is really easy: the Guest asks for a block number and
1260 * we read or write that position in the file.
1261 *
1262 * Unfortunately, this is amazingly slow: the Guest waits until the read is
1263 * finished before running anything else, even if it could be doing useful
1264 * work. We could use async I/O, except it's reputed to suck so hard that
1265 * characters actually go missing from your code when you try to use it.
1266 *
1267 * So we farm the I/O out to thread, and communicate with it via a pipe. */
1268
1269/* This hangs off device->priv, with the data. */
1270struct vblk_info
1271{
1272 /* The size of the file. */
1273 off64_t len;
1274
1275 /* The file descriptor for the file. */
1276 int fd;
1277
1278 /* IO thread listens on this file descriptor [0]. */
1279 int workpipe[2];
1280
1281 /* IO thread writes to this file descriptor to mark it done, then
1282 * Launcher triggers interrupt to Guest. */
1283 int done_fd;
1284};
1285
1286/* This is the core of the I/O thread. It returns true if it did something. */
1287static bool service_io(struct device *dev)
1288{
1289 struct vblk_info *vblk = dev->priv;
1290 unsigned int head, out_num, in_num, wlen;
1291 int ret;
1292 struct virtio_blk_inhdr *in;
1293 struct virtio_blk_outhdr *out;
1294 struct iovec iov[dev->vq->vring.num];
1295 off64_t off;
1296
1297 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
1298 if (head == dev->vq->vring.num)
1299 return false;
1300
1301 if (out_num == 0 || in_num == 0)
1302 errx(1, "Bad virtblk cmd %u out=%u in=%u",
1303 head, out_num, in_num);
1304
1305 out = convert(&iov[0], struct virtio_blk_outhdr);
1306 in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr);
1307 off = out->sector * 512;
1308
1309 /* This is how we implement barriers. Pretty poor, no? */
1310 if (out->type & VIRTIO_BLK_T_BARRIER)
1311 fdatasync(vblk->fd);
1312
1313 if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
1314 fprintf(stderr, "Scsi commands unsupported\n");
1315 in->status = VIRTIO_BLK_S_UNSUPP;
1316 wlen = sizeof(in);
1317 } else if (out->type & VIRTIO_BLK_T_OUT) {
1318 /* Write */
1319
1320 /* Move to the right location in the block file. This can fail
1321 * if they try to write past end. */
1322 if (lseek64(vblk->fd, off, SEEK_SET) != off)
1323 err(1, "Bad seek to sector %llu", out->sector);
1324
1325 ret = writev(vblk->fd, iov+1, out_num-1);
1326 verbose("WRITE to sector %llu: %i\n", out->sector, ret);
1327
1328 /* Grr... Now we know how long the descriptor they sent was, we
1329 * make sure they didn't try to write over the end of the block
1330 * file (possibly extending it). */
1331 if (ret > 0 && off + ret > vblk->len) {
1332 /* Trim it back to the correct length */
1333 ftruncate64(vblk->fd, vblk->len);
1334 /* Die, bad Guest, die. */
1335 errx(1, "Write past end %llu+%u", off, ret);
1336 }
1337 wlen = sizeof(in);
1338 in->status = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
1339 } else {
1340 /* Read */
1341
1342 /* Move to the right location in the block file. This can fail
1343 * if they try to read past end. */
1344 if (lseek64(vblk->fd, off, SEEK_SET) != off)
1345 err(1, "Bad seek to sector %llu", out->sector);
1346
1347 ret = readv(vblk->fd, iov+1, in_num-1);
1348 verbose("READ from sector %llu: %i\n", out->sector, ret);
1349 if (ret >= 0) {
1350 wlen = sizeof(in) + ret;
1351 in->status = VIRTIO_BLK_S_OK;
1352 } else {
1353 wlen = sizeof(in);
1354 in->status = VIRTIO_BLK_S_IOERR;
1355 }
1356 }
1357
1358 /* We can't trigger an IRQ, because we're not the Launcher. It does
1359 * that when we tell it we're done. */
1360 add_used(dev->vq, head, wlen);
1361 return true;
1362}
1363
1364/* This is the thread which actually services the I/O. */
1365static int io_thread(void *_dev)
1366{
1367 struct device *dev = _dev;
1368 struct vblk_info *vblk = dev->priv;
1369 char c;
1370
1371 /* Close other side of workpipe so we get 0 read when main dies. */
1372 close(vblk->workpipe[1]);
1373 /* Close the other side of the done_fd pipe. */
1374 close(dev->fd);
1375
1376 /* When this read fails, it means Launcher died, so we follow. */
1377 while (read(vblk->workpipe[0], &c, 1) == 1) {
1378 /* We acknowledge each request immediately, to reduce latency,
1379 * rather than waiting until we've done them all. I haven't
1380 * measured to see if it makes any difference. */
1381 while (service_io(dev))
1382 write(vblk->done_fd, &c, 1);
1383 }
1384 return 0;
1385}
1386
1387/* When the thread says some I/O is done, we interrupt the Guest. */
1388static bool handle_io_finish(int fd, struct device *dev)
1389{
1390 char c;
1391
1392 /* If child died, presumably it printed message. */
1393 if (read(dev->fd, &c, 1) != 1)
1394 exit(1);
1395
1396 /* It did some work, so trigger the irq. */
1397 trigger_irq(fd, dev->vq);
1398 return true;
1399}
1400
1401/* When the Guest submits some I/O, we wake the I/O thread. */
1402static void handle_virtblk_output(int fd, struct virtqueue *vq)
1403{
1404 struct vblk_info *vblk = vq->dev->priv;
1405 char c = 0;
1406
1407 /* Wake up I/O thread and tell it to go to work! */
1408 if (write(vblk->workpipe[1], &c, 1) != 1)
1409 /* Presumably it indicated why it died. */
1410 exit(1);
1411}
1412
1413/* This creates a virtual block device. */
1414static void setup_block_file(const char *filename)
1415{
1416 int p[2];
1417 struct device *dev;
1418 struct vblk_info *vblk;
1419 void *stack;
1420 u64 cap;
1421 unsigned int val;
1422
1423 /* This is the pipe the I/O thread will use to tell us I/O is done. */
1424 pipe(p);
1425
1426 /* The device responds to return from I/O thread. */
1427 dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish);
1428
1429 /* The device has a virtqueue. */
1430 add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output);
1431
1432 /* Allocate the room for our own bookkeeping */
1433 vblk = dev->priv = malloc(sizeof(*vblk));
1434
1435 /* First we open the file and store the length. */
1436 vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
1437 vblk->len = lseek64(vblk->fd, 0, SEEK_END);
1438
1439 /* Tell Guest how many sectors this device has. */
1440 cap = cpu_to_le64(vblk->len / 512);
1441 add_desc_field(dev, VIRTIO_CONFIG_BLK_F_CAPACITY, sizeof(cap), &cap);
1442
1443 /* Tell Guest not to put in too many descriptors at once: two are used
1444 * for the in and out elements. */
1445 val = cpu_to_le32(VIRTQUEUE_NUM - 2);
1446 add_desc_field(dev, VIRTIO_CONFIG_BLK_F_SEG_MAX, sizeof(val), &val);
1447
1448 /* The I/O thread writes to this end of the pipe when done. */
1449 vblk->done_fd = p[1];
1450
1451 /* This is how we tell the I/O thread about more work. */
1452 pipe(vblk->workpipe);
1453
1454 /* Create stack for thread and run it */
1455 stack = malloc(32768);
1456 if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1)
1457 err(1, "Creating clone");
1458
1459 /* We don't need to keep the I/O thread's end of the pipes open. */
1460 close(vblk->done_fd);
1461 close(vblk->workpipe[0]);
1462
1463 verbose("device %u: virtblock %llu sectors\n",
1464 devices.device_num, cap);
1465}
1319/* That's the end of device setup. */ 1466/* That's the end of device setup. */
1320 1467
1321/*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves 1468/*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves
1322 * its input and output, and finally, lays it to rest. */ 1469 * its input and output, and finally, lays it to rest. */
1323static void __attribute__((noreturn)) 1470static void __attribute__((noreturn)) run_guest(int lguest_fd)
1324run_guest(int lguest_fd, struct device_list *device_list)
1325{ 1471{
1326 for (;;) { 1472 for (;;) {
1327 u32 args[] = { LHREQ_BREAK, 0 }; 1473 unsigned long args[] = { LHREQ_BREAK, 0 };
1328 unsigned long arr[2]; 1474 unsigned long notify_addr;
1329 int readval; 1475 int readval;
1330 1476
1331 /* We read from the /dev/lguest device to run the Guest. */ 1477 /* We read from the /dev/lguest device to run the Guest. */
1332 readval = read(lguest_fd, arr, sizeof(arr)); 1478 readval = read(lguest_fd, &notify_addr, sizeof(notify_addr));
1333
1334 /* The read can only really return sizeof(arr) (the Guest did a
1335 * SEND_DMA to us), or an error. */
1336 1479
1337 /* For a successful read, arr[0] is the address of the "struct 1480 /* One unsigned long means the Guest did HCALL_NOTIFY */
1338 * lguest_dma", and arr[1] is the key the Guest sent to. */ 1481 if (readval == sizeof(notify_addr)) {
1339 if (readval == sizeof(arr)) { 1482 verbose("Notify on address %#lx\n", notify_addr);
1340 handle_output(lguest_fd, arr[0], arr[1], device_list); 1483 handle_output(lguest_fd, notify_addr);
1341 continue; 1484 continue;
1342 /* ENOENT means the Guest died. Reading tells us why. */ 1485 /* ENOENT means the Guest died. Reading tells us why. */
1343 } else if (errno == ENOENT) { 1486 } else if (errno == ENOENT) {
@@ -1351,7 +1494,7 @@ run_guest(int lguest_fd, struct device_list *device_list)
1351 1494
1352 /* Service input, then unset the BREAK which releases 1495 /* Service input, then unset the BREAK which releases
1353 * the Waker. */ 1496 * the Waker. */
1354 handle_input(lguest_fd, device_list); 1497 handle_input(lguest_fd);
1355 if (write(lguest_fd, args, sizeof(args)) < 0) 1498 if (write(lguest_fd, args, sizeof(args)) < 0)
1356 err(1, "Resetting break"); 1499 err(1, "Resetting break");
1357 } 1500 }
@@ -1365,7 +1508,6 @@ run_guest(int lguest_fd, struct device_list *device_list)
1365 1508
1366static struct option opts[] = { 1509static struct option opts[] = {
1367 { "verbose", 0, NULL, 'v' }, 1510 { "verbose", 0, NULL, 'v' },
1368 { "sharenet", 1, NULL, 's' },
1369 { "tunnet", 1, NULL, 't' }, 1511 { "tunnet", 1, NULL, 't' },
1370 { "block", 1, NULL, 'b' }, 1512 { "block", 1, NULL, 'b' },
1371 { "initrd", 1, NULL, 'i' }, 1513 { "initrd", 1, NULL, 'i' },
@@ -1374,37 +1516,21 @@ static struct option opts[] = {
1374static void usage(void) 1516static void usage(void)
1375{ 1517{
1376 errx(1, "Usage: lguest [--verbose] " 1518 errx(1, "Usage: lguest [--verbose] "
1377 "[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)\n" 1519 "[--tunnet=(<ipaddr>|bridge:<bridgename>)\n"
1378 "|--block=<filename>|--initrd=<filename>]...\n" 1520 "|--block=<filename>|--initrd=<filename>]...\n"
1379 "<mem-in-mb> vmlinux [args...]"); 1521 "<mem-in-mb> vmlinux [args...]");
1380} 1522}
1381 1523
1382/*L:100 The Launcher code itself takes us out into userspace, that scary place 1524/*L:105 The main routine is where the real work begins: */
1383 * where pointers run wild and free! Unfortunately, like most userspace
1384 * programs, it's quite boring (which is why everyone like to hack on the
1385 * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it
1386 * will get you through this section. Or, maybe not.
1387 *
1388 * The Launcher binary sits up high, usually starting at address 0xB8000000.
1389 * Everything below this is the "physical" memory for the Guest. For example,
1390 * if the Guest were to write a "1" at physical address 0, we would see a "1"
1391 * in the Launcher at "(int *)0". Guest physical == Launcher virtual.
1392 *
1393 * This can be tough to get your head around, but usually it just means that we
1394 * don't need to do any conversion when the Guest gives us it's "physical"
1395 * addresses.
1396 */
1397int main(int argc, char *argv[]) 1525int main(int argc, char *argv[])
1398{ 1526{
1399 /* Memory, top-level pagetable, code startpoint, PAGE_OFFSET and size 1527 /* Memory, top-level pagetable, code startpoint and size of the
1400 * of the (optional) initrd. */ 1528 * (optional) initrd. */
1401 unsigned long mem = 0, pgdir, start, page_offset, initrd_size = 0; 1529 unsigned long mem = 0, pgdir, start, initrd_size = 0;
1402 /* A temporary and the /dev/lguest file descriptor. */ 1530 /* A temporary and the /dev/lguest file descriptor. */
1403 int i, c, lguest_fd; 1531 int i, c, lguest_fd;
1404 /* The list of Guest devices, based on command line arguments. */ 1532 /* The boot information for the Guest. */
1405 struct device_list device_list; 1533 struct boot_params *boot;
1406 /* The boot information for the Guest: at guest-physical address 0. */
1407 void *boot = (void *)0;
1408 /* If they specify an initrd file to load. */ 1534 /* If they specify an initrd file to load. */
1409 const char *initrd_name = NULL; 1535 const char *initrd_name = NULL;
1410 1536
@@ -1412,11 +1538,12 @@ int main(int argc, char *argv[])
1412 * device receive input from a file descriptor, we keep an fdset 1538 * device receive input from a file descriptor, we keep an fdset
1413 * (infds) and the maximum fd number (max_infd) with the head of the 1539 * (infds) and the maximum fd number (max_infd) with the head of the
1414 * list. We also keep a pointer to the last device, for easy appending 1540 * list. We also keep a pointer to the last device, for easy appending
1415 * to the list. */ 1541 * to the list. Finally, we keep the next interrupt number to hand out
1416 device_list.max_infd = -1; 1542 * (1: remember that 0 is used by the timer). */
1417 device_list.dev = NULL; 1543 FD_ZERO(&devices.infds);
1418 device_list.lastdev = &device_list.dev; 1544 devices.max_infd = -1;
1419 FD_ZERO(&device_list.infds); 1545 devices.lastdev = &devices.dev;
1546 devices.next_irq = 1;
1420 1547
1421 /* We need to know how much memory so we can set up the device 1548 /* We need to know how much memory so we can set up the device
1422 * descriptor and memory pages for the devices as we parse the command 1549 * descriptor and memory pages for the devices as we parse the command
@@ -1424,9 +1551,16 @@ int main(int argc, char *argv[])
1424 * of memory now. */ 1551 * of memory now. */
1425 for (i = 1; i < argc; i++) { 1552 for (i = 1; i < argc; i++) {
1426 if (argv[i][0] != '-') { 1553 if (argv[i][0] != '-') {
1427 mem = top = atoi(argv[i]) * 1024 * 1024; 1554 mem = atoi(argv[i]) * 1024 * 1024;
1428 device_list.descs = map_zeroed_pages(top, 1); 1555 /* We start by mapping anonymous pages over all of
1429 top += getpagesize(); 1556 * guest-physical memory range. This fills it with 0,
1557 * and ensures that the Guest won't be killed when it
1558 * tries to access it. */
1559 guest_base = map_zeroed_pages(mem / getpagesize()
1560 + DEVICE_PAGES);
1561 guest_limit = mem;
1562 guest_max = mem + DEVICE_PAGES*getpagesize();
1563 devices.descpage = get_pages(1);
1430 break; 1564 break;
1431 } 1565 }
1432 } 1566 }
@@ -1437,14 +1571,11 @@ int main(int argc, char *argv[])
1437 case 'v': 1571 case 'v':
1438 verbose = true; 1572 verbose = true;
1439 break; 1573 break;
1440 case 's':
1441 setup_net_file(optarg, &device_list);
1442 break;
1443 case 't': 1574 case 't':
1444 setup_tun_net(optarg, &device_list); 1575 setup_tun_net(optarg);
1445 break; 1576 break;
1446 case 'b': 1577 case 'b':
1447 setup_block_file(optarg, &device_list); 1578 setup_block_file(optarg);
1448 break; 1579 break;
1449 case 'i': 1580 case 'i':
1450 initrd_name = optarg; 1581 initrd_name = optarg;
@@ -1459,56 +1590,60 @@ int main(int argc, char *argv[])
1459 if (optind + 2 > argc) 1590 if (optind + 2 > argc)
1460 usage(); 1591 usage();
1461 1592
1462 /* We always have a console device */ 1593 verbose("Guest base is at %p\n", guest_base);
1463 setup_console(&device_list);
1464 1594
1465 /* We start by mapping anonymous pages over all of guest-physical 1595 /* We always have a console device */
1466 * memory range. This fills it with 0, and ensures that the Guest 1596 setup_console();
1467 * won't be killed when it tries to access it. */
1468 map_zeroed_pages(0, mem / getpagesize());
1469 1597
1470 /* Now we load the kernel */ 1598 /* Now we load the kernel */
1471 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY), 1599 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
1472 &page_offset); 1600
1601 /* Boot information is stashed at physical address 0 */
1602 boot = from_guest_phys(0);
1473 1603
1474 /* Map the initrd image if requested (at top of physical memory) */ 1604 /* Map the initrd image if requested (at top of physical memory) */
1475 if (initrd_name) { 1605 if (initrd_name) {
1476 initrd_size = load_initrd(initrd_name, mem); 1606 initrd_size = load_initrd(initrd_name, mem);
1477 /* These are the location in the Linux boot header where the 1607 /* These are the location in the Linux boot header where the
1478 * start and size of the initrd are expected to be found. */ 1608 * start and size of the initrd are expected to be found. */
1479 *(unsigned long *)(boot+0x218) = mem - initrd_size; 1609 boot->hdr.ramdisk_image = mem - initrd_size;
1480 *(unsigned long *)(boot+0x21c) = initrd_size; 1610 boot->hdr.ramdisk_size = initrd_size;
1481 /* The bootloader type 0xFF means "unknown"; that's OK. */ 1611 /* The bootloader type 0xFF means "unknown"; that's OK. */
1482 *(unsigned char *)(boot+0x210) = 0xFF; 1612 boot->hdr.type_of_loader = 0xFF;
1483 } 1613 }
1484 1614
1485 /* Set up the initial linear pagetables, starting below the initrd. */ 1615 /* Set up the initial linear pagetables, starting below the initrd. */
1486 pgdir = setup_pagetables(mem, initrd_size, page_offset); 1616 pgdir = setup_pagetables(mem, initrd_size);
1487 1617
1488 /* The Linux boot header contains an "E820" memory map: ours is a 1618 /* The Linux boot header contains an "E820" memory map: ours is a
1489 * simple, single region. */ 1619 * simple, single region. */
1490 *(char*)(boot+E820NR) = 1; 1620 boot->e820_entries = 1;
1491 *((struct e820entry *)(boot+E820MAP)) 1621 boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM });
1492 = ((struct e820entry) { 0, mem, E820_RAM });
1493 /* The boot header contains a command line pointer: we put the command 1622 /* The boot header contains a command line pointer: we put the command
1494 * line after the boot header (at address 4096) */ 1623 * line after the boot header. */
1495 *(void **)(boot + 0x228) = boot + 4096; 1624 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
1496 concat(boot + 4096, argv+optind+2); 1625 concat((char *)(boot + 1), argv+optind+2);
1626
1627 /* Boot protocol version: 2.07 supports the fields for lguest. */
1628 boot->hdr.version = 0x207;
1629
1630 /* The hardware_subarch value of "1" tells the Guest it's an lguest. */
1631 boot->hdr.hardware_subarch = 1;
1497 1632
1498 /* The guest type value of "1" tells the Guest it's under lguest. */ 1633 /* Tell the entry path not to try to reload segment registers. */
1499 *(int *)(boot + 0x23c) = 1; 1634 boot->hdr.loadflags |= KEEP_SEGMENTS;
1500 1635
1501 /* We tell the kernel to initialize the Guest: this returns the open 1636 /* We tell the kernel to initialize the Guest: this returns the open
1502 * /dev/lguest file descriptor. */ 1637 * /dev/lguest file descriptor. */
1503 lguest_fd = tell_kernel(pgdir, start, page_offset); 1638 lguest_fd = tell_kernel(pgdir, start);
1504 1639
1505 /* We fork off a child process, which wakes the Launcher whenever one 1640 /* We fork off a child process, which wakes the Launcher whenever one
1506 * of the input file descriptors needs attention. Otherwise we would 1641 * of the input file descriptors needs attention. Otherwise we would
1507 * run the Guest until it tries to output something. */ 1642 * run the Guest until it tries to output something. */
1508 waker_fd = setup_waker(lguest_fd, &device_list); 1643 waker_fd = setup_waker(lguest_fd);
1509 1644
1510 /* Finally, run the Guest. This doesn't return. */ 1645 /* Finally, run the Guest. This doesn't return. */
1511 run_guest(lguest_fd, &device_list); 1646 run_guest(lguest_fd);
1512} 1647}
1513/*:*/ 1648/*:*/
1514 1649
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt
index 821617bd6c04..7885ab2d5f53 100644
--- a/Documentation/lguest/lguest.txt
+++ b/Documentation/lguest/lguest.txt
@@ -6,7 +6,7 @@ Lguest is designed to be a minimal hypervisor for the Linux kernel, for
6Linux developers and users to experiment with virtualization with the 6Linux developers and users to experiment with virtualization with the
7minimum of complexity. Nonetheless, it should have sufficient 7minimum of complexity. Nonetheless, it should have sufficient
8features to make it useful for specific tasks, and, of course, you are 8features to make it useful for specific tasks, and, of course, you are
9encouraged to fork and enhance it. 9encouraged to fork and enhance it (see drivers/lguest/README).
10 10
11Features: 11Features:
12 12
@@ -23,19 +23,30 @@ Developer features:
23 23
24Running Lguest: 24Running Lguest:
25 25
26- Lguest runs the same kernel as guest and host. You can configure 26- The easiest way to run lguest is to use same kernel as guest and host.
27 them differently, but usually it's easiest not to. 27 You can configure them differently, but usually it's easiest not to.
28 28
29 You will need to configure your kernel with the following options: 29 You will need to configure your kernel with the following options:
30 30
31 CONFIG_HIGHMEM64G=n ("High Memory Support" "64GB")[1] 31 "General setup":
32 CONFIG_TUN=y/m ("Universal TUN/TAP device driver support") 32 "Prompt for development and/or incomplete code/drivers" = Y
33 CONFIG_EXPERIMENTAL=y ("Prompt for development and/or incomplete code/drivers") 33 (CONFIG_EXPERIMENTAL=y)
34 CONFIG_PARAVIRT=y ("Paravirtualization support (EXPERIMENTAL)") 34
35 CONFIG_LGUEST=y/m ("Linux hypervisor example code") 35 "Processor type and features":
36 36 "Paravirtualized guest support" = Y
37 and I recommend: 37 "Lguest guest support" = Y
38 CONFIG_HZ=100 ("Timer frequency")[2] 38 "High Memory Support" = off/4GB
39 "Alignment value to which kernel should be aligned" = 0x100000
40 (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
41 CONFIG_PHYSICAL_ALIGN=0x100000)
42
43 "Device Drivers":
44 "Network device support"
45 "Universal TUN/TAP device driver support" = M/Y
46 (CONFIG_TUN=m)
47 "Virtualization"
48 "Linux hypervisor example code" = M/Y
49 (CONFIG_LGUEST=m)
39 50
40- A tool called "lguest" is available in this directory: type "make" 51- A tool called "lguest" is available in this directory: type "make"
41 to build it. If you didn't build your kernel in-tree, use "make 52 to build it. If you didn't build your kernel in-tree, use "make
@@ -51,14 +62,17 @@ Running Lguest:
51 dd if=/dev/zero of=rootfile bs=1M count=2048 62 dd if=/dev/zero of=rootfile bs=1M count=2048
52 qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d 63 qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
53 64
65 Make sure that you install a getty on /dev/hvc0 if you want to log in on the
66 console!
67
54- "modprobe lg" if you built it as a module. 68- "modprobe lg" if you built it as a module.
55 69
56- Run an lguest as root: 70- Run an lguest as root:
57 71
58 Documentation/lguest/lguest 64m vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/lgba 72 Documentation/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/vda
59 73
60 Explanation: 74 Explanation:
61 64m: the amount of memory to use. 75 64: the amount of memory to use, in MB.
62 76
63 vmlinux: the kernel image found in the top of your build directory. You 77 vmlinux: the kernel image found in the top of your build directory. You
64 can also use a standard bzImage. 78 can also use a standard bzImage.
@@ -66,10 +80,10 @@ Running Lguest:
66 --tunnet=192.168.19.1: configures a "tap" device for networking with this 80 --tunnet=192.168.19.1: configures a "tap" device for networking with this
67 IP address. 81 IP address.
68 82
69 --block=rootfile: a file or block device which becomes /dev/lgba 83 --block=rootfile: a file or block device which becomes /dev/vda
70 inside the guest. 84 inside the guest.
71 85
72 root=/dev/lgba: this (and anything else on the command line) are 86 root=/dev/vda: this (and anything else on the command line) are
73 kernel boot parameters. 87 kernel boot parameters.
74 88
75- Configuring networking. I usually have the host masquerade, using 89- Configuring networking. I usually have the host masquerade, using
@@ -99,31 +113,7 @@ Running Lguest:
99 "--sharenet=<filename>": any two guests using the same file are on 113 "--sharenet=<filename>": any two guests using the same file are on
100 the same network. This file is created if it does not exist. 114 the same network. This file is created if it does not exist.
101 115
102Lguest I/O model: 116There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest
103
104Lguest uses a simplified DMA model plus shared memory for I/O. Guests
105can communicate with each other if they share underlying memory
106(usually by the lguest program mmaping the same file), but they can
107use any non-shared memory to communicate with the lguest process.
108
109Guests can register DMA buffers at any key (must be a valid physical
110address) using the LHCALL_BIND_DMA(key, dmabufs, num<<8|irq)
111hypercall. "dmabufs" is the physical address of an array of "num"
112"struct lguest_dma": each contains a used_len, and an array of
113physical addresses and lengths. When a transfer occurs, the
114"used_len" field of one of the buffers which has used_len 0 will be
115set to the length transferred and the irq will fire.
116 117
117Using an irq value of 0 unbinds the dma buffers. 118Good luck!
118
119To send DMA, the LHCALL_SEND_DMA(key, dma_physaddr) hypercall is used,
120and the bytes used is written to the used_len field. This can be 0 if
121noone else has bound a DMA buffer to that key or some other error.
122DMA buffers bound by the same guest are ignored.
123
124Cheers!
125Rusty Russell rusty@rustcorp.com.au. 119Rusty Russell rusty@rustcorp.com.au.
126
127[1] These are on various places on the TODO list, waiting for you to
128 get annoyed enough at the limitation to fix it.
129[2] Lguest is not yet tickless when idle. See [1].