aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/lguest/Makefile26
-rw-r--r--Documentation/lguest/lguest.c1629
-rw-r--r--Documentation/lguest/lguest.txt72
-rw-r--r--arch/i386/Kconfig32
-rw-r--r--arch/i386/Makefile3
-rw-r--r--arch/x86/kernel/asm-offsets_32.c1
-rw-r--r--arch/x86/lguest/Kconfig14
-rw-r--r--arch/x86/lguest/Makefile1
-rw-r--r--arch/x86/lguest/boot.c (renamed from drivers/lguest/lguest.c)102
-rw-r--r--arch/x86/lguest/i386_head.S (renamed from drivers/lguest/lguest_asm.S)46
-rw-r--r--arch/x86/xen/Kconfig5
-rw-r--r--drivers/Kconfig2
-rw-r--r--drivers/Makefile1
-rw-r--r--drivers/block/Kconfig6
-rw-r--r--drivers/block/Makefile2
-rw-r--r--drivers/block/lguest_blk.c421
-rw-r--r--drivers/block/virtio_blk.c308
-rw-r--r--drivers/char/Kconfig4
-rw-r--r--drivers/char/Makefile2
-rw-r--r--drivers/char/hvc_lguest.c177
-rw-r--r--drivers/char/virtio_console.c225
-rw-r--r--drivers/kvm/Kconfig4
-rw-r--r--drivers/lguest/Kconfig13
-rw-r--r--drivers/lguest/Makefile10
-rw-r--r--drivers/lguest/core.c568
-rw-r--r--drivers/lguest/hypercalls.c177
-rw-r--r--drivers/lguest/interrupts_and_traps.c125
-rw-r--r--drivers/lguest/io.c626
-rw-r--r--drivers/lguest/lg.h189
-rw-r--r--drivers/lguest/lguest_bus.c218
-rw-r--r--drivers/lguest/lguest_device.c373
-rw-r--r--drivers/lguest/lguest_user.c138
-rw-r--r--drivers/lguest/page_tables.c250
-rw-r--r--drivers/lguest/segments.c28
-rw-r--r--drivers/lguest/x86/core.c577
-rw-r--r--drivers/lguest/x86/switcher_32.S (renamed from drivers/lguest/switcher.S)7
-rw-r--r--drivers/net/Kconfig6
-rw-r--r--drivers/net/Makefile2
-rw-r--r--drivers/net/lguest_net.c555
-rw-r--r--drivers/net/virtio_net.c435
-rw-r--r--drivers/virtio/Kconfig8
-rw-r--r--drivers/virtio/Makefile2
-rw-r--r--drivers/virtio/config.c13
-rw-r--r--drivers/virtio/virtio.c189
-rw-r--r--drivers/virtio/virtio_ring.c313
-rw-r--r--include/asm-x86/Kbuild3
-rw-r--r--include/asm-x86/bootparam.h108
-rw-r--r--include/asm-x86/e820.h28
-rw-r--r--include/asm-x86/e820_32.h21
-rw-r--r--include/asm-x86/e820_64.h20
-rw-r--r--include/asm-x86/ist.h12
-rw-r--r--include/asm-x86/lguest.h86
-rw-r--r--include/asm-x86/lguest_hcall.h71
-rw-r--r--include/linux/Kbuild5
-rw-r--r--include/linux/apm_bios.h30
-rw-r--r--include/linux/edd.h137
-rw-r--r--include/linux/lguest.h80
-rw-r--r--include/linux/lguest_bus.h51
-rw-r--r--include/linux/lguest_launcher.h112
-rw-r--r--include/linux/mod_devicetable.h6
-rw-r--r--include/linux/screen_info.h81
-rw-r--r--include/linux/virtio.h110
-rw-r--r--include/linux/virtio_blk.h51
-rw-r--r--include/linux/virtio_config.h111
-rw-r--r--include/linux/virtio_console.h12
-rw-r--r--include/linux/virtio_net.h36
-rw-r--r--include/linux/virtio_ring.h119
-rw-r--r--include/video/Kbuild1
-rw-r--r--include/video/edid.h9
-rw-r--r--scripts/mod/file2alias.c18
70 files changed, 4822 insertions, 4401 deletions
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile
index c0b7a455639..bac037eb1cd 100644
--- a/Documentation/lguest/Makefile
+++ b/Documentation/lguest/Makefile
@@ -1,28 +1,8 @@
1# This creates the demonstration utility "lguest" which runs a Linux guest. 1# This creates the demonstration utility "lguest" which runs a Linux guest.
2 2CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include
3# For those people that have a separate object dir, look there for .config
4KBUILD_OUTPUT := ../..
5ifdef O
6 ifeq ("$(origin O)", "command line")
7 KBUILD_OUTPUT := $(O)
8 endif
9endif
10# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
11include $(KBUILD_OUTPUT)/.config
12LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
13
14CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds
15LDLIBS:=-lz 3LDLIBS:=-lz
16# Removing this works for some versions of ld.so (eg. Ubuntu Feisty) and
17# not others (eg. FC7).
18LDFLAGS+=-static
19all: lguest.lds lguest
20 4
21# The linker script on x86 is so complex the only way of creating one 5all: lguest
22# which will link our binary in the right place is to mangle the
23# default one.
24lguest.lds:
25 $(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@
26 6
27clean: 7clean:
28 rm -f lguest.lds lguest 8 rm -f lguest
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 103e346c8b6..5bdc37f8184 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -1,10 +1,7 @@
1/*P:100 This is the Launcher code, a simple program which lays out the 1/*P:100 This is the Launcher code, a simple program which lays out the
2 * "physical" memory for the new Guest by mapping the kernel image and the 2 * "physical" memory for the new Guest by mapping the kernel image and the
3 * virtual devices, then reads repeatedly from /dev/lguest to run the Guest. 3 * virtual devices, then reads repeatedly from /dev/lguest to run the Guest.
4 * 4:*/
5 * The only trick: the Makefile links it at a high address so it will be clear
6 * of the guest memory region. It means that each Guest cannot have more than
7 * about 2.5G of memory on a normally configured Host. :*/
8#define _LARGEFILE64_SOURCE 5#define _LARGEFILE64_SOURCE
9#define _GNU_SOURCE 6#define _GNU_SOURCE
10#include <stdio.h> 7#include <stdio.h>
@@ -15,6 +12,7 @@
15#include <stdlib.h> 12#include <stdlib.h>
16#include <elf.h> 13#include <elf.h>
17#include <sys/mman.h> 14#include <sys/mman.h>
15#include <sys/param.h>
18#include <sys/types.h> 16#include <sys/types.h>
19#include <sys/stat.h> 17#include <sys/stat.h>
20#include <sys/wait.h> 18#include <sys/wait.h>
@@ -34,7 +32,9 @@
34#include <termios.h> 32#include <termios.h>
35#include <getopt.h> 33#include <getopt.h>
36#include <zlib.h> 34#include <zlib.h>
37/*L:110 We can ignore the 28 include files we need for this program, but I do 35#include <assert.h>
36#include <sched.h>
37/*L:110 We can ignore the 30 include files we need for this program, but I do
38 * want to draw attention to the use of kernel-style types. 38 * want to draw attention to the use of kernel-style types.
39 * 39 *
40 * As Linus said, "C is a Spartan language, and so should your naming be." I 40 * As Linus said, "C is a Spartan language, and so should your naming be." I
@@ -45,8 +45,14 @@ typedef unsigned long long u64;
45typedef uint32_t u32; 45typedef uint32_t u32;
46typedef uint16_t u16; 46typedef uint16_t u16;
47typedef uint8_t u8; 47typedef uint8_t u8;
48#include "../../include/linux/lguest_launcher.h" 48#include "linux/lguest_launcher.h"
49#include "../../include/asm-x86/e820_32.h" 49#include "linux/pci_ids.h"
50#include "linux/virtio_config.h"
51#include "linux/virtio_net.h"
52#include "linux/virtio_blk.h"
53#include "linux/virtio_console.h"
54#include "linux/virtio_ring.h"
55#include "asm-x86/bootparam.h"
50/*:*/ 56/*:*/
51 57
52#define PAGE_PRESENT 0x7 /* Present, RW, Execute */ 58#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
@@ -55,6 +61,10 @@ typedef uint8_t u8;
55#ifndef SIOCBRADDIF 61#ifndef SIOCBRADDIF
56#define SIOCBRADDIF 0x89a2 /* add interface to bridge */ 62#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
57#endif 63#endif
64/* We can have up to 256 pages for devices. */
65#define DEVICE_PAGES 256
66/* This fits nicely in a single 4096-byte page. */
67#define VIRTQUEUE_NUM 127
58 68
59/*L:120 verbose is both a global flag and a macro. The C preprocessor allows 69/*L:120 verbose is both a global flag and a macro. The C preprocessor allows
60 * this, and although I wouldn't recommend it, it works quite nicely here. */ 70 * this, and although I wouldn't recommend it, it works quite nicely here. */
@@ -65,8 +75,10 @@ static bool verbose;
65 75
66/* The pipe to send commands to the waker process */ 76/* The pipe to send commands to the waker process */
67static int waker_fd; 77static int waker_fd;
68/* The top of guest physical memory. */ 78/* The pointer to the start of guest memory. */
69static u32 top; 79static void *guest_base;
80/* The maximum guest physical address allowed, and maximum possible. */
81static unsigned long guest_limit, guest_max;
70 82
71/* This is our list of devices. */ 83/* This is our list of devices. */
72struct device_list 84struct device_list
@@ -76,8 +88,17 @@ struct device_list
76 fd_set infds; 88 fd_set infds;
77 int max_infd; 89 int max_infd;
78 90
91 /* Counter to assign interrupt numbers. */
92 unsigned int next_irq;
93
94 /* Counter to print out convenient device numbers. */
95 unsigned int device_num;
96
79 /* The descriptor page for the devices. */ 97 /* The descriptor page for the devices. */
80 struct lguest_device_desc *descs; 98 u8 *descpage;
99
100 /* The tail of the last descriptor. */
101 unsigned int desc_used;
81 102
82 /* A single linked list of devices. */ 103 /* A single linked list of devices. */
83 struct device *dev; 104 struct device *dev;
@@ -85,31 +106,111 @@ struct device_list
85 struct device **lastdev; 106 struct device **lastdev;
86}; 107};
87 108
109/* The list of Guest devices, based on command line arguments. */
110static struct device_list devices;
111
88/* The device structure describes a single device. */ 112/* The device structure describes a single device. */
89struct device 113struct device
90{ 114{
91 /* The linked-list pointer. */ 115 /* The linked-list pointer. */
92 struct device *next; 116 struct device *next;
93 /* The descriptor for this device, as mapped into the Guest. */ 117
118 /* The this device's descriptor, as mapped into the Guest. */
94 struct lguest_device_desc *desc; 119 struct lguest_device_desc *desc;
95 /* The memory page(s) of this device, if any. Also mapped in Guest. */ 120
96 void *mem; 121 /* The name of this device, for --verbose. */
122 const char *name;
97 123
98 /* If handle_input is set, it wants to be called when this file 124 /* If handle_input is set, it wants to be called when this file
99 * descriptor is ready. */ 125 * descriptor is ready. */
100 int fd; 126 int fd;
101 bool (*handle_input)(int fd, struct device *me); 127 bool (*handle_input)(int fd, struct device *me);
102 128
103 /* If handle_output is set, it wants to be called when the Guest sends 129 /* Any queues attached to this device */
104 * DMA to this key. */ 130 struct virtqueue *vq;
105 unsigned long watch_key;
106 u32 (*handle_output)(int fd, const struct iovec *iov,
107 unsigned int num, struct device *me);
108 131
109 /* Device-specific data. */ 132 /* Device-specific data. */
110 void *priv; 133 void *priv;
111}; 134};
112 135
136/* The virtqueue structure describes a queue attached to a device. */
137struct virtqueue
138{
139 struct virtqueue *next;
140
141 /* Which device owns me. */
142 struct device *dev;
143
144 /* The configuration for this queue. */
145 struct lguest_vqconfig config;
146
147 /* The actual ring of buffers. */
148 struct vring vring;
149
150 /* Last available index we saw. */
151 u16 last_avail_idx;
152
153 /* The routine to call when the Guest pings us. */
154 void (*handle_output)(int fd, struct virtqueue *me);
155};
156
157/* Since guest is UP and we don't run at the same time, we don't need barriers.
158 * But I include them in the code in case others copy it. */
159#define wmb()
160
161/* Convert an iovec element to the given type.
162 *
163 * This is a fairly ugly trick: we need to know the size of the type and
164 * alignment requirement to check the pointer is kosher. It's also nice to
165 * have the name of the type in case we report failure.
166 *
167 * Typing those three things all the time is cumbersome and error prone, so we
168 * have a macro which sets them all up and passes to the real function. */
169#define convert(iov, type) \
170 ((type *)_convert((iov), sizeof(type), __alignof__(type), #type))
171
172static void *_convert(struct iovec *iov, size_t size, size_t align,
173 const char *name)
174{
175 if (iov->iov_len != size)
176 errx(1, "Bad iovec size %zu for %s", iov->iov_len, name);
177 if ((unsigned long)iov->iov_base % align != 0)
178 errx(1, "Bad alignment %p for %s", iov->iov_base, name);
179 return iov->iov_base;
180}
181
182/* The virtio configuration space is defined to be little-endian. x86 is
183 * little-endian too, but it's nice to be explicit so we have these helpers. */
184#define cpu_to_le16(v16) (v16)
185#define cpu_to_le32(v32) (v32)
186#define cpu_to_le64(v64) (v64)
187#define le16_to_cpu(v16) (v16)
188#define le32_to_cpu(v32) (v32)
189#define le64_to_cpu(v32) (v64)
190
191/*L:100 The Launcher code itself takes us out into userspace, that scary place
192 * where pointers run wild and free! Unfortunately, like most userspace
193 * programs, it's quite boring (which is why everyone likes to hack on the
194 * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it
195 * will get you through this section. Or, maybe not.
196 *
197 * The Launcher sets up a big chunk of memory to be the Guest's "physical"
198 * memory and stores it in "guest_base". In other words, Guest physical ==
199 * Launcher virtual with an offset.
200 *
201 * This can be tough to get your head around, but usually it just means that we
202 * use these trivial conversion functions when the Guest gives us it's
203 * "physical" addresses: */
204static void *from_guest_phys(unsigned long addr)
205{
206 return guest_base + addr;
207}
208
209static unsigned long to_guest_phys(const void *addr)
210{
211 return (addr - guest_base);
212}
213
113/*L:130 214/*L:130
114 * Loading the Kernel. 215 * Loading the Kernel.
115 * 216 *
@@ -123,43 +224,55 @@ static int open_or_die(const char *name, int flags)
123 return fd; 224 return fd;
124} 225}
125 226
126/* map_zeroed_pages() takes a (page-aligned) address and a number of pages. */ 227/* map_zeroed_pages() takes a number of pages. */
127static void *map_zeroed_pages(unsigned long addr, unsigned int num) 228static void *map_zeroed_pages(unsigned int num)
128{ 229{
129 /* We cache the /dev/zero file-descriptor so we only open it once. */ 230 int fd = open_or_die("/dev/zero", O_RDONLY);
130 static int fd = -1; 231 void *addr;
131
132 if (fd == -1)
133 fd = open_or_die("/dev/zero", O_RDONLY);
134 232
135 /* We use a private mapping (ie. if we write to the page, it will be 233 /* We use a private mapping (ie. if we write to the page, it will be
136 * copied), and obviously we insist that it be mapped where we ask. */ 234 * copied). */
137 if (mmap((void *)addr, getpagesize() * num, 235 addr = mmap(NULL, getpagesize() * num,
138 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0) 236 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0);
139 != (void *)addr) 237 if (addr == MAP_FAILED)
140 err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr); 238 err(1, "Mmaping %u pages of /dev/zero", num);
141 239
142 /* Returning the address is just a courtesy: can simplify callers. */ 240 return addr;
143 return (void *)addr;
144} 241}
145 242
146/* To find out where to start we look for the magic Guest string, which marks 243/* Get some more pages for a device. */
147 * the code we see in lguest_asm.S. This is a hack which we are currently 244static void *get_pages(unsigned int num)
148 * plotting to replace with the normal Linux entry point. */
149static unsigned long entry_point(void *start, void *end,
150 unsigned long page_offset)
151{ 245{
152 void *p; 246 void *addr = from_guest_phys(guest_limit);
153 247
154 /* The scan gives us the physical starting address. We want the 248 guest_limit += num * getpagesize();
155 * virtual address in this case, and fortunately, we already figured 249 if (guest_limit > guest_max)
156 * out the physical-virtual difference and passed it here in 250 errx(1, "Not enough memory for devices");
157 * "page_offset". */ 251 return addr;
158 for (p = start; p < end; p++) 252}
159 if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0)
160 return (long)p + strlen("GenuineLguest") + page_offset;
161 253
162 err(1, "Is this image a genuine lguest?"); 254/* This routine is used to load the kernel or initrd. It tries mmap, but if
255 * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
256 * it falls back to reading the memory in. */
257static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
258{
259 ssize_t r;
260
261 /* We map writable even though for some segments are marked read-only.
262 * The kernel really wants to be writable: it patches its own
263 * instructions.
264 *
265 * MAP_PRIVATE means that the page won't be copied until a write is
266 * done to it. This allows us to share untouched memory between
267 * Guests. */
268 if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC,
269 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
270 return;
271
272 /* pread does a seek and a read in one shot: saves a few lines. */
273 r = pread(fd, addr, len, offset);
274 if (r != len)
275 err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
163} 276}
164 277
165/* This routine takes an open vmlinux image, which is in ELF, and maps it into 278/* This routine takes an open vmlinux image, which is in ELF, and maps it into
@@ -167,19 +280,14 @@ static unsigned long entry_point(void *start, void *end,
167 * by all modern binaries on Linux including the kernel. 280 * by all modern binaries on Linux including the kernel.
168 * 281 *
169 * The ELF headers give *two* addresses: a physical address, and a virtual 282 * The ELF headers give *two* addresses: a physical address, and a virtual
170 * address. The Guest kernel expects to be placed in memory at the physical 283 * address. We use the physical address; the Guest will map itself to the
171 * address, and the page tables set up so it will correspond to that virtual 284 * virtual address.
172 * address. We return the difference between the virtual and physical
173 * addresses in the "page_offset" pointer.
174 * 285 *
175 * We return the starting address. */ 286 * We return the starting address. */
176static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, 287static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
177 unsigned long *page_offset)
178{ 288{
179 void *addr;
180 Elf32_Phdr phdr[ehdr->e_phnum]; 289 Elf32_Phdr phdr[ehdr->e_phnum];
181 unsigned int i; 290 unsigned int i;
182 unsigned long start = -1UL, end = 0;
183 291
184 /* Sanity checks on the main ELF header: an x86 executable with a 292 /* Sanity checks on the main ELF header: an x86 executable with a
185 * reasonable number of correctly-sized program headers. */ 293 * reasonable number of correctly-sized program headers. */
@@ -199,9 +307,6 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
199 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) 307 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
200 err(1, "Reading program headers"); 308 err(1, "Reading program headers");
201 309
202 /* We don't know page_offset yet. */
203 *page_offset = 0;
204
205 /* Try all the headers: there are usually only three. A read-only one, 310 /* Try all the headers: there are usually only three. A read-only one,
206 * a read-write one, and a "note" section which isn't loadable. */ 311 * a read-write one, and a "note" section which isn't loadable. */
207 for (i = 0; i < ehdr->e_phnum; i++) { 312 for (i = 0; i < ehdr->e_phnum; i++) {
@@ -212,158 +317,53 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
212 verbose("Section %i: size %i addr %p\n", 317 verbose("Section %i: size %i addr %p\n",
213 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); 318 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
214 319
215 /* We expect a simple linear address space: every segment must 320 /* We map this section of the file at its physical address. */
216 * have the same difference between virtual (p_vaddr) and 321 map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
217 * physical (p_paddr) address. */ 322 phdr[i].p_offset, phdr[i].p_filesz);
218 if (!*page_offset)
219 *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
220 else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
221 errx(1, "Page offset of section %i different", i);
222
223 /* We track the first and last address we mapped, so we can
224 * tell entry_point() where to scan. */
225 if (phdr[i].p_paddr < start)
226 start = phdr[i].p_paddr;
227 if (phdr[i].p_paddr + phdr[i].p_filesz > end)
228 end = phdr[i].p_paddr + phdr[i].p_filesz;
229
230 /* We map this section of the file at its physical address. We
231 * map it read & write even if the header says this segment is
232 * read-only. The kernel really wants to be writable: it
233 * patches its own instructions which would normally be
234 * read-only.
235 *
236 * MAP_PRIVATE means that the page won't be copied until a
237 * write is done to it. This allows us to share much of the
238 * kernel memory between Guests. */
239 addr = mmap((void *)phdr[i].p_paddr,
240 phdr[i].p_filesz,
241 PROT_READ|PROT_WRITE|PROT_EXEC,
242 MAP_FIXED|MAP_PRIVATE,
243 elf_fd, phdr[i].p_offset);
244 if (addr != (void *)phdr[i].p_paddr)
245 err(1, "Mmaping vmlinux seg %i gave %p not %p",
246 i, addr, (void *)phdr[i].p_paddr);
247 } 323 }
248 324
249 return entry_point((void *)start, (void *)end, *page_offset); 325 /* The entry point is given in the ELF header. */
326 return ehdr->e_entry;
250} 327}
251 328
252/*L:170 Prepare to be SHOCKED and AMAZED. And possibly a trifle nauseated. 329/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're
253 * 330 * supposed to jump into it and it will unpack itself. We used to have to
254 * We know that CONFIG_PAGE_OFFSET sets what virtual address the kernel expects 331 * perform some hairy magic because the unpacking code scared me.
255 * to be. We don't know what that option was, but we can figure it out
256 * approximately by looking at the addresses in the code. I chose the common
257 * case of reading a memory location into the %eax register:
258 *
259 * movl <some-address>, %eax
260 *
261 * This gets encoded as five bytes: "0xA1 <4-byte-address>". For example,
262 * "0xA1 0x18 0x60 0x47 0xC0" reads the address 0xC0476018 into %eax.
263 *
264 * In this example can guess that the kernel was compiled with
265 * CONFIG_PAGE_OFFSET set to 0xC0000000 (it's always a round number). If the
266 * kernel were larger than 16MB, we might see 0xC1 addresses show up, but our
267 * kernel isn't that bloated yet.
268 *
269 * Unfortunately, x86 has variable-length instructions, so finding this
270 * particular instruction properly involves writing a disassembler. Instead,
271 * we rely on statistics. We look for "0xA1" and tally the different bytes
272 * which occur 4 bytes later (the "0xC0" in our example above). When one of
273 * those bytes appears three times, we can be reasonably confident that it
274 * forms the start of CONFIG_PAGE_OFFSET.
275 * 332 *
276 * This is amazingly reliable. */ 333 * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
277static unsigned long intuit_page_offset(unsigned char *img, unsigned long len) 334 * a small patch to jump over the tricky bits in the Guest, so now we just read
335 * the funky header so we know where in the file to load, and away we go! */
336static unsigned long load_bzimage(int fd)
278{ 337{
279 unsigned int i, possibilities[256] = { 0 }; 338 struct boot_params boot;
339 int r;
340 /* Modern bzImages get loaded at 1M. */
341 void *p = from_guest_phys(0x100000);
280 342
281 for (i = 0; i + 4 < len; i++) { 343 /* Go back to the start of the file and read the header. It should be
282 /* mov 0xXXXXXXXX,%eax */ 344 * a Linux boot header (see Documentation/i386/boot.txt) */
283 if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3) 345 lseek(fd, 0, SEEK_SET);
284 return (unsigned long)img[i+4] << 24; 346 read(fd, &boot, sizeof(boot));
285 }
286 errx(1, "could not determine page offset");
287}
288 347
289/*L:160 Unfortunately the entire ELF image isn't compressed: the segments 348 /* Inside the setup_hdr, we expect the magic "HdrS" */
290 * which need loading are extracted and compressed raw. This denies us the 349 if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
291 * information we need to make a fully-general loader. */ 350 errx(1, "This doesn't look like a bzImage to me");
292static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)
293{
294 gzFile f;
295 int ret, len = 0;
296 /* A bzImage always gets loaded at physical address 1M. This is
297 * actually configurable as CONFIG_PHYSICAL_START, but as the comment
298 * there says, "Don't change this unless you know what you are doing".
299 * Indeed. */
300 void *img = (void *)0x100000;
301
302 /* gzdopen takes our file descriptor (carefully placed at the start of
303 * the GZIP header we found) and returns a gzFile. */
304 f = gzdopen(fd, "rb");
305 /* We read it into memory in 64k chunks until we hit the end. */
306 while ((ret = gzread(f, img + len, 65536)) > 0)
307 len += ret;
308 if (ret < 0)
309 err(1, "reading image from bzImage");
310
311 verbose("Unpacked size %i addr %p\n", len, img);
312
313 /* Without the ELF header, we can't tell virtual-physical gap. This is
314 * CONFIG_PAGE_OFFSET, and people do actually change it. Fortunately,
315 * I have a clever way of figuring it out from the code itself. */
316 *page_offset = intuit_page_offset(img, len);
317
318 return entry_point(img, img + len, *page_offset);
319}
320 351
321/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're 352 /* Skip over the extra sectors of the header. */
322 * supposed to jump into it and it will unpack itself. We can't do that 353 lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
323 * because the Guest can't run the unpacking code, and adding features to 354
324 * lguest kills puppies, so we don't want to. 355 /* Now read everything into memory. in nice big chunks. */
325 * 356 while ((r = read(fd, p, 65536)) > 0)
326 * The bzImage is formed by putting the decompressing code in front of the 357 p += r;
327 * compressed kernel code. So we can simple scan through it looking for the 358
328 * first "gzip" header, and start decompressing from there. */ 359 /* Finally, code32_start tells us where to enter the kernel. */
329static unsigned long load_bzimage(int fd, unsigned long *page_offset) 360 return boot.hdr.code32_start;
330{
331 unsigned char c;
332 int state = 0;
333
334 /* GZIP header is 0x1F 0x8B <method> <flags>... <compressed-by>. */
335 while (read(fd, &c, 1) == 1) {
336 switch (state) {
337 case 0:
338 if (c == 0x1F)
339 state++;
340 break;
341 case 1:
342 if (c == 0x8B)
343 state++;
344 else
345 state = 0;
346 break;
347 case 2 ... 8:
348 state++;
349 break;
350 case 9:
351 /* Seek back to the start of the gzip header. */
352 lseek(fd, -10, SEEK_CUR);
353 /* One final check: "compressed under UNIX". */
354 if (c != 0x03)
355 state = -1;
356 else
357 return unpack_bzimage(fd, page_offset);
358 }
359 }
360 errx(1, "Could not find kernel in bzImage");
361} 361}
362 362
363/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels 363/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels
364 * come wrapped up in the self-decompressing "bzImage" format. With some funky 364 * come wrapped up in the self-decompressing "bzImage" format. With some funky
365 * coding, we can load those, too. */ 365 * coding, we can load those, too. */
366static unsigned long load_kernel(int fd, unsigned long *page_offset) 366static unsigned long load_kernel(int fd)
367{ 367{
368 Elf32_Ehdr hdr; 368 Elf32_Ehdr hdr;
369 369
@@ -373,10 +373,10 @@ static unsigned long load_kernel(int fd, unsigned long *page_offset)
373 373
374 /* If it's an ELF file, it starts with "\177ELF" */ 374 /* If it's an ELF file, it starts with "\177ELF" */
375 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) 375 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
376 return map_elf(fd, &hdr, page_offset); 376 return map_elf(fd, &hdr);
377 377
378 /* Otherwise we assume it's a bzImage, and try to unpack it */ 378 /* Otherwise we assume it's a bzImage, and try to unpack it */
379 return load_bzimage(fd, page_offset); 379 return load_bzimage(fd);
380} 380}
381 381
382/* This is a trivial little helper to align pages. Andi Kleen hated it because 382/* This is a trivial little helper to align pages. Andi Kleen hated it because
@@ -402,59 +402,45 @@ static unsigned long load_initrd(const char *name, unsigned long mem)
402 int ifd; 402 int ifd;
403 struct stat st; 403 struct stat st;
404 unsigned long len; 404 unsigned long len;
405 void *iaddr;
406 405
407 ifd = open_or_die(name, O_RDONLY); 406 ifd = open_or_die(name, O_RDONLY);
408 /* fstat() is needed to get the file size. */ 407 /* fstat() is needed to get the file size. */
409 if (fstat(ifd, &st) < 0) 408 if (fstat(ifd, &st) < 0)
410 err(1, "fstat() on initrd '%s'", name); 409 err(1, "fstat() on initrd '%s'", name);
411 410
412 /* The length needs to be rounded up to a page size: mmap needs the 411 /* We map the initrd at the top of memory, but mmap wants it to be
413 * address to be page aligned. */ 412 * page-aligned, so we round the size up for that. */
414 len = page_align(st.st_size); 413 len = page_align(st.st_size);
415 /* We map the initrd at the top of memory. */ 414 map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
416 iaddr = mmap((void *)mem - len, st.st_size,
417 PROT_READ|PROT_EXEC|PROT_WRITE,
418 MAP_FIXED|MAP_PRIVATE, ifd, 0);
419 if (iaddr != (void *)mem - len)
420 err(1, "Mmaping initrd '%s' returned %p not %p",
421 name, iaddr, (void *)mem - len);
422 /* Once a file is mapped, you can close the file descriptor. It's a 415 /* Once a file is mapped, you can close the file descriptor. It's a
423 * little odd, but quite useful. */ 416 * little odd, but quite useful. */
424 close(ifd); 417 close(ifd);
425 verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr); 418 verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
426 419
427 /* We return the initrd size. */ 420 /* We return the initrd size. */
428 return len; 421 return len;
429} 422}
430 423
431/* Once we know how much memory we have, and the address the Guest kernel 424/* Once we know how much memory we have, we can construct simple linear page
432 * expects, we can construct simple linear page tables which will get the Guest 425 * tables which set virtual == physical which will get the Guest far enough
433 * far enough into the boot to create its own. 426 * into the boot to create its own.
434 * 427 *
435 * We lay them out of the way, just below the initrd (which is why we need to 428 * We lay them out of the way, just below the initrd (which is why we need to
436 * know its size). */ 429 * know its size). */
437static unsigned long setup_pagetables(unsigned long mem, 430static unsigned long setup_pagetables(unsigned long mem,
438 unsigned long initrd_size, 431 unsigned long initrd_size)
439 unsigned long page_offset)
440{ 432{
441 u32 *pgdir, *linear; 433 unsigned long *pgdir, *linear;
442 unsigned int mapped_pages, i, linear_pages; 434 unsigned int mapped_pages, i, linear_pages;
443 unsigned int ptes_per_page = getpagesize()/sizeof(u32); 435 unsigned int ptes_per_page = getpagesize()/sizeof(void *);
444 436
445 /* Ideally we map all physical memory starting at page_offset. 437 mapped_pages = mem/getpagesize();
446 * However, if page_offset is 0xC0000000 we can only map 1G of physical
447 * (0xC0000000 + 1G overflows). */
448 if (mem <= -page_offset)
449 mapped_pages = mem/getpagesize();
450 else
451 mapped_pages = -page_offset/getpagesize();
452 438
453 /* Each PTE page can map ptes_per_page pages: how many do we need? */ 439 /* Each PTE page can map ptes_per_page pages: how many do we need? */
454 linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; 440 linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page;
455 441
456 /* We put the toplevel page directory page at the top of memory. */ 442 /* We put the toplevel page directory page at the top of memory. */
457 pgdir = (void *)mem - initrd_size - getpagesize(); 443 pgdir = from_guest_phys(mem) - initrd_size - getpagesize();
458 444
459 /* Now we use the next linear_pages pages as pte pages */ 445 /* Now we use the next linear_pages pages as pte pages */
460 linear = (void *)pgdir - linear_pages*getpagesize(); 446 linear = (void *)pgdir - linear_pages*getpagesize();
@@ -465,20 +451,19 @@ static unsigned long setup_pagetables(unsigned long mem,
465 for (i = 0; i < mapped_pages; i++) 451 for (i = 0; i < mapped_pages; i++)
466 linear[i] = ((i * getpagesize()) | PAGE_PRESENT); 452 linear[i] = ((i * getpagesize()) | PAGE_PRESENT);
467 453
468 /* The top level points to the linear page table pages above. The 454 /* The top level points to the linear page table pages above. */
469 * entry representing page_offset points to the first one, and they
470 * continue from there. */
471 for (i = 0; i < mapped_pages; i += ptes_per_page) { 455 for (i = 0; i < mapped_pages; i += ptes_per_page) {
472 pgdir[(i + page_offset/getpagesize())/ptes_per_page] 456 pgdir[i/ptes_per_page]
473 = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT); 457 = ((to_guest_phys(linear) + i*sizeof(void *))
458 | PAGE_PRESENT);
474 } 459 }
475 460
476 verbose("Linear mapping of %u pages in %u pte pages at %p\n", 461 verbose("Linear mapping of %u pages in %u pte pages at %#lx\n",
477 mapped_pages, linear_pages, linear); 462 mapped_pages, linear_pages, to_guest_phys(linear));
478 463
479 /* We return the top level (guest-physical) address: the kernel needs 464 /* We return the top level (guest-physical) address: the kernel needs
480 * to know where it is. */ 465 * to know where it is. */
481 return (unsigned long)pgdir; 466 return to_guest_phys(pgdir);
482} 467}
483 468
484/* Simple routine to roll all the commandline arguments together with spaces 469/* Simple routine to roll all the commandline arguments together with spaces
@@ -498,14 +483,17 @@ static void concat(char *dst, char *args[])
498 483
499/* This is where we actually tell the kernel to initialize the Guest. We saw 484/* This is where we actually tell the kernel to initialize the Guest. We saw
500 * the arguments it expects when we looked at initialize() in lguest_user.c: 485 * the arguments it expects when we looked at initialize() in lguest_user.c:
501 * the top physical page to allow, the top level pagetable, the entry point and 486 * the base of guest "physical" memory, the top physical page to allow, the
502 * the page_offset constant for the Guest. */ 487 * top level pagetable and the entry point for the Guest. */
503static int tell_kernel(u32 pgdir, u32 start, u32 page_offset) 488static int tell_kernel(unsigned long pgdir, unsigned long start)
504{ 489{
505 u32 args[] = { LHREQ_INITIALIZE, 490 unsigned long args[] = { LHREQ_INITIALIZE,
506 top/getpagesize(), pgdir, start, page_offset }; 491 (unsigned long)guest_base,
492 guest_limit / getpagesize(), pgdir, start };
507 int fd; 493 int fd;
508 494
495 verbose("Guest: %p - %p (%#lx)\n",
496 guest_base, guest_base + guest_limit, guest_limit);
509 fd = open_or_die("/dev/lguest", O_RDWR); 497 fd = open_or_die("/dev/lguest", O_RDWR);
510 if (write(fd, args, sizeof(args)) < 0) 498 if (write(fd, args, sizeof(args)) < 0)
511 err(1, "Writing to /dev/lguest"); 499 err(1, "Writing to /dev/lguest");
@@ -515,11 +503,11 @@ static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
515} 503}
516/*:*/ 504/*:*/
517 505
518static void set_fd(int fd, struct device_list *devices) 506static void add_device_fd(int fd)
519{ 507{
520 FD_SET(fd, &devices->infds); 508 FD_SET(fd, &devices.infds);
521 if (fd > devices->max_infd) 509 if (fd > devices.max_infd)
522 devices->max_infd = fd; 510 devices.max_infd = fd;
523} 511}
524 512
525/*L:200 513/*L:200
@@ -537,36 +525,38 @@ static void set_fd(int fd, struct device_list *devices)
537 * 525 *
538 * This, of course, is merely a different *kind* of icky. 526 * This, of course, is merely a different *kind* of icky.
539 */ 527 */
540static void wake_parent(int pipefd, int lguest_fd, struct device_list *devices) 528static void wake_parent(int pipefd, int lguest_fd)
541{ 529{
542 /* Add the pipe from the Launcher to the fdset in the device_list, so 530 /* Add the pipe from the Launcher to the fdset in the device_list, so
543 * we watch it, too. */ 531 * we watch it, too. */
544 set_fd(pipefd, devices); 532 add_device_fd(pipefd);
545 533
546 for (;;) { 534 for (;;) {
547 fd_set rfds = devices->infds; 535 fd_set rfds = devices.infds;
548 u32 args[] = { LHREQ_BREAK, 1 }; 536 unsigned long args[] = { LHREQ_BREAK, 1 };
549 537
550 /* Wait until input is ready from one of the devices. */ 538 /* Wait until input is ready from one of the devices. */
551 select(devices->max_infd+1, &rfds, NULL, NULL, NULL); 539 select(devices.max_infd+1, &rfds, NULL, NULL, NULL);
552 /* Is it a message from the Launcher? */ 540 /* Is it a message from the Launcher? */
553 if (FD_ISSET(pipefd, &rfds)) { 541 if (FD_ISSET(pipefd, &rfds)) {
554 int ignorefd; 542 int fd;
555 /* If read() returns 0, it means the Launcher has 543 /* If read() returns 0, it means the Launcher has
556 * exited. We silently follow. */ 544 * exited. We silently follow. */
557 if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0) 545 if (read(pipefd, &fd, sizeof(fd)) == 0)
558 exit(0); 546 exit(0);
559 /* Otherwise it's telling us there's a problem with one 547 /* Otherwise it's telling us to change what file
560 * of the devices, and we should ignore that file 548 * descriptors we're to listen to. */
561 * descriptor from now on. */ 549 if (fd >= 0)
562 FD_CLR(ignorefd, &devices->infds); 550 FD_SET(fd, &devices.infds);
551 else
552 FD_CLR(-fd - 1, &devices.infds);
563 } else /* Send LHREQ_BREAK command. */ 553 } else /* Send LHREQ_BREAK command. */
564 write(lguest_fd, args, sizeof(args)); 554 write(lguest_fd, args, sizeof(args));
565 } 555 }
566} 556}
567 557
568/* This routine just sets up a pipe to the Waker process. */ 558/* This routine just sets up a pipe to the Waker process. */
569static int setup_waker(int lguest_fd, struct device_list *device_list) 559static int setup_waker(int lguest_fd)
570{ 560{
571 int pipefd[2], child; 561 int pipefd[2], child;
572 562
@@ -580,7 +570,7 @@ static int setup_waker(int lguest_fd, struct device_list *device_list)
580 if (child == 0) { 570 if (child == 0) {
581 /* Close the "writing" end of our copy of the pipe */ 571 /* Close the "writing" end of our copy of the pipe */
582 close(pipefd[1]); 572 close(pipefd[1]);
583 wake_parent(pipefd[0], lguest_fd, device_list); 573 wake_parent(pipefd[0], lguest_fd);
584 } 574 }
585 /* Close the reading end of our copy of the pipe. */ 575 /* Close the reading end of our copy of the pipe. */
586 close(pipefd[0]); 576 close(pipefd[0]);
@@ -602,83 +592,128 @@ static void *_check_pointer(unsigned long addr, unsigned int size,
602{ 592{
603 /* We have to separately check addr and addr+size, because size could 593 /* We have to separately check addr and addr+size, because size could
604 * be huge and addr + size might wrap around. */ 594 * be huge and addr + size might wrap around. */
605 if (addr >= top || addr + size >= top) 595 if (addr >= guest_limit || addr + size >= guest_limit)
606 errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr); 596 errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr);
607 /* We return a pointer for the caller's convenience, now we know it's 597 /* We return a pointer for the caller's convenience, now we know it's
608 * safe to use. */ 598 * safe to use. */
609 return (void *)addr; 599 return from_guest_phys(addr);
610} 600}
611/* A macro which transparently hands the line number to the real function. */ 601/* A macro which transparently hands the line number to the real function. */
612#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) 602#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
613 603
614/* The Guest has given us the address of a "struct lguest_dma". We check it's 604/* This function returns the next descriptor in the chain, or vq->vring.num. */
615 * OK and convert it to an iovec (which is a simple array of ptr/size 605static unsigned next_desc(struct virtqueue *vq, unsigned int i)
616 * pairs). */
617static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
618{ 606{
619 unsigned int i; 607 unsigned int next;
620 struct lguest_dma *udma;
621
622 /* First we make sure that the array memory itself is valid. */
623 udma = check_pointer(dma, sizeof(*udma));
624 /* Now we check each element */
625 for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
626 /* A zero length ends the array. */
627 if (!udma->len[i])
628 break;
629 608
630 iov[i].iov_base = check_pointer(udma->addr[i], udma->len[i]); 609 /* If this descriptor says it doesn't chain, we're done. */
631 iov[i].iov_len = udma->len[i]; 610 if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT))
632 } 611 return vq->vring.num;
633 *num = i; 612
613 /* Check they're not leading us off end of descriptors. */
614 next = vq->vring.desc[i].next;
615 /* Make sure compiler knows to grab that: we don't want it changing! */
616 wmb();
634 617
635 /* We return the pointer to where the caller should write the amount of 618 if (next >= vq->vring.num)
636 * the buffer used. */ 619 errx(1, "Desc next is %u", next);
637 return &udma->used_len; 620
621 return next;
622}
623
624/* This looks in the virtqueue and for the first available buffer, and converts
625 * it to an iovec for convenient access. Since descriptors consist of some
626 * number of output then some number of input descriptors, it's actually two
627 * iovecs, but we pack them into one and note how many of each there were.
628 *
629 * This function returns the descriptor number found, or vq->vring.num (which
630 * is never a valid descriptor number) if none was found. */
631static unsigned get_vq_desc(struct virtqueue *vq,
632 struct iovec iov[],
633 unsigned int *out_num, unsigned int *in_num)
634{
635 unsigned int i, head;
636
637 /* Check it isn't doing very strange things with descriptor numbers. */
638 if ((u16)(vq->vring.avail->idx - vq->last_avail_idx) > vq->vring.num)
639 errx(1, "Guest moved used index from %u to %u",
640 vq->last_avail_idx, vq->vring.avail->idx);
641
642 /* If there's nothing new since last we looked, return invalid. */
643 if (vq->vring.avail->idx == vq->last_avail_idx)
644 return vq->vring.num;
645
646 /* Grab the next descriptor number they're advertising, and increment
647 * the index we've seen. */
648 head = vq->vring.avail->ring[vq->last_avail_idx++ % vq->vring.num];
649
650 /* If their number is silly, that's a fatal mistake. */
651 if (head >= vq->vring.num)
652 errx(1, "Guest says index %u is available", head);
653
654 /* When we start there are none of either input nor output. */
655 *out_num = *in_num = 0;
656
657 i = head;
658 do {
659 /* Grab the first descriptor, and check it's OK. */
660 iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len;
661 iov[*out_num + *in_num].iov_base
662 = check_pointer(vq->vring.desc[i].addr,
663 vq->vring.desc[i].len);
664 /* If this is an input descriptor, increment that count. */
665 if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE)
666 (*in_num)++;
667 else {
668 /* If it's an output descriptor, they're all supposed
669 * to come before any input descriptors. */
670 if (*in_num)
671 errx(1, "Descriptor has out after in");
672 (*out_num)++;
673 }
674
675 /* If we've got too many, that implies a descriptor loop. */
676 if (*out_num + *in_num > vq->vring.num)
677 errx(1, "Looped descriptor");
678 } while ((i = next_desc(vq, i)) != vq->vring.num);
679
680 return head;
638} 681}
639 682
640/* This routine gets a DMA buffer from the Guest for a given key, and converts 683/* Once we've used one of their buffers, we tell them about it. We'll then
641 * it to an iovec array. It returns the interrupt the Guest wants when we're 684 * want to send them an interrupt, using trigger_irq(). */
642 * finished, and a pointer to the "used_len" field to fill in. */ 685static void add_used(struct virtqueue *vq, unsigned int head, int len)
643static u32 *get_dma_buffer(int fd, void *key,
644 struct iovec iov[], unsigned int *num, u32 *irq)
645{ 686{
646 u32 buf[] = { LHREQ_GETDMA, (u32)key }; 687 struct vring_used_elem *used;
647 unsigned long udma; 688
648 u32 *res; 689 /* Get a pointer to the next entry in the used ring. */
649 690 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
650 /* Ask the kernel for a DMA buffer corresponding to this key. */ 691 used->id = head;
651 udma = write(fd, buf, sizeof(buf)); 692 used->len = len;
652 /* They haven't registered any, or they're all used? */ 693 /* Make sure buffer is written before we update index. */
653 if (udma == (unsigned long)-1) 694 wmb();
654 return NULL; 695 vq->vring.used->idx++;
655
656 /* Convert it into our iovec array */
657 res = dma2iov(udma, iov, num);
658 /* The kernel stashes irq in ->used_len to get it out to us. */
659 *irq = *res;
660 /* Return a pointer to ((struct lguest_dma *)udma)->used_len. */
661 return res;
662} 696}
663 697
664/* This is a convenient routine to send the Guest an interrupt. */ 698/* This actually sends the interrupt for this virtqueue */
665static void trigger_irq(int fd, u32 irq) 699static void trigger_irq(int fd, struct virtqueue *vq)
666{ 700{
667 u32 buf[] = { LHREQ_IRQ, irq }; 701 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
702
703 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
704 return;
705
706 /* Send the Guest an interrupt tell them we used something up. */
668 if (write(fd, buf, sizeof(buf)) != 0) 707 if (write(fd, buf, sizeof(buf)) != 0)
669 err(1, "Triggering irq %i", irq); 708 err(1, "Triggering irq %i", vq->config.irq);
670} 709}
671 710
672/* This simply sets up an iovec array where we can put data to be discarded. 711/* And here's the combo meal deal. Supersize me! */
673 * This happens when the Guest doesn't want or can't handle the input: we have 712static void add_used_and_trigger(int fd, struct virtqueue *vq,
674 * to get rid of it somewhere, and if we bury it in the ceiling space it will 713 unsigned int head, int len)
675 * start to smell after a week. */
676static void discard_iovec(struct iovec *iov, unsigned int *num)
677{ 714{
678 static char discard_buf[1024]; 715 add_used(vq, head, len);
679 *num = 1; 716 trigger_irq(fd, vq);
680 iov->iov_base = discard_buf;
681 iov->iov_len = sizeof(discard_buf);
682} 717}
683 718
684/* Here is the input terminal setting we save, and the routine to restore them 719/* Here is the input terminal setting we save, and the routine to restore them
@@ -701,38 +736,39 @@ struct console_abort
701/* This is the routine which handles console input (ie. stdin). */ 736/* This is the routine which handles console input (ie. stdin). */
702static bool handle_console_input(int fd, struct device *dev) 737static bool handle_console_input(int fd, struct device *dev)
703{ 738{
704 u32 irq = 0, *lenp;
705 int len; 739 int len;
706 unsigned int num; 740 unsigned int head, in_num, out_num;
707 struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; 741 struct iovec iov[dev->vq->vring.num];
708 struct console_abort *abort = dev->priv; 742 struct console_abort *abort = dev->priv;
709 743
710 /* First we get the console buffer from the Guest. The key is dev->mem 744 /* First we need a console buffer from the Guests's input virtqueue. */
711 * which was set to 0 in setup_console(). */ 745 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
712 lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq); 746
713 if (!lenp) { 747 /* If they're not ready for input, stop listening to this file
714 /* If it's not ready for input, warn and set up to discard. */ 748 * descriptor. We'll start again once they add an input buffer. */
715 warn("console: no dma buffer!"); 749 if (head == dev->vq->vring.num)
716 discard_iovec(iov, &num); 750 return false;
717 } 751
752 if (out_num)
753 errx(1, "Output buffers in console in queue?");
718 754
719 /* This is why we convert to iovecs: the readv() call uses them, and so 755 /* This is why we convert to iovecs: the readv() call uses them, and so
720 * it reads straight into the Guest's buffer. */ 756 * it reads straight into the Guest's buffer. */
721 len = readv(dev->fd, iov, num); 757 len = readv(dev->fd, iov, in_num);
722 if (len <= 0) { 758 if (len <= 0) {
723 /* This implies that the console is closed, is /dev/null, or 759 /* This implies that the console is closed, is /dev/null, or
724 * something went terribly wrong. We still go through the rest 760 * something went terribly wrong. */
725 * of the logic, though, especially the exit handling below. */
726 warnx("Failed to get console input, ignoring console."); 761 warnx("Failed to get console input, ignoring console.");
727 len = 0; 762 /* Put the input terminal back. */
763 restore_term();
764 /* Remove callback from input vq, so it doesn't restart us. */
765 dev->vq->handle_output = NULL;
766 /* Stop listening to this fd: don't call us again. */
767 return false;
728 } 768 }
729 769
730 /* If we read the data into the Guest, fill in the length and send the 770 /* Tell the Guest about the new input. */
731 * interrupt. */ 771 add_used_and_trigger(fd, dev->vq, head, len);
732 if (lenp) {
733 *lenp = len;
734 trigger_irq(fd, irq);
735 }
736 772
737 /* Three ^C within one second? Exit. 773 /* Three ^C within one second? Exit.
738 * 774 *
@@ -746,7 +782,7 @@ static bool handle_console_input(int fd, struct device *dev)
746 struct timeval now; 782 struct timeval now;
747 gettimeofday(&now, NULL); 783 gettimeofday(&now, NULL);
748 if (now.tv_sec <= abort->start.tv_sec+1) { 784 if (now.tv_sec <= abort->start.tv_sec+1) {
749 u32 args[] = { LHREQ_BREAK, 0 }; 785 unsigned long args[] = { LHREQ_BREAK, 0 };
750 /* Close the fd so Waker will know it has to 786 /* Close the fd so Waker will know it has to
751 * exit. */ 787 * exit. */
752 close(waker_fd); 788 close(waker_fd);
@@ -761,214 +797,163 @@ static bool handle_console_input(int fd, struct device *dev)
761 /* Any other key resets the abort counter. */ 797 /* Any other key resets the abort counter. */
762 abort->count = 0; 798 abort->count = 0;
763 799
764 /* Now, if we didn't read anything, put the input terminal back and
765 * return failure (meaning, don't call us again). */
766 if (!len) {
767 restore_term();
768 return false;
769 }
770 /* Everything went OK! */ 800 /* Everything went OK! */
771 return true; 801 return true;
772} 802}
773 803
774/* Handling console output is much simpler than input. */ 804/* Handling output for console is simple: we just get all the output buffers
775static u32 handle_console_output(int fd, const struct iovec *iov, 805 * and write them to stdout. */
776 unsigned num, struct device*dev) 806static void handle_console_output(int fd, struct virtqueue *vq)
777{ 807{
778 /* Whatever the Guest sends, write it to standard output. Return the 808 unsigned int head, out, in;
779 * number of bytes written. */ 809 int len;
780 return writev(STDOUT_FILENO, iov, num); 810 struct iovec iov[vq->vring.num];
781} 811
782 812 /* Keep getting output buffers from the Guest until we run out. */
783/* Guest->Host network output is also pretty easy. */ 813 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
784static u32 handle_tun_output(int fd, const struct iovec *iov, 814 if (in)
785 unsigned num, struct device *dev) 815 errx(1, "Input buffers in output queue?");
786{ 816 len = writev(STDOUT_FILENO, iov, out);
787 /* We put a flag in the "priv" pointer of the network device, and set 817 add_used_and_trigger(fd, vq, head, len);
788 * it as soon as we see output. We'll see why in handle_tun_input() */ 818 }
789 *(bool *)dev->priv = true;
790 /* Whatever packet the Guest sent us, write it out to the tun
791 * device. */
792 return writev(dev->fd, iov, num);
793} 819}
794 820
795/* This matches the peer_key() in lguest_net.c. The key for any given slot 821/* Handling output for network is also simple: we get all the output buffers
796 * is the address of the network device's page plus 4 * the slot number. */ 822 * and write them (ignoring the first element) to this device's file descriptor
797static unsigned long peer_offset(unsigned int peernum) 823 * (stdout). */
824static void handle_net_output(int fd, struct virtqueue *vq)
798{ 825{
799 return 4 * peernum; 826 unsigned int head, out, in;
827 int len;
828 struct iovec iov[vq->vring.num];
829
830 /* Keep getting output buffers from the Guest until we run out. */
831 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
832 if (in)
833 errx(1, "Input buffers in output queue?");
834 /* Check header, but otherwise ignore it (we said we supported
835 * no features). */
836 (void)convert(&iov[0], struct virtio_net_hdr);
837 len = writev(vq->dev->fd, iov+1, out-1);
838 add_used_and_trigger(fd, vq, head, len);
839 }
800} 840}
801 841
802/* This is where we handle a packet coming in from the tun device */ 842/* This is where we handle a packet coming in from the tun device to our
843 * Guest. */
803static bool handle_tun_input(int fd, struct device *dev) 844static bool handle_tun_input(int fd, struct device *dev)
804{ 845{
805 u32 irq = 0, *lenp; 846 unsigned int head, in_num, out_num;
806 int len; 847 int len;
807 unsigned num; 848 struct iovec iov[dev->vq->vring.num];
808 struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; 849 struct virtio_net_hdr *hdr;
809 850
810 /* First we get a buffer the Guest has bound to its key. */ 851 /* First we need a network buffer from the Guests's recv virtqueue. */
811 lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num, 852 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
812 &irq); 853 if (head == dev->vq->vring.num) {
813 if (!lenp) {
814 /* Now, it's expected that if we try to send a packet too 854 /* Now, it's expected that if we try to send a packet too
815 * early, the Guest won't be ready yet. This is why we set a 855 * early, the Guest won't be ready yet. Wait until the device
816 * flag when the Guest sends its first packet. If it's sent a 856 * status says it's ready. */
817 * packet we assume it should be ready to receive them. 857 /* FIXME: Actually want DRIVER_ACTIVE here. */
818 * 858 if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK)
819 * Actually, this is what the status bits in the descriptor are
820 * for: we should *use* them. FIXME! */
821 if (*(bool *)dev->priv)
822 warn("network: no dma buffer!"); 859 warn("network: no dma buffer!");
823 discard_iovec(iov, &num); 860 /* We'll turn this back on if input buffers are registered. */
824 } 861 return false;
862 } else if (out_num)
863 errx(1, "Output buffers in network recv queue?");
864
865 /* First element is the header: we set it to 0 (no features). */
866 hdr = convert(&iov[0], struct virtio_net_hdr);
867 hdr->flags = 0;
868 hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
825 869
826 /* Read the packet from the device directly into the Guest's buffer. */ 870 /* Read the packet from the device directly into the Guest's buffer. */
827 len = readv(dev->fd, iov, num); 871 len = readv(dev->fd, iov+1, in_num-1);
828 if (len <= 0) 872 if (len <= 0)
829 err(1, "reading network"); 873 err(1, "reading network");
830 874
831 /* Write the used_len, and trigger the interrupt for the Guest */ 875 /* Tell the Guest about the new packet. */
832 if (lenp) { 876 add_used_and_trigger(fd, dev->vq, head, sizeof(*hdr) + len);
833 *lenp = len; 877
834 trigger_irq(fd, irq);
835 }
836 verbose("tun input packet len %i [%02x %02x] (%s)\n", len, 878 verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
837 ((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1], 879 ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1],
838 lenp ? "sent" : "discarded"); 880 head != dev->vq->vring.num ? "sent" : "discarded");
881
839 /* All good. */ 882 /* All good. */
840 return true; 883 return true;
841} 884}
842 885
843/* The last device handling routine is block output: the Guest has sent a DMA 886/* This callback ensures we try again, in case we stopped console or net
844 * to the block device. It will have placed the command it wants in the 887 * delivery because Guest didn't have any buffers. */
845 * "struct lguest_block_page". */ 888static void enable_fd(int fd, struct virtqueue *vq)
846static u32 handle_block_output(int fd, const struct iovec *iov,
847 unsigned num, struct device *dev)
848{ 889{
849 struct lguest_block_page *p = dev->mem; 890 add_device_fd(vq->dev->fd);
850 u32 irq, *lenp; 891 /* Tell waker to listen to it again */
851 unsigned int len, reply_num; 892 write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd));
852 struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
853 off64_t device_len, off = (off64_t)p->sector * 512;
854
855 /* First we extract the device length from the dev->priv pointer. */
856 device_len = *(off64_t *)dev->priv;
857
858 /* We first check that the read or write is within the length of the
859 * block file. */
860 if (off >= device_len)
861 err(1, "Bad offset %llu vs %llu", off, device_len);
862 /* Move to the right location in the block file. This shouldn't fail,
863 * but best to check. */
864 if (lseek64(dev->fd, off, SEEK_SET) != off)
865 err(1, "Bad seek to sector %i", p->sector);
866
867 verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off);
868
869 /* They were supposed to bind a reply buffer at key equal to the start
870 * of the block device memory. We need this to tell them when the
871 * request is finished. */
872 lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
873 if (!lenp)
874 err(1, "Block request didn't give us a dma buffer");
875
876 if (p->type) {
877 /* A write request. The DMA they sent contained the data, so
878 * write it out. */
879 len = writev(dev->fd, iov, num);
880 /* Grr... Now we know how long the "struct lguest_dma" they
881 * sent was, we make sure they didn't try to write over the end
882 * of the block file (possibly extending it). */
883 if (off + len > device_len) {
884 /* Trim it back to the correct length */
885 ftruncate64(dev->fd, device_len);
886 /* Die, bad Guest, die. */
887 errx(1, "Write past end %llu+%u", off, len);
888 }
889 /* The reply length is 0: we just send back an empty DMA to
890 * interrupt them and tell them the write is finished. */
891 *lenp = 0;
892 } else {
893 /* A read request. They sent an empty DMA to start the
894 * request, and we put the read contents into the reply
895 * buffer. */
896 len = readv(dev->fd, reply, reply_num);
897 *lenp = len;
898 }
899
900 /* The result is 1 (done), 2 if there was an error (short read or
901 * write). */
902 p->result = 1 + (p->bytes != len);
903 /* Now tell them we've used their reply buffer. */
904 trigger_irq(fd, irq);
905
906 /* We're supposed to return the number of bytes of the output buffer we
907 * used. But the block device uses the "result" field instead, so we
908 * don't bother. */
909 return 0;
910} 893}
911 894
912/* This is the generic routine we call when the Guest sends some DMA out. */ 895/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */
913static void handle_output(int fd, unsigned long dma, unsigned long key, 896static void handle_output(int fd, unsigned long addr)
914 struct device_list *devices)
915{ 897{
916 struct device *i; 898 struct device *i;
917 u32 *lenp; 899 struct virtqueue *vq;
918 struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; 900
919 unsigned num = 0; 901 /* Check each virtqueue. */
920 902 for (i = devices.dev; i; i = i->next) {
921 /* Convert the "struct lguest_dma" they're sending to a "struct 903 for (vq = i->vq; vq; vq = vq->next) {
922 * iovec". */ 904 if (vq->config.pfn == addr/getpagesize()
923 lenp = dma2iov(dma, iov, &num); 905 && vq->handle_output) {
924 906 verbose("Output to %s\n", vq->dev->name);
925 /* Check each device: if they expect output to this key, tell them to 907 vq->handle_output(fd, vq);
926 * handle it. */ 908 return;
927 for (i = devices->dev; i; i = i->next) { 909 }
928 if (i->handle_output && key == i->watch_key) {
929 /* We write the result straight into the used_len field
930 * for them. */
931 *lenp = i->handle_output(fd, iov, num, i);
932 return;
933 } 910 }
934 } 911 }
935 912
936 /* This can happen: the kernel sends any SEND_DMA which doesn't match 913 /* Early console write is done using notify on a nul-terminated string
937 * another Guest to us. It could be that another Guest just left a 914 * in Guest memory. */
938 * network, for example. But it's unusual. */ 915 if (addr >= guest_limit)
939 warnx("Pending dma %p, key %p", (void *)dma, (void *)key); 916 errx(1, "Bad NOTIFY %#lx", addr);
917
918 write(STDOUT_FILENO, from_guest_phys(addr),
919 strnlen(from_guest_phys(addr), guest_limit - addr));
940} 920}
941 921
942/* This is called when the waker wakes us up: check for incoming file 922/* This is called when the waker wakes us up: check for incoming file
943 * descriptors. */ 923 * descriptors. */
944static void handle_input(int fd, struct device_list *devices) 924static void handle_input(int fd)
945{ 925{
946 /* select() wants a zeroed timeval to mean "don't wait". */ 926 /* select() wants a zeroed timeval to mean "don't wait". */
947 struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; 927 struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
948 928
949 for (;;) { 929 for (;;) {
950 struct device *i; 930 struct device *i;
951 fd_set fds = devices->infds; 931 fd_set fds = devices.infds;
952 932
953 /* If nothing is ready, we're done. */ 933 /* If nothing is ready, we're done. */
954 if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0) 934 if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0)
955 break; 935 break;
956 936
957 /* Otherwise, call the device(s) which have readable 937 /* Otherwise, call the device(s) which have readable
958 * file descriptors and a method of handling them. */ 938 * file descriptors and a method of handling them. */
959 for (i = devices->dev; i; i = i->next) { 939 for (i = devices.dev; i; i = i->next) {
960 if (i->handle_input && FD_ISSET(i->fd, &fds)) { 940 if (i->handle_input && FD_ISSET(i->fd, &fds)) {
941 int dev_fd;
942 if (i->handle_input(fd, i))
943 continue;
944
961 /* If handle_input() returns false, it means we 945 /* If handle_input() returns false, it means we
962 * should no longer service it. 946 * should no longer service it. Networking and
963 * handle_console_input() does this. */ 947 * console do this when there's no input
964 if (!i->handle_input(fd, i)) { 948 * buffers to deliver into. Console also uses
965 /* Clear it from the set of input file 949 * it when it discovers that stdin is
966 * descriptors kept at the head of the 950 * closed. */
967 * device list. */ 951 FD_CLR(i->fd, &devices.infds);
968 FD_CLR(i->fd, &devices->infds); 952 /* Tell waker to ignore it too, by sending a
969 /* Tell waker to ignore it too... */ 953 * negative fd number (-1, since 0 is a valid
970 write(waker_fd, &i->fd, sizeof(i->fd)); 954 * FD number). */
971 } 955 dev_fd = -i->fd - 1;
956 write(waker_fd, &dev_fd, sizeof(dev_fd));
972 } 957 }
973 } 958 }
974 } 959 }
@@ -982,43 +967,93 @@ static void handle_input(int fd, struct device_list *devices)
982 * routines to allocate them. 967 * routines to allocate them.
983 * 968 *
984 * This routine allocates a new "struct lguest_device_desc" from descriptor 969 * This routine allocates a new "struct lguest_device_desc" from descriptor
985 * table in the devices array just above the Guest's normal memory. */ 970 * table just above the Guest's normal memory. It returns a pointer to that
986static struct lguest_device_desc * 971 * descriptor. */
987new_dev_desc(struct lguest_device_desc *descs, 972static struct lguest_device_desc *new_dev_desc(u16 type)
988 u16 type, u16 features, u16 num_pages)
989{ 973{
990 unsigned int i; 974 struct lguest_device_desc *d;
991 975
992 for (i = 0; i < LGUEST_MAX_DEVICES; i++) { 976 /* We only have one page for all the descriptors. */
993 if (!descs[i].type) { 977 if (devices.desc_used + sizeof(*d) > getpagesize())
994 descs[i].type = type; 978 errx(1, "Too many devices");
995 descs[i].features = features; 979
996 descs[i].num_pages = num_pages; 980 /* We don't need to set config_len or status: page is 0 already. */
997 /* If they said the device needs memory, we allocate 981 d = (void *)devices.descpage + devices.desc_used;
998 * that now, bumping up the top of Guest memory. */ 982 d->type = type;
999 if (num_pages) { 983 devices.desc_used += sizeof(*d);
1000 map_zeroed_pages(top, num_pages); 984
1001 descs[i].pfn = top/getpagesize(); 985 return d;
1002 top += num_pages*getpagesize();
1003 }
1004 return &descs[i];
1005 }
1006 }
1007 errx(1, "too many devices");
1008} 986}
1009 987
1010/* This monster routine does all the creation and setup of a new device, 988/* Each device descriptor is followed by some configuration information.
1011 * including caling new_dev_desc() to allocate the descriptor and device 989 * The first byte is a "status" byte for the Guest to report what's happening.
1012 * memory. */ 990 * After that are fields: u8 type, u8 len, [... len bytes...].
1013static struct device *new_device(struct device_list *devices, 991 *
1014 u16 type, u16 num_pages, u16 features, 992 * This routine adds a new field to an existing device's descriptor. It only
1015 int fd, 993 * works for the last device, but that's OK because that's how we use it. */
1016 bool (*handle_input)(int, struct device *), 994static void add_desc_field(struct device *dev, u8 type, u8 len, const void *c)
1017 unsigned long watch_off, 995{
1018 u32 (*handle_output)(int, 996 /* This is the last descriptor, right? */
1019 const struct iovec *, 997 assert(devices.descpage + devices.desc_used
1020 unsigned, 998 == (u8 *)(dev->desc + 1) + dev->desc->config_len);
1021 struct device *)) 999
1000 /* We only have one page of device descriptions. */
1001 if (devices.desc_used + 2 + len > getpagesize())
1002 errx(1, "Too many devices");
1003
1004 /* Copy in the new config header: type then length. */
1005 devices.descpage[devices.desc_used++] = type;
1006 devices.descpage[devices.desc_used++] = len;
1007 memcpy(devices.descpage + devices.desc_used, c, len);
1008 devices.desc_used += len;
1009
1010 /* Update the device descriptor length: two byte head then data. */
1011 dev->desc->config_len += 2 + len;
1012}
1013
1014/* This routine adds a virtqueue to a device. We specify how many descriptors
1015 * the virtqueue is to have. */
1016static void add_virtqueue(struct device *dev, unsigned int num_descs,
1017 void (*handle_output)(int fd, struct virtqueue *me))
1018{
1019 unsigned int pages;
1020 struct virtqueue **i, *vq = malloc(sizeof(*vq));
1021 void *p;
1022
1023 /* First we need some pages for this virtqueue. */
1024 pages = (vring_size(num_descs) + getpagesize() - 1) / getpagesize();
1025 p = get_pages(pages);
1026
1027 /* Initialize the configuration. */
1028 vq->config.num = num_descs;
1029 vq->config.irq = devices.next_irq++;
1030 vq->config.pfn = to_guest_phys(p) / getpagesize();
1031
1032 /* Initialize the vring. */
1033 vring_init(&vq->vring, num_descs, p);
1034
1035 /* Add the configuration information to this device's descriptor. */
1036 add_desc_field(dev, VIRTIO_CONFIG_F_VIRTQUEUE,
1037 sizeof(vq->config), &vq->config);
1038
1039 /* Add to tail of list, so dev->vq is first vq, dev->vq->next is
1040 * second. */
1041 for (i = &dev->vq; *i; i = &(*i)->next);
1042 *i = vq;
1043
1044 /* Link virtqueue back to device. */
1045 vq->dev = dev;
1046
1047 /* Set up handler. */
1048 vq->handle_output = handle_output;
1049 if (!handle_output)
1050 vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
1051}
1052
1053/* This routine does all the creation and setup of a new device, including
1054 * caling new_dev_desc() to allocate the descriptor and device memory. */
1055static struct device *new_device(const char *name, u16 type, int fd,
1056 bool (*handle_input)(int, struct device *))
1022{ 1057{
1023 struct device *dev = malloc(sizeof(*dev)); 1058 struct device *dev = malloc(sizeof(*dev));
1024 1059
@@ -1026,27 +1061,25 @@ static struct device *new_device(struct device_list *devices,
1026 * easier, but the user expects the devices to be arranged on the bus 1061 * easier, but the user expects the devices to be arranged on the bus
1027 * in command-line order. The first network device on the command line 1062 * in command-line order. The first network device on the command line
1028 * is eth0, the first block device /dev/lgba, etc. */ 1063 * is eth0, the first block device /dev/lgba, etc. */
1029 *devices->lastdev = dev; 1064 *devices.lastdev = dev;
1030 dev->next = NULL; 1065 dev->next = NULL;
1031 devices->lastdev = &dev->next; 1066 devices.lastdev = &dev->next;
1032 1067
1033 /* Now we populate the fields one at a time. */ 1068 /* Now we populate the fields one at a time. */
1034 dev->fd = fd; 1069 dev->fd = fd;
1035 /* If we have an input handler for this file descriptor, then we add it 1070 /* If we have an input handler for this file descriptor, then we add it
1036 * to the device_list's fdset and maxfd. */ 1071 * to the device_list's fdset and maxfd. */
1037 if (handle_input) 1072 if (handle_input)
1038 set_fd(dev->fd, devices); 1073 add_device_fd(dev->fd);
1039 dev->desc = new_dev_desc(devices->descs, type, features, num_pages); 1074 dev->desc = new_dev_desc(type);
1040 dev->mem = (void *)(dev->desc->pfn * getpagesize());
1041 dev->handle_input = handle_input; 1075 dev->handle_input = handle_input;
1042 dev->watch_key = (unsigned long)dev->mem + watch_off; 1076 dev->name = name;
1043 dev->handle_output = handle_output;
1044 return dev; 1077 return dev;
1045} 1078}
1046 1079
1047/* Our first setup routine is the console. It's a fairly simple device, but 1080/* Our first setup routine is the console. It's a fairly simple device, but
1048 * UNIX tty handling makes it uglier than it could be. */ 1081 * UNIX tty handling makes it uglier than it could be. */
1049static void setup_console(struct device_list *devices) 1082static void setup_console(void)
1050{ 1083{
1051 struct device *dev; 1084 struct device *dev;
1052 1085
@@ -1062,127 +1095,38 @@ static void setup_console(struct device_list *devices)
1062 atexit(restore_term); 1095 atexit(restore_term);
1063 } 1096 }
1064 1097
1065 /* We don't currently require any memory for the console, so we ask for 1098 dev = new_device("console", VIRTIO_ID_CONSOLE,
1066 * 0 pages. */ 1099 STDIN_FILENO, handle_console_input);
1067 dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0,
1068 STDIN_FILENO, handle_console_input,
1069 LGUEST_CONSOLE_DMA_KEY, handle_console_output);
1070 /* We store the console state in dev->priv, and initialize it. */ 1100 /* We store the console state in dev->priv, and initialize it. */
1071 dev->priv = malloc(sizeof(struct console_abort)); 1101 dev->priv = malloc(sizeof(struct console_abort));
1072 ((struct console_abort *)dev->priv)->count = 0; 1102 ((struct console_abort *)dev->priv)->count = 0;
1073 verbose("device %p: console\n",
1074 (void *)(dev->desc->pfn * getpagesize()));
1075}
1076 1103
1077/* Setting up a block file is also fairly straightforward. */ 1104 /* The console needs two virtqueues: the input then the output. When
1078static void setup_block_file(const char *filename, struct device_list *devices) 1105 * they put something the input queue, we make sure we're listening to
1079{ 1106 * stdin. When they put something in the output queue, we write it to
1080 int fd; 1107 * stdout. */
1081 struct device *dev; 1108 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
1082 off64_t *device_len; 1109 add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output);
1083 struct lguest_block_page *p; 1110
1084 1111 verbose("device %u: console\n", devices.device_num++);
1085 /* We open with O_LARGEFILE because otherwise we get stuck at 2G. We
1086 * open with O_DIRECT because otherwise our benchmarks go much too
1087 * fast. */
1088 fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT);
1089
1090 /* We want one page, and have no input handler (the block file never
1091 * has anything interesting to say to us). Our timing will be quite
1092 * random, so it should be a reasonable randomness source. */
1093 dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1,
1094 LGUEST_DEVICE_F_RANDOMNESS,
1095 fd, NULL, 0, handle_block_output);
1096
1097 /* We store the device size in the private area */
1098 device_len = dev->priv = malloc(sizeof(*device_len));
1099 /* This is the safe way of establishing the size of our device: it
1100 * might be a normal file or an actual block device like /dev/hdb. */
1101 *device_len = lseek64(fd, 0, SEEK_END);
1102
1103 /* The device memory is a "struct lguest_block_page". It's zeroed
1104 * already, we just need to put in the device size. Block devices
1105 * think in sectors (ie. 512 byte chunks), so we translate here. */
1106 p = dev->mem;
1107 p->num_sectors = *device_len/512;
1108 verbose("device %p: block %i sectors\n",
1109 (void *)(dev->desc->pfn * getpagesize()), p->num_sectors);
1110} 1112}
1113/*:*/
1111 1114
1112/* 1115/*M:010 Inter-guest networking is an interesting area. Simplest is to have a
1113 * Network Devices. 1116 * --sharenet=<name> option which opens or creates a named pipe. This can be
1117 * used to send packets to another guest in a 1:1 manner.
1114 * 1118 *
1115 * Setting up network devices is quite a pain, because we have three types. 1119 * More sopisticated is to use one of the tools developed for project like UML
1116 * First, we have the inter-Guest network. This is a file which is mapped into 1120 * to do networking.
1117 * the address space of the Guests who are on the network. Because it is a
1118 * shared mapping, the same page underlies all the devices, and they can send
1119 * DMA to each other.
1120 * 1121 *
1121 * Remember from our network driver, the Guest is told what slot in the page it 1122 * Faster is to do virtio bonding in kernel. Doing this 1:1 would be
1122 * is to use. We use exclusive fnctl locks to reserve a slot. If another 1123 * completely generic ("here's my vring, attach to your vring") and would work
1123 * Guest is using a slot, the lock will fail and we try another. Because fnctl 1124 * for any traffic. Of course, namespace and permissions issues need to be
1124 * locks are cleaned up automatically when we die, this cleverly means that our 1125 * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide
1125 * reservation on the slot will vanish if we crash. */ 1126 * multiple inter-guest channels behind one interface, although it would
1126static unsigned int find_slot(int netfd, const char *filename) 1127 * require some manner of hotplugging new virtio channels.
1127{ 1128 *
1128 struct flock fl; 1129 * Finally, we could implement a virtio network switch in the kernel. :*/
1129
1130 fl.l_type = F_WRLCK;
1131 fl.l_whence = SEEK_SET;
1132 fl.l_len = 1;
1133 /* Try a 1 byte lock in each possible position number */
1134 for (fl.l_start = 0;
1135 fl.l_start < getpagesize()/sizeof(struct lguest_net);
1136 fl.l_start++) {
1137 /* If we succeed, return the slot number. */
1138 if (fcntl(netfd, F_SETLK, &fl) == 0)
1139 return fl.l_start;
1140 }
1141 errx(1, "No free slots in network file %s", filename);
1142}
1143
1144/* This function sets up the network file */
1145static void setup_net_file(const char *filename,
1146 struct device_list *devices)
1147{
1148 int netfd;
1149 struct device *dev;
1150
1151 /* We don't use open_or_die() here: for friendliness we create the file
1152 * if it doesn't already exist. */
1153 netfd = open(filename, O_RDWR, 0);
1154 if (netfd < 0) {
1155 if (errno == ENOENT) {
1156 netfd = open(filename, O_RDWR|O_CREAT, 0600);
1157 if (netfd >= 0) {
1158 /* If we succeeded, initialize the file with a
1159 * blank page. */
1160 char page[getpagesize()];
1161 memset(page, 0, sizeof(page));
1162 write(netfd, page, sizeof(page));
1163 }
1164 }
1165 if (netfd < 0)
1166 err(1, "cannot open net file '%s'", filename);
1167 }
1168
1169 /* We need 1 page, and the features indicate the slot to use and that
1170 * no checksum is needed. We never touch this device again; it's
1171 * between the Guests on the network, so we don't register input or
1172 * output handlers. */
1173 dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
1174 find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM,
1175 -1, NULL, 0, NULL);
1176
1177 /* Map the shared file. */
1178 if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE,
1179 MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem)
1180 err(1, "could not mmap '%s'", filename);
1181 verbose("device %p: shared net %s, peer %i\n",
1182 (void *)(dev->desc->pfn * getpagesize()), filename,
1183 dev->desc->features & ~LGUEST_NET_F_NOCSUM);
1184}
1185/*:*/
1186 1130
1187static u32 str2ip(const char *ipaddr) 1131static u32 str2ip(const char *ipaddr)
1188{ 1132{
@@ -1217,7 +1161,7 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name)
1217 1161
1218/* This sets up the Host end of the network device with an IP address, brings 1162/* This sets up the Host end of the network device with an IP address, brings
1219 * it up so packets will flow, the copies the MAC address into the hwaddr 1163 * it up so packets will flow, the copies the MAC address into the hwaddr
1220 * pointer (in practice, the Host's slot in the network device's memory). */ 1164 * pointer. */
1221static void configure_device(int fd, const char *devname, u32 ipaddr, 1165static void configure_device(int fd, const char *devname, u32 ipaddr,
1222 unsigned char hwaddr[6]) 1166 unsigned char hwaddr[6])
1223{ 1167{
@@ -1243,18 +1187,18 @@ static void configure_device(int fd, const char *devname, u32 ipaddr,
1243 memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6); 1187 memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
1244} 1188}
1245 1189
1246/*L:195 The other kind of network is a Host<->Guest network. This can either 1190/*L:195 Our network is a Host<->Guest network. This can either use bridging or
1247 * use briding or routing, but the principle is the same: it uses the "tun" 1191 * routing, but the principle is the same: it uses the "tun" device to inject
1248 * device to inject packets into the Host as if they came in from a normal 1192 * packets into the Host as if they came in from a normal network card. We
1249 * network card. We just shunt packets between the Guest and the tun 1193 * just shunt packets between the Guest and the tun device. */
1250 * device. */ 1194static void setup_tun_net(const char *arg)
1251static void setup_tun_net(const char *arg, struct device_list *devices)
1252{ 1195{
1253 struct device *dev; 1196 struct device *dev;
1254 struct ifreq ifr; 1197 struct ifreq ifr;
1255 int netfd, ipfd; 1198 int netfd, ipfd;
1256 u32 ip; 1199 u32 ip;
1257 const char *br_name = NULL; 1200 const char *br_name = NULL;
1201 u8 hwaddr[6];
1258 1202
1259 /* We open the /dev/net/tun device and tell it we want a tap device. A 1203 /* We open the /dev/net/tun device and tell it we want a tap device. A
1260 * tap device is like a tun device, only somehow different. To tell 1204 * tap device is like a tun device, only somehow different. To tell
@@ -1270,21 +1214,13 @@ static void setup_tun_net(const char *arg, struct device_list *devices)
1270 * device: trust us! */ 1214 * device: trust us! */
1271 ioctl(netfd, TUNSETNOCSUM, 1); 1215 ioctl(netfd, TUNSETNOCSUM, 1);
1272 1216
1273 /* We create the net device with 1 page, using the features field of 1217 /* First we create a new network device. */
1274 * the descriptor to tell the Guest it is in slot 1 (NET_PEERNUM), and 1218 dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input);
1275 * that the device has fairly random timing. We do *not* specify
1276 * LGUEST_NET_F_NOCSUM: these packets can reach the real world.
1277 *
1278 * We will put our MAC address is slot 0 for the Guest to see, so
1279 * it will send packets to us using the key "peer_offset(0)": */
1280 dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
1281 NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd,
1282 handle_tun_input, peer_offset(0), handle_tun_output);
1283 1219
1284 /* We keep a flag which says whether we've seen packets come out from 1220 /* Network devices need a receive and a send queue, just like
1285 * this network device. */ 1221 * console. */
1286 dev->priv = malloc(sizeof(bool)); 1222 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
1287 *(bool *)dev->priv = false; 1223 add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
1288 1224
1289 /* We need a socket to perform the magic network ioctls to bring up the 1225 /* We need a socket to perform the magic network ioctls to bring up the
1290 * tap interface, connect to the bridge etc. Any socket will do! */ 1226 * tap interface, connect to the bridge etc. Any socket will do! */
@@ -1300,44 +1236,251 @@ static void setup_tun_net(const char *arg, struct device_list *devices)
1300 } else /* It is an IP address to set up the device with */ 1236 } else /* It is an IP address to set up the device with */
1301 ip = str2ip(arg); 1237 ip = str2ip(arg);
1302 1238
1303 /* We are peer 0, ie. first slot, so we hand dev->mem to this routine 1239 /* Set up the tun device, and get the mac address for the interface. */
1304 * to write the MAC address at the start of the device memory. */ 1240 configure_device(ipfd, ifr.ifr_name, ip, hwaddr);
1305 configure_device(ipfd, ifr.ifr_name, ip, dev->mem);
1306 1241
1307 /* Set "promisc" bit: we want every single packet if we're going to 1242 /* Tell Guest what MAC address to use. */
1308 * bridge to other machines (and otherwise it doesn't matter). */ 1243 add_desc_field(dev, VIRTIO_CONFIG_NET_MAC_F, sizeof(hwaddr), hwaddr);
1309 *((u8 *)dev->mem) |= 0x1;
1310 1244
1245 /* We don't seed the socket any more; setup is done. */
1311 close(ipfd); 1246 close(ipfd);
1312 1247
1313 verbose("device %p: tun net %u.%u.%u.%u\n", 1248 verbose("device %u: tun net %u.%u.%u.%u\n",
1314 (void *)(dev->desc->pfn * getpagesize()), 1249 devices.device_num++,
1315 (u8)(ip>>24), (u8)(ip>>16), (u8)(ip>>8), (u8)ip); 1250 (u8)(ip>>24),(u8)(ip>>16),(u8)(ip>>8),(u8)ip);
1316 if (br_name) 1251 if (br_name)
1317 verbose("attached to bridge: %s\n", br_name); 1252 verbose("attached to bridge: %s\n", br_name);
1318} 1253}
1254
1255
1256/*
1257 * Block device.
1258 *
1259 * Serving a block device is really easy: the Guest asks for a block number and
1260 * we read or write that position in the file.
1261 *
1262 * Unfortunately, this is amazingly slow: the Guest waits until the read is
1263 * finished before running anything else, even if it could be doing useful
1264 * work. We could use async I/O, except it's reputed to suck so hard that
1265 * characters actually go missing from your code when you try to use it.
1266 *
1267 * So we farm the I/O out to thread, and communicate with it via a pipe. */
1268
1269/* This hangs off device->priv, with the data. */
1270struct vblk_info
1271{
1272 /* The size of the file. */
1273 off64_t len;
1274
1275 /* The file descriptor for the file. */
1276 int fd;
1277
1278 /* IO thread listens on this file descriptor [0]. */
1279 int workpipe[2];
1280
1281 /* IO thread writes to this file descriptor to mark it done, then
1282 * Launcher triggers interrupt to Guest. */
1283 int done_fd;
1284};
1285
1286/* This is the core of the I/O thread. It returns true if it did something. */
1287static bool service_io(struct device *dev)
1288{
1289 struct vblk_info *vblk = dev->priv;
1290 unsigned int head, out_num, in_num, wlen;
1291 int ret;
1292 struct virtio_blk_inhdr *in;
1293 struct virtio_blk_outhdr *out;
1294 struct iovec iov[dev->vq->vring.num];
1295 off64_t off;
1296
1297 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
1298 if (head == dev->vq->vring.num)
1299 return false;
1300
1301 if (out_num == 0 || in_num == 0)
1302 errx(1, "Bad virtblk cmd %u out=%u in=%u",
1303 head, out_num, in_num);
1304
1305 out = convert(&iov[0], struct virtio_blk_outhdr);
1306 in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr);
1307 off = out->sector * 512;
1308
1309 /* This is how we implement barriers. Pretty poor, no? */
1310 if (out->type & VIRTIO_BLK_T_BARRIER)
1311 fdatasync(vblk->fd);
1312
1313 if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
1314 fprintf(stderr, "Scsi commands unsupported\n");
1315 in->status = VIRTIO_BLK_S_UNSUPP;
1316 wlen = sizeof(in);
1317 } else if (out->type & VIRTIO_BLK_T_OUT) {
1318 /* Write */
1319
1320 /* Move to the right location in the block file. This can fail
1321 * if they try to write past end. */
1322 if (lseek64(vblk->fd, off, SEEK_SET) != off)
1323 err(1, "Bad seek to sector %llu", out->sector);
1324
1325 ret = writev(vblk->fd, iov+1, out_num-1);
1326 verbose("WRITE to sector %llu: %i\n", out->sector, ret);
1327
1328 /* Grr... Now we know how long the descriptor they sent was, we
1329 * make sure they didn't try to write over the end of the block
1330 * file (possibly extending it). */
1331 if (ret > 0 && off + ret > vblk->len) {
1332 /* Trim it back to the correct length */
1333 ftruncate64(vblk->fd, vblk->len);
1334 /* Die, bad Guest, die. */
1335 errx(1, "Write past end %llu+%u", off, ret);
1336 }
1337 wlen = sizeof(in);
1338 in->status = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
1339 } else {
1340 /* Read */
1341
1342 /* Move to the right location in the block file. This can fail
1343 * if they try to read past end. */
1344 if (lseek64(vblk->fd, off, SEEK_SET) != off)
1345 err(1, "Bad seek to sector %llu", out->sector);
1346
1347 ret = readv(vblk->fd, iov+1, in_num-1);
1348 verbose("READ from sector %llu: %i\n", out->sector, ret);
1349 if (ret >= 0) {
1350 wlen = sizeof(in) + ret;
1351 in->status = VIRTIO_BLK_S_OK;
1352 } else {
1353 wlen = sizeof(in);
1354 in->status = VIRTIO_BLK_S_IOERR;
1355 }
1356 }
1357
1358 /* We can't trigger an IRQ, because we're not the Launcher. It does
1359 * that when we tell it we're done. */
1360 add_used(dev->vq, head, wlen);
1361 return true;
1362}
1363
1364/* This is the thread which actually services the I/O. */
1365static int io_thread(void *_dev)
1366{
1367 struct device *dev = _dev;
1368 struct vblk_info *vblk = dev->priv;
1369 char c;
1370
1371 /* Close other side of workpipe so we get 0 read when main dies. */
1372 close(vblk->workpipe[1]);
1373 /* Close the other side of the done_fd pipe. */
1374 close(dev->fd);
1375
1376 /* When this read fails, it means Launcher died, so we follow. */
1377 while (read(vblk->workpipe[0], &c, 1) == 1) {
1378 /* We acknowledge each request immediately, to reduce latency,
1379 * rather than waiting until we've done them all. I haven't
1380 * measured to see if it makes any difference. */
1381 while (service_io(dev))
1382 write(vblk->done_fd, &c, 1);
1383 }
1384 return 0;
1385}
1386
1387/* When the thread says some I/O is done, we interrupt the Guest. */
1388static bool handle_io_finish(int fd, struct device *dev)
1389{
1390 char c;
1391
1392 /* If child died, presumably it printed message. */
1393 if (read(dev->fd, &c, 1) != 1)
1394 exit(1);
1395
1396 /* It did some work, so trigger the irq. */
1397 trigger_irq(fd, dev->vq);
1398 return true;
1399}
1400
1401/* When the Guest submits some I/O, we wake the I/O thread. */
1402static void handle_virtblk_output(int fd, struct virtqueue *vq)
1403{
1404 struct vblk_info *vblk = vq->dev->priv;
1405 char c = 0;
1406
1407 /* Wake up I/O thread and tell it to go to work! */
1408 if (write(vblk->workpipe[1], &c, 1) != 1)
1409 /* Presumably it indicated why it died. */
1410 exit(1);
1411}
1412
1413/* This creates a virtual block device. */
1414static void setup_block_file(const char *filename)
1415{
1416 int p[2];
1417 struct device *dev;
1418 struct vblk_info *vblk;
1419 void *stack;
1420 u64 cap;
1421 unsigned int val;
1422
1423 /* This is the pipe the I/O thread will use to tell us I/O is done. */
1424 pipe(p);
1425
1426 /* The device responds to return from I/O thread. */
1427 dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish);
1428
1429 /* The device has a virtqueue. */
1430 add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output);
1431
1432 /* Allocate the room for our own bookkeeping */
1433 vblk = dev->priv = malloc(sizeof(*vblk));
1434
1435 /* First we open the file and store the length. */
1436 vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
1437 vblk->len = lseek64(vblk->fd, 0, SEEK_END);
1438
1439 /* Tell Guest how many sectors this device has. */
1440 cap = cpu_to_le64(vblk->len / 512);
1441 add_desc_field(dev, VIRTIO_CONFIG_BLK_F_CAPACITY, sizeof(cap), &cap);
1442
1443 /* Tell Guest not to put in too many descriptors at once: two are used
1444 * for the in and out elements. */
1445 val = cpu_to_le32(VIRTQUEUE_NUM - 2);
1446 add_desc_field(dev, VIRTIO_CONFIG_BLK_F_SEG_MAX, sizeof(val), &val);
1447
1448 /* The I/O thread writes to this end of the pipe when done. */
1449 vblk->done_fd = p[1];
1450
1451 /* This is how we tell the I/O thread about more work. */
1452 pipe(vblk->workpipe);
1453
1454 /* Create stack for thread and run it */
1455 stack = malloc(32768);
1456 if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1)
1457 err(1, "Creating clone");
1458
1459 /* We don't need to keep the I/O thread's end of the pipes open. */
1460 close(vblk->done_fd);
1461 close(vblk->workpipe[0]);
1462
1463 verbose("device %u: virtblock %llu sectors\n",
1464 devices.device_num, cap);
1465}
1319/* That's the end of device setup. */ 1466/* That's the end of device setup. */
1320 1467
1321/*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves 1468/*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves
1322 * its input and output, and finally, lays it to rest. */ 1469 * its input and output, and finally, lays it to rest. */
1323static void __attribute__((noreturn)) 1470static void __attribute__((noreturn)) run_guest(int lguest_fd)
1324run_guest(int lguest_fd, struct device_list *device_list)
1325{ 1471{
1326 for (;;) { 1472 for (;;) {
1327 u32 args[] = { LHREQ_BREAK, 0 }; 1473 unsigned long args[] = { LHREQ_BREAK, 0 };
1328 unsigned long arr[2]; 1474 unsigned long notify_addr;
1329 int readval; 1475 int readval;
1330 1476
1331 /* We read from the /dev/lguest device to run the Guest. */ 1477 /* We read from the /dev/lguest device to run the Guest. */
1332 readval = read(lguest_fd, arr, sizeof(arr)); 1478 readval = read(lguest_fd, &notify_addr, sizeof(notify_addr));
1333
1334 /* The read can only really return sizeof(arr) (the Guest did a
1335 * SEND_DMA to us), or an error. */
1336 1479
1337 /* For a successful read, arr[0] is the address of the "struct 1480 /* One unsigned long means the Guest did HCALL_NOTIFY */
1338 * lguest_dma", and arr[1] is the key the Guest sent to. */ 1481 if (readval == sizeof(notify_addr)) {
1339 if (readval == sizeof(arr)) { 1482 verbose("Notify on address %#lx\n", notify_addr);
1340 handle_output(lguest_fd, arr[0], arr[1], device_list); 1483 handle_output(lguest_fd, notify_addr);
1341 continue; 1484 continue;
1342 /* ENOENT means the Guest died. Reading tells us why. */ 1485 /* ENOENT means the Guest died. Reading tells us why. */
1343 } else if (errno == ENOENT) { 1486 } else if (errno == ENOENT) {
@@ -1351,7 +1494,7 @@ run_guest(int lguest_fd, struct device_list *device_list)
1351 1494
1352 /* Service input, then unset the BREAK which releases 1495 /* Service input, then unset the BREAK which releases
1353 * the Waker. */ 1496 * the Waker. */
1354 handle_input(lguest_fd, device_list); 1497 handle_input(lguest_fd);
1355 if (write(lguest_fd, args, sizeof(args)) < 0) 1498 if (write(lguest_fd, args, sizeof(args)) < 0)
1356 err(1, "Resetting break"); 1499 err(1, "Resetting break");
1357 } 1500 }
@@ -1365,7 +1508,6 @@ run_guest(int lguest_fd, struct device_list *device_list)
1365 1508
1366static struct option opts[] = { 1509static struct option opts[] = {
1367 { "verbose", 0, NULL, 'v' }, 1510 { "verbose", 0, NULL, 'v' },
1368 { "sharenet", 1, NULL, 's' },
1369 { "tunnet", 1, NULL, 't' }, 1511 { "tunnet", 1, NULL, 't' },
1370 { "block", 1, NULL, 'b' }, 1512 { "block", 1, NULL, 'b' },
1371 { "initrd", 1, NULL, 'i' }, 1513 { "initrd", 1, NULL, 'i' },
@@ -1374,37 +1516,21 @@ static struct option opts[] = {
1374static void usage(void) 1516static void usage(void)
1375{ 1517{
1376 errx(1, "Usage: lguest [--verbose] " 1518 errx(1, "Usage: lguest [--verbose] "
1377 "[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)\n" 1519 "[--tunnet=(<ipaddr>|bridge:<bridgename>)\n"
1378 "|--block=<filename>|--initrd=<filename>]...\n" 1520 "|--block=<filename>|--initrd=<filename>]...\n"
1379 "<mem-in-mb> vmlinux [args...]"); 1521 "<mem-in-mb> vmlinux [args...]");
1380} 1522}
1381 1523
1382/*L:100 The Launcher code itself takes us out into userspace, that scary place 1524/*L:105 The main routine is where the real work begins: */
1383 * where pointers run wild and free! Unfortunately, like most userspace
1384 * programs, it's quite boring (which is why everyone like to hack on the
1385 * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it
1386 * will get you through this section. Or, maybe not.
1387 *
1388 * The Launcher binary sits up high, usually starting at address 0xB8000000.
1389 * Everything below this is the "physical" memory for the Guest. For example,
1390 * if the Guest were to write a "1" at physical address 0, we would see a "1"
1391 * in the Launcher at "(int *)0". Guest physical == Launcher virtual.
1392 *
1393 * This can be tough to get your head around, but usually it just means that we
1394 * don't need to do any conversion when the Guest gives us it's "physical"
1395 * addresses.
1396 */
1397int main(int argc, char *argv[]) 1525int main(int argc, char *argv[])
1398{ 1526{
1399 /* Memory, top-level pagetable, code startpoint, PAGE_OFFSET and size 1527 /* Memory, top-level pagetable, code startpoint and size of the
1400 * of the (optional) initrd. */ 1528 * (optional) initrd. */
1401 unsigned long mem = 0, pgdir, start, page_offset, initrd_size = 0; 1529 unsigned long mem = 0, pgdir, start, initrd_size = 0;
1402 /* A temporary and the /dev/lguest file descriptor. */ 1530 /* A temporary and the /dev/lguest file descriptor. */
1403 int i, c, lguest_fd; 1531 int i, c, lguest_fd;
1404 /* The list of Guest devices, based on command line arguments. */ 1532 /* The boot information for the Guest. */
1405 struct device_list device_list; 1533 struct boot_params *boot;
1406 /* The boot information for the Guest: at guest-physical address 0. */
1407 void *boot = (void *)0;
1408 /* If they specify an initrd file to load. */ 1534 /* If they specify an initrd file to load. */
1409 const char *initrd_name = NULL; 1535 const char *initrd_name = NULL;
1410 1536
@@ -1412,11 +1538,12 @@ int main(int argc, char *argv[])
1412 * device receive input from a file descriptor, we keep an fdset 1538 * device receive input from a file descriptor, we keep an fdset
1413 * (infds) and the maximum fd number (max_infd) with the head of the 1539 * (infds) and the maximum fd number (max_infd) with the head of the
1414 * list. We also keep a pointer to the last device, for easy appending 1540 * list. We also keep a pointer to the last device, for easy appending
1415 * to the list. */ 1541 * to the list. Finally, we keep the next interrupt number to hand out
1416 device_list.max_infd = -1; 1542 * (1: remember that 0 is used by the timer). */
1417 device_list.dev = NULL; 1543 FD_ZERO(&devices.infds);
1418 device_list.lastdev = &device_list.dev; 1544 devices.max_infd = -1;
1419 FD_ZERO(&device_list.infds); 1545 devices.lastdev = &devices.dev;
1546 devices.next_irq = 1;
1420 1547
1421 /* We need to know how much memory so we can set up the device 1548 /* We need to know how much memory so we can set up the device
1422 * descriptor and memory pages for the devices as we parse the command 1549 * descriptor and memory pages for the devices as we parse the command
@@ -1424,9 +1551,16 @@ int main(int argc, char *argv[])
1424 * of memory now. */ 1551 * of memory now. */
1425 for (i = 1; i < argc; i++) { 1552 for (i = 1; i < argc; i++) {
1426 if (argv[i][0] != '-') { 1553 if (argv[i][0] != '-') {
1427 mem = top = atoi(argv[i]) * 1024 * 1024; 1554 mem = atoi(argv[i]) * 1024 * 1024;
1428 device_list.descs = map_zeroed_pages(top, 1); 1555 /* We start by mapping anonymous pages over all of
1429 top += getpagesize(); 1556 * guest-physical memory range. This fills it with 0,
1557 * and ensures that the Guest won't be killed when it
1558 * tries to access it. */
1559 guest_base = map_zeroed_pages(mem / getpagesize()
1560 + DEVICE_PAGES);
1561 guest_limit = mem;
1562 guest_max = mem + DEVICE_PAGES*getpagesize();
1563 devices.descpage = get_pages(1);
1430 break; 1564 break;
1431 } 1565 }
1432 } 1566 }
@@ -1437,14 +1571,11 @@ int main(int argc, char *argv[])
1437 case 'v': 1571 case 'v':
1438 verbose = true; 1572 verbose = true;
1439 break; 1573 break;
1440 case 's':
1441 setup_net_file(optarg, &device_list);
1442 break;
1443 case 't': 1574 case 't':
1444 setup_tun_net(optarg, &device_list); 1575 setup_tun_net(optarg);
1445 break; 1576 break;
1446 case 'b': 1577 case 'b':
1447 setup_block_file(optarg, &device_list); 1578 setup_block_file(optarg);
1448 break; 1579 break;
1449 case 'i': 1580 case 'i':
1450 initrd_name = optarg; 1581 initrd_name = optarg;
@@ -1459,56 +1590,60 @@ int main(int argc, char *argv[])
1459 if (optind + 2 > argc) 1590 if (optind + 2 > argc)
1460 usage(); 1591 usage();
1461 1592
1462 /* We always have a console device */ 1593 verbose("Guest base is at %p\n", guest_base);
1463 setup_console(&device_list);
1464 1594
1465 /* We start by mapping anonymous pages over all of guest-physical 1595 /* We always have a console device */
1466 * memory range. This fills it with 0, and ensures that the Guest 1596 setup_console();
1467 * won't be killed when it tries to access it. */
1468 map_zeroed_pages(0, mem / getpagesize());
1469 1597
1470 /* Now we load the kernel */ 1598 /* Now we load the kernel */
1471 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY), 1599 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
1472 &page_offset); 1600
1601 /* Boot information is stashed at physical address 0 */
1602 boot = from_guest_phys(0);
1473 1603
1474 /* Map the initrd image if requested (at top of physical memory) */ 1604 /* Map the initrd image if requested (at top of physical memory) */
1475 if (initrd_name) { 1605 if (initrd_name) {
1476 initrd_size = load_initrd(initrd_name, mem); 1606 initrd_size = load_initrd(initrd_name, mem);
1477 /* These are the location in the Linux boot header where the 1607 /* These are the location in the Linux boot header where the
1478 * start and size of the initrd are expected to be found. */ 1608 * start and size of the initrd are expected to be found. */
1479 *(unsigned long *)(boot+0x218) = mem - initrd_size; 1609 boot->hdr.ramdisk_image = mem - initrd_size;
1480 *(unsigned long *)(boot+0x21c) = initrd_size; 1610 boot->hdr.ramdisk_size = initrd_size;
1481 /* The bootloader type 0xFF means "unknown"; that's OK. */ 1611 /* The bootloader type 0xFF means "unknown"; that's OK. */
1482 *(unsigned char *)(boot+0x210) = 0xFF; 1612 boot->hdr.type_of_loader = 0xFF;
1483 } 1613 }
1484 1614
1485 /* Set up the initial linear pagetables, starting below the initrd. */ 1615 /* Set up the initial linear pagetables, starting below the initrd. */
1486 pgdir = setup_pagetables(mem, initrd_size, page_offset); 1616 pgdir = setup_pagetables(mem, initrd_size);
1487 1617
1488 /* The Linux boot header contains an "E820" memory map: ours is a 1618 /* The Linux boot header contains an "E820" memory map: ours is a
1489 * simple, single region. */ 1619 * simple, single region. */
1490 *(char*)(boot+E820NR) = 1; 1620 boot->e820_entries = 1;
1491 *((struct e820entry *)(boot+E820MAP)) 1621 boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM });
1492 = ((struct e820entry) { 0, mem, E820_RAM });
1493 /* The boot header contains a command line pointer: we put the command 1622 /* The boot header contains a command line pointer: we put the command
1494 * line after the boot header (at address 4096) */ 1623 * line after the boot header. */
1495 *(void **)(boot + 0x228) = boot + 4096; 1624 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
1496 concat(boot + 4096, argv+optind+2); 1625 concat((char *)(boot + 1), argv+optind+2);
1626
1627 /* Boot protocol version: 2.07 supports the fields for lguest. */
1628 boot->hdr.version = 0x207;
1629
1630 /* The hardware_subarch value of "1" tells the Guest it's an lguest. */
1631 boot->hdr.hardware_subarch = 1;
1497 1632
1498 /* The guest type value of "1" tells the Guest it's under lguest. */ 1633 /* Tell the entry path not to try to reload segment registers. */
1499 *(int *)(boot + 0x23c) = 1; 1634 boot->hdr.loadflags |= KEEP_SEGMENTS;
1500 1635
1501 /* We tell the kernel to initialize the Guest: this returns the open 1636 /* We tell the kernel to initialize the Guest: this returns the open
1502 * /dev/lguest file descriptor. */ 1637 * /dev/lguest file descriptor. */
1503 lguest_fd = tell_kernel(pgdir, start, page_offset); 1638 lguest_fd = tell_kernel(pgdir, start);
1504 1639
1505 /* We fork off a child process, which wakes the Launcher whenever one 1640 /* We fork off a child process, which wakes the Launcher whenever one
1506 * of the input file descriptors needs attention. Otherwise we would 1641 * of the input file descriptors needs attention. Otherwise we would
1507 * run the Guest until it tries to output something. */ 1642 * run the Guest until it tries to output something. */
1508 waker_fd = setup_waker(lguest_fd, &device_list); 1643 waker_fd = setup_waker(lguest_fd);
1509 1644
1510 /* Finally, run the Guest. This doesn't return. */ 1645 /* Finally, run the Guest. This doesn't return. */
1511 run_guest(lguest_fd, &device_list); 1646 run_guest(lguest_fd);
1512} 1647}
1513/*:*/ 1648/*:*/
1514 1649
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt
index 821617bd6c0..7885ab2d5f5 100644
--- a/Documentation/lguest/lguest.txt
+++ b/Documentation/lguest/lguest.txt
@@ -6,7 +6,7 @@ Lguest is designed to be a minimal hypervisor for the Linux kernel, for
6Linux developers and users to experiment with virtualization with the 6Linux developers and users to experiment with virtualization with the
7minimum of complexity. Nonetheless, it should have sufficient 7minimum of complexity. Nonetheless, it should have sufficient
8features to make it useful for specific tasks, and, of course, you are 8features to make it useful for specific tasks, and, of course, you are
9encouraged to fork and enhance it. 9encouraged to fork and enhance it (see drivers/lguest/README).
10 10
11Features: 11Features:
12 12
@@ -23,19 +23,30 @@ Developer features:
23 23
24Running Lguest: 24Running Lguest:
25 25
26- Lguest runs the same kernel as guest and host. You can configure 26- The easiest way to run lguest is to use same kernel as guest and host.
27 them differently, but usually it's easiest not to. 27 You can configure them differently, but usually it's easiest not to.
28 28
29 You will need to configure your kernel with the following options: 29 You will need to configure your kernel with the following options:
30 30
31 CONFIG_HIGHMEM64G=n ("High Memory Support" "64GB")[1] 31 "General setup":
32 CONFIG_TUN=y/m ("Universal TUN/TAP device driver support") 32 "Prompt for development and/or incomplete code/drivers" = Y
33 CONFIG_EXPERIMENTAL=y ("Prompt for development and/or incomplete code/drivers") 33 (CONFIG_EXPERIMENTAL=y)
34 CONFIG_PARAVIRT=y ("Paravirtualization support (EXPERIMENTAL)") 34
35 CONFIG_LGUEST=y/m ("Linux hypervisor example code") 35 "Processor type and features":
36 36 "Paravirtualized guest support" = Y
37 and I recommend: 37 "Lguest guest support" = Y
38 CONFIG_HZ=100 ("Timer frequency")[2] 38 "High Memory Support" = off/4GB
39 "Alignment value to which kernel should be aligned" = 0x100000
40 (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
41 CONFIG_PHYSICAL_ALIGN=0x100000)
42
43 "Device Drivers":
44 "Network device support"
45 "Universal TUN/TAP device driver support" = M/Y
46 (CONFIG_TUN=m)
47 "Virtualization"
48 "Linux hypervisor example code" = M/Y
49 (CONFIG_LGUEST=m)
39 50
40- A tool called "lguest" is available in this directory: type "make" 51- A tool called "lguest" is available in this directory: type "make"
41 to build it. If you didn't build your kernel in-tree, use "make 52 to build it. If you didn't build your kernel in-tree, use "make
@@ -51,14 +62,17 @@ Running Lguest:
51 dd if=/dev/zero of=rootfile bs=1M count=2048 62 dd if=/dev/zero of=rootfile bs=1M count=2048
52 qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d 63 qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
53 64
65 Make sure that you install a getty on /dev/hvc0 if you want to log in on the
66 console!
67
54- "modprobe lg" if you built it as a module. 68- "modprobe lg" if you built it as a module.
55 69
56- Run an lguest as root: 70- Run an lguest as root:
57 71
58 Documentation/lguest/lguest 64m vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/lgba 72 Documentation/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/vda
59 73
60 Explanation: 74 Explanation:
61 64m: the amount of memory to use. 75 64: the amount of memory to use, in MB.
62 76
63 vmlinux: the kernel image found in the top of your build directory. You 77 vmlinux: the kernel image found in the top of your build directory. You
64 can also use a standard bzImage. 78 can also use a standard bzImage.
@@ -66,10 +80,10 @@ Running Lguest:
66 --tunnet=192.168.19.1: configures a "tap" device for networking with this 80 --tunnet=192.168.19.1: configures a "tap" device for networking with this
67 IP address. 81 IP address.
68 82
69 --block=rootfile: a file or block device which becomes /dev/lgba 83 --block=rootfile: a file or block device which becomes /dev/vda
70 inside the guest. 84 inside the guest.
71 85
72 root=/dev/lgba: this (and anything else on the command line) are 86 root=/dev/vda: this (and anything else on the command line) are
73 kernel boot parameters. 87 kernel boot parameters.
74 88
75- Configuring networking. I usually have the host masquerade, using 89- Configuring networking. I usually have the host masquerade, using
@@ -99,31 +113,7 @@ Running Lguest:
99 "--sharenet=<filename>": any two guests using the same file are on 113 "--sharenet=<filename>": any two guests using the same file are on
100 the same network. This file is created if it does not exist. 114 the same network. This file is created if it does not exist.
101 115
102Lguest I/O model: 116There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest
103
104Lguest uses a simplified DMA model plus shared memory for I/O. Guests
105can communicate with each other if they share underlying memory
106(usually by the lguest program mmaping the same file), but they can
107use any non-shared memory to communicate with the lguest process.
108
109Guests can register DMA buffers at any key (must be a valid physical
110address) using the LHCALL_BIND_DMA(key, dmabufs, num<<8|irq)
111hypercall. "dmabufs" is the physical address of an array of "num"
112"struct lguest_dma": each contains a used_len, and an array of
113physical addresses and lengths. When a transfer occurs, the
114"used_len" field of one of the buffers which has used_len 0 will be
115set to the length transferred and the irq will fire.
116 117
117Using an irq value of 0 unbinds the dma buffers. 118Good luck!
118
119To send DMA, the LHCALL_SEND_DMA(key, dma_physaddr) hypercall is used,
120and the bytes used is written to the used_len field. This can be 0 if
121noone else has bound a DMA buffer to that key or some other error.
122DMA buffers bound by the same guest are ignored.
123
124Cheers!
125Rusty Russell rusty@rustcorp.com.au. 119Rusty Russell rusty@rustcorp.com.au.
126
127[1] These are on various places on the TODO list, waiting for you to
128 get annoyed enough at the limitation to fix it.
129[2] Lguest is not yet tickless when idle. See [1].
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index f6e44fc5283..5bed8be34ba 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -227,28 +227,40 @@ config SCHED_NO_NO_OMIT_FRAME_POINTER
227 If in doubt, say "Y". 227 If in doubt, say "Y".
228 228
229config PARAVIRT 229config PARAVIRT
230 bool "Paravirtualization support (EXPERIMENTAL)" 230 bool
231 depends on EXPERIMENTAL
232 depends on !(X86_VISWS || X86_VOYAGER) 231 depends on !(X86_VISWS || X86_VOYAGER)
233 help 232 help
234 Paravirtualization is a way of running multiple instances of 233 This changes the kernel so it can modify itself when it is run
235 Linux on the same machine, under a hypervisor. This option 234 under a hypervisor, potentially improving performance significantly
236 changes the kernel so it can modify itself when it is run 235 over full virtualization. However, when run without a hypervisor
237 under a hypervisor, improving performance significantly. 236 the kernel is theoretically slower and slightly larger.
238 However, when run without a hypervisor the kernel is 237
239 theoretically slower. If in doubt, say N. 238menuconfig PARAVIRT_GUEST
239 bool "Paravirtualized guest support"
240 help
241 Say Y here to get to see options related to running Linux under
242 various hypervisors. This option alone does not add any kernel code.
243
244 If you say N, all options in this submenu will be skipped and disabled.
245
246if PARAVIRT_GUEST
240 247
241source "arch/x86/xen/Kconfig" 248source "arch/x86/xen/Kconfig"
242 249
243config VMI 250config VMI
244 bool "VMI Paravirt-ops support" 251 bool "VMI Guest support"
245 depends on PARAVIRT 252 select PARAVIRT
253 depends on !(X86_VISWS || X86_VOYAGER)
246 help 254 help
247 VMI provides a paravirtualized interface to the VMware ESX server 255 VMI provides a paravirtualized interface to the VMware ESX server
248 (it could be used by other hypervisors in theory too, but is not 256 (it could be used by other hypervisors in theory too, but is not
249 at the moment), by linking the kernel to a GPL-ed ROM module 257 at the moment), by linking the kernel to a GPL-ed ROM module
250 provided by the hypervisor. 258 provided by the hypervisor.
251 259
260source "arch/x86/lguest/Kconfig"
261
262endif
263
252config ACPI_SRAT 264config ACPI_SRAT
253 bool 265 bool
254 default y 266 default y
diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index b88e47ca303..b81cb64d48e 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -99,6 +99,9 @@ core-$(CONFIG_X86_ES7000) := arch/x86/mach-es7000/
99# Xen paravirtualization support 99# Xen paravirtualization support
100core-$(CONFIG_XEN) += arch/x86/xen/ 100core-$(CONFIG_XEN) += arch/x86/xen/
101 101
102# lguest paravirtualization support
103core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
104
102# default subarch .h files 105# default subarch .h files
103mflags-y += -Iinclude/asm-x86/mach-default 106mflags-y += -Iinclude/asm-x86/mach-default
104 107
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index f8764716b0c..0e45981b2dd 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -136,6 +136,7 @@ void foo(void)
136#ifdef CONFIG_LGUEST_GUEST 136#ifdef CONFIG_LGUEST_GUEST
137 BLANK(); 137 BLANK();
138 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 138 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
139 OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
139 OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); 140 OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
140 OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); 141 OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
141 OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3); 142 OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
new file mode 100644
index 00000000000..c4dffbeea5e
--- /dev/null
+++ b/arch/x86/lguest/Kconfig
@@ -0,0 +1,14 @@
1config LGUEST_GUEST
2 bool "Lguest guest support"
3 select PARAVIRT
4 depends on !X86_PAE
5 select VIRTIO
6 select VIRTIO_RING
7 select VIRTIO_CONSOLE
8 help
9 Lguest is a tiny in-kernel hypervisor. Selecting this will
10 allow your kernel to boot under lguest. This option will increase
11 your kernel size by about 6k. If in doubt, say N.
12
13 If you say Y here, make sure you say Y (or M) to the virtio block
14 and net drivers which lguest needs.
diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile
new file mode 100644
index 00000000000..27f0c9ed7f6
--- /dev/null
+++ b/arch/x86/lguest/Makefile
@@ -0,0 +1 @@
obj-y := i386_head.o boot.o
diff --git a/drivers/lguest/lguest.c b/arch/x86/lguest/boot.c
index 3ba337dde85..d2235db4085 100644
--- a/drivers/lguest/lguest.c
+++ b/arch/x86/lguest/boot.c
@@ -55,7 +55,7 @@
55#include <linux/clockchips.h> 55#include <linux/clockchips.h>
56#include <linux/lguest.h> 56#include <linux/lguest.h>
57#include <linux/lguest_launcher.h> 57#include <linux/lguest_launcher.h>
58#include <linux/lguest_bus.h> 58#include <linux/virtio_console.h>
59#include <asm/paravirt.h> 59#include <asm/paravirt.h>
60#include <asm/param.h> 60#include <asm/param.h>
61#include <asm/page.h> 61#include <asm/page.h>
@@ -65,6 +65,7 @@
65#include <asm/e820.h> 65#include <asm/e820.h>
66#include <asm/mce.h> 66#include <asm/mce.h>
67#include <asm/io.h> 67#include <asm/io.h>
68#include <asm/i387.h>
68 69
69/*G:010 Welcome to the Guest! 70/*G:010 Welcome to the Guest!
70 * 71 *
@@ -85,9 +86,10 @@ struct lguest_data lguest_data = {
85 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, 86 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
86 .noirq_start = (u32)lguest_noirq_start, 87 .noirq_start = (u32)lguest_noirq_start,
87 .noirq_end = (u32)lguest_noirq_end, 88 .noirq_end = (u32)lguest_noirq_end,
89 .kernel_address = PAGE_OFFSET,
88 .blocked_interrupts = { 1 }, /* Block timer interrupts */ 90 .blocked_interrupts = { 1 }, /* Block timer interrupts */
91 .syscall_vec = SYSCALL_VECTOR,
89}; 92};
90struct lguest_device_desc *lguest_devices;
91static cycle_t clock_base; 93static cycle_t clock_base;
92 94
93/*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first 95/*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first
@@ -146,10 +148,10 @@ void async_hcall(unsigned long call,
146 /* Table full, so do normal hcall which will flush table. */ 148 /* Table full, so do normal hcall which will flush table. */
147 hcall(call, arg1, arg2, arg3); 149 hcall(call, arg1, arg2, arg3);
148 } else { 150 } else {
149 lguest_data.hcalls[next_call].eax = call; 151 lguest_data.hcalls[next_call].arg0 = call;
150 lguest_data.hcalls[next_call].edx = arg1; 152 lguest_data.hcalls[next_call].arg1 = arg1;
151 lguest_data.hcalls[next_call].ebx = arg2; 153 lguest_data.hcalls[next_call].arg2 = arg2;
152 lguest_data.hcalls[next_call].ecx = arg3; 154 lguest_data.hcalls[next_call].arg3 = arg3;
153 /* Arguments must all be written before we mark it to go */ 155 /* Arguments must all be written before we mark it to go */
154 wmb(); 156 wmb();
155 lguest_data.hcall_status[next_call] = 0; 157 lguest_data.hcall_status[next_call] = 0;
@@ -160,46 +162,6 @@ void async_hcall(unsigned long call,
160} 162}
161/*:*/ 163/*:*/
162 164
163/* Wrappers for the SEND_DMA and BIND_DMA hypercalls. This is mainly because
164 * Jeff Garzik complained that __pa() should never appear in drivers, and this
165 * helps remove most of them. But also, it wraps some ugliness. */
166void lguest_send_dma(unsigned long key, struct lguest_dma *dma)
167{
168 /* The hcall might not write this if something goes wrong */
169 dma->used_len = 0;
170 hcall(LHCALL_SEND_DMA, key, __pa(dma), 0);
171}
172
173int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas,
174 unsigned int num, u8 irq)
175{
176 /* This is the only hypercall which actually wants 5 arguments, and we
177 * only support 4. Fortunately the interrupt number is always less
178 * than 256, so we can pack it with the number of dmas in the final
179 * argument. */
180 if (!hcall(LHCALL_BIND_DMA, key, __pa(dmas), (num << 8) | irq))
181 return -ENOMEM;
182 return 0;
183}
184
185/* Unbinding is the same hypercall as binding, but with 0 num & irq. */
186void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas)
187{
188 hcall(LHCALL_BIND_DMA, key, __pa(dmas), 0);
189}
190
191/* For guests, device memory can be used as normal memory, so we cast away the
192 * __iomem to quieten sparse. */
193void *lguest_map(unsigned long phys_addr, unsigned long pages)
194{
195 return (__force void *)ioremap(phys_addr, PAGE_SIZE*pages);
196}
197
198void lguest_unmap(void *addr)
199{
200 iounmap((__force void __iomem *)addr);
201}
202
203/*G:033 165/*G:033
204 * Here are our first native-instruction replacements: four functions for 166 * Here are our first native-instruction replacements: four functions for
205 * interrupt control. 167 * interrupt control.
@@ -680,6 +642,7 @@ static struct clocksource lguest_clock = {
680 .mask = CLOCKSOURCE_MASK(64), 642 .mask = CLOCKSOURCE_MASK(64),
681 .mult = 1 << 22, 643 .mult = 1 << 22,
682 .shift = 22, 644 .shift = 22,
645 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
683}; 646};
684 647
685/* The "scheduler clock" is just our real clock, adjusted to start at zero */ 648/* The "scheduler clock" is just our real clock, adjusted to start at zero */
@@ -761,11 +724,9 @@ static void lguest_time_init(void)
761 * the TSC, otherwise it's a dumb nanosecond-resolution clock. Either 724 * the TSC, otherwise it's a dumb nanosecond-resolution clock. Either
762 * way, the "rating" is initialized so high that it's always chosen 725 * way, the "rating" is initialized so high that it's always chosen
763 * over any other clocksource. */ 726 * over any other clocksource. */
764 if (lguest_data.tsc_khz) { 727 if (lguest_data.tsc_khz)
765 lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz, 728 lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz,
766 lguest_clock.shift); 729 lguest_clock.shift);
767 lguest_clock.flags = CLOCK_SOURCE_IS_CONTINUOUS;
768 }
769 clock_base = lguest_clock_read(); 730 clock_base = lguest_clock_read();
770 clocksource_register(&lguest_clock); 731 clocksource_register(&lguest_clock);
771 732
@@ -889,6 +850,23 @@ static __init char *lguest_memory_setup(void)
889 return "LGUEST"; 850 return "LGUEST";
890} 851}
891 852
853/* Before virtqueues are set up, we use LHCALL_NOTIFY on normal memory to
854 * produce console output. */
855static __init int early_put_chars(u32 vtermno, const char *buf, int count)
856{
857 char scratch[17];
858 unsigned int len = count;
859
860 if (len > sizeof(scratch) - 1)
861 len = sizeof(scratch) - 1;
862 scratch[len] = '\0';
863 memcpy(scratch, buf, len);
864 hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0);
865
866 /* This routine returns the number of bytes actually written. */
867 return len;
868}
869
892/*G:050 870/*G:050
893 * Patching (Powerfully Placating Performance Pedants) 871 * Patching (Powerfully Placating Performance Pedants)
894 * 872 *
@@ -950,18 +928,8 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
950/*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops 928/*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops
951 * structures in the kernel provide points for (almost) every routine we have 929 * structures in the kernel provide points for (almost) every routine we have
952 * to override to avoid privileged instructions. */ 930 * to override to avoid privileged instructions. */
953__init void lguest_init(void *boot) 931__init void lguest_init(void)
954{ 932{
955 /* Copy boot parameters first: the Launcher put the physical location
956 * in %esi, and head.S converted that to a virtual address and handed
957 * it to us. We use "__memcpy" because "memcpy" sometimes tries to do
958 * tricky things to go faster, and we're not ready for that. */
959 __memcpy(&boot_params, boot, PARAM_SIZE);
960 /* The boot parameters also tell us where the command-line is: save
961 * that, too. */
962 __memcpy(boot_command_line, __va(boot_params.hdr.cmd_line_ptr),
963 COMMAND_LINE_SIZE);
964
965 /* We're under lguest, paravirt is enabled, and we're running at 933 /* We're under lguest, paravirt is enabled, and we're running at
966 * privilege level 1, not 0 as normal. */ 934 * privilege level 1, not 0 as normal. */
967 pv_info.name = "lguest"; 935 pv_info.name = "lguest";
@@ -1033,11 +1001,7 @@ __init void lguest_init(void *boot)
1033 1001
1034 /*G:070 Now we've seen all the paravirt_ops, we return to 1002 /*G:070 Now we've seen all the paravirt_ops, we return to
1035 * lguest_init() where the rest of the fairly chaotic boot setup 1003 * lguest_init() where the rest of the fairly chaotic boot setup
1036 * occurs. 1004 * occurs. */
1037 *
1038 * The Host expects our first hypercall to tell it where our "struct
1039 * lguest_data" is, so we do that first. */
1040 hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
1041 1005
1042 /* The native boot code sets up initial page tables immediately after 1006 /* The native boot code sets up initial page tables immediately after
1043 * the kernel itself, and sets init_pg_tables_end so they're not 1007 * the kernel itself, and sets init_pg_tables_end so they're not
@@ -1050,11 +1014,6 @@ __init void lguest_init(void *boot)
1050 * the normal data segment to get through booting. */ 1014 * the normal data segment to get through booting. */
1051 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); 1015 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
1052 1016
1053 /* Clear the part of the kernel data which is expected to be zero.
1054 * Normally it will be anyway, but if we're loading from a bzImage with
1055 * CONFIG_RELOCATALE=y, the relocations will be sitting here. */
1056 memset(__bss_start, 0, __bss_stop - __bss_start);
1057
1058 /* The Host uses the top of the Guest's virtual address space for the 1017 /* The Host uses the top of the Guest's virtual address space for the
1059 * Host<->Guest Switcher, and it tells us how much it needs in 1018 * Host<->Guest Switcher, and it tells us how much it needs in
1060 * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */ 1019 * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */
@@ -1092,6 +1051,9 @@ __init void lguest_init(void *boot)
1092 * adapted for lguest's use. */ 1051 * adapted for lguest's use. */
1093 add_preferred_console("hvc", 0, NULL); 1052 add_preferred_console("hvc", 0, NULL);
1094 1053
1054 /* Register our very early console. */
1055 virtio_cons_early_init(early_put_chars);
1056
1095 /* Last of all, we set the power management poweroff hook to point to 1057 /* Last of all, we set the power management poweroff hook to point to
1096 * the Guest routine to power off. */ 1058 * the Guest routine to power off. */
1097 pm_power_off = lguest_power_off; 1059 pm_power_off = lguest_power_off;
diff --git a/drivers/lguest/lguest_asm.S b/arch/x86/lguest/i386_head.S
index 1ddcd5cd20f..ebc6ac73389 100644
--- a/drivers/lguest/lguest_asm.S
+++ b/arch/x86/lguest/i386_head.S
@@ -1,25 +1,47 @@
1#include <linux/linkage.h> 1#include <linux/linkage.h>
2#include <linux/lguest.h> 2#include <linux/lguest.h>
3#include <asm/lguest_hcall.h>
3#include <asm/asm-offsets.h> 4#include <asm/asm-offsets.h>
4#include <asm/thread_info.h> 5#include <asm/thread_info.h>
5#include <asm/processor-flags.h> 6#include <asm/processor-flags.h>
6 7
7/*G:020 This is where we begin: we have a magic signature which the launcher 8/*G:020 This is where we begin: head.S notes that the boot header's platform
8 * looks for. The plan is that the Linux boot protocol will be extended with a 9 * type field is "1" (lguest), so calls us here. The boot header is in %esi.
9 * "platform type" field which will guide us here from the normal entry point, 10 *
10 * but for the moment this suffices. The normal boot code uses %esi for the 11 * WARNING: be very careful here! We're running at addresses equal to physical
11 * boot header, so we do too. We convert it to a virtual address by adding 12 * addesses (around 0), not above PAGE_OFFSET as most code expectes
12 * PAGE_OFFSET, and hand it to lguest_init() as its argument (ie. %eax). 13 * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
14 * data.
13 * 15 *
14 * The .section line puts this code in .init.text so it will be discarded after 16 * The .section line puts this code in .init.text so it will be discarded after
15 * boot. */ 17 * boot. */
16.section .init.text, "ax", @progbits 18.section .init.text, "ax", @progbits
17.ascii "GenuineLguest" 19ENTRY(lguest_entry)
18 /* Set up initial stack. */ 20 /* Make initial hypercall now, so we can set up the pagetables. */
19 movl $(init_thread_union+THREAD_SIZE),%esp 21 movl $LHCALL_LGUEST_INIT, %eax
20 movl %esi, %eax 22 movl $lguest_data - __PAGE_OFFSET, %edx
21 addl $__PAGE_OFFSET, %eax 23 int $LGUEST_TRAP_ENTRY
22 jmp lguest_init 24
25 /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl
26 * instruction uses %esi implicitly. */
27 movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi
28
29 /* Copy first 32 entries of page directory to __PAGE_OFFSET entries.
30 * This means the first 128M of kernel memory will be mapped at
31 * PAGE_OFFSET where the kernel expects to run. This will get it far
32 * enough through boot to switch to its own pagetables. */
33 movl $32, %ecx
34 movl %esi, %edi
35 addl $((__PAGE_OFFSET >> 22) * 4), %edi
36 rep
37 movsl
38
39 /* Set up the initial stack so we can run C code. */
40 movl $(init_thread_union+THREAD_SIZE),%esp
41
42 /* Jumps are relative, and we're running __PAGE_OFFSET too low at the
43 * moment. */
44 jmp lguest_init+__PAGE_OFFSET
23 45
24/*G:055 We create a macro which puts the assembler code between lgstart_ and 46/*G:055 We create a macro which puts the assembler code between lgstart_ and
25 * lgend_ markers. These templates are put in the .text section: they can't be 47 * lgend_ markers. These templates are put in the .text section: they can't be
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 9df99e1885a..fbfa55ce0d5 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -3,8 +3,9 @@
3# 3#
4 4
5config XEN 5config XEN
6 bool "Enable support for Xen hypervisor" 6 bool "Xen guest support"
7 depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES 7 select PARAVIRT
8 depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER)
8 help 9 help
9 This is the Linux Xen port. Enabling this will allow the 10 This is the Linux Xen port. Enabling this will allow the
10 kernel to boot in a paravirtualized environment under the 11 kernel to boot in a paravirtualized environment under the
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 34f40ea0ba6..f4076d9e990 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -94,5 +94,5 @@ source "drivers/kvm/Kconfig"
94 94
95source "drivers/uio/Kconfig" 95source "drivers/uio/Kconfig"
96 96
97source "drivers/lguest/Kconfig" 97source "drivers/virtio/Kconfig"
98endmenu 98endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index cfe38ffff28..560496b4330 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -91,3 +91,4 @@ obj-$(CONFIG_HID) += hid/
91obj-$(CONFIG_PPC_PS3) += ps3/ 91obj-$(CONFIG_PPC_PS3) += ps3/
92obj-$(CONFIG_OF) += of/ 92obj-$(CONFIG_OF) += of/
93obj-$(CONFIG_SSB) += ssb/ 93obj-$(CONFIG_SSB) += ssb/
94obj-$(CONFIG_VIRTIO) += virtio/
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index ce4b1e484e6..4d0119ea9e3 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -425,4 +425,10 @@ config XEN_BLKDEV_FRONTEND
425 block device driver. It communicates with a back-end driver 425 block device driver. It communicates with a back-end driver
426 in another domain which drives the actual block device. 426 in another domain which drives the actual block device.
427 427
428config VIRTIO_BLK
429 tristate "Virtio block driver (EXPERIMENTAL)"
430 depends on EXPERIMENTAL && VIRTIO
431 ---help---
432 This is the virtual block driver for lguest. Say Y or M.
433
428endif # BLK_DEV 434endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 014e72121b5..7691505a2e1 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -25,10 +25,10 @@ obj-$(CONFIG_SUNVDC) += sunvdc.o
25obj-$(CONFIG_BLK_DEV_UMEM) += umem.o 25obj-$(CONFIG_BLK_DEV_UMEM) += umem.o
26obj-$(CONFIG_BLK_DEV_NBD) += nbd.o 26obj-$(CONFIG_BLK_DEV_NBD) += nbd.o
27obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o 27obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o
28obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o
28 29
29obj-$(CONFIG_VIODASD) += viodasd.o 30obj-$(CONFIG_VIODASD) += viodasd.o
30obj-$(CONFIG_BLK_DEV_SX8) += sx8.o 31obj-$(CONFIG_BLK_DEV_SX8) += sx8.o
31obj-$(CONFIG_BLK_DEV_UB) += ub.o 32obj-$(CONFIG_BLK_DEV_UB) += ub.o
32 33
33obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o 34obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
34obj-$(CONFIG_LGUEST_BLOCK) += lguest_blk.o
diff --git a/drivers/block/lguest_blk.c b/drivers/block/lguest_blk.c
deleted file mode 100644
index fa8e42341b8..00000000000
--- a/drivers/block/lguest_blk.c
+++ /dev/null
@@ -1,421 +0,0 @@
1/*D:400
2 * The Guest block driver
3 *
4 * This is a simple block driver, which appears as /dev/lgba, lgbb, lgbc etc.
5 * The mechanism is simple: we place the information about the request in the
6 * device page, then use SEND_DMA (containing the data for a write, or an empty
7 * "ping" DMA for a read).
8 :*/
9/* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */
25//#define DEBUG
26#include <linux/init.h>
27#include <linux/types.h>
28#include <linux/blkdev.h>
29#include <linux/interrupt.h>
30#include <linux/lguest_bus.h>
31
32static char next_block_index = 'a';
33
34/*D:420 Here is the structure which holds all the information we need about
35 * each Guest block device.
36 *
37 * I'm sure at this stage, you're wondering "hey, where was the adventure I was
38 * promised?" and thinking "Rusty sucks, I shall say nasty things about him on
39 * my blog". I think Real adventures have boring bits, too, and you're in the
40 * middle of one. But it gets better. Just not quite yet. */
41struct blockdev
42{
43 /* The block queue infrastructure wants a spinlock: it is held while it
44 * calls our block request function. We grab it in our interrupt
45 * handler so the responses don't mess with new requests. */
46 spinlock_t lock;
47
48 /* The disk structure registered with kernel. */
49 struct gendisk *disk;
50
51 /* The major device number for this disk, and the interrupt. We only
52 * really keep them here for completeness; we'd need them if we
53 * supported device unplugging. */
54 int major;
55 int irq;
56
57 /* The physical address of this device's memory page */
58 unsigned long phys_addr;
59 /* The mapped memory page for convenient acces. */
60 struct lguest_block_page *lb_page;
61
62 /* We only have a single request outstanding at a time: this is it. */
63 struct lguest_dma dma;
64 struct request *req;
65};
66
67/*D:495 We originally used end_request() throughout the driver, but it turns
68 * out that end_request() is deprecated, and doesn't actually end the request
69 * (which seems like a good reason to deprecate it!). It simply ends the first
70 * bio. So if we had 3 bios in a "struct request" we would do all 3,
71 * end_request(), do 2, end_request(), do 1 and end_request(): twice as much
72 * work as we needed to do.
73 *
74 * This reinforced to me that I do not understand the block layer.
75 *
76 * Nonetheless, Jens Axboe gave me this nice helper to end all chunks of a
77 * request. This improved disk speed by 130%. */
78static void end_entire_request(struct request *req, int uptodate)
79{
80 if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
81 BUG();
82 add_disk_randomness(req->rq_disk);
83 blkdev_dequeue_request(req);
84 end_that_request_last(req, uptodate);
85}
86
87/* I'm told there are only two stories in the world worth telling: love and
88 * hate. So there used to be a love scene here like this:
89 *
90 * Launcher: We could make beautiful I/O together, you and I.
91 * Guest: My, that's a big disk!
92 *
93 * Unfortunately, it was just too raunchy for our otherwise-gentle tale. */
94
95/*D:490 This is the interrupt handler, called when a block read or write has
96 * been completed for us. */
97static irqreturn_t lgb_irq(int irq, void *_bd)
98{
99 /* We handed our "struct blockdev" as the argument to request_irq(), so
100 * it is passed through to us here. This tells us which device we're
101 * dealing with in case we have more than one. */
102 struct blockdev *bd = _bd;
103 unsigned long flags;
104
105 /* We weren't doing anything? Strange, but could happen if we shared
106 * interrupts (we don't!). */
107 if (!bd->req) {
108 pr_debug("No work!\n");
109 return IRQ_NONE;
110 }
111
112 /* Not done yet? That's equally strange. */
113 if (!bd->lb_page->result) {
114 pr_debug("No result!\n");
115 return IRQ_NONE;
116 }
117
118 /* We have to grab the lock before ending the request. */
119 spin_lock_irqsave(&bd->lock, flags);
120 /* "result" is 1 for success, 2 for failure: end_entire_request() wants
121 * to know whether this succeeded or not. */
122 end_entire_request(bd->req, bd->lb_page->result == 1);
123 /* Clear out request, it's done. */
124 bd->req = NULL;
125 /* Reset incoming DMA for next time. */
126 bd->dma.used_len = 0;
127 /* Ready for more reads or writes */
128 blk_start_queue(bd->disk->queue);
129 spin_unlock_irqrestore(&bd->lock, flags);
130
131 /* The interrupt was for us, we dealt with it. */
132 return IRQ_HANDLED;
133}
134
135/*D:480 The block layer's "struct request" contains a number of "struct bio"s,
136 * each of which contains "struct bio_vec"s, each of which contains a page, an
137 * offset and a length.
138 *
139 * Fortunately there are iterators to help us walk through the "struct
140 * request". Even more fortunately, there were plenty of places to steal the
141 * code from. We pack the "struct request" into our "struct lguest_dma" and
142 * return the total length. */
143static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
144{
145 unsigned int i = 0, len = 0;
146 struct req_iterator iter;
147 struct bio_vec *bvec;
148
149 rq_for_each_segment(bvec, req, iter) {
150 /* We told the block layer not to give us too many. */
151 BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
152 /* If we had a zero-length segment, it would look like
153 * the end of the data referred to by the "struct
154 * lguest_dma", so make sure that doesn't happen. */
155 BUG_ON(!bvec->bv_len);
156 /* Convert page & offset to a physical address */
157 dma->addr[i] = page_to_phys(bvec->bv_page)
158 + bvec->bv_offset;
159 dma->len[i] = bvec->bv_len;
160 len += bvec->bv_len;
161 i++;
162 }
163 /* If the array isn't full, we mark the end with a 0 length */
164 if (i < LGUEST_MAX_DMA_SECTIONS)
165 dma->len[i] = 0;
166 return len;
167}
168
169/* This creates an empty DMA, useful for prodding the Host without sending data
170 * (ie. when we want to do a read) */
171static void empty_dma(struct lguest_dma *dma)
172{
173 dma->len[0] = 0;
174}
175
176/*D:470 Setting up a request is fairly easy: */
177static void setup_req(struct blockdev *bd,
178 int type, struct request *req, struct lguest_dma *dma)
179{
180 /* The type is 1 (write) or 0 (read). */
181 bd->lb_page->type = type;
182 /* The sector on disk where the read or write starts. */
183 bd->lb_page->sector = req->sector;
184 /* The result is initialized to 0 (unfinished). */
185 bd->lb_page->result = 0;
186 /* The current request (so we can end it in the interrupt handler). */
187 bd->req = req;
188 /* The number of bytes: returned as a side-effect of req_to_dma(),
189 * which packs the block layer's "struct request" into our "struct
190 * lguest_dma" */
191 bd->lb_page->bytes = req_to_dma(req, dma);
192}
193
194/*D:450 Write is pretty straightforward: we pack the request into a "struct
195 * lguest_dma", then use SEND_DMA to send the request. */
196static void do_write(struct blockdev *bd, struct request *req)
197{
198 struct lguest_dma send;
199
200 pr_debug("lgb: WRITE sector %li\n", (long)req->sector);
201 setup_req(bd, 1, req, &send);
202
203 lguest_send_dma(bd->phys_addr, &send);
204}
205
206/* Read is similar to write, except we pack the request into our receive
207 * "struct lguest_dma" and send through an empty DMA just to tell the Host that
208 * there's a request pending. */
209static void do_read(struct blockdev *bd, struct request *req)
210{
211 struct lguest_dma ping;
212
213 pr_debug("lgb: READ sector %li\n", (long)req->sector);
214 setup_req(bd, 0, req, &bd->dma);
215
216 empty_dma(&ping);
217 lguest_send_dma(bd->phys_addr, &ping);
218}
219
220/*D:440 This where requests come in: we get handed the request queue and are
221 * expected to pull a "struct request" off it until we've finished them or
222 * we're waiting for a reply: */
223static void do_lgb_request(struct request_queue *q)
224{
225 struct blockdev *bd;
226 struct request *req;
227
228again:
229 /* This sometimes returns NULL even on the very first time around. I
230 * wonder if it's something to do with letting elves handle the request
231 * queue... */
232 req = elv_next_request(q);
233 if (!req)
234 return;
235
236 /* We attached the struct blockdev to the disk: get it back */
237 bd = req->rq_disk->private_data;
238 /* Sometimes we get repeated requests after blk_stop_queue(), but we
239 * can only handle one at a time. */
240 if (bd->req)
241 return;
242
243 /* We only do reads and writes: no tricky business! */
244 if (!blk_fs_request(req)) {
245 pr_debug("Got non-command 0x%08x\n", req->cmd_type);
246 req->errors++;
247 end_entire_request(req, 0);
248 goto again;
249 }
250
251 if (rq_data_dir(req) == WRITE)
252 do_write(bd, req);
253 else
254 do_read(bd, req);
255
256 /* We've put out the request, so stop any more coming in until we get
257 * an interrupt, which takes us to lgb_irq() to re-enable the queue. */
258 blk_stop_queue(q);
259}
260
261/*D:430 This is the "struct block_device_operations" we attach to the disk at
262 * the end of lguestblk_probe(). It doesn't seem to want much. */
263static struct block_device_operations lguestblk_fops = {
264 .owner = THIS_MODULE,
265};
266
267/*D:425 Setting up a disk device seems to involve a lot of code. I'm not sure
268 * quite why. I do know that the IDE code sent two or three of the maintainers
269 * insane, perhaps this is the fringe of the same disease?
270 *
271 * As in the console code, the probe function gets handed the generic
272 * lguest_device from lguest_bus.c: */
273static int lguestblk_probe(struct lguest_device *lgdev)
274{
275 struct blockdev *bd;
276 int err;
277 int irqflags = IRQF_SHARED;
278
279 /* First we allocate our own "struct blockdev" and initialize the easy
280 * fields. */
281 bd = kmalloc(sizeof(*bd), GFP_KERNEL);
282 if (!bd)
283 return -ENOMEM;
284
285 spin_lock_init(&bd->lock);
286 bd->irq = lgdev_irq(lgdev);
287 bd->req = NULL;
288 bd->dma.used_len = 0;
289 bd->dma.len[0] = 0;
290 /* The descriptor in the lguest_devices array provided by the Host
291 * gives the Guest the physical page number of the device's page. */
292 bd->phys_addr = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT);
293
294 /* We use lguest_map() to get a pointer to the device page */
295 bd->lb_page = lguest_map(bd->phys_addr, 1);
296 if (!bd->lb_page) {
297 err = -ENOMEM;
298 goto out_free_bd;
299 }
300
301 /* We need a major device number: 0 means "assign one dynamically". */
302 bd->major = register_blkdev(0, "lguestblk");
303 if (bd->major < 0) {
304 err = bd->major;
305 goto out_unmap;
306 }
307
308 /* This allocates a "struct gendisk" where we pack all the information
309 * about the disk which the rest of Linux sees. The argument is the
310 * number of minor devices desired: we need one minor for the main
311 * disk, and one for each partition. Of course, we can't possibly know
312 * how many partitions are on the disk (add_disk does that).
313 */
314 bd->disk = alloc_disk(16);
315 if (!bd->disk) {
316 err = -ENOMEM;
317 goto out_unregister_blkdev;
318 }
319
320 /* Every disk needs a queue for requests to come in: we set up the
321 * queue with a callback function (the core of our driver) and the lock
322 * to use. */
323 bd->disk->queue = blk_init_queue(do_lgb_request, &bd->lock);
324 if (!bd->disk->queue) {
325 err = -ENOMEM;
326 goto out_put_disk;
327 }
328
329 /* We can only handle a certain number of pointers in our SEND_DMA
330 * call, so we set that with blk_queue_max_hw_segments(). This is not
331 * to be confused with blk_queue_max_phys_segments() of course! I
332 * know, who could possibly confuse the two?
333 *
334 * Well, it's simple to tell them apart: this one seems to work and the
335 * other one didn't. */
336 blk_queue_max_hw_segments(bd->disk->queue, LGUEST_MAX_DMA_SECTIONS);
337
338 /* Due to technical limitations of our Host (and simple coding) we
339 * can't have a single buffer which crosses a page boundary. Tell it
340 * here. This means that our maximum request size is 16
341 * (LGUEST_MAX_DMA_SECTIONS) pages. */
342 blk_queue_segment_boundary(bd->disk->queue, PAGE_SIZE-1);
343
344 /* We name our disk: this becomes the device name when udev does its
345 * magic thing and creates the device node, such as /dev/lgba.
346 * next_block_index is a global which starts at 'a'. Unfortunately
347 * this simple increment logic means that the 27th disk will be called
348 * "/dev/lgb{". In that case, I recommend having at least 29 disks, so
349 * your /dev directory will be balanced. */
350 sprintf(bd->disk->disk_name, "lgb%c", next_block_index++);
351
352 /* We look to the device descriptor again to see if this device's
353 * interrupts are expected to be random. If they are, we tell the irq
354 * subsystem. At the moment this bit is always set. */
355 if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS)
356 irqflags |= IRQF_SAMPLE_RANDOM;
357
358 /* Now we have the name and irqflags, we can request the interrupt; we
359 * give it the "struct blockdev" we have set up to pass to lgb_irq()
360 * when there is an interrupt. */
361 err = request_irq(bd->irq, lgb_irq, irqflags, bd->disk->disk_name, bd);
362 if (err)
363 goto out_cleanup_queue;
364
365 /* We bind our one-entry DMA pool to the key for this block device so
366 * the Host can reply to our requests. The key is equal to the
367 * physical address of the device's page, which is conveniently
368 * unique. */
369 err = lguest_bind_dma(bd->phys_addr, &bd->dma, 1, bd->irq);
370 if (err)
371 goto out_free_irq;
372
373 /* We finish our disk initialization and add the disk to the system. */
374 bd->disk->major = bd->major;
375 bd->disk->first_minor = 0;
376 bd->disk->private_data = bd;
377 bd->disk->fops = &lguestblk_fops;
378 /* This is initialized to the disk size by the Launcher. */
379 set_capacity(bd->disk, bd->lb_page->num_sectors);
380 add_disk(bd->disk);
381
382 printk(KERN_INFO "%s: device %i at major %d\n",
383 bd->disk->disk_name, lgdev->index, bd->major);
384
385 /* We don't need to keep the "struct blockdev" around, but if we ever
386 * implemented device removal, we'd need this. */
387 lgdev->private = bd;
388 return 0;
389
390out_free_irq:
391 free_irq(bd->irq, bd);
392out_cleanup_queue:
393 blk_cleanup_queue(bd->disk->queue);
394out_put_disk:
395 put_disk(bd->disk);
396out_unregister_blkdev:
397 unregister_blkdev(bd->major, "lguestblk");
398out_unmap:
399 lguest_unmap(bd->lb_page);
400out_free_bd:
401 kfree(bd);
402 return err;
403}
404
405/*D:410 The boilerplate code for registering the lguest block driver is just
406 * like the console: */
407static struct lguest_driver lguestblk_drv = {
408 .name = "lguestblk",
409 .owner = THIS_MODULE,
410 .device_type = LGUEST_DEVICE_T_BLOCK,
411 .probe = lguestblk_probe,
412};
413
414static __init int lguestblk_init(void)
415{
416 return register_lguest_driver(&lguestblk_drv);
417}
418module_init(lguestblk_init);
419
420MODULE_DESCRIPTION("Lguest block driver");
421MODULE_LICENSE("GPL");
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
new file mode 100644
index 00000000000..a901eee64ba
--- /dev/null
+++ b/drivers/block/virtio_blk.c
@@ -0,0 +1,308 @@
1//#define DEBUG
2#include <linux/spinlock.h>
3#include <linux/blkdev.h>
4#include <linux/hdreg.h>
5#include <linux/virtio.h>
6#include <linux/virtio_blk.h>
7#include <linux/virtio_blk.h>
8
9static unsigned char virtblk_index = 'a';
10struct virtio_blk
11{
12 spinlock_t lock;
13
14 struct virtio_device *vdev;
15 struct virtqueue *vq;
16
17 /* The disk structure for the kernel. */
18 struct gendisk *disk;
19
20 /* Request tracking. */
21 struct list_head reqs;
22
23 mempool_t *pool;
24
25 /* Scatterlist: can be too big for stack. */
26 struct scatterlist sg[3+MAX_PHYS_SEGMENTS];
27};
28
29struct virtblk_req
30{
31 struct list_head list;
32 struct request *req;
33 struct virtio_blk_outhdr out_hdr;
34 struct virtio_blk_inhdr in_hdr;
35};
36
37static bool blk_done(struct virtqueue *vq)
38{
39 struct virtio_blk *vblk = vq->vdev->priv;
40 struct virtblk_req *vbr;
41 unsigned int len;
42 unsigned long flags;
43
44 spin_lock_irqsave(&vblk->lock, flags);
45 while ((vbr = vblk->vq->vq_ops->get_buf(vblk->vq, &len)) != NULL) {
46 int uptodate;
47 switch (vbr->in_hdr.status) {
48 case VIRTIO_BLK_S_OK:
49 uptodate = 1;
50 break;
51 case VIRTIO_BLK_S_UNSUPP:
52 uptodate = -ENOTTY;
53 break;
54 default:
55 uptodate = 0;
56 break;
57 }
58
59 end_dequeued_request(vbr->req, uptodate);
60 list_del(&vbr->list);
61 mempool_free(vbr, vblk->pool);
62 }
63 /* In case queue is stopped waiting for more buffers. */
64 blk_start_queue(vblk->disk->queue);
65 spin_unlock_irqrestore(&vblk->lock, flags);
66 return true;
67}
68
69static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
70 struct request *req)
71{
72 unsigned long num, out, in;
73 struct virtblk_req *vbr;
74
75 vbr = mempool_alloc(vblk->pool, GFP_ATOMIC);
76 if (!vbr)
77 /* When another request finishes we'll try again. */
78 return false;
79
80 vbr->req = req;
81 if (blk_fs_request(vbr->req)) {
82 vbr->out_hdr.type = 0;
83 vbr->out_hdr.sector = vbr->req->sector;
84 vbr->out_hdr.ioprio = vbr->req->ioprio;
85 } else if (blk_pc_request(vbr->req)) {
86 vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD;
87 vbr->out_hdr.sector = 0;
88 vbr->out_hdr.ioprio = vbr->req->ioprio;
89 } else {
90 /* We don't put anything else in the queue. */
91 BUG();
92 }
93
94 if (blk_barrier_rq(vbr->req))
95 vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER;
96
97 /* We have to zero this, otherwise blk_rq_map_sg gets upset. */
98 memset(vblk->sg, 0, sizeof(vblk->sg));
99 sg_set_buf(&vblk->sg[0], &vbr->out_hdr, sizeof(vbr->out_hdr));
100 num = blk_rq_map_sg(q, vbr->req, vblk->sg+1);
101 sg_set_buf(&vblk->sg[num+1], &vbr->in_hdr, sizeof(vbr->in_hdr));
102
103 if (rq_data_dir(vbr->req) == WRITE) {
104 vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
105 out = 1 + num;
106 in = 1;
107 } else {
108 vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
109 out = 1;
110 in = 1 + num;
111 }
112
113 if (vblk->vq->vq_ops->add_buf(vblk->vq, vblk->sg, out, in, vbr)) {
114 mempool_free(vbr, vblk->pool);
115 return false;
116 }
117
118 list_add_tail(&vbr->list, &vblk->reqs);
119 return true;
120}
121
122static void do_virtblk_request(struct request_queue *q)
123{
124 struct virtio_blk *vblk = NULL;
125 struct request *req;
126 unsigned int issued = 0;
127
128 while ((req = elv_next_request(q)) != NULL) {
129 vblk = req->rq_disk->private_data;
130 BUG_ON(req->nr_phys_segments > ARRAY_SIZE(vblk->sg));
131
132 /* If this request fails, stop queue and wait for something to
133 finish to restart it. */
134 if (!do_req(q, vblk, req)) {
135 blk_stop_queue(q);
136 break;
137 }
138 blkdev_dequeue_request(req);
139 issued++;
140 }
141
142 if (issued)
143 vblk->vq->vq_ops->kick(vblk->vq);
144}
145
146static int virtblk_ioctl(struct inode *inode, struct file *filp,
147 unsigned cmd, unsigned long data)
148{
149 return scsi_cmd_ioctl(filp, inode->i_bdev->bd_disk->queue,
150 inode->i_bdev->bd_disk, cmd,
151 (void __user *)data);
152}
153
154static struct block_device_operations virtblk_fops = {
155 .ioctl = virtblk_ioctl,
156 .owner = THIS_MODULE,
157};
158
159static int virtblk_probe(struct virtio_device *vdev)
160{
161 struct virtio_blk *vblk;
162 int err, major;
163 void *token;
164 unsigned int len;
165 u64 cap;
166 u32 v;
167
168 vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
169 if (!vblk) {
170 err = -ENOMEM;
171 goto out;
172 }
173
174 INIT_LIST_HEAD(&vblk->reqs);
175 spin_lock_init(&vblk->lock);
176 vblk->vdev = vdev;
177
178 /* We expect one virtqueue, for output. */
179 vblk->vq = vdev->config->find_vq(vdev, blk_done);
180 if (IS_ERR(vblk->vq)) {
181 err = PTR_ERR(vblk->vq);
182 goto out_free_vblk;
183 }
184
185 vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req));
186 if (!vblk->pool) {
187 err = -ENOMEM;
188 goto out_free_vq;
189 }
190
191 major = register_blkdev(0, "virtblk");
192 if (major < 0) {
193 err = major;
194 goto out_mempool;
195 }
196
197 /* FIXME: How many partitions? How long is a piece of string? */
198 vblk->disk = alloc_disk(1 << 4);
199 if (!vblk->disk) {
200 err = -ENOMEM;
201 goto out_unregister_blkdev;
202 }
203
204 vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock);
205 if (!vblk->disk->queue) {
206 err = -ENOMEM;
207 goto out_put_disk;
208 }
209
210 sprintf(vblk->disk->disk_name, "vd%c", virtblk_index++);
211 vblk->disk->major = major;
212 vblk->disk->first_minor = 0;
213 vblk->disk->private_data = vblk;
214 vblk->disk->fops = &virtblk_fops;
215
216 /* If barriers are supported, tell block layer that queue is ordered */
217 token = vdev->config->find(vdev, VIRTIO_CONFIG_BLK_F, &len);
218 if (virtio_use_bit(vdev, token, len, VIRTIO_BLK_F_BARRIER))
219 blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL);
220
221 err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_CAPACITY, &cap);
222 if (err) {
223 dev_err(&vdev->dev, "Bad/missing capacity in config\n");
224 goto out_put_disk;
225 }
226
227 /* If capacity is too big, truncate with warning. */
228 if ((sector_t)cap != cap) {
229 dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n",
230 (unsigned long long)cap);
231 cap = (sector_t)-1;
232 }
233 set_capacity(vblk->disk, cap);
234
235 err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SIZE_MAX, &v);
236 if (!err)
237 blk_queue_max_segment_size(vblk->disk->queue, v);
238 else if (err != -ENOENT) {
239 dev_err(&vdev->dev, "Bad SIZE_MAX in config\n");
240 goto out_put_disk;
241 }
242
243 err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SEG_MAX, &v);
244 if (!err)
245 blk_queue_max_hw_segments(vblk->disk->queue, v);
246 else if (err != -ENOENT) {
247 dev_err(&vdev->dev, "Bad SEG_MAX in config\n");
248 goto out_put_disk;
249 }
250
251 add_disk(vblk->disk);
252 return 0;
253
254out_put_disk:
255 put_disk(vblk->disk);
256out_unregister_blkdev:
257 unregister_blkdev(major, "virtblk");
258out_mempool:
259 mempool_destroy(vblk->pool);
260out_free_vq:
261 vdev->config->del_vq(vblk->vq);
262out_free_vblk:
263 kfree(vblk);
264out:
265 return err;
266}
267
268static void virtblk_remove(struct virtio_device *vdev)
269{
270 struct virtio_blk *vblk = vdev->priv;
271 int major = vblk->disk->major;
272
273 BUG_ON(!list_empty(&vblk->reqs));
274 blk_cleanup_queue(vblk->disk->queue);
275 put_disk(vblk->disk);
276 unregister_blkdev(major, "virtblk");
277 mempool_destroy(vblk->pool);
278 kfree(vblk);
279}
280
281static struct virtio_device_id id_table[] = {
282 { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
283 { 0 },
284};
285
286static struct virtio_driver virtio_blk = {
287 .driver.name = KBUILD_MODNAME,
288 .driver.owner = THIS_MODULE,
289 .id_table = id_table,
290 .probe = virtblk_probe,
291 .remove = __devexit_p(virtblk_remove),
292};
293
294static int __init init(void)
295{
296 return register_virtio_driver(&virtio_blk);
297}
298
299static void __exit fini(void)
300{
301 unregister_virtio_driver(&virtio_blk);
302}
303module_init(init);
304module_exit(fini);
305
306MODULE_DEVICE_TABLE(virtio, id_table);
307MODULE_DESCRIPTION("Virtio block driver");
308MODULE_LICENSE("GPL");
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 65491103e0f..bf18d757b87 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -613,6 +613,10 @@ config HVC_XEN
613 help 613 help
614 Xen virtual console device driver 614 Xen virtual console device driver
615 615
616config VIRTIO_CONSOLE
617 bool
618 select HVC_DRIVER
619
616config HVCS 620config HVCS
617 tristate "IBM Hypervisor Virtual Console Server support" 621 tristate "IBM Hypervisor Virtual Console Server support"
618 depends on PPC_PSERIES 622 depends on PPC_PSERIES
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index c78ff26647e..07304d50e0c 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -42,7 +42,6 @@ obj-$(CONFIG_SYNCLINK_GT) += synclink_gt.o
42obj-$(CONFIG_N_HDLC) += n_hdlc.o 42obj-$(CONFIG_N_HDLC) += n_hdlc.o
43obj-$(CONFIG_AMIGA_BUILTIN_SERIAL) += amiserial.o 43obj-$(CONFIG_AMIGA_BUILTIN_SERIAL) += amiserial.o
44obj-$(CONFIG_SX) += sx.o generic_serial.o 44obj-$(CONFIG_SX) += sx.o generic_serial.o
45obj-$(CONFIG_LGUEST_GUEST) += hvc_lguest.o
46obj-$(CONFIG_RIO) += rio/ generic_serial.o 45obj-$(CONFIG_RIO) += rio/ generic_serial.o
47obj-$(CONFIG_HVC_CONSOLE) += hvc_vio.o hvsi.o 46obj-$(CONFIG_HVC_CONSOLE) += hvc_vio.o hvsi.o
48obj-$(CONFIG_HVC_ISERIES) += hvc_iseries.o 47obj-$(CONFIG_HVC_ISERIES) += hvc_iseries.o
@@ -50,6 +49,7 @@ obj-$(CONFIG_HVC_RTAS) += hvc_rtas.o
50obj-$(CONFIG_HVC_BEAT) += hvc_beat.o 49obj-$(CONFIG_HVC_BEAT) += hvc_beat.o
51obj-$(CONFIG_HVC_DRIVER) += hvc_console.o 50obj-$(CONFIG_HVC_DRIVER) += hvc_console.o
52obj-$(CONFIG_HVC_XEN) += hvc_xen.o 51obj-$(CONFIG_HVC_XEN) += hvc_xen.o
52obj-$(CONFIG_VIRTIO_CONSOLE) += virtio_console.o
53obj-$(CONFIG_RAW_DRIVER) += raw.o 53obj-$(CONFIG_RAW_DRIVER) += raw.o
54obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o 54obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o
55obj-$(CONFIG_MSPEC) += mspec.o 55obj-$(CONFIG_MSPEC) += mspec.o
diff --git a/drivers/char/hvc_lguest.c b/drivers/char/hvc_lguest.c
deleted file mode 100644
index efccb215583..00000000000
--- a/drivers/char/hvc_lguest.c
+++ /dev/null
@@ -1,177 +0,0 @@
1/*D:300
2 * The Guest console driver
3 *
4 * This is a trivial console driver: we use lguest's DMA mechanism to send
5 * bytes out, and register a DMA buffer to receive bytes in. It is assumed to
6 * be present and available from the very beginning of boot.
7 *
8 * Writing console drivers is one of the few remaining Dark Arts in Linux.
9 * Fortunately for us, the path of virtual consoles has been well-trodden by
10 * the PowerPC folks, who wrote "hvc_console.c" to generically support any
11 * virtual console. We use that infrastructure which only requires us to write
12 * the basic put_chars and get_chars functions and call the right register
13 * functions.
14 :*/
15
16/*M:002 The console can be flooded: while the Guest is processing input the
17 * Host can send more. Buffering in the Host could alleviate this, but it is a
18 * difficult problem in general. :*/
19/* Copyright (C) 2006 Rusty Russell, IBM Corporation
20 *
21 * This program is free software; you can redistribute it and/or modify
22 * it under the terms of the GNU General Public License as published by
23 * the Free Software Foundation; either version 2 of the License, or
24 * (at your option) any later version.
25 *
26 * This program is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with this program; if not, write to the Free Software
33 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
34 */
35#include <linux/err.h>
36#include <linux/init.h>
37#include <linux/lguest_bus.h>
38#include <asm/paravirt.h>
39#include "hvc_console.h"
40
41/*D:340 This is our single console input buffer, with associated "struct
42 * lguest_dma" referring to it. Note the 0-terminated length array, and the
43 * use of physical address for the buffer itself. */
44static char inbuf[256];
45static struct lguest_dma cons_input = { .used_len = 0,
46 .addr[0] = __pa(inbuf),
47 .len[0] = sizeof(inbuf),
48 .len[1] = 0 };
49
50/*D:310 The put_chars() callback is pretty straightforward.
51 *
52 * First we put the pointer and length in a "struct lguest_dma": we only have
53 * one pointer, so we set the second length to 0. Then we use SEND_DMA to send
54 * the data to (Host) buffers attached to the console key. Usually a device's
55 * key is a physical address within the device's memory, but because the
56 * console device doesn't have any associated physical memory, we use the
57 * LGUEST_CONSOLE_DMA_KEY constant (aka 0). */
58static int put_chars(u32 vtermno, const char *buf, int count)
59{
60 struct lguest_dma dma;
61
62 /* FIXME: DMA buffers in a "struct lguest_dma" are not allowed
63 * to go over page boundaries. This never seems to happen,
64 * but if it did we'd need to fix this code. */
65 dma.len[0] = count;
66 dma.len[1] = 0;
67 dma.addr[0] = __pa(buf);
68
69 lguest_send_dma(LGUEST_CONSOLE_DMA_KEY, &dma);
70 /* We're expected to return the amount of data we wrote: all of it. */
71 return count;
72}
73
74/*D:350 get_chars() is the callback from the hvc_console infrastructure when
75 * an interrupt is received.
76 *
77 * Firstly we see if our buffer has been filled: if not, we return. The rest
78 * of the code deals with the fact that the hvc_console() infrastructure only
79 * asks us for 16 bytes at a time. We keep a "cons_offset" variable for
80 * partially-read buffers. */
81static int get_chars(u32 vtermno, char *buf, int count)
82{
83 static int cons_offset;
84
85 /* Nothing left to see here... */
86 if (!cons_input.used_len)
87 return 0;
88
89 /* You want more than we have to give? Well, try wanting less! */
90 if (cons_input.used_len - cons_offset < count)
91 count = cons_input.used_len - cons_offset;
92
93 /* Copy across to their buffer and increment offset. */
94 memcpy(buf, inbuf + cons_offset, count);
95 cons_offset += count;
96
97 /* Finished? Zero offset, and reset cons_input so Host will use it
98 * again. */
99 if (cons_offset == cons_input.used_len) {
100 cons_offset = 0;
101 cons_input.used_len = 0;
102 }
103 return count;
104}
105/*:*/
106
107static struct hv_ops lguest_cons = {
108 .get_chars = get_chars,
109 .put_chars = put_chars,
110};
111
112/*D:320 Console drivers are initialized very early so boot messages can go
113 * out. At this stage, the console is output-only. Our driver checks we're a
114 * Guest, and if so hands hvc_instantiate() the console number (0), priority
115 * (0), and the struct hv_ops containing the put_chars() function. */
116static int __init cons_init(void)
117{
118 if (strcmp(pv_info.name, "lguest") != 0)
119 return 0;
120
121 return hvc_instantiate(0, 0, &lguest_cons);
122}
123console_initcall(cons_init);
124
125/*D:370 To set up and manage our virtual console, we call hvc_alloc() and
126 * stash the result in the private pointer of the "struct lguest_device".
127 * Since we never remove the console device we never need this pointer again,
128 * but using ->private is considered good form, and you never know who's going
129 * to copy your driver.
130 *
131 * Once the console is set up, we bind our input buffer ready for input. */
132static int lguestcons_probe(struct lguest_device *lgdev)
133{
134 int err;
135
136 /* The first argument of hvc_alloc() is the virtual console number, so
137 * we use zero. The second argument is the interrupt number.
138 *
139 * The third argument is a "struct hv_ops" containing the put_chars()
140 * and get_chars() pointers. The final argument is the output buffer
141 * size: we use 256 and expect the Host to have room for us to send
142 * that much. */
143 lgdev->private = hvc_alloc(0, lgdev_irq(lgdev), &lguest_cons, 256);
144 if (IS_ERR(lgdev->private))
145 return PTR_ERR(lgdev->private);
146
147 /* We bind a single DMA buffer at key LGUEST_CONSOLE_DMA_KEY.
148 * "cons_input" is that statically-initialized global DMA buffer we saw
149 * above, and we also give the interrupt we want. */
150 err = lguest_bind_dma(LGUEST_CONSOLE_DMA_KEY, &cons_input, 1,
151 lgdev_irq(lgdev));
152 if (err)
153 printk("lguest console: failed to bind buffer.\n");
154 return err;
155}
156/* Note the use of lgdev_irq() for the interrupt number. We tell hvc_alloc()
157 * to expect input when this interrupt is triggered, and then tell
158 * lguest_bind_dma() that is the interrupt to send us when input comes in. */
159
160/*D:360 From now on the console driver follows standard Guest driver form:
161 * register_lguest_driver() registers the device type and probe function, and
162 * the probe function sets up the device.
163 *
164 * The standard "struct lguest_driver": */
165static struct lguest_driver lguestcons_drv = {
166 .name = "lguestcons",
167 .owner = THIS_MODULE,
168 .device_type = LGUEST_DEVICE_T_CONSOLE,
169 .probe = lguestcons_probe,
170};
171
172/* The standard init function */
173static int __init hvc_lguest_init(void)
174{
175 return register_lguest_driver(&lguestcons_drv);
176}
177module_init(hvc_lguest_init);
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
new file mode 100644
index 00000000000..100e8a201e3
--- /dev/null
+++ b/drivers/char/virtio_console.c
@@ -0,0 +1,225 @@
1/*D:300
2 * The Guest console driver
3 *
4 * Writing console drivers is one of the few remaining Dark Arts in Linux.
5 * Fortunately for us, the path of virtual consoles has been well-trodden by
6 * the PowerPC folks, who wrote "hvc_console.c" to generically support any
7 * virtual console. We use that infrastructure which only requires us to write
8 * the basic put_chars and get_chars functions and call the right register
9 * functions.
10 :*/
11
12/*M:002 The console can be flooded: while the Guest is processing input the
13 * Host can send more. Buffering in the Host could alleviate this, but it is a
14 * difficult problem in general. :*/
15/* Copyright (C) 2006, 2007 Rusty Russell, IBM Corporation
16 *
17 * This program is free software; you can redistribute it and/or modify
18 * it under the terms of the GNU General Public License as published by
19 * the Free Software Foundation; either version 2 of the License, or
20 * (at your option) any later version.
21 *
22 * This program is distributed in the hope that it will be useful,
23 * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 * GNU General Public License for more details.
26 *
27 * You should have received a copy of the GNU General Public License
28 * along with this program; if not, write to the Free Software
29 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30 */
31#include <linux/err.h>
32#include <linux/init.h>
33#include <linux/virtio.h>
34#include <linux/virtio_console.h>
35#include "hvc_console.h"
36
37/*D:340 These represent our input and output console queues, and the virtio
38 * operations for them. */
39static struct virtqueue *in_vq, *out_vq;
40static struct virtio_device *vdev;
41
42/* This is our input buffer, and how much data is left in it. */
43static unsigned int in_len;
44static char *in, *inbuf;
45
46/* The operations for our console. */
47static struct hv_ops virtio_cons;
48
49/*D:310 The put_chars() callback is pretty straightforward.
50 *
51 * We turn the characters into a scatter-gather list, add it to the output
52 * queue and then kick the Host. Then we sit here waiting for it to finish:
53 * inefficient in theory, but in practice implementations will do it
54 * immediately (lguest's Launcher does). */
55static int put_chars(u32 vtermno, const char *buf, int count)
56{
57 struct scatterlist sg[1];
58 unsigned int len;
59
60 /* This is a convenient routine to initialize a single-elem sg list */
61 sg_init_one(sg, buf, count);
62
63 /* add_buf wants a token to identify this buffer: we hand it any
64 * non-NULL pointer, since there's only ever one buffer. */
65 if (out_vq->vq_ops->add_buf(out_vq, sg, 1, 0, (void *)1) == 0) {
66 /* Tell Host to go! */
67 out_vq->vq_ops->kick(out_vq);
68 /* Chill out until it's done with the buffer. */
69 while (!out_vq->vq_ops->get_buf(out_vq, &len))
70 cpu_relax();
71 }
72
73 /* We're expected to return the amount of data we wrote: all of it. */
74 return count;
75}
76
77/* Create a scatter-gather list representing our input buffer and put it in the
78 * queue. */
79static void add_inbuf(void)
80{
81 struct scatterlist sg[1];
82 sg_init_one(sg, inbuf, PAGE_SIZE);
83
84 /* We should always be able to add one buffer to an empty queue. */
85 if (in_vq->vq_ops->add_buf(in_vq, sg, 0, 1, inbuf) != 0)
86 BUG();
87 in_vq->vq_ops->kick(in_vq);
88}
89
90/*D:350 get_chars() is the callback from the hvc_console infrastructure when
91 * an interrupt is received.
92 *
93 * Most of the code deals with the fact that the hvc_console() infrastructure
94 * only asks us for 16 bytes at a time. We keep in_offset and in_used fields
95 * for partially-filled buffers. */
96static int get_chars(u32 vtermno, char *buf, int count)
97{
98 /* If we don't have an input queue yet, we can't get input. */
99 BUG_ON(!in_vq);
100
101 /* No buffer? Try to get one. */
102 if (!in_len) {
103 in = in_vq->vq_ops->get_buf(in_vq, &in_len);
104 if (!in)
105 return 0;
106 }
107
108 /* You want more than we have to give? Well, try wanting less! */
109 if (in_len < count)
110 count = in_len;
111
112 /* Copy across to their buffer and increment offset. */
113 memcpy(buf, in, count);
114 in += count;
115 in_len -= count;
116
117 /* Finished? Re-register buffer so Host will use it again. */
118 if (in_len == 0)
119 add_inbuf();
120
121 return count;
122}
123/*:*/
124
125/*D:320 Console drivers are initialized very early so boot messages can go out,
126 * so we do things slightly differently from the generic virtio initialization
127 * of the net and block drivers.
128 *
129 * At this stage, the console is output-only. It's too early to set up a
130 * virtqueue, so we let the drivers do some boutique early-output thing. */
131int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int))
132{
133 virtio_cons.put_chars = put_chars;
134 return hvc_instantiate(0, 0, &virtio_cons);
135}
136
137/*D:370 Once we're further in boot, we get probed like any other virtio device.
138 * At this stage we set up the output virtqueue.
139 *
140 * To set up and manage our virtual console, we call hvc_alloc(). Since we
141 * never remove the console device we never need this pointer again.
142 *
143 * Finally we put our input buffer in the input queue, ready to receive. */
144static int virtcons_probe(struct virtio_device *dev)
145{
146 int err;
147 struct hvc_struct *hvc;
148
149 vdev = dev;
150
151 /* This is the scratch page we use to receive console input */
152 inbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
153 if (!inbuf) {
154 err = -ENOMEM;
155 goto fail;
156 }
157
158 /* Find the input queue. */
159 /* FIXME: This is why we want to wean off hvc: we do nothing
160 * when input comes in. */
161 in_vq = vdev->config->find_vq(vdev, NULL);
162 if (IS_ERR(in_vq)) {
163 err = PTR_ERR(in_vq);
164 goto free;
165 }
166
167 out_vq = vdev->config->find_vq(vdev, NULL);
168 if (IS_ERR(out_vq)) {
169 err = PTR_ERR(out_vq);
170 goto free_in_vq;
171 }
172
173 /* Start using the new console output. */
174 virtio_cons.get_chars = get_chars;
175 virtio_cons.put_chars = put_chars;
176
177 /* The first argument of hvc_alloc() is the virtual console number, so
178 * we use zero. The second argument is the interrupt number; we
179 * currently leave this as zero: it would be better not to use the
180 * hvc mechanism and fix this (FIXME!).
181 *
182 * The third argument is a "struct hv_ops" containing the put_chars()
183 * and get_chars() pointers. The final argument is the output buffer
184 * size: we can do any size, so we put PAGE_SIZE here. */
185 hvc = hvc_alloc(0, 0, &virtio_cons, PAGE_SIZE);
186 if (IS_ERR(hvc)) {
187 err = PTR_ERR(hvc);
188 goto free_out_vq;
189 }
190
191 /* Register the input buffer the first time. */
192 add_inbuf();
193 return 0;
194
195free_out_vq:
196 vdev->config->del_vq(out_vq);
197free_in_vq:
198 vdev->config->del_vq(in_vq);
199free:
200 kfree(inbuf);
201fail:
202 return err;
203}
204
205static struct virtio_device_id id_table[] = {
206 { VIRTIO_ID_CONSOLE, VIRTIO_DEV_ANY_ID },
207 { 0 },
208};
209
210static struct virtio_driver virtio_console = {
211 .driver.name = KBUILD_MODNAME,
212 .driver.owner = THIS_MODULE,
213 .id_table = id_table,
214 .probe = virtcons_probe,
215};
216
217static int __init init(void)
218{
219 return register_virtio_driver(&virtio_console);
220}
221module_init(init);
222
223MODULE_DEVICE_TABLE(virtio, id_table);
224MODULE_DESCRIPTION("Virtio console driver");
225MODULE_LICENSE("GPL");
diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig
index 8749fa4ffce..656920636cb 100644
--- a/drivers/kvm/Kconfig
+++ b/drivers/kvm/Kconfig
@@ -47,4 +47,8 @@ config KVM_AMD
47 Provides support for KVM on AMD processors equipped with the AMD-V 47 Provides support for KVM on AMD processors equipped with the AMD-V
48 (SVM) extensions. 48 (SVM) extensions.
49 49
50# OK, it's a little counter-intuitive to do this, but it puts it neatly under
51# the virtualization menu.
52source drivers/lguest/Kconfig
53
50endif # VIRTUALIZATION 54endif # VIRTUALIZATION
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
index 41e2250613a..7eb9ecff8f4 100644
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -1,7 +1,6 @@
1config LGUEST 1config LGUEST
2 tristate "Linux hypervisor example code" 2 tristate "Linux hypervisor example code"
3 depends on X86 && PARAVIRT && EXPERIMENTAL && !X86_PAE && FUTEX 3 depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX && !(X86_VISWS || X86_VOYAGER)
4 select LGUEST_GUEST
5 select HVC_DRIVER 4 select HVC_DRIVER
6 ---help--- 5 ---help---
7 This is a very simple module which allows you to run 6 This is a very simple module which allows you to run
@@ -18,13 +17,3 @@ config LGUEST_GUEST
18 The guest needs code built-in, even if the host has lguest 17 The guest needs code built-in, even if the host has lguest
19 support as a module. The drivers are tiny, so we build them 18 support as a module. The drivers are tiny, so we build them
20 in too. 19 in too.
21
22config LGUEST_NET
23 tristate
24 default y
25 depends on LGUEST_GUEST && NET
26
27config LGUEST_BLOCK
28 tristate
29 default y
30 depends on LGUEST_GUEST && BLOCK
diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile
index e5047471c33..5e8272d296d 100644
--- a/drivers/lguest/Makefile
+++ b/drivers/lguest/Makefile
@@ -1,10 +1,12 @@
1# Guest requires the paravirt_ops replacement and the bus driver. 1# Guest requires the device configuration and probing code.
2obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_asm.o lguest_bus.o 2obj-$(CONFIG_LGUEST_GUEST) += lguest_device.o
3 3
4# Host requires the other files, which can be a module. 4# Host requires the other files, which can be a module.
5obj-$(CONFIG_LGUEST) += lg.o 5obj-$(CONFIG_LGUEST) += lg.o
6lg-y := core.o hypercalls.o page_tables.o interrupts_and_traps.o \ 6lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \
7 segments.o io.o lguest_user.o switcher.o 7 segments.o lguest_user.o
8
9lg-$(CONFIG_X86_32) += x86/switcher_32.o x86/core.o
8 10
9Preparation Preparation!: PREFIX=P 11Preparation Preparation!: PREFIX=P
10Guest: PREFIX=G 12Guest: PREFIX=G
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index a0788c12b39..35d19ae58de 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -11,58 +11,20 @@
11#include <linux/vmalloc.h> 11#include <linux/vmalloc.h>
12#include <linux/cpu.h> 12#include <linux/cpu.h>
13#include <linux/freezer.h> 13#include <linux/freezer.h>
14#include <linux/highmem.h>
14#include <asm/paravirt.h> 15#include <asm/paravirt.h>
15#include <asm/desc.h>
16#include <asm/pgtable.h> 16#include <asm/pgtable.h>
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18#include <asm/poll.h> 18#include <asm/poll.h>
19#include <asm/highmem.h>
20#include <asm/asm-offsets.h> 19#include <asm/asm-offsets.h>
21#include <asm/i387.h>
22#include "lg.h" 20#include "lg.h"
23 21
24/* Found in switcher.S */
25extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
26extern unsigned long default_idt_entries[];
27
28/* Every guest maps the core switcher code. */
29#define SHARED_SWITCHER_PAGES \
30 DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
31/* Pages for switcher itself, then two pages per cpu */
32#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS)
33
34/* We map at -4M for ease of mapping into the guest (one PTE page). */
35#define SWITCHER_ADDR 0xFFC00000
36 22
37static struct vm_struct *switcher_vma; 23static struct vm_struct *switcher_vma;
38static struct page **switcher_page; 24static struct page **switcher_page;
39 25
40static int cpu_had_pge;
41static struct {
42 unsigned long offset;
43 unsigned short segment;
44} lguest_entry;
45
46/* This One Big lock protects all inter-guest data structures. */ 26/* This One Big lock protects all inter-guest data structures. */
47DEFINE_MUTEX(lguest_lock); 27DEFINE_MUTEX(lguest_lock);
48static DEFINE_PER_CPU(struct lguest *, last_guest);
49
50/* FIXME: Make dynamic. */
51#define MAX_LGUEST_GUESTS 16
52struct lguest lguests[MAX_LGUEST_GUESTS];
53
54/* Offset from where switcher.S was compiled to where we've copied it */
55static unsigned long switcher_offset(void)
56{
57 return SWITCHER_ADDR - (unsigned long)start_switcher_text;
58}
59
60/* This cpu's struct lguest_pages. */
61static struct lguest_pages *lguest_pages(unsigned int cpu)
62{
63 return &(((struct lguest_pages *)
64 (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
65}
66 28
67/*H:010 We need to set up the Switcher at a high virtual address. Remember the 29/*H:010 We need to set up the Switcher at a high virtual address. Remember the
68 * Switcher is a few hundred bytes of assembler code which actually changes the 30 * Switcher is a few hundred bytes of assembler code which actually changes the
@@ -73,9 +35,7 @@ static struct lguest_pages *lguest_pages(unsigned int cpu)
73 * Host since it will be running as the switchover occurs. 35 * Host since it will be running as the switchover occurs.
74 * 36 *
75 * Trying to map memory at a particular address is an unusual thing to do, so 37 * Trying to map memory at a particular address is an unusual thing to do, so
76 * it's not a simple one-liner. We also set up the per-cpu parts of the 38 * it's not a simple one-liner. */
77 * Switcher here.
78 */
79static __init int map_switcher(void) 39static __init int map_switcher(void)
80{ 40{
81 int i, err; 41 int i, err;
@@ -132,90 +92,11 @@ static __init int map_switcher(void)
132 goto free_vma; 92 goto free_vma;
133 } 93 }
134 94
135 /* Now the switcher is mapped at the right address, we can't fail! 95 /* Now the Switcher is mapped at the right address, we can't fail!
136 * Copy in the compiled-in Switcher code (from switcher.S). */ 96 * Copy in the compiled-in Switcher code (from <arch>_switcher.S). */
137 memcpy(switcher_vma->addr, start_switcher_text, 97 memcpy(switcher_vma->addr, start_switcher_text,
138 end_switcher_text - start_switcher_text); 98 end_switcher_text - start_switcher_text);
139 99
140 /* Most of the switcher.S doesn't care that it's been moved; on Intel,
141 * jumps are relative, and it doesn't access any references to external
142 * code or data.
143 *
144 * The only exception is the interrupt handlers in switcher.S: their
145 * addresses are placed in a table (default_idt_entries), so we need to
146 * update the table with the new addresses. switcher_offset() is a
147 * convenience function which returns the distance between the builtin
148 * switcher code and the high-mapped copy we just made. */
149 for (i = 0; i < IDT_ENTRIES; i++)
150 default_idt_entries[i] += switcher_offset();
151
152 /*
153 * Set up the Switcher's per-cpu areas.
154 *
155 * Each CPU gets two pages of its own within the high-mapped region
156 * (aka. "struct lguest_pages"). Much of this can be initialized now,
157 * but some depends on what Guest we are running (which is set up in
158 * copy_in_guest_info()).
159 */
160 for_each_possible_cpu(i) {
161 /* lguest_pages() returns this CPU's two pages. */
162 struct lguest_pages *pages = lguest_pages(i);
163 /* This is a convenience pointer to make the code fit one
164 * statement to a line. */
165 struct lguest_ro_state *state = &pages->state;
166
167 /* The Global Descriptor Table: the Host has a different one
168 * for each CPU. We keep a descriptor for the GDT which says
169 * where it is and how big it is (the size is actually the last
170 * byte, not the size, hence the "-1"). */
171 state->host_gdt_desc.size = GDT_SIZE-1;
172 state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
173
174 /* All CPUs on the Host use the same Interrupt Descriptor
175 * Table, so we just use store_idt(), which gets this CPU's IDT
176 * descriptor. */
177 store_idt(&state->host_idt_desc);
178
179 /* The descriptors for the Guest's GDT and IDT can be filled
180 * out now, too. We copy the GDT & IDT into ->guest_gdt and
181 * ->guest_idt before actually running the Guest. */
182 state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
183 state->guest_idt_desc.address = (long)&state->guest_idt;
184 state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
185 state->guest_gdt_desc.address = (long)&state->guest_gdt;
186
187 /* We know where we want the stack to be when the Guest enters
188 * the switcher: in pages->regs. The stack grows upwards, so
189 * we start it at the end of that structure. */
190 state->guest_tss.esp0 = (long)(&pages->regs + 1);
191 /* And this is the GDT entry to use for the stack: we keep a
192 * couple of special LGUEST entries. */
193 state->guest_tss.ss0 = LGUEST_DS;
194
195 /* x86 can have a finegrained bitmap which indicates what I/O
196 * ports the process can use. We set it to the end of our
197 * structure, meaning "none". */
198 state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
199
200 /* Some GDT entries are the same across all Guests, so we can
201 * set them up now. */
202 setup_default_gdt_entries(state);
203 /* Most IDT entries are the same for all Guests, too.*/
204 setup_default_idt_entries(state, default_idt_entries);
205
206 /* The Host needs to be able to use the LGUEST segments on this
207 * CPU, too, so put them in the Host GDT. */
208 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
209 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
210 }
211
212 /* In the Switcher, we want the %cs segment register to use the
213 * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so
214 * it will be undisturbed when we switch. To change %cs and jump we
215 * need this structure to feed to Intel's "lcall" instruction. */
216 lguest_entry.offset = (long)switch_to_guest + switcher_offset();
217 lguest_entry.segment = LGUEST_CS;
218
219 printk(KERN_INFO "lguest: mapped switcher at %p\n", 100 printk(KERN_INFO "lguest: mapped switcher at %p\n",
220 switcher_vma->addr); 101 switcher_vma->addr);
221 /* And we succeeded... */ 102 /* And we succeeded... */
@@ -247,86 +128,12 @@ static void unmap_switcher(void)
247 __free_pages(switcher_page[i], 0); 128 __free_pages(switcher_page[i], 0);
248} 129}
249 130
250/*H:130 Our Guest is usually so well behaved; it never tries to do things it
251 * isn't allowed to. Unfortunately, Linux's paravirtual infrastructure isn't
252 * quite complete, because it doesn't contain replacements for the Intel I/O
253 * instructions. As a result, the Guest sometimes fumbles across one during
254 * the boot process as it probes for various things which are usually attached
255 * to a PC.
256 *
257 * When the Guest uses one of these instructions, we get trap #13 (General
258 * Protection Fault) and come here. We see if it's one of those troublesome
259 * instructions and skip over it. We return true if we did. */
260static int emulate_insn(struct lguest *lg)
261{
262 u8 insn;
263 unsigned int insnlen = 0, in = 0, shift = 0;
264 /* The eip contains the *virtual* address of the Guest's instruction:
265 * guest_pa just subtracts the Guest's page_offset. */
266 unsigned long physaddr = guest_pa(lg, lg->regs->eip);
267
268 /* The guest_pa() function only works for Guest kernel addresses, but
269 * that's all we're trying to do anyway. */
270 if (lg->regs->eip < lg->page_offset)
271 return 0;
272
273 /* Decoding x86 instructions is icky. */
274 lgread(lg, &insn, physaddr, 1);
275
276 /* 0x66 is an "operand prefix". It means it's using the upper 16 bits
277 of the eax register. */
278 if (insn == 0x66) {
279 shift = 16;
280 /* The instruction is 1 byte so far, read the next byte. */
281 insnlen = 1;
282 lgread(lg, &insn, physaddr + insnlen, 1);
283 }
284
285 /* We can ignore the lower bit for the moment and decode the 4 opcodes
286 * we need to emulate. */
287 switch (insn & 0xFE) {
288 case 0xE4: /* in <next byte>,%al */
289 insnlen += 2;
290 in = 1;
291 break;
292 case 0xEC: /* in (%dx),%al */
293 insnlen += 1;
294 in = 1;
295 break;
296 case 0xE6: /* out %al,<next byte> */
297 insnlen += 2;
298 break;
299 case 0xEE: /* out %al,(%dx) */
300 insnlen += 1;
301 break;
302 default:
303 /* OK, we don't know what this is, can't emulate. */
304 return 0;
305 }
306
307 /* If it was an "IN" instruction, they expect the result to be read
308 * into %eax, so we change %eax. We always return all-ones, which
309 * traditionally means "there's nothing there". */
310 if (in) {
311 /* Lower bit tells is whether it's a 16 or 32 bit access */
312 if (insn & 0x1)
313 lg->regs->eax = 0xFFFFFFFF;
314 else
315 lg->regs->eax |= (0xFFFF << shift);
316 }
317 /* Finally, we've "done" the instruction, so move past it. */
318 lg->regs->eip += insnlen;
319 /* Success! */
320 return 1;
321}
322/*:*/
323
324/*L:305 131/*L:305
325 * Dealing With Guest Memory. 132 * Dealing With Guest Memory.
326 * 133 *
327 * When the Guest gives us (what it thinks is) a physical address, we can use 134 * When the Guest gives us (what it thinks is) a physical address, we can use
328 * the normal copy_from_user() & copy_to_user() on that address: remember, 135 * the normal copy_from_user() & copy_to_user() on the corresponding place in
329 * Guest physical == Launcher virtual. 136 * the memory region allocated by the Launcher.
330 * 137 *
331 * But we can't trust the Guest: it might be trying to access the Launcher 138 * But we can't trust the Guest: it might be trying to access the Launcher
332 * code. We have to check that the range is below the pfn_limit the Launcher 139 * code. We have to check that the range is below the pfn_limit the Launcher
@@ -338,148 +145,27 @@ int lguest_address_ok(const struct lguest *lg,
338 return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); 145 return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr);
339} 146}
340 147
341/* This is a convenient routine to get a 32-bit value from the Guest (a very 148/* This routine copies memory from the Guest. Here we can see how useful the
342 * common operation). Here we can see how useful the kill_lguest() routine we 149 * kill_lguest() routine we met in the Launcher can be: we return a random
343 * met in the Launcher can be: we return a random value (0) instead of needing 150 * value (all zeroes) instead of needing to return an error. */
344 * to return an error. */ 151void __lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes)
345u32 lgread_u32(struct lguest *lg, unsigned long addr)
346{
347 u32 val = 0;
348
349 /* Don't let them access lguest binary. */
350 if (!lguest_address_ok(lg, addr, sizeof(val))
351 || get_user(val, (u32 __user *)addr) != 0)
352 kill_guest(lg, "bad read address %#lx", addr);
353 return val;
354}
355
356/* Same thing for writing a value. */
357void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val)
358{
359 if (!lguest_address_ok(lg, addr, sizeof(val))
360 || put_user(val, (u32 __user *)addr) != 0)
361 kill_guest(lg, "bad write address %#lx", addr);
362}
363
364/* This routine is more generic, and copies a range of Guest bytes into a
365 * buffer. If the copy_from_user() fails, we fill the buffer with zeroes, so
366 * the caller doesn't end up using uninitialized kernel memory. */
367void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes)
368{ 152{
369 if (!lguest_address_ok(lg, addr, bytes) 153 if (!lguest_address_ok(lg, addr, bytes)
370 || copy_from_user(b, (void __user *)addr, bytes) != 0) { 154 || copy_from_user(b, lg->mem_base + addr, bytes) != 0) {
371 /* copy_from_user should do this, but as we rely on it... */ 155 /* copy_from_user should do this, but as we rely on it... */
372 memset(b, 0, bytes); 156 memset(b, 0, bytes);
373 kill_guest(lg, "bad read address %#lx len %u", addr, bytes); 157 kill_guest(lg, "bad read address %#lx len %u", addr, bytes);
374 } 158 }
375} 159}
376 160
377/* Similarly, our generic routine to copy into a range of Guest bytes. */ 161/* This is the write (copy into guest) version. */
378void lgwrite(struct lguest *lg, unsigned long addr, const void *b, 162void __lgwrite(struct lguest *lg, unsigned long addr, const void *b,
379 unsigned bytes) 163 unsigned bytes)
380{ 164{
381 if (!lguest_address_ok(lg, addr, bytes) 165 if (!lguest_address_ok(lg, addr, bytes)
382 || copy_to_user((void __user *)addr, b, bytes) != 0) 166 || copy_to_user(lg->mem_base + addr, b, bytes) != 0)
383 kill_guest(lg, "bad write address %#lx len %u", addr, bytes); 167 kill_guest(lg, "bad write address %#lx len %u", addr, bytes);
384} 168}
385/* (end of memory access helper routines) :*/
386
387static void set_ts(void)
388{
389 u32 cr0;
390
391 cr0 = read_cr0();
392 if (!(cr0 & 8))
393 write_cr0(cr0|8);
394}
395
396/*S:010
397 * We are getting close to the Switcher.
398 *
399 * Remember that each CPU has two pages which are visible to the Guest when it
400 * runs on that CPU. This has to contain the state for that Guest: we copy the
401 * state in just before we run the Guest.
402 *
403 * Each Guest has "changed" flags which indicate what has changed in the Guest
404 * since it last ran. We saw this set in interrupts_and_traps.c and
405 * segments.c.
406 */
407static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
408{
409 /* Copying all this data can be quite expensive. We usually run the
410 * same Guest we ran last time (and that Guest hasn't run anywhere else
411 * meanwhile). If that's not the case, we pretend everything in the
412 * Guest has changed. */
413 if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) {
414 __get_cpu_var(last_guest) = lg;
415 lg->last_pages = pages;
416 lg->changed = CHANGED_ALL;
417 }
418
419 /* These copies are pretty cheap, so we do them unconditionally: */
420 /* Save the current Host top-level page directory. */
421 pages->state.host_cr3 = __pa(current->mm->pgd);
422 /* Set up the Guest's page tables to see this CPU's pages (and no
423 * other CPU's pages). */
424 map_switcher_in_guest(lg, pages);
425 /* Set up the two "TSS" members which tell the CPU what stack to use
426 * for traps which do directly into the Guest (ie. traps at privilege
427 * level 1). */
428 pages->state.guest_tss.esp1 = lg->esp1;
429 pages->state.guest_tss.ss1 = lg->ss1;
430
431 /* Copy direct-to-Guest trap entries. */
432 if (lg->changed & CHANGED_IDT)
433 copy_traps(lg, pages->state.guest_idt, default_idt_entries);
434
435 /* Copy all GDT entries which the Guest can change. */
436 if (lg->changed & CHANGED_GDT)
437 copy_gdt(lg, pages->state.guest_gdt);
438 /* If only the TLS entries have changed, copy them. */
439 else if (lg->changed & CHANGED_GDT_TLS)
440 copy_gdt_tls(lg, pages->state.guest_gdt);
441
442 /* Mark the Guest as unchanged for next time. */
443 lg->changed = 0;
444}
445
446/* Finally: the code to actually call into the Switcher to run the Guest. */
447static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
448{
449 /* This is a dummy value we need for GCC's sake. */
450 unsigned int clobber;
451
452 /* Copy the guest-specific information into this CPU's "struct
453 * lguest_pages". */
454 copy_in_guest_info(lg, pages);
455
456 /* Set the trap number to 256 (impossible value). If we fault while
457 * switching to the Guest (bad segment registers or bug), this will
458 * cause us to abort the Guest. */
459 lg->regs->trapnum = 256;
460
461 /* Now: we push the "eflags" register on the stack, then do an "lcall".
462 * This is how we change from using the kernel code segment to using
463 * the dedicated lguest code segment, as well as jumping into the
464 * Switcher.
465 *
466 * The lcall also pushes the old code segment (KERNEL_CS) onto the
467 * stack, then the address of this call. This stack layout happens to
468 * exactly match the stack of an interrupt... */
469 asm volatile("pushf; lcall *lguest_entry"
470 /* This is how we tell GCC that %eax ("a") and %ebx ("b")
471 * are changed by this routine. The "=" means output. */
472 : "=a"(clobber), "=b"(clobber)
473 /* %eax contains the pages pointer. ("0" refers to the
474 * 0-th argument above, ie "a"). %ebx contains the
475 * physical address of the Guest's top-level page
476 * directory. */
477 : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir))
478 /* We tell gcc that all these registers could change,
479 * which means we don't have to save and restore them in
480 * the Switcher. */
481 : "memory", "%edx", "%ecx", "%edi", "%esi");
482}
483/*:*/ 169/*:*/
484 170
485/*H:030 Let's jump straight to the the main loop which runs the Guest. 171/*H:030 Let's jump straight to the the main loop which runs the Guest.
@@ -489,22 +175,16 @@ int run_guest(struct lguest *lg, unsigned long __user *user)
489{ 175{
490 /* We stop running once the Guest is dead. */ 176 /* We stop running once the Guest is dead. */
491 while (!lg->dead) { 177 while (!lg->dead) {
492 /* We need to initialize this, otherwise gcc complains. It's 178 /* First we run any hypercalls the Guest wants done. */
493 * not (yet) clever enough to see that it's initialized when we 179 if (lg->hcall)
494 * need it. */ 180 do_hypercalls(lg);
495 unsigned int cr2 = 0; /* Damn gcc */ 181
496 182 /* It's possible the Guest did a NOTIFY hypercall to the
497 /* First we run any hypercalls the Guest wants done: either in
498 * the hypercall ring in "struct lguest_data", or directly by
499 * using int 31 (LGUEST_TRAP_ENTRY). */
500 do_hypercalls(lg);
501 /* It's possible the Guest did a SEND_DMA hypercall to the
502 * Launcher, in which case we return from the read() now. */ 183 * Launcher, in which case we return from the read() now. */
503 if (lg->dma_is_pending) { 184 if (lg->pending_notify) {
504 if (put_user(lg->pending_dma, user) || 185 if (put_user(lg->pending_notify, user))
505 put_user(lg->pending_key, user+1))
506 return -EFAULT; 186 return -EFAULT;
507 return sizeof(unsigned long)*2; 187 return sizeof(lg->pending_notify);
508 } 188 }
509 189
510 /* Check for signals */ 190 /* Check for signals */
@@ -542,144 +222,20 @@ int run_guest(struct lguest *lg, unsigned long __user *user)
542 * the "Do Not Disturb" sign: */ 222 * the "Do Not Disturb" sign: */
543 local_irq_disable(); 223 local_irq_disable();
544 224
545 /* Remember the awfully-named TS bit? If the Guest has asked 225 /* Actually run the Guest until something happens. */
546 * to set it we set it now, so we can trap and pass that trap 226 lguest_arch_run_guest(lg);
547 * to the Guest if it uses the FPU. */
548 if (lg->ts)
549 set_ts();
550
551 /* SYSENTER is an optimized way of doing system calls. We
552 * can't allow it because it always jumps to privilege level 0.
553 * A normal Guest won't try it because we don't advertise it in
554 * CPUID, but a malicious Guest (or malicious Guest userspace
555 * program) could, so we tell the CPU to disable it before
556 * running the Guest. */
557 if (boot_cpu_has(X86_FEATURE_SEP))
558 wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
559
560 /* Now we actually run the Guest. It will pop back out when
561 * something interesting happens, and we can examine its
562 * registers to see what it was doing. */
563 run_guest_once(lg, lguest_pages(raw_smp_processor_id()));
564
565 /* The "regs" pointer contains two extra entries which are not
566 * really registers: a trap number which says what interrupt or
567 * trap made the switcher code come back, and an error code
568 * which some traps set. */
569
570 /* If the Guest page faulted, then the cr2 register will tell
571 * us the bad virtual address. We have to grab this now,
572 * because once we re-enable interrupts an interrupt could
573 * fault and thus overwrite cr2, or we could even move off to a
574 * different CPU. */
575 if (lg->regs->trapnum == 14)
576 cr2 = read_cr2();
577 /* Similarly, if we took a trap because the Guest used the FPU,
578 * we have to restore the FPU it expects to see. */
579 else if (lg->regs->trapnum == 7)
580 math_state_restore();
581
582 /* Restore SYSENTER if it's supposed to be on. */
583 if (boot_cpu_has(X86_FEATURE_SEP))
584 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
585 227
586 /* Now we're ready to be interrupted or moved to other CPUs */ 228 /* Now we're ready to be interrupted or moved to other CPUs */
587 local_irq_enable(); 229 local_irq_enable();
588 230
589 /* OK, so what happened? */ 231 /* Now we deal with whatever happened to the Guest. */
590 switch (lg->regs->trapnum) { 232 lguest_arch_handle_trap(lg);
591 case 13: /* We've intercepted a GPF. */
592 /* Check if this was one of those annoying IN or OUT
593 * instructions which we need to emulate. If so, we
594 * just go back into the Guest after we've done it. */
595 if (lg->regs->errcode == 0) {
596 if (emulate_insn(lg))
597 continue;
598 }
599 break;
600 case 14: /* We've intercepted a page fault. */
601 /* The Guest accessed a virtual address that wasn't
602 * mapped. This happens a lot: we don't actually set
603 * up most of the page tables for the Guest at all when
604 * we start: as it runs it asks for more and more, and
605 * we set them up as required. In this case, we don't
606 * even tell the Guest that the fault happened.
607 *
608 * The errcode tells whether this was a read or a
609 * write, and whether kernel or userspace code. */
610 if (demand_page(lg, cr2, lg->regs->errcode))
611 continue;
612
613 /* OK, it's really not there (or not OK): the Guest
614 * needs to know. We write out the cr2 value so it
615 * knows where the fault occurred.
616 *
617 * Note that if the Guest were really messed up, this
618 * could happen before it's done the INITIALIZE
619 * hypercall, so lg->lguest_data will be NULL, so
620 * &lg->lguest_data->cr2 will be address 8. Writing
621 * into that address won't hurt the Host at all,
622 * though. */
623 if (put_user(cr2, &lg->lguest_data->cr2))
624 kill_guest(lg, "Writing cr2");
625 break;
626 case 7: /* We've intercepted a Device Not Available fault. */
627 /* If the Guest doesn't want to know, we already
628 * restored the Floating Point Unit, so we just
629 * continue without telling it. */
630 if (!lg->ts)
631 continue;
632 break;
633 case 32 ... 255:
634 /* These values mean a real interrupt occurred, in
635 * which case the Host handler has already been run.
636 * We just do a friendly check if another process
637 * should now be run, then fall through to loop
638 * around: */
639 cond_resched();
640 case LGUEST_TRAP_ENTRY: /* Handled at top of loop */
641 continue;
642 }
643
644 /* If we get here, it's a trap the Guest wants to know
645 * about. */
646 if (deliver_trap(lg, lg->regs->trapnum))
647 continue;
648
649 /* If the Guest doesn't have a handler (either it hasn't
650 * registered any yet, or it's one of the faults we don't let
651 * it handle), it dies with a cryptic error message. */
652 kill_guest(lg, "unhandled trap %li at %#lx (%#lx)",
653 lg->regs->trapnum, lg->regs->eip,
654 lg->regs->trapnum == 14 ? cr2 : lg->regs->errcode);
655 } 233 }
234
656 /* The Guest is dead => "No such file or directory" */ 235 /* The Guest is dead => "No such file or directory" */
657 return -ENOENT; 236 return -ENOENT;
658} 237}
659 238
660/* Now we can look at each of the routines this calls, in increasing order of
661 * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(),
662 * deliver_trap() and demand_page(). After all those, we'll be ready to
663 * examine the Switcher, and our philosophical understanding of the Host/Guest
664 * duality will be complete. :*/
665
666int find_free_guest(void)
667{
668 unsigned int i;
669 for (i = 0; i < MAX_LGUEST_GUESTS; i++)
670 if (!lguests[i].tsk)
671 return i;
672 return -1;
673}
674
675static void adjust_pge(void *on)
676{
677 if (on)
678 write_cr4(read_cr4() | X86_CR4_PGE);
679 else
680 write_cr4(read_cr4() & ~X86_CR4_PGE);
681}
682
683/*H:000 239/*H:000
684 * Welcome to the Host! 240 * Welcome to the Host!
685 * 241 *
@@ -701,72 +257,50 @@ static int __init init(void)
701 /* First we put the Switcher up in very high virtual memory. */ 257 /* First we put the Switcher up in very high virtual memory. */
702 err = map_switcher(); 258 err = map_switcher();
703 if (err) 259 if (err)
704 return err; 260 goto out;
705 261
706 /* Now we set up the pagetable implementation for the Guests. */ 262 /* Now we set up the pagetable implementation for the Guests. */
707 err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES); 263 err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES);
708 if (err) { 264 if (err)
709 unmap_switcher(); 265 goto unmap;
710 return err;
711 }
712 266
713 /* The I/O subsystem needs some things initialized. */ 267 /* We might need to reserve an interrupt vector. */
714 lguest_io_init(); 268 err = init_interrupts();
269 if (err)
270 goto free_pgtables;
715 271
716 /* /dev/lguest needs to be registered. */ 272 /* /dev/lguest needs to be registered. */
717 err = lguest_device_init(); 273 err = lguest_device_init();
718 if (err) { 274 if (err)
719 free_pagetables(); 275 goto free_interrupts;
720 unmap_switcher();
721 return err;
722 }
723 276
724 /* Finally, we need to turn off "Page Global Enable". PGE is an 277 /* Finally we do some architecture-specific setup. */
725 * optimization where page table entries are specially marked to show 278 lguest_arch_host_init();
726 * they never change. The Host kernel marks all the kernel pages this
727 * way because it's always present, even when userspace is running.
728 *
729 * Lguest breaks this: unbeknownst to the rest of the Host kernel, we
730 * switch to the Guest kernel. If you don't disable this on all CPUs,
731 * you'll get really weird bugs that you'll chase for two days.
732 *
733 * I used to turn PGE off every time we switched to the Guest and back
734 * on when we return, but that slowed the Switcher down noticibly. */
735
736 /* We don't need the complexity of CPUs coming and going while we're
737 * doing this. */
738 lock_cpu_hotplug();
739 if (cpu_has_pge) { /* We have a broader idea of "global". */
740 /* Remember that this was originally set (for cleanup). */
741 cpu_had_pge = 1;
742 /* adjust_pge is a helper function which sets or unsets the PGE
743 * bit on its CPU, depending on the argument (0 == unset). */
744 on_each_cpu(adjust_pge, (void *)0, 0, 1);
745 /* Turn off the feature in the global feature set. */
746 clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
747 }
748 unlock_cpu_hotplug();
749 279
750 /* All good! */ 280 /* All good! */
751 return 0; 281 return 0;
282
283free_interrupts:
284 free_interrupts();
285free_pgtables:
286 free_pagetables();
287unmap:
288 unmap_switcher();
289out:
290 return err;
752} 291}
753 292
754/* Cleaning up is just the same code, backwards. With a little French. */ 293/* Cleaning up is just the same code, backwards. With a little French. */
755static void __exit fini(void) 294static void __exit fini(void)
756{ 295{
757 lguest_device_remove(); 296 lguest_device_remove();
297 free_interrupts();
758 free_pagetables(); 298 free_pagetables();
759 unmap_switcher(); 299 unmap_switcher();
760 300
761 /* If we had PGE before we started, turn it back on now. */ 301 lguest_arch_host_fini();
762 lock_cpu_hotplug();
763 if (cpu_had_pge) {
764 set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
765 /* adjust_pge's argument "1" means set PGE. */
766 on_each_cpu(adjust_pge, (void *)1, 0, 1);
767 }
768 unlock_cpu_hotplug();
769} 302}
303/*:*/
770 304
771/* The Host side of lguest can be a module. This is a nice way for people to 305/* The Host side of lguest can be a module. This is a nice way for people to
772 * play with it. */ 306 * play with it. */
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index db6caace3b9..9d5184c7c14 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -25,17 +25,13 @@
25#include <linux/mm.h> 25#include <linux/mm.h>
26#include <asm/page.h> 26#include <asm/page.h>
27#include <asm/pgtable.h> 27#include <asm/pgtable.h>
28#include <irq_vectors.h>
29#include "lg.h" 28#include "lg.h"
30 29
31/*H:120 This is the core hypercall routine: where the Guest gets what it 30/*H:120 This is the core hypercall routine: where the Guest gets what it wants.
32 * wants. Or gets killed. Or, in the case of LHCALL_CRASH, both. 31 * Or gets killed. Or, in the case of LHCALL_CRASH, both. */
33 * 32static void do_hcall(struct lguest *lg, struct hcall_args *args)
34 * Remember from the Guest: %eax == which call to make, and the arguments are
35 * packed into %edx, %ebx and %ecx if needed. */
36static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
37{ 33{
38 switch (regs->eax) { 34 switch (args->arg0) {
39 case LHCALL_FLUSH_ASYNC: 35 case LHCALL_FLUSH_ASYNC:
40 /* This call does nothing, except by breaking out of the Guest 36 /* This call does nothing, except by breaking out of the Guest
41 * it makes us process all the asynchronous hypercalls. */ 37 * it makes us process all the asynchronous hypercalls. */
@@ -51,7 +47,7 @@ static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
51 char msg[128]; 47 char msg[128];
52 /* If the lgread fails, it will call kill_guest() itself; the 48 /* If the lgread fails, it will call kill_guest() itself; the
53 * kill_guest() with the message will be ignored. */ 49 * kill_guest() with the message will be ignored. */
54 lgread(lg, msg, regs->edx, sizeof(msg)); 50 __lgread(lg, msg, args->arg1, sizeof(msg));
55 msg[sizeof(msg)-1] = '\0'; 51 msg[sizeof(msg)-1] = '\0';
56 kill_guest(lg, "CRASH: %s", msg); 52 kill_guest(lg, "CRASH: %s", msg);
57 break; 53 break;
@@ -59,67 +55,49 @@ static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
59 case LHCALL_FLUSH_TLB: 55 case LHCALL_FLUSH_TLB:
60 /* FLUSH_TLB comes in two flavors, depending on the 56 /* FLUSH_TLB comes in two flavors, depending on the
61 * argument: */ 57 * argument: */
62 if (regs->edx) 58 if (args->arg1)
63 guest_pagetable_clear_all(lg); 59 guest_pagetable_clear_all(lg);
64 else 60 else
65 guest_pagetable_flush_user(lg); 61 guest_pagetable_flush_user(lg);
66 break; 62 break;
67 case LHCALL_BIND_DMA:
68 /* BIND_DMA really wants four arguments, but it's the only call
69 * which does. So the Guest packs the number of buffers and
70 * the interrupt number into the final argument, and we decode
71 * it here. This can legitimately fail, since we currently
72 * place a limit on the number of DMA pools a Guest can have.
73 * So we return true or false from this call. */
74 regs->eax = bind_dma(lg, regs->edx, regs->ebx,
75 regs->ecx >> 8, regs->ecx & 0xFF);
76 break;
77 63
78 /* All these calls simply pass the arguments through to the right 64 /* All these calls simply pass the arguments through to the right
79 * routines. */ 65 * routines. */
80 case LHCALL_SEND_DMA:
81 send_dma(lg, regs->edx, regs->ebx);
82 break;
83 case LHCALL_LOAD_GDT:
84 load_guest_gdt(lg, regs->edx, regs->ebx);
85 break;
86 case LHCALL_LOAD_IDT_ENTRY:
87 load_guest_idt_entry(lg, regs->edx, regs->ebx, regs->ecx);
88 break;
89 case LHCALL_NEW_PGTABLE: 66 case LHCALL_NEW_PGTABLE:
90 guest_new_pagetable(lg, regs->edx); 67 guest_new_pagetable(lg, args->arg1);
91 break; 68 break;
92 case LHCALL_SET_STACK: 69 case LHCALL_SET_STACK:
93 guest_set_stack(lg, regs->edx, regs->ebx, regs->ecx); 70 guest_set_stack(lg, args->arg1, args->arg2, args->arg3);
94 break; 71 break;
95 case LHCALL_SET_PTE: 72 case LHCALL_SET_PTE:
96 guest_set_pte(lg, regs->edx, regs->ebx, mkgpte(regs->ecx)); 73 guest_set_pte(lg, args->arg1, args->arg2, __pte(args->arg3));
97 break; 74 break;
98 case LHCALL_SET_PMD: 75 case LHCALL_SET_PMD:
99 guest_set_pmd(lg, regs->edx, regs->ebx); 76 guest_set_pmd(lg, args->arg1, args->arg2);
100 break;
101 case LHCALL_LOAD_TLS:
102 guest_load_tls(lg, regs->edx);
103 break; 77 break;
104 case LHCALL_SET_CLOCKEVENT: 78 case LHCALL_SET_CLOCKEVENT:
105 guest_set_clockevent(lg, regs->edx); 79 guest_set_clockevent(lg, args->arg1);
106 break; 80 break;
107
108 case LHCALL_TS: 81 case LHCALL_TS:
109 /* This sets the TS flag, as we saw used in run_guest(). */ 82 /* This sets the TS flag, as we saw used in run_guest(). */
110 lg->ts = regs->edx; 83 lg->ts = args->arg1;
111 break; 84 break;
112 case LHCALL_HALT: 85 case LHCALL_HALT:
113 /* Similarly, this sets the halted flag for run_guest(). */ 86 /* Similarly, this sets the halted flag for run_guest(). */
114 lg->halted = 1; 87 lg->halted = 1;
115 break; 88 break;
89 case LHCALL_NOTIFY:
90 lg->pending_notify = args->arg1;
91 break;
116 default: 92 default:
117 kill_guest(lg, "Bad hypercall %li\n", regs->eax); 93 if (lguest_arch_do_hcall(lg, args))
94 kill_guest(lg, "Bad hypercall %li\n", args->arg0);
118 } 95 }
119} 96}
97/*:*/
120 98
121/* Asynchronous hypercalls are easy: we just look in the array in the Guest's 99/*H:124 Asynchronous hypercalls are easy: we just look in the array in the
122 * "struct lguest_data" and see if there are any new ones marked "ready". 100 * Guest's "struct lguest_data" to see if any new ones are marked "ready".
123 * 101 *
124 * We are careful to do these in order: obviously we respect the order the 102 * We are careful to do these in order: obviously we respect the order the
125 * Guest put them in the ring, but we also promise the Guest that they will 103 * Guest put them in the ring, but we also promise the Guest that they will
@@ -134,10 +112,9 @@ static void do_async_hcalls(struct lguest *lg)
134 if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st))) 112 if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st)))
135 return; 113 return;
136 114
137
138 /* We process "struct lguest_data"s hcalls[] ring once. */ 115 /* We process "struct lguest_data"s hcalls[] ring once. */
139 for (i = 0; i < ARRAY_SIZE(st); i++) { 116 for (i = 0; i < ARRAY_SIZE(st); i++) {
140 struct lguest_regs regs; 117 struct hcall_args args;
141 /* We remember where we were up to from last time. This makes 118 /* We remember where we were up to from last time. This makes
142 * sure that the hypercalls are done in the order the Guest 119 * sure that the hypercalls are done in the order the Guest
143 * places them in the ring. */ 120 * places them in the ring. */
@@ -152,18 +129,16 @@ static void do_async_hcalls(struct lguest *lg)
152 if (++lg->next_hcall == LHCALL_RING_SIZE) 129 if (++lg->next_hcall == LHCALL_RING_SIZE)
153 lg->next_hcall = 0; 130 lg->next_hcall = 0;
154 131
155 /* We copy the hypercall arguments into a fake register 132 /* Copy the hypercall arguments into a local copy of
156 * structure. This makes life simple for do_hcall(). */ 133 * the hcall_args struct. */
157 if (get_user(regs.eax, &lg->lguest_data->hcalls[n].eax) 134 if (copy_from_user(&args, &lg->lguest_data->hcalls[n],
158 || get_user(regs.edx, &lg->lguest_data->hcalls[n].edx) 135 sizeof(struct hcall_args))) {
159 || get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx)
160 || get_user(regs.ebx, &lg->lguest_data->hcalls[n].ebx)) {
161 kill_guest(lg, "Fetching async hypercalls"); 136 kill_guest(lg, "Fetching async hypercalls");
162 break; 137 break;
163 } 138 }
164 139
165 /* Do the hypercall, same as a normal one. */ 140 /* Do the hypercall, same as a normal one. */
166 do_hcall(lg, &regs); 141 do_hcall(lg, &args);
167 142
168 /* Mark the hypercall done. */ 143 /* Mark the hypercall done. */
169 if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) { 144 if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) {
@@ -171,9 +146,9 @@ static void do_async_hcalls(struct lguest *lg)
171 break; 146 break;
172 } 147 }
173 148
174 /* Stop doing hypercalls if we've just done a DMA to the 149 /* Stop doing hypercalls if they want to notify the Launcher:
175 * Launcher: it needs to service this first. */ 150 * it needs to service this first. */
176 if (lg->dma_is_pending) 151 if (lg->pending_notify)
177 break; 152 break;
178 } 153 }
179} 154}
@@ -182,76 +157,35 @@ static void do_async_hcalls(struct lguest *lg)
182 * Guest makes a hypercall, we end up here to set things up: */ 157 * Guest makes a hypercall, we end up here to set things up: */
183static void initialize(struct lguest *lg) 158static void initialize(struct lguest *lg)
184{ 159{
185 u32 tsc_speed;
186 160
187 /* You can't do anything until you're initialized. The Guest knows the 161 /* You can't do anything until you're initialized. The Guest knows the
188 * rules, so we're unforgiving here. */ 162 * rules, so we're unforgiving here. */
189 if (lg->regs->eax != LHCALL_LGUEST_INIT) { 163 if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) {
190 kill_guest(lg, "hypercall %li before LGUEST_INIT", 164 kill_guest(lg, "hypercall %li before INIT", lg->hcall->arg0);
191 lg->regs->eax);
192 return; 165 return;
193 } 166 }
194 167
195 /* We insist that the Time Stamp Counter exist and doesn't change with 168 if (lguest_arch_init_hypercalls(lg))
196 * cpu frequency. Some devious chip manufacturers decided that TSC
197 * changes could be handled in software. I decided that time going
198 * backwards might be good for benchmarks, but it's bad for users.
199 *
200 * We also insist that the TSC be stable: the kernel detects unreliable
201 * TSCs for its own purposes, and we use that here. */
202 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
203 tsc_speed = tsc_khz;
204 else
205 tsc_speed = 0;
206
207 /* The pointer to the Guest's "struct lguest_data" is the only
208 * argument. */
209 lg->lguest_data = (struct lguest_data __user *)lg->regs->edx;
210 /* If we check the address they gave is OK now, we can simply
211 * copy_to_user/from_user from now on rather than using lgread/lgwrite.
212 * I put this in to show that I'm not immune to writing stupid
213 * optimizations. */
214 if (!lguest_address_ok(lg, lg->regs->edx, sizeof(*lg->lguest_data))) {
215 kill_guest(lg, "bad guest page %p", lg->lguest_data); 169 kill_guest(lg, "bad guest page %p", lg->lguest_data);
216 return; 170
217 }
218 /* The Guest tells us where we're not to deliver interrupts by putting 171 /* The Guest tells us where we're not to deliver interrupts by putting
219 * the range of addresses into "struct lguest_data". */ 172 * the range of addresses into "struct lguest_data". */
220 if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start) 173 if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start)
221 || get_user(lg->noirq_end, &lg->lguest_data->noirq_end) 174 || get_user(lg->noirq_end, &lg->lguest_data->noirq_end))
222 /* We tell the Guest that it can't use the top 4MB of virtual
223 * addresses used by the Switcher. */
224 || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
225 || put_user(tsc_speed, &lg->lguest_data->tsc_khz)
226 /* We also give the Guest a unique id, as used in lguest_net.c. */
227 || put_user(lg->guestid, &lg->lguest_data->guestid))
228 kill_guest(lg, "bad guest page %p", lg->lguest_data); 175 kill_guest(lg, "bad guest page %p", lg->lguest_data);
229 176
230 /* We write the current time into the Guest's data page once now. */ 177 /* We write the current time into the Guest's data page once now. */
231 write_timestamp(lg); 178 write_timestamp(lg);
232 179
180 /* page_tables.c will also do some setup. */
181 page_table_guest_data_init(lg);
182
233 /* This is the one case where the above accesses might have been the 183 /* This is the one case where the above accesses might have been the
234 * first write to a Guest page. This may have caused a copy-on-write 184 * first write to a Guest page. This may have caused a copy-on-write
235 * fault, but the Guest might be referring to the old (read-only) 185 * fault, but the Guest might be referring to the old (read-only)
236 * page. */ 186 * page. */
237 guest_pagetable_clear_all(lg); 187 guest_pagetable_clear_all(lg);
238} 188}
239/* Now we've examined the hypercall code; our Guest can make requests. There
240 * is one other way we can do things for the Guest, as we see in
241 * emulate_insn(). */
242
243/*H:110 Tricky point: we mark the hypercall as "done" once we've done it.
244 * Normally we don't need to do this: the Guest will run again and update the
245 * trap number before we come back around the run_guest() loop to
246 * do_hypercalls().
247 *
248 * However, if we are signalled or the Guest sends DMA to the Launcher, that
249 * loop will exit without running the Guest. When it comes back it would try
250 * to re-run the hypercall. */
251static void clear_hcall(struct lguest *lg)
252{
253 lg->regs->trapnum = 255;
254}
255 189
256/*H:100 190/*H:100
257 * Hypercalls 191 * Hypercalls
@@ -261,16 +195,12 @@ static void clear_hcall(struct lguest *lg)
261 */ 195 */
262void do_hypercalls(struct lguest *lg) 196void do_hypercalls(struct lguest *lg)
263{ 197{
264 /* Not initialized yet? */ 198 /* Not initialized yet? This hypercall must do it. */
265 if (unlikely(!lg->lguest_data)) { 199 if (unlikely(!lg->lguest_data)) {
266 /* Did the Guest make a hypercall? We might have come back for 200 /* Set up the "struct lguest_data" */
267 * some other reason (an interrupt, a different trap). */ 201 initialize(lg);
268 if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) { 202 /* Hcall is done. */
269 /* Set up the "struct lguest_data" */ 203 lg->hcall = NULL;
270 initialize(lg);
271 /* The hypercall is done. */
272 clear_hcall(lg);
273 }
274 return; 204 return;
275 } 205 }
276 206
@@ -280,12 +210,21 @@ void do_hypercalls(struct lguest *lg)
280 do_async_hcalls(lg); 210 do_async_hcalls(lg);
281 211
282 /* If we stopped reading the hypercall ring because the Guest did a 212 /* If we stopped reading the hypercall ring because the Guest did a
283 * SEND_DMA to the Launcher, we want to return now. Otherwise if the 213 * NOTIFY to the Launcher, we want to return now. Otherwise we do
284 * Guest asked us to do a hypercall, we do it. */ 214 * the hypercall. */
285 if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) { 215 if (!lg->pending_notify) {
286 do_hcall(lg, lg->regs); 216 do_hcall(lg, lg->hcall);
287 /* The hypercall is done. */ 217 /* Tricky point: we reset the hcall pointer to mark the
288 clear_hcall(lg); 218 * hypercall as "done". We use the hcall pointer rather than
219 * the trap number to indicate a hypercall is pending.
220 * Normally it doesn't matter: the Guest will run again and
221 * update the trap number before we come back here.
222 *
223 * However, if we are signalled or the Guest sends DMA to the
224 * Launcher, the run_guest() loop will exit without running the
225 * Guest. When it comes back it would try to re-run the
226 * hypercall. */
227 lg->hcall = NULL;
289 } 228 }
290} 229}
291 230
@@ -295,6 +234,6 @@ void write_timestamp(struct lguest *lg)
295{ 234{
296 struct timespec now; 235 struct timespec now;
297 ktime_get_real_ts(&now); 236 ktime_get_real_ts(&now);
298 if (put_user(now, &lg->lguest_data->time)) 237 if (copy_to_user(&lg->lguest_data->time, &now, sizeof(struct timespec)))
299 kill_guest(lg, "Writing timestamp"); 238 kill_guest(lg, "Writing timestamp");
300} 239}
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 39731232d82..82966982cb3 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -12,8 +12,14 @@
12 * them first, so we also have a way of "reflecting" them into the Guest as if 12 * them first, so we also have a way of "reflecting" them into the Guest as if
13 * they had been delivered to it directly. :*/ 13 * they had been delivered to it directly. :*/
14#include <linux/uaccess.h> 14#include <linux/uaccess.h>
15#include <linux/interrupt.h>
16#include <linux/module.h>
15#include "lg.h" 17#include "lg.h"
16 18
19/* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */
20static unsigned int syscall_vector = SYSCALL_VECTOR;
21module_param(syscall_vector, uint, 0444);
22
17/* The address of the interrupt handler is split into two bits: */ 23/* The address of the interrupt handler is split into two bits: */
18static unsigned long idt_address(u32 lo, u32 hi) 24static unsigned long idt_address(u32 lo, u32 hi)
19{ 25{
@@ -39,7 +45,7 @@ static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
39{ 45{
40 /* Stack grows upwards: move stack then write value. */ 46 /* Stack grows upwards: move stack then write value. */
41 *gstack -= 4; 47 *gstack -= 4;
42 lgwrite_u32(lg, *gstack, val); 48 lgwrite(lg, *gstack, u32, val);
43} 49}
44 50
45/*H:210 The set_guest_interrupt() routine actually delivers the interrupt or 51/*H:210 The set_guest_interrupt() routine actually delivers the interrupt or
@@ -56,8 +62,9 @@ static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
56 * it). */ 62 * it). */
57static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) 63static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
58{ 64{
59 unsigned long gstack; 65 unsigned long gstack, origstack;
60 u32 eflags, ss, irq_enable; 66 u32 eflags, ss, irq_enable;
67 unsigned long virtstack;
61 68
62 /* There are two cases for interrupts: one where the Guest is already 69 /* There are two cases for interrupts: one where the Guest is already
63 * in the kernel, and a more complex one where the Guest is in 70 * in the kernel, and a more complex one where the Guest is in
@@ -65,8 +72,10 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
65 if ((lg->regs->ss&0x3) != GUEST_PL) { 72 if ((lg->regs->ss&0x3) != GUEST_PL) {
66 /* The Guest told us their kernel stack with the SET_STACK 73 /* The Guest told us their kernel stack with the SET_STACK
67 * hypercall: both the virtual address and the segment */ 74 * hypercall: both the virtual address and the segment */
68 gstack = guest_pa(lg, lg->esp1); 75 virtstack = lg->esp1;
69 ss = lg->ss1; 76 ss = lg->ss1;
77
78 origstack = gstack = guest_pa(lg, virtstack);
70 /* We push the old stack segment and pointer onto the new 79 /* We push the old stack segment and pointer onto the new
71 * stack: when the Guest does an "iret" back from the interrupt 80 * stack: when the Guest does an "iret" back from the interrupt
72 * handler the CPU will notice they're dropping privilege 81 * handler the CPU will notice they're dropping privilege
@@ -75,8 +84,10 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
75 push_guest_stack(lg, &gstack, lg->regs->esp); 84 push_guest_stack(lg, &gstack, lg->regs->esp);
76 } else { 85 } else {
77 /* We're staying on the same Guest (kernel) stack. */ 86 /* We're staying on the same Guest (kernel) stack. */
78 gstack = guest_pa(lg, lg->regs->esp); 87 virtstack = lg->regs->esp;
79 ss = lg->regs->ss; 88 ss = lg->regs->ss;
89
90 origstack = gstack = guest_pa(lg, virtstack);
80 } 91 }
81 92
82 /* Remember that we never let the Guest actually disable interrupts, so 93 /* Remember that we never let the Guest actually disable interrupts, so
@@ -102,7 +113,7 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
102 /* Now we've pushed all the old state, we change the stack, the code 113 /* Now we've pushed all the old state, we change the stack, the code
103 * segment and the address to execute. */ 114 * segment and the address to execute. */
104 lg->regs->ss = ss; 115 lg->regs->ss = ss;
105 lg->regs->esp = gstack + lg->page_offset; 116 lg->regs->esp = virtstack + (gstack - origstack);
106 lg->regs->cs = (__KERNEL_CS|GUEST_PL); 117 lg->regs->cs = (__KERNEL_CS|GUEST_PL);
107 lg->regs->eip = idt_address(lo, hi); 118 lg->regs->eip = idt_address(lo, hi);
108 119
@@ -165,7 +176,7 @@ void maybe_do_interrupt(struct lguest *lg)
165 /* Look at the IDT entry the Guest gave us for this interrupt. The 176 /* Look at the IDT entry the Guest gave us for this interrupt. The
166 * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip 177 * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip
167 * over them. */ 178 * over them. */
168 idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq]; 179 idt = &lg->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
169 /* If they don't have a handler (yet?), we just ignore it */ 180 /* If they don't have a handler (yet?), we just ignore it */
170 if (idt_present(idt->a, idt->b)) { 181 if (idt_present(idt->a, idt->b)) {
171 /* OK, mark it no longer pending and deliver it. */ 182 /* OK, mark it no longer pending and deliver it. */
@@ -183,6 +194,47 @@ void maybe_do_interrupt(struct lguest *lg)
183 * timer interrupt. */ 194 * timer interrupt. */
184 write_timestamp(lg); 195 write_timestamp(lg);
185} 196}
197/*:*/
198
199/* Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent
200 * me a patch, so we support that too. It'd be a big step for lguest if half
201 * the Plan 9 user base were to start using it.
202 *
203 * Actually now I think of it, it's possible that Ron *is* half the Plan 9
204 * userbase. Oh well. */
205static bool could_be_syscall(unsigned int num)
206{
207 /* Normal Linux SYSCALL_VECTOR or reserved vector? */
208 return num == SYSCALL_VECTOR || num == syscall_vector;
209}
210
211/* The syscall vector it wants must be unused by Host. */
212bool check_syscall_vector(struct lguest *lg)
213{
214 u32 vector;
215
216 if (get_user(vector, &lg->lguest_data->syscall_vec))
217 return false;
218
219 return could_be_syscall(vector);
220}
221
222int init_interrupts(void)
223{
224 /* If they want some strange system call vector, reserve it now */
225 if (syscall_vector != SYSCALL_VECTOR
226 && test_and_set_bit(syscall_vector, used_vectors)) {
227 printk("lg: couldn't reserve syscall %u\n", syscall_vector);
228 return -EBUSY;
229 }
230 return 0;
231}
232
233void free_interrupts(void)
234{
235 if (syscall_vector != SYSCALL_VECTOR)
236 clear_bit(syscall_vector, used_vectors);
237}
186 238
187/*H:220 Now we've got the routines to deliver interrupts, delivering traps 239/*H:220 Now we've got the routines to deliver interrupts, delivering traps
188 * like page fault is easy. The only trick is that Intel decided that some 240 * like page fault is easy. The only trick is that Intel decided that some
@@ -197,14 +249,14 @@ int deliver_trap(struct lguest *lg, unsigned int num)
197{ 249{
198 /* Trap numbers are always 8 bit, but we set an impossible trap number 250 /* Trap numbers are always 8 bit, but we set an impossible trap number
199 * for traps inside the Switcher, so check that here. */ 251 * for traps inside the Switcher, so check that here. */
200 if (num >= ARRAY_SIZE(lg->idt)) 252 if (num >= ARRAY_SIZE(lg->arch.idt))
201 return 0; 253 return 0;
202 254
203 /* Early on the Guest hasn't set the IDT entries (or maybe it put a 255 /* Early on the Guest hasn't set the IDT entries (or maybe it put a
204 * bogus one in): if we fail here, the Guest will be killed. */ 256 * bogus one in): if we fail here, the Guest will be killed. */
205 if (!idt_present(lg->idt[num].a, lg->idt[num].b)) 257 if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b))
206 return 0; 258 return 0;
207 set_guest_interrupt(lg, lg->idt[num].a, lg->idt[num].b, has_err(num)); 259 set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b, has_err(num));
208 return 1; 260 return 1;
209} 261}
210 262
@@ -218,28 +270,20 @@ int deliver_trap(struct lguest *lg, unsigned int num)
218 * system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all 270 * system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all
219 * the other hypervisors would tease it. 271 * the other hypervisors would tease it.
220 * 272 *
221 * This routine determines if a trap can be delivered directly. */ 273 * This routine indicates if a particular trap number could be delivered
222static int direct_trap(const struct lguest *lg, 274 * directly. */
223 const struct desc_struct *trap, 275static int direct_trap(unsigned int num)
224 unsigned int num)
225{ 276{
226 /* Hardware interrupts don't go to the Guest at all (except system 277 /* Hardware interrupts don't go to the Guest at all (except system
227 * call). */ 278 * call). */
228 if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR) 279 if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num))
229 return 0; 280 return 0;
230 281
231 /* The Host needs to see page faults (for shadow paging and to save the 282 /* The Host needs to see page faults (for shadow paging and to save the
232 * fault address), general protection faults (in/out emulation) and 283 * fault address), general protection faults (in/out emulation) and
233 * device not available (TS handling), and of course, the hypercall 284 * device not available (TS handling), and of course, the hypercall
234 * trap. */ 285 * trap. */
235 if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY) 286 return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY;
236 return 0;
237
238 /* Only trap gates (type 15) can go direct to the Guest. Interrupt
239 * gates (type 14) disable interrupts as they are entered, which we
240 * never let the Guest do. Not present entries (type 0x0) also can't
241 * go direct, of course 8) */
242 return idt_type(trap->a, trap->b) == 0xF;
243} 287}
244/*:*/ 288/*:*/
245 289
@@ -348,15 +392,11 @@ void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi)
348 * to copy this again. */ 392 * to copy this again. */
349 lg->changed |= CHANGED_IDT; 393 lg->changed |= CHANGED_IDT;
350 394
351 /* The IDT which we keep in "struct lguest" only contains 32 entries 395 /* Check that the Guest doesn't try to step outside the bounds. */
352 * for the traps and LGUEST_IRQS (32) entries for interrupts. We 396 if (num >= ARRAY_SIZE(lg->arch.idt))
353 * ignore attempts to set handlers for higher interrupt numbers, except 397 kill_guest(lg, "Setting idt entry %u", num);
354 * for the system call "interrupt" at 128: we have a special IDT entry 398 else
355 * for that. */ 399 set_trap(lg, &lg->arch.idt[num], num, lo, hi);
356 if (num < ARRAY_SIZE(lg->idt))
357 set_trap(lg, &lg->idt[num], num, lo, hi);
358 else if (num == SYSCALL_VECTOR)
359 set_trap(lg, &lg->syscall_idt, num, lo, hi);
360} 400}
361 401
362/* The default entry for each interrupt points into the Switcher routines which 402/* The default entry for each interrupt points into the Switcher routines which
@@ -399,20 +439,21 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt,
399 439
400 /* We can simply copy the direct traps, otherwise we use the default 440 /* We can simply copy the direct traps, otherwise we use the default
401 * ones in the Switcher: they will return to the Host. */ 441 * ones in the Switcher: they will return to the Host. */
402 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) { 442 for (i = 0; i < ARRAY_SIZE(lg->arch.idt); i++) {
403 if (direct_trap(lg, &lg->idt[i], i)) 443 /* If no Guest can ever override this trap, leave it alone. */
404 idt[i] = lg->idt[i]; 444 if (!direct_trap(i))
445 continue;
446
447 /* Only trap gates (type 15) can go direct to the Guest.
448 * Interrupt gates (type 14) disable interrupts as they are
449 * entered, which we never let the Guest do. Not present
450 * entries (type 0x0) also can't go direct, of course. */
451 if (idt_type(lg->arch.idt[i].a, lg->arch.idt[i].b) == 0xF)
452 idt[i] = lg->arch.idt[i];
405 else 453 else
454 /* Reset it to the default. */
406 default_idt_entry(&idt[i], i, def[i]); 455 default_idt_entry(&idt[i], i, def[i]);
407 } 456 }
408
409 /* Don't forget the system call trap! The IDT entries for other
410 * interupts never change, so no need to copy them. */
411 i = SYSCALL_VECTOR;
412 if (direct_trap(lg, &lg->syscall_idt, i))
413 idt[i] = lg->syscall_idt;
414 else
415 default_idt_entry(&idt[i], i, def[i]);
416} 457}
417 458
418void guest_set_clockevent(struct lguest *lg, unsigned long delta) 459void guest_set_clockevent(struct lguest *lg, unsigned long delta)
diff --git a/drivers/lguest/io.c b/drivers/lguest/io.c
deleted file mode 100644
index ea68613b43f..00000000000
--- a/drivers/lguest/io.c
+++ /dev/null
@@ -1,626 +0,0 @@
1/*P:300 The I/O mechanism in lguest is simple yet flexible, allowing the Guest
2 * to talk to the Launcher or directly to another Guest. It uses familiar
3 * concepts of DMA and interrupts, plus some neat code stolen from
4 * futexes... :*/
5
6/* Copyright (C) 2006 Rusty Russell IBM Corporation
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22#include <linux/types.h>
23#include <linux/futex.h>
24#include <linux/jhash.h>
25#include <linux/mm.h>
26#include <linux/highmem.h>
27#include <linux/uaccess.h>
28#include "lg.h"
29
30/*L:300
31 * I/O
32 *
33 * Getting data in and out of the Guest is quite an art. There are numerous
34 * ways to do it, and they all suck differently. We try to keep things fairly
35 * close to "real" hardware so our Guest's drivers don't look like an alien
36 * visitation in the middle of the Linux code, and yet make sure that Guests
37 * can talk directly to other Guests, not just the Launcher.
38 *
39 * To do this, the Guest gives us a key when it binds or sends DMA buffers.
40 * The key corresponds to a "physical" address inside the Guest (ie. a virtual
41 * address inside the Launcher process). We don't, however, use this key
42 * directly.
43 *
44 * We want Guests which share memory to be able to DMA to each other: two
45 * Launchers can mmap memory the same file, then the Guests can communicate.
46 * Fortunately, the futex code provides us with a way to get a "union
47 * futex_key" corresponding to the memory lying at a virtual address: if the
48 * two processes share memory, the "union futex_key" for that memory will match
49 * even if the memory is mapped at different addresses in each. So we always
50 * convert the keys to "union futex_key"s to compare them.
51 *
52 * Before we dive into this though, we need to look at another set of helper
53 * routines used throughout the Host kernel code to access Guest memory.
54 :*/
55static struct list_head dma_hash[61];
56
57/* An unfortunate side effect of the Linux double-linked list implementation is
58 * that there's no good way to statically initialize an array of linked
59 * lists. */
60void lguest_io_init(void)
61{
62 unsigned int i;
63
64 for (i = 0; i < ARRAY_SIZE(dma_hash); i++)
65 INIT_LIST_HEAD(&dma_hash[i]);
66}
67
68/* FIXME: allow multi-page lengths. */
69static int check_dma_list(struct lguest *lg, const struct lguest_dma *dma)
70{
71 unsigned int i;
72
73 for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
74 if (!dma->len[i])
75 return 1;
76 if (!lguest_address_ok(lg, dma->addr[i], dma->len[i]))
77 goto kill;
78 if (dma->len[i] > PAGE_SIZE)
79 goto kill;
80 /* We could do over a page, but is it worth it? */
81 if ((dma->addr[i] % PAGE_SIZE) + dma->len[i] > PAGE_SIZE)
82 goto kill;
83 }
84 return 1;
85
86kill:
87 kill_guest(lg, "bad DMA entry: %u@%#lx", dma->len[i], dma->addr[i]);
88 return 0;
89}
90
91/*L:330 This is our hash function, using the wonderful Jenkins hash.
92 *
93 * The futex key is a union with three parts: an unsigned long word, a pointer,
94 * and an int "offset". We could use jhash_2words() which takes three u32s.
95 * (Ok, the hash functions are great: the naming sucks though).
96 *
97 * It's nice to be portable to 64-bit platforms, so we use the more generic
98 * jhash2(), which takes an array of u32, the number of u32s, and an initial
99 * u32 to roll in. This is uglier, but breaks down to almost the same code on
100 * 32-bit platforms like this one.
101 *
102 * We want a position in the array, so we modulo ARRAY_SIZE(dma_hash) (ie. 61).
103 */
104static unsigned int hash(const union futex_key *key)
105{
106 return jhash2((u32*)&key->both.word,
107 (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
108 key->both.offset)
109 % ARRAY_SIZE(dma_hash);
110}
111
112/* This is a convenience routine to compare two keys. It's a much bemoaned C
113 * weakness that it doesn't allow '==' on structures or unions, so we have to
114 * open-code it like this. */
115static inline int key_eq(const union futex_key *a, const union futex_key *b)
116{
117 return (a->both.word == b->both.word
118 && a->both.ptr == b->both.ptr
119 && a->both.offset == b->both.offset);
120}
121
122/*L:360 OK, when we need to actually free up a Guest's DMA array we do several
123 * things, so we have a convenient function to do it.
124 *
125 * The caller must hold a read lock on dmainfo owner's current->mm->mmap_sem
126 * for the drop_futex_key_refs(). */
127static void unlink_dma(struct lguest_dma_info *dmainfo)
128{
129 /* You locked this too, right? */
130 BUG_ON(!mutex_is_locked(&lguest_lock));
131 /* This is how we know that the entry is free. */
132 dmainfo->interrupt = 0;
133 /* Remove it from the hash table. */
134 list_del(&dmainfo->list);
135 /* Drop the references we were holding (to the inode or mm). */
136 drop_futex_key_refs(&dmainfo->key);
137}
138
139/*L:350 This is the routine which we call when the Guest asks to unregister a
140 * DMA array attached to a given key. Returns true if the array was found. */
141static int unbind_dma(struct lguest *lg,
142 const union futex_key *key,
143 unsigned long dmas)
144{
145 int i, ret = 0;
146
147 /* We don't bother with the hash table, just look through all this
148 * Guest's DMA arrays. */
149 for (i = 0; i < LGUEST_MAX_DMA; i++) {
150 /* In theory it could have more than one array on the same key,
151 * or one array on multiple keys, so we check both */
152 if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) {
153 unlink_dma(&lg->dma[i]);
154 ret = 1;
155 break;
156 }
157 }
158 return ret;
159}
160
161/*L:340 BIND_DMA: this is the hypercall which sets up an array of "struct
162 * lguest_dma" for receiving I/O.
163 *
164 * The Guest wants to bind an array of "struct lguest_dma"s to a particular key
165 * to receive input. This only happens when the Guest is setting up a new
166 * device, so it doesn't have to be very fast.
167 *
168 * It returns 1 on a successful registration (it can fail if we hit the limit
169 * of registrations for this Guest).
170 */
171int bind_dma(struct lguest *lg,
172 unsigned long ukey, unsigned long dmas, u16 numdmas, u8 interrupt)
173{
174 unsigned int i;
175 int ret = 0;
176 union futex_key key;
177 /* Futex code needs the mmap_sem. */
178 struct rw_semaphore *fshared = &current->mm->mmap_sem;
179
180 /* Invalid interrupt? (We could kill the guest here). */
181 if (interrupt >= LGUEST_IRQS)
182 return 0;
183
184 /* We need to grab the Big Lguest Lock, because other Guests may be
185 * trying to look through this Guest's DMAs to send something while
186 * we're doing this. */
187 mutex_lock(&lguest_lock);
188 down_read(fshared);
189 if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) {
190 kill_guest(lg, "bad dma key %#lx", ukey);
191 goto unlock;
192 }
193
194 /* We want to keep this key valid once we drop mmap_sem, so we have to
195 * hold a reference. */
196 get_futex_key_refs(&key);
197
198 /* If the Guest specified an interrupt of 0, that means they want to
199 * unregister this array of "struct lguest_dma"s. */
200 if (interrupt == 0)
201 ret = unbind_dma(lg, &key, dmas);
202 else {
203 /* Look through this Guest's dma array for an unused entry. */
204 for (i = 0; i < LGUEST_MAX_DMA; i++) {
205 /* If the interrupt is non-zero, the entry is already
206 * used. */
207 if (lg->dma[i].interrupt)
208 continue;
209
210 /* OK, a free one! Fill on our details. */
211 lg->dma[i].dmas = dmas;
212 lg->dma[i].num_dmas = numdmas;
213 lg->dma[i].next_dma = 0;
214 lg->dma[i].key = key;
215 lg->dma[i].guestid = lg->guestid;
216 lg->dma[i].interrupt = interrupt;
217
218 /* Now we add it to the hash table: the position
219 * depends on the futex key that we got. */
220 list_add(&lg->dma[i].list, &dma_hash[hash(&key)]);
221 /* Success! */
222 ret = 1;
223 goto unlock;
224 }
225 }
226 /* If we didn't find a slot to put the key in, drop the reference
227 * again. */
228 drop_futex_key_refs(&key);
229unlock:
230 /* Unlock and out. */
231 up_read(fshared);
232 mutex_unlock(&lguest_lock);
233 return ret;
234}
235
236/*L:385 Note that our routines to access a different Guest's memory are called
237 * lgread_other() and lgwrite_other(): these names emphasize that they are only
238 * used when the Guest is *not* the current Guest.
239 *
240 * The interface for copying from another process's memory is called
241 * access_process_vm(), with a final argument of 0 for a read, and 1 for a
242 * write.
243 *
244 * We need lgread_other() to read the destination Guest's "struct lguest_dma"
245 * array. */
246static int lgread_other(struct lguest *lg,
247 void *buf, u32 addr, unsigned bytes)
248{
249 if (!lguest_address_ok(lg, addr, bytes)
250 || access_process_vm(lg->tsk, addr, buf, bytes, 0) != bytes) {
251 memset(buf, 0, bytes);
252 kill_guest(lg, "bad address in registered DMA struct");
253 return 0;
254 }
255 return 1;
256}
257
258/* "lgwrite()" to another Guest: used to update the destination "used_len" once
259 * we've transferred data into the buffer. */
260static int lgwrite_other(struct lguest *lg, u32 addr,
261 const void *buf, unsigned bytes)
262{
263 if (!lguest_address_ok(lg, addr, bytes)
264 || (access_process_vm(lg->tsk, addr, (void *)buf, bytes, 1)
265 != bytes)) {
266 kill_guest(lg, "bad address writing to registered DMA");
267 return 0;
268 }
269 return 1;
270}
271
272/*L:400 This is the generic engine which copies from a source "struct
273 * lguest_dma" from this Guest into another Guest's "struct lguest_dma". The
274 * destination Guest's pages have already been mapped, as contained in the
275 * pages array.
276 *
277 * If you're wondering if there's a nice "copy from one process to another"
278 * routine, so was I. But Linux isn't really set up to copy between two
279 * unrelated processes, so we have to write it ourselves.
280 */
281static u32 copy_data(struct lguest *srclg,
282 const struct lguest_dma *src,
283 const struct lguest_dma *dst,
284 struct page *pages[])
285{
286 unsigned int totlen, si, di, srcoff, dstoff;
287 void *maddr = NULL;
288
289 /* We return the total length transferred. */
290 totlen = 0;
291
292 /* We keep indexes into the source and destination "struct lguest_dma",
293 * and an offset within each region. */
294 si = di = 0;
295 srcoff = dstoff = 0;
296
297 /* We loop until the source or destination is exhausted. */
298 while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si]
299 && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) {
300 /* We can only transfer the rest of the src buffer, or as much
301 * as will fit into the destination buffer. */
302 u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff);
303
304 /* For systems using "highmem" we need to use kmap() to access
305 * the page we want. We often use the same page over and over,
306 * so rather than kmap() it on every loop, we set the maddr
307 * pointer to NULL when we need to move to the next
308 * destination page. */
309 if (!maddr)
310 maddr = kmap(pages[di]);
311
312 /* Copy directly from (this Guest's) source address to the
313 * destination Guest's kmap()ed buffer. Note that maddr points
314 * to the start of the page: we need to add the offset of the
315 * destination address and offset within the buffer. */
316
317 /* FIXME: This is not completely portable. I looked at
318 * copy_to_user_page(), and some arch's seem to need special
319 * flushes. x86 is fine. */
320 if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE,
321 (void __user *)src->addr[si], len) != 0) {
322 /* If a copy failed, it's the source's fault. */
323 kill_guest(srclg, "bad address in sending DMA");
324 totlen = 0;
325 break;
326 }
327
328 /* Increment the total and src & dst offsets */
329 totlen += len;
330 srcoff += len;
331 dstoff += len;
332
333 /* Presumably we reached the end of the src or dest buffers: */
334 if (srcoff == src->len[si]) {
335 /* Move to the next buffer at offset 0 */
336 si++;
337 srcoff = 0;
338 }
339 if (dstoff == dst->len[di]) {
340 /* We need to unmap that destination page and reset
341 * maddr ready for the next one. */
342 kunmap(pages[di]);
343 maddr = NULL;
344 di++;
345 dstoff = 0;
346 }
347 }
348
349 /* If we still had a page mapped at the end, unmap now. */
350 if (maddr)
351 kunmap(pages[di]);
352
353 return totlen;
354}
355
356/*L:390 This is how we transfer a "struct lguest_dma" from the source Guest
357 * (the current Guest which called SEND_DMA) to another Guest. */
358static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src,
359 struct lguest *dstlg, const struct lguest_dma *dst)
360{
361 int i;
362 u32 ret;
363 struct page *pages[LGUEST_MAX_DMA_SECTIONS];
364
365 /* We check that both source and destination "struct lguest_dma"s are
366 * within the bounds of the source and destination Guests */
367 if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src))
368 return 0;
369
370 /* We need to map the pages which correspond to each parts of
371 * destination buffer. */
372 for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
373 if (dst->len[i] == 0)
374 break;
375 /* get_user_pages() is a complicated function, especially since
376 * we only want a single page. But it works, and returns the
377 * number of pages. Note that we're holding the destination's
378 * mmap_sem, as get_user_pages() requires. */
379 if (get_user_pages(dstlg->tsk, dstlg->mm,
380 dst->addr[i], 1, 1, 1, pages+i, NULL)
381 != 1) {
382 /* This means the destination gave us a bogus buffer */
383 kill_guest(dstlg, "Error mapping DMA pages");
384 ret = 0;
385 goto drop_pages;
386 }
387 }
388
389 /* Now copy the data until we run out of src or dst. */
390 ret = copy_data(srclg, src, dst, pages);
391
392drop_pages:
393 while (--i >= 0)
394 put_page(pages[i]);
395 return ret;
396}
397
398/*L:380 Transferring data from one Guest to another is not as simple as I'd
399 * like. We've found the "struct lguest_dma_info" bound to the same address as
400 * the send, we need to copy into it.
401 *
402 * This function returns true if the destination array was empty. */
403static int dma_transfer(struct lguest *srclg,
404 unsigned long udma,
405 struct lguest_dma_info *dst)
406{
407 struct lguest_dma dst_dma, src_dma;
408 struct lguest *dstlg;
409 u32 i, dma = 0;
410
411 /* From the "struct lguest_dma_info" we found in the hash, grab the
412 * Guest. */
413 dstlg = &lguests[dst->guestid];
414 /* Read in the source "struct lguest_dma" handed to SEND_DMA. */
415 lgread(srclg, &src_dma, udma, sizeof(src_dma));
416
417 /* We need the destination's mmap_sem, and we already hold the source's
418 * mmap_sem for the futex key lookup. Normally this would suggest that
419 * we could deadlock if the destination Guest was trying to send to
420 * this source Guest at the same time, which is another reason that all
421 * I/O is done under the big lguest_lock. */
422 down_read(&dstlg->mm->mmap_sem);
423
424 /* Look through the destination DMA array for an available buffer. */
425 for (i = 0; i < dst->num_dmas; i++) {
426 /* We keep a "next_dma" pointer which often helps us avoid
427 * looking at lots of previously-filled entries. */
428 dma = (dst->next_dma + i) % dst->num_dmas;
429 if (!lgread_other(dstlg, &dst_dma,
430 dst->dmas + dma * sizeof(struct lguest_dma),
431 sizeof(dst_dma))) {
432 goto fail;
433 }
434 if (!dst_dma.used_len)
435 break;
436 }
437
438 /* If we found a buffer, we do the actual data copy. */
439 if (i != dst->num_dmas) {
440 unsigned long used_lenp;
441 unsigned int ret;
442
443 ret = do_dma(srclg, &src_dma, dstlg, &dst_dma);
444 /* Put used length in the source "struct lguest_dma"'s used_len
445 * field. It's a little tricky to figure out where that is,
446 * though. */
447 lgwrite_u32(srclg,
448 udma+offsetof(struct lguest_dma, used_len), ret);
449 /* Tranferring 0 bytes is OK if the source buffer was empty. */
450 if (ret == 0 && src_dma.len[0] != 0)
451 goto fail;
452
453 /* The destination Guest might be running on a different CPU:
454 * we have to make sure that it will see the "used_len" field
455 * change to non-zero *after* it sees the data we copied into
456 * the buffer. Hence a write memory barrier. */
457 wmb();
458 /* Figuring out where the destination's used_len field for this
459 * "struct lguest_dma" in the array is also a little ugly. */
460 used_lenp = dst->dmas
461 + dma * sizeof(struct lguest_dma)
462 + offsetof(struct lguest_dma, used_len);
463 lgwrite_other(dstlg, used_lenp, &ret, sizeof(ret));
464 /* Move the cursor for next time. */
465 dst->next_dma++;
466 }
467 up_read(&dstlg->mm->mmap_sem);
468
469 /* We trigger the destination interrupt, even if the destination was
470 * empty and we didn't transfer anything: this gives them a chance to
471 * wake up and refill. */
472 set_bit(dst->interrupt, dstlg->irqs_pending);
473 /* Wake up the destination process. */
474 wake_up_process(dstlg->tsk);
475 /* If we passed the last "struct lguest_dma", the receive had no
476 * buffers left. */
477 return i == dst->num_dmas;
478
479fail:
480 up_read(&dstlg->mm->mmap_sem);
481 return 0;
482}
483
484/*L:370 This is the counter-side to the BIND_DMA hypercall; the SEND_DMA
485 * hypercall. We find out who's listening, and send to them. */
486void send_dma(struct lguest *lg, unsigned long ukey, unsigned long udma)
487{
488 union futex_key key;
489 int empty = 0;
490 struct rw_semaphore *fshared = &current->mm->mmap_sem;
491
492again:
493 mutex_lock(&lguest_lock);
494 down_read(fshared);
495 /* Get the futex key for the key the Guest gave us */
496 if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) {
497 kill_guest(lg, "bad sending DMA key");
498 goto unlock;
499 }
500 /* Since the key must be a multiple of 4, the futex key uses the lower
501 * bit of the "offset" field (which would always be 0) to indicate a
502 * mapping which is shared with other processes (ie. Guests). */
503 if (key.shared.offset & 1) {
504 struct lguest_dma_info *i;
505 /* Look through the hash for other Guests. */
506 list_for_each_entry(i, &dma_hash[hash(&key)], list) {
507 /* Don't send to ourselves. */
508 if (i->guestid == lg->guestid)
509 continue;
510 if (!key_eq(&key, &i->key))
511 continue;
512
513 /* If dma_transfer() tells us the destination has no
514 * available buffers, we increment "empty". */
515 empty += dma_transfer(lg, udma, i);
516 break;
517 }
518 /* If the destination is empty, we release our locks and
519 * give the destination Guest a brief chance to restock. */
520 if (empty == 1) {
521 /* Give any recipients one chance to restock. */
522 up_read(&current->mm->mmap_sem);
523 mutex_unlock(&lguest_lock);
524 /* Next time, we won't try again. */
525 empty++;
526 goto again;
527 }
528 } else {
529 /* Private mapping: Guest is sending to its Launcher. We set
530 * the "dma_is_pending" flag so that the main loop will exit
531 * and the Launcher's read() from /dev/lguest will return. */
532 lg->dma_is_pending = 1;
533 lg->pending_dma = udma;
534 lg->pending_key = ukey;
535 }
536unlock:
537 up_read(fshared);
538 mutex_unlock(&lguest_lock);
539}
540/*:*/
541
542void release_all_dma(struct lguest *lg)
543{
544 unsigned int i;
545
546 BUG_ON(!mutex_is_locked(&lguest_lock));
547
548 down_read(&lg->mm->mmap_sem);
549 for (i = 0; i < LGUEST_MAX_DMA; i++) {
550 if (lg->dma[i].interrupt)
551 unlink_dma(&lg->dma[i]);
552 }
553 up_read(&lg->mm->mmap_sem);
554}
555
556/*M:007 We only return a single DMA buffer to the Launcher, but it would be
557 * more efficient to return a pointer to the entire array of DMA buffers, which
558 * it can cache and choose one whenever it wants.
559 *
560 * Currently the Launcher uses a write to /dev/lguest, and the return value is
561 * the address of the DMA structure with the interrupt number placed in
562 * dma->used_len. If we wanted to return the entire array, we need to return
563 * the address, array size and interrupt number: this seems to require an
564 * ioctl(). :*/
565
566/*L:320 This routine looks for a DMA buffer registered by the Guest on the
567 * given key (using the BIND_DMA hypercall). */
568unsigned long get_dma_buffer(struct lguest *lg,
569 unsigned long ukey, unsigned long *interrupt)
570{
571 unsigned long ret = 0;
572 union futex_key key;
573 struct lguest_dma_info *i;
574 struct rw_semaphore *fshared = &current->mm->mmap_sem;
575
576 /* Take the Big Lguest Lock to stop other Guests sending this Guest DMA
577 * at the same time. */
578 mutex_lock(&lguest_lock);
579 /* To match between Guests sharing the same underlying memory we steal
580 * code from the futex infrastructure. This requires that we hold the
581 * "mmap_sem" for our process (the Launcher), and pass it to the futex
582 * code. */
583 down_read(fshared);
584
585 /* This can fail if it's not a valid address, or if the address is not
586 * divisible by 4 (the futex code needs that, we don't really). */
587 if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) {
588 kill_guest(lg, "bad registered DMA buffer");
589 goto unlock;
590 }
591 /* Search the hash table for matching entries (the Launcher can only
592 * send to its own Guest for the moment, so the entry must be for this
593 * Guest) */
594 list_for_each_entry(i, &dma_hash[hash(&key)], list) {
595 if (key_eq(&key, &i->key) && i->guestid == lg->guestid) {
596 unsigned int j;
597 /* Look through the registered DMA array for an
598 * available buffer. */
599 for (j = 0; j < i->num_dmas; j++) {
600 struct lguest_dma dma;
601
602 ret = i->dmas + j * sizeof(struct lguest_dma);
603 lgread(lg, &dma, ret, sizeof(dma));
604 if (dma.used_len == 0)
605 break;
606 }
607 /* Store the interrupt the Guest wants when the buffer
608 * is used. */
609 *interrupt = i->interrupt;
610 break;
611 }
612 }
613unlock:
614 up_read(fshared);
615 mutex_unlock(&lguest_lock);
616 return ret;
617}
618/*:*/
619
620/*L:410 This really has completed the Launcher. Not only have we now finished
621 * the longest chapter in our journey, but this also means we are over halfway
622 * through!
623 *
624 * Enough prevaricating around the bush: it is time for us to dive into the
625 * core of the Host, in "make Host".
626 */
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 64f0abed317..d9144beca82 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -1,119 +1,25 @@
1#ifndef _LGUEST_H 1#ifndef _LGUEST_H
2#define _LGUEST_H 2#define _LGUEST_H
3 3
4#include <asm/desc.h>
5
6#define GDT_ENTRY_LGUEST_CS 10
7#define GDT_ENTRY_LGUEST_DS 11
8#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8)
9#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8)
10
11#ifndef __ASSEMBLY__ 4#ifndef __ASSEMBLY__
12#include <linux/types.h> 5#include <linux/types.h>
13#include <linux/init.h> 6#include <linux/init.h>
14#include <linux/stringify.h> 7#include <linux/stringify.h>
15#include <linux/binfmts.h>
16#include <linux/futex.h>
17#include <linux/lguest.h> 8#include <linux/lguest.h>
18#include <linux/lguest_launcher.h> 9#include <linux/lguest_launcher.h>
19#include <linux/wait.h> 10#include <linux/wait.h>
20#include <linux/err.h> 11#include <linux/err.h>
21#include <asm/semaphore.h> 12#include <asm/semaphore.h>
22#include "irq_vectors.h"
23
24#define GUEST_PL 1
25 13
26struct lguest_regs 14#include <asm/lguest.h>
27{
28 /* Manually saved part. */
29 unsigned long ebx, ecx, edx;
30 unsigned long esi, edi, ebp;
31 unsigned long gs;
32 unsigned long eax;
33 unsigned long fs, ds, es;
34 unsigned long trapnum, errcode;
35 /* Trap pushed part */
36 unsigned long eip;
37 unsigned long cs;
38 unsigned long eflags;
39 unsigned long esp;
40 unsigned long ss;
41};
42 15
43void free_pagetables(void); 16void free_pagetables(void);
44int init_pagetables(struct page **switcher_page, unsigned int pages); 17int init_pagetables(struct page **switcher_page, unsigned int pages);
45 18
46/* Full 4G segment descriptors, suitable for CS and DS. */
47#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00})
48#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300})
49
50struct lguest_dma_info
51{
52 struct list_head list;
53 union futex_key key;
54 unsigned long dmas;
55 u16 next_dma;
56 u16 num_dmas;
57 u16 guestid;
58 u8 interrupt; /* 0 when not registered */
59};
60
61/*H:310 The page-table code owes a great debt of gratitude to Andi Kleen. He
62 * reviewed the original code which used "u32" for all page table entries, and
63 * insisted that it would be far clearer with explicit typing. I thought it
64 * was overkill, but he was right: it is much clearer than it was before.
65 *
66 * We have separate types for the Guest's ptes & pgds and the shadow ptes &
67 * pgds. There's already a Linux type for these (pte_t and pgd_t) but they
68 * change depending on kernel config options (PAE). */
69
70/* Each entry is identical: lower 12 bits of flags and upper 20 bits for the
71 * "page frame number" (0 == first physical page, etc). They are different
72 * types so the compiler will warn us if we mix them improperly. */
73typedef union {
74 struct { unsigned flags:12, pfn:20; };
75 struct { unsigned long val; } raw;
76} spgd_t;
77typedef union {
78 struct { unsigned flags:12, pfn:20; };
79 struct { unsigned long val; } raw;
80} spte_t;
81typedef union {
82 struct { unsigned flags:12, pfn:20; };
83 struct { unsigned long val; } raw;
84} gpgd_t;
85typedef union {
86 struct { unsigned flags:12, pfn:20; };
87 struct { unsigned long val; } raw;
88} gpte_t;
89
90/* We have two convenient macros to convert a "raw" value as handed to us by
91 * the Guest into the correct Guest PGD or PTE type. */
92#define mkgpte(_val) ((gpte_t){.raw.val = _val})
93#define mkgpgd(_val) ((gpgd_t){.raw.val = _val})
94/*:*/
95
96struct pgdir 19struct pgdir
97{ 20{
98 unsigned long cr3; 21 unsigned long gpgdir;
99 spgd_t *pgdir; 22 pgd_t *pgdir;
100};
101
102/* This is a guest-specific page (mapped ro) into the guest. */
103struct lguest_ro_state
104{
105 /* Host information we need to restore when we switch back. */
106 u32 host_cr3;
107 struct Xgt_desc_struct host_idt_desc;
108 struct Xgt_desc_struct host_gdt_desc;
109 u32 host_sp;
110
111 /* Fields which are used when guest is running. */
112 struct Xgt_desc_struct guest_idt_desc;
113 struct Xgt_desc_struct guest_gdt_desc;
114 struct i386_hw_tss guest_tss;
115 struct desc_struct guest_idt[IDT_ENTRIES];
116 struct desc_struct guest_gdt[GDT_ENTRIES];
117}; 23};
118 24
119/* We have two pages shared with guests, per cpu. */ 25/* We have two pages shared with guests, per cpu. */
@@ -141,9 +47,11 @@ struct lguest
141 struct lguest_data __user *lguest_data; 47 struct lguest_data __user *lguest_data;
142 struct task_struct *tsk; 48 struct task_struct *tsk;
143 struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */ 49 struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */
144 u16 guestid;
145 u32 pfn_limit; 50 u32 pfn_limit;
146 u32 page_offset; 51 /* This provides the offset to the base of guest-physical
52 * memory in the Launcher. */
53 void __user *mem_base;
54 unsigned long kernel_address;
147 u32 cr2; 55 u32 cr2;
148 int halted; 56 int halted;
149 int ts; 57 int ts;
@@ -151,6 +59,9 @@ struct lguest
151 u32 esp1; 59 u32 esp1;
152 u8 ss1; 60 u8 ss1;
153 61
62 /* If a hypercall was asked for, this points to the arguments. */
63 struct hcall_args *hcall;
64
154 /* Do we need to stop what we're doing and return to userspace? */ 65 /* Do we need to stop what we're doing and return to userspace? */
155 int break_out; 66 int break_out;
156 wait_queue_head_t break_wq; 67 wait_queue_head_t break_wq;
@@ -167,24 +78,15 @@ struct lguest
167 struct task_struct *wake; 78 struct task_struct *wake;
168 79
169 unsigned long noirq_start, noirq_end; 80 unsigned long noirq_start, noirq_end;
170 int dma_is_pending; 81 unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
171 unsigned long pending_dma; /* struct lguest_dma */
172 unsigned long pending_key; /* address they're sending to */
173 82
174 unsigned int stack_pages; 83 unsigned int stack_pages;
175 u32 tsc_khz; 84 u32 tsc_khz;
176 85
177 struct lguest_dma_info dma[LGUEST_MAX_DMA];
178
179 /* Dead? */ 86 /* Dead? */
180 const char *dead; 87 const char *dead;
181 88
182 /* The GDT entries copied into lguest_ro_state when running. */ 89 struct lguest_arch arch;
183 struct desc_struct gdt[GDT_ENTRIES];
184
185 /* The IDT entries: some copied into lguest_ro_state when running. */
186 struct desc_struct idt[FIRST_EXTERNAL_VECTOR+LGUEST_IRQS];
187 struct desc_struct syscall_idt;
188 90
189 /* Virtual clock device */ 91 /* Virtual clock device */
190 struct hrtimer hrt; 92 struct hrtimer hrt;
@@ -193,19 +95,38 @@ struct lguest
193 DECLARE_BITMAP(irqs_pending, LGUEST_IRQS); 95 DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
194}; 96};
195 97
196extern struct lguest lguests[];
197extern struct mutex lguest_lock; 98extern struct mutex lguest_lock;
198 99
199/* core.c: */ 100/* core.c: */
200u32 lgread_u32(struct lguest *lg, unsigned long addr);
201void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val);
202void lgread(struct lguest *lg, void *buf, unsigned long addr, unsigned len);
203void lgwrite(struct lguest *lg, unsigned long, const void *buf, unsigned len);
204int find_free_guest(void);
205int lguest_address_ok(const struct lguest *lg, 101int lguest_address_ok(const struct lguest *lg,
206 unsigned long addr, unsigned long len); 102 unsigned long addr, unsigned long len);
103void __lgread(struct lguest *, void *, unsigned long, unsigned);
104void __lgwrite(struct lguest *, unsigned long, const void *, unsigned);
105
106/*L:306 Using memory-copy operations like that is usually inconvient, so we
107 * have the following helper macros which read and write a specific type (often
108 * an unsigned long).
109 *
110 * This reads into a variable of the given type then returns that. */
111#define lgread(lg, addr, type) \
112 ({ type _v; __lgread((lg), &_v, (addr), sizeof(_v)); _v; })
113
114/* This checks that the variable is of the given type, then writes it out. */
115#define lgwrite(lg, addr, type, val) \
116 do { \
117 typecheck(type, val); \
118 __lgwrite((lg), (addr), &(val), sizeof(val)); \
119 } while(0)
120/* (end of memory access helper routines) :*/
121
207int run_guest(struct lguest *lg, unsigned long __user *user); 122int run_guest(struct lguest *lg, unsigned long __user *user);
208 123
124/* Helper macros to obtain the first 12 or the last 20 bits, this is only the
125 * first step in the migration to the kernel types. pte_pfn is already defined
126 * in the kernel. */
127#define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK)
128#define pte_flags(x) (pte_val(x) & ~PAGE_MASK)
129#define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT)
209 130
210/* interrupts_and_traps.c: */ 131/* interrupts_and_traps.c: */
211void maybe_do_interrupt(struct lguest *lg); 132void maybe_do_interrupt(struct lguest *lg);
@@ -219,6 +140,9 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt,
219 const unsigned long *def); 140 const unsigned long *def);
220void guest_set_clockevent(struct lguest *lg, unsigned long delta); 141void guest_set_clockevent(struct lguest *lg, unsigned long delta);
221void init_clockdev(struct lguest *lg); 142void init_clockdev(struct lguest *lg);
143bool check_syscall_vector(struct lguest *lg);
144int init_interrupts(void);
145void free_interrupts(void);
222 146
223/* segments.c: */ 147/* segments.c: */
224void setup_default_gdt_entries(struct lguest_ro_state *state); 148void setup_default_gdt_entries(struct lguest_ro_state *state);
@@ -232,28 +156,33 @@ void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt);
232int init_guest_pagetable(struct lguest *lg, unsigned long pgtable); 156int init_guest_pagetable(struct lguest *lg, unsigned long pgtable);
233void free_guest_pagetable(struct lguest *lg); 157void free_guest_pagetable(struct lguest *lg);
234void guest_new_pagetable(struct lguest *lg, unsigned long pgtable); 158void guest_new_pagetable(struct lguest *lg, unsigned long pgtable);
235void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 i); 159void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
236void guest_pagetable_clear_all(struct lguest *lg); 160void guest_pagetable_clear_all(struct lguest *lg);
237void guest_pagetable_flush_user(struct lguest *lg); 161void guest_pagetable_flush_user(struct lguest *lg);
238void guest_set_pte(struct lguest *lg, unsigned long cr3, 162void guest_set_pte(struct lguest *lg, unsigned long gpgdir,
239 unsigned long vaddr, gpte_t val); 163 unsigned long vaddr, pte_t val);
240void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages); 164void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages);
241int demand_page(struct lguest *info, unsigned long cr2, int errcode); 165int demand_page(struct lguest *info, unsigned long cr2, int errcode);
242void pin_page(struct lguest *lg, unsigned long vaddr); 166void pin_page(struct lguest *lg, unsigned long vaddr);
167unsigned long guest_pa(struct lguest *lg, unsigned long vaddr);
168void page_table_guest_data_init(struct lguest *lg);
169
170/* <arch>/core.c: */
171void lguest_arch_host_init(void);
172void lguest_arch_host_fini(void);
173void lguest_arch_run_guest(struct lguest *lg);
174void lguest_arch_handle_trap(struct lguest *lg);
175int lguest_arch_init_hypercalls(struct lguest *lg);
176int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args);
177void lguest_arch_setup_regs(struct lguest *lg, unsigned long start);
178
179/* <arch>/switcher.S: */
180extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
243 181
244/* lguest_user.c: */ 182/* lguest_user.c: */
245int lguest_device_init(void); 183int lguest_device_init(void);
246void lguest_device_remove(void); 184void lguest_device_remove(void);
247 185
248/* io.c: */
249void lguest_io_init(void);
250int bind_dma(struct lguest *lg,
251 unsigned long key, unsigned long udma, u16 numdmas, u8 interrupt);
252void send_dma(struct lguest *info, unsigned long key, unsigned long udma);
253void release_all_dma(struct lguest *lg);
254unsigned long get_dma_buffer(struct lguest *lg, unsigned long key,
255 unsigned long *interrupt);
256
257/* hypercalls.c: */ 186/* hypercalls.c: */
258void do_hypercalls(struct lguest *lg); 187void do_hypercalls(struct lguest *lg);
259void write_timestamp(struct lguest *lg); 188void write_timestamp(struct lguest *lg);
@@ -292,9 +221,5 @@ do { \
292} while(0) 221} while(0)
293/* (End of aside) :*/ 222/* (End of aside) :*/
294 223
295static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
296{
297 return vaddr - lg->page_offset;
298}
299#endif /* __ASSEMBLY__ */ 224#endif /* __ASSEMBLY__ */
300#endif /* _LGUEST_H */ 225#endif /* _LGUEST_H */
diff --git a/drivers/lguest/lguest_bus.c b/drivers/lguest/lguest_bus.c
deleted file mode 100644
index 57329788f8a..00000000000
--- a/drivers/lguest/lguest_bus.c
+++ /dev/null
@@ -1,218 +0,0 @@
1/*P:050 Lguest guests use a very simple bus for devices. It's a simple array
2 * of device descriptors contained just above the top of normal memory. The
3 * lguest bus is 80% tedious boilerplate code. :*/
4#include <linux/init.h>
5#include <linux/bootmem.h>
6#include <linux/lguest_bus.h>
7#include <asm/io.h>
8#include <asm/paravirt.h>
9
10static ssize_t type_show(struct device *_dev,
11 struct device_attribute *attr, char *buf)
12{
13 struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
14 return sprintf(buf, "%hu", lguest_devices[dev->index].type);
15}
16static ssize_t features_show(struct device *_dev,
17 struct device_attribute *attr, char *buf)
18{
19 struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
20 return sprintf(buf, "%hx", lguest_devices[dev->index].features);
21}
22static ssize_t pfn_show(struct device *_dev,
23 struct device_attribute *attr, char *buf)
24{
25 struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
26 return sprintf(buf, "%u", lguest_devices[dev->index].pfn);
27}
28static ssize_t status_show(struct device *_dev,
29 struct device_attribute *attr, char *buf)
30{
31 struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
32 return sprintf(buf, "%hx", lguest_devices[dev->index].status);
33}
34static ssize_t status_store(struct device *_dev, struct device_attribute *attr,
35 const char *buf, size_t count)
36{
37 struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
38 if (sscanf(buf, "%hi", &lguest_devices[dev->index].status) != 1)
39 return -EINVAL;
40 return count;
41}
42static struct device_attribute lguest_dev_attrs[] = {
43 __ATTR_RO(type),
44 __ATTR_RO(features),
45 __ATTR_RO(pfn),
46 __ATTR(status, 0644, status_show, status_store),
47 __ATTR_NULL
48};
49
50/*D:130 The generic bus infrastructure requires a function which says whether a
51 * device matches a driver. For us, it is simple: "struct lguest_driver"
52 * contains a "device_type" field which indicates what type of device it can
53 * handle, so we just cast the args and compare: */
54static int lguest_dev_match(struct device *_dev, struct device_driver *_drv)
55{
56 struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
57 struct lguest_driver *drv = container_of(_drv,struct lguest_driver,drv);
58
59 return (drv->device_type == lguest_devices[dev->index].type);
60}
61/*:*/
62
63struct lguest_bus {
64 struct bus_type bus;
65 struct device dev;
66};
67
68static struct lguest_bus lguest_bus = {
69 .bus = {
70 .name = "lguest",
71 .match = lguest_dev_match,
72 .dev_attrs = lguest_dev_attrs,
73 },
74 .dev = {
75 .parent = NULL,
76 .bus_id = "lguest",
77 }
78};
79
80/*D:140 This is the callback which occurs once the bus infrastructure matches
81 * up a device and driver, ie. in response to add_lguest_device() calling
82 * device_register(), or register_lguest_driver() calling driver_register().
83 *
84 * At the moment it's always the latter: the devices are added first, since
85 * scan_devices() is called from a "core_initcall", and the drivers themselves
86 * called later as a normal "initcall". But it would work the other way too.
87 *
88 * So now we have the happy couple, we add the status bit to indicate that we
89 * found a driver. If the driver truly loves the device, it will return
90 * happiness from its probe function (ok, perhaps this wasn't my greatest
91 * analogy), and we set the final "driver ok" bit so the Host sees it's all
92 * green. */
93static int lguest_dev_probe(struct device *_dev)
94{
95 int ret;
96 struct lguest_device*dev = container_of(_dev,struct lguest_device,dev);
97 struct lguest_driver*drv = container_of(dev->dev.driver,
98 struct lguest_driver, drv);
99
100 lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER;
101 ret = drv->probe(dev);
102 if (ret == 0)
103 lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER_OK;
104 return ret;
105}
106
107/* The last part of the bus infrastructure is the function lguest drivers use
108 * to register themselves. Firstly, we do nothing if there's no lguest bus
109 * (ie. this is not a Guest), otherwise we fill in the embedded generic "struct
110 * driver" fields and call the generic driver_register(). */
111int register_lguest_driver(struct lguest_driver *drv)
112{
113 if (!lguest_devices)
114 return 0;
115
116 drv->drv.bus = &lguest_bus.bus;
117 drv->drv.name = drv->name;
118 drv->drv.owner = drv->owner;
119 drv->drv.probe = lguest_dev_probe;
120
121 return driver_register(&drv->drv);
122}
123
124/* At the moment we build all the drivers into the kernel because they're so
125 * simple: 8144 bytes for all three of them as I type this. And as the console
126 * really needs to be built in, it's actually only 3527 bytes for the network
127 * and block drivers.
128 *
129 * If they get complex it will make sense for them to be modularized, so we
130 * need to explicitly export the symbol.
131 *
132 * I don't think non-GPL modules make sense, so it's a GPL-only export.
133 */
134EXPORT_SYMBOL_GPL(register_lguest_driver);
135
136/*D:120 This is the core of the lguest bus: actually adding a new device.
137 * It's a separate function because it's neater that way, and because an
138 * earlier version of the code supported hotplug and unplug. They were removed
139 * early on because they were never used.
140 *
141 * As Andrew Tridgell says, "Untested code is buggy code".
142 *
143 * It's worth reading this carefully: we start with an index into the array of
144 * "struct lguest_device_desc"s indicating the device which is new: */
145static void add_lguest_device(unsigned int index)
146{
147 struct lguest_device *new;
148
149 /* Each "struct lguest_device_desc" has a "status" field, which the
150 * Guest updates as the device is probed. In the worst case, the Host
151 * can look at these bits to tell what part of device setup failed,
152 * even if the console isn't available. */
153 lguest_devices[index].status |= LGUEST_DEVICE_S_ACKNOWLEDGE;
154 new = kmalloc(sizeof(struct lguest_device), GFP_KERNEL);
155 if (!new) {
156 printk(KERN_EMERG "Cannot allocate lguest device %u\n", index);
157 lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED;
158 return;
159 }
160
161 /* The "struct lguest_device" setup is pretty straight-forward example
162 * code. */
163 new->index = index;
164 new->private = NULL;
165 memset(&new->dev, 0, sizeof(new->dev));
166 new->dev.parent = &lguest_bus.dev;
167 new->dev.bus = &lguest_bus.bus;
168 sprintf(new->dev.bus_id, "%u", index);
169
170 /* device_register() causes the bus infrastructure to look for a
171 * matching driver. */
172 if (device_register(&new->dev) != 0) {
173 printk(KERN_EMERG "Cannot register lguest device %u\n", index);
174 lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED;
175 kfree(new);
176 }
177}
178
179/*D:110 scan_devices() simply iterates through the device array. The type 0
180 * is reserved to mean "no device", and anything else means we have found a
181 * device: add it. */
182static void scan_devices(void)
183{
184 unsigned int i;
185
186 for (i = 0; i < LGUEST_MAX_DEVICES; i++)
187 if (lguest_devices[i].type)
188 add_lguest_device(i);
189}
190
191/*D:100 Fairly early in boot, lguest_bus_init() is called to set up the lguest
192 * bus. We check that we are a Guest by checking paravirt_ops.name: there are
193 * other ways of checking, but this seems most obvious to me.
194 *
195 * So we can access the array of "struct lguest_device_desc"s easily, we map
196 * that memory and store the pointer in the global "lguest_devices". Then we
197 * register the bus with the core. Doing two registrations seems clunky to me,
198 * but it seems to be the correct sysfs incantation.
199 *
200 * Finally we call scan_devices() which adds all the devices found in the
201 * "struct lguest_device_desc" array. */
202static int __init lguest_bus_init(void)
203{
204 if (strcmp(pv_info.name, "lguest") != 0)
205 return 0;
206
207 /* Devices are in a single page above top of "normal" mem */
208 lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1);
209
210 if (bus_register(&lguest_bus.bus) != 0
211 || device_register(&lguest_bus.dev) != 0)
212 panic("lguest bus registration failed");
213
214 scan_devices();
215 return 0;
216}
217/* Do this after core stuff, before devices. */
218postcore_initcall(lguest_bus_init);
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
new file mode 100644
index 00000000000..71c64837b43
--- /dev/null
+++ b/drivers/lguest/lguest_device.c
@@ -0,0 +1,373 @@
1/*P:050 Lguest guests use a very simple method to describe devices. It's a
2 * series of device descriptors contained just above the top of normal
3 * memory.
4 *
5 * We use the standard "virtio" device infrastructure, which provides us with a
6 * console, a network and a block driver. Each one expects some configuration
7 * information and a "virtqueue" mechanism to send and receive data. :*/
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/lguest_launcher.h>
11#include <linux/virtio.h>
12#include <linux/virtio_config.h>
13#include <linux/interrupt.h>
14#include <linux/virtio_ring.h>
15#include <linux/err.h>
16#include <asm/io.h>
17#include <asm/paravirt.h>
18#include <asm/lguest_hcall.h>
19
20/* The pointer to our (page) of device descriptions. */
21static void *lguest_devices;
22
23/* Unique numbering for lguest devices. */
24static unsigned int dev_index;
25
26/* For Guests, device memory can be used as normal memory, so we cast away the
27 * __iomem to quieten sparse. */
28static inline void *lguest_map(unsigned long phys_addr, unsigned long pages)
29{
30 return (__force void *)ioremap(phys_addr, PAGE_SIZE*pages);
31}
32
33static inline void lguest_unmap(void *addr)
34{
35 iounmap((__force void __iomem *)addr);
36}
37
38/*D:100 Each lguest device is just a virtio device plus a pointer to its entry
39 * in the lguest_devices page. */
40struct lguest_device {
41 struct virtio_device vdev;
42
43 /* The entry in the lguest_devices page for this device. */
44 struct lguest_device_desc *desc;
45};
46
47/* Since the virtio infrastructure hands us a pointer to the virtio_device all
48 * the time, it helps to have a curt macro to get a pointer to the struct
49 * lguest_device it's enclosed in. */
50#define to_lgdev(vdev) container_of(vdev, struct lguest_device, vdev)
51
52/*D:130
53 * Device configurations
54 *
55 * The configuration information for a device consists of a series of fields.
56 * The device will look for these fields during setup.
57 *
58 * For us these fields come immediately after that device's descriptor in the
59 * lguest_devices page.
60 *
61 * Each field starts with a "type" byte, a "length" byte, then that number of
62 * bytes of configuration information. The device descriptor tells us the
63 * total configuration length so we know when we've reached the last field. */
64
65/* type + length bytes */
66#define FHDR_LEN 2
67
68/* This finds the first field of a given type for a device's configuration. */
69static void *lg_find(struct virtio_device *vdev, u8 type, unsigned int *len)
70{
71 struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
72 int i;
73
74 for (i = 0; i < desc->config_len; i += FHDR_LEN + desc->config[i+1]) {
75 if (desc->config[i] == type) {
76 /* Mark it used, so Host can know we looked at it, and
77 * also so we won't find the same one twice. */
78 desc->config[i] |= 0x80;
79 /* Remember, the second byte is the length. */
80 *len = desc->config[i+1];
81 /* We return a pointer to the field header. */
82 return desc->config + i;
83 }
84 }
85
86 /* Not found: return NULL for failure. */
87 return NULL;
88}
89
90/* Once they've found a field, getting a copy of it is easy. */
91static void lg_get(struct virtio_device *vdev, void *token,
92 void *buf, unsigned len)
93{
94 /* Check they didn't ask for more than the length of the field! */
95 BUG_ON(len > ((u8 *)token)[1]);
96 memcpy(buf, token + FHDR_LEN, len);
97}
98
99/* Setting the contents is also trivial. */
100static void lg_set(struct virtio_device *vdev, void *token,
101 const void *buf, unsigned len)
102{
103 BUG_ON(len > ((u8 *)token)[1]);
104 memcpy(token + FHDR_LEN, buf, len);
105}
106
107/* The operations to get and set the status word just access the status field
108 * of the device descriptor. */
109static u8 lg_get_status(struct virtio_device *vdev)
110{
111 return to_lgdev(vdev)->desc->status;
112}
113
114static void lg_set_status(struct virtio_device *vdev, u8 status)
115{
116 to_lgdev(vdev)->desc->status = status;
117}
118
119/*
120 * Virtqueues
121 *
122 * The other piece of infrastructure virtio needs is a "virtqueue": a way of
123 * the Guest device registering buffers for the other side to read from or
124 * write into (ie. send and receive buffers). Each device can have multiple
125 * virtqueues: for example the console has one queue for sending and one for
126 * receiving.
127 *
128 * Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue
129 * already exists in virtio_ring.c. We just need to connect it up.
130 *
131 * We start with the information we need to keep about each virtqueue.
132 */
133
134/*D:140 This is the information we remember about each virtqueue. */
135struct lguest_vq_info
136{
137 /* A copy of the information contained in the device config. */
138 struct lguest_vqconfig config;
139
140 /* The address where we mapped the virtio ring, so we can unmap it. */
141 void *pages;
142};
143
144/* When the virtio_ring code wants to prod the Host, it calls us here and we
145 * make a hypercall. We hand the page number of the virtqueue so the Host
146 * knows which virtqueue we're talking about. */
147static void lg_notify(struct virtqueue *vq)
148{
149 /* We store our virtqueue information in the "priv" pointer of the
150 * virtqueue structure. */
151 struct lguest_vq_info *lvq = vq->priv;
152
153 hcall(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT, 0, 0);
154}
155
156/* This routine finds the first virtqueue described in the configuration of
157 * this device and sets it up.
158 *
159 * This is kind of an ugly duckling. It'd be nicer to have a standard
160 * representation of a virtqueue in the configuration space, but it seems that
161 * everyone wants to do it differently. The KVM guys want the Guest to
162 * allocate its own pages and tell the Host where they are, but for lguest it's
163 * simpler for the Host to simply tell us where the pages are.
164 *
165 * So we provide devices with a "find virtqueue and set it up" function. */
166static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
167 bool (*callback)(struct virtqueue *vq))
168{
169 struct lguest_vq_info *lvq;
170 struct virtqueue *vq;
171 unsigned int len;
172 void *token;
173 int err;
174
175 /* Look for a field of the correct type to mark a virtqueue. Note that
176 * if this succeeds, then the type will be changed so it won't be found
177 * again, and future lg_find_vq() calls will find the next
178 * virtqueue (if any). */
179 token = vdev->config->find(vdev, VIRTIO_CONFIG_F_VIRTQUEUE, &len);
180 if (!token)
181 return ERR_PTR(-ENOENT);
182
183 lvq = kmalloc(sizeof(*lvq), GFP_KERNEL);
184 if (!lvq)
185 return ERR_PTR(-ENOMEM);
186
187 /* Note: we could use a configuration space inside here, just like we
188 * do for the device. This would allow expansion in future, because
189 * our configuration system is designed to be expansible. But this is
190 * way easier. */
191 if (len != sizeof(lvq->config)) {
192 dev_err(&vdev->dev, "Unexpected virtio config len %u\n", len);
193 err = -EIO;
194 goto free_lvq;
195 }
196 /* Make a copy of the "struct lguest_vqconfig" field. We need a copy
197 * because the config space might not be aligned correctly. */
198 vdev->config->get(vdev, token, &lvq->config, sizeof(lvq->config));
199
200 /* Figure out how many pages the ring will take, and map that memory */
201 lvq->pages = lguest_map((unsigned long)lvq->config.pfn << PAGE_SHIFT,
202 DIV_ROUND_UP(vring_size(lvq->config.num),
203 PAGE_SIZE));
204 if (!lvq->pages) {
205 err = -ENOMEM;
206 goto free_lvq;
207 }
208
209 /* OK, tell virtio_ring.c to set up a virtqueue now we know its size
210 * and we've got a pointer to its pages. */
211 vq = vring_new_virtqueue(lvq->config.num, vdev, lvq->pages,
212 lg_notify, callback);
213 if (!vq) {
214 err = -ENOMEM;
215 goto unmap;
216 }
217
218 /* Tell the interrupt for this virtqueue to go to the virtio_ring
219 * interrupt handler. */
220 /* FIXME: We used to have a flag for the Host to tell us we could use
221 * the interrupt as a source of randomness: it'd be nice to have that
222 * back.. */
223 err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED,
224 vdev->dev.bus_id, vq);
225 if (err)
226 goto destroy_vring;
227
228 /* Last of all we hook up our 'struct lguest_vq_info" to the
229 * virtqueue's priv pointer. */
230 vq->priv = lvq;
231 return vq;
232
233destroy_vring:
234 vring_del_virtqueue(vq);
235unmap:
236 lguest_unmap(lvq->pages);
237free_lvq:
238 kfree(lvq);
239 return ERR_PTR(err);
240}
241/*:*/
242
243/* Cleaning up a virtqueue is easy */
244static void lg_del_vq(struct virtqueue *vq)
245{
246 struct lguest_vq_info *lvq = vq->priv;
247
248 /* Tell virtio_ring.c to free the virtqueue. */
249 vring_del_virtqueue(vq);
250 /* Unmap the pages containing the ring. */
251 lguest_unmap(lvq->pages);
252 /* Free our own queue information. */
253 kfree(lvq);
254}
255
256/* The ops structure which hooks everything together. */
257static struct virtio_config_ops lguest_config_ops = {
258 .find = lg_find,
259 .get = lg_get,
260 .set = lg_set,
261 .get_status = lg_get_status,
262 .set_status = lg_set_status,
263 .find_vq = lg_find_vq,
264 .del_vq = lg_del_vq,
265};
266
267/* The root device for the lguest virtio devices. This makes them appear as
268 * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. */
269static struct device lguest_root = {
270 .parent = NULL,
271 .bus_id = "lguest",
272};
273
274/*D:120 This is the core of the lguest bus: actually adding a new device.
275 * It's a separate function because it's neater that way, and because an
276 * earlier version of the code supported hotplug and unplug. They were removed
277 * early on because they were never used.
278 *
279 * As Andrew Tridgell says, "Untested code is buggy code".
280 *
281 * It's worth reading this carefully: we start with a pointer to the new device
282 * descriptor in the "lguest_devices" page. */
283static void add_lguest_device(struct lguest_device_desc *d)
284{
285 struct lguest_device *ldev;
286
287 ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
288 if (!ldev) {
289 printk(KERN_EMERG "Cannot allocate lguest dev %u\n",
290 dev_index++);
291 return;
292 }
293
294 /* This devices' parent is the lguest/ dir. */
295 ldev->vdev.dev.parent = &lguest_root;
296 /* We have a unique device index thanks to the dev_index counter. */
297 ldev->vdev.index = dev_index++;
298 /* The device type comes straight from the descriptor. There's also a
299 * device vendor field in the virtio_device struct, which we leave as
300 * 0. */
301 ldev->vdev.id.device = d->type;
302 /* We have a simple set of routines for querying the device's
303 * configuration information and setting its status. */
304 ldev->vdev.config = &lguest_config_ops;
305 /* And we remember the device's descriptor for lguest_config_ops. */
306 ldev->desc = d;
307
308 /* register_virtio_device() sets up the generic fields for the struct
309 * virtio_device and calls device_register(). This makes the bus
310 * infrastructure look for a matching driver. */
311 if (register_virtio_device(&ldev->vdev) != 0) {
312 printk(KERN_ERR "Failed to register lguest device %u\n",
313 ldev->vdev.index);
314 kfree(ldev);
315 }
316}
317
318/*D:110 scan_devices() simply iterates through the device page. The type 0 is
319 * reserved to mean "end of devices". */
320static void scan_devices(void)
321{
322 unsigned int i;
323 struct lguest_device_desc *d;
324
325 /* We start at the page beginning, and skip over each entry. */
326 for (i = 0; i < PAGE_SIZE; i += sizeof(*d) + d->config_len) {
327 d = lguest_devices + i;
328
329 /* Once we hit a zero, stop. */
330 if (d->type == 0)
331 break;
332
333 add_lguest_device(d);
334 }
335}
336
337/*D:105 Fairly early in boot, lguest_devices_init() is called to set up the
338 * lguest device infrastructure. We check that we are a Guest by checking
339 * pv_info.name: there are other ways of checking, but this seems most
340 * obvious to me.
341 *
342 * So we can access the "struct lguest_device_desc"s easily, we map that memory
343 * and store the pointer in the global "lguest_devices". Then we register a
344 * root device from which all our devices will hang (this seems to be the
345 * correct sysfs incantation).
346 *
347 * Finally we call scan_devices() which adds all the devices found in the
348 * lguest_devices page. */
349static int __init lguest_devices_init(void)
350{
351 if (strcmp(pv_info.name, "lguest") != 0)
352 return 0;
353
354 if (device_register(&lguest_root) != 0)
355 panic("Could not register lguest root");
356
357 /* Devices are in a single page above top of "normal" mem */
358 lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1);
359
360 scan_devices();
361 return 0;
362}
363/* We do this after core stuff, but before the drivers. */
364postcore_initcall(lguest_devices_init);
365
366/*D:150 At this point in the journey we used to now wade through the lguest
367 * devices themselves: net, block and console. Since they're all now virtio
368 * devices rather than lguest-specific, I've decided to ignore them. Mostly,
369 * they're kind of boring. But this does mean you'll never experience the
370 * thrill of reading the forbidden love scene buried deep in the block driver.
371 *
372 * "make Launcher" beckons, where we answer questions like "Where do Guests
373 * come from?", and "What do you do when someone asks for optimization?". */
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 80d1b58c769..ee405b38383 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -1,73 +1,17 @@
1/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher 1/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher
2 * controls and communicates with the Guest. For example, the first write will 2 * controls and communicates with the Guest. For example, the first write will
3 * tell us the memory size, pagetable, entry point and kernel address offset. 3 * tell us the Guest's memory layout, pagetable, entry point and kernel address
4 * A read will run the Guest until a signal is pending (-EINTR), or the Guest 4 * offset. A read will run the Guest until something happens, such as a signal
5 * does a DMA out to the Launcher. Writes are also used to get a DMA buffer 5 * or the Guest doing a NOTIFY out to the Launcher. :*/
6 * registered by the Guest and to send the Guest an interrupt. :*/
7#include <linux/uaccess.h> 6#include <linux/uaccess.h>
8#include <linux/miscdevice.h> 7#include <linux/miscdevice.h>
9#include <linux/fs.h> 8#include <linux/fs.h>
10#include "lg.h" 9#include "lg.h"
11 10
12/*L:030 setup_regs() doesn't really belong in this file, but it gives us an
13 * early glimpse deeper into the Host so it's worth having here.
14 *
15 * Most of the Guest's registers are left alone: we used get_zeroed_page() to
16 * allocate the structure, so they will be 0. */
17static void setup_regs(struct lguest_regs *regs, unsigned long start)
18{
19 /* There are four "segment" registers which the Guest needs to boot:
20 * The "code segment" register (cs) refers to the kernel code segment
21 * __KERNEL_CS, and the "data", "extra" and "stack" segment registers
22 * refer to the kernel data segment __KERNEL_DS.
23 *
24 * The privilege level is packed into the lower bits. The Guest runs
25 * at privilege level 1 (GUEST_PL).*/
26 regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
27 regs->cs = __KERNEL_CS|GUEST_PL;
28
29 /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002)
30 * is supposed to always be "1". Bit 9 (0x200) controls whether
31 * interrupts are enabled. We always leave interrupts enabled while
32 * running the Guest. */
33 regs->eflags = 0x202;
34
35 /* The "Extended Instruction Pointer" register says where the Guest is
36 * running. */
37 regs->eip = start;
38
39 /* %esi points to our boot information, at physical address 0, so don't
40 * touch it. */
41}
42
43/*L:310 To send DMA into the Guest, the Launcher needs to be able to ask for a
44 * DMA buffer. This is done by writing LHREQ_GETDMA and the key to
45 * /dev/lguest. */
46static long user_get_dma(struct lguest *lg, const u32 __user *input)
47{
48 unsigned long key, udma, irq;
49
50 /* Fetch the key they wrote to us. */
51 if (get_user(key, input) != 0)
52 return -EFAULT;
53 /* Look for a free Guest DMA buffer bound to that key. */
54 udma = get_dma_buffer(lg, key, &irq);
55 if (!udma)
56 return -ENOENT;
57
58 /* We need to tell the Launcher what interrupt the Guest expects after
59 * the buffer is filled. We stash it in udma->used_len. */
60 lgwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq);
61
62 /* The (guest-physical) address of the DMA buffer is returned from
63 * the write(). */
64 return udma;
65}
66
67/*L:315 To force the Guest to stop running and return to the Launcher, the 11/*L:315 To force the Guest to stop running and return to the Launcher, the
68 * Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest. The 12 * Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest. The
69 * Launcher then writes LHREQ_BREAK and "0" to release the Waker. */ 13 * Launcher then writes LHREQ_BREAK and "0" to release the Waker. */
70static int break_guest_out(struct lguest *lg, const u32 __user *input) 14static int break_guest_out(struct lguest *lg, const unsigned long __user *input)
71{ 15{
72 unsigned long on; 16 unsigned long on;
73 17
@@ -90,9 +34,9 @@ static int break_guest_out(struct lguest *lg, const u32 __user *input)
90 34
91/*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt 35/*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
92 * number to /dev/lguest. */ 36 * number to /dev/lguest. */
93static int user_send_irq(struct lguest *lg, const u32 __user *input) 37static int user_send_irq(struct lguest *lg, const unsigned long __user *input)
94{ 38{
95 u32 irq; 39 unsigned long irq;
96 40
97 if (get_user(irq, input) != 0) 41 if (get_user(irq, input) != 0)
98 return -EFAULT; 42 return -EFAULT;
@@ -133,17 +77,19 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
133 return len; 77 return len;
134 } 78 }
135 79
136 /* If we returned from read() last time because the Guest sent DMA, 80 /* If we returned from read() last time because the Guest notified,
137 * clear the flag. */ 81 * clear the flag. */
138 if (lg->dma_is_pending) 82 if (lg->pending_notify)
139 lg->dma_is_pending = 0; 83 lg->pending_notify = 0;
140 84
141 /* Run the Guest until something interesting happens. */ 85 /* Run the Guest until something interesting happens. */
142 return run_guest(lg, (unsigned long __user *)user); 86 return run_guest(lg, (unsigned long __user *)user);
143} 87}
144 88
145/*L:020 The initialization write supplies 4 32-bit values (in addition to the 89/*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit)
146 * 32-bit LHREQ_INITIALIZE value). These are: 90 * values (in addition to the LHREQ_INITIALIZE value). These are:
91 *
92 * base: The start of the Guest-physical memory inside the Launcher memory.
147 * 93 *
148 * pfnlimit: The highest (Guest-physical) page number the Guest should be 94 * pfnlimit: The highest (Guest-physical) page number the Guest should be
149 * allowed to access. The Launcher has to live in Guest memory, so it sets 95 * allowed to access. The Launcher has to live in Guest memory, so it sets
@@ -153,23 +99,17 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
153 * pagetables (which are set up by the Launcher). 99 * pagetables (which are set up by the Launcher).
154 * 100 *
155 * start: The first instruction to execute ("eip" in x86-speak). 101 * start: The first instruction to execute ("eip" in x86-speak).
156 *
157 * page_offset: The PAGE_OFFSET constant in the Guest kernel. We should
158 * probably wean the code off this, but it's a very useful constant! Any
159 * address above this is within the Guest kernel, and any kernel address can
160 * quickly converted from physical to virtual by adding PAGE_OFFSET. It's
161 * 0xC0000000 (3G) by default, but it's configurable at kernel build time.
162 */ 102 */
163static int initialize(struct file *file, const u32 __user *input) 103static int initialize(struct file *file, const unsigned long __user *input)
164{ 104{
165 /* "struct lguest" contains everything we (the Host) know about a 105 /* "struct lguest" contains everything we (the Host) know about a
166 * Guest. */ 106 * Guest. */
167 struct lguest *lg; 107 struct lguest *lg;
168 int err, i; 108 int err;
169 u32 args[4]; 109 unsigned long args[4];
170 110
171 /* We grab the Big Lguest lock, which protects the global array 111 /* We grab the Big Lguest lock, which protects against multiple
172 * "lguests" and multiple simultaneous initializations. */ 112 * simultaneous initializations. */
173 mutex_lock(&lguest_lock); 113 mutex_lock(&lguest_lock);
174 /* You can't initialize twice! Close the device and start again... */ 114 /* You can't initialize twice! Close the device and start again... */
175 if (file->private_data) { 115 if (file->private_data) {
@@ -182,20 +122,15 @@ static int initialize(struct file *file, const u32 __user *input)
182 goto unlock; 122 goto unlock;
183 } 123 }
184 124
185 /* Find an unused guest. */ 125 lg = kzalloc(sizeof(*lg), GFP_KERNEL);
186 i = find_free_guest(); 126 if (!lg) {
187 if (i < 0) { 127 err = -ENOMEM;
188 err = -ENOSPC;
189 goto unlock; 128 goto unlock;
190 } 129 }
191 /* OK, we have an index into the "lguest" array: "lg" is a convenient
192 * pointer. */
193 lg = &lguests[i];
194 130
195 /* Populate the easy fields of our "struct lguest" */ 131 /* Populate the easy fields of our "struct lguest" */
196 lg->guestid = i; 132 lg->mem_base = (void __user *)(long)args[0];
197 lg->pfn_limit = args[0]; 133 lg->pfn_limit = args[1];
198 lg->page_offset = args[3];
199 134
200 /* We need a complete page for the Guest registers: they are accessible 135 /* We need a complete page for the Guest registers: they are accessible
201 * to the Guest and we can only grant it access to whole pages. */ 136 * to the Guest and we can only grant it access to whole pages. */
@@ -210,17 +145,13 @@ static int initialize(struct file *file, const u32 __user *input)
210 /* Initialize the Guest's shadow page tables, using the toplevel 145 /* Initialize the Guest's shadow page tables, using the toplevel
211 * address the Launcher gave us. This allocates memory, so can 146 * address the Launcher gave us. This allocates memory, so can
212 * fail. */ 147 * fail. */
213 err = init_guest_pagetable(lg, args[1]); 148 err = init_guest_pagetable(lg, args[2]);
214 if (err) 149 if (err)
215 goto free_regs; 150 goto free_regs;
216 151
217 /* Now we initialize the Guest's registers, handing it the start 152 /* Now we initialize the Guest's registers, handing it the start
218 * address. */ 153 * address. */
219 setup_regs(lg->regs, args[2]); 154 lguest_arch_setup_regs(lg, args[3]);
220
221 /* There are a couple of GDT entries the Guest expects when first
222 * booting. */
223 setup_guest_gdt(lg);
224 155
225 /* The timer for lguest's clock needs initialization. */ 156 /* The timer for lguest's clock needs initialization. */
226 init_clockdev(lg); 157 init_clockdev(lg);
@@ -260,18 +191,19 @@ unlock:
260/*L:010 The first operation the Launcher does must be a write. All writes 191/*L:010 The first operation the Launcher does must be a write. All writes
261 * start with a 32 bit number: for the first write this must be 192 * start with a 32 bit number: for the first write this must be
262 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use 193 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use
263 * writes of other values to get DMA buffers and send interrupts. */ 194 * writes of other values to send interrupts. */
264static ssize_t write(struct file *file, const char __user *input, 195static ssize_t write(struct file *file, const char __user *in,
265 size_t size, loff_t *off) 196 size_t size, loff_t *off)
266{ 197{
267 /* Once the guest is initialized, we hold the "struct lguest" in the 198 /* Once the guest is initialized, we hold the "struct lguest" in the
268 * file private data. */ 199 * file private data. */
269 struct lguest *lg = file->private_data; 200 struct lguest *lg = file->private_data;
270 u32 req; 201 const unsigned long __user *input = (const unsigned long __user *)in;
202 unsigned long req;
271 203
272 if (get_user(req, input) != 0) 204 if (get_user(req, input) != 0)
273 return -EFAULT; 205 return -EFAULT;
274 input += sizeof(req); 206 input++;
275 207
276 /* If you haven't initialized, you must do that first. */ 208 /* If you haven't initialized, you must do that first. */
277 if (req != LHREQ_INITIALIZE && !lg) 209 if (req != LHREQ_INITIALIZE && !lg)
@@ -287,13 +219,11 @@ static ssize_t write(struct file *file, const char __user *input,
287 219
288 switch (req) { 220 switch (req) {
289 case LHREQ_INITIALIZE: 221 case LHREQ_INITIALIZE:
290 return initialize(file, (const u32 __user *)input); 222 return initialize(file, input);
291 case LHREQ_GETDMA:
292 return user_get_dma(lg, (const u32 __user *)input);
293 case LHREQ_IRQ: 223 case LHREQ_IRQ:
294 return user_send_irq(lg, (const u32 __user *)input); 224 return user_send_irq(lg, input);
295 case LHREQ_BREAK: 225 case LHREQ_BREAK:
296 return break_guest_out(lg, (const u32 __user *)input); 226 return break_guest_out(lg, input);
297 default: 227 default:
298 return -EINVAL; 228 return -EINVAL;
299 } 229 }
@@ -319,8 +249,6 @@ static int close(struct inode *inode, struct file *file)
319 mutex_lock(&lguest_lock); 249 mutex_lock(&lguest_lock);
320 /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ 250 /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
321 hrtimer_cancel(&lg->hrt); 251 hrtimer_cancel(&lg->hrt);
322 /* Free any DMA buffers the Guest had bound. */
323 release_all_dma(lg);
324 /* Free up the shadow page tables for the Guest. */ 252 /* Free up the shadow page tables for the Guest. */
325 free_guest_pagetable(lg); 253 free_guest_pagetable(lg);
326 /* Now all the memory cleanups are done, it's safe to release the 254 /* Now all the memory cleanups are done, it's safe to release the
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index b7a924ace68..2a45f0691c9 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -13,6 +13,7 @@
13#include <linux/random.h> 13#include <linux/random.h>
14#include <linux/percpu.h> 14#include <linux/percpu.h>
15#include <asm/tlbflush.h> 15#include <asm/tlbflush.h>
16#include <asm/uaccess.h>
16#include "lg.h" 17#include "lg.h"
17 18
18/*M:008 We hold reference to pages, which prevents them from being swapped. 19/*M:008 We hold reference to pages, which prevents them from being swapped.
@@ -44,44 +45,32 @@
44 * (vii) Setting up the page tables initially. 45 * (vii) Setting up the page tables initially.
45 :*/ 46 :*/
46 47
47/* Pages a 4k long, and each page table entry is 4 bytes long, giving us 1024
48 * (or 2^10) entries per page. */
49#define PTES_PER_PAGE_SHIFT 10
50#define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT)
51 48
52/* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is 49/* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is
53 * conveniently placed at the top 4MB, so it uses a separate, complete PTE 50 * conveniently placed at the top 4MB, so it uses a separate, complete PTE
54 * page. */ 51 * page. */
55#define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1) 52#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
56 53
57/* We actually need a separate PTE page for each CPU. Remember that after the 54/* We actually need a separate PTE page for each CPU. Remember that after the
58 * Switcher code itself comes two pages for each CPU, and we don't want this 55 * Switcher code itself comes two pages for each CPU, and we don't want this
59 * CPU's guest to see the pages of any other CPU. */ 56 * CPU's guest to see the pages of any other CPU. */
60static DEFINE_PER_CPU(spte_t *, switcher_pte_pages); 57static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
61#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) 58#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
62 59
63/*H:320 With our shadow and Guest types established, we need to deal with 60/*H:320 With our shadow and Guest types established, we need to deal with
64 * them: the page table code is curly enough to need helper functions to keep 61 * them: the page table code is curly enough to need helper functions to keep
65 * it clear and clean. 62 * it clear and clean.
66 * 63 *
67 * The first helper takes a virtual address, and says which entry in the top 64 * There are two functions which return pointers to the shadow (aka "real")
68 * level page table deals with that address. Since each top level entry deals
69 * with 4M, this effectively divides by 4M. */
70static unsigned vaddr_to_pgd_index(unsigned long vaddr)
71{
72 return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
73}
74
75/* There are two functions which return pointers to the shadow (aka "real")
76 * page tables. 65 * page tables.
77 * 66 *
78 * spgd_addr() takes the virtual address and returns a pointer to the top-level 67 * spgd_addr() takes the virtual address and returns a pointer to the top-level
79 * page directory entry for that address. Since we keep track of several page 68 * page directory entry for that address. Since we keep track of several page
80 * tables, the "i" argument tells us which one we're interested in (it's 69 * tables, the "i" argument tells us which one we're interested in (it's
81 * usually the current one). */ 70 * usually the current one). */
82static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) 71static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
83{ 72{
84 unsigned int index = vaddr_to_pgd_index(vaddr); 73 unsigned int index = pgd_index(vaddr);
85 74
86 /* We kill any Guest trying to touch the Switcher addresses. */ 75 /* We kill any Guest trying to touch the Switcher addresses. */
87 if (index >= SWITCHER_PGD_INDEX) { 76 if (index >= SWITCHER_PGD_INDEX) {
@@ -95,28 +84,28 @@ static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
95/* This routine then takes the PGD entry given above, which contains the 84/* This routine then takes the PGD entry given above, which contains the
96 * address of the PTE page. It then returns a pointer to the PTE entry for the 85 * address of the PTE page. It then returns a pointer to the PTE entry for the
97 * given address. */ 86 * given address. */
98static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr) 87static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr)
99{ 88{
100 spte_t *page = __va(spgd.pfn << PAGE_SHIFT); 89 pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
101 /* You should never call this if the PGD entry wasn't valid */ 90 /* You should never call this if the PGD entry wasn't valid */
102 BUG_ON(!(spgd.flags & _PAGE_PRESENT)); 91 BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
103 return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE]; 92 return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE];
104} 93}
105 94
106/* These two functions just like the above two, except they access the Guest 95/* These two functions just like the above two, except they access the Guest
107 * page tables. Hence they return a Guest address. */ 96 * page tables. Hence they return a Guest address. */
108static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) 97static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr)
109{ 98{
110 unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); 99 unsigned int index = vaddr >> (PGDIR_SHIFT);
111 return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(gpgd_t); 100 return lg->pgdirs[lg->pgdidx].gpgdir + index * sizeof(pgd_t);
112} 101}
113 102
114static unsigned long gpte_addr(struct lguest *lg, 103static unsigned long gpte_addr(struct lguest *lg,
115 gpgd_t gpgd, unsigned long vaddr) 104 pgd_t gpgd, unsigned long vaddr)
116{ 105{
117 unsigned long gpage = gpgd.pfn << PAGE_SHIFT; 106 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
118 BUG_ON(!(gpgd.flags & _PAGE_PRESENT)); 107 BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
119 return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t); 108 return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t);
120} 109}
121 110
122/*H:350 This routine takes a page number given by the Guest and converts it to 111/*H:350 This routine takes a page number given by the Guest and converts it to
@@ -149,53 +138,55 @@ static unsigned long get_pfn(unsigned long virtpfn, int write)
149 * entry can be a little tricky. The flags are (almost) the same, but the 138 * entry can be a little tricky. The flags are (almost) the same, but the
150 * Guest PTE contains a virtual page number: the CPU needs the real page 139 * Guest PTE contains a virtual page number: the CPU needs the real page
151 * number. */ 140 * number. */
152static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write) 141static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write)
153{ 142{
154 spte_t spte; 143 unsigned long pfn, base, flags;
155 unsigned long pfn;
156 144
157 /* The Guest sets the global flag, because it thinks that it is using 145 /* The Guest sets the global flag, because it thinks that it is using
158 * PGE. We only told it to use PGE so it would tell us whether it was 146 * PGE. We only told it to use PGE so it would tell us whether it was
159 * flushing a kernel mapping or a userspace mapping. We don't actually 147 * flushing a kernel mapping or a userspace mapping. We don't actually
160 * use the global bit, so throw it away. */ 148 * use the global bit, so throw it away. */
161 spte.flags = (gpte.flags & ~_PAGE_GLOBAL); 149 flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
150
151 /* The Guest's pages are offset inside the Launcher. */
152 base = (unsigned long)lg->mem_base / PAGE_SIZE;
162 153
163 /* We need a temporary "unsigned long" variable to hold the answer from 154 /* We need a temporary "unsigned long" variable to hold the answer from
164 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't 155 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
165 * fit in spte.pfn. get_pfn() finds the real physical number of the 156 * fit in spte.pfn. get_pfn() finds the real physical number of the
166 * page, given the virtual number. */ 157 * page, given the virtual number. */
167 pfn = get_pfn(gpte.pfn, write); 158 pfn = get_pfn(base + pte_pfn(gpte), write);
168 if (pfn == -1UL) { 159 if (pfn == -1UL) {
169 kill_guest(lg, "failed to get page %u", gpte.pfn); 160 kill_guest(lg, "failed to get page %lu", pte_pfn(gpte));
170 /* When we destroy the Guest, we'll go through the shadow page 161 /* When we destroy the Guest, we'll go through the shadow page
171 * tables and release_pte() them. Make sure we don't think 162 * tables and release_pte() them. Make sure we don't think
172 * this one is valid! */ 163 * this one is valid! */
173 spte.flags = 0; 164 flags = 0;
174 } 165 }
175 /* Now we assign the page number, and our shadow PTE is complete. */ 166 /* Now we assemble our shadow PTE from the page number and flags. */
176 spte.pfn = pfn; 167 return pfn_pte(pfn, __pgprot(flags));
177 return spte;
178} 168}
179 169
180/*H:460 And to complete the chain, release_pte() looks like this: */ 170/*H:460 And to complete the chain, release_pte() looks like this: */
181static void release_pte(spte_t pte) 171static void release_pte(pte_t pte)
182{ 172{
183 /* Remember that get_user_pages() took a reference to the page, in 173 /* Remember that get_user_pages() took a reference to the page, in
184 * get_pfn()? We have to put it back now. */ 174 * get_pfn()? We have to put it back now. */
185 if (pte.flags & _PAGE_PRESENT) 175 if (pte_flags(pte) & _PAGE_PRESENT)
186 put_page(pfn_to_page(pte.pfn)); 176 put_page(pfn_to_page(pte_pfn(pte)));
187} 177}
188/*:*/ 178/*:*/
189 179
190static void check_gpte(struct lguest *lg, gpte_t gpte) 180static void check_gpte(struct lguest *lg, pte_t gpte)
191{ 181{
192 if ((gpte.flags & (_PAGE_PWT|_PAGE_PSE)) || gpte.pfn >= lg->pfn_limit) 182 if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE))
183 || pte_pfn(gpte) >= lg->pfn_limit)
193 kill_guest(lg, "bad page table entry"); 184 kill_guest(lg, "bad page table entry");
194} 185}
195 186
196static void check_gpgd(struct lguest *lg, gpgd_t gpgd) 187static void check_gpgd(struct lguest *lg, pgd_t gpgd)
197{ 188{
198 if ((gpgd.flags & ~_PAGE_TABLE) || gpgd.pfn >= lg->pfn_limit) 189 if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || pgd_pfn(gpgd) >= lg->pfn_limit)
199 kill_guest(lg, "bad page directory entry"); 190 kill_guest(lg, "bad page directory entry");
200} 191}
201 192
@@ -211,21 +202,21 @@ static void check_gpgd(struct lguest *lg, gpgd_t gpgd)
211 * true. */ 202 * true. */
212int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) 203int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
213{ 204{
214 gpgd_t gpgd; 205 pgd_t gpgd;
215 spgd_t *spgd; 206 pgd_t *spgd;
216 unsigned long gpte_ptr; 207 unsigned long gpte_ptr;
217 gpte_t gpte; 208 pte_t gpte;
218 spte_t *spte; 209 pte_t *spte;
219 210
220 /* First step: get the top-level Guest page table entry. */ 211 /* First step: get the top-level Guest page table entry. */
221 gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr))); 212 gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t);
222 /* Toplevel not present? We can't map it in. */ 213 /* Toplevel not present? We can't map it in. */
223 if (!(gpgd.flags & _PAGE_PRESENT)) 214 if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
224 return 0; 215 return 0;
225 216
226 /* Now look at the matching shadow entry. */ 217 /* Now look at the matching shadow entry. */
227 spgd = spgd_addr(lg, lg->pgdidx, vaddr); 218 spgd = spgd_addr(lg, lg->pgdidx, vaddr);
228 if (!(spgd->flags & _PAGE_PRESENT)) { 219 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
229 /* No shadow entry: allocate a new shadow PTE page. */ 220 /* No shadow entry: allocate a new shadow PTE page. */
230 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 221 unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
231 /* This is not really the Guest's fault, but killing it is 222 /* This is not really the Guest's fault, but killing it is
@@ -238,34 +229,35 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
238 check_gpgd(lg, gpgd); 229 check_gpgd(lg, gpgd);
239 /* And we copy the flags to the shadow PGD entry. The page 230 /* And we copy the flags to the shadow PGD entry. The page
240 * number in the shadow PGD is the page we just allocated. */ 231 * number in the shadow PGD is the page we just allocated. */
241 spgd->raw.val = (__pa(ptepage) | gpgd.flags); 232 *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd));
242 } 233 }
243 234
244 /* OK, now we look at the lower level in the Guest page table: keep its 235 /* OK, now we look at the lower level in the Guest page table: keep its
245 * address, because we might update it later. */ 236 * address, because we might update it later. */
246 gpte_ptr = gpte_addr(lg, gpgd, vaddr); 237 gpte_ptr = gpte_addr(lg, gpgd, vaddr);
247 gpte = mkgpte(lgread_u32(lg, gpte_ptr)); 238 gpte = lgread(lg, gpte_ptr, pte_t);
248 239
249 /* If this page isn't in the Guest page tables, we can't page it in. */ 240 /* If this page isn't in the Guest page tables, we can't page it in. */
250 if (!(gpte.flags & _PAGE_PRESENT)) 241 if (!(pte_flags(gpte) & _PAGE_PRESENT))
251 return 0; 242 return 0;
252 243
253 /* Check they're not trying to write to a page the Guest wants 244 /* Check they're not trying to write to a page the Guest wants
254 * read-only (bit 2 of errcode == write). */ 245 * read-only (bit 2 of errcode == write). */
255 if ((errcode & 2) && !(gpte.flags & _PAGE_RW)) 246 if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
256 return 0; 247 return 0;
257 248
258 /* User access to a kernel page? (bit 3 == user access) */ 249 /* User access to a kernel page? (bit 3 == user access) */
259 if ((errcode & 4) && !(gpte.flags & _PAGE_USER)) 250 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
260 return 0; 251 return 0;
261 252
262 /* Check that the Guest PTE flags are OK, and the page number is below 253 /* Check that the Guest PTE flags are OK, and the page number is below
263 * the pfn_limit (ie. not mapping the Launcher binary). */ 254 * the pfn_limit (ie. not mapping the Launcher binary). */
264 check_gpte(lg, gpte); 255 check_gpte(lg, gpte);
265 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ 256 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
266 gpte.flags |= _PAGE_ACCESSED; 257 gpte = pte_mkyoung(gpte);
258
267 if (errcode & 2) 259 if (errcode & 2)
268 gpte.flags |= _PAGE_DIRTY; 260 gpte = pte_mkdirty(gpte);
269 261
270 /* Get the pointer to the shadow PTE entry we're going to set. */ 262 /* Get the pointer to the shadow PTE entry we're going to set. */
271 spte = spte_addr(lg, *spgd, vaddr); 263 spte = spte_addr(lg, *spgd, vaddr);
@@ -275,21 +267,18 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
275 267
276 /* If this is a write, we insist that the Guest page is writable (the 268 /* If this is a write, we insist that the Guest page is writable (the
277 * final arg to gpte_to_spte()). */ 269 * final arg to gpte_to_spte()). */
278 if (gpte.flags & _PAGE_DIRTY) 270 if (pte_dirty(gpte))
279 *spte = gpte_to_spte(lg, gpte, 1); 271 *spte = gpte_to_spte(lg, gpte, 1);
280 else { 272 else
281 /* If this is a read, don't set the "writable" bit in the page 273 /* If this is a read, don't set the "writable" bit in the page
282 * table entry, even if the Guest says it's writable. That way 274 * table entry, even if the Guest says it's writable. That way
283 * we come back here when a write does actually ocur, so we can 275 * we come back here when a write does actually ocur, so we can
284 * update the Guest's _PAGE_DIRTY flag. */ 276 * update the Guest's _PAGE_DIRTY flag. */
285 gpte_t ro_gpte = gpte; 277 *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0);
286 ro_gpte.flags &= ~_PAGE_RW;
287 *spte = gpte_to_spte(lg, ro_gpte, 0);
288 }
289 278
290 /* Finally, we write the Guest PTE entry back: we've set the 279 /* Finally, we write the Guest PTE entry back: we've set the
291 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ 280 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
292 lgwrite_u32(lg, gpte_ptr, gpte.raw.val); 281 lgwrite(lg, gpte_ptr, pte_t, gpte);
293 282
294 /* We succeeded in mapping the page! */ 283 /* We succeeded in mapping the page! */
295 return 1; 284 return 1;
@@ -305,17 +294,18 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
305 * mapped by the shadow page tables, and is it writable? */ 294 * mapped by the shadow page tables, and is it writable? */
306static int page_writable(struct lguest *lg, unsigned long vaddr) 295static int page_writable(struct lguest *lg, unsigned long vaddr)
307{ 296{
308 spgd_t *spgd; 297 pgd_t *spgd;
309 unsigned long flags; 298 unsigned long flags;
310 299
311 /* Look at the top level entry: is it present? */ 300 /* Look at the top level entry: is it present? */
312 spgd = spgd_addr(lg, lg->pgdidx, vaddr); 301 spgd = spgd_addr(lg, lg->pgdidx, vaddr);
313 if (!(spgd->flags & _PAGE_PRESENT)) 302 if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
314 return 0; 303 return 0;
315 304
316 /* Check the flags on the pte entry itself: it must be present and 305 /* Check the flags on the pte entry itself: it must be present and
317 * writable. */ 306 * writable. */
318 flags = spte_addr(lg, *spgd, vaddr)->flags; 307 flags = pte_flags(*(spte_addr(lg, *spgd, vaddr)));
308
319 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); 309 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
320} 310}
321 311
@@ -329,22 +319,22 @@ void pin_page(struct lguest *lg, unsigned long vaddr)
329} 319}
330 320
331/*H:450 If we chase down the release_pgd() code, it looks like this: */ 321/*H:450 If we chase down the release_pgd() code, it looks like this: */
332static void release_pgd(struct lguest *lg, spgd_t *spgd) 322static void release_pgd(struct lguest *lg, pgd_t *spgd)
333{ 323{
334 /* If the entry's not present, there's nothing to release. */ 324 /* If the entry's not present, there's nothing to release. */
335 if (spgd->flags & _PAGE_PRESENT) { 325 if (pgd_flags(*spgd) & _PAGE_PRESENT) {
336 unsigned int i; 326 unsigned int i;
337 /* Converting the pfn to find the actual PTE page is easy: turn 327 /* Converting the pfn to find the actual PTE page is easy: turn
338 * the page number into a physical address, then convert to a 328 * the page number into a physical address, then convert to a
339 * virtual address (easy for kernel pages like this one). */ 329 * virtual address (easy for kernel pages like this one). */
340 spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT); 330 pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
341 /* For each entry in the page, we might need to release it. */ 331 /* For each entry in the page, we might need to release it. */
342 for (i = 0; i < PTES_PER_PAGE; i++) 332 for (i = 0; i < PTRS_PER_PTE; i++)
343 release_pte(ptepage[i]); 333 release_pte(ptepage[i]);
344 /* Now we can free the page of PTEs */ 334 /* Now we can free the page of PTEs */
345 free_page((long)ptepage); 335 free_page((long)ptepage);
346 /* And zero out the PGD entry we we never release it twice. */ 336 /* And zero out the PGD entry we we never release it twice. */
347 spgd->raw.val = 0; 337 *spgd = __pgd(0);
348 } 338 }
349} 339}
350 340
@@ -356,7 +346,7 @@ static void flush_user_mappings(struct lguest *lg, int idx)
356{ 346{
357 unsigned int i; 347 unsigned int i;
358 /* Release every pgd entry up to the kernel's address. */ 348 /* Release every pgd entry up to the kernel's address. */
359 for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++) 349 for (i = 0; i < pgd_index(lg->kernel_address); i++)
360 release_pgd(lg, lg->pgdirs[idx].pgdir + i); 350 release_pgd(lg, lg->pgdirs[idx].pgdir + i);
361} 351}
362 352
@@ -369,6 +359,25 @@ void guest_pagetable_flush_user(struct lguest *lg)
369} 359}
370/*:*/ 360/*:*/
371 361
362/* We walk down the guest page tables to get a guest-physical address */
363unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
364{
365 pgd_t gpgd;
366 pte_t gpte;
367
368 /* First step: get the top-level Guest page table entry. */
369 gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t);
370 /* Toplevel not present? We can't map it in. */
371 if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
372 kill_guest(lg, "Bad address %#lx", vaddr);
373
374 gpte = lgread(lg, gpte_addr(lg, gpgd, vaddr), pte_t);
375 if (!(pte_flags(gpte) & _PAGE_PRESENT))
376 kill_guest(lg, "Bad address %#lx", vaddr);
377
378 return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
379}
380
372/* We keep several page tables. This is a simple routine to find the page 381/* We keep several page tables. This is a simple routine to find the page
373 * table (if any) corresponding to this top-level address the Guest has given 382 * table (if any) corresponding to this top-level address the Guest has given
374 * us. */ 383 * us. */
@@ -376,7 +385,7 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
376{ 385{
377 unsigned int i; 386 unsigned int i;
378 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 387 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
379 if (lg->pgdirs[i].cr3 == pgtable) 388 if (lg->pgdirs[i].gpgdir == pgtable)
380 break; 389 break;
381 return i; 390 return i;
382} 391}
@@ -385,7 +394,7 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
385 * allocate a new one (and so the kernel parts are not there), we set 394 * allocate a new one (and so the kernel parts are not there), we set
386 * blank_pgdir. */ 395 * blank_pgdir. */
387static unsigned int new_pgdir(struct lguest *lg, 396static unsigned int new_pgdir(struct lguest *lg,
388 unsigned long cr3, 397 unsigned long gpgdir,
389 int *blank_pgdir) 398 int *blank_pgdir)
390{ 399{
391 unsigned int next; 400 unsigned int next;
@@ -395,7 +404,7 @@ static unsigned int new_pgdir(struct lguest *lg,
395 next = random32() % ARRAY_SIZE(lg->pgdirs); 404 next = random32() % ARRAY_SIZE(lg->pgdirs);
396 /* If it's never been allocated at all before, try now. */ 405 /* If it's never been allocated at all before, try now. */
397 if (!lg->pgdirs[next].pgdir) { 406 if (!lg->pgdirs[next].pgdir) {
398 lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL); 407 lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
399 /* If the allocation fails, just keep using the one we have */ 408 /* If the allocation fails, just keep using the one we have */
400 if (!lg->pgdirs[next].pgdir) 409 if (!lg->pgdirs[next].pgdir)
401 next = lg->pgdidx; 410 next = lg->pgdidx;
@@ -405,7 +414,7 @@ static unsigned int new_pgdir(struct lguest *lg,
405 *blank_pgdir = 1; 414 *blank_pgdir = 1;
406 } 415 }
407 /* Record which Guest toplevel this shadows. */ 416 /* Record which Guest toplevel this shadows. */
408 lg->pgdirs[next].cr3 = cr3; 417 lg->pgdirs[next].gpgdir = gpgdir;
409 /* Release all the non-kernel mappings. */ 418 /* Release all the non-kernel mappings. */
410 flush_user_mappings(lg, next); 419 flush_user_mappings(lg, next);
411 420
@@ -472,26 +481,27 @@ void guest_pagetable_clear_all(struct lguest *lg)
472 * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. 481 * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
473 */ 482 */
474static void do_set_pte(struct lguest *lg, int idx, 483static void do_set_pte(struct lguest *lg, int idx,
475 unsigned long vaddr, gpte_t gpte) 484 unsigned long vaddr, pte_t gpte)
476{ 485{
477 /* Look up the matching shadow page directot entry. */ 486 /* Look up the matching shadow page directot entry. */
478 spgd_t *spgd = spgd_addr(lg, idx, vaddr); 487 pgd_t *spgd = spgd_addr(lg, idx, vaddr);
479 488
480 /* If the top level isn't present, there's no entry to update. */ 489 /* If the top level isn't present, there's no entry to update. */
481 if (spgd->flags & _PAGE_PRESENT) { 490 if (pgd_flags(*spgd) & _PAGE_PRESENT) {
482 /* Otherwise, we start by releasing the existing entry. */ 491 /* Otherwise, we start by releasing the existing entry. */
483 spte_t *spte = spte_addr(lg, *spgd, vaddr); 492 pte_t *spte = spte_addr(lg, *spgd, vaddr);
484 release_pte(*spte); 493 release_pte(*spte);
485 494
486 /* If they're setting this entry as dirty or accessed, we might 495 /* If they're setting this entry as dirty or accessed, we might
487 * as well put that entry they've given us in now. This shaves 496 * as well put that entry they've given us in now. This shaves
488 * 10% off a copy-on-write micro-benchmark. */ 497 * 10% off a copy-on-write micro-benchmark. */
489 if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) { 498 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
490 check_gpte(lg, gpte); 499 check_gpte(lg, gpte);
491 *spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY); 500 *spte = gpte_to_spte(lg, gpte,
501 pte_flags(gpte) & _PAGE_DIRTY);
492 } else 502 } else
493 /* Otherwise we can demand_page() it in later. */ 503 /* Otherwise we can demand_page() it in later. */
494 spte->raw.val = 0; 504 *spte = __pte(0);
495 } 505 }
496} 506}
497 507
@@ -506,18 +516,18 @@ static void do_set_pte(struct lguest *lg, int idx,
506 * The benefit is that when we have to track a new page table, we can copy keep 516 * The benefit is that when we have to track a new page table, we can copy keep
507 * all the kernel mappings. This speeds up context switch immensely. */ 517 * all the kernel mappings. This speeds up context switch immensely. */
508void guest_set_pte(struct lguest *lg, 518void guest_set_pte(struct lguest *lg,
509 unsigned long cr3, unsigned long vaddr, gpte_t gpte) 519 unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
510{ 520{
511 /* Kernel mappings must be changed on all top levels. Slow, but 521 /* Kernel mappings must be changed on all top levels. Slow, but
512 * doesn't happen often. */ 522 * doesn't happen often. */
513 if (vaddr >= lg->page_offset) { 523 if (vaddr >= lg->kernel_address) {
514 unsigned int i; 524 unsigned int i;
515 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 525 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
516 if (lg->pgdirs[i].pgdir) 526 if (lg->pgdirs[i].pgdir)
517 do_set_pte(lg, i, vaddr, gpte); 527 do_set_pte(lg, i, vaddr, gpte);
518 } else { 528 } else {
519 /* Is this page table one we have a shadow for? */ 529 /* Is this page table one we have a shadow for? */
520 int pgdir = find_pgdir(lg, cr3); 530 int pgdir = find_pgdir(lg, gpgdir);
521 if (pgdir != ARRAY_SIZE(lg->pgdirs)) 531 if (pgdir != ARRAY_SIZE(lg->pgdirs))
522 /* If so, do the update. */ 532 /* If so, do the update. */
523 do_set_pte(lg, pgdir, vaddr, gpte); 533 do_set_pte(lg, pgdir, vaddr, gpte);
@@ -538,7 +548,7 @@ void guest_set_pte(struct lguest *lg,
538 * 548 *
539 * So with that in mind here's our code to to update a (top-level) PGD entry: 549 * So with that in mind here's our code to to update a (top-level) PGD entry:
540 */ 550 */
541void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx) 551void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx)
542{ 552{
543 int pgdir; 553 int pgdir;
544 554
@@ -548,7 +558,7 @@ void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx)
548 return; 558 return;
549 559
550 /* If they're talking about a page table we have a shadow for... */ 560 /* If they're talking about a page table we have a shadow for... */
551 pgdir = find_pgdir(lg, cr3); 561 pgdir = find_pgdir(lg, gpgdir);
552 if (pgdir < ARRAY_SIZE(lg->pgdirs)) 562 if (pgdir < ARRAY_SIZE(lg->pgdirs))
553 /* ... throw it away. */ 563 /* ... throw it away. */
554 release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); 564 release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);
@@ -560,21 +570,34 @@ void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx)
560 * its first page table is. We set some things up here: */ 570 * its first page table is. We set some things up here: */
561int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) 571int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
562{ 572{
563 /* In flush_user_mappings() we loop from 0 to
564 * "vaddr_to_pgd_index(lg->page_offset)". This assumes it won't hit
565 * the Switcher mappings, so check that now. */
566 if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX)
567 return -EINVAL;
568 /* We start on the first shadow page table, and give it a blank PGD 573 /* We start on the first shadow page table, and give it a blank PGD
569 * page. */ 574 * page. */
570 lg->pgdidx = 0; 575 lg->pgdidx = 0;
571 lg->pgdirs[lg->pgdidx].cr3 = pgtable; 576 lg->pgdirs[lg->pgdidx].gpgdir = pgtable;
572 lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL); 577 lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL);
573 if (!lg->pgdirs[lg->pgdidx].pgdir) 578 if (!lg->pgdirs[lg->pgdidx].pgdir)
574 return -ENOMEM; 579 return -ENOMEM;
575 return 0; 580 return 0;
576} 581}
577 582
583/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
584void page_table_guest_data_init(struct lguest *lg)
585{
586 /* We get the kernel address: above this is all kernel memory. */
587 if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address)
588 /* We tell the Guest that it can't use the top 4MB of virtual
589 * addresses used by the Switcher. */
590 || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
591 || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir))
592 kill_guest(lg, "bad guest page %p", lg->lguest_data);
593
594 /* In flush_user_mappings() we loop from 0 to
595 * "pgd_index(lg->kernel_address)". This assumes it won't hit the
596 * Switcher mappings, so check that now. */
597 if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX)
598 kill_guest(lg, "bad kernel address %#lx", lg->kernel_address);
599}
600
578/* When a Guest dies, our cleanup is fairly simple. */ 601/* When a Guest dies, our cleanup is fairly simple. */
579void free_guest_pagetable(struct lguest *lg) 602void free_guest_pagetable(struct lguest *lg)
580{ 603{
@@ -594,14 +617,14 @@ void free_guest_pagetable(struct lguest *lg)
594 * for each CPU already set up, we just need to hook them in. */ 617 * for each CPU already set up, we just need to hook them in. */
595void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) 618void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
596{ 619{
597 spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); 620 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
598 spgd_t switcher_pgd; 621 pgd_t switcher_pgd;
599 spte_t regs_pte; 622 pte_t regs_pte;
600 623
601 /* Make the last PGD entry for this Guest point to the Switcher's PTE 624 /* Make the last PGD entry for this Guest point to the Switcher's PTE
602 * page for this CPU (with appropriate flags). */ 625 * page for this CPU (with appropriate flags). */
603 switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT; 626 switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL);
604 switcher_pgd.flags = _PAGE_KERNEL; 627
605 lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; 628 lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
606 629
607 /* We also change the Switcher PTE page. When we're running the Guest, 630 /* We also change the Switcher PTE page. When we're running the Guest,
@@ -611,10 +634,8 @@ void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
611 * CPU's "struct lguest_pages": if we make sure the Guest's register 634 * CPU's "struct lguest_pages": if we make sure the Guest's register
612 * page is already mapped there, we don't have to copy them out 635 * page is already mapped there, we don't have to copy them out
613 * again. */ 636 * again. */
614 regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT; 637 regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL));
615 regs_pte.flags = _PAGE_KERNEL; 638 switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte;
616 switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE]
617 = regs_pte;
618} 639}
619/*:*/ 640/*:*/
620 641
@@ -635,24 +656,25 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
635 unsigned int pages) 656 unsigned int pages)
636{ 657{
637 unsigned int i; 658 unsigned int i;
638 spte_t *pte = switcher_pte_page(cpu); 659 pte_t *pte = switcher_pte_page(cpu);
639 660
640 /* The first entries are easy: they map the Switcher code. */ 661 /* The first entries are easy: they map the Switcher code. */
641 for (i = 0; i < pages; i++) { 662 for (i = 0; i < pages; i++) {
642 pte[i].pfn = page_to_pfn(switcher_page[i]); 663 pte[i] = mk_pte(switcher_page[i],
643 pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED; 664 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));
644 } 665 }
645 666
646 /* The only other thing we map is this CPU's pair of pages. */ 667 /* The only other thing we map is this CPU's pair of pages. */
647 i = pages + cpu*2; 668 i = pages + cpu*2;
648 669
649 /* First page (Guest registers) is writable from the Guest */ 670 /* First page (Guest registers) is writable from the Guest */
650 pte[i].pfn = page_to_pfn(switcher_page[i]); 671 pte[i] = pfn_pte(page_to_pfn(switcher_page[i]),
651 pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW; 672 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW));
673
652 /* The second page contains the "struct lguest_ro_state", and is 674 /* The second page contains the "struct lguest_ro_state", and is
653 * read-only. */ 675 * read-only. */
654 pte[i+1].pfn = page_to_pfn(switcher_page[i+1]); 676 pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]),
655 pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED; 677 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));
656} 678}
657 679
658/*H:510 At boot or module load time, init_pagetables() allocates and populates 680/*H:510 At boot or module load time, init_pagetables() allocates and populates
@@ -662,7 +684,7 @@ __init int init_pagetables(struct page **switcher_page, unsigned int pages)
662 unsigned int i; 684 unsigned int i;
663 685
664 for_each_possible_cpu(i) { 686 for_each_possible_cpu(i) {
665 switcher_pte_page(i) = (spte_t *)get_zeroed_page(GFP_KERNEL); 687 switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL);
666 if (!switcher_pte_page(i)) { 688 if (!switcher_pte_page(i)) {
667 free_switcher_pte_pages(); 689 free_switcher_pte_pages();
668 return -ENOMEM; 690 return -ENOMEM;
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
index 9b81119f46e..c2434ec99f7 100644
--- a/drivers/lguest/segments.c
+++ b/drivers/lguest/segments.c
@@ -73,14 +73,14 @@ static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
73 /* Segment descriptors contain a privilege level: the Guest is 73 /* Segment descriptors contain a privilege level: the Guest is
74 * sometimes careless and leaves this as 0, even though it's 74 * sometimes careless and leaves this as 0, even though it's
75 * running at privilege level 1. If so, we fix it here. */ 75 * running at privilege level 1. If so, we fix it here. */
76 if ((lg->gdt[i].b & 0x00006000) == 0) 76 if ((lg->arch.gdt[i].b & 0x00006000) == 0)
77 lg->gdt[i].b |= (GUEST_PL << 13); 77 lg->arch.gdt[i].b |= (GUEST_PL << 13);
78 78
79 /* Each descriptor has an "accessed" bit. If we don't set it 79 /* Each descriptor has an "accessed" bit. If we don't set it
80 * now, the CPU will try to set it when the Guest first loads 80 * now, the CPU will try to set it when the Guest first loads
81 * that entry into a segment register. But the GDT isn't 81 * that entry into a segment register. But the GDT isn't
82 * writable by the Guest, so bad things can happen. */ 82 * writable by the Guest, so bad things can happen. */
83 lg->gdt[i].b |= 0x00000100; 83 lg->arch.gdt[i].b |= 0x00000100;
84 } 84 }
85} 85}
86 86
@@ -106,12 +106,12 @@ void setup_default_gdt_entries(struct lguest_ro_state *state)
106void setup_guest_gdt(struct lguest *lg) 106void setup_guest_gdt(struct lguest *lg)
107{ 107{
108 /* Start with full 0-4G segments... */ 108 /* Start with full 0-4G segments... */
109 lg->gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; 109 lg->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
110 lg->gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; 110 lg->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
111 /* ...except the Guest is allowed to use them, so set the privilege 111 /* ...except the Guest is allowed to use them, so set the privilege
112 * level appropriately in the flags. */ 112 * level appropriately in the flags. */
113 lg->gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); 113 lg->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
114 lg->gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); 114 lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
115} 115}
116 116
117/* Like the IDT, we never simply use the GDT the Guest gives us. We set up the 117/* Like the IDT, we never simply use the GDT the Guest gives us. We set up the
@@ -126,7 +126,7 @@ void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
126 unsigned int i; 126 unsigned int i;
127 127
128 for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++) 128 for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
129 gdt[i] = lg->gdt[i]; 129 gdt[i] = lg->arch.gdt[i];
130} 130}
131 131
132/* This is the full version */ 132/* This is the full version */
@@ -138,7 +138,7 @@ void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
138 * replaced. See ignored_gdt() above. */ 138 * replaced. See ignored_gdt() above. */
139 for (i = 0; i < GDT_ENTRIES; i++) 139 for (i = 0; i < GDT_ENTRIES; i++)
140 if (!ignored_gdt(i)) 140 if (!ignored_gdt(i))
141 gdt[i] = lg->gdt[i]; 141 gdt[i] = lg->arch.gdt[i];
142} 142}
143 143
144/* This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). */ 144/* This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). */
@@ -146,12 +146,12 @@ void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
146{ 146{
147 /* We assume the Guest has the same number of GDT entries as the 147 /* We assume the Guest has the same number of GDT entries as the
148 * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ 148 * Host, otherwise we'd have to dynamically allocate the Guest GDT. */
149 if (num > ARRAY_SIZE(lg->gdt)) 149 if (num > ARRAY_SIZE(lg->arch.gdt))
150 kill_guest(lg, "too many gdt entries %i", num); 150 kill_guest(lg, "too many gdt entries %i", num);
151 151
152 /* We read the whole thing in, then fix it up. */ 152 /* We read the whole thing in, then fix it up. */
153 lgread(lg, lg->gdt, table, num * sizeof(lg->gdt[0])); 153 __lgread(lg, lg->arch.gdt, table, num * sizeof(lg->arch.gdt[0]));
154 fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->gdt)); 154 fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->arch.gdt));
155 /* Mark that the GDT changed so the core knows it has to copy it again, 155 /* Mark that the GDT changed so the core knows it has to copy it again,
156 * even if the Guest is run on the same CPU. */ 156 * even if the Guest is run on the same CPU. */
157 lg->changed |= CHANGED_GDT; 157 lg->changed |= CHANGED_GDT;
@@ -159,9 +159,9 @@ void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
159 159
160void guest_load_tls(struct lguest *lg, unsigned long gtls) 160void guest_load_tls(struct lguest *lg, unsigned long gtls)
161{ 161{
162 struct desc_struct *tls = &lg->gdt[GDT_ENTRY_TLS_MIN]; 162 struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN];
163 163
164 lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); 164 __lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
165 fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); 165 fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
166 lg->changed |= CHANGED_GDT_TLS; 166 lg->changed |= CHANGED_GDT_TLS;
167} 167}
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
new file mode 100644
index 00000000000..9eed12d5a39
--- /dev/null
+++ b/drivers/lguest/x86/core.c
@@ -0,0 +1,577 @@
1/*
2 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
3 * Copyright (C) 2007, Jes Sorensen <jes@sgi.com> SGI.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
13 * NON INFRINGEMENT. See the GNU General Public License for more
14 * details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20#include <linux/kernel.h>
21#include <linux/start_kernel.h>
22#include <linux/string.h>
23#include <linux/console.h>
24#include <linux/screen_info.h>
25#include <linux/irq.h>
26#include <linux/interrupt.h>
27#include <linux/clocksource.h>
28#include <linux/clockchips.h>
29#include <linux/cpu.h>
30#include <linux/lguest.h>
31#include <linux/lguest_launcher.h>
32#include <asm/paravirt.h>
33#include <asm/param.h>
34#include <asm/page.h>
35#include <asm/pgtable.h>
36#include <asm/desc.h>
37#include <asm/setup.h>
38#include <asm/lguest.h>
39#include <asm/uaccess.h>
40#include <asm/i387.h>
41#include "../lg.h"
42
43static int cpu_had_pge;
44
45static struct {
46 unsigned long offset;
47 unsigned short segment;
48} lguest_entry;
49
50/* Offset from where switcher.S was compiled to where we've copied it */
51static unsigned long switcher_offset(void)
52{
53 return SWITCHER_ADDR - (unsigned long)start_switcher_text;
54}
55
56/* This cpu's struct lguest_pages. */
57static struct lguest_pages *lguest_pages(unsigned int cpu)
58{
59 return &(((struct lguest_pages *)
60 (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
61}
62
63static DEFINE_PER_CPU(struct lguest *, last_guest);
64
65/*S:010
66 * We are getting close to the Switcher.
67 *
68 * Remember that each CPU has two pages which are visible to the Guest when it
69 * runs on that CPU. This has to contain the state for that Guest: we copy the
70 * state in just before we run the Guest.
71 *
72 * Each Guest has "changed" flags which indicate what has changed in the Guest
73 * since it last ran. We saw this set in interrupts_and_traps.c and
74 * segments.c.
75 */
76static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
77{
78 /* Copying all this data can be quite expensive. We usually run the
79 * same Guest we ran last time (and that Guest hasn't run anywhere else
80 * meanwhile). If that's not the case, we pretend everything in the
81 * Guest has changed. */
82 if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) {
83 __get_cpu_var(last_guest) = lg;
84 lg->last_pages = pages;
85 lg->changed = CHANGED_ALL;
86 }
87
88 /* These copies are pretty cheap, so we do them unconditionally: */
89 /* Save the current Host top-level page directory. */
90 pages->state.host_cr3 = __pa(current->mm->pgd);
91 /* Set up the Guest's page tables to see this CPU's pages (and no
92 * other CPU's pages). */
93 map_switcher_in_guest(lg, pages);
94 /* Set up the two "TSS" members which tell the CPU what stack to use
95 * for traps which do directly into the Guest (ie. traps at privilege
96 * level 1). */
97 pages->state.guest_tss.esp1 = lg->esp1;
98 pages->state.guest_tss.ss1 = lg->ss1;
99
100 /* Copy direct-to-Guest trap entries. */
101 if (lg->changed & CHANGED_IDT)
102 copy_traps(lg, pages->state.guest_idt, default_idt_entries);
103
104 /* Copy all GDT entries which the Guest can change. */
105 if (lg->changed & CHANGED_GDT)
106 copy_gdt(lg, pages->state.guest_gdt);
107 /* If only the TLS entries have changed, copy them. */
108 else if (lg->changed & CHANGED_GDT_TLS)
109 copy_gdt_tls(lg, pages->state.guest_gdt);
110
111 /* Mark the Guest as unchanged for next time. */
112 lg->changed = 0;
113}
114
115/* Finally: the code to actually call into the Switcher to run the Guest. */
116static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
117{
118 /* This is a dummy value we need for GCC's sake. */
119 unsigned int clobber;
120
121 /* Copy the guest-specific information into this CPU's "struct
122 * lguest_pages". */
123 copy_in_guest_info(lg, pages);
124
125 /* Set the trap number to 256 (impossible value). If we fault while
126 * switching to the Guest (bad segment registers or bug), this will
127 * cause us to abort the Guest. */
128 lg->regs->trapnum = 256;
129
130 /* Now: we push the "eflags" register on the stack, then do an "lcall".
131 * This is how we change from using the kernel code segment to using
132 * the dedicated lguest code segment, as well as jumping into the
133 * Switcher.
134 *
135 * The lcall also pushes the old code segment (KERNEL_CS) onto the
136 * stack, then the address of this call. This stack layout happens to
137 * exactly match the stack of an interrupt... */
138 asm volatile("pushf; lcall *lguest_entry"
139 /* This is how we tell GCC that %eax ("a") and %ebx ("b")
140 * are changed by this routine. The "=" means output. */
141 : "=a"(clobber), "=b"(clobber)
142 /* %eax contains the pages pointer. ("0" refers to the
143 * 0-th argument above, ie "a"). %ebx contains the
144 * physical address of the Guest's top-level page
145 * directory. */
146 : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir))
147 /* We tell gcc that all these registers could change,
148 * which means we don't have to save and restore them in
149 * the Switcher. */
150 : "memory", "%edx", "%ecx", "%edi", "%esi");
151}
152/*:*/
153
154/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts
155 * are disabled: we own the CPU. */
156void lguest_arch_run_guest(struct lguest *lg)
157{
158 /* Remember the awfully-named TS bit? If the Guest has asked
159 * to set it we set it now, so we can trap and pass that trap
160 * to the Guest if it uses the FPU. */
161 if (lg->ts)
162 lguest_set_ts();
163
164 /* SYSENTER is an optimized way of doing system calls. We
165 * can't allow it because it always jumps to privilege level 0.
166 * A normal Guest won't try it because we don't advertise it in
167 * CPUID, but a malicious Guest (or malicious Guest userspace
168 * program) could, so we tell the CPU to disable it before
169 * running the Guest. */
170 if (boot_cpu_has(X86_FEATURE_SEP))
171 wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
172
173 /* Now we actually run the Guest. It will pop back out when
174 * something interesting happens, and we can examine its
175 * registers to see what it was doing. */
176 run_guest_once(lg, lguest_pages(raw_smp_processor_id()));
177
178 /* The "regs" pointer contains two extra entries which are not
179 * really registers: a trap number which says what interrupt or
180 * trap made the switcher code come back, and an error code
181 * which some traps set. */
182
183 /* If the Guest page faulted, then the cr2 register will tell
184 * us the bad virtual address. We have to grab this now,
185 * because once we re-enable interrupts an interrupt could
186 * fault and thus overwrite cr2, or we could even move off to a
187 * different CPU. */
188 if (lg->regs->trapnum == 14)
189 lg->arch.last_pagefault = read_cr2();
190 /* Similarly, if we took a trap because the Guest used the FPU,
191 * we have to restore the FPU it expects to see. */
192 else if (lg->regs->trapnum == 7)
193 math_state_restore();
194
195 /* Restore SYSENTER if it's supposed to be on. */
196 if (boot_cpu_has(X86_FEATURE_SEP))
197 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
198}
199
200/*H:130 Our Guest is usually so well behaved; it never tries to do things it
201 * isn't allowed to. Unfortunately, Linux's paravirtual infrastructure isn't
202 * quite complete, because it doesn't contain replacements for the Intel I/O
203 * instructions. As a result, the Guest sometimes fumbles across one during
204 * the boot process as it probes for various things which are usually attached
205 * to a PC.
206 *
207 * When the Guest uses one of these instructions, we get trap #13 (General
208 * Protection Fault) and come here. We see if it's one of those troublesome
209 * instructions and skip over it. We return true if we did. */
210static int emulate_insn(struct lguest *lg)
211{
212 u8 insn;
213 unsigned int insnlen = 0, in = 0, shift = 0;
214 /* The eip contains the *virtual* address of the Guest's instruction:
215 * guest_pa just subtracts the Guest's page_offset. */
216 unsigned long physaddr = guest_pa(lg, lg->regs->eip);
217
218 /* This must be the Guest kernel trying to do something, not userspace!
219 * The bottom two bits of the CS segment register are the privilege
220 * level. */
221 if ((lg->regs->cs & 3) != GUEST_PL)
222 return 0;
223
224 /* Decoding x86 instructions is icky. */
225 insn = lgread(lg, physaddr, u8);
226
227 /* 0x66 is an "operand prefix". It means it's using the upper 16 bits
228 of the eax register. */
229 if (insn == 0x66) {
230 shift = 16;
231 /* The instruction is 1 byte so far, read the next byte. */
232 insnlen = 1;
233 insn = lgread(lg, physaddr + insnlen, u8);
234 }
235
236 /* We can ignore the lower bit for the moment and decode the 4 opcodes
237 * we need to emulate. */
238 switch (insn & 0xFE) {
239 case 0xE4: /* in <next byte>,%al */
240 insnlen += 2;
241 in = 1;
242 break;
243 case 0xEC: /* in (%dx),%al */
244 insnlen += 1;
245 in = 1;
246 break;
247 case 0xE6: /* out %al,<next byte> */
248 insnlen += 2;
249 break;
250 case 0xEE: /* out %al,(%dx) */
251 insnlen += 1;
252 break;
253 default:
254 /* OK, we don't know what this is, can't emulate. */
255 return 0;
256 }
257
258 /* If it was an "IN" instruction, they expect the result to be read
259 * into %eax, so we change %eax. We always return all-ones, which
260 * traditionally means "there's nothing there". */
261 if (in) {
262 /* Lower bit tells is whether it's a 16 or 32 bit access */
263 if (insn & 0x1)
264 lg->regs->eax = 0xFFFFFFFF;
265 else
266 lg->regs->eax |= (0xFFFF << shift);
267 }
268 /* Finally, we've "done" the instruction, so move past it. */
269 lg->regs->eip += insnlen;
270 /* Success! */
271 return 1;
272}
273
274/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
275void lguest_arch_handle_trap(struct lguest *lg)
276{
277 switch (lg->regs->trapnum) {
278 case 13: /* We've intercepted a GPF. */
279 /* Check if this was one of those annoying IN or OUT
280 * instructions which we need to emulate. If so, we
281 * just go back into the Guest after we've done it. */
282 if (lg->regs->errcode == 0) {
283 if (emulate_insn(lg))
284 return;
285 }
286 break;
287 case 14: /* We've intercepted a page fault. */
288 /* The Guest accessed a virtual address that wasn't
289 * mapped. This happens a lot: we don't actually set
290 * up most of the page tables for the Guest at all when
291 * we start: as it runs it asks for more and more, and
292 * we set them up as required. In this case, we don't
293 * even tell the Guest that the fault happened.
294 *
295 * The errcode tells whether this was a read or a
296 * write, and whether kernel or userspace code. */
297 if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode))
298 return;
299
300 /* OK, it's really not there (or not OK): the Guest
301 * needs to know. We write out the cr2 value so it
302 * knows where the fault occurred.
303 *
304 * Note that if the Guest were really messed up, this
305 * could happen before it's done the INITIALIZE
306 * hypercall, so lg->lguest_data will be NULL */
307 if (lg->lguest_data &&
308 put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2))
309 kill_guest(lg, "Writing cr2");
310 break;
311 case 7: /* We've intercepted a Device Not Available fault. */
312 /* If the Guest doesn't want to know, we already
313 * restored the Floating Point Unit, so we just
314 * continue without telling it. */
315 if (!lg->ts)
316 return;
317 break;
318 case 32 ... 255:
319 /* These values mean a real interrupt occurred, in which case
320 * the Host handler has already been run. We just do a
321 * friendly check if another process should now be run, then
322 * return to run the Guest again */
323 cond_resched();
324 return;
325 case LGUEST_TRAP_ENTRY:
326 /* Our 'struct hcall_args' maps directly over our regs: we set
327 * up the pointer now to indicate a hypercall is pending. */
328 lg->hcall = (struct hcall_args *)lg->regs;
329 return;
330 }
331
332 /* We didn't handle the trap, so it needs to go to the Guest. */
333 if (!deliver_trap(lg, lg->regs->trapnum))
334 /* If the Guest doesn't have a handler (either it hasn't
335 * registered any yet, or it's one of the faults we don't let
336 * it handle), it dies with a cryptic error message. */
337 kill_guest(lg, "unhandled trap %li at %#lx (%#lx)",
338 lg->regs->trapnum, lg->regs->eip,
339 lg->regs->trapnum == 14 ? lg->arch.last_pagefault
340 : lg->regs->errcode);
341}
342
343/* Now we can look at each of the routines this calls, in increasing order of
344 * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(),
345 * deliver_trap() and demand_page(). After all those, we'll be ready to
346 * examine the Switcher, and our philosophical understanding of the Host/Guest
347 * duality will be complete. :*/
348static void adjust_pge(void *on)
349{
350 if (on)
351 write_cr4(read_cr4() | X86_CR4_PGE);
352 else
353 write_cr4(read_cr4() & ~X86_CR4_PGE);
354}
355
356/*H:020 Now the Switcher is mapped and every thing else is ready, we need to do
357 * some more i386-specific initialization. */
358void __init lguest_arch_host_init(void)
359{
360 int i;
361
362 /* Most of the i386/switcher.S doesn't care that it's been moved; on
363 * Intel, jumps are relative, and it doesn't access any references to
364 * external code or data.
365 *
366 * The only exception is the interrupt handlers in switcher.S: their
367 * addresses are placed in a table (default_idt_entries), so we need to
368 * update the table with the new addresses. switcher_offset() is a
369 * convenience function which returns the distance between the builtin
370 * switcher code and the high-mapped copy we just made. */
371 for (i = 0; i < IDT_ENTRIES; i++)
372 default_idt_entries[i] += switcher_offset();
373
374 /*
375 * Set up the Switcher's per-cpu areas.
376 *
377 * Each CPU gets two pages of its own within the high-mapped region
378 * (aka. "struct lguest_pages"). Much of this can be initialized now,
379 * but some depends on what Guest we are running (which is set up in
380 * copy_in_guest_info()).
381 */
382 for_each_possible_cpu(i) {
383 /* lguest_pages() returns this CPU's two pages. */
384 struct lguest_pages *pages = lguest_pages(i);
385 /* This is a convenience pointer to make the code fit one
386 * statement to a line. */
387 struct lguest_ro_state *state = &pages->state;
388
389 /* The Global Descriptor Table: the Host has a different one
390 * for each CPU. We keep a descriptor for the GDT which says
391 * where it is and how big it is (the size is actually the last
392 * byte, not the size, hence the "-1"). */
393 state->host_gdt_desc.size = GDT_SIZE-1;
394 state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
395
396 /* All CPUs on the Host use the same Interrupt Descriptor
397 * Table, so we just use store_idt(), which gets this CPU's IDT
398 * descriptor. */
399 store_idt(&state->host_idt_desc);
400
401 /* The descriptors for the Guest's GDT and IDT can be filled
402 * out now, too. We copy the GDT & IDT into ->guest_gdt and
403 * ->guest_idt before actually running the Guest. */
404 state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
405 state->guest_idt_desc.address = (long)&state->guest_idt;
406 state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
407 state->guest_gdt_desc.address = (long)&state->guest_gdt;
408
409 /* We know where we want the stack to be when the Guest enters
410 * the switcher: in pages->regs. The stack grows upwards, so
411 * we start it at the end of that structure. */
412 state->guest_tss.esp0 = (long)(&pages->regs + 1);
413 /* And this is the GDT entry to use for the stack: we keep a
414 * couple of special LGUEST entries. */
415 state->guest_tss.ss0 = LGUEST_DS;
416
417 /* x86 can have a finegrained bitmap which indicates what I/O
418 * ports the process can use. We set it to the end of our
419 * structure, meaning "none". */
420 state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
421
422 /* Some GDT entries are the same across all Guests, so we can
423 * set them up now. */
424 setup_default_gdt_entries(state);
425 /* Most IDT entries are the same for all Guests, too.*/
426 setup_default_idt_entries(state, default_idt_entries);
427
428 /* The Host needs to be able to use the LGUEST segments on this
429 * CPU, too, so put them in the Host GDT. */
430 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
431 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
432 }
433
434 /* In the Switcher, we want the %cs segment register to use the
435 * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so
436 * it will be undisturbed when we switch. To change %cs and jump we
437 * need this structure to feed to Intel's "lcall" instruction. */
438 lguest_entry.offset = (long)switch_to_guest + switcher_offset();
439 lguest_entry.segment = LGUEST_CS;
440
441 /* Finally, we need to turn off "Page Global Enable". PGE is an
442 * optimization where page table entries are specially marked to show
443 * they never change. The Host kernel marks all the kernel pages this
444 * way because it's always present, even when userspace is running.
445 *
446 * Lguest breaks this: unbeknownst to the rest of the Host kernel, we
447 * switch to the Guest kernel. If you don't disable this on all CPUs,
448 * you'll get really weird bugs that you'll chase for two days.
449 *
450 * I used to turn PGE off every time we switched to the Guest and back
451 * on when we return, but that slowed the Switcher down noticibly. */
452
453 /* We don't need the complexity of CPUs coming and going while we're
454 * doing this. */
455 lock_cpu_hotplug();
456 if (cpu_has_pge) { /* We have a broader idea of "global". */
457 /* Remember that this was originally set (for cleanup). */
458 cpu_had_pge = 1;
459 /* adjust_pge is a helper function which sets or unsets the PGE
460 * bit on its CPU, depending on the argument (0 == unset). */
461 on_each_cpu(adjust_pge, (void *)0, 0, 1);
462 /* Turn off the feature in the global feature set. */
463 clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
464 }
465 unlock_cpu_hotplug();
466};
467/*:*/
468
469void __exit lguest_arch_host_fini(void)
470{
471 /* If we had PGE before we started, turn it back on now. */
472 lock_cpu_hotplug();
473 if (cpu_had_pge) {
474 set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
475 /* adjust_pge's argument "1" means set PGE. */
476 on_each_cpu(adjust_pge, (void *)1, 0, 1);
477 }
478 unlock_cpu_hotplug();
479}
480
481
482/*H:122 The i386-specific hypercalls simply farm out to the right functions. */
483int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args)
484{
485 switch (args->arg0) {
486 case LHCALL_LOAD_GDT:
487 load_guest_gdt(lg, args->arg1, args->arg2);
488 break;
489 case LHCALL_LOAD_IDT_ENTRY:
490 load_guest_idt_entry(lg, args->arg1, args->arg2, args->arg3);
491 break;
492 case LHCALL_LOAD_TLS:
493 guest_load_tls(lg, args->arg1);
494 break;
495 default:
496 /* Bad Guest. Bad! */
497 return -EIO;
498 }
499 return 0;
500}
501
502/*H:126 i386-specific hypercall initialization: */
503int lguest_arch_init_hypercalls(struct lguest *lg)
504{
505 u32 tsc_speed;
506
507 /* The pointer to the Guest's "struct lguest_data" is the only
508 * argument. We check that address now. */
509 if (!lguest_address_ok(lg, lg->hcall->arg1, sizeof(*lg->lguest_data)))
510 return -EFAULT;
511
512 /* Having checked it, we simply set lg->lguest_data to point straight
513 * into the Launcher's memory at the right place and then use
514 * copy_to_user/from_user from now on, instead of lgread/write. I put
515 * this in to show that I'm not immune to writing stupid
516 * optimizations. */
517 lg->lguest_data = lg->mem_base + lg->hcall->arg1;
518
519 /* We insist that the Time Stamp Counter exist and doesn't change with
520 * cpu frequency. Some devious chip manufacturers decided that TSC
521 * changes could be handled in software. I decided that time going
522 * backwards might be good for benchmarks, but it's bad for users.
523 *
524 * We also insist that the TSC be stable: the kernel detects unreliable
525 * TSCs for its own purposes, and we use that here. */
526 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
527 tsc_speed = tsc_khz;
528 else
529 tsc_speed = 0;
530 if (put_user(tsc_speed, &lg->lguest_data->tsc_khz))
531 return -EFAULT;
532
533 /* The interrupt code might not like the system call vector. */
534 if (!check_syscall_vector(lg))
535 kill_guest(lg, "bad syscall vector");
536
537 return 0;
538}
539/* Now we've examined the hypercall code; our Guest can make requests. There
540 * is one other way we can do things for the Guest, as we see in
541 * emulate_insn(). :*/
542
543/*L:030 lguest_arch_setup_regs()
544 *
545 * Most of the Guest's registers are left alone: we used get_zeroed_page() to
546 * allocate the structure, so they will be 0. */
547void lguest_arch_setup_regs(struct lguest *lg, unsigned long start)
548{
549 struct lguest_regs *regs = lg->regs;
550
551 /* There are four "segment" registers which the Guest needs to boot:
552 * The "code segment" register (cs) refers to the kernel code segment
553 * __KERNEL_CS, and the "data", "extra" and "stack" segment registers
554 * refer to the kernel data segment __KERNEL_DS.
555 *
556 * The privilege level is packed into the lower bits. The Guest runs
557 * at privilege level 1 (GUEST_PL).*/
558 regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
559 regs->cs = __KERNEL_CS|GUEST_PL;
560
561 /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002)
562 * is supposed to always be "1". Bit 9 (0x200) controls whether
563 * interrupts are enabled. We always leave interrupts enabled while
564 * running the Guest. */
565 regs->eflags = 0x202;
566
567 /* The "Extended Instruction Pointer" register says where the Guest is
568 * running. */
569 regs->eip = start;
570
571 /* %esi points to our boot information, at physical address 0, so don't
572 * touch it. */
573 /* There are a couple of GDT entries the Guest expects when first
574 * booting. */
575
576 setup_guest_gdt(lg);
577}
diff --git a/drivers/lguest/switcher.S b/drivers/lguest/x86/switcher_32.S
index 7c9c230cc84..1010b90b11f 100644
--- a/drivers/lguest/switcher.S
+++ b/drivers/lguest/x86/switcher_32.S
@@ -48,7 +48,8 @@
48#include <linux/linkage.h> 48#include <linux/linkage.h>
49#include <asm/asm-offsets.h> 49#include <asm/asm-offsets.h>
50#include <asm/page.h> 50#include <asm/page.h>
51#include "lg.h" 51#include <asm/segment.h>
52#include <asm/lguest.h>
52 53
53// We mark the start of the code to copy 54// We mark the start of the code to copy
54// It's placed in .text tho it's never run here 55// It's placed in .text tho it's never run here
@@ -132,6 +133,7 @@ ENTRY(switch_to_guest)
132 // The Guest's register page has been mapped 133 // The Guest's register page has been mapped
133 // Writable onto our %esp (stack) -- 134 // Writable onto our %esp (stack) --
134 // We can simply pop off all Guest regs. 135 // We can simply pop off all Guest regs.
136 popl %eax
135 popl %ebx 137 popl %ebx
136 popl %ecx 138 popl %ecx
137 popl %edx 139 popl %edx
@@ -139,7 +141,6 @@ ENTRY(switch_to_guest)
139 popl %edi 141 popl %edi
140 popl %ebp 142 popl %ebp
141 popl %gs 143 popl %gs
142 popl %eax
143 popl %fs 144 popl %fs
144 popl %ds 145 popl %ds
145 popl %es 146 popl %es
@@ -167,7 +168,6 @@ ENTRY(switch_to_guest)
167 pushl %es; \ 168 pushl %es; \
168 pushl %ds; \ 169 pushl %ds; \
169 pushl %fs; \ 170 pushl %fs; \
170 pushl %eax; \
171 pushl %gs; \ 171 pushl %gs; \
172 pushl %ebp; \ 172 pushl %ebp; \
173 pushl %edi; \ 173 pushl %edi; \
@@ -175,6 +175,7 @@ ENTRY(switch_to_guest)
175 pushl %edx; \ 175 pushl %edx; \
176 pushl %ecx; \ 176 pushl %ecx; \
177 pushl %ebx; \ 177 pushl %ebx; \
178 pushl %eax; \
178 /* Our stack and our code are using segments \ 179 /* Our stack and our code are using segments \
179 * Set in the TSS and IDT \ 180 * Set in the TSS and IDT \
180 * Yet if we were to touch data we'd use \ 181 * Yet if we were to touch data we'd use \
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index ce34b539bf3..2538816817a 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -3100,4 +3100,10 @@ config NETPOLL_TRAP
3100config NET_POLL_CONTROLLER 3100config NET_POLL_CONTROLLER
3101 def_bool NETPOLL 3101 def_bool NETPOLL
3102 3102
3103config VIRTIO_NET
3104 tristate "Virtio network driver (EXPERIMENTAL)"
3105 depends on EXPERIMENTAL && VIRTIO
3106 ---help---
3107 This is the virtual network driver for lguest. Say Y or M.
3108
3103endif # NETDEVICES 3109endif # NETDEVICES
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 22f78cbd126..593262065c9 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -183,7 +183,6 @@ obj-$(CONFIG_ZORRO8390) += zorro8390.o
183obj-$(CONFIG_HPLANCE) += hplance.o 7990.o 183obj-$(CONFIG_HPLANCE) += hplance.o 7990.o
184obj-$(CONFIG_MVME147_NET) += mvme147.o 7990.o 184obj-$(CONFIG_MVME147_NET) += mvme147.o 7990.o
185obj-$(CONFIG_EQUALIZER) += eql.o 185obj-$(CONFIG_EQUALIZER) += eql.o
186obj-$(CONFIG_LGUEST_NET) += lguest_net.o
187obj-$(CONFIG_MIPS_JAZZ_SONIC) += jazzsonic.o 186obj-$(CONFIG_MIPS_JAZZ_SONIC) += jazzsonic.o
188obj-$(CONFIG_MIPS_AU1X00_ENET) += au1000_eth.o 187obj-$(CONFIG_MIPS_AU1X00_ENET) += au1000_eth.o
189obj-$(CONFIG_MIPS_SIM_NET) += mipsnet.o 188obj-$(CONFIG_MIPS_SIM_NET) += mipsnet.o
@@ -243,3 +242,4 @@ obj-$(CONFIG_FS_ENET) += fs_enet/
243 242
244obj-$(CONFIG_NETXEN_NIC) += netxen/ 243obj-$(CONFIG_NETXEN_NIC) += netxen/
245obj-$(CONFIG_NIU) += niu.o 244obj-$(CONFIG_NIU) += niu.o
245obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
diff --git a/drivers/net/lguest_net.c b/drivers/net/lguest_net.c
deleted file mode 100644
index abce2ee8430..00000000000
--- a/drivers/net/lguest_net.c
+++ /dev/null
@@ -1,555 +0,0 @@
1/*D:500
2 * The Guest network driver.
3 *
4 * This is very simple a virtual network driver, and our last Guest driver.
5 * The only trick is that it can talk directly to multiple other recipients
6 * (ie. other Guests on the same network). It can also be used with only the
7 * Host on the network.
8 :*/
9
10/* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 */
26//#define DEBUG
27#include <linux/netdevice.h>
28#include <linux/etherdevice.h>
29#include <linux/module.h>
30#include <linux/mm_types.h>
31#include <linux/io.h>
32#include <linux/lguest_bus.h>
33
34#define SHARED_SIZE PAGE_SIZE
35#define MAX_LANS 4
36#define NUM_SKBS 8
37
38/*M:011 Network code master Jeff Garzik points out numerous shortcomings in
39 * this driver if it aspires to greatness.
40 *
41 * Firstly, it doesn't use "NAPI": the networking's New API, and is poorer for
42 * it. As he says "NAPI means system-wide load leveling, across multiple
43 * network interfaces. Lack of NAPI can mean competition at higher loads."
44 *
45 * He also points out that we don't implement set_mac_address, so users cannot
46 * change the devices hardware address. When I asked why one would want to:
47 * "Bonding, and situations where you /do/ want the MAC address to "leak" out
48 * of the host onto the wider net."
49 *
50 * Finally, he would like module unloading: "It is not unrealistic to think of
51 * [un|re|]loading the net support module in an lguest guest. And, adding
52 * module support makes the programmer more responsible, because they now have
53 * to learn to clean up after themselves. Any driver that cannot clean up
54 * after itself is an incomplete driver in my book."
55 :*/
56
57/*D:530 The "struct lguestnet_info" contains all the information we need to
58 * know about the network device. */
59struct lguestnet_info
60{
61 /* The mapped device page(s) (an array of "struct lguest_net"). */
62 struct lguest_net *peer;
63 /* The physical address of the device page(s) */
64 unsigned long peer_phys;
65 /* The size of the device page(s). */
66 unsigned long mapsize;
67
68 /* The lguest_device I come from */
69 struct lguest_device *lgdev;
70
71 /* My peerid (ie. my slot in the array). */
72 unsigned int me;
73
74 /* Receive queue: the network packets waiting to be filled. */
75 struct sk_buff *skb[NUM_SKBS];
76 struct lguest_dma dma[NUM_SKBS];
77};
78/*:*/
79
80/* How many bytes left in this page. */
81static unsigned int rest_of_page(void *data)
82{
83 return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE);
84}
85
86/*D:570 Each peer (ie. Guest or Host) on the network binds their receive
87 * buffers to a different key: we simply use the physical address of the
88 * device's memory page plus the peer number. The Host insists that all keys
89 * be a multiple of 4, so we multiply the peer number by 4. */
90static unsigned long peer_key(struct lguestnet_info *info, unsigned peernum)
91{
92 return info->peer_phys + 4 * peernum;
93}
94
95/* This is the routine which sets up a "struct lguest_dma" to point to a
96 * network packet, similar to req_to_dma() in lguest_blk.c. The structure of a
97 * "struct sk_buff" has grown complex over the years: it consists of a "head"
98 * linear section pointed to by "skb->data", and possibly an array of
99 * "fragments" in the case of a non-linear packet.
100 *
101 * Our receive buffers don't use fragments at all but outgoing skbs might, so
102 * we handle it. */
103static void skb_to_dma(const struct sk_buff *skb, unsigned int headlen,
104 struct lguest_dma *dma)
105{
106 unsigned int i, seg;
107
108 /* First, we put the linear region into the "struct lguest_dma". Each
109 * entry can't go over a page boundary, so even though all our packets
110 * are 1514 bytes or less, we might need to use two entries here: */
111 for (i = seg = 0; i < headlen; seg++, i += rest_of_page(skb->data+i)) {
112 dma->addr[seg] = virt_to_phys(skb->data + i);
113 dma->len[seg] = min((unsigned)(headlen - i),
114 rest_of_page(skb->data + i));
115 }
116
117 /* Now we handle the fragments: at least they're guaranteed not to go
118 * over a page. skb_shinfo(skb) returns a pointer to the structure
119 * which tells us about the number of fragments and the fragment
120 * array. */
121 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, seg++) {
122 const skb_frag_t *f = &skb_shinfo(skb)->frags[i];
123 /* Should not happen with MTU less than 64k - 2 * PAGE_SIZE. */
124 if (seg == LGUEST_MAX_DMA_SECTIONS) {
125 /* We will end up sending a truncated packet should
126 * this ever happen. Plus, a cool log message! */
127 printk("Woah dude! Megapacket!\n");
128 break;
129 }
130 dma->addr[seg] = page_to_phys(f->page) + f->page_offset;
131 dma->len[seg] = f->size;
132 }
133
134 /* If after all that we didn't use the entire "struct lguest_dma"
135 * array, we terminate it with a 0 length. */
136 if (seg < LGUEST_MAX_DMA_SECTIONS)
137 dma->len[seg] = 0;
138}
139
140/*
141 * Packet transmission.
142 *
143 * Our packet transmission is a little unusual. A real network card would just
144 * send out the packet and leave the receivers to decide if they're interested.
145 * Instead, we look through the network device memory page and see if any of
146 * the ethernet addresses match the packet destination, and if so we send it to
147 * that Guest.
148 *
149 * This is made a little more complicated in two cases. The first case is
150 * broadcast packets: for that we send the packet to all Guests on the network,
151 * one at a time. The second case is "promiscuous" mode, where a Guest wants
152 * to see all the packets on the network. We need a way for the Guest to tell
153 * us it wants to see all packets, so it sets the "multicast" bit on its
154 * published MAC address, which is never valid in a real ethernet address.
155 */
156#define PROMISC_BIT 0x01
157
158/* This is the callback which is summoned whenever the network device's
159 * multicast or promiscuous state changes. If the card is in promiscuous mode,
160 * we advertise that in our ethernet address in the device's memory. We do the
161 * same if Linux wants any or all multicast traffic. */
162static void lguestnet_set_multicast(struct net_device *dev)
163{
164 struct lguestnet_info *info = netdev_priv(dev);
165
166 if ((dev->flags & (IFF_PROMISC|IFF_ALLMULTI)) || dev->mc_count)
167 info->peer[info->me].mac[0] |= PROMISC_BIT;
168 else
169 info->peer[info->me].mac[0] &= ~PROMISC_BIT;
170}
171
172/* A simple test function to see if a peer wants to see all packets.*/
173static int promisc(struct lguestnet_info *info, unsigned int peer)
174{
175 return info->peer[peer].mac[0] & PROMISC_BIT;
176}
177
178/* Another simple function to see if a peer's advertised ethernet address
179 * matches a packet's destination ethernet address. */
180static int mac_eq(const unsigned char mac[ETH_ALEN],
181 struct lguestnet_info *info, unsigned int peer)
182{
183 /* Ignore multicast bit, which peer turns on to mean promisc. */
184 if ((info->peer[peer].mac[0] & (~PROMISC_BIT)) != mac[0])
185 return 0;
186 return memcmp(mac+1, info->peer[peer].mac+1, ETH_ALEN-1) == 0;
187}
188
189/* This is the function which actually sends a packet once we've decided a
190 * peer wants it: */
191static void transfer_packet(struct net_device *dev,
192 struct sk_buff *skb,
193 unsigned int peernum)
194{
195 struct lguestnet_info *info = netdev_priv(dev);
196 struct lguest_dma dma;
197
198 /* We use our handy "struct lguest_dma" packing function to prepare
199 * the skb for sending. */
200 skb_to_dma(skb, skb_headlen(skb), &dma);
201 pr_debug("xfer length %04x (%u)\n", htons(skb->len), skb->len);
202
203 /* This is the actual send call which copies the packet. */
204 lguest_send_dma(peer_key(info, peernum), &dma);
205
206 /* Check that the entire packet was transmitted. If not, it could mean
207 * that the other Guest registered a short receive buffer, but this
208 * driver should never do that. More likely, the peer is dead. */
209 if (dma.used_len != skb->len) {
210 dev->stats.tx_carrier_errors++;
211 pr_debug("Bad xfer to peer %i: %i of %i (dma %p/%i)\n",
212 peernum, dma.used_len, skb->len,
213 (void *)dma.addr[0], dma.len[0]);
214 } else {
215 /* On success we update the stats. */
216 dev->stats.tx_bytes += skb->len;
217 dev->stats.tx_packets++;
218 }
219}
220
221/* Another helper function to tell is if a slot in the device memory is unused.
222 * Since we always set the Local Assignment bit in the ethernet address, the
223 * first byte can never be 0. */
224static int unused_peer(const struct lguest_net peer[], unsigned int num)
225{
226 return peer[num].mac[0] == 0;
227}
228
229/* Finally, here is the routine which handles an outgoing packet. It's called
230 * "start_xmit" for traditional reasons. */
231static int lguestnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
232{
233 unsigned int i;
234 int broadcast;
235 struct lguestnet_info *info = netdev_priv(dev);
236 /* Extract the destination ethernet address from the packet. */
237 const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
238 DECLARE_MAC_BUF(mac);
239
240 pr_debug("%s: xmit %s\n", dev->name, print_mac(mac, dest));
241
242 /* If it's a multicast packet, we broadcast to everyone. That's not
243 * very efficient, but there are very few applications which actually
244 * use multicast, which is a shame really.
245 *
246 * As etherdevice.h points out: "By definition the broadcast address is
247 * also a multicast address." So we don't have to test for broadcast
248 * packets separately. */
249 broadcast = is_multicast_ether_addr(dest);
250
251 /* Look through all the published ethernet addresses to see if we
252 * should send this packet. */
253 for (i = 0; i < info->mapsize/sizeof(struct lguest_net); i++) {
254 /* We don't send to ourselves (we actually can't SEND_DMA to
255 * ourselves anyway), and don't send to unused slots.*/
256 if (i == info->me || unused_peer(info->peer, i))
257 continue;
258
259 /* If it's broadcast we send it. If they want every packet we
260 * send it. If the destination matches their address we send
261 * it. Otherwise we go to the next peer. */
262 if (!broadcast && !promisc(info, i) && !mac_eq(dest, info, i))
263 continue;
264
265 pr_debug("lguestnet %s: sending from %i to %i\n",
266 dev->name, info->me, i);
267 /* Our routine which actually does the transfer. */
268 transfer_packet(dev, skb, i);
269 }
270
271 /* An xmit routine is expected to dispose of the packet, so we do. */
272 dev_kfree_skb(skb);
273
274 /* As per kernel convention, 0 means success. This is why I love
275 * networking: even if we never sent to anyone, that's still
276 * success! */
277 return 0;
278}
279
280/*D:560
281 * Packet receiving.
282 *
283 * First, here's a helper routine which fills one of our array of receive
284 * buffers: */
285static int fill_slot(struct net_device *dev, unsigned int slot)
286{
287 struct lguestnet_info *info = netdev_priv(dev);
288
289 /* We can receive ETH_DATA_LEN (1500) byte packets, plus a standard
290 * ethernet header of ETH_HLEN (14) bytes. */
291 info->skb[slot] = netdev_alloc_skb(dev, ETH_HLEN + ETH_DATA_LEN);
292 if (!info->skb[slot]) {
293 printk("%s: could not fill slot %i\n", dev->name, slot);
294 return -ENOMEM;
295 }
296
297 /* skb_to_dma() is a helper which sets up the "struct lguest_dma" to
298 * point to the data in the skb: we also use it for sending out a
299 * packet. */
300 skb_to_dma(info->skb[slot], ETH_HLEN + ETH_DATA_LEN, &info->dma[slot]);
301
302 /* This is a Write Memory Barrier: it ensures that the entry in the
303 * receive buffer array is written *before* we set the "used_len" entry
304 * to 0. If the Host were looking at the receive buffer array from a
305 * different CPU, it could potentially see "used_len = 0" and not see
306 * the updated receive buffer information. This would be a horribly
307 * nasty bug, so make sure the compiler and CPU know this has to happen
308 * first. */
309 wmb();
310 /* Writing 0 to "used_len" tells the Host it can use this receive
311 * buffer now. */
312 info->dma[slot].used_len = 0;
313 return 0;
314}
315
316/* This is the actual receive routine. When we receive an interrupt from the
317 * Host to tell us a packet has been delivered, we arrive here: */
318static irqreturn_t lguestnet_rcv(int irq, void *dev_id)
319{
320 struct net_device *dev = dev_id;
321 struct lguestnet_info *info = netdev_priv(dev);
322 unsigned int i, done = 0;
323
324 /* Look through our entire receive array for an entry which has data
325 * in it. */
326 for (i = 0; i < ARRAY_SIZE(info->dma); i++) {
327 unsigned int length;
328 struct sk_buff *skb;
329
330 length = info->dma[i].used_len;
331 if (length == 0)
332 continue;
333
334 /* We've found one! Remember the skb (we grabbed the length
335 * above), and immediately refill the slot we've taken it
336 * from. */
337 done++;
338 skb = info->skb[i];
339 fill_slot(dev, i);
340
341 /* This shouldn't happen: micropackets could be sent by a
342 * badly-behaved Guest on the network, but the Host will never
343 * stuff more data in the buffer than the buffer length. */
344 if (length < ETH_HLEN || length > ETH_HLEN + ETH_DATA_LEN) {
345 pr_debug(KERN_WARNING "%s: unbelievable skb len: %i\n",
346 dev->name, length);
347 dev_kfree_skb(skb);
348 continue;
349 }
350
351 /* skb_put(), what a great function! I've ranted about this
352 * function before (http://lkml.org/lkml/1999/9/26/24). You
353 * call it after you've added data to the end of an skb (in
354 * this case, it was the Host which wrote the data). */
355 skb_put(skb, length);
356
357 /* The ethernet header contains a protocol field: we use the
358 * standard helper to extract it, and place the result in
359 * skb->protocol. The helper also sets up skb->pkt_type and
360 * eats up the ethernet header from the front of the packet. */
361 skb->protocol = eth_type_trans(skb, dev);
362
363 /* If this device doesn't need checksums for sending, we also
364 * don't need to check the packets when they come in. */
365 if (dev->features & NETIF_F_NO_CSUM)
366 skb->ip_summed = CHECKSUM_UNNECESSARY;
367
368 /* As a last resort for debugging the driver or the lguest I/O
369 * subsystem, you can uncomment the "#define DEBUG" at the top
370 * of this file, which turns all the pr_debug() into printk()
371 * and floods the logs. */
372 pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
373 ntohs(skb->protocol), skb->len, skb->pkt_type);
374
375 /* Update the packet and byte counts (visible from ifconfig,
376 * and good for debugging). */
377 dev->stats.rx_bytes += skb->len;
378 dev->stats.rx_packets++;
379
380 /* Hand our fresh network packet into the stack's "network
381 * interface receive" routine. That will free the packet
382 * itself when it's finished. */
383 netif_rx(skb);
384 }
385
386 /* If we found any packets, we assume the interrupt was for us. */
387 return done ? IRQ_HANDLED : IRQ_NONE;
388}
389
390/*D:550 This is where we start: when the device is brought up by dhcpd or
391 * ifconfig. At this point we advertise our MAC address to the rest of the
392 * network, and register receive buffers ready for incoming packets. */
393static int lguestnet_open(struct net_device *dev)
394{
395 int i;
396 struct lguestnet_info *info = netdev_priv(dev);
397
398 /* Copy our MAC address into the device page, so others on the network
399 * can find us. */
400 memcpy(info->peer[info->me].mac, dev->dev_addr, ETH_ALEN);
401
402 /* We might already be in promisc mode (dev->flags & IFF_PROMISC). Our
403 * set_multicast callback handles this already, so we call it now. */
404 lguestnet_set_multicast(dev);
405
406 /* Allocate packets and put them into our "struct lguest_dma" array.
407 * If we fail to allocate all the packets we could still limp along,
408 * but it's a sign of real stress so we should probably give up now. */
409 for (i = 0; i < ARRAY_SIZE(info->dma); i++) {
410 if (fill_slot(dev, i) != 0)
411 goto cleanup;
412 }
413
414 /* Finally we tell the Host where our array of "struct lguest_dma"
415 * receive buffers is, binding it to the key corresponding to the
416 * device's physical memory plus our peerid. */
417 if (lguest_bind_dma(peer_key(info,info->me), info->dma,
418 NUM_SKBS, lgdev_irq(info->lgdev)) != 0)
419 goto cleanup;
420 return 0;
421
422cleanup:
423 while (--i >= 0)
424 dev_kfree_skb(info->skb[i]);
425 return -ENOMEM;
426}
427/*:*/
428
429/* The close routine is called when the device is no longer in use: we clean up
430 * elegantly. */
431static int lguestnet_close(struct net_device *dev)
432{
433 unsigned int i;
434 struct lguestnet_info *info = netdev_priv(dev);
435
436 /* Clear all trace of our existence out of the device memory by setting
437 * the slot which held our MAC address to 0 (unused). */
438 memset(&info->peer[info->me], 0, sizeof(info->peer[info->me]));
439
440 /* Unregister our array of receive buffers */
441 lguest_unbind_dma(peer_key(info, info->me), info->dma);
442 for (i = 0; i < ARRAY_SIZE(info->dma); i++)
443 dev_kfree_skb(info->skb[i]);
444 return 0;
445}
446
447/*D:510 The network device probe function is basically a standard ethernet
448 * device setup. It reads the "struct lguest_device_desc" and sets the "struct
449 * net_device". Oh, the line-by-line excitement! Let's skip over it. :*/
450static int lguestnet_probe(struct lguest_device *lgdev)
451{
452 int err, irqf = IRQF_SHARED;
453 struct net_device *dev;
454 struct lguestnet_info *info;
455 struct lguest_device_desc *desc = &lguest_devices[lgdev->index];
456
457 pr_debug("lguest_net: probing for device %i\n", lgdev->index);
458
459 dev = alloc_etherdev(sizeof(struct lguestnet_info));
460 if (!dev)
461 return -ENOMEM;
462
463 /* Ethernet defaults with some changes */
464 ether_setup(dev);
465 dev->set_mac_address = NULL;
466
467 dev->dev_addr[0] = 0x02; /* set local assignment bit (IEEE802) */
468 dev->dev_addr[1] = 0x00;
469 memcpy(&dev->dev_addr[2], &lguest_data.guestid, 2);
470 dev->dev_addr[4] = 0x00;
471 dev->dev_addr[5] = 0x00;
472
473 dev->open = lguestnet_open;
474 dev->stop = lguestnet_close;
475 dev->hard_start_xmit = lguestnet_start_xmit;
476
477 /* We don't actually support multicast yet, but turning on/off
478 * promisc also calls dev->set_multicast_list. */
479 dev->set_multicast_list = lguestnet_set_multicast;
480 SET_NETDEV_DEV(dev, &lgdev->dev);
481
482 /* The network code complains if you have "scatter-gather" capability
483 * if you don't also handle checksums (it seem that would be
484 * "illogical"). So we use a lie of omission and don't tell it that we
485 * can handle scattered packets unless we also don't want checksums,
486 * even though to us they're completely independent. */
487 if (desc->features & LGUEST_NET_F_NOCSUM)
488 dev->features = NETIF_F_SG|NETIF_F_NO_CSUM;
489
490 info = netdev_priv(dev);
491 info->mapsize = PAGE_SIZE * desc->num_pages;
492 info->peer_phys = ((unsigned long)desc->pfn << PAGE_SHIFT);
493 info->lgdev = lgdev;
494 info->peer = lguest_map(info->peer_phys, desc->num_pages);
495 if (!info->peer) {
496 err = -ENOMEM;
497 goto free;
498 }
499
500 /* This stores our peerid (upper bits reserved for future). */
501 info->me = (desc->features & (info->mapsize-1));
502
503 err = register_netdev(dev);
504 if (err) {
505 pr_debug("lguestnet: registering device failed\n");
506 goto unmap;
507 }
508
509 if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS)
510 irqf |= IRQF_SAMPLE_RANDOM;
511 if (request_irq(lgdev_irq(lgdev), lguestnet_rcv, irqf, "lguestnet",
512 dev) != 0) {
513 pr_debug("lguestnet: cannot get irq %i\n", lgdev_irq(lgdev));
514 goto unregister;
515 }
516
517 pr_debug("lguestnet: registered device %s\n", dev->name);
518 /* Finally, we put the "struct net_device" in the generic "struct
519 * lguest_device"s private pointer. Again, it's not necessary, but
520 * makes sure the cool kernel kids don't tease us. */
521 lgdev->private = dev;
522 return 0;
523
524unregister:
525 unregister_netdev(dev);
526unmap:
527 lguest_unmap(info->peer);
528free:
529 free_netdev(dev);
530 return err;
531}
532
533static struct lguest_driver lguestnet_drv = {
534 .name = "lguestnet",
535 .owner = THIS_MODULE,
536 .device_type = LGUEST_DEVICE_T_NET,
537 .probe = lguestnet_probe,
538};
539
540static __init int lguestnet_init(void)
541{
542 return register_lguest_driver(&lguestnet_drv);
543}
544module_init(lguestnet_init);
545
546MODULE_DESCRIPTION("Lguest network driver");
547MODULE_LICENSE("GPL");
548
549/*D:580
550 * This is the last of the Drivers, and with this we have covered the many and
551 * wonderous and fine (and boring) details of the Guest.
552 *
553 * "make Launcher" beckons, where we answer questions like "Where do Guests
554 * come from?", and "What do you do when someone asks for optimization?"
555 */
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
new file mode 100644
index 00000000000..e396c9d2af8
--- /dev/null
+++ b/drivers/net/virtio_net.c
@@ -0,0 +1,435 @@
1/* A simple network driver using virtio.
2 *
3 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19//#define DEBUG
20#include <linux/netdevice.h>
21#include <linux/etherdevice.h>
22#include <linux/module.h>
23#include <linux/virtio.h>
24#include <linux/virtio_net.h>
25#include <linux/scatterlist.h>
26
27/* FIXME: MTU in config. */
28#define MAX_PACKET_LEN (ETH_HLEN+ETH_DATA_LEN)
29
30struct virtnet_info
31{
32 struct virtio_device *vdev;
33 struct virtqueue *rvq, *svq;
34 struct net_device *dev;
35 struct napi_struct napi;
36
37 /* Number of input buffers, and max we've ever had. */
38 unsigned int num, max;
39
40 /* Receive & send queues. */
41 struct sk_buff_head recv;
42 struct sk_buff_head send;
43};
44
45static inline struct virtio_net_hdr *skb_vnet_hdr(struct sk_buff *skb)
46{
47 return (struct virtio_net_hdr *)skb->cb;
48}
49
50static inline void vnet_hdr_to_sg(struct scatterlist *sg, struct sk_buff *skb)
51{
52 sg_init_one(sg, skb_vnet_hdr(skb), sizeof(struct virtio_net_hdr));
53}
54
55static bool skb_xmit_done(struct virtqueue *rvq)
56{
57 struct virtnet_info *vi = rvq->vdev->priv;
58
59 /* In case we were waiting for output buffers. */
60 netif_wake_queue(vi->dev);
61 return true;
62}
63
64static void receive_skb(struct net_device *dev, struct sk_buff *skb,
65 unsigned len)
66{
67 struct virtio_net_hdr *hdr = skb_vnet_hdr(skb);
68
69 if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
70 pr_debug("%s: short packet %i\n", dev->name, len);
71 dev->stats.rx_length_errors++;
72 goto drop;
73 }
74 len -= sizeof(struct virtio_net_hdr);
75 BUG_ON(len > MAX_PACKET_LEN);
76
77 skb_trim(skb, len);
78 skb->protocol = eth_type_trans(skb, dev);
79 pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
80 ntohs(skb->protocol), skb->len, skb->pkt_type);
81 dev->stats.rx_bytes += skb->len;
82 dev->stats.rx_packets++;
83
84 if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
85 pr_debug("Needs csum!\n");
86 skb->ip_summed = CHECKSUM_PARTIAL;
87 skb->csum_start = hdr->csum_start;
88 skb->csum_offset = hdr->csum_offset;
89 if (skb->csum_start > skb->len - 2
90 || skb->csum_offset > skb->len - 2) {
91 if (net_ratelimit())
92 printk(KERN_WARNING "%s: csum=%u/%u len=%u\n",
93 dev->name, skb->csum_start,
94 skb->csum_offset, skb->len);
95 goto frame_err;
96 }
97 }
98
99 if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
100 pr_debug("GSO!\n");
101 switch (hdr->gso_type) {
102 case VIRTIO_NET_HDR_GSO_TCPV4:
103 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
104 break;
105 case VIRTIO_NET_HDR_GSO_TCPV4_ECN:
106 skb_shinfo(skb)->gso_type = SKB_GSO_TCP_ECN;
107 break;
108 case VIRTIO_NET_HDR_GSO_UDP:
109 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
110 break;
111 case VIRTIO_NET_HDR_GSO_TCPV6:
112 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
113 break;
114 default:
115 if (net_ratelimit())
116 printk(KERN_WARNING "%s: bad gso type %u.\n",
117 dev->name, hdr->gso_type);
118 goto frame_err;
119 }
120
121 skb_shinfo(skb)->gso_size = hdr->gso_size;
122 if (skb_shinfo(skb)->gso_size == 0) {
123 if (net_ratelimit())
124 printk(KERN_WARNING "%s: zero gso size.\n",
125 dev->name);
126 goto frame_err;
127 }
128
129 /* Header must be checked, and gso_segs computed. */
130 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
131 skb_shinfo(skb)->gso_segs = 0;
132 }
133
134 netif_receive_skb(skb);
135 return;
136
137frame_err:
138 dev->stats.rx_frame_errors++;
139drop:
140 dev_kfree_skb(skb);
141}
142
143static void try_fill_recv(struct virtnet_info *vi)
144{
145 struct sk_buff *skb;
146 struct scatterlist sg[1+MAX_SKB_FRAGS];
147 int num, err;
148
149 for (;;) {
150 skb = netdev_alloc_skb(vi->dev, MAX_PACKET_LEN);
151 if (unlikely(!skb))
152 break;
153
154 skb_put(skb, MAX_PACKET_LEN);
155 vnet_hdr_to_sg(sg, skb);
156 num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
157 skb_queue_head(&vi->recv, skb);
158
159 err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, num, skb);
160 if (err) {
161 skb_unlink(skb, &vi->recv);
162 kfree_skb(skb);
163 break;
164 }
165 vi->num++;
166 }
167 if (unlikely(vi->num > vi->max))
168 vi->max = vi->num;
169 vi->rvq->vq_ops->kick(vi->rvq);
170}
171
172static bool skb_recv_done(struct virtqueue *rvq)
173{
174 struct virtnet_info *vi = rvq->vdev->priv;
175 netif_rx_schedule(vi->dev, &vi->napi);
176 /* Suppress further interrupts. */
177 return false;
178}
179
180static int virtnet_poll(struct napi_struct *napi, int budget)
181{
182 struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi);
183 struct sk_buff *skb = NULL;
184 unsigned int len, received = 0;
185
186again:
187 while (received < budget &&
188 (skb = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
189 __skb_unlink(skb, &vi->recv);
190 receive_skb(vi->dev, skb, len);
191 vi->num--;
192 received++;
193 }
194
195 /* FIXME: If we oom and completely run out of inbufs, we need
196 * to start a timer trying to fill more. */
197 if (vi->num < vi->max / 2)
198 try_fill_recv(vi);
199
200 /* All done? */
201 if (!skb) {
202 netif_rx_complete(vi->dev, napi);
203 if (unlikely(!vi->rvq->vq_ops->restart(vi->rvq))
204 && netif_rx_reschedule(vi->dev, napi))
205 goto again;
206 }
207
208 return received;
209}
210
211static void free_old_xmit_skbs(struct virtnet_info *vi)
212{
213 struct sk_buff *skb;
214 unsigned int len;
215
216 while ((skb = vi->svq->vq_ops->get_buf(vi->svq, &len)) != NULL) {
217 pr_debug("Sent skb %p\n", skb);
218 __skb_unlink(skb, &vi->send);
219 vi->dev->stats.tx_bytes += len;
220 vi->dev->stats.tx_packets++;
221 kfree_skb(skb);
222 }
223}
224
225static int start_xmit(struct sk_buff *skb, struct net_device *dev)
226{
227 struct virtnet_info *vi = netdev_priv(dev);
228 int num, err;
229 struct scatterlist sg[1+MAX_SKB_FRAGS];
230 struct virtio_net_hdr *hdr;
231 const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
232 DECLARE_MAC_BUF(mac);
233
234 pr_debug("%s: xmit %p %s\n", dev->name, skb, print_mac(mac, dest));
235
236 free_old_xmit_skbs(vi);
237
238 /* Encode metadata header at front. */
239 hdr = skb_vnet_hdr(skb);
240 if (skb->ip_summed == CHECKSUM_PARTIAL) {
241 hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
242 hdr->csum_start = skb->csum_start - skb_headroom(skb);
243 hdr->csum_offset = skb->csum_offset;
244 } else {
245 hdr->flags = 0;
246 hdr->csum_offset = hdr->csum_start = 0;
247 }
248
249 if (skb_is_gso(skb)) {
250 hdr->gso_size = skb_shinfo(skb)->gso_size;
251 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN)
252 hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4_ECN;
253 else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
254 hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
255 else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
256 hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
257 else if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
258 hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
259 else
260 BUG();
261 } else {
262 hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
263 hdr->gso_size = 0;
264 }
265
266 vnet_hdr_to_sg(sg, skb);
267 num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
268 __skb_queue_head(&vi->send, skb);
269 err = vi->svq->vq_ops->add_buf(vi->svq, sg, num, 0, skb);
270 if (err) {
271 pr_debug("%s: virtio not prepared to send\n", dev->name);
272 skb_unlink(skb, &vi->send);
273 netif_stop_queue(dev);
274 return NETDEV_TX_BUSY;
275 }
276 vi->svq->vq_ops->kick(vi->svq);
277
278 return 0;
279}
280
281static int virtnet_open(struct net_device *dev)
282{
283 struct virtnet_info *vi = netdev_priv(dev);
284
285 try_fill_recv(vi);
286
287 /* If we didn't even get one input buffer, we're useless. */
288 if (vi->num == 0)
289 return -ENOMEM;
290
291 napi_enable(&vi->napi);
292 return 0;
293}
294
295static int virtnet_close(struct net_device *dev)
296{
297 struct virtnet_info *vi = netdev_priv(dev);
298 struct sk_buff *skb;
299
300 napi_disable(&vi->napi);
301
302 /* networking core has neutered skb_xmit_done/skb_recv_done, so don't
303 * worry about races vs. get(). */
304 vi->rvq->vq_ops->shutdown(vi->rvq);
305 while ((skb = __skb_dequeue(&vi->recv)) != NULL) {
306 kfree_skb(skb);
307 vi->num--;
308 }
309 vi->svq->vq_ops->shutdown(vi->svq);
310 while ((skb = __skb_dequeue(&vi->send)) != NULL)
311 kfree_skb(skb);
312
313 BUG_ON(vi->num != 0);
314 return 0;
315}
316
317static int virtnet_probe(struct virtio_device *vdev)
318{
319 int err;
320 unsigned int len;
321 struct net_device *dev;
322 struct virtnet_info *vi;
323 void *token;
324
325 /* Allocate ourselves a network device with room for our info */
326 dev = alloc_etherdev(sizeof(struct virtnet_info));
327 if (!dev)
328 return -ENOMEM;
329
330 /* Set up network device as normal. */
331 ether_setup(dev);
332 dev->open = virtnet_open;
333 dev->stop = virtnet_close;
334 dev->hard_start_xmit = start_xmit;
335 dev->features = NETIF_F_HIGHDMA;
336 SET_NETDEV_DEV(dev, &vdev->dev);
337
338 /* Do we support "hardware" checksums? */
339 token = vdev->config->find(vdev, VIRTIO_CONFIG_NET_F, &len);
340 if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_NO_CSUM)) {
341 /* This opens up the world of extra features. */
342 dev->features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST;
343 if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO4))
344 dev->features |= NETIF_F_TSO;
345 if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_UFO))
346 dev->features |= NETIF_F_UFO;
347 if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO4_ECN))
348 dev->features |= NETIF_F_TSO_ECN;
349 if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO6))
350 dev->features |= NETIF_F_TSO6;
351 }
352
353 /* Configuration may specify what MAC to use. Otherwise random. */
354 token = vdev->config->find(vdev, VIRTIO_CONFIG_NET_MAC_F, &len);
355 if (token) {
356 dev->addr_len = len;
357 vdev->config->get(vdev, token, dev->dev_addr, len);
358 } else
359 random_ether_addr(dev->dev_addr);
360
361 /* Set up our device-specific information */
362 vi = netdev_priv(dev);
363 netif_napi_add(dev, &vi->napi, virtnet_poll, 16);
364 vi->dev = dev;
365 vi->vdev = vdev;
366
367 /* We expect two virtqueues, receive then send. */
368 vi->rvq = vdev->config->find_vq(vdev, skb_recv_done);
369 if (IS_ERR(vi->rvq)) {
370 err = PTR_ERR(vi->rvq);
371 goto free;
372 }
373
374 vi->svq = vdev->config->find_vq(vdev, skb_xmit_done);
375 if (IS_ERR(vi->svq)) {
376 err = PTR_ERR(vi->svq);
377 goto free_recv;
378 }
379
380 /* Initialize our empty receive and send queues. */
381 skb_queue_head_init(&vi->recv);
382 skb_queue_head_init(&vi->send);
383
384 err = register_netdev(dev);
385 if (err) {
386 pr_debug("virtio_net: registering device failed\n");
387 goto free_send;
388 }
389 pr_debug("virtnet: registered device %s\n", dev->name);
390 vdev->priv = vi;
391 return 0;
392
393free_send:
394 vdev->config->del_vq(vi->svq);
395free_recv:
396 vdev->config->del_vq(vi->rvq);
397free:
398 free_netdev(dev);
399 return err;
400}
401
402static void virtnet_remove(struct virtio_device *vdev)
403{
404 unregister_netdev(vdev->priv);
405 free_netdev(vdev->priv);
406}
407
408static struct virtio_device_id id_table[] = {
409 { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
410 { 0 },
411};
412
413static struct virtio_driver virtio_net = {
414 .driver.name = KBUILD_MODNAME,
415 .driver.owner = THIS_MODULE,
416 .id_table = id_table,
417 .probe = virtnet_probe,
418 .remove = __devexit_p(virtnet_remove),
419};
420
421static int __init init(void)
422{
423 return register_virtio_driver(&virtio_net);
424}
425
426static void __exit fini(void)
427{
428 unregister_virtio_driver(&virtio_net);
429}
430module_init(init);
431module_exit(fini);
432
433MODULE_DEVICE_TABLE(virtio, id_table);
434MODULE_DESCRIPTION("Virtio network driver");
435MODULE_LICENSE("GPL");
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
new file mode 100644
index 00000000000..9e33fc4da87
--- /dev/null
+++ b/drivers/virtio/Kconfig
@@ -0,0 +1,8 @@
1# Virtio always gets selected by whoever wants it.
2config VIRTIO
3 bool
4
5# Similarly the virtio ring implementation.
6config VIRTIO_RING
7 bool
8 depends on VIRTIO
diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
new file mode 100644
index 00000000000..f70e40971dd
--- /dev/null
+++ b/drivers/virtio/Makefile
@@ -0,0 +1,2 @@
1obj-$(CONFIG_VIRTIO) += virtio.o
2obj-$(CONFIG_VIRTIO_RING) += virtio_ring.o
diff --git a/drivers/virtio/config.c b/drivers/virtio/config.c
new file mode 100644
index 00000000000..983d482fba4
--- /dev/null
+++ b/drivers/virtio/config.c
@@ -0,0 +1,13 @@
1/* Configuration space parsing helpers for virtio.
2 *
3 * The configuration is [type][len][... len bytes ...] fields.
4 *
5 * Copyright 2007 Rusty Russell, IBM Corporation.
6 * GPL v2 or later.
7 */
8#include <linux/err.h>
9#include <linux/virtio.h>
10#include <linux/virtio_config.h>
11#include <linux/bug.h>
12#include <asm/system.h>
13
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
new file mode 100644
index 00000000000..15d7787dea8
--- /dev/null
+++ b/drivers/virtio/virtio.c
@@ -0,0 +1,189 @@
1#include <linux/virtio.h>
2#include <linux/spinlock.h>
3#include <linux/virtio_config.h>
4
5static ssize_t device_show(struct device *_d,
6 struct device_attribute *attr, char *buf)
7{
8 struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
9 return sprintf(buf, "%hu", dev->id.device);
10}
11static ssize_t vendor_show(struct device *_d,
12 struct device_attribute *attr, char *buf)
13{
14 struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
15 return sprintf(buf, "%hu", dev->id.vendor);
16}
17static ssize_t status_show(struct device *_d,
18 struct device_attribute *attr, char *buf)
19{
20 struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
21 return sprintf(buf, "0x%08x", dev->config->get_status(dev));
22}
23static ssize_t modalias_show(struct device *_d,
24 struct device_attribute *attr, char *buf)
25{
26 struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
27
28 return sprintf(buf, "virtio:d%08Xv%08X\n",
29 dev->id.device, dev->id.vendor);
30}
31static struct device_attribute virtio_dev_attrs[] = {
32 __ATTR_RO(device),
33 __ATTR_RO(vendor),
34 __ATTR_RO(status),
35 __ATTR_RO(modalias),
36 __ATTR_NULL
37};
38
39static inline int virtio_id_match(const struct virtio_device *dev,
40 const struct virtio_device_id *id)
41{
42 if (id->device != dev->id.device)
43 return 0;
44
45 return id->vendor == VIRTIO_DEV_ANY_ID || id->vendor != dev->id.vendor;
46}
47
48/* This looks through all the IDs a driver claims to support. If any of them
49 * match, we return 1 and the kernel will call virtio_dev_probe(). */
50static int virtio_dev_match(struct device *_dv, struct device_driver *_dr)
51{
52 unsigned int i;
53 struct virtio_device *dev = container_of(_dv,struct virtio_device,dev);
54 const struct virtio_device_id *ids;
55
56 ids = container_of(_dr, struct virtio_driver, driver)->id_table;
57 for (i = 0; ids[i].device; i++)
58 if (virtio_id_match(dev, &ids[i]))
59 return 1;
60 return 0;
61}
62
63static int virtio_uevent(struct device *_dv, struct kobj_uevent_env *env)
64{
65 struct virtio_device *dev = container_of(_dv,struct virtio_device,dev);
66
67 return add_uevent_var(env, "MODALIAS=virtio:d%08Xv%08X",
68 dev->id.device, dev->id.vendor);
69}
70
71static struct bus_type virtio_bus = {
72 .name = "virtio",
73 .match = virtio_dev_match,
74 .dev_attrs = virtio_dev_attrs,
75 .uevent = virtio_uevent,
76};
77
78static void add_status(struct virtio_device *dev, unsigned status)
79{
80 dev->config->set_status(dev, dev->config->get_status(dev) | status);
81}
82
83static int virtio_dev_probe(struct device *_d)
84{
85 int err;
86 struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
87 struct virtio_driver *drv = container_of(dev->dev.driver,
88 struct virtio_driver, driver);
89
90 add_status(dev, VIRTIO_CONFIG_S_DRIVER);
91 err = drv->probe(dev);
92 if (err)
93 add_status(dev, VIRTIO_CONFIG_S_FAILED);
94 else
95 add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
96 return err;
97}
98
99int register_virtio_driver(struct virtio_driver *driver)
100{
101 driver->driver.bus = &virtio_bus;
102 driver->driver.probe = virtio_dev_probe;
103 return driver_register(&driver->driver);
104}
105EXPORT_SYMBOL_GPL(register_virtio_driver);
106
107void unregister_virtio_driver(struct virtio_driver *driver)
108{
109 driver_unregister(&driver->driver);
110}
111EXPORT_SYMBOL_GPL(unregister_virtio_driver);
112
113int register_virtio_device(struct virtio_device *dev)
114{
115 int err;
116
117 dev->dev.bus = &virtio_bus;
118 sprintf(dev->dev.bus_id, "%u", dev->index);
119
120 /* Acknowledge that we've seen the device. */
121 add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
122
123 /* device_register() causes the bus infrastructure to look for a
124 * matching driver. */
125 err = device_register(&dev->dev);
126 if (err)
127 add_status(dev, VIRTIO_CONFIG_S_FAILED);
128 return err;
129}
130EXPORT_SYMBOL_GPL(register_virtio_device);
131
132void unregister_virtio_device(struct virtio_device *dev)
133{
134 device_unregister(&dev->dev);
135}
136EXPORT_SYMBOL_GPL(unregister_virtio_device);
137
138int __virtio_config_val(struct virtio_device *vdev,
139 u8 type, void *val, size_t size)
140{
141 void *token;
142 unsigned int len;
143
144 token = vdev->config->find(vdev, type, &len);
145 if (!token)
146 return -ENOENT;
147
148 if (len != size)
149 return -EIO;
150
151 vdev->config->get(vdev, token, val, size);
152 return 0;
153}
154EXPORT_SYMBOL_GPL(__virtio_config_val);
155
156int virtio_use_bit(struct virtio_device *vdev,
157 void *token, unsigned int len, unsigned int bitnum)
158{
159 unsigned long bits[16];
160
161 /* This makes it convenient to pass-through find() results. */
162 if (!token)
163 return 0;
164
165 /* bit not in range of this bitfield? */
166 if (bitnum * 8 >= len / 2)
167 return 0;
168
169 /* Giant feature bitfields are silly. */
170 BUG_ON(len > sizeof(bits));
171 vdev->config->get(vdev, token, bits, len);
172
173 if (!test_bit(bitnum, bits))
174 return 0;
175
176 /* Set acknowledge bit, and write it back. */
177 set_bit(bitnum + len * 8 / 2, bits);
178 vdev->config->set(vdev, token, bits, len);
179 return 1;
180}
181EXPORT_SYMBOL_GPL(virtio_use_bit);
182
183static int virtio_init(void)
184{
185 if (bus_register(&virtio_bus) != 0)
186 panic("virtio bus registration failed");
187 return 0;
188}
189core_initcall(virtio_init);
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
new file mode 100644
index 00000000000..0e4baca21b8
--- /dev/null
+++ b/drivers/virtio/virtio_ring.c
@@ -0,0 +1,313 @@
1/* Virtio ring implementation.
2 *
3 * Copyright 2007 Rusty Russell IBM Corporation
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include <linux/virtio.h>
20#include <linux/virtio_ring.h>
21#include <linux/device.h>
22
23#ifdef DEBUG
24/* For development, we want to crash whenever the ring is screwed. */
25#define BAD_RING(vq, fmt...) \
26 do { dev_err(&vq->vq.vdev->dev, fmt); BUG(); } while(0)
27#define START_USE(vq) \
28 do { if ((vq)->in_use) panic("in_use = %i\n", (vq)->in_use); (vq)->in_use = __LINE__; mb(); } while(0)
29#define END_USE(vq) \
30 do { BUG_ON(!(vq)->in_use); (vq)->in_use = 0; mb(); } while(0)
31#else
32#define BAD_RING(vq, fmt...) \
33 do { dev_err(&vq->vq.vdev->dev, fmt); (vq)->broken = true; } while(0)
34#define START_USE(vq)
35#define END_USE(vq)
36#endif
37
38struct vring_virtqueue
39{
40 struct virtqueue vq;
41
42 /* Actual memory layout for this queue */
43 struct vring vring;
44
45 /* Other side has made a mess, don't try any more. */
46 bool broken;
47
48 /* Number of free buffers */
49 unsigned int num_free;
50 /* Head of free buffer list. */
51 unsigned int free_head;
52 /* Number we've added since last sync. */
53 unsigned int num_added;
54
55 /* Last used index we've seen. */
56 unsigned int last_used_idx;
57
58 /* How to notify other side. FIXME: commonalize hcalls! */
59 void (*notify)(struct virtqueue *vq);
60
61#ifdef DEBUG
62 /* They're supposed to lock for us. */
63 unsigned int in_use;
64#endif
65
66 /* Tokens for callbacks. */
67 void *data[];
68};
69
70#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
71
72static int vring_add_buf(struct virtqueue *_vq,
73 struct scatterlist sg[],
74 unsigned int out,
75 unsigned int in,
76 void *data)
77{
78 struct vring_virtqueue *vq = to_vvq(_vq);
79 unsigned int i, avail, head, uninitialized_var(prev);
80
81 BUG_ON(data == NULL);
82 BUG_ON(out + in > vq->vring.num);
83 BUG_ON(out + in == 0);
84
85 START_USE(vq);
86
87 if (vq->num_free < out + in) {
88 pr_debug("Can't add buf len %i - avail = %i\n",
89 out + in, vq->num_free);
90 END_USE(vq);
91 return -ENOSPC;
92 }
93
94 /* We're about to use some buffers from the free list. */
95 vq->num_free -= out + in;
96
97 head = vq->free_head;
98 for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) {
99 vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
100 vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<<PAGE_SHIFT)
101 + sg->offset;
102 vq->vring.desc[i].len = sg->length;
103 prev = i;
104 sg++;
105 }
106 for (; in; i = vq->vring.desc[i].next, in--) {
107 vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
108 vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<<PAGE_SHIFT)
109 + sg->offset;
110 vq->vring.desc[i].len = sg->length;
111 prev = i;
112 sg++;
113 }
114 /* Last one doesn't continue. */
115 vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT;
116
117 /* Update free pointer */
118 vq->free_head = i;
119
120 /* Set token. */
121 vq->data[head] = data;
122
123 /* Put entry in available array (but don't update avail->idx until they
124 * do sync). FIXME: avoid modulus here? */
125 avail = (vq->vring.avail->idx + vq->num_added++) % vq->vring.num;
126 vq->vring.avail->ring[avail] = head;
127
128 pr_debug("Added buffer head %i to %p\n", head, vq);
129 END_USE(vq);
130 return 0;
131}
132
133static void vring_kick(struct virtqueue *_vq)
134{
135 struct vring_virtqueue *vq = to_vvq(_vq);
136 START_USE(vq);
137 /* Descriptors and available array need to be set before we expose the
138 * new available array entries. */
139 wmb();
140
141 vq->vring.avail->idx += vq->num_added;
142 vq->num_added = 0;
143
144 /* Need to update avail index before checking if we should notify */
145 mb();
146
147 if (!(vq->vring.used->flags & VRING_USED_F_NO_NOTIFY))
148 /* Prod other side to tell it about changes. */
149 vq->notify(&vq->vq);
150
151 END_USE(vq);
152}
153
154static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
155{
156 unsigned int i;
157
158 /* Clear data ptr. */
159 vq->data[head] = NULL;
160
161 /* Put back on free list: find end */
162 i = head;
163 while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) {
164 i = vq->vring.desc[i].next;
165 vq->num_free++;
166 }
167
168 vq->vring.desc[i].next = vq->free_head;
169 vq->free_head = head;
170 /* Plus final descriptor */
171 vq->num_free++;
172}
173
174/* FIXME: We need to tell other side about removal, to synchronize. */
175static void vring_shutdown(struct virtqueue *_vq)
176{
177 struct vring_virtqueue *vq = to_vvq(_vq);
178 unsigned int i;
179
180 for (i = 0; i < vq->vring.num; i++)
181 detach_buf(vq, i);
182}
183
184static inline bool more_used(const struct vring_virtqueue *vq)
185{
186 return vq->last_used_idx != vq->vring.used->idx;
187}
188
189static void *vring_get_buf(struct virtqueue *_vq, unsigned int *len)
190{
191 struct vring_virtqueue *vq = to_vvq(_vq);
192 void *ret;
193 unsigned int i;
194
195 START_USE(vq);
196
197 if (!more_used(vq)) {
198 pr_debug("No more buffers in queue\n");
199 END_USE(vq);
200 return NULL;
201 }
202
203 i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id;
204 *len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len;
205
206 if (unlikely(i >= vq->vring.num)) {
207 BAD_RING(vq, "id %u out of range\n", i);
208 return NULL;
209 }
210 if (unlikely(!vq->data[i])) {
211 BAD_RING(vq, "id %u is not a head!\n", i);
212 return NULL;
213 }
214
215 /* detach_buf clears data, so grab it now. */
216 ret = vq->data[i];
217 detach_buf(vq, i);
218 vq->last_used_idx++;
219 END_USE(vq);
220 return ret;
221}
222
223static bool vring_restart(struct virtqueue *_vq)
224{
225 struct vring_virtqueue *vq = to_vvq(_vq);
226
227 START_USE(vq);
228 BUG_ON(!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT));
229
230 /* We optimistically turn back on interrupts, then check if there was
231 * more to do. */
232 vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
233 mb();
234 if (unlikely(more_used(vq))) {
235 vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
236 END_USE(vq);
237 return false;
238 }
239
240 END_USE(vq);
241 return true;
242}
243
244irqreturn_t vring_interrupt(int irq, void *_vq)
245{
246 struct vring_virtqueue *vq = to_vvq(_vq);
247
248 if (!more_used(vq)) {
249 pr_debug("virtqueue interrupt with no work for %p\n", vq);
250 return IRQ_NONE;
251 }
252
253 if (unlikely(vq->broken))
254 return IRQ_HANDLED;
255
256 pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback);
257 if (vq->vq.callback && !vq->vq.callback(&vq->vq))
258 vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
259
260 return IRQ_HANDLED;
261}
262
263static struct virtqueue_ops vring_vq_ops = {
264 .add_buf = vring_add_buf,
265 .get_buf = vring_get_buf,
266 .kick = vring_kick,
267 .restart = vring_restart,
268 .shutdown = vring_shutdown,
269};
270
271struct virtqueue *vring_new_virtqueue(unsigned int num,
272 struct virtio_device *vdev,
273 void *pages,
274 void (*notify)(struct virtqueue *),
275 bool (*callback)(struct virtqueue *))
276{
277 struct vring_virtqueue *vq;
278 unsigned int i;
279
280 vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL);
281 if (!vq)
282 return NULL;
283
284 vring_init(&vq->vring, num, pages);
285 vq->vq.callback = callback;
286 vq->vq.vdev = vdev;
287 vq->vq.vq_ops = &vring_vq_ops;
288 vq->notify = notify;
289 vq->broken = false;
290 vq->last_used_idx = 0;
291 vq->num_added = 0;
292#ifdef DEBUG
293 vq->in_use = false;
294#endif
295
296 /* No callback? Tell other side not to bother us. */
297 if (!callback)
298 vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
299
300 /* Put everything in free lists. */
301 vq->num_free = num;
302 vq->free_head = 0;
303 for (i = 0; i < num-1; i++)
304 vq->vring.desc[i].next = i+1;
305
306 return &vq->vq;
307}
308
309void vring_del_virtqueue(struct virtqueue *vq)
310{
311 kfree(to_vvq(vq));
312}
313
diff --git a/include/asm-x86/Kbuild b/include/asm-x86/Kbuild
index 559830ece75..5e3539c129b 100644
--- a/include/asm-x86/Kbuild
+++ b/include/asm-x86/Kbuild
@@ -1,6 +1,7 @@
1include include/asm-generic/Kbuild.asm 1include include/asm-generic/Kbuild.asm
2 2
3header-y += boot.h 3header-y += boot.h
4header-y += bootparam.h
4header-y += debugreg.h 5header-y += debugreg.h
5header-y += ldt.h 6header-y += ldt.h
6header-y += msr-index.h 7header-y += msr-index.h
@@ -14,8 +15,10 @@ unifdef-y += a.out_32.h
14unifdef-y += a.out_64.h 15unifdef-y += a.out_64.h
15unifdef-y += byteorder_32.h 16unifdef-y += byteorder_32.h
16unifdef-y += byteorder_64.h 17unifdef-y += byteorder_64.h
18unifdef-y += e820.h
17unifdef-y += elf_32.h 19unifdef-y += elf_32.h
18unifdef-y += elf_64.h 20unifdef-y += elf_64.h
21unifdef-y += ist.h
19unifdef-y += mce.h 22unifdef-y += mce.h
20unifdef-y += msgbuf_32.h 23unifdef-y += msgbuf_32.h
21unifdef-y += msgbuf_64.h 24unifdef-y += msgbuf_64.h
diff --git a/include/asm-x86/bootparam.h b/include/asm-x86/bootparam.h
index dc031cf4463..19f3ddf2df4 100644
--- a/include/asm-x86/bootparam.h
+++ b/include/asm-x86/bootparam.h
@@ -10,85 +10,85 @@
10#include <video/edid.h> 10#include <video/edid.h>
11 11
12struct setup_header { 12struct setup_header {
13 u8 setup_sects; 13 __u8 setup_sects;
14 u16 root_flags; 14 __u16 root_flags;
15 u32 syssize; 15 __u32 syssize;
16 u16 ram_size; 16 __u16 ram_size;
17#define RAMDISK_IMAGE_START_MASK 0x07FF 17#define RAMDISK_IMAGE_START_MASK 0x07FF
18#define RAMDISK_PROMPT_FLAG 0x8000 18#define RAMDISK_PROMPT_FLAG 0x8000
19#define RAMDISK_LOAD_FLAG 0x4000 19#define RAMDISK_LOAD_FLAG 0x4000
20 u16 vid_mode; 20 __u16 vid_mode;
21 u16 root_dev; 21 __u16 root_dev;
22 u16 boot_flag; 22 __u16 boot_flag;
23 u16 jump; 23 __u16 jump;
24 u32 header; 24 __u32 header;
25 u16 version; 25 __u16 version;
26 u32 realmode_swtch; 26 __u32 realmode_swtch;
27 u16 start_sys; 27 __u16 start_sys;
28 u16 kernel_version; 28 __u16 kernel_version;
29 u8 type_of_loader; 29 __u8 type_of_loader;
30 u8 loadflags; 30 __u8 loadflags;
31#define LOADED_HIGH (1<<0) 31#define LOADED_HIGH (1<<0)
32#define KEEP_SEGMENTS (1<<6) 32#define KEEP_SEGMENTS (1<<6)
33#define CAN_USE_HEAP (1<<7) 33#define CAN_USE_HEAP (1<<7)
34 u16 setup_move_size; 34 __u16 setup_move_size;
35 u32 code32_start; 35 __u32 code32_start;
36 u32 ramdisk_image; 36 __u32 ramdisk_image;
37 u32 ramdisk_size; 37 __u32 ramdisk_size;
38 u32 bootsect_kludge; 38 __u32 bootsect_kludge;
39 u16 heap_end_ptr; 39 __u16 heap_end_ptr;
40 u16 _pad1; 40 __u16 _pad1;
41 u32 cmd_line_ptr; 41 __u32 cmd_line_ptr;
42 u32 initrd_addr_max; 42 __u32 initrd_addr_max;
43 u32 kernel_alignment; 43 __u32 kernel_alignment;
44 u8 relocatable_kernel; 44 __u8 relocatable_kernel;
45 u8 _pad2[3]; 45 __u8 _pad2[3];
46 u32 cmdline_size; 46 __u32 cmdline_size;
47 u32 hardware_subarch; 47 __u32 hardware_subarch;
48 u64 hardware_subarch_data; 48 __u64 hardware_subarch_data;
49} __attribute__((packed)); 49} __attribute__((packed));
50 50
51struct sys_desc_table { 51struct sys_desc_table {
52 u16 length; 52 __u16 length;
53 u8 table[14]; 53 __u8 table[14];
54}; 54};
55 55
56struct efi_info { 56struct efi_info {
57 u32 _pad1; 57 __u32 _pad1;
58 u32 efi_systab; 58 __u32 efi_systab;
59 u32 efi_memdesc_size; 59 __u32 efi_memdesc_size;
60 u32 efi_memdesc_version; 60 __u32 efi_memdesc_version;
61 u32 efi_memmap; 61 __u32 efi_memmap;
62 u32 efi_memmap_size; 62 __u32 efi_memmap_size;
63 u32 _pad2[2]; 63 __u32 _pad2[2];
64}; 64};
65 65
66/* The so-called "zeropage" */ 66/* The so-called "zeropage" */
67struct boot_params { 67struct boot_params {
68 struct screen_info screen_info; /* 0x000 */ 68 struct screen_info screen_info; /* 0x000 */
69 struct apm_bios_info apm_bios_info; /* 0x040 */ 69 struct apm_bios_info apm_bios_info; /* 0x040 */
70 u8 _pad2[12]; /* 0x054 */ 70 __u8 _pad2[12]; /* 0x054 */
71 struct ist_info ist_info; /* 0x060 */ 71 struct ist_info ist_info; /* 0x060 */
72 u8 _pad3[16]; /* 0x070 */ 72 __u8 _pad3[16]; /* 0x070 */
73 u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ 73 __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */
74 u8 hd1_info[16]; /* obsolete! */ /* 0x090 */ 74 __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */
75 struct sys_desc_table sys_desc_table; /* 0x0a0 */ 75 struct sys_desc_table sys_desc_table; /* 0x0a0 */
76 u8 _pad4[144]; /* 0x0b0 */ 76 __u8 _pad4[144]; /* 0x0b0 */
77 struct edid_info edid_info; /* 0x140 */ 77 struct edid_info edid_info; /* 0x140 */
78 struct efi_info efi_info; /* 0x1c0 */ 78 struct efi_info efi_info; /* 0x1c0 */
79 u32 alt_mem_k; /* 0x1e0 */ 79 __u32 alt_mem_k; /* 0x1e0 */
80 u32 scratch; /* Scratch field! */ /* 0x1e4 */ 80 __u32 scratch; /* Scratch field! */ /* 0x1e4 */
81 u8 e820_entries; /* 0x1e8 */ 81 __u8 e820_entries; /* 0x1e8 */
82 u8 eddbuf_entries; /* 0x1e9 */ 82 __u8 eddbuf_entries; /* 0x1e9 */
83 u8 edd_mbr_sig_buf_entries; /* 0x1ea */ 83 __u8 edd_mbr_sig_buf_entries; /* 0x1ea */
84 u8 _pad6[6]; /* 0x1eb */ 84 __u8 _pad6[6]; /* 0x1eb */
85 struct setup_header hdr; /* setup header */ /* 0x1f1 */ 85 struct setup_header hdr; /* setup header */ /* 0x1f1 */
86 u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)]; 86 __u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)];
87 u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */ 87 __u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */
88 struct e820entry e820_map[E820MAX]; /* 0x2d0 */ 88 struct e820entry e820_map[E820MAX]; /* 0x2d0 */
89 u8 _pad8[48]; /* 0xcd0 */ 89 __u8 _pad8[48]; /* 0xcd0 */
90 struct edd_info eddbuf[EDDMAXNR]; /* 0xd00 */ 90 struct edd_info eddbuf[EDDMAXNR]; /* 0xd00 */
91 u8 _pad9[276]; /* 0xeec */ 91 __u8 _pad9[276]; /* 0xeec */
92} __attribute__((packed)); 92} __attribute__((packed));
93 93
94#endif /* _ASM_BOOTPARAM_H */ 94#endif /* _ASM_BOOTPARAM_H */
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 5d4d2183e5d..3e214f39fad 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -1,5 +1,33 @@
1#ifndef __ASM_E820_H
2#define __ASM_E820_H
3#define E820MAP 0x2d0 /* our map */
4#define E820MAX 128 /* number of entries in E820MAP */
5#define E820NR 0x1e8 /* # entries in E820MAP */
6
7#define E820_RAM 1
8#define E820_RESERVED 2
9#define E820_ACPI 3
10#define E820_NVS 4
11
12#ifndef __ASSEMBLY__
13struct e820entry {
14 __u64 addr; /* start of memory segment */
15 __u64 size; /* size of memory segment */
16 __u32 type; /* type of memory segment */
17} __attribute__((packed));
18
19struct e820map {
20 __u32 nr_map;
21 struct e820entry map[E820MAX];
22};
23#endif /* __ASSEMBLY__ */
24
25#ifdef __KERNEL__
1#ifdef CONFIG_X86_32 26#ifdef CONFIG_X86_32
2# include "e820_32.h" 27# include "e820_32.h"
3#else 28#else
4# include "e820_64.h" 29# include "e820_64.h"
5#endif 30#endif
31#endif /* __KERNEL__ */
32
33#endif /* __ASM_E820_H */
diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h
index cf67dbb1db7..03f60c690c8 100644
--- a/include/asm-x86/e820_32.h
+++ b/include/asm-x86/e820_32.h
@@ -12,30 +12,10 @@
12#ifndef __E820_HEADER 12#ifndef __E820_HEADER
13#define __E820_HEADER 13#define __E820_HEADER
14 14
15#define E820MAP 0x2d0 /* our map */
16#define E820MAX 128 /* number of entries in E820MAP */
17#define E820NR 0x1e8 /* # entries in E820MAP */
18
19#define E820_RAM 1
20#define E820_RESERVED 2
21#define E820_ACPI 3
22#define E820_NVS 4
23
24#define HIGH_MEMORY (1024*1024) 15#define HIGH_MEMORY (1024*1024)
25 16
26#ifndef __ASSEMBLY__ 17#ifndef __ASSEMBLY__
27 18
28struct e820entry {
29 u64 addr; /* start of memory segment */
30 u64 size; /* size of memory segment */
31 u32 type; /* type of memory segment */
32} __attribute__((packed));
33
34struct e820map {
35 u32 nr_map;
36 struct e820entry map[E820MAX];
37};
38
39extern struct e820map e820; 19extern struct e820map e820;
40 20
41extern int e820_all_mapped(unsigned long start, unsigned long end, 21extern int e820_all_mapped(unsigned long start, unsigned long end,
@@ -56,5 +36,4 @@ static inline void e820_mark_nosave_regions(void)
56#endif 36#endif
57 37
58#endif/*!__ASSEMBLY__*/ 38#endif/*!__ASSEMBLY__*/
59
60#endif/*__E820_HEADER*/ 39#endif/*__E820_HEADER*/
diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h
index 3486e701bd8..0bd4787a5d5 100644
--- a/include/asm-x86/e820_64.h
+++ b/include/asm-x86/e820_64.h
@@ -11,27 +11,7 @@
11#ifndef __E820_HEADER 11#ifndef __E820_HEADER
12#define __E820_HEADER 12#define __E820_HEADER
13 13
14#define E820MAP 0x2d0 /* our map */
15#define E820MAX 128 /* number of entries in E820MAP */
16#define E820NR 0x1e8 /* # entries in E820MAP */
17
18#define E820_RAM 1
19#define E820_RESERVED 2
20#define E820_ACPI 3
21#define E820_NVS 4
22
23#ifndef __ASSEMBLY__ 14#ifndef __ASSEMBLY__
24struct e820entry {
25 u64 addr; /* start of memory segment */
26 u64 size; /* size of memory segment */
27 u32 type; /* type of memory segment */
28} __attribute__((packed));
29
30struct e820map {
31 u32 nr_map;
32 struct e820entry map[E820MAX];
33};
34
35extern unsigned long find_e820_area(unsigned long start, unsigned long end, 15extern unsigned long find_e820_area(unsigned long start, unsigned long end,
36 unsigned size); 16 unsigned size);
37extern void add_memory_region(unsigned long start, unsigned long size, 17extern void add_memory_region(unsigned long start, unsigned long size,
diff --git a/include/asm-x86/ist.h b/include/asm-x86/ist.h
index ef2003ebc6f..6ec6ceed95a 100644
--- a/include/asm-x86/ist.h
+++ b/include/asm-x86/ist.h
@@ -17,17 +17,17 @@
17 */ 17 */
18 18
19 19
20#ifdef __KERNEL__
21
22#include <linux/types.h> 20#include <linux/types.h>
23 21
24struct ist_info { 22struct ist_info {
25 u32 signature; 23 __u32 signature;
26 u32 command; 24 __u32 command;
27 u32 event; 25 __u32 event;
28 u32 perf_level; 26 __u32 perf_level;
29}; 27};
30 28
29#ifdef __KERNEL__
30
31extern struct ist_info ist_info; 31extern struct ist_info ist_info;
32 32
33#endif /* __KERNEL__ */ 33#endif /* __KERNEL__ */
diff --git a/include/asm-x86/lguest.h b/include/asm-x86/lguest.h
new file mode 100644
index 00000000000..ccd33846081
--- /dev/null
+++ b/include/asm-x86/lguest.h
@@ -0,0 +1,86 @@
1#ifndef _X86_LGUEST_H
2#define _X86_LGUEST_H
3
4#define GDT_ENTRY_LGUEST_CS 10
5#define GDT_ENTRY_LGUEST_DS 11
6#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8)
7#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8)
8
9#ifndef __ASSEMBLY__
10#include <asm/desc.h>
11
12#define GUEST_PL 1
13
14/* Every guest maps the core switcher code. */
15#define SHARED_SWITCHER_PAGES \
16 DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
17/* Pages for switcher itself, then two pages per cpu */
18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS)
19
20/* We map at -4M for ease of mapping into the guest (one PTE page). */
21#define SWITCHER_ADDR 0xFFC00000
22
23/* Found in switcher.S */
24extern unsigned long default_idt_entries[];
25
26struct lguest_regs
27{
28 /* Manually saved part. */
29 unsigned long eax, ebx, ecx, edx;
30 unsigned long esi, edi, ebp;
31 unsigned long gs;
32 unsigned long fs, ds, es;
33 unsigned long trapnum, errcode;
34 /* Trap pushed part */
35 unsigned long eip;
36 unsigned long cs;
37 unsigned long eflags;
38 unsigned long esp;
39 unsigned long ss;
40};
41
42/* This is a guest-specific page (mapped ro) into the guest. */
43struct lguest_ro_state
44{
45 /* Host information we need to restore when we switch back. */
46 u32 host_cr3;
47 struct Xgt_desc_struct host_idt_desc;
48 struct Xgt_desc_struct host_gdt_desc;
49 u32 host_sp;
50
51 /* Fields which are used when guest is running. */
52 struct Xgt_desc_struct guest_idt_desc;
53 struct Xgt_desc_struct guest_gdt_desc;
54 struct i386_hw_tss guest_tss;
55 struct desc_struct guest_idt[IDT_ENTRIES];
56 struct desc_struct guest_gdt[GDT_ENTRIES];
57};
58
59struct lguest_arch
60{
61 /* The GDT entries copied into lguest_ro_state when running. */
62 struct desc_struct gdt[GDT_ENTRIES];
63
64 /* The IDT entries: some copied into lguest_ro_state when running. */
65 struct desc_struct idt[IDT_ENTRIES];
66
67 /* The address of the last guest-visible pagefault (ie. cr2). */
68 unsigned long last_pagefault;
69};
70
71static inline void lguest_set_ts(void)
72{
73 u32 cr0;
74
75 cr0 = read_cr0();
76 if (!(cr0 & 8))
77 write_cr0(cr0|8);
78}
79
80/* Full 4G segment descriptors, suitable for CS and DS. */
81#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00})
82#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300})
83
84#endif /* __ASSEMBLY__ */
85
86#endif
diff --git a/include/asm-x86/lguest_hcall.h b/include/asm-x86/lguest_hcall.h
new file mode 100644
index 00000000000..f948491eb56
--- /dev/null
+++ b/include/asm-x86/lguest_hcall.h
@@ -0,0 +1,71 @@
1/* Architecture specific portion of the lguest hypercalls */
2#ifndef _X86_LGUEST_HCALL_H
3#define _X86_LGUEST_HCALL_H
4
5#define LHCALL_FLUSH_ASYNC 0
6#define LHCALL_LGUEST_INIT 1
7#define LHCALL_CRASH 2
8#define LHCALL_LOAD_GDT 3
9#define LHCALL_NEW_PGTABLE 4
10#define LHCALL_FLUSH_TLB 5
11#define LHCALL_LOAD_IDT_ENTRY 6
12#define LHCALL_SET_STACK 7
13#define LHCALL_TS 8
14#define LHCALL_SET_CLOCKEVENT 9
15#define LHCALL_HALT 10
16#define LHCALL_SET_PTE 14
17#define LHCALL_SET_PMD 15
18#define LHCALL_LOAD_TLS 16
19#define LHCALL_NOTIFY 17
20
21/*G:031 First, how does our Guest contact the Host to ask for privileged
22 * operations? There are two ways: the direct way is to make a "hypercall",
23 * to make requests of the Host Itself.
24 *
25 * Our hypercall mechanism uses the highest unused trap code (traps 32 and
26 * above are used by real hardware interrupts). Seventeen hypercalls are
27 * available: the hypercall number is put in the %eax register, and the
28 * arguments (when required) are placed in %edx, %ebx and %ecx. If a return
29 * value makes sense, it's returned in %eax.
30 *
31 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
32 * Host, rather than returning failure. This reflects Winston Churchill's
33 * definition of a gentleman: "someone who is only rude intentionally". */
34#define LGUEST_TRAP_ENTRY 0x1F
35
36#ifndef __ASSEMBLY__
37#include <asm/hw_irq.h>
38
39static inline unsigned long
40hcall(unsigned long call,
41 unsigned long arg1, unsigned long arg2, unsigned long arg3)
42{
43 /* "int" is the Intel instruction to trigger a trap. */
44 asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
45 /* The call is in %eax (aka "a"), and can be replaced */
46 : "=a"(call)
47 /* The other arguments are in %eax, %edx, %ebx & %ecx */
48 : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3)
49 /* "memory" means this might write somewhere in memory.
50 * This isn't true for all calls, but it's safe to tell
51 * gcc that it might happen so it doesn't get clever. */
52 : "memory");
53 return call;
54}
55/*:*/
56
57void async_hcall(unsigned long call,
58 unsigned long arg1, unsigned long arg2, unsigned long arg3);
59
60/* Can't use our min() macro here: needs to be a constant */
61#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
62
63#define LHCALL_RING_SIZE 64
64struct hcall_args
65{
66 /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */
67 unsigned long arg0, arg2, arg3, arg1;
68};
69
70#endif /* !__ASSEMBLY__ */
71#endif /* _I386_LGUEST_HCALL_H */
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index e3ffd14a3f0..6a65231bc78 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -186,6 +186,7 @@ unifdef-y += cyclades.h
186unifdef-y += dccp.h 186unifdef-y += dccp.h
187unifdef-y += dirent.h 187unifdef-y += dirent.h
188unifdef-y += dlm.h 188unifdef-y += dlm.h
189unifdef-y += edd.h
189unifdef-y += elfcore.h 190unifdef-y += elfcore.h
190unifdef-y += errno.h 191unifdef-y += errno.h
191unifdef-y += errqueue.h 192unifdef-y += errqueue.h
@@ -306,6 +307,7 @@ unifdef-y += rtc.h
306unifdef-y += rtnetlink.h 307unifdef-y += rtnetlink.h
307unifdef-y += scc.h 308unifdef-y += scc.h
308unifdef-y += sched.h 309unifdef-y += sched.h
310unifdef-y += screen_info.h
309unifdef-y += sdla.h 311unifdef-y += sdla.h
310unifdef-y += selinux_netlink.h 312unifdef-y += selinux_netlink.h
311unifdef-y += sem.h 313unifdef-y += sem.h
@@ -341,6 +343,9 @@ unifdef-y += user.h
341unifdef-y += utsname.h 343unifdef-y += utsname.h
342unifdef-y += videodev2.h 344unifdef-y += videodev2.h
343unifdef-y += videodev.h 345unifdef-y += videodev.h
346unifdef-y += virtio_config.h
347unifdef-y += virtio_blk.h
348unifdef-y += virtio_net.h
344unifdef-y += wait.h 349unifdef-y += wait.h
345unifdef-y += wanrouter.h 350unifdef-y += wanrouter.h
346unifdef-y += watchdog.h 351unifdef-y += watchdog.h
diff --git a/include/linux/apm_bios.h b/include/linux/apm_bios.h
index 5f921c84827..9754baa1492 100644
--- a/include/linux/apm_bios.h
+++ b/include/linux/apm_bios.h
@@ -16,29 +16,29 @@
16 * General Public License for more details. 16 * General Public License for more details.
17 */ 17 */
18 18
19typedef unsigned short apm_event_t; 19#include <linux/types.h>
20typedef unsigned short apm_eventinfo_t; 20
21struct apm_bios_info {
22 __u16 version;
23 __u16 cseg;
24 __u32 offset;
25 __u16 cseg_16;
26 __u16 dseg;
27 __u16 flags;
28 __u16 cseg_len;
29 __u16 cseg_16_len;
30 __u16 dseg_len;
31};
21 32
22#ifdef __KERNEL__ 33#ifdef __KERNEL__
23 34
24#include <linux/types.h> 35typedef unsigned short apm_event_t;
36typedef unsigned short apm_eventinfo_t;
25 37
26#define APM_CS (GDT_ENTRY_APMBIOS_BASE * 8) 38#define APM_CS (GDT_ENTRY_APMBIOS_BASE * 8)
27#define APM_CS_16 (APM_CS + 8) 39#define APM_CS_16 (APM_CS + 8)
28#define APM_DS (APM_CS_16 + 8) 40#define APM_DS (APM_CS_16 + 8)
29 41
30struct apm_bios_info {
31 u16 version;
32 u16 cseg;
33 u32 offset;
34 u16 cseg_16;
35 u16 dseg;
36 u16 flags;
37 u16 cseg_len;
38 u16 cseg_16_len;
39 u16 dseg_len;
40};
41
42/* Results of APM Installation Check */ 42/* Results of APM Installation Check */
43#define APM_16_BIT_SUPPORT 0x0001 43#define APM_16_BIT_SUPPORT 0x0001
44#define APM_32_BIT_SUPPORT 0x0002 44#define APM_32_BIT_SUPPORT 0x0002
diff --git a/include/linux/edd.h b/include/linux/edd.h
index 7b647822d6d..5d747c5cd0f 100644
--- a/include/linux/edd.h
+++ b/include/linux/edd.h
@@ -67,113 +67,113 @@
67#define EDD_INFO_USE_INT13_FN50 (1 << 7) 67#define EDD_INFO_USE_INT13_FN50 (1 << 7)
68 68
69struct edd_device_params { 69struct edd_device_params {
70 u16 length; 70 __u16 length;
71 u16 info_flags; 71 __u16 info_flags;
72 u32 num_default_cylinders; 72 __u32 num_default_cylinders;
73 u32 num_default_heads; 73 __u32 num_default_heads;
74 u32 sectors_per_track; 74 __u32 sectors_per_track;
75 u64 number_of_sectors; 75 __u64 number_of_sectors;
76 u16 bytes_per_sector; 76 __u16 bytes_per_sector;
77 u32 dpte_ptr; /* 0xFFFFFFFF for our purposes */ 77 __u32 dpte_ptr; /* 0xFFFFFFFF for our purposes */
78 u16 key; /* = 0xBEDD */ 78 __u16 key; /* = 0xBEDD */
79 u8 device_path_info_length; /* = 44 */ 79 __u8 device_path_info_length; /* = 44 */
80 u8 reserved2; 80 __u8 reserved2;
81 u16 reserved3; 81 __u16 reserved3;
82 u8 host_bus_type[4]; 82 __u8 host_bus_type[4];
83 u8 interface_type[8]; 83 __u8 interface_type[8];
84 union { 84 union {
85 struct { 85 struct {
86 u16 base_address; 86 __u16 base_address;
87 u16 reserved1; 87 __u16 reserved1;
88 u32 reserved2; 88 __u32 reserved2;
89 } __attribute__ ((packed)) isa; 89 } __attribute__ ((packed)) isa;
90 struct { 90 struct {
91 u8 bus; 91 __u8 bus;
92 u8 slot; 92 __u8 slot;
93 u8 function; 93 __u8 function;
94 u8 channel; 94 __u8 channel;
95 u32 reserved; 95 __u32 reserved;
96 } __attribute__ ((packed)) pci; 96 } __attribute__ ((packed)) pci;
97 /* pcix is same as pci */ 97 /* pcix is same as pci */
98 struct { 98 struct {
99 u64 reserved; 99 __u64 reserved;
100 } __attribute__ ((packed)) ibnd; 100 } __attribute__ ((packed)) ibnd;
101 struct { 101 struct {
102 u64 reserved; 102 __u64 reserved;
103 } __attribute__ ((packed)) xprs; 103 } __attribute__ ((packed)) xprs;
104 struct { 104 struct {
105 u64 reserved; 105 __u64 reserved;
106 } __attribute__ ((packed)) htpt; 106 } __attribute__ ((packed)) htpt;
107 struct { 107 struct {
108 u64 reserved; 108 __u64 reserved;
109 } __attribute__ ((packed)) unknown; 109 } __attribute__ ((packed)) unknown;
110 } interface_path; 110 } interface_path;
111 union { 111 union {
112 struct { 112 struct {
113 u8 device; 113 __u8 device;
114 u8 reserved1; 114 __u8 reserved1;
115 u16 reserved2; 115 __u16 reserved2;
116 u32 reserved3; 116 __u32 reserved3;
117 u64 reserved4; 117 __u64 reserved4;
118 } __attribute__ ((packed)) ata; 118 } __attribute__ ((packed)) ata;
119 struct { 119 struct {
120 u8 device; 120 __u8 device;
121 u8 lun; 121 __u8 lun;
122 u8 reserved1; 122 __u8 reserved1;
123 u8 reserved2; 123 __u8 reserved2;
124 u32 reserved3; 124 __u32 reserved3;
125 u64 reserved4; 125 __u64 reserved4;
126 } __attribute__ ((packed)) atapi; 126 } __attribute__ ((packed)) atapi;
127 struct { 127 struct {
128 u16 id; 128 __u16 id;
129 u64 lun; 129 __u64 lun;
130 u16 reserved1; 130 __u16 reserved1;
131 u32 reserved2; 131 __u32 reserved2;
132 } __attribute__ ((packed)) scsi; 132 } __attribute__ ((packed)) scsi;
133 struct { 133 struct {
134 u64 serial_number; 134 __u64 serial_number;
135 u64 reserved; 135 __u64 reserved;
136 } __attribute__ ((packed)) usb; 136 } __attribute__ ((packed)) usb;
137 struct { 137 struct {
138 u64 eui; 138 __u64 eui;
139 u64 reserved; 139 __u64 reserved;
140 } __attribute__ ((packed)) i1394; 140 } __attribute__ ((packed)) i1394;
141 struct { 141 struct {
142 u64 wwid; 142 __u64 wwid;
143 u64 lun; 143 __u64 lun;
144 } __attribute__ ((packed)) fibre; 144 } __attribute__ ((packed)) fibre;
145 struct { 145 struct {
146 u64 identity_tag; 146 __u64 identity_tag;
147 u64 reserved; 147 __u64 reserved;
148 } __attribute__ ((packed)) i2o; 148 } __attribute__ ((packed)) i2o;
149 struct { 149 struct {
150 u32 array_number; 150 __u32 array_number;
151 u32 reserved1; 151 __u32 reserved1;
152 u64 reserved2; 152 __u64 reserved2;
153 } __attribute__ ((packed)) raid; 153 } __attribute__ ((packed)) raid;
154 struct { 154 struct {
155 u8 device; 155 __u8 device;
156 u8 reserved1; 156 __u8 reserved1;
157 u16 reserved2; 157 __u16 reserved2;
158 u32 reserved3; 158 __u32 reserved3;
159 u64 reserved4; 159 __u64 reserved4;
160 } __attribute__ ((packed)) sata; 160 } __attribute__ ((packed)) sata;
161 struct { 161 struct {
162 u64 reserved1; 162 __u64 reserved1;
163 u64 reserved2; 163 __u64 reserved2;
164 } __attribute__ ((packed)) unknown; 164 } __attribute__ ((packed)) unknown;
165 } device_path; 165 } device_path;
166 u8 reserved4; 166 __u8 reserved4;
167 u8 checksum; 167 __u8 checksum;
168} __attribute__ ((packed)); 168} __attribute__ ((packed));
169 169
170struct edd_info { 170struct edd_info {
171 u8 device; 171 __u8 device;
172 u8 version; 172 __u8 version;
173 u16 interface_support; 173 __u16 interface_support;
174 u16 legacy_max_cylinder; 174 __u16 legacy_max_cylinder;
175 u8 legacy_max_head; 175 __u8 legacy_max_head;
176 u8 legacy_sectors_per_track; 176 __u8 legacy_sectors_per_track;
177 struct edd_device_params params; 177 struct edd_device_params params;
178} __attribute__ ((packed)); 178} __attribute__ ((packed));
179 179
@@ -184,8 +184,9 @@ struct edd {
184 unsigned char edd_info_nr; 184 unsigned char edd_info_nr;
185}; 185};
186 186
187#ifdef __KERNEL__
187extern struct edd edd; 188extern struct edd edd;
188 189#endif /* __KERNEL__ */
189#endif /*!__ASSEMBLY__ */ 190#endif /*!__ASSEMBLY__ */
190 191
191#endif /* _LINUX_EDD_H */ 192#endif /* _LINUX_EDD_H */
diff --git a/include/linux/lguest.h b/include/linux/lguest.h
index 157ad64aa7c..8beb2913462 100644
--- a/include/linux/lguest.h
+++ b/include/linux/lguest.h
@@ -1,76 +1,16 @@
1/* Things the lguest guest needs to know. Note: like all lguest interfaces, 1/* Things the lguest guest needs to know. Note: like all lguest interfaces,
2 * this is subject to wild and random change between versions. */ 2 * this is subject to wild and random change between versions. */
3#ifndef _ASM_LGUEST_H 3#ifndef _LINUX_LGUEST_H
4#define _ASM_LGUEST_H 4#define _LINUX_LGUEST_H
5 5
6#ifndef __ASSEMBLY__ 6#ifndef __ASSEMBLY__
7#include <linux/time.h>
7#include <asm/irq.h> 8#include <asm/irq.h>
8 9#include <asm/lguest_hcall.h>
9#define LHCALL_FLUSH_ASYNC 0
10#define LHCALL_LGUEST_INIT 1
11#define LHCALL_CRASH 2
12#define LHCALL_LOAD_GDT 3
13#define LHCALL_NEW_PGTABLE 4
14#define LHCALL_FLUSH_TLB 5
15#define LHCALL_LOAD_IDT_ENTRY 6
16#define LHCALL_SET_STACK 7
17#define LHCALL_TS 8
18#define LHCALL_SET_CLOCKEVENT 9
19#define LHCALL_HALT 10
20#define LHCALL_BIND_DMA 12
21#define LHCALL_SEND_DMA 13
22#define LHCALL_SET_PTE 14
23#define LHCALL_SET_PMD 15
24#define LHCALL_LOAD_TLS 16
25 10
26#define LG_CLOCK_MIN_DELTA 100UL 11#define LG_CLOCK_MIN_DELTA 100UL
27#define LG_CLOCK_MAX_DELTA ULONG_MAX 12#define LG_CLOCK_MAX_DELTA ULONG_MAX
28 13
29/*G:031 First, how does our Guest contact the Host to ask for privileged
30 * operations? There are two ways: the direct way is to make a "hypercall",
31 * to make requests of the Host Itself.
32 *
33 * Our hypercall mechanism uses the highest unused trap code (traps 32 and
34 * above are used by real hardware interrupts). Seventeen hypercalls are
35 * available: the hypercall number is put in the %eax register, and the
36 * arguments (when required) are placed in %edx, %ebx and %ecx. If a return
37 * value makes sense, it's returned in %eax.
38 *
39 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
40 * Host, rather than returning failure. This reflects Winston Churchill's
41 * definition of a gentleman: "someone who is only rude intentionally". */
42#define LGUEST_TRAP_ENTRY 0x1F
43
44static inline unsigned long
45hcall(unsigned long call,
46 unsigned long arg1, unsigned long arg2, unsigned long arg3)
47{
48 /* "int" is the Intel instruction to trigger a trap. */
49 asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
50 /* The call is in %eax (aka "a"), and can be replaced */
51 : "=a"(call)
52 /* The other arguments are in %eax, %edx, %ebx & %ecx */
53 : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3)
54 /* "memory" means this might write somewhere in memory.
55 * This isn't true for all calls, but it's safe to tell
56 * gcc that it might happen so it doesn't get clever. */
57 : "memory");
58 return call;
59}
60/*:*/
61
62void async_hcall(unsigned long call,
63 unsigned long arg1, unsigned long arg2, unsigned long arg3);
64
65/* Can't use our min() macro here: needs to be a constant */
66#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
67
68#define LHCALL_RING_SIZE 64
69struct hcall_ring
70{
71 u32 eax, edx, ebx, ecx;
72};
73
74/*G:032 The second method of communicating with the Host is to via "struct 14/*G:032 The second method of communicating with the Host is to via "struct
75 * lguest_data". The Guest's very first hypercall is to tell the Host where 15 * lguest_data". The Guest's very first hypercall is to tell the Host where
76 * this is, and then the Guest and Host both publish information in it. :*/ 16 * this is, and then the Guest and Host both publish information in it. :*/
@@ -97,20 +37,24 @@ struct lguest_data
97 /* 0xFF == done (set by Host), 0 == pending (set by Guest). */ 37 /* 0xFF == done (set by Host), 0 == pending (set by Guest). */
98 u8 hcall_status[LHCALL_RING_SIZE]; 38 u8 hcall_status[LHCALL_RING_SIZE];
99 /* The actual registers for the hypercalls. */ 39 /* The actual registers for the hypercalls. */
100 struct hcall_ring hcalls[LHCALL_RING_SIZE]; 40 struct hcall_args hcalls[LHCALL_RING_SIZE];
101 41
102/* Fields initialized by the Host at boot: */ 42/* Fields initialized by the Host at boot: */
103 /* Memory not to try to access */ 43 /* Memory not to try to access */
104 unsigned long reserve_mem; 44 unsigned long reserve_mem;
105 /* ID of this Guest (used by network driver to set ethernet address) */
106 u16 guestid;
107 /* KHz for the TSC clock. */ 45 /* KHz for the TSC clock. */
108 u32 tsc_khz; 46 u32 tsc_khz;
47 /* Page where the top-level pagetable is */
48 unsigned long pgdir;
109 49
110/* Fields initialized by the Guest at boot: */ 50/* Fields initialized by the Guest at boot: */
111 /* Instruction range to suppress interrupts even if enabled */ 51 /* Instruction range to suppress interrupts even if enabled */
112 unsigned long noirq_start, noirq_end; 52 unsigned long noirq_start, noirq_end;
53 /* Address above which page tables are all identical. */
54 unsigned long kernel_address;
55 /* The vector to try to use for system calls (0x40 or 0x80). */
56 unsigned int syscall_vec;
113}; 57};
114extern struct lguest_data lguest_data; 58extern struct lguest_data lguest_data;
115#endif /* __ASSEMBLY__ */ 59#endif /* __ASSEMBLY__ */
116#endif /* _ASM_LGUEST_H */ 60#endif /* _LINUX_LGUEST_H */
diff --git a/include/linux/lguest_bus.h b/include/linux/lguest_bus.h
deleted file mode 100644
index d27853ddc64..00000000000
--- a/include/linux/lguest_bus.h
+++ /dev/null
@@ -1,51 +0,0 @@
1#ifndef _ASM_LGUEST_DEVICE_H
2#define _ASM_LGUEST_DEVICE_H
3/* Everything you need to know about lguest devices. */
4#include <linux/device.h>
5#include <linux/lguest.h>
6#include <linux/lguest_launcher.h>
7
8struct lguest_device {
9 /* Unique busid, and index into lguest_page->devices[] */
10 unsigned int index;
11
12 struct device dev;
13
14 /* Driver can hang data off here. */
15 void *private;
16};
17
18/*D:380 Since interrupt numbers are arbitrary, we use a convention: each device
19 * can use the interrupt number corresponding to its index. The +1 is because
20 * interrupt 0 is not usable (it's actually the timer interrupt). */
21static inline int lgdev_irq(const struct lguest_device *dev)
22{
23 return dev->index + 1;
24}
25/*:*/
26
27/* dma args must not be vmalloced! */
28void lguest_send_dma(unsigned long key, struct lguest_dma *dma);
29int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas,
30 unsigned int num, u8 irq);
31void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas);
32
33/* Map the virtual device space */
34void *lguest_map(unsigned long phys_addr, unsigned long pages);
35void lguest_unmap(void *);
36
37struct lguest_driver {
38 const char *name;
39 struct module *owner;
40 u16 device_type;
41 int (*probe)(struct lguest_device *dev);
42 void (*remove)(struct lguest_device *dev);
43
44 struct device_driver drv;
45};
46
47extern int register_lguest_driver(struct lguest_driver *drv);
48extern void unregister_lguest_driver(struct lguest_driver *drv);
49
50extern struct lguest_device_desc *lguest_devices; /* Just past max_pfn */
51#endif /* _ASM_LGUEST_DEVICE_H */
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
index 64167057944..61e1e3e6b1c 100644
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -1,6 +1,7 @@
1#ifndef _ASM_LGUEST_USER 1#ifndef _ASM_LGUEST_USER
2#define _ASM_LGUEST_USER 2#define _ASM_LGUEST_USER
3/* Everything the "lguest" userspace program needs to know. */ 3/* Everything the "lguest" userspace program needs to know. */
4#include <linux/types.h>
4/* They can register up to 32 arrays of lguest_dma. */ 5/* They can register up to 32 arrays of lguest_dma. */
5#define LGUEST_MAX_DMA 32 6#define LGUEST_MAX_DMA 32
6/* At most we can dma 16 lguest_dma in one op. */ 7/* At most we can dma 16 lguest_dma in one op. */
@@ -9,66 +10,6 @@
9/* How many devices? Assume each one wants up to two dma arrays per device. */ 10/* How many devices? Assume each one wants up to two dma arrays per device. */
10#define LGUEST_MAX_DEVICES (LGUEST_MAX_DMA/2) 11#define LGUEST_MAX_DEVICES (LGUEST_MAX_DMA/2)
11 12
12/*D:200
13 * Lguest I/O
14 *
15 * The lguest I/O mechanism is the only way Guests can talk to devices. There
16 * are two hypercalls involved: SEND_DMA for output and BIND_DMA for input. In
17 * each case, "struct lguest_dma" describes the buffer: this contains 16
18 * addr/len pairs, and if there are fewer buffer elements the len array is
19 * terminated with a 0.
20 *
21 * I/O is organized by keys: BIND_DMA attaches buffers to a particular key, and
22 * SEND_DMA transfers to buffers bound to particular key. By convention, keys
23 * correspond to a physical address within the device's page. This means that
24 * devices will never accidentally end up with the same keys, and allows the
25 * Host use The Futex Trick (as we'll see later in our journey).
26 *
27 * SEND_DMA simply indicates a key to send to, and the physical address of the
28 * "struct lguest_dma" to send. The Host will write the number of bytes
29 * transferred into the "struct lguest_dma"'s used_len member.
30 *
31 * BIND_DMA indicates a key to bind to, a pointer to an array of "struct
32 * lguest_dma"s ready for receiving, the size of that array, and an interrupt
33 * to trigger when data is received. The Host will only allow transfers into
34 * buffers with a used_len of zero: it then sets used_len to the number of
35 * bytes transferred and triggers the interrupt for the Guest to process the
36 * new input. */
37struct lguest_dma
38{
39 /* 0 if free to be used, filled by the Host. */
40 u32 used_len;
41 unsigned long addr[LGUEST_MAX_DMA_SECTIONS];
42 u16 len[LGUEST_MAX_DMA_SECTIONS];
43};
44/*:*/
45
46/*D:460 This is the layout of a block device memory page. The Launcher sets up
47 * the num_sectors initially to tell the Guest the size of the disk. The Guest
48 * puts the type, sector and length of the request in the first three fields,
49 * then DMAs to the Host. The Host processes the request, sets up the result,
50 * then DMAs back to the Guest. */
51struct lguest_block_page
52{
53 /* 0 is a read, 1 is a write. */
54 int type;
55 u32 sector; /* Offset in device = sector * 512. */
56 u32 bytes; /* Length expected to be read/written in bytes */
57 /* 0 = pending, 1 = done, 2 = done, error */
58 int result;
59 u32 num_sectors; /* Disk length = num_sectors * 512 */
60};
61
62/*D:520 The network device is basically a memory page where all the Guests on
63 * the network publish their MAC (ethernet) addresses: it's an array of "struct
64 * lguest_net": */
65struct lguest_net
66{
67 /* Simply the mac address (with multicast bit meaning promisc). */
68 unsigned char mac[6];
69};
70/*:*/
71
72/* Where the Host expects the Guest to SEND_DMA console output to. */ 13/* Where the Host expects the Guest to SEND_DMA console output to. */
73#define LGUEST_CONSOLE_DMA_KEY 0 14#define LGUEST_CONSOLE_DMA_KEY 0
74 15
@@ -81,38 +22,29 @@ struct lguest_net
81 * complex burden for the Host and suboptimal for the Guest, so we have our own 22 * complex burden for the Host and suboptimal for the Guest, so we have our own
82 * "lguest" bus and simple drivers. 23 * "lguest" bus and simple drivers.
83 * 24 *
84 * Devices are described by an array of LGUEST_MAX_DEVICES of these structs, 25 * Devices are described by a simplified ID, a status byte, and some "config"
85 * placed by the Launcher just above the top of physical memory: 26 * bytes which describe this device's configuration. This is placed by the
27 * Launcher just above the top of physical memory:
86 */ 28 */
87struct lguest_device_desc { 29struct lguest_device_desc {
88 /* The device type: console, network, disk etc. */ 30 /* The device type: console, network, disk etc. Type 0 terminates. */
89 u16 type; 31 __u8 type;
90#define LGUEST_DEVICE_T_CONSOLE 1 32 /* The number of bytes of the config array. */
91#define LGUEST_DEVICE_T_NET 2 33 __u8 config_len;
92#define LGUEST_DEVICE_T_BLOCK 3 34 /* A status byte, written by the Guest. */
93 35 __u8 status;
94 /* The specific features of this device: these depends on device type 36 __u8 config[0];
95 * except for LGUEST_DEVICE_F_RANDOMNESS. */ 37};
96 u16 features;
97#define LGUEST_NET_F_NOCSUM 0x4000 /* Don't bother checksumming */
98#define LGUEST_DEVICE_F_RANDOMNESS 0x8000 /* IRQ is fairly random */
99
100 /* This is how the Guest reports status of the device: the Host can set
101 * LGUEST_DEVICE_S_REMOVED to indicate removal, but the rest are only
102 * ever manipulated by the Guest, and only ever set. */
103 u16 status;
104/* 256 and above are device specific. */
105#define LGUEST_DEVICE_S_ACKNOWLEDGE 1 /* We have seen device. */
106#define LGUEST_DEVICE_S_DRIVER 2 /* We have found a driver */
107#define LGUEST_DEVICE_S_DRIVER_OK 4 /* Driver says OK! */
108#define LGUEST_DEVICE_S_REMOVED 8 /* Device has gone away. */
109#define LGUEST_DEVICE_S_REMOVED_ACK 16 /* Driver has been told. */
110#define LGUEST_DEVICE_S_FAILED 128 /* Something actually failed */
111 38
112 /* Each device exists somewhere in Guest physical memory, over some 39/*D:135 This is how we expect the device configuration field for a virtqueue
113 * number of pages. */ 40 * (type VIRTIO_CONFIG_F_VIRTQUEUE) to be laid out: */
114 u16 num_pages; 41struct lguest_vqconfig {
115 u32 pfn; 42 /* The number of entries in the virtio_ring */
43 __u16 num;
44 /* The interrupt we get when something happens. */
45 __u16 irq;
46 /* The page number of the virtio ring for this device. */
47 __u32 pfn;
116}; 48};
117/*:*/ 49/*:*/
118 50
@@ -120,7 +52,7 @@ struct lguest_device_desc {
120enum lguest_req 52enum lguest_req
121{ 53{
122 LHREQ_INITIALIZE, /* + pfnlimit, pgdir, start, pageoffset */ 54 LHREQ_INITIALIZE, /* + pfnlimit, pgdir, start, pageoffset */
123 LHREQ_GETDMA, /* + addr (returns &lguest_dma, irq in ->used_len) */ 55 LHREQ_GETDMA, /* No longer used */
124 LHREQ_IRQ, /* + irq */ 56 LHREQ_IRQ, /* + irq */
125 LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */ 57 LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
126}; 58};
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 522b0dd836c..e9fddb42f26 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -361,4 +361,10 @@ struct ssb_device_id {
361#define SSB_ANY_ID 0xFFFF 361#define SSB_ANY_ID 0xFFFF
362#define SSB_ANY_REV 0xFF 362#define SSB_ANY_REV 0xFF
363 363
364struct virtio_device_id {
365 __u32 device;
366 __u32 vendor;
367};
368#define VIRTIO_DEV_ANY_ID 0xffffffff
369
364#endif /* LINUX_MOD_DEVICETABLE_H */ 370#endif /* LINUX_MOD_DEVICETABLE_H */
diff --git a/include/linux/screen_info.h b/include/linux/screen_info.h
index ba81ffe9958..827b85bbf38 100644
--- a/include/linux/screen_info.h
+++ b/include/linux/screen_info.h
@@ -8,45 +8,43 @@
8 */ 8 */
9 9
10struct screen_info { 10struct screen_info {
11 u8 orig_x; /* 0x00 */ 11 __u8 orig_x; /* 0x00 */
12 u8 orig_y; /* 0x01 */ 12 __u8 orig_y; /* 0x01 */
13 u16 ext_mem_k; /* 0x02 */ 13 __u16 ext_mem_k; /* 0x02 */
14 u16 orig_video_page; /* 0x04 */ 14 __u16 orig_video_page; /* 0x04 */
15 u8 orig_video_mode; /* 0x06 */ 15 __u8 orig_video_mode; /* 0x06 */
16 u8 orig_video_cols; /* 0x07 */ 16 __u8 orig_video_cols; /* 0x07 */
17 u16 unused2; /* 0x08 */ 17 __u16 unused2; /* 0x08 */
18 u16 orig_video_ega_bx; /* 0x0a */ 18 __u16 orig_video_ega_bx;/* 0x0a */
19 u16 unused3; /* 0x0c */ 19 __u16 unused3; /* 0x0c */
20 u8 orig_video_lines; /* 0x0e */ 20 __u8 orig_video_lines; /* 0x0e */
21 u8 orig_video_isVGA; /* 0x0f */ 21 __u8 orig_video_isVGA; /* 0x0f */
22 u16 orig_video_points; /* 0x10 */ 22 __u16 orig_video_points;/* 0x10 */
23 23
24 /* VESA graphic mode -- linear frame buffer */ 24 /* VESA graphic mode -- linear frame buffer */
25 u16 lfb_width; /* 0x12 */ 25 __u16 lfb_width; /* 0x12 */
26 u16 lfb_height; /* 0x14 */ 26 __u16 lfb_height; /* 0x14 */
27 u16 lfb_depth; /* 0x16 */ 27 __u16 lfb_depth; /* 0x16 */
28 u32 lfb_base; /* 0x18 */ 28 __u32 lfb_base; /* 0x18 */
29 u32 lfb_size; /* 0x1c */ 29 __u32 lfb_size; /* 0x1c */
30 u16 cl_magic, cl_offset; /* 0x20 */ 30 __u16 cl_magic, cl_offset; /* 0x20 */
31 u16 lfb_linelength; /* 0x24 */ 31 __u16 lfb_linelength; /* 0x24 */
32 u8 red_size; /* 0x26 */ 32 __u8 red_size; /* 0x26 */
33 u8 red_pos; /* 0x27 */ 33 __u8 red_pos; /* 0x27 */
34 u8 green_size; /* 0x28 */ 34 __u8 green_size; /* 0x28 */
35 u8 green_pos; /* 0x29 */ 35 __u8 green_pos; /* 0x29 */
36 u8 blue_size; /* 0x2a */ 36 __u8 blue_size; /* 0x2a */
37 u8 blue_pos; /* 0x2b */ 37 __u8 blue_pos; /* 0x2b */
38 u8 rsvd_size; /* 0x2c */ 38 __u8 rsvd_size; /* 0x2c */
39 u8 rsvd_pos; /* 0x2d */ 39 __u8 rsvd_pos; /* 0x2d */
40 u16 vesapm_seg; /* 0x2e */ 40 __u16 vesapm_seg; /* 0x2e */
41 u16 vesapm_off; /* 0x30 */ 41 __u16 vesapm_off; /* 0x30 */
42 u16 pages; /* 0x32 */ 42 __u16 pages; /* 0x32 */
43 u16 vesa_attributes; /* 0x34 */ 43 __u16 vesa_attributes; /* 0x34 */
44 u32 capabilities; /* 0x36 */ 44 __u32 capabilities; /* 0x36 */
45 u8 _reserved[6]; /* 0x3a */ 45 __u8 _reserved[6]; /* 0x3a */
46} __attribute__((packed)); 46} __attribute__((packed));
47 47
48extern struct screen_info screen_info;
49
50#define VIDEO_TYPE_MDA 0x10 /* Monochrome Text Display */ 48#define VIDEO_TYPE_MDA 0x10 /* Monochrome Text Display */
51#define VIDEO_TYPE_CGA 0x11 /* CGA Display */ 49#define VIDEO_TYPE_CGA 0x11 /* CGA Display */
52#define VIDEO_TYPE_EGAM 0x20 /* EGA/VGA in Monochrome Mode */ 50#define VIDEO_TYPE_EGAM 0x20 /* EGA/VGA in Monochrome Mode */
@@ -65,4 +63,17 @@ extern struct screen_info screen_info;
65 63
66#define VIDEO_TYPE_PMAC 0x60 /* PowerMacintosh frame buffer. */ 64#define VIDEO_TYPE_PMAC 0x60 /* PowerMacintosh frame buffer. */
67 65
66#ifdef __KERNEL__
67extern struct screen_info screen_info;
68
69#define ORIG_X (screen_info.orig_x)
70#define ORIG_Y (screen_info.orig_y)
71#define ORIG_VIDEO_MODE (screen_info.orig_video_mode)
72#define ORIG_VIDEO_COLS (screen_info.orig_video_cols)
73#define ORIG_VIDEO_EGA_BX (screen_info.orig_video_ega_bx)
74#define ORIG_VIDEO_LINES (screen_info.orig_video_lines)
75#define ORIG_VIDEO_ISVGA (screen_info.orig_video_isVGA)
76#define ORIG_VIDEO_POINTS (screen_info.orig_video_points)
77#endif /* __KERNEL__ */
78
68#endif /* _SCREEN_INFO_H */ 79#endif /* _SCREEN_INFO_H */
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
new file mode 100644
index 00000000000..14e1379876d
--- /dev/null
+++ b/include/linux/virtio.h
@@ -0,0 +1,110 @@
1#ifndef _LINUX_VIRTIO_H
2#define _LINUX_VIRTIO_H
3/* Everything a virtio driver needs to work with any particular virtio
4 * implementation. */
5#include <linux/types.h>
6#include <linux/scatterlist.h>
7#include <linux/spinlock.h>
8#include <linux/device.h>
9#include <linux/mod_devicetable.h>
10
11/**
12 * virtqueue - a queue to register buffers for sending or receiving.
13 * @callback: the function to call when buffers are consumed (can be NULL).
14 * If this returns false, callbacks are suppressed until vq_ops->restart
15 * is called.
16 * @vdev: the virtio device this queue was created for.
17 * @vq_ops: the operations for this virtqueue (see below).
18 * @priv: a pointer for the virtqueue implementation to use.
19 */
20struct virtqueue
21{
22 bool (*callback)(struct virtqueue *vq);
23 struct virtio_device *vdev;
24 struct virtqueue_ops *vq_ops;
25 void *priv;
26};
27
28/**
29 * virtqueue_ops - operations for virtqueue abstraction layer
30 * @add_buf: expose buffer to other end
31 * vq: the struct virtqueue we're talking about.
32 * sg: the description of the buffer(s).
33 * out_num: the number of sg readable by other side
34 * in_num: the number of sg which are writable (after readable ones)
35 * data: the token identifying the buffer.
36 * Returns 0 or an error.
37 * @kick: update after add_buf
38 * vq: the struct virtqueue
39 * After one or more add_buf calls, invoke this to kick the other side.
40 * @get_buf: get the next used buffer
41 * vq: the struct virtqueue we're talking about.
42 * len: the length written into the buffer
43 * Returns NULL or the "data" token handed to add_buf.
44 * @restart: restart callbacks after callback returned false.
45 * vq: the struct virtqueue we're talking about.
46 * This returns "false" (and doesn't re-enable) if there are pending
47 * buffers in the queue, to avoid a race.
48 * @shutdown: "unadd" all buffers.
49 * vq: the struct virtqueue we're talking about.
50 * Remove everything from the queue.
51 *
52 * Locking rules are straightforward: the driver is responsible for
53 * locking. No two operations may be invoked simultaneously.
54 *
55 * All operations can be called in any context.
56 */
57struct virtqueue_ops {
58 int (*add_buf)(struct virtqueue *vq,
59 struct scatterlist sg[],
60 unsigned int out_num,
61 unsigned int in_num,
62 void *data);
63
64 void (*kick)(struct virtqueue *vq);
65
66 void *(*get_buf)(struct virtqueue *vq, unsigned int *len);
67
68 bool (*restart)(struct virtqueue *vq);
69
70 void (*shutdown)(struct virtqueue *vq);
71};
72
73/**
74 * virtio_device - representation of a device using virtio
75 * @index: unique position on the virtio bus
76 * @dev: underlying device.
77 * @id: the device type identification (used to match it with a driver).
78 * @config: the configuration ops for this device.
79 * @priv: private pointer for the driver's use.
80 */
81struct virtio_device
82{
83 int index;
84 struct device dev;
85 struct virtio_device_id id;
86 struct virtio_config_ops *config;
87 void *priv;
88};
89
90int register_virtio_device(struct virtio_device *dev);
91void unregister_virtio_device(struct virtio_device *dev);
92
93/**
94 * virtio_driver - operations for a virtio I/O driver
95 * @driver: underlying device driver (populate name and owner).
96 * @id_table: the ids serviced by this driver.
97 * @probe: the function to call when a device is found. Returns a token for
98 * remove, or PTR_ERR().
99 * @remove: the function when a device is removed.
100 */
101struct virtio_driver {
102 struct device_driver driver;
103 const struct virtio_device_id *id_table;
104 int (*probe)(struct virtio_device *dev);
105 void (*remove)(struct virtio_device *dev);
106};
107
108int register_virtio_driver(struct virtio_driver *drv);
109void unregister_virtio_driver(struct virtio_driver *drv);
110#endif /* _LINUX_VIRTIO_H */
diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h
new file mode 100644
index 00000000000..7bd2bce0cfd
--- /dev/null
+++ b/include/linux/virtio_blk.h
@@ -0,0 +1,51 @@
1#ifndef _LINUX_VIRTIO_BLK_H
2#define _LINUX_VIRTIO_BLK_H
3#include <linux/virtio_config.h>
4
5/* The ID for virtio_block */
6#define VIRTIO_ID_BLOCK 2
7
8/* Feature bits */
9#define VIRTIO_CONFIG_BLK_F 0x40
10#define VIRTIO_BLK_F_BARRIER 1 /* Does host support barriers? */
11
12/* The capacity (in 512-byte sectors). */
13#define VIRTIO_CONFIG_BLK_F_CAPACITY 0x41
14/* The maximum segment size. */
15#define VIRTIO_CONFIG_BLK_F_SIZE_MAX 0x42
16/* The maximum number of segments. */
17#define VIRTIO_CONFIG_BLK_F_SEG_MAX 0x43
18
19/* These two define direction. */
20#define VIRTIO_BLK_T_IN 0
21#define VIRTIO_BLK_T_OUT 1
22
23/* This bit says it's a scsi command, not an actual read or write. */
24#define VIRTIO_BLK_T_SCSI_CMD 2
25
26/* Barrier before this op. */
27#define VIRTIO_BLK_T_BARRIER 0x80000000
28
29/* This is the first element of the read scatter-gather list. */
30struct virtio_blk_outhdr
31{
32 /* VIRTIO_BLK_T* */
33 __u32 type;
34 /* io priority. */
35 __u32 ioprio;
36 /* Sector (ie. 512 byte offset) */
37 __u64 sector;
38 /* Where to put reply. */
39 __u64 id;
40};
41
42#define VIRTIO_BLK_S_OK 0
43#define VIRTIO_BLK_S_IOERR 1
44#define VIRTIO_BLK_S_UNSUPP 2
45
46/* This is the first element of the write scatter-gather list */
47struct virtio_blk_inhdr
48{
49 unsigned char status;
50};
51#endif /* _LINUX_VIRTIO_BLK_H */
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
new file mode 100644
index 00000000000..bcc01888df7
--- /dev/null
+++ b/include/linux/virtio_config.h
@@ -0,0 +1,111 @@
1#ifndef _LINUX_VIRTIO_CONFIG_H
2#define _LINUX_VIRTIO_CONFIG_H
3/* Virtio devices use a standardized configuration space to define their
4 * features and pass configuration information, but each implementation can
5 * store and access that space differently. */
6#include <linux/types.h>
7
8/* Status byte for guest to report progress, and synchronize config. */
9/* We have seen device and processed generic fields (VIRTIO_CONFIG_F_VIRTIO) */
10#define VIRTIO_CONFIG_S_ACKNOWLEDGE 1
11/* We have found a driver for the device. */
12#define VIRTIO_CONFIG_S_DRIVER 2
13/* Driver has used its parts of the config, and is happy */
14#define VIRTIO_CONFIG_S_DRIVER_OK 4
15/* We've given up on this device. */
16#define VIRTIO_CONFIG_S_FAILED 0x80
17
18/* Feature byte (actually 7 bits availabe): */
19/* Requirements/features of the virtio implementation. */
20#define VIRTIO_CONFIG_F_VIRTIO 1
21/* Requirements/features of the virtqueue (may have more than one). */
22#define VIRTIO_CONFIG_F_VIRTQUEUE 2
23
24#ifdef __KERNEL__
25struct virtio_device;
26
27/**
28 * virtio_config_ops - operations for configuring a virtio device
29 * @find: search for the next configuration field of the given type.
30 * vdev: the virtio_device
31 * type: the feature type
32 * len: the (returned) length of the field if found.
33 * Returns a token if found, or NULL. Never returnes the same field twice
34 * (ie. it's used up).
35 * @get: read the value of a configuration field after find().
36 * vdev: the virtio_device
37 * token: the token returned from find().
38 * buf: the buffer to write the field value into.
39 * len: the length of the buffer (given by find()).
40 * Note that contents are conventionally little-endian.
41 * @set: write the value of a configuration field after find().
42 * vdev: the virtio_device
43 * token: the token returned from find().
44 * buf: the buffer to read the field value from.
45 * len: the length of the buffer (given by find()).
46 * Note that contents are conventionally little-endian.
47 * @get_status: read the status byte
48 * vdev: the virtio_device
49 * Returns the status byte
50 * @set_status: write the status byte
51 * vdev: the virtio_device
52 * status: the new status byte
53 * @find_vq: find the first VIRTIO_CONFIG_F_VIRTQUEUE and create a virtqueue.
54 * vdev: the virtio_device
55 * callback: the virqtueue callback
56 * Returns the new virtqueue or ERR_PTR().
57 * @del_vq: free a virtqueue found by find_vq().
58 */
59struct virtio_config_ops
60{
61 void *(*find)(struct virtio_device *vdev, u8 type, unsigned *len);
62 void (*get)(struct virtio_device *vdev, void *token,
63 void *buf, unsigned len);
64 void (*set)(struct virtio_device *vdev, void *token,
65 const void *buf, unsigned len);
66 u8 (*get_status)(struct virtio_device *vdev);
67 void (*set_status)(struct virtio_device *vdev, u8 status);
68 struct virtqueue *(*find_vq)(struct virtio_device *vdev,
69 bool (*callback)(struct virtqueue *));
70 void (*del_vq)(struct virtqueue *vq);
71};
72
73/**
74 * virtio_config_val - get a single virtio config and mark it used.
75 * @config: the virtio config space
76 * @type: the type to search for.
77 * @val: a pointer to the value to fill in.
78 *
79 * Once used, the config type is marked with VIRTIO_CONFIG_F_USED so it can't
80 * be found again. This version does endian conversion. */
81#define virtio_config_val(vdev, type, v) ({ \
82 int _err = __virtio_config_val((vdev),(type),(v),sizeof(*(v))); \
83 \
84 BUILD_BUG_ON(sizeof(*(v)) != 1 && sizeof(*(v)) != 2 \
85 && sizeof(*(v)) != 4 && sizeof(*(v)) != 8); \
86 if (!_err) { \
87 switch (sizeof(*(v))) { \
88 case 2: le16_to_cpus((__u16 *) v); break; \
89 case 4: le32_to_cpus((__u32 *) v); break; \
90 case 8: le64_to_cpus((__u64 *) v); break; \
91 } \
92 } \
93 _err; \
94})
95
96int __virtio_config_val(struct virtio_device *dev,
97 u8 type, void *val, size_t size);
98
99/**
100 * virtio_use_bit - helper to use a feature bit in a bitfield value.
101 * @dev: the virtio device
102 * @token: the token as returned from vdev->config->find().
103 * @len: the length of the field.
104 * @bitnum: the bit to test.
105 *
106 * If handed a NULL token, it returns false, otherwise returns bit status.
107 * If it's one, it sets the mirroring acknowledgement bit. */
108int virtio_use_bit(struct virtio_device *vdev,
109 void *token, unsigned int len, unsigned int bitnum);
110#endif /* __KERNEL__ */
111#endif /* _LINUX_VIRTIO_CONFIG_H */
diff --git a/include/linux/virtio_console.h b/include/linux/virtio_console.h
new file mode 100644
index 00000000000..ed2d4ead7eb
--- /dev/null
+++ b/include/linux/virtio_console.h
@@ -0,0 +1,12 @@
1#ifndef _LINUX_VIRTIO_CONSOLE_H
2#define _LINUX_VIRTIO_CONSOLE_H
3#include <linux/virtio_config.h>
4
5/* The ID for virtio console */
6#define VIRTIO_ID_CONSOLE 3
7
8#ifdef __KERNEL__
9int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int));
10#endif /* __KERNEL__ */
11
12#endif /* _LINUX_VIRTIO_CONSOLE_H */
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
new file mode 100644
index 00000000000..ae469ae55d3
--- /dev/null
+++ b/include/linux/virtio_net.h
@@ -0,0 +1,36 @@
1#ifndef _LINUX_VIRTIO_NET_H
2#define _LINUX_VIRTIO_NET_H
3#include <linux/virtio_config.h>
4
5/* The ID for virtio_net */
6#define VIRTIO_ID_NET 1
7
8/* The bitmap of config for virtio net */
9#define VIRTIO_CONFIG_NET_F 0x40
10#define VIRTIO_NET_F_NO_CSUM 0
11#define VIRTIO_NET_F_TSO4 1
12#define VIRTIO_NET_F_UFO 2
13#define VIRTIO_NET_F_TSO4_ECN 3
14#define VIRTIO_NET_F_TSO6 4
15
16/* The config defining mac address. */
17#define VIRTIO_CONFIG_NET_MAC_F 0x41
18
19/* This is the first element of the scatter-gather list. If you don't
20 * specify GSO or CSUM features, you can simply ignore the header. */
21struct virtio_net_hdr
22{
23#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 // Use csum_start, csum_offset
24 __u8 flags;
25#define VIRTIO_NET_HDR_GSO_NONE 0 // Not a GSO frame
26#define VIRTIO_NET_HDR_GSO_TCPV4 1 // GSO frame, IPv4 TCP (TSO)
27/* FIXME: Do we need this? If they said they can handle ECN, do they care? */
28#define VIRTIO_NET_HDR_GSO_TCPV4_ECN 2 // GSO frame, IPv4 TCP w/ ECN
29#define VIRTIO_NET_HDR_GSO_UDP 3 // GSO frame, IPv4 UDP (UFO)
30#define VIRTIO_NET_HDR_GSO_TCPV6 4 // GSO frame, IPv6 TCP
31 __u8 gso_type;
32 __u16 gso_size;
33 __u16 csum_start;
34 __u16 csum_offset;
35};
36#endif /* _LINUX_VIRTIO_NET_H */
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
new file mode 100644
index 00000000000..ac69e7bb5a1
--- /dev/null
+++ b/include/linux/virtio_ring.h
@@ -0,0 +1,119 @@
1#ifndef _LINUX_VIRTIO_RING_H
2#define _LINUX_VIRTIO_RING_H
3/* An interface for efficient virtio implementation, currently for use by KVM
4 * and lguest, but hopefully others soon. Do NOT change this since it will
5 * break existing servers and clients.
6 *
7 * This header is BSD licensed so anyone can use the definitions to implement
8 * compatible drivers/servers.
9 *
10 * Copyright Rusty Russell IBM Corporation 2007. */
11#include <linux/types.h>
12
13/* This marks a buffer as continuing via the next field. */
14#define VRING_DESC_F_NEXT 1
15/* This marks a buffer as write-only (otherwise read-only). */
16#define VRING_DESC_F_WRITE 2
17
18/* This means don't notify other side when buffer added. */
19#define VRING_USED_F_NO_NOTIFY 1
20/* This means don't interrupt guest when buffer consumed. */
21#define VRING_AVAIL_F_NO_INTERRUPT 1
22
23/* Virtio ring descriptors: 16 bytes. These can chain together via "next". */
24struct vring_desc
25{
26 /* Address (guest-physical). */
27 __u64 addr;
28 /* Length. */
29 __u32 len;
30 /* The flags as indicated above. */
31 __u16 flags;
32 /* We chain unused descriptors via this, too */
33 __u16 next;
34};
35
36struct vring_avail
37{
38 __u16 flags;
39 __u16 idx;
40 __u16 ring[];
41};
42
43/* u32 is used here for ids for padding reasons. */
44struct vring_used_elem
45{
46 /* Index of start of used descriptor chain. */
47 __u32 id;
48 /* Total length of the descriptor chain which was used (written to) */
49 __u32 len;
50};
51
52struct vring_used
53{
54 __u16 flags;
55 __u16 idx;
56 struct vring_used_elem ring[];
57};
58
59struct vring {
60 unsigned int num;
61
62 struct vring_desc *desc;
63
64 struct vring_avail *avail;
65
66 struct vring_used *used;
67};
68
69/* The standard layout for the ring is a continuous chunk of memory which looks
70 * like this. The used fields will be aligned to a "num+1" boundary.
71 *
72 * struct vring
73 * {
74 * // The actual descriptors (16 bytes each)
75 * struct vring_desc desc[num];
76 *
77 * // A ring of available descriptor heads with free-running index.
78 * __u16 avail_flags;
79 * __u16 avail_idx;
80 * __u16 available[num];
81 *
82 * // Padding so a correctly-chosen num value will cache-align used_idx.
83 * char pad[sizeof(struct vring_desc) - sizeof(avail_flags)];
84 *
85 * // A ring of used descriptor heads with free-running index.
86 * __u16 used_flags;
87 * __u16 used_idx;
88 * struct vring_used_elem used[num];
89 * };
90 */
91static inline void vring_init(struct vring *vr, unsigned int num, void *p)
92{
93 vr->num = num;
94 vr->desc = p;
95 vr->avail = p + num*sizeof(struct vring);
96 vr->used = p + (num+1)*(sizeof(struct vring) + sizeof(__u16));
97}
98
99static inline unsigned vring_size(unsigned int num)
100{
101 return (num + 1) * (sizeof(struct vring_desc) + sizeof(__u16))
102 + sizeof(__u32) + num * sizeof(struct vring_used_elem);
103}
104
105#ifdef __KERNEL__
106#include <linux/irqreturn.h>
107struct virtio_device;
108struct virtqueue;
109
110struct virtqueue *vring_new_virtqueue(unsigned int num,
111 struct virtio_device *vdev,
112 void *pages,
113 void (*notify)(struct virtqueue *vq),
114 bool (*callback)(struct virtqueue *vq));
115void vring_del_virtqueue(struct virtqueue *vq);
116
117irqreturn_t vring_interrupt(int irq, void *_vq);
118#endif /* __KERNEL__ */
119#endif /* _LINUX_VIRTIO_RING_H */
diff --git a/include/video/Kbuild b/include/video/Kbuild
index 53a6c7310e6..0e406f730c2 100644
--- a/include/video/Kbuild
+++ b/include/video/Kbuild
@@ -1 +1,2 @@
1unifdef-y += sisfb.h uvesafb.h 1unifdef-y += sisfb.h uvesafb.h
2unifdef-y += edid.h
diff --git a/include/video/edid.h b/include/video/edid.h
index f6a42d6c2e2..928c342b33d 100644
--- a/include/video/edid.h
+++ b/include/video/edid.h
@@ -1,17 +1,16 @@
1#ifndef __linux_video_edid_h__ 1#ifndef __linux_video_edid_h__
2#define __linux_video_edid_h__ 2#define __linux_video_edid_h__
3 3
4#ifdef __KERNEL__ 4#if !defined(__KERNEL__) || defined(CONFIG_X86)
5
6 5
7#ifdef CONFIG_X86
8struct edid_info { 6struct edid_info {
9 unsigned char dummy[128]; 7 unsigned char dummy[128];
10}; 8};
11 9
10#ifdef __KERNEL__
12extern struct edid_info edid_info; 11extern struct edid_info edid_info;
13#endif /* CONFIG_X86 */
14
15#endif /* __KERNEL__ */ 12#endif /* __KERNEL__ */
16 13
14#endif
15
17#endif /* __linux_video_edid_h__ */ 16#endif /* __linux_video_edid_h__ */
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 91c15da2680..d802b5afae8 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -525,6 +525,20 @@ static int do_ssb_entry(const char *filename,
525 return 1; 525 return 1;
526} 526}
527 527
528/* Looks like: virtio:dNvN */
529static int do_virtio_entry(const char *filename, struct virtio_device_id *id,
530 char *alias)
531{
532 id->device = TO_NATIVE(id->device);
533 id->vendor = TO_NATIVE(id->vendor);
534
535 strcpy(alias, "virtio:");
536 ADD(alias, "d", 1, id->device);
537 ADD(alias, "v", id->vendor != VIRTIO_DEV_ANY_ID, id->vendor);
538
539 return 1;
540}
541
528/* Ignore any prefix, eg. v850 prepends _ */ 542/* Ignore any prefix, eg. v850 prepends _ */
529static inline int sym_is(const char *symbol, const char *name) 543static inline int sym_is(const char *symbol, const char *name)
530{ 544{
@@ -651,6 +665,10 @@ void handle_moddevtable(struct module *mod, struct elf_info *info,
651 do_table(symval, sym->st_size, 665 do_table(symval, sym->st_size,
652 sizeof(struct ssb_device_id), "ssb", 666 sizeof(struct ssb_device_id), "ssb",
653 do_ssb_entry, mod); 667 do_ssb_entry, mod);
668 else if (sym_is(symname, "__mod_virtio_device_table"))
669 do_table(symval, sym->st_size,
670 sizeof(struct virtio_device_id), "virtio",
671 do_virtio_entry, mod);
654 free(zeros); 672 free(zeros);
655} 673}
656 674