aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/lguest
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/lguest')
-rw-r--r--drivers/lguest/Kconfig10
-rw-r--r--drivers/lguest/Makefile12
-rw-r--r--drivers/lguest/README47
-rw-r--r--drivers/lguest/core.c357
-rw-r--r--drivers/lguest/hypercalls.c144
-rw-r--r--drivers/lguest/interrupts_and_traps.c212
-rw-r--r--drivers/lguest/io.c265
-rw-r--r--drivers/lguest/lg.h47
-rw-r--r--drivers/lguest/lguest.c535
-rw-r--r--drivers/lguest/lguest_asm.S71
-rw-r--r--drivers/lguest/lguest_bus.c75
-rw-r--r--drivers/lguest/lguest_user.c166
-rw-r--r--drivers/lguest/page_tables.c329
-rw-r--r--drivers/lguest/segments.c126
-rw-r--r--drivers/lguest/switcher.S284
15 files changed, 2442 insertions, 238 deletions
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
index 43d901fdc77f..888205c3f76b 100644
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -1,6 +1,6 @@
1config LGUEST 1config LGUEST
2 tristate "Linux hypervisor example code" 2 tristate "Linux hypervisor example code"
3 depends on X86 && PARAVIRT && NET && EXPERIMENTAL && !X86_PAE 3 depends on X86 && PARAVIRT && EXPERIMENTAL && !X86_PAE
4 select LGUEST_GUEST 4 select LGUEST_GUEST
5 select HVC_DRIVER 5 select HVC_DRIVER
6 ---help--- 6 ---help---
@@ -18,3 +18,11 @@ config LGUEST_GUEST
18 The guest needs code built-in, even if the host has lguest 18 The guest needs code built-in, even if the host has lguest
19 support as a module. The drivers are tiny, so we build them 19 support as a module. The drivers are tiny, so we build them
20 in too. 20 in too.
21
22config LGUEST_NET
23 tristate
24 depends on LGUEST_GUEST && NET
25
26config LGUEST_BLOCK
27 tristate
28 depends on LGUEST_GUEST && BLOCK
diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile
index 55382c7d799c..e5047471c334 100644
--- a/drivers/lguest/Makefile
+++ b/drivers/lguest/Makefile
@@ -5,3 +5,15 @@ obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_asm.o lguest_bus.o
5obj-$(CONFIG_LGUEST) += lg.o 5obj-$(CONFIG_LGUEST) += lg.o
6lg-y := core.o hypercalls.o page_tables.o interrupts_and_traps.o \ 6lg-y := core.o hypercalls.o page_tables.o interrupts_and_traps.o \
7 segments.o io.o lguest_user.o switcher.o 7 segments.o io.o lguest_user.o switcher.o
8
9Preparation Preparation!: PREFIX=P
10Guest: PREFIX=G
11Drivers: PREFIX=D
12Launcher: PREFIX=L
13Host: PREFIX=H
14Switcher: PREFIX=S
15Mastery: PREFIX=M
16Beer:
17 @for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}"
18Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery:
19 @sh ../../Documentation/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'`
diff --git a/drivers/lguest/README b/drivers/lguest/README
new file mode 100644
index 000000000000..b7db39a64c66
--- /dev/null
+++ b/drivers/lguest/README
@@ -0,0 +1,47 @@
1Welcome, friend reader, to lguest.
2
3Lguest is an adventure, with you, the reader, as Hero. I can't think of many
45000-line projects which offer both such capability and glimpses of future
5potential; it is an exciting time to be delving into the source!
6
7But be warned; this is an arduous journey of several hours or more! And as we
8know, all true Heroes are driven by a Noble Goal. Thus I offer a Beer (or
9equivalent) to anyone I meet who has completed this documentation.
10
11So get comfortable and keep your wits about you (both quick and humorous).
12Along your way to the Noble Goal, you will also gain masterly insight into
13lguest, and hypervisors and x86 virtualization in general.
14
15Our Quest is in seven parts: (best read with C highlighting turned on)
16
17I) Preparation
18 - In which our potential hero is flown quickly over the landscape for a
19 taste of its scope. Suitable for the armchair coders and other such
20 persons of faint constitution.
21
22II) Guest
23 - Where we encounter the first tantalising wisps of code, and come to
24 understand the details of the life of a Guest kernel.
25
26III) Drivers
27 - Whereby the Guest finds its voice and become useful, and our
28 understanding of the Guest is completed.
29
30IV) Launcher
31 - Where we trace back to the creation of the Guest, and thus begin our
32 understanding of the Host.
33
34V) Host
35 - Where we master the Host code, through a long and tortuous journey.
36 Indeed, it is here that our hero is tested in the Bit of Despair.
37
38VI) Switcher
39 - Where our understanding of the intertwined nature of Guests and Hosts
40 is completed.
41
42VII) Mastery
43 - Where our fully fledged hero grapples with the Great Question:
44 "What next?"
45
46make Preparation!
47Rusty Russell.
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index ce909ec57499..0a46e8837d9a 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -1,5 +1,8 @@
1/* World's simplest hypervisor, to test paravirt_ops and show 1/*P:400 This contains run_guest() which actually calls into the Host<->Guest
2 * unbelievers that virtualization is the future. Plus, it's fun! */ 2 * Switcher and analyzes the return, such as determining if the Guest wants the
3 * Host to do something. This file also contains useful helper routines, and a
4 * couple of non-obvious setup and teardown pieces which were implemented after
5 * days of debugging pain. :*/
3#include <linux/module.h> 6#include <linux/module.h>
4#include <linux/stringify.h> 7#include <linux/stringify.h>
5#include <linux/stddef.h> 8#include <linux/stddef.h>
@@ -61,11 +64,33 @@ static struct lguest_pages *lguest_pages(unsigned int cpu)
61 (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); 64 (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
62} 65}
63 66
67/*H:010 We need to set up the Switcher at a high virtual address. Remember the
68 * Switcher is a few hundred bytes of assembler code which actually changes the
69 * CPU to run the Guest, and then changes back to the Host when a trap or
70 * interrupt happens.
71 *
72 * The Switcher code must be at the same virtual address in the Guest as the
73 * Host since it will be running as the switchover occurs.
74 *
75 * Trying to map memory at a particular address is an unusual thing to do, so
76 * it's not a simple one-liner. We also set up the per-cpu parts of the
77 * Switcher here.
78 */
64static __init int map_switcher(void) 79static __init int map_switcher(void)
65{ 80{
66 int i, err; 81 int i, err;
67 struct page **pagep; 82 struct page **pagep;
68 83
84 /*
85 * Map the Switcher in to high memory.
86 *
87 * It turns out that if we choose the address 0xFFC00000 (4MB under the
88 * top virtual address), it makes setting up the page tables really
89 * easy.
90 */
91
92 /* We allocate an array of "struct page"s. map_vm_area() wants the
93 * pages in this form, rather than just an array of pointers. */
69 switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, 94 switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES,
70 GFP_KERNEL); 95 GFP_KERNEL);
71 if (!switcher_page) { 96 if (!switcher_page) {
@@ -73,6 +98,8 @@ static __init int map_switcher(void)
73 goto out; 98 goto out;
74 } 99 }
75 100
101 /* Now we actually allocate the pages. The Guest will see these pages,
102 * so we make sure they're zeroed. */
76 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { 103 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
77 unsigned long addr = get_zeroed_page(GFP_KERNEL); 104 unsigned long addr = get_zeroed_page(GFP_KERNEL);
78 if (!addr) { 105 if (!addr) {
@@ -82,6 +109,9 @@ static __init int map_switcher(void)
82 switcher_page[i] = virt_to_page(addr); 109 switcher_page[i] = virt_to_page(addr);
83 } 110 }
84 111
112 /* Now we reserve the "virtual memory area" we want: 0xFFC00000
113 * (SWITCHER_ADDR). We might not get it in theory, but in practice
114 * it's worked so far. */
85 switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, 115 switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
86 VM_ALLOC, SWITCHER_ADDR, VMALLOC_END); 116 VM_ALLOC, SWITCHER_ADDR, VMALLOC_END);
87 if (!switcher_vma) { 117 if (!switcher_vma) {
@@ -90,49 +120,105 @@ static __init int map_switcher(void)
90 goto free_pages; 120 goto free_pages;
91 } 121 }
92 122
123 /* This code actually sets up the pages we've allocated to appear at
124 * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the
125 * kind of pages we're mapping (kernel pages), and a pointer to our
126 * array of struct pages. It increments that pointer, but we don't
127 * care. */
93 pagep = switcher_page; 128 pagep = switcher_page;
94 err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep); 129 err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep);
95 if (err) { 130 if (err) {
96 printk("lguest: map_vm_area failed: %i\n", err); 131 printk("lguest: map_vm_area failed: %i\n", err);
97 goto free_vma; 132 goto free_vma;
98 } 133 }
134
135 /* Now the switcher is mapped at the right address, we can't fail!
136 * Copy in the compiled-in Switcher code (from switcher.S). */
99 memcpy(switcher_vma->addr, start_switcher_text, 137 memcpy(switcher_vma->addr, start_switcher_text,
100 end_switcher_text - start_switcher_text); 138 end_switcher_text - start_switcher_text);
101 139
102 /* Fix up IDT entries to point into copied text. */ 140 /* Most of the switcher.S doesn't care that it's been moved; on Intel,
141 * jumps are relative, and it doesn't access any references to external
142 * code or data.
143 *
144 * The only exception is the interrupt handlers in switcher.S: their
145 * addresses are placed in a table (default_idt_entries), so we need to
146 * update the table with the new addresses. switcher_offset() is a
147 * convenience function which returns the distance between the builtin
148 * switcher code and the high-mapped copy we just made. */
103 for (i = 0; i < IDT_ENTRIES; i++) 149 for (i = 0; i < IDT_ENTRIES; i++)
104 default_idt_entries[i] += switcher_offset(); 150 default_idt_entries[i] += switcher_offset();
105 151
152 /*
153 * Set up the Switcher's per-cpu areas.
154 *
155 * Each CPU gets two pages of its own within the high-mapped region
156 * (aka. "struct lguest_pages"). Much of this can be initialized now,
157 * but some depends on what Guest we are running (which is set up in
158 * copy_in_guest_info()).
159 */
106 for_each_possible_cpu(i) { 160 for_each_possible_cpu(i) {
161 /* lguest_pages() returns this CPU's two pages. */
107 struct lguest_pages *pages = lguest_pages(i); 162 struct lguest_pages *pages = lguest_pages(i);
163 /* This is a convenience pointer to make the code fit one
164 * statement to a line. */
108 struct lguest_ro_state *state = &pages->state; 165 struct lguest_ro_state *state = &pages->state;
109 166
110 /* These fields are static: rest done in copy_in_guest_info */ 167 /* The Global Descriptor Table: the Host has a different one
168 * for each CPU. We keep a descriptor for the GDT which says
169 * where it is and how big it is (the size is actually the last
170 * byte, not the size, hence the "-1"). */
111 state->host_gdt_desc.size = GDT_SIZE-1; 171 state->host_gdt_desc.size = GDT_SIZE-1;
112 state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); 172 state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
173
174 /* All CPUs on the Host use the same Interrupt Descriptor
175 * Table, so we just use store_idt(), which gets this CPU's IDT
176 * descriptor. */
113 store_idt(&state->host_idt_desc); 177 store_idt(&state->host_idt_desc);
178
179 /* The descriptors for the Guest's GDT and IDT can be filled
180 * out now, too. We copy the GDT & IDT into ->guest_gdt and
181 * ->guest_idt before actually running the Guest. */
114 state->guest_idt_desc.size = sizeof(state->guest_idt)-1; 182 state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
115 state->guest_idt_desc.address = (long)&state->guest_idt; 183 state->guest_idt_desc.address = (long)&state->guest_idt;
116 state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; 184 state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
117 state->guest_gdt_desc.address = (long)&state->guest_gdt; 185 state->guest_gdt_desc.address = (long)&state->guest_gdt;
186
187 /* We know where we want the stack to be when the Guest enters
188 * the switcher: in pages->regs. The stack grows upwards, so
189 * we start it at the end of that structure. */
118 state->guest_tss.esp0 = (long)(&pages->regs + 1); 190 state->guest_tss.esp0 = (long)(&pages->regs + 1);
191 /* And this is the GDT entry to use for the stack: we keep a
192 * couple of special LGUEST entries. */
119 state->guest_tss.ss0 = LGUEST_DS; 193 state->guest_tss.ss0 = LGUEST_DS;
120 /* No I/O for you! */ 194
195 /* x86 can have a finegrained bitmap which indicates what I/O
196 * ports the process can use. We set it to the end of our
197 * structure, meaning "none". */
121 state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); 198 state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
199
200 /* Some GDT entries are the same across all Guests, so we can
201 * set them up now. */
122 setup_default_gdt_entries(state); 202 setup_default_gdt_entries(state);
203 /* Most IDT entries are the same for all Guests, too.*/
123 setup_default_idt_entries(state, default_idt_entries); 204 setup_default_idt_entries(state, default_idt_entries);
124 205
125 /* Setup LGUEST segments on all cpus */ 206 /* The Host needs to be able to use the LGUEST segments on this
207 * CPU, too, so put them in the Host GDT. */
126 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; 208 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
127 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; 209 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
128 } 210 }
129 211
130 /* Initialize entry point into switcher. */ 212 /* In the Switcher, we want the %cs segment register to use the
213 * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so
214 * it will be undisturbed when we switch. To change %cs and jump we
215 * need this structure to feed to Intel's "lcall" instruction. */
131 lguest_entry.offset = (long)switch_to_guest + switcher_offset(); 216 lguest_entry.offset = (long)switch_to_guest + switcher_offset();
132 lguest_entry.segment = LGUEST_CS; 217 lguest_entry.segment = LGUEST_CS;
133 218
134 printk(KERN_INFO "lguest: mapped switcher at %p\n", 219 printk(KERN_INFO "lguest: mapped switcher at %p\n",
135 switcher_vma->addr); 220 switcher_vma->addr);
221 /* And we succeeded... */
136 return 0; 222 return 0;
137 223
138free_vma: 224free_vma:
@@ -146,35 +232,58 @@ free_some_pages:
146out: 232out:
147 return err; 233 return err;
148} 234}
235/*:*/
149 236
237/* Cleaning up the mapping when the module is unloaded is almost...
238 * too easy. */
150static void unmap_switcher(void) 239static void unmap_switcher(void)
151{ 240{
152 unsigned int i; 241 unsigned int i;
153 242
243 /* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */
154 vunmap(switcher_vma->addr); 244 vunmap(switcher_vma->addr);
245 /* Now we just need to free the pages we copied the switcher into */
155 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) 246 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
156 __free_pages(switcher_page[i], 0); 247 __free_pages(switcher_page[i], 0);
157} 248}
158 249
159/* IN/OUT insns: enough to get us past boot-time probing. */ 250/*H:130 Our Guest is usually so well behaved; it never tries to do things it
251 * isn't allowed to. Unfortunately, "struct paravirt_ops" isn't quite
252 * complete, because it doesn't contain replacements for the Intel I/O
253 * instructions. As a result, the Guest sometimes fumbles across one during
254 * the boot process as it probes for various things which are usually attached
255 * to a PC.
256 *
257 * When the Guest uses one of these instructions, we get trap #13 (General
258 * Protection Fault) and come here. We see if it's one of those troublesome
259 * instructions and skip over it. We return true if we did. */
160static int emulate_insn(struct lguest *lg) 260static int emulate_insn(struct lguest *lg)
161{ 261{
162 u8 insn; 262 u8 insn;
163 unsigned int insnlen = 0, in = 0, shift = 0; 263 unsigned int insnlen = 0, in = 0, shift = 0;
264 /* The eip contains the *virtual* address of the Guest's instruction:
265 * guest_pa just subtracts the Guest's page_offset. */
164 unsigned long physaddr = guest_pa(lg, lg->regs->eip); 266 unsigned long physaddr = guest_pa(lg, lg->regs->eip);
165 267
166 /* This only works for addresses in linear mapping... */ 268 /* The guest_pa() function only works for Guest kernel addresses, but
269 * that's all we're trying to do anyway. */
167 if (lg->regs->eip < lg->page_offset) 270 if (lg->regs->eip < lg->page_offset)
168 return 0; 271 return 0;
272
273 /* Decoding x86 instructions is icky. */
169 lgread(lg, &insn, physaddr, 1); 274 lgread(lg, &insn, physaddr, 1);
170 275
171 /* Operand size prefix means it's actually for ax. */ 276 /* 0x66 is an "operand prefix". It means it's using the upper 16 bits
277 of the eax register. */
172 if (insn == 0x66) { 278 if (insn == 0x66) {
173 shift = 16; 279 shift = 16;
280 /* The instruction is 1 byte so far, read the next byte. */
174 insnlen = 1; 281 insnlen = 1;
175 lgread(lg, &insn, physaddr + insnlen, 1); 282 lgread(lg, &insn, physaddr + insnlen, 1);
176 } 283 }
177 284
285 /* We can ignore the lower bit for the moment and decode the 4 opcodes
286 * we need to emulate. */
178 switch (insn & 0xFE) { 287 switch (insn & 0xFE) {
179 case 0xE4: /* in <next byte>,%al */ 288 case 0xE4: /* in <next byte>,%al */
180 insnlen += 2; 289 insnlen += 2;
@@ -191,9 +300,13 @@ static int emulate_insn(struct lguest *lg)
191 insnlen += 1; 300 insnlen += 1;
192 break; 301 break;
193 default: 302 default:
303 /* OK, we don't know what this is, can't emulate. */
194 return 0; 304 return 0;
195 } 305 }
196 306
307 /* If it was an "IN" instruction, they expect the result to be read
308 * into %eax, so we change %eax. We always return all-ones, which
309 * traditionally means "there's nothing there". */
197 if (in) { 310 if (in) {
198 /* Lower bit tells is whether it's a 16 or 32 bit access */ 311 /* Lower bit tells is whether it's a 16 or 32 bit access */
199 if (insn & 0x1) 312 if (insn & 0x1)
@@ -201,28 +314,46 @@ static int emulate_insn(struct lguest *lg)
201 else 314 else
202 lg->regs->eax |= (0xFFFF << shift); 315 lg->regs->eax |= (0xFFFF << shift);
203 } 316 }
317 /* Finally, we've "done" the instruction, so move past it. */
204 lg->regs->eip += insnlen; 318 lg->regs->eip += insnlen;
319 /* Success! */
205 return 1; 320 return 1;
206} 321}
207 322/*:*/
323
324/*L:305
325 * Dealing With Guest Memory.
326 *
327 * When the Guest gives us (what it thinks is) a physical address, we can use
328 * the normal copy_from_user() & copy_to_user() on that address: remember,
329 * Guest physical == Launcher virtual.
330 *
331 * But we can't trust the Guest: it might be trying to access the Launcher
332 * code. We have to check that the range is below the pfn_limit the Launcher
333 * gave us. We have to make sure that addr + len doesn't give us a false
334 * positive by overflowing, too. */
208int lguest_address_ok(const struct lguest *lg, 335int lguest_address_ok(const struct lguest *lg,
209 unsigned long addr, unsigned long len) 336 unsigned long addr, unsigned long len)
210{ 337{
211 return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); 338 return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr);
212} 339}
213 340
214/* Just like get_user, but don't let guest access lguest binary. */ 341/* This is a convenient routine to get a 32-bit value from the Guest (a very
342 * common operation). Here we can see how useful the kill_lguest() routine we
343 * met in the Launcher can be: we return a random value (0) instead of needing
344 * to return an error. */
215u32 lgread_u32(struct lguest *lg, unsigned long addr) 345u32 lgread_u32(struct lguest *lg, unsigned long addr)
216{ 346{
217 u32 val = 0; 347 u32 val = 0;
218 348
219 /* Don't let them access lguest binary */ 349 /* Don't let them access lguest binary. */
220 if (!lguest_address_ok(lg, addr, sizeof(val)) 350 if (!lguest_address_ok(lg, addr, sizeof(val))
221 || get_user(val, (u32 __user *)addr) != 0) 351 || get_user(val, (u32 __user *)addr) != 0)
222 kill_guest(lg, "bad read address %#lx", addr); 352 kill_guest(lg, "bad read address %#lx", addr);
223 return val; 353 return val;
224} 354}
225 355
356/* Same thing for writing a value. */
226void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val) 357void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val)
227{ 358{
228 if (!lguest_address_ok(lg, addr, sizeof(val)) 359 if (!lguest_address_ok(lg, addr, sizeof(val))
@@ -230,6 +361,9 @@ void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val)
230 kill_guest(lg, "bad write address %#lx", addr); 361 kill_guest(lg, "bad write address %#lx", addr);
231} 362}
232 363
364/* This routine is more generic, and copies a range of Guest bytes into a
365 * buffer. If the copy_from_user() fails, we fill the buffer with zeroes, so
366 * the caller doesn't end up using uninitialized kernel memory. */
233void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes) 367void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes)
234{ 368{
235 if (!lguest_address_ok(lg, addr, bytes) 369 if (!lguest_address_ok(lg, addr, bytes)
@@ -240,6 +374,7 @@ void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes)
240 } 374 }
241} 375}
242 376
377/* Similarly, our generic routine to copy into a range of Guest bytes. */
243void lgwrite(struct lguest *lg, unsigned long addr, const void *b, 378void lgwrite(struct lguest *lg, unsigned long addr, const void *b,
244 unsigned bytes) 379 unsigned bytes)
245{ 380{
@@ -247,6 +382,7 @@ void lgwrite(struct lguest *lg, unsigned long addr, const void *b,
247 || copy_to_user((void __user *)addr, b, bytes) != 0) 382 || copy_to_user((void __user *)addr, b, bytes) != 0)
248 kill_guest(lg, "bad write address %#lx len %u", addr, bytes); 383 kill_guest(lg, "bad write address %#lx len %u", addr, bytes);
249} 384}
385/* (end of memory access helper routines) :*/
250 386
251static void set_ts(void) 387static void set_ts(void)
252{ 388{
@@ -257,54 +393,108 @@ static void set_ts(void)
257 write_cr0(cr0|8); 393 write_cr0(cr0|8);
258} 394}
259 395
396/*S:010
397 * We are getting close to the Switcher.
398 *
399 * Remember that each CPU has two pages which are visible to the Guest when it
400 * runs on that CPU. This has to contain the state for that Guest: we copy the
401 * state in just before we run the Guest.
402 *
403 * Each Guest has "changed" flags which indicate what has changed in the Guest
404 * since it last ran. We saw this set in interrupts_and_traps.c and
405 * segments.c.
406 */
260static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages) 407static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
261{ 408{
409 /* Copying all this data can be quite expensive. We usually run the
410 * same Guest we ran last time (and that Guest hasn't run anywhere else
411 * meanwhile). If that's not the case, we pretend everything in the
412 * Guest has changed. */
262 if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) { 413 if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) {
263 __get_cpu_var(last_guest) = lg; 414 __get_cpu_var(last_guest) = lg;
264 lg->last_pages = pages; 415 lg->last_pages = pages;
265 lg->changed = CHANGED_ALL; 416 lg->changed = CHANGED_ALL;
266 } 417 }
267 418
268 /* These are pretty cheap, so we do them unconditionally. */ 419 /* These copies are pretty cheap, so we do them unconditionally: */
420 /* Save the current Host top-level page directory. */
269 pages->state.host_cr3 = __pa(current->mm->pgd); 421 pages->state.host_cr3 = __pa(current->mm->pgd);
422 /* Set up the Guest's page tables to see this CPU's pages (and no
423 * other CPU's pages). */
270 map_switcher_in_guest(lg, pages); 424 map_switcher_in_guest(lg, pages);
425 /* Set up the two "TSS" members which tell the CPU what stack to use
426 * for traps which do directly into the Guest (ie. traps at privilege
427 * level 1). */
271 pages->state.guest_tss.esp1 = lg->esp1; 428 pages->state.guest_tss.esp1 = lg->esp1;
272 pages->state.guest_tss.ss1 = lg->ss1; 429 pages->state.guest_tss.ss1 = lg->ss1;
273 430
274 /* Copy direct trap entries. */ 431 /* Copy direct-to-Guest trap entries. */
275 if (lg->changed & CHANGED_IDT) 432 if (lg->changed & CHANGED_IDT)
276 copy_traps(lg, pages->state.guest_idt, default_idt_entries); 433 copy_traps(lg, pages->state.guest_idt, default_idt_entries);
277 434
278 /* Copy all GDT entries but the TSS. */ 435 /* Copy all GDT entries which the Guest can change. */
279 if (lg->changed & CHANGED_GDT) 436 if (lg->changed & CHANGED_GDT)
280 copy_gdt(lg, pages->state.guest_gdt); 437 copy_gdt(lg, pages->state.guest_gdt);
281 /* If only the TLS entries have changed, copy them. */ 438 /* If only the TLS entries have changed, copy them. */
282 else if (lg->changed & CHANGED_GDT_TLS) 439 else if (lg->changed & CHANGED_GDT_TLS)
283 copy_gdt_tls(lg, pages->state.guest_gdt); 440 copy_gdt_tls(lg, pages->state.guest_gdt);
284 441
442 /* Mark the Guest as unchanged for next time. */
285 lg->changed = 0; 443 lg->changed = 0;
286} 444}
287 445
446/* Finally: the code to actually call into the Switcher to run the Guest. */
288static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) 447static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
289{ 448{
449 /* This is a dummy value we need for GCC's sake. */
290 unsigned int clobber; 450 unsigned int clobber;
291 451
452 /* Copy the guest-specific information into this CPU's "struct
453 * lguest_pages". */
292 copy_in_guest_info(lg, pages); 454 copy_in_guest_info(lg, pages);
293 455
294 /* Put eflags on stack, lcall does rest: suitable for iret return. */ 456 /* Now: we push the "eflags" register on the stack, then do an "lcall".
457 * This is how we change from using the kernel code segment to using
458 * the dedicated lguest code segment, as well as jumping into the
459 * Switcher.
460 *
461 * The lcall also pushes the old code segment (KERNEL_CS) onto the
462 * stack, then the address of this call. This stack layout happens to
463 * exactly match the stack of an interrupt... */
295 asm volatile("pushf; lcall *lguest_entry" 464 asm volatile("pushf; lcall *lguest_entry"
465 /* This is how we tell GCC that %eax ("a") and %ebx ("b")
466 * are changed by this routine. The "=" means output. */
296 : "=a"(clobber), "=b"(clobber) 467 : "=a"(clobber), "=b"(clobber)
468 /* %eax contains the pages pointer. ("0" refers to the
469 * 0-th argument above, ie "a"). %ebx contains the
470 * physical address of the Guest's top-level page
471 * directory. */
297 : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir)) 472 : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir))
473 /* We tell gcc that all these registers could change,
474 * which means we don't have to save and restore them in
475 * the Switcher. */
298 : "memory", "%edx", "%ecx", "%edi", "%esi"); 476 : "memory", "%edx", "%ecx", "%edi", "%esi");
299} 477}
478/*:*/
300 479
480/*H:030 Let's jump straight to the the main loop which runs the Guest.
481 * Remember, this is called by the Launcher reading /dev/lguest, and we keep
482 * going around and around until something interesting happens. */
301int run_guest(struct lguest *lg, unsigned long __user *user) 483int run_guest(struct lguest *lg, unsigned long __user *user)
302{ 484{
485 /* We stop running once the Guest is dead. */
303 while (!lg->dead) { 486 while (!lg->dead) {
487 /* We need to initialize this, otherwise gcc complains. It's
488 * not (yet) clever enough to see that it's initialized when we
489 * need it. */
304 unsigned int cr2 = 0; /* Damn gcc */ 490 unsigned int cr2 = 0; /* Damn gcc */
305 491
306 /* Hypercalls first: we might have been out to userspace */ 492 /* First we run any hypercalls the Guest wants done: either in
493 * the hypercall ring in "struct lguest_data", or directly by
494 * using int 31 (LGUEST_TRAP_ENTRY). */
307 do_hypercalls(lg); 495 do_hypercalls(lg);
496 /* It's possible the Guest did a SEND_DMA hypercall to the
497 * Launcher, in which case we return from the read() now. */
308 if (lg->dma_is_pending) { 498 if (lg->dma_is_pending) {
309 if (put_user(lg->pending_dma, user) || 499 if (put_user(lg->pending_dma, user) ||
310 put_user(lg->pending_key, user+1)) 500 put_user(lg->pending_key, user+1))
@@ -312,6 +502,7 @@ int run_guest(struct lguest *lg, unsigned long __user *user)
312 return sizeof(unsigned long)*2; 502 return sizeof(unsigned long)*2;
313 } 503 }
314 504
505 /* Check for signals */
315 if (signal_pending(current)) 506 if (signal_pending(current))
316 return -ERESTARTSYS; 507 return -ERESTARTSYS;
317 508
@@ -319,77 +510,154 @@ int run_guest(struct lguest *lg, unsigned long __user *user)
319 if (lg->break_out) 510 if (lg->break_out)
320 return -EAGAIN; 511 return -EAGAIN;
321 512
513 /* Check if there are any interrupts which can be delivered
514 * now: if so, this sets up the hander to be executed when we
515 * next run the Guest. */
322 maybe_do_interrupt(lg); 516 maybe_do_interrupt(lg);
323 517
518 /* All long-lived kernel loops need to check with this horrible
519 * thing called the freezer. If the Host is trying to suspend,
520 * it stops us. */
324 try_to_freeze(); 521 try_to_freeze();
325 522
523 /* Just make absolutely sure the Guest is still alive. One of
524 * those hypercalls could have been fatal, for example. */
326 if (lg->dead) 525 if (lg->dead)
327 break; 526 break;
328 527
528 /* If the Guest asked to be stopped, we sleep. The Guest's
529 * clock timer or LHCALL_BREAK from the Waker will wake us. */
329 if (lg->halted) { 530 if (lg->halted) {
330 set_current_state(TASK_INTERRUPTIBLE); 531 set_current_state(TASK_INTERRUPTIBLE);
331 schedule(); 532 schedule();
332 continue; 533 continue;
333 } 534 }
334 535
536 /* OK, now we're ready to jump into the Guest. First we put up
537 * the "Do Not Disturb" sign: */
335 local_irq_disable(); 538 local_irq_disable();
336 539
337 /* Even if *we* don't want FPU trap, guest might... */ 540 /* Remember the awfully-named TS bit? If the Guest has asked
541 * to set it we set it now, so we can trap and pass that trap
542 * to the Guest if it uses the FPU. */
338 if (lg->ts) 543 if (lg->ts)
339 set_ts(); 544 set_ts();
340 545
341 /* Don't let Guest do SYSENTER: we can't handle it. */ 546 /* SYSENTER is an optimized way of doing system calls. We
547 * can't allow it because it always jumps to privilege level 0.
548 * A normal Guest won't try it because we don't advertise it in
549 * CPUID, but a malicious Guest (or malicious Guest userspace
550 * program) could, so we tell the CPU to disable it before
551 * running the Guest. */
342 if (boot_cpu_has(X86_FEATURE_SEP)) 552 if (boot_cpu_has(X86_FEATURE_SEP))
343 wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); 553 wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
344 554
555 /* Now we actually run the Guest. It will pop back out when
556 * something interesting happens, and we can examine its
557 * registers to see what it was doing. */
345 run_guest_once(lg, lguest_pages(raw_smp_processor_id())); 558 run_guest_once(lg, lguest_pages(raw_smp_processor_id()));
346 559
347 /* Save cr2 now if we page-faulted. */ 560 /* The "regs" pointer contains two extra entries which are not
561 * really registers: a trap number which says what interrupt or
562 * trap made the switcher code come back, and an error code
563 * which some traps set. */
564
565 /* If the Guest page faulted, then the cr2 register will tell
566 * us the bad virtual address. We have to grab this now,
567 * because once we re-enable interrupts an interrupt could
568 * fault and thus overwrite cr2, or we could even move off to a
569 * different CPU. */
348 if (lg->regs->trapnum == 14) 570 if (lg->regs->trapnum == 14)
349 cr2 = read_cr2(); 571 cr2 = read_cr2();
572 /* Similarly, if we took a trap because the Guest used the FPU,
573 * we have to restore the FPU it expects to see. */
350 else if (lg->regs->trapnum == 7) 574 else if (lg->regs->trapnum == 7)
351 math_state_restore(); 575 math_state_restore();
352 576
577 /* Restore SYSENTER if it's supposed to be on. */
353 if (boot_cpu_has(X86_FEATURE_SEP)) 578 if (boot_cpu_has(X86_FEATURE_SEP))
354 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 579 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
580
581 /* Now we're ready to be interrupted or moved to other CPUs */
355 local_irq_enable(); 582 local_irq_enable();
356 583
584 /* OK, so what happened? */
357 switch (lg->regs->trapnum) { 585 switch (lg->regs->trapnum) {
358 case 13: /* We've intercepted a GPF. */ 586 case 13: /* We've intercepted a GPF. */
587 /* Check if this was one of those annoying IN or OUT
588 * instructions which we need to emulate. If so, we
589 * just go back into the Guest after we've done it. */
359 if (lg->regs->errcode == 0) { 590 if (lg->regs->errcode == 0) {
360 if (emulate_insn(lg)) 591 if (emulate_insn(lg))
361 continue; 592 continue;
362 } 593 }
363 break; 594 break;
364 case 14: /* We've intercepted a page fault. */ 595 case 14: /* We've intercepted a page fault. */
596 /* The Guest accessed a virtual address that wasn't
597 * mapped. This happens a lot: we don't actually set
598 * up most of the page tables for the Guest at all when
599 * we start: as it runs it asks for more and more, and
600 * we set them up as required. In this case, we don't
601 * even tell the Guest that the fault happened.
602 *
603 * The errcode tells whether this was a read or a
604 * write, and whether kernel or userspace code. */
365 if (demand_page(lg, cr2, lg->regs->errcode)) 605 if (demand_page(lg, cr2, lg->regs->errcode))
366 continue; 606 continue;
367 607
368 /* If lguest_data is NULL, this won't hurt. */ 608 /* OK, it's really not there (or not OK): the Guest
609 * needs to know. We write out the cr2 value so it
610 * knows where the fault occurred.
611 *
612 * Note that if the Guest were really messed up, this
613 * could happen before it's done the INITIALIZE
614 * hypercall, so lg->lguest_data will be NULL, so
615 * &lg->lguest_data->cr2 will be address 8. Writing
616 * into that address won't hurt the Host at all,
617 * though. */
369 if (put_user(cr2, &lg->lguest_data->cr2)) 618 if (put_user(cr2, &lg->lguest_data->cr2))
370 kill_guest(lg, "Writing cr2"); 619 kill_guest(lg, "Writing cr2");
371 break; 620 break;
372 case 7: /* We've intercepted a Device Not Available fault. */ 621 case 7: /* We've intercepted a Device Not Available fault. */
373 /* If they don't want to know, just absorb it. */ 622 /* If the Guest doesn't want to know, we already
623 * restored the Floating Point Unit, so we just
624 * continue without telling it. */
374 if (!lg->ts) 625 if (!lg->ts)
375 continue; 626 continue;
376 break; 627 break;
377 case 32 ... 255: /* Real interrupt, fall thru */ 628 case 32 ... 255:
629 /* These values mean a real interrupt occurred, in
630 * which case the Host handler has already been run.
631 * We just do a friendly check if another process
632 * should now be run, then fall through to loop
633 * around: */
378 cond_resched(); 634 cond_resched();
379 case LGUEST_TRAP_ENTRY: /* Handled at top of loop */ 635 case LGUEST_TRAP_ENTRY: /* Handled at top of loop */
380 continue; 636 continue;
381 } 637 }
382 638
639 /* If we get here, it's a trap the Guest wants to know
640 * about. */
383 if (deliver_trap(lg, lg->regs->trapnum)) 641 if (deliver_trap(lg, lg->regs->trapnum))
384 continue; 642 continue;
385 643
644 /* If the Guest doesn't have a handler (either it hasn't
645 * registered any yet, or it's one of the faults we don't let
646 * it handle), it dies with a cryptic error message. */
386 kill_guest(lg, "unhandled trap %li at %#lx (%#lx)", 647 kill_guest(lg, "unhandled trap %li at %#lx (%#lx)",
387 lg->regs->trapnum, lg->regs->eip, 648 lg->regs->trapnum, lg->regs->eip,
388 lg->regs->trapnum == 14 ? cr2 : lg->regs->errcode); 649 lg->regs->trapnum == 14 ? cr2 : lg->regs->errcode);
389 } 650 }
651 /* The Guest is dead => "No such file or directory" */
390 return -ENOENT; 652 return -ENOENT;
391} 653}
392 654
655/* Now we can look at each of the routines this calls, in increasing order of
656 * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(),
657 * deliver_trap() and demand_page(). After all those, we'll be ready to
658 * examine the Switcher, and our philosophical understanding of the Host/Guest
659 * duality will be complete. :*/
660
393int find_free_guest(void) 661int find_free_guest(void)
394{ 662{
395 unsigned int i; 663 unsigned int i;
@@ -407,55 +675,96 @@ static void adjust_pge(void *on)
407 write_cr4(read_cr4() & ~X86_CR4_PGE); 675 write_cr4(read_cr4() & ~X86_CR4_PGE);
408} 676}
409 677
678/*H:000
679 * Welcome to the Host!
680 *
681 * By this point your brain has been tickled by the Guest code and numbed by
682 * the Launcher code; prepare for it to be stretched by the Host code. This is
683 * the heart. Let's begin at the initialization routine for the Host's lg
684 * module.
685 */
410static int __init init(void) 686static int __init init(void)
411{ 687{
412 int err; 688 int err;
413 689
690 /* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */
414 if (paravirt_enabled()) { 691 if (paravirt_enabled()) {
415 printk("lguest is afraid of %s\n", paravirt_ops.name); 692 printk("lguest is afraid of %s\n", paravirt_ops.name);
416 return -EPERM; 693 return -EPERM;
417 } 694 }
418 695
696 /* First we put the Switcher up in very high virtual memory. */
419 err = map_switcher(); 697 err = map_switcher();
420 if (err) 698 if (err)
421 return err; 699 return err;
422 700
701 /* Now we set up the pagetable implementation for the Guests. */
423 err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES); 702 err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES);
424 if (err) { 703 if (err) {
425 unmap_switcher(); 704 unmap_switcher();
426 return err; 705 return err;
427 } 706 }
707
708 /* The I/O subsystem needs some things initialized. */
428 lguest_io_init(); 709 lguest_io_init();
429 710
711 /* /dev/lguest needs to be registered. */
430 err = lguest_device_init(); 712 err = lguest_device_init();
431 if (err) { 713 if (err) {
432 free_pagetables(); 714 free_pagetables();
433 unmap_switcher(); 715 unmap_switcher();
434 return err; 716 return err;
435 } 717 }
718
719 /* Finally, we need to turn off "Page Global Enable". PGE is an
720 * optimization where page table entries are specially marked to show
721 * they never change. The Host kernel marks all the kernel pages this
722 * way because it's always present, even when userspace is running.
723 *
724 * Lguest breaks this: unbeknownst to the rest of the Host kernel, we
725 * switch to the Guest kernel. If you don't disable this on all CPUs,
726 * you'll get really weird bugs that you'll chase for two days.
727 *
728 * I used to turn PGE off every time we switched to the Guest and back
729 * on when we return, but that slowed the Switcher down noticibly. */
730
731 /* We don't need the complexity of CPUs coming and going while we're
732 * doing this. */
436 lock_cpu_hotplug(); 733 lock_cpu_hotplug();
437 if (cpu_has_pge) { /* We have a broader idea of "global". */ 734 if (cpu_has_pge) { /* We have a broader idea of "global". */
735 /* Remember that this was originally set (for cleanup). */
438 cpu_had_pge = 1; 736 cpu_had_pge = 1;
737 /* adjust_pge is a helper function which sets or unsets the PGE
738 * bit on its CPU, depending on the argument (0 == unset). */
439 on_each_cpu(adjust_pge, (void *)0, 0, 1); 739 on_each_cpu(adjust_pge, (void *)0, 0, 1);
740 /* Turn off the feature in the global feature set. */
440 clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); 741 clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
441 } 742 }
442 unlock_cpu_hotplug(); 743 unlock_cpu_hotplug();
744
745 /* All good! */
443 return 0; 746 return 0;
444} 747}
445 748
749/* Cleaning up is just the same code, backwards. With a little French. */
446static void __exit fini(void) 750static void __exit fini(void)
447{ 751{
448 lguest_device_remove(); 752 lguest_device_remove();
449 free_pagetables(); 753 free_pagetables();
450 unmap_switcher(); 754 unmap_switcher();
755
756 /* If we had PGE before we started, turn it back on now. */
451 lock_cpu_hotplug(); 757 lock_cpu_hotplug();
452 if (cpu_had_pge) { 758 if (cpu_had_pge) {
453 set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); 759 set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
760 /* adjust_pge's argument "1" means set PGE. */
454 on_each_cpu(adjust_pge, (void *)1, 0, 1); 761 on_each_cpu(adjust_pge, (void *)1, 0, 1);
455 } 762 }
456 unlock_cpu_hotplug(); 763 unlock_cpu_hotplug();
457} 764}
458 765
766/* The Host side of lguest can be a module. This is a nice way for people to
767 * play with it. */
459module_init(init); 768module_init(init);
460module_exit(fini); 769module_exit(fini);
461MODULE_LICENSE("GPL"); 770MODULE_LICENSE("GPL");
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index ea52ca451f74..db6caace3b9c 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -1,5 +1,10 @@
1/* Actual hypercalls, which allow guests to actually do something. 1/*P:500 Just as userspace programs request kernel operations through a system
2 Copyright (C) 2006 Rusty Russell IBM Corporation 2 * call, the Guest requests Host operations through a "hypercall". You might
3 * notice this nomenclature doesn't really follow any logic, but the name has
4 * been around for long enough that we're stuck with it. As you'd expect, this
5 * code is basically a one big switch statement. :*/
6
7/* Copyright (C) 2006 Rusty Russell IBM Corporation
3 8
4 This program is free software; you can redistribute it and/or modify 9 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by 10 it under the terms of the GNU General Public License as published by
@@ -23,37 +28,55 @@
23#include <irq_vectors.h> 28#include <irq_vectors.h>
24#include "lg.h" 29#include "lg.h"
25 30
31/*H:120 This is the core hypercall routine: where the Guest gets what it
32 * wants. Or gets killed. Or, in the case of LHCALL_CRASH, both.
33 *
34 * Remember from the Guest: %eax == which call to make, and the arguments are
35 * packed into %edx, %ebx and %ecx if needed. */
26static void do_hcall(struct lguest *lg, struct lguest_regs *regs) 36static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
27{ 37{
28 switch (regs->eax) { 38 switch (regs->eax) {
29 case LHCALL_FLUSH_ASYNC: 39 case LHCALL_FLUSH_ASYNC:
40 /* This call does nothing, except by breaking out of the Guest
41 * it makes us process all the asynchronous hypercalls. */
30 break; 42 break;
31 case LHCALL_LGUEST_INIT: 43 case LHCALL_LGUEST_INIT:
44 /* You can't get here unless you're already initialized. Don't
45 * do that. */
32 kill_guest(lg, "already have lguest_data"); 46 kill_guest(lg, "already have lguest_data");
33 break; 47 break;
34 case LHCALL_CRASH: { 48 case LHCALL_CRASH: {
49 /* Crash is such a trivial hypercall that we do it in four
50 * lines right here. */
35 char msg[128]; 51 char msg[128];
52 /* If the lgread fails, it will call kill_guest() itself; the
53 * kill_guest() with the message will be ignored. */
36 lgread(lg, msg, regs->edx, sizeof(msg)); 54 lgread(lg, msg, regs->edx, sizeof(msg));
37 msg[sizeof(msg)-1] = '\0'; 55 msg[sizeof(msg)-1] = '\0';
38 kill_guest(lg, "CRASH: %s", msg); 56 kill_guest(lg, "CRASH: %s", msg);
39 break; 57 break;
40 } 58 }
41 case LHCALL_FLUSH_TLB: 59 case LHCALL_FLUSH_TLB:
60 /* FLUSH_TLB comes in two flavors, depending on the
61 * argument: */
42 if (regs->edx) 62 if (regs->edx)
43 guest_pagetable_clear_all(lg); 63 guest_pagetable_clear_all(lg);
44 else 64 else
45 guest_pagetable_flush_user(lg); 65 guest_pagetable_flush_user(lg);
46 break; 66 break;
47 case LHCALL_GET_WALLCLOCK: {
48 struct timespec ts;
49 ktime_get_real_ts(&ts);
50 regs->eax = ts.tv_sec;
51 break;
52 }
53 case LHCALL_BIND_DMA: 67 case LHCALL_BIND_DMA:
68 /* BIND_DMA really wants four arguments, but it's the only call
69 * which does. So the Guest packs the number of buffers and
70 * the interrupt number into the final argument, and we decode
71 * it here. This can legitimately fail, since we currently
72 * place a limit on the number of DMA pools a Guest can have.
73 * So we return true or false from this call. */
54 regs->eax = bind_dma(lg, regs->edx, regs->ebx, 74 regs->eax = bind_dma(lg, regs->edx, regs->ebx,
55 regs->ecx >> 8, regs->ecx & 0xFF); 75 regs->ecx >> 8, regs->ecx & 0xFF);
56 break; 76 break;
77
78 /* All these calls simply pass the arguments through to the right
79 * routines. */
57 case LHCALL_SEND_DMA: 80 case LHCALL_SEND_DMA:
58 send_dma(lg, regs->edx, regs->ebx); 81 send_dma(lg, regs->edx, regs->ebx);
59 break; 82 break;
@@ -81,10 +104,13 @@ static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
81 case LHCALL_SET_CLOCKEVENT: 104 case LHCALL_SET_CLOCKEVENT:
82 guest_set_clockevent(lg, regs->edx); 105 guest_set_clockevent(lg, regs->edx);
83 break; 106 break;
107
84 case LHCALL_TS: 108 case LHCALL_TS:
109 /* This sets the TS flag, as we saw used in run_guest(). */
85 lg->ts = regs->edx; 110 lg->ts = regs->edx;
86 break; 111 break;
87 case LHCALL_HALT: 112 case LHCALL_HALT:
113 /* Similarly, this sets the halted flag for run_guest(). */
88 lg->halted = 1; 114 lg->halted = 1;
89 break; 115 break;
90 default: 116 default:
@@ -92,25 +118,42 @@ static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
92 } 118 }
93} 119}
94 120
95/* We always do queued calls before actual hypercall. */ 121/* Asynchronous hypercalls are easy: we just look in the array in the Guest's
122 * "struct lguest_data" and see if there are any new ones marked "ready".
123 *
124 * We are careful to do these in order: obviously we respect the order the
125 * Guest put them in the ring, but we also promise the Guest that they will
126 * happen before any normal hypercall (which is why we check this before
127 * checking for a normal hcall). */
96static void do_async_hcalls(struct lguest *lg) 128static void do_async_hcalls(struct lguest *lg)
97{ 129{
98 unsigned int i; 130 unsigned int i;
99 u8 st[LHCALL_RING_SIZE]; 131 u8 st[LHCALL_RING_SIZE];
100 132
133 /* For simplicity, we copy the entire call status array in at once. */
101 if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st))) 134 if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st)))
102 return; 135 return;
103 136
137
138 /* We process "struct lguest_data"s hcalls[] ring once. */
104 for (i = 0; i < ARRAY_SIZE(st); i++) { 139 for (i = 0; i < ARRAY_SIZE(st); i++) {
105 struct lguest_regs regs; 140 struct lguest_regs regs;
141 /* We remember where we were up to from last time. This makes
142 * sure that the hypercalls are done in the order the Guest
143 * places them in the ring. */
106 unsigned int n = lg->next_hcall; 144 unsigned int n = lg->next_hcall;
107 145
146 /* 0xFF means there's no call here (yet). */
108 if (st[n] == 0xFF) 147 if (st[n] == 0xFF)
109 break; 148 break;
110 149
150 /* OK, we have hypercall. Increment the "next_hcall" cursor,
151 * and wrap back to 0 if we reach the end. */
111 if (++lg->next_hcall == LHCALL_RING_SIZE) 152 if (++lg->next_hcall == LHCALL_RING_SIZE)
112 lg->next_hcall = 0; 153 lg->next_hcall = 0;
113 154
155 /* We copy the hypercall arguments into a fake register
156 * structure. This makes life simple for do_hcall(). */
114 if (get_user(regs.eax, &lg->lguest_data->hcalls[n].eax) 157 if (get_user(regs.eax, &lg->lguest_data->hcalls[n].eax)
115 || get_user(regs.edx, &lg->lguest_data->hcalls[n].edx) 158 || get_user(regs.edx, &lg->lguest_data->hcalls[n].edx)
116 || get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx) 159 || get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx)
@@ -119,74 +162,139 @@ static void do_async_hcalls(struct lguest *lg)
119 break; 162 break;
120 } 163 }
121 164
165 /* Do the hypercall, same as a normal one. */
122 do_hcall(lg, &regs); 166 do_hcall(lg, &regs);
167
168 /* Mark the hypercall done. */
123 if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) { 169 if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) {
124 kill_guest(lg, "Writing result for async hypercall"); 170 kill_guest(lg, "Writing result for async hypercall");
125 break; 171 break;
126 } 172 }
127 173
174 /* Stop doing hypercalls if we've just done a DMA to the
175 * Launcher: it needs to service this first. */
128 if (lg->dma_is_pending) 176 if (lg->dma_is_pending)
129 break; 177 break;
130 } 178 }
131} 179}
132 180
181/* Last of all, we look at what happens first of all. The very first time the
182 * Guest makes a hypercall, we end up here to set things up: */
133static void initialize(struct lguest *lg) 183static void initialize(struct lguest *lg)
134{ 184{
135 u32 tsc_speed; 185 u32 tsc_speed;
136 186
187 /* You can't do anything until you're initialized. The Guest knows the
188 * rules, so we're unforgiving here. */
137 if (lg->regs->eax != LHCALL_LGUEST_INIT) { 189 if (lg->regs->eax != LHCALL_LGUEST_INIT) {
138 kill_guest(lg, "hypercall %li before LGUEST_INIT", 190 kill_guest(lg, "hypercall %li before LGUEST_INIT",
139 lg->regs->eax); 191 lg->regs->eax);
140 return; 192 return;
141 } 193 }
142 194
143 /* We only tell the guest to use the TSC if it's reliable. */ 195 /* We insist that the Time Stamp Counter exist and doesn't change with
196 * cpu frequency. Some devious chip manufacturers decided that TSC
197 * changes could be handled in software. I decided that time going
198 * backwards might be good for benchmarks, but it's bad for users.
199 *
200 * We also insist that the TSC be stable: the kernel detects unreliable
201 * TSCs for its own purposes, and we use that here. */
144 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) 202 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
145 tsc_speed = tsc_khz; 203 tsc_speed = tsc_khz;
146 else 204 else
147 tsc_speed = 0; 205 tsc_speed = 0;
148 206
207 /* The pointer to the Guest's "struct lguest_data" is the only
208 * argument. */
149 lg->lguest_data = (struct lguest_data __user *)lg->regs->edx; 209 lg->lguest_data = (struct lguest_data __user *)lg->regs->edx;
150 /* We check here so we can simply copy_to_user/from_user */ 210 /* If we check the address they gave is OK now, we can simply
211 * copy_to_user/from_user from now on rather than using lgread/lgwrite.
212 * I put this in to show that I'm not immune to writing stupid
213 * optimizations. */
151 if (!lguest_address_ok(lg, lg->regs->edx, sizeof(*lg->lguest_data))) { 214 if (!lguest_address_ok(lg, lg->regs->edx, sizeof(*lg->lguest_data))) {
152 kill_guest(lg, "bad guest page %p", lg->lguest_data); 215 kill_guest(lg, "bad guest page %p", lg->lguest_data);
153 return; 216 return;
154 } 217 }
218 /* The Guest tells us where we're not to deliver interrupts by putting
219 * the range of addresses into "struct lguest_data". */
155 if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start) 220 if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start)
156 || get_user(lg->noirq_end, &lg->lguest_data->noirq_end) 221 || get_user(lg->noirq_end, &lg->lguest_data->noirq_end)
157 /* We reserve the top pgd entry. */ 222 /* We tell the Guest that it can't use the top 4MB of virtual
223 * addresses used by the Switcher. */
158 || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem) 224 || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
159 || put_user(tsc_speed, &lg->lguest_data->tsc_khz) 225 || put_user(tsc_speed, &lg->lguest_data->tsc_khz)
226 /* We also give the Guest a unique id, as used in lguest_net.c. */
160 || put_user(lg->guestid, &lg->lguest_data->guestid)) 227 || put_user(lg->guestid, &lg->lguest_data->guestid))
161 kill_guest(lg, "bad guest page %p", lg->lguest_data); 228 kill_guest(lg, "bad guest page %p", lg->lguest_data);
162 229
163 /* This is the one case where the above accesses might have 230 /* We write the current time into the Guest's data page once now. */
164 * been the first write to a Guest page. This may have caused 231 write_timestamp(lg);
165 * a copy-on-write fault, but the Guest might be referring to 232
166 * the old (read-only) page. */ 233 /* This is the one case where the above accesses might have been the
234 * first write to a Guest page. This may have caused a copy-on-write
235 * fault, but the Guest might be referring to the old (read-only)
236 * page. */
167 guest_pagetable_clear_all(lg); 237 guest_pagetable_clear_all(lg);
168} 238}
239/* Now we've examined the hypercall code; our Guest can make requests. There
240 * is one other way we can do things for the Guest, as we see in
241 * emulate_insn(). */
169 242
170/* Even if we go out to userspace and come back, we don't want to do 243/*H:110 Tricky point: we mark the hypercall as "done" once we've done it.
171 * the hypercall again. */ 244 * Normally we don't need to do this: the Guest will run again and update the
245 * trap number before we come back around the run_guest() loop to
246 * do_hypercalls().
247 *
248 * However, if we are signalled or the Guest sends DMA to the Launcher, that
249 * loop will exit without running the Guest. When it comes back it would try
250 * to re-run the hypercall. */
172static void clear_hcall(struct lguest *lg) 251static void clear_hcall(struct lguest *lg)
173{ 252{
174 lg->regs->trapnum = 255; 253 lg->regs->trapnum = 255;
175} 254}
176 255
256/*H:100
257 * Hypercalls
258 *
259 * Remember from the Guest, hypercalls come in two flavors: normal and
260 * asynchronous. This file handles both of types.
261 */
177void do_hypercalls(struct lguest *lg) 262void do_hypercalls(struct lguest *lg)
178{ 263{
264 /* Not initialized yet? */
179 if (unlikely(!lg->lguest_data)) { 265 if (unlikely(!lg->lguest_data)) {
266 /* Did the Guest make a hypercall? We might have come back for
267 * some other reason (an interrupt, a different trap). */
180 if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) { 268 if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
269 /* Set up the "struct lguest_data" */
181 initialize(lg); 270 initialize(lg);
271 /* The hypercall is done. */
182 clear_hcall(lg); 272 clear_hcall(lg);
183 } 273 }
184 return; 274 return;
185 } 275 }
186 276
277 /* The Guest has initialized.
278 *
279 * Look in the hypercall ring for the async hypercalls: */
187 do_async_hcalls(lg); 280 do_async_hcalls(lg);
281
282 /* If we stopped reading the hypercall ring because the Guest did a
283 * SEND_DMA to the Launcher, we want to return now. Otherwise if the
284 * Guest asked us to do a hypercall, we do it. */
188 if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) { 285 if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
189 do_hcall(lg, lg->regs); 286 do_hcall(lg, lg->regs);
287 /* The hypercall is done. */
190 clear_hcall(lg); 288 clear_hcall(lg);
191 } 289 }
192} 290}
291
292/* This routine supplies the Guest with time: it's used for wallclock time at
293 * initial boot and as a rough time source if the TSC isn't available. */
294void write_timestamp(struct lguest *lg)
295{
296 struct timespec now;
297 ktime_get_real_ts(&now);
298 if (put_user(now, &lg->lguest_data->time))
299 kill_guest(lg, "Writing timestamp");
300}
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index bee029bb2c7b..49787e964a0d 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -1,100 +1,160 @@
1/*P:800 Interrupts (traps) are complicated enough to earn their own file.
2 * There are three classes of interrupts:
3 *
4 * 1) Real hardware interrupts which occur while we're running the Guest,
5 * 2) Interrupts for virtual devices attached to the Guest, and
6 * 3) Traps and faults from the Guest.
7 *
8 * Real hardware interrupts must be delivered to the Host, not the Guest.
9 * Virtual interrupts must be delivered to the Guest, but we make them look
10 * just like real hardware would deliver them. Traps from the Guest can be set
11 * up to go directly back into the Guest, but sometimes the Host wants to see
12 * them first, so we also have a way of "reflecting" them into the Guest as if
13 * they had been delivered to it directly. :*/
1#include <linux/uaccess.h> 14#include <linux/uaccess.h>
2#include "lg.h" 15#include "lg.h"
3 16
17/* The address of the interrupt handler is split into two bits: */
4static unsigned long idt_address(u32 lo, u32 hi) 18static unsigned long idt_address(u32 lo, u32 hi)
5{ 19{
6 return (lo & 0x0000FFFF) | (hi & 0xFFFF0000); 20 return (lo & 0x0000FFFF) | (hi & 0xFFFF0000);
7} 21}
8 22
23/* The "type" of the interrupt handler is a 4 bit field: we only support a
24 * couple of types. */
9static int idt_type(u32 lo, u32 hi) 25static int idt_type(u32 lo, u32 hi)
10{ 26{
11 return (hi >> 8) & 0xF; 27 return (hi >> 8) & 0xF;
12} 28}
13 29
30/* An IDT entry can't be used unless the "present" bit is set. */
14static int idt_present(u32 lo, u32 hi) 31static int idt_present(u32 lo, u32 hi)
15{ 32{
16 return (hi & 0x8000); 33 return (hi & 0x8000);
17} 34}
18 35
36/* We need a helper to "push" a value onto the Guest's stack, since that's a
37 * big part of what delivering an interrupt does. */
19static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val) 38static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
20{ 39{
40 /* Stack grows upwards: move stack then write value. */
21 *gstack -= 4; 41 *gstack -= 4;
22 lgwrite_u32(lg, *gstack, val); 42 lgwrite_u32(lg, *gstack, val);
23} 43}
24 44
45/*H:210 The set_guest_interrupt() routine actually delivers the interrupt or
46 * trap. The mechanics of delivering traps and interrupts to the Guest are the
47 * same, except some traps have an "error code" which gets pushed onto the
48 * stack as well: the caller tells us if this is one.
49 *
50 * "lo" and "hi" are the two parts of the Interrupt Descriptor Table for this
51 * interrupt or trap. It's split into two parts for traditional reasons: gcc
52 * on i386 used to be frightened by 64 bit numbers.
53 *
54 * We set up the stack just like the CPU does for a real interrupt, so it's
55 * identical for the Guest (and the standard "iret" instruction will undo
56 * it). */
25static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) 57static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
26{ 58{
27 unsigned long gstack; 59 unsigned long gstack;
28 u32 eflags, ss, irq_enable; 60 u32 eflags, ss, irq_enable;
29 61
30 /* If they want a ring change, we use new stack and push old ss/esp */ 62 /* There are two cases for interrupts: one where the Guest is already
63 * in the kernel, and a more complex one where the Guest is in
64 * userspace. We check the privilege level to find out. */
31 if ((lg->regs->ss&0x3) != GUEST_PL) { 65 if ((lg->regs->ss&0x3) != GUEST_PL) {
66 /* The Guest told us their kernel stack with the SET_STACK
67 * hypercall: both the virtual address and the segment */
32 gstack = guest_pa(lg, lg->esp1); 68 gstack = guest_pa(lg, lg->esp1);
33 ss = lg->ss1; 69 ss = lg->ss1;
70 /* We push the old stack segment and pointer onto the new
71 * stack: when the Guest does an "iret" back from the interrupt
72 * handler the CPU will notice they're dropping privilege
73 * levels and expect these here. */
34 push_guest_stack(lg, &gstack, lg->regs->ss); 74 push_guest_stack(lg, &gstack, lg->regs->ss);
35 push_guest_stack(lg, &gstack, lg->regs->esp); 75 push_guest_stack(lg, &gstack, lg->regs->esp);
36 } else { 76 } else {
77 /* We're staying on the same Guest (kernel) stack. */
37 gstack = guest_pa(lg, lg->regs->esp); 78 gstack = guest_pa(lg, lg->regs->esp);
38 ss = lg->regs->ss; 79 ss = lg->regs->ss;
39 } 80 }
40 81
41 /* We use IF bit in eflags to indicate whether irqs were enabled 82 /* Remember that we never let the Guest actually disable interrupts, so
42 (it's always 1, since irqs are enabled when guest is running). */ 83 * the "Interrupt Flag" bit is always set. We copy that bit from the
84 * Guest's "irq_enabled" field into the eflags word: the Guest copies
85 * it back in "lguest_iret". */
43 eflags = lg->regs->eflags; 86 eflags = lg->regs->eflags;
44 if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0 87 if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0
45 && !(irq_enable & X86_EFLAGS_IF)) 88 && !(irq_enable & X86_EFLAGS_IF))
46 eflags &= ~X86_EFLAGS_IF; 89 eflags &= ~X86_EFLAGS_IF;
47 90
91 /* An interrupt is expected to push three things on the stack: the old
92 * "eflags" word, the old code segment, and the old instruction
93 * pointer. */
48 push_guest_stack(lg, &gstack, eflags); 94 push_guest_stack(lg, &gstack, eflags);
49 push_guest_stack(lg, &gstack, lg->regs->cs); 95 push_guest_stack(lg, &gstack, lg->regs->cs);
50 push_guest_stack(lg, &gstack, lg->regs->eip); 96 push_guest_stack(lg, &gstack, lg->regs->eip);
51 97
98 /* For the six traps which supply an error code, we push that, too. */
52 if (has_err) 99 if (has_err)
53 push_guest_stack(lg, &gstack, lg->regs->errcode); 100 push_guest_stack(lg, &gstack, lg->regs->errcode);
54 101
55 /* Change the real stack so switcher returns to trap handler */ 102 /* Now we've pushed all the old state, we change the stack, the code
103 * segment and the address to execute. */
56 lg->regs->ss = ss; 104 lg->regs->ss = ss;
57 lg->regs->esp = gstack + lg->page_offset; 105 lg->regs->esp = gstack + lg->page_offset;
58 lg->regs->cs = (__KERNEL_CS|GUEST_PL); 106 lg->regs->cs = (__KERNEL_CS|GUEST_PL);
59 lg->regs->eip = idt_address(lo, hi); 107 lg->regs->eip = idt_address(lo, hi);
60 108
61 /* Disable interrupts for an interrupt gate. */ 109 /* There are two kinds of interrupt handlers: 0xE is an "interrupt
110 * gate" which expects interrupts to be disabled on entry. */
62 if (idt_type(lo, hi) == 0xE) 111 if (idt_type(lo, hi) == 0xE)
63 if (put_user(0, &lg->lguest_data->irq_enabled)) 112 if (put_user(0, &lg->lguest_data->irq_enabled))
64 kill_guest(lg, "Disabling interrupts"); 113 kill_guest(lg, "Disabling interrupts");
65} 114}
66 115
116/*H:200
117 * Virtual Interrupts.
118 *
119 * maybe_do_interrupt() gets called before every entry to the Guest, to see if
120 * we should divert the Guest to running an interrupt handler. */
67void maybe_do_interrupt(struct lguest *lg) 121void maybe_do_interrupt(struct lguest *lg)
68{ 122{
69 unsigned int irq; 123 unsigned int irq;
70 DECLARE_BITMAP(blk, LGUEST_IRQS); 124 DECLARE_BITMAP(blk, LGUEST_IRQS);
71 struct desc_struct *idt; 125 struct desc_struct *idt;
72 126
127 /* If the Guest hasn't even initialized yet, we can do nothing. */
73 if (!lg->lguest_data) 128 if (!lg->lguest_data)
74 return; 129 return;
75 130
76 /* Mask out any interrupts they have blocked. */ 131 /* Take our "irqs_pending" array and remove any interrupts the Guest
132 * wants blocked: the result ends up in "blk". */
77 if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts, 133 if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts,
78 sizeof(blk))) 134 sizeof(blk)))
79 return; 135 return;
80 136
81 bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS); 137 bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS);
82 138
139 /* Find the first interrupt. */
83 irq = find_first_bit(blk, LGUEST_IRQS); 140 irq = find_first_bit(blk, LGUEST_IRQS);
141 /* None? Nothing to do */
84 if (irq >= LGUEST_IRQS) 142 if (irq >= LGUEST_IRQS)
85 return; 143 return;
86 144
145 /* They may be in the middle of an iret, where they asked us never to
146 * deliver interrupts. */
87 if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end) 147 if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end)
88 return; 148 return;
89 149
90 /* If they're halted, we re-enable interrupts. */ 150 /* If they're halted, interrupts restart them. */
91 if (lg->halted) { 151 if (lg->halted) {
92 /* Re-enable interrupts. */ 152 /* Re-enable interrupts. */
93 if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled)) 153 if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled))
94 kill_guest(lg, "Re-enabling interrupts"); 154 kill_guest(lg, "Re-enabling interrupts");
95 lg->halted = 0; 155 lg->halted = 0;
96 } else { 156 } else {
97 /* Maybe they have interrupts disabled? */ 157 /* Otherwise we check if they have interrupts disabled. */
98 u32 irq_enabled; 158 u32 irq_enabled;
99 if (get_user(irq_enabled, &lg->lguest_data->irq_enabled)) 159 if (get_user(irq_enabled, &lg->lguest_data->irq_enabled))
100 irq_enabled = 0; 160 irq_enabled = 0;
@@ -102,112 +162,218 @@ void maybe_do_interrupt(struct lguest *lg)
102 return; 162 return;
103 } 163 }
104 164
165 /* Look at the IDT entry the Guest gave us for this interrupt. The
166 * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip
167 * over them. */
105 idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq]; 168 idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq];
169 /* If they don't have a handler (yet?), we just ignore it */
106 if (idt_present(idt->a, idt->b)) { 170 if (idt_present(idt->a, idt->b)) {
171 /* OK, mark it no longer pending and deliver it. */
107 clear_bit(irq, lg->irqs_pending); 172 clear_bit(irq, lg->irqs_pending);
173 /* set_guest_interrupt() takes the interrupt descriptor and a
174 * flag to say whether this interrupt pushes an error code onto
175 * the stack as well: virtual interrupts never do. */
108 set_guest_interrupt(lg, idt->a, idt->b, 0); 176 set_guest_interrupt(lg, idt->a, idt->b, 0);
109 } 177 }
178
179 /* Every time we deliver an interrupt, we update the timestamp in the
180 * Guest's lguest_data struct. It would be better for the Guest if we
181 * did this more often, but it can actually be quite slow: doing it
182 * here is a compromise which means at least it gets updated every
183 * timer interrupt. */
184 write_timestamp(lg);
110} 185}
111 186
187/*H:220 Now we've got the routines to deliver interrupts, delivering traps
188 * like page fault is easy. The only trick is that Intel decided that some
189 * traps should have error codes: */
112static int has_err(unsigned int trap) 190static int has_err(unsigned int trap)
113{ 191{
114 return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); 192 return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
115} 193}
116 194
195/* deliver_trap() returns true if it could deliver the trap. */
117int deliver_trap(struct lguest *lg, unsigned int num) 196int deliver_trap(struct lguest *lg, unsigned int num)
118{ 197{
119 u32 lo = lg->idt[num].a, hi = lg->idt[num].b; 198 u32 lo = lg->idt[num].a, hi = lg->idt[num].b;
120 199
200 /* Early on the Guest hasn't set the IDT entries (or maybe it put a
201 * bogus one in): if we fail here, the Guest will be killed. */
121 if (!idt_present(lo, hi)) 202 if (!idt_present(lo, hi))
122 return 0; 203 return 0;
123 set_guest_interrupt(lg, lo, hi, has_err(num)); 204 set_guest_interrupt(lg, lo, hi, has_err(num));
124 return 1; 205 return 1;
125} 206}
126 207
208/*H:250 Here's the hard part: returning to the Host every time a trap happens
209 * and then calling deliver_trap() and re-entering the Guest is slow.
210 * Particularly because Guest userspace system calls are traps (trap 128).
211 *
212 * So we'd like to set up the IDT to tell the CPU to deliver traps directly
213 * into the Guest. This is possible, but the complexities cause the size of
214 * this file to double! However, 150 lines of code is worth writing for taking
215 * system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all
216 * the other hypervisors would tease it.
217 *
218 * This routine determines if a trap can be delivered directly. */
127static int direct_trap(const struct lguest *lg, 219static int direct_trap(const struct lguest *lg,
128 const struct desc_struct *trap, 220 const struct desc_struct *trap,
129 unsigned int num) 221 unsigned int num)
130{ 222{
131 /* Hardware interrupts don't go to guest (except syscall). */ 223 /* Hardware interrupts don't go to the Guest at all (except system
224 * call). */
132 if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR) 225 if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR)
133 return 0; 226 return 0;
134 227
135 /* We intercept page fault (demand shadow paging & cr2 saving) 228 /* The Host needs to see page faults (for shadow paging and to save the
136 protection fault (in/out emulation) and device not 229 * fault address), general protection faults (in/out emulation) and
137 available (TS handling), and hypercall */ 230 * device not available (TS handling), and of course, the hypercall
231 * trap. */
138 if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY) 232 if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY)
139 return 0; 233 return 0;
140 234
141 /* Interrupt gates (0xE) or not present (0x0) can't go direct. */ 235 /* Only trap gates (type 15) can go direct to the Guest. Interrupt
236 * gates (type 14) disable interrupts as they are entered, which we
237 * never let the Guest do. Not present entries (type 0x0) also can't
238 * go direct, of course 8) */
142 return idt_type(trap->a, trap->b) == 0xF; 239 return idt_type(trap->a, trap->b) == 0xF;
143} 240}
144 241/*:*/
242
243/*M:005 The Guest has the ability to turn its interrupt gates into trap gates,
244 * if it is careful. The Host will let trap gates can go directly to the
245 * Guest, but the Guest needs the interrupts atomically disabled for an
246 * interrupt gate. It can do this by pointing the trap gate at instructions
247 * within noirq_start and noirq_end, where it can safely disable interrupts. */
248
249/*M:006 The Guests do not use the sysenter (fast system call) instruction,
250 * because it's hardcoded to enter privilege level 0 and so can't go direct.
251 * It's about twice as fast as the older "int 0x80" system call, so it might
252 * still be worthwhile to handle it in the Switcher and lcall down to the
253 * Guest. The sysenter semantics are hairy tho: search for that keyword in
254 * entry.S :*/
255
256/*H:260 When we make traps go directly into the Guest, we need to make sure
257 * the kernel stack is valid (ie. mapped in the page tables). Otherwise, the
258 * CPU trying to deliver the trap will fault while trying to push the interrupt
259 * words on the stack: this is called a double fault, and it forces us to kill
260 * the Guest.
261 *
262 * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */
145void pin_stack_pages(struct lguest *lg) 263void pin_stack_pages(struct lguest *lg)
146{ 264{
147 unsigned int i; 265 unsigned int i;
148 266
267 /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or
268 * two pages of stack space. */
149 for (i = 0; i < lg->stack_pages; i++) 269 for (i = 0; i < lg->stack_pages; i++)
270 /* The stack grows *upwards*, hence the subtraction */
150 pin_page(lg, lg->esp1 - i * PAGE_SIZE); 271 pin_page(lg, lg->esp1 - i * PAGE_SIZE);
151} 272}
152 273
274/* Direct traps also mean that we need to know whenever the Guest wants to use
275 * a different kernel stack, so we can change the IDT entries to use that
276 * stack. The IDT entries expect a virtual address, so unlike most addresses
277 * the Guest gives us, the "esp" (stack pointer) value here is virtual, not
278 * physical.
279 *
280 * In Linux each process has its own kernel stack, so this happens a lot: we
281 * change stacks on each context switch. */
153void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages) 282void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
154{ 283{
155 /* You cannot have a stack segment with priv level 0. */ 284 /* You are not allowd have a stack segment with privilege level 0: bad
285 * Guest! */
156 if ((seg & 0x3) != GUEST_PL) 286 if ((seg & 0x3) != GUEST_PL)
157 kill_guest(lg, "bad stack segment %i", seg); 287 kill_guest(lg, "bad stack segment %i", seg);
288 /* We only expect one or two stack pages. */
158 if (pages > 2) 289 if (pages > 2)
159 kill_guest(lg, "bad stack pages %u", pages); 290 kill_guest(lg, "bad stack pages %u", pages);
291 /* Save where the stack is, and how many pages */
160 lg->ss1 = seg; 292 lg->ss1 = seg;
161 lg->esp1 = esp; 293 lg->esp1 = esp;
162 lg->stack_pages = pages; 294 lg->stack_pages = pages;
295 /* Make sure the new stack pages are mapped */
163 pin_stack_pages(lg); 296 pin_stack_pages(lg);
164} 297}
165 298
166/* Set up trap in IDT. */ 299/* All this reference to mapping stacks leads us neatly into the other complex
300 * part of the Host: page table handling. */
301
302/*H:235 This is the routine which actually checks the Guest's IDT entry and
303 * transfers it into our entry in "struct lguest": */
167static void set_trap(struct lguest *lg, struct desc_struct *trap, 304static void set_trap(struct lguest *lg, struct desc_struct *trap,
168 unsigned int num, u32 lo, u32 hi) 305 unsigned int num, u32 lo, u32 hi)
169{ 306{
170 u8 type = idt_type(lo, hi); 307 u8 type = idt_type(lo, hi);
171 308
309 /* We zero-out a not-present entry */
172 if (!idt_present(lo, hi)) { 310 if (!idt_present(lo, hi)) {
173 trap->a = trap->b = 0; 311 trap->a = trap->b = 0;
174 return; 312 return;
175 } 313 }
176 314
315 /* We only support interrupt and trap gates. */
177 if (type != 0xE && type != 0xF) 316 if (type != 0xE && type != 0xF)
178 kill_guest(lg, "bad IDT type %i", type); 317 kill_guest(lg, "bad IDT type %i", type);
179 318
319 /* We only copy the handler address, present bit, privilege level and
320 * type. The privilege level controls where the trap can be triggered
321 * manually with an "int" instruction. This is usually GUEST_PL,
322 * except for system calls which userspace can use. */
180 trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF); 323 trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF);
181 trap->b = (hi&0xFFFFEF00); 324 trap->b = (hi&0xFFFFEF00);
182} 325}
183 326
327/*H:230 While we're here, dealing with delivering traps and interrupts to the
328 * Guest, we might as well complete the picture: how the Guest tells us where
329 * it wants them to go. This would be simple, except making traps fast
330 * requires some tricks.
331 *
332 * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the
333 * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */
184void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi) 334void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi)
185{ 335{
186 /* Guest never handles: NMI, doublefault, hypercall, spurious irq. */ 336 /* Guest never handles: NMI, doublefault, spurious interrupt or
337 * hypercall. We ignore when it tries to set them. */
187 if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY) 338 if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY)
188 return; 339 return;
189 340
341 /* Mark the IDT as changed: next time the Guest runs we'll know we have
342 * to copy this again. */
190 lg->changed |= CHANGED_IDT; 343 lg->changed |= CHANGED_IDT;
344
345 /* The IDT which we keep in "struct lguest" only contains 32 entries
346 * for the traps and LGUEST_IRQS (32) entries for interrupts. We
347 * ignore attempts to set handlers for higher interrupt numbers, except
348 * for the system call "interrupt" at 128: we have a special IDT entry
349 * for that. */
191 if (num < ARRAY_SIZE(lg->idt)) 350 if (num < ARRAY_SIZE(lg->idt))
192 set_trap(lg, &lg->idt[num], num, lo, hi); 351 set_trap(lg, &lg->idt[num], num, lo, hi);
193 else if (num == SYSCALL_VECTOR) 352 else if (num == SYSCALL_VECTOR)
194 set_trap(lg, &lg->syscall_idt, num, lo, hi); 353 set_trap(lg, &lg->syscall_idt, num, lo, hi);
195} 354}
196 355
356/* The default entry for each interrupt points into the Switcher routines which
357 * simply return to the Host. The run_guest() loop will then call
358 * deliver_trap() to bounce it back into the Guest. */
197static void default_idt_entry(struct desc_struct *idt, 359static void default_idt_entry(struct desc_struct *idt,
198 int trap, 360 int trap,
199 const unsigned long handler) 361 const unsigned long handler)
200{ 362{
363 /* A present interrupt gate. */
201 u32 flags = 0x8e00; 364 u32 flags = 0x8e00;
202 365
203 /* They can't "int" into any of them except hypercall. */ 366 /* Set the privilege level on the entry for the hypercall: this allows
367 * the Guest to use the "int" instruction to trigger it. */
204 if (trap == LGUEST_TRAP_ENTRY) 368 if (trap == LGUEST_TRAP_ENTRY)
205 flags |= (GUEST_PL << 13); 369 flags |= (GUEST_PL << 13);
206 370
371 /* Now pack it into the IDT entry in its weird format. */
207 idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF); 372 idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF);
208 idt->b = (handler&0xFFFF0000) | flags; 373 idt->b = (handler&0xFFFF0000) | flags;
209} 374}
210 375
376/* When the Guest first starts, we put default entries into the IDT. */
211void setup_default_idt_entries(struct lguest_ro_state *state, 377void setup_default_idt_entries(struct lguest_ro_state *state,
212 const unsigned long *def) 378 const unsigned long *def)
213{ 379{
@@ -217,19 +383,25 @@ void setup_default_idt_entries(struct lguest_ro_state *state,
217 default_idt_entry(&state->guest_idt[i], i, def[i]); 383 default_idt_entry(&state->guest_idt[i], i, def[i]);
218} 384}
219 385
386/*H:240 We don't use the IDT entries in the "struct lguest" directly, instead
387 * we copy them into the IDT which we've set up for Guests on this CPU, just
388 * before we run the Guest. This routine does that copy. */
220void copy_traps(const struct lguest *lg, struct desc_struct *idt, 389void copy_traps(const struct lguest *lg, struct desc_struct *idt,
221 const unsigned long *def) 390 const unsigned long *def)
222{ 391{
223 unsigned int i; 392 unsigned int i;
224 393
225 /* All hardware interrupts are same whatever the guest: only the 394 /* We can simply copy the direct traps, otherwise we use the default
226 * traps might be different. */ 395 * ones in the Switcher: they will return to the Host. */
227 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) { 396 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) {
228 if (direct_trap(lg, &lg->idt[i], i)) 397 if (direct_trap(lg, &lg->idt[i], i))
229 idt[i] = lg->idt[i]; 398 idt[i] = lg->idt[i];
230 else 399 else
231 default_idt_entry(&idt[i], i, def[i]); 400 default_idt_entry(&idt[i], i, def[i]);
232 } 401 }
402
403 /* Don't forget the system call trap! The IDT entries for other
404 * interupts never change, so no need to copy them. */
233 i = SYSCALL_VECTOR; 405 i = SYSCALL_VECTOR;
234 if (direct_trap(lg, &lg->syscall_idt, i)) 406 if (direct_trap(lg, &lg->syscall_idt, i))
235 idt[i] = lg->syscall_idt; 407 idt[i] = lg->syscall_idt;
diff --git a/drivers/lguest/io.c b/drivers/lguest/io.c
index c8eb79266991..ea68613b43f6 100644
--- a/drivers/lguest/io.c
+++ b/drivers/lguest/io.c
@@ -1,5 +1,9 @@
1/* Simple I/O model for guests, based on shared memory. 1/*P:300 The I/O mechanism in lguest is simple yet flexible, allowing the Guest
2 * Copyright (C) 2006 Rusty Russell IBM Corporation 2 * to talk to the Launcher or directly to another Guest. It uses familiar
3 * concepts of DMA and interrupts, plus some neat code stolen from
4 * futexes... :*/
5
6/* Copyright (C) 2006 Rusty Russell IBM Corporation
3 * 7 *
4 * This program is free software; you can redistribute it and/or modify 8 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by 9 * it under the terms of the GNU General Public License as published by
@@ -23,8 +27,36 @@
23#include <linux/uaccess.h> 27#include <linux/uaccess.h>
24#include "lg.h" 28#include "lg.h"
25 29
30/*L:300
31 * I/O
32 *
33 * Getting data in and out of the Guest is quite an art. There are numerous
34 * ways to do it, and they all suck differently. We try to keep things fairly
35 * close to "real" hardware so our Guest's drivers don't look like an alien
36 * visitation in the middle of the Linux code, and yet make sure that Guests
37 * can talk directly to other Guests, not just the Launcher.
38 *
39 * To do this, the Guest gives us a key when it binds or sends DMA buffers.
40 * The key corresponds to a "physical" address inside the Guest (ie. a virtual
41 * address inside the Launcher process). We don't, however, use this key
42 * directly.
43 *
44 * We want Guests which share memory to be able to DMA to each other: two
45 * Launchers can mmap memory the same file, then the Guests can communicate.
46 * Fortunately, the futex code provides us with a way to get a "union
47 * futex_key" corresponding to the memory lying at a virtual address: if the
48 * two processes share memory, the "union futex_key" for that memory will match
49 * even if the memory is mapped at different addresses in each. So we always
50 * convert the keys to "union futex_key"s to compare them.
51 *
52 * Before we dive into this though, we need to look at another set of helper
53 * routines used throughout the Host kernel code to access Guest memory.
54 :*/
26static struct list_head dma_hash[61]; 55static struct list_head dma_hash[61];
27 56
57/* An unfortunate side effect of the Linux double-linked list implementation is
58 * that there's no good way to statically initialize an array of linked
59 * lists. */
28void lguest_io_init(void) 60void lguest_io_init(void)
29{ 61{
30 unsigned int i; 62 unsigned int i;
@@ -56,6 +88,19 @@ kill:
56 return 0; 88 return 0;
57} 89}
58 90
91/*L:330 This is our hash function, using the wonderful Jenkins hash.
92 *
93 * The futex key is a union with three parts: an unsigned long word, a pointer,
94 * and an int "offset". We could use jhash_2words() which takes three u32s.
95 * (Ok, the hash functions are great: the naming sucks though).
96 *
97 * It's nice to be portable to 64-bit platforms, so we use the more generic
98 * jhash2(), which takes an array of u32, the number of u32s, and an initial
99 * u32 to roll in. This is uglier, but breaks down to almost the same code on
100 * 32-bit platforms like this one.
101 *
102 * We want a position in the array, so we modulo ARRAY_SIZE(dma_hash) (ie. 61).
103 */
59static unsigned int hash(const union futex_key *key) 104static unsigned int hash(const union futex_key *key)
60{ 105{
61 return jhash2((u32*)&key->both.word, 106 return jhash2((u32*)&key->both.word,
@@ -64,6 +109,9 @@ static unsigned int hash(const union futex_key *key)
64 % ARRAY_SIZE(dma_hash); 109 % ARRAY_SIZE(dma_hash);
65} 110}
66 111
112/* This is a convenience routine to compare two keys. It's a much bemoaned C
113 * weakness that it doesn't allow '==' on structures or unions, so we have to
114 * open-code it like this. */
67static inline int key_eq(const union futex_key *a, const union futex_key *b) 115static inline int key_eq(const union futex_key *a, const union futex_key *b)
68{ 116{
69 return (a->both.word == b->both.word 117 return (a->both.word == b->both.word
@@ -71,22 +119,36 @@ static inline int key_eq(const union futex_key *a, const union futex_key *b)
71 && a->both.offset == b->both.offset); 119 && a->both.offset == b->both.offset);
72} 120}
73 121
74/* Must hold read lock on dmainfo owner's current->mm->mmap_sem */ 122/*L:360 OK, when we need to actually free up a Guest's DMA array we do several
123 * things, so we have a convenient function to do it.
124 *
125 * The caller must hold a read lock on dmainfo owner's current->mm->mmap_sem
126 * for the drop_futex_key_refs(). */
75static void unlink_dma(struct lguest_dma_info *dmainfo) 127static void unlink_dma(struct lguest_dma_info *dmainfo)
76{ 128{
129 /* You locked this too, right? */
77 BUG_ON(!mutex_is_locked(&lguest_lock)); 130 BUG_ON(!mutex_is_locked(&lguest_lock));
131 /* This is how we know that the entry is free. */
78 dmainfo->interrupt = 0; 132 dmainfo->interrupt = 0;
133 /* Remove it from the hash table. */
79 list_del(&dmainfo->list); 134 list_del(&dmainfo->list);
135 /* Drop the references we were holding (to the inode or mm). */
80 drop_futex_key_refs(&dmainfo->key); 136 drop_futex_key_refs(&dmainfo->key);
81} 137}
82 138
139/*L:350 This is the routine which we call when the Guest asks to unregister a
140 * DMA array attached to a given key. Returns true if the array was found. */
83static int unbind_dma(struct lguest *lg, 141static int unbind_dma(struct lguest *lg,
84 const union futex_key *key, 142 const union futex_key *key,
85 unsigned long dmas) 143 unsigned long dmas)
86{ 144{
87 int i, ret = 0; 145 int i, ret = 0;
88 146
147 /* We don't bother with the hash table, just look through all this
148 * Guest's DMA arrays. */
89 for (i = 0; i < LGUEST_MAX_DMA; i++) { 149 for (i = 0; i < LGUEST_MAX_DMA; i++) {
150 /* In theory it could have more than one array on the same key,
151 * or one array on multiple keys, so we check both */
90 if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) { 152 if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) {
91 unlink_dma(&lg->dma[i]); 153 unlink_dma(&lg->dma[i]);
92 ret = 1; 154 ret = 1;
@@ -96,51 +158,91 @@ static int unbind_dma(struct lguest *lg,
96 return ret; 158 return ret;
97} 159}
98 160
161/*L:340 BIND_DMA: this is the hypercall which sets up an array of "struct
162 * lguest_dma" for receiving I/O.
163 *
164 * The Guest wants to bind an array of "struct lguest_dma"s to a particular key
165 * to receive input. This only happens when the Guest is setting up a new
166 * device, so it doesn't have to be very fast.
167 *
168 * It returns 1 on a successful registration (it can fail if we hit the limit
169 * of registrations for this Guest).
170 */
99int bind_dma(struct lguest *lg, 171int bind_dma(struct lguest *lg,
100 unsigned long ukey, unsigned long dmas, u16 numdmas, u8 interrupt) 172 unsigned long ukey, unsigned long dmas, u16 numdmas, u8 interrupt)
101{ 173{
102 unsigned int i; 174 unsigned int i;
103 int ret = 0; 175 int ret = 0;
104 union futex_key key; 176 union futex_key key;
177 /* Futex code needs the mmap_sem. */
105 struct rw_semaphore *fshared = &current->mm->mmap_sem; 178 struct rw_semaphore *fshared = &current->mm->mmap_sem;
106 179
180 /* Invalid interrupt? (We could kill the guest here). */
107 if (interrupt >= LGUEST_IRQS) 181 if (interrupt >= LGUEST_IRQS)
108 return 0; 182 return 0;
109 183
184 /* We need to grab the Big Lguest Lock, because other Guests may be
185 * trying to look through this Guest's DMAs to send something while
186 * we're doing this. */
110 mutex_lock(&lguest_lock); 187 mutex_lock(&lguest_lock);
111 down_read(fshared); 188 down_read(fshared);
112 if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { 189 if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) {
113 kill_guest(lg, "bad dma key %#lx", ukey); 190 kill_guest(lg, "bad dma key %#lx", ukey);
114 goto unlock; 191 goto unlock;
115 } 192 }
193
194 /* We want to keep this key valid once we drop mmap_sem, so we have to
195 * hold a reference. */
116 get_futex_key_refs(&key); 196 get_futex_key_refs(&key);
117 197
198 /* If the Guest specified an interrupt of 0, that means they want to
199 * unregister this array of "struct lguest_dma"s. */
118 if (interrupt == 0) 200 if (interrupt == 0)
119 ret = unbind_dma(lg, &key, dmas); 201 ret = unbind_dma(lg, &key, dmas);
120 else { 202 else {
203 /* Look through this Guest's dma array for an unused entry. */
121 for (i = 0; i < LGUEST_MAX_DMA; i++) { 204 for (i = 0; i < LGUEST_MAX_DMA; i++) {
205 /* If the interrupt is non-zero, the entry is already
206 * used. */
122 if (lg->dma[i].interrupt) 207 if (lg->dma[i].interrupt)
123 continue; 208 continue;
124 209
210 /* OK, a free one! Fill on our details. */
125 lg->dma[i].dmas = dmas; 211 lg->dma[i].dmas = dmas;
126 lg->dma[i].num_dmas = numdmas; 212 lg->dma[i].num_dmas = numdmas;
127 lg->dma[i].next_dma = 0; 213 lg->dma[i].next_dma = 0;
128 lg->dma[i].key = key; 214 lg->dma[i].key = key;
129 lg->dma[i].guestid = lg->guestid; 215 lg->dma[i].guestid = lg->guestid;
130 lg->dma[i].interrupt = interrupt; 216 lg->dma[i].interrupt = interrupt;
217
218 /* Now we add it to the hash table: the position
219 * depends on the futex key that we got. */
131 list_add(&lg->dma[i].list, &dma_hash[hash(&key)]); 220 list_add(&lg->dma[i].list, &dma_hash[hash(&key)]);
221 /* Success! */
132 ret = 1; 222 ret = 1;
133 goto unlock; 223 goto unlock;
134 } 224 }
135 } 225 }
226 /* If we didn't find a slot to put the key in, drop the reference
227 * again. */
136 drop_futex_key_refs(&key); 228 drop_futex_key_refs(&key);
137unlock: 229unlock:
230 /* Unlock and out. */
138 up_read(fshared); 231 up_read(fshared);
139 mutex_unlock(&lguest_lock); 232 mutex_unlock(&lguest_lock);
140 return ret; 233 return ret;
141} 234}
142 235
143/* lgread from another guest */ 236/*L:385 Note that our routines to access a different Guest's memory are called
237 * lgread_other() and lgwrite_other(): these names emphasize that they are only
238 * used when the Guest is *not* the current Guest.
239 *
240 * The interface for copying from another process's memory is called
241 * access_process_vm(), with a final argument of 0 for a read, and 1 for a
242 * write.
243 *
244 * We need lgread_other() to read the destination Guest's "struct lguest_dma"
245 * array. */
144static int lgread_other(struct lguest *lg, 246static int lgread_other(struct lguest *lg,
145 void *buf, u32 addr, unsigned bytes) 247 void *buf, u32 addr, unsigned bytes)
146{ 248{
@@ -153,7 +255,8 @@ static int lgread_other(struct lguest *lg,
153 return 1; 255 return 1;
154} 256}
155 257
156/* lgwrite to another guest */ 258/* "lgwrite()" to another Guest: used to update the destination "used_len" once
259 * we've transferred data into the buffer. */
157static int lgwrite_other(struct lguest *lg, u32 addr, 260static int lgwrite_other(struct lguest *lg, u32 addr,
158 const void *buf, unsigned bytes) 261 const void *buf, unsigned bytes)
159{ 262{
@@ -166,6 +269,15 @@ static int lgwrite_other(struct lguest *lg, u32 addr,
166 return 1; 269 return 1;
167} 270}
168 271
272/*L:400 This is the generic engine which copies from a source "struct
273 * lguest_dma" from this Guest into another Guest's "struct lguest_dma". The
274 * destination Guest's pages have already been mapped, as contained in the
275 * pages array.
276 *
277 * If you're wondering if there's a nice "copy from one process to another"
278 * routine, so was I. But Linux isn't really set up to copy between two
279 * unrelated processes, so we have to write it ourselves.
280 */
169static u32 copy_data(struct lguest *srclg, 281static u32 copy_data(struct lguest *srclg,
170 const struct lguest_dma *src, 282 const struct lguest_dma *src,
171 const struct lguest_dma *dst, 283 const struct lguest_dma *dst,
@@ -174,33 +286,59 @@ static u32 copy_data(struct lguest *srclg,
174 unsigned int totlen, si, di, srcoff, dstoff; 286 unsigned int totlen, si, di, srcoff, dstoff;
175 void *maddr = NULL; 287 void *maddr = NULL;
176 288
289 /* We return the total length transferred. */
177 totlen = 0; 290 totlen = 0;
291
292 /* We keep indexes into the source and destination "struct lguest_dma",
293 * and an offset within each region. */
178 si = di = 0; 294 si = di = 0;
179 srcoff = dstoff = 0; 295 srcoff = dstoff = 0;
296
297 /* We loop until the source or destination is exhausted. */
180 while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si] 298 while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si]
181 && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) { 299 && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) {
300 /* We can only transfer the rest of the src buffer, or as much
301 * as will fit into the destination buffer. */
182 u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff); 302 u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff);
183 303
304 /* For systems using "highmem" we need to use kmap() to access
305 * the page we want. We often use the same page over and over,
306 * so rather than kmap() it on every loop, we set the maddr
307 * pointer to NULL when we need to move to the next
308 * destination page. */
184 if (!maddr) 309 if (!maddr)
185 maddr = kmap(pages[di]); 310 maddr = kmap(pages[di]);
186 311
187 /* FIXME: This is not completely portable, since 312 /* Copy directly from (this Guest's) source address to the
188 archs do different things for copy_to_user_page. */ 313 * destination Guest's kmap()ed buffer. Note that maddr points
314 * to the start of the page: we need to add the offset of the
315 * destination address and offset within the buffer. */
316
317 /* FIXME: This is not completely portable. I looked at
318 * copy_to_user_page(), and some arch's seem to need special
319 * flushes. x86 is fine. */
189 if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE, 320 if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE,
190 (void __user *)src->addr[si], len) != 0) { 321 (void __user *)src->addr[si], len) != 0) {
322 /* If a copy failed, it's the source's fault. */
191 kill_guest(srclg, "bad address in sending DMA"); 323 kill_guest(srclg, "bad address in sending DMA");
192 totlen = 0; 324 totlen = 0;
193 break; 325 break;
194 } 326 }
195 327
328 /* Increment the total and src & dst offsets */
196 totlen += len; 329 totlen += len;
197 srcoff += len; 330 srcoff += len;
198 dstoff += len; 331 dstoff += len;
332
333 /* Presumably we reached the end of the src or dest buffers: */
199 if (srcoff == src->len[si]) { 334 if (srcoff == src->len[si]) {
335 /* Move to the next buffer at offset 0 */
200 si++; 336 si++;
201 srcoff = 0; 337 srcoff = 0;
202 } 338 }
203 if (dstoff == dst->len[di]) { 339 if (dstoff == dst->len[di]) {
340 /* We need to unmap that destination page and reset
341 * maddr ready for the next one. */
204 kunmap(pages[di]); 342 kunmap(pages[di]);
205 maddr = NULL; 343 maddr = NULL;
206 di++; 344 di++;
@@ -208,13 +346,15 @@ static u32 copy_data(struct lguest *srclg,
208 } 346 }
209 } 347 }
210 348
349 /* If we still had a page mapped at the end, unmap now. */
211 if (maddr) 350 if (maddr)
212 kunmap(pages[di]); 351 kunmap(pages[di]);
213 352
214 return totlen; 353 return totlen;
215} 354}
216 355
217/* Src is us, ie. current. */ 356/*L:390 This is how we transfer a "struct lguest_dma" from the source Guest
357 * (the current Guest which called SEND_DMA) to another Guest. */
218static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src, 358static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src,
219 struct lguest *dstlg, const struct lguest_dma *dst) 359 struct lguest *dstlg, const struct lguest_dma *dst)
220{ 360{
@@ -222,23 +362,31 @@ static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src,
222 u32 ret; 362 u32 ret;
223 struct page *pages[LGUEST_MAX_DMA_SECTIONS]; 363 struct page *pages[LGUEST_MAX_DMA_SECTIONS];
224 364
365 /* We check that both source and destination "struct lguest_dma"s are
366 * within the bounds of the source and destination Guests */
225 if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src)) 367 if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src))
226 return 0; 368 return 0;
227 369
228 /* First get the destination pages */ 370 /* We need to map the pages which correspond to each parts of
371 * destination buffer. */
229 for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { 372 for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
230 if (dst->len[i] == 0) 373 if (dst->len[i] == 0)
231 break; 374 break;
375 /* get_user_pages() is a complicated function, especially since
376 * we only want a single page. But it works, and returns the
377 * number of pages. Note that we're holding the destination's
378 * mmap_sem, as get_user_pages() requires. */
232 if (get_user_pages(dstlg->tsk, dstlg->mm, 379 if (get_user_pages(dstlg->tsk, dstlg->mm,
233 dst->addr[i], 1, 1, 1, pages+i, NULL) 380 dst->addr[i], 1, 1, 1, pages+i, NULL)
234 != 1) { 381 != 1) {
382 /* This means the destination gave us a bogus buffer */
235 kill_guest(dstlg, "Error mapping DMA pages"); 383 kill_guest(dstlg, "Error mapping DMA pages");
236 ret = 0; 384 ret = 0;
237 goto drop_pages; 385 goto drop_pages;
238 } 386 }
239 } 387 }
240 388
241 /* Now copy until we run out of src or dst. */ 389 /* Now copy the data until we run out of src or dst. */
242 ret = copy_data(srclg, src, dst, pages); 390 ret = copy_data(srclg, src, dst, pages);
243 391
244drop_pages: 392drop_pages:
@@ -247,6 +395,11 @@ drop_pages:
247 return ret; 395 return ret;
248} 396}
249 397
398/*L:380 Transferring data from one Guest to another is not as simple as I'd
399 * like. We've found the "struct lguest_dma_info" bound to the same address as
400 * the send, we need to copy into it.
401 *
402 * This function returns true if the destination array was empty. */
250static int dma_transfer(struct lguest *srclg, 403static int dma_transfer(struct lguest *srclg,
251 unsigned long udma, 404 unsigned long udma,
252 struct lguest_dma_info *dst) 405 struct lguest_dma_info *dst)
@@ -255,15 +408,23 @@ static int dma_transfer(struct lguest *srclg,
255 struct lguest *dstlg; 408 struct lguest *dstlg;
256 u32 i, dma = 0; 409 u32 i, dma = 0;
257 410
411 /* From the "struct lguest_dma_info" we found in the hash, grab the
412 * Guest. */
258 dstlg = &lguests[dst->guestid]; 413 dstlg = &lguests[dst->guestid];
259 /* Get our dma list. */ 414 /* Read in the source "struct lguest_dma" handed to SEND_DMA. */
260 lgread(srclg, &src_dma, udma, sizeof(src_dma)); 415 lgread(srclg, &src_dma, udma, sizeof(src_dma));
261 416
262 /* We can't deadlock against them dmaing to us, because this 417 /* We need the destination's mmap_sem, and we already hold the source's
263 * is all under the lguest_lock. */ 418 * mmap_sem for the futex key lookup. Normally this would suggest that
419 * we could deadlock if the destination Guest was trying to send to
420 * this source Guest at the same time, which is another reason that all
421 * I/O is done under the big lguest_lock. */
264 down_read(&dstlg->mm->mmap_sem); 422 down_read(&dstlg->mm->mmap_sem);
265 423
424 /* Look through the destination DMA array for an available buffer. */
266 for (i = 0; i < dst->num_dmas; i++) { 425 for (i = 0; i < dst->num_dmas; i++) {
426 /* We keep a "next_dma" pointer which often helps us avoid
427 * looking at lots of previously-filled entries. */
267 dma = (dst->next_dma + i) % dst->num_dmas; 428 dma = (dst->next_dma + i) % dst->num_dmas;
268 if (!lgread_other(dstlg, &dst_dma, 429 if (!lgread_other(dstlg, &dst_dma,
269 dst->dmas + dma * sizeof(struct lguest_dma), 430 dst->dmas + dma * sizeof(struct lguest_dma),
@@ -273,30 +434,46 @@ static int dma_transfer(struct lguest *srclg,
273 if (!dst_dma.used_len) 434 if (!dst_dma.used_len)
274 break; 435 break;
275 } 436 }
437
438 /* If we found a buffer, we do the actual data copy. */
276 if (i != dst->num_dmas) { 439 if (i != dst->num_dmas) {
277 unsigned long used_lenp; 440 unsigned long used_lenp;
278 unsigned int ret; 441 unsigned int ret;
279 442
280 ret = do_dma(srclg, &src_dma, dstlg, &dst_dma); 443 ret = do_dma(srclg, &src_dma, dstlg, &dst_dma);
281 /* Put used length in src. */ 444 /* Put used length in the source "struct lguest_dma"'s used_len
445 * field. It's a little tricky to figure out where that is,
446 * though. */
282 lgwrite_u32(srclg, 447 lgwrite_u32(srclg,
283 udma+offsetof(struct lguest_dma, used_len), ret); 448 udma+offsetof(struct lguest_dma, used_len), ret);
449 /* Tranferring 0 bytes is OK if the source buffer was empty. */
284 if (ret == 0 && src_dma.len[0] != 0) 450 if (ret == 0 && src_dma.len[0] != 0)
285 goto fail; 451 goto fail;
286 452
287 /* Make sure destination sees contents before length. */ 453 /* The destination Guest might be running on a different CPU:
454 * we have to make sure that it will see the "used_len" field
455 * change to non-zero *after* it sees the data we copied into
456 * the buffer. Hence a write memory barrier. */
288 wmb(); 457 wmb();
458 /* Figuring out where the destination's used_len field for this
459 * "struct lguest_dma" in the array is also a little ugly. */
289 used_lenp = dst->dmas 460 used_lenp = dst->dmas
290 + dma * sizeof(struct lguest_dma) 461 + dma * sizeof(struct lguest_dma)
291 + offsetof(struct lguest_dma, used_len); 462 + offsetof(struct lguest_dma, used_len);
292 lgwrite_other(dstlg, used_lenp, &ret, sizeof(ret)); 463 lgwrite_other(dstlg, used_lenp, &ret, sizeof(ret));
464 /* Move the cursor for next time. */
293 dst->next_dma++; 465 dst->next_dma++;
294 } 466 }
295 up_read(&dstlg->mm->mmap_sem); 467 up_read(&dstlg->mm->mmap_sem);
296 468
297 /* Do this last so dst doesn't simply sleep on lock. */ 469 /* We trigger the destination interrupt, even if the destination was
470 * empty and we didn't transfer anything: this gives them a chance to
471 * wake up and refill. */
298 set_bit(dst->interrupt, dstlg->irqs_pending); 472 set_bit(dst->interrupt, dstlg->irqs_pending);
473 /* Wake up the destination process. */
299 wake_up_process(dstlg->tsk); 474 wake_up_process(dstlg->tsk);
475 /* If we passed the last "struct lguest_dma", the receive had no
476 * buffers left. */
300 return i == dst->num_dmas; 477 return i == dst->num_dmas;
301 478
302fail: 479fail:
@@ -304,6 +481,8 @@ fail:
304 return 0; 481 return 0;
305} 482}
306 483
484/*L:370 This is the counter-side to the BIND_DMA hypercall; the SEND_DMA
485 * hypercall. We find out who's listening, and send to them. */
307void send_dma(struct lguest *lg, unsigned long ukey, unsigned long udma) 486void send_dma(struct lguest *lg, unsigned long ukey, unsigned long udma)
308{ 487{
309 union futex_key key; 488 union futex_key key;
@@ -313,31 +492,43 @@ void send_dma(struct lguest *lg, unsigned long ukey, unsigned long udma)
313again: 492again:
314 mutex_lock(&lguest_lock); 493 mutex_lock(&lguest_lock);
315 down_read(fshared); 494 down_read(fshared);
495 /* Get the futex key for the key the Guest gave us */
316 if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { 496 if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) {
317 kill_guest(lg, "bad sending DMA key"); 497 kill_guest(lg, "bad sending DMA key");
318 goto unlock; 498 goto unlock;
319 } 499 }
320 /* Shared mapping? Look for other guests... */ 500 /* Since the key must be a multiple of 4, the futex key uses the lower
501 * bit of the "offset" field (which would always be 0) to indicate a
502 * mapping which is shared with other processes (ie. Guests). */
321 if (key.shared.offset & 1) { 503 if (key.shared.offset & 1) {
322 struct lguest_dma_info *i; 504 struct lguest_dma_info *i;
505 /* Look through the hash for other Guests. */
323 list_for_each_entry(i, &dma_hash[hash(&key)], list) { 506 list_for_each_entry(i, &dma_hash[hash(&key)], list) {
507 /* Don't send to ourselves. */
324 if (i->guestid == lg->guestid) 508 if (i->guestid == lg->guestid)
325 continue; 509 continue;
326 if (!key_eq(&key, &i->key)) 510 if (!key_eq(&key, &i->key))
327 continue; 511 continue;
328 512
513 /* If dma_transfer() tells us the destination has no
514 * available buffers, we increment "empty". */
329 empty += dma_transfer(lg, udma, i); 515 empty += dma_transfer(lg, udma, i);
330 break; 516 break;
331 } 517 }
518 /* If the destination is empty, we release our locks and
519 * give the destination Guest a brief chance to restock. */
332 if (empty == 1) { 520 if (empty == 1) {
333 /* Give any recipients one chance to restock. */ 521 /* Give any recipients one chance to restock. */
334 up_read(&current->mm->mmap_sem); 522 up_read(&current->mm->mmap_sem);
335 mutex_unlock(&lguest_lock); 523 mutex_unlock(&lguest_lock);
524 /* Next time, we won't try again. */
336 empty++; 525 empty++;
337 goto again; 526 goto again;
338 } 527 }
339 } else { 528 } else {
340 /* Private mapping: tell our userspace. */ 529 /* Private mapping: Guest is sending to its Launcher. We set
530 * the "dma_is_pending" flag so that the main loop will exit
531 * and the Launcher's read() from /dev/lguest will return. */
341 lg->dma_is_pending = 1; 532 lg->dma_is_pending = 1;
342 lg->pending_dma = udma; 533 lg->pending_dma = udma;
343 lg->pending_key = ukey; 534 lg->pending_key = ukey;
@@ -346,6 +537,7 @@ unlock:
346 up_read(fshared); 537 up_read(fshared);
347 mutex_unlock(&lguest_lock); 538 mutex_unlock(&lguest_lock);
348} 539}
540/*:*/
349 541
350void release_all_dma(struct lguest *lg) 542void release_all_dma(struct lguest *lg)
351{ 543{
@@ -361,7 +553,18 @@ void release_all_dma(struct lguest *lg)
361 up_read(&lg->mm->mmap_sem); 553 up_read(&lg->mm->mmap_sem);
362} 554}
363 555
364/* Userspace wants a dma buffer from this guest. */ 556/*M:007 We only return a single DMA buffer to the Launcher, but it would be
557 * more efficient to return a pointer to the entire array of DMA buffers, which
558 * it can cache and choose one whenever it wants.
559 *
560 * Currently the Launcher uses a write to /dev/lguest, and the return value is
561 * the address of the DMA structure with the interrupt number placed in
562 * dma->used_len. If we wanted to return the entire array, we need to return
563 * the address, array size and interrupt number: this seems to require an
564 * ioctl(). :*/
565
566/*L:320 This routine looks for a DMA buffer registered by the Guest on the
567 * given key (using the BIND_DMA hypercall). */
365unsigned long get_dma_buffer(struct lguest *lg, 568unsigned long get_dma_buffer(struct lguest *lg,
366 unsigned long ukey, unsigned long *interrupt) 569 unsigned long ukey, unsigned long *interrupt)
367{ 570{
@@ -370,15 +573,29 @@ unsigned long get_dma_buffer(struct lguest *lg,
370 struct lguest_dma_info *i; 573 struct lguest_dma_info *i;
371 struct rw_semaphore *fshared = &current->mm->mmap_sem; 574 struct rw_semaphore *fshared = &current->mm->mmap_sem;
372 575
576 /* Take the Big Lguest Lock to stop other Guests sending this Guest DMA
577 * at the same time. */
373 mutex_lock(&lguest_lock); 578 mutex_lock(&lguest_lock);
579 /* To match between Guests sharing the same underlying memory we steal
580 * code from the futex infrastructure. This requires that we hold the
581 * "mmap_sem" for our process (the Launcher), and pass it to the futex
582 * code. */
374 down_read(fshared); 583 down_read(fshared);
584
585 /* This can fail if it's not a valid address, or if the address is not
586 * divisible by 4 (the futex code needs that, we don't really). */
375 if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { 587 if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) {
376 kill_guest(lg, "bad registered DMA buffer"); 588 kill_guest(lg, "bad registered DMA buffer");
377 goto unlock; 589 goto unlock;
378 } 590 }
591 /* Search the hash table for matching entries (the Launcher can only
592 * send to its own Guest for the moment, so the entry must be for this
593 * Guest) */
379 list_for_each_entry(i, &dma_hash[hash(&key)], list) { 594 list_for_each_entry(i, &dma_hash[hash(&key)], list) {
380 if (key_eq(&key, &i->key) && i->guestid == lg->guestid) { 595 if (key_eq(&key, &i->key) && i->guestid == lg->guestid) {
381 unsigned int j; 596 unsigned int j;
597 /* Look through the registered DMA array for an
598 * available buffer. */
382 for (j = 0; j < i->num_dmas; j++) { 599 for (j = 0; j < i->num_dmas; j++) {
383 struct lguest_dma dma; 600 struct lguest_dma dma;
384 601
@@ -387,6 +604,8 @@ unsigned long get_dma_buffer(struct lguest *lg,
387 if (dma.used_len == 0) 604 if (dma.used_len == 0)
388 break; 605 break;
389 } 606 }
607 /* Store the interrupt the Guest wants when the buffer
608 * is used. */
390 *interrupt = i->interrupt; 609 *interrupt = i->interrupt;
391 break; 610 break;
392 } 611 }
@@ -396,4 +615,12 @@ unlock:
396 mutex_unlock(&lguest_lock); 615 mutex_unlock(&lguest_lock);
397 return ret; 616 return ret;
398} 617}
618/*:*/
399 619
620/*L:410 This really has completed the Launcher. Not only have we now finished
621 * the longest chapter in our journey, but this also means we are over halfway
622 * through!
623 *
624 * Enough prevaricating around the bush: it is time for us to dive into the
625 * core of the Host, in "make Host".
626 */
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 3e2ddfbc816e..64f0abed317c 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -58,9 +58,18 @@ struct lguest_dma_info
58 u8 interrupt; /* 0 when not registered */ 58 u8 interrupt; /* 0 when not registered */
59}; 59};
60 60
61/* We have separate types for the guest's ptes & pgds and the shadow ptes & 61/*H:310 The page-table code owes a great debt of gratitude to Andi Kleen. He
62 * pgds. Since this host might use three-level pagetables and the guest and 62 * reviewed the original code which used "u32" for all page table entries, and
63 * shadow pagetables don't, we can't use the normal pte_t/pgd_t. */ 63 * insisted that it would be far clearer with explicit typing. I thought it
64 * was overkill, but he was right: it is much clearer than it was before.
65 *
66 * We have separate types for the Guest's ptes & pgds and the shadow ptes &
67 * pgds. There's already a Linux type for these (pte_t and pgd_t) but they
68 * change depending on kernel config options (PAE). */
69
70/* Each entry is identical: lower 12 bits of flags and upper 20 bits for the
71 * "page frame number" (0 == first physical page, etc). They are different
72 * types so the compiler will warn us if we mix them improperly. */
64typedef union { 73typedef union {
65 struct { unsigned flags:12, pfn:20; }; 74 struct { unsigned flags:12, pfn:20; };
66 struct { unsigned long val; } raw; 75 struct { unsigned long val; } raw;
@@ -77,8 +86,12 @@ typedef union {
77 struct { unsigned flags:12, pfn:20; }; 86 struct { unsigned flags:12, pfn:20; };
78 struct { unsigned long val; } raw; 87 struct { unsigned long val; } raw;
79} gpte_t; 88} gpte_t;
89
90/* We have two convenient macros to convert a "raw" value as handed to us by
91 * the Guest into the correct Guest PGD or PTE type. */
80#define mkgpte(_val) ((gpte_t){.raw.val = _val}) 92#define mkgpte(_val) ((gpte_t){.raw.val = _val})
81#define mkgpgd(_val) ((gpgd_t){.raw.val = _val}) 93#define mkgpgd(_val) ((gpgd_t){.raw.val = _val})
94/*:*/
82 95
83struct pgdir 96struct pgdir
84{ 97{
@@ -243,7 +256,32 @@ unsigned long get_dma_buffer(struct lguest *lg, unsigned long key,
243 256
244/* hypercalls.c: */ 257/* hypercalls.c: */
245void do_hypercalls(struct lguest *lg); 258void do_hypercalls(struct lguest *lg);
246 259void write_timestamp(struct lguest *lg);
260
261/*L:035
262 * Let's step aside for the moment, to study one important routine that's used
263 * widely in the Host code.
264 *
265 * There are many cases where the Guest does something invalid, like pass crap
266 * to a hypercall. Since only the Guest kernel can make hypercalls, it's quite
267 * acceptable to simply terminate the Guest and give the Launcher a nicely
268 * formatted reason. It's also simpler for the Guest itself, which doesn't
269 * need to check most hypercalls for "success"; if you're still running, it
270 * succeeded.
271 *
272 * Once this is called, the Guest will never run again, so most Host code can
273 * call this then continue as if nothing had happened. This means many
274 * functions don't have to explicitly return an error code, which keeps the
275 * code simple.
276 *
277 * It also means that this can be called more than once: only the first one is
278 * remembered. The only trick is that we still need to kill the Guest even if
279 * we can't allocate memory to store the reason. Linux has a neat way of
280 * packing error codes into invalid pointers, so we use that here.
281 *
282 * Like any macro which uses an "if", it is safely wrapped in a run-once "do {
283 * } while(0)".
284 */
247#define kill_guest(lg, fmt...) \ 285#define kill_guest(lg, fmt...) \
248do { \ 286do { \
249 if (!(lg)->dead) { \ 287 if (!(lg)->dead) { \
@@ -252,6 +290,7 @@ do { \
252 (lg)->dead = ERR_PTR(-ENOMEM); \ 290 (lg)->dead = ERR_PTR(-ENOMEM); \
253 } \ 291 } \
254} while(0) 292} while(0)
293/* (End of aside) :*/
255 294
256static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) 295static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
257{ 296{
diff --git a/drivers/lguest/lguest.c b/drivers/lguest/lguest.c
index 18dade06d4a9..1bc1546c7fd0 100644
--- a/drivers/lguest/lguest.c
+++ b/drivers/lguest/lguest.c
@@ -1,6 +1,32 @@
1/* 1/*P:010
2 * Lguest specific paravirt-ops implementation 2 * A hypervisor allows multiple Operating Systems to run on a single machine.
3 * To quote David Wheeler: "Any problem in computer science can be solved with
4 * another layer of indirection."
5 *
6 * We keep things simple in two ways. First, we start with a normal Linux
7 * kernel and insert a module (lg.ko) which allows us to run other Linux
8 * kernels the same way we'd run processes. We call the first kernel the Host,
9 * and the others the Guests. The program which sets up and configures Guests
10 * (such as the example in Documentation/lguest/lguest.c) is called the
11 * Launcher.
12 *
13 * Secondly, we only run specially modified Guests, not normal kernels. When
14 * you set CONFIG_LGUEST to 'y' or 'm', this automatically sets
15 * CONFIG_LGUEST_GUEST=y, which compiles this file into the kernel so it knows
16 * how to be a Guest. This means that you can use the same kernel you boot
17 * normally (ie. as a Host) as a Guest.
3 * 18 *
19 * These Guests know that they cannot do privileged operations, such as disable
20 * interrupts, and that they have to ask the Host to do such things explicitly.
21 * This file consists of all the replacements for such low-level native
22 * hardware operations: these special Guest versions call the Host.
23 *
24 * So how does the kernel know it's a Guest? The Guest starts at a special
25 * entry point marked with a magic string, which sets up a few things then
26 * calls here. We replace the native functions in "struct paravirt_ops"
27 * with our Guest versions, then boot like normal. :*/
28
29/*
4 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. 30 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
5 * 31 *
6 * This program is free software; you can redistribute it and/or modify 32 * This program is free software; you can redistribute it and/or modify
@@ -40,6 +66,12 @@
40#include <asm/mce.h> 66#include <asm/mce.h>
41#include <asm/io.h> 67#include <asm/io.h>
42 68
69/*G:010 Welcome to the Guest!
70 *
71 * The Guest in our tale is a simple creature: identical to the Host but
72 * behaving in simplified but equivalent ways. In particular, the Guest is the
73 * same kernel as the Host (or at least, built from the same source code). :*/
74
43/* Declarations for definitions in lguest_guest.S */ 75/* Declarations for definitions in lguest_guest.S */
44extern char lguest_noirq_start[], lguest_noirq_end[]; 76extern char lguest_noirq_start[], lguest_noirq_end[];
45extern const char lgstart_cli[], lgend_cli[]; 77extern const char lgstart_cli[], lgend_cli[];
@@ -58,7 +90,26 @@ struct lguest_data lguest_data = {
58struct lguest_device_desc *lguest_devices; 90struct lguest_device_desc *lguest_devices;
59static cycle_t clock_base; 91static cycle_t clock_base;
60 92
61static enum paravirt_lazy_mode lazy_mode; 93/*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first
94 * real optimization trick!
95 *
96 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
97 * them as a batch when lazy_mode is eventually turned off. Because hypercalls
98 * are reasonably expensive, batching them up makes sense. For example, a
99 * large mmap might update dozens of page table entries: that code calls
100 * lguest_lazy_mode(PARAVIRT_LAZY_MMU), does the dozen updates, then calls
101 * lguest_lazy_mode(PARAVIRT_LAZY_NONE).
102 *
103 * So, when we're in lazy mode, we call async_hypercall() to store the call for
104 * future processing. When lazy mode is turned off we issue a hypercall to
105 * flush the stored calls.
106 *
107 * There's also a hack where "mode" is set to "PARAVIRT_LAZY_FLUSH" which
108 * indicates we're to flush any outstanding calls immediately. This is used
109 * when an interrupt handler does a kmap_atomic(): the page table changes must
110 * happen immediately even if we're in the middle of a batch. Usually we're
111 * not, though, so there's nothing to do. */
112static enum paravirt_lazy_mode lazy_mode; /* Note: not SMP-safe! */
62static void lguest_lazy_mode(enum paravirt_lazy_mode mode) 113static void lguest_lazy_mode(enum paravirt_lazy_mode mode)
63{ 114{
64 if (mode == PARAVIRT_LAZY_FLUSH) { 115 if (mode == PARAVIRT_LAZY_FLUSH) {
@@ -82,6 +133,16 @@ static void lazy_hcall(unsigned long call,
82 async_hcall(call, arg1, arg2, arg3); 133 async_hcall(call, arg1, arg2, arg3);
83} 134}
84 135
136/* async_hcall() is pretty simple: I'm quite proud of it really. We have a
137 * ring buffer of stored hypercalls which the Host will run though next time we
138 * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall
139 * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
140 * and 255 once the Host has finished with it.
141 *
142 * If we come around to a slot which hasn't been finished, then the table is
143 * full and we just make the hypercall directly. This has the nice side
144 * effect of causing the Host to run all the stored calls in the ring buffer
145 * which empties it for next time! */
85void async_hcall(unsigned long call, 146void async_hcall(unsigned long call,
86 unsigned long arg1, unsigned long arg2, unsigned long arg3) 147 unsigned long arg1, unsigned long arg2, unsigned long arg3)
87{ 148{
@@ -89,6 +150,9 @@ void async_hcall(unsigned long call,
89 static unsigned int next_call; 150 static unsigned int next_call;
90 unsigned long flags; 151 unsigned long flags;
91 152
153 /* Disable interrupts if not already disabled: we don't want an
154 * interrupt handler making a hypercall while we're already doing
155 * one! */
92 local_irq_save(flags); 156 local_irq_save(flags);
93 if (lguest_data.hcall_status[next_call] != 0xFF) { 157 if (lguest_data.hcall_status[next_call] != 0xFF) {
94 /* Table full, so do normal hcall which will flush table. */ 158 /* Table full, so do normal hcall which will flush table. */
@@ -98,7 +162,7 @@ void async_hcall(unsigned long call,
98 lguest_data.hcalls[next_call].edx = arg1; 162 lguest_data.hcalls[next_call].edx = arg1;
99 lguest_data.hcalls[next_call].ebx = arg2; 163 lguest_data.hcalls[next_call].ebx = arg2;
100 lguest_data.hcalls[next_call].ecx = arg3; 164 lguest_data.hcalls[next_call].ecx = arg3;
101 /* Make sure host sees arguments before "valid" flag. */ 165 /* Arguments must all be written before we mark it to go */
102 wmb(); 166 wmb();
103 lguest_data.hcall_status[next_call] = 0; 167 lguest_data.hcall_status[next_call] = 0;
104 if (++next_call == LHCALL_RING_SIZE) 168 if (++next_call == LHCALL_RING_SIZE)
@@ -106,9 +170,14 @@ void async_hcall(unsigned long call,
106 } 170 }
107 local_irq_restore(flags); 171 local_irq_restore(flags);
108} 172}
173/*:*/
109 174
175/* Wrappers for the SEND_DMA and BIND_DMA hypercalls. This is mainly because
176 * Jeff Garzik complained that __pa() should never appear in drivers, and this
177 * helps remove most of them. But also, it wraps some ugliness. */
110void lguest_send_dma(unsigned long key, struct lguest_dma *dma) 178void lguest_send_dma(unsigned long key, struct lguest_dma *dma)
111{ 179{
180 /* The hcall might not write this if something goes wrong */
112 dma->used_len = 0; 181 dma->used_len = 0;
113 hcall(LHCALL_SEND_DMA, key, __pa(dma), 0); 182 hcall(LHCALL_SEND_DMA, key, __pa(dma), 0);
114} 183}
@@ -116,11 +185,16 @@ void lguest_send_dma(unsigned long key, struct lguest_dma *dma)
116int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas, 185int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas,
117 unsigned int num, u8 irq) 186 unsigned int num, u8 irq)
118{ 187{
188 /* This is the only hypercall which actually wants 5 arguments, and we
189 * only support 4. Fortunately the interrupt number is always less
190 * than 256, so we can pack it with the number of dmas in the final
191 * argument. */
119 if (!hcall(LHCALL_BIND_DMA, key, __pa(dmas), (num << 8) | irq)) 192 if (!hcall(LHCALL_BIND_DMA, key, __pa(dmas), (num << 8) | irq))
120 return -ENOMEM; 193 return -ENOMEM;
121 return 0; 194 return 0;
122} 195}
123 196
197/* Unbinding is the same hypercall as binding, but with 0 num & irq. */
124void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas) 198void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas)
125{ 199{
126 hcall(LHCALL_BIND_DMA, key, __pa(dmas), 0); 200 hcall(LHCALL_BIND_DMA, key, __pa(dmas), 0);
@@ -138,35 +212,73 @@ void lguest_unmap(void *addr)
138 iounmap((__force void __iomem *)addr); 212 iounmap((__force void __iomem *)addr);
139} 213}
140 214
215/*G:033
216 * Here are our first native-instruction replacements: four functions for
217 * interrupt control.
218 *
219 * The simplest way of implementing these would be to have "turn interrupts
220 * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow:
221 * these are by far the most commonly called functions of those we override.
222 *
223 * So instead we keep an "irq_enabled" field inside our "struct lguest_data",
224 * which the Guest can update with a single instruction. The Host knows to
225 * check there when it wants to deliver an interrupt.
226 */
227
228/* save_flags() is expected to return the processor state (ie. "eflags"). The
229 * eflags word contains all kind of stuff, but in practice Linux only cares
230 * about the interrupt flag. Our "save_flags()" just returns that. */
141static unsigned long save_fl(void) 231static unsigned long save_fl(void)
142{ 232{
143 return lguest_data.irq_enabled; 233 return lguest_data.irq_enabled;
144} 234}
145 235
236/* "restore_flags" just sets the flags back to the value given. */
146static void restore_fl(unsigned long flags) 237static void restore_fl(unsigned long flags)
147{ 238{
148 /* FIXME: Check if interrupt pending... */
149 lguest_data.irq_enabled = flags; 239 lguest_data.irq_enabled = flags;
150} 240}
151 241
242/* Interrupts go off... */
152static void irq_disable(void) 243static void irq_disable(void)
153{ 244{
154 lguest_data.irq_enabled = 0; 245 lguest_data.irq_enabled = 0;
155} 246}
156 247
248/* Interrupts go on... */
157static void irq_enable(void) 249static void irq_enable(void)
158{ 250{
159 /* FIXME: Check if interrupt pending... */
160 lguest_data.irq_enabled = X86_EFLAGS_IF; 251 lguest_data.irq_enabled = X86_EFLAGS_IF;
161} 252}
162 253/*:*/
254/*M:003 Note that we don't check for outstanding interrupts when we re-enable
255 * them (or when we unmask an interrupt). This seems to work for the moment,
256 * since interrupts are rare and we'll just get the interrupt on the next timer
257 * tick, but when we turn on CONFIG_NO_HZ, we should revisit this. One way
258 * would be to put the "irq_enabled" field in a page by itself, and have the
259 * Host write-protect it when an interrupt comes in when irqs are disabled.
260 * There will then be a page fault as soon as interrupts are re-enabled. :*/
261
262/*G:034
263 * The Interrupt Descriptor Table (IDT).
264 *
265 * The IDT tells the processor what to do when an interrupt comes in. Each
266 * entry in the table is a 64-bit descriptor: this holds the privilege level,
267 * address of the handler, and... well, who cares? The Guest just asks the
268 * Host to make the change anyway, because the Host controls the real IDT.
269 */
163static void lguest_write_idt_entry(struct desc_struct *dt, 270static void lguest_write_idt_entry(struct desc_struct *dt,
164 int entrynum, u32 low, u32 high) 271 int entrynum, u32 low, u32 high)
165{ 272{
273 /* Keep the local copy up to date. */
166 write_dt_entry(dt, entrynum, low, high); 274 write_dt_entry(dt, entrynum, low, high);
275 /* Tell Host about this new entry. */
167 hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high); 276 hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high);
168} 277}
169 278
279/* Changing to a different IDT is very rare: we keep the IDT up-to-date every
280 * time it is written, so we can simply loop through all entries and tell the
281 * Host about them. */
170static void lguest_load_idt(const struct Xgt_desc_struct *desc) 282static void lguest_load_idt(const struct Xgt_desc_struct *desc)
171{ 283{
172 unsigned int i; 284 unsigned int i;
@@ -176,12 +288,29 @@ static void lguest_load_idt(const struct Xgt_desc_struct *desc)
176 hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b); 288 hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);
177} 289}
178 290
291/*
292 * The Global Descriptor Table.
293 *
294 * The Intel architecture defines another table, called the Global Descriptor
295 * Table (GDT). You tell the CPU where it is (and its size) using the "lgdt"
296 * instruction, and then several other instructions refer to entries in the
297 * table. There are three entries which the Switcher needs, so the Host simply
298 * controls the entire thing and the Guest asks it to make changes using the
299 * LOAD_GDT hypercall.
300 *
301 * This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY
302 * hypercall and use that repeatedly to load a new IDT. I don't think it
303 * really matters, but wouldn't it be nice if they were the same?
304 */
179static void lguest_load_gdt(const struct Xgt_desc_struct *desc) 305static void lguest_load_gdt(const struct Xgt_desc_struct *desc)
180{ 306{
181 BUG_ON((desc->size+1)/8 != GDT_ENTRIES); 307 BUG_ON((desc->size+1)/8 != GDT_ENTRIES);
182 hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0); 308 hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0);
183} 309}
184 310
311/* For a single GDT entry which changes, we do the lazy thing: alter our GDT,
312 * then tell the Host to reload the entire thing. This operation is so rare
313 * that this naive implementation is reasonable. */
185static void lguest_write_gdt_entry(struct desc_struct *dt, 314static void lguest_write_gdt_entry(struct desc_struct *dt,
186 int entrynum, u32 low, u32 high) 315 int entrynum, u32 low, u32 high)
187{ 316{
@@ -189,19 +318,58 @@ static void lguest_write_gdt_entry(struct desc_struct *dt,
189 hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0); 318 hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);
190} 319}
191 320
321/* OK, I lied. There are three "thread local storage" GDT entries which change
322 * on every context switch (these three entries are how glibc implements
323 * __thread variables). So we have a hypercall specifically for this case. */
192static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) 324static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
193{ 325{
194 lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0); 326 lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
195} 327}
328/*:*/
196 329
330/*G:038 That's enough excitement for now, back to ploughing through each of
331 * the paravirt_ops (we're about 1/3 of the way through).
332 *
333 * This is the Local Descriptor Table, another weird Intel thingy. Linux only
334 * uses this for some strange applications like Wine. We don't do anything
335 * here, so they'll get an informative and friendly Segmentation Fault. */
197static void lguest_set_ldt(const void *addr, unsigned entries) 336static void lguest_set_ldt(const void *addr, unsigned entries)
198{ 337{
199} 338}
200 339
340/* This loads a GDT entry into the "Task Register": that entry points to a
341 * structure called the Task State Segment. Some comments scattered though the
342 * kernel code indicate that this used for task switching in ages past, along
343 * with blood sacrifice and astrology.
344 *
345 * Now there's nothing interesting in here that we don't get told elsewhere.
346 * But the native version uses the "ltr" instruction, which makes the Host
347 * complain to the Guest about a Segmentation Fault and it'll oops. So we
348 * override the native version with a do-nothing version. */
201static void lguest_load_tr_desc(void) 349static void lguest_load_tr_desc(void)
202{ 350{
203} 351}
204 352
353/* The "cpuid" instruction is a way of querying both the CPU identity
354 * (manufacturer, model, etc) and its features. It was introduced before the
355 * Pentium in 1993 and keeps getting extended by both Intel and AMD. As you
356 * might imagine, after a decade and a half this treatment, it is now a giant
357 * ball of hair. Its entry in the current Intel manual runs to 28 pages.
358 *
359 * This instruction even it has its own Wikipedia entry. The Wikipedia entry
360 * has been translated into 4 languages. I am not making this up!
361 *
362 * We could get funky here and identify ourselves as "GenuineLguest", but
363 * instead we just use the real "cpuid" instruction. Then I pretty much turned
364 * off feature bits until the Guest booted. (Don't say that: you'll damage
365 * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is
366 * hardly future proof.) Noone's listening! They don't like you anyway,
367 * parenthetic weirdo!
368 *
369 * Replacing the cpuid so we can turn features off is great for the kernel, but
370 * anyone (including userspace) can just use the raw "cpuid" instruction and
371 * the Host won't even notice since it isn't privileged. So we try not to get
372 * too worked up about it. */
205static void lguest_cpuid(unsigned int *eax, unsigned int *ebx, 373static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
206 unsigned int *ecx, unsigned int *edx) 374 unsigned int *ecx, unsigned int *edx)
207{ 375{
@@ -214,21 +382,43 @@ static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
214 *ecx &= 0x00002201; 382 *ecx &= 0x00002201;
215 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */ 383 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
216 *edx &= 0x07808101; 384 *edx &= 0x07808101;
217 /* Host wants to know when we flush kernel pages: set PGE. */ 385 /* The Host can do a nice optimization if it knows that the
386 * kernel mappings (addresses above 0xC0000000 or whatever
387 * PAGE_OFFSET is set to) haven't changed. But Linux calls
388 * flush_tlb_user() for both user and kernel mappings unless
389 * the Page Global Enable (PGE) feature bit is set. */
218 *edx |= 0x00002000; 390 *edx |= 0x00002000;
219 break; 391 break;
220 case 0x80000000: 392 case 0x80000000:
221 /* Futureproof this a little: if they ask how much extended 393 /* Futureproof this a little: if they ask how much extended
222 * processor information, limit it to known fields. */ 394 * processor information there is, limit it to known fields. */
223 if (*eax > 0x80000008) 395 if (*eax > 0x80000008)
224 *eax = 0x80000008; 396 *eax = 0x80000008;
225 break; 397 break;
226 } 398 }
227} 399}
228 400
401/* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
402 * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother
403 * it. The Host needs to know when the Guest wants to change them, so we have
404 * a whole series of functions like read_cr0() and write_cr0().
405 *
406 * We start with CR0. CR0 allows you to turn on and off all kinds of basic
407 * features, but Linux only really cares about one: the horrifically-named Task
408 * Switched (TS) bit at bit 3 (ie. 8)
409 *
410 * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if
411 * the floating point unit is used. Which allows us to restore FPU state
412 * lazily after a task switch, and Linux uses that gratefully, but wouldn't a
413 * name like "FPUTRAP bit" be a little less cryptic?
414 *
415 * We store cr0 (and cr3) locally, because the Host never changes it. The
416 * Guest sometimes wants to read it and we'd prefer not to bother the Host
417 * unnecessarily. */
229static unsigned long current_cr0, current_cr3; 418static unsigned long current_cr0, current_cr3;
230static void lguest_write_cr0(unsigned long val) 419static void lguest_write_cr0(unsigned long val)
231{ 420{
421 /* 8 == TS bit. */
232 lazy_hcall(LHCALL_TS, val & 8, 0, 0); 422 lazy_hcall(LHCALL_TS, val & 8, 0, 0);
233 current_cr0 = val; 423 current_cr0 = val;
234} 424}
@@ -238,17 +428,25 @@ static unsigned long lguest_read_cr0(void)
238 return current_cr0; 428 return current_cr0;
239} 429}
240 430
431/* Intel provided a special instruction to clear the TS bit for people too cool
432 * to use write_cr0() to do it. This "clts" instruction is faster, because all
433 * the vowels have been optimized out. */
241static void lguest_clts(void) 434static void lguest_clts(void)
242{ 435{
243 lazy_hcall(LHCALL_TS, 0, 0, 0); 436 lazy_hcall(LHCALL_TS, 0, 0, 0);
244 current_cr0 &= ~8U; 437 current_cr0 &= ~8U;
245} 438}
246 439
440/* CR2 is the virtual address of the last page fault, which the Guest only ever
441 * reads. The Host kindly writes this into our "struct lguest_data", so we
442 * just read it out of there. */
247static unsigned long lguest_read_cr2(void) 443static unsigned long lguest_read_cr2(void)
248{ 444{
249 return lguest_data.cr2; 445 return lguest_data.cr2;
250} 446}
251 447
448/* CR3 is the current toplevel pagetable page: the principle is the same as
449 * cr0. Keep a local copy, and tell the Host when it changes. */
252static void lguest_write_cr3(unsigned long cr3) 450static void lguest_write_cr3(unsigned long cr3)
253{ 451{
254 lazy_hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0); 452 lazy_hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0);
@@ -260,7 +458,7 @@ static unsigned long lguest_read_cr3(void)
260 return current_cr3; 458 return current_cr3;
261} 459}
262 460
263/* Used to enable/disable PGE, but we don't care. */ 461/* CR4 is used to enable and disable PGE, but we don't care. */
264static unsigned long lguest_read_cr4(void) 462static unsigned long lguest_read_cr4(void)
265{ 463{
266 return 0; 464 return 0;
@@ -270,6 +468,59 @@ static void lguest_write_cr4(unsigned long val)
270{ 468{
271} 469}
272 470
471/*
472 * Page Table Handling.
473 *
474 * Now would be a good time to take a rest and grab a coffee or similarly
475 * relaxing stimulant. The easy parts are behind us, and the trek gradually
476 * winds uphill from here.
477 *
478 * Quick refresher: memory is divided into "pages" of 4096 bytes each. The CPU
479 * maps virtual addresses to physical addresses using "page tables". We could
480 * use one huge index of 1 million entries: each address is 4 bytes, so that's
481 * 1024 pages just to hold the page tables. But since most virtual addresses
482 * are unused, we use a two level index which saves space. The CR3 register
483 * contains the physical address of the top level "page directory" page, which
484 * contains physical addresses of up to 1024 second-level pages. Each of these
485 * second level pages contains up to 1024 physical addresses of actual pages,
486 * or Page Table Entries (PTEs).
487 *
488 * Here's a diagram, where arrows indicate physical addresses:
489 *
490 * CR3 ---> +---------+
491 * | --------->+---------+
492 * | | | PADDR1 |
493 * Top-level | | PADDR2 |
494 * (PMD) page | | |
495 * | | Lower-level |
496 * | | (PTE) page |
497 * | | | |
498 * .... ....
499 *
500 * So to convert a virtual address to a physical address, we look up the top
501 * level, which points us to the second level, which gives us the physical
502 * address of that page. If the top level entry was not present, or the second
503 * level entry was not present, then the virtual address is invalid (we
504 * say "the page was not mapped").
505 *
506 * Put another way, a 32-bit virtual address is divided up like so:
507 *
508 * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
509 * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>|
510 * Index into top Index into second Offset within page
511 * page directory page pagetable page
512 *
513 * The kernel spends a lot of time changing both the top-level page directory
514 * and lower-level pagetable pages. The Guest doesn't know physical addresses,
515 * so while it maintains these page tables exactly like normal, it also needs
516 * to keep the Host informed whenever it makes a change: the Host will create
517 * the real page tables based on the Guests'.
518 */
519
520/* The Guest calls this to set a second-level entry (pte), ie. to map a page
521 * into a process' address space. We set the entry then tell the Host the
522 * toplevel and address this corresponds to. The Guest uses one pagetable per
523 * process, so we need to tell the Host which one we're changing (mm->pgd). */
273static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, 524static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
274 pte_t *ptep, pte_t pteval) 525 pte_t *ptep, pte_t pteval)
275{ 526{
@@ -277,7 +528,9 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
277 lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low); 528 lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low);
278} 529}
279 530
280/* We only support two-level pagetables at the moment. */ 531/* The Guest calls this to set a top-level entry. Again, we set the entry then
532 * tell the Host which top-level page we changed, and the index of the entry we
533 * changed. */
281static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) 534static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
282{ 535{
283 *pmdp = pmdval; 536 *pmdp = pmdval;
@@ -285,7 +538,15 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
285 (__pa(pmdp)&(PAGE_SIZE-1))/4, 0); 538 (__pa(pmdp)&(PAGE_SIZE-1))/4, 0);
286} 539}
287 540
288/* FIXME: Eliminate all callers of this. */ 541/* There are a couple of legacy places where the kernel sets a PTE, but we
542 * don't know the top level any more. This is useless for us, since we don't
543 * know which pagetable is changing or what address, so we just tell the Host
544 * to forget all of them. Fortunately, this is very rare.
545 *
546 * ... except in early boot when the kernel sets up the initial pagetables,
547 * which makes booting astonishingly slow. So we don't even tell the Host
548 * anything changed until we've done the first page table switch.
549 */
289static void lguest_set_pte(pte_t *ptep, pte_t pteval) 550static void lguest_set_pte(pte_t *ptep, pte_t pteval)
290{ 551{
291 *ptep = pteval; 552 *ptep = pteval;
@@ -294,22 +555,51 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
294 lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); 555 lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
295} 556}
296 557
558/* Unfortunately for Lguest, the paravirt_ops for page tables were based on
559 * native page table operations. On native hardware you can set a new page
560 * table entry whenever you want, but if you want to remove one you have to do
561 * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
562 *
563 * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only
564 * called when a valid entry is written, not when it's removed (ie. marked not
565 * present). Instead, this is where we come when the Guest wants to remove a
566 * page table entry: we tell the Host to set that entry to 0 (ie. the present
567 * bit is zero). */
297static void lguest_flush_tlb_single(unsigned long addr) 568static void lguest_flush_tlb_single(unsigned long addr)
298{ 569{
299 /* Simply set it to zero, and it will fault back in. */ 570 /* Simply set it to zero: if it was not, it will fault back in. */
300 lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0); 571 lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0);
301} 572}
302 573
574/* This is what happens after the Guest has removed a large number of entries.
575 * This tells the Host that any of the page table entries for userspace might
576 * have changed, ie. virtual addresses below PAGE_OFFSET. */
303static void lguest_flush_tlb_user(void) 577static void lguest_flush_tlb_user(void)
304{ 578{
305 lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0); 579 lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0);
306} 580}
307 581
582/* This is called when the kernel page tables have changed. That's not very
583 * common (unless the Guest is using highmem, which makes the Guest extremely
584 * slow), so it's worth separating this from the user flushing above. */
308static void lguest_flush_tlb_kernel(void) 585static void lguest_flush_tlb_kernel(void)
309{ 586{
310 lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); 587 lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
311} 588}
312 589
590/*
591 * The Unadvanced Programmable Interrupt Controller.
592 *
593 * This is an attempt to implement the simplest possible interrupt controller.
594 * I spent some time looking though routines like set_irq_chip_and_handler,
595 * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and
596 * I *think* this is as simple as it gets.
597 *
598 * We can tell the Host what interrupts we want blocked ready for using the
599 * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as
600 * simple as setting a bit. We don't actually "ack" interrupts as such, we
601 * just mask and unmask them. I wonder if we should be cleverer?
602 */
313static void disable_lguest_irq(unsigned int irq) 603static void disable_lguest_irq(unsigned int irq)
314{ 604{
315 set_bit(irq, lguest_data.blocked_interrupts); 605 set_bit(irq, lguest_data.blocked_interrupts);
@@ -318,9 +608,9 @@ static void disable_lguest_irq(unsigned int irq)
318static void enable_lguest_irq(unsigned int irq) 608static void enable_lguest_irq(unsigned int irq)
319{ 609{
320 clear_bit(irq, lguest_data.blocked_interrupts); 610 clear_bit(irq, lguest_data.blocked_interrupts);
321 /* FIXME: If it's pending? */
322} 611}
323 612
613/* This structure describes the lguest IRQ controller. */
324static struct irq_chip lguest_irq_controller = { 614static struct irq_chip lguest_irq_controller = {
325 .name = "lguest", 615 .name = "lguest",
326 .mask = disable_lguest_irq, 616 .mask = disable_lguest_irq,
@@ -328,6 +618,10 @@ static struct irq_chip lguest_irq_controller = {
328 .unmask = enable_lguest_irq, 618 .unmask = enable_lguest_irq,
329}; 619};
330 620
621/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
622 * interrupt (except 128, which is used for system calls), and then tells the
623 * Linux infrastructure that each interrupt is controlled by our level-based
624 * lguest interrupt controller. */
331static void __init lguest_init_IRQ(void) 625static void __init lguest_init_IRQ(void)
332{ 626{
333 unsigned int i; 627 unsigned int i;
@@ -340,20 +634,51 @@ static void __init lguest_init_IRQ(void)
340 handle_level_irq); 634 handle_level_irq);
341 } 635 }
342 } 636 }
637 /* This call is required to set up for 4k stacks, where we have
638 * separate stacks for hard and soft interrupts. */
343 irq_ctx_init(smp_processor_id()); 639 irq_ctx_init(smp_processor_id());
344} 640}
345 641
642/*
643 * Time.
644 *
645 * It would be far better for everyone if the Guest had its own clock, but
646 * until then the Host gives us the time on every interrupt.
647 */
346static unsigned long lguest_get_wallclock(void) 648static unsigned long lguest_get_wallclock(void)
347{ 649{
348 return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0); 650 return lguest_data.time.tv_sec;
349} 651}
350 652
351static cycle_t lguest_clock_read(void) 653static cycle_t lguest_clock_read(void)
352{ 654{
655 unsigned long sec, nsec;
656
657 /* If the Host tells the TSC speed, we can trust that. */
353 if (lguest_data.tsc_khz) 658 if (lguest_data.tsc_khz)
354 return native_read_tsc(); 659 return native_read_tsc();
355 else 660
356 return jiffies; 661 /* If we can't use the TSC, we read the time value written by the Host.
662 * Since it's in two parts (seconds and nanoseconds), we risk reading
663 * it just as it's changing from 99 & 0.999999999 to 100 and 0, and
664 * getting 99 and 0. As Linux tends to come apart under the stress of
665 * time travel, we must be careful: */
666 do {
667 /* First we read the seconds part. */
668 sec = lguest_data.time.tv_sec;
669 /* This read memory barrier tells the compiler and the CPU that
670 * this can't be reordered: we have to complete the above
671 * before going on. */
672 rmb();
673 /* Now we read the nanoseconds part. */
674 nsec = lguest_data.time.tv_nsec;
675 /* Make sure we've done that. */
676 rmb();
677 /* Now if the seconds part has changed, try again. */
678 } while (unlikely(lguest_data.time.tv_sec != sec));
679
680 /* Our non-TSC clock is in real nanoseconds. */
681 return sec*1000000000ULL + nsec;
357} 682}
358 683
359/* This is what we tell the kernel is our clocksource. */ 684/* This is what we tell the kernel is our clocksource. */
@@ -361,8 +686,11 @@ static struct clocksource lguest_clock = {
361 .name = "lguest", 686 .name = "lguest",
362 .rating = 400, 687 .rating = 400,
363 .read = lguest_clock_read, 688 .read = lguest_clock_read,
689 .mask = CLOCKSOURCE_MASK(64),
690 .mult = 1,
364}; 691};
365 692
693/* The "scheduler clock" is just our real clock, adjusted to start at zero */
366static unsigned long long lguest_sched_clock(void) 694static unsigned long long lguest_sched_clock(void)
367{ 695{
368 return cyc2ns(&lguest_clock, lguest_clock_read() - clock_base); 696 return cyc2ns(&lguest_clock, lguest_clock_read() - clock_base);
@@ -428,34 +756,55 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
428 local_irq_restore(flags); 756 local_irq_restore(flags);
429} 757}
430 758
759/* At some point in the boot process, we get asked to set up our timing
760 * infrastructure. The kernel doesn't expect timer interrupts before this, but
761 * we cleverly initialized the "blocked_interrupts" field of "struct
762 * lguest_data" so that timer interrupts were blocked until now. */
431static void lguest_time_init(void) 763static void lguest_time_init(void)
432{ 764{
765 /* Set up the timer interrupt (0) to go to our simple timer routine */
433 set_irq_handler(0, lguest_time_irq); 766 set_irq_handler(0, lguest_time_irq);
434 767
435 /* We use the TSC if the Host tells us we can, otherwise a dumb 768 /* Our clock structure look like arch/i386/kernel/tsc.c if we can use
436 * jiffies-based clock. */ 769 * the TSC, otherwise it's a dumb nanosecond-resolution clock. Either
770 * way, the "rating" is initialized so high that it's always chosen
771 * over any other clocksource. */
437 if (lguest_data.tsc_khz) { 772 if (lguest_data.tsc_khz) {
438 lguest_clock.shift = 22; 773 lguest_clock.shift = 22;
439 lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz, 774 lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz,
440 lguest_clock.shift); 775 lguest_clock.shift);
441 lguest_clock.mask = CLOCKSOURCE_MASK(64);
442 lguest_clock.flags = CLOCK_SOURCE_IS_CONTINUOUS; 776 lguest_clock.flags = CLOCK_SOURCE_IS_CONTINUOUS;
443 } else {
444 /* To understand this, start at kernel/time/jiffies.c... */
445 lguest_clock.shift = 8;
446 lguest_clock.mult = (((u64)NSEC_PER_SEC<<8)/ACTHZ) << 8;
447 lguest_clock.mask = CLOCKSOURCE_MASK(32);
448 } 777 }
449 clock_base = lguest_clock_read(); 778 clock_base = lguest_clock_read();
450 clocksource_register(&lguest_clock); 779 clocksource_register(&lguest_clock);
451 780
452 /* We can't set cpumask in the initializer: damn C limitations! */ 781 /* Now we've set up our clock, we can use it as the scheduler clock */
782 paravirt_ops.sched_clock = lguest_sched_clock;
783
784 /* We can't set cpumask in the initializer: damn C limitations! Set it
785 * here and register our timer device. */
453 lguest_clockevent.cpumask = cpumask_of_cpu(0); 786 lguest_clockevent.cpumask = cpumask_of_cpu(0);
454 clockevents_register_device(&lguest_clockevent); 787 clockevents_register_device(&lguest_clockevent);
455 788
789 /* Finally, we unblock the timer interrupt. */
456 enable_lguest_irq(0); 790 enable_lguest_irq(0);
457} 791}
458 792
793/*
794 * Miscellaneous bits and pieces.
795 *
796 * Here is an oddball collection of functions which the Guest needs for things
797 * to work. They're pretty simple.
798 */
799
800/* The Guest needs to tell the host what stack it expects traps to use. For
801 * native hardware, this is part of the Task State Segment mentioned above in
802 * lguest_load_tr_desc(), but to help hypervisors there's this special call.
803 *
804 * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data
805 * segment), the privilege level (we're privilege level 1, the Host is 0 and
806 * will not tolerate us trying to use that), the stack pointer, and the number
807 * of pages in the stack. */
459static void lguest_load_esp0(struct tss_struct *tss, 808static void lguest_load_esp0(struct tss_struct *tss,
460 struct thread_struct *thread) 809 struct thread_struct *thread)
461{ 810{
@@ -463,15 +812,31 @@ static void lguest_load_esp0(struct tss_struct *tss,
463 THREAD_SIZE/PAGE_SIZE); 812 THREAD_SIZE/PAGE_SIZE);
464} 813}
465 814
815/* Let's just say, I wouldn't do debugging under a Guest. */
466static void lguest_set_debugreg(int regno, unsigned long value) 816static void lguest_set_debugreg(int regno, unsigned long value)
467{ 817{
468 /* FIXME: Implement */ 818 /* FIXME: Implement */
469} 819}
470 820
821/* There are times when the kernel wants to make sure that no memory writes are
822 * caught in the cache (that they've all reached real hardware devices). This
823 * doesn't matter for the Guest which has virtual hardware.
824 *
825 * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush
826 * (clflush) instruction is available and the kernel uses that. Otherwise, it
827 * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction.
828 * Unlike clflush, wbinvd can only be run at privilege level 0. So we can
829 * ignore clflush, but replace wbinvd.
830 */
471static void lguest_wbinvd(void) 831static void lguest_wbinvd(void)
472{ 832{
473} 833}
474 834
835/* If the Guest expects to have an Advanced Programmable Interrupt Controller,
836 * we play dumb by ignoring writes and returning 0 for reads. So it's no
837 * longer Programmable nor Controlling anything, and I don't think 8 lines of
838 * code qualifies for Advanced. It will also never interrupt anything. It
839 * does, however, allow us to get through the Linux boot code. */
475#ifdef CONFIG_X86_LOCAL_APIC 840#ifdef CONFIG_X86_LOCAL_APIC
476static void lguest_apic_write(unsigned long reg, unsigned long v) 841static void lguest_apic_write(unsigned long reg, unsigned long v)
477{ 842{
@@ -483,19 +848,32 @@ static unsigned long lguest_apic_read(unsigned long reg)
483} 848}
484#endif 849#endif
485 850
851/* STOP! Until an interrupt comes in. */
486static void lguest_safe_halt(void) 852static void lguest_safe_halt(void)
487{ 853{
488 hcall(LHCALL_HALT, 0, 0, 0); 854 hcall(LHCALL_HALT, 0, 0, 0);
489} 855}
490 856
857/* Perhaps CRASH isn't the best name for this hypercall, but we use it to get a
858 * message out when we're crashing as well as elegant termination like powering
859 * off.
860 *
861 * Note that the Host always prefers that the Guest speak in physical addresses
862 * rather than virtual addresses, so we use __pa() here. */
491static void lguest_power_off(void) 863static void lguest_power_off(void)
492{ 864{
493 hcall(LHCALL_CRASH, __pa("Power down"), 0, 0); 865 hcall(LHCALL_CRASH, __pa("Power down"), 0, 0);
494} 866}
495 867
868/*
869 * Panicing.
870 *
871 * Don't. But if you did, this is what happens.
872 */
496static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) 873static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
497{ 874{
498 hcall(LHCALL_CRASH, __pa(p), 0, 0); 875 hcall(LHCALL_CRASH, __pa(p), 0, 0);
876 /* The hcall won't return, but to keep gcc happy, we're "done". */
499 return NOTIFY_DONE; 877 return NOTIFY_DONE;
500} 878}
501 879
@@ -503,15 +881,45 @@ static struct notifier_block paniced = {
503 .notifier_call = lguest_panic 881 .notifier_call = lguest_panic
504}; 882};
505 883
884/* Setting up memory is fairly easy. */
506static __init char *lguest_memory_setup(void) 885static __init char *lguest_memory_setup(void)
507{ 886{
508 /* We do this here because lockcheck barfs if before start_kernel */ 887 /* We do this here and not earlier because lockcheck barfs if we do it
888 * before start_kernel() */
509 atomic_notifier_chain_register(&panic_notifier_list, &paniced); 889 atomic_notifier_chain_register(&panic_notifier_list, &paniced);
510 890
891 /* The Linux bootloader header contains an "e820" memory map: the
892 * Launcher populated the first entry with our memory limit. */
511 add_memory_region(E820_MAP->addr, E820_MAP->size, E820_MAP->type); 893 add_memory_region(E820_MAP->addr, E820_MAP->size, E820_MAP->type);
894
895 /* This string is for the boot messages. */
512 return "LGUEST"; 896 return "LGUEST";
513} 897}
514 898
899/*G:050
900 * Patching (Powerfully Placating Performance Pedants)
901 *
902 * We have already seen that "struct paravirt_ops" lets us replace simple
903 * native instructions with calls to the appropriate back end all throughout
904 * the kernel. This allows the same kernel to run as a Guest and as a native
905 * kernel, but it's slow because of all the indirect branches.
906 *
907 * Remember that David Wheeler quote about "Any problem in computer science can
908 * be solved with another layer of indirection"? The rest of that quote is
909 * "... But that usually will create another problem." This is the first of
910 * those problems.
911 *
912 * Our current solution is to allow the paravirt back end to optionally patch
913 * over the indirect calls to replace them with something more efficient. We
914 * patch the four most commonly called functions: disable interrupts, enable
915 * interrupts, restore interrupts and save interrupts. We usually have 10
916 * bytes to patch into: the Guest versions of these operations are small enough
917 * that we can fit comfortably.
918 *
919 * First we need assembly templates of each of the patchable Guest operations,
920 * and these are in lguest_asm.S. */
921
922/*G:060 We construct a table from the assembler templates: */
515static const struct lguest_insns 923static const struct lguest_insns
516{ 924{
517 const char *start, *end; 925 const char *start, *end;
@@ -521,35 +929,52 @@ static const struct lguest_insns
521 [PARAVIRT_PATCH(restore_fl)] = { lgstart_popf, lgend_popf }, 929 [PARAVIRT_PATCH(restore_fl)] = { lgstart_popf, lgend_popf },
522 [PARAVIRT_PATCH(save_fl)] = { lgstart_pushf, lgend_pushf }, 930 [PARAVIRT_PATCH(save_fl)] = { lgstart_pushf, lgend_pushf },
523}; 931};
932
933/* Now our patch routine is fairly simple (based on the native one in
934 * paravirt.c). If we have a replacement, we copy it in and return how much of
935 * the available space we used. */
524static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len) 936static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len)
525{ 937{
526 unsigned int insn_len; 938 unsigned int insn_len;
527 939
528 /* Don't touch it if we don't have a replacement */ 940 /* Don't do anything special if we don't have a replacement */
529 if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start) 941 if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
530 return paravirt_patch_default(type, clobber, insns, len); 942 return paravirt_patch_default(type, clobber, insns, len);
531 943
532 insn_len = lguest_insns[type].end - lguest_insns[type].start; 944 insn_len = lguest_insns[type].end - lguest_insns[type].start;
533 945
534 /* Similarly if we can't fit replacement. */ 946 /* Similarly if we can't fit replacement (shouldn't happen, but let's
947 * be thorough). */
535 if (len < insn_len) 948 if (len < insn_len)
536 return paravirt_patch_default(type, clobber, insns, len); 949 return paravirt_patch_default(type, clobber, insns, len);
537 950
951 /* Copy in our instructions. */
538 memcpy(insns, lguest_insns[type].start, insn_len); 952 memcpy(insns, lguest_insns[type].start, insn_len);
539 return insn_len; 953 return insn_len;
540} 954}
541 955
956/*G:030 Once we get to lguest_init(), we know we're a Guest. The paravirt_ops
957 * structure in the kernel provides a single point for (almost) every routine
958 * we have to override to avoid privileged instructions. */
542__init void lguest_init(void *boot) 959__init void lguest_init(void *boot)
543{ 960{
544 /* Copy boot parameters first. */ 961 /* Copy boot parameters first: the Launcher put the physical location
962 * in %esi, and head.S converted that to a virtual address and handed
963 * it to us. */
545 memcpy(&boot_params, boot, PARAM_SIZE); 964 memcpy(&boot_params, boot, PARAM_SIZE);
965 /* The boot parameters also tell us where the command-line is: save
966 * that, too. */
546 memcpy(boot_command_line, __va(boot_params.hdr.cmd_line_ptr), 967 memcpy(boot_command_line, __va(boot_params.hdr.cmd_line_ptr),
547 COMMAND_LINE_SIZE); 968 COMMAND_LINE_SIZE);
548 969
970 /* We're under lguest, paravirt is enabled, and we're running at
971 * privilege level 1, not 0 as normal. */
549 paravirt_ops.name = "lguest"; 972 paravirt_ops.name = "lguest";
550 paravirt_ops.paravirt_enabled = 1; 973 paravirt_ops.paravirt_enabled = 1;
551 paravirt_ops.kernel_rpl = 1; 974 paravirt_ops.kernel_rpl = 1;
552 975
976 /* We set up all the lguest overrides for sensitive operations. These
977 * are detailed with the operations themselves. */
553 paravirt_ops.save_fl = save_fl; 978 paravirt_ops.save_fl = save_fl;
554 paravirt_ops.restore_fl = restore_fl; 979 paravirt_ops.restore_fl = restore_fl;
555 paravirt_ops.irq_disable = irq_disable; 980 paravirt_ops.irq_disable = irq_disable;
@@ -592,21 +1017,50 @@ __init void lguest_init(void *boot)
592 paravirt_ops.time_init = lguest_time_init; 1017 paravirt_ops.time_init = lguest_time_init;
593 paravirt_ops.set_lazy_mode = lguest_lazy_mode; 1018 paravirt_ops.set_lazy_mode = lguest_lazy_mode;
594 paravirt_ops.wbinvd = lguest_wbinvd; 1019 paravirt_ops.wbinvd = lguest_wbinvd;
595 paravirt_ops.sched_clock = lguest_sched_clock; 1020 /* Now is a good time to look at the implementations of these functions
596 1021 * before returning to the rest of lguest_init(). */
1022
1023 /*G:070 Now we've seen all the paravirt_ops, we return to
1024 * lguest_init() where the rest of the fairly chaotic boot setup
1025 * occurs.
1026 *
1027 * The Host expects our first hypercall to tell it where our "struct
1028 * lguest_data" is, so we do that first. */
597 hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0); 1029 hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
598 1030
599 /* We use top of mem for initial pagetables. */ 1031 /* The native boot code sets up initial page tables immediately after
1032 * the kernel itself, and sets init_pg_tables_end so they're not
1033 * clobbered. The Launcher places our initial pagetables somewhere at
1034 * the top of our physical memory, so we don't need extra space: set
1035 * init_pg_tables_end to the end of the kernel. */
600 init_pg_tables_end = __pa(pg0); 1036 init_pg_tables_end = __pa(pg0);
601 1037
1038 /* Load the %fs segment register (the per-cpu segment register) with
1039 * the normal data segment to get through booting. */
602 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); 1040 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
603 1041
1042 /* Clear the part of the kernel data which is expected to be zero.
1043 * Normally it will be anyway, but if we're loading from a bzImage with
1044 * CONFIG_RELOCATALE=y, the relocations will be sitting here. */
1045 memset(__bss_start, 0, __bss_stop - __bss_start);
1046
1047 /* The Host uses the top of the Guest's virtual address space for the
1048 * Host<->Guest Switcher, and it tells us how much it needs in
1049 * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */
604 reserve_top_address(lguest_data.reserve_mem); 1050 reserve_top_address(lguest_data.reserve_mem);
605 1051
1052 /* If we don't initialize the lock dependency checker now, it crashes
1053 * paravirt_disable_iospace. */
606 lockdep_init(); 1054 lockdep_init();
607 1055
1056 /* The IDE code spends about 3 seconds probing for disks: if we reserve
1057 * all the I/O ports up front it can't get them and so doesn't probe.
1058 * Other device drivers are similar (but less severe). This cuts the
1059 * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */
608 paravirt_disable_iospace(); 1060 paravirt_disable_iospace();
609 1061
1062 /* This is messy CPU setup stuff which the native boot code does before
1063 * start_kernel, so we have to do, too: */
610 cpu_detect(&new_cpu_data); 1064 cpu_detect(&new_cpu_data);
611 /* head.S usually sets up the first capability word, so do it here. */ 1065 /* head.S usually sets up the first capability word, so do it here. */
612 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1066 new_cpu_data.x86_capability[0] = cpuid_edx(1);
@@ -617,14 +1071,27 @@ __init void lguest_init(void *boot)
617#ifdef CONFIG_X86_MCE 1071#ifdef CONFIG_X86_MCE
618 mce_disabled = 1; 1072 mce_disabled = 1;
619#endif 1073#endif
620
621#ifdef CONFIG_ACPI 1074#ifdef CONFIG_ACPI
622 acpi_disabled = 1; 1075 acpi_disabled = 1;
623 acpi_ht = 0; 1076 acpi_ht = 0;
624#endif 1077#endif
625 1078
1079 /* We set the perferred console to "hvc". This is the "hypervisor
1080 * virtual console" driver written by the PowerPC people, which we also
1081 * adapted for lguest's use. */
626 add_preferred_console("hvc", 0, NULL); 1082 add_preferred_console("hvc", 0, NULL);
627 1083
1084 /* Last of all, we set the power management poweroff hook to point to
1085 * the Guest routine to power off. */
628 pm_power_off = lguest_power_off; 1086 pm_power_off = lguest_power_off;
1087
1088 /* Now we're set up, call start_kernel() in init/main.c and we proceed
1089 * to boot as normal. It never returns. */
629 start_kernel(); 1090 start_kernel();
630} 1091}
1092/*
1093 * This marks the end of stage II of our journey, The Guest.
1094 *
1095 * It is now time for us to explore the nooks and crannies of the three Guest
1096 * devices and complete our understanding of the Guest in "make Drivers".
1097 */
diff --git a/drivers/lguest/lguest_asm.S b/drivers/lguest/lguest_asm.S
index a3dbf22ee365..f182c6a36209 100644
--- a/drivers/lguest/lguest_asm.S
+++ b/drivers/lguest/lguest_asm.S
@@ -4,15 +4,15 @@
4#include <asm/thread_info.h> 4#include <asm/thread_info.h>
5#include <asm/processor-flags.h> 5#include <asm/processor-flags.h>
6 6
7/* 7/*G:020 This is where we begin: we have a magic signature which the launcher
8 * This is where we begin: we have a magic signature which the launcher looks 8 * looks for. The plan is that the Linux boot protocol will be extended with a
9 * for. The plan is that the Linux boot protocol will be extended with a
10 * "platform type" field which will guide us here from the normal entry point, 9 * "platform type" field which will guide us here from the normal entry point,
11 * but for the moment this suffices. We pass the virtual address of the boot 10 * but for the moment this suffices. The normal boot code uses %esi for the
12 * info to lguest_init(). 11 * boot header, so we do too. We convert it to a virtual address by adding
12 * PAGE_OFFSET, and hand it to lguest_init() as its argument (ie. %eax).
13 * 13 *
14 * We put it in .init.text will be discarded after boot. 14 * The .section line puts this code in .init.text so it will be discarded after
15 */ 15 * boot. */
16.section .init.text, "ax", @progbits 16.section .init.text, "ax", @progbits
17.ascii "GenuineLguest" 17.ascii "GenuineLguest"
18 /* Set up initial stack. */ 18 /* Set up initial stack. */
@@ -21,7 +21,9 @@
21 addl $__PAGE_OFFSET, %eax 21 addl $__PAGE_OFFSET, %eax
22 jmp lguest_init 22 jmp lguest_init
23 23
24/* The templates for inline patching. */ 24/*G:055 We create a macro which puts the assembler code between lgstart_ and
25 * lgend_ markers. These templates end up in the .init.text section, so they
26 * are discarded after boot. */
25#define LGUEST_PATCH(name, insns...) \ 27#define LGUEST_PATCH(name, insns...) \
26 lgstart_##name: insns; lgend_##name:; \ 28 lgstart_##name: insns; lgend_##name:; \
27 .globl lgstart_##name; .globl lgend_##name 29 .globl lgstart_##name; .globl lgend_##name
@@ -30,24 +32,61 @@ LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
30LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled) 32LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled)
31LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled) 33LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled)
32LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) 34LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
35/*:*/
33 36
34.text 37.text
35/* These demark the EIP range where host should never deliver interrupts. */ 38/* These demark the EIP range where host should never deliver interrupts. */
36.global lguest_noirq_start 39.global lguest_noirq_start
37.global lguest_noirq_end 40.global lguest_noirq_end
38 41
39/* 42/*M:004 When the Host reflects a trap or injects an interrupt into the Guest,
40 * We move eflags word to lguest_data.irq_enabled to restore interrupt state. 43 * it sets the eflags interrupt bit on the stack based on
41 * For page faults, gpfs and virtual interrupts, the hypervisor has saved 44 * lguest_data.irq_enabled, so the Guest iret logic does the right thing when
42 * eflags manually, otherwise it was delivered directly and so eflags reflects 45 * restoring it. However, when the Host sets the Guest up for direct traps,
43 * the real machine IF state, ie. interrupts on. Since the kernel always dies 46 * such as system calls, the processor is the one to push eflags onto the
44 * if it takes such a trap with interrupts disabled anyway, turning interrupts 47 * stack, and the interrupt bit will be 1 (in reality, interrupts are always
45 * back on unconditionally here is OK. 48 * enabled in the Guest).
46 */ 49 *
50 * This turns out to be harmless: the only trap which should happen under Linux
51 * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
52 * regions), which has to be reflected through the Host anyway. If another
53 * trap *does* go off when interrupts are disabled, the Guest will panic, and
54 * we'll never get to this iret! :*/
55
56/*G:045 There is one final paravirt_op that the Guest implements, and glancing
57 * at it you can see why I left it to last. It's *cool*! It's in *assembler*!
58 *
59 * The "iret" instruction is used to return from an interrupt or trap. The
60 * stack looks like this:
61 * old address
62 * old code segment & privilege level
63 * old processor flags ("eflags")
64 *
65 * The "iret" instruction pops those values off the stack and restores them all
66 * at once. The only problem is that eflags includes the Interrupt Flag which
67 * the Guest can't change: the CPU will simply ignore it when we do an "iret".
68 * So we have to copy eflags from the stack to lguest_data.irq_enabled before
69 * we do the "iret".
70 *
71 * There are two problems with this: firstly, we need to use a register to do
72 * the copy and secondly, the whole thing needs to be atomic. The first
73 * problem is easy to solve: push %eax on the stack so we can use it, and then
74 * restore it at the end just before the real "iret".
75 *
76 * The second is harder: copying eflags to lguest_data.irq_enabled will turn
77 * interrupts on before we're finished, so we could be interrupted before we
78 * return to userspace or wherever. Our solution to this is to surround the
79 * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the
80 * Host that it is *never* to interrupt us there, even if interrupts seem to be
81 * enabled. */
47ENTRY(lguest_iret) 82ENTRY(lguest_iret)
48 pushl %eax 83 pushl %eax
49 movl 12(%esp), %eax 84 movl 12(%esp), %eax
50lguest_noirq_start: 85lguest_noirq_start:
86 /* Note the %ss: segment prefix here. Normal data accesses use the
87 * "ds" segment, but that will have already been restored for whatever
88 * we're returning to (such as userspace): we can't trust it. The %ss:
89 * prefix makes sure we use the stack segment, which is still valid. */
51 movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled 90 movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled
52 popl %eax 91 popl %eax
53 iret 92 iret
diff --git a/drivers/lguest/lguest_bus.c b/drivers/lguest/lguest_bus.c
index 18d6ab21a43b..55a7940ca732 100644
--- a/drivers/lguest/lguest_bus.c
+++ b/drivers/lguest/lguest_bus.c
@@ -1,3 +1,6 @@
1/*P:050 Lguest guests use a very simple bus for devices. It's a simple array
2 * of device descriptors contained just above the top of normal memory. The
3 * lguest bus is 80% tedious boilerplate code. :*/
1#include <linux/init.h> 4#include <linux/init.h>
2#include <linux/bootmem.h> 5#include <linux/bootmem.h>
3#include <linux/lguest_bus.h> 6#include <linux/lguest_bus.h>
@@ -43,6 +46,10 @@ static struct device_attribute lguest_dev_attrs[] = {
43 __ATTR_NULL 46 __ATTR_NULL
44}; 47};
45 48
49/*D:130 The generic bus infrastructure requires a function which says whether a
50 * device matches a driver. For us, it is simple: "struct lguest_driver"
51 * contains a "device_type" field which indicates what type of device it can
52 * handle, so we just cast the args and compare: */
46static int lguest_dev_match(struct device *_dev, struct device_driver *_drv) 53static int lguest_dev_match(struct device *_dev, struct device_driver *_drv)
47{ 54{
48 struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); 55 struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
@@ -50,6 +57,7 @@ static int lguest_dev_match(struct device *_dev, struct device_driver *_drv)
50 57
51 return (drv->device_type == lguest_devices[dev->index].type); 58 return (drv->device_type == lguest_devices[dev->index].type);
52} 59}
60/*:*/
53 61
54struct lguest_bus { 62struct lguest_bus {
55 struct bus_type bus; 63 struct bus_type bus;
@@ -68,11 +76,24 @@ static struct lguest_bus lguest_bus = {
68 } 76 }
69}; 77};
70 78
79/*D:140 This is the callback which occurs once the bus infrastructure matches
80 * up a device and driver, ie. in response to add_lguest_device() calling
81 * device_register(), or register_lguest_driver() calling driver_register().
82 *
83 * At the moment it's always the latter: the devices are added first, since
84 * scan_devices() is called from a "core_initcall", and the drivers themselves
85 * called later as a normal "initcall". But it would work the other way too.
86 *
87 * So now we have the happy couple, we add the status bit to indicate that we
88 * found a driver. If the driver truly loves the device, it will return
89 * happiness from its probe function (ok, perhaps this wasn't my greatest
90 * analogy), and we set the final "driver ok" bit so the Host sees it's all
91 * green. */
71static int lguest_dev_probe(struct device *_dev) 92static int lguest_dev_probe(struct device *_dev)
72{ 93{
73 int ret; 94 int ret;
74 struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); 95 struct lguest_device*dev = container_of(_dev,struct lguest_device,dev);
75 struct lguest_driver *drv = container_of(dev->dev.driver, 96 struct lguest_driver*drv = container_of(dev->dev.driver,
76 struct lguest_driver, drv); 97 struct lguest_driver, drv);
77 98
78 lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER; 99 lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER;
@@ -82,6 +103,10 @@ static int lguest_dev_probe(struct device *_dev)
82 return ret; 103 return ret;
83} 104}
84 105
106/* The last part of the bus infrastructure is the function lguest drivers use
107 * to register themselves. Firstly, we do nothing if there's no lguest bus
108 * (ie. this is not a Guest), otherwise we fill in the embedded generic "struct
109 * driver" fields and call the generic driver_register(). */
85int register_lguest_driver(struct lguest_driver *drv) 110int register_lguest_driver(struct lguest_driver *drv)
86{ 111{
87 if (!lguest_devices) 112 if (!lguest_devices)
@@ -94,12 +119,36 @@ int register_lguest_driver(struct lguest_driver *drv)
94 119
95 return driver_register(&drv->drv); 120 return driver_register(&drv->drv);
96} 121}
122
123/* At the moment we build all the drivers into the kernel because they're so
124 * simple: 8144 bytes for all three of them as I type this. And as the console
125 * really needs to be built in, it's actually only 3527 bytes for the network
126 * and block drivers.
127 *
128 * If they get complex it will make sense for them to be modularized, so we
129 * need to explicitly export the symbol.
130 *
131 * I don't think non-GPL modules make sense, so it's a GPL-only export.
132 */
97EXPORT_SYMBOL_GPL(register_lguest_driver); 133EXPORT_SYMBOL_GPL(register_lguest_driver);
98 134
135/*D:120 This is the core of the lguest bus: actually adding a new device.
136 * It's a separate function because it's neater that way, and because an
137 * earlier version of the code supported hotplug and unplug. They were removed
138 * early on because they were never used.
139 *
140 * As Andrew Tridgell says, "Untested code is buggy code".
141 *
142 * It's worth reading this carefully: we start with an index into the array of
143 * "struct lguest_device_desc"s indicating the device which is new: */
99static void add_lguest_device(unsigned int index) 144static void add_lguest_device(unsigned int index)
100{ 145{
101 struct lguest_device *new; 146 struct lguest_device *new;
102 147
148 /* Each "struct lguest_device_desc" has a "status" field, which the
149 * Guest updates as the device is probed. In the worst case, the Host
150 * can look at these bits to tell what part of device setup failed,
151 * even if the console isn't available. */
103 lguest_devices[index].status |= LGUEST_DEVICE_S_ACKNOWLEDGE; 152 lguest_devices[index].status |= LGUEST_DEVICE_S_ACKNOWLEDGE;
104 new = kmalloc(sizeof(struct lguest_device), GFP_KERNEL); 153 new = kmalloc(sizeof(struct lguest_device), GFP_KERNEL);
105 if (!new) { 154 if (!new) {
@@ -108,12 +157,17 @@ static void add_lguest_device(unsigned int index)
108 return; 157 return;
109 } 158 }
110 159
160 /* The "struct lguest_device" setup is pretty straight-forward example
161 * code. */
111 new->index = index; 162 new->index = index;
112 new->private = NULL; 163 new->private = NULL;
113 memset(&new->dev, 0, sizeof(new->dev)); 164 memset(&new->dev, 0, sizeof(new->dev));
114 new->dev.parent = &lguest_bus.dev; 165 new->dev.parent = &lguest_bus.dev;
115 new->dev.bus = &lguest_bus.bus; 166 new->dev.bus = &lguest_bus.bus;
116 sprintf(new->dev.bus_id, "%u", index); 167 sprintf(new->dev.bus_id, "%u", index);
168
169 /* device_register() causes the bus infrastructure to look for a
170 * matching driver. */
117 if (device_register(&new->dev) != 0) { 171 if (device_register(&new->dev) != 0) {
118 printk(KERN_EMERG "Cannot register lguest device %u\n", index); 172 printk(KERN_EMERG "Cannot register lguest device %u\n", index);
119 lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED; 173 lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED;
@@ -121,6 +175,9 @@ static void add_lguest_device(unsigned int index)
121 } 175 }
122} 176}
123 177
178/*D:110 scan_devices() simply iterates through the device array. The type 0
179 * is reserved to mean "no device", and anything else means we have found a
180 * device: add it. */
124static void scan_devices(void) 181static void scan_devices(void)
125{ 182{
126 unsigned int i; 183 unsigned int i;
@@ -130,12 +187,23 @@ static void scan_devices(void)
130 add_lguest_device(i); 187 add_lguest_device(i);
131} 188}
132 189
190/*D:100 Fairly early in boot, lguest_bus_init() is called to set up the lguest
191 * bus. We check that we are a Guest by checking paravirt_ops.name: there are
192 * other ways of checking, but this seems most obvious to me.
193 *
194 * So we can access the array of "struct lguest_device_desc"s easily, we map
195 * that memory and store the pointer in the global "lguest_devices". Then we
196 * register the bus with the core. Doing two registrations seems clunky to me,
197 * but it seems to be the correct sysfs incantation.
198 *
199 * Finally we call scan_devices() which adds all the devices found in the
200 * "struct lguest_device_desc" array. */
133static int __init lguest_bus_init(void) 201static int __init lguest_bus_init(void)
134{ 202{
135 if (strcmp(paravirt_ops.name, "lguest") != 0) 203 if (strcmp(paravirt_ops.name, "lguest") != 0)
136 return 0; 204 return 0;
137 205
138 /* Devices are in page above top of "normal" mem. */ 206 /* Devices are in a single page above top of "normal" mem */
139 lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1); 207 lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1);
140 208
141 if (bus_register(&lguest_bus.bus) != 0 209 if (bus_register(&lguest_bus.bus) != 0
@@ -145,4 +213,5 @@ static int __init lguest_bus_init(void)
145 scan_devices(); 213 scan_devices();
146 return 0; 214 return 0;
147} 215}
216/* Do this after core stuff, before devices. */
148postcore_initcall(lguest_bus_init); 217postcore_initcall(lguest_bus_init);
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index e90d7a783daf..80d1b58c7698 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -1,36 +1,70 @@
1/* Userspace control of the guest, via /dev/lguest. */ 1/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher
2 * controls and communicates with the Guest. For example, the first write will
3 * tell us the memory size, pagetable, entry point and kernel address offset.
4 * A read will run the Guest until a signal is pending (-EINTR), or the Guest
5 * does a DMA out to the Launcher. Writes are also used to get a DMA buffer
6 * registered by the Guest and to send the Guest an interrupt. :*/
2#include <linux/uaccess.h> 7#include <linux/uaccess.h>
3#include <linux/miscdevice.h> 8#include <linux/miscdevice.h>
4#include <linux/fs.h> 9#include <linux/fs.h>
5#include "lg.h" 10#include "lg.h"
6 11
12/*L:030 setup_regs() doesn't really belong in this file, but it gives us an
13 * early glimpse deeper into the Host so it's worth having here.
14 *
15 * Most of the Guest's registers are left alone: we used get_zeroed_page() to
16 * allocate the structure, so they will be 0. */
7static void setup_regs(struct lguest_regs *regs, unsigned long start) 17static void setup_regs(struct lguest_regs *regs, unsigned long start)
8{ 18{
9 /* Write out stack in format lguest expects, so we can switch to it. */ 19 /* There are four "segment" registers which the Guest needs to boot:
20 * The "code segment" register (cs) refers to the kernel code segment
21 * __KERNEL_CS, and the "data", "extra" and "stack" segment registers
22 * refer to the kernel data segment __KERNEL_DS.
23 *
24 * The privilege level is packed into the lower bits. The Guest runs
25 * at privilege level 1 (GUEST_PL).*/
10 regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; 26 regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
11 regs->cs = __KERNEL_CS|GUEST_PL; 27 regs->cs = __KERNEL_CS|GUEST_PL;
12 regs->eflags = 0x202; /* Interrupts enabled. */ 28
29 /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002)
30 * is supposed to always be "1". Bit 9 (0x200) controls whether
31 * interrupts are enabled. We always leave interrupts enabled while
32 * running the Guest. */
33 regs->eflags = 0x202;
34
35 /* The "Extended Instruction Pointer" register says where the Guest is
36 * running. */
13 regs->eip = start; 37 regs->eip = start;
14 /* esi points to our boot information (physical address 0) */ 38
39 /* %esi points to our boot information, at physical address 0, so don't
40 * touch it. */
15} 41}
16 42
17/* + addr */ 43/*L:310 To send DMA into the Guest, the Launcher needs to be able to ask for a
44 * DMA buffer. This is done by writing LHREQ_GETDMA and the key to
45 * /dev/lguest. */
18static long user_get_dma(struct lguest *lg, const u32 __user *input) 46static long user_get_dma(struct lguest *lg, const u32 __user *input)
19{ 47{
20 unsigned long key, udma, irq; 48 unsigned long key, udma, irq;
21 49
50 /* Fetch the key they wrote to us. */
22 if (get_user(key, input) != 0) 51 if (get_user(key, input) != 0)
23 return -EFAULT; 52 return -EFAULT;
53 /* Look for a free Guest DMA buffer bound to that key. */
24 udma = get_dma_buffer(lg, key, &irq); 54 udma = get_dma_buffer(lg, key, &irq);
25 if (!udma) 55 if (!udma)
26 return -ENOENT; 56 return -ENOENT;
27 57
28 /* We put irq number in udma->used_len. */ 58 /* We need to tell the Launcher what interrupt the Guest expects after
59 * the buffer is filled. We stash it in udma->used_len. */
29 lgwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq); 60 lgwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq);
61
62 /* The (guest-physical) address of the DMA buffer is returned from
63 * the write(). */
30 return udma; 64 return udma;
31} 65}
32 66
33/* To force the Guest to stop running and return to the Launcher, the 67/*L:315 To force the Guest to stop running and return to the Launcher, the
34 * Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest. The 68 * Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest. The
35 * Launcher then writes LHREQ_BREAK and "0" to release the Waker. */ 69 * Launcher then writes LHREQ_BREAK and "0" to release the Waker. */
36static int break_guest_out(struct lguest *lg, const u32 __user *input) 70static int break_guest_out(struct lguest *lg, const u32 __user *input)
@@ -54,7 +88,8 @@ static int break_guest_out(struct lguest *lg, const u32 __user *input)
54 } 88 }
55} 89}
56 90
57/* + irq */ 91/*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
92 * number to /dev/lguest. */
58static int user_send_irq(struct lguest *lg, const u32 __user *input) 93static int user_send_irq(struct lguest *lg, const u32 __user *input)
59{ 94{
60 u32 irq; 95 u32 irq;
@@ -63,14 +98,19 @@ static int user_send_irq(struct lguest *lg, const u32 __user *input)
63 return -EFAULT; 98 return -EFAULT;
64 if (irq >= LGUEST_IRQS) 99 if (irq >= LGUEST_IRQS)
65 return -EINVAL; 100 return -EINVAL;
101 /* Next time the Guest runs, the core code will see if it can deliver
102 * this interrupt. */
66 set_bit(irq, lg->irqs_pending); 103 set_bit(irq, lg->irqs_pending);
67 return 0; 104 return 0;
68} 105}
69 106
107/*L:040 Once our Guest is initialized, the Launcher makes it run by reading
108 * from /dev/lguest. */
70static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) 109static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
71{ 110{
72 struct lguest *lg = file->private_data; 111 struct lguest *lg = file->private_data;
73 112
113 /* You must write LHREQ_INITIALIZE first! */
74 if (!lg) 114 if (!lg)
75 return -EINVAL; 115 return -EINVAL;
76 116
@@ -78,27 +118,52 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
78 if (current != lg->tsk) 118 if (current != lg->tsk)
79 return -EPERM; 119 return -EPERM;
80 120
121 /* If the guest is already dead, we indicate why */
81 if (lg->dead) { 122 if (lg->dead) {
82 size_t len; 123 size_t len;
83 124
125 /* lg->dead either contains an error code, or a string. */
84 if (IS_ERR(lg->dead)) 126 if (IS_ERR(lg->dead))
85 return PTR_ERR(lg->dead); 127 return PTR_ERR(lg->dead);
86 128
129 /* We can only return as much as the buffer they read with. */
87 len = min(size, strlen(lg->dead)+1); 130 len = min(size, strlen(lg->dead)+1);
88 if (copy_to_user(user, lg->dead, len) != 0) 131 if (copy_to_user(user, lg->dead, len) != 0)
89 return -EFAULT; 132 return -EFAULT;
90 return len; 133 return len;
91 } 134 }
92 135
136 /* If we returned from read() last time because the Guest sent DMA,
137 * clear the flag. */
93 if (lg->dma_is_pending) 138 if (lg->dma_is_pending)
94 lg->dma_is_pending = 0; 139 lg->dma_is_pending = 0;
95 140
141 /* Run the Guest until something interesting happens. */
96 return run_guest(lg, (unsigned long __user *)user); 142 return run_guest(lg, (unsigned long __user *)user);
97} 143}
98 144
99/* Take: pfnlimit, pgdir, start, pageoffset. */ 145/*L:020 The initialization write supplies 4 32-bit values (in addition to the
146 * 32-bit LHREQ_INITIALIZE value). These are:
147 *
148 * pfnlimit: The highest (Guest-physical) page number the Guest should be
149 * allowed to access. The Launcher has to live in Guest memory, so it sets
150 * this to ensure the Guest can't reach it.
151 *
152 * pgdir: The (Guest-physical) address of the top of the initial Guest
153 * pagetables (which are set up by the Launcher).
154 *
155 * start: The first instruction to execute ("eip" in x86-speak).
156 *
157 * page_offset: The PAGE_OFFSET constant in the Guest kernel. We should
158 * probably wean the code off this, but it's a very useful constant! Any
159 * address above this is within the Guest kernel, and any kernel address can
160 * quickly converted from physical to virtual by adding PAGE_OFFSET. It's
161 * 0xC0000000 (3G) by default, but it's configurable at kernel build time.
162 */
100static int initialize(struct file *file, const u32 __user *input) 163static int initialize(struct file *file, const u32 __user *input)
101{ 164{
165 /* "struct lguest" contains everything we (the Host) know about a
166 * Guest. */
102 struct lguest *lg; 167 struct lguest *lg;
103 int err, i; 168 int err, i;
104 u32 args[4]; 169 u32 args[4];
@@ -106,7 +171,7 @@ static int initialize(struct file *file, const u32 __user *input)
106 /* We grab the Big Lguest lock, which protects the global array 171 /* We grab the Big Lguest lock, which protects the global array
107 * "lguests" and multiple simultaneous initializations. */ 172 * "lguests" and multiple simultaneous initializations. */
108 mutex_lock(&lguest_lock); 173 mutex_lock(&lguest_lock);
109 174 /* You can't initialize twice! Close the device and start again... */
110 if (file->private_data) { 175 if (file->private_data) {
111 err = -EBUSY; 176 err = -EBUSY;
112 goto unlock; 177 goto unlock;
@@ -117,37 +182,70 @@ static int initialize(struct file *file, const u32 __user *input)
117 goto unlock; 182 goto unlock;
118 } 183 }
119 184
185 /* Find an unused guest. */
120 i = find_free_guest(); 186 i = find_free_guest();
121 if (i < 0) { 187 if (i < 0) {
122 err = -ENOSPC; 188 err = -ENOSPC;
123 goto unlock; 189 goto unlock;
124 } 190 }
191 /* OK, we have an index into the "lguest" array: "lg" is a convenient
192 * pointer. */
125 lg = &lguests[i]; 193 lg = &lguests[i];
194
195 /* Populate the easy fields of our "struct lguest" */
126 lg->guestid = i; 196 lg->guestid = i;
127 lg->pfn_limit = args[0]; 197 lg->pfn_limit = args[0];
128 lg->page_offset = args[3]; 198 lg->page_offset = args[3];
199
200 /* We need a complete page for the Guest registers: they are accessible
201 * to the Guest and we can only grant it access to whole pages. */
129 lg->regs_page = get_zeroed_page(GFP_KERNEL); 202 lg->regs_page = get_zeroed_page(GFP_KERNEL);
130 if (!lg->regs_page) { 203 if (!lg->regs_page) {
131 err = -ENOMEM; 204 err = -ENOMEM;
132 goto release_guest; 205 goto release_guest;
133 } 206 }
207 /* We actually put the registers at the bottom of the page. */
134 lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs); 208 lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs);
135 209
210 /* Initialize the Guest's shadow page tables, using the toplevel
211 * address the Launcher gave us. This allocates memory, so can
212 * fail. */
136 err = init_guest_pagetable(lg, args[1]); 213 err = init_guest_pagetable(lg, args[1]);
137 if (err) 214 if (err)
138 goto free_regs; 215 goto free_regs;
139 216
217 /* Now we initialize the Guest's registers, handing it the start
218 * address. */
140 setup_regs(lg->regs, args[2]); 219 setup_regs(lg->regs, args[2]);
220
221 /* There are a couple of GDT entries the Guest expects when first
222 * booting. */
141 setup_guest_gdt(lg); 223 setup_guest_gdt(lg);
224
225 /* The timer for lguest's clock needs initialization. */
142 init_clockdev(lg); 226 init_clockdev(lg);
227
228 /* We keep a pointer to the Launcher task (ie. current task) for when
229 * other Guests want to wake this one (inter-Guest I/O). */
143 lg->tsk = current; 230 lg->tsk = current;
231 /* We need to keep a pointer to the Launcher's memory map, because if
232 * the Launcher dies we need to clean it up. If we don't keep a
233 * reference, it is destroyed before close() is called. */
144 lg->mm = get_task_mm(lg->tsk); 234 lg->mm = get_task_mm(lg->tsk);
235
236 /* Initialize the queue for the waker to wait on */
145 init_waitqueue_head(&lg->break_wq); 237 init_waitqueue_head(&lg->break_wq);
238
239 /* We remember which CPU's pages this Guest used last, for optimization
240 * when the same Guest runs on the same CPU twice. */
146 lg->last_pages = NULL; 241 lg->last_pages = NULL;
242
243 /* We keep our "struct lguest" in the file's private_data. */
147 file->private_data = lg; 244 file->private_data = lg;
148 245
149 mutex_unlock(&lguest_lock); 246 mutex_unlock(&lguest_lock);
150 247
248 /* And because this is a write() call, we return the length used. */
151 return sizeof(args); 249 return sizeof(args);
152 250
153free_regs: 251free_regs:
@@ -159,9 +257,15 @@ unlock:
159 return err; 257 return err;
160} 258}
161 259
260/*L:010 The first operation the Launcher does must be a write. All writes
261 * start with a 32 bit number: for the first write this must be
262 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use
263 * writes of other values to get DMA buffers and send interrupts. */
162static ssize_t write(struct file *file, const char __user *input, 264static ssize_t write(struct file *file, const char __user *input,
163 size_t size, loff_t *off) 265 size_t size, loff_t *off)
164{ 266{
267 /* Once the guest is initialized, we hold the "struct lguest" in the
268 * file private data. */
165 struct lguest *lg = file->private_data; 269 struct lguest *lg = file->private_data;
166 u32 req; 270 u32 req;
167 271
@@ -169,8 +273,11 @@ static ssize_t write(struct file *file, const char __user *input,
169 return -EFAULT; 273 return -EFAULT;
170 input += sizeof(req); 274 input += sizeof(req);
171 275
276 /* If you haven't initialized, you must do that first. */
172 if (req != LHREQ_INITIALIZE && !lg) 277 if (req != LHREQ_INITIALIZE && !lg)
173 return -EINVAL; 278 return -EINVAL;
279
280 /* Once the Guest is dead, all you can do is read() why it died. */
174 if (lg && lg->dead) 281 if (lg && lg->dead)
175 return -ENOENT; 282 return -ENOENT;
176 283
@@ -192,33 +299,72 @@ static ssize_t write(struct file *file, const char __user *input,
192 } 299 }
193} 300}
194 301
302/*L:060 The final piece of interface code is the close() routine. It reverses
303 * everything done in initialize(). This is usually called because the
304 * Launcher exited.
305 *
306 * Note that the close routine returns 0 or a negative error number: it can't
307 * really fail, but it can whine. I blame Sun for this wart, and K&R C for
308 * letting them do it. :*/
195static int close(struct inode *inode, struct file *file) 309static int close(struct inode *inode, struct file *file)
196{ 310{
197 struct lguest *lg = file->private_data; 311 struct lguest *lg = file->private_data;
198 312
313 /* If we never successfully initialized, there's nothing to clean up */
199 if (!lg) 314 if (!lg)
200 return 0; 315 return 0;
201 316
317 /* We need the big lock, to protect from inter-guest I/O and other
318 * Launchers initializing guests. */
202 mutex_lock(&lguest_lock); 319 mutex_lock(&lguest_lock);
203 /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ 320 /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
204 hrtimer_cancel(&lg->hrt); 321 hrtimer_cancel(&lg->hrt);
322 /* Free any DMA buffers the Guest had bound. */
205 release_all_dma(lg); 323 release_all_dma(lg);
324 /* Free up the shadow page tables for the Guest. */
206 free_guest_pagetable(lg); 325 free_guest_pagetable(lg);
326 /* Now all the memory cleanups are done, it's safe to release the
327 * Launcher's memory management structure. */
207 mmput(lg->mm); 328 mmput(lg->mm);
329 /* If lg->dead doesn't contain an error code it will be NULL or a
330 * kmalloc()ed string, either of which is ok to hand to kfree(). */
208 if (!IS_ERR(lg->dead)) 331 if (!IS_ERR(lg->dead))
209 kfree(lg->dead); 332 kfree(lg->dead);
333 /* We can free up the register page we allocated. */
210 free_page(lg->regs_page); 334 free_page(lg->regs_page);
335 /* We clear the entire structure, which also marks it as free for the
336 * next user. */
211 memset(lg, 0, sizeof(*lg)); 337 memset(lg, 0, sizeof(*lg));
338 /* Release lock and exit. */
212 mutex_unlock(&lguest_lock); 339 mutex_unlock(&lguest_lock);
340
213 return 0; 341 return 0;
214} 342}
215 343
344/*L:000
345 * Welcome to our journey through the Launcher!
346 *
347 * The Launcher is the Host userspace program which sets up, runs and services
348 * the Guest. In fact, many comments in the Drivers which refer to "the Host"
349 * doing things are inaccurate: the Launcher does all the device handling for
350 * the Guest. The Guest can't tell what's done by the the Launcher and what by
351 * the Host.
352 *
353 * Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we
354 * shall see more of that later.
355 *
356 * We begin our understanding with the Host kernel interface which the Launcher
357 * uses: reading and writing a character device called /dev/lguest. All the
358 * work happens in the read(), write() and close() routines: */
216static struct file_operations lguest_fops = { 359static struct file_operations lguest_fops = {
217 .owner = THIS_MODULE, 360 .owner = THIS_MODULE,
218 .release = close, 361 .release = close,
219 .write = write, 362 .write = write,
220 .read = read, 363 .read = read,
221}; 364};
365
366/* This is a textbook example of a "misc" character device. Populate a "struct
367 * miscdevice" and register it with misc_register(). */
222static struct miscdevice lguest_dev = { 368static struct miscdevice lguest_dev = {
223 .minor = MISC_DYNAMIC_MINOR, 369 .minor = MISC_DYNAMIC_MINOR,
224 .name = "lguest", 370 .name = "lguest",
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 1b0ba09b1269..b7a924ace684 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -1,5 +1,11 @@
1/* Shadow page table operations. 1/*P:700 The pagetable code, on the other hand, still shows the scars of
2 * Copyright (C) Rusty Russell IBM Corporation 2006. 2 * previous encounters. It's functional, and as neat as it can be in the
3 * circumstances, but be wary, for these things are subtle and break easily.
4 * The Guest provides a virtual to physical mapping, but we can neither trust
5 * it nor use it: we verify and convert it here to point the hardware to the
6 * actual Guest pages when running the Guest. :*/
7
8/* Copyright (C) Rusty Russell IBM Corporation 2006.
3 * GPL v2 and any later version */ 9 * GPL v2 and any later version */
4#include <linux/mm.h> 10#include <linux/mm.h>
5#include <linux/types.h> 11#include <linux/types.h>
@@ -9,38 +15,96 @@
9#include <asm/tlbflush.h> 15#include <asm/tlbflush.h>
10#include "lg.h" 16#include "lg.h"
11 17
18/*M:008 We hold reference to pages, which prevents them from being swapped.
19 * It'd be nice to have a callback in the "struct mm_struct" when Linux wants
20 * to swap out. If we had this, and a shrinker callback to trim PTE pages, we
21 * could probably consider launching Guests as non-root. :*/
22
23/*H:300
24 * The Page Table Code
25 *
26 * We use two-level page tables for the Guest. If you're not entirely
27 * comfortable with virtual addresses, physical addresses and page tables then
28 * I recommend you review lguest.c's "Page Table Handling" (with diagrams!).
29 *
30 * The Guest keeps page tables, but we maintain the actual ones here: these are
31 * called "shadow" page tables. Which is a very Guest-centric name: these are
32 * the real page tables the CPU uses, although we keep them up to date to
33 * reflect the Guest's. (See what I mean about weird naming? Since when do
34 * shadows reflect anything?)
35 *
36 * Anyway, this is the most complicated part of the Host code. There are seven
37 * parts to this:
38 * (i) Setting up a page table entry for the Guest when it faults,
39 * (ii) Setting up the page table entry for the Guest stack,
40 * (iii) Setting up a page table entry when the Guest tells us it has changed,
41 * (iv) Switching page tables,
42 * (v) Flushing (thowing away) page tables,
43 * (vi) Mapping the Switcher when the Guest is about to run,
44 * (vii) Setting up the page tables initially.
45 :*/
46
47/* Pages a 4k long, and each page table entry is 4 bytes long, giving us 1024
48 * (or 2^10) entries per page. */
12#define PTES_PER_PAGE_SHIFT 10 49#define PTES_PER_PAGE_SHIFT 10
13#define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT) 50#define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT)
51
52/* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is
53 * conveniently placed at the top 4MB, so it uses a separate, complete PTE
54 * page. */
14#define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1) 55#define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1)
15 56
57/* We actually need a separate PTE page for each CPU. Remember that after the
58 * Switcher code itself comes two pages for each CPU, and we don't want this
59 * CPU's guest to see the pages of any other CPU. */
16static DEFINE_PER_CPU(spte_t *, switcher_pte_pages); 60static DEFINE_PER_CPU(spte_t *, switcher_pte_pages);
17#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) 61#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
18 62
63/*H:320 With our shadow and Guest types established, we need to deal with
64 * them: the page table code is curly enough to need helper functions to keep
65 * it clear and clean.
66 *
67 * The first helper takes a virtual address, and says which entry in the top
68 * level page table deals with that address. Since each top level entry deals
69 * with 4M, this effectively divides by 4M. */
19static unsigned vaddr_to_pgd_index(unsigned long vaddr) 70static unsigned vaddr_to_pgd_index(unsigned long vaddr)
20{ 71{
21 return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); 72 return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
22} 73}
23 74
24/* These access the shadow versions (ie. the ones used by the CPU). */ 75/* There are two functions which return pointers to the shadow (aka "real")
76 * page tables.
77 *
78 * spgd_addr() takes the virtual address and returns a pointer to the top-level
79 * page directory entry for that address. Since we keep track of several page
80 * tables, the "i" argument tells us which one we're interested in (it's
81 * usually the current one). */
25static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) 82static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
26{ 83{
27 unsigned int index = vaddr_to_pgd_index(vaddr); 84 unsigned int index = vaddr_to_pgd_index(vaddr);
28 85
86 /* We kill any Guest trying to touch the Switcher addresses. */
29 if (index >= SWITCHER_PGD_INDEX) { 87 if (index >= SWITCHER_PGD_INDEX) {
30 kill_guest(lg, "attempt to access switcher pages"); 88 kill_guest(lg, "attempt to access switcher pages");
31 index = 0; 89 index = 0;
32 } 90 }
91 /* Return a pointer index'th pgd entry for the i'th page table. */
33 return &lg->pgdirs[i].pgdir[index]; 92 return &lg->pgdirs[i].pgdir[index];
34} 93}
35 94
95/* This routine then takes the PGD entry given above, which contains the
96 * address of the PTE page. It then returns a pointer to the PTE entry for the
97 * given address. */
36static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr) 98static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr)
37{ 99{
38 spte_t *page = __va(spgd.pfn << PAGE_SHIFT); 100 spte_t *page = __va(spgd.pfn << PAGE_SHIFT);
101 /* You should never call this if the PGD entry wasn't valid */
39 BUG_ON(!(spgd.flags & _PAGE_PRESENT)); 102 BUG_ON(!(spgd.flags & _PAGE_PRESENT));
40 return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE]; 103 return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE];
41} 104}
42 105
43/* These access the guest versions. */ 106/* These two functions just like the above two, except they access the Guest
107 * page tables. Hence they return a Guest address. */
44static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) 108static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr)
45{ 109{
46 unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); 110 unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
@@ -55,12 +119,24 @@ static unsigned long gpte_addr(struct lguest *lg,
55 return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t); 119 return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t);
56} 120}
57 121
58/* Do a virtual -> physical mapping on a user page. */ 122/*H:350 This routine takes a page number given by the Guest and converts it to
123 * an actual, physical page number. It can fail for several reasons: the
124 * virtual address might not be mapped by the Launcher, the write flag is set
125 * and the page is read-only, or the write flag was set and the page was
126 * shared so had to be copied, but we ran out of memory.
127 *
128 * This holds a reference to the page, so release_pte() is careful to
129 * put that back. */
59static unsigned long get_pfn(unsigned long virtpfn, int write) 130static unsigned long get_pfn(unsigned long virtpfn, int write)
60{ 131{
61 struct page *page; 132 struct page *page;
133 /* This value indicates failure. */
62 unsigned long ret = -1UL; 134 unsigned long ret = -1UL;
63 135
136 /* get_user_pages() is a complex interface: it gets the "struct
137 * vm_area_struct" and "struct page" assocated with a range of pages.
138 * It also needs the task's mmap_sem held, and is not very quick.
139 * It returns the number of pages it got. */
64 down_read(&current->mm->mmap_sem); 140 down_read(&current->mm->mmap_sem);
65 if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT, 141 if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT,
66 1, write, 1, &page, NULL) == 1) 142 1, write, 1, &page, NULL) == 1)
@@ -69,28 +145,47 @@ static unsigned long get_pfn(unsigned long virtpfn, int write)
69 return ret; 145 return ret;
70} 146}
71 147
148/*H:340 Converting a Guest page table entry to a shadow (ie. real) page table
149 * entry can be a little tricky. The flags are (almost) the same, but the
150 * Guest PTE contains a virtual page number: the CPU needs the real page
151 * number. */
72static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write) 152static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write)
73{ 153{
74 spte_t spte; 154 spte_t spte;
75 unsigned long pfn; 155 unsigned long pfn;
76 156
77 /* We ignore the global flag. */ 157 /* The Guest sets the global flag, because it thinks that it is using
158 * PGE. We only told it to use PGE so it would tell us whether it was
159 * flushing a kernel mapping or a userspace mapping. We don't actually
160 * use the global bit, so throw it away. */
78 spte.flags = (gpte.flags & ~_PAGE_GLOBAL); 161 spte.flags = (gpte.flags & ~_PAGE_GLOBAL);
162
163 /* We need a temporary "unsigned long" variable to hold the answer from
164 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
165 * fit in spte.pfn. get_pfn() finds the real physical number of the
166 * page, given the virtual number. */
79 pfn = get_pfn(gpte.pfn, write); 167 pfn = get_pfn(gpte.pfn, write);
80 if (pfn == -1UL) { 168 if (pfn == -1UL) {
81 kill_guest(lg, "failed to get page %u", gpte.pfn); 169 kill_guest(lg, "failed to get page %u", gpte.pfn);
82 /* Must not put_page() bogus page on cleanup. */ 170 /* When we destroy the Guest, we'll go through the shadow page
171 * tables and release_pte() them. Make sure we don't think
172 * this one is valid! */
83 spte.flags = 0; 173 spte.flags = 0;
84 } 174 }
175 /* Now we assign the page number, and our shadow PTE is complete. */
85 spte.pfn = pfn; 176 spte.pfn = pfn;
86 return spte; 177 return spte;
87} 178}
88 179
180/*H:460 And to complete the chain, release_pte() looks like this: */
89static void release_pte(spte_t pte) 181static void release_pte(spte_t pte)
90{ 182{
183 /* Remember that get_user_pages() took a reference to the page, in
184 * get_pfn()? We have to put it back now. */
91 if (pte.flags & _PAGE_PRESENT) 185 if (pte.flags & _PAGE_PRESENT)
92 put_page(pfn_to_page(pte.pfn)); 186 put_page(pfn_to_page(pte.pfn));
93} 187}
188/*:*/
94 189
95static void check_gpte(struct lguest *lg, gpte_t gpte) 190static void check_gpte(struct lguest *lg, gpte_t gpte)
96{ 191{
@@ -104,11 +199,16 @@ static void check_gpgd(struct lguest *lg, gpgd_t gpgd)
104 kill_guest(lg, "bad page directory entry"); 199 kill_guest(lg, "bad page directory entry");
105} 200}
106 201
107/* FIXME: We hold reference to pages, which prevents them from being 202/*H:330
108 swapped. It'd be nice to have a callback when Linux wants to swap out. */ 203 * (i) Setting up a page table entry for the Guest when it faults
109 204 *
110/* We fault pages in, which allows us to update accessed/dirty bits. 205 * We saw this call in run_guest(): when we see a page fault in the Guest, we
111 * Return true if we got page. */ 206 * come here. That's because we only set up the shadow page tables lazily as
207 * they're needed, so we get page faults all the time and quietly fix them up
208 * and return to the Guest without it knowing.
209 *
210 * If we fixed up the fault (ie. we mapped the address), this routine returns
211 * true. */
112int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) 212int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
113{ 213{
114 gpgd_t gpgd; 214 gpgd_t gpgd;
@@ -117,106 +217,161 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
117 gpte_t gpte; 217 gpte_t gpte;
118 spte_t *spte; 218 spte_t *spte;
119 219
220 /* First step: get the top-level Guest page table entry. */
120 gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr))); 221 gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr)));
222 /* Toplevel not present? We can't map it in. */
121 if (!(gpgd.flags & _PAGE_PRESENT)) 223 if (!(gpgd.flags & _PAGE_PRESENT))
122 return 0; 224 return 0;
123 225
226 /* Now look at the matching shadow entry. */
124 spgd = spgd_addr(lg, lg->pgdidx, vaddr); 227 spgd = spgd_addr(lg, lg->pgdidx, vaddr);
125 if (!(spgd->flags & _PAGE_PRESENT)) { 228 if (!(spgd->flags & _PAGE_PRESENT)) {
126 /* Get a page of PTEs for them. */ 229 /* No shadow entry: allocate a new shadow PTE page. */
127 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 230 unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
128 /* FIXME: Steal from self in this case? */ 231 /* This is not really the Guest's fault, but killing it is
232 * simple for this corner case. */
129 if (!ptepage) { 233 if (!ptepage) {
130 kill_guest(lg, "out of memory allocating pte page"); 234 kill_guest(lg, "out of memory allocating pte page");
131 return 0; 235 return 0;
132 } 236 }
237 /* We check that the Guest pgd is OK. */
133 check_gpgd(lg, gpgd); 238 check_gpgd(lg, gpgd);
239 /* And we copy the flags to the shadow PGD entry. The page
240 * number in the shadow PGD is the page we just allocated. */
134 spgd->raw.val = (__pa(ptepage) | gpgd.flags); 241 spgd->raw.val = (__pa(ptepage) | gpgd.flags);
135 } 242 }
136 243
244 /* OK, now we look at the lower level in the Guest page table: keep its
245 * address, because we might update it later. */
137 gpte_ptr = gpte_addr(lg, gpgd, vaddr); 246 gpte_ptr = gpte_addr(lg, gpgd, vaddr);
138 gpte = mkgpte(lgread_u32(lg, gpte_ptr)); 247 gpte = mkgpte(lgread_u32(lg, gpte_ptr));
139 248
140 /* No page? */ 249 /* If this page isn't in the Guest page tables, we can't page it in. */
141 if (!(gpte.flags & _PAGE_PRESENT)) 250 if (!(gpte.flags & _PAGE_PRESENT))
142 return 0; 251 return 0;
143 252
144 /* Write to read-only page? */ 253 /* Check they're not trying to write to a page the Guest wants
254 * read-only (bit 2 of errcode == write). */
145 if ((errcode & 2) && !(gpte.flags & _PAGE_RW)) 255 if ((errcode & 2) && !(gpte.flags & _PAGE_RW))
146 return 0; 256 return 0;
147 257
148 /* User access to a non-user page? */ 258 /* User access to a kernel page? (bit 3 == user access) */
149 if ((errcode & 4) && !(gpte.flags & _PAGE_USER)) 259 if ((errcode & 4) && !(gpte.flags & _PAGE_USER))
150 return 0; 260 return 0;
151 261
262 /* Check that the Guest PTE flags are OK, and the page number is below
263 * the pfn_limit (ie. not mapping the Launcher binary). */
152 check_gpte(lg, gpte); 264 check_gpte(lg, gpte);
265 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
153 gpte.flags |= _PAGE_ACCESSED; 266 gpte.flags |= _PAGE_ACCESSED;
154 if (errcode & 2) 267 if (errcode & 2)
155 gpte.flags |= _PAGE_DIRTY; 268 gpte.flags |= _PAGE_DIRTY;
156 269
157 /* We're done with the old pte. */ 270 /* Get the pointer to the shadow PTE entry we're going to set. */
158 spte = spte_addr(lg, *spgd, vaddr); 271 spte = spte_addr(lg, *spgd, vaddr);
272 /* If there was a valid shadow PTE entry here before, we release it.
273 * This can happen with a write to a previously read-only entry. */
159 release_pte(*spte); 274 release_pte(*spte);
160 275
161 /* We don't make it writable if this isn't a write: later 276 /* If this is a write, we insist that the Guest page is writable (the
162 * write will fault so we can set dirty bit in guest. */ 277 * final arg to gpte_to_spte()). */
163 if (gpte.flags & _PAGE_DIRTY) 278 if (gpte.flags & _PAGE_DIRTY)
164 *spte = gpte_to_spte(lg, gpte, 1); 279 *spte = gpte_to_spte(lg, gpte, 1);
165 else { 280 else {
281 /* If this is a read, don't set the "writable" bit in the page
282 * table entry, even if the Guest says it's writable. That way
283 * we come back here when a write does actually ocur, so we can
284 * update the Guest's _PAGE_DIRTY flag. */
166 gpte_t ro_gpte = gpte; 285 gpte_t ro_gpte = gpte;
167 ro_gpte.flags &= ~_PAGE_RW; 286 ro_gpte.flags &= ~_PAGE_RW;
168 *spte = gpte_to_spte(lg, ro_gpte, 0); 287 *spte = gpte_to_spte(lg, ro_gpte, 0);
169 } 288 }
170 289
171 /* Now we update dirty/accessed on guest. */ 290 /* Finally, we write the Guest PTE entry back: we've set the
291 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
172 lgwrite_u32(lg, gpte_ptr, gpte.raw.val); 292 lgwrite_u32(lg, gpte_ptr, gpte.raw.val);
293
294 /* We succeeded in mapping the page! */
173 return 1; 295 return 1;
174} 296}
175 297
176/* This is much faster than the full demand_page logic. */ 298/*H:360 (ii) Setting up the page table entry for the Guest stack.
299 *
300 * Remember pin_stack_pages() which makes sure the stack is mapped? It could
301 * simply call demand_page(), but as we've seen that logic is quite long, and
302 * usually the stack pages are already mapped anyway, so it's not required.
303 *
304 * This is a quick version which answers the question: is this virtual address
305 * mapped by the shadow page tables, and is it writable? */
177static int page_writable(struct lguest *lg, unsigned long vaddr) 306static int page_writable(struct lguest *lg, unsigned long vaddr)
178{ 307{
179 spgd_t *spgd; 308 spgd_t *spgd;
180 unsigned long flags; 309 unsigned long flags;
181 310
311 /* Look at the top level entry: is it present? */
182 spgd = spgd_addr(lg, lg->pgdidx, vaddr); 312 spgd = spgd_addr(lg, lg->pgdidx, vaddr);
183 if (!(spgd->flags & _PAGE_PRESENT)) 313 if (!(spgd->flags & _PAGE_PRESENT))
184 return 0; 314 return 0;
185 315
316 /* Check the flags on the pte entry itself: it must be present and
317 * writable. */
186 flags = spte_addr(lg, *spgd, vaddr)->flags; 318 flags = spte_addr(lg, *spgd, vaddr)->flags;
187 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); 319 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
188} 320}
189 321
322/* So, when pin_stack_pages() asks us to pin a page, we check if it's already
323 * in the page tables, and if not, we call demand_page() with error code 2
324 * (meaning "write"). */
190void pin_page(struct lguest *lg, unsigned long vaddr) 325void pin_page(struct lguest *lg, unsigned long vaddr)
191{ 326{
192 if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2)) 327 if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2))
193 kill_guest(lg, "bad stack page %#lx", vaddr); 328 kill_guest(lg, "bad stack page %#lx", vaddr);
194} 329}
195 330
331/*H:450 If we chase down the release_pgd() code, it looks like this: */
196static void release_pgd(struct lguest *lg, spgd_t *spgd) 332static void release_pgd(struct lguest *lg, spgd_t *spgd)
197{ 333{
334 /* If the entry's not present, there's nothing to release. */
198 if (spgd->flags & _PAGE_PRESENT) { 335 if (spgd->flags & _PAGE_PRESENT) {
199 unsigned int i; 336 unsigned int i;
337 /* Converting the pfn to find the actual PTE page is easy: turn
338 * the page number into a physical address, then convert to a
339 * virtual address (easy for kernel pages like this one). */
200 spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT); 340 spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT);
341 /* For each entry in the page, we might need to release it. */
201 for (i = 0; i < PTES_PER_PAGE; i++) 342 for (i = 0; i < PTES_PER_PAGE; i++)
202 release_pte(ptepage[i]); 343 release_pte(ptepage[i]);
344 /* Now we can free the page of PTEs */
203 free_page((long)ptepage); 345 free_page((long)ptepage);
346 /* And zero out the PGD entry we we never release it twice. */
204 spgd->raw.val = 0; 347 spgd->raw.val = 0;
205 } 348 }
206} 349}
207 350
351/*H:440 (v) Flushing (thowing away) page tables,
352 *
353 * We saw flush_user_mappings() called when we re-used a top-level pgdir page.
354 * It simply releases every PTE page from 0 up to the kernel address. */
208static void flush_user_mappings(struct lguest *lg, int idx) 355static void flush_user_mappings(struct lguest *lg, int idx)
209{ 356{
210 unsigned int i; 357 unsigned int i;
358 /* Release every pgd entry up to the kernel's address. */
211 for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++) 359 for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++)
212 release_pgd(lg, lg->pgdirs[idx].pgdir + i); 360 release_pgd(lg, lg->pgdirs[idx].pgdir + i);
213} 361}
214 362
363/* The Guest also has a hypercall to do this manually: it's used when a large
364 * number of mappings have been changed. */
215void guest_pagetable_flush_user(struct lguest *lg) 365void guest_pagetable_flush_user(struct lguest *lg)
216{ 366{
367 /* Drop the userspace part of the current page table. */
217 flush_user_mappings(lg, lg->pgdidx); 368 flush_user_mappings(lg, lg->pgdidx);
218} 369}
370/*:*/
219 371
372/* We keep several page tables. This is a simple routine to find the page
373 * table (if any) corresponding to this top-level address the Guest has given
374 * us. */
220static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) 375static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
221{ 376{
222 unsigned int i; 377 unsigned int i;
@@ -226,21 +381,30 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
226 return i; 381 return i;
227} 382}
228 383
384/*H:435 And this is us, creating the new page directory. If we really do
385 * allocate a new one (and so the kernel parts are not there), we set
386 * blank_pgdir. */
229static unsigned int new_pgdir(struct lguest *lg, 387static unsigned int new_pgdir(struct lguest *lg,
230 unsigned long cr3, 388 unsigned long cr3,
231 int *blank_pgdir) 389 int *blank_pgdir)
232{ 390{
233 unsigned int next; 391 unsigned int next;
234 392
393 /* We pick one entry at random to throw out. Choosing the Least
394 * Recently Used might be better, but this is easy. */
235 next = random32() % ARRAY_SIZE(lg->pgdirs); 395 next = random32() % ARRAY_SIZE(lg->pgdirs);
396 /* If it's never been allocated at all before, try now. */
236 if (!lg->pgdirs[next].pgdir) { 397 if (!lg->pgdirs[next].pgdir) {
237 lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL); 398 lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL);
399 /* If the allocation fails, just keep using the one we have */
238 if (!lg->pgdirs[next].pgdir) 400 if (!lg->pgdirs[next].pgdir)
239 next = lg->pgdidx; 401 next = lg->pgdidx;
240 else 402 else
241 /* There are no mappings: you'll need to re-pin */ 403 /* This is a blank page, so there are no kernel
404 * mappings: caller must map the stack! */
242 *blank_pgdir = 1; 405 *blank_pgdir = 1;
243 } 406 }
407 /* Record which Guest toplevel this shadows. */
244 lg->pgdirs[next].cr3 = cr3; 408 lg->pgdirs[next].cr3 = cr3;
245 /* Release all the non-kernel mappings. */ 409 /* Release all the non-kernel mappings. */
246 flush_user_mappings(lg, next); 410 flush_user_mappings(lg, next);
@@ -248,82 +412,161 @@ static unsigned int new_pgdir(struct lguest *lg,
248 return next; 412 return next;
249} 413}
250 414
415/*H:430 (iv) Switching page tables
416 *
417 * This is what happens when the Guest changes page tables (ie. changes the
418 * top-level pgdir). This happens on almost every context switch. */
251void guest_new_pagetable(struct lguest *lg, unsigned long pgtable) 419void guest_new_pagetable(struct lguest *lg, unsigned long pgtable)
252{ 420{
253 int newpgdir, repin = 0; 421 int newpgdir, repin = 0;
254 422
423 /* Look to see if we have this one already. */
255 newpgdir = find_pgdir(lg, pgtable); 424 newpgdir = find_pgdir(lg, pgtable);
425 /* If not, we allocate or mug an existing one: if it's a fresh one,
426 * repin gets set to 1. */
256 if (newpgdir == ARRAY_SIZE(lg->pgdirs)) 427 if (newpgdir == ARRAY_SIZE(lg->pgdirs))
257 newpgdir = new_pgdir(lg, pgtable, &repin); 428 newpgdir = new_pgdir(lg, pgtable, &repin);
429 /* Change the current pgd index to the new one. */
258 lg->pgdidx = newpgdir; 430 lg->pgdidx = newpgdir;
431 /* If it was completely blank, we map in the Guest kernel stack */
259 if (repin) 432 if (repin)
260 pin_stack_pages(lg); 433 pin_stack_pages(lg);
261} 434}
262 435
436/*H:470 Finally, a routine which throws away everything: all PGD entries in all
437 * the shadow page tables. This is used when we destroy the Guest. */
263static void release_all_pagetables(struct lguest *lg) 438static void release_all_pagetables(struct lguest *lg)
264{ 439{
265 unsigned int i, j; 440 unsigned int i, j;
266 441
442 /* Every shadow pagetable this Guest has */
267 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 443 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
268 if (lg->pgdirs[i].pgdir) 444 if (lg->pgdirs[i].pgdir)
445 /* Every PGD entry except the Switcher at the top */
269 for (j = 0; j < SWITCHER_PGD_INDEX; j++) 446 for (j = 0; j < SWITCHER_PGD_INDEX; j++)
270 release_pgd(lg, lg->pgdirs[i].pgdir + j); 447 release_pgd(lg, lg->pgdirs[i].pgdir + j);
271} 448}
272 449
450/* We also throw away everything when a Guest tells us it's changed a kernel
451 * mapping. Since kernel mappings are in every page table, it's easiest to
452 * throw them all away. This is amazingly slow, but thankfully rare. */
273void guest_pagetable_clear_all(struct lguest *lg) 453void guest_pagetable_clear_all(struct lguest *lg)
274{ 454{
275 release_all_pagetables(lg); 455 release_all_pagetables(lg);
456 /* We need the Guest kernel stack mapped again. */
276 pin_stack_pages(lg); 457 pin_stack_pages(lg);
277} 458}
278 459
460/*H:420 This is the routine which actually sets the page table entry for then
461 * "idx"'th shadow page table.
462 *
463 * Normally, we can just throw out the old entry and replace it with 0: if they
464 * use it demand_page() will put the new entry in. We need to do this anyway:
465 * The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page
466 * is read from, and _PAGE_DIRTY when it's written to.
467 *
468 * But Avi Kivity pointed out that most Operating Systems (Linux included) set
469 * these bits on PTEs immediately anyway. This is done to save the CPU from
470 * having to update them, but it helps us the same way: if they set
471 * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if
472 * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
473 */
279static void do_set_pte(struct lguest *lg, int idx, 474static void do_set_pte(struct lguest *lg, int idx,
280 unsigned long vaddr, gpte_t gpte) 475 unsigned long vaddr, gpte_t gpte)
281{ 476{
477 /* Look up the matching shadow page directot entry. */
282 spgd_t *spgd = spgd_addr(lg, idx, vaddr); 478 spgd_t *spgd = spgd_addr(lg, idx, vaddr);
479
480 /* If the top level isn't present, there's no entry to update. */
283 if (spgd->flags & _PAGE_PRESENT) { 481 if (spgd->flags & _PAGE_PRESENT) {
482 /* Otherwise, we start by releasing the existing entry. */
284 spte_t *spte = spte_addr(lg, *spgd, vaddr); 483 spte_t *spte = spte_addr(lg, *spgd, vaddr);
285 release_pte(*spte); 484 release_pte(*spte);
485
486 /* If they're setting this entry as dirty or accessed, we might
487 * as well put that entry they've given us in now. This shaves
488 * 10% off a copy-on-write micro-benchmark. */
286 if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) { 489 if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
287 check_gpte(lg, gpte); 490 check_gpte(lg, gpte);
288 *spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY); 491 *spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY);
289 } else 492 } else
493 /* Otherwise we can demand_page() it in later. */
290 spte->raw.val = 0; 494 spte->raw.val = 0;
291 } 495 }
292} 496}
293 497
498/*H:410 Updating a PTE entry is a little trickier.
499 *
500 * We keep track of several different page tables (the Guest uses one for each
501 * process, so it makes sense to cache at least a few). Each of these have
502 * identical kernel parts: ie. every mapping above PAGE_OFFSET is the same for
503 * all processes. So when the page table above that address changes, we update
504 * all the page tables, not just the current one. This is rare.
505 *
506 * The benefit is that when we have to track a new page table, we can copy keep
507 * all the kernel mappings. This speeds up context switch immensely. */
294void guest_set_pte(struct lguest *lg, 508void guest_set_pte(struct lguest *lg,
295 unsigned long cr3, unsigned long vaddr, gpte_t gpte) 509 unsigned long cr3, unsigned long vaddr, gpte_t gpte)
296{ 510{
297 /* Kernel mappings must be changed on all top levels. */ 511 /* Kernel mappings must be changed on all top levels. Slow, but
512 * doesn't happen often. */
298 if (vaddr >= lg->page_offset) { 513 if (vaddr >= lg->page_offset) {
299 unsigned int i; 514 unsigned int i;
300 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 515 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
301 if (lg->pgdirs[i].pgdir) 516 if (lg->pgdirs[i].pgdir)
302 do_set_pte(lg, i, vaddr, gpte); 517 do_set_pte(lg, i, vaddr, gpte);
303 } else { 518 } else {
519 /* Is this page table one we have a shadow for? */
304 int pgdir = find_pgdir(lg, cr3); 520 int pgdir = find_pgdir(lg, cr3);
305 if (pgdir != ARRAY_SIZE(lg->pgdirs)) 521 if (pgdir != ARRAY_SIZE(lg->pgdirs))
522 /* If so, do the update. */
306 do_set_pte(lg, pgdir, vaddr, gpte); 523 do_set_pte(lg, pgdir, vaddr, gpte);
307 } 524 }
308} 525}
309 526
527/*H:400
528 * (iii) Setting up a page table entry when the Guest tells us it has changed.
529 *
530 * Just like we did in interrupts_and_traps.c, it makes sense for us to deal
531 * with the other side of page tables while we're here: what happens when the
532 * Guest asks for a page table to be updated?
533 *
534 * We already saw that demand_page() will fill in the shadow page tables when
535 * needed, so we can simply remove shadow page table entries whenever the Guest
536 * tells us they've changed. When the Guest tries to use the new entry it will
537 * fault and demand_page() will fix it up.
538 *
539 * So with that in mind here's our code to to update a (top-level) PGD entry:
540 */
310void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx) 541void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx)
311{ 542{
312 int pgdir; 543 int pgdir;
313 544
545 /* The kernel seems to try to initialize this early on: we ignore its
546 * attempts to map over the Switcher. */
314 if (idx >= SWITCHER_PGD_INDEX) 547 if (idx >= SWITCHER_PGD_INDEX)
315 return; 548 return;
316 549
550 /* If they're talking about a page table we have a shadow for... */
317 pgdir = find_pgdir(lg, cr3); 551 pgdir = find_pgdir(lg, cr3);
318 if (pgdir < ARRAY_SIZE(lg->pgdirs)) 552 if (pgdir < ARRAY_SIZE(lg->pgdirs))
553 /* ... throw it away. */
319 release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); 554 release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);
320} 555}
321 556
557/*H:500 (vii) Setting up the page tables initially.
558 *
559 * When a Guest is first created, the Launcher tells us where the toplevel of
560 * its first page table is. We set some things up here: */
322int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) 561int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
323{ 562{
324 /* We assume this in flush_user_mappings, so check now */ 563 /* In flush_user_mappings() we loop from 0 to
564 * "vaddr_to_pgd_index(lg->page_offset)". This assumes it won't hit
565 * the Switcher mappings, so check that now. */
325 if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX) 566 if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX)
326 return -EINVAL; 567 return -EINVAL;
568 /* We start on the first shadow page table, and give it a blank PGD
569 * page. */
327 lg->pgdidx = 0; 570 lg->pgdidx = 0;
328 lg->pgdirs[lg->pgdidx].cr3 = pgtable; 571 lg->pgdirs[lg->pgdidx].cr3 = pgtable;
329 lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL); 572 lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL);
@@ -332,33 +575,48 @@ int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
332 return 0; 575 return 0;
333} 576}
334 577
578/* When a Guest dies, our cleanup is fairly simple. */
335void free_guest_pagetable(struct lguest *lg) 579void free_guest_pagetable(struct lguest *lg)
336{ 580{
337 unsigned int i; 581 unsigned int i;
338 582
583 /* Throw away all page table pages. */
339 release_all_pagetables(lg); 584 release_all_pagetables(lg);
585 /* Now free the top levels: free_page() can handle 0 just fine. */
340 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 586 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
341 free_page((long)lg->pgdirs[i].pgdir); 587 free_page((long)lg->pgdirs[i].pgdir);
342} 588}
343 589
344/* Caller must be preempt-safe */ 590/*H:480 (vi) Mapping the Switcher when the Guest is about to run.
591 *
592 * The Switcher and the two pages for this CPU need to be available to the
593 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages
594 * for each CPU already set up, we just need to hook them in. */
345void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) 595void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
346{ 596{
347 spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); 597 spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
348 spgd_t switcher_pgd; 598 spgd_t switcher_pgd;
349 spte_t regs_pte; 599 spte_t regs_pte;
350 600
351 /* Since switcher less that 4MB, we simply mug top pte page. */ 601 /* Make the last PGD entry for this Guest point to the Switcher's PTE
602 * page for this CPU (with appropriate flags). */
352 switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT; 603 switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT;
353 switcher_pgd.flags = _PAGE_KERNEL; 604 switcher_pgd.flags = _PAGE_KERNEL;
354 lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; 605 lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
355 606
356 /* Map our regs page over stack page. */ 607 /* We also change the Switcher PTE page. When we're running the Guest,
608 * we want the Guest's "regs" page to appear where the first Switcher
609 * page for this CPU is. This is an optimization: when the Switcher
610 * saves the Guest registers, it saves them into the first page of this
611 * CPU's "struct lguest_pages": if we make sure the Guest's register
612 * page is already mapped there, we don't have to copy them out
613 * again. */
357 regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT; 614 regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT;
358 regs_pte.flags = _PAGE_KERNEL; 615 regs_pte.flags = _PAGE_KERNEL;
359 switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE] 616 switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE]
360 = regs_pte; 617 = regs_pte;
361} 618}
619/*:*/
362 620
363static void free_switcher_pte_pages(void) 621static void free_switcher_pte_pages(void)
364{ 622{
@@ -368,6 +626,10 @@ static void free_switcher_pte_pages(void)
368 free_page((long)switcher_pte_page(i)); 626 free_page((long)switcher_pte_page(i));
369} 627}
370 628
629/*H:520 Setting up the Switcher PTE page for given CPU is fairly easy, given
630 * the CPU number and the "struct page"s for the Switcher code itself.
631 *
632 * Currently the Switcher is less than a page long, so "pages" is always 1. */
371static __init void populate_switcher_pte_page(unsigned int cpu, 633static __init void populate_switcher_pte_page(unsigned int cpu,
372 struct page *switcher_page[], 634 struct page *switcher_page[],
373 unsigned int pages) 635 unsigned int pages)
@@ -375,21 +637,26 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
375 unsigned int i; 637 unsigned int i;
376 spte_t *pte = switcher_pte_page(cpu); 638 spte_t *pte = switcher_pte_page(cpu);
377 639
640 /* The first entries are easy: they map the Switcher code. */
378 for (i = 0; i < pages; i++) { 641 for (i = 0; i < pages; i++) {
379 pte[i].pfn = page_to_pfn(switcher_page[i]); 642 pte[i].pfn = page_to_pfn(switcher_page[i]);
380 pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED; 643 pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED;
381 } 644 }
382 645
383 /* We only map this CPU's pages, so guest can't see others. */ 646 /* The only other thing we map is this CPU's pair of pages. */
384 i = pages + cpu*2; 647 i = pages + cpu*2;
385 648
386 /* First page (regs) is rw, second (state) is ro. */ 649 /* First page (Guest registers) is writable from the Guest */
387 pte[i].pfn = page_to_pfn(switcher_page[i]); 650 pte[i].pfn = page_to_pfn(switcher_page[i]);
388 pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW; 651 pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW;
652 /* The second page contains the "struct lguest_ro_state", and is
653 * read-only. */
389 pte[i+1].pfn = page_to_pfn(switcher_page[i+1]); 654 pte[i+1].pfn = page_to_pfn(switcher_page[i+1]);
390 pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED; 655 pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED;
391} 656}
392 657
658/*H:510 At boot or module load time, init_pagetables() allocates and populates
659 * the Switcher PTE page for each CPU. */
393__init int init_pagetables(struct page **switcher_page, unsigned int pages) 660__init int init_pagetables(struct page **switcher_page, unsigned int pages)
394{ 661{
395 unsigned int i; 662 unsigned int i;
@@ -404,7 +671,9 @@ __init int init_pagetables(struct page **switcher_page, unsigned int pages)
404 } 671 }
405 return 0; 672 return 0;
406} 673}
674/*:*/
407 675
676/* Cleaning up simply involves freeing the PTE page for each CPU. */
408void free_pagetables(void) 677void free_pagetables(void)
409{ 678{
410 free_switcher_pte_pages(); 679 free_switcher_pte_pages();
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
index 1b2cfe89dcd5..f675a41a80da 100644
--- a/drivers/lguest/segments.c
+++ b/drivers/lguest/segments.c
@@ -1,16 +1,68 @@
1/*P:600 The x86 architecture has segments, which involve a table of descriptors
2 * which can be used to do funky things with virtual address interpretation.
3 * We originally used to use segments so the Guest couldn't alter the
4 * Guest<->Host Switcher, and then we had to trim Guest segments, and restore
5 * for userspace per-thread segments, but trim again for on userspace->kernel
6 * transitions... This nightmarish creation was contained within this file,
7 * where we knew not to tread without heavy armament and a change of underwear.
8 *
9 * In these modern times, the segment handling code consists of simple sanity
10 * checks, and the worst you'll experience reading this code is butterfly-rash
11 * from frolicking through its parklike serenity. :*/
1#include "lg.h" 12#include "lg.h"
2 13
14/*H:600
15 * We've almost completed the Host; there's just one file to go!
16 *
17 * Segments & The Global Descriptor Table
18 *
19 * (That title sounds like a bad Nerdcore group. Not to suggest that there are
20 * any good Nerdcore groups, but in high school a friend of mine had a band
21 * called Joe Fish and the Chips, so there are definitely worse band names).
22 *
23 * To refresh: the GDT is a table of 8-byte values describing segments. Once
24 * set up, these segments can be loaded into one of the 6 "segment registers".
25 *
26 * GDT entries are passed around as "struct desc_struct"s, which like IDT
27 * entries are split into two 32-bit members, "a" and "b". One day, someone
28 * will clean that up, and be declared a Hero. (No pressure, I'm just saying).
29 *
30 * Anyway, the GDT entry contains a base (the start address of the segment), a
31 * limit (the size of the segment - 1), and some flags. Sounds simple, and it
32 * would be, except those zany Intel engineers decided that it was too boring
33 * to put the base at one end, the limit at the other, and the flags in
34 * between. They decided to shotgun the bits at random throughout the 8 bytes,
35 * like so:
36 *
37 * 0 16 40 48 52 56 63
38 * [ limit part 1 ][ base part 1 ][ flags ][li][fl][base ]
39 * mit ags part 2
40 * part 2
41 *
42 * As a result, this file contains a certain amount of magic numeracy. Let's
43 * begin.
44 */
45
46/* Is the descriptor the Guest wants us to put in OK?
47 *
48 * The flag which Intel says must be zero: must be zero. The descriptor must
49 * be present, (this is actually checked earlier but is here for thorougness),
50 * and the descriptor type must be 1 (a memory segment). */
3static int desc_ok(const struct desc_struct *gdt) 51static int desc_ok(const struct desc_struct *gdt)
4{ 52{
5 /* MBZ=0, P=1, DT=1 */
6 return ((gdt->b & 0x00209000) == 0x00009000); 53 return ((gdt->b & 0x00209000) == 0x00009000);
7} 54}
8 55
56/* Is the segment present? (Otherwise it can't be used by the Guest). */
9static int segment_present(const struct desc_struct *gdt) 57static int segment_present(const struct desc_struct *gdt)
10{ 58{
11 return gdt->b & 0x8000; 59 return gdt->b & 0x8000;
12} 60}
13 61
62/* There are several entries we don't let the Guest set. The TSS entry is the
63 * "Task State Segment" which controls all kinds of delicate things. The
64 * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the
65 * the Guest can't be trusted to deal with double faults. */
14static int ignored_gdt(unsigned int num) 66static int ignored_gdt(unsigned int num)
15{ 67{
16 return (num == GDT_ENTRY_TSS 68 return (num == GDT_ENTRY_TSS
@@ -19,9 +71,18 @@ static int ignored_gdt(unsigned int num)
19 || num == GDT_ENTRY_DOUBLEFAULT_TSS); 71 || num == GDT_ENTRY_DOUBLEFAULT_TSS);
20} 72}
21 73
22/* We don't allow removal of CS, DS or SS; it doesn't make sense. */ 74/* If the Guest asks us to remove an entry from the GDT, we have to be careful.
75 * If one of the segment registers is pointing at that entry the Switcher will
76 * crash when it tries to reload the segment registers for the Guest.
77 *
78 * It doesn't make much sense for the Guest to try to remove its own code, data
79 * or stack segments while they're in use: assume that's a Guest bug. If it's
80 * one of the lesser segment registers using the removed entry, we simply set
81 * that register to 0 (unusable). */
23static void check_segment_use(struct lguest *lg, unsigned int desc) 82static void check_segment_use(struct lguest *lg, unsigned int desc)
24{ 83{
84 /* GDT entries are 8 bytes long, so we divide to get the index and
85 * ignore the bottom bits. */
25 if (lg->regs->gs / 8 == desc) 86 if (lg->regs->gs / 8 == desc)
26 lg->regs->gs = 0; 87 lg->regs->gs = 0;
27 if (lg->regs->fs / 8 == desc) 88 if (lg->regs->fs / 8 == desc)
@@ -33,13 +94,21 @@ static void check_segment_use(struct lguest *lg, unsigned int desc)
33 || lg->regs->ss / 8 == desc) 94 || lg->regs->ss / 8 == desc)
34 kill_guest(lg, "Removed live GDT entry %u", desc); 95 kill_guest(lg, "Removed live GDT entry %u", desc);
35} 96}
36 97/*:*/
98/*M:009 We wouldn't need to check for removal of in-use segments if we handled
99 * faults in the Switcher. However, it's probably not a worthwhile
100 * optimization. :*/
101
102/*H:610 Once the GDT has been changed, we look through the changed entries and
103 * see if they're OK. If not, we'll call kill_guest() and the Guest will never
104 * get to use the invalid entries. */
37static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end) 105static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
38{ 106{
39 unsigned int i; 107 unsigned int i;
40 108
41 for (i = start; i < end; i++) { 109 for (i = start; i < end; i++) {
42 /* We never copy these ones to real gdt */ 110 /* We never copy these ones to real GDT, so we don't care what
111 * they say */
43 if (ignored_gdt(i)) 112 if (ignored_gdt(i))
44 continue; 113 continue;
45 114
@@ -53,41 +122,57 @@ static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
53 if (!desc_ok(&lg->gdt[i])) 122 if (!desc_ok(&lg->gdt[i]))
54 kill_guest(lg, "Bad GDT descriptor %i", i); 123 kill_guest(lg, "Bad GDT descriptor %i", i);
55 124
56 /* DPL 0 presumably means "for use by guest". */ 125 /* Segment descriptors contain a privilege level: the Guest is
126 * sometimes careless and leaves this as 0, even though it's
127 * running at privilege level 1. If so, we fix it here. */
57 if ((lg->gdt[i].b & 0x00006000) == 0) 128 if ((lg->gdt[i].b & 0x00006000) == 0)
58 lg->gdt[i].b |= (GUEST_PL << 13); 129 lg->gdt[i].b |= (GUEST_PL << 13);
59 130
60 /* Set accessed bit, since gdt isn't writable. */ 131 /* Each descriptor has an "accessed" bit. If we don't set it
132 * now, the CPU will try to set it when the Guest first loads
133 * that entry into a segment register. But the GDT isn't
134 * writable by the Guest, so bad things can happen. */
61 lg->gdt[i].b |= 0x00000100; 135 lg->gdt[i].b |= 0x00000100;
62 } 136 }
63} 137}
64 138
139/* This routine is called at boot or modprobe time for each CPU to set up the
140 * "constant" GDT entries for Guests running on that CPU. */
65void setup_default_gdt_entries(struct lguest_ro_state *state) 141void setup_default_gdt_entries(struct lguest_ro_state *state)
66{ 142{
67 struct desc_struct *gdt = state->guest_gdt; 143 struct desc_struct *gdt = state->guest_gdt;
68 unsigned long tss = (unsigned long)&state->guest_tss; 144 unsigned long tss = (unsigned long)&state->guest_tss;
69 145
70 /* Hypervisor segments. */ 146 /* The hypervisor segments are full 0-4G segments, privilege level 0 */
71 gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; 147 gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
72 gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; 148 gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
73 149
74 /* This is the one which we *cannot* copy from guest, since tss 150 /* The TSS segment refers to the TSS entry for this CPU, so we cannot
75 is depended on this lguest_ro_state, ie. this cpu. */ 151 * copy it from the Guest. Forgive the magic flags */
76 gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); 152 gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16);
77 gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) 153 gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000)
78 | ((tss >> 16) & 0x000000FF); 154 | ((tss >> 16) & 0x000000FF);
79} 155}
80 156
157/* This routine is called before the Guest is run for the first time. */
81void setup_guest_gdt(struct lguest *lg) 158void setup_guest_gdt(struct lguest *lg)
82{ 159{
160 /* Start with full 0-4G segments... */
83 lg->gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; 161 lg->gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
84 lg->gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; 162 lg->gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
163 /* ...except the Guest is allowed to use them, so set the privilege
164 * level appropriately in the flags. */
85 lg->gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); 165 lg->gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
86 lg->gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); 166 lg->gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
87} 167}
88 168
89/* This is a fast version for the common case where only the three TLS entries 169/* Like the IDT, we never simply use the GDT the Guest gives us. We set up the
90 * have changed. */ 170 * GDTs for each CPU, then we copy across the entries each time we want to run
171 * a different Guest on that CPU. */
172
173/* A partial GDT load, for the three "thead-local storage" entries. Otherwise
174 * it's just like load_guest_gdt(). So much, in fact, it would probably be
175 * neater to have a single hypercall to cover both. */
91void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt) 176void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
92{ 177{
93 unsigned int i; 178 unsigned int i;
@@ -96,22 +181,31 @@ void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
96 gdt[i] = lg->gdt[i]; 181 gdt[i] = lg->gdt[i];
97} 182}
98 183
184/* This is the full version */
99void copy_gdt(const struct lguest *lg, struct desc_struct *gdt) 185void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
100{ 186{
101 unsigned int i; 187 unsigned int i;
102 188
189 /* The default entries from setup_default_gdt_entries() are not
190 * replaced. See ignored_gdt() above. */
103 for (i = 0; i < GDT_ENTRIES; i++) 191 for (i = 0; i < GDT_ENTRIES; i++)
104 if (!ignored_gdt(i)) 192 if (!ignored_gdt(i))
105 gdt[i] = lg->gdt[i]; 193 gdt[i] = lg->gdt[i];
106} 194}
107 195
196/* This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). */
108void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num) 197void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
109{ 198{
199 /* We assume the Guest has the same number of GDT entries as the
200 * Host, otherwise we'd have to dynamically allocate the Guest GDT. */
110 if (num > ARRAY_SIZE(lg->gdt)) 201 if (num > ARRAY_SIZE(lg->gdt))
111 kill_guest(lg, "too many gdt entries %i", num); 202 kill_guest(lg, "too many gdt entries %i", num);
112 203
204 /* We read the whole thing in, then fix it up. */
113 lgread(lg, lg->gdt, table, num * sizeof(lg->gdt[0])); 205 lgread(lg, lg->gdt, table, num * sizeof(lg->gdt[0]));
114 fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->gdt)); 206 fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->gdt));
207 /* Mark that the GDT changed so the core knows it has to copy it again,
208 * even if the Guest is run on the same CPU. */
115 lg->changed |= CHANGED_GDT; 209 lg->changed |= CHANGED_GDT;
116} 210}
117 211
@@ -123,3 +217,13 @@ void guest_load_tls(struct lguest *lg, unsigned long gtls)
123 fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); 217 fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
124 lg->changed |= CHANGED_GDT_TLS; 218 lg->changed |= CHANGED_GDT_TLS;
125} 219}
220
221/*
222 * With this, we have finished the Host.
223 *
224 * Five of the seven parts of our task are complete. You have made it through
225 * the Bit of Despair (I think that's somewhere in the page table code,
226 * myself).
227 *
228 * Next, we examine "make Switcher". It's short, but intense.
229 */
diff --git a/drivers/lguest/switcher.S b/drivers/lguest/switcher.S
index eadd4cc299d2..d418179ea6b5 100644
--- a/drivers/lguest/switcher.S
+++ b/drivers/lguest/switcher.S
@@ -1,45 +1,136 @@
1/* This code sits at 0xFFC00000 to do the low-level guest<->host switch. 1/*P:900 This is the Switcher: code which sits at 0xFFC00000 to do the low-level
2 * Guest<->Host switch. It is as simple as it can be made, but it's naturally
3 * very specific to x86.
4 *
5 * You have now completed Preparation. If this has whet your appetite; if you
6 * are feeling invigorated and refreshed then the next, more challenging stage
7 * can be found in "make Guest". :*/
2 8
3 There is are two pages above us for this CPU (struct lguest_pages). 9/*S:100
4 The second page (struct lguest_ro_state) becomes read-only after the 10 * Welcome to the Switcher itself!
5 context switch. The first page (the stack for traps) remains writable, 11 *
6 but while we're in here, the guest cannot be running. 12 * This file contains the low-level code which changes the CPU to run the Guest
7*/ 13 * code, and returns to the Host when something happens. Understand this, and
14 * you understand the heart of our journey.
15 *
16 * Because this is in assembler rather than C, our tale switches from prose to
17 * verse. First I tried limericks:
18 *
19 * There once was an eax reg,
20 * To which our pointer was fed,
21 * It needed an add,
22 * Which asm-offsets.h had
23 * But this limerick is hurting my head.
24 *
25 * Next I tried haikus, but fitting the required reference to the seasons in
26 * every stanza was quickly becoming tiresome:
27 *
28 * The %eax reg
29 * Holds "struct lguest_pages" now:
30 * Cherry blossoms fall.
31 *
32 * Then I started with Heroic Verse, but the rhyming requirement leeched away
33 * the content density and led to some uniquely awful oblique rhymes:
34 *
35 * These constants are coming from struct offsets
36 * For use within the asm switcher text.
37 *
38 * Finally, I settled for something between heroic hexameter, and normal prose
39 * with inappropriate linebreaks. Anyway, it aint no Shakespeare.
40 */
41
42// Not all kernel headers work from assembler
43// But these ones are needed: the ENTRY() define
44// And constants extracted from struct offsets
45// To avoid magic numbers and breakage:
46// Should they change the compiler can't save us
47// Down here in the depths of assembler code.
8#include <linux/linkage.h> 48#include <linux/linkage.h>
9#include <asm/asm-offsets.h> 49#include <asm/asm-offsets.h>
10#include "lg.h" 50#include "lg.h"
11 51
52// We mark the start of the code to copy
53// It's placed in .text tho it's never run here
54// You'll see the trick macro at the end
55// Which interleaves data and text to effect.
12.text 56.text
13ENTRY(start_switcher_text) 57ENTRY(start_switcher_text)
14 58
15/* %eax points to lguest pages for this CPU. %ebx contains cr3 value. 59// When we reach switch_to_guest we have just left
16 All normal registers can be clobbered! */ 60// The safe and comforting shores of C code
61// %eax has the "struct lguest_pages" to use
62// Where we save state and still see it from the Guest
63// And %ebx holds the Guest shadow pagetable:
64// Once set we have truly left Host behind.
17ENTRY(switch_to_guest) 65ENTRY(switch_to_guest)
18 /* Save host segments on host stack. */ 66 // We told gcc all its regs could fade,
67 // Clobbered by our journey into the Guest
68 // We could have saved them, if we tried
69 // But time is our master and cycles count.
70
71 // Segment registers must be saved for the Host
72 // We push them on the Host stack for later
19 pushl %es 73 pushl %es
20 pushl %ds 74 pushl %ds
21 pushl %gs 75 pushl %gs
22 pushl %fs 76 pushl %fs
23 /* With CONFIG_FRAME_POINTER, gcc doesn't let us clobber this! */ 77 // But the compiler is fickle, and heeds
78 // No warning of %ebp clobbers
79 // When frame pointers are used. That register
80 // Must be saved and restored or chaos strikes.
24 pushl %ebp 81 pushl %ebp
25 /* Save host stack. */ 82 // The Host's stack is done, now save it away
83 // In our "struct lguest_pages" at offset
84 // Distilled into asm-offsets.h
26 movl %esp, LGUEST_PAGES_host_sp(%eax) 85 movl %esp, LGUEST_PAGES_host_sp(%eax)
27 /* Switch to guest stack: if we get NMI we expect to be there. */ 86
87 // All saved and there's now five steps before us:
88 // Stack, GDT, IDT, TSS
89 // And last of all the page tables are flipped.
90
91 // Yet beware that our stack pointer must be
92 // Always valid lest an NMI hits
93 // %edx does the duty here as we juggle
94 // %eax is lguest_pages: our stack lies within.
28 movl %eax, %edx 95 movl %eax, %edx
29 addl $LGUEST_PAGES_regs, %edx 96 addl $LGUEST_PAGES_regs, %edx
30 movl %edx, %esp 97 movl %edx, %esp
31 /* Switch to guest's GDT, IDT. */ 98
99 // The Guest's GDT we so carefully
100 // Placed in the "struct lguest_pages" before
32 lgdt LGUEST_PAGES_guest_gdt_desc(%eax) 101 lgdt LGUEST_PAGES_guest_gdt_desc(%eax)
102
103 // The Guest's IDT we did partially
104 // Move to the "struct lguest_pages" as well.
33 lidt LGUEST_PAGES_guest_idt_desc(%eax) 105 lidt LGUEST_PAGES_guest_idt_desc(%eax)
34 /* Switch to guest's TSS while GDT still writable. */ 106
107 // The TSS entry which controls traps
108 // Must be loaded up with "ltr" now:
109 // For after we switch over our page tables
110 // It (as the rest) will be writable no more.
111 // (The GDT entry TSS needs
112 // Changes type when we load it: damn Intel!)
35 movl $(GDT_ENTRY_TSS*8), %edx 113 movl $(GDT_ENTRY_TSS*8), %edx
36 ltr %dx 114 ltr %dx
37 /* Set host's TSS GDT entry to available (clear byte 5 bit 2). */ 115
116 // Look back now, before we take this last step!
117 // The Host's TSS entry was also marked used;
118 // Let's clear it again, ere we return.
119 // The GDT descriptor of the Host
120 // Points to the table after two "size" bytes
38 movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx 121 movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
122 // Clear the type field of "used" (byte 5, bit 2)
39 andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx) 123 andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
40 /* Switch to guest page tables: lguest_pages->state now read-only. */ 124
125 // Once our page table's switched, the Guest is live!
126 // The Host fades as we run this final step.
127 // Our "struct lguest_pages" is now read-only.
41 movl %ebx, %cr3 128 movl %ebx, %cr3
42 /* Restore guest regs */ 129
130 // The page table change did one tricky thing:
131 // The Guest's register page has been mapped
132 // Writable onto our %esp (stack) --
133 // We can simply pop off all Guest regs.
43 popl %ebx 134 popl %ebx
44 popl %ecx 135 popl %ecx
45 popl %edx 136 popl %edx
@@ -51,12 +142,27 @@ ENTRY(switch_to_guest)
51 popl %fs 142 popl %fs
52 popl %ds 143 popl %ds
53 popl %es 144 popl %es
54 /* Skip error code and trap number */ 145
146 // Near the base of the stack lurk two strange fields
147 // Which we fill as we exit the Guest
148 // These are the trap number and its error
149 // We can simply step past them on our way.
55 addl $8, %esp 150 addl $8, %esp
151
152 // The last five stack slots hold return address
153 // And everything needed to change privilege
154 // Into the Guest privilege level of 1,
155 // And the stack where the Guest had last left it.
156 // Interrupts are turned back on: we are Guest.
56 iret 157 iret
57 158
159// There are two paths where we switch to the Host
160// So we put the routine in a macro.
161// We are on our way home, back to the Host
162// Interrupted out of the Guest, we come here.
58#define SWITCH_TO_HOST \ 163#define SWITCH_TO_HOST \
59 /* Save guest state */ \ 164 /* We save the Guest state: all registers first \
165 * Laid out just as "struct lguest_regs" defines */ \
60 pushl %es; \ 166 pushl %es; \
61 pushl %ds; \ 167 pushl %ds; \
62 pushl %fs; \ 168 pushl %fs; \
@@ -68,58 +174,119 @@ ENTRY(switch_to_guest)
68 pushl %edx; \ 174 pushl %edx; \
69 pushl %ecx; \ 175 pushl %ecx; \
70 pushl %ebx; \ 176 pushl %ebx; \
71 /* Load lguest ds segment for convenience. */ \ 177 /* Our stack and our code are using segments \
178 * Set in the TSS and IDT \
179 * Yet if we were to touch data we'd use \
180 * Whatever data segment the Guest had. \
181 * Load the lguest ds segment for now. */ \
72 movl $(LGUEST_DS), %eax; \ 182 movl $(LGUEST_DS), %eax; \
73 movl %eax, %ds; \ 183 movl %eax, %ds; \
74 /* Figure out where we are, based on stack (at top of regs). */ \ 184 /* So where are we? Which CPU, which struct? \
185 * The stack is our clue: our TSS sets \
186 * It at the end of "struct lguest_pages" \
187 * And we then pushed and pushed and pushed Guest regs: \
188 * Now stack points atop the "struct lguest_regs". \
189 * Subtract that offset, and we find our struct. */ \
75 movl %esp, %eax; \ 190 movl %esp, %eax; \
76 subl $LGUEST_PAGES_regs, %eax; \ 191 subl $LGUEST_PAGES_regs, %eax; \
77 /* Put trap number in %ebx before we switch cr3 and lose it. */ \ 192 /* Save our trap number: the switch will obscure it \
193 * (The Guest regs are not mapped here in the Host) \
194 * %ebx holds it safe for deliver_to_host */ \
78 movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \ 195 movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \
79 /* Switch to host page tables (host GDT, IDT and stack are in host \ 196 /* The Host GDT, IDT and stack! \
80 mem, so need this first) */ \ 197 * All these lie safely hidden from the Guest: \
198 * We must return to the Host page tables \
199 * (Hence that was saved in struct lguest_pages) */ \
81 movl LGUEST_PAGES_host_cr3(%eax), %edx; \ 200 movl LGUEST_PAGES_host_cr3(%eax), %edx; \
82 movl %edx, %cr3; \ 201 movl %edx, %cr3; \
83 /* Set guest's TSS to available (clear byte 5 bit 2). */ \ 202 /* As before, when we looked back at the Host \
203 * As we left and marked TSS unused \
204 * So must we now for the Guest left behind. */ \
84 andb $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \ 205 andb $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \
85 /* Switch to host's GDT & IDT. */ \ 206 /* Switch to Host's GDT, IDT. */ \
86 lgdt LGUEST_PAGES_host_gdt_desc(%eax); \ 207 lgdt LGUEST_PAGES_host_gdt_desc(%eax); \
87 lidt LGUEST_PAGES_host_idt_desc(%eax); \ 208 lidt LGUEST_PAGES_host_idt_desc(%eax); \
88 /* Switch to host's stack. */ \ 209 /* Restore the Host's stack where it's saved regs lie */ \
89 movl LGUEST_PAGES_host_sp(%eax), %esp; \ 210 movl LGUEST_PAGES_host_sp(%eax), %esp; \
90 /* Switch to host's TSS */ \ 211 /* Last the TSS: our Host is complete */ \
91 movl $(GDT_ENTRY_TSS*8), %edx; \ 212 movl $(GDT_ENTRY_TSS*8), %edx; \
92 ltr %dx; \ 213 ltr %dx; \
214 /* Restore now the regs saved right at the first. */ \
93 popl %ebp; \ 215 popl %ebp; \
94 popl %fs; \ 216 popl %fs; \
95 popl %gs; \ 217 popl %gs; \
96 popl %ds; \ 218 popl %ds; \
97 popl %es 219 popl %es
98 220
99/* Return to run_guest_once. */ 221// Here's where we come when the Guest has just trapped:
222// (Which trap we'll see has been pushed on the stack).
223// We need only switch back, and the Host will decode
224// Why we came home, and what needs to be done.
100return_to_host: 225return_to_host:
101 SWITCH_TO_HOST 226 SWITCH_TO_HOST
102 iret 227 iret
103 228
229// An interrupt, with some cause external
230// Has ajerked us rudely from the Guest's code
231// Again we must return home to the Host
104deliver_to_host: 232deliver_to_host:
105 SWITCH_TO_HOST 233 SWITCH_TO_HOST
106 /* Decode IDT and jump to hosts' irq handler. When that does iret, it 234 // But now we must go home via that place
107 * will return to run_guest_once. This is a feature. */ 235 // Where that interrupt was supposed to go
236 // Had we not been ensconced, running the Guest.
237 // Here we see the cleverness of our stack:
238 // The Host stack is formed like an interrupt
239 // With EIP, CS and EFLAGS layered.
240 // Interrupt handlers end with "iret"
241 // And that will take us home at long long last.
242
243 // But first we must find the handler to call!
244 // The IDT descriptor for the Host
245 // Has two bytes for size, and four for address:
246 // %edx will hold it for us for now.
108 movl (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx 247 movl (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx
248 // We now know the table address we need,
249 // And saved the trap's number inside %ebx.
250 // Yet the pointer to the handler is smeared
251 // Across the bits of the table entry.
252 // What oracle can tell us how to extract
253 // From such a convoluted encoding?
254 // I consulted gcc, and it gave
255 // These instructions, which I gladly credit:
109 leal (%edx,%ebx,8), %eax 256 leal (%edx,%ebx,8), %eax
110 movzwl (%eax),%edx 257 movzwl (%eax),%edx
111 movl 4(%eax), %eax 258 movl 4(%eax), %eax
112 xorw %ax, %ax 259 xorw %ax, %ax
113 orl %eax, %edx 260 orl %eax, %edx
261 // Now the address of the handler's in %edx
262 // We call it now: its "iret" takes us home.
114 jmp *%edx 263 jmp *%edx
115 264
116/* Real hardware interrupts are delivered straight to the host. Others 265// Every interrupt can come to us here
117 cause us to return to run_guest_once so it can decide what to do. Note 266// But we must truly tell each apart.
118 that some of these are overridden by the guest to deliver directly, and 267// They number two hundred and fifty six
119 never enter here (see load_guest_idt_entry). */ 268// And each must land in a different spot,
269// Push its number on stack, and join the stream.
270
271// And worse, a mere six of the traps stand apart
272// And push on their stack an addition:
273// An error number, thirty two bits long
274// So we punish the other two fifty
275// And make them push a zero so they match.
276
277// Yet two fifty six entries is long
278// And all will look most the same as the last
279// So we create a macro which can make
280// As many entries as we need to fill.
281
282// Note the change to .data then .text:
283// We plant the address of each entry
284// Into a (data) table for the Host
285// To know where each Guest interrupt should go.
120.macro IRQ_STUB N TARGET 286.macro IRQ_STUB N TARGET
121 .data; .long 1f; .text; 1: 287 .data; .long 1f; .text; 1:
122 /* Make an error number for most traps, which don't have one. */ 288 // Trap eight, ten through fourteen and seventeen
289 // Supply an error number. Else zero.
123 .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17) 290 .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
124 pushl $0 291 pushl $0
125 .endif 292 .endif
@@ -128,6 +295,8 @@ deliver_to_host:
128 ALIGN 295 ALIGN
129.endm 296.endm
130 297
298// This macro creates numerous entries
299// Using GAS macros which out-power C's.
131.macro IRQ_STUBS FIRST LAST TARGET 300.macro IRQ_STUBS FIRST LAST TARGET
132 irq=\FIRST 301 irq=\FIRST
133 .rept \LAST-\FIRST+1 302 .rept \LAST-\FIRST+1
@@ -136,24 +305,43 @@ deliver_to_host:
136 .endr 305 .endr
137.endm 306.endm
138 307
139/* We intercept every interrupt, because we may need to switch back to 308// Here's the marker for our pointer table
140 * host. Unfortunately we can't tell them apart except by entry 309// Laid in the data section just before
141 * point, so we need 256 entry points. 310// Each macro places the address of code
142 */ 311// Forming an array: each one points to text
312// Which handles interrupt in its turn.
143.data 313.data
144.global default_idt_entries 314.global default_idt_entries
145default_idt_entries: 315default_idt_entries:
146.text 316.text
147 IRQ_STUBS 0 1 return_to_host /* First two traps */ 317 // The first two traps go straight back to the Host
148 IRQ_STUB 2 handle_nmi /* NMI */ 318 IRQ_STUBS 0 1 return_to_host
149 IRQ_STUBS 3 31 return_to_host /* Rest of traps */ 319 // We'll say nothing, yet, about NMI
150 IRQ_STUBS 32 127 deliver_to_host /* Real interrupts */ 320 IRQ_STUB 2 handle_nmi
151 IRQ_STUB 128 return_to_host /* System call (overridden) */ 321 // Other traps also return to the Host
152 IRQ_STUBS 129 255 deliver_to_host /* Other real interrupts */ 322 IRQ_STUBS 3 31 return_to_host
153 323 // All interrupts go via their handlers
154/* We ignore NMI and return. */ 324 IRQ_STUBS 32 127 deliver_to_host
325 // 'Cept system calls coming from userspace
326 // Are to go to the Guest, never the Host.
327 IRQ_STUB 128 return_to_host
328 IRQ_STUBS 129 255 deliver_to_host
329
330// The NMI, what a fabulous beast
331// Which swoops in and stops us no matter that
332// We're suspended between heaven and hell,
333// (Or more likely between the Host and Guest)
334// When in it comes! We are dazed and confused
335// So we do the simplest thing which one can.
336// Though we've pushed the trap number and zero
337// We discard them, return, and hope we live.
155handle_nmi: 338handle_nmi:
156 addl $8, %esp 339 addl $8, %esp
157 iret 340 iret
158 341
342// We are done; all that's left is Mastery
343// And "make Mastery" is a journey long
344// Designed to make your fingers itch to code.
345
346// Here ends the text, the file and poem.
159ENTRY(end_switcher_text) 347ENTRY(end_switcher_text)