diff options
Diffstat (limited to 'drivers/lguest')
-rw-r--r-- | drivers/lguest/Kconfig | 10 | ||||
-rw-r--r-- | drivers/lguest/Makefile | 12 | ||||
-rw-r--r-- | drivers/lguest/README | 47 | ||||
-rw-r--r-- | drivers/lguest/core.c | 357 | ||||
-rw-r--r-- | drivers/lguest/hypercalls.c | 144 | ||||
-rw-r--r-- | drivers/lguest/interrupts_and_traps.c | 212 | ||||
-rw-r--r-- | drivers/lguest/io.c | 265 | ||||
-rw-r--r-- | drivers/lguest/lg.h | 47 | ||||
-rw-r--r-- | drivers/lguest/lguest.c | 535 | ||||
-rw-r--r-- | drivers/lguest/lguest_asm.S | 71 | ||||
-rw-r--r-- | drivers/lguest/lguest_bus.c | 75 | ||||
-rw-r--r-- | drivers/lguest/lguest_user.c | 166 | ||||
-rw-r--r-- | drivers/lguest/page_tables.c | 329 | ||||
-rw-r--r-- | drivers/lguest/segments.c | 126 | ||||
-rw-r--r-- | drivers/lguest/switcher.S | 284 |
15 files changed, 2442 insertions, 238 deletions
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig index 43d901fdc77f..888205c3f76b 100644 --- a/drivers/lguest/Kconfig +++ b/drivers/lguest/Kconfig | |||
@@ -1,6 +1,6 @@ | |||
1 | config LGUEST | 1 | config LGUEST |
2 | tristate "Linux hypervisor example code" | 2 | tristate "Linux hypervisor example code" |
3 | depends on X86 && PARAVIRT && NET && EXPERIMENTAL && !X86_PAE | 3 | depends on X86 && PARAVIRT && EXPERIMENTAL && !X86_PAE |
4 | select LGUEST_GUEST | 4 | select LGUEST_GUEST |
5 | select HVC_DRIVER | 5 | select HVC_DRIVER |
6 | ---help--- | 6 | ---help--- |
@@ -18,3 +18,11 @@ config LGUEST_GUEST | |||
18 | The guest needs code built-in, even if the host has lguest | 18 | The guest needs code built-in, even if the host has lguest |
19 | support as a module. The drivers are tiny, so we build them | 19 | support as a module. The drivers are tiny, so we build them |
20 | in too. | 20 | in too. |
21 | |||
22 | config LGUEST_NET | ||
23 | tristate | ||
24 | depends on LGUEST_GUEST && NET | ||
25 | |||
26 | config LGUEST_BLOCK | ||
27 | tristate | ||
28 | depends on LGUEST_GUEST && BLOCK | ||
diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile index 55382c7d799c..e5047471c334 100644 --- a/drivers/lguest/Makefile +++ b/drivers/lguest/Makefile | |||
@@ -5,3 +5,15 @@ obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_asm.o lguest_bus.o | |||
5 | obj-$(CONFIG_LGUEST) += lg.o | 5 | obj-$(CONFIG_LGUEST) += lg.o |
6 | lg-y := core.o hypercalls.o page_tables.o interrupts_and_traps.o \ | 6 | lg-y := core.o hypercalls.o page_tables.o interrupts_and_traps.o \ |
7 | segments.o io.o lguest_user.o switcher.o | 7 | segments.o io.o lguest_user.o switcher.o |
8 | |||
9 | Preparation Preparation!: PREFIX=P | ||
10 | Guest: PREFIX=G | ||
11 | Drivers: PREFIX=D | ||
12 | Launcher: PREFIX=L | ||
13 | Host: PREFIX=H | ||
14 | Switcher: PREFIX=S | ||
15 | Mastery: PREFIX=M | ||
16 | Beer: | ||
17 | @for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}" | ||
18 | Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery: | ||
19 | @sh ../../Documentation/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'` | ||
diff --git a/drivers/lguest/README b/drivers/lguest/README new file mode 100644 index 000000000000..b7db39a64c66 --- /dev/null +++ b/drivers/lguest/README | |||
@@ -0,0 +1,47 @@ | |||
1 | Welcome, friend reader, to lguest. | ||
2 | |||
3 | Lguest is an adventure, with you, the reader, as Hero. I can't think of many | ||
4 | 5000-line projects which offer both such capability and glimpses of future | ||
5 | potential; it is an exciting time to be delving into the source! | ||
6 | |||
7 | But be warned; this is an arduous journey of several hours or more! And as we | ||
8 | know, all true Heroes are driven by a Noble Goal. Thus I offer a Beer (or | ||
9 | equivalent) to anyone I meet who has completed this documentation. | ||
10 | |||
11 | So get comfortable and keep your wits about you (both quick and humorous). | ||
12 | Along your way to the Noble Goal, you will also gain masterly insight into | ||
13 | lguest, and hypervisors and x86 virtualization in general. | ||
14 | |||
15 | Our Quest is in seven parts: (best read with C highlighting turned on) | ||
16 | |||
17 | I) Preparation | ||
18 | - In which our potential hero is flown quickly over the landscape for a | ||
19 | taste of its scope. Suitable for the armchair coders and other such | ||
20 | persons of faint constitution. | ||
21 | |||
22 | II) Guest | ||
23 | - Where we encounter the first tantalising wisps of code, and come to | ||
24 | understand the details of the life of a Guest kernel. | ||
25 | |||
26 | III) Drivers | ||
27 | - Whereby the Guest finds its voice and become useful, and our | ||
28 | understanding of the Guest is completed. | ||
29 | |||
30 | IV) Launcher | ||
31 | - Where we trace back to the creation of the Guest, and thus begin our | ||
32 | understanding of the Host. | ||
33 | |||
34 | V) Host | ||
35 | - Where we master the Host code, through a long and tortuous journey. | ||
36 | Indeed, it is here that our hero is tested in the Bit of Despair. | ||
37 | |||
38 | VI) Switcher | ||
39 | - Where our understanding of the intertwined nature of Guests and Hosts | ||
40 | is completed. | ||
41 | |||
42 | VII) Mastery | ||
43 | - Where our fully fledged hero grapples with the Great Question: | ||
44 | "What next?" | ||
45 | |||
46 | make Preparation! | ||
47 | Rusty Russell. | ||
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index ce909ec57499..0a46e8837d9a 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c | |||
@@ -1,5 +1,8 @@ | |||
1 | /* World's simplest hypervisor, to test paravirt_ops and show | 1 | /*P:400 This contains run_guest() which actually calls into the Host<->Guest |
2 | * unbelievers that virtualization is the future. Plus, it's fun! */ | 2 | * Switcher and analyzes the return, such as determining if the Guest wants the |
3 | * Host to do something. This file also contains useful helper routines, and a | ||
4 | * couple of non-obvious setup and teardown pieces which were implemented after | ||
5 | * days of debugging pain. :*/ | ||
3 | #include <linux/module.h> | 6 | #include <linux/module.h> |
4 | #include <linux/stringify.h> | 7 | #include <linux/stringify.h> |
5 | #include <linux/stddef.h> | 8 | #include <linux/stddef.h> |
@@ -61,11 +64,33 @@ static struct lguest_pages *lguest_pages(unsigned int cpu) | |||
61 | (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); | 64 | (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); |
62 | } | 65 | } |
63 | 66 | ||
67 | /*H:010 We need to set up the Switcher at a high virtual address. Remember the | ||
68 | * Switcher is a few hundred bytes of assembler code which actually changes the | ||
69 | * CPU to run the Guest, and then changes back to the Host when a trap or | ||
70 | * interrupt happens. | ||
71 | * | ||
72 | * The Switcher code must be at the same virtual address in the Guest as the | ||
73 | * Host since it will be running as the switchover occurs. | ||
74 | * | ||
75 | * Trying to map memory at a particular address is an unusual thing to do, so | ||
76 | * it's not a simple one-liner. We also set up the per-cpu parts of the | ||
77 | * Switcher here. | ||
78 | */ | ||
64 | static __init int map_switcher(void) | 79 | static __init int map_switcher(void) |
65 | { | 80 | { |
66 | int i, err; | 81 | int i, err; |
67 | struct page **pagep; | 82 | struct page **pagep; |
68 | 83 | ||
84 | /* | ||
85 | * Map the Switcher in to high memory. | ||
86 | * | ||
87 | * It turns out that if we choose the address 0xFFC00000 (4MB under the | ||
88 | * top virtual address), it makes setting up the page tables really | ||
89 | * easy. | ||
90 | */ | ||
91 | |||
92 | /* We allocate an array of "struct page"s. map_vm_area() wants the | ||
93 | * pages in this form, rather than just an array of pointers. */ | ||
69 | switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, | 94 | switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, |
70 | GFP_KERNEL); | 95 | GFP_KERNEL); |
71 | if (!switcher_page) { | 96 | if (!switcher_page) { |
@@ -73,6 +98,8 @@ static __init int map_switcher(void) | |||
73 | goto out; | 98 | goto out; |
74 | } | 99 | } |
75 | 100 | ||
101 | /* Now we actually allocate the pages. The Guest will see these pages, | ||
102 | * so we make sure they're zeroed. */ | ||
76 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { | 103 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { |
77 | unsigned long addr = get_zeroed_page(GFP_KERNEL); | 104 | unsigned long addr = get_zeroed_page(GFP_KERNEL); |
78 | if (!addr) { | 105 | if (!addr) { |
@@ -82,6 +109,9 @@ static __init int map_switcher(void) | |||
82 | switcher_page[i] = virt_to_page(addr); | 109 | switcher_page[i] = virt_to_page(addr); |
83 | } | 110 | } |
84 | 111 | ||
112 | /* Now we reserve the "virtual memory area" we want: 0xFFC00000 | ||
113 | * (SWITCHER_ADDR). We might not get it in theory, but in practice | ||
114 | * it's worked so far. */ | ||
85 | switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, | 115 | switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, |
86 | VM_ALLOC, SWITCHER_ADDR, VMALLOC_END); | 116 | VM_ALLOC, SWITCHER_ADDR, VMALLOC_END); |
87 | if (!switcher_vma) { | 117 | if (!switcher_vma) { |
@@ -90,49 +120,105 @@ static __init int map_switcher(void) | |||
90 | goto free_pages; | 120 | goto free_pages; |
91 | } | 121 | } |
92 | 122 | ||
123 | /* This code actually sets up the pages we've allocated to appear at | ||
124 | * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the | ||
125 | * kind of pages we're mapping (kernel pages), and a pointer to our | ||
126 | * array of struct pages. It increments that pointer, but we don't | ||
127 | * care. */ | ||
93 | pagep = switcher_page; | 128 | pagep = switcher_page; |
94 | err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep); | 129 | err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep); |
95 | if (err) { | 130 | if (err) { |
96 | printk("lguest: map_vm_area failed: %i\n", err); | 131 | printk("lguest: map_vm_area failed: %i\n", err); |
97 | goto free_vma; | 132 | goto free_vma; |
98 | } | 133 | } |
134 | |||
135 | /* Now the switcher is mapped at the right address, we can't fail! | ||
136 | * Copy in the compiled-in Switcher code (from switcher.S). */ | ||
99 | memcpy(switcher_vma->addr, start_switcher_text, | 137 | memcpy(switcher_vma->addr, start_switcher_text, |
100 | end_switcher_text - start_switcher_text); | 138 | end_switcher_text - start_switcher_text); |
101 | 139 | ||
102 | /* Fix up IDT entries to point into copied text. */ | 140 | /* Most of the switcher.S doesn't care that it's been moved; on Intel, |
141 | * jumps are relative, and it doesn't access any references to external | ||
142 | * code or data. | ||
143 | * | ||
144 | * The only exception is the interrupt handlers in switcher.S: their | ||
145 | * addresses are placed in a table (default_idt_entries), so we need to | ||
146 | * update the table with the new addresses. switcher_offset() is a | ||
147 | * convenience function which returns the distance between the builtin | ||
148 | * switcher code and the high-mapped copy we just made. */ | ||
103 | for (i = 0; i < IDT_ENTRIES; i++) | 149 | for (i = 0; i < IDT_ENTRIES; i++) |
104 | default_idt_entries[i] += switcher_offset(); | 150 | default_idt_entries[i] += switcher_offset(); |
105 | 151 | ||
152 | /* | ||
153 | * Set up the Switcher's per-cpu areas. | ||
154 | * | ||
155 | * Each CPU gets two pages of its own within the high-mapped region | ||
156 | * (aka. "struct lguest_pages"). Much of this can be initialized now, | ||
157 | * but some depends on what Guest we are running (which is set up in | ||
158 | * copy_in_guest_info()). | ||
159 | */ | ||
106 | for_each_possible_cpu(i) { | 160 | for_each_possible_cpu(i) { |
161 | /* lguest_pages() returns this CPU's two pages. */ | ||
107 | struct lguest_pages *pages = lguest_pages(i); | 162 | struct lguest_pages *pages = lguest_pages(i); |
163 | /* This is a convenience pointer to make the code fit one | ||
164 | * statement to a line. */ | ||
108 | struct lguest_ro_state *state = &pages->state; | 165 | struct lguest_ro_state *state = &pages->state; |
109 | 166 | ||
110 | /* These fields are static: rest done in copy_in_guest_info */ | 167 | /* The Global Descriptor Table: the Host has a different one |
168 | * for each CPU. We keep a descriptor for the GDT which says | ||
169 | * where it is and how big it is (the size is actually the last | ||
170 | * byte, not the size, hence the "-1"). */ | ||
111 | state->host_gdt_desc.size = GDT_SIZE-1; | 171 | state->host_gdt_desc.size = GDT_SIZE-1; |
112 | state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); | 172 | state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); |
173 | |||
174 | /* All CPUs on the Host use the same Interrupt Descriptor | ||
175 | * Table, so we just use store_idt(), which gets this CPU's IDT | ||
176 | * descriptor. */ | ||
113 | store_idt(&state->host_idt_desc); | 177 | store_idt(&state->host_idt_desc); |
178 | |||
179 | /* The descriptors for the Guest's GDT and IDT can be filled | ||
180 | * out now, too. We copy the GDT & IDT into ->guest_gdt and | ||
181 | * ->guest_idt before actually running the Guest. */ | ||
114 | state->guest_idt_desc.size = sizeof(state->guest_idt)-1; | 182 | state->guest_idt_desc.size = sizeof(state->guest_idt)-1; |
115 | state->guest_idt_desc.address = (long)&state->guest_idt; | 183 | state->guest_idt_desc.address = (long)&state->guest_idt; |
116 | state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; | 184 | state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; |
117 | state->guest_gdt_desc.address = (long)&state->guest_gdt; | 185 | state->guest_gdt_desc.address = (long)&state->guest_gdt; |
186 | |||
187 | /* We know where we want the stack to be when the Guest enters | ||
188 | * the switcher: in pages->regs. The stack grows upwards, so | ||
189 | * we start it at the end of that structure. */ | ||
118 | state->guest_tss.esp0 = (long)(&pages->regs + 1); | 190 | state->guest_tss.esp0 = (long)(&pages->regs + 1); |
191 | /* And this is the GDT entry to use for the stack: we keep a | ||
192 | * couple of special LGUEST entries. */ | ||
119 | state->guest_tss.ss0 = LGUEST_DS; | 193 | state->guest_tss.ss0 = LGUEST_DS; |
120 | /* No I/O for you! */ | 194 | |
195 | /* x86 can have a finegrained bitmap which indicates what I/O | ||
196 | * ports the process can use. We set it to the end of our | ||
197 | * structure, meaning "none". */ | ||
121 | state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); | 198 | state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); |
199 | |||
200 | /* Some GDT entries are the same across all Guests, so we can | ||
201 | * set them up now. */ | ||
122 | setup_default_gdt_entries(state); | 202 | setup_default_gdt_entries(state); |
203 | /* Most IDT entries are the same for all Guests, too.*/ | ||
123 | setup_default_idt_entries(state, default_idt_entries); | 204 | setup_default_idt_entries(state, default_idt_entries); |
124 | 205 | ||
125 | /* Setup LGUEST segments on all cpus */ | 206 | /* The Host needs to be able to use the LGUEST segments on this |
207 | * CPU, too, so put them in the Host GDT. */ | ||
126 | get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; | 208 | get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; |
127 | get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; | 209 | get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; |
128 | } | 210 | } |
129 | 211 | ||
130 | /* Initialize entry point into switcher. */ | 212 | /* In the Switcher, we want the %cs segment register to use the |
213 | * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so | ||
214 | * it will be undisturbed when we switch. To change %cs and jump we | ||
215 | * need this structure to feed to Intel's "lcall" instruction. */ | ||
131 | lguest_entry.offset = (long)switch_to_guest + switcher_offset(); | 216 | lguest_entry.offset = (long)switch_to_guest + switcher_offset(); |
132 | lguest_entry.segment = LGUEST_CS; | 217 | lguest_entry.segment = LGUEST_CS; |
133 | 218 | ||
134 | printk(KERN_INFO "lguest: mapped switcher at %p\n", | 219 | printk(KERN_INFO "lguest: mapped switcher at %p\n", |
135 | switcher_vma->addr); | 220 | switcher_vma->addr); |
221 | /* And we succeeded... */ | ||
136 | return 0; | 222 | return 0; |
137 | 223 | ||
138 | free_vma: | 224 | free_vma: |
@@ -146,35 +232,58 @@ free_some_pages: | |||
146 | out: | 232 | out: |
147 | return err; | 233 | return err; |
148 | } | 234 | } |
235 | /*:*/ | ||
149 | 236 | ||
237 | /* Cleaning up the mapping when the module is unloaded is almost... | ||
238 | * too easy. */ | ||
150 | static void unmap_switcher(void) | 239 | static void unmap_switcher(void) |
151 | { | 240 | { |
152 | unsigned int i; | 241 | unsigned int i; |
153 | 242 | ||
243 | /* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */ | ||
154 | vunmap(switcher_vma->addr); | 244 | vunmap(switcher_vma->addr); |
245 | /* Now we just need to free the pages we copied the switcher into */ | ||
155 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) | 246 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) |
156 | __free_pages(switcher_page[i], 0); | 247 | __free_pages(switcher_page[i], 0); |
157 | } | 248 | } |
158 | 249 | ||
159 | /* IN/OUT insns: enough to get us past boot-time probing. */ | 250 | /*H:130 Our Guest is usually so well behaved; it never tries to do things it |
251 | * isn't allowed to. Unfortunately, "struct paravirt_ops" isn't quite | ||
252 | * complete, because it doesn't contain replacements for the Intel I/O | ||
253 | * instructions. As a result, the Guest sometimes fumbles across one during | ||
254 | * the boot process as it probes for various things which are usually attached | ||
255 | * to a PC. | ||
256 | * | ||
257 | * When the Guest uses one of these instructions, we get trap #13 (General | ||
258 | * Protection Fault) and come here. We see if it's one of those troublesome | ||
259 | * instructions and skip over it. We return true if we did. */ | ||
160 | static int emulate_insn(struct lguest *lg) | 260 | static int emulate_insn(struct lguest *lg) |
161 | { | 261 | { |
162 | u8 insn; | 262 | u8 insn; |
163 | unsigned int insnlen = 0, in = 0, shift = 0; | 263 | unsigned int insnlen = 0, in = 0, shift = 0; |
264 | /* The eip contains the *virtual* address of the Guest's instruction: | ||
265 | * guest_pa just subtracts the Guest's page_offset. */ | ||
164 | unsigned long physaddr = guest_pa(lg, lg->regs->eip); | 266 | unsigned long physaddr = guest_pa(lg, lg->regs->eip); |
165 | 267 | ||
166 | /* This only works for addresses in linear mapping... */ | 268 | /* The guest_pa() function only works for Guest kernel addresses, but |
269 | * that's all we're trying to do anyway. */ | ||
167 | if (lg->regs->eip < lg->page_offset) | 270 | if (lg->regs->eip < lg->page_offset) |
168 | return 0; | 271 | return 0; |
272 | |||
273 | /* Decoding x86 instructions is icky. */ | ||
169 | lgread(lg, &insn, physaddr, 1); | 274 | lgread(lg, &insn, physaddr, 1); |
170 | 275 | ||
171 | /* Operand size prefix means it's actually for ax. */ | 276 | /* 0x66 is an "operand prefix". It means it's using the upper 16 bits |
277 | of the eax register. */ | ||
172 | if (insn == 0x66) { | 278 | if (insn == 0x66) { |
173 | shift = 16; | 279 | shift = 16; |
280 | /* The instruction is 1 byte so far, read the next byte. */ | ||
174 | insnlen = 1; | 281 | insnlen = 1; |
175 | lgread(lg, &insn, physaddr + insnlen, 1); | 282 | lgread(lg, &insn, physaddr + insnlen, 1); |
176 | } | 283 | } |
177 | 284 | ||
285 | /* We can ignore the lower bit for the moment and decode the 4 opcodes | ||
286 | * we need to emulate. */ | ||
178 | switch (insn & 0xFE) { | 287 | switch (insn & 0xFE) { |
179 | case 0xE4: /* in <next byte>,%al */ | 288 | case 0xE4: /* in <next byte>,%al */ |
180 | insnlen += 2; | 289 | insnlen += 2; |
@@ -191,9 +300,13 @@ static int emulate_insn(struct lguest *lg) | |||
191 | insnlen += 1; | 300 | insnlen += 1; |
192 | break; | 301 | break; |
193 | default: | 302 | default: |
303 | /* OK, we don't know what this is, can't emulate. */ | ||
194 | return 0; | 304 | return 0; |
195 | } | 305 | } |
196 | 306 | ||
307 | /* If it was an "IN" instruction, they expect the result to be read | ||
308 | * into %eax, so we change %eax. We always return all-ones, which | ||
309 | * traditionally means "there's nothing there". */ | ||
197 | if (in) { | 310 | if (in) { |
198 | /* Lower bit tells is whether it's a 16 or 32 bit access */ | 311 | /* Lower bit tells is whether it's a 16 or 32 bit access */ |
199 | if (insn & 0x1) | 312 | if (insn & 0x1) |
@@ -201,28 +314,46 @@ static int emulate_insn(struct lguest *lg) | |||
201 | else | 314 | else |
202 | lg->regs->eax |= (0xFFFF << shift); | 315 | lg->regs->eax |= (0xFFFF << shift); |
203 | } | 316 | } |
317 | /* Finally, we've "done" the instruction, so move past it. */ | ||
204 | lg->regs->eip += insnlen; | 318 | lg->regs->eip += insnlen; |
319 | /* Success! */ | ||
205 | return 1; | 320 | return 1; |
206 | } | 321 | } |
207 | 322 | /*:*/ | |
323 | |||
324 | /*L:305 | ||
325 | * Dealing With Guest Memory. | ||
326 | * | ||
327 | * When the Guest gives us (what it thinks is) a physical address, we can use | ||
328 | * the normal copy_from_user() & copy_to_user() on that address: remember, | ||
329 | * Guest physical == Launcher virtual. | ||
330 | * | ||
331 | * But we can't trust the Guest: it might be trying to access the Launcher | ||
332 | * code. We have to check that the range is below the pfn_limit the Launcher | ||
333 | * gave us. We have to make sure that addr + len doesn't give us a false | ||
334 | * positive by overflowing, too. */ | ||
208 | int lguest_address_ok(const struct lguest *lg, | 335 | int lguest_address_ok(const struct lguest *lg, |
209 | unsigned long addr, unsigned long len) | 336 | unsigned long addr, unsigned long len) |
210 | { | 337 | { |
211 | return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); | 338 | return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); |
212 | } | 339 | } |
213 | 340 | ||
214 | /* Just like get_user, but don't let guest access lguest binary. */ | 341 | /* This is a convenient routine to get a 32-bit value from the Guest (a very |
342 | * common operation). Here we can see how useful the kill_lguest() routine we | ||
343 | * met in the Launcher can be: we return a random value (0) instead of needing | ||
344 | * to return an error. */ | ||
215 | u32 lgread_u32(struct lguest *lg, unsigned long addr) | 345 | u32 lgread_u32(struct lguest *lg, unsigned long addr) |
216 | { | 346 | { |
217 | u32 val = 0; | 347 | u32 val = 0; |
218 | 348 | ||
219 | /* Don't let them access lguest binary */ | 349 | /* Don't let them access lguest binary. */ |
220 | if (!lguest_address_ok(lg, addr, sizeof(val)) | 350 | if (!lguest_address_ok(lg, addr, sizeof(val)) |
221 | || get_user(val, (u32 __user *)addr) != 0) | 351 | || get_user(val, (u32 __user *)addr) != 0) |
222 | kill_guest(lg, "bad read address %#lx", addr); | 352 | kill_guest(lg, "bad read address %#lx", addr); |
223 | return val; | 353 | return val; |
224 | } | 354 | } |
225 | 355 | ||
356 | /* Same thing for writing a value. */ | ||
226 | void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val) | 357 | void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val) |
227 | { | 358 | { |
228 | if (!lguest_address_ok(lg, addr, sizeof(val)) | 359 | if (!lguest_address_ok(lg, addr, sizeof(val)) |
@@ -230,6 +361,9 @@ void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val) | |||
230 | kill_guest(lg, "bad write address %#lx", addr); | 361 | kill_guest(lg, "bad write address %#lx", addr); |
231 | } | 362 | } |
232 | 363 | ||
364 | /* This routine is more generic, and copies a range of Guest bytes into a | ||
365 | * buffer. If the copy_from_user() fails, we fill the buffer with zeroes, so | ||
366 | * the caller doesn't end up using uninitialized kernel memory. */ | ||
233 | void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes) | 367 | void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes) |
234 | { | 368 | { |
235 | if (!lguest_address_ok(lg, addr, bytes) | 369 | if (!lguest_address_ok(lg, addr, bytes) |
@@ -240,6 +374,7 @@ void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes) | |||
240 | } | 374 | } |
241 | } | 375 | } |
242 | 376 | ||
377 | /* Similarly, our generic routine to copy into a range of Guest bytes. */ | ||
243 | void lgwrite(struct lguest *lg, unsigned long addr, const void *b, | 378 | void lgwrite(struct lguest *lg, unsigned long addr, const void *b, |
244 | unsigned bytes) | 379 | unsigned bytes) |
245 | { | 380 | { |
@@ -247,6 +382,7 @@ void lgwrite(struct lguest *lg, unsigned long addr, const void *b, | |||
247 | || copy_to_user((void __user *)addr, b, bytes) != 0) | 382 | || copy_to_user((void __user *)addr, b, bytes) != 0) |
248 | kill_guest(lg, "bad write address %#lx len %u", addr, bytes); | 383 | kill_guest(lg, "bad write address %#lx len %u", addr, bytes); |
249 | } | 384 | } |
385 | /* (end of memory access helper routines) :*/ | ||
250 | 386 | ||
251 | static void set_ts(void) | 387 | static void set_ts(void) |
252 | { | 388 | { |
@@ -257,54 +393,108 @@ static void set_ts(void) | |||
257 | write_cr0(cr0|8); | 393 | write_cr0(cr0|8); |
258 | } | 394 | } |
259 | 395 | ||
396 | /*S:010 | ||
397 | * We are getting close to the Switcher. | ||
398 | * | ||
399 | * Remember that each CPU has two pages which are visible to the Guest when it | ||
400 | * runs on that CPU. This has to contain the state for that Guest: we copy the | ||
401 | * state in just before we run the Guest. | ||
402 | * | ||
403 | * Each Guest has "changed" flags which indicate what has changed in the Guest | ||
404 | * since it last ran. We saw this set in interrupts_and_traps.c and | ||
405 | * segments.c. | ||
406 | */ | ||
260 | static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages) | 407 | static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages) |
261 | { | 408 | { |
409 | /* Copying all this data can be quite expensive. We usually run the | ||
410 | * same Guest we ran last time (and that Guest hasn't run anywhere else | ||
411 | * meanwhile). If that's not the case, we pretend everything in the | ||
412 | * Guest has changed. */ | ||
262 | if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) { | 413 | if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) { |
263 | __get_cpu_var(last_guest) = lg; | 414 | __get_cpu_var(last_guest) = lg; |
264 | lg->last_pages = pages; | 415 | lg->last_pages = pages; |
265 | lg->changed = CHANGED_ALL; | 416 | lg->changed = CHANGED_ALL; |
266 | } | 417 | } |
267 | 418 | ||
268 | /* These are pretty cheap, so we do them unconditionally. */ | 419 | /* These copies are pretty cheap, so we do them unconditionally: */ |
420 | /* Save the current Host top-level page directory. */ | ||
269 | pages->state.host_cr3 = __pa(current->mm->pgd); | 421 | pages->state.host_cr3 = __pa(current->mm->pgd); |
422 | /* Set up the Guest's page tables to see this CPU's pages (and no | ||
423 | * other CPU's pages). */ | ||
270 | map_switcher_in_guest(lg, pages); | 424 | map_switcher_in_guest(lg, pages); |
425 | /* Set up the two "TSS" members which tell the CPU what stack to use | ||
426 | * for traps which do directly into the Guest (ie. traps at privilege | ||
427 | * level 1). */ | ||
271 | pages->state.guest_tss.esp1 = lg->esp1; | 428 | pages->state.guest_tss.esp1 = lg->esp1; |
272 | pages->state.guest_tss.ss1 = lg->ss1; | 429 | pages->state.guest_tss.ss1 = lg->ss1; |
273 | 430 | ||
274 | /* Copy direct trap entries. */ | 431 | /* Copy direct-to-Guest trap entries. */ |
275 | if (lg->changed & CHANGED_IDT) | 432 | if (lg->changed & CHANGED_IDT) |
276 | copy_traps(lg, pages->state.guest_idt, default_idt_entries); | 433 | copy_traps(lg, pages->state.guest_idt, default_idt_entries); |
277 | 434 | ||
278 | /* Copy all GDT entries but the TSS. */ | 435 | /* Copy all GDT entries which the Guest can change. */ |
279 | if (lg->changed & CHANGED_GDT) | 436 | if (lg->changed & CHANGED_GDT) |
280 | copy_gdt(lg, pages->state.guest_gdt); | 437 | copy_gdt(lg, pages->state.guest_gdt); |
281 | /* If only the TLS entries have changed, copy them. */ | 438 | /* If only the TLS entries have changed, copy them. */ |
282 | else if (lg->changed & CHANGED_GDT_TLS) | 439 | else if (lg->changed & CHANGED_GDT_TLS) |
283 | copy_gdt_tls(lg, pages->state.guest_gdt); | 440 | copy_gdt_tls(lg, pages->state.guest_gdt); |
284 | 441 | ||
442 | /* Mark the Guest as unchanged for next time. */ | ||
285 | lg->changed = 0; | 443 | lg->changed = 0; |
286 | } | 444 | } |
287 | 445 | ||
446 | /* Finally: the code to actually call into the Switcher to run the Guest. */ | ||
288 | static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) | 447 | static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) |
289 | { | 448 | { |
449 | /* This is a dummy value we need for GCC's sake. */ | ||
290 | unsigned int clobber; | 450 | unsigned int clobber; |
291 | 451 | ||
452 | /* Copy the guest-specific information into this CPU's "struct | ||
453 | * lguest_pages". */ | ||
292 | copy_in_guest_info(lg, pages); | 454 | copy_in_guest_info(lg, pages); |
293 | 455 | ||
294 | /* Put eflags on stack, lcall does rest: suitable for iret return. */ | 456 | /* Now: we push the "eflags" register on the stack, then do an "lcall". |
457 | * This is how we change from using the kernel code segment to using | ||
458 | * the dedicated lguest code segment, as well as jumping into the | ||
459 | * Switcher. | ||
460 | * | ||
461 | * The lcall also pushes the old code segment (KERNEL_CS) onto the | ||
462 | * stack, then the address of this call. This stack layout happens to | ||
463 | * exactly match the stack of an interrupt... */ | ||
295 | asm volatile("pushf; lcall *lguest_entry" | 464 | asm volatile("pushf; lcall *lguest_entry" |
465 | /* This is how we tell GCC that %eax ("a") and %ebx ("b") | ||
466 | * are changed by this routine. The "=" means output. */ | ||
296 | : "=a"(clobber), "=b"(clobber) | 467 | : "=a"(clobber), "=b"(clobber) |
468 | /* %eax contains the pages pointer. ("0" refers to the | ||
469 | * 0-th argument above, ie "a"). %ebx contains the | ||
470 | * physical address of the Guest's top-level page | ||
471 | * directory. */ | ||
297 | : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir)) | 472 | : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir)) |
473 | /* We tell gcc that all these registers could change, | ||
474 | * which means we don't have to save and restore them in | ||
475 | * the Switcher. */ | ||
298 | : "memory", "%edx", "%ecx", "%edi", "%esi"); | 476 | : "memory", "%edx", "%ecx", "%edi", "%esi"); |
299 | } | 477 | } |
478 | /*:*/ | ||
300 | 479 | ||
480 | /*H:030 Let's jump straight to the the main loop which runs the Guest. | ||
481 | * Remember, this is called by the Launcher reading /dev/lguest, and we keep | ||
482 | * going around and around until something interesting happens. */ | ||
301 | int run_guest(struct lguest *lg, unsigned long __user *user) | 483 | int run_guest(struct lguest *lg, unsigned long __user *user) |
302 | { | 484 | { |
485 | /* We stop running once the Guest is dead. */ | ||
303 | while (!lg->dead) { | 486 | while (!lg->dead) { |
487 | /* We need to initialize this, otherwise gcc complains. It's | ||
488 | * not (yet) clever enough to see that it's initialized when we | ||
489 | * need it. */ | ||
304 | unsigned int cr2 = 0; /* Damn gcc */ | 490 | unsigned int cr2 = 0; /* Damn gcc */ |
305 | 491 | ||
306 | /* Hypercalls first: we might have been out to userspace */ | 492 | /* First we run any hypercalls the Guest wants done: either in |
493 | * the hypercall ring in "struct lguest_data", or directly by | ||
494 | * using int 31 (LGUEST_TRAP_ENTRY). */ | ||
307 | do_hypercalls(lg); | 495 | do_hypercalls(lg); |
496 | /* It's possible the Guest did a SEND_DMA hypercall to the | ||
497 | * Launcher, in which case we return from the read() now. */ | ||
308 | if (lg->dma_is_pending) { | 498 | if (lg->dma_is_pending) { |
309 | if (put_user(lg->pending_dma, user) || | 499 | if (put_user(lg->pending_dma, user) || |
310 | put_user(lg->pending_key, user+1)) | 500 | put_user(lg->pending_key, user+1)) |
@@ -312,6 +502,7 @@ int run_guest(struct lguest *lg, unsigned long __user *user) | |||
312 | return sizeof(unsigned long)*2; | 502 | return sizeof(unsigned long)*2; |
313 | } | 503 | } |
314 | 504 | ||
505 | /* Check for signals */ | ||
315 | if (signal_pending(current)) | 506 | if (signal_pending(current)) |
316 | return -ERESTARTSYS; | 507 | return -ERESTARTSYS; |
317 | 508 | ||
@@ -319,77 +510,154 @@ int run_guest(struct lguest *lg, unsigned long __user *user) | |||
319 | if (lg->break_out) | 510 | if (lg->break_out) |
320 | return -EAGAIN; | 511 | return -EAGAIN; |
321 | 512 | ||
513 | /* Check if there are any interrupts which can be delivered | ||
514 | * now: if so, this sets up the hander to be executed when we | ||
515 | * next run the Guest. */ | ||
322 | maybe_do_interrupt(lg); | 516 | maybe_do_interrupt(lg); |
323 | 517 | ||
518 | /* All long-lived kernel loops need to check with this horrible | ||
519 | * thing called the freezer. If the Host is trying to suspend, | ||
520 | * it stops us. */ | ||
324 | try_to_freeze(); | 521 | try_to_freeze(); |
325 | 522 | ||
523 | /* Just make absolutely sure the Guest is still alive. One of | ||
524 | * those hypercalls could have been fatal, for example. */ | ||
326 | if (lg->dead) | 525 | if (lg->dead) |
327 | break; | 526 | break; |
328 | 527 | ||
528 | /* If the Guest asked to be stopped, we sleep. The Guest's | ||
529 | * clock timer or LHCALL_BREAK from the Waker will wake us. */ | ||
329 | if (lg->halted) { | 530 | if (lg->halted) { |
330 | set_current_state(TASK_INTERRUPTIBLE); | 531 | set_current_state(TASK_INTERRUPTIBLE); |
331 | schedule(); | 532 | schedule(); |
332 | continue; | 533 | continue; |
333 | } | 534 | } |
334 | 535 | ||
536 | /* OK, now we're ready to jump into the Guest. First we put up | ||
537 | * the "Do Not Disturb" sign: */ | ||
335 | local_irq_disable(); | 538 | local_irq_disable(); |
336 | 539 | ||
337 | /* Even if *we* don't want FPU trap, guest might... */ | 540 | /* Remember the awfully-named TS bit? If the Guest has asked |
541 | * to set it we set it now, so we can trap and pass that trap | ||
542 | * to the Guest if it uses the FPU. */ | ||
338 | if (lg->ts) | 543 | if (lg->ts) |
339 | set_ts(); | 544 | set_ts(); |
340 | 545 | ||
341 | /* Don't let Guest do SYSENTER: we can't handle it. */ | 546 | /* SYSENTER is an optimized way of doing system calls. We |
547 | * can't allow it because it always jumps to privilege level 0. | ||
548 | * A normal Guest won't try it because we don't advertise it in | ||
549 | * CPUID, but a malicious Guest (or malicious Guest userspace | ||
550 | * program) could, so we tell the CPU to disable it before | ||
551 | * running the Guest. */ | ||
342 | if (boot_cpu_has(X86_FEATURE_SEP)) | 552 | if (boot_cpu_has(X86_FEATURE_SEP)) |
343 | wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); | 553 | wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); |
344 | 554 | ||
555 | /* Now we actually run the Guest. It will pop back out when | ||
556 | * something interesting happens, and we can examine its | ||
557 | * registers to see what it was doing. */ | ||
345 | run_guest_once(lg, lguest_pages(raw_smp_processor_id())); | 558 | run_guest_once(lg, lguest_pages(raw_smp_processor_id())); |
346 | 559 | ||
347 | /* Save cr2 now if we page-faulted. */ | 560 | /* The "regs" pointer contains two extra entries which are not |
561 | * really registers: a trap number which says what interrupt or | ||
562 | * trap made the switcher code come back, and an error code | ||
563 | * which some traps set. */ | ||
564 | |||
565 | /* If the Guest page faulted, then the cr2 register will tell | ||
566 | * us the bad virtual address. We have to grab this now, | ||
567 | * because once we re-enable interrupts an interrupt could | ||
568 | * fault and thus overwrite cr2, or we could even move off to a | ||
569 | * different CPU. */ | ||
348 | if (lg->regs->trapnum == 14) | 570 | if (lg->regs->trapnum == 14) |
349 | cr2 = read_cr2(); | 571 | cr2 = read_cr2(); |
572 | /* Similarly, if we took a trap because the Guest used the FPU, | ||
573 | * we have to restore the FPU it expects to see. */ | ||
350 | else if (lg->regs->trapnum == 7) | 574 | else if (lg->regs->trapnum == 7) |
351 | math_state_restore(); | 575 | math_state_restore(); |
352 | 576 | ||
577 | /* Restore SYSENTER if it's supposed to be on. */ | ||
353 | if (boot_cpu_has(X86_FEATURE_SEP)) | 578 | if (boot_cpu_has(X86_FEATURE_SEP)) |
354 | wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); | 579 | wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); |
580 | |||
581 | /* Now we're ready to be interrupted or moved to other CPUs */ | ||
355 | local_irq_enable(); | 582 | local_irq_enable(); |
356 | 583 | ||
584 | /* OK, so what happened? */ | ||
357 | switch (lg->regs->trapnum) { | 585 | switch (lg->regs->trapnum) { |
358 | case 13: /* We've intercepted a GPF. */ | 586 | case 13: /* We've intercepted a GPF. */ |
587 | /* Check if this was one of those annoying IN or OUT | ||
588 | * instructions which we need to emulate. If so, we | ||
589 | * just go back into the Guest after we've done it. */ | ||
359 | if (lg->regs->errcode == 0) { | 590 | if (lg->regs->errcode == 0) { |
360 | if (emulate_insn(lg)) | 591 | if (emulate_insn(lg)) |
361 | continue; | 592 | continue; |
362 | } | 593 | } |
363 | break; | 594 | break; |
364 | case 14: /* We've intercepted a page fault. */ | 595 | case 14: /* We've intercepted a page fault. */ |
596 | /* The Guest accessed a virtual address that wasn't | ||
597 | * mapped. This happens a lot: we don't actually set | ||
598 | * up most of the page tables for the Guest at all when | ||
599 | * we start: as it runs it asks for more and more, and | ||
600 | * we set them up as required. In this case, we don't | ||
601 | * even tell the Guest that the fault happened. | ||
602 | * | ||
603 | * The errcode tells whether this was a read or a | ||
604 | * write, and whether kernel or userspace code. */ | ||
365 | if (demand_page(lg, cr2, lg->regs->errcode)) | 605 | if (demand_page(lg, cr2, lg->regs->errcode)) |
366 | continue; | 606 | continue; |
367 | 607 | ||
368 | /* If lguest_data is NULL, this won't hurt. */ | 608 | /* OK, it's really not there (or not OK): the Guest |
609 | * needs to know. We write out the cr2 value so it | ||
610 | * knows where the fault occurred. | ||
611 | * | ||
612 | * Note that if the Guest were really messed up, this | ||
613 | * could happen before it's done the INITIALIZE | ||
614 | * hypercall, so lg->lguest_data will be NULL, so | ||
615 | * &lg->lguest_data->cr2 will be address 8. Writing | ||
616 | * into that address won't hurt the Host at all, | ||
617 | * though. */ | ||
369 | if (put_user(cr2, &lg->lguest_data->cr2)) | 618 | if (put_user(cr2, &lg->lguest_data->cr2)) |
370 | kill_guest(lg, "Writing cr2"); | 619 | kill_guest(lg, "Writing cr2"); |
371 | break; | 620 | break; |
372 | case 7: /* We've intercepted a Device Not Available fault. */ | 621 | case 7: /* We've intercepted a Device Not Available fault. */ |
373 | /* If they don't want to know, just absorb it. */ | 622 | /* If the Guest doesn't want to know, we already |
623 | * restored the Floating Point Unit, so we just | ||
624 | * continue without telling it. */ | ||
374 | if (!lg->ts) | 625 | if (!lg->ts) |
375 | continue; | 626 | continue; |
376 | break; | 627 | break; |
377 | case 32 ... 255: /* Real interrupt, fall thru */ | 628 | case 32 ... 255: |
629 | /* These values mean a real interrupt occurred, in | ||
630 | * which case the Host handler has already been run. | ||
631 | * We just do a friendly check if another process | ||
632 | * should now be run, then fall through to loop | ||
633 | * around: */ | ||
378 | cond_resched(); | 634 | cond_resched(); |
379 | case LGUEST_TRAP_ENTRY: /* Handled at top of loop */ | 635 | case LGUEST_TRAP_ENTRY: /* Handled at top of loop */ |
380 | continue; | 636 | continue; |
381 | } | 637 | } |
382 | 638 | ||
639 | /* If we get here, it's a trap the Guest wants to know | ||
640 | * about. */ | ||
383 | if (deliver_trap(lg, lg->regs->trapnum)) | 641 | if (deliver_trap(lg, lg->regs->trapnum)) |
384 | continue; | 642 | continue; |
385 | 643 | ||
644 | /* If the Guest doesn't have a handler (either it hasn't | ||
645 | * registered any yet, or it's one of the faults we don't let | ||
646 | * it handle), it dies with a cryptic error message. */ | ||
386 | kill_guest(lg, "unhandled trap %li at %#lx (%#lx)", | 647 | kill_guest(lg, "unhandled trap %li at %#lx (%#lx)", |
387 | lg->regs->trapnum, lg->regs->eip, | 648 | lg->regs->trapnum, lg->regs->eip, |
388 | lg->regs->trapnum == 14 ? cr2 : lg->regs->errcode); | 649 | lg->regs->trapnum == 14 ? cr2 : lg->regs->errcode); |
389 | } | 650 | } |
651 | /* The Guest is dead => "No such file or directory" */ | ||
390 | return -ENOENT; | 652 | return -ENOENT; |
391 | } | 653 | } |
392 | 654 | ||
655 | /* Now we can look at each of the routines this calls, in increasing order of | ||
656 | * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), | ||
657 | * deliver_trap() and demand_page(). After all those, we'll be ready to | ||
658 | * examine the Switcher, and our philosophical understanding of the Host/Guest | ||
659 | * duality will be complete. :*/ | ||
660 | |||
393 | int find_free_guest(void) | 661 | int find_free_guest(void) |
394 | { | 662 | { |
395 | unsigned int i; | 663 | unsigned int i; |
@@ -407,55 +675,96 @@ static void adjust_pge(void *on) | |||
407 | write_cr4(read_cr4() & ~X86_CR4_PGE); | 675 | write_cr4(read_cr4() & ~X86_CR4_PGE); |
408 | } | 676 | } |
409 | 677 | ||
678 | /*H:000 | ||
679 | * Welcome to the Host! | ||
680 | * | ||
681 | * By this point your brain has been tickled by the Guest code and numbed by | ||
682 | * the Launcher code; prepare for it to be stretched by the Host code. This is | ||
683 | * the heart. Let's begin at the initialization routine for the Host's lg | ||
684 | * module. | ||
685 | */ | ||
410 | static int __init init(void) | 686 | static int __init init(void) |
411 | { | 687 | { |
412 | int err; | 688 | int err; |
413 | 689 | ||
690 | /* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */ | ||
414 | if (paravirt_enabled()) { | 691 | if (paravirt_enabled()) { |
415 | printk("lguest is afraid of %s\n", paravirt_ops.name); | 692 | printk("lguest is afraid of %s\n", paravirt_ops.name); |
416 | return -EPERM; | 693 | return -EPERM; |
417 | } | 694 | } |
418 | 695 | ||
696 | /* First we put the Switcher up in very high virtual memory. */ | ||
419 | err = map_switcher(); | 697 | err = map_switcher(); |
420 | if (err) | 698 | if (err) |
421 | return err; | 699 | return err; |
422 | 700 | ||
701 | /* Now we set up the pagetable implementation for the Guests. */ | ||
423 | err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES); | 702 | err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES); |
424 | if (err) { | 703 | if (err) { |
425 | unmap_switcher(); | 704 | unmap_switcher(); |
426 | return err; | 705 | return err; |
427 | } | 706 | } |
707 | |||
708 | /* The I/O subsystem needs some things initialized. */ | ||
428 | lguest_io_init(); | 709 | lguest_io_init(); |
429 | 710 | ||
711 | /* /dev/lguest needs to be registered. */ | ||
430 | err = lguest_device_init(); | 712 | err = lguest_device_init(); |
431 | if (err) { | 713 | if (err) { |
432 | free_pagetables(); | 714 | free_pagetables(); |
433 | unmap_switcher(); | 715 | unmap_switcher(); |
434 | return err; | 716 | return err; |
435 | } | 717 | } |
718 | |||
719 | /* Finally, we need to turn off "Page Global Enable". PGE is an | ||
720 | * optimization where page table entries are specially marked to show | ||
721 | * they never change. The Host kernel marks all the kernel pages this | ||
722 | * way because it's always present, even when userspace is running. | ||
723 | * | ||
724 | * Lguest breaks this: unbeknownst to the rest of the Host kernel, we | ||
725 | * switch to the Guest kernel. If you don't disable this on all CPUs, | ||
726 | * you'll get really weird bugs that you'll chase for two days. | ||
727 | * | ||
728 | * I used to turn PGE off every time we switched to the Guest and back | ||
729 | * on when we return, but that slowed the Switcher down noticibly. */ | ||
730 | |||
731 | /* We don't need the complexity of CPUs coming and going while we're | ||
732 | * doing this. */ | ||
436 | lock_cpu_hotplug(); | 733 | lock_cpu_hotplug(); |
437 | if (cpu_has_pge) { /* We have a broader idea of "global". */ | 734 | if (cpu_has_pge) { /* We have a broader idea of "global". */ |
735 | /* Remember that this was originally set (for cleanup). */ | ||
438 | cpu_had_pge = 1; | 736 | cpu_had_pge = 1; |
737 | /* adjust_pge is a helper function which sets or unsets the PGE | ||
738 | * bit on its CPU, depending on the argument (0 == unset). */ | ||
439 | on_each_cpu(adjust_pge, (void *)0, 0, 1); | 739 | on_each_cpu(adjust_pge, (void *)0, 0, 1); |
740 | /* Turn off the feature in the global feature set. */ | ||
440 | clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); | 741 | clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); |
441 | } | 742 | } |
442 | unlock_cpu_hotplug(); | 743 | unlock_cpu_hotplug(); |
744 | |||
745 | /* All good! */ | ||
443 | return 0; | 746 | return 0; |
444 | } | 747 | } |
445 | 748 | ||
749 | /* Cleaning up is just the same code, backwards. With a little French. */ | ||
446 | static void __exit fini(void) | 750 | static void __exit fini(void) |
447 | { | 751 | { |
448 | lguest_device_remove(); | 752 | lguest_device_remove(); |
449 | free_pagetables(); | 753 | free_pagetables(); |
450 | unmap_switcher(); | 754 | unmap_switcher(); |
755 | |||
756 | /* If we had PGE before we started, turn it back on now. */ | ||
451 | lock_cpu_hotplug(); | 757 | lock_cpu_hotplug(); |
452 | if (cpu_had_pge) { | 758 | if (cpu_had_pge) { |
453 | set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); | 759 | set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); |
760 | /* adjust_pge's argument "1" means set PGE. */ | ||
454 | on_each_cpu(adjust_pge, (void *)1, 0, 1); | 761 | on_each_cpu(adjust_pge, (void *)1, 0, 1); |
455 | } | 762 | } |
456 | unlock_cpu_hotplug(); | 763 | unlock_cpu_hotplug(); |
457 | } | 764 | } |
458 | 765 | ||
766 | /* The Host side of lguest can be a module. This is a nice way for people to | ||
767 | * play with it. */ | ||
459 | module_init(init); | 768 | module_init(init); |
460 | module_exit(fini); | 769 | module_exit(fini); |
461 | MODULE_LICENSE("GPL"); | 770 | MODULE_LICENSE("GPL"); |
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index ea52ca451f74..db6caace3b9c 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c | |||
@@ -1,5 +1,10 @@ | |||
1 | /* Actual hypercalls, which allow guests to actually do something. | 1 | /*P:500 Just as userspace programs request kernel operations through a system |
2 | Copyright (C) 2006 Rusty Russell IBM Corporation | 2 | * call, the Guest requests Host operations through a "hypercall". You might |
3 | * notice this nomenclature doesn't really follow any logic, but the name has | ||
4 | * been around for long enough that we're stuck with it. As you'd expect, this | ||
5 | * code is basically a one big switch statement. :*/ | ||
6 | |||
7 | /* Copyright (C) 2006 Rusty Russell IBM Corporation | ||
3 | 8 | ||
4 | This program is free software; you can redistribute it and/or modify | 9 | This program is free software; you can redistribute it and/or modify |
5 | it under the terms of the GNU General Public License as published by | 10 | it under the terms of the GNU General Public License as published by |
@@ -23,37 +28,55 @@ | |||
23 | #include <irq_vectors.h> | 28 | #include <irq_vectors.h> |
24 | #include "lg.h" | 29 | #include "lg.h" |
25 | 30 | ||
31 | /*H:120 This is the core hypercall routine: where the Guest gets what it | ||
32 | * wants. Or gets killed. Or, in the case of LHCALL_CRASH, both. | ||
33 | * | ||
34 | * Remember from the Guest: %eax == which call to make, and the arguments are | ||
35 | * packed into %edx, %ebx and %ecx if needed. */ | ||
26 | static void do_hcall(struct lguest *lg, struct lguest_regs *regs) | 36 | static void do_hcall(struct lguest *lg, struct lguest_regs *regs) |
27 | { | 37 | { |
28 | switch (regs->eax) { | 38 | switch (regs->eax) { |
29 | case LHCALL_FLUSH_ASYNC: | 39 | case LHCALL_FLUSH_ASYNC: |
40 | /* This call does nothing, except by breaking out of the Guest | ||
41 | * it makes us process all the asynchronous hypercalls. */ | ||
30 | break; | 42 | break; |
31 | case LHCALL_LGUEST_INIT: | 43 | case LHCALL_LGUEST_INIT: |
44 | /* You can't get here unless you're already initialized. Don't | ||
45 | * do that. */ | ||
32 | kill_guest(lg, "already have lguest_data"); | 46 | kill_guest(lg, "already have lguest_data"); |
33 | break; | 47 | break; |
34 | case LHCALL_CRASH: { | 48 | case LHCALL_CRASH: { |
49 | /* Crash is such a trivial hypercall that we do it in four | ||
50 | * lines right here. */ | ||
35 | char msg[128]; | 51 | char msg[128]; |
52 | /* If the lgread fails, it will call kill_guest() itself; the | ||
53 | * kill_guest() with the message will be ignored. */ | ||
36 | lgread(lg, msg, regs->edx, sizeof(msg)); | 54 | lgread(lg, msg, regs->edx, sizeof(msg)); |
37 | msg[sizeof(msg)-1] = '\0'; | 55 | msg[sizeof(msg)-1] = '\0'; |
38 | kill_guest(lg, "CRASH: %s", msg); | 56 | kill_guest(lg, "CRASH: %s", msg); |
39 | break; | 57 | break; |
40 | } | 58 | } |
41 | case LHCALL_FLUSH_TLB: | 59 | case LHCALL_FLUSH_TLB: |
60 | /* FLUSH_TLB comes in two flavors, depending on the | ||
61 | * argument: */ | ||
42 | if (regs->edx) | 62 | if (regs->edx) |
43 | guest_pagetable_clear_all(lg); | 63 | guest_pagetable_clear_all(lg); |
44 | else | 64 | else |
45 | guest_pagetable_flush_user(lg); | 65 | guest_pagetable_flush_user(lg); |
46 | break; | 66 | break; |
47 | case LHCALL_GET_WALLCLOCK: { | ||
48 | struct timespec ts; | ||
49 | ktime_get_real_ts(&ts); | ||
50 | regs->eax = ts.tv_sec; | ||
51 | break; | ||
52 | } | ||
53 | case LHCALL_BIND_DMA: | 67 | case LHCALL_BIND_DMA: |
68 | /* BIND_DMA really wants four arguments, but it's the only call | ||
69 | * which does. So the Guest packs the number of buffers and | ||
70 | * the interrupt number into the final argument, and we decode | ||
71 | * it here. This can legitimately fail, since we currently | ||
72 | * place a limit on the number of DMA pools a Guest can have. | ||
73 | * So we return true or false from this call. */ | ||
54 | regs->eax = bind_dma(lg, regs->edx, regs->ebx, | 74 | regs->eax = bind_dma(lg, regs->edx, regs->ebx, |
55 | regs->ecx >> 8, regs->ecx & 0xFF); | 75 | regs->ecx >> 8, regs->ecx & 0xFF); |
56 | break; | 76 | break; |
77 | |||
78 | /* All these calls simply pass the arguments through to the right | ||
79 | * routines. */ | ||
57 | case LHCALL_SEND_DMA: | 80 | case LHCALL_SEND_DMA: |
58 | send_dma(lg, regs->edx, regs->ebx); | 81 | send_dma(lg, regs->edx, regs->ebx); |
59 | break; | 82 | break; |
@@ -81,10 +104,13 @@ static void do_hcall(struct lguest *lg, struct lguest_regs *regs) | |||
81 | case LHCALL_SET_CLOCKEVENT: | 104 | case LHCALL_SET_CLOCKEVENT: |
82 | guest_set_clockevent(lg, regs->edx); | 105 | guest_set_clockevent(lg, regs->edx); |
83 | break; | 106 | break; |
107 | |||
84 | case LHCALL_TS: | 108 | case LHCALL_TS: |
109 | /* This sets the TS flag, as we saw used in run_guest(). */ | ||
85 | lg->ts = regs->edx; | 110 | lg->ts = regs->edx; |
86 | break; | 111 | break; |
87 | case LHCALL_HALT: | 112 | case LHCALL_HALT: |
113 | /* Similarly, this sets the halted flag for run_guest(). */ | ||
88 | lg->halted = 1; | 114 | lg->halted = 1; |
89 | break; | 115 | break; |
90 | default: | 116 | default: |
@@ -92,25 +118,42 @@ static void do_hcall(struct lguest *lg, struct lguest_regs *regs) | |||
92 | } | 118 | } |
93 | } | 119 | } |
94 | 120 | ||
95 | /* We always do queued calls before actual hypercall. */ | 121 | /* Asynchronous hypercalls are easy: we just look in the array in the Guest's |
122 | * "struct lguest_data" and see if there are any new ones marked "ready". | ||
123 | * | ||
124 | * We are careful to do these in order: obviously we respect the order the | ||
125 | * Guest put them in the ring, but we also promise the Guest that they will | ||
126 | * happen before any normal hypercall (which is why we check this before | ||
127 | * checking for a normal hcall). */ | ||
96 | static void do_async_hcalls(struct lguest *lg) | 128 | static void do_async_hcalls(struct lguest *lg) |
97 | { | 129 | { |
98 | unsigned int i; | 130 | unsigned int i; |
99 | u8 st[LHCALL_RING_SIZE]; | 131 | u8 st[LHCALL_RING_SIZE]; |
100 | 132 | ||
133 | /* For simplicity, we copy the entire call status array in at once. */ | ||
101 | if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st))) | 134 | if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st))) |
102 | return; | 135 | return; |
103 | 136 | ||
137 | |||
138 | /* We process "struct lguest_data"s hcalls[] ring once. */ | ||
104 | for (i = 0; i < ARRAY_SIZE(st); i++) { | 139 | for (i = 0; i < ARRAY_SIZE(st); i++) { |
105 | struct lguest_regs regs; | 140 | struct lguest_regs regs; |
141 | /* We remember where we were up to from last time. This makes | ||
142 | * sure that the hypercalls are done in the order the Guest | ||
143 | * places them in the ring. */ | ||
106 | unsigned int n = lg->next_hcall; | 144 | unsigned int n = lg->next_hcall; |
107 | 145 | ||
146 | /* 0xFF means there's no call here (yet). */ | ||
108 | if (st[n] == 0xFF) | 147 | if (st[n] == 0xFF) |
109 | break; | 148 | break; |
110 | 149 | ||
150 | /* OK, we have hypercall. Increment the "next_hcall" cursor, | ||
151 | * and wrap back to 0 if we reach the end. */ | ||
111 | if (++lg->next_hcall == LHCALL_RING_SIZE) | 152 | if (++lg->next_hcall == LHCALL_RING_SIZE) |
112 | lg->next_hcall = 0; | 153 | lg->next_hcall = 0; |
113 | 154 | ||
155 | /* We copy the hypercall arguments into a fake register | ||
156 | * structure. This makes life simple for do_hcall(). */ | ||
114 | if (get_user(regs.eax, &lg->lguest_data->hcalls[n].eax) | 157 | if (get_user(regs.eax, &lg->lguest_data->hcalls[n].eax) |
115 | || get_user(regs.edx, &lg->lguest_data->hcalls[n].edx) | 158 | || get_user(regs.edx, &lg->lguest_data->hcalls[n].edx) |
116 | || get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx) | 159 | || get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx) |
@@ -119,74 +162,139 @@ static void do_async_hcalls(struct lguest *lg) | |||
119 | break; | 162 | break; |
120 | } | 163 | } |
121 | 164 | ||
165 | /* Do the hypercall, same as a normal one. */ | ||
122 | do_hcall(lg, ®s); | 166 | do_hcall(lg, ®s); |
167 | |||
168 | /* Mark the hypercall done. */ | ||
123 | if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) { | 169 | if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) { |
124 | kill_guest(lg, "Writing result for async hypercall"); | 170 | kill_guest(lg, "Writing result for async hypercall"); |
125 | break; | 171 | break; |
126 | } | 172 | } |
127 | 173 | ||
174 | /* Stop doing hypercalls if we've just done a DMA to the | ||
175 | * Launcher: it needs to service this first. */ | ||
128 | if (lg->dma_is_pending) | 176 | if (lg->dma_is_pending) |
129 | break; | 177 | break; |
130 | } | 178 | } |
131 | } | 179 | } |
132 | 180 | ||
181 | /* Last of all, we look at what happens first of all. The very first time the | ||
182 | * Guest makes a hypercall, we end up here to set things up: */ | ||
133 | static void initialize(struct lguest *lg) | 183 | static void initialize(struct lguest *lg) |
134 | { | 184 | { |
135 | u32 tsc_speed; | 185 | u32 tsc_speed; |
136 | 186 | ||
187 | /* You can't do anything until you're initialized. The Guest knows the | ||
188 | * rules, so we're unforgiving here. */ | ||
137 | if (lg->regs->eax != LHCALL_LGUEST_INIT) { | 189 | if (lg->regs->eax != LHCALL_LGUEST_INIT) { |
138 | kill_guest(lg, "hypercall %li before LGUEST_INIT", | 190 | kill_guest(lg, "hypercall %li before LGUEST_INIT", |
139 | lg->regs->eax); | 191 | lg->regs->eax); |
140 | return; | 192 | return; |
141 | } | 193 | } |
142 | 194 | ||
143 | /* We only tell the guest to use the TSC if it's reliable. */ | 195 | /* We insist that the Time Stamp Counter exist and doesn't change with |
196 | * cpu frequency. Some devious chip manufacturers decided that TSC | ||
197 | * changes could be handled in software. I decided that time going | ||
198 | * backwards might be good for benchmarks, but it's bad for users. | ||
199 | * | ||
200 | * We also insist that the TSC be stable: the kernel detects unreliable | ||
201 | * TSCs for its own purposes, and we use that here. */ | ||
144 | if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) | 202 | if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) |
145 | tsc_speed = tsc_khz; | 203 | tsc_speed = tsc_khz; |
146 | else | 204 | else |
147 | tsc_speed = 0; | 205 | tsc_speed = 0; |
148 | 206 | ||
207 | /* The pointer to the Guest's "struct lguest_data" is the only | ||
208 | * argument. */ | ||
149 | lg->lguest_data = (struct lguest_data __user *)lg->regs->edx; | 209 | lg->lguest_data = (struct lguest_data __user *)lg->regs->edx; |
150 | /* We check here so we can simply copy_to_user/from_user */ | 210 | /* If we check the address they gave is OK now, we can simply |
211 | * copy_to_user/from_user from now on rather than using lgread/lgwrite. | ||
212 | * I put this in to show that I'm not immune to writing stupid | ||
213 | * optimizations. */ | ||
151 | if (!lguest_address_ok(lg, lg->regs->edx, sizeof(*lg->lguest_data))) { | 214 | if (!lguest_address_ok(lg, lg->regs->edx, sizeof(*lg->lguest_data))) { |
152 | kill_guest(lg, "bad guest page %p", lg->lguest_data); | 215 | kill_guest(lg, "bad guest page %p", lg->lguest_data); |
153 | return; | 216 | return; |
154 | } | 217 | } |
218 | /* The Guest tells us where we're not to deliver interrupts by putting | ||
219 | * the range of addresses into "struct lguest_data". */ | ||
155 | if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start) | 220 | if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start) |
156 | || get_user(lg->noirq_end, &lg->lguest_data->noirq_end) | 221 | || get_user(lg->noirq_end, &lg->lguest_data->noirq_end) |
157 | /* We reserve the top pgd entry. */ | 222 | /* We tell the Guest that it can't use the top 4MB of virtual |
223 | * addresses used by the Switcher. */ | ||
158 | || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem) | 224 | || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem) |
159 | || put_user(tsc_speed, &lg->lguest_data->tsc_khz) | 225 | || put_user(tsc_speed, &lg->lguest_data->tsc_khz) |
226 | /* We also give the Guest a unique id, as used in lguest_net.c. */ | ||
160 | || put_user(lg->guestid, &lg->lguest_data->guestid)) | 227 | || put_user(lg->guestid, &lg->lguest_data->guestid)) |
161 | kill_guest(lg, "bad guest page %p", lg->lguest_data); | 228 | kill_guest(lg, "bad guest page %p", lg->lguest_data); |
162 | 229 | ||
163 | /* This is the one case where the above accesses might have | 230 | /* We write the current time into the Guest's data page once now. */ |
164 | * been the first write to a Guest page. This may have caused | 231 | write_timestamp(lg); |
165 | * a copy-on-write fault, but the Guest might be referring to | 232 | |
166 | * the old (read-only) page. */ | 233 | /* This is the one case where the above accesses might have been the |
234 | * first write to a Guest page. This may have caused a copy-on-write | ||
235 | * fault, but the Guest might be referring to the old (read-only) | ||
236 | * page. */ | ||
167 | guest_pagetable_clear_all(lg); | 237 | guest_pagetable_clear_all(lg); |
168 | } | 238 | } |
239 | /* Now we've examined the hypercall code; our Guest can make requests. There | ||
240 | * is one other way we can do things for the Guest, as we see in | ||
241 | * emulate_insn(). */ | ||
169 | 242 | ||
170 | /* Even if we go out to userspace and come back, we don't want to do | 243 | /*H:110 Tricky point: we mark the hypercall as "done" once we've done it. |
171 | * the hypercall again. */ | 244 | * Normally we don't need to do this: the Guest will run again and update the |
245 | * trap number before we come back around the run_guest() loop to | ||
246 | * do_hypercalls(). | ||
247 | * | ||
248 | * However, if we are signalled or the Guest sends DMA to the Launcher, that | ||
249 | * loop will exit without running the Guest. When it comes back it would try | ||
250 | * to re-run the hypercall. */ | ||
172 | static void clear_hcall(struct lguest *lg) | 251 | static void clear_hcall(struct lguest *lg) |
173 | { | 252 | { |
174 | lg->regs->trapnum = 255; | 253 | lg->regs->trapnum = 255; |
175 | } | 254 | } |
176 | 255 | ||
256 | /*H:100 | ||
257 | * Hypercalls | ||
258 | * | ||
259 | * Remember from the Guest, hypercalls come in two flavors: normal and | ||
260 | * asynchronous. This file handles both of types. | ||
261 | */ | ||
177 | void do_hypercalls(struct lguest *lg) | 262 | void do_hypercalls(struct lguest *lg) |
178 | { | 263 | { |
264 | /* Not initialized yet? */ | ||
179 | if (unlikely(!lg->lguest_data)) { | 265 | if (unlikely(!lg->lguest_data)) { |
266 | /* Did the Guest make a hypercall? We might have come back for | ||
267 | * some other reason (an interrupt, a different trap). */ | ||
180 | if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) { | 268 | if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) { |
269 | /* Set up the "struct lguest_data" */ | ||
181 | initialize(lg); | 270 | initialize(lg); |
271 | /* The hypercall is done. */ | ||
182 | clear_hcall(lg); | 272 | clear_hcall(lg); |
183 | } | 273 | } |
184 | return; | 274 | return; |
185 | } | 275 | } |
186 | 276 | ||
277 | /* The Guest has initialized. | ||
278 | * | ||
279 | * Look in the hypercall ring for the async hypercalls: */ | ||
187 | do_async_hcalls(lg); | 280 | do_async_hcalls(lg); |
281 | |||
282 | /* If we stopped reading the hypercall ring because the Guest did a | ||
283 | * SEND_DMA to the Launcher, we want to return now. Otherwise if the | ||
284 | * Guest asked us to do a hypercall, we do it. */ | ||
188 | if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) { | 285 | if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) { |
189 | do_hcall(lg, lg->regs); | 286 | do_hcall(lg, lg->regs); |
287 | /* The hypercall is done. */ | ||
190 | clear_hcall(lg); | 288 | clear_hcall(lg); |
191 | } | 289 | } |
192 | } | 290 | } |
291 | |||
292 | /* This routine supplies the Guest with time: it's used for wallclock time at | ||
293 | * initial boot and as a rough time source if the TSC isn't available. */ | ||
294 | void write_timestamp(struct lguest *lg) | ||
295 | { | ||
296 | struct timespec now; | ||
297 | ktime_get_real_ts(&now); | ||
298 | if (put_user(now, &lg->lguest_data->time)) | ||
299 | kill_guest(lg, "Writing timestamp"); | ||
300 | } | ||
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index bee029bb2c7b..49787e964a0d 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c | |||
@@ -1,100 +1,160 @@ | |||
1 | /*P:800 Interrupts (traps) are complicated enough to earn their own file. | ||
2 | * There are three classes of interrupts: | ||
3 | * | ||
4 | * 1) Real hardware interrupts which occur while we're running the Guest, | ||
5 | * 2) Interrupts for virtual devices attached to the Guest, and | ||
6 | * 3) Traps and faults from the Guest. | ||
7 | * | ||
8 | * Real hardware interrupts must be delivered to the Host, not the Guest. | ||
9 | * Virtual interrupts must be delivered to the Guest, but we make them look | ||
10 | * just like real hardware would deliver them. Traps from the Guest can be set | ||
11 | * up to go directly back into the Guest, but sometimes the Host wants to see | ||
12 | * them first, so we also have a way of "reflecting" them into the Guest as if | ||
13 | * they had been delivered to it directly. :*/ | ||
1 | #include <linux/uaccess.h> | 14 | #include <linux/uaccess.h> |
2 | #include "lg.h" | 15 | #include "lg.h" |
3 | 16 | ||
17 | /* The address of the interrupt handler is split into two bits: */ | ||
4 | static unsigned long idt_address(u32 lo, u32 hi) | 18 | static unsigned long idt_address(u32 lo, u32 hi) |
5 | { | 19 | { |
6 | return (lo & 0x0000FFFF) | (hi & 0xFFFF0000); | 20 | return (lo & 0x0000FFFF) | (hi & 0xFFFF0000); |
7 | } | 21 | } |
8 | 22 | ||
23 | /* The "type" of the interrupt handler is a 4 bit field: we only support a | ||
24 | * couple of types. */ | ||
9 | static int idt_type(u32 lo, u32 hi) | 25 | static int idt_type(u32 lo, u32 hi) |
10 | { | 26 | { |
11 | return (hi >> 8) & 0xF; | 27 | return (hi >> 8) & 0xF; |
12 | } | 28 | } |
13 | 29 | ||
30 | /* An IDT entry can't be used unless the "present" bit is set. */ | ||
14 | static int idt_present(u32 lo, u32 hi) | 31 | static int idt_present(u32 lo, u32 hi) |
15 | { | 32 | { |
16 | return (hi & 0x8000); | 33 | return (hi & 0x8000); |
17 | } | 34 | } |
18 | 35 | ||
36 | /* We need a helper to "push" a value onto the Guest's stack, since that's a | ||
37 | * big part of what delivering an interrupt does. */ | ||
19 | static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val) | 38 | static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val) |
20 | { | 39 | { |
40 | /* Stack grows upwards: move stack then write value. */ | ||
21 | *gstack -= 4; | 41 | *gstack -= 4; |
22 | lgwrite_u32(lg, *gstack, val); | 42 | lgwrite_u32(lg, *gstack, val); |
23 | } | 43 | } |
24 | 44 | ||
45 | /*H:210 The set_guest_interrupt() routine actually delivers the interrupt or | ||
46 | * trap. The mechanics of delivering traps and interrupts to the Guest are the | ||
47 | * same, except some traps have an "error code" which gets pushed onto the | ||
48 | * stack as well: the caller tells us if this is one. | ||
49 | * | ||
50 | * "lo" and "hi" are the two parts of the Interrupt Descriptor Table for this | ||
51 | * interrupt or trap. It's split into two parts for traditional reasons: gcc | ||
52 | * on i386 used to be frightened by 64 bit numbers. | ||
53 | * | ||
54 | * We set up the stack just like the CPU does for a real interrupt, so it's | ||
55 | * identical for the Guest (and the standard "iret" instruction will undo | ||
56 | * it). */ | ||
25 | static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) | 57 | static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) |
26 | { | 58 | { |
27 | unsigned long gstack; | 59 | unsigned long gstack; |
28 | u32 eflags, ss, irq_enable; | 60 | u32 eflags, ss, irq_enable; |
29 | 61 | ||
30 | /* If they want a ring change, we use new stack and push old ss/esp */ | 62 | /* There are two cases for interrupts: one where the Guest is already |
63 | * in the kernel, and a more complex one where the Guest is in | ||
64 | * userspace. We check the privilege level to find out. */ | ||
31 | if ((lg->regs->ss&0x3) != GUEST_PL) { | 65 | if ((lg->regs->ss&0x3) != GUEST_PL) { |
66 | /* The Guest told us their kernel stack with the SET_STACK | ||
67 | * hypercall: both the virtual address and the segment */ | ||
32 | gstack = guest_pa(lg, lg->esp1); | 68 | gstack = guest_pa(lg, lg->esp1); |
33 | ss = lg->ss1; | 69 | ss = lg->ss1; |
70 | /* We push the old stack segment and pointer onto the new | ||
71 | * stack: when the Guest does an "iret" back from the interrupt | ||
72 | * handler the CPU will notice they're dropping privilege | ||
73 | * levels and expect these here. */ | ||
34 | push_guest_stack(lg, &gstack, lg->regs->ss); | 74 | push_guest_stack(lg, &gstack, lg->regs->ss); |
35 | push_guest_stack(lg, &gstack, lg->regs->esp); | 75 | push_guest_stack(lg, &gstack, lg->regs->esp); |
36 | } else { | 76 | } else { |
77 | /* We're staying on the same Guest (kernel) stack. */ | ||
37 | gstack = guest_pa(lg, lg->regs->esp); | 78 | gstack = guest_pa(lg, lg->regs->esp); |
38 | ss = lg->regs->ss; | 79 | ss = lg->regs->ss; |
39 | } | 80 | } |
40 | 81 | ||
41 | /* We use IF bit in eflags to indicate whether irqs were enabled | 82 | /* Remember that we never let the Guest actually disable interrupts, so |
42 | (it's always 1, since irqs are enabled when guest is running). */ | 83 | * the "Interrupt Flag" bit is always set. We copy that bit from the |
84 | * Guest's "irq_enabled" field into the eflags word: the Guest copies | ||
85 | * it back in "lguest_iret". */ | ||
43 | eflags = lg->regs->eflags; | 86 | eflags = lg->regs->eflags; |
44 | if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0 | 87 | if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0 |
45 | && !(irq_enable & X86_EFLAGS_IF)) | 88 | && !(irq_enable & X86_EFLAGS_IF)) |
46 | eflags &= ~X86_EFLAGS_IF; | 89 | eflags &= ~X86_EFLAGS_IF; |
47 | 90 | ||
91 | /* An interrupt is expected to push three things on the stack: the old | ||
92 | * "eflags" word, the old code segment, and the old instruction | ||
93 | * pointer. */ | ||
48 | push_guest_stack(lg, &gstack, eflags); | 94 | push_guest_stack(lg, &gstack, eflags); |
49 | push_guest_stack(lg, &gstack, lg->regs->cs); | 95 | push_guest_stack(lg, &gstack, lg->regs->cs); |
50 | push_guest_stack(lg, &gstack, lg->regs->eip); | 96 | push_guest_stack(lg, &gstack, lg->regs->eip); |
51 | 97 | ||
98 | /* For the six traps which supply an error code, we push that, too. */ | ||
52 | if (has_err) | 99 | if (has_err) |
53 | push_guest_stack(lg, &gstack, lg->regs->errcode); | 100 | push_guest_stack(lg, &gstack, lg->regs->errcode); |
54 | 101 | ||
55 | /* Change the real stack so switcher returns to trap handler */ | 102 | /* Now we've pushed all the old state, we change the stack, the code |
103 | * segment and the address to execute. */ | ||
56 | lg->regs->ss = ss; | 104 | lg->regs->ss = ss; |
57 | lg->regs->esp = gstack + lg->page_offset; | 105 | lg->regs->esp = gstack + lg->page_offset; |
58 | lg->regs->cs = (__KERNEL_CS|GUEST_PL); | 106 | lg->regs->cs = (__KERNEL_CS|GUEST_PL); |
59 | lg->regs->eip = idt_address(lo, hi); | 107 | lg->regs->eip = idt_address(lo, hi); |
60 | 108 | ||
61 | /* Disable interrupts for an interrupt gate. */ | 109 | /* There are two kinds of interrupt handlers: 0xE is an "interrupt |
110 | * gate" which expects interrupts to be disabled on entry. */ | ||
62 | if (idt_type(lo, hi) == 0xE) | 111 | if (idt_type(lo, hi) == 0xE) |
63 | if (put_user(0, &lg->lguest_data->irq_enabled)) | 112 | if (put_user(0, &lg->lguest_data->irq_enabled)) |
64 | kill_guest(lg, "Disabling interrupts"); | 113 | kill_guest(lg, "Disabling interrupts"); |
65 | } | 114 | } |
66 | 115 | ||
116 | /*H:200 | ||
117 | * Virtual Interrupts. | ||
118 | * | ||
119 | * maybe_do_interrupt() gets called before every entry to the Guest, to see if | ||
120 | * we should divert the Guest to running an interrupt handler. */ | ||
67 | void maybe_do_interrupt(struct lguest *lg) | 121 | void maybe_do_interrupt(struct lguest *lg) |
68 | { | 122 | { |
69 | unsigned int irq; | 123 | unsigned int irq; |
70 | DECLARE_BITMAP(blk, LGUEST_IRQS); | 124 | DECLARE_BITMAP(blk, LGUEST_IRQS); |
71 | struct desc_struct *idt; | 125 | struct desc_struct *idt; |
72 | 126 | ||
127 | /* If the Guest hasn't even initialized yet, we can do nothing. */ | ||
73 | if (!lg->lguest_data) | 128 | if (!lg->lguest_data) |
74 | return; | 129 | return; |
75 | 130 | ||
76 | /* Mask out any interrupts they have blocked. */ | 131 | /* Take our "irqs_pending" array and remove any interrupts the Guest |
132 | * wants blocked: the result ends up in "blk". */ | ||
77 | if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts, | 133 | if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts, |
78 | sizeof(blk))) | 134 | sizeof(blk))) |
79 | return; | 135 | return; |
80 | 136 | ||
81 | bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS); | 137 | bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS); |
82 | 138 | ||
139 | /* Find the first interrupt. */ | ||
83 | irq = find_first_bit(blk, LGUEST_IRQS); | 140 | irq = find_first_bit(blk, LGUEST_IRQS); |
141 | /* None? Nothing to do */ | ||
84 | if (irq >= LGUEST_IRQS) | 142 | if (irq >= LGUEST_IRQS) |
85 | return; | 143 | return; |
86 | 144 | ||
145 | /* They may be in the middle of an iret, where they asked us never to | ||
146 | * deliver interrupts. */ | ||
87 | if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end) | 147 | if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end) |
88 | return; | 148 | return; |
89 | 149 | ||
90 | /* If they're halted, we re-enable interrupts. */ | 150 | /* If they're halted, interrupts restart them. */ |
91 | if (lg->halted) { | 151 | if (lg->halted) { |
92 | /* Re-enable interrupts. */ | 152 | /* Re-enable interrupts. */ |
93 | if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled)) | 153 | if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled)) |
94 | kill_guest(lg, "Re-enabling interrupts"); | 154 | kill_guest(lg, "Re-enabling interrupts"); |
95 | lg->halted = 0; | 155 | lg->halted = 0; |
96 | } else { | 156 | } else { |
97 | /* Maybe they have interrupts disabled? */ | 157 | /* Otherwise we check if they have interrupts disabled. */ |
98 | u32 irq_enabled; | 158 | u32 irq_enabled; |
99 | if (get_user(irq_enabled, &lg->lguest_data->irq_enabled)) | 159 | if (get_user(irq_enabled, &lg->lguest_data->irq_enabled)) |
100 | irq_enabled = 0; | 160 | irq_enabled = 0; |
@@ -102,112 +162,218 @@ void maybe_do_interrupt(struct lguest *lg) | |||
102 | return; | 162 | return; |
103 | } | 163 | } |
104 | 164 | ||
165 | /* Look at the IDT entry the Guest gave us for this interrupt. The | ||
166 | * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip | ||
167 | * over them. */ | ||
105 | idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq]; | 168 | idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq]; |
169 | /* If they don't have a handler (yet?), we just ignore it */ | ||
106 | if (idt_present(idt->a, idt->b)) { | 170 | if (idt_present(idt->a, idt->b)) { |
171 | /* OK, mark it no longer pending and deliver it. */ | ||
107 | clear_bit(irq, lg->irqs_pending); | 172 | clear_bit(irq, lg->irqs_pending); |
173 | /* set_guest_interrupt() takes the interrupt descriptor and a | ||
174 | * flag to say whether this interrupt pushes an error code onto | ||
175 | * the stack as well: virtual interrupts never do. */ | ||
108 | set_guest_interrupt(lg, idt->a, idt->b, 0); | 176 | set_guest_interrupt(lg, idt->a, idt->b, 0); |
109 | } | 177 | } |
178 | |||
179 | /* Every time we deliver an interrupt, we update the timestamp in the | ||
180 | * Guest's lguest_data struct. It would be better for the Guest if we | ||
181 | * did this more often, but it can actually be quite slow: doing it | ||
182 | * here is a compromise which means at least it gets updated every | ||
183 | * timer interrupt. */ | ||
184 | write_timestamp(lg); | ||
110 | } | 185 | } |
111 | 186 | ||
187 | /*H:220 Now we've got the routines to deliver interrupts, delivering traps | ||
188 | * like page fault is easy. The only trick is that Intel decided that some | ||
189 | * traps should have error codes: */ | ||
112 | static int has_err(unsigned int trap) | 190 | static int has_err(unsigned int trap) |
113 | { | 191 | { |
114 | return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); | 192 | return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); |
115 | } | 193 | } |
116 | 194 | ||
195 | /* deliver_trap() returns true if it could deliver the trap. */ | ||
117 | int deliver_trap(struct lguest *lg, unsigned int num) | 196 | int deliver_trap(struct lguest *lg, unsigned int num) |
118 | { | 197 | { |
119 | u32 lo = lg->idt[num].a, hi = lg->idt[num].b; | 198 | u32 lo = lg->idt[num].a, hi = lg->idt[num].b; |
120 | 199 | ||
200 | /* Early on the Guest hasn't set the IDT entries (or maybe it put a | ||
201 | * bogus one in): if we fail here, the Guest will be killed. */ | ||
121 | if (!idt_present(lo, hi)) | 202 | if (!idt_present(lo, hi)) |
122 | return 0; | 203 | return 0; |
123 | set_guest_interrupt(lg, lo, hi, has_err(num)); | 204 | set_guest_interrupt(lg, lo, hi, has_err(num)); |
124 | return 1; | 205 | return 1; |
125 | } | 206 | } |
126 | 207 | ||
208 | /*H:250 Here's the hard part: returning to the Host every time a trap happens | ||
209 | * and then calling deliver_trap() and re-entering the Guest is slow. | ||
210 | * Particularly because Guest userspace system calls are traps (trap 128). | ||
211 | * | ||
212 | * So we'd like to set up the IDT to tell the CPU to deliver traps directly | ||
213 | * into the Guest. This is possible, but the complexities cause the size of | ||
214 | * this file to double! However, 150 lines of code is worth writing for taking | ||
215 | * system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all | ||
216 | * the other hypervisors would tease it. | ||
217 | * | ||
218 | * This routine determines if a trap can be delivered directly. */ | ||
127 | static int direct_trap(const struct lguest *lg, | 219 | static int direct_trap(const struct lguest *lg, |
128 | const struct desc_struct *trap, | 220 | const struct desc_struct *trap, |
129 | unsigned int num) | 221 | unsigned int num) |
130 | { | 222 | { |
131 | /* Hardware interrupts don't go to guest (except syscall). */ | 223 | /* Hardware interrupts don't go to the Guest at all (except system |
224 | * call). */ | ||
132 | if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR) | 225 | if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR) |
133 | return 0; | 226 | return 0; |
134 | 227 | ||
135 | /* We intercept page fault (demand shadow paging & cr2 saving) | 228 | /* The Host needs to see page faults (for shadow paging and to save the |
136 | protection fault (in/out emulation) and device not | 229 | * fault address), general protection faults (in/out emulation) and |
137 | available (TS handling), and hypercall */ | 230 | * device not available (TS handling), and of course, the hypercall |
231 | * trap. */ | ||
138 | if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY) | 232 | if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY) |
139 | return 0; | 233 | return 0; |
140 | 234 | ||
141 | /* Interrupt gates (0xE) or not present (0x0) can't go direct. */ | 235 | /* Only trap gates (type 15) can go direct to the Guest. Interrupt |
236 | * gates (type 14) disable interrupts as they are entered, which we | ||
237 | * never let the Guest do. Not present entries (type 0x0) also can't | ||
238 | * go direct, of course 8) */ | ||
142 | return idt_type(trap->a, trap->b) == 0xF; | 239 | return idt_type(trap->a, trap->b) == 0xF; |
143 | } | 240 | } |
144 | 241 | /*:*/ | |
242 | |||
243 | /*M:005 The Guest has the ability to turn its interrupt gates into trap gates, | ||
244 | * if it is careful. The Host will let trap gates can go directly to the | ||
245 | * Guest, but the Guest needs the interrupts atomically disabled for an | ||
246 | * interrupt gate. It can do this by pointing the trap gate at instructions | ||
247 | * within noirq_start and noirq_end, where it can safely disable interrupts. */ | ||
248 | |||
249 | /*M:006 The Guests do not use the sysenter (fast system call) instruction, | ||
250 | * because it's hardcoded to enter privilege level 0 and so can't go direct. | ||
251 | * It's about twice as fast as the older "int 0x80" system call, so it might | ||
252 | * still be worthwhile to handle it in the Switcher and lcall down to the | ||
253 | * Guest. The sysenter semantics are hairy tho: search for that keyword in | ||
254 | * entry.S :*/ | ||
255 | |||
256 | /*H:260 When we make traps go directly into the Guest, we need to make sure | ||
257 | * the kernel stack is valid (ie. mapped in the page tables). Otherwise, the | ||
258 | * CPU trying to deliver the trap will fault while trying to push the interrupt | ||
259 | * words on the stack: this is called a double fault, and it forces us to kill | ||
260 | * the Guest. | ||
261 | * | ||
262 | * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */ | ||
145 | void pin_stack_pages(struct lguest *lg) | 263 | void pin_stack_pages(struct lguest *lg) |
146 | { | 264 | { |
147 | unsigned int i; | 265 | unsigned int i; |
148 | 266 | ||
267 | /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or | ||
268 | * two pages of stack space. */ | ||
149 | for (i = 0; i < lg->stack_pages; i++) | 269 | for (i = 0; i < lg->stack_pages; i++) |
270 | /* The stack grows *upwards*, hence the subtraction */ | ||
150 | pin_page(lg, lg->esp1 - i * PAGE_SIZE); | 271 | pin_page(lg, lg->esp1 - i * PAGE_SIZE); |
151 | } | 272 | } |
152 | 273 | ||
274 | /* Direct traps also mean that we need to know whenever the Guest wants to use | ||
275 | * a different kernel stack, so we can change the IDT entries to use that | ||
276 | * stack. The IDT entries expect a virtual address, so unlike most addresses | ||
277 | * the Guest gives us, the "esp" (stack pointer) value here is virtual, not | ||
278 | * physical. | ||
279 | * | ||
280 | * In Linux each process has its own kernel stack, so this happens a lot: we | ||
281 | * change stacks on each context switch. */ | ||
153 | void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages) | 282 | void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages) |
154 | { | 283 | { |
155 | /* You cannot have a stack segment with priv level 0. */ | 284 | /* You are not allowd have a stack segment with privilege level 0: bad |
285 | * Guest! */ | ||
156 | if ((seg & 0x3) != GUEST_PL) | 286 | if ((seg & 0x3) != GUEST_PL) |
157 | kill_guest(lg, "bad stack segment %i", seg); | 287 | kill_guest(lg, "bad stack segment %i", seg); |
288 | /* We only expect one or two stack pages. */ | ||
158 | if (pages > 2) | 289 | if (pages > 2) |
159 | kill_guest(lg, "bad stack pages %u", pages); | 290 | kill_guest(lg, "bad stack pages %u", pages); |
291 | /* Save where the stack is, and how many pages */ | ||
160 | lg->ss1 = seg; | 292 | lg->ss1 = seg; |
161 | lg->esp1 = esp; | 293 | lg->esp1 = esp; |
162 | lg->stack_pages = pages; | 294 | lg->stack_pages = pages; |
295 | /* Make sure the new stack pages are mapped */ | ||
163 | pin_stack_pages(lg); | 296 | pin_stack_pages(lg); |
164 | } | 297 | } |
165 | 298 | ||
166 | /* Set up trap in IDT. */ | 299 | /* All this reference to mapping stacks leads us neatly into the other complex |
300 | * part of the Host: page table handling. */ | ||
301 | |||
302 | /*H:235 This is the routine which actually checks the Guest's IDT entry and | ||
303 | * transfers it into our entry in "struct lguest": */ | ||
167 | static void set_trap(struct lguest *lg, struct desc_struct *trap, | 304 | static void set_trap(struct lguest *lg, struct desc_struct *trap, |
168 | unsigned int num, u32 lo, u32 hi) | 305 | unsigned int num, u32 lo, u32 hi) |
169 | { | 306 | { |
170 | u8 type = idt_type(lo, hi); | 307 | u8 type = idt_type(lo, hi); |
171 | 308 | ||
309 | /* We zero-out a not-present entry */ | ||
172 | if (!idt_present(lo, hi)) { | 310 | if (!idt_present(lo, hi)) { |
173 | trap->a = trap->b = 0; | 311 | trap->a = trap->b = 0; |
174 | return; | 312 | return; |
175 | } | 313 | } |
176 | 314 | ||
315 | /* We only support interrupt and trap gates. */ | ||
177 | if (type != 0xE && type != 0xF) | 316 | if (type != 0xE && type != 0xF) |
178 | kill_guest(lg, "bad IDT type %i", type); | 317 | kill_guest(lg, "bad IDT type %i", type); |
179 | 318 | ||
319 | /* We only copy the handler address, present bit, privilege level and | ||
320 | * type. The privilege level controls where the trap can be triggered | ||
321 | * manually with an "int" instruction. This is usually GUEST_PL, | ||
322 | * except for system calls which userspace can use. */ | ||
180 | trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF); | 323 | trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF); |
181 | trap->b = (hi&0xFFFFEF00); | 324 | trap->b = (hi&0xFFFFEF00); |
182 | } | 325 | } |
183 | 326 | ||
327 | /*H:230 While we're here, dealing with delivering traps and interrupts to the | ||
328 | * Guest, we might as well complete the picture: how the Guest tells us where | ||
329 | * it wants them to go. This would be simple, except making traps fast | ||
330 | * requires some tricks. | ||
331 | * | ||
332 | * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the | ||
333 | * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */ | ||
184 | void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi) | 334 | void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi) |
185 | { | 335 | { |
186 | /* Guest never handles: NMI, doublefault, hypercall, spurious irq. */ | 336 | /* Guest never handles: NMI, doublefault, spurious interrupt or |
337 | * hypercall. We ignore when it tries to set them. */ | ||
187 | if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY) | 338 | if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY) |
188 | return; | 339 | return; |
189 | 340 | ||
341 | /* Mark the IDT as changed: next time the Guest runs we'll know we have | ||
342 | * to copy this again. */ | ||
190 | lg->changed |= CHANGED_IDT; | 343 | lg->changed |= CHANGED_IDT; |
344 | |||
345 | /* The IDT which we keep in "struct lguest" only contains 32 entries | ||
346 | * for the traps and LGUEST_IRQS (32) entries for interrupts. We | ||
347 | * ignore attempts to set handlers for higher interrupt numbers, except | ||
348 | * for the system call "interrupt" at 128: we have a special IDT entry | ||
349 | * for that. */ | ||
191 | if (num < ARRAY_SIZE(lg->idt)) | 350 | if (num < ARRAY_SIZE(lg->idt)) |
192 | set_trap(lg, &lg->idt[num], num, lo, hi); | 351 | set_trap(lg, &lg->idt[num], num, lo, hi); |
193 | else if (num == SYSCALL_VECTOR) | 352 | else if (num == SYSCALL_VECTOR) |
194 | set_trap(lg, &lg->syscall_idt, num, lo, hi); | 353 | set_trap(lg, &lg->syscall_idt, num, lo, hi); |
195 | } | 354 | } |
196 | 355 | ||
356 | /* The default entry for each interrupt points into the Switcher routines which | ||
357 | * simply return to the Host. The run_guest() loop will then call | ||
358 | * deliver_trap() to bounce it back into the Guest. */ | ||
197 | static void default_idt_entry(struct desc_struct *idt, | 359 | static void default_idt_entry(struct desc_struct *idt, |
198 | int trap, | 360 | int trap, |
199 | const unsigned long handler) | 361 | const unsigned long handler) |
200 | { | 362 | { |
363 | /* A present interrupt gate. */ | ||
201 | u32 flags = 0x8e00; | 364 | u32 flags = 0x8e00; |
202 | 365 | ||
203 | /* They can't "int" into any of them except hypercall. */ | 366 | /* Set the privilege level on the entry for the hypercall: this allows |
367 | * the Guest to use the "int" instruction to trigger it. */ | ||
204 | if (trap == LGUEST_TRAP_ENTRY) | 368 | if (trap == LGUEST_TRAP_ENTRY) |
205 | flags |= (GUEST_PL << 13); | 369 | flags |= (GUEST_PL << 13); |
206 | 370 | ||
371 | /* Now pack it into the IDT entry in its weird format. */ | ||
207 | idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF); | 372 | idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF); |
208 | idt->b = (handler&0xFFFF0000) | flags; | 373 | idt->b = (handler&0xFFFF0000) | flags; |
209 | } | 374 | } |
210 | 375 | ||
376 | /* When the Guest first starts, we put default entries into the IDT. */ | ||
211 | void setup_default_idt_entries(struct lguest_ro_state *state, | 377 | void setup_default_idt_entries(struct lguest_ro_state *state, |
212 | const unsigned long *def) | 378 | const unsigned long *def) |
213 | { | 379 | { |
@@ -217,19 +383,25 @@ void setup_default_idt_entries(struct lguest_ro_state *state, | |||
217 | default_idt_entry(&state->guest_idt[i], i, def[i]); | 383 | default_idt_entry(&state->guest_idt[i], i, def[i]); |
218 | } | 384 | } |
219 | 385 | ||
386 | /*H:240 We don't use the IDT entries in the "struct lguest" directly, instead | ||
387 | * we copy them into the IDT which we've set up for Guests on this CPU, just | ||
388 | * before we run the Guest. This routine does that copy. */ | ||
220 | void copy_traps(const struct lguest *lg, struct desc_struct *idt, | 389 | void copy_traps(const struct lguest *lg, struct desc_struct *idt, |
221 | const unsigned long *def) | 390 | const unsigned long *def) |
222 | { | 391 | { |
223 | unsigned int i; | 392 | unsigned int i; |
224 | 393 | ||
225 | /* All hardware interrupts are same whatever the guest: only the | 394 | /* We can simply copy the direct traps, otherwise we use the default |
226 | * traps might be different. */ | 395 | * ones in the Switcher: they will return to the Host. */ |
227 | for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) { | 396 | for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) { |
228 | if (direct_trap(lg, &lg->idt[i], i)) | 397 | if (direct_trap(lg, &lg->idt[i], i)) |
229 | idt[i] = lg->idt[i]; | 398 | idt[i] = lg->idt[i]; |
230 | else | 399 | else |
231 | default_idt_entry(&idt[i], i, def[i]); | 400 | default_idt_entry(&idt[i], i, def[i]); |
232 | } | 401 | } |
402 | |||
403 | /* Don't forget the system call trap! The IDT entries for other | ||
404 | * interupts never change, so no need to copy them. */ | ||
233 | i = SYSCALL_VECTOR; | 405 | i = SYSCALL_VECTOR; |
234 | if (direct_trap(lg, &lg->syscall_idt, i)) | 406 | if (direct_trap(lg, &lg->syscall_idt, i)) |
235 | idt[i] = lg->syscall_idt; | 407 | idt[i] = lg->syscall_idt; |
diff --git a/drivers/lguest/io.c b/drivers/lguest/io.c index c8eb79266991..ea68613b43f6 100644 --- a/drivers/lguest/io.c +++ b/drivers/lguest/io.c | |||
@@ -1,5 +1,9 @@ | |||
1 | /* Simple I/O model for guests, based on shared memory. | 1 | /*P:300 The I/O mechanism in lguest is simple yet flexible, allowing the Guest |
2 | * Copyright (C) 2006 Rusty Russell IBM Corporation | 2 | * to talk to the Launcher or directly to another Guest. It uses familiar |
3 | * concepts of DMA and interrupts, plus some neat code stolen from | ||
4 | * futexes... :*/ | ||
5 | |||
6 | /* Copyright (C) 2006 Rusty Russell IBM Corporation | ||
3 | * | 7 | * |
4 | * This program is free software; you can redistribute it and/or modify | 8 | * This program is free software; you can redistribute it and/or modify |
5 | * it under the terms of the GNU General Public License as published by | 9 | * it under the terms of the GNU General Public License as published by |
@@ -23,8 +27,36 @@ | |||
23 | #include <linux/uaccess.h> | 27 | #include <linux/uaccess.h> |
24 | #include "lg.h" | 28 | #include "lg.h" |
25 | 29 | ||
30 | /*L:300 | ||
31 | * I/O | ||
32 | * | ||
33 | * Getting data in and out of the Guest is quite an art. There are numerous | ||
34 | * ways to do it, and they all suck differently. We try to keep things fairly | ||
35 | * close to "real" hardware so our Guest's drivers don't look like an alien | ||
36 | * visitation in the middle of the Linux code, and yet make sure that Guests | ||
37 | * can talk directly to other Guests, not just the Launcher. | ||
38 | * | ||
39 | * To do this, the Guest gives us a key when it binds or sends DMA buffers. | ||
40 | * The key corresponds to a "physical" address inside the Guest (ie. a virtual | ||
41 | * address inside the Launcher process). We don't, however, use this key | ||
42 | * directly. | ||
43 | * | ||
44 | * We want Guests which share memory to be able to DMA to each other: two | ||
45 | * Launchers can mmap memory the same file, then the Guests can communicate. | ||
46 | * Fortunately, the futex code provides us with a way to get a "union | ||
47 | * futex_key" corresponding to the memory lying at a virtual address: if the | ||
48 | * two processes share memory, the "union futex_key" for that memory will match | ||
49 | * even if the memory is mapped at different addresses in each. So we always | ||
50 | * convert the keys to "union futex_key"s to compare them. | ||
51 | * | ||
52 | * Before we dive into this though, we need to look at another set of helper | ||
53 | * routines used throughout the Host kernel code to access Guest memory. | ||
54 | :*/ | ||
26 | static struct list_head dma_hash[61]; | 55 | static struct list_head dma_hash[61]; |
27 | 56 | ||
57 | /* An unfortunate side effect of the Linux double-linked list implementation is | ||
58 | * that there's no good way to statically initialize an array of linked | ||
59 | * lists. */ | ||
28 | void lguest_io_init(void) | 60 | void lguest_io_init(void) |
29 | { | 61 | { |
30 | unsigned int i; | 62 | unsigned int i; |
@@ -56,6 +88,19 @@ kill: | |||
56 | return 0; | 88 | return 0; |
57 | } | 89 | } |
58 | 90 | ||
91 | /*L:330 This is our hash function, using the wonderful Jenkins hash. | ||
92 | * | ||
93 | * The futex key is a union with three parts: an unsigned long word, a pointer, | ||
94 | * and an int "offset". We could use jhash_2words() which takes three u32s. | ||
95 | * (Ok, the hash functions are great: the naming sucks though). | ||
96 | * | ||
97 | * It's nice to be portable to 64-bit platforms, so we use the more generic | ||
98 | * jhash2(), which takes an array of u32, the number of u32s, and an initial | ||
99 | * u32 to roll in. This is uglier, but breaks down to almost the same code on | ||
100 | * 32-bit platforms like this one. | ||
101 | * | ||
102 | * We want a position in the array, so we modulo ARRAY_SIZE(dma_hash) (ie. 61). | ||
103 | */ | ||
59 | static unsigned int hash(const union futex_key *key) | 104 | static unsigned int hash(const union futex_key *key) |
60 | { | 105 | { |
61 | return jhash2((u32*)&key->both.word, | 106 | return jhash2((u32*)&key->both.word, |
@@ -64,6 +109,9 @@ static unsigned int hash(const union futex_key *key) | |||
64 | % ARRAY_SIZE(dma_hash); | 109 | % ARRAY_SIZE(dma_hash); |
65 | } | 110 | } |
66 | 111 | ||
112 | /* This is a convenience routine to compare two keys. It's a much bemoaned C | ||
113 | * weakness that it doesn't allow '==' on structures or unions, so we have to | ||
114 | * open-code it like this. */ | ||
67 | static inline int key_eq(const union futex_key *a, const union futex_key *b) | 115 | static inline int key_eq(const union futex_key *a, const union futex_key *b) |
68 | { | 116 | { |
69 | return (a->both.word == b->both.word | 117 | return (a->both.word == b->both.word |
@@ -71,22 +119,36 @@ static inline int key_eq(const union futex_key *a, const union futex_key *b) | |||
71 | && a->both.offset == b->both.offset); | 119 | && a->both.offset == b->both.offset); |
72 | } | 120 | } |
73 | 121 | ||
74 | /* Must hold read lock on dmainfo owner's current->mm->mmap_sem */ | 122 | /*L:360 OK, when we need to actually free up a Guest's DMA array we do several |
123 | * things, so we have a convenient function to do it. | ||
124 | * | ||
125 | * The caller must hold a read lock on dmainfo owner's current->mm->mmap_sem | ||
126 | * for the drop_futex_key_refs(). */ | ||
75 | static void unlink_dma(struct lguest_dma_info *dmainfo) | 127 | static void unlink_dma(struct lguest_dma_info *dmainfo) |
76 | { | 128 | { |
129 | /* You locked this too, right? */ | ||
77 | BUG_ON(!mutex_is_locked(&lguest_lock)); | 130 | BUG_ON(!mutex_is_locked(&lguest_lock)); |
131 | /* This is how we know that the entry is free. */ | ||
78 | dmainfo->interrupt = 0; | 132 | dmainfo->interrupt = 0; |
133 | /* Remove it from the hash table. */ | ||
79 | list_del(&dmainfo->list); | 134 | list_del(&dmainfo->list); |
135 | /* Drop the references we were holding (to the inode or mm). */ | ||
80 | drop_futex_key_refs(&dmainfo->key); | 136 | drop_futex_key_refs(&dmainfo->key); |
81 | } | 137 | } |
82 | 138 | ||
139 | /*L:350 This is the routine which we call when the Guest asks to unregister a | ||
140 | * DMA array attached to a given key. Returns true if the array was found. */ | ||
83 | static int unbind_dma(struct lguest *lg, | 141 | static int unbind_dma(struct lguest *lg, |
84 | const union futex_key *key, | 142 | const union futex_key *key, |
85 | unsigned long dmas) | 143 | unsigned long dmas) |
86 | { | 144 | { |
87 | int i, ret = 0; | 145 | int i, ret = 0; |
88 | 146 | ||
147 | /* We don't bother with the hash table, just look through all this | ||
148 | * Guest's DMA arrays. */ | ||
89 | for (i = 0; i < LGUEST_MAX_DMA; i++) { | 149 | for (i = 0; i < LGUEST_MAX_DMA; i++) { |
150 | /* In theory it could have more than one array on the same key, | ||
151 | * or one array on multiple keys, so we check both */ | ||
90 | if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) { | 152 | if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) { |
91 | unlink_dma(&lg->dma[i]); | 153 | unlink_dma(&lg->dma[i]); |
92 | ret = 1; | 154 | ret = 1; |
@@ -96,51 +158,91 @@ static int unbind_dma(struct lguest *lg, | |||
96 | return ret; | 158 | return ret; |
97 | } | 159 | } |
98 | 160 | ||
161 | /*L:340 BIND_DMA: this is the hypercall which sets up an array of "struct | ||
162 | * lguest_dma" for receiving I/O. | ||
163 | * | ||
164 | * The Guest wants to bind an array of "struct lguest_dma"s to a particular key | ||
165 | * to receive input. This only happens when the Guest is setting up a new | ||
166 | * device, so it doesn't have to be very fast. | ||
167 | * | ||
168 | * It returns 1 on a successful registration (it can fail if we hit the limit | ||
169 | * of registrations for this Guest). | ||
170 | */ | ||
99 | int bind_dma(struct lguest *lg, | 171 | int bind_dma(struct lguest *lg, |
100 | unsigned long ukey, unsigned long dmas, u16 numdmas, u8 interrupt) | 172 | unsigned long ukey, unsigned long dmas, u16 numdmas, u8 interrupt) |
101 | { | 173 | { |
102 | unsigned int i; | 174 | unsigned int i; |
103 | int ret = 0; | 175 | int ret = 0; |
104 | union futex_key key; | 176 | union futex_key key; |
177 | /* Futex code needs the mmap_sem. */ | ||
105 | struct rw_semaphore *fshared = ¤t->mm->mmap_sem; | 178 | struct rw_semaphore *fshared = ¤t->mm->mmap_sem; |
106 | 179 | ||
180 | /* Invalid interrupt? (We could kill the guest here). */ | ||
107 | if (interrupt >= LGUEST_IRQS) | 181 | if (interrupt >= LGUEST_IRQS) |
108 | return 0; | 182 | return 0; |
109 | 183 | ||
184 | /* We need to grab the Big Lguest Lock, because other Guests may be | ||
185 | * trying to look through this Guest's DMAs to send something while | ||
186 | * we're doing this. */ | ||
110 | mutex_lock(&lguest_lock); | 187 | mutex_lock(&lguest_lock); |
111 | down_read(fshared); | 188 | down_read(fshared); |
112 | if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { | 189 | if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { |
113 | kill_guest(lg, "bad dma key %#lx", ukey); | 190 | kill_guest(lg, "bad dma key %#lx", ukey); |
114 | goto unlock; | 191 | goto unlock; |
115 | } | 192 | } |
193 | |||
194 | /* We want to keep this key valid once we drop mmap_sem, so we have to | ||
195 | * hold a reference. */ | ||
116 | get_futex_key_refs(&key); | 196 | get_futex_key_refs(&key); |
117 | 197 | ||
198 | /* If the Guest specified an interrupt of 0, that means they want to | ||
199 | * unregister this array of "struct lguest_dma"s. */ | ||
118 | if (interrupt == 0) | 200 | if (interrupt == 0) |
119 | ret = unbind_dma(lg, &key, dmas); | 201 | ret = unbind_dma(lg, &key, dmas); |
120 | else { | 202 | else { |
203 | /* Look through this Guest's dma array for an unused entry. */ | ||
121 | for (i = 0; i < LGUEST_MAX_DMA; i++) { | 204 | for (i = 0; i < LGUEST_MAX_DMA; i++) { |
205 | /* If the interrupt is non-zero, the entry is already | ||
206 | * used. */ | ||
122 | if (lg->dma[i].interrupt) | 207 | if (lg->dma[i].interrupt) |
123 | continue; | 208 | continue; |
124 | 209 | ||
210 | /* OK, a free one! Fill on our details. */ | ||
125 | lg->dma[i].dmas = dmas; | 211 | lg->dma[i].dmas = dmas; |
126 | lg->dma[i].num_dmas = numdmas; | 212 | lg->dma[i].num_dmas = numdmas; |
127 | lg->dma[i].next_dma = 0; | 213 | lg->dma[i].next_dma = 0; |
128 | lg->dma[i].key = key; | 214 | lg->dma[i].key = key; |
129 | lg->dma[i].guestid = lg->guestid; | 215 | lg->dma[i].guestid = lg->guestid; |
130 | lg->dma[i].interrupt = interrupt; | 216 | lg->dma[i].interrupt = interrupt; |
217 | |||
218 | /* Now we add it to the hash table: the position | ||
219 | * depends on the futex key that we got. */ | ||
131 | list_add(&lg->dma[i].list, &dma_hash[hash(&key)]); | 220 | list_add(&lg->dma[i].list, &dma_hash[hash(&key)]); |
221 | /* Success! */ | ||
132 | ret = 1; | 222 | ret = 1; |
133 | goto unlock; | 223 | goto unlock; |
134 | } | 224 | } |
135 | } | 225 | } |
226 | /* If we didn't find a slot to put the key in, drop the reference | ||
227 | * again. */ | ||
136 | drop_futex_key_refs(&key); | 228 | drop_futex_key_refs(&key); |
137 | unlock: | 229 | unlock: |
230 | /* Unlock and out. */ | ||
138 | up_read(fshared); | 231 | up_read(fshared); |
139 | mutex_unlock(&lguest_lock); | 232 | mutex_unlock(&lguest_lock); |
140 | return ret; | 233 | return ret; |
141 | } | 234 | } |
142 | 235 | ||
143 | /* lgread from another guest */ | 236 | /*L:385 Note that our routines to access a different Guest's memory are called |
237 | * lgread_other() and lgwrite_other(): these names emphasize that they are only | ||
238 | * used when the Guest is *not* the current Guest. | ||
239 | * | ||
240 | * The interface for copying from another process's memory is called | ||
241 | * access_process_vm(), with a final argument of 0 for a read, and 1 for a | ||
242 | * write. | ||
243 | * | ||
244 | * We need lgread_other() to read the destination Guest's "struct lguest_dma" | ||
245 | * array. */ | ||
144 | static int lgread_other(struct lguest *lg, | 246 | static int lgread_other(struct lguest *lg, |
145 | void *buf, u32 addr, unsigned bytes) | 247 | void *buf, u32 addr, unsigned bytes) |
146 | { | 248 | { |
@@ -153,7 +255,8 @@ static int lgread_other(struct lguest *lg, | |||
153 | return 1; | 255 | return 1; |
154 | } | 256 | } |
155 | 257 | ||
156 | /* lgwrite to another guest */ | 258 | /* "lgwrite()" to another Guest: used to update the destination "used_len" once |
259 | * we've transferred data into the buffer. */ | ||
157 | static int lgwrite_other(struct lguest *lg, u32 addr, | 260 | static int lgwrite_other(struct lguest *lg, u32 addr, |
158 | const void *buf, unsigned bytes) | 261 | const void *buf, unsigned bytes) |
159 | { | 262 | { |
@@ -166,6 +269,15 @@ static int lgwrite_other(struct lguest *lg, u32 addr, | |||
166 | return 1; | 269 | return 1; |
167 | } | 270 | } |
168 | 271 | ||
272 | /*L:400 This is the generic engine which copies from a source "struct | ||
273 | * lguest_dma" from this Guest into another Guest's "struct lguest_dma". The | ||
274 | * destination Guest's pages have already been mapped, as contained in the | ||
275 | * pages array. | ||
276 | * | ||
277 | * If you're wondering if there's a nice "copy from one process to another" | ||
278 | * routine, so was I. But Linux isn't really set up to copy between two | ||
279 | * unrelated processes, so we have to write it ourselves. | ||
280 | */ | ||
169 | static u32 copy_data(struct lguest *srclg, | 281 | static u32 copy_data(struct lguest *srclg, |
170 | const struct lguest_dma *src, | 282 | const struct lguest_dma *src, |
171 | const struct lguest_dma *dst, | 283 | const struct lguest_dma *dst, |
@@ -174,33 +286,59 @@ static u32 copy_data(struct lguest *srclg, | |||
174 | unsigned int totlen, si, di, srcoff, dstoff; | 286 | unsigned int totlen, si, di, srcoff, dstoff; |
175 | void *maddr = NULL; | 287 | void *maddr = NULL; |
176 | 288 | ||
289 | /* We return the total length transferred. */ | ||
177 | totlen = 0; | 290 | totlen = 0; |
291 | |||
292 | /* We keep indexes into the source and destination "struct lguest_dma", | ||
293 | * and an offset within each region. */ | ||
178 | si = di = 0; | 294 | si = di = 0; |
179 | srcoff = dstoff = 0; | 295 | srcoff = dstoff = 0; |
296 | |||
297 | /* We loop until the source or destination is exhausted. */ | ||
180 | while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si] | 298 | while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si] |
181 | && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) { | 299 | && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) { |
300 | /* We can only transfer the rest of the src buffer, or as much | ||
301 | * as will fit into the destination buffer. */ | ||
182 | u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff); | 302 | u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff); |
183 | 303 | ||
304 | /* For systems using "highmem" we need to use kmap() to access | ||
305 | * the page we want. We often use the same page over and over, | ||
306 | * so rather than kmap() it on every loop, we set the maddr | ||
307 | * pointer to NULL when we need to move to the next | ||
308 | * destination page. */ | ||
184 | if (!maddr) | 309 | if (!maddr) |
185 | maddr = kmap(pages[di]); | 310 | maddr = kmap(pages[di]); |
186 | 311 | ||
187 | /* FIXME: This is not completely portable, since | 312 | /* Copy directly from (this Guest's) source address to the |
188 | archs do different things for copy_to_user_page. */ | 313 | * destination Guest's kmap()ed buffer. Note that maddr points |
314 | * to the start of the page: we need to add the offset of the | ||
315 | * destination address and offset within the buffer. */ | ||
316 | |||
317 | /* FIXME: This is not completely portable. I looked at | ||
318 | * copy_to_user_page(), and some arch's seem to need special | ||
319 | * flushes. x86 is fine. */ | ||
189 | if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE, | 320 | if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE, |
190 | (void __user *)src->addr[si], len) != 0) { | 321 | (void __user *)src->addr[si], len) != 0) { |
322 | /* If a copy failed, it's the source's fault. */ | ||
191 | kill_guest(srclg, "bad address in sending DMA"); | 323 | kill_guest(srclg, "bad address in sending DMA"); |
192 | totlen = 0; | 324 | totlen = 0; |
193 | break; | 325 | break; |
194 | } | 326 | } |
195 | 327 | ||
328 | /* Increment the total and src & dst offsets */ | ||
196 | totlen += len; | 329 | totlen += len; |
197 | srcoff += len; | 330 | srcoff += len; |
198 | dstoff += len; | 331 | dstoff += len; |
332 | |||
333 | /* Presumably we reached the end of the src or dest buffers: */ | ||
199 | if (srcoff == src->len[si]) { | 334 | if (srcoff == src->len[si]) { |
335 | /* Move to the next buffer at offset 0 */ | ||
200 | si++; | 336 | si++; |
201 | srcoff = 0; | 337 | srcoff = 0; |
202 | } | 338 | } |
203 | if (dstoff == dst->len[di]) { | 339 | if (dstoff == dst->len[di]) { |
340 | /* We need to unmap that destination page and reset | ||
341 | * maddr ready for the next one. */ | ||
204 | kunmap(pages[di]); | 342 | kunmap(pages[di]); |
205 | maddr = NULL; | 343 | maddr = NULL; |
206 | di++; | 344 | di++; |
@@ -208,13 +346,15 @@ static u32 copy_data(struct lguest *srclg, | |||
208 | } | 346 | } |
209 | } | 347 | } |
210 | 348 | ||
349 | /* If we still had a page mapped at the end, unmap now. */ | ||
211 | if (maddr) | 350 | if (maddr) |
212 | kunmap(pages[di]); | 351 | kunmap(pages[di]); |
213 | 352 | ||
214 | return totlen; | 353 | return totlen; |
215 | } | 354 | } |
216 | 355 | ||
217 | /* Src is us, ie. current. */ | 356 | /*L:390 This is how we transfer a "struct lguest_dma" from the source Guest |
357 | * (the current Guest which called SEND_DMA) to another Guest. */ | ||
218 | static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src, | 358 | static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src, |
219 | struct lguest *dstlg, const struct lguest_dma *dst) | 359 | struct lguest *dstlg, const struct lguest_dma *dst) |
220 | { | 360 | { |
@@ -222,23 +362,31 @@ static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src, | |||
222 | u32 ret; | 362 | u32 ret; |
223 | struct page *pages[LGUEST_MAX_DMA_SECTIONS]; | 363 | struct page *pages[LGUEST_MAX_DMA_SECTIONS]; |
224 | 364 | ||
365 | /* We check that both source and destination "struct lguest_dma"s are | ||
366 | * within the bounds of the source and destination Guests */ | ||
225 | if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src)) | 367 | if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src)) |
226 | return 0; | 368 | return 0; |
227 | 369 | ||
228 | /* First get the destination pages */ | 370 | /* We need to map the pages which correspond to each parts of |
371 | * destination buffer. */ | ||
229 | for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { | 372 | for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { |
230 | if (dst->len[i] == 0) | 373 | if (dst->len[i] == 0) |
231 | break; | 374 | break; |
375 | /* get_user_pages() is a complicated function, especially since | ||
376 | * we only want a single page. But it works, and returns the | ||
377 | * number of pages. Note that we're holding the destination's | ||
378 | * mmap_sem, as get_user_pages() requires. */ | ||
232 | if (get_user_pages(dstlg->tsk, dstlg->mm, | 379 | if (get_user_pages(dstlg->tsk, dstlg->mm, |
233 | dst->addr[i], 1, 1, 1, pages+i, NULL) | 380 | dst->addr[i], 1, 1, 1, pages+i, NULL) |
234 | != 1) { | 381 | != 1) { |
382 | /* This means the destination gave us a bogus buffer */ | ||
235 | kill_guest(dstlg, "Error mapping DMA pages"); | 383 | kill_guest(dstlg, "Error mapping DMA pages"); |
236 | ret = 0; | 384 | ret = 0; |
237 | goto drop_pages; | 385 | goto drop_pages; |
238 | } | 386 | } |
239 | } | 387 | } |
240 | 388 | ||
241 | /* Now copy until we run out of src or dst. */ | 389 | /* Now copy the data until we run out of src or dst. */ |
242 | ret = copy_data(srclg, src, dst, pages); | 390 | ret = copy_data(srclg, src, dst, pages); |
243 | 391 | ||
244 | drop_pages: | 392 | drop_pages: |
@@ -247,6 +395,11 @@ drop_pages: | |||
247 | return ret; | 395 | return ret; |
248 | } | 396 | } |
249 | 397 | ||
398 | /*L:380 Transferring data from one Guest to another is not as simple as I'd | ||
399 | * like. We've found the "struct lguest_dma_info" bound to the same address as | ||
400 | * the send, we need to copy into it. | ||
401 | * | ||
402 | * This function returns true if the destination array was empty. */ | ||
250 | static int dma_transfer(struct lguest *srclg, | 403 | static int dma_transfer(struct lguest *srclg, |
251 | unsigned long udma, | 404 | unsigned long udma, |
252 | struct lguest_dma_info *dst) | 405 | struct lguest_dma_info *dst) |
@@ -255,15 +408,23 @@ static int dma_transfer(struct lguest *srclg, | |||
255 | struct lguest *dstlg; | 408 | struct lguest *dstlg; |
256 | u32 i, dma = 0; | 409 | u32 i, dma = 0; |
257 | 410 | ||
411 | /* From the "struct lguest_dma_info" we found in the hash, grab the | ||
412 | * Guest. */ | ||
258 | dstlg = &lguests[dst->guestid]; | 413 | dstlg = &lguests[dst->guestid]; |
259 | /* Get our dma list. */ | 414 | /* Read in the source "struct lguest_dma" handed to SEND_DMA. */ |
260 | lgread(srclg, &src_dma, udma, sizeof(src_dma)); | 415 | lgread(srclg, &src_dma, udma, sizeof(src_dma)); |
261 | 416 | ||
262 | /* We can't deadlock against them dmaing to us, because this | 417 | /* We need the destination's mmap_sem, and we already hold the source's |
263 | * is all under the lguest_lock. */ | 418 | * mmap_sem for the futex key lookup. Normally this would suggest that |
419 | * we could deadlock if the destination Guest was trying to send to | ||
420 | * this source Guest at the same time, which is another reason that all | ||
421 | * I/O is done under the big lguest_lock. */ | ||
264 | down_read(&dstlg->mm->mmap_sem); | 422 | down_read(&dstlg->mm->mmap_sem); |
265 | 423 | ||
424 | /* Look through the destination DMA array for an available buffer. */ | ||
266 | for (i = 0; i < dst->num_dmas; i++) { | 425 | for (i = 0; i < dst->num_dmas; i++) { |
426 | /* We keep a "next_dma" pointer which often helps us avoid | ||
427 | * looking at lots of previously-filled entries. */ | ||
267 | dma = (dst->next_dma + i) % dst->num_dmas; | 428 | dma = (dst->next_dma + i) % dst->num_dmas; |
268 | if (!lgread_other(dstlg, &dst_dma, | 429 | if (!lgread_other(dstlg, &dst_dma, |
269 | dst->dmas + dma * sizeof(struct lguest_dma), | 430 | dst->dmas + dma * sizeof(struct lguest_dma), |
@@ -273,30 +434,46 @@ static int dma_transfer(struct lguest *srclg, | |||
273 | if (!dst_dma.used_len) | 434 | if (!dst_dma.used_len) |
274 | break; | 435 | break; |
275 | } | 436 | } |
437 | |||
438 | /* If we found a buffer, we do the actual data copy. */ | ||
276 | if (i != dst->num_dmas) { | 439 | if (i != dst->num_dmas) { |
277 | unsigned long used_lenp; | 440 | unsigned long used_lenp; |
278 | unsigned int ret; | 441 | unsigned int ret; |
279 | 442 | ||
280 | ret = do_dma(srclg, &src_dma, dstlg, &dst_dma); | 443 | ret = do_dma(srclg, &src_dma, dstlg, &dst_dma); |
281 | /* Put used length in src. */ | 444 | /* Put used length in the source "struct lguest_dma"'s used_len |
445 | * field. It's a little tricky to figure out where that is, | ||
446 | * though. */ | ||
282 | lgwrite_u32(srclg, | 447 | lgwrite_u32(srclg, |
283 | udma+offsetof(struct lguest_dma, used_len), ret); | 448 | udma+offsetof(struct lguest_dma, used_len), ret); |
449 | /* Tranferring 0 bytes is OK if the source buffer was empty. */ | ||
284 | if (ret == 0 && src_dma.len[0] != 0) | 450 | if (ret == 0 && src_dma.len[0] != 0) |
285 | goto fail; | 451 | goto fail; |
286 | 452 | ||
287 | /* Make sure destination sees contents before length. */ | 453 | /* The destination Guest might be running on a different CPU: |
454 | * we have to make sure that it will see the "used_len" field | ||
455 | * change to non-zero *after* it sees the data we copied into | ||
456 | * the buffer. Hence a write memory barrier. */ | ||
288 | wmb(); | 457 | wmb(); |
458 | /* Figuring out where the destination's used_len field for this | ||
459 | * "struct lguest_dma" in the array is also a little ugly. */ | ||
289 | used_lenp = dst->dmas | 460 | used_lenp = dst->dmas |
290 | + dma * sizeof(struct lguest_dma) | 461 | + dma * sizeof(struct lguest_dma) |
291 | + offsetof(struct lguest_dma, used_len); | 462 | + offsetof(struct lguest_dma, used_len); |
292 | lgwrite_other(dstlg, used_lenp, &ret, sizeof(ret)); | 463 | lgwrite_other(dstlg, used_lenp, &ret, sizeof(ret)); |
464 | /* Move the cursor for next time. */ | ||
293 | dst->next_dma++; | 465 | dst->next_dma++; |
294 | } | 466 | } |
295 | up_read(&dstlg->mm->mmap_sem); | 467 | up_read(&dstlg->mm->mmap_sem); |
296 | 468 | ||
297 | /* Do this last so dst doesn't simply sleep on lock. */ | 469 | /* We trigger the destination interrupt, even if the destination was |
470 | * empty and we didn't transfer anything: this gives them a chance to | ||
471 | * wake up and refill. */ | ||
298 | set_bit(dst->interrupt, dstlg->irqs_pending); | 472 | set_bit(dst->interrupt, dstlg->irqs_pending); |
473 | /* Wake up the destination process. */ | ||
299 | wake_up_process(dstlg->tsk); | 474 | wake_up_process(dstlg->tsk); |
475 | /* If we passed the last "struct lguest_dma", the receive had no | ||
476 | * buffers left. */ | ||
300 | return i == dst->num_dmas; | 477 | return i == dst->num_dmas; |
301 | 478 | ||
302 | fail: | 479 | fail: |
@@ -304,6 +481,8 @@ fail: | |||
304 | return 0; | 481 | return 0; |
305 | } | 482 | } |
306 | 483 | ||
484 | /*L:370 This is the counter-side to the BIND_DMA hypercall; the SEND_DMA | ||
485 | * hypercall. We find out who's listening, and send to them. */ | ||
307 | void send_dma(struct lguest *lg, unsigned long ukey, unsigned long udma) | 486 | void send_dma(struct lguest *lg, unsigned long ukey, unsigned long udma) |
308 | { | 487 | { |
309 | union futex_key key; | 488 | union futex_key key; |
@@ -313,31 +492,43 @@ void send_dma(struct lguest *lg, unsigned long ukey, unsigned long udma) | |||
313 | again: | 492 | again: |
314 | mutex_lock(&lguest_lock); | 493 | mutex_lock(&lguest_lock); |
315 | down_read(fshared); | 494 | down_read(fshared); |
495 | /* Get the futex key for the key the Guest gave us */ | ||
316 | if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { | 496 | if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { |
317 | kill_guest(lg, "bad sending DMA key"); | 497 | kill_guest(lg, "bad sending DMA key"); |
318 | goto unlock; | 498 | goto unlock; |
319 | } | 499 | } |
320 | /* Shared mapping? Look for other guests... */ | 500 | /* Since the key must be a multiple of 4, the futex key uses the lower |
501 | * bit of the "offset" field (which would always be 0) to indicate a | ||
502 | * mapping which is shared with other processes (ie. Guests). */ | ||
321 | if (key.shared.offset & 1) { | 503 | if (key.shared.offset & 1) { |
322 | struct lguest_dma_info *i; | 504 | struct lguest_dma_info *i; |
505 | /* Look through the hash for other Guests. */ | ||
323 | list_for_each_entry(i, &dma_hash[hash(&key)], list) { | 506 | list_for_each_entry(i, &dma_hash[hash(&key)], list) { |
507 | /* Don't send to ourselves. */ | ||
324 | if (i->guestid == lg->guestid) | 508 | if (i->guestid == lg->guestid) |
325 | continue; | 509 | continue; |
326 | if (!key_eq(&key, &i->key)) | 510 | if (!key_eq(&key, &i->key)) |
327 | continue; | 511 | continue; |
328 | 512 | ||
513 | /* If dma_transfer() tells us the destination has no | ||
514 | * available buffers, we increment "empty". */ | ||
329 | empty += dma_transfer(lg, udma, i); | 515 | empty += dma_transfer(lg, udma, i); |
330 | break; | 516 | break; |
331 | } | 517 | } |
518 | /* If the destination is empty, we release our locks and | ||
519 | * give the destination Guest a brief chance to restock. */ | ||
332 | if (empty == 1) { | 520 | if (empty == 1) { |
333 | /* Give any recipients one chance to restock. */ | 521 | /* Give any recipients one chance to restock. */ |
334 | up_read(¤t->mm->mmap_sem); | 522 | up_read(¤t->mm->mmap_sem); |
335 | mutex_unlock(&lguest_lock); | 523 | mutex_unlock(&lguest_lock); |
524 | /* Next time, we won't try again. */ | ||
336 | empty++; | 525 | empty++; |
337 | goto again; | 526 | goto again; |
338 | } | 527 | } |
339 | } else { | 528 | } else { |
340 | /* Private mapping: tell our userspace. */ | 529 | /* Private mapping: Guest is sending to its Launcher. We set |
530 | * the "dma_is_pending" flag so that the main loop will exit | ||
531 | * and the Launcher's read() from /dev/lguest will return. */ | ||
341 | lg->dma_is_pending = 1; | 532 | lg->dma_is_pending = 1; |
342 | lg->pending_dma = udma; | 533 | lg->pending_dma = udma; |
343 | lg->pending_key = ukey; | 534 | lg->pending_key = ukey; |
@@ -346,6 +537,7 @@ unlock: | |||
346 | up_read(fshared); | 537 | up_read(fshared); |
347 | mutex_unlock(&lguest_lock); | 538 | mutex_unlock(&lguest_lock); |
348 | } | 539 | } |
540 | /*:*/ | ||
349 | 541 | ||
350 | void release_all_dma(struct lguest *lg) | 542 | void release_all_dma(struct lguest *lg) |
351 | { | 543 | { |
@@ -361,7 +553,18 @@ void release_all_dma(struct lguest *lg) | |||
361 | up_read(&lg->mm->mmap_sem); | 553 | up_read(&lg->mm->mmap_sem); |
362 | } | 554 | } |
363 | 555 | ||
364 | /* Userspace wants a dma buffer from this guest. */ | 556 | /*M:007 We only return a single DMA buffer to the Launcher, but it would be |
557 | * more efficient to return a pointer to the entire array of DMA buffers, which | ||
558 | * it can cache and choose one whenever it wants. | ||
559 | * | ||
560 | * Currently the Launcher uses a write to /dev/lguest, and the return value is | ||
561 | * the address of the DMA structure with the interrupt number placed in | ||
562 | * dma->used_len. If we wanted to return the entire array, we need to return | ||
563 | * the address, array size and interrupt number: this seems to require an | ||
564 | * ioctl(). :*/ | ||
565 | |||
566 | /*L:320 This routine looks for a DMA buffer registered by the Guest on the | ||
567 | * given key (using the BIND_DMA hypercall). */ | ||
365 | unsigned long get_dma_buffer(struct lguest *lg, | 568 | unsigned long get_dma_buffer(struct lguest *lg, |
366 | unsigned long ukey, unsigned long *interrupt) | 569 | unsigned long ukey, unsigned long *interrupt) |
367 | { | 570 | { |
@@ -370,15 +573,29 @@ unsigned long get_dma_buffer(struct lguest *lg, | |||
370 | struct lguest_dma_info *i; | 573 | struct lguest_dma_info *i; |
371 | struct rw_semaphore *fshared = ¤t->mm->mmap_sem; | 574 | struct rw_semaphore *fshared = ¤t->mm->mmap_sem; |
372 | 575 | ||
576 | /* Take the Big Lguest Lock to stop other Guests sending this Guest DMA | ||
577 | * at the same time. */ | ||
373 | mutex_lock(&lguest_lock); | 578 | mutex_lock(&lguest_lock); |
579 | /* To match between Guests sharing the same underlying memory we steal | ||
580 | * code from the futex infrastructure. This requires that we hold the | ||
581 | * "mmap_sem" for our process (the Launcher), and pass it to the futex | ||
582 | * code. */ | ||
374 | down_read(fshared); | 583 | down_read(fshared); |
584 | |||
585 | /* This can fail if it's not a valid address, or if the address is not | ||
586 | * divisible by 4 (the futex code needs that, we don't really). */ | ||
375 | if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { | 587 | if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { |
376 | kill_guest(lg, "bad registered DMA buffer"); | 588 | kill_guest(lg, "bad registered DMA buffer"); |
377 | goto unlock; | 589 | goto unlock; |
378 | } | 590 | } |
591 | /* Search the hash table for matching entries (the Launcher can only | ||
592 | * send to its own Guest for the moment, so the entry must be for this | ||
593 | * Guest) */ | ||
379 | list_for_each_entry(i, &dma_hash[hash(&key)], list) { | 594 | list_for_each_entry(i, &dma_hash[hash(&key)], list) { |
380 | if (key_eq(&key, &i->key) && i->guestid == lg->guestid) { | 595 | if (key_eq(&key, &i->key) && i->guestid == lg->guestid) { |
381 | unsigned int j; | 596 | unsigned int j; |
597 | /* Look through the registered DMA array for an | ||
598 | * available buffer. */ | ||
382 | for (j = 0; j < i->num_dmas; j++) { | 599 | for (j = 0; j < i->num_dmas; j++) { |
383 | struct lguest_dma dma; | 600 | struct lguest_dma dma; |
384 | 601 | ||
@@ -387,6 +604,8 @@ unsigned long get_dma_buffer(struct lguest *lg, | |||
387 | if (dma.used_len == 0) | 604 | if (dma.used_len == 0) |
388 | break; | 605 | break; |
389 | } | 606 | } |
607 | /* Store the interrupt the Guest wants when the buffer | ||
608 | * is used. */ | ||
390 | *interrupt = i->interrupt; | 609 | *interrupt = i->interrupt; |
391 | break; | 610 | break; |
392 | } | 611 | } |
@@ -396,4 +615,12 @@ unlock: | |||
396 | mutex_unlock(&lguest_lock); | 615 | mutex_unlock(&lguest_lock); |
397 | return ret; | 616 | return ret; |
398 | } | 617 | } |
618 | /*:*/ | ||
399 | 619 | ||
620 | /*L:410 This really has completed the Launcher. Not only have we now finished | ||
621 | * the longest chapter in our journey, but this also means we are over halfway | ||
622 | * through! | ||
623 | * | ||
624 | * Enough prevaricating around the bush: it is time for us to dive into the | ||
625 | * core of the Host, in "make Host". | ||
626 | */ | ||
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 3e2ddfbc816e..64f0abed317c 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h | |||
@@ -58,9 +58,18 @@ struct lguest_dma_info | |||
58 | u8 interrupt; /* 0 when not registered */ | 58 | u8 interrupt; /* 0 when not registered */ |
59 | }; | 59 | }; |
60 | 60 | ||
61 | /* We have separate types for the guest's ptes & pgds and the shadow ptes & | 61 | /*H:310 The page-table code owes a great debt of gratitude to Andi Kleen. He |
62 | * pgds. Since this host might use three-level pagetables and the guest and | 62 | * reviewed the original code which used "u32" for all page table entries, and |
63 | * shadow pagetables don't, we can't use the normal pte_t/pgd_t. */ | 63 | * insisted that it would be far clearer with explicit typing. I thought it |
64 | * was overkill, but he was right: it is much clearer than it was before. | ||
65 | * | ||
66 | * We have separate types for the Guest's ptes & pgds and the shadow ptes & | ||
67 | * pgds. There's already a Linux type for these (pte_t and pgd_t) but they | ||
68 | * change depending on kernel config options (PAE). */ | ||
69 | |||
70 | /* Each entry is identical: lower 12 bits of flags and upper 20 bits for the | ||
71 | * "page frame number" (0 == first physical page, etc). They are different | ||
72 | * types so the compiler will warn us if we mix them improperly. */ | ||
64 | typedef union { | 73 | typedef union { |
65 | struct { unsigned flags:12, pfn:20; }; | 74 | struct { unsigned flags:12, pfn:20; }; |
66 | struct { unsigned long val; } raw; | 75 | struct { unsigned long val; } raw; |
@@ -77,8 +86,12 @@ typedef union { | |||
77 | struct { unsigned flags:12, pfn:20; }; | 86 | struct { unsigned flags:12, pfn:20; }; |
78 | struct { unsigned long val; } raw; | 87 | struct { unsigned long val; } raw; |
79 | } gpte_t; | 88 | } gpte_t; |
89 | |||
90 | /* We have two convenient macros to convert a "raw" value as handed to us by | ||
91 | * the Guest into the correct Guest PGD or PTE type. */ | ||
80 | #define mkgpte(_val) ((gpte_t){.raw.val = _val}) | 92 | #define mkgpte(_val) ((gpte_t){.raw.val = _val}) |
81 | #define mkgpgd(_val) ((gpgd_t){.raw.val = _val}) | 93 | #define mkgpgd(_val) ((gpgd_t){.raw.val = _val}) |
94 | /*:*/ | ||
82 | 95 | ||
83 | struct pgdir | 96 | struct pgdir |
84 | { | 97 | { |
@@ -243,7 +256,32 @@ unsigned long get_dma_buffer(struct lguest *lg, unsigned long key, | |||
243 | 256 | ||
244 | /* hypercalls.c: */ | 257 | /* hypercalls.c: */ |
245 | void do_hypercalls(struct lguest *lg); | 258 | void do_hypercalls(struct lguest *lg); |
246 | 259 | void write_timestamp(struct lguest *lg); | |
260 | |||
261 | /*L:035 | ||
262 | * Let's step aside for the moment, to study one important routine that's used | ||
263 | * widely in the Host code. | ||
264 | * | ||
265 | * There are many cases where the Guest does something invalid, like pass crap | ||
266 | * to a hypercall. Since only the Guest kernel can make hypercalls, it's quite | ||
267 | * acceptable to simply terminate the Guest and give the Launcher a nicely | ||
268 | * formatted reason. It's also simpler for the Guest itself, which doesn't | ||
269 | * need to check most hypercalls for "success"; if you're still running, it | ||
270 | * succeeded. | ||
271 | * | ||
272 | * Once this is called, the Guest will never run again, so most Host code can | ||
273 | * call this then continue as if nothing had happened. This means many | ||
274 | * functions don't have to explicitly return an error code, which keeps the | ||
275 | * code simple. | ||
276 | * | ||
277 | * It also means that this can be called more than once: only the first one is | ||
278 | * remembered. The only trick is that we still need to kill the Guest even if | ||
279 | * we can't allocate memory to store the reason. Linux has a neat way of | ||
280 | * packing error codes into invalid pointers, so we use that here. | ||
281 | * | ||
282 | * Like any macro which uses an "if", it is safely wrapped in a run-once "do { | ||
283 | * } while(0)". | ||
284 | */ | ||
247 | #define kill_guest(lg, fmt...) \ | 285 | #define kill_guest(lg, fmt...) \ |
248 | do { \ | 286 | do { \ |
249 | if (!(lg)->dead) { \ | 287 | if (!(lg)->dead) { \ |
@@ -252,6 +290,7 @@ do { \ | |||
252 | (lg)->dead = ERR_PTR(-ENOMEM); \ | 290 | (lg)->dead = ERR_PTR(-ENOMEM); \ |
253 | } \ | 291 | } \ |
254 | } while(0) | 292 | } while(0) |
293 | /* (End of aside) :*/ | ||
255 | 294 | ||
256 | static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) | 295 | static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) |
257 | { | 296 | { |
diff --git a/drivers/lguest/lguest.c b/drivers/lguest/lguest.c index 18dade06d4a9..1bc1546c7fd0 100644 --- a/drivers/lguest/lguest.c +++ b/drivers/lguest/lguest.c | |||
@@ -1,6 +1,32 @@ | |||
1 | /* | 1 | /*P:010 |
2 | * Lguest specific paravirt-ops implementation | 2 | * A hypervisor allows multiple Operating Systems to run on a single machine. |
3 | * To quote David Wheeler: "Any problem in computer science can be solved with | ||
4 | * another layer of indirection." | ||
5 | * | ||
6 | * We keep things simple in two ways. First, we start with a normal Linux | ||
7 | * kernel and insert a module (lg.ko) which allows us to run other Linux | ||
8 | * kernels the same way we'd run processes. We call the first kernel the Host, | ||
9 | * and the others the Guests. The program which sets up and configures Guests | ||
10 | * (such as the example in Documentation/lguest/lguest.c) is called the | ||
11 | * Launcher. | ||
12 | * | ||
13 | * Secondly, we only run specially modified Guests, not normal kernels. When | ||
14 | * you set CONFIG_LGUEST to 'y' or 'm', this automatically sets | ||
15 | * CONFIG_LGUEST_GUEST=y, which compiles this file into the kernel so it knows | ||
16 | * how to be a Guest. This means that you can use the same kernel you boot | ||
17 | * normally (ie. as a Host) as a Guest. | ||
3 | * | 18 | * |
19 | * These Guests know that they cannot do privileged operations, such as disable | ||
20 | * interrupts, and that they have to ask the Host to do such things explicitly. | ||
21 | * This file consists of all the replacements for such low-level native | ||
22 | * hardware operations: these special Guest versions call the Host. | ||
23 | * | ||
24 | * So how does the kernel know it's a Guest? The Guest starts at a special | ||
25 | * entry point marked with a magic string, which sets up a few things then | ||
26 | * calls here. We replace the native functions in "struct paravirt_ops" | ||
27 | * with our Guest versions, then boot like normal. :*/ | ||
28 | |||
29 | /* | ||
4 | * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. | 30 | * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. |
5 | * | 31 | * |
6 | * This program is free software; you can redistribute it and/or modify | 32 | * This program is free software; you can redistribute it and/or modify |
@@ -40,6 +66,12 @@ | |||
40 | #include <asm/mce.h> | 66 | #include <asm/mce.h> |
41 | #include <asm/io.h> | 67 | #include <asm/io.h> |
42 | 68 | ||
69 | /*G:010 Welcome to the Guest! | ||
70 | * | ||
71 | * The Guest in our tale is a simple creature: identical to the Host but | ||
72 | * behaving in simplified but equivalent ways. In particular, the Guest is the | ||
73 | * same kernel as the Host (or at least, built from the same source code). :*/ | ||
74 | |||
43 | /* Declarations for definitions in lguest_guest.S */ | 75 | /* Declarations for definitions in lguest_guest.S */ |
44 | extern char lguest_noirq_start[], lguest_noirq_end[]; | 76 | extern char lguest_noirq_start[], lguest_noirq_end[]; |
45 | extern const char lgstart_cli[], lgend_cli[]; | 77 | extern const char lgstart_cli[], lgend_cli[]; |
@@ -58,7 +90,26 @@ struct lguest_data lguest_data = { | |||
58 | struct lguest_device_desc *lguest_devices; | 90 | struct lguest_device_desc *lguest_devices; |
59 | static cycle_t clock_base; | 91 | static cycle_t clock_base; |
60 | 92 | ||
61 | static enum paravirt_lazy_mode lazy_mode; | 93 | /*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first |
94 | * real optimization trick! | ||
95 | * | ||
96 | * When lazy_mode is set, it means we're allowed to defer all hypercalls and do | ||
97 | * them as a batch when lazy_mode is eventually turned off. Because hypercalls | ||
98 | * are reasonably expensive, batching them up makes sense. For example, a | ||
99 | * large mmap might update dozens of page table entries: that code calls | ||
100 | * lguest_lazy_mode(PARAVIRT_LAZY_MMU), does the dozen updates, then calls | ||
101 | * lguest_lazy_mode(PARAVIRT_LAZY_NONE). | ||
102 | * | ||
103 | * So, when we're in lazy mode, we call async_hypercall() to store the call for | ||
104 | * future processing. When lazy mode is turned off we issue a hypercall to | ||
105 | * flush the stored calls. | ||
106 | * | ||
107 | * There's also a hack where "mode" is set to "PARAVIRT_LAZY_FLUSH" which | ||
108 | * indicates we're to flush any outstanding calls immediately. This is used | ||
109 | * when an interrupt handler does a kmap_atomic(): the page table changes must | ||
110 | * happen immediately even if we're in the middle of a batch. Usually we're | ||
111 | * not, though, so there's nothing to do. */ | ||
112 | static enum paravirt_lazy_mode lazy_mode; /* Note: not SMP-safe! */ | ||
62 | static void lguest_lazy_mode(enum paravirt_lazy_mode mode) | 113 | static void lguest_lazy_mode(enum paravirt_lazy_mode mode) |
63 | { | 114 | { |
64 | if (mode == PARAVIRT_LAZY_FLUSH) { | 115 | if (mode == PARAVIRT_LAZY_FLUSH) { |
@@ -82,6 +133,16 @@ static void lazy_hcall(unsigned long call, | |||
82 | async_hcall(call, arg1, arg2, arg3); | 133 | async_hcall(call, arg1, arg2, arg3); |
83 | } | 134 | } |
84 | 135 | ||
136 | /* async_hcall() is pretty simple: I'm quite proud of it really. We have a | ||
137 | * ring buffer of stored hypercalls which the Host will run though next time we | ||
138 | * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall | ||
139 | * arguments, and a "hcall_status" word which is 0 if the call is ready to go, | ||
140 | * and 255 once the Host has finished with it. | ||
141 | * | ||
142 | * If we come around to a slot which hasn't been finished, then the table is | ||
143 | * full and we just make the hypercall directly. This has the nice side | ||
144 | * effect of causing the Host to run all the stored calls in the ring buffer | ||
145 | * which empties it for next time! */ | ||
85 | void async_hcall(unsigned long call, | 146 | void async_hcall(unsigned long call, |
86 | unsigned long arg1, unsigned long arg2, unsigned long arg3) | 147 | unsigned long arg1, unsigned long arg2, unsigned long arg3) |
87 | { | 148 | { |
@@ -89,6 +150,9 @@ void async_hcall(unsigned long call, | |||
89 | static unsigned int next_call; | 150 | static unsigned int next_call; |
90 | unsigned long flags; | 151 | unsigned long flags; |
91 | 152 | ||
153 | /* Disable interrupts if not already disabled: we don't want an | ||
154 | * interrupt handler making a hypercall while we're already doing | ||
155 | * one! */ | ||
92 | local_irq_save(flags); | 156 | local_irq_save(flags); |
93 | if (lguest_data.hcall_status[next_call] != 0xFF) { | 157 | if (lguest_data.hcall_status[next_call] != 0xFF) { |
94 | /* Table full, so do normal hcall which will flush table. */ | 158 | /* Table full, so do normal hcall which will flush table. */ |
@@ -98,7 +162,7 @@ void async_hcall(unsigned long call, | |||
98 | lguest_data.hcalls[next_call].edx = arg1; | 162 | lguest_data.hcalls[next_call].edx = arg1; |
99 | lguest_data.hcalls[next_call].ebx = arg2; | 163 | lguest_data.hcalls[next_call].ebx = arg2; |
100 | lguest_data.hcalls[next_call].ecx = arg3; | 164 | lguest_data.hcalls[next_call].ecx = arg3; |
101 | /* Make sure host sees arguments before "valid" flag. */ | 165 | /* Arguments must all be written before we mark it to go */ |
102 | wmb(); | 166 | wmb(); |
103 | lguest_data.hcall_status[next_call] = 0; | 167 | lguest_data.hcall_status[next_call] = 0; |
104 | if (++next_call == LHCALL_RING_SIZE) | 168 | if (++next_call == LHCALL_RING_SIZE) |
@@ -106,9 +170,14 @@ void async_hcall(unsigned long call, | |||
106 | } | 170 | } |
107 | local_irq_restore(flags); | 171 | local_irq_restore(flags); |
108 | } | 172 | } |
173 | /*:*/ | ||
109 | 174 | ||
175 | /* Wrappers for the SEND_DMA and BIND_DMA hypercalls. This is mainly because | ||
176 | * Jeff Garzik complained that __pa() should never appear in drivers, and this | ||
177 | * helps remove most of them. But also, it wraps some ugliness. */ | ||
110 | void lguest_send_dma(unsigned long key, struct lguest_dma *dma) | 178 | void lguest_send_dma(unsigned long key, struct lguest_dma *dma) |
111 | { | 179 | { |
180 | /* The hcall might not write this if something goes wrong */ | ||
112 | dma->used_len = 0; | 181 | dma->used_len = 0; |
113 | hcall(LHCALL_SEND_DMA, key, __pa(dma), 0); | 182 | hcall(LHCALL_SEND_DMA, key, __pa(dma), 0); |
114 | } | 183 | } |
@@ -116,11 +185,16 @@ void lguest_send_dma(unsigned long key, struct lguest_dma *dma) | |||
116 | int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas, | 185 | int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas, |
117 | unsigned int num, u8 irq) | 186 | unsigned int num, u8 irq) |
118 | { | 187 | { |
188 | /* This is the only hypercall which actually wants 5 arguments, and we | ||
189 | * only support 4. Fortunately the interrupt number is always less | ||
190 | * than 256, so we can pack it with the number of dmas in the final | ||
191 | * argument. */ | ||
119 | if (!hcall(LHCALL_BIND_DMA, key, __pa(dmas), (num << 8) | irq)) | 192 | if (!hcall(LHCALL_BIND_DMA, key, __pa(dmas), (num << 8) | irq)) |
120 | return -ENOMEM; | 193 | return -ENOMEM; |
121 | return 0; | 194 | return 0; |
122 | } | 195 | } |
123 | 196 | ||
197 | /* Unbinding is the same hypercall as binding, but with 0 num & irq. */ | ||
124 | void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas) | 198 | void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas) |
125 | { | 199 | { |
126 | hcall(LHCALL_BIND_DMA, key, __pa(dmas), 0); | 200 | hcall(LHCALL_BIND_DMA, key, __pa(dmas), 0); |
@@ -138,35 +212,73 @@ void lguest_unmap(void *addr) | |||
138 | iounmap((__force void __iomem *)addr); | 212 | iounmap((__force void __iomem *)addr); |
139 | } | 213 | } |
140 | 214 | ||
215 | /*G:033 | ||
216 | * Here are our first native-instruction replacements: four functions for | ||
217 | * interrupt control. | ||
218 | * | ||
219 | * The simplest way of implementing these would be to have "turn interrupts | ||
220 | * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow: | ||
221 | * these are by far the most commonly called functions of those we override. | ||
222 | * | ||
223 | * So instead we keep an "irq_enabled" field inside our "struct lguest_data", | ||
224 | * which the Guest can update with a single instruction. The Host knows to | ||
225 | * check there when it wants to deliver an interrupt. | ||
226 | */ | ||
227 | |||
228 | /* save_flags() is expected to return the processor state (ie. "eflags"). The | ||
229 | * eflags word contains all kind of stuff, but in practice Linux only cares | ||
230 | * about the interrupt flag. Our "save_flags()" just returns that. */ | ||
141 | static unsigned long save_fl(void) | 231 | static unsigned long save_fl(void) |
142 | { | 232 | { |
143 | return lguest_data.irq_enabled; | 233 | return lguest_data.irq_enabled; |
144 | } | 234 | } |
145 | 235 | ||
236 | /* "restore_flags" just sets the flags back to the value given. */ | ||
146 | static void restore_fl(unsigned long flags) | 237 | static void restore_fl(unsigned long flags) |
147 | { | 238 | { |
148 | /* FIXME: Check if interrupt pending... */ | ||
149 | lguest_data.irq_enabled = flags; | 239 | lguest_data.irq_enabled = flags; |
150 | } | 240 | } |
151 | 241 | ||
242 | /* Interrupts go off... */ | ||
152 | static void irq_disable(void) | 243 | static void irq_disable(void) |
153 | { | 244 | { |
154 | lguest_data.irq_enabled = 0; | 245 | lguest_data.irq_enabled = 0; |
155 | } | 246 | } |
156 | 247 | ||
248 | /* Interrupts go on... */ | ||
157 | static void irq_enable(void) | 249 | static void irq_enable(void) |
158 | { | 250 | { |
159 | /* FIXME: Check if interrupt pending... */ | ||
160 | lguest_data.irq_enabled = X86_EFLAGS_IF; | 251 | lguest_data.irq_enabled = X86_EFLAGS_IF; |
161 | } | 252 | } |
162 | 253 | /*:*/ | |
254 | /*M:003 Note that we don't check for outstanding interrupts when we re-enable | ||
255 | * them (or when we unmask an interrupt). This seems to work for the moment, | ||
256 | * since interrupts are rare and we'll just get the interrupt on the next timer | ||
257 | * tick, but when we turn on CONFIG_NO_HZ, we should revisit this. One way | ||
258 | * would be to put the "irq_enabled" field in a page by itself, and have the | ||
259 | * Host write-protect it when an interrupt comes in when irqs are disabled. | ||
260 | * There will then be a page fault as soon as interrupts are re-enabled. :*/ | ||
261 | |||
262 | /*G:034 | ||
263 | * The Interrupt Descriptor Table (IDT). | ||
264 | * | ||
265 | * The IDT tells the processor what to do when an interrupt comes in. Each | ||
266 | * entry in the table is a 64-bit descriptor: this holds the privilege level, | ||
267 | * address of the handler, and... well, who cares? The Guest just asks the | ||
268 | * Host to make the change anyway, because the Host controls the real IDT. | ||
269 | */ | ||
163 | static void lguest_write_idt_entry(struct desc_struct *dt, | 270 | static void lguest_write_idt_entry(struct desc_struct *dt, |
164 | int entrynum, u32 low, u32 high) | 271 | int entrynum, u32 low, u32 high) |
165 | { | 272 | { |
273 | /* Keep the local copy up to date. */ | ||
166 | write_dt_entry(dt, entrynum, low, high); | 274 | write_dt_entry(dt, entrynum, low, high); |
275 | /* Tell Host about this new entry. */ | ||
167 | hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high); | 276 | hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high); |
168 | } | 277 | } |
169 | 278 | ||
279 | /* Changing to a different IDT is very rare: we keep the IDT up-to-date every | ||
280 | * time it is written, so we can simply loop through all entries and tell the | ||
281 | * Host about them. */ | ||
170 | static void lguest_load_idt(const struct Xgt_desc_struct *desc) | 282 | static void lguest_load_idt(const struct Xgt_desc_struct *desc) |
171 | { | 283 | { |
172 | unsigned int i; | 284 | unsigned int i; |
@@ -176,12 +288,29 @@ static void lguest_load_idt(const struct Xgt_desc_struct *desc) | |||
176 | hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b); | 288 | hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b); |
177 | } | 289 | } |
178 | 290 | ||
291 | /* | ||
292 | * The Global Descriptor Table. | ||
293 | * | ||
294 | * The Intel architecture defines another table, called the Global Descriptor | ||
295 | * Table (GDT). You tell the CPU where it is (and its size) using the "lgdt" | ||
296 | * instruction, and then several other instructions refer to entries in the | ||
297 | * table. There are three entries which the Switcher needs, so the Host simply | ||
298 | * controls the entire thing and the Guest asks it to make changes using the | ||
299 | * LOAD_GDT hypercall. | ||
300 | * | ||
301 | * This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY | ||
302 | * hypercall and use that repeatedly to load a new IDT. I don't think it | ||
303 | * really matters, but wouldn't it be nice if they were the same? | ||
304 | */ | ||
179 | static void lguest_load_gdt(const struct Xgt_desc_struct *desc) | 305 | static void lguest_load_gdt(const struct Xgt_desc_struct *desc) |
180 | { | 306 | { |
181 | BUG_ON((desc->size+1)/8 != GDT_ENTRIES); | 307 | BUG_ON((desc->size+1)/8 != GDT_ENTRIES); |
182 | hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0); | 308 | hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0); |
183 | } | 309 | } |
184 | 310 | ||
311 | /* For a single GDT entry which changes, we do the lazy thing: alter our GDT, | ||
312 | * then tell the Host to reload the entire thing. This operation is so rare | ||
313 | * that this naive implementation is reasonable. */ | ||
185 | static void lguest_write_gdt_entry(struct desc_struct *dt, | 314 | static void lguest_write_gdt_entry(struct desc_struct *dt, |
186 | int entrynum, u32 low, u32 high) | 315 | int entrynum, u32 low, u32 high) |
187 | { | 316 | { |
@@ -189,19 +318,58 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, | |||
189 | hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0); | 318 | hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0); |
190 | } | 319 | } |
191 | 320 | ||
321 | /* OK, I lied. There are three "thread local storage" GDT entries which change | ||
322 | * on every context switch (these three entries are how glibc implements | ||
323 | * __thread variables). So we have a hypercall specifically for this case. */ | ||
192 | static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) | 324 | static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) |
193 | { | 325 | { |
194 | lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0); | 326 | lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0); |
195 | } | 327 | } |
328 | /*:*/ | ||
196 | 329 | ||
330 | /*G:038 That's enough excitement for now, back to ploughing through each of | ||
331 | * the paravirt_ops (we're about 1/3 of the way through). | ||
332 | * | ||
333 | * This is the Local Descriptor Table, another weird Intel thingy. Linux only | ||
334 | * uses this for some strange applications like Wine. We don't do anything | ||
335 | * here, so they'll get an informative and friendly Segmentation Fault. */ | ||
197 | static void lguest_set_ldt(const void *addr, unsigned entries) | 336 | static void lguest_set_ldt(const void *addr, unsigned entries) |
198 | { | 337 | { |
199 | } | 338 | } |
200 | 339 | ||
340 | /* This loads a GDT entry into the "Task Register": that entry points to a | ||
341 | * structure called the Task State Segment. Some comments scattered though the | ||
342 | * kernel code indicate that this used for task switching in ages past, along | ||
343 | * with blood sacrifice and astrology. | ||
344 | * | ||
345 | * Now there's nothing interesting in here that we don't get told elsewhere. | ||
346 | * But the native version uses the "ltr" instruction, which makes the Host | ||
347 | * complain to the Guest about a Segmentation Fault and it'll oops. So we | ||
348 | * override the native version with a do-nothing version. */ | ||
201 | static void lguest_load_tr_desc(void) | 349 | static void lguest_load_tr_desc(void) |
202 | { | 350 | { |
203 | } | 351 | } |
204 | 352 | ||
353 | /* The "cpuid" instruction is a way of querying both the CPU identity | ||
354 | * (manufacturer, model, etc) and its features. It was introduced before the | ||
355 | * Pentium in 1993 and keeps getting extended by both Intel and AMD. As you | ||
356 | * might imagine, after a decade and a half this treatment, it is now a giant | ||
357 | * ball of hair. Its entry in the current Intel manual runs to 28 pages. | ||
358 | * | ||
359 | * This instruction even it has its own Wikipedia entry. The Wikipedia entry | ||
360 | * has been translated into 4 languages. I am not making this up! | ||
361 | * | ||
362 | * We could get funky here and identify ourselves as "GenuineLguest", but | ||
363 | * instead we just use the real "cpuid" instruction. Then I pretty much turned | ||
364 | * off feature bits until the Guest booted. (Don't say that: you'll damage | ||
365 | * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is | ||
366 | * hardly future proof.) Noone's listening! They don't like you anyway, | ||
367 | * parenthetic weirdo! | ||
368 | * | ||
369 | * Replacing the cpuid so we can turn features off is great for the kernel, but | ||
370 | * anyone (including userspace) can just use the raw "cpuid" instruction and | ||
371 | * the Host won't even notice since it isn't privileged. So we try not to get | ||
372 | * too worked up about it. */ | ||
205 | static void lguest_cpuid(unsigned int *eax, unsigned int *ebx, | 373 | static void lguest_cpuid(unsigned int *eax, unsigned int *ebx, |
206 | unsigned int *ecx, unsigned int *edx) | 374 | unsigned int *ecx, unsigned int *edx) |
207 | { | 375 | { |
@@ -214,21 +382,43 @@ static void lguest_cpuid(unsigned int *eax, unsigned int *ebx, | |||
214 | *ecx &= 0x00002201; | 382 | *ecx &= 0x00002201; |
215 | /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */ | 383 | /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */ |
216 | *edx &= 0x07808101; | 384 | *edx &= 0x07808101; |
217 | /* Host wants to know when we flush kernel pages: set PGE. */ | 385 | /* The Host can do a nice optimization if it knows that the |
386 | * kernel mappings (addresses above 0xC0000000 or whatever | ||
387 | * PAGE_OFFSET is set to) haven't changed. But Linux calls | ||
388 | * flush_tlb_user() for both user and kernel mappings unless | ||
389 | * the Page Global Enable (PGE) feature bit is set. */ | ||
218 | *edx |= 0x00002000; | 390 | *edx |= 0x00002000; |
219 | break; | 391 | break; |
220 | case 0x80000000: | 392 | case 0x80000000: |
221 | /* Futureproof this a little: if they ask how much extended | 393 | /* Futureproof this a little: if they ask how much extended |
222 | * processor information, limit it to known fields. */ | 394 | * processor information there is, limit it to known fields. */ |
223 | if (*eax > 0x80000008) | 395 | if (*eax > 0x80000008) |
224 | *eax = 0x80000008; | 396 | *eax = 0x80000008; |
225 | break; | 397 | break; |
226 | } | 398 | } |
227 | } | 399 | } |
228 | 400 | ||
401 | /* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. | ||
402 | * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother | ||
403 | * it. The Host needs to know when the Guest wants to change them, so we have | ||
404 | * a whole series of functions like read_cr0() and write_cr0(). | ||
405 | * | ||
406 | * We start with CR0. CR0 allows you to turn on and off all kinds of basic | ||
407 | * features, but Linux only really cares about one: the horrifically-named Task | ||
408 | * Switched (TS) bit at bit 3 (ie. 8) | ||
409 | * | ||
410 | * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if | ||
411 | * the floating point unit is used. Which allows us to restore FPU state | ||
412 | * lazily after a task switch, and Linux uses that gratefully, but wouldn't a | ||
413 | * name like "FPUTRAP bit" be a little less cryptic? | ||
414 | * | ||
415 | * We store cr0 (and cr3) locally, because the Host never changes it. The | ||
416 | * Guest sometimes wants to read it and we'd prefer not to bother the Host | ||
417 | * unnecessarily. */ | ||
229 | static unsigned long current_cr0, current_cr3; | 418 | static unsigned long current_cr0, current_cr3; |
230 | static void lguest_write_cr0(unsigned long val) | 419 | static void lguest_write_cr0(unsigned long val) |
231 | { | 420 | { |
421 | /* 8 == TS bit. */ | ||
232 | lazy_hcall(LHCALL_TS, val & 8, 0, 0); | 422 | lazy_hcall(LHCALL_TS, val & 8, 0, 0); |
233 | current_cr0 = val; | 423 | current_cr0 = val; |
234 | } | 424 | } |
@@ -238,17 +428,25 @@ static unsigned long lguest_read_cr0(void) | |||
238 | return current_cr0; | 428 | return current_cr0; |
239 | } | 429 | } |
240 | 430 | ||
431 | /* Intel provided a special instruction to clear the TS bit for people too cool | ||
432 | * to use write_cr0() to do it. This "clts" instruction is faster, because all | ||
433 | * the vowels have been optimized out. */ | ||
241 | static void lguest_clts(void) | 434 | static void lguest_clts(void) |
242 | { | 435 | { |
243 | lazy_hcall(LHCALL_TS, 0, 0, 0); | 436 | lazy_hcall(LHCALL_TS, 0, 0, 0); |
244 | current_cr0 &= ~8U; | 437 | current_cr0 &= ~8U; |
245 | } | 438 | } |
246 | 439 | ||
440 | /* CR2 is the virtual address of the last page fault, which the Guest only ever | ||
441 | * reads. The Host kindly writes this into our "struct lguest_data", so we | ||
442 | * just read it out of there. */ | ||
247 | static unsigned long lguest_read_cr2(void) | 443 | static unsigned long lguest_read_cr2(void) |
248 | { | 444 | { |
249 | return lguest_data.cr2; | 445 | return lguest_data.cr2; |
250 | } | 446 | } |
251 | 447 | ||
448 | /* CR3 is the current toplevel pagetable page: the principle is the same as | ||
449 | * cr0. Keep a local copy, and tell the Host when it changes. */ | ||
252 | static void lguest_write_cr3(unsigned long cr3) | 450 | static void lguest_write_cr3(unsigned long cr3) |
253 | { | 451 | { |
254 | lazy_hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0); | 452 | lazy_hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0); |
@@ -260,7 +458,7 @@ static unsigned long lguest_read_cr3(void) | |||
260 | return current_cr3; | 458 | return current_cr3; |
261 | } | 459 | } |
262 | 460 | ||
263 | /* Used to enable/disable PGE, but we don't care. */ | 461 | /* CR4 is used to enable and disable PGE, but we don't care. */ |
264 | static unsigned long lguest_read_cr4(void) | 462 | static unsigned long lguest_read_cr4(void) |
265 | { | 463 | { |
266 | return 0; | 464 | return 0; |
@@ -270,6 +468,59 @@ static void lguest_write_cr4(unsigned long val) | |||
270 | { | 468 | { |
271 | } | 469 | } |
272 | 470 | ||
471 | /* | ||
472 | * Page Table Handling. | ||
473 | * | ||
474 | * Now would be a good time to take a rest and grab a coffee or similarly | ||
475 | * relaxing stimulant. The easy parts are behind us, and the trek gradually | ||
476 | * winds uphill from here. | ||
477 | * | ||
478 | * Quick refresher: memory is divided into "pages" of 4096 bytes each. The CPU | ||
479 | * maps virtual addresses to physical addresses using "page tables". We could | ||
480 | * use one huge index of 1 million entries: each address is 4 bytes, so that's | ||
481 | * 1024 pages just to hold the page tables. But since most virtual addresses | ||
482 | * are unused, we use a two level index which saves space. The CR3 register | ||
483 | * contains the physical address of the top level "page directory" page, which | ||
484 | * contains physical addresses of up to 1024 second-level pages. Each of these | ||
485 | * second level pages contains up to 1024 physical addresses of actual pages, | ||
486 | * or Page Table Entries (PTEs). | ||
487 | * | ||
488 | * Here's a diagram, where arrows indicate physical addresses: | ||
489 | * | ||
490 | * CR3 ---> +---------+ | ||
491 | * | --------->+---------+ | ||
492 | * | | | PADDR1 | | ||
493 | * Top-level | | PADDR2 | | ||
494 | * (PMD) page | | | | ||
495 | * | | Lower-level | | ||
496 | * | | (PTE) page | | ||
497 | * | | | | | ||
498 | * .... .... | ||
499 | * | ||
500 | * So to convert a virtual address to a physical address, we look up the top | ||
501 | * level, which points us to the second level, which gives us the physical | ||
502 | * address of that page. If the top level entry was not present, or the second | ||
503 | * level entry was not present, then the virtual address is invalid (we | ||
504 | * say "the page was not mapped"). | ||
505 | * | ||
506 | * Put another way, a 32-bit virtual address is divided up like so: | ||
507 | * | ||
508 | * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | ||
509 | * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>| | ||
510 | * Index into top Index into second Offset within page | ||
511 | * page directory page pagetable page | ||
512 | * | ||
513 | * The kernel spends a lot of time changing both the top-level page directory | ||
514 | * and lower-level pagetable pages. The Guest doesn't know physical addresses, | ||
515 | * so while it maintains these page tables exactly like normal, it also needs | ||
516 | * to keep the Host informed whenever it makes a change: the Host will create | ||
517 | * the real page tables based on the Guests'. | ||
518 | */ | ||
519 | |||
520 | /* The Guest calls this to set a second-level entry (pte), ie. to map a page | ||
521 | * into a process' address space. We set the entry then tell the Host the | ||
522 | * toplevel and address this corresponds to. The Guest uses one pagetable per | ||
523 | * process, so we need to tell the Host which one we're changing (mm->pgd). */ | ||
273 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, | 524 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, |
274 | pte_t *ptep, pte_t pteval) | 525 | pte_t *ptep, pte_t pteval) |
275 | { | 526 | { |
@@ -277,7 +528,9 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, | |||
277 | lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low); | 528 | lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low); |
278 | } | 529 | } |
279 | 530 | ||
280 | /* We only support two-level pagetables at the moment. */ | 531 | /* The Guest calls this to set a top-level entry. Again, we set the entry then |
532 | * tell the Host which top-level page we changed, and the index of the entry we | ||
533 | * changed. */ | ||
281 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | 534 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) |
282 | { | 535 | { |
283 | *pmdp = pmdval; | 536 | *pmdp = pmdval; |
@@ -285,7 +538,15 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | |||
285 | (__pa(pmdp)&(PAGE_SIZE-1))/4, 0); | 538 | (__pa(pmdp)&(PAGE_SIZE-1))/4, 0); |
286 | } | 539 | } |
287 | 540 | ||
288 | /* FIXME: Eliminate all callers of this. */ | 541 | /* There are a couple of legacy places where the kernel sets a PTE, but we |
542 | * don't know the top level any more. This is useless for us, since we don't | ||
543 | * know which pagetable is changing or what address, so we just tell the Host | ||
544 | * to forget all of them. Fortunately, this is very rare. | ||
545 | * | ||
546 | * ... except in early boot when the kernel sets up the initial pagetables, | ||
547 | * which makes booting astonishingly slow. So we don't even tell the Host | ||
548 | * anything changed until we've done the first page table switch. | ||
549 | */ | ||
289 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) | 550 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) |
290 | { | 551 | { |
291 | *ptep = pteval; | 552 | *ptep = pteval; |
@@ -294,22 +555,51 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval) | |||
294 | lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); | 555 | lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); |
295 | } | 556 | } |
296 | 557 | ||
558 | /* Unfortunately for Lguest, the paravirt_ops for page tables were based on | ||
559 | * native page table operations. On native hardware you can set a new page | ||
560 | * table entry whenever you want, but if you want to remove one you have to do | ||
561 | * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). | ||
562 | * | ||
563 | * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only | ||
564 | * called when a valid entry is written, not when it's removed (ie. marked not | ||
565 | * present). Instead, this is where we come when the Guest wants to remove a | ||
566 | * page table entry: we tell the Host to set that entry to 0 (ie. the present | ||
567 | * bit is zero). */ | ||
297 | static void lguest_flush_tlb_single(unsigned long addr) | 568 | static void lguest_flush_tlb_single(unsigned long addr) |
298 | { | 569 | { |
299 | /* Simply set it to zero, and it will fault back in. */ | 570 | /* Simply set it to zero: if it was not, it will fault back in. */ |
300 | lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0); | 571 | lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0); |
301 | } | 572 | } |
302 | 573 | ||
574 | /* This is what happens after the Guest has removed a large number of entries. | ||
575 | * This tells the Host that any of the page table entries for userspace might | ||
576 | * have changed, ie. virtual addresses below PAGE_OFFSET. */ | ||
303 | static void lguest_flush_tlb_user(void) | 577 | static void lguest_flush_tlb_user(void) |
304 | { | 578 | { |
305 | lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0); | 579 | lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0); |
306 | } | 580 | } |
307 | 581 | ||
582 | /* This is called when the kernel page tables have changed. That's not very | ||
583 | * common (unless the Guest is using highmem, which makes the Guest extremely | ||
584 | * slow), so it's worth separating this from the user flushing above. */ | ||
308 | static void lguest_flush_tlb_kernel(void) | 585 | static void lguest_flush_tlb_kernel(void) |
309 | { | 586 | { |
310 | lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); | 587 | lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); |
311 | } | 588 | } |
312 | 589 | ||
590 | /* | ||
591 | * The Unadvanced Programmable Interrupt Controller. | ||
592 | * | ||
593 | * This is an attempt to implement the simplest possible interrupt controller. | ||
594 | * I spent some time looking though routines like set_irq_chip_and_handler, | ||
595 | * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and | ||
596 | * I *think* this is as simple as it gets. | ||
597 | * | ||
598 | * We can tell the Host what interrupts we want blocked ready for using the | ||
599 | * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as | ||
600 | * simple as setting a bit. We don't actually "ack" interrupts as such, we | ||
601 | * just mask and unmask them. I wonder if we should be cleverer? | ||
602 | */ | ||
313 | static void disable_lguest_irq(unsigned int irq) | 603 | static void disable_lguest_irq(unsigned int irq) |
314 | { | 604 | { |
315 | set_bit(irq, lguest_data.blocked_interrupts); | 605 | set_bit(irq, lguest_data.blocked_interrupts); |
@@ -318,9 +608,9 @@ static void disable_lguest_irq(unsigned int irq) | |||
318 | static void enable_lguest_irq(unsigned int irq) | 608 | static void enable_lguest_irq(unsigned int irq) |
319 | { | 609 | { |
320 | clear_bit(irq, lguest_data.blocked_interrupts); | 610 | clear_bit(irq, lguest_data.blocked_interrupts); |
321 | /* FIXME: If it's pending? */ | ||
322 | } | 611 | } |
323 | 612 | ||
613 | /* This structure describes the lguest IRQ controller. */ | ||
324 | static struct irq_chip lguest_irq_controller = { | 614 | static struct irq_chip lguest_irq_controller = { |
325 | .name = "lguest", | 615 | .name = "lguest", |
326 | .mask = disable_lguest_irq, | 616 | .mask = disable_lguest_irq, |
@@ -328,6 +618,10 @@ static struct irq_chip lguest_irq_controller = { | |||
328 | .unmask = enable_lguest_irq, | 618 | .unmask = enable_lguest_irq, |
329 | }; | 619 | }; |
330 | 620 | ||
621 | /* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware | ||
622 | * interrupt (except 128, which is used for system calls), and then tells the | ||
623 | * Linux infrastructure that each interrupt is controlled by our level-based | ||
624 | * lguest interrupt controller. */ | ||
331 | static void __init lguest_init_IRQ(void) | 625 | static void __init lguest_init_IRQ(void) |
332 | { | 626 | { |
333 | unsigned int i; | 627 | unsigned int i; |
@@ -340,20 +634,51 @@ static void __init lguest_init_IRQ(void) | |||
340 | handle_level_irq); | 634 | handle_level_irq); |
341 | } | 635 | } |
342 | } | 636 | } |
637 | /* This call is required to set up for 4k stacks, where we have | ||
638 | * separate stacks for hard and soft interrupts. */ | ||
343 | irq_ctx_init(smp_processor_id()); | 639 | irq_ctx_init(smp_processor_id()); |
344 | } | 640 | } |
345 | 641 | ||
642 | /* | ||
643 | * Time. | ||
644 | * | ||
645 | * It would be far better for everyone if the Guest had its own clock, but | ||
646 | * until then the Host gives us the time on every interrupt. | ||
647 | */ | ||
346 | static unsigned long lguest_get_wallclock(void) | 648 | static unsigned long lguest_get_wallclock(void) |
347 | { | 649 | { |
348 | return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0); | 650 | return lguest_data.time.tv_sec; |
349 | } | 651 | } |
350 | 652 | ||
351 | static cycle_t lguest_clock_read(void) | 653 | static cycle_t lguest_clock_read(void) |
352 | { | 654 | { |
655 | unsigned long sec, nsec; | ||
656 | |||
657 | /* If the Host tells the TSC speed, we can trust that. */ | ||
353 | if (lguest_data.tsc_khz) | 658 | if (lguest_data.tsc_khz) |
354 | return native_read_tsc(); | 659 | return native_read_tsc(); |
355 | else | 660 | |
356 | return jiffies; | 661 | /* If we can't use the TSC, we read the time value written by the Host. |
662 | * Since it's in two parts (seconds and nanoseconds), we risk reading | ||
663 | * it just as it's changing from 99 & 0.999999999 to 100 and 0, and | ||
664 | * getting 99 and 0. As Linux tends to come apart under the stress of | ||
665 | * time travel, we must be careful: */ | ||
666 | do { | ||
667 | /* First we read the seconds part. */ | ||
668 | sec = lguest_data.time.tv_sec; | ||
669 | /* This read memory barrier tells the compiler and the CPU that | ||
670 | * this can't be reordered: we have to complete the above | ||
671 | * before going on. */ | ||
672 | rmb(); | ||
673 | /* Now we read the nanoseconds part. */ | ||
674 | nsec = lguest_data.time.tv_nsec; | ||
675 | /* Make sure we've done that. */ | ||
676 | rmb(); | ||
677 | /* Now if the seconds part has changed, try again. */ | ||
678 | } while (unlikely(lguest_data.time.tv_sec != sec)); | ||
679 | |||
680 | /* Our non-TSC clock is in real nanoseconds. */ | ||
681 | return sec*1000000000ULL + nsec; | ||
357 | } | 682 | } |
358 | 683 | ||
359 | /* This is what we tell the kernel is our clocksource. */ | 684 | /* This is what we tell the kernel is our clocksource. */ |
@@ -361,8 +686,11 @@ static struct clocksource lguest_clock = { | |||
361 | .name = "lguest", | 686 | .name = "lguest", |
362 | .rating = 400, | 687 | .rating = 400, |
363 | .read = lguest_clock_read, | 688 | .read = lguest_clock_read, |
689 | .mask = CLOCKSOURCE_MASK(64), | ||
690 | .mult = 1, | ||
364 | }; | 691 | }; |
365 | 692 | ||
693 | /* The "scheduler clock" is just our real clock, adjusted to start at zero */ | ||
366 | static unsigned long long lguest_sched_clock(void) | 694 | static unsigned long long lguest_sched_clock(void) |
367 | { | 695 | { |
368 | return cyc2ns(&lguest_clock, lguest_clock_read() - clock_base); | 696 | return cyc2ns(&lguest_clock, lguest_clock_read() - clock_base); |
@@ -428,34 +756,55 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) | |||
428 | local_irq_restore(flags); | 756 | local_irq_restore(flags); |
429 | } | 757 | } |
430 | 758 | ||
759 | /* At some point in the boot process, we get asked to set up our timing | ||
760 | * infrastructure. The kernel doesn't expect timer interrupts before this, but | ||
761 | * we cleverly initialized the "blocked_interrupts" field of "struct | ||
762 | * lguest_data" so that timer interrupts were blocked until now. */ | ||
431 | static void lguest_time_init(void) | 763 | static void lguest_time_init(void) |
432 | { | 764 | { |
765 | /* Set up the timer interrupt (0) to go to our simple timer routine */ | ||
433 | set_irq_handler(0, lguest_time_irq); | 766 | set_irq_handler(0, lguest_time_irq); |
434 | 767 | ||
435 | /* We use the TSC if the Host tells us we can, otherwise a dumb | 768 | /* Our clock structure look like arch/i386/kernel/tsc.c if we can use |
436 | * jiffies-based clock. */ | 769 | * the TSC, otherwise it's a dumb nanosecond-resolution clock. Either |
770 | * way, the "rating" is initialized so high that it's always chosen | ||
771 | * over any other clocksource. */ | ||
437 | if (lguest_data.tsc_khz) { | 772 | if (lguest_data.tsc_khz) { |
438 | lguest_clock.shift = 22; | 773 | lguest_clock.shift = 22; |
439 | lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz, | 774 | lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz, |
440 | lguest_clock.shift); | 775 | lguest_clock.shift); |
441 | lguest_clock.mask = CLOCKSOURCE_MASK(64); | ||
442 | lguest_clock.flags = CLOCK_SOURCE_IS_CONTINUOUS; | 776 | lguest_clock.flags = CLOCK_SOURCE_IS_CONTINUOUS; |
443 | } else { | ||
444 | /* To understand this, start at kernel/time/jiffies.c... */ | ||
445 | lguest_clock.shift = 8; | ||
446 | lguest_clock.mult = (((u64)NSEC_PER_SEC<<8)/ACTHZ) << 8; | ||
447 | lguest_clock.mask = CLOCKSOURCE_MASK(32); | ||
448 | } | 777 | } |
449 | clock_base = lguest_clock_read(); | 778 | clock_base = lguest_clock_read(); |
450 | clocksource_register(&lguest_clock); | 779 | clocksource_register(&lguest_clock); |
451 | 780 | ||
452 | /* We can't set cpumask in the initializer: damn C limitations! */ | 781 | /* Now we've set up our clock, we can use it as the scheduler clock */ |
782 | paravirt_ops.sched_clock = lguest_sched_clock; | ||
783 | |||
784 | /* We can't set cpumask in the initializer: damn C limitations! Set it | ||
785 | * here and register our timer device. */ | ||
453 | lguest_clockevent.cpumask = cpumask_of_cpu(0); | 786 | lguest_clockevent.cpumask = cpumask_of_cpu(0); |
454 | clockevents_register_device(&lguest_clockevent); | 787 | clockevents_register_device(&lguest_clockevent); |
455 | 788 | ||
789 | /* Finally, we unblock the timer interrupt. */ | ||
456 | enable_lguest_irq(0); | 790 | enable_lguest_irq(0); |
457 | } | 791 | } |
458 | 792 | ||
793 | /* | ||
794 | * Miscellaneous bits and pieces. | ||
795 | * | ||
796 | * Here is an oddball collection of functions which the Guest needs for things | ||
797 | * to work. They're pretty simple. | ||
798 | */ | ||
799 | |||
800 | /* The Guest needs to tell the host what stack it expects traps to use. For | ||
801 | * native hardware, this is part of the Task State Segment mentioned above in | ||
802 | * lguest_load_tr_desc(), but to help hypervisors there's this special call. | ||
803 | * | ||
804 | * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data | ||
805 | * segment), the privilege level (we're privilege level 1, the Host is 0 and | ||
806 | * will not tolerate us trying to use that), the stack pointer, and the number | ||
807 | * of pages in the stack. */ | ||
459 | static void lguest_load_esp0(struct tss_struct *tss, | 808 | static void lguest_load_esp0(struct tss_struct *tss, |
460 | struct thread_struct *thread) | 809 | struct thread_struct *thread) |
461 | { | 810 | { |
@@ -463,15 +812,31 @@ static void lguest_load_esp0(struct tss_struct *tss, | |||
463 | THREAD_SIZE/PAGE_SIZE); | 812 | THREAD_SIZE/PAGE_SIZE); |
464 | } | 813 | } |
465 | 814 | ||
815 | /* Let's just say, I wouldn't do debugging under a Guest. */ | ||
466 | static void lguest_set_debugreg(int regno, unsigned long value) | 816 | static void lguest_set_debugreg(int regno, unsigned long value) |
467 | { | 817 | { |
468 | /* FIXME: Implement */ | 818 | /* FIXME: Implement */ |
469 | } | 819 | } |
470 | 820 | ||
821 | /* There are times when the kernel wants to make sure that no memory writes are | ||
822 | * caught in the cache (that they've all reached real hardware devices). This | ||
823 | * doesn't matter for the Guest which has virtual hardware. | ||
824 | * | ||
825 | * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush | ||
826 | * (clflush) instruction is available and the kernel uses that. Otherwise, it | ||
827 | * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction. | ||
828 | * Unlike clflush, wbinvd can only be run at privilege level 0. So we can | ||
829 | * ignore clflush, but replace wbinvd. | ||
830 | */ | ||
471 | static void lguest_wbinvd(void) | 831 | static void lguest_wbinvd(void) |
472 | { | 832 | { |
473 | } | 833 | } |
474 | 834 | ||
835 | /* If the Guest expects to have an Advanced Programmable Interrupt Controller, | ||
836 | * we play dumb by ignoring writes and returning 0 for reads. So it's no | ||
837 | * longer Programmable nor Controlling anything, and I don't think 8 lines of | ||
838 | * code qualifies for Advanced. It will also never interrupt anything. It | ||
839 | * does, however, allow us to get through the Linux boot code. */ | ||
475 | #ifdef CONFIG_X86_LOCAL_APIC | 840 | #ifdef CONFIG_X86_LOCAL_APIC |
476 | static void lguest_apic_write(unsigned long reg, unsigned long v) | 841 | static void lguest_apic_write(unsigned long reg, unsigned long v) |
477 | { | 842 | { |
@@ -483,19 +848,32 @@ static unsigned long lguest_apic_read(unsigned long reg) | |||
483 | } | 848 | } |
484 | #endif | 849 | #endif |
485 | 850 | ||
851 | /* STOP! Until an interrupt comes in. */ | ||
486 | static void lguest_safe_halt(void) | 852 | static void lguest_safe_halt(void) |
487 | { | 853 | { |
488 | hcall(LHCALL_HALT, 0, 0, 0); | 854 | hcall(LHCALL_HALT, 0, 0, 0); |
489 | } | 855 | } |
490 | 856 | ||
857 | /* Perhaps CRASH isn't the best name for this hypercall, but we use it to get a | ||
858 | * message out when we're crashing as well as elegant termination like powering | ||
859 | * off. | ||
860 | * | ||
861 | * Note that the Host always prefers that the Guest speak in physical addresses | ||
862 | * rather than virtual addresses, so we use __pa() here. */ | ||
491 | static void lguest_power_off(void) | 863 | static void lguest_power_off(void) |
492 | { | 864 | { |
493 | hcall(LHCALL_CRASH, __pa("Power down"), 0, 0); | 865 | hcall(LHCALL_CRASH, __pa("Power down"), 0, 0); |
494 | } | 866 | } |
495 | 867 | ||
868 | /* | ||
869 | * Panicing. | ||
870 | * | ||
871 | * Don't. But if you did, this is what happens. | ||
872 | */ | ||
496 | static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) | 873 | static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) |
497 | { | 874 | { |
498 | hcall(LHCALL_CRASH, __pa(p), 0, 0); | 875 | hcall(LHCALL_CRASH, __pa(p), 0, 0); |
876 | /* The hcall won't return, but to keep gcc happy, we're "done". */ | ||
499 | return NOTIFY_DONE; | 877 | return NOTIFY_DONE; |
500 | } | 878 | } |
501 | 879 | ||
@@ -503,15 +881,45 @@ static struct notifier_block paniced = { | |||
503 | .notifier_call = lguest_panic | 881 | .notifier_call = lguest_panic |
504 | }; | 882 | }; |
505 | 883 | ||
884 | /* Setting up memory is fairly easy. */ | ||
506 | static __init char *lguest_memory_setup(void) | 885 | static __init char *lguest_memory_setup(void) |
507 | { | 886 | { |
508 | /* We do this here because lockcheck barfs if before start_kernel */ | 887 | /* We do this here and not earlier because lockcheck barfs if we do it |
888 | * before start_kernel() */ | ||
509 | atomic_notifier_chain_register(&panic_notifier_list, &paniced); | 889 | atomic_notifier_chain_register(&panic_notifier_list, &paniced); |
510 | 890 | ||
891 | /* The Linux bootloader header contains an "e820" memory map: the | ||
892 | * Launcher populated the first entry with our memory limit. */ | ||
511 | add_memory_region(E820_MAP->addr, E820_MAP->size, E820_MAP->type); | 893 | add_memory_region(E820_MAP->addr, E820_MAP->size, E820_MAP->type); |
894 | |||
895 | /* This string is for the boot messages. */ | ||
512 | return "LGUEST"; | 896 | return "LGUEST"; |
513 | } | 897 | } |
514 | 898 | ||
899 | /*G:050 | ||
900 | * Patching (Powerfully Placating Performance Pedants) | ||
901 | * | ||
902 | * We have already seen that "struct paravirt_ops" lets us replace simple | ||
903 | * native instructions with calls to the appropriate back end all throughout | ||
904 | * the kernel. This allows the same kernel to run as a Guest and as a native | ||
905 | * kernel, but it's slow because of all the indirect branches. | ||
906 | * | ||
907 | * Remember that David Wheeler quote about "Any problem in computer science can | ||
908 | * be solved with another layer of indirection"? The rest of that quote is | ||
909 | * "... But that usually will create another problem." This is the first of | ||
910 | * those problems. | ||
911 | * | ||
912 | * Our current solution is to allow the paravirt back end to optionally patch | ||
913 | * over the indirect calls to replace them with something more efficient. We | ||
914 | * patch the four most commonly called functions: disable interrupts, enable | ||
915 | * interrupts, restore interrupts and save interrupts. We usually have 10 | ||
916 | * bytes to patch into: the Guest versions of these operations are small enough | ||
917 | * that we can fit comfortably. | ||
918 | * | ||
919 | * First we need assembly templates of each of the patchable Guest operations, | ||
920 | * and these are in lguest_asm.S. */ | ||
921 | |||
922 | /*G:060 We construct a table from the assembler templates: */ | ||
515 | static const struct lguest_insns | 923 | static const struct lguest_insns |
516 | { | 924 | { |
517 | const char *start, *end; | 925 | const char *start, *end; |
@@ -521,35 +929,52 @@ static const struct lguest_insns | |||
521 | [PARAVIRT_PATCH(restore_fl)] = { lgstart_popf, lgend_popf }, | 929 | [PARAVIRT_PATCH(restore_fl)] = { lgstart_popf, lgend_popf }, |
522 | [PARAVIRT_PATCH(save_fl)] = { lgstart_pushf, lgend_pushf }, | 930 | [PARAVIRT_PATCH(save_fl)] = { lgstart_pushf, lgend_pushf }, |
523 | }; | 931 | }; |
932 | |||
933 | /* Now our patch routine is fairly simple (based on the native one in | ||
934 | * paravirt.c). If we have a replacement, we copy it in and return how much of | ||
935 | * the available space we used. */ | ||
524 | static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len) | 936 | static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len) |
525 | { | 937 | { |
526 | unsigned int insn_len; | 938 | unsigned int insn_len; |
527 | 939 | ||
528 | /* Don't touch it if we don't have a replacement */ | 940 | /* Don't do anything special if we don't have a replacement */ |
529 | if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start) | 941 | if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start) |
530 | return paravirt_patch_default(type, clobber, insns, len); | 942 | return paravirt_patch_default(type, clobber, insns, len); |
531 | 943 | ||
532 | insn_len = lguest_insns[type].end - lguest_insns[type].start; | 944 | insn_len = lguest_insns[type].end - lguest_insns[type].start; |
533 | 945 | ||
534 | /* Similarly if we can't fit replacement. */ | 946 | /* Similarly if we can't fit replacement (shouldn't happen, but let's |
947 | * be thorough). */ | ||
535 | if (len < insn_len) | 948 | if (len < insn_len) |
536 | return paravirt_patch_default(type, clobber, insns, len); | 949 | return paravirt_patch_default(type, clobber, insns, len); |
537 | 950 | ||
951 | /* Copy in our instructions. */ | ||
538 | memcpy(insns, lguest_insns[type].start, insn_len); | 952 | memcpy(insns, lguest_insns[type].start, insn_len); |
539 | return insn_len; | 953 | return insn_len; |
540 | } | 954 | } |
541 | 955 | ||
956 | /*G:030 Once we get to lguest_init(), we know we're a Guest. The paravirt_ops | ||
957 | * structure in the kernel provides a single point for (almost) every routine | ||
958 | * we have to override to avoid privileged instructions. */ | ||
542 | __init void lguest_init(void *boot) | 959 | __init void lguest_init(void *boot) |
543 | { | 960 | { |
544 | /* Copy boot parameters first. */ | 961 | /* Copy boot parameters first: the Launcher put the physical location |
962 | * in %esi, and head.S converted that to a virtual address and handed | ||
963 | * it to us. */ | ||
545 | memcpy(&boot_params, boot, PARAM_SIZE); | 964 | memcpy(&boot_params, boot, PARAM_SIZE); |
965 | /* The boot parameters also tell us where the command-line is: save | ||
966 | * that, too. */ | ||
546 | memcpy(boot_command_line, __va(boot_params.hdr.cmd_line_ptr), | 967 | memcpy(boot_command_line, __va(boot_params.hdr.cmd_line_ptr), |
547 | COMMAND_LINE_SIZE); | 968 | COMMAND_LINE_SIZE); |
548 | 969 | ||
970 | /* We're under lguest, paravirt is enabled, and we're running at | ||
971 | * privilege level 1, not 0 as normal. */ | ||
549 | paravirt_ops.name = "lguest"; | 972 | paravirt_ops.name = "lguest"; |
550 | paravirt_ops.paravirt_enabled = 1; | 973 | paravirt_ops.paravirt_enabled = 1; |
551 | paravirt_ops.kernel_rpl = 1; | 974 | paravirt_ops.kernel_rpl = 1; |
552 | 975 | ||
976 | /* We set up all the lguest overrides for sensitive operations. These | ||
977 | * are detailed with the operations themselves. */ | ||
553 | paravirt_ops.save_fl = save_fl; | 978 | paravirt_ops.save_fl = save_fl; |
554 | paravirt_ops.restore_fl = restore_fl; | 979 | paravirt_ops.restore_fl = restore_fl; |
555 | paravirt_ops.irq_disable = irq_disable; | 980 | paravirt_ops.irq_disable = irq_disable; |
@@ -592,21 +1017,50 @@ __init void lguest_init(void *boot) | |||
592 | paravirt_ops.time_init = lguest_time_init; | 1017 | paravirt_ops.time_init = lguest_time_init; |
593 | paravirt_ops.set_lazy_mode = lguest_lazy_mode; | 1018 | paravirt_ops.set_lazy_mode = lguest_lazy_mode; |
594 | paravirt_ops.wbinvd = lguest_wbinvd; | 1019 | paravirt_ops.wbinvd = lguest_wbinvd; |
595 | paravirt_ops.sched_clock = lguest_sched_clock; | 1020 | /* Now is a good time to look at the implementations of these functions |
596 | 1021 | * before returning to the rest of lguest_init(). */ | |
1022 | |||
1023 | /*G:070 Now we've seen all the paravirt_ops, we return to | ||
1024 | * lguest_init() where the rest of the fairly chaotic boot setup | ||
1025 | * occurs. | ||
1026 | * | ||
1027 | * The Host expects our first hypercall to tell it where our "struct | ||
1028 | * lguest_data" is, so we do that first. */ | ||
597 | hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0); | 1029 | hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0); |
598 | 1030 | ||
599 | /* We use top of mem for initial pagetables. */ | 1031 | /* The native boot code sets up initial page tables immediately after |
1032 | * the kernel itself, and sets init_pg_tables_end so they're not | ||
1033 | * clobbered. The Launcher places our initial pagetables somewhere at | ||
1034 | * the top of our physical memory, so we don't need extra space: set | ||
1035 | * init_pg_tables_end to the end of the kernel. */ | ||
600 | init_pg_tables_end = __pa(pg0); | 1036 | init_pg_tables_end = __pa(pg0); |
601 | 1037 | ||
1038 | /* Load the %fs segment register (the per-cpu segment register) with | ||
1039 | * the normal data segment to get through booting. */ | ||
602 | asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); | 1040 | asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); |
603 | 1041 | ||
1042 | /* Clear the part of the kernel data which is expected to be zero. | ||
1043 | * Normally it will be anyway, but if we're loading from a bzImage with | ||
1044 | * CONFIG_RELOCATALE=y, the relocations will be sitting here. */ | ||
1045 | memset(__bss_start, 0, __bss_stop - __bss_start); | ||
1046 | |||
1047 | /* The Host uses the top of the Guest's virtual address space for the | ||
1048 | * Host<->Guest Switcher, and it tells us how much it needs in | ||
1049 | * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */ | ||
604 | reserve_top_address(lguest_data.reserve_mem); | 1050 | reserve_top_address(lguest_data.reserve_mem); |
605 | 1051 | ||
1052 | /* If we don't initialize the lock dependency checker now, it crashes | ||
1053 | * paravirt_disable_iospace. */ | ||
606 | lockdep_init(); | 1054 | lockdep_init(); |
607 | 1055 | ||
1056 | /* The IDE code spends about 3 seconds probing for disks: if we reserve | ||
1057 | * all the I/O ports up front it can't get them and so doesn't probe. | ||
1058 | * Other device drivers are similar (but less severe). This cuts the | ||
1059 | * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */ | ||
608 | paravirt_disable_iospace(); | 1060 | paravirt_disable_iospace(); |
609 | 1061 | ||
1062 | /* This is messy CPU setup stuff which the native boot code does before | ||
1063 | * start_kernel, so we have to do, too: */ | ||
610 | cpu_detect(&new_cpu_data); | 1064 | cpu_detect(&new_cpu_data); |
611 | /* head.S usually sets up the first capability word, so do it here. */ | 1065 | /* head.S usually sets up the first capability word, so do it here. */ |
612 | new_cpu_data.x86_capability[0] = cpuid_edx(1); | 1066 | new_cpu_data.x86_capability[0] = cpuid_edx(1); |
@@ -617,14 +1071,27 @@ __init void lguest_init(void *boot) | |||
617 | #ifdef CONFIG_X86_MCE | 1071 | #ifdef CONFIG_X86_MCE |
618 | mce_disabled = 1; | 1072 | mce_disabled = 1; |
619 | #endif | 1073 | #endif |
620 | |||
621 | #ifdef CONFIG_ACPI | 1074 | #ifdef CONFIG_ACPI |
622 | acpi_disabled = 1; | 1075 | acpi_disabled = 1; |
623 | acpi_ht = 0; | 1076 | acpi_ht = 0; |
624 | #endif | 1077 | #endif |
625 | 1078 | ||
1079 | /* We set the perferred console to "hvc". This is the "hypervisor | ||
1080 | * virtual console" driver written by the PowerPC people, which we also | ||
1081 | * adapted for lguest's use. */ | ||
626 | add_preferred_console("hvc", 0, NULL); | 1082 | add_preferred_console("hvc", 0, NULL); |
627 | 1083 | ||
1084 | /* Last of all, we set the power management poweroff hook to point to | ||
1085 | * the Guest routine to power off. */ | ||
628 | pm_power_off = lguest_power_off; | 1086 | pm_power_off = lguest_power_off; |
1087 | |||
1088 | /* Now we're set up, call start_kernel() in init/main.c and we proceed | ||
1089 | * to boot as normal. It never returns. */ | ||
629 | start_kernel(); | 1090 | start_kernel(); |
630 | } | 1091 | } |
1092 | /* | ||
1093 | * This marks the end of stage II of our journey, The Guest. | ||
1094 | * | ||
1095 | * It is now time for us to explore the nooks and crannies of the three Guest | ||
1096 | * devices and complete our understanding of the Guest in "make Drivers". | ||
1097 | */ | ||
diff --git a/drivers/lguest/lguest_asm.S b/drivers/lguest/lguest_asm.S index a3dbf22ee365..f182c6a36209 100644 --- a/drivers/lguest/lguest_asm.S +++ b/drivers/lguest/lguest_asm.S | |||
@@ -4,15 +4,15 @@ | |||
4 | #include <asm/thread_info.h> | 4 | #include <asm/thread_info.h> |
5 | #include <asm/processor-flags.h> | 5 | #include <asm/processor-flags.h> |
6 | 6 | ||
7 | /* | 7 | /*G:020 This is where we begin: we have a magic signature which the launcher |
8 | * This is where we begin: we have a magic signature which the launcher looks | 8 | * looks for. The plan is that the Linux boot protocol will be extended with a |
9 | * for. The plan is that the Linux boot protocol will be extended with a | ||
10 | * "platform type" field which will guide us here from the normal entry point, | 9 | * "platform type" field which will guide us here from the normal entry point, |
11 | * but for the moment this suffices. We pass the virtual address of the boot | 10 | * but for the moment this suffices. The normal boot code uses %esi for the |
12 | * info to lguest_init(). | 11 | * boot header, so we do too. We convert it to a virtual address by adding |
12 | * PAGE_OFFSET, and hand it to lguest_init() as its argument (ie. %eax). | ||
13 | * | 13 | * |
14 | * We put it in .init.text will be discarded after boot. | 14 | * The .section line puts this code in .init.text so it will be discarded after |
15 | */ | 15 | * boot. */ |
16 | .section .init.text, "ax", @progbits | 16 | .section .init.text, "ax", @progbits |
17 | .ascii "GenuineLguest" | 17 | .ascii "GenuineLguest" |
18 | /* Set up initial stack. */ | 18 | /* Set up initial stack. */ |
@@ -21,7 +21,9 @@ | |||
21 | addl $__PAGE_OFFSET, %eax | 21 | addl $__PAGE_OFFSET, %eax |
22 | jmp lguest_init | 22 | jmp lguest_init |
23 | 23 | ||
24 | /* The templates for inline patching. */ | 24 | /*G:055 We create a macro which puts the assembler code between lgstart_ and |
25 | * lgend_ markers. These templates end up in the .init.text section, so they | ||
26 | * are discarded after boot. */ | ||
25 | #define LGUEST_PATCH(name, insns...) \ | 27 | #define LGUEST_PATCH(name, insns...) \ |
26 | lgstart_##name: insns; lgend_##name:; \ | 28 | lgstart_##name: insns; lgend_##name:; \ |
27 | .globl lgstart_##name; .globl lgend_##name | 29 | .globl lgstart_##name; .globl lgend_##name |
@@ -30,24 +32,61 @@ LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) | |||
30 | LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled) | 32 | LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled) |
31 | LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled) | 33 | LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled) |
32 | LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) | 34 | LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) |
35 | /*:*/ | ||
33 | 36 | ||
34 | .text | 37 | .text |
35 | /* These demark the EIP range where host should never deliver interrupts. */ | 38 | /* These demark the EIP range where host should never deliver interrupts. */ |
36 | .global lguest_noirq_start | 39 | .global lguest_noirq_start |
37 | .global lguest_noirq_end | 40 | .global lguest_noirq_end |
38 | 41 | ||
39 | /* | 42 | /*M:004 When the Host reflects a trap or injects an interrupt into the Guest, |
40 | * We move eflags word to lguest_data.irq_enabled to restore interrupt state. | 43 | * it sets the eflags interrupt bit on the stack based on |
41 | * For page faults, gpfs and virtual interrupts, the hypervisor has saved | 44 | * lguest_data.irq_enabled, so the Guest iret logic does the right thing when |
42 | * eflags manually, otherwise it was delivered directly and so eflags reflects | 45 | * restoring it. However, when the Host sets the Guest up for direct traps, |
43 | * the real machine IF state, ie. interrupts on. Since the kernel always dies | 46 | * such as system calls, the processor is the one to push eflags onto the |
44 | * if it takes such a trap with interrupts disabled anyway, turning interrupts | 47 | * stack, and the interrupt bit will be 1 (in reality, interrupts are always |
45 | * back on unconditionally here is OK. | 48 | * enabled in the Guest). |
46 | */ | 49 | * |
50 | * This turns out to be harmless: the only trap which should happen under Linux | ||
51 | * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc | ||
52 | * regions), which has to be reflected through the Host anyway. If another | ||
53 | * trap *does* go off when interrupts are disabled, the Guest will panic, and | ||
54 | * we'll never get to this iret! :*/ | ||
55 | |||
56 | /*G:045 There is one final paravirt_op that the Guest implements, and glancing | ||
57 | * at it you can see why I left it to last. It's *cool*! It's in *assembler*! | ||
58 | * | ||
59 | * The "iret" instruction is used to return from an interrupt or trap. The | ||
60 | * stack looks like this: | ||
61 | * old address | ||
62 | * old code segment & privilege level | ||
63 | * old processor flags ("eflags") | ||
64 | * | ||
65 | * The "iret" instruction pops those values off the stack and restores them all | ||
66 | * at once. The only problem is that eflags includes the Interrupt Flag which | ||
67 | * the Guest can't change: the CPU will simply ignore it when we do an "iret". | ||
68 | * So we have to copy eflags from the stack to lguest_data.irq_enabled before | ||
69 | * we do the "iret". | ||
70 | * | ||
71 | * There are two problems with this: firstly, we need to use a register to do | ||
72 | * the copy and secondly, the whole thing needs to be atomic. The first | ||
73 | * problem is easy to solve: push %eax on the stack so we can use it, and then | ||
74 | * restore it at the end just before the real "iret". | ||
75 | * | ||
76 | * The second is harder: copying eflags to lguest_data.irq_enabled will turn | ||
77 | * interrupts on before we're finished, so we could be interrupted before we | ||
78 | * return to userspace or wherever. Our solution to this is to surround the | ||
79 | * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the | ||
80 | * Host that it is *never* to interrupt us there, even if interrupts seem to be | ||
81 | * enabled. */ | ||
47 | ENTRY(lguest_iret) | 82 | ENTRY(lguest_iret) |
48 | pushl %eax | 83 | pushl %eax |
49 | movl 12(%esp), %eax | 84 | movl 12(%esp), %eax |
50 | lguest_noirq_start: | 85 | lguest_noirq_start: |
86 | /* Note the %ss: segment prefix here. Normal data accesses use the | ||
87 | * "ds" segment, but that will have already been restored for whatever | ||
88 | * we're returning to (such as userspace): we can't trust it. The %ss: | ||
89 | * prefix makes sure we use the stack segment, which is still valid. */ | ||
51 | movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled | 90 | movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled |
52 | popl %eax | 91 | popl %eax |
53 | iret | 92 | iret |
diff --git a/drivers/lguest/lguest_bus.c b/drivers/lguest/lguest_bus.c index 18d6ab21a43b..55a7940ca732 100644 --- a/drivers/lguest/lguest_bus.c +++ b/drivers/lguest/lguest_bus.c | |||
@@ -1,3 +1,6 @@ | |||
1 | /*P:050 Lguest guests use a very simple bus for devices. It's a simple array | ||
2 | * of device descriptors contained just above the top of normal memory. The | ||
3 | * lguest bus is 80% tedious boilerplate code. :*/ | ||
1 | #include <linux/init.h> | 4 | #include <linux/init.h> |
2 | #include <linux/bootmem.h> | 5 | #include <linux/bootmem.h> |
3 | #include <linux/lguest_bus.h> | 6 | #include <linux/lguest_bus.h> |
@@ -43,6 +46,10 @@ static struct device_attribute lguest_dev_attrs[] = { | |||
43 | __ATTR_NULL | 46 | __ATTR_NULL |
44 | }; | 47 | }; |
45 | 48 | ||
49 | /*D:130 The generic bus infrastructure requires a function which says whether a | ||
50 | * device matches a driver. For us, it is simple: "struct lguest_driver" | ||
51 | * contains a "device_type" field which indicates what type of device it can | ||
52 | * handle, so we just cast the args and compare: */ | ||
46 | static int lguest_dev_match(struct device *_dev, struct device_driver *_drv) | 53 | static int lguest_dev_match(struct device *_dev, struct device_driver *_drv) |
47 | { | 54 | { |
48 | struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); | 55 | struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); |
@@ -50,6 +57,7 @@ static int lguest_dev_match(struct device *_dev, struct device_driver *_drv) | |||
50 | 57 | ||
51 | return (drv->device_type == lguest_devices[dev->index].type); | 58 | return (drv->device_type == lguest_devices[dev->index].type); |
52 | } | 59 | } |
60 | /*:*/ | ||
53 | 61 | ||
54 | struct lguest_bus { | 62 | struct lguest_bus { |
55 | struct bus_type bus; | 63 | struct bus_type bus; |
@@ -68,11 +76,24 @@ static struct lguest_bus lguest_bus = { | |||
68 | } | 76 | } |
69 | }; | 77 | }; |
70 | 78 | ||
79 | /*D:140 This is the callback which occurs once the bus infrastructure matches | ||
80 | * up a device and driver, ie. in response to add_lguest_device() calling | ||
81 | * device_register(), or register_lguest_driver() calling driver_register(). | ||
82 | * | ||
83 | * At the moment it's always the latter: the devices are added first, since | ||
84 | * scan_devices() is called from a "core_initcall", and the drivers themselves | ||
85 | * called later as a normal "initcall". But it would work the other way too. | ||
86 | * | ||
87 | * So now we have the happy couple, we add the status bit to indicate that we | ||
88 | * found a driver. If the driver truly loves the device, it will return | ||
89 | * happiness from its probe function (ok, perhaps this wasn't my greatest | ||
90 | * analogy), and we set the final "driver ok" bit so the Host sees it's all | ||
91 | * green. */ | ||
71 | static int lguest_dev_probe(struct device *_dev) | 92 | static int lguest_dev_probe(struct device *_dev) |
72 | { | 93 | { |
73 | int ret; | 94 | int ret; |
74 | struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); | 95 | struct lguest_device*dev = container_of(_dev,struct lguest_device,dev); |
75 | struct lguest_driver *drv = container_of(dev->dev.driver, | 96 | struct lguest_driver*drv = container_of(dev->dev.driver, |
76 | struct lguest_driver, drv); | 97 | struct lguest_driver, drv); |
77 | 98 | ||
78 | lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER; | 99 | lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER; |
@@ -82,6 +103,10 @@ static int lguest_dev_probe(struct device *_dev) | |||
82 | return ret; | 103 | return ret; |
83 | } | 104 | } |
84 | 105 | ||
106 | /* The last part of the bus infrastructure is the function lguest drivers use | ||
107 | * to register themselves. Firstly, we do nothing if there's no lguest bus | ||
108 | * (ie. this is not a Guest), otherwise we fill in the embedded generic "struct | ||
109 | * driver" fields and call the generic driver_register(). */ | ||
85 | int register_lguest_driver(struct lguest_driver *drv) | 110 | int register_lguest_driver(struct lguest_driver *drv) |
86 | { | 111 | { |
87 | if (!lguest_devices) | 112 | if (!lguest_devices) |
@@ -94,12 +119,36 @@ int register_lguest_driver(struct lguest_driver *drv) | |||
94 | 119 | ||
95 | return driver_register(&drv->drv); | 120 | return driver_register(&drv->drv); |
96 | } | 121 | } |
122 | |||
123 | /* At the moment we build all the drivers into the kernel because they're so | ||
124 | * simple: 8144 bytes for all three of them as I type this. And as the console | ||
125 | * really needs to be built in, it's actually only 3527 bytes for the network | ||
126 | * and block drivers. | ||
127 | * | ||
128 | * If they get complex it will make sense for them to be modularized, so we | ||
129 | * need to explicitly export the symbol. | ||
130 | * | ||
131 | * I don't think non-GPL modules make sense, so it's a GPL-only export. | ||
132 | */ | ||
97 | EXPORT_SYMBOL_GPL(register_lguest_driver); | 133 | EXPORT_SYMBOL_GPL(register_lguest_driver); |
98 | 134 | ||
135 | /*D:120 This is the core of the lguest bus: actually adding a new device. | ||
136 | * It's a separate function because it's neater that way, and because an | ||
137 | * earlier version of the code supported hotplug and unplug. They were removed | ||
138 | * early on because they were never used. | ||
139 | * | ||
140 | * As Andrew Tridgell says, "Untested code is buggy code". | ||
141 | * | ||
142 | * It's worth reading this carefully: we start with an index into the array of | ||
143 | * "struct lguest_device_desc"s indicating the device which is new: */ | ||
99 | static void add_lguest_device(unsigned int index) | 144 | static void add_lguest_device(unsigned int index) |
100 | { | 145 | { |
101 | struct lguest_device *new; | 146 | struct lguest_device *new; |
102 | 147 | ||
148 | /* Each "struct lguest_device_desc" has a "status" field, which the | ||
149 | * Guest updates as the device is probed. In the worst case, the Host | ||
150 | * can look at these bits to tell what part of device setup failed, | ||
151 | * even if the console isn't available. */ | ||
103 | lguest_devices[index].status |= LGUEST_DEVICE_S_ACKNOWLEDGE; | 152 | lguest_devices[index].status |= LGUEST_DEVICE_S_ACKNOWLEDGE; |
104 | new = kmalloc(sizeof(struct lguest_device), GFP_KERNEL); | 153 | new = kmalloc(sizeof(struct lguest_device), GFP_KERNEL); |
105 | if (!new) { | 154 | if (!new) { |
@@ -108,12 +157,17 @@ static void add_lguest_device(unsigned int index) | |||
108 | return; | 157 | return; |
109 | } | 158 | } |
110 | 159 | ||
160 | /* The "struct lguest_device" setup is pretty straight-forward example | ||
161 | * code. */ | ||
111 | new->index = index; | 162 | new->index = index; |
112 | new->private = NULL; | 163 | new->private = NULL; |
113 | memset(&new->dev, 0, sizeof(new->dev)); | 164 | memset(&new->dev, 0, sizeof(new->dev)); |
114 | new->dev.parent = &lguest_bus.dev; | 165 | new->dev.parent = &lguest_bus.dev; |
115 | new->dev.bus = &lguest_bus.bus; | 166 | new->dev.bus = &lguest_bus.bus; |
116 | sprintf(new->dev.bus_id, "%u", index); | 167 | sprintf(new->dev.bus_id, "%u", index); |
168 | |||
169 | /* device_register() causes the bus infrastructure to look for a | ||
170 | * matching driver. */ | ||
117 | if (device_register(&new->dev) != 0) { | 171 | if (device_register(&new->dev) != 0) { |
118 | printk(KERN_EMERG "Cannot register lguest device %u\n", index); | 172 | printk(KERN_EMERG "Cannot register lguest device %u\n", index); |
119 | lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED; | 173 | lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED; |
@@ -121,6 +175,9 @@ static void add_lguest_device(unsigned int index) | |||
121 | } | 175 | } |
122 | } | 176 | } |
123 | 177 | ||
178 | /*D:110 scan_devices() simply iterates through the device array. The type 0 | ||
179 | * is reserved to mean "no device", and anything else means we have found a | ||
180 | * device: add it. */ | ||
124 | static void scan_devices(void) | 181 | static void scan_devices(void) |
125 | { | 182 | { |
126 | unsigned int i; | 183 | unsigned int i; |
@@ -130,12 +187,23 @@ static void scan_devices(void) | |||
130 | add_lguest_device(i); | 187 | add_lguest_device(i); |
131 | } | 188 | } |
132 | 189 | ||
190 | /*D:100 Fairly early in boot, lguest_bus_init() is called to set up the lguest | ||
191 | * bus. We check that we are a Guest by checking paravirt_ops.name: there are | ||
192 | * other ways of checking, but this seems most obvious to me. | ||
193 | * | ||
194 | * So we can access the array of "struct lguest_device_desc"s easily, we map | ||
195 | * that memory and store the pointer in the global "lguest_devices". Then we | ||
196 | * register the bus with the core. Doing two registrations seems clunky to me, | ||
197 | * but it seems to be the correct sysfs incantation. | ||
198 | * | ||
199 | * Finally we call scan_devices() which adds all the devices found in the | ||
200 | * "struct lguest_device_desc" array. */ | ||
133 | static int __init lguest_bus_init(void) | 201 | static int __init lguest_bus_init(void) |
134 | { | 202 | { |
135 | if (strcmp(paravirt_ops.name, "lguest") != 0) | 203 | if (strcmp(paravirt_ops.name, "lguest") != 0) |
136 | return 0; | 204 | return 0; |
137 | 205 | ||
138 | /* Devices are in page above top of "normal" mem. */ | 206 | /* Devices are in a single page above top of "normal" mem */ |
139 | lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1); | 207 | lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1); |
140 | 208 | ||
141 | if (bus_register(&lguest_bus.bus) != 0 | 209 | if (bus_register(&lguest_bus.bus) != 0 |
@@ -145,4 +213,5 @@ static int __init lguest_bus_init(void) | |||
145 | scan_devices(); | 213 | scan_devices(); |
146 | return 0; | 214 | return 0; |
147 | } | 215 | } |
216 | /* Do this after core stuff, before devices. */ | ||
148 | postcore_initcall(lguest_bus_init); | 217 | postcore_initcall(lguest_bus_init); |
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index e90d7a783daf..80d1b58c7698 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c | |||
@@ -1,36 +1,70 @@ | |||
1 | /* Userspace control of the guest, via /dev/lguest. */ | 1 | /*P:200 This contains all the /dev/lguest code, whereby the userspace launcher |
2 | * controls and communicates with the Guest. For example, the first write will | ||
3 | * tell us the memory size, pagetable, entry point and kernel address offset. | ||
4 | * A read will run the Guest until a signal is pending (-EINTR), or the Guest | ||
5 | * does a DMA out to the Launcher. Writes are also used to get a DMA buffer | ||
6 | * registered by the Guest and to send the Guest an interrupt. :*/ | ||
2 | #include <linux/uaccess.h> | 7 | #include <linux/uaccess.h> |
3 | #include <linux/miscdevice.h> | 8 | #include <linux/miscdevice.h> |
4 | #include <linux/fs.h> | 9 | #include <linux/fs.h> |
5 | #include "lg.h" | 10 | #include "lg.h" |
6 | 11 | ||
12 | /*L:030 setup_regs() doesn't really belong in this file, but it gives us an | ||
13 | * early glimpse deeper into the Host so it's worth having here. | ||
14 | * | ||
15 | * Most of the Guest's registers are left alone: we used get_zeroed_page() to | ||
16 | * allocate the structure, so they will be 0. */ | ||
7 | static void setup_regs(struct lguest_regs *regs, unsigned long start) | 17 | static void setup_regs(struct lguest_regs *regs, unsigned long start) |
8 | { | 18 | { |
9 | /* Write out stack in format lguest expects, so we can switch to it. */ | 19 | /* There are four "segment" registers which the Guest needs to boot: |
20 | * The "code segment" register (cs) refers to the kernel code segment | ||
21 | * __KERNEL_CS, and the "data", "extra" and "stack" segment registers | ||
22 | * refer to the kernel data segment __KERNEL_DS. | ||
23 | * | ||
24 | * The privilege level is packed into the lower bits. The Guest runs | ||
25 | * at privilege level 1 (GUEST_PL).*/ | ||
10 | regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; | 26 | regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; |
11 | regs->cs = __KERNEL_CS|GUEST_PL; | 27 | regs->cs = __KERNEL_CS|GUEST_PL; |
12 | regs->eflags = 0x202; /* Interrupts enabled. */ | 28 | |
29 | /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002) | ||
30 | * is supposed to always be "1". Bit 9 (0x200) controls whether | ||
31 | * interrupts are enabled. We always leave interrupts enabled while | ||
32 | * running the Guest. */ | ||
33 | regs->eflags = 0x202; | ||
34 | |||
35 | /* The "Extended Instruction Pointer" register says where the Guest is | ||
36 | * running. */ | ||
13 | regs->eip = start; | 37 | regs->eip = start; |
14 | /* esi points to our boot information (physical address 0) */ | 38 | |
39 | /* %esi points to our boot information, at physical address 0, so don't | ||
40 | * touch it. */ | ||
15 | } | 41 | } |
16 | 42 | ||
17 | /* + addr */ | 43 | /*L:310 To send DMA into the Guest, the Launcher needs to be able to ask for a |
44 | * DMA buffer. This is done by writing LHREQ_GETDMA and the key to | ||
45 | * /dev/lguest. */ | ||
18 | static long user_get_dma(struct lguest *lg, const u32 __user *input) | 46 | static long user_get_dma(struct lguest *lg, const u32 __user *input) |
19 | { | 47 | { |
20 | unsigned long key, udma, irq; | 48 | unsigned long key, udma, irq; |
21 | 49 | ||
50 | /* Fetch the key they wrote to us. */ | ||
22 | if (get_user(key, input) != 0) | 51 | if (get_user(key, input) != 0) |
23 | return -EFAULT; | 52 | return -EFAULT; |
53 | /* Look for a free Guest DMA buffer bound to that key. */ | ||
24 | udma = get_dma_buffer(lg, key, &irq); | 54 | udma = get_dma_buffer(lg, key, &irq); |
25 | if (!udma) | 55 | if (!udma) |
26 | return -ENOENT; | 56 | return -ENOENT; |
27 | 57 | ||
28 | /* We put irq number in udma->used_len. */ | 58 | /* We need to tell the Launcher what interrupt the Guest expects after |
59 | * the buffer is filled. We stash it in udma->used_len. */ | ||
29 | lgwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq); | 60 | lgwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq); |
61 | |||
62 | /* The (guest-physical) address of the DMA buffer is returned from | ||
63 | * the write(). */ | ||
30 | return udma; | 64 | return udma; |
31 | } | 65 | } |
32 | 66 | ||
33 | /* To force the Guest to stop running and return to the Launcher, the | 67 | /*L:315 To force the Guest to stop running and return to the Launcher, the |
34 | * Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest. The | 68 | * Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest. The |
35 | * Launcher then writes LHREQ_BREAK and "0" to release the Waker. */ | 69 | * Launcher then writes LHREQ_BREAK and "0" to release the Waker. */ |
36 | static int break_guest_out(struct lguest *lg, const u32 __user *input) | 70 | static int break_guest_out(struct lguest *lg, const u32 __user *input) |
@@ -54,7 +88,8 @@ static int break_guest_out(struct lguest *lg, const u32 __user *input) | |||
54 | } | 88 | } |
55 | } | 89 | } |
56 | 90 | ||
57 | /* + irq */ | 91 | /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt |
92 | * number to /dev/lguest. */ | ||
58 | static int user_send_irq(struct lguest *lg, const u32 __user *input) | 93 | static int user_send_irq(struct lguest *lg, const u32 __user *input) |
59 | { | 94 | { |
60 | u32 irq; | 95 | u32 irq; |
@@ -63,14 +98,19 @@ static int user_send_irq(struct lguest *lg, const u32 __user *input) | |||
63 | return -EFAULT; | 98 | return -EFAULT; |
64 | if (irq >= LGUEST_IRQS) | 99 | if (irq >= LGUEST_IRQS) |
65 | return -EINVAL; | 100 | return -EINVAL; |
101 | /* Next time the Guest runs, the core code will see if it can deliver | ||
102 | * this interrupt. */ | ||
66 | set_bit(irq, lg->irqs_pending); | 103 | set_bit(irq, lg->irqs_pending); |
67 | return 0; | 104 | return 0; |
68 | } | 105 | } |
69 | 106 | ||
107 | /*L:040 Once our Guest is initialized, the Launcher makes it run by reading | ||
108 | * from /dev/lguest. */ | ||
70 | static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) | 109 | static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) |
71 | { | 110 | { |
72 | struct lguest *lg = file->private_data; | 111 | struct lguest *lg = file->private_data; |
73 | 112 | ||
113 | /* You must write LHREQ_INITIALIZE first! */ | ||
74 | if (!lg) | 114 | if (!lg) |
75 | return -EINVAL; | 115 | return -EINVAL; |
76 | 116 | ||
@@ -78,27 +118,52 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) | |||
78 | if (current != lg->tsk) | 118 | if (current != lg->tsk) |
79 | return -EPERM; | 119 | return -EPERM; |
80 | 120 | ||
121 | /* If the guest is already dead, we indicate why */ | ||
81 | if (lg->dead) { | 122 | if (lg->dead) { |
82 | size_t len; | 123 | size_t len; |
83 | 124 | ||
125 | /* lg->dead either contains an error code, or a string. */ | ||
84 | if (IS_ERR(lg->dead)) | 126 | if (IS_ERR(lg->dead)) |
85 | return PTR_ERR(lg->dead); | 127 | return PTR_ERR(lg->dead); |
86 | 128 | ||
129 | /* We can only return as much as the buffer they read with. */ | ||
87 | len = min(size, strlen(lg->dead)+1); | 130 | len = min(size, strlen(lg->dead)+1); |
88 | if (copy_to_user(user, lg->dead, len) != 0) | 131 | if (copy_to_user(user, lg->dead, len) != 0) |
89 | return -EFAULT; | 132 | return -EFAULT; |
90 | return len; | 133 | return len; |
91 | } | 134 | } |
92 | 135 | ||
136 | /* If we returned from read() last time because the Guest sent DMA, | ||
137 | * clear the flag. */ | ||
93 | if (lg->dma_is_pending) | 138 | if (lg->dma_is_pending) |
94 | lg->dma_is_pending = 0; | 139 | lg->dma_is_pending = 0; |
95 | 140 | ||
141 | /* Run the Guest until something interesting happens. */ | ||
96 | return run_guest(lg, (unsigned long __user *)user); | 142 | return run_guest(lg, (unsigned long __user *)user); |
97 | } | 143 | } |
98 | 144 | ||
99 | /* Take: pfnlimit, pgdir, start, pageoffset. */ | 145 | /*L:020 The initialization write supplies 4 32-bit values (in addition to the |
146 | * 32-bit LHREQ_INITIALIZE value). These are: | ||
147 | * | ||
148 | * pfnlimit: The highest (Guest-physical) page number the Guest should be | ||
149 | * allowed to access. The Launcher has to live in Guest memory, so it sets | ||
150 | * this to ensure the Guest can't reach it. | ||
151 | * | ||
152 | * pgdir: The (Guest-physical) address of the top of the initial Guest | ||
153 | * pagetables (which are set up by the Launcher). | ||
154 | * | ||
155 | * start: The first instruction to execute ("eip" in x86-speak). | ||
156 | * | ||
157 | * page_offset: The PAGE_OFFSET constant in the Guest kernel. We should | ||
158 | * probably wean the code off this, but it's a very useful constant! Any | ||
159 | * address above this is within the Guest kernel, and any kernel address can | ||
160 | * quickly converted from physical to virtual by adding PAGE_OFFSET. It's | ||
161 | * 0xC0000000 (3G) by default, but it's configurable at kernel build time. | ||
162 | */ | ||
100 | static int initialize(struct file *file, const u32 __user *input) | 163 | static int initialize(struct file *file, const u32 __user *input) |
101 | { | 164 | { |
165 | /* "struct lguest" contains everything we (the Host) know about a | ||
166 | * Guest. */ | ||
102 | struct lguest *lg; | 167 | struct lguest *lg; |
103 | int err, i; | 168 | int err, i; |
104 | u32 args[4]; | 169 | u32 args[4]; |
@@ -106,7 +171,7 @@ static int initialize(struct file *file, const u32 __user *input) | |||
106 | /* We grab the Big Lguest lock, which protects the global array | 171 | /* We grab the Big Lguest lock, which protects the global array |
107 | * "lguests" and multiple simultaneous initializations. */ | 172 | * "lguests" and multiple simultaneous initializations. */ |
108 | mutex_lock(&lguest_lock); | 173 | mutex_lock(&lguest_lock); |
109 | 174 | /* You can't initialize twice! Close the device and start again... */ | |
110 | if (file->private_data) { | 175 | if (file->private_data) { |
111 | err = -EBUSY; | 176 | err = -EBUSY; |
112 | goto unlock; | 177 | goto unlock; |
@@ -117,37 +182,70 @@ static int initialize(struct file *file, const u32 __user *input) | |||
117 | goto unlock; | 182 | goto unlock; |
118 | } | 183 | } |
119 | 184 | ||
185 | /* Find an unused guest. */ | ||
120 | i = find_free_guest(); | 186 | i = find_free_guest(); |
121 | if (i < 0) { | 187 | if (i < 0) { |
122 | err = -ENOSPC; | 188 | err = -ENOSPC; |
123 | goto unlock; | 189 | goto unlock; |
124 | } | 190 | } |
191 | /* OK, we have an index into the "lguest" array: "lg" is a convenient | ||
192 | * pointer. */ | ||
125 | lg = &lguests[i]; | 193 | lg = &lguests[i]; |
194 | |||
195 | /* Populate the easy fields of our "struct lguest" */ | ||
126 | lg->guestid = i; | 196 | lg->guestid = i; |
127 | lg->pfn_limit = args[0]; | 197 | lg->pfn_limit = args[0]; |
128 | lg->page_offset = args[3]; | 198 | lg->page_offset = args[3]; |
199 | |||
200 | /* We need a complete page for the Guest registers: they are accessible | ||
201 | * to the Guest and we can only grant it access to whole pages. */ | ||
129 | lg->regs_page = get_zeroed_page(GFP_KERNEL); | 202 | lg->regs_page = get_zeroed_page(GFP_KERNEL); |
130 | if (!lg->regs_page) { | 203 | if (!lg->regs_page) { |
131 | err = -ENOMEM; | 204 | err = -ENOMEM; |
132 | goto release_guest; | 205 | goto release_guest; |
133 | } | 206 | } |
207 | /* We actually put the registers at the bottom of the page. */ | ||
134 | lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs); | 208 | lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs); |
135 | 209 | ||
210 | /* Initialize the Guest's shadow page tables, using the toplevel | ||
211 | * address the Launcher gave us. This allocates memory, so can | ||
212 | * fail. */ | ||
136 | err = init_guest_pagetable(lg, args[1]); | 213 | err = init_guest_pagetable(lg, args[1]); |
137 | if (err) | 214 | if (err) |
138 | goto free_regs; | 215 | goto free_regs; |
139 | 216 | ||
217 | /* Now we initialize the Guest's registers, handing it the start | ||
218 | * address. */ | ||
140 | setup_regs(lg->regs, args[2]); | 219 | setup_regs(lg->regs, args[2]); |
220 | |||
221 | /* There are a couple of GDT entries the Guest expects when first | ||
222 | * booting. */ | ||
141 | setup_guest_gdt(lg); | 223 | setup_guest_gdt(lg); |
224 | |||
225 | /* The timer for lguest's clock needs initialization. */ | ||
142 | init_clockdev(lg); | 226 | init_clockdev(lg); |
227 | |||
228 | /* We keep a pointer to the Launcher task (ie. current task) for when | ||
229 | * other Guests want to wake this one (inter-Guest I/O). */ | ||
143 | lg->tsk = current; | 230 | lg->tsk = current; |
231 | /* We need to keep a pointer to the Launcher's memory map, because if | ||
232 | * the Launcher dies we need to clean it up. If we don't keep a | ||
233 | * reference, it is destroyed before close() is called. */ | ||
144 | lg->mm = get_task_mm(lg->tsk); | 234 | lg->mm = get_task_mm(lg->tsk); |
235 | |||
236 | /* Initialize the queue for the waker to wait on */ | ||
145 | init_waitqueue_head(&lg->break_wq); | 237 | init_waitqueue_head(&lg->break_wq); |
238 | |||
239 | /* We remember which CPU's pages this Guest used last, for optimization | ||
240 | * when the same Guest runs on the same CPU twice. */ | ||
146 | lg->last_pages = NULL; | 241 | lg->last_pages = NULL; |
242 | |||
243 | /* We keep our "struct lguest" in the file's private_data. */ | ||
147 | file->private_data = lg; | 244 | file->private_data = lg; |
148 | 245 | ||
149 | mutex_unlock(&lguest_lock); | 246 | mutex_unlock(&lguest_lock); |
150 | 247 | ||
248 | /* And because this is a write() call, we return the length used. */ | ||
151 | return sizeof(args); | 249 | return sizeof(args); |
152 | 250 | ||
153 | free_regs: | 251 | free_regs: |
@@ -159,9 +257,15 @@ unlock: | |||
159 | return err; | 257 | return err; |
160 | } | 258 | } |
161 | 259 | ||
260 | /*L:010 The first operation the Launcher does must be a write. All writes | ||
261 | * start with a 32 bit number: for the first write this must be | ||
262 | * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use | ||
263 | * writes of other values to get DMA buffers and send interrupts. */ | ||
162 | static ssize_t write(struct file *file, const char __user *input, | 264 | static ssize_t write(struct file *file, const char __user *input, |
163 | size_t size, loff_t *off) | 265 | size_t size, loff_t *off) |
164 | { | 266 | { |
267 | /* Once the guest is initialized, we hold the "struct lguest" in the | ||
268 | * file private data. */ | ||
165 | struct lguest *lg = file->private_data; | 269 | struct lguest *lg = file->private_data; |
166 | u32 req; | 270 | u32 req; |
167 | 271 | ||
@@ -169,8 +273,11 @@ static ssize_t write(struct file *file, const char __user *input, | |||
169 | return -EFAULT; | 273 | return -EFAULT; |
170 | input += sizeof(req); | 274 | input += sizeof(req); |
171 | 275 | ||
276 | /* If you haven't initialized, you must do that first. */ | ||
172 | if (req != LHREQ_INITIALIZE && !lg) | 277 | if (req != LHREQ_INITIALIZE && !lg) |
173 | return -EINVAL; | 278 | return -EINVAL; |
279 | |||
280 | /* Once the Guest is dead, all you can do is read() why it died. */ | ||
174 | if (lg && lg->dead) | 281 | if (lg && lg->dead) |
175 | return -ENOENT; | 282 | return -ENOENT; |
176 | 283 | ||
@@ -192,33 +299,72 @@ static ssize_t write(struct file *file, const char __user *input, | |||
192 | } | 299 | } |
193 | } | 300 | } |
194 | 301 | ||
302 | /*L:060 The final piece of interface code is the close() routine. It reverses | ||
303 | * everything done in initialize(). This is usually called because the | ||
304 | * Launcher exited. | ||
305 | * | ||
306 | * Note that the close routine returns 0 or a negative error number: it can't | ||
307 | * really fail, but it can whine. I blame Sun for this wart, and K&R C for | ||
308 | * letting them do it. :*/ | ||
195 | static int close(struct inode *inode, struct file *file) | 309 | static int close(struct inode *inode, struct file *file) |
196 | { | 310 | { |
197 | struct lguest *lg = file->private_data; | 311 | struct lguest *lg = file->private_data; |
198 | 312 | ||
313 | /* If we never successfully initialized, there's nothing to clean up */ | ||
199 | if (!lg) | 314 | if (!lg) |
200 | return 0; | 315 | return 0; |
201 | 316 | ||
317 | /* We need the big lock, to protect from inter-guest I/O and other | ||
318 | * Launchers initializing guests. */ | ||
202 | mutex_lock(&lguest_lock); | 319 | mutex_lock(&lguest_lock); |
203 | /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ | 320 | /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ |
204 | hrtimer_cancel(&lg->hrt); | 321 | hrtimer_cancel(&lg->hrt); |
322 | /* Free any DMA buffers the Guest had bound. */ | ||
205 | release_all_dma(lg); | 323 | release_all_dma(lg); |
324 | /* Free up the shadow page tables for the Guest. */ | ||
206 | free_guest_pagetable(lg); | 325 | free_guest_pagetable(lg); |
326 | /* Now all the memory cleanups are done, it's safe to release the | ||
327 | * Launcher's memory management structure. */ | ||
207 | mmput(lg->mm); | 328 | mmput(lg->mm); |
329 | /* If lg->dead doesn't contain an error code it will be NULL or a | ||
330 | * kmalloc()ed string, either of which is ok to hand to kfree(). */ | ||
208 | if (!IS_ERR(lg->dead)) | 331 | if (!IS_ERR(lg->dead)) |
209 | kfree(lg->dead); | 332 | kfree(lg->dead); |
333 | /* We can free up the register page we allocated. */ | ||
210 | free_page(lg->regs_page); | 334 | free_page(lg->regs_page); |
335 | /* We clear the entire structure, which also marks it as free for the | ||
336 | * next user. */ | ||
211 | memset(lg, 0, sizeof(*lg)); | 337 | memset(lg, 0, sizeof(*lg)); |
338 | /* Release lock and exit. */ | ||
212 | mutex_unlock(&lguest_lock); | 339 | mutex_unlock(&lguest_lock); |
340 | |||
213 | return 0; | 341 | return 0; |
214 | } | 342 | } |
215 | 343 | ||
344 | /*L:000 | ||
345 | * Welcome to our journey through the Launcher! | ||
346 | * | ||
347 | * The Launcher is the Host userspace program which sets up, runs and services | ||
348 | * the Guest. In fact, many comments in the Drivers which refer to "the Host" | ||
349 | * doing things are inaccurate: the Launcher does all the device handling for | ||
350 | * the Guest. The Guest can't tell what's done by the the Launcher and what by | ||
351 | * the Host. | ||
352 | * | ||
353 | * Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we | ||
354 | * shall see more of that later. | ||
355 | * | ||
356 | * We begin our understanding with the Host kernel interface which the Launcher | ||
357 | * uses: reading and writing a character device called /dev/lguest. All the | ||
358 | * work happens in the read(), write() and close() routines: */ | ||
216 | static struct file_operations lguest_fops = { | 359 | static struct file_operations lguest_fops = { |
217 | .owner = THIS_MODULE, | 360 | .owner = THIS_MODULE, |
218 | .release = close, | 361 | .release = close, |
219 | .write = write, | 362 | .write = write, |
220 | .read = read, | 363 | .read = read, |
221 | }; | 364 | }; |
365 | |||
366 | /* This is a textbook example of a "misc" character device. Populate a "struct | ||
367 | * miscdevice" and register it with misc_register(). */ | ||
222 | static struct miscdevice lguest_dev = { | 368 | static struct miscdevice lguest_dev = { |
223 | .minor = MISC_DYNAMIC_MINOR, | 369 | .minor = MISC_DYNAMIC_MINOR, |
224 | .name = "lguest", | 370 | .name = "lguest", |
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 1b0ba09b1269..b7a924ace684 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c | |||
@@ -1,5 +1,11 @@ | |||
1 | /* Shadow page table operations. | 1 | /*P:700 The pagetable code, on the other hand, still shows the scars of |
2 | * Copyright (C) Rusty Russell IBM Corporation 2006. | 2 | * previous encounters. It's functional, and as neat as it can be in the |
3 | * circumstances, but be wary, for these things are subtle and break easily. | ||
4 | * The Guest provides a virtual to physical mapping, but we can neither trust | ||
5 | * it nor use it: we verify and convert it here to point the hardware to the | ||
6 | * actual Guest pages when running the Guest. :*/ | ||
7 | |||
8 | /* Copyright (C) Rusty Russell IBM Corporation 2006. | ||
3 | * GPL v2 and any later version */ | 9 | * GPL v2 and any later version */ |
4 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
5 | #include <linux/types.h> | 11 | #include <linux/types.h> |
@@ -9,38 +15,96 @@ | |||
9 | #include <asm/tlbflush.h> | 15 | #include <asm/tlbflush.h> |
10 | #include "lg.h" | 16 | #include "lg.h" |
11 | 17 | ||
18 | /*M:008 We hold reference to pages, which prevents them from being swapped. | ||
19 | * It'd be nice to have a callback in the "struct mm_struct" when Linux wants | ||
20 | * to swap out. If we had this, and a shrinker callback to trim PTE pages, we | ||
21 | * could probably consider launching Guests as non-root. :*/ | ||
22 | |||
23 | /*H:300 | ||
24 | * The Page Table Code | ||
25 | * | ||
26 | * We use two-level page tables for the Guest. If you're not entirely | ||
27 | * comfortable with virtual addresses, physical addresses and page tables then | ||
28 | * I recommend you review lguest.c's "Page Table Handling" (with diagrams!). | ||
29 | * | ||
30 | * The Guest keeps page tables, but we maintain the actual ones here: these are | ||
31 | * called "shadow" page tables. Which is a very Guest-centric name: these are | ||
32 | * the real page tables the CPU uses, although we keep them up to date to | ||
33 | * reflect the Guest's. (See what I mean about weird naming? Since when do | ||
34 | * shadows reflect anything?) | ||
35 | * | ||
36 | * Anyway, this is the most complicated part of the Host code. There are seven | ||
37 | * parts to this: | ||
38 | * (i) Setting up a page table entry for the Guest when it faults, | ||
39 | * (ii) Setting up the page table entry for the Guest stack, | ||
40 | * (iii) Setting up a page table entry when the Guest tells us it has changed, | ||
41 | * (iv) Switching page tables, | ||
42 | * (v) Flushing (thowing away) page tables, | ||
43 | * (vi) Mapping the Switcher when the Guest is about to run, | ||
44 | * (vii) Setting up the page tables initially. | ||
45 | :*/ | ||
46 | |||
47 | /* Pages a 4k long, and each page table entry is 4 bytes long, giving us 1024 | ||
48 | * (or 2^10) entries per page. */ | ||
12 | #define PTES_PER_PAGE_SHIFT 10 | 49 | #define PTES_PER_PAGE_SHIFT 10 |
13 | #define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT) | 50 | #define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT) |
51 | |||
52 | /* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is | ||
53 | * conveniently placed at the top 4MB, so it uses a separate, complete PTE | ||
54 | * page. */ | ||
14 | #define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1) | 55 | #define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1) |
15 | 56 | ||
57 | /* We actually need a separate PTE page for each CPU. Remember that after the | ||
58 | * Switcher code itself comes two pages for each CPU, and we don't want this | ||
59 | * CPU's guest to see the pages of any other CPU. */ | ||
16 | static DEFINE_PER_CPU(spte_t *, switcher_pte_pages); | 60 | static DEFINE_PER_CPU(spte_t *, switcher_pte_pages); |
17 | #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) | 61 | #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) |
18 | 62 | ||
63 | /*H:320 With our shadow and Guest types established, we need to deal with | ||
64 | * them: the page table code is curly enough to need helper functions to keep | ||
65 | * it clear and clean. | ||
66 | * | ||
67 | * The first helper takes a virtual address, and says which entry in the top | ||
68 | * level page table deals with that address. Since each top level entry deals | ||
69 | * with 4M, this effectively divides by 4M. */ | ||
19 | static unsigned vaddr_to_pgd_index(unsigned long vaddr) | 70 | static unsigned vaddr_to_pgd_index(unsigned long vaddr) |
20 | { | 71 | { |
21 | return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); | 72 | return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); |
22 | } | 73 | } |
23 | 74 | ||
24 | /* These access the shadow versions (ie. the ones used by the CPU). */ | 75 | /* There are two functions which return pointers to the shadow (aka "real") |
76 | * page tables. | ||
77 | * | ||
78 | * spgd_addr() takes the virtual address and returns a pointer to the top-level | ||
79 | * page directory entry for that address. Since we keep track of several page | ||
80 | * tables, the "i" argument tells us which one we're interested in (it's | ||
81 | * usually the current one). */ | ||
25 | static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) | 82 | static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) |
26 | { | 83 | { |
27 | unsigned int index = vaddr_to_pgd_index(vaddr); | 84 | unsigned int index = vaddr_to_pgd_index(vaddr); |
28 | 85 | ||
86 | /* We kill any Guest trying to touch the Switcher addresses. */ | ||
29 | if (index >= SWITCHER_PGD_INDEX) { | 87 | if (index >= SWITCHER_PGD_INDEX) { |
30 | kill_guest(lg, "attempt to access switcher pages"); | 88 | kill_guest(lg, "attempt to access switcher pages"); |
31 | index = 0; | 89 | index = 0; |
32 | } | 90 | } |
91 | /* Return a pointer index'th pgd entry for the i'th page table. */ | ||
33 | return &lg->pgdirs[i].pgdir[index]; | 92 | return &lg->pgdirs[i].pgdir[index]; |
34 | } | 93 | } |
35 | 94 | ||
95 | /* This routine then takes the PGD entry given above, which contains the | ||
96 | * address of the PTE page. It then returns a pointer to the PTE entry for the | ||
97 | * given address. */ | ||
36 | static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr) | 98 | static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr) |
37 | { | 99 | { |
38 | spte_t *page = __va(spgd.pfn << PAGE_SHIFT); | 100 | spte_t *page = __va(spgd.pfn << PAGE_SHIFT); |
101 | /* You should never call this if the PGD entry wasn't valid */ | ||
39 | BUG_ON(!(spgd.flags & _PAGE_PRESENT)); | 102 | BUG_ON(!(spgd.flags & _PAGE_PRESENT)); |
40 | return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE]; | 103 | return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE]; |
41 | } | 104 | } |
42 | 105 | ||
43 | /* These access the guest versions. */ | 106 | /* These two functions just like the above two, except they access the Guest |
107 | * page tables. Hence they return a Guest address. */ | ||
44 | static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) | 108 | static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) |
45 | { | 109 | { |
46 | unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); | 110 | unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); |
@@ -55,12 +119,24 @@ static unsigned long gpte_addr(struct lguest *lg, | |||
55 | return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t); | 119 | return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t); |
56 | } | 120 | } |
57 | 121 | ||
58 | /* Do a virtual -> physical mapping on a user page. */ | 122 | /*H:350 This routine takes a page number given by the Guest and converts it to |
123 | * an actual, physical page number. It can fail for several reasons: the | ||
124 | * virtual address might not be mapped by the Launcher, the write flag is set | ||
125 | * and the page is read-only, or the write flag was set and the page was | ||
126 | * shared so had to be copied, but we ran out of memory. | ||
127 | * | ||
128 | * This holds a reference to the page, so release_pte() is careful to | ||
129 | * put that back. */ | ||
59 | static unsigned long get_pfn(unsigned long virtpfn, int write) | 130 | static unsigned long get_pfn(unsigned long virtpfn, int write) |
60 | { | 131 | { |
61 | struct page *page; | 132 | struct page *page; |
133 | /* This value indicates failure. */ | ||
62 | unsigned long ret = -1UL; | 134 | unsigned long ret = -1UL; |
63 | 135 | ||
136 | /* get_user_pages() is a complex interface: it gets the "struct | ||
137 | * vm_area_struct" and "struct page" assocated with a range of pages. | ||
138 | * It also needs the task's mmap_sem held, and is not very quick. | ||
139 | * It returns the number of pages it got. */ | ||
64 | down_read(¤t->mm->mmap_sem); | 140 | down_read(¤t->mm->mmap_sem); |
65 | if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT, | 141 | if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT, |
66 | 1, write, 1, &page, NULL) == 1) | 142 | 1, write, 1, &page, NULL) == 1) |
@@ -69,28 +145,47 @@ static unsigned long get_pfn(unsigned long virtpfn, int write) | |||
69 | return ret; | 145 | return ret; |
70 | } | 146 | } |
71 | 147 | ||
148 | /*H:340 Converting a Guest page table entry to a shadow (ie. real) page table | ||
149 | * entry can be a little tricky. The flags are (almost) the same, but the | ||
150 | * Guest PTE contains a virtual page number: the CPU needs the real page | ||
151 | * number. */ | ||
72 | static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write) | 152 | static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write) |
73 | { | 153 | { |
74 | spte_t spte; | 154 | spte_t spte; |
75 | unsigned long pfn; | 155 | unsigned long pfn; |
76 | 156 | ||
77 | /* We ignore the global flag. */ | 157 | /* The Guest sets the global flag, because it thinks that it is using |
158 | * PGE. We only told it to use PGE so it would tell us whether it was | ||
159 | * flushing a kernel mapping or a userspace mapping. We don't actually | ||
160 | * use the global bit, so throw it away. */ | ||
78 | spte.flags = (gpte.flags & ~_PAGE_GLOBAL); | 161 | spte.flags = (gpte.flags & ~_PAGE_GLOBAL); |
162 | |||
163 | /* We need a temporary "unsigned long" variable to hold the answer from | ||
164 | * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't | ||
165 | * fit in spte.pfn. get_pfn() finds the real physical number of the | ||
166 | * page, given the virtual number. */ | ||
79 | pfn = get_pfn(gpte.pfn, write); | 167 | pfn = get_pfn(gpte.pfn, write); |
80 | if (pfn == -1UL) { | 168 | if (pfn == -1UL) { |
81 | kill_guest(lg, "failed to get page %u", gpte.pfn); | 169 | kill_guest(lg, "failed to get page %u", gpte.pfn); |
82 | /* Must not put_page() bogus page on cleanup. */ | 170 | /* When we destroy the Guest, we'll go through the shadow page |
171 | * tables and release_pte() them. Make sure we don't think | ||
172 | * this one is valid! */ | ||
83 | spte.flags = 0; | 173 | spte.flags = 0; |
84 | } | 174 | } |
175 | /* Now we assign the page number, and our shadow PTE is complete. */ | ||
85 | spte.pfn = pfn; | 176 | spte.pfn = pfn; |
86 | return spte; | 177 | return spte; |
87 | } | 178 | } |
88 | 179 | ||
180 | /*H:460 And to complete the chain, release_pte() looks like this: */ | ||
89 | static void release_pte(spte_t pte) | 181 | static void release_pte(spte_t pte) |
90 | { | 182 | { |
183 | /* Remember that get_user_pages() took a reference to the page, in | ||
184 | * get_pfn()? We have to put it back now. */ | ||
91 | if (pte.flags & _PAGE_PRESENT) | 185 | if (pte.flags & _PAGE_PRESENT) |
92 | put_page(pfn_to_page(pte.pfn)); | 186 | put_page(pfn_to_page(pte.pfn)); |
93 | } | 187 | } |
188 | /*:*/ | ||
94 | 189 | ||
95 | static void check_gpte(struct lguest *lg, gpte_t gpte) | 190 | static void check_gpte(struct lguest *lg, gpte_t gpte) |
96 | { | 191 | { |
@@ -104,11 +199,16 @@ static void check_gpgd(struct lguest *lg, gpgd_t gpgd) | |||
104 | kill_guest(lg, "bad page directory entry"); | 199 | kill_guest(lg, "bad page directory entry"); |
105 | } | 200 | } |
106 | 201 | ||
107 | /* FIXME: We hold reference to pages, which prevents them from being | 202 | /*H:330 |
108 | swapped. It'd be nice to have a callback when Linux wants to swap out. */ | 203 | * (i) Setting up a page table entry for the Guest when it faults |
109 | 204 | * | |
110 | /* We fault pages in, which allows us to update accessed/dirty bits. | 205 | * We saw this call in run_guest(): when we see a page fault in the Guest, we |
111 | * Return true if we got page. */ | 206 | * come here. That's because we only set up the shadow page tables lazily as |
207 | * they're needed, so we get page faults all the time and quietly fix them up | ||
208 | * and return to the Guest without it knowing. | ||
209 | * | ||
210 | * If we fixed up the fault (ie. we mapped the address), this routine returns | ||
211 | * true. */ | ||
112 | int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | 212 | int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) |
113 | { | 213 | { |
114 | gpgd_t gpgd; | 214 | gpgd_t gpgd; |
@@ -117,106 +217,161 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | |||
117 | gpte_t gpte; | 217 | gpte_t gpte; |
118 | spte_t *spte; | 218 | spte_t *spte; |
119 | 219 | ||
220 | /* First step: get the top-level Guest page table entry. */ | ||
120 | gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr))); | 221 | gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr))); |
222 | /* Toplevel not present? We can't map it in. */ | ||
121 | if (!(gpgd.flags & _PAGE_PRESENT)) | 223 | if (!(gpgd.flags & _PAGE_PRESENT)) |
122 | return 0; | 224 | return 0; |
123 | 225 | ||
226 | /* Now look at the matching shadow entry. */ | ||
124 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); | 227 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); |
125 | if (!(spgd->flags & _PAGE_PRESENT)) { | 228 | if (!(spgd->flags & _PAGE_PRESENT)) { |
126 | /* Get a page of PTEs for them. */ | 229 | /* No shadow entry: allocate a new shadow PTE page. */ |
127 | unsigned long ptepage = get_zeroed_page(GFP_KERNEL); | 230 | unsigned long ptepage = get_zeroed_page(GFP_KERNEL); |
128 | /* FIXME: Steal from self in this case? */ | 231 | /* This is not really the Guest's fault, but killing it is |
232 | * simple for this corner case. */ | ||
129 | if (!ptepage) { | 233 | if (!ptepage) { |
130 | kill_guest(lg, "out of memory allocating pte page"); | 234 | kill_guest(lg, "out of memory allocating pte page"); |
131 | return 0; | 235 | return 0; |
132 | } | 236 | } |
237 | /* We check that the Guest pgd is OK. */ | ||
133 | check_gpgd(lg, gpgd); | 238 | check_gpgd(lg, gpgd); |
239 | /* And we copy the flags to the shadow PGD entry. The page | ||
240 | * number in the shadow PGD is the page we just allocated. */ | ||
134 | spgd->raw.val = (__pa(ptepage) | gpgd.flags); | 241 | spgd->raw.val = (__pa(ptepage) | gpgd.flags); |
135 | } | 242 | } |
136 | 243 | ||
244 | /* OK, now we look at the lower level in the Guest page table: keep its | ||
245 | * address, because we might update it later. */ | ||
137 | gpte_ptr = gpte_addr(lg, gpgd, vaddr); | 246 | gpte_ptr = gpte_addr(lg, gpgd, vaddr); |
138 | gpte = mkgpte(lgread_u32(lg, gpte_ptr)); | 247 | gpte = mkgpte(lgread_u32(lg, gpte_ptr)); |
139 | 248 | ||
140 | /* No page? */ | 249 | /* If this page isn't in the Guest page tables, we can't page it in. */ |
141 | if (!(gpte.flags & _PAGE_PRESENT)) | 250 | if (!(gpte.flags & _PAGE_PRESENT)) |
142 | return 0; | 251 | return 0; |
143 | 252 | ||
144 | /* Write to read-only page? */ | 253 | /* Check they're not trying to write to a page the Guest wants |
254 | * read-only (bit 2 of errcode == write). */ | ||
145 | if ((errcode & 2) && !(gpte.flags & _PAGE_RW)) | 255 | if ((errcode & 2) && !(gpte.flags & _PAGE_RW)) |
146 | return 0; | 256 | return 0; |
147 | 257 | ||
148 | /* User access to a non-user page? */ | 258 | /* User access to a kernel page? (bit 3 == user access) */ |
149 | if ((errcode & 4) && !(gpte.flags & _PAGE_USER)) | 259 | if ((errcode & 4) && !(gpte.flags & _PAGE_USER)) |
150 | return 0; | 260 | return 0; |
151 | 261 | ||
262 | /* Check that the Guest PTE flags are OK, and the page number is below | ||
263 | * the pfn_limit (ie. not mapping the Launcher binary). */ | ||
152 | check_gpte(lg, gpte); | 264 | check_gpte(lg, gpte); |
265 | /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ | ||
153 | gpte.flags |= _PAGE_ACCESSED; | 266 | gpte.flags |= _PAGE_ACCESSED; |
154 | if (errcode & 2) | 267 | if (errcode & 2) |
155 | gpte.flags |= _PAGE_DIRTY; | 268 | gpte.flags |= _PAGE_DIRTY; |
156 | 269 | ||
157 | /* We're done with the old pte. */ | 270 | /* Get the pointer to the shadow PTE entry we're going to set. */ |
158 | spte = spte_addr(lg, *spgd, vaddr); | 271 | spte = spte_addr(lg, *spgd, vaddr); |
272 | /* If there was a valid shadow PTE entry here before, we release it. | ||
273 | * This can happen with a write to a previously read-only entry. */ | ||
159 | release_pte(*spte); | 274 | release_pte(*spte); |
160 | 275 | ||
161 | /* We don't make it writable if this isn't a write: later | 276 | /* If this is a write, we insist that the Guest page is writable (the |
162 | * write will fault so we can set dirty bit in guest. */ | 277 | * final arg to gpte_to_spte()). */ |
163 | if (gpte.flags & _PAGE_DIRTY) | 278 | if (gpte.flags & _PAGE_DIRTY) |
164 | *spte = gpte_to_spte(lg, gpte, 1); | 279 | *spte = gpte_to_spte(lg, gpte, 1); |
165 | else { | 280 | else { |
281 | /* If this is a read, don't set the "writable" bit in the page | ||
282 | * table entry, even if the Guest says it's writable. That way | ||
283 | * we come back here when a write does actually ocur, so we can | ||
284 | * update the Guest's _PAGE_DIRTY flag. */ | ||
166 | gpte_t ro_gpte = gpte; | 285 | gpte_t ro_gpte = gpte; |
167 | ro_gpte.flags &= ~_PAGE_RW; | 286 | ro_gpte.flags &= ~_PAGE_RW; |
168 | *spte = gpte_to_spte(lg, ro_gpte, 0); | 287 | *spte = gpte_to_spte(lg, ro_gpte, 0); |
169 | } | 288 | } |
170 | 289 | ||
171 | /* Now we update dirty/accessed on guest. */ | 290 | /* Finally, we write the Guest PTE entry back: we've set the |
291 | * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ | ||
172 | lgwrite_u32(lg, gpte_ptr, gpte.raw.val); | 292 | lgwrite_u32(lg, gpte_ptr, gpte.raw.val); |
293 | |||
294 | /* We succeeded in mapping the page! */ | ||
173 | return 1; | 295 | return 1; |
174 | } | 296 | } |
175 | 297 | ||
176 | /* This is much faster than the full demand_page logic. */ | 298 | /*H:360 (ii) Setting up the page table entry for the Guest stack. |
299 | * | ||
300 | * Remember pin_stack_pages() which makes sure the stack is mapped? It could | ||
301 | * simply call demand_page(), but as we've seen that logic is quite long, and | ||
302 | * usually the stack pages are already mapped anyway, so it's not required. | ||
303 | * | ||
304 | * This is a quick version which answers the question: is this virtual address | ||
305 | * mapped by the shadow page tables, and is it writable? */ | ||
177 | static int page_writable(struct lguest *lg, unsigned long vaddr) | 306 | static int page_writable(struct lguest *lg, unsigned long vaddr) |
178 | { | 307 | { |
179 | spgd_t *spgd; | 308 | spgd_t *spgd; |
180 | unsigned long flags; | 309 | unsigned long flags; |
181 | 310 | ||
311 | /* Look at the top level entry: is it present? */ | ||
182 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); | 312 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); |
183 | if (!(spgd->flags & _PAGE_PRESENT)) | 313 | if (!(spgd->flags & _PAGE_PRESENT)) |
184 | return 0; | 314 | return 0; |
185 | 315 | ||
316 | /* Check the flags on the pte entry itself: it must be present and | ||
317 | * writable. */ | ||
186 | flags = spte_addr(lg, *spgd, vaddr)->flags; | 318 | flags = spte_addr(lg, *spgd, vaddr)->flags; |
187 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); | 319 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); |
188 | } | 320 | } |
189 | 321 | ||
322 | /* So, when pin_stack_pages() asks us to pin a page, we check if it's already | ||
323 | * in the page tables, and if not, we call demand_page() with error code 2 | ||
324 | * (meaning "write"). */ | ||
190 | void pin_page(struct lguest *lg, unsigned long vaddr) | 325 | void pin_page(struct lguest *lg, unsigned long vaddr) |
191 | { | 326 | { |
192 | if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2)) | 327 | if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2)) |
193 | kill_guest(lg, "bad stack page %#lx", vaddr); | 328 | kill_guest(lg, "bad stack page %#lx", vaddr); |
194 | } | 329 | } |
195 | 330 | ||
331 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ | ||
196 | static void release_pgd(struct lguest *lg, spgd_t *spgd) | 332 | static void release_pgd(struct lguest *lg, spgd_t *spgd) |
197 | { | 333 | { |
334 | /* If the entry's not present, there's nothing to release. */ | ||
198 | if (spgd->flags & _PAGE_PRESENT) { | 335 | if (spgd->flags & _PAGE_PRESENT) { |
199 | unsigned int i; | 336 | unsigned int i; |
337 | /* Converting the pfn to find the actual PTE page is easy: turn | ||
338 | * the page number into a physical address, then convert to a | ||
339 | * virtual address (easy for kernel pages like this one). */ | ||
200 | spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT); | 340 | spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT); |
341 | /* For each entry in the page, we might need to release it. */ | ||
201 | for (i = 0; i < PTES_PER_PAGE; i++) | 342 | for (i = 0; i < PTES_PER_PAGE; i++) |
202 | release_pte(ptepage[i]); | 343 | release_pte(ptepage[i]); |
344 | /* Now we can free the page of PTEs */ | ||
203 | free_page((long)ptepage); | 345 | free_page((long)ptepage); |
346 | /* And zero out the PGD entry we we never release it twice. */ | ||
204 | spgd->raw.val = 0; | 347 | spgd->raw.val = 0; |
205 | } | 348 | } |
206 | } | 349 | } |
207 | 350 | ||
351 | /*H:440 (v) Flushing (thowing away) page tables, | ||
352 | * | ||
353 | * We saw flush_user_mappings() called when we re-used a top-level pgdir page. | ||
354 | * It simply releases every PTE page from 0 up to the kernel address. */ | ||
208 | static void flush_user_mappings(struct lguest *lg, int idx) | 355 | static void flush_user_mappings(struct lguest *lg, int idx) |
209 | { | 356 | { |
210 | unsigned int i; | 357 | unsigned int i; |
358 | /* Release every pgd entry up to the kernel's address. */ | ||
211 | for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++) | 359 | for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++) |
212 | release_pgd(lg, lg->pgdirs[idx].pgdir + i); | 360 | release_pgd(lg, lg->pgdirs[idx].pgdir + i); |
213 | } | 361 | } |
214 | 362 | ||
363 | /* The Guest also has a hypercall to do this manually: it's used when a large | ||
364 | * number of mappings have been changed. */ | ||
215 | void guest_pagetable_flush_user(struct lguest *lg) | 365 | void guest_pagetable_flush_user(struct lguest *lg) |
216 | { | 366 | { |
367 | /* Drop the userspace part of the current page table. */ | ||
217 | flush_user_mappings(lg, lg->pgdidx); | 368 | flush_user_mappings(lg, lg->pgdidx); |
218 | } | 369 | } |
370 | /*:*/ | ||
219 | 371 | ||
372 | /* We keep several page tables. This is a simple routine to find the page | ||
373 | * table (if any) corresponding to this top-level address the Guest has given | ||
374 | * us. */ | ||
220 | static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) | 375 | static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) |
221 | { | 376 | { |
222 | unsigned int i; | 377 | unsigned int i; |
@@ -226,21 +381,30 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) | |||
226 | return i; | 381 | return i; |
227 | } | 382 | } |
228 | 383 | ||
384 | /*H:435 And this is us, creating the new page directory. If we really do | ||
385 | * allocate a new one (and so the kernel parts are not there), we set | ||
386 | * blank_pgdir. */ | ||
229 | static unsigned int new_pgdir(struct lguest *lg, | 387 | static unsigned int new_pgdir(struct lguest *lg, |
230 | unsigned long cr3, | 388 | unsigned long cr3, |
231 | int *blank_pgdir) | 389 | int *blank_pgdir) |
232 | { | 390 | { |
233 | unsigned int next; | 391 | unsigned int next; |
234 | 392 | ||
393 | /* We pick one entry at random to throw out. Choosing the Least | ||
394 | * Recently Used might be better, but this is easy. */ | ||
235 | next = random32() % ARRAY_SIZE(lg->pgdirs); | 395 | next = random32() % ARRAY_SIZE(lg->pgdirs); |
396 | /* If it's never been allocated at all before, try now. */ | ||
236 | if (!lg->pgdirs[next].pgdir) { | 397 | if (!lg->pgdirs[next].pgdir) { |
237 | lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL); | 398 | lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL); |
399 | /* If the allocation fails, just keep using the one we have */ | ||
238 | if (!lg->pgdirs[next].pgdir) | 400 | if (!lg->pgdirs[next].pgdir) |
239 | next = lg->pgdidx; | 401 | next = lg->pgdidx; |
240 | else | 402 | else |
241 | /* There are no mappings: you'll need to re-pin */ | 403 | /* This is a blank page, so there are no kernel |
404 | * mappings: caller must map the stack! */ | ||
242 | *blank_pgdir = 1; | 405 | *blank_pgdir = 1; |
243 | } | 406 | } |
407 | /* Record which Guest toplevel this shadows. */ | ||
244 | lg->pgdirs[next].cr3 = cr3; | 408 | lg->pgdirs[next].cr3 = cr3; |
245 | /* Release all the non-kernel mappings. */ | 409 | /* Release all the non-kernel mappings. */ |
246 | flush_user_mappings(lg, next); | 410 | flush_user_mappings(lg, next); |
@@ -248,82 +412,161 @@ static unsigned int new_pgdir(struct lguest *lg, | |||
248 | return next; | 412 | return next; |
249 | } | 413 | } |
250 | 414 | ||
415 | /*H:430 (iv) Switching page tables | ||
416 | * | ||
417 | * This is what happens when the Guest changes page tables (ie. changes the | ||
418 | * top-level pgdir). This happens on almost every context switch. */ | ||
251 | void guest_new_pagetable(struct lguest *lg, unsigned long pgtable) | 419 | void guest_new_pagetable(struct lguest *lg, unsigned long pgtable) |
252 | { | 420 | { |
253 | int newpgdir, repin = 0; | 421 | int newpgdir, repin = 0; |
254 | 422 | ||
423 | /* Look to see if we have this one already. */ | ||
255 | newpgdir = find_pgdir(lg, pgtable); | 424 | newpgdir = find_pgdir(lg, pgtable); |
425 | /* If not, we allocate or mug an existing one: if it's a fresh one, | ||
426 | * repin gets set to 1. */ | ||
256 | if (newpgdir == ARRAY_SIZE(lg->pgdirs)) | 427 | if (newpgdir == ARRAY_SIZE(lg->pgdirs)) |
257 | newpgdir = new_pgdir(lg, pgtable, &repin); | 428 | newpgdir = new_pgdir(lg, pgtable, &repin); |
429 | /* Change the current pgd index to the new one. */ | ||
258 | lg->pgdidx = newpgdir; | 430 | lg->pgdidx = newpgdir; |
431 | /* If it was completely blank, we map in the Guest kernel stack */ | ||
259 | if (repin) | 432 | if (repin) |
260 | pin_stack_pages(lg); | 433 | pin_stack_pages(lg); |
261 | } | 434 | } |
262 | 435 | ||
436 | /*H:470 Finally, a routine which throws away everything: all PGD entries in all | ||
437 | * the shadow page tables. This is used when we destroy the Guest. */ | ||
263 | static void release_all_pagetables(struct lguest *lg) | 438 | static void release_all_pagetables(struct lguest *lg) |
264 | { | 439 | { |
265 | unsigned int i, j; | 440 | unsigned int i, j; |
266 | 441 | ||
442 | /* Every shadow pagetable this Guest has */ | ||
267 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) | 443 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) |
268 | if (lg->pgdirs[i].pgdir) | 444 | if (lg->pgdirs[i].pgdir) |
445 | /* Every PGD entry except the Switcher at the top */ | ||
269 | for (j = 0; j < SWITCHER_PGD_INDEX; j++) | 446 | for (j = 0; j < SWITCHER_PGD_INDEX; j++) |
270 | release_pgd(lg, lg->pgdirs[i].pgdir + j); | 447 | release_pgd(lg, lg->pgdirs[i].pgdir + j); |
271 | } | 448 | } |
272 | 449 | ||
450 | /* We also throw away everything when a Guest tells us it's changed a kernel | ||
451 | * mapping. Since kernel mappings are in every page table, it's easiest to | ||
452 | * throw them all away. This is amazingly slow, but thankfully rare. */ | ||
273 | void guest_pagetable_clear_all(struct lguest *lg) | 453 | void guest_pagetable_clear_all(struct lguest *lg) |
274 | { | 454 | { |
275 | release_all_pagetables(lg); | 455 | release_all_pagetables(lg); |
456 | /* We need the Guest kernel stack mapped again. */ | ||
276 | pin_stack_pages(lg); | 457 | pin_stack_pages(lg); |
277 | } | 458 | } |
278 | 459 | ||
460 | /*H:420 This is the routine which actually sets the page table entry for then | ||
461 | * "idx"'th shadow page table. | ||
462 | * | ||
463 | * Normally, we can just throw out the old entry and replace it with 0: if they | ||
464 | * use it demand_page() will put the new entry in. We need to do this anyway: | ||
465 | * The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page | ||
466 | * is read from, and _PAGE_DIRTY when it's written to. | ||
467 | * | ||
468 | * But Avi Kivity pointed out that most Operating Systems (Linux included) set | ||
469 | * these bits on PTEs immediately anyway. This is done to save the CPU from | ||
470 | * having to update them, but it helps us the same way: if they set | ||
471 | * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if | ||
472 | * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. | ||
473 | */ | ||
279 | static void do_set_pte(struct lguest *lg, int idx, | 474 | static void do_set_pte(struct lguest *lg, int idx, |
280 | unsigned long vaddr, gpte_t gpte) | 475 | unsigned long vaddr, gpte_t gpte) |
281 | { | 476 | { |
477 | /* Look up the matching shadow page directot entry. */ | ||
282 | spgd_t *spgd = spgd_addr(lg, idx, vaddr); | 478 | spgd_t *spgd = spgd_addr(lg, idx, vaddr); |
479 | |||
480 | /* If the top level isn't present, there's no entry to update. */ | ||
283 | if (spgd->flags & _PAGE_PRESENT) { | 481 | if (spgd->flags & _PAGE_PRESENT) { |
482 | /* Otherwise, we start by releasing the existing entry. */ | ||
284 | spte_t *spte = spte_addr(lg, *spgd, vaddr); | 483 | spte_t *spte = spte_addr(lg, *spgd, vaddr); |
285 | release_pte(*spte); | 484 | release_pte(*spte); |
485 | |||
486 | /* If they're setting this entry as dirty or accessed, we might | ||
487 | * as well put that entry they've given us in now. This shaves | ||
488 | * 10% off a copy-on-write micro-benchmark. */ | ||
286 | if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) { | 489 | if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) { |
287 | check_gpte(lg, gpte); | 490 | check_gpte(lg, gpte); |
288 | *spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY); | 491 | *spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY); |
289 | } else | 492 | } else |
493 | /* Otherwise we can demand_page() it in later. */ | ||
290 | spte->raw.val = 0; | 494 | spte->raw.val = 0; |
291 | } | 495 | } |
292 | } | 496 | } |
293 | 497 | ||
498 | /*H:410 Updating a PTE entry is a little trickier. | ||
499 | * | ||
500 | * We keep track of several different page tables (the Guest uses one for each | ||
501 | * process, so it makes sense to cache at least a few). Each of these have | ||
502 | * identical kernel parts: ie. every mapping above PAGE_OFFSET is the same for | ||
503 | * all processes. So when the page table above that address changes, we update | ||
504 | * all the page tables, not just the current one. This is rare. | ||
505 | * | ||
506 | * The benefit is that when we have to track a new page table, we can copy keep | ||
507 | * all the kernel mappings. This speeds up context switch immensely. */ | ||
294 | void guest_set_pte(struct lguest *lg, | 508 | void guest_set_pte(struct lguest *lg, |
295 | unsigned long cr3, unsigned long vaddr, gpte_t gpte) | 509 | unsigned long cr3, unsigned long vaddr, gpte_t gpte) |
296 | { | 510 | { |
297 | /* Kernel mappings must be changed on all top levels. */ | 511 | /* Kernel mappings must be changed on all top levels. Slow, but |
512 | * doesn't happen often. */ | ||
298 | if (vaddr >= lg->page_offset) { | 513 | if (vaddr >= lg->page_offset) { |
299 | unsigned int i; | 514 | unsigned int i; |
300 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) | 515 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) |
301 | if (lg->pgdirs[i].pgdir) | 516 | if (lg->pgdirs[i].pgdir) |
302 | do_set_pte(lg, i, vaddr, gpte); | 517 | do_set_pte(lg, i, vaddr, gpte); |
303 | } else { | 518 | } else { |
519 | /* Is this page table one we have a shadow for? */ | ||
304 | int pgdir = find_pgdir(lg, cr3); | 520 | int pgdir = find_pgdir(lg, cr3); |
305 | if (pgdir != ARRAY_SIZE(lg->pgdirs)) | 521 | if (pgdir != ARRAY_SIZE(lg->pgdirs)) |
522 | /* If so, do the update. */ | ||
306 | do_set_pte(lg, pgdir, vaddr, gpte); | 523 | do_set_pte(lg, pgdir, vaddr, gpte); |
307 | } | 524 | } |
308 | } | 525 | } |
309 | 526 | ||
527 | /*H:400 | ||
528 | * (iii) Setting up a page table entry when the Guest tells us it has changed. | ||
529 | * | ||
530 | * Just like we did in interrupts_and_traps.c, it makes sense for us to deal | ||
531 | * with the other side of page tables while we're here: what happens when the | ||
532 | * Guest asks for a page table to be updated? | ||
533 | * | ||
534 | * We already saw that demand_page() will fill in the shadow page tables when | ||
535 | * needed, so we can simply remove shadow page table entries whenever the Guest | ||
536 | * tells us they've changed. When the Guest tries to use the new entry it will | ||
537 | * fault and demand_page() will fix it up. | ||
538 | * | ||
539 | * So with that in mind here's our code to to update a (top-level) PGD entry: | ||
540 | */ | ||
310 | void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx) | 541 | void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx) |
311 | { | 542 | { |
312 | int pgdir; | 543 | int pgdir; |
313 | 544 | ||
545 | /* The kernel seems to try to initialize this early on: we ignore its | ||
546 | * attempts to map over the Switcher. */ | ||
314 | if (idx >= SWITCHER_PGD_INDEX) | 547 | if (idx >= SWITCHER_PGD_INDEX) |
315 | return; | 548 | return; |
316 | 549 | ||
550 | /* If they're talking about a page table we have a shadow for... */ | ||
317 | pgdir = find_pgdir(lg, cr3); | 551 | pgdir = find_pgdir(lg, cr3); |
318 | if (pgdir < ARRAY_SIZE(lg->pgdirs)) | 552 | if (pgdir < ARRAY_SIZE(lg->pgdirs)) |
553 | /* ... throw it away. */ | ||
319 | release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); | 554 | release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); |
320 | } | 555 | } |
321 | 556 | ||
557 | /*H:500 (vii) Setting up the page tables initially. | ||
558 | * | ||
559 | * When a Guest is first created, the Launcher tells us where the toplevel of | ||
560 | * its first page table is. We set some things up here: */ | ||
322 | int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) | 561 | int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) |
323 | { | 562 | { |
324 | /* We assume this in flush_user_mappings, so check now */ | 563 | /* In flush_user_mappings() we loop from 0 to |
564 | * "vaddr_to_pgd_index(lg->page_offset)". This assumes it won't hit | ||
565 | * the Switcher mappings, so check that now. */ | ||
325 | if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX) | 566 | if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX) |
326 | return -EINVAL; | 567 | return -EINVAL; |
568 | /* We start on the first shadow page table, and give it a blank PGD | ||
569 | * page. */ | ||
327 | lg->pgdidx = 0; | 570 | lg->pgdidx = 0; |
328 | lg->pgdirs[lg->pgdidx].cr3 = pgtable; | 571 | lg->pgdirs[lg->pgdidx].cr3 = pgtable; |
329 | lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL); | 572 | lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL); |
@@ -332,33 +575,48 @@ int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) | |||
332 | return 0; | 575 | return 0; |
333 | } | 576 | } |
334 | 577 | ||
578 | /* When a Guest dies, our cleanup is fairly simple. */ | ||
335 | void free_guest_pagetable(struct lguest *lg) | 579 | void free_guest_pagetable(struct lguest *lg) |
336 | { | 580 | { |
337 | unsigned int i; | 581 | unsigned int i; |
338 | 582 | ||
583 | /* Throw away all page table pages. */ | ||
339 | release_all_pagetables(lg); | 584 | release_all_pagetables(lg); |
585 | /* Now free the top levels: free_page() can handle 0 just fine. */ | ||
340 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) | 586 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) |
341 | free_page((long)lg->pgdirs[i].pgdir); | 587 | free_page((long)lg->pgdirs[i].pgdir); |
342 | } | 588 | } |
343 | 589 | ||
344 | /* Caller must be preempt-safe */ | 590 | /*H:480 (vi) Mapping the Switcher when the Guest is about to run. |
591 | * | ||
592 | * The Switcher and the two pages for this CPU need to be available to the | ||
593 | * Guest (and not the pages for other CPUs). We have the appropriate PTE pages | ||
594 | * for each CPU already set up, we just need to hook them in. */ | ||
345 | void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) | 595 | void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) |
346 | { | 596 | { |
347 | spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); | 597 | spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); |
348 | spgd_t switcher_pgd; | 598 | spgd_t switcher_pgd; |
349 | spte_t regs_pte; | 599 | spte_t regs_pte; |
350 | 600 | ||
351 | /* Since switcher less that 4MB, we simply mug top pte page. */ | 601 | /* Make the last PGD entry for this Guest point to the Switcher's PTE |
602 | * page for this CPU (with appropriate flags). */ | ||
352 | switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT; | 603 | switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT; |
353 | switcher_pgd.flags = _PAGE_KERNEL; | 604 | switcher_pgd.flags = _PAGE_KERNEL; |
354 | lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; | 605 | lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; |
355 | 606 | ||
356 | /* Map our regs page over stack page. */ | 607 | /* We also change the Switcher PTE page. When we're running the Guest, |
608 | * we want the Guest's "regs" page to appear where the first Switcher | ||
609 | * page for this CPU is. This is an optimization: when the Switcher | ||
610 | * saves the Guest registers, it saves them into the first page of this | ||
611 | * CPU's "struct lguest_pages": if we make sure the Guest's register | ||
612 | * page is already mapped there, we don't have to copy them out | ||
613 | * again. */ | ||
357 | regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT; | 614 | regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT; |
358 | regs_pte.flags = _PAGE_KERNEL; | 615 | regs_pte.flags = _PAGE_KERNEL; |
359 | switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE] | 616 | switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE] |
360 | = regs_pte; | 617 | = regs_pte; |
361 | } | 618 | } |
619 | /*:*/ | ||
362 | 620 | ||
363 | static void free_switcher_pte_pages(void) | 621 | static void free_switcher_pte_pages(void) |
364 | { | 622 | { |
@@ -368,6 +626,10 @@ static void free_switcher_pte_pages(void) | |||
368 | free_page((long)switcher_pte_page(i)); | 626 | free_page((long)switcher_pte_page(i)); |
369 | } | 627 | } |
370 | 628 | ||
629 | /*H:520 Setting up the Switcher PTE page for given CPU is fairly easy, given | ||
630 | * the CPU number and the "struct page"s for the Switcher code itself. | ||
631 | * | ||
632 | * Currently the Switcher is less than a page long, so "pages" is always 1. */ | ||
371 | static __init void populate_switcher_pte_page(unsigned int cpu, | 633 | static __init void populate_switcher_pte_page(unsigned int cpu, |
372 | struct page *switcher_page[], | 634 | struct page *switcher_page[], |
373 | unsigned int pages) | 635 | unsigned int pages) |
@@ -375,21 +637,26 @@ static __init void populate_switcher_pte_page(unsigned int cpu, | |||
375 | unsigned int i; | 637 | unsigned int i; |
376 | spte_t *pte = switcher_pte_page(cpu); | 638 | spte_t *pte = switcher_pte_page(cpu); |
377 | 639 | ||
640 | /* The first entries are easy: they map the Switcher code. */ | ||
378 | for (i = 0; i < pages; i++) { | 641 | for (i = 0; i < pages; i++) { |
379 | pte[i].pfn = page_to_pfn(switcher_page[i]); | 642 | pte[i].pfn = page_to_pfn(switcher_page[i]); |
380 | pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED; | 643 | pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED; |
381 | } | 644 | } |
382 | 645 | ||
383 | /* We only map this CPU's pages, so guest can't see others. */ | 646 | /* The only other thing we map is this CPU's pair of pages. */ |
384 | i = pages + cpu*2; | 647 | i = pages + cpu*2; |
385 | 648 | ||
386 | /* First page (regs) is rw, second (state) is ro. */ | 649 | /* First page (Guest registers) is writable from the Guest */ |
387 | pte[i].pfn = page_to_pfn(switcher_page[i]); | 650 | pte[i].pfn = page_to_pfn(switcher_page[i]); |
388 | pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW; | 651 | pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW; |
652 | /* The second page contains the "struct lguest_ro_state", and is | ||
653 | * read-only. */ | ||
389 | pte[i+1].pfn = page_to_pfn(switcher_page[i+1]); | 654 | pte[i+1].pfn = page_to_pfn(switcher_page[i+1]); |
390 | pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED; | 655 | pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED; |
391 | } | 656 | } |
392 | 657 | ||
658 | /*H:510 At boot or module load time, init_pagetables() allocates and populates | ||
659 | * the Switcher PTE page for each CPU. */ | ||
393 | __init int init_pagetables(struct page **switcher_page, unsigned int pages) | 660 | __init int init_pagetables(struct page **switcher_page, unsigned int pages) |
394 | { | 661 | { |
395 | unsigned int i; | 662 | unsigned int i; |
@@ -404,7 +671,9 @@ __init int init_pagetables(struct page **switcher_page, unsigned int pages) | |||
404 | } | 671 | } |
405 | return 0; | 672 | return 0; |
406 | } | 673 | } |
674 | /*:*/ | ||
407 | 675 | ||
676 | /* Cleaning up simply involves freeing the PTE page for each CPU. */ | ||
408 | void free_pagetables(void) | 677 | void free_pagetables(void) |
409 | { | 678 | { |
410 | free_switcher_pte_pages(); | 679 | free_switcher_pte_pages(); |
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c index 1b2cfe89dcd5..f675a41a80da 100644 --- a/drivers/lguest/segments.c +++ b/drivers/lguest/segments.c | |||
@@ -1,16 +1,68 @@ | |||
1 | /*P:600 The x86 architecture has segments, which involve a table of descriptors | ||
2 | * which can be used to do funky things with virtual address interpretation. | ||
3 | * We originally used to use segments so the Guest couldn't alter the | ||
4 | * Guest<->Host Switcher, and then we had to trim Guest segments, and restore | ||
5 | * for userspace per-thread segments, but trim again for on userspace->kernel | ||
6 | * transitions... This nightmarish creation was contained within this file, | ||
7 | * where we knew not to tread without heavy armament and a change of underwear. | ||
8 | * | ||
9 | * In these modern times, the segment handling code consists of simple sanity | ||
10 | * checks, and the worst you'll experience reading this code is butterfly-rash | ||
11 | * from frolicking through its parklike serenity. :*/ | ||
1 | #include "lg.h" | 12 | #include "lg.h" |
2 | 13 | ||
14 | /*H:600 | ||
15 | * We've almost completed the Host; there's just one file to go! | ||
16 | * | ||
17 | * Segments & The Global Descriptor Table | ||
18 | * | ||
19 | * (That title sounds like a bad Nerdcore group. Not to suggest that there are | ||
20 | * any good Nerdcore groups, but in high school a friend of mine had a band | ||
21 | * called Joe Fish and the Chips, so there are definitely worse band names). | ||
22 | * | ||
23 | * To refresh: the GDT is a table of 8-byte values describing segments. Once | ||
24 | * set up, these segments can be loaded into one of the 6 "segment registers". | ||
25 | * | ||
26 | * GDT entries are passed around as "struct desc_struct"s, which like IDT | ||
27 | * entries are split into two 32-bit members, "a" and "b". One day, someone | ||
28 | * will clean that up, and be declared a Hero. (No pressure, I'm just saying). | ||
29 | * | ||
30 | * Anyway, the GDT entry contains a base (the start address of the segment), a | ||
31 | * limit (the size of the segment - 1), and some flags. Sounds simple, and it | ||
32 | * would be, except those zany Intel engineers decided that it was too boring | ||
33 | * to put the base at one end, the limit at the other, and the flags in | ||
34 | * between. They decided to shotgun the bits at random throughout the 8 bytes, | ||
35 | * like so: | ||
36 | * | ||
37 | * 0 16 40 48 52 56 63 | ||
38 | * [ limit part 1 ][ base part 1 ][ flags ][li][fl][base ] | ||
39 | * mit ags part 2 | ||
40 | * part 2 | ||
41 | * | ||
42 | * As a result, this file contains a certain amount of magic numeracy. Let's | ||
43 | * begin. | ||
44 | */ | ||
45 | |||
46 | /* Is the descriptor the Guest wants us to put in OK? | ||
47 | * | ||
48 | * The flag which Intel says must be zero: must be zero. The descriptor must | ||
49 | * be present, (this is actually checked earlier but is here for thorougness), | ||
50 | * and the descriptor type must be 1 (a memory segment). */ | ||
3 | static int desc_ok(const struct desc_struct *gdt) | 51 | static int desc_ok(const struct desc_struct *gdt) |
4 | { | 52 | { |
5 | /* MBZ=0, P=1, DT=1 */ | ||
6 | return ((gdt->b & 0x00209000) == 0x00009000); | 53 | return ((gdt->b & 0x00209000) == 0x00009000); |
7 | } | 54 | } |
8 | 55 | ||
56 | /* Is the segment present? (Otherwise it can't be used by the Guest). */ | ||
9 | static int segment_present(const struct desc_struct *gdt) | 57 | static int segment_present(const struct desc_struct *gdt) |
10 | { | 58 | { |
11 | return gdt->b & 0x8000; | 59 | return gdt->b & 0x8000; |
12 | } | 60 | } |
13 | 61 | ||
62 | /* There are several entries we don't let the Guest set. The TSS entry is the | ||
63 | * "Task State Segment" which controls all kinds of delicate things. The | ||
64 | * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the | ||
65 | * the Guest can't be trusted to deal with double faults. */ | ||
14 | static int ignored_gdt(unsigned int num) | 66 | static int ignored_gdt(unsigned int num) |
15 | { | 67 | { |
16 | return (num == GDT_ENTRY_TSS | 68 | return (num == GDT_ENTRY_TSS |
@@ -19,9 +71,18 @@ static int ignored_gdt(unsigned int num) | |||
19 | || num == GDT_ENTRY_DOUBLEFAULT_TSS); | 71 | || num == GDT_ENTRY_DOUBLEFAULT_TSS); |
20 | } | 72 | } |
21 | 73 | ||
22 | /* We don't allow removal of CS, DS or SS; it doesn't make sense. */ | 74 | /* If the Guest asks us to remove an entry from the GDT, we have to be careful. |
75 | * If one of the segment registers is pointing at that entry the Switcher will | ||
76 | * crash when it tries to reload the segment registers for the Guest. | ||
77 | * | ||
78 | * It doesn't make much sense for the Guest to try to remove its own code, data | ||
79 | * or stack segments while they're in use: assume that's a Guest bug. If it's | ||
80 | * one of the lesser segment registers using the removed entry, we simply set | ||
81 | * that register to 0 (unusable). */ | ||
23 | static void check_segment_use(struct lguest *lg, unsigned int desc) | 82 | static void check_segment_use(struct lguest *lg, unsigned int desc) |
24 | { | 83 | { |
84 | /* GDT entries are 8 bytes long, so we divide to get the index and | ||
85 | * ignore the bottom bits. */ | ||
25 | if (lg->regs->gs / 8 == desc) | 86 | if (lg->regs->gs / 8 == desc) |
26 | lg->regs->gs = 0; | 87 | lg->regs->gs = 0; |
27 | if (lg->regs->fs / 8 == desc) | 88 | if (lg->regs->fs / 8 == desc) |
@@ -33,13 +94,21 @@ static void check_segment_use(struct lguest *lg, unsigned int desc) | |||
33 | || lg->regs->ss / 8 == desc) | 94 | || lg->regs->ss / 8 == desc) |
34 | kill_guest(lg, "Removed live GDT entry %u", desc); | 95 | kill_guest(lg, "Removed live GDT entry %u", desc); |
35 | } | 96 | } |
36 | 97 | /*:*/ | |
98 | /*M:009 We wouldn't need to check for removal of in-use segments if we handled | ||
99 | * faults in the Switcher. However, it's probably not a worthwhile | ||
100 | * optimization. :*/ | ||
101 | |||
102 | /*H:610 Once the GDT has been changed, we look through the changed entries and | ||
103 | * see if they're OK. If not, we'll call kill_guest() and the Guest will never | ||
104 | * get to use the invalid entries. */ | ||
37 | static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end) | 105 | static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end) |
38 | { | 106 | { |
39 | unsigned int i; | 107 | unsigned int i; |
40 | 108 | ||
41 | for (i = start; i < end; i++) { | 109 | for (i = start; i < end; i++) { |
42 | /* We never copy these ones to real gdt */ | 110 | /* We never copy these ones to real GDT, so we don't care what |
111 | * they say */ | ||
43 | if (ignored_gdt(i)) | 112 | if (ignored_gdt(i)) |
44 | continue; | 113 | continue; |
45 | 114 | ||
@@ -53,41 +122,57 @@ static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end) | |||
53 | if (!desc_ok(&lg->gdt[i])) | 122 | if (!desc_ok(&lg->gdt[i])) |
54 | kill_guest(lg, "Bad GDT descriptor %i", i); | 123 | kill_guest(lg, "Bad GDT descriptor %i", i); |
55 | 124 | ||
56 | /* DPL 0 presumably means "for use by guest". */ | 125 | /* Segment descriptors contain a privilege level: the Guest is |
126 | * sometimes careless and leaves this as 0, even though it's | ||
127 | * running at privilege level 1. If so, we fix it here. */ | ||
57 | if ((lg->gdt[i].b & 0x00006000) == 0) | 128 | if ((lg->gdt[i].b & 0x00006000) == 0) |
58 | lg->gdt[i].b |= (GUEST_PL << 13); | 129 | lg->gdt[i].b |= (GUEST_PL << 13); |
59 | 130 | ||
60 | /* Set accessed bit, since gdt isn't writable. */ | 131 | /* Each descriptor has an "accessed" bit. If we don't set it |
132 | * now, the CPU will try to set it when the Guest first loads | ||
133 | * that entry into a segment register. But the GDT isn't | ||
134 | * writable by the Guest, so bad things can happen. */ | ||
61 | lg->gdt[i].b |= 0x00000100; | 135 | lg->gdt[i].b |= 0x00000100; |
62 | } | 136 | } |
63 | } | 137 | } |
64 | 138 | ||
139 | /* This routine is called at boot or modprobe time for each CPU to set up the | ||
140 | * "constant" GDT entries for Guests running on that CPU. */ | ||
65 | void setup_default_gdt_entries(struct lguest_ro_state *state) | 141 | void setup_default_gdt_entries(struct lguest_ro_state *state) |
66 | { | 142 | { |
67 | struct desc_struct *gdt = state->guest_gdt; | 143 | struct desc_struct *gdt = state->guest_gdt; |
68 | unsigned long tss = (unsigned long)&state->guest_tss; | 144 | unsigned long tss = (unsigned long)&state->guest_tss; |
69 | 145 | ||
70 | /* Hypervisor segments. */ | 146 | /* The hypervisor segments are full 0-4G segments, privilege level 0 */ |
71 | gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; | 147 | gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; |
72 | gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; | 148 | gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; |
73 | 149 | ||
74 | /* This is the one which we *cannot* copy from guest, since tss | 150 | /* The TSS segment refers to the TSS entry for this CPU, so we cannot |
75 | is depended on this lguest_ro_state, ie. this cpu. */ | 151 | * copy it from the Guest. Forgive the magic flags */ |
76 | gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); | 152 | gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); |
77 | gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) | 153 | gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) |
78 | | ((tss >> 16) & 0x000000FF); | 154 | | ((tss >> 16) & 0x000000FF); |
79 | } | 155 | } |
80 | 156 | ||
157 | /* This routine is called before the Guest is run for the first time. */ | ||
81 | void setup_guest_gdt(struct lguest *lg) | 158 | void setup_guest_gdt(struct lguest *lg) |
82 | { | 159 | { |
160 | /* Start with full 0-4G segments... */ | ||
83 | lg->gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; | 161 | lg->gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; |
84 | lg->gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; | 162 | lg->gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; |
163 | /* ...except the Guest is allowed to use them, so set the privilege | ||
164 | * level appropriately in the flags. */ | ||
85 | lg->gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); | 165 | lg->gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); |
86 | lg->gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); | 166 | lg->gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); |
87 | } | 167 | } |
88 | 168 | ||
89 | /* This is a fast version for the common case where only the three TLS entries | 169 | /* Like the IDT, we never simply use the GDT the Guest gives us. We set up the |
90 | * have changed. */ | 170 | * GDTs for each CPU, then we copy across the entries each time we want to run |
171 | * a different Guest on that CPU. */ | ||
172 | |||
173 | /* A partial GDT load, for the three "thead-local storage" entries. Otherwise | ||
174 | * it's just like load_guest_gdt(). So much, in fact, it would probably be | ||
175 | * neater to have a single hypercall to cover both. */ | ||
91 | void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt) | 176 | void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt) |
92 | { | 177 | { |
93 | unsigned int i; | 178 | unsigned int i; |
@@ -96,22 +181,31 @@ void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt) | |||
96 | gdt[i] = lg->gdt[i]; | 181 | gdt[i] = lg->gdt[i]; |
97 | } | 182 | } |
98 | 183 | ||
184 | /* This is the full version */ | ||
99 | void copy_gdt(const struct lguest *lg, struct desc_struct *gdt) | 185 | void copy_gdt(const struct lguest *lg, struct desc_struct *gdt) |
100 | { | 186 | { |
101 | unsigned int i; | 187 | unsigned int i; |
102 | 188 | ||
189 | /* The default entries from setup_default_gdt_entries() are not | ||
190 | * replaced. See ignored_gdt() above. */ | ||
103 | for (i = 0; i < GDT_ENTRIES; i++) | 191 | for (i = 0; i < GDT_ENTRIES; i++) |
104 | if (!ignored_gdt(i)) | 192 | if (!ignored_gdt(i)) |
105 | gdt[i] = lg->gdt[i]; | 193 | gdt[i] = lg->gdt[i]; |
106 | } | 194 | } |
107 | 195 | ||
196 | /* This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). */ | ||
108 | void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num) | 197 | void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num) |
109 | { | 198 | { |
199 | /* We assume the Guest has the same number of GDT entries as the | ||
200 | * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ | ||
110 | if (num > ARRAY_SIZE(lg->gdt)) | 201 | if (num > ARRAY_SIZE(lg->gdt)) |
111 | kill_guest(lg, "too many gdt entries %i", num); | 202 | kill_guest(lg, "too many gdt entries %i", num); |
112 | 203 | ||
204 | /* We read the whole thing in, then fix it up. */ | ||
113 | lgread(lg, lg->gdt, table, num * sizeof(lg->gdt[0])); | 205 | lgread(lg, lg->gdt, table, num * sizeof(lg->gdt[0])); |
114 | fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->gdt)); | 206 | fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->gdt)); |
207 | /* Mark that the GDT changed so the core knows it has to copy it again, | ||
208 | * even if the Guest is run on the same CPU. */ | ||
115 | lg->changed |= CHANGED_GDT; | 209 | lg->changed |= CHANGED_GDT; |
116 | } | 210 | } |
117 | 211 | ||
@@ -123,3 +217,13 @@ void guest_load_tls(struct lguest *lg, unsigned long gtls) | |||
123 | fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); | 217 | fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); |
124 | lg->changed |= CHANGED_GDT_TLS; | 218 | lg->changed |= CHANGED_GDT_TLS; |
125 | } | 219 | } |
220 | |||
221 | /* | ||
222 | * With this, we have finished the Host. | ||
223 | * | ||
224 | * Five of the seven parts of our task are complete. You have made it through | ||
225 | * the Bit of Despair (I think that's somewhere in the page table code, | ||
226 | * myself). | ||
227 | * | ||
228 | * Next, we examine "make Switcher". It's short, but intense. | ||
229 | */ | ||
diff --git a/drivers/lguest/switcher.S b/drivers/lguest/switcher.S index eadd4cc299d2..d418179ea6b5 100644 --- a/drivers/lguest/switcher.S +++ b/drivers/lguest/switcher.S | |||
@@ -1,45 +1,136 @@ | |||
1 | /* This code sits at 0xFFC00000 to do the low-level guest<->host switch. | 1 | /*P:900 This is the Switcher: code which sits at 0xFFC00000 to do the low-level |
2 | * Guest<->Host switch. It is as simple as it can be made, but it's naturally | ||
3 | * very specific to x86. | ||
4 | * | ||
5 | * You have now completed Preparation. If this has whet your appetite; if you | ||
6 | * are feeling invigorated and refreshed then the next, more challenging stage | ||
7 | * can be found in "make Guest". :*/ | ||
2 | 8 | ||
3 | There is are two pages above us for this CPU (struct lguest_pages). | 9 | /*S:100 |
4 | The second page (struct lguest_ro_state) becomes read-only after the | 10 | * Welcome to the Switcher itself! |
5 | context switch. The first page (the stack for traps) remains writable, | 11 | * |
6 | but while we're in here, the guest cannot be running. | 12 | * This file contains the low-level code which changes the CPU to run the Guest |
7 | */ | 13 | * code, and returns to the Host when something happens. Understand this, and |
14 | * you understand the heart of our journey. | ||
15 | * | ||
16 | * Because this is in assembler rather than C, our tale switches from prose to | ||
17 | * verse. First I tried limericks: | ||
18 | * | ||
19 | * There once was an eax reg, | ||
20 | * To which our pointer was fed, | ||
21 | * It needed an add, | ||
22 | * Which asm-offsets.h had | ||
23 | * But this limerick is hurting my head. | ||
24 | * | ||
25 | * Next I tried haikus, but fitting the required reference to the seasons in | ||
26 | * every stanza was quickly becoming tiresome: | ||
27 | * | ||
28 | * The %eax reg | ||
29 | * Holds "struct lguest_pages" now: | ||
30 | * Cherry blossoms fall. | ||
31 | * | ||
32 | * Then I started with Heroic Verse, but the rhyming requirement leeched away | ||
33 | * the content density and led to some uniquely awful oblique rhymes: | ||
34 | * | ||
35 | * These constants are coming from struct offsets | ||
36 | * For use within the asm switcher text. | ||
37 | * | ||
38 | * Finally, I settled for something between heroic hexameter, and normal prose | ||
39 | * with inappropriate linebreaks. Anyway, it aint no Shakespeare. | ||
40 | */ | ||
41 | |||
42 | // Not all kernel headers work from assembler | ||
43 | // But these ones are needed: the ENTRY() define | ||
44 | // And constants extracted from struct offsets | ||
45 | // To avoid magic numbers and breakage: | ||
46 | // Should they change the compiler can't save us | ||
47 | // Down here in the depths of assembler code. | ||
8 | #include <linux/linkage.h> | 48 | #include <linux/linkage.h> |
9 | #include <asm/asm-offsets.h> | 49 | #include <asm/asm-offsets.h> |
10 | #include "lg.h" | 50 | #include "lg.h" |
11 | 51 | ||
52 | // We mark the start of the code to copy | ||
53 | // It's placed in .text tho it's never run here | ||
54 | // You'll see the trick macro at the end | ||
55 | // Which interleaves data and text to effect. | ||
12 | .text | 56 | .text |
13 | ENTRY(start_switcher_text) | 57 | ENTRY(start_switcher_text) |
14 | 58 | ||
15 | /* %eax points to lguest pages for this CPU. %ebx contains cr3 value. | 59 | // When we reach switch_to_guest we have just left |
16 | All normal registers can be clobbered! */ | 60 | // The safe and comforting shores of C code |
61 | // %eax has the "struct lguest_pages" to use | ||
62 | // Where we save state and still see it from the Guest | ||
63 | // And %ebx holds the Guest shadow pagetable: | ||
64 | // Once set we have truly left Host behind. | ||
17 | ENTRY(switch_to_guest) | 65 | ENTRY(switch_to_guest) |
18 | /* Save host segments on host stack. */ | 66 | // We told gcc all its regs could fade, |
67 | // Clobbered by our journey into the Guest | ||
68 | // We could have saved them, if we tried | ||
69 | // But time is our master and cycles count. | ||
70 | |||
71 | // Segment registers must be saved for the Host | ||
72 | // We push them on the Host stack for later | ||
19 | pushl %es | 73 | pushl %es |
20 | pushl %ds | 74 | pushl %ds |
21 | pushl %gs | 75 | pushl %gs |
22 | pushl %fs | 76 | pushl %fs |
23 | /* With CONFIG_FRAME_POINTER, gcc doesn't let us clobber this! */ | 77 | // But the compiler is fickle, and heeds |
78 | // No warning of %ebp clobbers | ||
79 | // When frame pointers are used. That register | ||
80 | // Must be saved and restored or chaos strikes. | ||
24 | pushl %ebp | 81 | pushl %ebp |
25 | /* Save host stack. */ | 82 | // The Host's stack is done, now save it away |
83 | // In our "struct lguest_pages" at offset | ||
84 | // Distilled into asm-offsets.h | ||
26 | movl %esp, LGUEST_PAGES_host_sp(%eax) | 85 | movl %esp, LGUEST_PAGES_host_sp(%eax) |
27 | /* Switch to guest stack: if we get NMI we expect to be there. */ | 86 | |
87 | // All saved and there's now five steps before us: | ||
88 | // Stack, GDT, IDT, TSS | ||
89 | // And last of all the page tables are flipped. | ||
90 | |||
91 | // Yet beware that our stack pointer must be | ||
92 | // Always valid lest an NMI hits | ||
93 | // %edx does the duty here as we juggle | ||
94 | // %eax is lguest_pages: our stack lies within. | ||
28 | movl %eax, %edx | 95 | movl %eax, %edx |
29 | addl $LGUEST_PAGES_regs, %edx | 96 | addl $LGUEST_PAGES_regs, %edx |
30 | movl %edx, %esp | 97 | movl %edx, %esp |
31 | /* Switch to guest's GDT, IDT. */ | 98 | |
99 | // The Guest's GDT we so carefully | ||
100 | // Placed in the "struct lguest_pages" before | ||
32 | lgdt LGUEST_PAGES_guest_gdt_desc(%eax) | 101 | lgdt LGUEST_PAGES_guest_gdt_desc(%eax) |
102 | |||
103 | // The Guest's IDT we did partially | ||
104 | // Move to the "struct lguest_pages" as well. | ||
33 | lidt LGUEST_PAGES_guest_idt_desc(%eax) | 105 | lidt LGUEST_PAGES_guest_idt_desc(%eax) |
34 | /* Switch to guest's TSS while GDT still writable. */ | 106 | |
107 | // The TSS entry which controls traps | ||
108 | // Must be loaded up with "ltr" now: | ||
109 | // For after we switch over our page tables | ||
110 | // It (as the rest) will be writable no more. | ||
111 | // (The GDT entry TSS needs | ||
112 | // Changes type when we load it: damn Intel!) | ||
35 | movl $(GDT_ENTRY_TSS*8), %edx | 113 | movl $(GDT_ENTRY_TSS*8), %edx |
36 | ltr %dx | 114 | ltr %dx |
37 | /* Set host's TSS GDT entry to available (clear byte 5 bit 2). */ | 115 | |
116 | // Look back now, before we take this last step! | ||
117 | // The Host's TSS entry was also marked used; | ||
118 | // Let's clear it again, ere we return. | ||
119 | // The GDT descriptor of the Host | ||
120 | // Points to the table after two "size" bytes | ||
38 | movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx | 121 | movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx |
122 | // Clear the type field of "used" (byte 5, bit 2) | ||
39 | andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx) | 123 | andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx) |
40 | /* Switch to guest page tables: lguest_pages->state now read-only. */ | 124 | |
125 | // Once our page table's switched, the Guest is live! | ||
126 | // The Host fades as we run this final step. | ||
127 | // Our "struct lguest_pages" is now read-only. | ||
41 | movl %ebx, %cr3 | 128 | movl %ebx, %cr3 |
42 | /* Restore guest regs */ | 129 | |
130 | // The page table change did one tricky thing: | ||
131 | // The Guest's register page has been mapped | ||
132 | // Writable onto our %esp (stack) -- | ||
133 | // We can simply pop off all Guest regs. | ||
43 | popl %ebx | 134 | popl %ebx |
44 | popl %ecx | 135 | popl %ecx |
45 | popl %edx | 136 | popl %edx |
@@ -51,12 +142,27 @@ ENTRY(switch_to_guest) | |||
51 | popl %fs | 142 | popl %fs |
52 | popl %ds | 143 | popl %ds |
53 | popl %es | 144 | popl %es |
54 | /* Skip error code and trap number */ | 145 | |
146 | // Near the base of the stack lurk two strange fields | ||
147 | // Which we fill as we exit the Guest | ||
148 | // These are the trap number and its error | ||
149 | // We can simply step past them on our way. | ||
55 | addl $8, %esp | 150 | addl $8, %esp |
151 | |||
152 | // The last five stack slots hold return address | ||
153 | // And everything needed to change privilege | ||
154 | // Into the Guest privilege level of 1, | ||
155 | // And the stack where the Guest had last left it. | ||
156 | // Interrupts are turned back on: we are Guest. | ||
56 | iret | 157 | iret |
57 | 158 | ||
159 | // There are two paths where we switch to the Host | ||
160 | // So we put the routine in a macro. | ||
161 | // We are on our way home, back to the Host | ||
162 | // Interrupted out of the Guest, we come here. | ||
58 | #define SWITCH_TO_HOST \ | 163 | #define SWITCH_TO_HOST \ |
59 | /* Save guest state */ \ | 164 | /* We save the Guest state: all registers first \ |
165 | * Laid out just as "struct lguest_regs" defines */ \ | ||
60 | pushl %es; \ | 166 | pushl %es; \ |
61 | pushl %ds; \ | 167 | pushl %ds; \ |
62 | pushl %fs; \ | 168 | pushl %fs; \ |
@@ -68,58 +174,119 @@ ENTRY(switch_to_guest) | |||
68 | pushl %edx; \ | 174 | pushl %edx; \ |
69 | pushl %ecx; \ | 175 | pushl %ecx; \ |
70 | pushl %ebx; \ | 176 | pushl %ebx; \ |
71 | /* Load lguest ds segment for convenience. */ \ | 177 | /* Our stack and our code are using segments \ |
178 | * Set in the TSS and IDT \ | ||
179 | * Yet if we were to touch data we'd use \ | ||
180 | * Whatever data segment the Guest had. \ | ||
181 | * Load the lguest ds segment for now. */ \ | ||
72 | movl $(LGUEST_DS), %eax; \ | 182 | movl $(LGUEST_DS), %eax; \ |
73 | movl %eax, %ds; \ | 183 | movl %eax, %ds; \ |
74 | /* Figure out where we are, based on stack (at top of regs). */ \ | 184 | /* So where are we? Which CPU, which struct? \ |
185 | * The stack is our clue: our TSS sets \ | ||
186 | * It at the end of "struct lguest_pages" \ | ||
187 | * And we then pushed and pushed and pushed Guest regs: \ | ||
188 | * Now stack points atop the "struct lguest_regs". \ | ||
189 | * Subtract that offset, and we find our struct. */ \ | ||
75 | movl %esp, %eax; \ | 190 | movl %esp, %eax; \ |
76 | subl $LGUEST_PAGES_regs, %eax; \ | 191 | subl $LGUEST_PAGES_regs, %eax; \ |
77 | /* Put trap number in %ebx before we switch cr3 and lose it. */ \ | 192 | /* Save our trap number: the switch will obscure it \ |
193 | * (The Guest regs are not mapped here in the Host) \ | ||
194 | * %ebx holds it safe for deliver_to_host */ \ | ||
78 | movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \ | 195 | movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \ |
79 | /* Switch to host page tables (host GDT, IDT and stack are in host \ | 196 | /* The Host GDT, IDT and stack! \ |
80 | mem, so need this first) */ \ | 197 | * All these lie safely hidden from the Guest: \ |
198 | * We must return to the Host page tables \ | ||
199 | * (Hence that was saved in struct lguest_pages) */ \ | ||
81 | movl LGUEST_PAGES_host_cr3(%eax), %edx; \ | 200 | movl LGUEST_PAGES_host_cr3(%eax), %edx; \ |
82 | movl %edx, %cr3; \ | 201 | movl %edx, %cr3; \ |
83 | /* Set guest's TSS to available (clear byte 5 bit 2). */ \ | 202 | /* As before, when we looked back at the Host \ |
203 | * As we left and marked TSS unused \ | ||
204 | * So must we now for the Guest left behind. */ \ | ||
84 | andb $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \ | 205 | andb $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \ |
85 | /* Switch to host's GDT & IDT. */ \ | 206 | /* Switch to Host's GDT, IDT. */ \ |
86 | lgdt LGUEST_PAGES_host_gdt_desc(%eax); \ | 207 | lgdt LGUEST_PAGES_host_gdt_desc(%eax); \ |
87 | lidt LGUEST_PAGES_host_idt_desc(%eax); \ | 208 | lidt LGUEST_PAGES_host_idt_desc(%eax); \ |
88 | /* Switch to host's stack. */ \ | 209 | /* Restore the Host's stack where it's saved regs lie */ \ |
89 | movl LGUEST_PAGES_host_sp(%eax), %esp; \ | 210 | movl LGUEST_PAGES_host_sp(%eax), %esp; \ |
90 | /* Switch to host's TSS */ \ | 211 | /* Last the TSS: our Host is complete */ \ |
91 | movl $(GDT_ENTRY_TSS*8), %edx; \ | 212 | movl $(GDT_ENTRY_TSS*8), %edx; \ |
92 | ltr %dx; \ | 213 | ltr %dx; \ |
214 | /* Restore now the regs saved right at the first. */ \ | ||
93 | popl %ebp; \ | 215 | popl %ebp; \ |
94 | popl %fs; \ | 216 | popl %fs; \ |
95 | popl %gs; \ | 217 | popl %gs; \ |
96 | popl %ds; \ | 218 | popl %ds; \ |
97 | popl %es | 219 | popl %es |
98 | 220 | ||
99 | /* Return to run_guest_once. */ | 221 | // Here's where we come when the Guest has just trapped: |
222 | // (Which trap we'll see has been pushed on the stack). | ||
223 | // We need only switch back, and the Host will decode | ||
224 | // Why we came home, and what needs to be done. | ||
100 | return_to_host: | 225 | return_to_host: |
101 | SWITCH_TO_HOST | 226 | SWITCH_TO_HOST |
102 | iret | 227 | iret |
103 | 228 | ||
229 | // An interrupt, with some cause external | ||
230 | // Has ajerked us rudely from the Guest's code | ||
231 | // Again we must return home to the Host | ||
104 | deliver_to_host: | 232 | deliver_to_host: |
105 | SWITCH_TO_HOST | 233 | SWITCH_TO_HOST |
106 | /* Decode IDT and jump to hosts' irq handler. When that does iret, it | 234 | // But now we must go home via that place |
107 | * will return to run_guest_once. This is a feature. */ | 235 | // Where that interrupt was supposed to go |
236 | // Had we not been ensconced, running the Guest. | ||
237 | // Here we see the cleverness of our stack: | ||
238 | // The Host stack is formed like an interrupt | ||
239 | // With EIP, CS and EFLAGS layered. | ||
240 | // Interrupt handlers end with "iret" | ||
241 | // And that will take us home at long long last. | ||
242 | |||
243 | // But first we must find the handler to call! | ||
244 | // The IDT descriptor for the Host | ||
245 | // Has two bytes for size, and four for address: | ||
246 | // %edx will hold it for us for now. | ||
108 | movl (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx | 247 | movl (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx |
248 | // We now know the table address we need, | ||
249 | // And saved the trap's number inside %ebx. | ||
250 | // Yet the pointer to the handler is smeared | ||
251 | // Across the bits of the table entry. | ||
252 | // What oracle can tell us how to extract | ||
253 | // From such a convoluted encoding? | ||
254 | // I consulted gcc, and it gave | ||
255 | // These instructions, which I gladly credit: | ||
109 | leal (%edx,%ebx,8), %eax | 256 | leal (%edx,%ebx,8), %eax |
110 | movzwl (%eax),%edx | 257 | movzwl (%eax),%edx |
111 | movl 4(%eax), %eax | 258 | movl 4(%eax), %eax |
112 | xorw %ax, %ax | 259 | xorw %ax, %ax |
113 | orl %eax, %edx | 260 | orl %eax, %edx |
261 | // Now the address of the handler's in %edx | ||
262 | // We call it now: its "iret" takes us home. | ||
114 | jmp *%edx | 263 | jmp *%edx |
115 | 264 | ||
116 | /* Real hardware interrupts are delivered straight to the host. Others | 265 | // Every interrupt can come to us here |
117 | cause us to return to run_guest_once so it can decide what to do. Note | 266 | // But we must truly tell each apart. |
118 | that some of these are overridden by the guest to deliver directly, and | 267 | // They number two hundred and fifty six |
119 | never enter here (see load_guest_idt_entry). */ | 268 | // And each must land in a different spot, |
269 | // Push its number on stack, and join the stream. | ||
270 | |||
271 | // And worse, a mere six of the traps stand apart | ||
272 | // And push on their stack an addition: | ||
273 | // An error number, thirty two bits long | ||
274 | // So we punish the other two fifty | ||
275 | // And make them push a zero so they match. | ||
276 | |||
277 | // Yet two fifty six entries is long | ||
278 | // And all will look most the same as the last | ||
279 | // So we create a macro which can make | ||
280 | // As many entries as we need to fill. | ||
281 | |||
282 | // Note the change to .data then .text: | ||
283 | // We plant the address of each entry | ||
284 | // Into a (data) table for the Host | ||
285 | // To know where each Guest interrupt should go. | ||
120 | .macro IRQ_STUB N TARGET | 286 | .macro IRQ_STUB N TARGET |
121 | .data; .long 1f; .text; 1: | 287 | .data; .long 1f; .text; 1: |
122 | /* Make an error number for most traps, which don't have one. */ | 288 | // Trap eight, ten through fourteen and seventeen |
289 | // Supply an error number. Else zero. | ||
123 | .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17) | 290 | .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17) |
124 | pushl $0 | 291 | pushl $0 |
125 | .endif | 292 | .endif |
@@ -128,6 +295,8 @@ deliver_to_host: | |||
128 | ALIGN | 295 | ALIGN |
129 | .endm | 296 | .endm |
130 | 297 | ||
298 | // This macro creates numerous entries | ||
299 | // Using GAS macros which out-power C's. | ||
131 | .macro IRQ_STUBS FIRST LAST TARGET | 300 | .macro IRQ_STUBS FIRST LAST TARGET |
132 | irq=\FIRST | 301 | irq=\FIRST |
133 | .rept \LAST-\FIRST+1 | 302 | .rept \LAST-\FIRST+1 |
@@ -136,24 +305,43 @@ deliver_to_host: | |||
136 | .endr | 305 | .endr |
137 | .endm | 306 | .endm |
138 | 307 | ||
139 | /* We intercept every interrupt, because we may need to switch back to | 308 | // Here's the marker for our pointer table |
140 | * host. Unfortunately we can't tell them apart except by entry | 309 | // Laid in the data section just before |
141 | * point, so we need 256 entry points. | 310 | // Each macro places the address of code |
142 | */ | 311 | // Forming an array: each one points to text |
312 | // Which handles interrupt in its turn. | ||
143 | .data | 313 | .data |
144 | .global default_idt_entries | 314 | .global default_idt_entries |
145 | default_idt_entries: | 315 | default_idt_entries: |
146 | .text | 316 | .text |
147 | IRQ_STUBS 0 1 return_to_host /* First two traps */ | 317 | // The first two traps go straight back to the Host |
148 | IRQ_STUB 2 handle_nmi /* NMI */ | 318 | IRQ_STUBS 0 1 return_to_host |
149 | IRQ_STUBS 3 31 return_to_host /* Rest of traps */ | 319 | // We'll say nothing, yet, about NMI |
150 | IRQ_STUBS 32 127 deliver_to_host /* Real interrupts */ | 320 | IRQ_STUB 2 handle_nmi |
151 | IRQ_STUB 128 return_to_host /* System call (overridden) */ | 321 | // Other traps also return to the Host |
152 | IRQ_STUBS 129 255 deliver_to_host /* Other real interrupts */ | 322 | IRQ_STUBS 3 31 return_to_host |
153 | 323 | // All interrupts go via their handlers | |
154 | /* We ignore NMI and return. */ | 324 | IRQ_STUBS 32 127 deliver_to_host |
325 | // 'Cept system calls coming from userspace | ||
326 | // Are to go to the Guest, never the Host. | ||
327 | IRQ_STUB 128 return_to_host | ||
328 | IRQ_STUBS 129 255 deliver_to_host | ||
329 | |||
330 | // The NMI, what a fabulous beast | ||
331 | // Which swoops in and stops us no matter that | ||
332 | // We're suspended between heaven and hell, | ||
333 | // (Or more likely between the Host and Guest) | ||
334 | // When in it comes! We are dazed and confused | ||
335 | // So we do the simplest thing which one can. | ||
336 | // Though we've pushed the trap number and zero | ||
337 | // We discard them, return, and hope we live. | ||
155 | handle_nmi: | 338 | handle_nmi: |
156 | addl $8, %esp | 339 | addl $8, %esp |
157 | iret | 340 | iret |
158 | 341 | ||
342 | // We are done; all that's left is Mastery | ||
343 | // And "make Mastery" is a journey long | ||
344 | // Designed to make your fingers itch to code. | ||
345 | |||
346 | // Here ends the text, the file and poem. | ||
159 | ENTRY(end_switcher_text) | 347 | ENTRY(end_switcher_text) |