aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/i386/kernel/tsc.c4
-rw-r--r--arch/x86_64/kernel/tsc.c2
-rw-r--r--drivers/lguest/core.c462
-rw-r--r--drivers/lguest/hypercalls.c192
-rw-r--r--drivers/lguest/interrupts_and_traps.c268
-rw-r--r--drivers/lguest/io.c399
-rw-r--r--drivers/lguest/lg.h261
-rw-r--r--drivers/lguest/lguest.c125
-rw-r--r--drivers/lguest/lguest_asm.S5
-rw-r--r--drivers/lguest/lguest_user.c236
-rw-r--r--drivers/lguest/page_tables.c411
-rw-r--r--drivers/lguest/segments.c125
-rw-r--r--drivers/lguest/switcher.S159
-rw-r--r--include/asm-i386/tsc.h1
-rw-r--r--include/linux/lguest.h12
-rw-r--r--include/linux/lguest_launcher.h73
-rw-r--r--kernel/fork.c1
17 files changed, 2702 insertions, 34 deletions
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 252f9010f283..debd7dbb4158 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -27,6 +27,7 @@ static int tsc_enabled;
27 * an extra value to store the TSC freq 27 * an extra value to store the TSC freq
28 */ 28 */
29unsigned int tsc_khz; 29unsigned int tsc_khz;
30EXPORT_SYMBOL_GPL(tsc_khz);
30 31
31int tsc_disable; 32int tsc_disable;
32 33
@@ -58,10 +59,11 @@ __setup("notsc", tsc_setup);
58 */ 59 */
59static int tsc_unstable; 60static int tsc_unstable;
60 61
61static inline int check_tsc_unstable(void) 62int check_tsc_unstable(void)
62{ 63{
63 return tsc_unstable; 64 return tsc_unstable;
64} 65}
66EXPORT_SYMBOL_GPL(check_tsc_unstable);
65 67
66/* Accellerators for sched_clock() 68/* Accellerators for sched_clock()
67 * convert from cycles(64bits) => nanoseconds (64bits) 69 * convert from cycles(64bits) => nanoseconds (64bits)
diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c
index 48f9a8e6aa91..e850aa01e1b3 100644
--- a/arch/x86_64/kernel/tsc.c
+++ b/arch/x86_64/kernel/tsc.c
@@ -44,7 +44,7 @@ unsigned long long sched_clock(void)
44 44
45static int tsc_unstable; 45static int tsc_unstable;
46 46
47static inline int check_tsc_unstable(void) 47inline int check_tsc_unstable(void)
48{ 48{
49 return tsc_unstable; 49 return tsc_unstable;
50} 50}
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
new file mode 100644
index 000000000000..ce909ec57499
--- /dev/null
+++ b/drivers/lguest/core.c
@@ -0,0 +1,462 @@
1/* World's simplest hypervisor, to test paravirt_ops and show
2 * unbelievers that virtualization is the future. Plus, it's fun! */
3#include <linux/module.h>
4#include <linux/stringify.h>
5#include <linux/stddef.h>
6#include <linux/io.h>
7#include <linux/mm.h>
8#include <linux/vmalloc.h>
9#include <linux/cpu.h>
10#include <linux/freezer.h>
11#include <asm/paravirt.h>
12#include <asm/desc.h>
13#include <asm/pgtable.h>
14#include <asm/uaccess.h>
15#include <asm/poll.h>
16#include <asm/highmem.h>
17#include <asm/asm-offsets.h>
18#include <asm/i387.h>
19#include "lg.h"
20
21/* Found in switcher.S */
22extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
23extern unsigned long default_idt_entries[];
24
25/* Every guest maps the core switcher code. */
26#define SHARED_SWITCHER_PAGES \
27 DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
28/* Pages for switcher itself, then two pages per cpu */
29#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS)
30
31/* We map at -4M for ease of mapping into the guest (one PTE page). */
32#define SWITCHER_ADDR 0xFFC00000
33
34static struct vm_struct *switcher_vma;
35static struct page **switcher_page;
36
37static int cpu_had_pge;
38static struct {
39 unsigned long offset;
40 unsigned short segment;
41} lguest_entry;
42
43/* This One Big lock protects all inter-guest data structures. */
44DEFINE_MUTEX(lguest_lock);
45static DEFINE_PER_CPU(struct lguest *, last_guest);
46
47/* FIXME: Make dynamic. */
48#define MAX_LGUEST_GUESTS 16
49struct lguest lguests[MAX_LGUEST_GUESTS];
50
51/* Offset from where switcher.S was compiled to where we've copied it */
52static unsigned long switcher_offset(void)
53{
54 return SWITCHER_ADDR - (unsigned long)start_switcher_text;
55}
56
57/* This cpu's struct lguest_pages. */
58static struct lguest_pages *lguest_pages(unsigned int cpu)
59{
60 return &(((struct lguest_pages *)
61 (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
62}
63
64static __init int map_switcher(void)
65{
66 int i, err;
67 struct page **pagep;
68
69 switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES,
70 GFP_KERNEL);
71 if (!switcher_page) {
72 err = -ENOMEM;
73 goto out;
74 }
75
76 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
77 unsigned long addr = get_zeroed_page(GFP_KERNEL);
78 if (!addr) {
79 err = -ENOMEM;
80 goto free_some_pages;
81 }
82 switcher_page[i] = virt_to_page(addr);
83 }
84
85 switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
86 VM_ALLOC, SWITCHER_ADDR, VMALLOC_END);
87 if (!switcher_vma) {
88 err = -ENOMEM;
89 printk("lguest: could not map switcher pages high\n");
90 goto free_pages;
91 }
92
93 pagep = switcher_page;
94 err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep);
95 if (err) {
96 printk("lguest: map_vm_area failed: %i\n", err);
97 goto free_vma;
98 }
99 memcpy(switcher_vma->addr, start_switcher_text,
100 end_switcher_text - start_switcher_text);
101
102 /* Fix up IDT entries to point into copied text. */
103 for (i = 0; i < IDT_ENTRIES; i++)
104 default_idt_entries[i] += switcher_offset();
105
106 for_each_possible_cpu(i) {
107 struct lguest_pages *pages = lguest_pages(i);
108 struct lguest_ro_state *state = &pages->state;
109
110 /* These fields are static: rest done in copy_in_guest_info */
111 state->host_gdt_desc.size = GDT_SIZE-1;
112 state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
113 store_idt(&state->host_idt_desc);
114 state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
115 state->guest_idt_desc.address = (long)&state->guest_idt;
116 state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
117 state->guest_gdt_desc.address = (long)&state->guest_gdt;
118 state->guest_tss.esp0 = (long)(&pages->regs + 1);
119 state->guest_tss.ss0 = LGUEST_DS;
120 /* No I/O for you! */
121 state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
122 setup_default_gdt_entries(state);
123 setup_default_idt_entries(state, default_idt_entries);
124
125 /* Setup LGUEST segments on all cpus */
126 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
127 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
128 }
129
130 /* Initialize entry point into switcher. */
131 lguest_entry.offset = (long)switch_to_guest + switcher_offset();
132 lguest_entry.segment = LGUEST_CS;
133
134 printk(KERN_INFO "lguest: mapped switcher at %p\n",
135 switcher_vma->addr);
136 return 0;
137
138free_vma:
139 vunmap(switcher_vma->addr);
140free_pages:
141 i = TOTAL_SWITCHER_PAGES;
142free_some_pages:
143 for (--i; i >= 0; i--)
144 __free_pages(switcher_page[i], 0);
145 kfree(switcher_page);
146out:
147 return err;
148}
149
150static void unmap_switcher(void)
151{
152 unsigned int i;
153
154 vunmap(switcher_vma->addr);
155 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
156 __free_pages(switcher_page[i], 0);
157}
158
159/* IN/OUT insns: enough to get us past boot-time probing. */
160static int emulate_insn(struct lguest *lg)
161{
162 u8 insn;
163 unsigned int insnlen = 0, in = 0, shift = 0;
164 unsigned long physaddr = guest_pa(lg, lg->regs->eip);
165
166 /* This only works for addresses in linear mapping... */
167 if (lg->regs->eip < lg->page_offset)
168 return 0;
169 lgread(lg, &insn, physaddr, 1);
170
171 /* Operand size prefix means it's actually for ax. */
172 if (insn == 0x66) {
173 shift = 16;
174 insnlen = 1;
175 lgread(lg, &insn, physaddr + insnlen, 1);
176 }
177
178 switch (insn & 0xFE) {
179 case 0xE4: /* in <next byte>,%al */
180 insnlen += 2;
181 in = 1;
182 break;
183 case 0xEC: /* in (%dx),%al */
184 insnlen += 1;
185 in = 1;
186 break;
187 case 0xE6: /* out %al,<next byte> */
188 insnlen += 2;
189 break;
190 case 0xEE: /* out %al,(%dx) */
191 insnlen += 1;
192 break;
193 default:
194 return 0;
195 }
196
197 if (in) {
198 /* Lower bit tells is whether it's a 16 or 32 bit access */
199 if (insn & 0x1)
200 lg->regs->eax = 0xFFFFFFFF;
201 else
202 lg->regs->eax |= (0xFFFF << shift);
203 }
204 lg->regs->eip += insnlen;
205 return 1;
206}
207
208int lguest_address_ok(const struct lguest *lg,
209 unsigned long addr, unsigned long len)
210{
211 return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr);
212}
213
214/* Just like get_user, but don't let guest access lguest binary. */
215u32 lgread_u32(struct lguest *lg, unsigned long addr)
216{
217 u32 val = 0;
218
219 /* Don't let them access lguest binary */
220 if (!lguest_address_ok(lg, addr, sizeof(val))
221 || get_user(val, (u32 __user *)addr) != 0)
222 kill_guest(lg, "bad read address %#lx", addr);
223 return val;
224}
225
226void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val)
227{
228 if (!lguest_address_ok(lg, addr, sizeof(val))
229 || put_user(val, (u32 __user *)addr) != 0)
230 kill_guest(lg, "bad write address %#lx", addr);
231}
232
233void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes)
234{
235 if (!lguest_address_ok(lg, addr, bytes)
236 || copy_from_user(b, (void __user *)addr, bytes) != 0) {
237 /* copy_from_user should do this, but as we rely on it... */
238 memset(b, 0, bytes);
239 kill_guest(lg, "bad read address %#lx len %u", addr, bytes);
240 }
241}
242
243void lgwrite(struct lguest *lg, unsigned long addr, const void *b,
244 unsigned bytes)
245{
246 if (!lguest_address_ok(lg, addr, bytes)
247 || copy_to_user((void __user *)addr, b, bytes) != 0)
248 kill_guest(lg, "bad write address %#lx len %u", addr, bytes);
249}
250
251static void set_ts(void)
252{
253 u32 cr0;
254
255 cr0 = read_cr0();
256 if (!(cr0 & 8))
257 write_cr0(cr0|8);
258}
259
260static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
261{
262 if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) {
263 __get_cpu_var(last_guest) = lg;
264 lg->last_pages = pages;
265 lg->changed = CHANGED_ALL;
266 }
267
268 /* These are pretty cheap, so we do them unconditionally. */
269 pages->state.host_cr3 = __pa(current->mm->pgd);
270 map_switcher_in_guest(lg, pages);
271 pages->state.guest_tss.esp1 = lg->esp1;
272 pages->state.guest_tss.ss1 = lg->ss1;
273
274 /* Copy direct trap entries. */
275 if (lg->changed & CHANGED_IDT)
276 copy_traps(lg, pages->state.guest_idt, default_idt_entries);
277
278 /* Copy all GDT entries but the TSS. */
279 if (lg->changed & CHANGED_GDT)
280 copy_gdt(lg, pages->state.guest_gdt);
281 /* If only the TLS entries have changed, copy them. */
282 else if (lg->changed & CHANGED_GDT_TLS)
283 copy_gdt_tls(lg, pages->state.guest_gdt);
284
285 lg->changed = 0;
286}
287
288static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
289{
290 unsigned int clobber;
291
292 copy_in_guest_info(lg, pages);
293
294 /* Put eflags on stack, lcall does rest: suitable for iret return. */
295 asm volatile("pushf; lcall *lguest_entry"
296 : "=a"(clobber), "=b"(clobber)
297 : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir))
298 : "memory", "%edx", "%ecx", "%edi", "%esi");
299}
300
301int run_guest(struct lguest *lg, unsigned long __user *user)
302{
303 while (!lg->dead) {
304 unsigned int cr2 = 0; /* Damn gcc */
305
306 /* Hypercalls first: we might have been out to userspace */
307 do_hypercalls(lg);
308 if (lg->dma_is_pending) {
309 if (put_user(lg->pending_dma, user) ||
310 put_user(lg->pending_key, user+1))
311 return -EFAULT;
312 return sizeof(unsigned long)*2;
313 }
314
315 if (signal_pending(current))
316 return -ERESTARTSYS;
317
318 /* If Waker set break_out, return to Launcher. */
319 if (lg->break_out)
320 return -EAGAIN;
321
322 maybe_do_interrupt(lg);
323
324 try_to_freeze();
325
326 if (lg->dead)
327 break;
328
329 if (lg->halted) {
330 set_current_state(TASK_INTERRUPTIBLE);
331 schedule();
332 continue;
333 }
334
335 local_irq_disable();
336
337 /* Even if *we* don't want FPU trap, guest might... */
338 if (lg->ts)
339 set_ts();
340
341 /* Don't let Guest do SYSENTER: we can't handle it. */
342 if (boot_cpu_has(X86_FEATURE_SEP))
343 wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
344
345 run_guest_once(lg, lguest_pages(raw_smp_processor_id()));
346
347 /* Save cr2 now if we page-faulted. */
348 if (lg->regs->trapnum == 14)
349 cr2 = read_cr2();
350 else if (lg->regs->trapnum == 7)
351 math_state_restore();
352
353 if (boot_cpu_has(X86_FEATURE_SEP))
354 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
355 local_irq_enable();
356
357 switch (lg->regs->trapnum) {
358 case 13: /* We've intercepted a GPF. */
359 if (lg->regs->errcode == 0) {
360 if (emulate_insn(lg))
361 continue;
362 }
363 break;
364 case 14: /* We've intercepted a page fault. */
365 if (demand_page(lg, cr2, lg->regs->errcode))
366 continue;
367
368 /* If lguest_data is NULL, this won't hurt. */
369 if (put_user(cr2, &lg->lguest_data->cr2))
370 kill_guest(lg, "Writing cr2");
371 break;
372 case 7: /* We've intercepted a Device Not Available fault. */
373 /* If they don't want to know, just absorb it. */
374 if (!lg->ts)
375 continue;
376 break;
377 case 32 ... 255: /* Real interrupt, fall thru */
378 cond_resched();
379 case LGUEST_TRAP_ENTRY: /* Handled at top of loop */
380 continue;
381 }
382
383 if (deliver_trap(lg, lg->regs->trapnum))
384 continue;
385
386 kill_guest(lg, "unhandled trap %li at %#lx (%#lx)",
387 lg->regs->trapnum, lg->regs->eip,
388 lg->regs->trapnum == 14 ? cr2 : lg->regs->errcode);
389 }
390 return -ENOENT;
391}
392
393int find_free_guest(void)
394{
395 unsigned int i;
396 for (i = 0; i < MAX_LGUEST_GUESTS; i++)
397 if (!lguests[i].tsk)
398 return i;
399 return -1;
400}
401
402static void adjust_pge(void *on)
403{
404 if (on)
405 write_cr4(read_cr4() | X86_CR4_PGE);
406 else
407 write_cr4(read_cr4() & ~X86_CR4_PGE);
408}
409
410static int __init init(void)
411{
412 int err;
413
414 if (paravirt_enabled()) {
415 printk("lguest is afraid of %s\n", paravirt_ops.name);
416 return -EPERM;
417 }
418
419 err = map_switcher();
420 if (err)
421 return err;
422
423 err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES);
424 if (err) {
425 unmap_switcher();
426 return err;
427 }
428 lguest_io_init();
429
430 err = lguest_device_init();
431 if (err) {
432 free_pagetables();
433 unmap_switcher();
434 return err;
435 }
436 lock_cpu_hotplug();
437 if (cpu_has_pge) { /* We have a broader idea of "global". */
438 cpu_had_pge = 1;
439 on_each_cpu(adjust_pge, (void *)0, 0, 1);
440 clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
441 }
442 unlock_cpu_hotplug();
443 return 0;
444}
445
446static void __exit fini(void)
447{
448 lguest_device_remove();
449 free_pagetables();
450 unmap_switcher();
451 lock_cpu_hotplug();
452 if (cpu_had_pge) {
453 set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
454 on_each_cpu(adjust_pge, (void *)1, 0, 1);
455 }
456 unlock_cpu_hotplug();
457}
458
459module_init(init);
460module_exit(fini);
461MODULE_LICENSE("GPL");
462MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
new file mode 100644
index 000000000000..ea52ca451f74
--- /dev/null
+++ b/drivers/lguest/hypercalls.c
@@ -0,0 +1,192 @@
1/* Actual hypercalls, which allow guests to actually do something.
2 Copyright (C) 2006 Rusty Russell IBM Corporation
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17*/
18#include <linux/uaccess.h>
19#include <linux/syscalls.h>
20#include <linux/mm.h>
21#include <asm/page.h>
22#include <asm/pgtable.h>
23#include <irq_vectors.h>
24#include "lg.h"
25
26static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
27{
28 switch (regs->eax) {
29 case LHCALL_FLUSH_ASYNC:
30 break;
31 case LHCALL_LGUEST_INIT:
32 kill_guest(lg, "already have lguest_data");
33 break;
34 case LHCALL_CRASH: {
35 char msg[128];
36 lgread(lg, msg, regs->edx, sizeof(msg));
37 msg[sizeof(msg)-1] = '\0';
38 kill_guest(lg, "CRASH: %s", msg);
39 break;
40 }
41 case LHCALL_FLUSH_TLB:
42 if (regs->edx)
43 guest_pagetable_clear_all(lg);
44 else
45 guest_pagetable_flush_user(lg);
46 break;
47 case LHCALL_GET_WALLCLOCK: {
48 struct timespec ts;
49 ktime_get_real_ts(&ts);
50 regs->eax = ts.tv_sec;
51 break;
52 }
53 case LHCALL_BIND_DMA:
54 regs->eax = bind_dma(lg, regs->edx, regs->ebx,
55 regs->ecx >> 8, regs->ecx & 0xFF);
56 break;
57 case LHCALL_SEND_DMA:
58 send_dma(lg, regs->edx, regs->ebx);
59 break;
60 case LHCALL_LOAD_GDT:
61 load_guest_gdt(lg, regs->edx, regs->ebx);
62 break;
63 case LHCALL_LOAD_IDT_ENTRY:
64 load_guest_idt_entry(lg, regs->edx, regs->ebx, regs->ecx);
65 break;
66 case LHCALL_NEW_PGTABLE:
67 guest_new_pagetable(lg, regs->edx);
68 break;
69 case LHCALL_SET_STACK:
70 guest_set_stack(lg, regs->edx, regs->ebx, regs->ecx);
71 break;
72 case LHCALL_SET_PTE:
73 guest_set_pte(lg, regs->edx, regs->ebx, mkgpte(regs->ecx));
74 break;
75 case LHCALL_SET_PMD:
76 guest_set_pmd(lg, regs->edx, regs->ebx);
77 break;
78 case LHCALL_LOAD_TLS:
79 guest_load_tls(lg, regs->edx);
80 break;
81 case LHCALL_SET_CLOCKEVENT:
82 guest_set_clockevent(lg, regs->edx);
83 break;
84 case LHCALL_TS:
85 lg->ts = regs->edx;
86 break;
87 case LHCALL_HALT:
88 lg->halted = 1;
89 break;
90 default:
91 kill_guest(lg, "Bad hypercall %li\n", regs->eax);
92 }
93}
94
95/* We always do queued calls before actual hypercall. */
96static void do_async_hcalls(struct lguest *lg)
97{
98 unsigned int i;
99 u8 st[LHCALL_RING_SIZE];
100
101 if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st)))
102 return;
103
104 for (i = 0; i < ARRAY_SIZE(st); i++) {
105 struct lguest_regs regs;
106 unsigned int n = lg->next_hcall;
107
108 if (st[n] == 0xFF)
109 break;
110
111 if (++lg->next_hcall == LHCALL_RING_SIZE)
112 lg->next_hcall = 0;
113
114 if (get_user(regs.eax, &lg->lguest_data->hcalls[n].eax)
115 || get_user(regs.edx, &lg->lguest_data->hcalls[n].edx)
116 || get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx)
117 || get_user(regs.ebx, &lg->lguest_data->hcalls[n].ebx)) {
118 kill_guest(lg, "Fetching async hypercalls");
119 break;
120 }
121
122 do_hcall(lg, &regs);
123 if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) {
124 kill_guest(lg, "Writing result for async hypercall");
125 break;
126 }
127
128 if (lg->dma_is_pending)
129 break;
130 }
131}
132
133static void initialize(struct lguest *lg)
134{
135 u32 tsc_speed;
136
137 if (lg->regs->eax != LHCALL_LGUEST_INIT) {
138 kill_guest(lg, "hypercall %li before LGUEST_INIT",
139 lg->regs->eax);
140 return;
141 }
142
143 /* We only tell the guest to use the TSC if it's reliable. */
144 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
145 tsc_speed = tsc_khz;
146 else
147 tsc_speed = 0;
148
149 lg->lguest_data = (struct lguest_data __user *)lg->regs->edx;
150 /* We check here so we can simply copy_to_user/from_user */
151 if (!lguest_address_ok(lg, lg->regs->edx, sizeof(*lg->lguest_data))) {
152 kill_guest(lg, "bad guest page %p", lg->lguest_data);
153 return;
154 }
155 if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start)
156 || get_user(lg->noirq_end, &lg->lguest_data->noirq_end)
157 /* We reserve the top pgd entry. */
158 || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
159 || put_user(tsc_speed, &lg->lguest_data->tsc_khz)
160 || put_user(lg->guestid, &lg->lguest_data->guestid))
161 kill_guest(lg, "bad guest page %p", lg->lguest_data);
162
163 /* This is the one case where the above accesses might have
164 * been the first write to a Guest page. This may have caused
165 * a copy-on-write fault, but the Guest might be referring to
166 * the old (read-only) page. */
167 guest_pagetable_clear_all(lg);
168}
169
170/* Even if we go out to userspace and come back, we don't want to do
171 * the hypercall again. */
172static void clear_hcall(struct lguest *lg)
173{
174 lg->regs->trapnum = 255;
175}
176
177void do_hypercalls(struct lguest *lg)
178{
179 if (unlikely(!lg->lguest_data)) {
180 if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
181 initialize(lg);
182 clear_hcall(lg);
183 }
184 return;
185 }
186
187 do_async_hcalls(lg);
188 if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
189 do_hcall(lg, lg->regs);
190 clear_hcall(lg);
191 }
192}
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
new file mode 100644
index 000000000000..d9de5bbc613f
--- /dev/null
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -0,0 +1,268 @@
1#include <linux/uaccess.h>
2#include "lg.h"
3
4static unsigned long idt_address(u32 lo, u32 hi)
5{
6 return (lo & 0x0000FFFF) | (hi & 0xFFFF0000);
7}
8
9static int idt_type(u32 lo, u32 hi)
10{
11 return (hi >> 8) & 0xF;
12}
13
14static int idt_present(u32 lo, u32 hi)
15{
16 return (hi & 0x8000);
17}
18
19static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
20{
21 *gstack -= 4;
22 lgwrite_u32(lg, *gstack, val);
23}
24
25static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
26{
27 unsigned long gstack;
28 u32 eflags, ss, irq_enable;
29
30 /* If they want a ring change, we use new stack and push old ss/esp */
31 if ((lg->regs->ss&0x3) != GUEST_PL) {
32 gstack = guest_pa(lg, lg->esp1);
33 ss = lg->ss1;
34 push_guest_stack(lg, &gstack, lg->regs->ss);
35 push_guest_stack(lg, &gstack, lg->regs->esp);
36 } else {
37 gstack = guest_pa(lg, lg->regs->esp);
38 ss = lg->regs->ss;
39 }
40
41 /* We use IF bit in eflags to indicate whether irqs were disabled
42 (it's always 0, since irqs are enabled when guest is running). */
43 eflags = lg->regs->eflags;
44 if (get_user(irq_enable, &lg->lguest_data->irq_enabled))
45 irq_enable = 0;
46 eflags |= (irq_enable & X86_EFLAGS_IF);
47
48 push_guest_stack(lg, &gstack, eflags);
49 push_guest_stack(lg, &gstack, lg->regs->cs);
50 push_guest_stack(lg, &gstack, lg->regs->eip);
51
52 if (has_err)
53 push_guest_stack(lg, &gstack, lg->regs->errcode);
54
55 /* Change the real stack so switcher returns to trap handler */
56 lg->regs->ss = ss;
57 lg->regs->esp = gstack + lg->page_offset;
58 lg->regs->cs = (__KERNEL_CS|GUEST_PL);
59 lg->regs->eip = idt_address(lo, hi);
60
61 /* Disable interrupts for an interrupt gate. */
62 if (idt_type(lo, hi) == 0xE)
63 if (put_user(0, &lg->lguest_data->irq_enabled))
64 kill_guest(lg, "Disabling interrupts");
65}
66
67void maybe_do_interrupt(struct lguest *lg)
68{
69 unsigned int irq;
70 DECLARE_BITMAP(blk, LGUEST_IRQS);
71 struct desc_struct *idt;
72
73 if (!lg->lguest_data)
74 return;
75
76 /* Mask out any interrupts they have blocked. */
77 if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts,
78 sizeof(blk)))
79 return;
80
81 bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS);
82
83 irq = find_first_bit(blk, LGUEST_IRQS);
84 if (irq >= LGUEST_IRQS)
85 return;
86
87 if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end)
88 return;
89
90 /* If they're halted, we re-enable interrupts. */
91 if (lg->halted) {
92 /* Re-enable interrupts. */
93 if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled))
94 kill_guest(lg, "Re-enabling interrupts");
95 lg->halted = 0;
96 } else {
97 /* Maybe they have interrupts disabled? */
98 u32 irq_enabled;
99 if (get_user(irq_enabled, &lg->lguest_data->irq_enabled))
100 irq_enabled = 0;
101 if (!irq_enabled)
102 return;
103 }
104
105 idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq];
106 if (idt_present(idt->a, idt->b)) {
107 clear_bit(irq, lg->irqs_pending);
108 set_guest_interrupt(lg, idt->a, idt->b, 0);
109 }
110}
111
112static int has_err(unsigned int trap)
113{
114 return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
115}
116
117int deliver_trap(struct lguest *lg, unsigned int num)
118{
119 u32 lo = lg->idt[num].a, hi = lg->idt[num].b;
120
121 if (!idt_present(lo, hi))
122 return 0;
123 set_guest_interrupt(lg, lo, hi, has_err(num));
124 return 1;
125}
126
127static int direct_trap(const struct lguest *lg,
128 const struct desc_struct *trap,
129 unsigned int num)
130{
131 /* Hardware interrupts don't go to guest (except syscall). */
132 if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR)
133 return 0;
134
135 /* We intercept page fault (demand shadow paging & cr2 saving)
136 protection fault (in/out emulation) and device not
137 available (TS handling), and hypercall */
138 if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY)
139 return 0;
140
141 /* Interrupt gates (0xE) or not present (0x0) can't go direct. */
142 return idt_type(trap->a, trap->b) == 0xF;
143}
144
145void pin_stack_pages(struct lguest *lg)
146{
147 unsigned int i;
148
149 for (i = 0; i < lg->stack_pages; i++)
150 pin_page(lg, lg->esp1 - i * PAGE_SIZE);
151}
152
153void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
154{
155 /* You cannot have a stack segment with priv level 0. */
156 if ((seg & 0x3) != GUEST_PL)
157 kill_guest(lg, "bad stack segment %i", seg);
158 if (pages > 2)
159 kill_guest(lg, "bad stack pages %u", pages);
160 lg->ss1 = seg;
161 lg->esp1 = esp;
162 lg->stack_pages = pages;
163 pin_stack_pages(lg);
164}
165
166/* Set up trap in IDT. */
167static void set_trap(struct lguest *lg, struct desc_struct *trap,
168 unsigned int num, u32 lo, u32 hi)
169{
170 u8 type = idt_type(lo, hi);
171
172 if (!idt_present(lo, hi)) {
173 trap->a = trap->b = 0;
174 return;
175 }
176
177 if (type != 0xE && type != 0xF)
178 kill_guest(lg, "bad IDT type %i", type);
179
180 trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF);
181 trap->b = (hi&0xFFFFEF00);
182}
183
184void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi)
185{
186 /* Guest never handles: NMI, doublefault, hypercall, spurious irq. */
187 if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY)
188 return;
189
190 lg->changed |= CHANGED_IDT;
191 if (num < ARRAY_SIZE(lg->idt))
192 set_trap(lg, &lg->idt[num], num, lo, hi);
193 else if (num == SYSCALL_VECTOR)
194 set_trap(lg, &lg->syscall_idt, num, lo, hi);
195}
196
197static void default_idt_entry(struct desc_struct *idt,
198 int trap,
199 const unsigned long handler)
200{
201 u32 flags = 0x8e00;
202
203 /* They can't "int" into any of them except hypercall. */
204 if (trap == LGUEST_TRAP_ENTRY)
205 flags |= (GUEST_PL << 13);
206
207 idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF);
208 idt->b = (handler&0xFFFF0000) | flags;
209}
210
211void setup_default_idt_entries(struct lguest_ro_state *state,
212 const unsigned long *def)
213{
214 unsigned int i;
215
216 for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++)
217 default_idt_entry(&state->guest_idt[i], i, def[i]);
218}
219
220void copy_traps(const struct lguest *lg, struct desc_struct *idt,
221 const unsigned long *def)
222{
223 unsigned int i;
224
225 /* All hardware interrupts are same whatever the guest: only the
226 * traps might be different. */
227 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) {
228 if (direct_trap(lg, &lg->idt[i], i))
229 idt[i] = lg->idt[i];
230 else
231 default_idt_entry(&idt[i], i, def[i]);
232 }
233 i = SYSCALL_VECTOR;
234 if (direct_trap(lg, &lg->syscall_idt, i))
235 idt[i] = lg->syscall_idt;
236 else
237 default_idt_entry(&idt[i], i, def[i]);
238}
239
240void guest_set_clockevent(struct lguest *lg, unsigned long delta)
241{
242 ktime_t expires;
243
244 if (unlikely(delta == 0)) {
245 /* Clock event device is shutting down. */
246 hrtimer_cancel(&lg->hrt);
247 return;
248 }
249
250 expires = ktime_add_ns(ktime_get_real(), delta);
251 hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS);
252}
253
254static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
255{
256 struct lguest *lg = container_of(timer, struct lguest, hrt);
257
258 set_bit(0, lg->irqs_pending);
259 if (lg->halted)
260 wake_up_process(lg->tsk);
261 return HRTIMER_NORESTART;
262}
263
264void init_clockdev(struct lguest *lg)
265{
266 hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
267 lg->hrt.function = clockdev_fn;
268}
diff --git a/drivers/lguest/io.c b/drivers/lguest/io.c
new file mode 100644
index 000000000000..06bdba2337ef
--- /dev/null
+++ b/drivers/lguest/io.c
@@ -0,0 +1,399 @@
1/* Simple I/O model for guests, based on shared memory.
2 * Copyright (C) 2006 Rusty Russell IBM Corporation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include <linux/types.h>
19#include <linux/futex.h>
20#include <linux/jhash.h>
21#include <linux/mm.h>
22#include <linux/highmem.h>
23#include <linux/uaccess.h>
24#include "lg.h"
25
26static struct list_head dma_hash[61];
27
28void lguest_io_init(void)
29{
30 unsigned int i;
31
32 for (i = 0; i < ARRAY_SIZE(dma_hash); i++)
33 INIT_LIST_HEAD(&dma_hash[i]);
34}
35
36/* FIXME: allow multi-page lengths. */
37static int check_dma_list(struct lguest *lg, const struct lguest_dma *dma)
38{
39 unsigned int i;
40
41 for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
42 if (!dma->len[i])
43 return 1;
44 if (!lguest_address_ok(lg, dma->addr[i], dma->len[i]))
45 goto kill;
46 if (dma->len[i] > PAGE_SIZE)
47 goto kill;
48 /* We could do over a page, but is it worth it? */
49 if ((dma->addr[i] % PAGE_SIZE) + dma->len[i] > PAGE_SIZE)
50 goto kill;
51 }
52 return 1;
53
54kill:
55 kill_guest(lg, "bad DMA entry: %u@%#lx", dma->len[i], dma->addr[i]);
56 return 0;
57}
58
59static unsigned int hash(const union futex_key *key)
60{
61 return jhash2((u32*)&key->both.word,
62 (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
63 key->both.offset)
64 % ARRAY_SIZE(dma_hash);
65}
66
67static inline int key_eq(const union futex_key *a, const union futex_key *b)
68{
69 return (a->both.word == b->both.word
70 && a->both.ptr == b->both.ptr
71 && a->both.offset == b->both.offset);
72}
73
74/* Must hold read lock on dmainfo owner's current->mm->mmap_sem */
75static void unlink_dma(struct lguest_dma_info *dmainfo)
76{
77 BUG_ON(!mutex_is_locked(&lguest_lock));
78 dmainfo->interrupt = 0;
79 list_del(&dmainfo->list);
80 drop_futex_key_refs(&dmainfo->key);
81}
82
83static int unbind_dma(struct lguest *lg,
84 const union futex_key *key,
85 unsigned long dmas)
86{
87 int i, ret = 0;
88
89 for (i = 0; i < LGUEST_MAX_DMA; i++) {
90 if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) {
91 unlink_dma(&lg->dma[i]);
92 ret = 1;
93 break;
94 }
95 }
96 return ret;
97}
98
99int bind_dma(struct lguest *lg,
100 unsigned long ukey, unsigned long dmas, u16 numdmas, u8 interrupt)
101{
102 unsigned int i;
103 int ret = 0;
104 union futex_key key;
105 struct rw_semaphore *fshared = &current->mm->mmap_sem;
106
107 if (interrupt >= LGUEST_IRQS)
108 return 0;
109
110 mutex_lock(&lguest_lock);
111 down_read(fshared);
112 if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) {
113 kill_guest(lg, "bad dma key %#lx", ukey);
114 goto unlock;
115 }
116 get_futex_key_refs(&key);
117
118 if (interrupt == 0)
119 ret = unbind_dma(lg, &key, dmas);
120 else {
121 for (i = 0; i < LGUEST_MAX_DMA; i++) {
122 if (lg->dma[i].interrupt)
123 continue;
124
125 lg->dma[i].dmas = dmas;
126 lg->dma[i].num_dmas = numdmas;
127 lg->dma[i].next_dma = 0;
128 lg->dma[i].key = key;
129 lg->dma[i].guestid = lg->guestid;
130 lg->dma[i].interrupt = interrupt;
131 list_add(&lg->dma[i].list, &dma_hash[hash(&key)]);
132 ret = 1;
133 goto unlock;
134 }
135 }
136 drop_futex_key_refs(&key);
137unlock:
138 up_read(fshared);
139 mutex_unlock(&lguest_lock);
140 return ret;
141}
142
143/* lgread from another guest */
144static int lgread_other(struct lguest *lg,
145 void *buf, u32 addr, unsigned bytes)
146{
147 if (!lguest_address_ok(lg, addr, bytes)
148 || access_process_vm(lg->tsk, addr, buf, bytes, 0) != bytes) {
149 memset(buf, 0, bytes);
150 kill_guest(lg, "bad address in registered DMA struct");
151 return 0;
152 }
153 return 1;
154}
155
156/* lgwrite to another guest */
157static int lgwrite_other(struct lguest *lg, u32 addr,
158 const void *buf, unsigned bytes)
159{
160 if (!lguest_address_ok(lg, addr, bytes)
161 || (access_process_vm(lg->tsk, addr, (void *)buf, bytes, 1)
162 != bytes)) {
163 kill_guest(lg, "bad address writing to registered DMA");
164 return 0;
165 }
166 return 1;
167}
168
169static u32 copy_data(struct lguest *srclg,
170 const struct lguest_dma *src,
171 const struct lguest_dma *dst,
172 struct page *pages[])
173{
174 unsigned int totlen, si, di, srcoff, dstoff;
175 void *maddr = NULL;
176
177 totlen = 0;
178 si = di = 0;
179 srcoff = dstoff = 0;
180 while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si]
181 && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) {
182 u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff);
183
184 if (!maddr)
185 maddr = kmap(pages[di]);
186
187 /* FIXME: This is not completely portable, since
188 archs do different things for copy_to_user_page. */
189 if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE,
190 (void *__user)src->addr[si], len) != 0) {
191 kill_guest(srclg, "bad address in sending DMA");
192 totlen = 0;
193 break;
194 }
195
196 totlen += len;
197 srcoff += len;
198 dstoff += len;
199 if (srcoff == src->len[si]) {
200 si++;
201 srcoff = 0;
202 }
203 if (dstoff == dst->len[di]) {
204 kunmap(pages[di]);
205 maddr = NULL;
206 di++;
207 dstoff = 0;
208 }
209 }
210
211 if (maddr)
212 kunmap(pages[di]);
213
214 return totlen;
215}
216
217/* Src is us, ie. current. */
218static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src,
219 struct lguest *dstlg, const struct lguest_dma *dst)
220{
221 int i;
222 u32 ret;
223 struct page *pages[LGUEST_MAX_DMA_SECTIONS];
224
225 if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src))
226 return 0;
227
228 /* First get the destination pages */
229 for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
230 if (dst->len[i] == 0)
231 break;
232 if (get_user_pages(dstlg->tsk, dstlg->mm,
233 dst->addr[i], 1, 1, 1, pages+i, NULL)
234 != 1) {
235 kill_guest(dstlg, "Error mapping DMA pages");
236 ret = 0;
237 goto drop_pages;
238 }
239 }
240
241 /* Now copy until we run out of src or dst. */
242 ret = copy_data(srclg, src, dst, pages);
243
244drop_pages:
245 while (--i >= 0)
246 put_page(pages[i]);
247 return ret;
248}
249
250static int dma_transfer(struct lguest *srclg,
251 unsigned long udma,
252 struct lguest_dma_info *dst)
253{
254 struct lguest_dma dst_dma, src_dma;
255 struct lguest *dstlg;
256 u32 i, dma = 0;
257
258 dstlg = &lguests[dst->guestid];
259 /* Get our dma list. */
260 lgread(srclg, &src_dma, udma, sizeof(src_dma));
261
262 /* We can't deadlock against them dmaing to us, because this
263 * is all under the lguest_lock. */
264 down_read(&dstlg->mm->mmap_sem);
265
266 for (i = 0; i < dst->num_dmas; i++) {
267 dma = (dst->next_dma + i) % dst->num_dmas;
268 if (!lgread_other(dstlg, &dst_dma,
269 dst->dmas + dma * sizeof(struct lguest_dma),
270 sizeof(dst_dma))) {
271 goto fail;
272 }
273 if (!dst_dma.used_len)
274 break;
275 }
276 if (i != dst->num_dmas) {
277 unsigned long used_lenp;
278 unsigned int ret;
279
280 ret = do_dma(srclg, &src_dma, dstlg, &dst_dma);
281 /* Put used length in src. */
282 lgwrite_u32(srclg,
283 udma+offsetof(struct lguest_dma, used_len), ret);
284 if (ret == 0 && src_dma.len[0] != 0)
285 goto fail;
286
287 /* Make sure destination sees contents before length. */
288 wmb();
289 used_lenp = dst->dmas
290 + dma * sizeof(struct lguest_dma)
291 + offsetof(struct lguest_dma, used_len);
292 lgwrite_other(dstlg, used_lenp, &ret, sizeof(ret));
293 dst->next_dma++;
294 }
295 up_read(&dstlg->mm->mmap_sem);
296
297 /* Do this last so dst doesn't simply sleep on lock. */
298 set_bit(dst->interrupt, dstlg->irqs_pending);
299 wake_up_process(dstlg->tsk);
300 return i == dst->num_dmas;
301
302fail:
303 up_read(&dstlg->mm->mmap_sem);
304 return 0;
305}
306
307void send_dma(struct lguest *lg, unsigned long ukey, unsigned long udma)
308{
309 union futex_key key;
310 int empty = 0;
311 struct rw_semaphore *fshared = &current->mm->mmap_sem;
312
313again:
314 mutex_lock(&lguest_lock);
315 down_read(fshared);
316 if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) {
317 kill_guest(lg, "bad sending DMA key");
318 goto unlock;
319 }
320 /* Shared mapping? Look for other guests... */
321 if (key.shared.offset & 1) {
322 struct lguest_dma_info *i;
323 list_for_each_entry(i, &dma_hash[hash(&key)], list) {
324 if (i->guestid == lg->guestid)
325 continue;
326 if (!key_eq(&key, &i->key))
327 continue;
328
329 empty += dma_transfer(lg, udma, i);
330 break;
331 }
332 if (empty == 1) {
333 /* Give any recipients one chance to restock. */
334 up_read(&current->mm->mmap_sem);
335 mutex_unlock(&lguest_lock);
336 empty++;
337 goto again;
338 }
339 } else {
340 /* Private mapping: tell our userspace. */
341 lg->dma_is_pending = 1;
342 lg->pending_dma = udma;
343 lg->pending_key = ukey;
344 }
345unlock:
346 up_read(fshared);
347 mutex_unlock(&lguest_lock);
348}
349
350void release_all_dma(struct lguest *lg)
351{
352 unsigned int i;
353
354 BUG_ON(!mutex_is_locked(&lguest_lock));
355
356 down_read(&lg->mm->mmap_sem);
357 for (i = 0; i < LGUEST_MAX_DMA; i++) {
358 if (lg->dma[i].interrupt)
359 unlink_dma(&lg->dma[i]);
360 }
361 up_read(&lg->mm->mmap_sem);
362}
363
364/* Userspace wants a dma buffer from this guest. */
365unsigned long get_dma_buffer(struct lguest *lg,
366 unsigned long ukey, unsigned long *interrupt)
367{
368 unsigned long ret = 0;
369 union futex_key key;
370 struct lguest_dma_info *i;
371 struct rw_semaphore *fshared = &current->mm->mmap_sem;
372
373 mutex_lock(&lguest_lock);
374 down_read(fshared);
375 if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) {
376 kill_guest(lg, "bad registered DMA buffer");
377 goto unlock;
378 }
379 list_for_each_entry(i, &dma_hash[hash(&key)], list) {
380 if (key_eq(&key, &i->key) && i->guestid == lg->guestid) {
381 unsigned int j;
382 for (j = 0; j < i->num_dmas; j++) {
383 struct lguest_dma dma;
384
385 ret = i->dmas + j * sizeof(struct lguest_dma);
386 lgread(lg, &dma, ret, sizeof(dma));
387 if (dma.used_len == 0)
388 break;
389 }
390 *interrupt = i->interrupt;
391 break;
392 }
393 }
394unlock:
395 up_read(fshared);
396 mutex_unlock(&lguest_lock);
397 return ret;
398}
399
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
new file mode 100644
index 000000000000..3e2ddfbc816e
--- /dev/null
+++ b/drivers/lguest/lg.h
@@ -0,0 +1,261 @@
1#ifndef _LGUEST_H
2#define _LGUEST_H
3
4#include <asm/desc.h>
5
6#define GDT_ENTRY_LGUEST_CS 10
7#define GDT_ENTRY_LGUEST_DS 11
8#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8)
9#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8)
10
11#ifndef __ASSEMBLY__
12#include <linux/types.h>
13#include <linux/init.h>
14#include <linux/stringify.h>
15#include <linux/binfmts.h>
16#include <linux/futex.h>
17#include <linux/lguest.h>
18#include <linux/lguest_launcher.h>
19#include <linux/wait.h>
20#include <linux/err.h>
21#include <asm/semaphore.h>
22#include "irq_vectors.h"
23
24#define GUEST_PL 1
25
26struct lguest_regs
27{
28 /* Manually saved part. */
29 unsigned long ebx, ecx, edx;
30 unsigned long esi, edi, ebp;
31 unsigned long gs;
32 unsigned long eax;
33 unsigned long fs, ds, es;
34 unsigned long trapnum, errcode;
35 /* Trap pushed part */
36 unsigned long eip;
37 unsigned long cs;
38 unsigned long eflags;
39 unsigned long esp;
40 unsigned long ss;
41};
42
43void free_pagetables(void);
44int init_pagetables(struct page **switcher_page, unsigned int pages);
45
46/* Full 4G segment descriptors, suitable for CS and DS. */
47#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00})
48#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300})
49
50struct lguest_dma_info
51{
52 struct list_head list;
53 union futex_key key;
54 unsigned long dmas;
55 u16 next_dma;
56 u16 num_dmas;
57 u16 guestid;
58 u8 interrupt; /* 0 when not registered */
59};
60
61/* We have separate types for the guest's ptes & pgds and the shadow ptes &
62 * pgds. Since this host might use three-level pagetables and the guest and
63 * shadow pagetables don't, we can't use the normal pte_t/pgd_t. */
64typedef union {
65 struct { unsigned flags:12, pfn:20; };
66 struct { unsigned long val; } raw;
67} spgd_t;
68typedef union {
69 struct { unsigned flags:12, pfn:20; };
70 struct { unsigned long val; } raw;
71} spte_t;
72typedef union {
73 struct { unsigned flags:12, pfn:20; };
74 struct { unsigned long val; } raw;
75} gpgd_t;
76typedef union {
77 struct { unsigned flags:12, pfn:20; };
78 struct { unsigned long val; } raw;
79} gpte_t;
80#define mkgpte(_val) ((gpte_t){.raw.val = _val})
81#define mkgpgd(_val) ((gpgd_t){.raw.val = _val})
82
83struct pgdir
84{
85 unsigned long cr3;
86 spgd_t *pgdir;
87};
88
89/* This is a guest-specific page (mapped ro) into the guest. */
90struct lguest_ro_state
91{
92 /* Host information we need to restore when we switch back. */
93 u32 host_cr3;
94 struct Xgt_desc_struct host_idt_desc;
95 struct Xgt_desc_struct host_gdt_desc;
96 u32 host_sp;
97
98 /* Fields which are used when guest is running. */
99 struct Xgt_desc_struct guest_idt_desc;
100 struct Xgt_desc_struct guest_gdt_desc;
101 struct i386_hw_tss guest_tss;
102 struct desc_struct guest_idt[IDT_ENTRIES];
103 struct desc_struct guest_gdt[GDT_ENTRIES];
104};
105
106/* We have two pages shared with guests, per cpu. */
107struct lguest_pages
108{
109 /* This is the stack page mapped rw in guest */
110 char spare[PAGE_SIZE - sizeof(struct lguest_regs)];
111 struct lguest_regs regs;
112
113 /* This is the host state & guest descriptor page, ro in guest */
114 struct lguest_ro_state state;
115} __attribute__((aligned(PAGE_SIZE)));
116
117#define CHANGED_IDT 1
118#define CHANGED_GDT 2
119#define CHANGED_GDT_TLS 4 /* Actually a subset of CHANGED_GDT */
120#define CHANGED_ALL 3
121
122/* The private info the thread maintains about the guest. */
123struct lguest
124{
125 /* At end of a page shared mapped over lguest_pages in guest. */
126 unsigned long regs_page;
127 struct lguest_regs *regs;
128 struct lguest_data __user *lguest_data;
129 struct task_struct *tsk;
130 struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */
131 u16 guestid;
132 u32 pfn_limit;
133 u32 page_offset;
134 u32 cr2;
135 int halted;
136 int ts;
137 u32 next_hcall;
138 u32 esp1;
139 u8 ss1;
140
141 /* Do we need to stop what we're doing and return to userspace? */
142 int break_out;
143 wait_queue_head_t break_wq;
144
145 /* Bitmap of what has changed: see CHANGED_* above. */
146 int changed;
147 struct lguest_pages *last_pages;
148
149 /* We keep a small number of these. */
150 u32 pgdidx;
151 struct pgdir pgdirs[4];
152
153 /* Cached wakeup: we hold a reference to this task. */
154 struct task_struct *wake;
155
156 unsigned long noirq_start, noirq_end;
157 int dma_is_pending;
158 unsigned long pending_dma; /* struct lguest_dma */
159 unsigned long pending_key; /* address they're sending to */
160
161 unsigned int stack_pages;
162 u32 tsc_khz;
163
164 struct lguest_dma_info dma[LGUEST_MAX_DMA];
165
166 /* Dead? */
167 const char *dead;
168
169 /* The GDT entries copied into lguest_ro_state when running. */
170 struct desc_struct gdt[GDT_ENTRIES];
171
172 /* The IDT entries: some copied into lguest_ro_state when running. */
173 struct desc_struct idt[FIRST_EXTERNAL_VECTOR+LGUEST_IRQS];
174 struct desc_struct syscall_idt;
175
176 /* Virtual clock device */
177 struct hrtimer hrt;
178
179 /* Pending virtual interrupts */
180 DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
181};
182
183extern struct lguest lguests[];
184extern struct mutex lguest_lock;
185
186/* core.c: */
187u32 lgread_u32(struct lguest *lg, unsigned long addr);
188void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val);
189void lgread(struct lguest *lg, void *buf, unsigned long addr, unsigned len);
190void lgwrite(struct lguest *lg, unsigned long, const void *buf, unsigned len);
191int find_free_guest(void);
192int lguest_address_ok(const struct lguest *lg,
193 unsigned long addr, unsigned long len);
194int run_guest(struct lguest *lg, unsigned long __user *user);
195
196
197/* interrupts_and_traps.c: */
198void maybe_do_interrupt(struct lguest *lg);
199int deliver_trap(struct lguest *lg, unsigned int num);
200void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 hi);
201void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages);
202void pin_stack_pages(struct lguest *lg);
203void setup_default_idt_entries(struct lguest_ro_state *state,
204 const unsigned long *def);
205void copy_traps(const struct lguest *lg, struct desc_struct *idt,
206 const unsigned long *def);
207void guest_set_clockevent(struct lguest *lg, unsigned long delta);
208void init_clockdev(struct lguest *lg);
209
210/* segments.c: */
211void setup_default_gdt_entries(struct lguest_ro_state *state);
212void setup_guest_gdt(struct lguest *lg);
213void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num);
214void guest_load_tls(struct lguest *lg, unsigned long tls_array);
215void copy_gdt(const struct lguest *lg, struct desc_struct *gdt);
216void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt);
217
218/* page_tables.c: */
219int init_guest_pagetable(struct lguest *lg, unsigned long pgtable);
220void free_guest_pagetable(struct lguest *lg);
221void guest_new_pagetable(struct lguest *lg, unsigned long pgtable);
222void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 i);
223void guest_pagetable_clear_all(struct lguest *lg);
224void guest_pagetable_flush_user(struct lguest *lg);
225void guest_set_pte(struct lguest *lg, unsigned long cr3,
226 unsigned long vaddr, gpte_t val);
227void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages);
228int demand_page(struct lguest *info, unsigned long cr2, int errcode);
229void pin_page(struct lguest *lg, unsigned long vaddr);
230
231/* lguest_user.c: */
232int lguest_device_init(void);
233void lguest_device_remove(void);
234
235/* io.c: */
236void lguest_io_init(void);
237int bind_dma(struct lguest *lg,
238 unsigned long key, unsigned long udma, u16 numdmas, u8 interrupt);
239void send_dma(struct lguest *info, unsigned long key, unsigned long udma);
240void release_all_dma(struct lguest *lg);
241unsigned long get_dma_buffer(struct lguest *lg, unsigned long key,
242 unsigned long *interrupt);
243
244/* hypercalls.c: */
245void do_hypercalls(struct lguest *lg);
246
247#define kill_guest(lg, fmt...) \
248do { \
249 if (!(lg)->dead) { \
250 (lg)->dead = kasprintf(GFP_ATOMIC, fmt); \
251 if (!(lg)->dead) \
252 (lg)->dead = ERR_PTR(-ENOMEM); \
253 } \
254} while(0)
255
256static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
257{
258 return vaddr - lg->page_offset;
259}
260#endif /* __ASSEMBLY__ */
261#endif /* _LGUEST_H */
diff --git a/drivers/lguest/lguest.c b/drivers/lguest/lguest.c
index b3a72bd8d6f5..b9a58b78c990 100644
--- a/drivers/lguest/lguest.c
+++ b/drivers/lguest/lguest.c
@@ -25,6 +25,8 @@
25#include <linux/screen_info.h> 25#include <linux/screen_info.h>
26#include <linux/irq.h> 26#include <linux/irq.h>
27#include <linux/interrupt.h> 27#include <linux/interrupt.h>
28#include <linux/clocksource.h>
29#include <linux/clockchips.h>
28#include <linux/lguest.h> 30#include <linux/lguest.h>
29#include <linux/lguest_launcher.h> 31#include <linux/lguest_launcher.h>
30#include <linux/lguest_bus.h> 32#include <linux/lguest_bus.h>
@@ -37,6 +39,7 @@
37#include <asm/e820.h> 39#include <asm/e820.h>
38#include <asm/mce.h> 40#include <asm/mce.h>
39#include <asm/io.h> 41#include <asm/io.h>
42//#include <asm/sched-clock.h>
40 43
41/* Declarations for definitions in lguest_guest.S */ 44/* Declarations for definitions in lguest_guest.S */
42extern char lguest_noirq_start[], lguest_noirq_end[]; 45extern char lguest_noirq_start[], lguest_noirq_end[];
@@ -54,7 +57,6 @@ struct lguest_data lguest_data = {
54 .blocked_interrupts = { 1 }, /* Block timer interrupts */ 57 .blocked_interrupts = { 1 }, /* Block timer interrupts */
55}; 58};
56struct lguest_device_desc *lguest_devices; 59struct lguest_device_desc *lguest_devices;
57static __initdata const struct lguest_boot_info *boot = __va(0);
58 60
59static enum paravirt_lazy_mode lazy_mode; 61static enum paravirt_lazy_mode lazy_mode;
60static void lguest_lazy_mode(enum paravirt_lazy_mode mode) 62static void lguest_lazy_mode(enum paravirt_lazy_mode mode)
@@ -210,7 +212,7 @@ static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
210 case 1: /* Basic feature request. */ 212 case 1: /* Basic feature request. */
211 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ 213 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
212 *ecx &= 0x00002201; 214 *ecx &= 0x00002201;
213 /* Similarly: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */ 215 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
214 *edx &= 0x07808101; 216 *edx &= 0x07808101;
215 /* Host wants to know when we flush kernel pages: set PGE. */ 217 /* Host wants to know when we flush kernel pages: set PGE. */
216 *edx |= 0x00002000; 218 *edx |= 0x00002000;
@@ -346,24 +348,104 @@ static unsigned long lguest_get_wallclock(void)
346 return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0); 348 return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0);
347} 349}
348 350
351static cycle_t lguest_clock_read(void)
352{
353 if (lguest_data.tsc_khz)
354 return native_read_tsc();
355 else
356 return jiffies;
357}
358
359/* This is what we tell the kernel is our clocksource. */
360static struct clocksource lguest_clock = {
361 .name = "lguest",
362 .rating = 400,
363 .read = lguest_clock_read,
364};
365
366/* We also need a "struct clock_event_device": Linux asks us to set it to go
367 * off some time in the future. Actually, James Morris figured all this out, I
368 * just applied the patch. */
369static int lguest_clockevent_set_next_event(unsigned long delta,
370 struct clock_event_device *evt)
371{
372 if (delta < LG_CLOCK_MIN_DELTA) {
373 if (printk_ratelimit())
374 printk(KERN_DEBUG "%s: small delta %lu ns\n",
375 __FUNCTION__, delta);
376 return -ETIME;
377 }
378 hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0);
379 return 0;
380}
381
382static void lguest_clockevent_set_mode(enum clock_event_mode mode,
383 struct clock_event_device *evt)
384{
385 switch (mode) {
386 case CLOCK_EVT_MODE_UNUSED:
387 case CLOCK_EVT_MODE_SHUTDOWN:
388 /* A 0 argument shuts the clock down. */
389 hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0);
390 break;
391 case CLOCK_EVT_MODE_ONESHOT:
392 /* This is what we expect. */
393 break;
394 case CLOCK_EVT_MODE_PERIODIC:
395 BUG();
396 }
397}
398
399/* This describes our primitive timer chip. */
400static struct clock_event_device lguest_clockevent = {
401 .name = "lguest",
402 .features = CLOCK_EVT_FEAT_ONESHOT,
403 .set_next_event = lguest_clockevent_set_next_event,
404 .set_mode = lguest_clockevent_set_mode,
405 .rating = INT_MAX,
406 .mult = 1,
407 .shift = 0,
408 .min_delta_ns = LG_CLOCK_MIN_DELTA,
409 .max_delta_ns = LG_CLOCK_MAX_DELTA,
410};
411
412/* This is the Guest timer interrupt handler (hardware interrupt 0). We just
413 * call the clockevent infrastructure and it does whatever needs doing. */
349static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) 414static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
350{ 415{
351 do_timer(hcall(LHCALL_TIMER_READ, 0, 0, 0)); 416 unsigned long flags;
352 update_process_times(user_mode_vm(get_irq_regs())); 417
418 /* Don't interrupt us while this is running. */
419 local_irq_save(flags);
420 lguest_clockevent.event_handler(&lguest_clockevent);
421 local_irq_restore(flags);
353} 422}
354 423
355static u64 sched_clock_base;
356static void lguest_time_init(void) 424static void lguest_time_init(void)
357{ 425{
358 set_irq_handler(0, lguest_time_irq); 426 set_irq_handler(0, lguest_time_irq);
359 hcall(LHCALL_TIMER_READ, 0, 0, 0);
360 sched_clock_base = jiffies_64;
361 enable_lguest_irq(0);
362}
363 427
364static unsigned long long lguest_sched_clock(void) 428 /* We use the TSC if the Host tells us we can, otherwise a dumb
365{ 429 * jiffies-based clock. */
366 return (jiffies_64 - sched_clock_base) * (1000000000 / HZ); 430 if (lguest_data.tsc_khz) {
431 lguest_clock.shift = 22;
432 lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz,
433 lguest_clock.shift);
434 lguest_clock.mask = CLOCKSOURCE_MASK(64);
435 lguest_clock.flags = CLOCK_SOURCE_IS_CONTINUOUS;
436 } else {
437 /* To understand this, start at kernel/time/jiffies.c... */
438 lguest_clock.shift = 8;
439 lguest_clock.mult = (((u64)NSEC_PER_SEC<<8)/ACTHZ) << 8;
440 lguest_clock.mask = CLOCKSOURCE_MASK(32);
441 }
442 clocksource_register(&lguest_clock);
443
444 /* We can't set cpumask in the initializer: damn C limitations! */
445 lguest_clockevent.cpumask = cpumask_of_cpu(0);
446 clockevents_register_device(&lguest_clockevent);
447
448 enable_lguest_irq(0);
367} 449}
368 450
369static void lguest_load_esp0(struct tss_struct *tss, 451static void lguest_load_esp0(struct tss_struct *tss,
@@ -418,8 +500,7 @@ static __init char *lguest_memory_setup(void)
418 /* We do this here because lockcheck barfs if before start_kernel */ 500 /* We do this here because lockcheck barfs if before start_kernel */
419 atomic_notifier_chain_register(&panic_notifier_list, &paniced); 501 atomic_notifier_chain_register(&panic_notifier_list, &paniced);
420 502
421 e820.nr_map = 0; 503 add_memory_region(E820_MAP->addr, E820_MAP->size, E820_MAP->type);
422 add_memory_region(0, PFN_PHYS(boot->max_pfn), E820_RAM);
423 return "LGUEST"; 504 return "LGUEST";
424} 505}
425 506
@@ -450,8 +531,13 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len)
450 return insn_len; 531 return insn_len;
451} 532}
452 533
453__init void lguest_init(void) 534__init void lguest_init(void *boot)
454{ 535{
536 /* Copy boot parameters first. */
537 memcpy(&boot_params, boot, PARAM_SIZE);
538 memcpy(boot_command_line, __va(boot_params.hdr.cmd_line_ptr),
539 COMMAND_LINE_SIZE);
540
455 paravirt_ops.name = "lguest"; 541 paravirt_ops.name = "lguest";
456 paravirt_ops.paravirt_enabled = 1; 542 paravirt_ops.paravirt_enabled = 1;
457 paravirt_ops.kernel_rpl = 1; 543 paravirt_ops.kernel_rpl = 1;
@@ -498,10 +584,8 @@ __init void lguest_init(void)
498 paravirt_ops.time_init = lguest_time_init; 584 paravirt_ops.time_init = lguest_time_init;
499 paravirt_ops.set_lazy_mode = lguest_lazy_mode; 585 paravirt_ops.set_lazy_mode = lguest_lazy_mode;
500 paravirt_ops.wbinvd = lguest_wbinvd; 586 paravirt_ops.wbinvd = lguest_wbinvd;
501 paravirt_ops.sched_clock = lguest_sched_clock;
502 587
503 hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0); 588 hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
504 strncpy(boot_command_line, boot->cmdline, COMMAND_LINE_SIZE);
505 589
506 /* We use top of mem for initial pagetables. */ 590 /* We use top of mem for initial pagetables. */
507 init_pg_tables_end = __pa(pg0); 591 init_pg_tables_end = __pa(pg0);
@@ -532,13 +616,6 @@ __init void lguest_init(void)
532 616
533 add_preferred_console("hvc", 0, NULL); 617 add_preferred_console("hvc", 0, NULL);
534 618
535 if (boot->initrd_size) {
536 /* We stash this at top of memory. */
537 INITRD_START = boot->max_pfn*PAGE_SIZE - boot->initrd_size;
538 INITRD_SIZE = boot->initrd_size;
539 LOADER_TYPE = 0xFF;
540 }
541
542 pm_power_off = lguest_power_off; 619 pm_power_off = lguest_power_off;
543 start_kernel(); 620 start_kernel();
544} 621}
diff --git a/drivers/lguest/lguest_asm.S b/drivers/lguest/lguest_asm.S
index 5ac3d20bb184..00046c57b5ba 100644
--- a/drivers/lguest/lguest_asm.S
+++ b/drivers/lguest/lguest_asm.S
@@ -10,7 +10,8 @@
10 * This is where we begin: we have a magic signature which the launcher looks 10 * This is where we begin: we have a magic signature which the launcher looks
11 * for. The plan is that the Linux boot protocol will be extended with a 11 * for. The plan is that the Linux boot protocol will be extended with a
12 * "platform type" field which will guide us here from the normal entry point, 12 * "platform type" field which will guide us here from the normal entry point,
13 * but for the moment this suffices. 13 * but for the moment this suffices. We pass the virtual address of the boot
14 * info to lguest_init().
14 * 15 *
15 * We put it in .init.text will be discarded after boot. 16 * We put it in .init.text will be discarded after boot.
16 */ 17 */
@@ -18,6 +19,8 @@
18.ascii "GenuineLguest" 19.ascii "GenuineLguest"
19 /* Set up initial stack. */ 20 /* Set up initial stack. */
20 movl $(init_thread_union+THREAD_SIZE),%esp 21 movl $(init_thread_union+THREAD_SIZE),%esp
22 movl %esi, %eax
23 addl $__PAGE_OFFSET, %eax
21 jmp lguest_init 24 jmp lguest_init
22 25
23/* The templates for inline patching. */ 26/* The templates for inline patching. */
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
new file mode 100644
index 000000000000..e90d7a783daf
--- /dev/null
+++ b/drivers/lguest/lguest_user.c
@@ -0,0 +1,236 @@
1/* Userspace control of the guest, via /dev/lguest. */
2#include <linux/uaccess.h>
3#include <linux/miscdevice.h>
4#include <linux/fs.h>
5#include "lg.h"
6
7static void setup_regs(struct lguest_regs *regs, unsigned long start)
8{
9 /* Write out stack in format lguest expects, so we can switch to it. */
10 regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
11 regs->cs = __KERNEL_CS|GUEST_PL;
12 regs->eflags = 0x202; /* Interrupts enabled. */
13 regs->eip = start;
14 /* esi points to our boot information (physical address 0) */
15}
16
17/* + addr */
18static long user_get_dma(struct lguest *lg, const u32 __user *input)
19{
20 unsigned long key, udma, irq;
21
22 if (get_user(key, input) != 0)
23 return -EFAULT;
24 udma = get_dma_buffer(lg, key, &irq);
25 if (!udma)
26 return -ENOENT;
27
28 /* We put irq number in udma->used_len. */
29 lgwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq);
30 return udma;
31}
32
33/* To force the Guest to stop running and return to the Launcher, the
34 * Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest. The
35 * Launcher then writes LHREQ_BREAK and "0" to release the Waker. */
36static int break_guest_out(struct lguest *lg, const u32 __user *input)
37{
38 unsigned long on;
39
40 /* Fetch whether they're turning break on or off.. */
41 if (get_user(on, input) != 0)
42 return -EFAULT;
43
44 if (on) {
45 lg->break_out = 1;
46 /* Pop it out (may be running on different CPU) */
47 wake_up_process(lg->tsk);
48 /* Wait for them to reset it */
49 return wait_event_interruptible(lg->break_wq, !lg->break_out);
50 } else {
51 lg->break_out = 0;
52 wake_up(&lg->break_wq);
53 return 0;
54 }
55}
56
57/* + irq */
58static int user_send_irq(struct lguest *lg, const u32 __user *input)
59{
60 u32 irq;
61
62 if (get_user(irq, input) != 0)
63 return -EFAULT;
64 if (irq >= LGUEST_IRQS)
65 return -EINVAL;
66 set_bit(irq, lg->irqs_pending);
67 return 0;
68}
69
70static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
71{
72 struct lguest *lg = file->private_data;
73
74 if (!lg)
75 return -EINVAL;
76
77 /* If you're not the task which owns the guest, go away. */
78 if (current != lg->tsk)
79 return -EPERM;
80
81 if (lg->dead) {
82 size_t len;
83
84 if (IS_ERR(lg->dead))
85 return PTR_ERR(lg->dead);
86
87 len = min(size, strlen(lg->dead)+1);
88 if (copy_to_user(user, lg->dead, len) != 0)
89 return -EFAULT;
90 return len;
91 }
92
93 if (lg->dma_is_pending)
94 lg->dma_is_pending = 0;
95
96 return run_guest(lg, (unsigned long __user *)user);
97}
98
99/* Take: pfnlimit, pgdir, start, pageoffset. */
100static int initialize(struct file *file, const u32 __user *input)
101{
102 struct lguest *lg;
103 int err, i;
104 u32 args[4];
105
106 /* We grab the Big Lguest lock, which protects the global array
107 * "lguests" and multiple simultaneous initializations. */
108 mutex_lock(&lguest_lock);
109
110 if (file->private_data) {
111 err = -EBUSY;
112 goto unlock;
113 }
114
115 if (copy_from_user(args, input, sizeof(args)) != 0) {
116 err = -EFAULT;
117 goto unlock;
118 }
119
120 i = find_free_guest();
121 if (i < 0) {
122 err = -ENOSPC;
123 goto unlock;
124 }
125 lg = &lguests[i];
126 lg->guestid = i;
127 lg->pfn_limit = args[0];
128 lg->page_offset = args[3];
129 lg->regs_page = get_zeroed_page(GFP_KERNEL);
130 if (!lg->regs_page) {
131 err = -ENOMEM;
132 goto release_guest;
133 }
134 lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs);
135
136 err = init_guest_pagetable(lg, args[1]);
137 if (err)
138 goto free_regs;
139
140 setup_regs(lg->regs, args[2]);
141 setup_guest_gdt(lg);
142 init_clockdev(lg);
143 lg->tsk = current;
144 lg->mm = get_task_mm(lg->tsk);
145 init_waitqueue_head(&lg->break_wq);
146 lg->last_pages = NULL;
147 file->private_data = lg;
148
149 mutex_unlock(&lguest_lock);
150
151 return sizeof(args);
152
153free_regs:
154 free_page(lg->regs_page);
155release_guest:
156 memset(lg, 0, sizeof(*lg));
157unlock:
158 mutex_unlock(&lguest_lock);
159 return err;
160}
161
162static ssize_t write(struct file *file, const char __user *input,
163 size_t size, loff_t *off)
164{
165 struct lguest *lg = file->private_data;
166 u32 req;
167
168 if (get_user(req, input) != 0)
169 return -EFAULT;
170 input += sizeof(req);
171
172 if (req != LHREQ_INITIALIZE && !lg)
173 return -EINVAL;
174 if (lg && lg->dead)
175 return -ENOENT;
176
177 /* If you're not the task which owns the Guest, you can only break */
178 if (lg && current != lg->tsk && req != LHREQ_BREAK)
179 return -EPERM;
180
181 switch (req) {
182 case LHREQ_INITIALIZE:
183 return initialize(file, (const u32 __user *)input);
184 case LHREQ_GETDMA:
185 return user_get_dma(lg, (const u32 __user *)input);
186 case LHREQ_IRQ:
187 return user_send_irq(lg, (const u32 __user *)input);
188 case LHREQ_BREAK:
189 return break_guest_out(lg, (const u32 __user *)input);
190 default:
191 return -EINVAL;
192 }
193}
194
195static int close(struct inode *inode, struct file *file)
196{
197 struct lguest *lg = file->private_data;
198
199 if (!lg)
200 return 0;
201
202 mutex_lock(&lguest_lock);
203 /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
204 hrtimer_cancel(&lg->hrt);
205 release_all_dma(lg);
206 free_guest_pagetable(lg);
207 mmput(lg->mm);
208 if (!IS_ERR(lg->dead))
209 kfree(lg->dead);
210 free_page(lg->regs_page);
211 memset(lg, 0, sizeof(*lg));
212 mutex_unlock(&lguest_lock);
213 return 0;
214}
215
216static struct file_operations lguest_fops = {
217 .owner = THIS_MODULE,
218 .release = close,
219 .write = write,
220 .read = read,
221};
222static struct miscdevice lguest_dev = {
223 .minor = MISC_DYNAMIC_MINOR,
224 .name = "lguest",
225 .fops = &lguest_fops,
226};
227
228int __init lguest_device_init(void)
229{
230 return misc_register(&lguest_dev);
231}
232
233void __exit lguest_device_remove(void)
234{
235 misc_deregister(&lguest_dev);
236}
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
new file mode 100644
index 000000000000..1b0ba09b1269
--- /dev/null
+++ b/drivers/lguest/page_tables.c
@@ -0,0 +1,411 @@
1/* Shadow page table operations.
2 * Copyright (C) Rusty Russell IBM Corporation 2006.
3 * GPL v2 and any later version */
4#include <linux/mm.h>
5#include <linux/types.h>
6#include <linux/spinlock.h>
7#include <linux/random.h>
8#include <linux/percpu.h>
9#include <asm/tlbflush.h>
10#include "lg.h"
11
12#define PTES_PER_PAGE_SHIFT 10
13#define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT)
14#define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1)
15
16static DEFINE_PER_CPU(spte_t *, switcher_pte_pages);
17#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
18
19static unsigned vaddr_to_pgd_index(unsigned long vaddr)
20{
21 return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
22}
23
24/* These access the shadow versions (ie. the ones used by the CPU). */
25static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
26{
27 unsigned int index = vaddr_to_pgd_index(vaddr);
28
29 if (index >= SWITCHER_PGD_INDEX) {
30 kill_guest(lg, "attempt to access switcher pages");
31 index = 0;
32 }
33 return &lg->pgdirs[i].pgdir[index];
34}
35
36static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr)
37{
38 spte_t *page = __va(spgd.pfn << PAGE_SHIFT);
39 BUG_ON(!(spgd.flags & _PAGE_PRESENT));
40 return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE];
41}
42
43/* These access the guest versions. */
44static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr)
45{
46 unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
47 return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(gpgd_t);
48}
49
50static unsigned long gpte_addr(struct lguest *lg,
51 gpgd_t gpgd, unsigned long vaddr)
52{
53 unsigned long gpage = gpgd.pfn << PAGE_SHIFT;
54 BUG_ON(!(gpgd.flags & _PAGE_PRESENT));
55 return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t);
56}
57
58/* Do a virtual -> physical mapping on a user page. */
59static unsigned long get_pfn(unsigned long virtpfn, int write)
60{
61 struct page *page;
62 unsigned long ret = -1UL;
63
64 down_read(&current->mm->mmap_sem);
65 if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT,
66 1, write, 1, &page, NULL) == 1)
67 ret = page_to_pfn(page);
68 up_read(&current->mm->mmap_sem);
69 return ret;
70}
71
72static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write)
73{
74 spte_t spte;
75 unsigned long pfn;
76
77 /* We ignore the global flag. */
78 spte.flags = (gpte.flags & ~_PAGE_GLOBAL);
79 pfn = get_pfn(gpte.pfn, write);
80 if (pfn == -1UL) {
81 kill_guest(lg, "failed to get page %u", gpte.pfn);
82 /* Must not put_page() bogus page on cleanup. */
83 spte.flags = 0;
84 }
85 spte.pfn = pfn;
86 return spte;
87}
88
89static void release_pte(spte_t pte)
90{
91 if (pte.flags & _PAGE_PRESENT)
92 put_page(pfn_to_page(pte.pfn));
93}
94
95static void check_gpte(struct lguest *lg, gpte_t gpte)
96{
97 if ((gpte.flags & (_PAGE_PWT|_PAGE_PSE)) || gpte.pfn >= lg->pfn_limit)
98 kill_guest(lg, "bad page table entry");
99}
100
101static void check_gpgd(struct lguest *lg, gpgd_t gpgd)
102{
103 if ((gpgd.flags & ~_PAGE_TABLE) || gpgd.pfn >= lg->pfn_limit)
104 kill_guest(lg, "bad page directory entry");
105}
106
107/* FIXME: We hold reference to pages, which prevents them from being
108 swapped. It'd be nice to have a callback when Linux wants to swap out. */
109
110/* We fault pages in, which allows us to update accessed/dirty bits.
111 * Return true if we got page. */
112int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
113{
114 gpgd_t gpgd;
115 spgd_t *spgd;
116 unsigned long gpte_ptr;
117 gpte_t gpte;
118 spte_t *spte;
119
120 gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr)));
121 if (!(gpgd.flags & _PAGE_PRESENT))
122 return 0;
123
124 spgd = spgd_addr(lg, lg->pgdidx, vaddr);
125 if (!(spgd->flags & _PAGE_PRESENT)) {
126 /* Get a page of PTEs for them. */
127 unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
128 /* FIXME: Steal from self in this case? */
129 if (!ptepage) {
130 kill_guest(lg, "out of memory allocating pte page");
131 return 0;
132 }
133 check_gpgd(lg, gpgd);
134 spgd->raw.val = (__pa(ptepage) | gpgd.flags);
135 }
136
137 gpte_ptr = gpte_addr(lg, gpgd, vaddr);
138 gpte = mkgpte(lgread_u32(lg, gpte_ptr));
139
140 /* No page? */
141 if (!(gpte.flags & _PAGE_PRESENT))
142 return 0;
143
144 /* Write to read-only page? */
145 if ((errcode & 2) && !(gpte.flags & _PAGE_RW))
146 return 0;
147
148 /* User access to a non-user page? */
149 if ((errcode & 4) && !(gpte.flags & _PAGE_USER))
150 return 0;
151
152 check_gpte(lg, gpte);
153 gpte.flags |= _PAGE_ACCESSED;
154 if (errcode & 2)
155 gpte.flags |= _PAGE_DIRTY;
156
157 /* We're done with the old pte. */
158 spte = spte_addr(lg, *spgd, vaddr);
159 release_pte(*spte);
160
161 /* We don't make it writable if this isn't a write: later
162 * write will fault so we can set dirty bit in guest. */
163 if (gpte.flags & _PAGE_DIRTY)
164 *spte = gpte_to_spte(lg, gpte, 1);
165 else {
166 gpte_t ro_gpte = gpte;
167 ro_gpte.flags &= ~_PAGE_RW;
168 *spte = gpte_to_spte(lg, ro_gpte, 0);
169 }
170
171 /* Now we update dirty/accessed on guest. */
172 lgwrite_u32(lg, gpte_ptr, gpte.raw.val);
173 return 1;
174}
175
176/* This is much faster than the full demand_page logic. */
177static int page_writable(struct lguest *lg, unsigned long vaddr)
178{
179 spgd_t *spgd;
180 unsigned long flags;
181
182 spgd = spgd_addr(lg, lg->pgdidx, vaddr);
183 if (!(spgd->flags & _PAGE_PRESENT))
184 return 0;
185
186 flags = spte_addr(lg, *spgd, vaddr)->flags;
187 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
188}
189
190void pin_page(struct lguest *lg, unsigned long vaddr)
191{
192 if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2))
193 kill_guest(lg, "bad stack page %#lx", vaddr);
194}
195
196static void release_pgd(struct lguest *lg, spgd_t *spgd)
197{
198 if (spgd->flags & _PAGE_PRESENT) {
199 unsigned int i;
200 spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT);
201 for (i = 0; i < PTES_PER_PAGE; i++)
202 release_pte(ptepage[i]);
203 free_page((long)ptepage);
204 spgd->raw.val = 0;
205 }
206}
207
208static void flush_user_mappings(struct lguest *lg, int idx)
209{
210 unsigned int i;
211 for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++)
212 release_pgd(lg, lg->pgdirs[idx].pgdir + i);
213}
214
215void guest_pagetable_flush_user(struct lguest *lg)
216{
217 flush_user_mappings(lg, lg->pgdidx);
218}
219
220static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
221{
222 unsigned int i;
223 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
224 if (lg->pgdirs[i].cr3 == pgtable)
225 break;
226 return i;
227}
228
229static unsigned int new_pgdir(struct lguest *lg,
230 unsigned long cr3,
231 int *blank_pgdir)
232{
233 unsigned int next;
234
235 next = random32() % ARRAY_SIZE(lg->pgdirs);
236 if (!lg->pgdirs[next].pgdir) {
237 lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL);
238 if (!lg->pgdirs[next].pgdir)
239 next = lg->pgdidx;
240 else
241 /* There are no mappings: you'll need to re-pin */
242 *blank_pgdir = 1;
243 }
244 lg->pgdirs[next].cr3 = cr3;
245 /* Release all the non-kernel mappings. */
246 flush_user_mappings(lg, next);
247
248 return next;
249}
250
251void guest_new_pagetable(struct lguest *lg, unsigned long pgtable)
252{
253 int newpgdir, repin = 0;
254
255 newpgdir = find_pgdir(lg, pgtable);
256 if (newpgdir == ARRAY_SIZE(lg->pgdirs))
257 newpgdir = new_pgdir(lg, pgtable, &repin);
258 lg->pgdidx = newpgdir;
259 if (repin)
260 pin_stack_pages(lg);
261}
262
263static void release_all_pagetables(struct lguest *lg)
264{
265 unsigned int i, j;
266
267 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
268 if (lg->pgdirs[i].pgdir)
269 for (j = 0; j < SWITCHER_PGD_INDEX; j++)
270 release_pgd(lg, lg->pgdirs[i].pgdir + j);
271}
272
273void guest_pagetable_clear_all(struct lguest *lg)
274{
275 release_all_pagetables(lg);
276 pin_stack_pages(lg);
277}
278
279static void do_set_pte(struct lguest *lg, int idx,
280 unsigned long vaddr, gpte_t gpte)
281{
282 spgd_t *spgd = spgd_addr(lg, idx, vaddr);
283 if (spgd->flags & _PAGE_PRESENT) {
284 spte_t *spte = spte_addr(lg, *spgd, vaddr);
285 release_pte(*spte);
286 if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
287 check_gpte(lg, gpte);
288 *spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY);
289 } else
290 spte->raw.val = 0;
291 }
292}
293
294void guest_set_pte(struct lguest *lg,
295 unsigned long cr3, unsigned long vaddr, gpte_t gpte)
296{
297 /* Kernel mappings must be changed on all top levels. */
298 if (vaddr >= lg->page_offset) {
299 unsigned int i;
300 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
301 if (lg->pgdirs[i].pgdir)
302 do_set_pte(lg, i, vaddr, gpte);
303 } else {
304 int pgdir = find_pgdir(lg, cr3);
305 if (pgdir != ARRAY_SIZE(lg->pgdirs))
306 do_set_pte(lg, pgdir, vaddr, gpte);
307 }
308}
309
310void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx)
311{
312 int pgdir;
313
314 if (idx >= SWITCHER_PGD_INDEX)
315 return;
316
317 pgdir = find_pgdir(lg, cr3);
318 if (pgdir < ARRAY_SIZE(lg->pgdirs))
319 release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);
320}
321
322int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
323{
324 /* We assume this in flush_user_mappings, so check now */
325 if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX)
326 return -EINVAL;
327 lg->pgdidx = 0;
328 lg->pgdirs[lg->pgdidx].cr3 = pgtable;
329 lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL);
330 if (!lg->pgdirs[lg->pgdidx].pgdir)
331 return -ENOMEM;
332 return 0;
333}
334
335void free_guest_pagetable(struct lguest *lg)
336{
337 unsigned int i;
338
339 release_all_pagetables(lg);
340 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
341 free_page((long)lg->pgdirs[i].pgdir);
342}
343
344/* Caller must be preempt-safe */
345void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
346{
347 spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
348 spgd_t switcher_pgd;
349 spte_t regs_pte;
350
351 /* Since switcher less that 4MB, we simply mug top pte page. */
352 switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT;
353 switcher_pgd.flags = _PAGE_KERNEL;
354 lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
355
356 /* Map our regs page over stack page. */
357 regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT;
358 regs_pte.flags = _PAGE_KERNEL;
359 switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE]
360 = regs_pte;
361}
362
363static void free_switcher_pte_pages(void)
364{
365 unsigned int i;
366
367 for_each_possible_cpu(i)
368 free_page((long)switcher_pte_page(i));
369}
370
371static __init void populate_switcher_pte_page(unsigned int cpu,
372 struct page *switcher_page[],
373 unsigned int pages)
374{
375 unsigned int i;
376 spte_t *pte = switcher_pte_page(cpu);
377
378 for (i = 0; i < pages; i++) {
379 pte[i].pfn = page_to_pfn(switcher_page[i]);
380 pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED;
381 }
382
383 /* We only map this CPU's pages, so guest can't see others. */
384 i = pages + cpu*2;
385
386 /* First page (regs) is rw, second (state) is ro. */
387 pte[i].pfn = page_to_pfn(switcher_page[i]);
388 pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW;
389 pte[i+1].pfn = page_to_pfn(switcher_page[i+1]);
390 pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED;
391}
392
393__init int init_pagetables(struct page **switcher_page, unsigned int pages)
394{
395 unsigned int i;
396
397 for_each_possible_cpu(i) {
398 switcher_pte_page(i) = (spte_t *)get_zeroed_page(GFP_KERNEL);
399 if (!switcher_pte_page(i)) {
400 free_switcher_pte_pages();
401 return -ENOMEM;
402 }
403 populate_switcher_pte_page(i, switcher_page, pages);
404 }
405 return 0;
406}
407
408void free_pagetables(void)
409{
410 free_switcher_pte_pages();
411}
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
new file mode 100644
index 000000000000..1b2cfe89dcd5
--- /dev/null
+++ b/drivers/lguest/segments.c
@@ -0,0 +1,125 @@
1#include "lg.h"
2
3static int desc_ok(const struct desc_struct *gdt)
4{
5 /* MBZ=0, P=1, DT=1 */
6 return ((gdt->b & 0x00209000) == 0x00009000);
7}
8
9static int segment_present(const struct desc_struct *gdt)
10{
11 return gdt->b & 0x8000;
12}
13
14static int ignored_gdt(unsigned int num)
15{
16 return (num == GDT_ENTRY_TSS
17 || num == GDT_ENTRY_LGUEST_CS
18 || num == GDT_ENTRY_LGUEST_DS
19 || num == GDT_ENTRY_DOUBLEFAULT_TSS);
20}
21
22/* We don't allow removal of CS, DS or SS; it doesn't make sense. */
23static void check_segment_use(struct lguest *lg, unsigned int desc)
24{
25 if (lg->regs->gs / 8 == desc)
26 lg->regs->gs = 0;
27 if (lg->regs->fs / 8 == desc)
28 lg->regs->fs = 0;
29 if (lg->regs->es / 8 == desc)
30 lg->regs->es = 0;
31 if (lg->regs->ds / 8 == desc
32 || lg->regs->cs / 8 == desc
33 || lg->regs->ss / 8 == desc)
34 kill_guest(lg, "Removed live GDT entry %u", desc);
35}
36
37static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
38{
39 unsigned int i;
40
41 for (i = start; i < end; i++) {
42 /* We never copy these ones to real gdt */
43 if (ignored_gdt(i))
44 continue;
45
46 /* We could fault in switch_to_guest if they are using
47 * a removed segment. */
48 if (!segment_present(&lg->gdt[i])) {
49 check_segment_use(lg, i);
50 continue;
51 }
52
53 if (!desc_ok(&lg->gdt[i]))
54 kill_guest(lg, "Bad GDT descriptor %i", i);
55
56 /* DPL 0 presumably means "for use by guest". */
57 if ((lg->gdt[i].b & 0x00006000) == 0)
58 lg->gdt[i].b |= (GUEST_PL << 13);
59
60 /* Set accessed bit, since gdt isn't writable. */
61 lg->gdt[i].b |= 0x00000100;
62 }
63}
64
65void setup_default_gdt_entries(struct lguest_ro_state *state)
66{
67 struct desc_struct *gdt = state->guest_gdt;
68 unsigned long tss = (unsigned long)&state->guest_tss;
69
70 /* Hypervisor segments. */
71 gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
72 gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
73
74 /* This is the one which we *cannot* copy from guest, since tss
75 is depended on this lguest_ro_state, ie. this cpu. */
76 gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16);
77 gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000)
78 | ((tss >> 16) & 0x000000FF);
79}
80
81void setup_guest_gdt(struct lguest *lg)
82{
83 lg->gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
84 lg->gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
85 lg->gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
86 lg->gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
87}
88
89/* This is a fast version for the common case where only the three TLS entries
90 * have changed. */
91void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
92{
93 unsigned int i;
94
95 for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
96 gdt[i] = lg->gdt[i];
97}
98
99void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
100{
101 unsigned int i;
102
103 for (i = 0; i < GDT_ENTRIES; i++)
104 if (!ignored_gdt(i))
105 gdt[i] = lg->gdt[i];
106}
107
108void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
109{
110 if (num > ARRAY_SIZE(lg->gdt))
111 kill_guest(lg, "too many gdt entries %i", num);
112
113 lgread(lg, lg->gdt, table, num * sizeof(lg->gdt[0]));
114 fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->gdt));
115 lg->changed |= CHANGED_GDT;
116}
117
118void guest_load_tls(struct lguest *lg, unsigned long gtls)
119{
120 struct desc_struct *tls = &lg->gdt[GDT_ENTRY_TLS_MIN];
121
122 lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
123 fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
124 lg->changed |= CHANGED_GDT_TLS;
125}
diff --git a/drivers/lguest/switcher.S b/drivers/lguest/switcher.S
new file mode 100644
index 000000000000..eadd4cc299d2
--- /dev/null
+++ b/drivers/lguest/switcher.S
@@ -0,0 +1,159 @@
1/* This code sits at 0xFFC00000 to do the low-level guest<->host switch.
2
3 There is are two pages above us for this CPU (struct lguest_pages).
4 The second page (struct lguest_ro_state) becomes read-only after the
5 context switch. The first page (the stack for traps) remains writable,
6 but while we're in here, the guest cannot be running.
7*/
8#include <linux/linkage.h>
9#include <asm/asm-offsets.h>
10#include "lg.h"
11
12.text
13ENTRY(start_switcher_text)
14
15/* %eax points to lguest pages for this CPU. %ebx contains cr3 value.
16 All normal registers can be clobbered! */
17ENTRY(switch_to_guest)
18 /* Save host segments on host stack. */
19 pushl %es
20 pushl %ds
21 pushl %gs
22 pushl %fs
23 /* With CONFIG_FRAME_POINTER, gcc doesn't let us clobber this! */
24 pushl %ebp
25 /* Save host stack. */
26 movl %esp, LGUEST_PAGES_host_sp(%eax)
27 /* Switch to guest stack: if we get NMI we expect to be there. */
28 movl %eax, %edx
29 addl $LGUEST_PAGES_regs, %edx
30 movl %edx, %esp
31 /* Switch to guest's GDT, IDT. */
32 lgdt LGUEST_PAGES_guest_gdt_desc(%eax)
33 lidt LGUEST_PAGES_guest_idt_desc(%eax)
34 /* Switch to guest's TSS while GDT still writable. */
35 movl $(GDT_ENTRY_TSS*8), %edx
36 ltr %dx
37 /* Set host's TSS GDT entry to available (clear byte 5 bit 2). */
38 movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
39 andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
40 /* Switch to guest page tables: lguest_pages->state now read-only. */
41 movl %ebx, %cr3
42 /* Restore guest regs */
43 popl %ebx
44 popl %ecx
45 popl %edx
46 popl %esi
47 popl %edi
48 popl %ebp
49 popl %gs
50 popl %eax
51 popl %fs
52 popl %ds
53 popl %es
54 /* Skip error code and trap number */
55 addl $8, %esp
56 iret
57
58#define SWITCH_TO_HOST \
59 /* Save guest state */ \
60 pushl %es; \
61 pushl %ds; \
62 pushl %fs; \
63 pushl %eax; \
64 pushl %gs; \
65 pushl %ebp; \
66 pushl %edi; \
67 pushl %esi; \
68 pushl %edx; \
69 pushl %ecx; \
70 pushl %ebx; \
71 /* Load lguest ds segment for convenience. */ \
72 movl $(LGUEST_DS), %eax; \
73 movl %eax, %ds; \
74 /* Figure out where we are, based on stack (at top of regs). */ \
75 movl %esp, %eax; \
76 subl $LGUEST_PAGES_regs, %eax; \
77 /* Put trap number in %ebx before we switch cr3 and lose it. */ \
78 movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \
79 /* Switch to host page tables (host GDT, IDT and stack are in host \
80 mem, so need this first) */ \
81 movl LGUEST_PAGES_host_cr3(%eax), %edx; \
82 movl %edx, %cr3; \
83 /* Set guest's TSS to available (clear byte 5 bit 2). */ \
84 andb $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \
85 /* Switch to host's GDT & IDT. */ \
86 lgdt LGUEST_PAGES_host_gdt_desc(%eax); \
87 lidt LGUEST_PAGES_host_idt_desc(%eax); \
88 /* Switch to host's stack. */ \
89 movl LGUEST_PAGES_host_sp(%eax), %esp; \
90 /* Switch to host's TSS */ \
91 movl $(GDT_ENTRY_TSS*8), %edx; \
92 ltr %dx; \
93 popl %ebp; \
94 popl %fs; \
95 popl %gs; \
96 popl %ds; \
97 popl %es
98
99/* Return to run_guest_once. */
100return_to_host:
101 SWITCH_TO_HOST
102 iret
103
104deliver_to_host:
105 SWITCH_TO_HOST
106 /* Decode IDT and jump to hosts' irq handler. When that does iret, it
107 * will return to run_guest_once. This is a feature. */
108 movl (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx
109 leal (%edx,%ebx,8), %eax
110 movzwl (%eax),%edx
111 movl 4(%eax), %eax
112 xorw %ax, %ax
113 orl %eax, %edx
114 jmp *%edx
115
116/* Real hardware interrupts are delivered straight to the host. Others
117 cause us to return to run_guest_once so it can decide what to do. Note
118 that some of these are overridden by the guest to deliver directly, and
119 never enter here (see load_guest_idt_entry). */
120.macro IRQ_STUB N TARGET
121 .data; .long 1f; .text; 1:
122 /* Make an error number for most traps, which don't have one. */
123 .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
124 pushl $0
125 .endif
126 pushl $\N
127 jmp \TARGET
128 ALIGN
129.endm
130
131.macro IRQ_STUBS FIRST LAST TARGET
132 irq=\FIRST
133 .rept \LAST-\FIRST+1
134 IRQ_STUB irq \TARGET
135 irq=irq+1
136 .endr
137.endm
138
139/* We intercept every interrupt, because we may need to switch back to
140 * host. Unfortunately we can't tell them apart except by entry
141 * point, so we need 256 entry points.
142 */
143.data
144.global default_idt_entries
145default_idt_entries:
146.text
147 IRQ_STUBS 0 1 return_to_host /* First two traps */
148 IRQ_STUB 2 handle_nmi /* NMI */
149 IRQ_STUBS 3 31 return_to_host /* Rest of traps */
150 IRQ_STUBS 32 127 deliver_to_host /* Real interrupts */
151 IRQ_STUB 128 return_to_host /* System call (overridden) */
152 IRQ_STUBS 129 255 deliver_to_host /* Other real interrupts */
153
154/* We ignore NMI and return. */
155handle_nmi:
156 addl $8, %esp
157 iret
158
159ENTRY(end_switcher_text)
diff --git a/include/asm-i386/tsc.h b/include/asm-i386/tsc.h
index 62c091ffcccc..a4d806610b7f 100644
--- a/include/asm-i386/tsc.h
+++ b/include/asm-i386/tsc.h
@@ -63,6 +63,7 @@ extern void tsc_init(void);
63extern void mark_tsc_unstable(char *reason); 63extern void mark_tsc_unstable(char *reason);
64extern int unsynchronized_tsc(void); 64extern int unsynchronized_tsc(void);
65extern void init_tsc_clocksource(void); 65extern void init_tsc_clocksource(void);
66int check_tsc_unstable(void);
66 67
67/* 68/*
68 * Boot-time check whether the TSCs are synchronized across 69 * Boot-time check whether the TSCs are synchronized across
diff --git a/include/linux/lguest.h b/include/linux/lguest.h
index f30c04fc22b7..500aace21ca7 100644
--- a/include/linux/lguest.h
+++ b/include/linux/lguest.h
@@ -3,11 +3,6 @@
3#ifndef _ASM_LGUEST_H 3#ifndef _ASM_LGUEST_H
4#define _ASM_LGUEST_H 4#define _ASM_LGUEST_H
5 5
6/* These are randomly chosen numbers which indicate we're an lguest at boot */
7#define LGUEST_MAGIC_EBP 0x4C687970
8#define LGUEST_MAGIC_EDI 0x652D4D65
9#define LGUEST_MAGIC_ESI 0xFFFFFFFF
10
11#ifndef __ASSEMBLY__ 6#ifndef __ASSEMBLY__
12#include <asm/irq.h> 7#include <asm/irq.h>
13 8
@@ -20,7 +15,7 @@
20#define LHCALL_LOAD_IDT_ENTRY 6 15#define LHCALL_LOAD_IDT_ENTRY 6
21#define LHCALL_SET_STACK 7 16#define LHCALL_SET_STACK 7
22#define LHCALL_TS 8 17#define LHCALL_TS 8
23#define LHCALL_TIMER_READ 9 18#define LHCALL_SET_CLOCKEVENT 9
24#define LHCALL_HALT 10 19#define LHCALL_HALT 10
25#define LHCALL_GET_WALLCLOCK 11 20#define LHCALL_GET_WALLCLOCK 11
26#define LHCALL_BIND_DMA 12 21#define LHCALL_BIND_DMA 12
@@ -29,6 +24,9 @@
29#define LHCALL_SET_PMD 15 24#define LHCALL_SET_PMD 15
30#define LHCALL_LOAD_TLS 16 25#define LHCALL_LOAD_TLS 16
31 26
27#define LG_CLOCK_MIN_DELTA 100UL
28#define LG_CLOCK_MAX_DELTA ULONG_MAX
29
32#define LGUEST_TRAP_ENTRY 0x1F 30#define LGUEST_TRAP_ENTRY 0x1F
33 31
34static inline unsigned long 32static inline unsigned long
@@ -75,6 +73,8 @@ struct lguest_data
75 unsigned long reserve_mem; 73 unsigned long reserve_mem;
76 /* ID of this guest (used by network driver to set ethernet address) */ 74 /* ID of this guest (used by network driver to set ethernet address) */
77 u16 guestid; 75 u16 guestid;
76 /* KHz for the TSC clock. */
77 u32 tsc_khz;
78 78
79/* Fields initialized by the guest at boot: */ 79/* Fields initialized by the guest at boot: */
80 /* Instruction range to suppress interrupts even if enabled */ 80 /* Instruction range to suppress interrupts even if enabled */
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
new file mode 100644
index 000000000000..0ba414a40c80
--- /dev/null
+++ b/include/linux/lguest_launcher.h
@@ -0,0 +1,73 @@
1#ifndef _ASM_LGUEST_USER
2#define _ASM_LGUEST_USER
3/* Everything the "lguest" userspace program needs to know. */
4/* They can register up to 32 arrays of lguest_dma. */
5#define LGUEST_MAX_DMA 32
6/* At most we can dma 16 lguest_dma in one op. */
7#define LGUEST_MAX_DMA_SECTIONS 16
8
9/* How many devices? Assume each one wants up to two dma arrays per device. */
10#define LGUEST_MAX_DEVICES (LGUEST_MAX_DMA/2)
11
12struct lguest_dma
13{
14 /* 0 if free to be used, filled by hypervisor. */
15 u32 used_len;
16 unsigned long addr[LGUEST_MAX_DMA_SECTIONS];
17 u16 len[LGUEST_MAX_DMA_SECTIONS];
18};
19
20struct lguest_block_page
21{
22 /* 0 is a read, 1 is a write. */
23 int type;
24 u32 sector; /* Offset in device = sector * 512. */
25 u32 bytes; /* Length expected to be read/written in bytes */
26 /* 0 = pending, 1 = done, 2 = done, error */
27 int result;
28 u32 num_sectors; /* Disk length = num_sectors * 512 */
29};
30
31/* There is a shared page of these. */
32struct lguest_net
33{
34 /* Simply the mac address (with multicast bit meaning promisc). */
35 unsigned char mac[6];
36};
37
38/* Where the Host expects the Guest to SEND_DMA console output to. */
39#define LGUEST_CONSOLE_DMA_KEY 0
40
41/* We have a page of these descriptors in the lguest_device page. */
42struct lguest_device_desc {
43 u16 type;
44#define LGUEST_DEVICE_T_CONSOLE 1
45#define LGUEST_DEVICE_T_NET 2
46#define LGUEST_DEVICE_T_BLOCK 3
47
48 u16 features;
49#define LGUEST_NET_F_NOCSUM 0x4000 /* Don't bother checksumming */
50#define LGUEST_DEVICE_F_RANDOMNESS 0x8000 /* IRQ is fairly random */
51
52 u16 status;
53/* 256 and above are device specific. */
54#define LGUEST_DEVICE_S_ACKNOWLEDGE 1 /* We have seen device. */
55#define LGUEST_DEVICE_S_DRIVER 2 /* We have found a driver */
56#define LGUEST_DEVICE_S_DRIVER_OK 4 /* Driver says OK! */
57#define LGUEST_DEVICE_S_REMOVED 8 /* Device has gone away. */
58#define LGUEST_DEVICE_S_REMOVED_ACK 16 /* Driver has been told. */
59#define LGUEST_DEVICE_S_FAILED 128 /* Something actually failed */
60
61 u16 num_pages;
62 u32 pfn;
63};
64
65/* Write command first word is a request. */
66enum lguest_req
67{
68 LHREQ_INITIALIZE, /* + pfnlimit, pgdir, start, pageoffset */
69 LHREQ_GETDMA, /* + addr (returns &lguest_dma, irq in ->used_len) */
70 LHREQ_IRQ, /* + irq */
71 LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
72};
73#endif /* _ASM_LGUEST_USER */
diff --git a/kernel/fork.c b/kernel/fork.c
index e7a2d995b087..469838998220 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -127,7 +127,6 @@ void __put_task_struct(struct task_struct *tsk)
127 if (!profile_handoff_task(tsk)) 127 if (!profile_handoff_task(tsk))
128 free_task(tsk); 128 free_task(tsk);
129} 129}
130EXPORT_SYMBOL_GPL(__put_task_struct);
131 130
132void __init fork_init(unsigned long mempages) 131void __init fork_init(unsigned long mempages)
133{ 132{