aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-07-22 16:45:50 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-07-22 16:45:50 -0400
commita7e1aabb28e8154ce987b622fd78d80a1ca39361 (patch)
tree8671d8faf51d43665045b7362a177a23dc88921b
parent111ad119d1765b1bbef2629a5f2bd825caeb7e74 (diff)
parent996ba96a97f7406052486682846d68935a60e986 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus
* git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus: lguest: Fix in/out emulation lguest: Fix translation count about wikipedia's cpuid page lguest: Fix three simple typos in comments lguest: update comments lguest: Simplify device initialization. lguest: don't rewrite vmcall instructions lguest: remove remaining vmcall lguest: use a special 1:1 linear pagetable mode until first switch. lguest: Do not exit on non-fatal errors
-rw-r--r--Documentation/virtual/lguest/lguest.c47
-rw-r--r--arch/x86/include/asm/lguest_hcall.h1
-rw-r--r--arch/x86/kernel/asm-offsets_32.c1
-rw-r--r--arch/x86/lguest/boot.c36
-rw-r--r--arch/x86/lguest/i386_head.S35
-rw-r--r--drivers/lguest/core.c2
-rw-r--r--drivers/lguest/interrupts_and_traps.c10
-rw-r--r--drivers/lguest/lg.h2
-rw-r--r--drivers/lguest/lguest_device.c37
-rw-r--r--drivers/lguest/lguest_user.c17
-rw-r--r--drivers/lguest/page_tables.c282
-rw-r--r--drivers/lguest/x86/core.c107
-rw-r--r--include/linux/lguest.h2
13 files changed, 199 insertions, 380 deletions
diff --git a/Documentation/virtual/lguest/lguest.c b/Documentation/virtual/lguest/lguest.c
index cd9d6af61d07..043bd7df3139 100644
--- a/Documentation/virtual/lguest/lguest.c
+++ b/Documentation/virtual/lguest/lguest.c
@@ -51,7 +51,7 @@
51#include <asm/bootparam.h> 51#include <asm/bootparam.h>
52#include "../../../include/linux/lguest_launcher.h" 52#include "../../../include/linux/lguest_launcher.h"
53/*L:110 53/*L:110
54 * We can ignore the 42 include files we need for this program, but I do want 54 * We can ignore the 43 include files we need for this program, but I do want
55 * to draw attention to the use of kernel-style types. 55 * to draw attention to the use of kernel-style types.
56 * 56 *
57 * As Linus said, "C is a Spartan language, and so should your naming be." I 57 * As Linus said, "C is a Spartan language, and so should your naming be." I
@@ -65,7 +65,6 @@ typedef uint16_t u16;
65typedef uint8_t u8; 65typedef uint8_t u8;
66/*:*/ 66/*:*/
67 67
68#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
69#define BRIDGE_PFX "bridge:" 68#define BRIDGE_PFX "bridge:"
70#ifndef SIOCBRADDIF 69#ifndef SIOCBRADDIF
71#define SIOCBRADDIF 0x89a2 /* add interface to bridge */ 70#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
@@ -861,8 +860,10 @@ static void console_output(struct virtqueue *vq)
861 /* writev can return a partial write, so we loop here. */ 860 /* writev can return a partial write, so we loop here. */
862 while (!iov_empty(iov, out)) { 861 while (!iov_empty(iov, out)) {
863 int len = writev(STDOUT_FILENO, iov, out); 862 int len = writev(STDOUT_FILENO, iov, out);
864 if (len <= 0) 863 if (len <= 0) {
865 err(1, "Write to stdout gave %i", len); 864 warn("Write to stdout gave %i (%d)", len, errno);
865 break;
866 }
866 iov_consume(iov, out, len); 867 iov_consume(iov, out, len);
867 } 868 }
868 869
@@ -898,7 +899,7 @@ static void net_output(struct virtqueue *vq)
898 * same format: what a coincidence! 899 * same format: what a coincidence!
899 */ 900 */
900 if (writev(net_info->tunfd, iov, out) < 0) 901 if (writev(net_info->tunfd, iov, out) < 0)
901 errx(1, "Write to tun failed?"); 902 warnx("Write to tun failed (%d)?", errno);
902 903
903 /* 904 /*
904 * Done with that one; wait_for_vq_desc() will send the interrupt if 905 * Done with that one; wait_for_vq_desc() will send the interrupt if
@@ -955,7 +956,7 @@ static void net_input(struct virtqueue *vq)
955 */ 956 */
956 len = readv(net_info->tunfd, iov, in); 957 len = readv(net_info->tunfd, iov, in);
957 if (len <= 0) 958 if (len <= 0)
958 err(1, "Failed to read from tun."); 959 warn("Failed to read from tun (%d).", errno);
959 960
960 /* 961 /*
961 * Mark that packet buffer as used, but don't interrupt here. We want 962 * Mark that packet buffer as used, but don't interrupt here. We want
@@ -1093,9 +1094,10 @@ static void update_device_status(struct device *dev)
1093 warnx("Device %s configuration FAILED", dev->name); 1094 warnx("Device %s configuration FAILED", dev->name);
1094 if (dev->running) 1095 if (dev->running)
1095 reset_device(dev); 1096 reset_device(dev);
1096 } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { 1097 } else {
1097 if (!dev->running) 1098 if (dev->running)
1098 start_device(dev); 1099 err(1, "Device %s features finalized twice", dev->name);
1100 start_device(dev);
1099 } 1101 }
1100} 1102}
1101 1103
@@ -1120,25 +1122,11 @@ static void handle_output(unsigned long addr)
1120 return; 1122 return;
1121 } 1123 }
1122 1124
1123 /* 1125 /* Devices should not be used before features are finalized. */
1124 * Devices *can* be used before status is set to DRIVER_OK.
1125 * The original plan was that they would never do this: they
1126 * would always finish setting up their status bits before
1127 * actually touching the virtqueues. In practice, we allowed
1128 * them to, and they do (eg. the disk probes for partition
1129 * tables as part of initialization).
1130 *
1131 * If we see this, we start the device: once it's running, we
1132 * expect the device to catch all the notifications.
1133 */
1134 for (vq = i->vq; vq; vq = vq->next) { 1126 for (vq = i->vq; vq; vq = vq->next) {
1135 if (addr != vq->config.pfn*getpagesize()) 1127 if (addr != vq->config.pfn*getpagesize())
1136 continue; 1128 continue;
1137 if (i->running) 1129 errx(1, "Notification on %s before setup!", i->name);
1138 errx(1, "Notification on running %s", i->name);
1139 /* This just calls create_thread() for each virtqueue */
1140 start_device(i);
1141 return;
1142 } 1130 }
1143 } 1131 }
1144 1132
@@ -1370,7 +1358,7 @@ static void setup_console(void)
1370 * --sharenet=<name> option which opens or creates a named pipe. This can be 1358 * --sharenet=<name> option which opens or creates a named pipe. This can be
1371 * used to send packets to another guest in a 1:1 manner. 1359 * used to send packets to another guest in a 1:1 manner.
1372 * 1360 *
1373 * More sopisticated is to use one of the tools developed for project like UML 1361 * More sophisticated is to use one of the tools developed for project like UML
1374 * to do networking. 1362 * to do networking.
1375 * 1363 *
1376 * Faster is to do virtio bonding in kernel. Doing this 1:1 would be 1364 * Faster is to do virtio bonding in kernel. Doing this 1:1 would be
@@ -1380,7 +1368,7 @@ static void setup_console(void)
1380 * multiple inter-guest channels behind one interface, although it would 1368 * multiple inter-guest channels behind one interface, although it would
1381 * require some manner of hotplugging new virtio channels. 1369 * require some manner of hotplugging new virtio channels.
1382 * 1370 *
1383 * Finally, we could implement a virtio network switch in the kernel. 1371 * Finally, we could use a virtio network switch in the kernel, ie. vhost.
1384:*/ 1372:*/
1385 1373
1386static u32 str2ip(const char *ipaddr) 1374static u32 str2ip(const char *ipaddr)
@@ -2017,10 +2005,7 @@ int main(int argc, char *argv[])
2017 /* Tell the entry path not to try to reload segment registers. */ 2005 /* Tell the entry path not to try to reload segment registers. */
2018 boot->hdr.loadflags |= KEEP_SEGMENTS; 2006 boot->hdr.loadflags |= KEEP_SEGMENTS;
2019 2007
2020 /* 2008 /* We tell the kernel to initialize the Guest. */
2021 * We tell the kernel to initialize the Guest: this returns the open
2022 * /dev/lguest file descriptor.
2023 */
2024 tell_kernel(start); 2009 tell_kernel(start);
2025 2010
2026 /* Ensure that we terminate if a device-servicing child dies. */ 2011 /* Ensure that we terminate if a device-servicing child dies. */
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index b60f2924c413..879fd7d33877 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -61,6 +61,7 @@ hcall(unsigned long call,
61 : "memory"); 61 : "memory");
62 return call; 62 return call;
63} 63}
64/*:*/
64 65
65/* Can't use our min() macro here: needs to be a constant */ 66/* Can't use our min() macro here: needs to be a constant */
66#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) 67#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index c29d631af6fc..395a10e68067 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -63,7 +63,6 @@ void foo(void)
63 BLANK(); 63 BLANK();
64 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 64 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
65 OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); 65 OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
66 OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
67 66
68 BLANK(); 67 BLANK();
69 OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); 68 OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index db832fd65ecb..13ee258442ae 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -71,7 +71,8 @@
71#include <asm/stackprotector.h> 71#include <asm/stackprotector.h>
72#include <asm/reboot.h> /* for struct machine_ops */ 72#include <asm/reboot.h> /* for struct machine_ops */
73 73
74/*G:010 Welcome to the Guest! 74/*G:010
75 * Welcome to the Guest!
75 * 76 *
76 * The Guest in our tale is a simple creature: identical to the Host but 77 * The Guest in our tale is a simple creature: identical to the Host but
77 * behaving in simplified but equivalent ways. In particular, the Guest is the 78 * behaving in simplified but equivalent ways. In particular, the Guest is the
@@ -190,15 +191,23 @@ static void lazy_hcall4(unsigned long call,
190#endif 191#endif
191 192
192/*G:036 193/*G:036
193 * When lazy mode is turned off reset the per-cpu lazy mode variable and then 194 * When lazy mode is turned off, we issue the do-nothing hypercall to
194 * issue the do-nothing hypercall to flush any stored calls. 195 * flush any stored calls, and call the generic helper to reset the
195:*/ 196 * per-cpu lazy mode variable.
197 */
196static void lguest_leave_lazy_mmu_mode(void) 198static void lguest_leave_lazy_mmu_mode(void)
197{ 199{
198 hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); 200 hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
199 paravirt_leave_lazy_mmu(); 201 paravirt_leave_lazy_mmu();
200} 202}
201 203
204/*
205 * We also catch the end of context switch; we enter lazy mode for much of
206 * that too, so again we need to flush here.
207 *
208 * (Technically, this is lazy CPU mode, and normally we're in lazy MMU
209 * mode, but unlike Xen, lguest doesn't care about the difference).
210 */
202static void lguest_end_context_switch(struct task_struct *next) 211static void lguest_end_context_switch(struct task_struct *next)
203{ 212{
204 hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); 213 hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
@@ -391,7 +400,7 @@ static void lguest_load_tr_desc(void)
391 * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. 400 * giant ball of hair. Its entry in the current Intel manual runs to 28 pages.
392 * 401 *
393 * This instruction even it has its own Wikipedia entry. The Wikipedia entry 402 * This instruction even it has its own Wikipedia entry. The Wikipedia entry
394 * has been translated into 5 languages. I am not making this up! 403 * has been translated into 6 languages. I am not making this up!
395 * 404 *
396 * We could get funky here and identify ourselves as "GenuineLguest", but 405 * We could get funky here and identify ourselves as "GenuineLguest", but
397 * instead we just use the real "cpuid" instruction. Then I pretty much turned 406 * instead we just use the real "cpuid" instruction. Then I pretty much turned
@@ -458,7 +467,7 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
458 /* 467 /*
459 * PAE systems can mark pages as non-executable. Linux calls this the 468 * PAE systems can mark pages as non-executable. Linux calls this the
460 * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced 469 * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
461 * Virus Protection). We just switch turn if off here, since we don't 470 * Virus Protection). We just switch it off here, since we don't
462 * support it. 471 * support it.
463 */ 472 */
464 case 0x80000001: 473 case 0x80000001:
@@ -520,17 +529,16 @@ static unsigned long lguest_read_cr2(void)
520 529
521/* See lguest_set_pte() below. */ 530/* See lguest_set_pte() below. */
522static bool cr3_changed = false; 531static bool cr3_changed = false;
532static unsigned long current_cr3;
523 533
524/* 534/*
525 * cr3 is the current toplevel pagetable page: the principle is the same as 535 * cr3 is the current toplevel pagetable page: the principle is the same as
526 * cr0. Keep a local copy, and tell the Host when it changes. The only 536 * cr0. Keep a local copy, and tell the Host when it changes.
527 * difference is that our local copy is in lguest_data because the Host needs
528 * to set it upon our initial hypercall.
529 */ 537 */
530static void lguest_write_cr3(unsigned long cr3) 538static void lguest_write_cr3(unsigned long cr3)
531{ 539{
532 lguest_data.pgdir = cr3;
533 lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); 540 lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
541 current_cr3 = cr3;
534 542
535 /* These two page tables are simple, linear, and used during boot */ 543 /* These two page tables are simple, linear, and used during boot */
536 if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table)) 544 if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table))
@@ -539,7 +547,7 @@ static void lguest_write_cr3(unsigned long cr3)
539 547
540static unsigned long lguest_read_cr3(void) 548static unsigned long lguest_read_cr3(void)
541{ 549{
542 return lguest_data.pgdir; 550 return current_cr3;
543} 551}
544 552
545/* cr4 is used to enable and disable PGE, but we don't care. */ 553/* cr4 is used to enable and disable PGE, but we don't care. */
@@ -641,7 +649,7 @@ static void lguest_write_cr4(unsigned long val)
641 649
642/* 650/*
643 * The Guest calls this after it has set a second-level entry (pte), ie. to map 651 * The Guest calls this after it has set a second-level entry (pte), ie. to map
644 * a page into a process' address space. Wetell the Host the toplevel and 652 * a page into a process' address space. We tell the Host the toplevel and
645 * address this corresponds to. The Guest uses one pagetable per process, so 653 * address this corresponds to. The Guest uses one pagetable per process, so
646 * we need to tell the Host which one we're changing (mm->pgd). 654 * we need to tell the Host which one we're changing (mm->pgd).
647 */ 655 */
@@ -758,7 +766,7 @@ static void lguest_pmd_clear(pmd_t *pmdp)
758static void lguest_flush_tlb_single(unsigned long addr) 766static void lguest_flush_tlb_single(unsigned long addr)
759{ 767{
760 /* Simply set it to zero: if it was not, it will fault back in. */ 768 /* Simply set it to zero: if it was not, it will fault back in. */
761 lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); 769 lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
762} 770}
763 771
764/* 772/*
@@ -1140,7 +1148,7 @@ static struct notifier_block paniced = {
1140static __init char *lguest_memory_setup(void) 1148static __init char *lguest_memory_setup(void)
1141{ 1149{
1142 /* 1150 /*
1143 *The Linux bootloader header contains an "e820" memory map: the 1151 * The Linux bootloader header contains an "e820" memory map: the
1144 * Launcher populated the first entry with our memory limit. 1152 * Launcher populated the first entry with our memory limit.
1145 */ 1153 */
1146 e820_add_region(boot_params.e820_map[0].addr, 1154 e820_add_region(boot_params.e820_map[0].addr,
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 4f420c2f2d55..6ddfe4fc23c3 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -6,18 +6,22 @@
6#include <asm/processor-flags.h> 6#include <asm/processor-flags.h>
7 7
8/*G:020 8/*G:020
9 * Our story starts with the kernel booting into startup_32 in 9
10 * arch/x86/kernel/head_32.S. It expects a boot header, which is created by 10 * Our story starts with the bzImage: booting starts at startup_32 in
11 * the bootloader (the Launcher in our case). 11 * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real
12 * kernel in place and then jumps into it: startup_32 in
13 * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi
14 * register, which is created by the bootloader (the Launcher in our case).
12 * 15 *
13 * The startup_32 function does very little: it clears the uninitialized global 16 * The startup_32 function does very little: it clears the uninitialized global
14 * C variables which we expect to be zero (ie. BSS) and then copies the boot 17 * C variables which we expect to be zero (ie. BSS) and then copies the boot
15 * header and kernel command line somewhere safe. Finally it checks the 18 * header and kernel command line somewhere safe, and populates some initial
16 * 'hardware_subarch' field. This was introduced in 2.6.24 for lguest and Xen: 19 * page tables. Finally it checks the 'hardware_subarch' field. This was
17 * if it's set to '1' (lguest's assigned number), then it calls us here. 20 * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's
21 * assigned number), then it calls us here.
18 * 22 *
19 * WARNING: be very careful here! We're running at addresses equal to physical 23 * WARNING: be very careful here! We're running at addresses equal to physical
20 * addesses (around 0), not above PAGE_OFFSET as most code expectes 24 * addresses (around 0), not above PAGE_OFFSET as most code expects
21 * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any 25 * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
22 * data without remembering to subtract __PAGE_OFFSET! 26 * data without remembering to subtract __PAGE_OFFSET!
23 * 27 *
@@ -27,13 +31,18 @@
27.section .init.text, "ax", @progbits 31.section .init.text, "ax", @progbits
28ENTRY(lguest_entry) 32ENTRY(lguest_entry)
29 /* 33 /*
30 * We make the "initialization" hypercall now to tell the Host about 34 * We make the "initialization" hypercall now to tell the Host where
31 * us, and also find out where it put our page tables. 35 * our lguest_data struct is.
32 */ 36 */
33 movl $LHCALL_LGUEST_INIT, %eax 37 movl $LHCALL_LGUEST_INIT, %eax
34 movl $lguest_data - __PAGE_OFFSET, %ebx 38 movl $lguest_data - __PAGE_OFFSET, %ebx
35 int $LGUEST_TRAP_ENTRY 39 int $LGUEST_TRAP_ENTRY
36 40
41 /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
42 movl $LHCALL_NEW_PGTABLE, %eax
43 movl $(initial_page_table - __PAGE_OFFSET), %ebx
44 int $LGUEST_TRAP_ENTRY
45
37 /* Set up the initial stack so we can run C code. */ 46 /* Set up the initial stack so we can run C code. */
38 movl $(init_thread_union+THREAD_SIZE),%esp 47 movl $(init_thread_union+THREAD_SIZE),%esp
39 48
@@ -96,12 +105,8 @@ send_interrupts:
96 */ 105 */
97 pushl %eax 106 pushl %eax
98 movl $LHCALL_SEND_INTERRUPTS, %eax 107 movl $LHCALL_SEND_INTERRUPTS, %eax
99 /* 108 /* This is the actual hypercall trap. */
100 * This is a vmcall instruction (same thing that KVM uses). Older 109 int $LGUEST_TRAP_ENTRY
101 * assembler versions might not know the "vmcall" instruction, so we
102 * create one manually here.
103 */
104 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
105 /* Put eax back the way we found it. */ 110 /* Put eax back the way we found it. */
106 popl %eax 111 popl %eax
107 ret 112 ret
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index efa202499e37..2535933c49f8 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -117,7 +117,7 @@ static __init int map_switcher(void)
117 117
118 /* 118 /*
119 * Now the Switcher is mapped at the right address, we can't fail! 119 * Now the Switcher is mapped at the right address, we can't fail!
120 * Copy in the compiled-in Switcher code (from <arch>_switcher.S). 120 * Copy in the compiled-in Switcher code (from x86/switcher_32.S).
121 */ 121 */
122 memcpy(switcher_vma->addr, start_switcher_text, 122 memcpy(switcher_vma->addr, start_switcher_text,
123 end_switcher_text - start_switcher_text); 123 end_switcher_text - start_switcher_text);
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index daaf86631647..28433a155d67 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -375,11 +375,9 @@ static bool direct_trap(unsigned int num)
375 /* 375 /*
376 * The Host needs to see page faults (for shadow paging and to save the 376 * The Host needs to see page faults (for shadow paging and to save the
377 * fault address), general protection faults (in/out emulation) and 377 * fault address), general protection faults (in/out emulation) and
378 * device not available (TS handling), invalid opcode fault (kvm hcall), 378 * device not available (TS handling) and of course, the hypercall trap.
379 * and of course, the hypercall trap.
380 */ 379 */
381 return num != 14 && num != 13 && num != 7 && 380 return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY;
382 num != 6 && num != LGUEST_TRAP_ENTRY;
383} 381}
384/*:*/ 382/*:*/
385 383
@@ -429,8 +427,8 @@ void pin_stack_pages(struct lg_cpu *cpu)
429 427
430/* 428/*
431 * Direct traps also mean that we need to know whenever the Guest wants to use 429 * Direct traps also mean that we need to know whenever the Guest wants to use
432 * a different kernel stack, so we can change the IDT entries to use that 430 * a different kernel stack, so we can change the guest TSS to use that
433 * stack. The IDT entries expect a virtual address, so unlike most addresses 431 * stack. The TSS entries expect a virtual address, so unlike most addresses
434 * the Guest gives us, the "esp" (stack pointer) value here is virtual, not 432 * the Guest gives us, the "esp" (stack pointer) value here is virtual, not
435 * physical. 433 * physical.
436 * 434 *
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 9136411fadd5..295df06e6590 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -59,6 +59,8 @@ struct lg_cpu {
59 59
60 struct lguest_pages *last_pages; 60 struct lguest_pages *last_pages;
61 61
62 /* Initialization mode: linear map everything. */
63 bool linear_pages;
62 int cpu_pgd; /* Which pgd this cpu is currently using */ 64 int cpu_pgd; /* Which pgd this cpu is currently using */
63 65
64 /* If a hypercall was asked for, this points to the arguments. */ 66 /* If a hypercall was asked for, this points to the arguments. */
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index 69c84a1d88ea..5289ffa2e500 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -109,6 +109,17 @@ static u32 lg_get_features(struct virtio_device *vdev)
109} 109}
110 110
111/* 111/*
112 * To notify on reset or feature finalization, we (ab)use the NOTIFY
113 * hypercall, with the descriptor address of the device.
114 */
115static void status_notify(struct virtio_device *vdev)
116{
117 unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices;
118
119 hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0);
120}
121
122/*
112 * The virtio core takes the features the Host offers, and copies the ones 123 * The virtio core takes the features the Host offers, and copies the ones
113 * supported by the driver into the vdev->features array. Once that's all 124 * supported by the driver into the vdev->features array. Once that's all
114 * sorted out, this routine is called so we can tell the Host which features we 125 * sorted out, this routine is called so we can tell the Host which features we
@@ -135,6 +146,9 @@ static void lg_finalize_features(struct virtio_device *vdev)
135 if (test_bit(i, vdev->features)) 146 if (test_bit(i, vdev->features))
136 out_features[i / 8] |= (1 << (i % 8)); 147 out_features[i / 8] |= (1 << (i % 8));
137 } 148 }
149
150 /* Tell Host we've finished with this device's feature negotiation */
151 status_notify(vdev);
138} 152}
139 153
140/* Once they've found a field, getting a copy of it is easy. */ 154/* Once they've found a field, getting a copy of it is easy. */
@@ -168,28 +182,21 @@ static u8 lg_get_status(struct virtio_device *vdev)
168 return to_lgdev(vdev)->desc->status; 182 return to_lgdev(vdev)->desc->status;
169} 183}
170 184
171/*
172 * To notify on status updates, we (ab)use the NOTIFY hypercall, with the
173 * descriptor address of the device. A zero status means "reset".
174 */
175static void set_status(struct virtio_device *vdev, u8 status)
176{
177 unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices;
178
179 /* We set the status. */
180 to_lgdev(vdev)->desc->status = status;
181 hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0);
182}
183
184static void lg_set_status(struct virtio_device *vdev, u8 status) 185static void lg_set_status(struct virtio_device *vdev, u8 status)
185{ 186{
186 BUG_ON(!status); 187 BUG_ON(!status);
187 set_status(vdev, status); 188 to_lgdev(vdev)->desc->status = status;
189
190 /* Tell Host immediately if we failed. */
191 if (status & VIRTIO_CONFIG_S_FAILED)
192 status_notify(vdev);
188} 193}
189 194
190static void lg_reset(struct virtio_device *vdev) 195static void lg_reset(struct virtio_device *vdev)
191{ 196{
192 set_status(vdev, 0); 197 /* 0 status means "reset" */
198 to_lgdev(vdev)->desc->status = 0;
199 status_notify(vdev);
193} 200}
194 201
195/* 202/*
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 948c547b8e9e..f97e625241ad 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -1,8 +1,10 @@
1/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher 1/*P:200 This contains all the /dev/lguest code, whereby the userspace
2 * controls and communicates with the Guest. For example, the first write will 2 * launcher controls and communicates with the Guest. For example,
3 * tell us the Guest's memory layout and entry point. A read will run the 3 * the first write will tell us the Guest's memory layout and entry
4 * Guest until something happens, such as a signal or the Guest doing a NOTIFY 4 * point. A read will run the Guest until something happens, such as
5 * out to the Launcher. 5 * a signal or the Guest doing a NOTIFY out to the Launcher. There is
6 * also a way for the Launcher to attach eventfds to particular NOTIFY
7 * values instead of returning from the read() call.
6:*/ 8:*/
7#include <linux/uaccess.h> 9#include <linux/uaccess.h>
8#include <linux/miscdevice.h> 10#include <linux/miscdevice.h>
@@ -357,8 +359,8 @@ static int initialize(struct file *file, const unsigned long __user *input)
357 goto free_eventfds; 359 goto free_eventfds;
358 360
359 /* 361 /*
360 * Initialize the Guest's shadow page tables, using the toplevel 362 * Initialize the Guest's shadow page tables. This allocates
361 * address the Launcher gave us. This allocates memory, so can fail. 363 * memory, so can fail.
362 */ 364 */
363 err = init_guest_pagetable(lg); 365 err = init_guest_pagetable(lg);
364 if (err) 366 if (err)
@@ -516,6 +518,7 @@ static const struct file_operations lguest_fops = {
516 .read = read, 518 .read = read,
517 .llseek = default_llseek, 519 .llseek = default_llseek,
518}; 520};
521/*:*/
519 522
520/* 523/*
521 * This is a textbook example of a "misc" character device. Populate a "struct 524 * This is a textbook example of a "misc" character device. Populate a "struct
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index d21578ee95de..3b62be160a6e 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -17,7 +17,6 @@
17#include <linux/percpu.h> 17#include <linux/percpu.h>
18#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
19#include <asm/uaccess.h> 19#include <asm/uaccess.h>
20#include <asm/bootparam.h>
21#include "lg.h" 20#include "lg.h"
22 21
23/*M:008 22/*M:008
@@ -156,7 +155,7 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
156} 155}
157 156
158/* 157/*
159 * These functions are just like the above two, except they access the Guest 158 * These functions are just like the above, except they access the Guest
160 * page tables. Hence they return a Guest address. 159 * page tables. Hence they return a Guest address.
161 */ 160 */
162static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) 161static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
@@ -196,7 +195,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu,
196#endif 195#endif
197/*:*/ 196/*:*/
198 197
199/*M:014 198/*M:007
200 * get_pfn is slow: we could probably try to grab batches of pages here as 199 * get_pfn is slow: we could probably try to grab batches of pages here as
201 * an optimization (ie. pre-faulting). 200 * an optimization (ie. pre-faulting).
202:*/ 201:*/
@@ -325,10 +324,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
325#endif 324#endif
326 325
327 /* First step: get the top-level Guest page table entry. */ 326 /* First step: get the top-level Guest page table entry. */
328 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); 327 if (unlikely(cpu->linear_pages)) {
329 /* Toplevel not present? We can't map it in. */ 328 /* Faking up a linear mapping. */
330 if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) 329 gpgd = __pgd(CHECK_GPGD_MASK);
331 return false; 330 } else {
331 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
332 /* Toplevel not present? We can't map it in. */
333 if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
334 return false;
335 }
332 336
333 /* Now look at the matching shadow entry. */ 337 /* Now look at the matching shadow entry. */
334 spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); 338 spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
@@ -353,10 +357,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
353 } 357 }
354 358
355#ifdef CONFIG_X86_PAE 359#ifdef CONFIG_X86_PAE
356 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); 360 if (unlikely(cpu->linear_pages)) {
357 /* Middle level not present? We can't map it in. */ 361 /* Faking up a linear mapping. */
358 if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) 362 gpmd = __pmd(_PAGE_TABLE);
359 return false; 363 } else {
364 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
365 /* Middle level not present? We can't map it in. */
366 if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
367 return false;
368 }
360 369
361 /* Now look at the matching shadow entry. */ 370 /* Now look at the matching shadow entry. */
362 spmd = spmd_addr(cpu, *spgd, vaddr); 371 spmd = spmd_addr(cpu, *spgd, vaddr);
@@ -397,8 +406,13 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
397 gpte_ptr = gpte_addr(cpu, gpgd, vaddr); 406 gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
398#endif 407#endif
399 408
400 /* Read the actual PTE value. */ 409 if (unlikely(cpu->linear_pages)) {
401 gpte = lgread(cpu, gpte_ptr, pte_t); 410 /* Linear? Make up a PTE which points to same page. */
411 gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT);
412 } else {
413 /* Read the actual PTE value. */
414 gpte = lgread(cpu, gpte_ptr, pte_t);
415 }
402 416
403 /* If this page isn't in the Guest page tables, we can't page it in. */ 417 /* If this page isn't in the Guest page tables, we can't page it in. */
404 if (!(pte_flags(gpte) & _PAGE_PRESENT)) 418 if (!(pte_flags(gpte) & _PAGE_PRESENT))
@@ -454,7 +468,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
454 * Finally, we write the Guest PTE entry back: we've set the 468 * Finally, we write the Guest PTE entry back: we've set the
455 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. 469 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags.
456 */ 470 */
457 lgwrite(cpu, gpte_ptr, pte_t, gpte); 471 if (likely(!cpu->linear_pages))
472 lgwrite(cpu, gpte_ptr, pte_t, gpte);
458 473
459 /* 474 /*
460 * The fault is fixed, the page table is populated, the mapping 475 * The fault is fixed, the page table is populated, the mapping
@@ -612,6 +627,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
612#ifdef CONFIG_X86_PAE 627#ifdef CONFIG_X86_PAE
613 pmd_t gpmd; 628 pmd_t gpmd;
614#endif 629#endif
630
631 /* Still not set up? Just map 1:1. */
632 if (unlikely(cpu->linear_pages))
633 return vaddr;
634
615 /* First step: get the top-level Guest page table entry. */ 635 /* First step: get the top-level Guest page table entry. */
616 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); 636 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
617 /* Toplevel not present? We can't map it in. */ 637 /* Toplevel not present? We can't map it in. */
@@ -708,32 +728,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
708 return next; 728 return next;
709} 729}
710 730
711/*H:430
712 * (iv) Switching page tables
713 *
714 * Now we've seen all the page table setting and manipulation, let's see
715 * what happens when the Guest changes page tables (ie. changes the top-level
716 * pgdir). This occurs on almost every context switch.
717 */
718void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
719{
720 int newpgdir, repin = 0;
721
722 /* Look to see if we have this one already. */
723 newpgdir = find_pgdir(cpu->lg, pgtable);
724 /*
725 * If not, we allocate or mug an existing one: if it's a fresh one,
726 * repin gets set to 1.
727 */
728 if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
729 newpgdir = new_pgdir(cpu, pgtable, &repin);
730 /* Change the current pgd index to the new one. */
731 cpu->cpu_pgd = newpgdir;
732 /* If it was completely blank, we map in the Guest kernel stack */
733 if (repin)
734 pin_stack_pages(cpu);
735}
736
737/*H:470 731/*H:470
738 * Finally, a routine which throws away everything: all PGD entries in all 732 * Finally, a routine which throws away everything: all PGD entries in all
739 * the shadow page tables, including the Guest's kernel mappings. This is used 733 * the shadow page tables, including the Guest's kernel mappings. This is used
@@ -780,6 +774,44 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
780 /* We need the Guest kernel stack mapped again. */ 774 /* We need the Guest kernel stack mapped again. */
781 pin_stack_pages(cpu); 775 pin_stack_pages(cpu);
782} 776}
777
778/*H:430
779 * (iv) Switching page tables
780 *
781 * Now we've seen all the page table setting and manipulation, let's see
782 * what happens when the Guest changes page tables (ie. changes the top-level
783 * pgdir). This occurs on almost every context switch.
784 */
785void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
786{
787 int newpgdir, repin = 0;
788
789 /*
790 * The very first time they call this, we're actually running without
791 * any page tables; we've been making it up. Throw them away now.
792 */
793 if (unlikely(cpu->linear_pages)) {
794 release_all_pagetables(cpu->lg);
795 cpu->linear_pages = false;
796 /* Force allocation of a new pgdir. */
797 newpgdir = ARRAY_SIZE(cpu->lg->pgdirs);
798 } else {
799 /* Look to see if we have this one already. */
800 newpgdir = find_pgdir(cpu->lg, pgtable);
801 }
802
803 /*
804 * If not, we allocate or mug an existing one: if it's a fresh one,
805 * repin gets set to 1.
806 */
807 if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
808 newpgdir = new_pgdir(cpu, pgtable, &repin);
809 /* Change the current pgd index to the new one. */
810 cpu->cpu_pgd = newpgdir;
811 /* If it was completely blank, we map in the Guest kernel stack */
812 if (repin)
813 pin_stack_pages(cpu);
814}
783/*:*/ 815/*:*/
784 816
785/*M:009 817/*M:009
@@ -919,168 +951,26 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
919} 951}
920#endif 952#endif
921 953
922/*H:505
923 * To get through boot, we construct simple identity page mappings (which
924 * set virtual == physical) and linear mappings which will get the Guest far
925 * enough into the boot to create its own. The linear mapping means we
926 * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET,
927 * as you'll see.
928 *
929 * We lay them out of the way, just below the initrd (which is why we need to
930 * know its size here).
931 */
932static unsigned long setup_pagetables(struct lguest *lg,
933 unsigned long mem,
934 unsigned long initrd_size)
935{
936 pgd_t __user *pgdir;
937 pte_t __user *linear;
938 unsigned long mem_base = (unsigned long)lg->mem_base;
939 unsigned int mapped_pages, i, linear_pages;
940#ifdef CONFIG_X86_PAE
941 pmd_t __user *pmds;
942 unsigned int j;
943 pgd_t pgd;
944 pmd_t pmd;
945#else
946 unsigned int phys_linear;
947#endif
948
949 /*
950 * We have mapped_pages frames to map, so we need linear_pages page
951 * tables to map them.
952 */
953 mapped_pages = mem / PAGE_SIZE;
954 linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE;
955
956 /* We put the toplevel page directory page at the top of memory. */
957 pgdir = (pgd_t *)(mem + mem_base - initrd_size - PAGE_SIZE);
958
959 /* Now we use the next linear_pages pages as pte pages */
960 linear = (void *)pgdir - linear_pages * PAGE_SIZE;
961
962#ifdef CONFIG_X86_PAE
963 /*
964 * And the single mid page goes below that. We only use one, but
965 * that's enough to map 1G, which definitely gets us through boot.
966 */
967 pmds = (void *)linear - PAGE_SIZE;
968#endif
969 /*
970 * Linear mapping is easy: put every page's address into the
971 * mapping in order.
972 */
973 for (i = 0; i < mapped_pages; i++) {
974 pte_t pte;
975 pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER));
976 if (copy_to_user(&linear[i], &pte, sizeof(pte)) != 0)
977 return -EFAULT;
978 }
979
980#ifdef CONFIG_X86_PAE
981 /*
982 * Make the Guest PMD entries point to the corresponding place in the
983 * linear mapping (up to one page worth of PMD).
984 */
985 for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD;
986 i += PTRS_PER_PTE, j++) {
987 pmd = pfn_pmd(((unsigned long)&linear[i] - mem_base)/PAGE_SIZE,
988 __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
989
990 if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0)
991 return -EFAULT;
992 }
993
994 /* One PGD entry, pointing to that PMD page. */
995 pgd = __pgd(((unsigned long)pmds - mem_base) | _PAGE_PRESENT);
996 /* Copy it in as the first PGD entry (ie. addresses 0-1G). */
997 if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0)
998 return -EFAULT;
999 /*
1000 * And the other PGD entry to make the linear mapping at PAGE_OFFSET
1001 */
1002 if (copy_to_user(&pgdir[KERNEL_PGD_BOUNDARY], &pgd, sizeof(pgd)))
1003 return -EFAULT;
1004#else
1005 /*
1006 * The top level points to the linear page table pages above.
1007 * We setup the identity and linear mappings here.
1008 */
1009 phys_linear = (unsigned long)linear - mem_base;
1010 for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) {
1011 pgd_t pgd;
1012 /*
1013 * Create a PGD entry which points to the right part of the
1014 * linear PTE pages.
1015 */
1016 pgd = __pgd((phys_linear + i * sizeof(pte_t)) |
1017 (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
1018
1019 /*
1020 * Copy it into the PGD page at 0 and PAGE_OFFSET.
1021 */
1022 if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd))
1023 || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET)
1024 + i / PTRS_PER_PTE],
1025 &pgd, sizeof(pgd)))
1026 return -EFAULT;
1027 }
1028#endif
1029
1030 /*
1031 * We return the top level (guest-physical) address: we remember where
1032 * this is to write it into lguest_data when the Guest initializes.
1033 */
1034 return (unsigned long)pgdir - mem_base;
1035}
1036
1037/*H:500 954/*H:500
1038 * (vii) Setting up the page tables initially. 955 * (vii) Setting up the page tables initially.
1039 * 956 *
1040 * When a Guest is first created, the Launcher tells us where the toplevel of 957 * When a Guest is first created, set initialize a shadow page table which
1041 * its first page table is. We set some things up here: 958 * we will populate on future faults. The Guest doesn't have any actual
959 * pagetables yet, so we set linear_pages to tell demand_page() to fake it
960 * for the moment.
1042 */ 961 */
1043int init_guest_pagetable(struct lguest *lg) 962int init_guest_pagetable(struct lguest *lg)
1044{ 963{
1045 u64 mem; 964 struct lg_cpu *cpu = &lg->cpus[0];
1046 u32 initrd_size; 965 int allocated = 0;
1047 struct boot_params __user *boot = (struct boot_params *)lg->mem_base;
1048#ifdef CONFIG_X86_PAE
1049 pgd_t *pgd;
1050 pmd_t *pmd_table;
1051#endif
1052 /*
1053 * Get the Guest memory size and the ramdisk size from the boot header
1054 * located at lg->mem_base (Guest address 0).
1055 */
1056 if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem))
1057 || get_user(initrd_size, &boot->hdr.ramdisk_size))
1058 return -EFAULT;
1059 966
1060 /* 967 /* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */
1061 * We start on the first shadow page table, and give it a blank PGD 968 cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated);
1062 * page. 969 if (!allocated)
1063 */
1064 lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size);
1065 if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir))
1066 return lg->pgdirs[0].gpgdir;
1067 lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
1068 if (!lg->pgdirs[0].pgdir)
1069 return -ENOMEM; 970 return -ENOMEM;
1070 971
1071#ifdef CONFIG_X86_PAE 972 /* We start with a linear mapping until the initialize. */
1072 /* For PAE, we also create the initial mid-level. */ 973 cpu->linear_pages = true;
1073 pgd = lg->pgdirs[0].pgdir;
1074 pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL);
1075 if (!pmd_table)
1076 return -ENOMEM;
1077
1078 set_pgd(pgd + SWITCHER_PGD_INDEX,
1079 __pgd(__pa(pmd_table) | _PAGE_PRESENT));
1080#endif
1081
1082 /* This is the current page table. */
1083 lg->cpus[0].cpu_pgd = 0;
1084 return 0; 974 return 0;
1085} 975}
1086 976
@@ -1095,10 +985,10 @@ void page_table_guest_data_init(struct lg_cpu *cpu)
1095 * of virtual addresses used by the Switcher. 985 * of virtual addresses used by the Switcher.
1096 */ 986 */
1097 || put_user(RESERVE_MEM * 1024 * 1024, 987 || put_user(RESERVE_MEM * 1024 * 1024,
1098 &cpu->lg->lguest_data->reserve_mem) 988 &cpu->lg->lguest_data->reserve_mem)) {
1099 || put_user(cpu->lg->pgdirs[0].gpgdir,
1100 &cpu->lg->lguest_data->pgdir))
1101 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 989 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
990 return;
991 }
1102 992
1103 /* 993 /*
1104 * In flush_user_mappings() we loop from 0 to 994 * In flush_user_mappings() we loop from 0 to
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 9f1659c3d1f3..65af42f2d593 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -269,10 +269,10 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
269static int emulate_insn(struct lg_cpu *cpu) 269static int emulate_insn(struct lg_cpu *cpu)
270{ 270{
271 u8 insn; 271 u8 insn;
272 unsigned int insnlen = 0, in = 0, shift = 0; 272 unsigned int insnlen = 0, in = 0, small_operand = 0;
273 /* 273 /*
274 * The eip contains the *virtual* address of the Guest's instruction: 274 * The eip contains the *virtual* address of the Guest's instruction:
275 * guest_pa just subtracts the Guest's page_offset. 275 * walk the Guest's page tables to find the "physical" address.
276 */ 276 */
277 unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); 277 unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
278 278
@@ -300,11 +300,10 @@ static int emulate_insn(struct lg_cpu *cpu)
300 } 300 }
301 301
302 /* 302 /*
303 * 0x66 is an "operand prefix". It means it's using the upper 16 bits 303 * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out.
304 * of the eax register.
305 */ 304 */
306 if (insn == 0x66) { 305 if (insn == 0x66) {
307 shift = 16; 306 small_operand = 1;
308 /* The instruction is 1 byte so far, read the next byte. */ 307 /* The instruction is 1 byte so far, read the next byte. */
309 insnlen = 1; 308 insnlen = 1;
310 insn = lgread(cpu, physaddr + insnlen, u8); 309 insn = lgread(cpu, physaddr + insnlen, u8);
@@ -340,11 +339,14 @@ static int emulate_insn(struct lg_cpu *cpu)
340 * traditionally means "there's nothing there". 339 * traditionally means "there's nothing there".
341 */ 340 */
342 if (in) { 341 if (in) {
343 /* Lower bit tells is whether it's a 16 or 32 bit access */ 342 /* Lower bit tells means it's a 32/16 bit access */
344 if (insn & 0x1) 343 if (insn & 0x1) {
345 cpu->regs->eax = 0xFFFFFFFF; 344 if (small_operand)
346 else 345 cpu->regs->eax |= 0xFFFF;
347 cpu->regs->eax |= (0xFFFF << shift); 346 else
347 cpu->regs->eax = 0xFFFFFFFF;
348 } else
349 cpu->regs->eax |= 0xFF;
348 } 350 }
349 /* Finally, we've "done" the instruction, so move past it. */ 351 /* Finally, we've "done" the instruction, so move past it. */
350 cpu->regs->eip += insnlen; 352 cpu->regs->eip += insnlen;
@@ -352,69 +354,6 @@ static int emulate_insn(struct lg_cpu *cpu)
352 return 1; 354 return 1;
353} 355}
354 356
355/*
356 * Our hypercalls mechanism used to be based on direct software interrupts.
357 * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to
358 * change over to using kvm hypercalls.
359 *
360 * KVM_HYPERCALL is actually a "vmcall" instruction, which generates an invalid
361 * opcode fault (fault 6) on non-VT cpus, so the easiest solution seemed to be
362 * an *emulation approach*: if the fault was really produced by an hypercall
363 * (is_hypercall() does exactly this check), we can just call the corresponding
364 * hypercall host implementation function.
365 *
366 * But these invalid opcode faults are notably slower than software interrupts.
367 * So we implemented the *patching (or rewriting) approach*: every time we hit
368 * the KVM_HYPERCALL opcode in Guest code, we patch it to the old "int 0x1f"
369 * opcode, so next time the Guest calls this hypercall it will use the
370 * faster trap mechanism.
371 *
372 * Matias even benchmarked it to convince you: this shows the average cycle
373 * cost of a hypercall. For each alternative solution mentioned above we've
374 * made 5 runs of the benchmark:
375 *
376 * 1) direct software interrupt: 2915, 2789, 2764, 2721, 2898
377 * 2) emulation technique: 3410, 3681, 3466, 3392, 3780
378 * 3) patching (rewrite) technique: 2977, 2975, 2891, 2637, 2884
379 *
380 * One two-line function is worth a 20% hypercall speed boost!
381 */
382static void rewrite_hypercall(struct lg_cpu *cpu)
383{
384 /*
385 * This are the opcodes we use to patch the Guest. The opcode for "int
386 * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we
387 * complete the sequence with a NOP (0x90).
388 */
389 u8 insn[3] = {0xcd, 0x1f, 0x90};
390
391 __lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn));
392 /*
393 * The above write might have caused a copy of that page to be made
394 * (if it was read-only). We need to make sure the Guest has
395 * up-to-date pagetables. As this doesn't happen often, we can just
396 * drop them all.
397 */
398 guest_pagetable_clear_all(cpu);
399}
400
401static bool is_hypercall(struct lg_cpu *cpu)
402{
403 u8 insn[3];
404
405 /*
406 * This must be the Guest kernel trying to do something.
407 * The bottom two bits of the CS segment register are the privilege
408 * level.
409 */
410 if ((cpu->regs->cs & 3) != GUEST_PL)
411 return false;
412
413 /* Is it a vmcall? */
414 __lgread(cpu, insn, guest_pa(cpu, cpu->regs->eip), sizeof(insn));
415 return insn[0] == 0x0f && insn[1] == 0x01 && insn[2] == 0xc1;
416}
417
418/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ 357/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
419void lguest_arch_handle_trap(struct lg_cpu *cpu) 358void lguest_arch_handle_trap(struct lg_cpu *cpu)
420{ 359{
@@ -429,20 +368,6 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
429 if (emulate_insn(cpu)) 368 if (emulate_insn(cpu))
430 return; 369 return;
431 } 370 }
432 /*
433 * If KVM is active, the vmcall instruction triggers a General
434 * Protection Fault. Normally it triggers an invalid opcode
435 * fault (6):
436 */
437 case 6:
438 /*
439 * We need to check if ring == GUEST_PL and faulting
440 * instruction == vmcall.
441 */
442 if (is_hypercall(cpu)) {
443 rewrite_hypercall(cpu);
444 return;
445 }
446 break; 371 break;
447 case 14: /* We've intercepted a Page Fault. */ 372 case 14: /* We've intercepted a Page Fault. */
448 /* 373 /*
@@ -486,7 +411,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
486 * These values mean a real interrupt occurred, in which case 411 * These values mean a real interrupt occurred, in which case
487 * the Host handler has already been run. We just do a 412 * the Host handler has already been run. We just do a
488 * friendly check if another process should now be run, then 413 * friendly check if another process should now be run, then
489 * return to run the Guest again 414 * return to run the Guest again.
490 */ 415 */
491 cond_resched(); 416 cond_resched();
492 return; 417 return;
@@ -536,7 +461,7 @@ void __init lguest_arch_host_init(void)
536 int i; 461 int i;
537 462
538 /* 463 /*
539 * Most of the i386/switcher.S doesn't care that it's been moved; on 464 * Most of the x86/switcher_32.S doesn't care that it's been moved; on
540 * Intel, jumps are relative, and it doesn't access any references to 465 * Intel, jumps are relative, and it doesn't access any references to
541 * external code or data. 466 * external code or data.
542 * 467 *
@@ -664,7 +589,7 @@ void __init lguest_arch_host_init(void)
664 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); 589 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
665 } 590 }
666 put_online_cpus(); 591 put_online_cpus();
667}; 592}
668/*:*/ 593/*:*/
669 594
670void __exit lguest_arch_host_fini(void) 595void __exit lguest_arch_host_fini(void)
@@ -747,8 +672,6 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
747/*:*/ 672/*:*/
748 673
749/*L:030 674/*L:030
750 * lguest_arch_setup_regs()
751 *
752 * Most of the Guest's registers are left alone: we used get_zeroed_page() to 675 * Most of the Guest's registers are left alone: we used get_zeroed_page() to
753 * allocate the structure, so they will be 0. 676 * allocate the structure, so they will be 0.
754 */ 677 */
diff --git a/include/linux/lguest.h b/include/linux/lguest.h
index 2fb1dcbcb5aa..9962c6bb1311 100644
--- a/include/linux/lguest.h
+++ b/include/linux/lguest.h
@@ -59,8 +59,6 @@ struct lguest_data {
59 unsigned long reserve_mem; 59 unsigned long reserve_mem;
60 /* KHz for the TSC clock. */ 60 /* KHz for the TSC clock. */
61 u32 tsc_khz; 61 u32 tsc_khz;
62 /* Page where the top-level pagetable is */
63 unsigned long pgdir;
64 62
65/* Fields initialized by the Guest at boot: */ 63/* Fields initialized by the Guest at boot: */
66 /* Instruction range to suppress interrupts even if enabled */ 64 /* Instruction range to suppress interrupts even if enabled */