diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-22 16:45:50 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-22 16:45:50 -0400 |
commit | a7e1aabb28e8154ce987b622fd78d80a1ca39361 (patch) | |
tree | 8671d8faf51d43665045b7362a177a23dc88921b | |
parent | 111ad119d1765b1bbef2629a5f2bd825caeb7e74 (diff) | |
parent | 996ba96a97f7406052486682846d68935a60e986 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus
* git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus:
lguest: Fix in/out emulation
lguest: Fix translation count about wikipedia's cpuid page
lguest: Fix three simple typos in comments
lguest: update comments
lguest: Simplify device initialization.
lguest: don't rewrite vmcall instructions
lguest: remove remaining vmcall
lguest: use a special 1:1 linear pagetable mode until first switch.
lguest: Do not exit on non-fatal errors
-rw-r--r-- | Documentation/virtual/lguest/lguest.c | 47 | ||||
-rw-r--r-- | arch/x86/include/asm/lguest_hcall.h | 1 | ||||
-rw-r--r-- | arch/x86/kernel/asm-offsets_32.c | 1 | ||||
-rw-r--r-- | arch/x86/lguest/boot.c | 36 | ||||
-rw-r--r-- | arch/x86/lguest/i386_head.S | 35 | ||||
-rw-r--r-- | drivers/lguest/core.c | 2 | ||||
-rw-r--r-- | drivers/lguest/interrupts_and_traps.c | 10 | ||||
-rw-r--r-- | drivers/lguest/lg.h | 2 | ||||
-rw-r--r-- | drivers/lguest/lguest_device.c | 37 | ||||
-rw-r--r-- | drivers/lguest/lguest_user.c | 17 | ||||
-rw-r--r-- | drivers/lguest/page_tables.c | 282 | ||||
-rw-r--r-- | drivers/lguest/x86/core.c | 107 | ||||
-rw-r--r-- | include/linux/lguest.h | 2 |
13 files changed, 199 insertions, 380 deletions
diff --git a/Documentation/virtual/lguest/lguest.c b/Documentation/virtual/lguest/lguest.c index cd9d6af61d07..043bd7df3139 100644 --- a/Documentation/virtual/lguest/lguest.c +++ b/Documentation/virtual/lguest/lguest.c | |||
@@ -51,7 +51,7 @@ | |||
51 | #include <asm/bootparam.h> | 51 | #include <asm/bootparam.h> |
52 | #include "../../../include/linux/lguest_launcher.h" | 52 | #include "../../../include/linux/lguest_launcher.h" |
53 | /*L:110 | 53 | /*L:110 |
54 | * We can ignore the 42 include files we need for this program, but I do want | 54 | * We can ignore the 43 include files we need for this program, but I do want |
55 | * to draw attention to the use of kernel-style types. | 55 | * to draw attention to the use of kernel-style types. |
56 | * | 56 | * |
57 | * As Linus said, "C is a Spartan language, and so should your naming be." I | 57 | * As Linus said, "C is a Spartan language, and so should your naming be." I |
@@ -65,7 +65,6 @@ typedef uint16_t u16; | |||
65 | typedef uint8_t u8; | 65 | typedef uint8_t u8; |
66 | /*:*/ | 66 | /*:*/ |
67 | 67 | ||
68 | #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ | ||
69 | #define BRIDGE_PFX "bridge:" | 68 | #define BRIDGE_PFX "bridge:" |
70 | #ifndef SIOCBRADDIF | 69 | #ifndef SIOCBRADDIF |
71 | #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ | 70 | #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ |
@@ -861,8 +860,10 @@ static void console_output(struct virtqueue *vq) | |||
861 | /* writev can return a partial write, so we loop here. */ | 860 | /* writev can return a partial write, so we loop here. */ |
862 | while (!iov_empty(iov, out)) { | 861 | while (!iov_empty(iov, out)) { |
863 | int len = writev(STDOUT_FILENO, iov, out); | 862 | int len = writev(STDOUT_FILENO, iov, out); |
864 | if (len <= 0) | 863 | if (len <= 0) { |
865 | err(1, "Write to stdout gave %i", len); | 864 | warn("Write to stdout gave %i (%d)", len, errno); |
865 | break; | ||
866 | } | ||
866 | iov_consume(iov, out, len); | 867 | iov_consume(iov, out, len); |
867 | } | 868 | } |
868 | 869 | ||
@@ -898,7 +899,7 @@ static void net_output(struct virtqueue *vq) | |||
898 | * same format: what a coincidence! | 899 | * same format: what a coincidence! |
899 | */ | 900 | */ |
900 | if (writev(net_info->tunfd, iov, out) < 0) | 901 | if (writev(net_info->tunfd, iov, out) < 0) |
901 | errx(1, "Write to tun failed?"); | 902 | warnx("Write to tun failed (%d)?", errno); |
902 | 903 | ||
903 | /* | 904 | /* |
904 | * Done with that one; wait_for_vq_desc() will send the interrupt if | 905 | * Done with that one; wait_for_vq_desc() will send the interrupt if |
@@ -955,7 +956,7 @@ static void net_input(struct virtqueue *vq) | |||
955 | */ | 956 | */ |
956 | len = readv(net_info->tunfd, iov, in); | 957 | len = readv(net_info->tunfd, iov, in); |
957 | if (len <= 0) | 958 | if (len <= 0) |
958 | err(1, "Failed to read from tun."); | 959 | warn("Failed to read from tun (%d).", errno); |
959 | 960 | ||
960 | /* | 961 | /* |
961 | * Mark that packet buffer as used, but don't interrupt here. We want | 962 | * Mark that packet buffer as used, but don't interrupt here. We want |
@@ -1093,9 +1094,10 @@ static void update_device_status(struct device *dev) | |||
1093 | warnx("Device %s configuration FAILED", dev->name); | 1094 | warnx("Device %s configuration FAILED", dev->name); |
1094 | if (dev->running) | 1095 | if (dev->running) |
1095 | reset_device(dev); | 1096 | reset_device(dev); |
1096 | } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { | 1097 | } else { |
1097 | if (!dev->running) | 1098 | if (dev->running) |
1098 | start_device(dev); | 1099 | err(1, "Device %s features finalized twice", dev->name); |
1100 | start_device(dev); | ||
1099 | } | 1101 | } |
1100 | } | 1102 | } |
1101 | 1103 | ||
@@ -1120,25 +1122,11 @@ static void handle_output(unsigned long addr) | |||
1120 | return; | 1122 | return; |
1121 | } | 1123 | } |
1122 | 1124 | ||
1123 | /* | 1125 | /* Devices should not be used before features are finalized. */ |
1124 | * Devices *can* be used before status is set to DRIVER_OK. | ||
1125 | * The original plan was that they would never do this: they | ||
1126 | * would always finish setting up their status bits before | ||
1127 | * actually touching the virtqueues. In practice, we allowed | ||
1128 | * them to, and they do (eg. the disk probes for partition | ||
1129 | * tables as part of initialization). | ||
1130 | * | ||
1131 | * If we see this, we start the device: once it's running, we | ||
1132 | * expect the device to catch all the notifications. | ||
1133 | */ | ||
1134 | for (vq = i->vq; vq; vq = vq->next) { | 1126 | for (vq = i->vq; vq; vq = vq->next) { |
1135 | if (addr != vq->config.pfn*getpagesize()) | 1127 | if (addr != vq->config.pfn*getpagesize()) |
1136 | continue; | 1128 | continue; |
1137 | if (i->running) | 1129 | errx(1, "Notification on %s before setup!", i->name); |
1138 | errx(1, "Notification on running %s", i->name); | ||
1139 | /* This just calls create_thread() for each virtqueue */ | ||
1140 | start_device(i); | ||
1141 | return; | ||
1142 | } | 1130 | } |
1143 | } | 1131 | } |
1144 | 1132 | ||
@@ -1370,7 +1358,7 @@ static void setup_console(void) | |||
1370 | * --sharenet=<name> option which opens or creates a named pipe. This can be | 1358 | * --sharenet=<name> option which opens or creates a named pipe. This can be |
1371 | * used to send packets to another guest in a 1:1 manner. | 1359 | * used to send packets to another guest in a 1:1 manner. |
1372 | * | 1360 | * |
1373 | * More sopisticated is to use one of the tools developed for project like UML | 1361 | * More sophisticated is to use one of the tools developed for project like UML |
1374 | * to do networking. | 1362 | * to do networking. |
1375 | * | 1363 | * |
1376 | * Faster is to do virtio bonding in kernel. Doing this 1:1 would be | 1364 | * Faster is to do virtio bonding in kernel. Doing this 1:1 would be |
@@ -1380,7 +1368,7 @@ static void setup_console(void) | |||
1380 | * multiple inter-guest channels behind one interface, although it would | 1368 | * multiple inter-guest channels behind one interface, although it would |
1381 | * require some manner of hotplugging new virtio channels. | 1369 | * require some manner of hotplugging new virtio channels. |
1382 | * | 1370 | * |
1383 | * Finally, we could implement a virtio network switch in the kernel. | 1371 | * Finally, we could use a virtio network switch in the kernel, ie. vhost. |
1384 | :*/ | 1372 | :*/ |
1385 | 1373 | ||
1386 | static u32 str2ip(const char *ipaddr) | 1374 | static u32 str2ip(const char *ipaddr) |
@@ -2017,10 +2005,7 @@ int main(int argc, char *argv[]) | |||
2017 | /* Tell the entry path not to try to reload segment registers. */ | 2005 | /* Tell the entry path not to try to reload segment registers. */ |
2018 | boot->hdr.loadflags |= KEEP_SEGMENTS; | 2006 | boot->hdr.loadflags |= KEEP_SEGMENTS; |
2019 | 2007 | ||
2020 | /* | 2008 | /* We tell the kernel to initialize the Guest. */ |
2021 | * We tell the kernel to initialize the Guest: this returns the open | ||
2022 | * /dev/lguest file descriptor. | ||
2023 | */ | ||
2024 | tell_kernel(start); | 2009 | tell_kernel(start); |
2025 | 2010 | ||
2026 | /* Ensure that we terminate if a device-servicing child dies. */ | 2011 | /* Ensure that we terminate if a device-servicing child dies. */ |
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index b60f2924c413..879fd7d33877 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h | |||
@@ -61,6 +61,7 @@ hcall(unsigned long call, | |||
61 | : "memory"); | 61 | : "memory"); |
62 | return call; | 62 | return call; |
63 | } | 63 | } |
64 | /*:*/ | ||
64 | 65 | ||
65 | /* Can't use our min() macro here: needs to be a constant */ | 66 | /* Can't use our min() macro here: needs to be a constant */ |
66 | #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) | 67 | #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) |
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index c29d631af6fc..395a10e68067 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c | |||
@@ -63,7 +63,6 @@ void foo(void) | |||
63 | BLANK(); | 63 | BLANK(); |
64 | OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); | 64 | OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); |
65 | OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); | 65 | OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); |
66 | OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); | ||
67 | 66 | ||
68 | BLANK(); | 67 | BLANK(); |
69 | OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); | 68 | OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); |
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index db832fd65ecb..13ee258442ae 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
@@ -71,7 +71,8 @@ | |||
71 | #include <asm/stackprotector.h> | 71 | #include <asm/stackprotector.h> |
72 | #include <asm/reboot.h> /* for struct machine_ops */ | 72 | #include <asm/reboot.h> /* for struct machine_ops */ |
73 | 73 | ||
74 | /*G:010 Welcome to the Guest! | 74 | /*G:010 |
75 | * Welcome to the Guest! | ||
75 | * | 76 | * |
76 | * The Guest in our tale is a simple creature: identical to the Host but | 77 | * The Guest in our tale is a simple creature: identical to the Host but |
77 | * behaving in simplified but equivalent ways. In particular, the Guest is the | 78 | * behaving in simplified but equivalent ways. In particular, the Guest is the |
@@ -190,15 +191,23 @@ static void lazy_hcall4(unsigned long call, | |||
190 | #endif | 191 | #endif |
191 | 192 | ||
192 | /*G:036 | 193 | /*G:036 |
193 | * When lazy mode is turned off reset the per-cpu lazy mode variable and then | 194 | * When lazy mode is turned off, we issue the do-nothing hypercall to |
194 | * issue the do-nothing hypercall to flush any stored calls. | 195 | * flush any stored calls, and call the generic helper to reset the |
195 | :*/ | 196 | * per-cpu lazy mode variable. |
197 | */ | ||
196 | static void lguest_leave_lazy_mmu_mode(void) | 198 | static void lguest_leave_lazy_mmu_mode(void) |
197 | { | 199 | { |
198 | hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); | 200 | hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); |
199 | paravirt_leave_lazy_mmu(); | 201 | paravirt_leave_lazy_mmu(); |
200 | } | 202 | } |
201 | 203 | ||
204 | /* | ||
205 | * We also catch the end of context switch; we enter lazy mode for much of | ||
206 | * that too, so again we need to flush here. | ||
207 | * | ||
208 | * (Technically, this is lazy CPU mode, and normally we're in lazy MMU | ||
209 | * mode, but unlike Xen, lguest doesn't care about the difference). | ||
210 | */ | ||
202 | static void lguest_end_context_switch(struct task_struct *next) | 211 | static void lguest_end_context_switch(struct task_struct *next) |
203 | { | 212 | { |
204 | hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); | 213 | hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); |
@@ -391,7 +400,7 @@ static void lguest_load_tr_desc(void) | |||
391 | * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. | 400 | * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. |
392 | * | 401 | * |
393 | * This instruction even it has its own Wikipedia entry. The Wikipedia entry | 402 | * This instruction even it has its own Wikipedia entry. The Wikipedia entry |
394 | * has been translated into 5 languages. I am not making this up! | 403 | * has been translated into 6 languages. I am not making this up! |
395 | * | 404 | * |
396 | * We could get funky here and identify ourselves as "GenuineLguest", but | 405 | * We could get funky here and identify ourselves as "GenuineLguest", but |
397 | * instead we just use the real "cpuid" instruction. Then I pretty much turned | 406 | * instead we just use the real "cpuid" instruction. Then I pretty much turned |
@@ -458,7 +467,7 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | |||
458 | /* | 467 | /* |
459 | * PAE systems can mark pages as non-executable. Linux calls this the | 468 | * PAE systems can mark pages as non-executable. Linux calls this the |
460 | * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced | 469 | * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced |
461 | * Virus Protection). We just switch turn if off here, since we don't | 470 | * Virus Protection). We just switch it off here, since we don't |
462 | * support it. | 471 | * support it. |
463 | */ | 472 | */ |
464 | case 0x80000001: | 473 | case 0x80000001: |
@@ -520,17 +529,16 @@ static unsigned long lguest_read_cr2(void) | |||
520 | 529 | ||
521 | /* See lguest_set_pte() below. */ | 530 | /* See lguest_set_pte() below. */ |
522 | static bool cr3_changed = false; | 531 | static bool cr3_changed = false; |
532 | static unsigned long current_cr3; | ||
523 | 533 | ||
524 | /* | 534 | /* |
525 | * cr3 is the current toplevel pagetable page: the principle is the same as | 535 | * cr3 is the current toplevel pagetable page: the principle is the same as |
526 | * cr0. Keep a local copy, and tell the Host when it changes. The only | 536 | * cr0. Keep a local copy, and tell the Host when it changes. |
527 | * difference is that our local copy is in lguest_data because the Host needs | ||
528 | * to set it upon our initial hypercall. | ||
529 | */ | 537 | */ |
530 | static void lguest_write_cr3(unsigned long cr3) | 538 | static void lguest_write_cr3(unsigned long cr3) |
531 | { | 539 | { |
532 | lguest_data.pgdir = cr3; | ||
533 | lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); | 540 | lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); |
541 | current_cr3 = cr3; | ||
534 | 542 | ||
535 | /* These two page tables are simple, linear, and used during boot */ | 543 | /* These two page tables are simple, linear, and used during boot */ |
536 | if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table)) | 544 | if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table)) |
@@ -539,7 +547,7 @@ static void lguest_write_cr3(unsigned long cr3) | |||
539 | 547 | ||
540 | static unsigned long lguest_read_cr3(void) | 548 | static unsigned long lguest_read_cr3(void) |
541 | { | 549 | { |
542 | return lguest_data.pgdir; | 550 | return current_cr3; |
543 | } | 551 | } |
544 | 552 | ||
545 | /* cr4 is used to enable and disable PGE, but we don't care. */ | 553 | /* cr4 is used to enable and disable PGE, but we don't care. */ |
@@ -641,7 +649,7 @@ static void lguest_write_cr4(unsigned long val) | |||
641 | 649 | ||
642 | /* | 650 | /* |
643 | * The Guest calls this after it has set a second-level entry (pte), ie. to map | 651 | * The Guest calls this after it has set a second-level entry (pte), ie. to map |
644 | * a page into a process' address space. Wetell the Host the toplevel and | 652 | * a page into a process' address space. We tell the Host the toplevel and |
645 | * address this corresponds to. The Guest uses one pagetable per process, so | 653 | * address this corresponds to. The Guest uses one pagetable per process, so |
646 | * we need to tell the Host which one we're changing (mm->pgd). | 654 | * we need to tell the Host which one we're changing (mm->pgd). |
647 | */ | 655 | */ |
@@ -758,7 +766,7 @@ static void lguest_pmd_clear(pmd_t *pmdp) | |||
758 | static void lguest_flush_tlb_single(unsigned long addr) | 766 | static void lguest_flush_tlb_single(unsigned long addr) |
759 | { | 767 | { |
760 | /* Simply set it to zero: if it was not, it will fault back in. */ | 768 | /* Simply set it to zero: if it was not, it will fault back in. */ |
761 | lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); | 769 | lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0); |
762 | } | 770 | } |
763 | 771 | ||
764 | /* | 772 | /* |
@@ -1140,7 +1148,7 @@ static struct notifier_block paniced = { | |||
1140 | static __init char *lguest_memory_setup(void) | 1148 | static __init char *lguest_memory_setup(void) |
1141 | { | 1149 | { |
1142 | /* | 1150 | /* |
1143 | *The Linux bootloader header contains an "e820" memory map: the | 1151 | * The Linux bootloader header contains an "e820" memory map: the |
1144 | * Launcher populated the first entry with our memory limit. | 1152 | * Launcher populated the first entry with our memory limit. |
1145 | */ | 1153 | */ |
1146 | e820_add_region(boot_params.e820_map[0].addr, | 1154 | e820_add_region(boot_params.e820_map[0].addr, |
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index 4f420c2f2d55..6ddfe4fc23c3 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S | |||
@@ -6,18 +6,22 @@ | |||
6 | #include <asm/processor-flags.h> | 6 | #include <asm/processor-flags.h> |
7 | 7 | ||
8 | /*G:020 | 8 | /*G:020 |
9 | * Our story starts with the kernel booting into startup_32 in | 9 | |
10 | * arch/x86/kernel/head_32.S. It expects a boot header, which is created by | 10 | * Our story starts with the bzImage: booting starts at startup_32 in |
11 | * the bootloader (the Launcher in our case). | 11 | * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real |
12 | * kernel in place and then jumps into it: startup_32 in | ||
13 | * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi | ||
14 | * register, which is created by the bootloader (the Launcher in our case). | ||
12 | * | 15 | * |
13 | * The startup_32 function does very little: it clears the uninitialized global | 16 | * The startup_32 function does very little: it clears the uninitialized global |
14 | * C variables which we expect to be zero (ie. BSS) and then copies the boot | 17 | * C variables which we expect to be zero (ie. BSS) and then copies the boot |
15 | * header and kernel command line somewhere safe. Finally it checks the | 18 | * header and kernel command line somewhere safe, and populates some initial |
16 | * 'hardware_subarch' field. This was introduced in 2.6.24 for lguest and Xen: | 19 | * page tables. Finally it checks the 'hardware_subarch' field. This was |
17 | * if it's set to '1' (lguest's assigned number), then it calls us here. | 20 | * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's |
21 | * assigned number), then it calls us here. | ||
18 | * | 22 | * |
19 | * WARNING: be very careful here! We're running at addresses equal to physical | 23 | * WARNING: be very careful here! We're running at addresses equal to physical |
20 | * addesses (around 0), not above PAGE_OFFSET as most code expectes | 24 | * addresses (around 0), not above PAGE_OFFSET as most code expects |
21 | * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any | 25 | * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any |
22 | * data without remembering to subtract __PAGE_OFFSET! | 26 | * data without remembering to subtract __PAGE_OFFSET! |
23 | * | 27 | * |
@@ -27,13 +31,18 @@ | |||
27 | .section .init.text, "ax", @progbits | 31 | .section .init.text, "ax", @progbits |
28 | ENTRY(lguest_entry) | 32 | ENTRY(lguest_entry) |
29 | /* | 33 | /* |
30 | * We make the "initialization" hypercall now to tell the Host about | 34 | * We make the "initialization" hypercall now to tell the Host where |
31 | * us, and also find out where it put our page tables. | 35 | * our lguest_data struct is. |
32 | */ | 36 | */ |
33 | movl $LHCALL_LGUEST_INIT, %eax | 37 | movl $LHCALL_LGUEST_INIT, %eax |
34 | movl $lguest_data - __PAGE_OFFSET, %ebx | 38 | movl $lguest_data - __PAGE_OFFSET, %ebx |
35 | int $LGUEST_TRAP_ENTRY | 39 | int $LGUEST_TRAP_ENTRY |
36 | 40 | ||
41 | /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */ | ||
42 | movl $LHCALL_NEW_PGTABLE, %eax | ||
43 | movl $(initial_page_table - __PAGE_OFFSET), %ebx | ||
44 | int $LGUEST_TRAP_ENTRY | ||
45 | |||
37 | /* Set up the initial stack so we can run C code. */ | 46 | /* Set up the initial stack so we can run C code. */ |
38 | movl $(init_thread_union+THREAD_SIZE),%esp | 47 | movl $(init_thread_union+THREAD_SIZE),%esp |
39 | 48 | ||
@@ -96,12 +105,8 @@ send_interrupts: | |||
96 | */ | 105 | */ |
97 | pushl %eax | 106 | pushl %eax |
98 | movl $LHCALL_SEND_INTERRUPTS, %eax | 107 | movl $LHCALL_SEND_INTERRUPTS, %eax |
99 | /* | 108 | /* This is the actual hypercall trap. */ |
100 | * This is a vmcall instruction (same thing that KVM uses). Older | 109 | int $LGUEST_TRAP_ENTRY |
101 | * assembler versions might not know the "vmcall" instruction, so we | ||
102 | * create one manually here. | ||
103 | */ | ||
104 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ | ||
105 | /* Put eax back the way we found it. */ | 110 | /* Put eax back the way we found it. */ |
106 | popl %eax | 111 | popl %eax |
107 | ret | 112 | ret |
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index efa202499e37..2535933c49f8 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c | |||
@@ -117,7 +117,7 @@ static __init int map_switcher(void) | |||
117 | 117 | ||
118 | /* | 118 | /* |
119 | * Now the Switcher is mapped at the right address, we can't fail! | 119 | * Now the Switcher is mapped at the right address, we can't fail! |
120 | * Copy in the compiled-in Switcher code (from <arch>_switcher.S). | 120 | * Copy in the compiled-in Switcher code (from x86/switcher_32.S). |
121 | */ | 121 | */ |
122 | memcpy(switcher_vma->addr, start_switcher_text, | 122 | memcpy(switcher_vma->addr, start_switcher_text, |
123 | end_switcher_text - start_switcher_text); | 123 | end_switcher_text - start_switcher_text); |
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index daaf86631647..28433a155d67 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c | |||
@@ -375,11 +375,9 @@ static bool direct_trap(unsigned int num) | |||
375 | /* | 375 | /* |
376 | * The Host needs to see page faults (for shadow paging and to save the | 376 | * The Host needs to see page faults (for shadow paging and to save the |
377 | * fault address), general protection faults (in/out emulation) and | 377 | * fault address), general protection faults (in/out emulation) and |
378 | * device not available (TS handling), invalid opcode fault (kvm hcall), | 378 | * device not available (TS handling) and of course, the hypercall trap. |
379 | * and of course, the hypercall trap. | ||
380 | */ | 379 | */ |
381 | return num != 14 && num != 13 && num != 7 && | 380 | return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY; |
382 | num != 6 && num != LGUEST_TRAP_ENTRY; | ||
383 | } | 381 | } |
384 | /*:*/ | 382 | /*:*/ |
385 | 383 | ||
@@ -429,8 +427,8 @@ void pin_stack_pages(struct lg_cpu *cpu) | |||
429 | 427 | ||
430 | /* | 428 | /* |
431 | * Direct traps also mean that we need to know whenever the Guest wants to use | 429 | * Direct traps also mean that we need to know whenever the Guest wants to use |
432 | * a different kernel stack, so we can change the IDT entries to use that | 430 | * a different kernel stack, so we can change the guest TSS to use that |
433 | * stack. The IDT entries expect a virtual address, so unlike most addresses | 431 | * stack. The TSS entries expect a virtual address, so unlike most addresses |
434 | * the Guest gives us, the "esp" (stack pointer) value here is virtual, not | 432 | * the Guest gives us, the "esp" (stack pointer) value here is virtual, not |
435 | * physical. | 433 | * physical. |
436 | * | 434 | * |
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 9136411fadd5..295df06e6590 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h | |||
@@ -59,6 +59,8 @@ struct lg_cpu { | |||
59 | 59 | ||
60 | struct lguest_pages *last_pages; | 60 | struct lguest_pages *last_pages; |
61 | 61 | ||
62 | /* Initialization mode: linear map everything. */ | ||
63 | bool linear_pages; | ||
62 | int cpu_pgd; /* Which pgd this cpu is currently using */ | 64 | int cpu_pgd; /* Which pgd this cpu is currently using */ |
63 | 65 | ||
64 | /* If a hypercall was asked for, this points to the arguments. */ | 66 | /* If a hypercall was asked for, this points to the arguments. */ |
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index 69c84a1d88ea..5289ffa2e500 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c | |||
@@ -109,6 +109,17 @@ static u32 lg_get_features(struct virtio_device *vdev) | |||
109 | } | 109 | } |
110 | 110 | ||
111 | /* | 111 | /* |
112 | * To notify on reset or feature finalization, we (ab)use the NOTIFY | ||
113 | * hypercall, with the descriptor address of the device. | ||
114 | */ | ||
115 | static void status_notify(struct virtio_device *vdev) | ||
116 | { | ||
117 | unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; | ||
118 | |||
119 | hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0); | ||
120 | } | ||
121 | |||
122 | /* | ||
112 | * The virtio core takes the features the Host offers, and copies the ones | 123 | * The virtio core takes the features the Host offers, and copies the ones |
113 | * supported by the driver into the vdev->features array. Once that's all | 124 | * supported by the driver into the vdev->features array. Once that's all |
114 | * sorted out, this routine is called so we can tell the Host which features we | 125 | * sorted out, this routine is called so we can tell the Host which features we |
@@ -135,6 +146,9 @@ static void lg_finalize_features(struct virtio_device *vdev) | |||
135 | if (test_bit(i, vdev->features)) | 146 | if (test_bit(i, vdev->features)) |
136 | out_features[i / 8] |= (1 << (i % 8)); | 147 | out_features[i / 8] |= (1 << (i % 8)); |
137 | } | 148 | } |
149 | |||
150 | /* Tell Host we've finished with this device's feature negotiation */ | ||
151 | status_notify(vdev); | ||
138 | } | 152 | } |
139 | 153 | ||
140 | /* Once they've found a field, getting a copy of it is easy. */ | 154 | /* Once they've found a field, getting a copy of it is easy. */ |
@@ -168,28 +182,21 @@ static u8 lg_get_status(struct virtio_device *vdev) | |||
168 | return to_lgdev(vdev)->desc->status; | 182 | return to_lgdev(vdev)->desc->status; |
169 | } | 183 | } |
170 | 184 | ||
171 | /* | ||
172 | * To notify on status updates, we (ab)use the NOTIFY hypercall, with the | ||
173 | * descriptor address of the device. A zero status means "reset". | ||
174 | */ | ||
175 | static void set_status(struct virtio_device *vdev, u8 status) | ||
176 | { | ||
177 | unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; | ||
178 | |||
179 | /* We set the status. */ | ||
180 | to_lgdev(vdev)->desc->status = status; | ||
181 | hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0); | ||
182 | } | ||
183 | |||
184 | static void lg_set_status(struct virtio_device *vdev, u8 status) | 185 | static void lg_set_status(struct virtio_device *vdev, u8 status) |
185 | { | 186 | { |
186 | BUG_ON(!status); | 187 | BUG_ON(!status); |
187 | set_status(vdev, status); | 188 | to_lgdev(vdev)->desc->status = status; |
189 | |||
190 | /* Tell Host immediately if we failed. */ | ||
191 | if (status & VIRTIO_CONFIG_S_FAILED) | ||
192 | status_notify(vdev); | ||
188 | } | 193 | } |
189 | 194 | ||
190 | static void lg_reset(struct virtio_device *vdev) | 195 | static void lg_reset(struct virtio_device *vdev) |
191 | { | 196 | { |
192 | set_status(vdev, 0); | 197 | /* 0 status means "reset" */ |
198 | to_lgdev(vdev)->desc->status = 0; | ||
199 | status_notify(vdev); | ||
193 | } | 200 | } |
194 | 201 | ||
195 | /* | 202 | /* |
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 948c547b8e9e..f97e625241ad 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c | |||
@@ -1,8 +1,10 @@ | |||
1 | /*P:200 This contains all the /dev/lguest code, whereby the userspace launcher | 1 | /*P:200 This contains all the /dev/lguest code, whereby the userspace |
2 | * controls and communicates with the Guest. For example, the first write will | 2 | * launcher controls and communicates with the Guest. For example, |
3 | * tell us the Guest's memory layout and entry point. A read will run the | 3 | * the first write will tell us the Guest's memory layout and entry |
4 | * Guest until something happens, such as a signal or the Guest doing a NOTIFY | 4 | * point. A read will run the Guest until something happens, such as |
5 | * out to the Launcher. | 5 | * a signal or the Guest doing a NOTIFY out to the Launcher. There is |
6 | * also a way for the Launcher to attach eventfds to particular NOTIFY | ||
7 | * values instead of returning from the read() call. | ||
6 | :*/ | 8 | :*/ |
7 | #include <linux/uaccess.h> | 9 | #include <linux/uaccess.h> |
8 | #include <linux/miscdevice.h> | 10 | #include <linux/miscdevice.h> |
@@ -357,8 +359,8 @@ static int initialize(struct file *file, const unsigned long __user *input) | |||
357 | goto free_eventfds; | 359 | goto free_eventfds; |
358 | 360 | ||
359 | /* | 361 | /* |
360 | * Initialize the Guest's shadow page tables, using the toplevel | 362 | * Initialize the Guest's shadow page tables. This allocates |
361 | * address the Launcher gave us. This allocates memory, so can fail. | 363 | * memory, so can fail. |
362 | */ | 364 | */ |
363 | err = init_guest_pagetable(lg); | 365 | err = init_guest_pagetable(lg); |
364 | if (err) | 366 | if (err) |
@@ -516,6 +518,7 @@ static const struct file_operations lguest_fops = { | |||
516 | .read = read, | 518 | .read = read, |
517 | .llseek = default_llseek, | 519 | .llseek = default_llseek, |
518 | }; | 520 | }; |
521 | /*:*/ | ||
519 | 522 | ||
520 | /* | 523 | /* |
521 | * This is a textbook example of a "misc" character device. Populate a "struct | 524 | * This is a textbook example of a "misc" character device. Populate a "struct |
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index d21578ee95de..3b62be160a6e 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c | |||
@@ -17,7 +17,6 @@ | |||
17 | #include <linux/percpu.h> | 17 | #include <linux/percpu.h> |
18 | #include <asm/tlbflush.h> | 18 | #include <asm/tlbflush.h> |
19 | #include <asm/uaccess.h> | 19 | #include <asm/uaccess.h> |
20 | #include <asm/bootparam.h> | ||
21 | #include "lg.h" | 20 | #include "lg.h" |
22 | 21 | ||
23 | /*M:008 | 22 | /*M:008 |
@@ -156,7 +155,7 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) | |||
156 | } | 155 | } |
157 | 156 | ||
158 | /* | 157 | /* |
159 | * These functions are just like the above two, except they access the Guest | 158 | * These functions are just like the above, except they access the Guest |
160 | * page tables. Hence they return a Guest address. | 159 | * page tables. Hence they return a Guest address. |
161 | */ | 160 | */ |
162 | static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) | 161 | static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) |
@@ -196,7 +195,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu, | |||
196 | #endif | 195 | #endif |
197 | /*:*/ | 196 | /*:*/ |
198 | 197 | ||
199 | /*M:014 | 198 | /*M:007 |
200 | * get_pfn is slow: we could probably try to grab batches of pages here as | 199 | * get_pfn is slow: we could probably try to grab batches of pages here as |
201 | * an optimization (ie. pre-faulting). | 200 | * an optimization (ie. pre-faulting). |
202 | :*/ | 201 | :*/ |
@@ -325,10 +324,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
325 | #endif | 324 | #endif |
326 | 325 | ||
327 | /* First step: get the top-level Guest page table entry. */ | 326 | /* First step: get the top-level Guest page table entry. */ |
328 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); | 327 | if (unlikely(cpu->linear_pages)) { |
329 | /* Toplevel not present? We can't map it in. */ | 328 | /* Faking up a linear mapping. */ |
330 | if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) | 329 | gpgd = __pgd(CHECK_GPGD_MASK); |
331 | return false; | 330 | } else { |
331 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); | ||
332 | /* Toplevel not present? We can't map it in. */ | ||
333 | if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) | ||
334 | return false; | ||
335 | } | ||
332 | 336 | ||
333 | /* Now look at the matching shadow entry. */ | 337 | /* Now look at the matching shadow entry. */ |
334 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); | 338 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); |
@@ -353,10 +357,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
353 | } | 357 | } |
354 | 358 | ||
355 | #ifdef CONFIG_X86_PAE | 359 | #ifdef CONFIG_X86_PAE |
356 | gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); | 360 | if (unlikely(cpu->linear_pages)) { |
357 | /* Middle level not present? We can't map it in. */ | 361 | /* Faking up a linear mapping. */ |
358 | if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) | 362 | gpmd = __pmd(_PAGE_TABLE); |
359 | return false; | 363 | } else { |
364 | gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); | ||
365 | /* Middle level not present? We can't map it in. */ | ||
366 | if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) | ||
367 | return false; | ||
368 | } | ||
360 | 369 | ||
361 | /* Now look at the matching shadow entry. */ | 370 | /* Now look at the matching shadow entry. */ |
362 | spmd = spmd_addr(cpu, *spgd, vaddr); | 371 | spmd = spmd_addr(cpu, *spgd, vaddr); |
@@ -397,8 +406,13 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
397 | gpte_ptr = gpte_addr(cpu, gpgd, vaddr); | 406 | gpte_ptr = gpte_addr(cpu, gpgd, vaddr); |
398 | #endif | 407 | #endif |
399 | 408 | ||
400 | /* Read the actual PTE value. */ | 409 | if (unlikely(cpu->linear_pages)) { |
401 | gpte = lgread(cpu, gpte_ptr, pte_t); | 410 | /* Linear? Make up a PTE which points to same page. */ |
411 | gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT); | ||
412 | } else { | ||
413 | /* Read the actual PTE value. */ | ||
414 | gpte = lgread(cpu, gpte_ptr, pte_t); | ||
415 | } | ||
402 | 416 | ||
403 | /* If this page isn't in the Guest page tables, we can't page it in. */ | 417 | /* If this page isn't in the Guest page tables, we can't page it in. */ |
404 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) | 418 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) |
@@ -454,7 +468,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
454 | * Finally, we write the Guest PTE entry back: we've set the | 468 | * Finally, we write the Guest PTE entry back: we've set the |
455 | * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. | 469 | * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. |
456 | */ | 470 | */ |
457 | lgwrite(cpu, gpte_ptr, pte_t, gpte); | 471 | if (likely(!cpu->linear_pages)) |
472 | lgwrite(cpu, gpte_ptr, pte_t, gpte); | ||
458 | 473 | ||
459 | /* | 474 | /* |
460 | * The fault is fixed, the page table is populated, the mapping | 475 | * The fault is fixed, the page table is populated, the mapping |
@@ -612,6 +627,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) | |||
612 | #ifdef CONFIG_X86_PAE | 627 | #ifdef CONFIG_X86_PAE |
613 | pmd_t gpmd; | 628 | pmd_t gpmd; |
614 | #endif | 629 | #endif |
630 | |||
631 | /* Still not set up? Just map 1:1. */ | ||
632 | if (unlikely(cpu->linear_pages)) | ||
633 | return vaddr; | ||
634 | |||
615 | /* First step: get the top-level Guest page table entry. */ | 635 | /* First step: get the top-level Guest page table entry. */ |
616 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); | 636 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); |
617 | /* Toplevel not present? We can't map it in. */ | 637 | /* Toplevel not present? We can't map it in. */ |
@@ -708,32 +728,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, | |||
708 | return next; | 728 | return next; |
709 | } | 729 | } |
710 | 730 | ||
711 | /*H:430 | ||
712 | * (iv) Switching page tables | ||
713 | * | ||
714 | * Now we've seen all the page table setting and manipulation, let's see | ||
715 | * what happens when the Guest changes page tables (ie. changes the top-level | ||
716 | * pgdir). This occurs on almost every context switch. | ||
717 | */ | ||
718 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) | ||
719 | { | ||
720 | int newpgdir, repin = 0; | ||
721 | |||
722 | /* Look to see if we have this one already. */ | ||
723 | newpgdir = find_pgdir(cpu->lg, pgtable); | ||
724 | /* | ||
725 | * If not, we allocate or mug an existing one: if it's a fresh one, | ||
726 | * repin gets set to 1. | ||
727 | */ | ||
728 | if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) | ||
729 | newpgdir = new_pgdir(cpu, pgtable, &repin); | ||
730 | /* Change the current pgd index to the new one. */ | ||
731 | cpu->cpu_pgd = newpgdir; | ||
732 | /* If it was completely blank, we map in the Guest kernel stack */ | ||
733 | if (repin) | ||
734 | pin_stack_pages(cpu); | ||
735 | } | ||
736 | |||
737 | /*H:470 | 731 | /*H:470 |
738 | * Finally, a routine which throws away everything: all PGD entries in all | 732 | * Finally, a routine which throws away everything: all PGD entries in all |
739 | * the shadow page tables, including the Guest's kernel mappings. This is used | 733 | * the shadow page tables, including the Guest's kernel mappings. This is used |
@@ -780,6 +774,44 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu) | |||
780 | /* We need the Guest kernel stack mapped again. */ | 774 | /* We need the Guest kernel stack mapped again. */ |
781 | pin_stack_pages(cpu); | 775 | pin_stack_pages(cpu); |
782 | } | 776 | } |
777 | |||
778 | /*H:430 | ||
779 | * (iv) Switching page tables | ||
780 | * | ||
781 | * Now we've seen all the page table setting and manipulation, let's see | ||
782 | * what happens when the Guest changes page tables (ie. changes the top-level | ||
783 | * pgdir). This occurs on almost every context switch. | ||
784 | */ | ||
785 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) | ||
786 | { | ||
787 | int newpgdir, repin = 0; | ||
788 | |||
789 | /* | ||
790 | * The very first time they call this, we're actually running without | ||
791 | * any page tables; we've been making it up. Throw them away now. | ||
792 | */ | ||
793 | if (unlikely(cpu->linear_pages)) { | ||
794 | release_all_pagetables(cpu->lg); | ||
795 | cpu->linear_pages = false; | ||
796 | /* Force allocation of a new pgdir. */ | ||
797 | newpgdir = ARRAY_SIZE(cpu->lg->pgdirs); | ||
798 | } else { | ||
799 | /* Look to see if we have this one already. */ | ||
800 | newpgdir = find_pgdir(cpu->lg, pgtable); | ||
801 | } | ||
802 | |||
803 | /* | ||
804 | * If not, we allocate or mug an existing one: if it's a fresh one, | ||
805 | * repin gets set to 1. | ||
806 | */ | ||
807 | if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) | ||
808 | newpgdir = new_pgdir(cpu, pgtable, &repin); | ||
809 | /* Change the current pgd index to the new one. */ | ||
810 | cpu->cpu_pgd = newpgdir; | ||
811 | /* If it was completely blank, we map in the Guest kernel stack */ | ||
812 | if (repin) | ||
813 | pin_stack_pages(cpu); | ||
814 | } | ||
783 | /*:*/ | 815 | /*:*/ |
784 | 816 | ||
785 | /*M:009 | 817 | /*M:009 |
@@ -919,168 +951,26 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) | |||
919 | } | 951 | } |
920 | #endif | 952 | #endif |
921 | 953 | ||
922 | /*H:505 | ||
923 | * To get through boot, we construct simple identity page mappings (which | ||
924 | * set virtual == physical) and linear mappings which will get the Guest far | ||
925 | * enough into the boot to create its own. The linear mapping means we | ||
926 | * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET, | ||
927 | * as you'll see. | ||
928 | * | ||
929 | * We lay them out of the way, just below the initrd (which is why we need to | ||
930 | * know its size here). | ||
931 | */ | ||
932 | static unsigned long setup_pagetables(struct lguest *lg, | ||
933 | unsigned long mem, | ||
934 | unsigned long initrd_size) | ||
935 | { | ||
936 | pgd_t __user *pgdir; | ||
937 | pte_t __user *linear; | ||
938 | unsigned long mem_base = (unsigned long)lg->mem_base; | ||
939 | unsigned int mapped_pages, i, linear_pages; | ||
940 | #ifdef CONFIG_X86_PAE | ||
941 | pmd_t __user *pmds; | ||
942 | unsigned int j; | ||
943 | pgd_t pgd; | ||
944 | pmd_t pmd; | ||
945 | #else | ||
946 | unsigned int phys_linear; | ||
947 | #endif | ||
948 | |||
949 | /* | ||
950 | * We have mapped_pages frames to map, so we need linear_pages page | ||
951 | * tables to map them. | ||
952 | */ | ||
953 | mapped_pages = mem / PAGE_SIZE; | ||
954 | linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE; | ||
955 | |||
956 | /* We put the toplevel page directory page at the top of memory. */ | ||
957 | pgdir = (pgd_t *)(mem + mem_base - initrd_size - PAGE_SIZE); | ||
958 | |||
959 | /* Now we use the next linear_pages pages as pte pages */ | ||
960 | linear = (void *)pgdir - linear_pages * PAGE_SIZE; | ||
961 | |||
962 | #ifdef CONFIG_X86_PAE | ||
963 | /* | ||
964 | * And the single mid page goes below that. We only use one, but | ||
965 | * that's enough to map 1G, which definitely gets us through boot. | ||
966 | */ | ||
967 | pmds = (void *)linear - PAGE_SIZE; | ||
968 | #endif | ||
969 | /* | ||
970 | * Linear mapping is easy: put every page's address into the | ||
971 | * mapping in order. | ||
972 | */ | ||
973 | for (i = 0; i < mapped_pages; i++) { | ||
974 | pte_t pte; | ||
975 | pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER)); | ||
976 | if (copy_to_user(&linear[i], &pte, sizeof(pte)) != 0) | ||
977 | return -EFAULT; | ||
978 | } | ||
979 | |||
980 | #ifdef CONFIG_X86_PAE | ||
981 | /* | ||
982 | * Make the Guest PMD entries point to the corresponding place in the | ||
983 | * linear mapping (up to one page worth of PMD). | ||
984 | */ | ||
985 | for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; | ||
986 | i += PTRS_PER_PTE, j++) { | ||
987 | pmd = pfn_pmd(((unsigned long)&linear[i] - mem_base)/PAGE_SIZE, | ||
988 | __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); | ||
989 | |||
990 | if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0) | ||
991 | return -EFAULT; | ||
992 | } | ||
993 | |||
994 | /* One PGD entry, pointing to that PMD page. */ | ||
995 | pgd = __pgd(((unsigned long)pmds - mem_base) | _PAGE_PRESENT); | ||
996 | /* Copy it in as the first PGD entry (ie. addresses 0-1G). */ | ||
997 | if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) | ||
998 | return -EFAULT; | ||
999 | /* | ||
1000 | * And the other PGD entry to make the linear mapping at PAGE_OFFSET | ||
1001 | */ | ||
1002 | if (copy_to_user(&pgdir[KERNEL_PGD_BOUNDARY], &pgd, sizeof(pgd))) | ||
1003 | return -EFAULT; | ||
1004 | #else | ||
1005 | /* | ||
1006 | * The top level points to the linear page table pages above. | ||
1007 | * We setup the identity and linear mappings here. | ||
1008 | */ | ||
1009 | phys_linear = (unsigned long)linear - mem_base; | ||
1010 | for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { | ||
1011 | pgd_t pgd; | ||
1012 | /* | ||
1013 | * Create a PGD entry which points to the right part of the | ||
1014 | * linear PTE pages. | ||
1015 | */ | ||
1016 | pgd = __pgd((phys_linear + i * sizeof(pte_t)) | | ||
1017 | (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); | ||
1018 | |||
1019 | /* | ||
1020 | * Copy it into the PGD page at 0 and PAGE_OFFSET. | ||
1021 | */ | ||
1022 | if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) | ||
1023 | || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) | ||
1024 | + i / PTRS_PER_PTE], | ||
1025 | &pgd, sizeof(pgd))) | ||
1026 | return -EFAULT; | ||
1027 | } | ||
1028 | #endif | ||
1029 | |||
1030 | /* | ||
1031 | * We return the top level (guest-physical) address: we remember where | ||
1032 | * this is to write it into lguest_data when the Guest initializes. | ||
1033 | */ | ||
1034 | return (unsigned long)pgdir - mem_base; | ||
1035 | } | ||
1036 | |||
1037 | /*H:500 | 954 | /*H:500 |
1038 | * (vii) Setting up the page tables initially. | 955 | * (vii) Setting up the page tables initially. |
1039 | * | 956 | * |
1040 | * When a Guest is first created, the Launcher tells us where the toplevel of | 957 | * When a Guest is first created, set initialize a shadow page table which |
1041 | * its first page table is. We set some things up here: | 958 | * we will populate on future faults. The Guest doesn't have any actual |
959 | * pagetables yet, so we set linear_pages to tell demand_page() to fake it | ||
960 | * for the moment. | ||
1042 | */ | 961 | */ |
1043 | int init_guest_pagetable(struct lguest *lg) | 962 | int init_guest_pagetable(struct lguest *lg) |
1044 | { | 963 | { |
1045 | u64 mem; | 964 | struct lg_cpu *cpu = &lg->cpus[0]; |
1046 | u32 initrd_size; | 965 | int allocated = 0; |
1047 | struct boot_params __user *boot = (struct boot_params *)lg->mem_base; | ||
1048 | #ifdef CONFIG_X86_PAE | ||
1049 | pgd_t *pgd; | ||
1050 | pmd_t *pmd_table; | ||
1051 | #endif | ||
1052 | /* | ||
1053 | * Get the Guest memory size and the ramdisk size from the boot header | ||
1054 | * located at lg->mem_base (Guest address 0). | ||
1055 | */ | ||
1056 | if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) | ||
1057 | || get_user(initrd_size, &boot->hdr.ramdisk_size)) | ||
1058 | return -EFAULT; | ||
1059 | 966 | ||
1060 | /* | 967 | /* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */ |
1061 | * We start on the first shadow page table, and give it a blank PGD | 968 | cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated); |
1062 | * page. | 969 | if (!allocated) |
1063 | */ | ||
1064 | lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size); | ||
1065 | if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir)) | ||
1066 | return lg->pgdirs[0].gpgdir; | ||
1067 | lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); | ||
1068 | if (!lg->pgdirs[0].pgdir) | ||
1069 | return -ENOMEM; | 970 | return -ENOMEM; |
1070 | 971 | ||
1071 | #ifdef CONFIG_X86_PAE | 972 | /* We start with a linear mapping until the initialize. */ |
1072 | /* For PAE, we also create the initial mid-level. */ | 973 | cpu->linear_pages = true; |
1073 | pgd = lg->pgdirs[0].pgdir; | ||
1074 | pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); | ||
1075 | if (!pmd_table) | ||
1076 | return -ENOMEM; | ||
1077 | |||
1078 | set_pgd(pgd + SWITCHER_PGD_INDEX, | ||
1079 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | ||
1080 | #endif | ||
1081 | |||
1082 | /* This is the current page table. */ | ||
1083 | lg->cpus[0].cpu_pgd = 0; | ||
1084 | return 0; | 974 | return 0; |
1085 | } | 975 | } |
1086 | 976 | ||
@@ -1095,10 +985,10 @@ void page_table_guest_data_init(struct lg_cpu *cpu) | |||
1095 | * of virtual addresses used by the Switcher. | 985 | * of virtual addresses used by the Switcher. |
1096 | */ | 986 | */ |
1097 | || put_user(RESERVE_MEM * 1024 * 1024, | 987 | || put_user(RESERVE_MEM * 1024 * 1024, |
1098 | &cpu->lg->lguest_data->reserve_mem) | 988 | &cpu->lg->lguest_data->reserve_mem)) { |
1099 | || put_user(cpu->lg->pgdirs[0].gpgdir, | ||
1100 | &cpu->lg->lguest_data->pgdir)) | ||
1101 | kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); | 989 | kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); |
990 | return; | ||
991 | } | ||
1102 | 992 | ||
1103 | /* | 993 | /* |
1104 | * In flush_user_mappings() we loop from 0 to | 994 | * In flush_user_mappings() we loop from 0 to |
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 9f1659c3d1f3..65af42f2d593 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c | |||
@@ -269,10 +269,10 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) | |||
269 | static int emulate_insn(struct lg_cpu *cpu) | 269 | static int emulate_insn(struct lg_cpu *cpu) |
270 | { | 270 | { |
271 | u8 insn; | 271 | u8 insn; |
272 | unsigned int insnlen = 0, in = 0, shift = 0; | 272 | unsigned int insnlen = 0, in = 0, small_operand = 0; |
273 | /* | 273 | /* |
274 | * The eip contains the *virtual* address of the Guest's instruction: | 274 | * The eip contains the *virtual* address of the Guest's instruction: |
275 | * guest_pa just subtracts the Guest's page_offset. | 275 | * walk the Guest's page tables to find the "physical" address. |
276 | */ | 276 | */ |
277 | unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); | 277 | unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); |
278 | 278 | ||
@@ -300,11 +300,10 @@ static int emulate_insn(struct lg_cpu *cpu) | |||
300 | } | 300 | } |
301 | 301 | ||
302 | /* | 302 | /* |
303 | * 0x66 is an "operand prefix". It means it's using the upper 16 bits | 303 | * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out. |
304 | * of the eax register. | ||
305 | */ | 304 | */ |
306 | if (insn == 0x66) { | 305 | if (insn == 0x66) { |
307 | shift = 16; | 306 | small_operand = 1; |
308 | /* The instruction is 1 byte so far, read the next byte. */ | 307 | /* The instruction is 1 byte so far, read the next byte. */ |
309 | insnlen = 1; | 308 | insnlen = 1; |
310 | insn = lgread(cpu, physaddr + insnlen, u8); | 309 | insn = lgread(cpu, physaddr + insnlen, u8); |
@@ -340,11 +339,14 @@ static int emulate_insn(struct lg_cpu *cpu) | |||
340 | * traditionally means "there's nothing there". | 339 | * traditionally means "there's nothing there". |
341 | */ | 340 | */ |
342 | if (in) { | 341 | if (in) { |
343 | /* Lower bit tells is whether it's a 16 or 32 bit access */ | 342 | /* Lower bit tells means it's a 32/16 bit access */ |
344 | if (insn & 0x1) | 343 | if (insn & 0x1) { |
345 | cpu->regs->eax = 0xFFFFFFFF; | 344 | if (small_operand) |
346 | else | 345 | cpu->regs->eax |= 0xFFFF; |
347 | cpu->regs->eax |= (0xFFFF << shift); | 346 | else |
347 | cpu->regs->eax = 0xFFFFFFFF; | ||
348 | } else | ||
349 | cpu->regs->eax |= 0xFF; | ||
348 | } | 350 | } |
349 | /* Finally, we've "done" the instruction, so move past it. */ | 351 | /* Finally, we've "done" the instruction, so move past it. */ |
350 | cpu->regs->eip += insnlen; | 352 | cpu->regs->eip += insnlen; |
@@ -352,69 +354,6 @@ static int emulate_insn(struct lg_cpu *cpu) | |||
352 | return 1; | 354 | return 1; |
353 | } | 355 | } |
354 | 356 | ||
355 | /* | ||
356 | * Our hypercalls mechanism used to be based on direct software interrupts. | ||
357 | * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to | ||
358 | * change over to using kvm hypercalls. | ||
359 | * | ||
360 | * KVM_HYPERCALL is actually a "vmcall" instruction, which generates an invalid | ||
361 | * opcode fault (fault 6) on non-VT cpus, so the easiest solution seemed to be | ||
362 | * an *emulation approach*: if the fault was really produced by an hypercall | ||
363 | * (is_hypercall() does exactly this check), we can just call the corresponding | ||
364 | * hypercall host implementation function. | ||
365 | * | ||
366 | * But these invalid opcode faults are notably slower than software interrupts. | ||
367 | * So we implemented the *patching (or rewriting) approach*: every time we hit | ||
368 | * the KVM_HYPERCALL opcode in Guest code, we patch it to the old "int 0x1f" | ||
369 | * opcode, so next time the Guest calls this hypercall it will use the | ||
370 | * faster trap mechanism. | ||
371 | * | ||
372 | * Matias even benchmarked it to convince you: this shows the average cycle | ||
373 | * cost of a hypercall. For each alternative solution mentioned above we've | ||
374 | * made 5 runs of the benchmark: | ||
375 | * | ||
376 | * 1) direct software interrupt: 2915, 2789, 2764, 2721, 2898 | ||
377 | * 2) emulation technique: 3410, 3681, 3466, 3392, 3780 | ||
378 | * 3) patching (rewrite) technique: 2977, 2975, 2891, 2637, 2884 | ||
379 | * | ||
380 | * One two-line function is worth a 20% hypercall speed boost! | ||
381 | */ | ||
382 | static void rewrite_hypercall(struct lg_cpu *cpu) | ||
383 | { | ||
384 | /* | ||
385 | * This are the opcodes we use to patch the Guest. The opcode for "int | ||
386 | * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we | ||
387 | * complete the sequence with a NOP (0x90). | ||
388 | */ | ||
389 | u8 insn[3] = {0xcd, 0x1f, 0x90}; | ||
390 | |||
391 | __lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn)); | ||
392 | /* | ||
393 | * The above write might have caused a copy of that page to be made | ||
394 | * (if it was read-only). We need to make sure the Guest has | ||
395 | * up-to-date pagetables. As this doesn't happen often, we can just | ||
396 | * drop them all. | ||
397 | */ | ||
398 | guest_pagetable_clear_all(cpu); | ||
399 | } | ||
400 | |||
401 | static bool is_hypercall(struct lg_cpu *cpu) | ||
402 | { | ||
403 | u8 insn[3]; | ||
404 | |||
405 | /* | ||
406 | * This must be the Guest kernel trying to do something. | ||
407 | * The bottom two bits of the CS segment register are the privilege | ||
408 | * level. | ||
409 | */ | ||
410 | if ((cpu->regs->cs & 3) != GUEST_PL) | ||
411 | return false; | ||
412 | |||
413 | /* Is it a vmcall? */ | ||
414 | __lgread(cpu, insn, guest_pa(cpu, cpu->regs->eip), sizeof(insn)); | ||
415 | return insn[0] == 0x0f && insn[1] == 0x01 && insn[2] == 0xc1; | ||
416 | } | ||
417 | |||
418 | /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ | 357 | /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ |
419 | void lguest_arch_handle_trap(struct lg_cpu *cpu) | 358 | void lguest_arch_handle_trap(struct lg_cpu *cpu) |
420 | { | 359 | { |
@@ -429,20 +368,6 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) | |||
429 | if (emulate_insn(cpu)) | 368 | if (emulate_insn(cpu)) |
430 | return; | 369 | return; |
431 | } | 370 | } |
432 | /* | ||
433 | * If KVM is active, the vmcall instruction triggers a General | ||
434 | * Protection Fault. Normally it triggers an invalid opcode | ||
435 | * fault (6): | ||
436 | */ | ||
437 | case 6: | ||
438 | /* | ||
439 | * We need to check if ring == GUEST_PL and faulting | ||
440 | * instruction == vmcall. | ||
441 | */ | ||
442 | if (is_hypercall(cpu)) { | ||
443 | rewrite_hypercall(cpu); | ||
444 | return; | ||
445 | } | ||
446 | break; | 371 | break; |
447 | case 14: /* We've intercepted a Page Fault. */ | 372 | case 14: /* We've intercepted a Page Fault. */ |
448 | /* | 373 | /* |
@@ -486,7 +411,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) | |||
486 | * These values mean a real interrupt occurred, in which case | 411 | * These values mean a real interrupt occurred, in which case |
487 | * the Host handler has already been run. We just do a | 412 | * the Host handler has already been run. We just do a |
488 | * friendly check if another process should now be run, then | 413 | * friendly check if another process should now be run, then |
489 | * return to run the Guest again | 414 | * return to run the Guest again. |
490 | */ | 415 | */ |
491 | cond_resched(); | 416 | cond_resched(); |
492 | return; | 417 | return; |
@@ -536,7 +461,7 @@ void __init lguest_arch_host_init(void) | |||
536 | int i; | 461 | int i; |
537 | 462 | ||
538 | /* | 463 | /* |
539 | * Most of the i386/switcher.S doesn't care that it's been moved; on | 464 | * Most of the x86/switcher_32.S doesn't care that it's been moved; on |
540 | * Intel, jumps are relative, and it doesn't access any references to | 465 | * Intel, jumps are relative, and it doesn't access any references to |
541 | * external code or data. | 466 | * external code or data. |
542 | * | 467 | * |
@@ -664,7 +589,7 @@ void __init lguest_arch_host_init(void) | |||
664 | clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); | 589 | clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); |
665 | } | 590 | } |
666 | put_online_cpus(); | 591 | put_online_cpus(); |
667 | }; | 592 | } |
668 | /*:*/ | 593 | /*:*/ |
669 | 594 | ||
670 | void __exit lguest_arch_host_fini(void) | 595 | void __exit lguest_arch_host_fini(void) |
@@ -747,8 +672,6 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) | |||
747 | /*:*/ | 672 | /*:*/ |
748 | 673 | ||
749 | /*L:030 | 674 | /*L:030 |
750 | * lguest_arch_setup_regs() | ||
751 | * | ||
752 | * Most of the Guest's registers are left alone: we used get_zeroed_page() to | 675 | * Most of the Guest's registers are left alone: we used get_zeroed_page() to |
753 | * allocate the structure, so they will be 0. | 676 | * allocate the structure, so they will be 0. |
754 | */ | 677 | */ |
diff --git a/include/linux/lguest.h b/include/linux/lguest.h index 2fb1dcbcb5aa..9962c6bb1311 100644 --- a/include/linux/lguest.h +++ b/include/linux/lguest.h | |||
@@ -59,8 +59,6 @@ struct lguest_data { | |||
59 | unsigned long reserve_mem; | 59 | unsigned long reserve_mem; |
60 | /* KHz for the TSC clock. */ | 60 | /* KHz for the TSC clock. */ |
61 | u32 tsc_khz; | 61 | u32 tsc_khz; |
62 | /* Page where the top-level pagetable is */ | ||
63 | unsigned long pgdir; | ||
64 | 62 | ||
65 | /* Fields initialized by the Guest at boot: */ | 63 | /* Fields initialized by the Guest at boot: */ |
66 | /* Instruction range to suppress interrupts even if enabled */ | 64 | /* Instruction range to suppress interrupts even if enabled */ |