diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2008-03-27 21:52:43 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-03-27 21:52:43 -0400 |
commit | 8536bbaff44addff8d2ac66da1156c95b1e00c4e (patch) | |
tree | 56b11bc2107e6683b40d9eceb47a3242348ec43e | |
parent | 7529963cb9c5db6821f0a00fc8426ebed79fc2e0 (diff) | |
parent | a6bd8e13034dd7d60b6f14217096efa192d0adc1 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus
* git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus:
lguest: comment documentation update.
lguest: Don't need comment terminator before disk section.
lguest: lguest.txt documentation fix
lguest: Add puppies which where previously missing.
virtio_pci: unregister virtio device at device remove
-rw-r--r-- | Documentation/lguest/lguest.c | 70 | ||||
-rw-r--r-- | Documentation/lguest/lguest.txt | 19 | ||||
-rw-r--r-- | arch/x86/lguest/boot.c | 108 | ||||
-rw-r--r-- | arch/x86/lguest/i386_head.S | 15 | ||||
-rw-r--r-- | drivers/lguest/Makefile | 8 | ||||
-rw-r--r-- | drivers/lguest/core.c | 18 | ||||
-rw-r--r-- | drivers/lguest/hypercalls.c | 11 | ||||
-rw-r--r-- | drivers/lguest/interrupts_and_traps.c | 7 | ||||
-rw-r--r-- | drivers/lguest/lguest_device.c | 11 | ||||
-rw-r--r-- | drivers/lguest/lguest_user.c | 30 | ||||
-rw-r--r-- | drivers/lguest/page_tables.c | 32 | ||||
-rw-r--r-- | drivers/lguest/x86/core.c | 33 | ||||
-rw-r--r-- | drivers/lguest/x86/switcher_32.S | 8 | ||||
-rw-r--r-- | drivers/virtio/virtio_pci.c | 1 | ||||
-rw-r--r-- | include/asm-x86/lguest_hcall.h | 2 | ||||
-rw-r--r-- | include/linux/lguest_launcher.h | 6 |
16 files changed, 229 insertions, 150 deletions
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index bec5a32e4095..4c1fc65a8b3d 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /*P:100 This is the Launcher code, a simple program which lays out the | 1 | /*P:100 This is the Launcher code, a simple program which lays out the |
2 | * "physical" memory for the new Guest by mapping the kernel image and the | 2 | * "physical" memory for the new Guest by mapping the kernel image and |
3 | * virtual devices, then reads repeatedly from /dev/lguest to run the Guest. | 3 | * the virtual devices, then opens /dev/lguest to tell the kernel |
4 | :*/ | 4 | * about the Guest and control it. :*/ |
5 | #define _LARGEFILE64_SOURCE | 5 | #define _LARGEFILE64_SOURCE |
6 | #define _GNU_SOURCE | 6 | #define _GNU_SOURCE |
7 | #include <stdio.h> | 7 | #include <stdio.h> |
@@ -43,7 +43,7 @@ | |||
43 | #include "linux/virtio_console.h" | 43 | #include "linux/virtio_console.h" |
44 | #include "linux/virtio_ring.h" | 44 | #include "linux/virtio_ring.h" |
45 | #include "asm-x86/bootparam.h" | 45 | #include "asm-x86/bootparam.h" |
46 | /*L:110 We can ignore the 38 include files we need for this program, but I do | 46 | /*L:110 We can ignore the 39 include files we need for this program, but I do |
47 | * want to draw attention to the use of kernel-style types. | 47 | * want to draw attention to the use of kernel-style types. |
48 | * | 48 | * |
49 | * As Linus said, "C is a Spartan language, and so should your naming be." I | 49 | * As Linus said, "C is a Spartan language, and so should your naming be." I |
@@ -320,7 +320,7 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) | |||
320 | err(1, "Reading program headers"); | 320 | err(1, "Reading program headers"); |
321 | 321 | ||
322 | /* Try all the headers: there are usually only three. A read-only one, | 322 | /* Try all the headers: there are usually only three. A read-only one, |
323 | * a read-write one, and a "note" section which isn't loadable. */ | 323 | * a read-write one, and a "note" section which we don't load. */ |
324 | for (i = 0; i < ehdr->e_phnum; i++) { | 324 | for (i = 0; i < ehdr->e_phnum; i++) { |
325 | /* If this isn't a loadable segment, we ignore it */ | 325 | /* If this isn't a loadable segment, we ignore it */ |
326 | if (phdr[i].p_type != PT_LOAD) | 326 | if (phdr[i].p_type != PT_LOAD) |
@@ -387,7 +387,7 @@ static unsigned long load_kernel(int fd) | |||
387 | if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) | 387 | if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) |
388 | return map_elf(fd, &hdr); | 388 | return map_elf(fd, &hdr); |
389 | 389 | ||
390 | /* Otherwise we assume it's a bzImage, and try to unpack it */ | 390 | /* Otherwise we assume it's a bzImage, and try to load it. */ |
391 | return load_bzimage(fd); | 391 | return load_bzimage(fd); |
392 | } | 392 | } |
393 | 393 | ||
@@ -433,12 +433,12 @@ static unsigned long load_initrd(const char *name, unsigned long mem) | |||
433 | return len; | 433 | return len; |
434 | } | 434 | } |
435 | 435 | ||
436 | /* Once we know how much memory we have, we can construct simple linear page | 436 | /* Once we know how much memory we have we can construct simple linear page |
437 | * tables which set virtual == physical which will get the Guest far enough | 437 | * tables which set virtual == physical which will get the Guest far enough |
438 | * into the boot to create its own. | 438 | * into the boot to create its own. |
439 | * | 439 | * |
440 | * We lay them out of the way, just below the initrd (which is why we need to | 440 | * We lay them out of the way, just below the initrd (which is why we need to |
441 | * know its size). */ | 441 | * know its size here). */ |
442 | static unsigned long setup_pagetables(unsigned long mem, | 442 | static unsigned long setup_pagetables(unsigned long mem, |
443 | unsigned long initrd_size) | 443 | unsigned long initrd_size) |
444 | { | 444 | { |
@@ -850,7 +850,8 @@ static void handle_console_output(int fd, struct virtqueue *vq) | |||
850 | * | 850 | * |
851 | * Handling output for network is also simple: we get all the output buffers | 851 | * Handling output for network is also simple: we get all the output buffers |
852 | * and write them (ignoring the first element) to this device's file descriptor | 852 | * and write them (ignoring the first element) to this device's file descriptor |
853 | * (stdout). */ | 853 | * (/dev/net/tun). |
854 | */ | ||
854 | static void handle_net_output(int fd, struct virtqueue *vq) | 855 | static void handle_net_output(int fd, struct virtqueue *vq) |
855 | { | 856 | { |
856 | unsigned int head, out, in; | 857 | unsigned int head, out, in; |
@@ -924,7 +925,7 @@ static void enable_fd(int fd, struct virtqueue *vq) | |||
924 | write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); | 925 | write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); |
925 | } | 926 | } |
926 | 927 | ||
927 | /* Resetting a device is fairly easy. */ | 928 | /* When the Guest asks us to reset a device, it's is fairly easy. */ |
928 | static void reset_device(struct device *dev) | 929 | static void reset_device(struct device *dev) |
929 | { | 930 | { |
930 | struct virtqueue *vq; | 931 | struct virtqueue *vq; |
@@ -1003,8 +1004,8 @@ static void handle_input(int fd) | |||
1003 | if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0) | 1004 | if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0) |
1004 | break; | 1005 | break; |
1005 | 1006 | ||
1006 | /* Otherwise, call the device(s) which have readable | 1007 | /* Otherwise, call the device(s) which have readable file |
1007 | * file descriptors and a method of handling them. */ | 1008 | * descriptors and a method of handling them. */ |
1008 | for (i = devices.dev; i; i = i->next) { | 1009 | for (i = devices.dev; i; i = i->next) { |
1009 | if (i->handle_input && FD_ISSET(i->fd, &fds)) { | 1010 | if (i->handle_input && FD_ISSET(i->fd, &fds)) { |
1010 | int dev_fd; | 1011 | int dev_fd; |
@@ -1015,8 +1016,7 @@ static void handle_input(int fd) | |||
1015 | * should no longer service it. Networking and | 1016 | * should no longer service it. Networking and |
1016 | * console do this when there's no input | 1017 | * console do this when there's no input |
1017 | * buffers to deliver into. Console also uses | 1018 | * buffers to deliver into. Console also uses |
1018 | * it when it discovers that stdin is | 1019 | * it when it discovers that stdin is closed. */ |
1019 | * closed. */ | ||
1020 | FD_CLR(i->fd, &devices.infds); | 1020 | FD_CLR(i->fd, &devices.infds); |
1021 | /* Tell waker to ignore it too, by sending a | 1021 | /* Tell waker to ignore it too, by sending a |
1022 | * negative fd number (-1, since 0 is a valid | 1022 | * negative fd number (-1, since 0 is a valid |
@@ -1033,7 +1033,8 @@ static void handle_input(int fd) | |||
1033 | * | 1033 | * |
1034 | * All devices need a descriptor so the Guest knows it exists, and a "struct | 1034 | * All devices need a descriptor so the Guest knows it exists, and a "struct |
1035 | * device" so the Launcher can keep track of it. We have common helper | 1035 | * device" so the Launcher can keep track of it. We have common helper |
1036 | * routines to allocate and manage them. */ | 1036 | * routines to allocate and manage them. |
1037 | */ | ||
1037 | 1038 | ||
1038 | /* The layout of the device page is a "struct lguest_device_desc" followed by a | 1039 | /* The layout of the device page is a "struct lguest_device_desc" followed by a |
1039 | * number of virtqueue descriptors, then two sets of feature bits, then an | 1040 | * number of virtqueue descriptors, then two sets of feature bits, then an |
@@ -1078,7 +1079,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, | |||
1078 | struct virtqueue **i, *vq = malloc(sizeof(*vq)); | 1079 | struct virtqueue **i, *vq = malloc(sizeof(*vq)); |
1079 | void *p; | 1080 | void *p; |
1080 | 1081 | ||
1081 | /* First we need some pages for this virtqueue. */ | 1082 | /* First we need some memory for this virtqueue. */ |
1082 | pages = (vring_size(num_descs, getpagesize()) + getpagesize() - 1) | 1083 | pages = (vring_size(num_descs, getpagesize()) + getpagesize() - 1) |
1083 | / getpagesize(); | 1084 | / getpagesize(); |
1084 | p = get_pages(pages); | 1085 | p = get_pages(pages); |
@@ -1122,7 +1123,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, | |||
1122 | } | 1123 | } |
1123 | 1124 | ||
1124 | /* The first half of the feature bitmask is for us to advertise features. The | 1125 | /* The first half of the feature bitmask is for us to advertise features. The |
1125 | * second half if for the Guest to accept features. */ | 1126 | * second half is for the Guest to accept features. */ |
1126 | static void add_feature(struct device *dev, unsigned bit) | 1127 | static void add_feature(struct device *dev, unsigned bit) |
1127 | { | 1128 | { |
1128 | u8 *features = get_feature_bits(dev); | 1129 | u8 *features = get_feature_bits(dev); |
@@ -1151,7 +1152,9 @@ static void set_config(struct device *dev, unsigned len, const void *conf) | |||
1151 | } | 1152 | } |
1152 | 1153 | ||
1153 | /* This routine does all the creation and setup of a new device, including | 1154 | /* This routine does all the creation and setup of a new device, including |
1154 | * calling new_dev_desc() to allocate the descriptor and device memory. */ | 1155 | * calling new_dev_desc() to allocate the descriptor and device memory. |
1156 | * | ||
1157 | * See what I mean about userspace being boring? */ | ||
1155 | static struct device *new_device(const char *name, u16 type, int fd, | 1158 | static struct device *new_device(const char *name, u16 type, int fd, |
1156 | bool (*handle_input)(int, struct device *)) | 1159 | bool (*handle_input)(int, struct device *)) |
1157 | { | 1160 | { |
@@ -1383,7 +1386,6 @@ struct vblk_info | |||
1383 | * Launcher triggers interrupt to Guest. */ | 1386 | * Launcher triggers interrupt to Guest. */ |
1384 | int done_fd; | 1387 | int done_fd; |
1385 | }; | 1388 | }; |
1386 | /*:*/ | ||
1387 | 1389 | ||
1388 | /*L:210 | 1390 | /*L:210 |
1389 | * The Disk | 1391 | * The Disk |
@@ -1493,7 +1495,10 @@ static int io_thread(void *_dev) | |||
1493 | while (read(vblk->workpipe[0], &c, 1) == 1) { | 1495 | while (read(vblk->workpipe[0], &c, 1) == 1) { |
1494 | /* We acknowledge each request immediately to reduce latency, | 1496 | /* We acknowledge each request immediately to reduce latency, |
1495 | * rather than waiting until we've done them all. I haven't | 1497 | * rather than waiting until we've done them all. I haven't |
1496 | * measured to see if it makes any difference. */ | 1498 | * measured to see if it makes any difference. |
1499 | * | ||
1500 | * That would be an interesting test, wouldn't it? You could | ||
1501 | * also try having more than one I/O thread. */ | ||
1497 | while (service_io(dev)) | 1502 | while (service_io(dev)) |
1498 | write(vblk->done_fd, &c, 1); | 1503 | write(vblk->done_fd, &c, 1); |
1499 | } | 1504 | } |
@@ -1501,7 +1506,7 @@ static int io_thread(void *_dev) | |||
1501 | } | 1506 | } |
1502 | 1507 | ||
1503 | /* Now we've seen the I/O thread, we return to the Launcher to see what happens | 1508 | /* Now we've seen the I/O thread, we return to the Launcher to see what happens |
1504 | * when the thread tells us it's completed some I/O. */ | 1509 | * when that thread tells us it's completed some I/O. */ |
1505 | static bool handle_io_finish(int fd, struct device *dev) | 1510 | static bool handle_io_finish(int fd, struct device *dev) |
1506 | { | 1511 | { |
1507 | char c; | 1512 | char c; |
@@ -1573,11 +1578,12 @@ static void setup_block_file(const char *filename) | |||
1573 | * more work. */ | 1578 | * more work. */ |
1574 | pipe(vblk->workpipe); | 1579 | pipe(vblk->workpipe); |
1575 | 1580 | ||
1576 | /* Create stack for thread and run it */ | 1581 | /* Create stack for thread and run it. Since stack grows upwards, we |
1582 | * point the stack pointer to the end of this region. */ | ||
1577 | stack = malloc(32768); | 1583 | stack = malloc(32768); |
1578 | /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from | 1584 | /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from |
1579 | * becoming a zombie. */ | 1585 | * becoming a zombie. */ |
1580 | if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1) | 1586 | if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1) |
1581 | err(1, "Creating clone"); | 1587 | err(1, "Creating clone"); |
1582 | 1588 | ||
1583 | /* We don't need to keep the I/O thread's end of the pipes open. */ | 1589 | /* We don't need to keep the I/O thread's end of the pipes open. */ |
@@ -1587,14 +1593,14 @@ static void setup_block_file(const char *filename) | |||
1587 | verbose("device %u: virtblock %llu sectors\n", | 1593 | verbose("device %u: virtblock %llu sectors\n", |
1588 | devices.device_num, le64_to_cpu(conf.capacity)); | 1594 | devices.device_num, le64_to_cpu(conf.capacity)); |
1589 | } | 1595 | } |
1590 | /* That's the end of device setup. :*/ | 1596 | /* That's the end of device setup. */ |
1591 | 1597 | ||
1592 | /* Reboot */ | 1598 | /*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */ |
1593 | static void __attribute__((noreturn)) restart_guest(void) | 1599 | static void __attribute__((noreturn)) restart_guest(void) |
1594 | { | 1600 | { |
1595 | unsigned int i; | 1601 | unsigned int i; |
1596 | 1602 | ||
1597 | /* Closing pipes causes the waker thread and io_threads to die, and | 1603 | /* Closing pipes causes the Waker thread and io_threads to die, and |
1598 | * closing /dev/lguest cleans up the Guest. Since we don't track all | 1604 | * closing /dev/lguest cleans up the Guest. Since we don't track all |
1599 | * open fds, we simply close everything beyond stderr. */ | 1605 | * open fds, we simply close everything beyond stderr. */ |
1600 | for (i = 3; i < FD_SETSIZE; i++) | 1606 | for (i = 3; i < FD_SETSIZE; i++) |
@@ -1603,7 +1609,7 @@ static void __attribute__((noreturn)) restart_guest(void) | |||
1603 | err(1, "Could not exec %s", main_args[0]); | 1609 | err(1, "Could not exec %s", main_args[0]); |
1604 | } | 1610 | } |
1605 | 1611 | ||
1606 | /*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves | 1612 | /*L:220 Finally we reach the core of the Launcher which runs the Guest, serves |
1607 | * its input and output, and finally, lays it to rest. */ | 1613 | * its input and output, and finally, lays it to rest. */ |
1608 | static void __attribute__((noreturn)) run_guest(int lguest_fd) | 1614 | static void __attribute__((noreturn)) run_guest(int lguest_fd) |
1609 | { | 1615 | { |
@@ -1644,7 +1650,7 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) | |||
1644 | err(1, "Resetting break"); | 1650 | err(1, "Resetting break"); |
1645 | } | 1651 | } |
1646 | } | 1652 | } |
1647 | /* | 1653 | /*L:240 |
1648 | * This is the end of the Launcher. The good news: we are over halfway | 1654 | * This is the end of the Launcher. The good news: we are over halfway |
1649 | * through! The bad news: the most fiendish part of the code still lies ahead | 1655 | * through! The bad news: the most fiendish part of the code still lies ahead |
1650 | * of us. | 1656 | * of us. |
@@ -1691,8 +1697,8 @@ int main(int argc, char *argv[]) | |||
1691 | * device receive input from a file descriptor, we keep an fdset | 1697 | * device receive input from a file descriptor, we keep an fdset |
1692 | * (infds) and the maximum fd number (max_infd) with the head of the | 1698 | * (infds) and the maximum fd number (max_infd) with the head of the |
1693 | * list. We also keep a pointer to the last device. Finally, we keep | 1699 | * list. We also keep a pointer to the last device. Finally, we keep |
1694 | * the next interrupt number to hand out (1: remember that 0 is used by | 1700 | * the next interrupt number to use for devices (1: remember that 0 is |
1695 | * the timer). */ | 1701 | * used by the timer). */ |
1696 | FD_ZERO(&devices.infds); | 1702 | FD_ZERO(&devices.infds); |
1697 | devices.max_infd = -1; | 1703 | devices.max_infd = -1; |
1698 | devices.lastdev = NULL; | 1704 | devices.lastdev = NULL; |
@@ -1793,8 +1799,8 @@ int main(int argc, char *argv[]) | |||
1793 | lguest_fd = tell_kernel(pgdir, start); | 1799 | lguest_fd = tell_kernel(pgdir, start); |
1794 | 1800 | ||
1795 | /* We fork off a child process, which wakes the Launcher whenever one | 1801 | /* We fork off a child process, which wakes the Launcher whenever one |
1796 | * of the input file descriptors needs attention. Otherwise we would | 1802 | * of the input file descriptors needs attention. We call this the |
1797 | * run the Guest until it tries to output something. */ | 1803 | * Waker, and we'll cover it in a moment. */ |
1798 | waker_fd = setup_waker(lguest_fd); | 1804 | waker_fd = setup_waker(lguest_fd); |
1799 | 1805 | ||
1800 | /* Finally, run the Guest. This doesn't return. */ | 1806 | /* Finally, run the Guest. This doesn't return. */ |
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt index 722d4e7fbebe..29510dc51510 100644 --- a/Documentation/lguest/lguest.txt +++ b/Documentation/lguest/lguest.txt | |||
@@ -1,6 +1,7 @@ | |||
1 | Rusty's Remarkably Unreliable Guide to Lguest | 1 | __ |
2 | - or, A Young Coder's Illustrated Hypervisor | 2 | (___()'`; Rusty's Remarkably Unreliable Guide to Lguest |
3 | http://lguest.ozlabs.org | 3 | /, /` - or, A Young Coder's Illustrated Hypervisor |
4 | \\"--\\ http://lguest.ozlabs.org | ||
4 | 5 | ||
5 | Lguest is designed to be a minimal hypervisor for the Linux kernel, for | 6 | Lguest is designed to be a minimal hypervisor for the Linux kernel, for |
6 | Linux developers and users to experiment with virtualization with the | 7 | Linux developers and users to experiment with virtualization with the |
@@ -41,12 +42,16 @@ Running Lguest: | |||
41 | CONFIG_PHYSICAL_ALIGN=0x100000) | 42 | CONFIG_PHYSICAL_ALIGN=0x100000) |
42 | 43 | ||
43 | "Device Drivers": | 44 | "Device Drivers": |
45 | "Block devices" | ||
46 | "Virtio block driver (EXPERIMENTAL)" = M/Y | ||
44 | "Network device support" | 47 | "Network device support" |
45 | "Universal TUN/TAP device driver support" = M/Y | 48 | "Universal TUN/TAP device driver support" = M/Y |
46 | (CONFIG_TUN=m) | 49 | "Virtio network driver (EXPERIMENTAL)" = M/Y |
47 | "Virtualization" | 50 | (CONFIG_VIRTIO_BLK=m, CONFIG_VIRTIO_NET=m and CONFIG_TUN=m) |
48 | "Linux hypervisor example code" = M/Y | 51 | |
49 | (CONFIG_LGUEST=m) | 52 | "Virtualization" |
53 | "Linux hypervisor example code" = M/Y | ||
54 | (CONFIG_LGUEST=m) | ||
50 | 55 | ||
51 | - A tool called "lguest" is available in this directory: type "make" | 56 | - A tool called "lguest" is available in this directory: type "make" |
52 | to build it. If you didn't build your kernel in-tree, use "make | 57 | to build it. If you didn't build your kernel in-tree, use "make |
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index a104c532ff70..3335b4595efd 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
@@ -10,21 +10,19 @@ | |||
10 | * (such as the example in Documentation/lguest/lguest.c) is called the | 10 | * (such as the example in Documentation/lguest/lguest.c) is called the |
11 | * Launcher. | 11 | * Launcher. |
12 | * | 12 | * |
13 | * Secondly, we only run specially modified Guests, not normal kernels. When | 13 | * Secondly, we only run specially modified Guests, not normal kernels: setting |
14 | * you set CONFIG_LGUEST to 'y' or 'm', this automatically sets | 14 | * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows |
15 | * CONFIG_LGUEST_GUEST=y, which compiles this file into the kernel so it knows | 15 | * how to be a Guest at boot time. This means that you can use the same kernel |
16 | * how to be a Guest. This means that you can use the same kernel you boot | 16 | * you boot normally (ie. as a Host) as a Guest. |
17 | * normally (ie. as a Host) as a Guest. | ||
18 | * | 17 | * |
19 | * These Guests know that they cannot do privileged operations, such as disable | 18 | * These Guests know that they cannot do privileged operations, such as disable |
20 | * interrupts, and that they have to ask the Host to do such things explicitly. | 19 | * interrupts, and that they have to ask the Host to do such things explicitly. |
21 | * This file consists of all the replacements for such low-level native | 20 | * This file consists of all the replacements for such low-level native |
22 | * hardware operations: these special Guest versions call the Host. | 21 | * hardware operations: these special Guest versions call the Host. |
23 | * | 22 | * |
24 | * So how does the kernel know it's a Guest? The Guest starts at a special | 23 | * So how does the kernel know it's a Guest? We'll see that later, but let's |
25 | * entry point marked with a magic string, which sets up a few things then | 24 | * just say that we end up here where we replace the native functions various |
26 | * calls here. We replace the native functions various "paravirt" structures | 25 | * "paravirt" structures with our Guest versions, then boot like normal. :*/ |
27 | * with our Guest versions, then boot like normal. :*/ | ||
28 | 26 | ||
29 | /* | 27 | /* |
30 | * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. | 28 | * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. |
@@ -134,7 +132,7 @@ static void async_hcall(unsigned long call, unsigned long arg1, | |||
134 | * lguest_leave_lazy_mode(). | 132 | * lguest_leave_lazy_mode(). |
135 | * | 133 | * |
136 | * So, when we're in lazy mode, we call async_hcall() to store the call for | 134 | * So, when we're in lazy mode, we call async_hcall() to store the call for |
137 | * future processing. */ | 135 | * future processing: */ |
138 | static void lazy_hcall(unsigned long call, | 136 | static void lazy_hcall(unsigned long call, |
139 | unsigned long arg1, | 137 | unsigned long arg1, |
140 | unsigned long arg2, | 138 | unsigned long arg2, |
@@ -147,7 +145,7 @@ static void lazy_hcall(unsigned long call, | |||
147 | } | 145 | } |
148 | 146 | ||
149 | /* When lazy mode is turned off reset the per-cpu lazy mode variable and then | 147 | /* When lazy mode is turned off reset the per-cpu lazy mode variable and then |
150 | * issue a hypercall to flush any stored calls. */ | 148 | * issue the do-nothing hypercall to flush any stored calls. */ |
151 | static void lguest_leave_lazy_mode(void) | 149 | static void lguest_leave_lazy_mode(void) |
152 | { | 150 | { |
153 | paravirt_leave_lazy(paravirt_get_lazy_mode()); | 151 | paravirt_leave_lazy(paravirt_get_lazy_mode()); |
@@ -164,7 +162,7 @@ static void lguest_leave_lazy_mode(void) | |||
164 | * | 162 | * |
165 | * So instead we keep an "irq_enabled" field inside our "struct lguest_data", | 163 | * So instead we keep an "irq_enabled" field inside our "struct lguest_data", |
166 | * which the Guest can update with a single instruction. The Host knows to | 164 | * which the Guest can update with a single instruction. The Host knows to |
167 | * check there when it wants to deliver an interrupt. | 165 | * check there before it tries to deliver an interrupt. |
168 | */ | 166 | */ |
169 | 167 | ||
170 | /* save_flags() is expected to return the processor state (ie. "flags"). The | 168 | /* save_flags() is expected to return the processor state (ie. "flags"). The |
@@ -196,10 +194,15 @@ static void irq_enable(void) | |||
196 | /*M:003 Note that we don't check for outstanding interrupts when we re-enable | 194 | /*M:003 Note that we don't check for outstanding interrupts when we re-enable |
197 | * them (or when we unmask an interrupt). This seems to work for the moment, | 195 | * them (or when we unmask an interrupt). This seems to work for the moment, |
198 | * since interrupts are rare and we'll just get the interrupt on the next timer | 196 | * since interrupts are rare and we'll just get the interrupt on the next timer |
199 | * tick, but when we turn on CONFIG_NO_HZ, we should revisit this. One way | 197 | * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way |
200 | * would be to put the "irq_enabled" field in a page by itself, and have the | 198 | * would be to put the "irq_enabled" field in a page by itself, and have the |
201 | * Host write-protect it when an interrupt comes in when irqs are disabled. | 199 | * Host write-protect it when an interrupt comes in when irqs are disabled. |
202 | * There will then be a page fault as soon as interrupts are re-enabled. :*/ | 200 | * There will then be a page fault as soon as interrupts are re-enabled. |
201 | * | ||
202 | * A better method is to implement soft interrupt disable generally for x86: | ||
203 | * instead of disabling interrupts, we set a flag. If an interrupt does come | ||
204 | * in, we then disable them for real. This is uncommon, so we could simply use | ||
205 | * a hypercall for interrupt control and not worry about efficiency. :*/ | ||
203 | 206 | ||
204 | /*G:034 | 207 | /*G:034 |
205 | * The Interrupt Descriptor Table (IDT). | 208 | * The Interrupt Descriptor Table (IDT). |
@@ -212,6 +215,10 @@ static void irq_enable(void) | |||
212 | static void lguest_write_idt_entry(gate_desc *dt, | 215 | static void lguest_write_idt_entry(gate_desc *dt, |
213 | int entrynum, const gate_desc *g) | 216 | int entrynum, const gate_desc *g) |
214 | { | 217 | { |
218 | /* The gate_desc structure is 8 bytes long: we hand it to the Host in | ||
219 | * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors | ||
220 | * around like this; typesafety wasn't a big concern in Linux's early | ||
221 | * years. */ | ||
215 | u32 *desc = (u32 *)g; | 222 | u32 *desc = (u32 *)g; |
216 | /* Keep the local copy up to date. */ | 223 | /* Keep the local copy up to date. */ |
217 | native_write_idt_entry(dt, entrynum, g); | 224 | native_write_idt_entry(dt, entrynum, g); |
@@ -243,7 +250,8 @@ static void lguest_load_idt(const struct desc_ptr *desc) | |||
243 | * | 250 | * |
244 | * This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY | 251 | * This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY |
245 | * hypercall and use that repeatedly to load a new IDT. I don't think it | 252 | * hypercall and use that repeatedly to load a new IDT. I don't think it |
246 | * really matters, but wouldn't it be nice if they were the same? | 253 | * really matters, but wouldn't it be nice if they were the same? Wouldn't |
254 | * it be even better if you were the one to send the patch to fix it? | ||
247 | */ | 255 | */ |
248 | static void lguest_load_gdt(const struct desc_ptr *desc) | 256 | static void lguest_load_gdt(const struct desc_ptr *desc) |
249 | { | 257 | { |
@@ -298,9 +306,9 @@ static void lguest_load_tr_desc(void) | |||
298 | 306 | ||
299 | /* The "cpuid" instruction is a way of querying both the CPU identity | 307 | /* The "cpuid" instruction is a way of querying both the CPU identity |
300 | * (manufacturer, model, etc) and its features. It was introduced before the | 308 | * (manufacturer, model, etc) and its features. It was introduced before the |
301 | * Pentium in 1993 and keeps getting extended by both Intel and AMD. As you | 309 | * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. |
302 | * might imagine, after a decade and a half this treatment, it is now a giant | 310 | * As you might imagine, after a decade and a half this treatment, it is now a |
303 | * ball of hair. Its entry in the current Intel manual runs to 28 pages. | 311 | * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. |
304 | * | 312 | * |
305 | * This instruction even it has its own Wikipedia entry. The Wikipedia entry | 313 | * This instruction even it has its own Wikipedia entry. The Wikipedia entry |
306 | * has been translated into 4 languages. I am not making this up! | 314 | * has been translated into 4 languages. I am not making this up! |
@@ -594,17 +602,17 @@ static unsigned long lguest_get_wallclock(void) | |||
594 | return lguest_data.time.tv_sec; | 602 | return lguest_data.time.tv_sec; |
595 | } | 603 | } |
596 | 604 | ||
597 | /* The TSC is a Time Stamp Counter. The Host tells us what speed it runs at, | 605 | /* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us |
598 | * or 0 if it's unusable as a reliable clock source. This matches what we want | 606 | * what speed it runs at, or 0 if it's unusable as a reliable clock source. |
599 | * here: if we return 0 from this function, the x86 TSC clock will not register | 607 | * This matches what we want here: if we return 0 from this function, the x86 |
600 | * itself. */ | 608 | * TSC clock will give up and not register itself. */ |
601 | static unsigned long lguest_cpu_khz(void) | 609 | static unsigned long lguest_cpu_khz(void) |
602 | { | 610 | { |
603 | return lguest_data.tsc_khz; | 611 | return lguest_data.tsc_khz; |
604 | } | 612 | } |
605 | 613 | ||
606 | /* If we can't use the TSC, the kernel falls back to our "lguest_clock", where | 614 | /* If we can't use the TSC, the kernel falls back to our lower-priority |
607 | * we read the time value given to us by the Host. */ | 615 | * "lguest_clock", where we read the time value given to us by the Host. */ |
608 | static cycle_t lguest_clock_read(void) | 616 | static cycle_t lguest_clock_read(void) |
609 | { | 617 | { |
610 | unsigned long sec, nsec; | 618 | unsigned long sec, nsec; |
@@ -648,12 +656,16 @@ static struct clocksource lguest_clock = { | |||
648 | static int lguest_clockevent_set_next_event(unsigned long delta, | 656 | static int lguest_clockevent_set_next_event(unsigned long delta, |
649 | struct clock_event_device *evt) | 657 | struct clock_event_device *evt) |
650 | { | 658 | { |
659 | /* FIXME: I don't think this can ever happen, but James tells me he had | ||
660 | * to put this code in. Maybe we should remove it now. Anyone? */ | ||
651 | if (delta < LG_CLOCK_MIN_DELTA) { | 661 | if (delta < LG_CLOCK_MIN_DELTA) { |
652 | if (printk_ratelimit()) | 662 | if (printk_ratelimit()) |
653 | printk(KERN_DEBUG "%s: small delta %lu ns\n", | 663 | printk(KERN_DEBUG "%s: small delta %lu ns\n", |
654 | __FUNCTION__, delta); | 664 | __FUNCTION__, delta); |
655 | return -ETIME; | 665 | return -ETIME; |
656 | } | 666 | } |
667 | |||
668 | /* Please wake us this far in the future. */ | ||
657 | hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0); | 669 | hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0); |
658 | return 0; | 670 | return 0; |
659 | } | 671 | } |
@@ -738,7 +750,7 @@ static void lguest_time_init(void) | |||
738 | * will not tolerate us trying to use that), the stack pointer, and the number | 750 | * will not tolerate us trying to use that), the stack pointer, and the number |
739 | * of pages in the stack. */ | 751 | * of pages in the stack. */ |
740 | static void lguest_load_sp0(struct tss_struct *tss, | 752 | static void lguest_load_sp0(struct tss_struct *tss, |
741 | struct thread_struct *thread) | 753 | struct thread_struct *thread) |
742 | { | 754 | { |
743 | lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->sp0, | 755 | lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->sp0, |
744 | THREAD_SIZE/PAGE_SIZE); | 756 | THREAD_SIZE/PAGE_SIZE); |
@@ -786,9 +798,8 @@ static void lguest_safe_halt(void) | |||
786 | hcall(LHCALL_HALT, 0, 0, 0); | 798 | hcall(LHCALL_HALT, 0, 0, 0); |
787 | } | 799 | } |
788 | 800 | ||
789 | /* Perhaps CRASH isn't the best name for this hypercall, but we use it to get a | 801 | /* The SHUTDOWN hypercall takes a string to describe what's happening, and |
790 | * message out when we're crashing as well as elegant termination like powering | 802 | * an argument which says whether this to restart (reboot) the Guest or not. |
791 | * off. | ||
792 | * | 803 | * |
793 | * Note that the Host always prefers that the Guest speak in physical addresses | 804 | * Note that the Host always prefers that the Guest speak in physical addresses |
794 | * rather than virtual addresses, so we use __pa() here. */ | 805 | * rather than virtual addresses, so we use __pa() here. */ |
@@ -816,8 +827,9 @@ static struct notifier_block paniced = { | |||
816 | /* Setting up memory is fairly easy. */ | 827 | /* Setting up memory is fairly easy. */ |
817 | static __init char *lguest_memory_setup(void) | 828 | static __init char *lguest_memory_setup(void) |
818 | { | 829 | { |
819 | /* We do this here and not earlier because lockcheck barfs if we do it | 830 | /* We do this here and not earlier because lockcheck used to barf if we |
820 | * before start_kernel() */ | 831 | * did it before start_kernel(). I think we fixed that, so it'd be |
832 | * nice to move it back to lguest_init. Patch welcome... */ | ||
821 | atomic_notifier_chain_register(&panic_notifier_list, &paniced); | 833 | atomic_notifier_chain_register(&panic_notifier_list, &paniced); |
822 | 834 | ||
823 | /* The Linux bootloader header contains an "e820" memory map: the | 835 | /* The Linux bootloader header contains an "e820" memory map: the |
@@ -850,12 +862,19 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) | |||
850 | return len; | 862 | return len; |
851 | } | 863 | } |
852 | 864 | ||
865 | /* Rebooting also tells the Host we're finished, but the RESTART flag tells the | ||
866 | * Launcher to reboot us. */ | ||
867 | static void lguest_restart(char *reason) | ||
868 | { | ||
869 | hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0); | ||
870 | } | ||
871 | |||
853 | /*G:050 | 872 | /*G:050 |
854 | * Patching (Powerfully Placating Performance Pedants) | 873 | * Patching (Powerfully Placating Performance Pedants) |
855 | * | 874 | * |
856 | * We have already seen that pv_ops structures let us replace simple | 875 | * We have already seen that pv_ops structures let us replace simple native |
857 | * native instructions with calls to the appropriate back end all throughout | 876 | * instructions with calls to the appropriate back end all throughout the |
858 | * the kernel. This allows the same kernel to run as a Guest and as a native | 877 | * kernel. This allows the same kernel to run as a Guest and as a native |
859 | * kernel, but it's slow because of all the indirect branches. | 878 | * kernel, but it's slow because of all the indirect branches. |
860 | * | 879 | * |
861 | * Remember that David Wheeler quote about "Any problem in computer science can | 880 | * Remember that David Wheeler quote about "Any problem in computer science can |
@@ -908,14 +927,9 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, | |||
908 | return insn_len; | 927 | return insn_len; |
909 | } | 928 | } |
910 | 929 | ||
911 | static void lguest_restart(char *reason) | 930 | /*G:030 Once we get to lguest_init(), we know we're a Guest. The various |
912 | { | 931 | * pv_ops structures in the kernel provide points for (almost) every routine we |
913 | hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0); | 932 | * have to override to avoid privileged instructions. */ |
914 | } | ||
915 | |||
916 | /*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops | ||
917 | * structures in the kernel provide points for (almost) every routine we have | ||
918 | * to override to avoid privileged instructions. */ | ||
919 | __init void lguest_init(void) | 933 | __init void lguest_init(void) |
920 | { | 934 | { |
921 | /* We're under lguest, paravirt is enabled, and we're running at | 935 | /* We're under lguest, paravirt is enabled, and we're running at |
@@ -1003,9 +1017,9 @@ __init void lguest_init(void) | |||
1003 | * the normal data segment to get through booting. */ | 1017 | * the normal data segment to get through booting. */ |
1004 | asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); | 1018 | asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); |
1005 | 1019 | ||
1006 | /* The Host uses the top of the Guest's virtual address space for the | 1020 | /* The Host<->Guest Switcher lives at the top of our address space, and |
1007 | * Host<->Guest Switcher, and it tells us how big that is in | 1021 | * the Host told us how big it is when we made LGUEST_INIT hypercall: |
1008 | * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */ | 1022 | * it put the answer in lguest_data.reserve_mem */ |
1009 | reserve_top_address(lguest_data.reserve_mem); | 1023 | reserve_top_address(lguest_data.reserve_mem); |
1010 | 1024 | ||
1011 | /* If we don't initialize the lock dependency checker now, it crashes | 1025 | /* If we don't initialize the lock dependency checker now, it crashes |
@@ -1027,6 +1041,7 @@ __init void lguest_init(void) | |||
1027 | /* Math is always hard! */ | 1041 | /* Math is always hard! */ |
1028 | new_cpu_data.hard_math = 1; | 1042 | new_cpu_data.hard_math = 1; |
1029 | 1043 | ||
1044 | /* We don't have features. We have puppies! Puppies! */ | ||
1030 | #ifdef CONFIG_X86_MCE | 1045 | #ifdef CONFIG_X86_MCE |
1031 | mce_disabled = 1; | 1046 | mce_disabled = 1; |
1032 | #endif | 1047 | #endif |
@@ -1044,10 +1059,11 @@ __init void lguest_init(void) | |||
1044 | virtio_cons_early_init(early_put_chars); | 1059 | virtio_cons_early_init(early_put_chars); |
1045 | 1060 | ||
1046 | /* Last of all, we set the power management poweroff hook to point to | 1061 | /* Last of all, we set the power management poweroff hook to point to |
1047 | * the Guest routine to power off. */ | 1062 | * the Guest routine to power off, and the reboot hook to our restart |
1063 | * routine. */ | ||
1048 | pm_power_off = lguest_power_off; | 1064 | pm_power_off = lguest_power_off; |
1049 | |||
1050 | machine_ops.restart = lguest_restart; | 1065 | machine_ops.restart = lguest_restart; |
1066 | |||
1051 | /* Now we're set up, call start_kernel() in init/main.c and we proceed | 1067 | /* Now we're set up, call start_kernel() in init/main.c and we proceed |
1052 | * to boot as normal. It never returns. */ | 1068 | * to boot as normal. It never returns. */ |
1053 | start_kernel(); | 1069 | start_kernel(); |
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index 95b6fbcded63..5c7cef34c9e7 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S | |||
@@ -5,13 +5,20 @@ | |||
5 | #include <asm/thread_info.h> | 5 | #include <asm/thread_info.h> |
6 | #include <asm/processor-flags.h> | 6 | #include <asm/processor-flags.h> |
7 | 7 | ||
8 | /*G:020 This is where we begin: head.S notes that the boot header's platform | 8 | /*G:020 Our story starts with the kernel booting into startup_32 in |
9 | * type field is "1" (lguest), so calls us here. | 9 | * arch/x86/kernel/head_32.S. It expects a boot header, which is created by |
10 | * the bootloader (the Launcher in our case). | ||
11 | * | ||
12 | * The startup_32 function does very little: it clears the uninitialized global | ||
13 | * C variables which we expect to be zero (ie. BSS) and then copies the boot | ||
14 | * header and kernel command line somewhere safe. Finally it checks the | ||
15 | * 'hardware_subarch' field. This was introduced in 2.6.24 for lguest and Xen: | ||
16 | * if it's set to '1' (lguest's assigned number), then it calls us here. | ||
10 | * | 17 | * |
11 | * WARNING: be very careful here! We're running at addresses equal to physical | 18 | * WARNING: be very careful here! We're running at addresses equal to physical |
12 | * addesses (around 0), not above PAGE_OFFSET as most code expectes | 19 | * addesses (around 0), not above PAGE_OFFSET as most code expectes |
13 | * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any | 20 | * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any |
14 | * data. | 21 | * data without remembering to subtract __PAGE_OFFSET! |
15 | * | 22 | * |
16 | * The .section line puts this code in .init.text so it will be discarded after | 23 | * The .section line puts this code in .init.text so it will be discarded after |
17 | * boot. */ | 24 | * boot. */ |
@@ -24,7 +31,7 @@ ENTRY(lguest_entry) | |||
24 | int $LGUEST_TRAP_ENTRY | 31 | int $LGUEST_TRAP_ENTRY |
25 | 32 | ||
26 | /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl | 33 | /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl |
27 | * instruction uses %esi implicitly as the source for the copy we' | 34 | * instruction uses %esi implicitly as the source for the copy we're |
28 | * about to do. */ | 35 | * about to do. */ |
29 | movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi | 36 | movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi |
30 | 37 | ||
diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile index 5e8272d296d8..7d463c26124f 100644 --- a/drivers/lguest/Makefile +++ b/drivers/lguest/Makefile | |||
@@ -19,3 +19,11 @@ Beer: | |||
19 | @for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}" | 19 | @for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}" |
20 | Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery: | 20 | Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery: |
21 | @sh ../../Documentation/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'` | 21 | @sh ../../Documentation/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'` |
22 | Puppy: | ||
23 | @clear | ||
24 | @printf " __ \n (___()'\`;\n /, /\`\n \\\\\\\"--\\\\\\ \n" | ||
25 | @sleep 2; clear; printf "\n\n Sit!\n\n"; sleep 1; clear | ||
26 | @printf " __ \n ()'\`; \n /\\|\` \n / | \n(/_)_|_ \n" | ||
27 | @sleep 2; clear; printf "\n\n Stand!\n\n"; sleep 1; clear | ||
28 | @printf " __ \n ()'\`; \n /\\|\` \n /._.= \n /| / \n(_\_)_ \n" | ||
29 | @sleep 2; clear; printf "\n\n Good puppy!\n\n"; sleep 1; clear | ||
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index c632c08cbbdc..5eea4356d703 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c | |||
@@ -1,8 +1,6 @@ | |||
1 | /*P:400 This contains run_guest() which actually calls into the Host<->Guest | 1 | /*P:400 This contains run_guest() which actually calls into the Host<->Guest |
2 | * Switcher and analyzes the return, such as determining if the Guest wants the | 2 | * Switcher and analyzes the return, such as determining if the Guest wants the |
3 | * Host to do something. This file also contains useful helper routines, and a | 3 | * Host to do something. This file also contains useful helper routines. :*/ |
4 | * couple of non-obvious setup and teardown pieces which were implemented after | ||
5 | * days of debugging pain. :*/ | ||
6 | #include <linux/module.h> | 4 | #include <linux/module.h> |
7 | #include <linux/stringify.h> | 5 | #include <linux/stringify.h> |
8 | #include <linux/stddef.h> | 6 | #include <linux/stddef.h> |
@@ -49,8 +47,8 @@ static __init int map_switcher(void) | |||
49 | * easy. | 47 | * easy. |
50 | */ | 48 | */ |
51 | 49 | ||
52 | /* We allocate an array of "struct page"s. map_vm_area() wants the | 50 | /* We allocate an array of struct page pointers. map_vm_area() wants |
53 | * pages in this form, rather than just an array of pointers. */ | 51 | * this, rather than just an array of pages. */ |
54 | switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, | 52 | switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, |
55 | GFP_KERNEL); | 53 | GFP_KERNEL); |
56 | if (!switcher_page) { | 54 | if (!switcher_page) { |
@@ -172,7 +170,7 @@ void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes) | |||
172 | } | 170 | } |
173 | } | 171 | } |
174 | 172 | ||
175 | /* This is the write (copy into guest) version. */ | 173 | /* This is the write (copy into Guest) version. */ |
176 | void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b, | 174 | void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b, |
177 | unsigned bytes) | 175 | unsigned bytes) |
178 | { | 176 | { |
@@ -209,9 +207,9 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) | |||
209 | if (cpu->break_out) | 207 | if (cpu->break_out) |
210 | return -EAGAIN; | 208 | return -EAGAIN; |
211 | 209 | ||
212 | /* Check if there are any interrupts which can be delivered | 210 | /* Check if there are any interrupts which can be delivered now: |
213 | * now: if so, this sets up the hander to be executed when we | 211 | * if so, this sets up the hander to be executed when we next |
214 | * next run the Guest. */ | 212 | * run the Guest. */ |
215 | maybe_do_interrupt(cpu); | 213 | maybe_do_interrupt(cpu); |
216 | 214 | ||
217 | /* All long-lived kernel loops need to check with this horrible | 215 | /* All long-lived kernel loops need to check with this horrible |
@@ -246,8 +244,10 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) | |||
246 | lguest_arch_handle_trap(cpu); | 244 | lguest_arch_handle_trap(cpu); |
247 | } | 245 | } |
248 | 246 | ||
247 | /* Special case: Guest is 'dead' but wants a reboot. */ | ||
249 | if (cpu->lg->dead == ERR_PTR(-ERESTART)) | 248 | if (cpu->lg->dead == ERR_PTR(-ERESTART)) |
250 | return -ERESTART; | 249 | return -ERESTART; |
250 | |||
251 | /* The Guest is dead => "No such file or directory" */ | 251 | /* The Guest is dead => "No such file or directory" */ |
252 | return -ENOENT; | 252 | return -ENOENT; |
253 | } | 253 | } |
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 0f2cb4fd7c69..54d66f05fefa 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c | |||
@@ -29,7 +29,7 @@ | |||
29 | #include "lg.h" | 29 | #include "lg.h" |
30 | 30 | ||
31 | /*H:120 This is the core hypercall routine: where the Guest gets what it wants. | 31 | /*H:120 This is the core hypercall routine: where the Guest gets what it wants. |
32 | * Or gets killed. Or, in the case of LHCALL_CRASH, both. */ | 32 | * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. */ |
33 | static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) | 33 | static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) |
34 | { | 34 | { |
35 | switch (args->arg0) { | 35 | switch (args->arg0) { |
@@ -190,6 +190,13 @@ static void initialize(struct lg_cpu *cpu) | |||
190 | * pagetable. */ | 190 | * pagetable. */ |
191 | guest_pagetable_clear_all(cpu); | 191 | guest_pagetable_clear_all(cpu); |
192 | } | 192 | } |
193 | /*:*/ | ||
194 | |||
195 | /*M:013 If a Guest reads from a page (so creates a mapping) that it has never | ||
196 | * written to, and then the Launcher writes to it (ie. the output of a virtual | ||
197 | * device), the Guest will still see the old page. In practice, this never | ||
198 | * happens: why would the Guest read a page which it has never written to? But | ||
199 | * a similar scenario might one day bite us, so it's worth mentioning. :*/ | ||
193 | 200 | ||
194 | /*H:100 | 201 | /*H:100 |
195 | * Hypercalls | 202 | * Hypercalls |
@@ -227,7 +234,7 @@ void do_hypercalls(struct lg_cpu *cpu) | |||
227 | * However, if we are signalled or the Guest sends I/O to the | 234 | * However, if we are signalled or the Guest sends I/O to the |
228 | * Launcher, the run_guest() loop will exit without running the | 235 | * Launcher, the run_guest() loop will exit without running the |
229 | * Guest. When it comes back it would try to re-run the | 236 | * Guest. When it comes back it would try to re-run the |
230 | * hypercall. */ | 237 | * hypercall. Finding that bug sucked. */ |
231 | cpu->hcall = NULL; | 238 | cpu->hcall = NULL; |
232 | } | 239 | } |
233 | } | 240 | } |
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 32e97c1858e5..0414ddf87587 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c | |||
@@ -144,7 +144,6 @@ void maybe_do_interrupt(struct lg_cpu *cpu) | |||
144 | if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, | 144 | if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, |
145 | sizeof(blk))) | 145 | sizeof(blk))) |
146 | return; | 146 | return; |
147 | |||
148 | bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS); | 147 | bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS); |
149 | 148 | ||
150 | /* Find the first interrupt. */ | 149 | /* Find the first interrupt. */ |
@@ -237,9 +236,9 @@ void free_interrupts(void) | |||
237 | clear_bit(syscall_vector, used_vectors); | 236 | clear_bit(syscall_vector, used_vectors); |
238 | } | 237 | } |
239 | 238 | ||
240 | /*H:220 Now we've got the routines to deliver interrupts, delivering traps | 239 | /*H:220 Now we've got the routines to deliver interrupts, delivering traps like |
241 | * like page fault is easy. The only trick is that Intel decided that some | 240 | * page fault is easy. The only trick is that Intel decided that some traps |
242 | * traps should have error codes: */ | 241 | * should have error codes: */ |
243 | static int has_err(unsigned int trap) | 242 | static int has_err(unsigned int trap) |
244 | { | 243 | { |
245 | return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); | 244 | return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); |
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index 1b2ec0bf5eb1..2bc9bf7e88e5 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c | |||
@@ -1,10 +1,10 @@ | |||
1 | /*P:050 Lguest guests use a very simple method to describe devices. It's a | 1 | /*P:050 Lguest guests use a very simple method to describe devices. It's a |
2 | * series of device descriptors contained just above the top of normal | 2 | * series of device descriptors contained just above the top of normal Guest |
3 | * memory. | 3 | * memory. |
4 | * | 4 | * |
5 | * We use the standard "virtio" device infrastructure, which provides us with a | 5 | * We use the standard "virtio" device infrastructure, which provides us with a |
6 | * console, a network and a block driver. Each one expects some configuration | 6 | * console, a network and a block driver. Each one expects some configuration |
7 | * information and a "virtqueue" mechanism to send and receive data. :*/ | 7 | * information and a "virtqueue" or two to send and receive data. :*/ |
8 | #include <linux/init.h> | 8 | #include <linux/init.h> |
9 | #include <linux/bootmem.h> | 9 | #include <linux/bootmem.h> |
10 | #include <linux/lguest_launcher.h> | 10 | #include <linux/lguest_launcher.h> |
@@ -53,7 +53,7 @@ struct lguest_device { | |||
53 | * Device configurations | 53 | * Device configurations |
54 | * | 54 | * |
55 | * The configuration information for a device consists of one or more | 55 | * The configuration information for a device consists of one or more |
56 | * virtqueues, a feature bitmaks, and some configuration bytes. The | 56 | * virtqueues, a feature bitmap, and some configuration bytes. The |
57 | * configuration bytes don't really matter to us: the Launcher sets them up, and | 57 | * configuration bytes don't really matter to us: the Launcher sets them up, and |
58 | * the driver will look at them during setup. | 58 | * the driver will look at them during setup. |
59 | * | 59 | * |
@@ -179,7 +179,7 @@ struct lguest_vq_info | |||
179 | }; | 179 | }; |
180 | 180 | ||
181 | /* When the virtio_ring code wants to prod the Host, it calls us here and we | 181 | /* When the virtio_ring code wants to prod the Host, it calls us here and we |
182 | * make a hypercall. We hand the page number of the virtqueue so the Host | 182 | * make a hypercall. We hand the physical address of the virtqueue so the Host |
183 | * knows which virtqueue we're talking about. */ | 183 | * knows which virtqueue we're talking about. */ |
184 | static void lg_notify(struct virtqueue *vq) | 184 | static void lg_notify(struct virtqueue *vq) |
185 | { | 185 | { |
@@ -199,7 +199,8 @@ static void lg_notify(struct virtqueue *vq) | |||
199 | * allocate its own pages and tell the Host where they are, but for lguest it's | 199 | * allocate its own pages and tell the Host where they are, but for lguest it's |
200 | * simpler for the Host to simply tell us where the pages are. | 200 | * simpler for the Host to simply tell us where the pages are. |
201 | * | 201 | * |
202 | * So we provide devices with a "find virtqueue and set it up" function. */ | 202 | * So we provide drivers with a "find the Nth virtqueue and set it up" |
203 | * function. */ | ||
203 | static struct virtqueue *lg_find_vq(struct virtio_device *vdev, | 204 | static struct virtqueue *lg_find_vq(struct virtio_device *vdev, |
204 | unsigned index, | 205 | unsigned index, |
205 | void (*callback)(struct virtqueue *vq)) | 206 | void (*callback)(struct virtqueue *vq)) |
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 2221485b0773..564e425d71dd 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c | |||
@@ -73,7 +73,7 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) | |||
73 | if (current != cpu->tsk) | 73 | if (current != cpu->tsk) |
74 | return -EPERM; | 74 | return -EPERM; |
75 | 75 | ||
76 | /* If the guest is already dead, we indicate why */ | 76 | /* If the Guest is already dead, we indicate why */ |
77 | if (lg->dead) { | 77 | if (lg->dead) { |
78 | size_t len; | 78 | size_t len; |
79 | 79 | ||
@@ -88,7 +88,7 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) | |||
88 | return len; | 88 | return len; |
89 | } | 89 | } |
90 | 90 | ||
91 | /* If we returned from read() last time because the Guest notified, | 91 | /* If we returned from read() last time because the Guest sent I/O, |
92 | * clear the flag. */ | 92 | * clear the flag. */ |
93 | if (cpu->pending_notify) | 93 | if (cpu->pending_notify) |
94 | cpu->pending_notify = 0; | 94 | cpu->pending_notify = 0; |
@@ -97,14 +97,20 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) | |||
97 | return run_guest(cpu, (unsigned long __user *)user); | 97 | return run_guest(cpu, (unsigned long __user *)user); |
98 | } | 98 | } |
99 | 99 | ||
100 | /*L:025 This actually initializes a CPU. For the moment, a Guest is only | ||
101 | * uniprocessor, so "id" is always 0. */ | ||
100 | static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) | 102 | static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) |
101 | { | 103 | { |
104 | /* We have a limited number the number of CPUs in the lguest struct. */ | ||
102 | if (id >= NR_CPUS) | 105 | if (id >= NR_CPUS) |
103 | return -EINVAL; | 106 | return -EINVAL; |
104 | 107 | ||
108 | /* Set up this CPU's id, and pointer back to the lguest struct. */ | ||
105 | cpu->id = id; | 109 | cpu->id = id; |
106 | cpu->lg = container_of((cpu - id), struct lguest, cpus[0]); | 110 | cpu->lg = container_of((cpu - id), struct lguest, cpus[0]); |
107 | cpu->lg->nr_cpus++; | 111 | cpu->lg->nr_cpus++; |
112 | |||
113 | /* Each CPU has a timer it can set. */ | ||
108 | init_clockdev(cpu); | 114 | init_clockdev(cpu); |
109 | 115 | ||
110 | /* We need a complete page for the Guest registers: they are accessible | 116 | /* We need a complete page for the Guest registers: they are accessible |
@@ -120,11 +126,11 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) | |||
120 | * address. */ | 126 | * address. */ |
121 | lguest_arch_setup_regs(cpu, start_ip); | 127 | lguest_arch_setup_regs(cpu, start_ip); |
122 | 128 | ||
123 | /* Initialize the queue for the waker to wait on */ | 129 | /* Initialize the queue for the Waker to wait on */ |
124 | init_waitqueue_head(&cpu->break_wq); | 130 | init_waitqueue_head(&cpu->break_wq); |
125 | 131 | ||
126 | /* We keep a pointer to the Launcher task (ie. current task) for when | 132 | /* We keep a pointer to the Launcher task (ie. current task) for when |
127 | * other Guests want to wake this one (inter-Guest I/O). */ | 133 | * other Guests want to wake this one (eg. console input). */ |
128 | cpu->tsk = current; | 134 | cpu->tsk = current; |
129 | 135 | ||
130 | /* We need to keep a pointer to the Launcher's memory map, because if | 136 | /* We need to keep a pointer to the Launcher's memory map, because if |
@@ -136,6 +142,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) | |||
136 | * when the same Guest runs on the same CPU twice. */ | 142 | * when the same Guest runs on the same CPU twice. */ |
137 | cpu->last_pages = NULL; | 143 | cpu->last_pages = NULL; |
138 | 144 | ||
145 | /* No error == success. */ | ||
139 | return 0; | 146 | return 0; |
140 | } | 147 | } |
141 | 148 | ||
@@ -185,14 +192,13 @@ static int initialize(struct file *file, const unsigned long __user *input) | |||
185 | lg->mem_base = (void __user *)(long)args[0]; | 192 | lg->mem_base = (void __user *)(long)args[0]; |
186 | lg->pfn_limit = args[1]; | 193 | lg->pfn_limit = args[1]; |
187 | 194 | ||
188 | /* This is the first cpu */ | 195 | /* This is the first cpu (cpu 0) and it will start booting at args[3] */ |
189 | err = lg_cpu_start(&lg->cpus[0], 0, args[3]); | 196 | err = lg_cpu_start(&lg->cpus[0], 0, args[3]); |
190 | if (err) | 197 | if (err) |
191 | goto release_guest; | 198 | goto release_guest; |
192 | 199 | ||
193 | /* Initialize the Guest's shadow page tables, using the toplevel | 200 | /* Initialize the Guest's shadow page tables, using the toplevel |
194 | * address the Launcher gave us. This allocates memory, so can | 201 | * address the Launcher gave us. This allocates memory, so can fail. */ |
195 | * fail. */ | ||
196 | err = init_guest_pagetable(lg, args[2]); | 202 | err = init_guest_pagetable(lg, args[2]); |
197 | if (err) | 203 | if (err) |
198 | goto free_regs; | 204 | goto free_regs; |
@@ -218,11 +224,16 @@ unlock: | |||
218 | /*L:010 The first operation the Launcher does must be a write. All writes | 224 | /*L:010 The first operation the Launcher does must be a write. All writes |
219 | * start with an unsigned long number: for the first write this must be | 225 | * start with an unsigned long number: for the first write this must be |
220 | * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use | 226 | * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use |
221 | * writes of other values to send interrupts. */ | 227 | * writes of other values to send interrupts. |
228 | * | ||
229 | * Note that we overload the "offset" in the /dev/lguest file to indicate what | ||
230 | * CPU number we're dealing with. Currently this is always 0, since we only | ||
231 | * support uniprocessor Guests, but you can see the beginnings of SMP support | ||
232 | * here. */ | ||
222 | static ssize_t write(struct file *file, const char __user *in, | 233 | static ssize_t write(struct file *file, const char __user *in, |
223 | size_t size, loff_t *off) | 234 | size_t size, loff_t *off) |
224 | { | 235 | { |
225 | /* Once the guest is initialized, we hold the "struct lguest" in the | 236 | /* Once the Guest is initialized, we hold the "struct lguest" in the |
226 | * file private data. */ | 237 | * file private data. */ |
227 | struct lguest *lg = file->private_data; | 238 | struct lguest *lg = file->private_data; |
228 | const unsigned long __user *input = (const unsigned long __user *)in; | 239 | const unsigned long __user *input = (const unsigned long __user *)in; |
@@ -230,6 +241,7 @@ static ssize_t write(struct file *file, const char __user *in, | |||
230 | struct lg_cpu *uninitialized_var(cpu); | 241 | struct lg_cpu *uninitialized_var(cpu); |
231 | unsigned int cpu_id = *off; | 242 | unsigned int cpu_id = *off; |
232 | 243 | ||
244 | /* The first value tells us what this request is. */ | ||
233 | if (get_user(req, input) != 0) | 245 | if (get_user(req, input) != 0) |
234 | return -EFAULT; | 246 | return -EFAULT; |
235 | input++; | 247 | input++; |
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index a7f64a9d67e0..d93500f24fbb 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c | |||
@@ -2,8 +2,8 @@ | |||
2 | * previous encounters. It's functional, and as neat as it can be in the | 2 | * previous encounters. It's functional, and as neat as it can be in the |
3 | * circumstances, but be wary, for these things are subtle and break easily. | 3 | * circumstances, but be wary, for these things are subtle and break easily. |
4 | * The Guest provides a virtual to physical mapping, but we can neither trust | 4 | * The Guest provides a virtual to physical mapping, but we can neither trust |
5 | * it nor use it: we verify and convert it here to point the hardware to the | 5 | * it nor use it: we verify and convert it here then point the CPU to the |
6 | * actual Guest pages when running the Guest. :*/ | 6 | * converted Guest pages when running the Guest. :*/ |
7 | 7 | ||
8 | /* Copyright (C) Rusty Russell IBM Corporation 2006. | 8 | /* Copyright (C) Rusty Russell IBM Corporation 2006. |
9 | * GPL v2 and any later version */ | 9 | * GPL v2 and any later version */ |
@@ -106,6 +106,11 @@ static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr) | |||
106 | BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); | 106 | BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); |
107 | return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t); | 107 | return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t); |
108 | } | 108 | } |
109 | /*:*/ | ||
110 | |||
111 | /*M:014 get_pfn is slow; it takes the mmap sem and calls get_user_pages. We | ||
112 | * could probably try to grab batches of pages here as an optimization | ||
113 | * (ie. pre-faulting). :*/ | ||
109 | 114 | ||
110 | /*H:350 This routine takes a page number given by the Guest and converts it to | 115 | /*H:350 This routine takes a page number given by the Guest and converts it to |
111 | * an actual, physical page number. It can fail for several reasons: the | 116 | * an actual, physical page number. It can fail for several reasons: the |
@@ -113,8 +118,8 @@ static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr) | |||
113 | * and the page is read-only, or the write flag was set and the page was | 118 | * and the page is read-only, or the write flag was set and the page was |
114 | * shared so had to be copied, but we ran out of memory. | 119 | * shared so had to be copied, but we ran out of memory. |
115 | * | 120 | * |
116 | * This holds a reference to the page, so release_pte() is careful to | 121 | * This holds a reference to the page, so release_pte() is careful to put that |
117 | * put that back. */ | 122 | * back. */ |
118 | static unsigned long get_pfn(unsigned long virtpfn, int write) | 123 | static unsigned long get_pfn(unsigned long virtpfn, int write) |
119 | { | 124 | { |
120 | struct page *page; | 125 | struct page *page; |
@@ -532,13 +537,13 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, | |||
532 | * all processes. So when the page table above that address changes, we update | 537 | * all processes. So when the page table above that address changes, we update |
533 | * all the page tables, not just the current one. This is rare. | 538 | * all the page tables, not just the current one. This is rare. |
534 | * | 539 | * |
535 | * The benefit is that when we have to track a new page table, we can copy keep | 540 | * The benefit is that when we have to track a new page table, we can keep all |
536 | * all the kernel mappings. This speeds up context switch immensely. */ | 541 | * the kernel mappings. This speeds up context switch immensely. */ |
537 | void guest_set_pte(struct lg_cpu *cpu, | 542 | void guest_set_pte(struct lg_cpu *cpu, |
538 | unsigned long gpgdir, unsigned long vaddr, pte_t gpte) | 543 | unsigned long gpgdir, unsigned long vaddr, pte_t gpte) |
539 | { | 544 | { |
540 | /* Kernel mappings must be changed on all top levels. Slow, but | 545 | /* Kernel mappings must be changed on all top levels. Slow, but doesn't |
541 | * doesn't happen often. */ | 546 | * happen often. */ |
542 | if (vaddr >= cpu->lg->kernel_address) { | 547 | if (vaddr >= cpu->lg->kernel_address) { |
543 | unsigned int i; | 548 | unsigned int i; |
544 | for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) | 549 | for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) |
@@ -704,12 +709,11 @@ static __init void populate_switcher_pte_page(unsigned int cpu, | |||
704 | /* We've made it through the page table code. Perhaps our tired brains are | 709 | /* We've made it through the page table code. Perhaps our tired brains are |
705 | * still processing the details, or perhaps we're simply glad it's over. | 710 | * still processing the details, or perhaps we're simply glad it's over. |
706 | * | 711 | * |
707 | * If nothing else, note that all this complexity in juggling shadow page | 712 | * If nothing else, note that all this complexity in juggling shadow page tables |
708 | * tables in sync with the Guest's page tables is for one reason: for most | 713 | * in sync with the Guest's page tables is for one reason: for most Guests this |
709 | * Guests this page table dance determines how bad performance will be. This | 714 | * page table dance determines how bad performance will be. This is why Xen |
710 | * is why Xen uses exotic direct Guest pagetable manipulation, and why both | 715 | * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD |
711 | * Intel and AMD have implemented shadow page table support directly into | 716 | * have implemented shadow page table support directly into hardware. |
712 | * hardware. | ||
713 | * | 717 | * |
714 | * There is just one file remaining in the Host. */ | 718 | * There is just one file remaining in the Host. */ |
715 | 719 | ||
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 635187812d52..5126d5d9ea0e 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c | |||
@@ -17,6 +17,13 @@ | |||
17 | * along with this program; if not, write to the Free Software | 17 | * along with this program; if not, write to the Free Software |
18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
19 | */ | 19 | */ |
20 | /*P:450 This file contains the x86-specific lguest code. It used to be all | ||
21 | * mixed in with drivers/lguest/core.c but several foolhardy code slashers | ||
22 | * wrestled most of the dependencies out to here in preparation for porting | ||
23 | * lguest to other architectures (see what I mean by foolhardy?). | ||
24 | * | ||
25 | * This also contains a couple of non-obvious setup and teardown pieces which | ||
26 | * were implemented after days of debugging pain. :*/ | ||
20 | #include <linux/kernel.h> | 27 | #include <linux/kernel.h> |
21 | #include <linux/start_kernel.h> | 28 | #include <linux/start_kernel.h> |
22 | #include <linux/string.h> | 29 | #include <linux/string.h> |
@@ -157,6 +164,8 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) | |||
157 | * also simplify copy_in_guest_info(). Note that we'd still need to restore | 164 | * also simplify copy_in_guest_info(). Note that we'd still need to restore |
158 | * things when we exit to Launcher userspace, but that's fairly easy. | 165 | * things when we exit to Launcher userspace, but that's fairly easy. |
159 | * | 166 | * |
167 | * We could also try using this hooks for PGE, but that might be too expensive. | ||
168 | * | ||
160 | * The hooks were designed for KVM, but we can also put them to good use. :*/ | 169 | * The hooks were designed for KVM, but we can also put them to good use. :*/ |
161 | 170 | ||
162 | /*H:040 This is the i386-specific code to setup and run the Guest. Interrupts | 171 | /*H:040 This is the i386-specific code to setup and run the Guest. Interrupts |
@@ -182,7 +191,7 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) | |||
182 | * was doing. */ | 191 | * was doing. */ |
183 | run_guest_once(cpu, lguest_pages(raw_smp_processor_id())); | 192 | run_guest_once(cpu, lguest_pages(raw_smp_processor_id())); |
184 | 193 | ||
185 | /* Note that the "regs" pointer contains two extra entries which are | 194 | /* Note that the "regs" structure contains two extra entries which are |
186 | * not really registers: a trap number which says what interrupt or | 195 | * not really registers: a trap number which says what interrupt or |
187 | * trap made the switcher code come back, and an error code which some | 196 | * trap made the switcher code come back, and an error code which some |
188 | * traps set. */ | 197 | * traps set. */ |
@@ -293,11 +302,10 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) | |||
293 | break; | 302 | break; |
294 | case 14: /* We've intercepted a Page Fault. */ | 303 | case 14: /* We've intercepted a Page Fault. */ |
295 | /* The Guest accessed a virtual address that wasn't mapped. | 304 | /* The Guest accessed a virtual address that wasn't mapped. |
296 | * This happens a lot: we don't actually set up most of the | 305 | * This happens a lot: we don't actually set up most of the page |
297 | * page tables for the Guest at all when we start: as it runs | 306 | * tables for the Guest at all when we start: as it runs it asks |
298 | * it asks for more and more, and we set them up as | 307 | * for more and more, and we set them up as required. In this |
299 | * required. In this case, we don't even tell the Guest that | 308 | * case, we don't even tell the Guest that the fault happened. |
300 | * the fault happened. | ||
301 | * | 309 | * |
302 | * The errcode tells whether this was a read or a write, and | 310 | * The errcode tells whether this was a read or a write, and |
303 | * whether kernel or userspace code. */ | 311 | * whether kernel or userspace code. */ |
@@ -342,7 +350,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) | |||
342 | if (!deliver_trap(cpu, cpu->regs->trapnum)) | 350 | if (!deliver_trap(cpu, cpu->regs->trapnum)) |
343 | /* If the Guest doesn't have a handler (either it hasn't | 351 | /* If the Guest doesn't have a handler (either it hasn't |
344 | * registered any yet, or it's one of the faults we don't let | 352 | * registered any yet, or it's one of the faults we don't let |
345 | * it handle), it dies with a cryptic error message. */ | 353 | * it handle), it dies with this cryptic error message. */ |
346 | kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)", | 354 | kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)", |
347 | cpu->regs->trapnum, cpu->regs->eip, | 355 | cpu->regs->trapnum, cpu->regs->eip, |
348 | cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault | 356 | cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault |
@@ -375,8 +383,8 @@ void __init lguest_arch_host_init(void) | |||
375 | * The only exception is the interrupt handlers in switcher.S: their | 383 | * The only exception is the interrupt handlers in switcher.S: their |
376 | * addresses are placed in a table (default_idt_entries), so we need to | 384 | * addresses are placed in a table (default_idt_entries), so we need to |
377 | * update the table with the new addresses. switcher_offset() is a | 385 | * update the table with the new addresses. switcher_offset() is a |
378 | * convenience function which returns the distance between the builtin | 386 | * convenience function which returns the distance between the |
379 | * switcher code and the high-mapped copy we just made. */ | 387 | * compiled-in switcher code and the high-mapped copy we just made. */ |
380 | for (i = 0; i < IDT_ENTRIES; i++) | 388 | for (i = 0; i < IDT_ENTRIES; i++) |
381 | default_idt_entries[i] += switcher_offset(); | 389 | default_idt_entries[i] += switcher_offset(); |
382 | 390 | ||
@@ -416,7 +424,7 @@ void __init lguest_arch_host_init(void) | |||
416 | state->guest_gdt_desc.address = (long)&state->guest_gdt; | 424 | state->guest_gdt_desc.address = (long)&state->guest_gdt; |
417 | 425 | ||
418 | /* We know where we want the stack to be when the Guest enters | 426 | /* We know where we want the stack to be when the Guest enters |
419 | * the switcher: in pages->regs. The stack grows upwards, so | 427 | * the Switcher: in pages->regs. The stack grows upwards, so |
420 | * we start it at the end of that structure. */ | 428 | * we start it at the end of that structure. */ |
421 | state->guest_tss.sp0 = (long)(&pages->regs + 1); | 429 | state->guest_tss.sp0 = (long)(&pages->regs + 1); |
422 | /* And this is the GDT entry to use for the stack: we keep a | 430 | /* And this is the GDT entry to use for the stack: we keep a |
@@ -513,8 +521,8 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) | |||
513 | { | 521 | { |
514 | u32 tsc_speed; | 522 | u32 tsc_speed; |
515 | 523 | ||
516 | /* The pointer to the Guest's "struct lguest_data" is the only | 524 | /* The pointer to the Guest's "struct lguest_data" is the only argument. |
517 | * argument. We check that address now. */ | 525 | * We check that address now. */ |
518 | if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1, | 526 | if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1, |
519 | sizeof(*cpu->lg->lguest_data))) | 527 | sizeof(*cpu->lg->lguest_data))) |
520 | return -EFAULT; | 528 | return -EFAULT; |
@@ -546,6 +554,7 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) | |||
546 | 554 | ||
547 | return 0; | 555 | return 0; |
548 | } | 556 | } |
557 | /*:*/ | ||
549 | 558 | ||
550 | /*L:030 lguest_arch_setup_regs() | 559 | /*L:030 lguest_arch_setup_regs() |
551 | * | 560 | * |
diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S index 0af8baaa0d4a..3fc15318a80f 100644 --- a/drivers/lguest/x86/switcher_32.S +++ b/drivers/lguest/x86/switcher_32.S | |||
@@ -1,6 +1,6 @@ | |||
1 | /*P:900 This is the Switcher: code which sits at 0xFFC00000 to do the low-level | 1 | /*P:900 This is the Switcher: code which sits at 0xFFC00000 astride both the |
2 | * Guest<->Host switch. It is as simple as it can be made, but it's naturally | 2 | * Host and Guest to do the low-level Guest<->Host switch. It is as simple as |
3 | * very specific to x86. | 3 | * it can be made, but it's naturally very specific to x86. |
4 | * | 4 | * |
5 | * You have now completed Preparation. If this has whet your appetite; if you | 5 | * You have now completed Preparation. If this has whet your appetite; if you |
6 | * are feeling invigorated and refreshed then the next, more challenging stage | 6 | * are feeling invigorated and refreshed then the next, more challenging stage |
@@ -189,7 +189,7 @@ ENTRY(switch_to_guest) | |||
189 | // Interrupts are turned back on: we are Guest. | 189 | // Interrupts are turned back on: we are Guest. |
190 | iret | 190 | iret |
191 | 191 | ||
192 | // We treat two paths to switch back to the Host | 192 | // We tread two paths to switch back to the Host |
193 | // Yet both must save Guest state and restore Host | 193 | // Yet both must save Guest state and restore Host |
194 | // So we put the routine in a macro. | 194 | // So we put the routine in a macro. |
195 | #define SWITCH_TO_HOST \ | 195 | #define SWITCH_TO_HOST \ |
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index 59a8f73dec73..6c8ecde6aad1 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c | |||
@@ -388,6 +388,7 @@ static void __devexit virtio_pci_remove(struct pci_dev *pci_dev) | |||
388 | { | 388 | { |
389 | struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); | 389 | struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); |
390 | 390 | ||
391 | unregister_virtio_device(&vp_dev->vdev); | ||
391 | free_irq(pci_dev->irq, vp_dev); | 392 | free_irq(pci_dev->irq, vp_dev); |
392 | pci_set_drvdata(pci_dev, NULL); | 393 | pci_set_drvdata(pci_dev, NULL); |
393 | pci_iounmap(pci_dev, vp_dev->ioaddr); | 394 | pci_iounmap(pci_dev, vp_dev->ioaddr); |
diff --git a/include/asm-x86/lguest_hcall.h b/include/asm-x86/lguest_hcall.h index 758b9a5d4539..f239e7069cab 100644 --- a/include/asm-x86/lguest_hcall.h +++ b/include/asm-x86/lguest_hcall.h | |||
@@ -27,7 +27,7 @@ | |||
27 | #ifndef __ASSEMBLY__ | 27 | #ifndef __ASSEMBLY__ |
28 | #include <asm/hw_irq.h> | 28 | #include <asm/hw_irq.h> |
29 | 29 | ||
30 | /*G:031 First, how does our Guest contact the Host to ask for privileged | 30 | /*G:031 But first, how does our Guest contact the Host to ask for privileged |
31 | * operations? There are two ways: the direct way is to make a "hypercall", | 31 | * operations? There are two ways: the direct way is to make a "hypercall", |
32 | * to make requests of the Host Itself. | 32 | * to make requests of the Host Itself. |
33 | * | 33 | * |
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index 589be3e1f3ac..e7217dc58f39 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h | |||
@@ -16,6 +16,10 @@ | |||
16 | * a new device, we simply need to write a new virtio driver and create support | 16 | * a new device, we simply need to write a new virtio driver and create support |
17 | * for it in the Launcher: this code won't need to change. | 17 | * for it in the Launcher: this code won't need to change. |
18 | * | 18 | * |
19 | * Virtio devices are also used by kvm, so we can simply reuse their optimized | ||
20 | * device drivers. And one day when everyone uses virtio, my plan will be | ||
21 | * complete. Bwahahahah! | ||
22 | * | ||
19 | * Devices are described by a simplified ID, a status byte, and some "config" | 23 | * Devices are described by a simplified ID, a status byte, and some "config" |
20 | * bytes which describe this device's configuration. This is placed by the | 24 | * bytes which describe this device's configuration. This is placed by the |
21 | * Launcher just above the top of physical memory: | 25 | * Launcher just above the top of physical memory: |
@@ -26,7 +30,7 @@ struct lguest_device_desc { | |||
26 | /* The number of virtqueues (first in config array) */ | 30 | /* The number of virtqueues (first in config array) */ |
27 | __u8 num_vq; | 31 | __u8 num_vq; |
28 | /* The number of bytes of feature bits. Multiply by 2: one for host | 32 | /* The number of bytes of feature bits. Multiply by 2: one for host |
29 | * features and one for guest acknowledgements. */ | 33 | * features and one for Guest acknowledgements. */ |
30 | __u8 feature_len; | 34 | __u8 feature_len; |
31 | /* The number of bytes of the config array after virtqueues. */ | 35 | /* The number of bytes of the config array after virtqueues. */ |
32 | __u8 config_len; | 36 | __u8 config_len; |