aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/lguest/lguest.c184
-rw-r--r--arch/x86/include/asm/lguest_hcall.h8
-rw-r--r--arch/x86/lguest/boot.c99
-rw-r--r--arch/x86/lguest/i386_head.S2
-rw-r--r--drivers/lguest/core.c7
-rw-r--r--drivers/lguest/hypercalls.c6
-rw-r--r--drivers/lguest/lguest_device.c11
-rw-r--r--drivers/lguest/lguest_user.c100
-rw-r--r--drivers/lguest/page_tables.c84
-rw-r--r--drivers/lguest/x86/core.c2
-rw-r--r--drivers/lguest/x86/switcher_32.S6
11 files changed, 398 insertions, 111 deletions
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index aa66a52b73e9..45163651b519 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -49,7 +49,7 @@
49#include "linux/virtio_ring.h" 49#include "linux/virtio_ring.h"
50#include "asm/bootparam.h" 50#include "asm/bootparam.h"
51/*L:110 51/*L:110
52 * We can ignore the 39 include files we need for this program, but I do want 52 * We can ignore the 42 include files we need for this program, but I do want
53 * to draw attention to the use of kernel-style types. 53 * to draw attention to the use of kernel-style types.
54 * 54 *
55 * As Linus said, "C is a Spartan language, and so should your naming be." I 55 * As Linus said, "C is a Spartan language, and so should your naming be." I
@@ -305,6 +305,11 @@ static void *map_zeroed_pages(unsigned int num)
305 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); 305 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0);
306 if (addr == MAP_FAILED) 306 if (addr == MAP_FAILED)
307 err(1, "Mmaping %u pages of /dev/zero", num); 307 err(1, "Mmaping %u pages of /dev/zero", num);
308
309 /*
310 * One neat mmap feature is that you can close the fd, and it
311 * stays mapped.
312 */
308 close(fd); 313 close(fd);
309 314
310 return addr; 315 return addr;
@@ -557,7 +562,7 @@ static void tell_kernel(unsigned long start)
557} 562}
558/*:*/ 563/*:*/
559 564
560/* 565/*L:200
561 * Device Handling. 566 * Device Handling.
562 * 567 *
563 * When the Guest gives us a buffer, it sends an array of addresses and sizes. 568 * When the Guest gives us a buffer, it sends an array of addresses and sizes.
@@ -608,7 +613,10 @@ static unsigned next_desc(struct vring_desc *desc,
608 return next; 613 return next;
609} 614}
610 615
611/* This actually sends the interrupt for this virtqueue */ 616/*
617 * This actually sends the interrupt for this virtqueue, if we've used a
618 * buffer.
619 */
612static void trigger_irq(struct virtqueue *vq) 620static void trigger_irq(struct virtqueue *vq)
613{ 621{
614 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; 622 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
@@ -629,12 +637,12 @@ static void trigger_irq(struct virtqueue *vq)
629} 637}
630 638
631/* 639/*
632 * This looks in the virtqueue and for the first available buffer, and converts 640 * This looks in the virtqueue for the first available buffer, and converts
633 * it to an iovec for convenient access. Since descriptors consist of some 641 * it to an iovec for convenient access. Since descriptors consist of some
634 * number of output then some number of input descriptors, it's actually two 642 * number of output then some number of input descriptors, it's actually two
635 * iovecs, but we pack them into one and note how many of each there were. 643 * iovecs, but we pack them into one and note how many of each there were.
636 * 644 *
637 * This function returns the descriptor number found. 645 * This function waits if necessary, and returns the descriptor number found.
638 */ 646 */
639static unsigned wait_for_vq_desc(struct virtqueue *vq, 647static unsigned wait_for_vq_desc(struct virtqueue *vq,
640 struct iovec iov[], 648 struct iovec iov[],
@@ -644,10 +652,14 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
644 struct vring_desc *desc; 652 struct vring_desc *desc;
645 u16 last_avail = lg_last_avail(vq); 653 u16 last_avail = lg_last_avail(vq);
646 654
655 /* There's nothing available? */
647 while (last_avail == vq->vring.avail->idx) { 656 while (last_avail == vq->vring.avail->idx) {
648 u64 event; 657 u64 event;
649 658
650 /* OK, tell Guest about progress up to now. */ 659 /*
660 * Since we're about to sleep, now is a good time to tell the
661 * Guest about what we've used up to now.
662 */
651 trigger_irq(vq); 663 trigger_irq(vq);
652 664
653 /* OK, now we need to know about added descriptors. */ 665 /* OK, now we need to know about added descriptors. */
@@ -734,8 +746,9 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
734} 746}
735 747
736/* 748/*
737 * After we've used one of their buffers, we tell them about it. We'll then 749 * After we've used one of their buffers, we tell the Guest about it. Sometime
738 * want to send them an interrupt, using trigger_irq(). 750 * later we'll want to send them an interrupt using trigger_irq(); note that
751 * wait_for_vq_desc() does that for us if it has to wait.
739 */ 752 */
740static void add_used(struct virtqueue *vq, unsigned int head, int len) 753static void add_used(struct virtqueue *vq, unsigned int head, int len)
741{ 754{
@@ -782,12 +795,12 @@ static void console_input(struct virtqueue *vq)
782 struct console_abort *abort = vq->dev->priv; 795 struct console_abort *abort = vq->dev->priv;
783 struct iovec iov[vq->vring.num]; 796 struct iovec iov[vq->vring.num];
784 797
785 /* Make sure there's a descriptor waiting. */ 798 /* Make sure there's a descriptor available. */
786 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 799 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
787 if (out_num) 800 if (out_num)
788 errx(1, "Output buffers in console in queue?"); 801 errx(1, "Output buffers in console in queue?");
789 802
790 /* Read it in. */ 803 /* Read into it. This is where we usually wait. */
791 len = readv(STDIN_FILENO, iov, in_num); 804 len = readv(STDIN_FILENO, iov, in_num);
792 if (len <= 0) { 805 if (len <= 0) {
793 /* Ran out of input? */ 806 /* Ran out of input? */
@@ -800,6 +813,7 @@ static void console_input(struct virtqueue *vq)
800 pause(); 813 pause();
801 } 814 }
802 815
816 /* Tell the Guest we used a buffer. */
803 add_used_and_trigger(vq, head, len); 817 add_used_and_trigger(vq, head, len);
804 818
805 /* 819 /*
@@ -834,15 +848,23 @@ static void console_output(struct virtqueue *vq)
834 unsigned int head, out, in; 848 unsigned int head, out, in;
835 struct iovec iov[vq->vring.num]; 849 struct iovec iov[vq->vring.num];
836 850
851 /* We usually wait in here, for the Guest to give us something. */
837 head = wait_for_vq_desc(vq, iov, &out, &in); 852 head = wait_for_vq_desc(vq, iov, &out, &in);
838 if (in) 853 if (in)
839 errx(1, "Input buffers in console output queue?"); 854 errx(1, "Input buffers in console output queue?");
855
856 /* writev can return a partial write, so we loop here. */
840 while (!iov_empty(iov, out)) { 857 while (!iov_empty(iov, out)) {
841 int len = writev(STDOUT_FILENO, iov, out); 858 int len = writev(STDOUT_FILENO, iov, out);
842 if (len <= 0) 859 if (len <= 0)
843 err(1, "Write to stdout gave %i", len); 860 err(1, "Write to stdout gave %i", len);
844 iov_consume(iov, out, len); 861 iov_consume(iov, out, len);
845 } 862 }
863
864 /*
865 * We're finished with that buffer: if we're going to sleep,
866 * wait_for_vq_desc() will prod the Guest with an interrupt.
867 */
846 add_used(vq, head, 0); 868 add_used(vq, head, 0);
847} 869}
848 870
@@ -862,15 +884,30 @@ static void net_output(struct virtqueue *vq)
862 unsigned int head, out, in; 884 unsigned int head, out, in;
863 struct iovec iov[vq->vring.num]; 885 struct iovec iov[vq->vring.num];
864 886
887 /* We usually wait in here for the Guest to give us a packet. */
865 head = wait_for_vq_desc(vq, iov, &out, &in); 888 head = wait_for_vq_desc(vq, iov, &out, &in);
866 if (in) 889 if (in)
867 errx(1, "Input buffers in net output queue?"); 890 errx(1, "Input buffers in net output queue?");
891 /*
892 * Send the whole thing through to /dev/net/tun. It expects the exact
893 * same format: what a coincidence!
894 */
868 if (writev(net_info->tunfd, iov, out) < 0) 895 if (writev(net_info->tunfd, iov, out) < 0)
869 errx(1, "Write to tun failed?"); 896 errx(1, "Write to tun failed?");
897
898 /*
899 * Done with that one; wait_for_vq_desc() will send the interrupt if
900 * all packets are processed.
901 */
870 add_used(vq, head, 0); 902 add_used(vq, head, 0);
871} 903}
872 904
873/* Will reading from this file descriptor block? */ 905/*
906 * Handling network input is a bit trickier, because I've tried to optimize it.
907 *
908 * First we have a helper routine which tells is if from this file descriptor
909 * (ie. the /dev/net/tun device) will block:
910 */
874static bool will_block(int fd) 911static bool will_block(int fd)
875{ 912{
876 fd_set fdset; 913 fd_set fdset;
@@ -880,7 +917,11 @@ static bool will_block(int fd)
880 return select(fd+1, &fdset, NULL, NULL, &zero) != 1; 917 return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
881} 918}
882 919
883/* This handles packets coming in from the tun device to our Guest. */ 920/*
921 * This handles packets coming in from the tun device to our Guest. Like all
922 * service routines, it gets called again as soon as it returns, so you don't
923 * see a while(1) loop here.
924 */
884static void net_input(struct virtqueue *vq) 925static void net_input(struct virtqueue *vq)
885{ 926{
886 int len; 927 int len;
@@ -888,21 +929,38 @@ static void net_input(struct virtqueue *vq)
888 struct iovec iov[vq->vring.num]; 929 struct iovec iov[vq->vring.num];
889 struct net_info *net_info = vq->dev->priv; 930 struct net_info *net_info = vq->dev->priv;
890 931
932 /*
933 * Get a descriptor to write an incoming packet into. This will also
934 * send an interrupt if they're out of descriptors.
935 */
891 head = wait_for_vq_desc(vq, iov, &out, &in); 936 head = wait_for_vq_desc(vq, iov, &out, &in);
892 if (out) 937 if (out)
893 errx(1, "Output buffers in net input queue?"); 938 errx(1, "Output buffers in net input queue?");
894 939
895 /* Deliver interrupt now, since we're about to sleep. */ 940 /*
941 * If it looks like we'll block reading from the tun device, send them
942 * an interrupt.
943 */
896 if (vq->pending_used && will_block(net_info->tunfd)) 944 if (vq->pending_used && will_block(net_info->tunfd))
897 trigger_irq(vq); 945 trigger_irq(vq);
898 946
947 /*
948 * Read in the packet. This is where we normally wait (when there's no
949 * incoming network traffic).
950 */
899 len = readv(net_info->tunfd, iov, in); 951 len = readv(net_info->tunfd, iov, in);
900 if (len <= 0) 952 if (len <= 0)
901 err(1, "Failed to read from tun."); 953 err(1, "Failed to read from tun.");
954
955 /*
956 * Mark that packet buffer as used, but don't interrupt here. We want
957 * to wait until we've done as much work as we can.
958 */
902 add_used(vq, head, len); 959 add_used(vq, head, len);
903} 960}
961/*:*/
904 962
905/* This is the helper to create threads. */ 963/* This is the helper to create threads: run the service routine in a loop. */
906static int do_thread(void *_vq) 964static int do_thread(void *_vq)
907{ 965{
908 struct virtqueue *vq = _vq; 966 struct virtqueue *vq = _vq;
@@ -950,11 +1008,14 @@ static void reset_device(struct device *dev)
950 signal(SIGCHLD, (void *)kill_launcher); 1008 signal(SIGCHLD, (void *)kill_launcher);
951} 1009}
952 1010
1011/*L:216
1012 * This actually creates the thread which services the virtqueue for a device.
1013 */
953static void create_thread(struct virtqueue *vq) 1014static void create_thread(struct virtqueue *vq)
954{ 1015{
955 /* 1016 /*
956 * Create stack for thread and run it. Since the stack grows upwards, 1017 * Create stack for thread. Since the stack grows upwards, we point
957 * we point the stack pointer to the end of this region. 1018 * the stack pointer to the end of this region.
958 */ 1019 */
959 char *stack = malloc(32768); 1020 char *stack = malloc(32768);
960 unsigned long args[] = { LHREQ_EVENTFD, 1021 unsigned long args[] = { LHREQ_EVENTFD,
@@ -966,17 +1027,22 @@ static void create_thread(struct virtqueue *vq)
966 err(1, "Creating eventfd"); 1027 err(1, "Creating eventfd");
967 args[2] = vq->eventfd; 1028 args[2] = vq->eventfd;
968 1029
969 /* Attach an eventfd to this virtqueue: it will go off 1030 /*
970 * when the Guest does an LHCALL_NOTIFY for this vq. */ 1031 * Attach an eventfd to this virtqueue: it will go off when the Guest
1032 * does an LHCALL_NOTIFY for this vq.
1033 */
971 if (write(lguest_fd, &args, sizeof(args)) != 0) 1034 if (write(lguest_fd, &args, sizeof(args)) != 0)
972 err(1, "Attaching eventfd"); 1035 err(1, "Attaching eventfd");
973 1036
974 /* CLONE_VM: because it has to access the Guest memory, and 1037 /*
975 * SIGCHLD so we get a signal if it dies. */ 1038 * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so
1039 * we get a signal if it dies.
1040 */
976 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); 1041 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
977 if (vq->thread == (pid_t)-1) 1042 if (vq->thread == (pid_t)-1)
978 err(1, "Creating clone"); 1043 err(1, "Creating clone");
979 /* We close our local copy, now the child has it. */ 1044
1045 /* We close our local copy now the child has it. */
980 close(vq->eventfd); 1046 close(vq->eventfd);
981} 1047}
982 1048
@@ -1028,7 +1094,10 @@ static void update_device_status(struct device *dev)
1028 } 1094 }
1029} 1095}
1030 1096
1031/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ 1097/*L:215
1098 * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In
1099 * particular, it's used to notify us of device status changes during boot.
1100 */
1032static void handle_output(unsigned long addr) 1101static void handle_output(unsigned long addr)
1033{ 1102{
1034 struct device *i; 1103 struct device *i;
@@ -1037,18 +1106,32 @@ static void handle_output(unsigned long addr)
1037 for (i = devices.dev; i; i = i->next) { 1106 for (i = devices.dev; i; i = i->next) {
1038 struct virtqueue *vq; 1107 struct virtqueue *vq;
1039 1108
1040 /* Notifications to device descriptors update device status. */ 1109 /*
1110 * Notifications to device descriptors mean they updated the
1111 * device status.
1112 */
1041 if (from_guest_phys(addr) == i->desc) { 1113 if (from_guest_phys(addr) == i->desc) {
1042 update_device_status(i); 1114 update_device_status(i);
1043 return; 1115 return;
1044 } 1116 }
1045 1117
1046 /* Devices *can* be used before status is set to DRIVER_OK. */ 1118 /*
1119 * Devices *can* be used before status is set to DRIVER_OK.
1120 * The original plan was that they would never do this: they
1121 * would always finish setting up their status bits before
1122 * actually touching the virtqueues. In practice, we allowed
1123 * them to, and they do (eg. the disk probes for partition
1124 * tables as part of initialization).
1125 *
1126 * If we see this, we start the device: once it's running, we
1127 * expect the device to catch all the notifications.
1128 */
1047 for (vq = i->vq; vq; vq = vq->next) { 1129 for (vq = i->vq; vq; vq = vq->next) {
1048 if (addr != vq->config.pfn*getpagesize()) 1130 if (addr != vq->config.pfn*getpagesize())
1049 continue; 1131 continue;
1050 if (i->running) 1132 if (i->running)
1051 errx(1, "Notification on running %s", i->name); 1133 errx(1, "Notification on running %s", i->name);
1134 /* This just calls create_thread() for each virtqueue */
1052 start_device(i); 1135 start_device(i);
1053 return; 1136 return;
1054 } 1137 }
@@ -1132,6 +1215,11 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1132 vq->next = NULL; 1215 vq->next = NULL;
1133 vq->last_avail_idx = 0; 1216 vq->last_avail_idx = 0;
1134 vq->dev = dev; 1217 vq->dev = dev;
1218
1219 /*
1220 * This is the routine the service thread will run, and its Process ID
1221 * once it's running.
1222 */
1135 vq->service = service; 1223 vq->service = service;
1136 vq->thread = (pid_t)-1; 1224 vq->thread = (pid_t)-1;
1137 1225
@@ -1202,7 +1290,8 @@ static void set_config(struct device *dev, unsigned len, const void *conf)
1202 1290
1203/* 1291/*
1204 * This routine does all the creation and setup of a new device, including 1292 * This routine does all the creation and setup of a new device, including
1205 * calling new_dev_desc() to allocate the descriptor and device memory. 1293 * calling new_dev_desc() to allocate the descriptor and device memory. We
1294 * don't actually start the service threads until later.
1206 * 1295 *
1207 * See what I mean about userspace being boring? 1296 * See what I mean about userspace being boring?
1208 */ 1297 */
@@ -1478,19 +1567,7 @@ static void setup_tun_net(char *arg)
1478 verbose("device %u: tun %s: %s\n", 1567 verbose("device %u: tun %s: %s\n",
1479 devices.device_num, tapif, arg); 1568 devices.device_num, tapif, arg);
1480} 1569}
1481 1570/*:*/
1482/*
1483 * Our block (disk) device should be really simple: the Guest asks for a block
1484 * number and we read or write that position in the file. Unfortunately, that
1485 * was amazingly slow: the Guest waits until the read is finished before
1486 * running anything else, even if it could have been doing useful work.
1487 *
1488 * We could use async I/O, except it's reputed to suck so hard that characters
1489 * actually go missing from your code when you try to use it.
1490 *
1491 * So this was one reason why lguest now does all virtqueue servicing in
1492 * separate threads: it's more efficient and more like a real device.
1493 */
1494 1571
1495/* This hangs off device->priv. */ 1572/* This hangs off device->priv. */
1496struct vblk_info 1573struct vblk_info
@@ -1512,8 +1589,16 @@ struct vblk_info
1512/*L:210 1589/*L:210
1513 * The Disk 1590 * The Disk
1514 * 1591 *
1515 * Remember that the block device is handled by a separate I/O thread. We head 1592 * The disk only has one virtqueue, so it only has one thread. It is really
1516 * straight into the core of that thread here: 1593 * simple: the Guest asks for a block number and we read or write that position
1594 * in the file.
1595 *
1596 * Before we serviced each virtqueue in a separate thread, that was unacceptably
1597 * slow: the Guest waits until the read is finished before running anything
1598 * else, even if it could have been doing useful work.
1599 *
1600 * We could have used async I/O, except it's reputed to suck so hard that
1601 * characters actually go missing from your code when you try to use it.
1517 */ 1602 */
1518static void blk_request(struct virtqueue *vq) 1603static void blk_request(struct virtqueue *vq)
1519{ 1604{
@@ -1525,7 +1610,10 @@ static void blk_request(struct virtqueue *vq)
1525 struct iovec iov[vq->vring.num]; 1610 struct iovec iov[vq->vring.num];
1526 off64_t off; 1611 off64_t off;
1527 1612
1528 /* Get the next request. */ 1613 /*
1614 * Get the next request, where we normally wait. It triggers the
1615 * interrupt to acknowledge previously serviced requests (if any).
1616 */
1529 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 1617 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1530 1618
1531 /* 1619 /*
@@ -1539,6 +1627,10 @@ static void blk_request(struct virtqueue *vq)
1539 1627
1540 out = convert(&iov[0], struct virtio_blk_outhdr); 1628 out = convert(&iov[0], struct virtio_blk_outhdr);
1541 in = convert(&iov[out_num+in_num-1], u8); 1629 in = convert(&iov[out_num+in_num-1], u8);
1630 /*
1631 * For historical reasons, block operations are expressed in 512 byte
1632 * "sectors".
1633 */
1542 off = out->sector * 512; 1634 off = out->sector * 512;
1543 1635
1544 /* 1636 /*
@@ -1614,6 +1706,7 @@ static void blk_request(struct virtqueue *vq)
1614 if (out->type & VIRTIO_BLK_T_BARRIER) 1706 if (out->type & VIRTIO_BLK_T_BARRIER)
1615 fdatasync(vblk->fd); 1707 fdatasync(vblk->fd);
1616 1708
1709 /* Finished that request. */
1617 add_used(vq, head, wlen); 1710 add_used(vq, head, wlen);
1618} 1711}
1619 1712
@@ -1682,9 +1775,8 @@ static void rng_input(struct virtqueue *vq)
1682 errx(1, "Output buffers in rng?"); 1775 errx(1, "Output buffers in rng?");
1683 1776
1684 /* 1777 /*
1685 * This is why we convert to iovecs: the readv() call uses them, and so 1778 * Just like the console write, we loop to cover the whole iovec.
1686 * it reads straight into the Guest's buffer. We loop to make sure we 1779 * In this case, short reads actually happen quite a bit.
1687 * fill it.
1688 */ 1780 */
1689 while (!iov_empty(iov, in_num)) { 1781 while (!iov_empty(iov, in_num)) {
1690 len = readv(rng_info->rfd, iov, in_num); 1782 len = readv(rng_info->rfd, iov, in_num);
@@ -1818,7 +1910,9 @@ int main(int argc, char *argv[])
1818 devices.lastdev = NULL; 1910 devices.lastdev = NULL;
1819 devices.next_irq = 1; 1911 devices.next_irq = 1;
1820 1912
1913 /* We're CPU 0. In fact, that's the only CPU possible right now. */
1821 cpu_id = 0; 1914 cpu_id = 0;
1915
1822 /* 1916 /*
1823 * We need to know how much memory so we can set up the device 1917 * We need to know how much memory so we can set up the device
1824 * descriptor and memory pages for the devices as we parse the command 1918 * descriptor and memory pages for the devices as we parse the command
@@ -1926,7 +2020,7 @@ int main(int argc, char *argv[])
1926 */ 2020 */
1927 tell_kernel(start); 2021 tell_kernel(start);
1928 2022
1929 /* Ensure that we terminate if a child dies. */ 2023 /* Ensure that we terminate if a device-servicing child dies. */
1930 signal(SIGCHLD, kill_launcher); 2024 signal(SIGCHLD, kill_launcher);
1931 2025
1932 /* If we exit via err(), this kills all the threads, restores tty. */ 2026 /* If we exit via err(), this kills all the threads, restores tty. */
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index cceb73e12e50..ba0eed8aa1a6 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -35,10 +35,10 @@
35 * operations? There are two ways: the direct way is to make a "hypercall", 35 * operations? There are two ways: the direct way is to make a "hypercall",
36 * to make requests of the Host Itself. 36 * to make requests of the Host Itself.
37 * 37 *
38 * We use the KVM hypercall mechanism. Seventeen hypercalls are 38 * We use the KVM hypercall mechanism, though completely different hypercall
39 * available: the hypercall number is put in the %eax register, and the 39 * numbers. Seventeen hypercalls are available: the hypercall number is put in
40 * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. 40 * the %eax register, and the arguments (when required) are placed in %ebx,
41 * If a return value makes sense, it's returned in %eax. 41 * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax.
42 * 42 *
43 * Grossly invalid calls result in Sudden Death at the hands of the vengeful 43 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
44 * Host, rather than returning failure. This reflects Winston Churchill's 44 * Host, rather than returning failure. This reflects Winston Churchill's
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 025c04d18f2b..d677fa9ca650 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -154,6 +154,7 @@ static void lazy_hcall1(unsigned long call,
154 async_hcall(call, arg1, 0, 0, 0); 154 async_hcall(call, arg1, 0, 0, 0);
155} 155}
156 156
157/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
157static void lazy_hcall2(unsigned long call, 158static void lazy_hcall2(unsigned long call,
158 unsigned long arg1, 159 unsigned long arg1,
159 unsigned long arg2) 160 unsigned long arg2)
@@ -189,8 +190,10 @@ static void lazy_hcall4(unsigned long call,
189} 190}
190#endif 191#endif
191 192
192/* When lazy mode is turned off reset the per-cpu lazy mode variable and then 193/*G:036
193 * issue the do-nothing hypercall to flush any stored calls. */ 194 * When lazy mode is turned off reset the per-cpu lazy mode variable and then
195 * issue the do-nothing hypercall to flush any stored calls.
196:*/
194static void lguest_leave_lazy_mmu_mode(void) 197static void lguest_leave_lazy_mmu_mode(void)
195{ 198{
196 kvm_hypercall0(LHCALL_FLUSH_ASYNC); 199 kvm_hypercall0(LHCALL_FLUSH_ASYNC);
@@ -250,13 +253,11 @@ extern void lg_irq_enable(void);
250extern void lg_restore_fl(unsigned long flags); 253extern void lg_restore_fl(unsigned long flags);
251 254
252/*M:003 255/*M:003
253 * Note that we don't check for outstanding interrupts when we re-enable them 256 * We could be more efficient in our checking of outstanding interrupts, rather
254 * (or when we unmask an interrupt). This seems to work for the moment, since 257 * than using a branch. One way would be to put the "irq_enabled" field in a
255 * interrupts are rare and we'll just get the interrupt on the next timer tick, 258 * page by itself, and have the Host write-protect it when an interrupt comes
256 * but now we can run with CONFIG_NO_HZ, we should revisit this. One way would 259 * in when irqs are disabled. There will then be a page fault as soon as
257 * be to put the "irq_enabled" field in a page by itself, and have the Host 260 * interrupts are re-enabled.
258 * write-protect it when an interrupt comes in when irqs are disabled. There
259 * will then be a page fault as soon as interrupts are re-enabled.
260 * 261 *
261 * A better method is to implement soft interrupt disable generally for x86: 262 * A better method is to implement soft interrupt disable generally for x86:
262 * instead of disabling interrupts, we set a flag. If an interrupt does come 263 * instead of disabling interrupts, we set a flag. If an interrupt does come
@@ -568,7 +569,7 @@ static void lguest_write_cr4(unsigned long val)
568 * cr3 ---> +---------+ 569 * cr3 ---> +---------+
569 * | --------->+---------+ 570 * | --------->+---------+
570 * | | | PADDR1 | 571 * | | | PADDR1 |
571 * Top-level | | PADDR2 | 572 * Mid-level | | PADDR2 |
572 * (PMD) page | | | 573 * (PMD) page | | |
573 * | | Lower-level | 574 * | | Lower-level |
574 * | | (PTE) page | 575 * | | (PTE) page |
@@ -588,23 +589,62 @@ static void lguest_write_cr4(unsigned long val)
588 * Index into top Index into second Offset within page 589 * Index into top Index into second Offset within page
589 * page directory page pagetable page 590 * page directory page pagetable page
590 * 591 *
591 * The kernel spends a lot of time changing both the top-level page directory 592 * Now, unfortunately, this isn't the whole story: Intel added Physical Address
592 * and lower-level pagetable pages. The Guest doesn't know physical addresses, 593 * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits).
593 * so while it maintains these page tables exactly like normal, it also needs 594 * These are held in 64-bit page table entries, so we can now only fit 512
594 * to keep the Host informed whenever it makes a change: the Host will create 595 * entries in a page, and the neat three-level tree breaks down.
595 * the real page tables based on the Guests'. 596 *
597 * The result is a four level page table:
598 *
599 * cr3 --> [ 4 Upper ]
600 * [ Level ]
601 * [ Entries ]
602 * [(PUD Page)]---> +---------+
603 * | --------->+---------+
604 * | | | PADDR1 |
605 * Mid-level | | PADDR2 |
606 * (PMD) page | | |
607 * | | Lower-level |
608 * | | (PTE) page |
609 * | | | |
610 * .... ....
611 *
612 *
613 * And the virtual address is decoded as:
614 *
615 * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
616 * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>|
617 * Index into Index into mid Index into lower Offset within page
618 * top entries directory page pagetable page
619 *
620 * It's too hard to switch between these two formats at runtime, so Linux only
621 * supports one or the other depending on whether CONFIG_X86_PAE is set. Many
622 * distributions turn it on, and not just for people with silly amounts of
623 * memory: the larger PTE entries allow room for the NX bit, which lets the
624 * kernel disable execution of pages and increase security.
625 *
626 * This was a problem for lguest, which couldn't run on these distributions;
627 * then Matias Zabaljauregui figured it all out and implemented it, and only a
628 * handful of puppies were crushed in the process!
629 *
630 * Back to our point: the kernel spends a lot of time changing both the
631 * top-level page directory and lower-level pagetable pages. The Guest doesn't
632 * know physical addresses, so while it maintains these page tables exactly
633 * like normal, it also needs to keep the Host informed whenever it makes a
634 * change: the Host will create the real page tables based on the Guests'.
596 */ 635 */
597 636
598/* 637/*
599 * The Guest calls this to set a second-level entry (pte), ie. to map a page 638 * The Guest calls this after it has set a second-level entry (pte), ie. to map
600 * into a process' address space. We set the entry then tell the Host the 639 * a page into a process' address space. Wetell the Host the toplevel and
601 * toplevel and address this corresponds to. The Guest uses one pagetable per 640 * address this corresponds to. The Guest uses one pagetable per process, so
602 * process, so we need to tell the Host which one we're changing (mm->pgd). 641 * we need to tell the Host which one we're changing (mm->pgd).
603 */ 642 */
604static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, 643static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
605 pte_t *ptep) 644 pte_t *ptep)
606{ 645{
607#ifdef CONFIG_X86_PAE 646#ifdef CONFIG_X86_PAE
647 /* PAE needs to hand a 64 bit page table entry, so it uses two args. */
608 lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, 648 lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
609 ptep->pte_low, ptep->pte_high); 649 ptep->pte_low, ptep->pte_high);
610#else 650#else
@@ -612,6 +652,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
612#endif 652#endif
613} 653}
614 654
655/* This is the "set and update" combo-meal-deal version. */
615static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, 656static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
616 pte_t *ptep, pte_t pteval) 657 pte_t *ptep, pte_t pteval)
617{ 658{
@@ -672,6 +713,11 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
672} 713}
673 714
674#ifdef CONFIG_X86_PAE 715#ifdef CONFIG_X86_PAE
716/*
717 * With 64-bit PTE values, we need to be careful setting them: if we set 32
718 * bits at a time, the hardware could see a weird half-set entry. These
719 * versions ensure we update all 64 bits at once.
720 */
675static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) 721static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
676{ 722{
677 native_set_pte_atomic(ptep, pte); 723 native_set_pte_atomic(ptep, pte);
@@ -679,13 +725,14 @@ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
679 lazy_hcall1(LHCALL_FLUSH_TLB, 1); 725 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
680} 726}
681 727
682void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 728static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr,
729 pte_t *ptep)
683{ 730{
684 native_pte_clear(mm, addr, ptep); 731 native_pte_clear(mm, addr, ptep);
685 lguest_pte_update(mm, addr, ptep); 732 lguest_pte_update(mm, addr, ptep);
686} 733}
687 734
688void lguest_pmd_clear(pmd_t *pmdp) 735static void lguest_pmd_clear(pmd_t *pmdp)
689{ 736{
690 lguest_set_pmd(pmdp, __pmd(0)); 737 lguest_set_pmd(pmdp, __pmd(0));
691} 738}
@@ -784,6 +831,14 @@ static void __init lguest_init_IRQ(void)
784 irq_ctx_init(smp_processor_id()); 831 irq_ctx_init(smp_processor_id());
785} 832}
786 833
834/*
835 * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so
836 * rather than set them in lguest_init_IRQ we are called here every time an
837 * lguest device needs an interrupt.
838 *
839 * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should
840 * pass that up!
841 */
787void lguest_setup_irq(unsigned int irq) 842void lguest_setup_irq(unsigned int irq)
788{ 843{
789 irq_to_desc_alloc_node(irq, 0); 844 irq_to_desc_alloc_node(irq, 0);
@@ -1298,7 +1353,7 @@ __init void lguest_init(void)
1298 */ 1353 */
1299 switch_to_new_gdt(0); 1354 switch_to_new_gdt(0);
1300 1355
1301 /* As described in head_32.S, we map the first 128M of memory. */ 1356 /* We actually boot with all memory mapped, but let's say 128MB. */
1302 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; 1357 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1303 1358
1304 /* 1359 /*
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index db6aa95eb054..27eac0faee48 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -102,6 +102,7 @@ send_interrupts:
102 * create one manually here. 102 * create one manually here.
103 */ 103 */
104 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ 104 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
105 /* Put eax back the way we found it. */
105 popl %eax 106 popl %eax
106 ret 107 ret
107 108
@@ -125,6 +126,7 @@ ENTRY(lg_restore_fl)
125 jnz send_interrupts 126 jnz send_interrupts
126 /* Again, the normal path has used no extra registers. Clever, huh? */ 127 /* Again, the normal path has used no extra registers. Clever, huh? */
127 ret 128 ret
129/*:*/
128 130
129/* These demark the EIP range where host should never deliver interrupts. */ 131/* These demark the EIP range where host should never deliver interrupts. */
130.global lguest_noirq_start 132.global lguest_noirq_start
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index cd058bc903ff..1e2cb846b3c9 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -217,10 +217,15 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
217 217
218 /* 218 /*
219 * It's possible the Guest did a NOTIFY hypercall to the 219 * It's possible the Guest did a NOTIFY hypercall to the
220 * Launcher, in which case we return from the read() now. 220 * Launcher.
221 */ 221 */
222 if (cpu->pending_notify) { 222 if (cpu->pending_notify) {
223 /*
224 * Does it just needs to write to a registered
225 * eventfd (ie. the appropriate virtqueue thread)?
226 */
223 if (!send_notify_to_eventfd(cpu)) { 227 if (!send_notify_to_eventfd(cpu)) {
228 /* OK, we tell the main Laucher. */
224 if (put_user(cpu->pending_notify, user)) 229 if (put_user(cpu->pending_notify, user))
225 return -EFAULT; 230 return -EFAULT;
226 return sizeof(cpu->pending_notify); 231 return sizeof(cpu->pending_notify);
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 787ab4bc09f0..83511eb0923d 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -59,7 +59,7 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
59 case LHCALL_SHUTDOWN: { 59 case LHCALL_SHUTDOWN: {
60 char msg[128]; 60 char msg[128];
61 /* 61 /*
62 * Shutdown is such a trivial hypercall that we do it in four 62 * Shutdown is such a trivial hypercall that we do it in five
63 * lines right here. 63 * lines right here.
64 * 64 *
65 * If the lgread fails, it will call kill_guest() itself; the 65 * If the lgread fails, it will call kill_guest() itself; the
@@ -245,6 +245,10 @@ static void initialize(struct lg_cpu *cpu)
245 * device), the Guest will still see the old page. In practice, this never 245 * device), the Guest will still see the old page. In practice, this never
246 * happens: why would the Guest read a page which it has never written to? But 246 * happens: why would the Guest read a page which it has never written to? But
247 * a similar scenario might one day bite us, so it's worth mentioning. 247 * a similar scenario might one day bite us, so it's worth mentioning.
248 *
249 * Note that if we used a shared anonymous mapping in the Launcher instead of
250 * mapping /dev/zero private, we wouldn't worry about cop-on-write. And we
251 * need that to switch the Launcher to processes (away from threads) anyway.
248:*/ 252:*/
249 253
250/*H:100 254/*H:100
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index cc000e79c3d1..1401c1ace1ec 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -236,7 +236,7 @@ static void lg_notify(struct virtqueue *vq)
236extern void lguest_setup_irq(unsigned int irq); 236extern void lguest_setup_irq(unsigned int irq);
237 237
238/* 238/*
239 * This routine finds the first virtqueue described in the configuration of 239 * This routine finds the Nth virtqueue described in the configuration of
240 * this device and sets it up. 240 * this device and sets it up.
241 * 241 *
242 * This is kind of an ugly duckling. It'd be nicer to have a standard 242 * This is kind of an ugly duckling. It'd be nicer to have a standard
@@ -244,9 +244,6 @@ extern void lguest_setup_irq(unsigned int irq);
244 * everyone wants to do it differently. The KVM coders want the Guest to 244 * everyone wants to do it differently. The KVM coders want the Guest to
245 * allocate its own pages and tell the Host where they are, but for lguest it's 245 * allocate its own pages and tell the Host where they are, but for lguest it's
246 * simpler for the Host to simply tell us where the pages are. 246 * simpler for the Host to simply tell us where the pages are.
247 *
248 * So we provide drivers with a "find the Nth virtqueue and set it up"
249 * function.
250 */ 247 */
251static struct virtqueue *lg_find_vq(struct virtio_device *vdev, 248static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
252 unsigned index, 249 unsigned index,
@@ -422,7 +419,11 @@ static void add_lguest_device(struct lguest_device_desc *d,
422 419
423 /* This devices' parent is the lguest/ dir. */ 420 /* This devices' parent is the lguest/ dir. */
424 ldev->vdev.dev.parent = lguest_root; 421 ldev->vdev.dev.parent = lguest_root;
425 /* We have a unique device index thanks to the dev_index counter. */ 422 /*
423 * The device type comes straight from the descriptor. There's also a
424 * device vendor field in the virtio_device struct, which we leave as
425 * 0.
426 */
426 ldev->vdev.id.device = d->type; 427 ldev->vdev.id.device = d->type;
427 /* 428 /*
428 * We have a simple set of routines for querying the device's 429 * We have a simple set of routines for querying the device's
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 7e92017103dc..b4d3f7ca554f 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -1,9 +1,8 @@
1/*P:200 1/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher
2 * This contains all the /dev/lguest code, whereby the userspace launcher
3 * controls and communicates with the Guest. For example, the first write will 2 * controls and communicates with the Guest. For example, the first write will
4 * tell us the Guest's memory layout, pagetable, entry point and kernel address 3 * tell us the Guest's memory layout and entry point. A read will run the
5 * offset. A read will run the Guest until something happens, such as a signal 4 * Guest until something happens, such as a signal or the Guest doing a NOTIFY
6 * or the Guest doing a NOTIFY out to the Launcher. 5 * out to the Launcher.
7:*/ 6:*/
8#include <linux/uaccess.h> 7#include <linux/uaccess.h>
9#include <linux/miscdevice.h> 8#include <linux/miscdevice.h>
@@ -13,14 +12,41 @@
13#include <linux/file.h> 12#include <linux/file.h>
14#include "lg.h" 13#include "lg.h"
15 14
15/*L:056
16 * Before we move on, let's jump ahead and look at what the kernel does when
17 * it needs to look up the eventfds. That will complete our picture of how we
18 * use RCU.
19 *
20 * The notification value is in cpu->pending_notify: we return true if it went
21 * to an eventfd.
22 */
16bool send_notify_to_eventfd(struct lg_cpu *cpu) 23bool send_notify_to_eventfd(struct lg_cpu *cpu)
17{ 24{
18 unsigned int i; 25 unsigned int i;
19 struct lg_eventfd_map *map; 26 struct lg_eventfd_map *map;
20 27
21 /* lg->eventfds is RCU-protected */ 28 /*
29 * This "rcu_read_lock()" helps track when someone is still looking at
30 * the (RCU-using) eventfds array. It's not actually a lock at all;
31 * indeed it's a noop in many configurations. (You didn't expect me to
32 * explain all the RCU secrets here, did you?)
33 */
22 rcu_read_lock(); 34 rcu_read_lock();
35 /*
36 * rcu_dereference is the counter-side of rcu_assign_pointer(); it
37 * makes sure we don't access the memory pointed to by
38 * cpu->lg->eventfds before cpu->lg->eventfds is set. Sounds crazy,
39 * but Alpha allows this! Paul McKenney points out that a really
40 * aggressive compiler could have the same effect:
41 * http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html
42 *
43 * So play safe, use rcu_dereference to get the rcu-protected pointer:
44 */
23 map = rcu_dereference(cpu->lg->eventfds); 45 map = rcu_dereference(cpu->lg->eventfds);
46 /*
47 * Simple array search: even if they add an eventfd while we do this,
48 * we'll continue to use the old array and just won't see the new one.
49 */
24 for (i = 0; i < map->num; i++) { 50 for (i = 0; i < map->num; i++) {
25 if (map->map[i].addr == cpu->pending_notify) { 51 if (map->map[i].addr == cpu->pending_notify) {
26 eventfd_signal(map->map[i].event, 1); 52 eventfd_signal(map->map[i].event, 1);
@@ -28,14 +54,43 @@ bool send_notify_to_eventfd(struct lg_cpu *cpu)
28 break; 54 break;
29 } 55 }
30 } 56 }
57 /* We're done with the rcu-protected variable cpu->lg->eventfds. */
31 rcu_read_unlock(); 58 rcu_read_unlock();
59
60 /* If we cleared the notification, it's because we found a match. */
32 return cpu->pending_notify == 0; 61 return cpu->pending_notify == 0;
33} 62}
34 63
64/*L:055
65 * One of the more tricksy tricks in the Linux Kernel is a technique called
66 * Read Copy Update. Since one point of lguest is to teach lguest journeyers
67 * about kernel coding, I use it here. (In case you're curious, other purposes
68 * include learning about virtualization and instilling a deep appreciation for
69 * simplicity and puppies).
70 *
71 * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we
72 * add new eventfds without ever blocking readers from accessing the array.
73 * The current Launcher only does this during boot, so that never happens. But
74 * Read Copy Update is cool, and adding a lock risks damaging even more puppies
75 * than this code does.
76 *
77 * We allocate a brand new one-larger array, copy the old one and add our new
78 * element. Then we make the lg eventfd pointer point to the new array.
79 * That's the easy part: now we need to free the old one, but we need to make
80 * sure no slow CPU somewhere is still looking at it. That's what
81 * synchronize_rcu does for us: waits until every CPU has indicated that it has
82 * moved on to know it's no longer using the old one.
83 *
84 * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update.
85 */
35static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) 86static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
36{ 87{
37 struct lg_eventfd_map *new, *old = lg->eventfds; 88 struct lg_eventfd_map *new, *old = lg->eventfds;
38 89
90 /*
91 * We don't allow notifications on value 0 anyway (pending_notify of
92 * 0 means "nothing pending").
93 */
39 if (!addr) 94 if (!addr)
40 return -EINVAL; 95 return -EINVAL;
41 96
@@ -62,12 +117,20 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
62 } 117 }
63 new->num++; 118 new->num++;
64 119
65 /* Now put new one in place. */ 120 /*
121 * Now put new one in place: rcu_assign_pointer() is a fancy way of
122 * doing "lg->eventfds = new", but it uses memory barriers to make
123 * absolutely sure that the contents of "new" written above is nailed
124 * down before we actually do the assignment.
125 *
126 * We have to think about these kinds of things when we're operating on
127 * live data without locks.
128 */
66 rcu_assign_pointer(lg->eventfds, new); 129 rcu_assign_pointer(lg->eventfds, new);
67 130
68 /* 131 /*
69 * We're not in a big hurry. Wait until noone's looking at old 132 * We're not in a big hurry. Wait until noone's looking at old
70 * version, then delete it. 133 * version, then free it.
71 */ 134 */
72 synchronize_rcu(); 135 synchronize_rcu();
73 kfree(old); 136 kfree(old);
@@ -75,6 +138,14 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
75 return 0; 138 return 0;
76} 139}
77 140
141/*L:052
142 * Receiving notifications from the Guest is usually done by attaching a
143 * particular LHCALL_NOTIFY value to an event filedescriptor. The eventfd will
144 * become readable when the Guest does an LHCALL_NOTIFY with that value.
145 *
146 * This is really convenient for processing each virtqueue in a separate
147 * thread.
148 */
78static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) 149static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
79{ 150{
80 unsigned long addr, fd; 151 unsigned long addr, fd;
@@ -86,6 +157,11 @@ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
86 if (get_user(fd, input) != 0) 157 if (get_user(fd, input) != 0)
87 return -EFAULT; 158 return -EFAULT;
88 159
160 /*
161 * Just make sure two callers don't add eventfds at once. We really
162 * only need to lock against callers adding to the same Guest, so using
163 * the Big Lguest Lock is overkill. But this is setup, not a fast path.
164 */
89 mutex_lock(&lguest_lock); 165 mutex_lock(&lguest_lock);
90 err = add_eventfd(lg, addr, fd); 166 err = add_eventfd(lg, addr, fd);
91 mutex_unlock(&lguest_lock); 167 mutex_unlock(&lguest_lock);
@@ -106,6 +182,10 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
106 if (irq >= LGUEST_IRQS) 182 if (irq >= LGUEST_IRQS)
107 return -EINVAL; 183 return -EINVAL;
108 184
185 /*
186 * Next time the Guest runs, the core code will see if it can deliver
187 * this interrupt.
188 */
109 set_interrupt(cpu, irq); 189 set_interrupt(cpu, irq);
110 return 0; 190 return 0;
111} 191}
@@ -307,10 +387,10 @@ unlock:
307 * The first operation the Launcher does must be a write. All writes 387 * The first operation the Launcher does must be a write. All writes
308 * start with an unsigned long number: for the first write this must be 388 * start with an unsigned long number: for the first write this must be
309 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use 389 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use
310 * writes of other values to send interrupts. 390 * writes of other values to send interrupts or set up receipt of notifications.
311 * 391 *
312 * Note that we overload the "offset" in the /dev/lguest file to indicate what 392 * Note that we overload the "offset" in the /dev/lguest file to indicate what
313 * CPU number we're dealing with. Currently this is always 0, since we only 393 * CPU number we're dealing with. Currently this is always 0 since we only
314 * support uniprocessor Guests, but you can see the beginnings of SMP support 394 * support uniprocessor Guests, but you can see the beginnings of SMP support
315 * here. 395 * here.
316 */ 396 */
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 3da902e4b4cb..a8d0aee3bc0e 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -29,10 +29,10 @@
29/*H:300 29/*H:300
30 * The Page Table Code 30 * The Page Table Code
31 * 31 *
32 * We use two-level page tables for the Guest. If you're not entirely 32 * We use two-level page tables for the Guest, or three-level with PAE. If
33 * comfortable with virtual addresses, physical addresses and page tables then 33 * you're not entirely comfortable with virtual addresses, physical addresses
34 * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with 34 * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page
35 * diagrams!). 35 * Table Handling" (with diagrams!).
36 * 36 *
37 * The Guest keeps page tables, but we maintain the actual ones here: these are 37 * The Guest keeps page tables, but we maintain the actual ones here: these are
38 * called "shadow" page tables. Which is a very Guest-centric name: these are 38 * called "shadow" page tables. Which is a very Guest-centric name: these are
@@ -52,9 +52,8 @@
52:*/ 52:*/
53 53
54/* 54/*
55 * 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is 55 * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB)
56 * conveniently placed at the top 4MB, so it uses a separate, complete PTE 56 * or 512 PTE entries with PAE (2MB).
57 * page.
58 */ 57 */
59#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) 58#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
60 59
@@ -81,7 +80,8 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
81 80
82/*H:320 81/*H:320
83 * The page table code is curly enough to need helper functions to keep it 82 * The page table code is curly enough to need helper functions to keep it
84 * clear and clean. 83 * clear and clean. The kernel itself provides many of them; one advantage
84 * of insisting that the Guest and Host use the same CONFIG_PAE setting.
85 * 85 *
86 * There are two functions which return pointers to the shadow (aka "real") 86 * There are two functions which return pointers to the shadow (aka "real")
87 * page tables. 87 * page tables.
@@ -155,7 +155,7 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
155} 155}
156 156
157/* 157/*
158 * These two functions just like the above two, except they access the Guest 158 * These functions are just like the above two, except they access the Guest
159 * page tables. Hence they return a Guest address. 159 * page tables. Hence they return a Guest address.
160 */ 160 */
161static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) 161static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
@@ -165,6 +165,7 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
165} 165}
166 166
167#ifdef CONFIG_X86_PAE 167#ifdef CONFIG_X86_PAE
168/* Follow the PGD to the PMD. */
168static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) 169static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
169{ 170{
170 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; 171 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
@@ -172,6 +173,7 @@ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
172 return gpage + pmd_index(vaddr) * sizeof(pmd_t); 173 return gpage + pmd_index(vaddr) * sizeof(pmd_t);
173} 174}
174 175
176/* Follow the PMD to the PTE. */
175static unsigned long gpte_addr(struct lg_cpu *cpu, 177static unsigned long gpte_addr(struct lg_cpu *cpu,
176 pmd_t gpmd, unsigned long vaddr) 178 pmd_t gpmd, unsigned long vaddr)
177{ 179{
@@ -181,6 +183,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu,
181 return gpage + pte_index(vaddr) * sizeof(pte_t); 183 return gpage + pte_index(vaddr) * sizeof(pte_t);
182} 184}
183#else 185#else
186/* Follow the PGD to the PTE (no mid-level for !PAE). */
184static unsigned long gpte_addr(struct lg_cpu *cpu, 187static unsigned long gpte_addr(struct lg_cpu *cpu,
185 pgd_t gpgd, unsigned long vaddr) 188 pgd_t gpgd, unsigned long vaddr)
186{ 189{
@@ -314,6 +317,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
314 pte_t gpte; 317 pte_t gpte;
315 pte_t *spte; 318 pte_t *spte;
316 319
320 /* Mid level for PAE. */
317#ifdef CONFIG_X86_PAE 321#ifdef CONFIG_X86_PAE
318 pmd_t *spmd; 322 pmd_t *spmd;
319 pmd_t gpmd; 323 pmd_t gpmd;
@@ -391,6 +395,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
391 */ 395 */
392 gpte_ptr = gpte_addr(cpu, gpgd, vaddr); 396 gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
393#endif 397#endif
398
399 /* Read the actual PTE value. */
394 gpte = lgread(cpu, gpte_ptr, pte_t); 400 gpte = lgread(cpu, gpte_ptr, pte_t);
395 401
396 /* If this page isn't in the Guest page tables, we can't page it in. */ 402 /* If this page isn't in the Guest page tables, we can't page it in. */
@@ -507,6 +513,7 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
507 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) 513 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2))
508 kill_guest(cpu, "bad stack page %#lx", vaddr); 514 kill_guest(cpu, "bad stack page %#lx", vaddr);
509} 515}
516/*:*/
510 517
511#ifdef CONFIG_X86_PAE 518#ifdef CONFIG_X86_PAE
512static void release_pmd(pmd_t *spmd) 519static void release_pmd(pmd_t *spmd)
@@ -543,7 +550,11 @@ static void release_pgd(pgd_t *spgd)
543} 550}
544 551
545#else /* !CONFIG_X86_PAE */ 552#else /* !CONFIG_X86_PAE */
546/*H:450 If we chase down the release_pgd() code, it looks like this: */ 553/*H:450
554 * If we chase down the release_pgd() code, the non-PAE version looks like
555 * this. The PAE version is almost identical, but instead of calling
556 * release_pte it calls release_pmd(), which looks much like this.
557 */
547static void release_pgd(pgd_t *spgd) 558static void release_pgd(pgd_t *spgd)
548{ 559{
549 /* If the entry's not present, there's nothing to release. */ 560 /* If the entry's not present, there's nothing to release. */
@@ -898,17 +909,21 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
898 /* ... throw it away. */ 909 /* ... throw it away. */
899 release_pgd(lg->pgdirs[pgdir].pgdir + idx); 910 release_pgd(lg->pgdirs[pgdir].pgdir + idx);
900} 911}
912
901#ifdef CONFIG_X86_PAE 913#ifdef CONFIG_X86_PAE
914/* For setting a mid-level, we just throw everything away. It's easy. */
902void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) 915void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
903{ 916{
904 guest_pagetable_clear_all(&lg->cpus[0]); 917 guest_pagetable_clear_all(&lg->cpus[0]);
905} 918}
906#endif 919#endif
907 920
908/* 921/*H:505
909 * Once we know how much memory we have we can construct simple identity (which 922 * To get through boot, we construct simple identity page mappings (which
910 * set virtual == physical) and linear mappings which will get the Guest far 923 * set virtual == physical) and linear mappings which will get the Guest far
911 * enough into the boot to create its own. 924 * enough into the boot to create its own. The linear mapping means we
925 * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET,
926 * as you'll see.
912 * 927 *
913 * We lay them out of the way, just below the initrd (which is why we need to 928 * We lay them out of the way, just below the initrd (which is why we need to
914 * know its size here). 929 * know its size here).
@@ -944,6 +959,10 @@ static unsigned long setup_pagetables(struct lguest *lg,
944 linear = (void *)pgdir - linear_pages * PAGE_SIZE; 959 linear = (void *)pgdir - linear_pages * PAGE_SIZE;
945 960
946#ifdef CONFIG_X86_PAE 961#ifdef CONFIG_X86_PAE
962 /*
963 * And the single mid page goes below that. We only use one, but
964 * that's enough to map 1G, which definitely gets us through boot.
965 */
947 pmds = (void *)linear - PAGE_SIZE; 966 pmds = (void *)linear - PAGE_SIZE;
948#endif 967#endif
949 /* 968 /*
@@ -957,13 +976,14 @@ static unsigned long setup_pagetables(struct lguest *lg,
957 return -EFAULT; 976 return -EFAULT;
958 } 977 }
959 978
979#ifdef CONFIG_X86_PAE
960 /* 980 /*
961 * The top level points to the linear page table pages above. 981 * Make the Guest PMD entries point to the corresponding place in the
962 * We setup the identity and linear mappings here. 982 * linear mapping (up to one page worth of PMD).
963 */ 983 */
964#ifdef CONFIG_X86_PAE
965 for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; 984 for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD;
966 i += PTRS_PER_PTE, j++) { 985 i += PTRS_PER_PTE, j++) {
986 /* FIXME: native_set_pmd is overkill here. */
967 native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) 987 native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i)
968 - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); 988 - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
969 989
@@ -971,18 +991,36 @@ static unsigned long setup_pagetables(struct lguest *lg,
971 return -EFAULT; 991 return -EFAULT;
972 } 992 }
973 993
994 /* One PGD entry, pointing to that PMD page. */
974 set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); 995 set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT));
996 /* Copy it in as the first PGD entry (ie. addresses 0-1G). */
975 if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) 997 if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0)
976 return -EFAULT; 998 return -EFAULT;
999 /*
1000 * And the third PGD entry (ie. addresses 3G-4G).
1001 *
1002 * FIXME: This assumes that PAGE_OFFSET for the Guest is 0xC0000000.
1003 */
977 if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) 1004 if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0)
978 return -EFAULT; 1005 return -EFAULT;
979#else 1006#else
1007 /*
1008 * The top level points to the linear page table pages above.
1009 * We setup the identity and linear mappings here.
1010 */
980 phys_linear = (unsigned long)linear - mem_base; 1011 phys_linear = (unsigned long)linear - mem_base;
981 for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { 1012 for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) {
982 pgd_t pgd; 1013 pgd_t pgd;
1014 /*
1015 * Create a PGD entry which points to the right part of the
1016 * linear PTE pages.
1017 */
983 pgd = __pgd((phys_linear + i * sizeof(pte_t)) | 1018 pgd = __pgd((phys_linear + i * sizeof(pte_t)) |
984 (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); 1019 (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
985 1020
1021 /*
1022 * Copy it into the PGD page at 0 and PAGE_OFFSET.
1023 */
986 if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) 1024 if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd))
987 || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) 1025 || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET)
988 + i / PTRS_PER_PTE], 1026 + i / PTRS_PER_PTE],
@@ -992,8 +1030,8 @@ static unsigned long setup_pagetables(struct lguest *lg,
992#endif 1030#endif
993 1031
994 /* 1032 /*
995 * We return the top level (guest-physical) address: remember where 1033 * We return the top level (guest-physical) address: we remember where
996 * this is. 1034 * this is to write it into lguest_data when the Guest initializes.
997 */ 1035 */
998 return (unsigned long)pgdir - mem_base; 1036 return (unsigned long)pgdir - mem_base;
999} 1037}
@@ -1031,7 +1069,9 @@ int init_guest_pagetable(struct lguest *lg)
1031 lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); 1069 lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
1032 if (!lg->pgdirs[0].pgdir) 1070 if (!lg->pgdirs[0].pgdir)
1033 return -ENOMEM; 1071 return -ENOMEM;
1072
1034#ifdef CONFIG_X86_PAE 1073#ifdef CONFIG_X86_PAE
1074 /* For PAE, we also create the initial mid-level. */
1035 pgd = lg->pgdirs[0].pgdir; 1075 pgd = lg->pgdirs[0].pgdir;
1036 pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); 1076 pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL);
1037 if (!pmd_table) 1077 if (!pmd_table)
@@ -1040,11 +1080,13 @@ int init_guest_pagetable(struct lguest *lg)
1040 set_pgd(pgd + SWITCHER_PGD_INDEX, 1080 set_pgd(pgd + SWITCHER_PGD_INDEX,
1041 __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 1081 __pgd(__pa(pmd_table) | _PAGE_PRESENT));
1042#endif 1082#endif
1083
1084 /* This is the current page table. */
1043 lg->cpus[0].cpu_pgd = 0; 1085 lg->cpus[0].cpu_pgd = 0;
1044 return 0; 1086 return 0;
1045} 1087}
1046 1088
1047/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ 1089/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
1048void page_table_guest_data_init(struct lg_cpu *cpu) 1090void page_table_guest_data_init(struct lg_cpu *cpu)
1049{ 1091{
1050 /* We get the kernel address: above this is all kernel memory. */ 1092 /* We get the kernel address: above this is all kernel memory. */
@@ -1105,12 +1147,16 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
1105 pmd_t switcher_pmd; 1147 pmd_t switcher_pmd;
1106 pmd_t *pmd_table; 1148 pmd_t *pmd_table;
1107 1149
1150 /* FIXME: native_set_pmd is overkill here. */
1108 native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> 1151 native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >>
1109 PAGE_SHIFT, PAGE_KERNEL_EXEC)); 1152 PAGE_SHIFT, PAGE_KERNEL_EXEC));
1110 1153
1154 /* Figure out where the pmd page is, by reading the PGD, and converting
1155 * it to a virtual address. */
1111 pmd_table = __va(pgd_pfn(cpu->lg-> 1156 pmd_table = __va(pgd_pfn(cpu->lg->
1112 pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) 1157 pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
1113 << PAGE_SHIFT); 1158 << PAGE_SHIFT);
1159 /* Now write it into the shadow page table. */
1114 native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); 1160 native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
1115#else 1161#else
1116 pgd_t switcher_pgd; 1162 pgd_t switcher_pgd;
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 96f7d88ec7f8..6ae388849a3b 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -187,7 +187,7 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
187 * also simplify copy_in_guest_info(). Note that we'd still need to restore 187 * also simplify copy_in_guest_info(). Note that we'd still need to restore
188 * things when we exit to Launcher userspace, but that's fairly easy. 188 * things when we exit to Launcher userspace, but that's fairly easy.
189 * 189 *
190 * We could also try using this hooks for PGE, but that might be too expensive. 190 * We could also try using these hooks for PGE, but that might be too expensive.
191 * 191 *
192 * The hooks were designed for KVM, but we can also put them to good use. 192 * The hooks were designed for KVM, but we can also put them to good use.
193:*/ 193:*/
diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S
index 6dec09793836..40634b0db9f7 100644
--- a/drivers/lguest/x86/switcher_32.S
+++ b/drivers/lguest/x86/switcher_32.S
@@ -1,7 +1,7 @@
1/*P:900 1/*P:900
2 * This is the Switcher: code which sits at 0xFFC00000 astride both the 2 * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride
3 * Host and Guest to do the low-level Guest<->Host switch. It is as simple as 3 * both the Host and Guest to do the low-level Guest<->Host switch. It is as
4 * it can be made, but it's naturally very specific to x86. 4 * simple as it can be made, but it's naturally very specific to x86.
5 * 5 *
6 * You have now completed Preparation. If this has whet your appetite; if you 6 * You have now completed Preparation. If this has whet your appetite; if you
7 * are feeling invigorated and refreshed then the next, more challenging stage 7 * are feeling invigorated and refreshed then the next, more challenging stage